palbot-llama3-8b / trainer_state.json
snu-bdai's picture
Upload folder using huggingface_hub
0a2ae6f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 87450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005717552887364208,
"grad_norm": 9.984894752502441,
"learning_rate": 9.999969012132217e-07,
"loss": 1.852,
"step": 100
},
{
"epoch": 0.011435105774728416,
"grad_norm": 6.384902000427246,
"learning_rate": 9.999873506403478e-07,
"loss": 1.1916,
"step": 200
},
{
"epoch": 0.017152658662092625,
"grad_norm": 5.624429225921631,
"learning_rate": 9.999713471140152e-07,
"loss": 1.1066,
"step": 300
},
{
"epoch": 0.022870211549456832,
"grad_norm": 5.947996616363525,
"learning_rate": 9.99948890840769e-07,
"loss": 1.0538,
"step": 400
},
{
"epoch": 0.02858776443682104,
"grad_norm": 5.026174068450928,
"learning_rate": 9.999199821104353e-07,
"loss": 1.0143,
"step": 500
},
{
"epoch": 0.03430531732418525,
"grad_norm": 6.598933696746826,
"learning_rate": 9.99884621296117e-07,
"loss": 0.9908,
"step": 600
},
{
"epoch": 0.040022870211549454,
"grad_norm": 4.971883296966553,
"learning_rate": 9.99842808854189e-07,
"loss": 0.9857,
"step": 700
},
{
"epoch": 0.045740423098913664,
"grad_norm": 5.7355570793151855,
"learning_rate": 9.997945453242922e-07,
"loss": 0.9578,
"step": 800
},
{
"epoch": 0.051457975986277875,
"grad_norm": 5.731710910797119,
"learning_rate": 9.997398313293272e-07,
"loss": 0.9344,
"step": 900
},
{
"epoch": 0.05717552887364208,
"grad_norm": 6.716394901275635,
"learning_rate": 9.996786675754455e-07,
"loss": 0.9511,
"step": 1000
},
{
"epoch": 0.06289308176100629,
"grad_norm": 7.5202178955078125,
"learning_rate": 9.996110548520408e-07,
"loss": 0.9319,
"step": 1100
},
{
"epoch": 0.0686106346483705,
"grad_norm": 6.257505416870117,
"learning_rate": 9.995369940317388e-07,
"loss": 0.9303,
"step": 1200
},
{
"epoch": 0.07432818753573471,
"grad_norm": 8.514854431152344,
"learning_rate": 9.994564860703857e-07,
"loss": 0.9233,
"step": 1300
},
{
"epoch": 0.08004574042309891,
"grad_norm": 7.62768030166626,
"learning_rate": 9.993695320070358e-07,
"loss": 0.9061,
"step": 1400
},
{
"epoch": 0.08576329331046312,
"grad_norm": 7.282722473144531,
"learning_rate": 9.992761329639389e-07,
"loss": 0.9147,
"step": 1500
},
{
"epoch": 0.09148084619782733,
"grad_norm": 6.965019226074219,
"learning_rate": 9.991762901465247e-07,
"loss": 0.9013,
"step": 1600
},
{
"epoch": 0.09719839908519154,
"grad_norm": 8.01344108581543,
"learning_rate": 9.990700048433879e-07,
"loss": 0.8811,
"step": 1700
},
{
"epoch": 0.10291595197255575,
"grad_norm": 7.747630596160889,
"learning_rate": 9.989572784262714e-07,
"loss": 0.8737,
"step": 1800
},
{
"epoch": 0.10863350485991996,
"grad_norm": 6.288477897644043,
"learning_rate": 9.988381123500485e-07,
"loss": 0.8984,
"step": 1900
},
{
"epoch": 0.11435105774728416,
"grad_norm": 9.441058158874512,
"learning_rate": 9.987125081527047e-07,
"loss": 0.8659,
"step": 2000
},
{
"epoch": 0.12006861063464837,
"grad_norm": 7.924680233001709,
"learning_rate": 9.98580467455317e-07,
"loss": 0.884,
"step": 2100
},
{
"epoch": 0.12578616352201258,
"grad_norm": 7.8524322509765625,
"learning_rate": 9.984419919620333e-07,
"loss": 0.8653,
"step": 2200
},
{
"epoch": 0.13150371640937678,
"grad_norm": 7.980896949768066,
"learning_rate": 9.982970834600508e-07,
"loss": 0.8732,
"step": 2300
},
{
"epoch": 0.137221269296741,
"grad_norm": 9.265905380249023,
"learning_rate": 9.981457438195925e-07,
"loss": 0.8934,
"step": 2400
},
{
"epoch": 0.1429388221841052,
"grad_norm": 7.617094039916992,
"learning_rate": 9.97987974993883e-07,
"loss": 0.8902,
"step": 2500
},
{
"epoch": 0.14865637507146942,
"grad_norm": 9.14963436126709,
"learning_rate": 9.978237790191236e-07,
"loss": 0.887,
"step": 2600
},
{
"epoch": 0.15437392795883362,
"grad_norm": 8.893723487854004,
"learning_rate": 9.97653158014466e-07,
"loss": 0.8648,
"step": 2700
},
{
"epoch": 0.16009148084619781,
"grad_norm": 9.325992584228516,
"learning_rate": 9.974761141819848e-07,
"loss": 0.8782,
"step": 2800
},
{
"epoch": 0.16580903373356204,
"grad_norm": 9.336038589477539,
"learning_rate": 9.972926498066484e-07,
"loss": 0.8657,
"step": 2900
},
{
"epoch": 0.17152658662092624,
"grad_norm": 9.833565711975098,
"learning_rate": 9.971027672562918e-07,
"loss": 0.8674,
"step": 3000
},
{
"epoch": 0.17724413950829046,
"grad_norm": 8.262983322143555,
"learning_rate": 9.969064689815828e-07,
"loss": 0.8531,
"step": 3100
},
{
"epoch": 0.18296169239565466,
"grad_norm": 8.745858192443848,
"learning_rate": 9.967037575159929e-07,
"loss": 0.9019,
"step": 3200
},
{
"epoch": 0.18867924528301888,
"grad_norm": 9.545065879821777,
"learning_rate": 9.964946354757638e-07,
"loss": 0.8313,
"step": 3300
},
{
"epoch": 0.19439679817038308,
"grad_norm": 10.856668472290039,
"learning_rate": 9.962791055598731e-07,
"loss": 0.8339,
"step": 3400
},
{
"epoch": 0.20011435105774728,
"grad_norm": 8.230608940124512,
"learning_rate": 9.960571705500005e-07,
"loss": 0.8456,
"step": 3500
},
{
"epoch": 0.2058319039451115,
"grad_norm": 10.353604316711426,
"learning_rate": 9.958288333104907e-07,
"loss": 0.8514,
"step": 3600
},
{
"epoch": 0.2115494568324757,
"grad_norm": 9.70090389251709,
"learning_rate": 9.95594096788318e-07,
"loss": 0.872,
"step": 3700
},
{
"epoch": 0.21726700971983992,
"grad_norm": 8.226770401000977,
"learning_rate": 9.953529640130459e-07,
"loss": 0.8532,
"step": 3800
},
{
"epoch": 0.22298456260720412,
"grad_norm": 10.802261352539062,
"learning_rate": 9.95105438096791e-07,
"loss": 0.8183,
"step": 3900
},
{
"epoch": 0.22870211549456831,
"grad_norm": 9.883532524108887,
"learning_rate": 9.948515222341802e-07,
"loss": 0.8244,
"step": 4000
},
{
"epoch": 0.23441966838193254,
"grad_norm": 9.516398429870605,
"learning_rate": 9.94591219702311e-07,
"loss": 0.8464,
"step": 4100
},
{
"epoch": 0.24013722126929674,
"grad_norm": 9.146637916564941,
"learning_rate": 9.943245338607086e-07,
"loss": 0.838,
"step": 4200
},
{
"epoch": 0.24585477415666096,
"grad_norm": 9.489537239074707,
"learning_rate": 9.94051468151283e-07,
"loss": 0.8245,
"step": 4300
},
{
"epoch": 0.25157232704402516,
"grad_norm": 9.833827018737793,
"learning_rate": 9.937720260982834e-07,
"loss": 0.8361,
"step": 4400
},
{
"epoch": 0.25728987993138935,
"grad_norm": 13.346197128295898,
"learning_rate": 9.934862113082547e-07,
"loss": 0.8464,
"step": 4500
},
{
"epoch": 0.26300743281875355,
"grad_norm": 8.923548698425293,
"learning_rate": 9.93194027469989e-07,
"loss": 0.8166,
"step": 4600
},
{
"epoch": 0.2687249857061178,
"grad_norm": 8.640154838562012,
"learning_rate": 9.928954783544794e-07,
"loss": 0.8253,
"step": 4700
},
{
"epoch": 0.274442538593482,
"grad_norm": 12.083985328674316,
"learning_rate": 9.9259056781487e-07,
"loss": 0.8319,
"step": 4800
},
{
"epoch": 0.2801600914808462,
"grad_norm": 8.075922012329102,
"learning_rate": 9.92279299786408e-07,
"loss": 0.8371,
"step": 4900
},
{
"epoch": 0.2858776443682104,
"grad_norm": 8.545024871826172,
"learning_rate": 9.919616782863908e-07,
"loss": 0.8319,
"step": 5000
},
{
"epoch": 0.2915951972555746,
"grad_norm": 8.820647239685059,
"learning_rate": 9.916377074141157e-07,
"loss": 0.8474,
"step": 5100
},
{
"epoch": 0.29731275014293884,
"grad_norm": 8.804427146911621,
"learning_rate": 9.913073913508266e-07,
"loss": 0.8183,
"step": 5200
},
{
"epoch": 0.30303030303030304,
"grad_norm": 8.17628002166748,
"learning_rate": 9.909707343596596e-07,
"loss": 0.8122,
"step": 5300
},
{
"epoch": 0.30874785591766724,
"grad_norm": 9.780845642089844,
"learning_rate": 9.906277407855883e-07,
"loss": 0.8329,
"step": 5400
},
{
"epoch": 0.31446540880503143,
"grad_norm": 9.164664268493652,
"learning_rate": 9.902784150553678e-07,
"loss": 0.805,
"step": 5500
},
{
"epoch": 0.32018296169239563,
"grad_norm": 10.00500774383545,
"learning_rate": 9.899227616774776e-07,
"loss": 0.823,
"step": 5600
},
{
"epoch": 0.3259005145797599,
"grad_norm": 10.257847785949707,
"learning_rate": 9.895607852420636e-07,
"loss": 0.8209,
"step": 5700
},
{
"epoch": 0.3316180674671241,
"grad_norm": 12.896175384521484,
"learning_rate": 9.891924904208774e-07,
"loss": 0.815,
"step": 5800
},
{
"epoch": 0.3373356203544883,
"grad_norm": 12.980055809020996,
"learning_rate": 9.888178819672186e-07,
"loss": 0.8061,
"step": 5900
},
{
"epoch": 0.34305317324185247,
"grad_norm": 9.492401123046875,
"learning_rate": 9.884369647158711e-07,
"loss": 0.8187,
"step": 6000
},
{
"epoch": 0.34877072612921667,
"grad_norm": 10.031133651733398,
"learning_rate": 9.880497435830418e-07,
"loss": 0.8317,
"step": 6100
},
{
"epoch": 0.3544882790165809,
"grad_norm": 10.923727989196777,
"learning_rate": 9.87656223566297e-07,
"loss": 0.8279,
"step": 6200
},
{
"epoch": 0.3602058319039451,
"grad_norm": 11.254301071166992,
"learning_rate": 9.872564097444981e-07,
"loss": 0.8274,
"step": 6300
},
{
"epoch": 0.3659233847913093,
"grad_norm": 11.593939781188965,
"learning_rate": 9.868503072777356e-07,
"loss": 0.8018,
"step": 6400
},
{
"epoch": 0.3716409376786735,
"grad_norm": 12.787039756774902,
"learning_rate": 9.864379214072626e-07,
"loss": 0.8114,
"step": 6500
},
{
"epoch": 0.37735849056603776,
"grad_norm": 9.75014877319336,
"learning_rate": 9.860192574554274e-07,
"loss": 0.8398,
"step": 6600
},
{
"epoch": 0.38307604345340196,
"grad_norm": 9.99585247039795,
"learning_rate": 9.855943208256046e-07,
"loss": 0.8166,
"step": 6700
},
{
"epoch": 0.38879359634076616,
"grad_norm": 9.503119468688965,
"learning_rate": 9.851631170021257e-07,
"loss": 0.7923,
"step": 6800
},
{
"epoch": 0.39451114922813035,
"grad_norm": 9.30905818939209,
"learning_rate": 9.84725651550208e-07,
"loss": 0.8128,
"step": 6900
},
{
"epoch": 0.40022870211549455,
"grad_norm": 12.592491149902344,
"learning_rate": 9.842819301158825e-07,
"loss": 0.8064,
"step": 7000
},
{
"epoch": 0.4059462550028588,
"grad_norm": 11.191341400146484,
"learning_rate": 9.838319584259217e-07,
"loss": 0.7924,
"step": 7100
},
{
"epoch": 0.411663807890223,
"grad_norm": 9.108357429504395,
"learning_rate": 9.833757422877653e-07,
"loss": 0.7678,
"step": 7200
},
{
"epoch": 0.4173813607775872,
"grad_norm": 9.786158561706543,
"learning_rate": 9.829132875894453e-07,
"loss": 0.8139,
"step": 7300
},
{
"epoch": 0.4230989136649514,
"grad_norm": 9.721033096313477,
"learning_rate": 9.8244460029951e-07,
"loss": 0.8294,
"step": 7400
},
{
"epoch": 0.4288164665523156,
"grad_norm": 12.101911544799805,
"learning_rate": 9.819696864669466e-07,
"loss": 0.8122,
"step": 7500
},
{
"epoch": 0.43453401943967984,
"grad_norm": 10.104898452758789,
"learning_rate": 9.814885522211044e-07,
"loss": 0.7911,
"step": 7600
},
{
"epoch": 0.44025157232704404,
"grad_norm": 10.600286483764648,
"learning_rate": 9.810012037716142e-07,
"loss": 0.8108,
"step": 7700
},
{
"epoch": 0.44596912521440824,
"grad_norm": 9.540319442749023,
"learning_rate": 9.805076474083085e-07,
"loss": 0.8296,
"step": 7800
},
{
"epoch": 0.45168667810177243,
"grad_norm": 8.930343627929688,
"learning_rate": 9.800078895011414e-07,
"loss": 0.8172,
"step": 7900
},
{
"epoch": 0.45740423098913663,
"grad_norm": 10.50644302368164,
"learning_rate": 9.795019365001047e-07,
"loss": 0.8063,
"step": 8000
},
{
"epoch": 0.4631217838765009,
"grad_norm": 10.2007417678833,
"learning_rate": 9.789897949351463e-07,
"loss": 0.8141,
"step": 8100
},
{
"epoch": 0.4688393367638651,
"grad_norm": 9.180241584777832,
"learning_rate": 9.784714714160844e-07,
"loss": 0.7992,
"step": 8200
},
{
"epoch": 0.4745568896512293,
"grad_norm": 13.358614921569824,
"learning_rate": 9.779469726325235e-07,
"loss": 0.7994,
"step": 8300
},
{
"epoch": 0.48027444253859347,
"grad_norm": 8.851704597473145,
"learning_rate": 9.774163053537675e-07,
"loss": 0.8179,
"step": 8400
},
{
"epoch": 0.48599199542595767,
"grad_norm": 11.237141609191895,
"learning_rate": 9.768794764287319e-07,
"loss": 0.7957,
"step": 8500
},
{
"epoch": 0.4917095483133219,
"grad_norm": 12.960529327392578,
"learning_rate": 9.76336492785856e-07,
"loss": 0.7971,
"step": 8600
},
{
"epoch": 0.4974271012006861,
"grad_norm": 15.1110258102417,
"learning_rate": 9.75787361433014e-07,
"loss": 0.8132,
"step": 8700
},
{
"epoch": 0.5031446540880503,
"grad_norm": 11.033527374267578,
"learning_rate": 9.752320894574232e-07,
"loss": 0.8141,
"step": 8800
},
{
"epoch": 0.5088622069754145,
"grad_norm": 14.08973503112793,
"learning_rate": 9.74670684025553e-07,
"loss": 0.801,
"step": 8900
},
{
"epoch": 0.5145797598627787,
"grad_norm": 11.95508098602295,
"learning_rate": 9.74103152383033e-07,
"loss": 0.8061,
"step": 9000
},
{
"epoch": 0.5202973127501429,
"grad_norm": 11.355822563171387,
"learning_rate": 9.73529501854559e-07,
"loss": 0.7923,
"step": 9100
},
{
"epoch": 0.5260148656375071,
"grad_norm": 12.26524829864502,
"learning_rate": 9.729497398437991e-07,
"loss": 0.7977,
"step": 9200
},
{
"epoch": 0.5317324185248714,
"grad_norm": 9.304167747497559,
"learning_rate": 9.723638738332967e-07,
"loss": 0.7951,
"step": 9300
},
{
"epoch": 0.5374499714122356,
"grad_norm": 14.560432434082031,
"learning_rate": 9.71771911384375e-07,
"loss": 0.7921,
"step": 9400
},
{
"epoch": 0.5431675242995998,
"grad_norm": 9.710556983947754,
"learning_rate": 9.711738601370406e-07,
"loss": 0.796,
"step": 9500
},
{
"epoch": 0.548885077186964,
"grad_norm": 11.41674518585205,
"learning_rate": 9.705697278098815e-07,
"loss": 0.8159,
"step": 9600
},
{
"epoch": 0.5546026300743282,
"grad_norm": 10.389808654785156,
"learning_rate": 9.69959522199971e-07,
"loss": 0.8113,
"step": 9700
},
{
"epoch": 0.5603201829616924,
"grad_norm": 10.79917049407959,
"learning_rate": 9.69343251182765e-07,
"loss": 0.7919,
"step": 9800
},
{
"epoch": 0.5660377358490566,
"grad_norm": 13.113627433776855,
"learning_rate": 9.687209227120013e-07,
"loss": 0.7975,
"step": 9900
},
{
"epoch": 0.5717552887364208,
"grad_norm": 13.61053466796875,
"learning_rate": 9.68092544819596e-07,
"loss": 0.8035,
"step": 10000
},
{
"epoch": 0.577472841623785,
"grad_norm": 12.462188720703125,
"learning_rate": 9.674581256155413e-07,
"loss": 0.7696,
"step": 10100
},
{
"epoch": 0.5831903945111492,
"grad_norm": 10.420384407043457,
"learning_rate": 9.668176732877992e-07,
"loss": 0.7682,
"step": 10200
},
{
"epoch": 0.5889079473985135,
"grad_norm": 10.863661766052246,
"learning_rate": 9.661711961021971e-07,
"loss": 0.7512,
"step": 10300
},
{
"epoch": 0.5946255002858777,
"grad_norm": 11.13371753692627,
"learning_rate": 9.655187024023205e-07,
"loss": 0.7939,
"step": 10400
},
{
"epoch": 0.6003430531732419,
"grad_norm": 10.809890747070312,
"learning_rate": 9.648602006094056e-07,
"loss": 0.7964,
"step": 10500
},
{
"epoch": 0.6060606060606061,
"grad_norm": 10.5729398727417,
"learning_rate": 9.641956992222297e-07,
"loss": 0.8021,
"step": 10600
},
{
"epoch": 0.6117781589479703,
"grad_norm": 10.581096649169922,
"learning_rate": 9.635252068170032e-07,
"loss": 0.7723,
"step": 10700
},
{
"epoch": 0.6174957118353345,
"grad_norm": 9.770636558532715,
"learning_rate": 9.628487320472575e-07,
"loss": 0.7809,
"step": 10800
},
{
"epoch": 0.6232132647226987,
"grad_norm": 11.409103393554688,
"learning_rate": 9.621662836437339e-07,
"loss": 0.8104,
"step": 10900
},
{
"epoch": 0.6289308176100629,
"grad_norm": 11.642632484436035,
"learning_rate": 9.61477870414271e-07,
"loss": 0.7712,
"step": 11000
},
{
"epoch": 0.6346483704974271,
"grad_norm": 11.671961784362793,
"learning_rate": 9.607835012436903e-07,
"loss": 0.7691,
"step": 11100
},
{
"epoch": 0.6403659233847913,
"grad_norm": 12.849811553955078,
"learning_rate": 9.600831850936823e-07,
"loss": 0.8127,
"step": 11200
},
{
"epoch": 0.6460834762721556,
"grad_norm": 12.369158744812012,
"learning_rate": 9.593769310026914e-07,
"loss": 0.7885,
"step": 11300
},
{
"epoch": 0.6518010291595198,
"grad_norm": 11.423858642578125,
"learning_rate": 9.58664748085797e-07,
"loss": 0.7764,
"step": 11400
},
{
"epoch": 0.657518582046884,
"grad_norm": 13.777689933776855,
"learning_rate": 9.579466455345984e-07,
"loss": 0.7833,
"step": 11500
},
{
"epoch": 0.6632361349342482,
"grad_norm": 12.836466789245605,
"learning_rate": 9.572226326170947e-07,
"loss": 0.7872,
"step": 11600
},
{
"epoch": 0.6689536878216124,
"grad_norm": 11.667618751525879,
"learning_rate": 9.564927186775657e-07,
"loss": 0.7793,
"step": 11700
},
{
"epoch": 0.6746712407089765,
"grad_norm": 11.850467681884766,
"learning_rate": 9.557569131364512e-07,
"loss": 0.7635,
"step": 11800
},
{
"epoch": 0.6803887935963407,
"grad_norm": 11.954856872558594,
"learning_rate": 9.550152254902288e-07,
"loss": 0.7885,
"step": 11900
},
{
"epoch": 0.6861063464837049,
"grad_norm": 12.209726333618164,
"learning_rate": 9.54267665311293e-07,
"loss": 0.8116,
"step": 12000
},
{
"epoch": 0.6918238993710691,
"grad_norm": 15.221282005310059,
"learning_rate": 9.535142422478295e-07,
"loss": 0.7969,
"step": 12100
},
{
"epoch": 0.6975414522584333,
"grad_norm": 13.073972702026367,
"learning_rate": 9.527549660236924e-07,
"loss": 0.775,
"step": 12200
},
{
"epoch": 0.7032590051457976,
"grad_norm": 11.428675651550293,
"learning_rate": 9.519898464382779e-07,
"loss": 0.7997,
"step": 12300
},
{
"epoch": 0.7089765580331618,
"grad_norm": 12.806885719299316,
"learning_rate": 9.512188933663979e-07,
"loss": 0.7469,
"step": 12400
},
{
"epoch": 0.714694110920526,
"grad_norm": 11.91589641571045,
"learning_rate": 9.504421167581529e-07,
"loss": 0.7899,
"step": 12500
},
{
"epoch": 0.7204116638078902,
"grad_norm": 14.660259246826172,
"learning_rate": 9.496595266388027e-07,
"loss": 0.7725,
"step": 12600
},
{
"epoch": 0.7261292166952544,
"grad_norm": 12.04018783569336,
"learning_rate": 9.488711331086387e-07,
"loss": 0.7732,
"step": 12700
},
{
"epoch": 0.7318467695826186,
"grad_norm": 12.394238471984863,
"learning_rate": 9.480769463428513e-07,
"loss": 0.8006,
"step": 12800
},
{
"epoch": 0.7375643224699828,
"grad_norm": 10.931447982788086,
"learning_rate": 9.472769765914003e-07,
"loss": 0.7648,
"step": 12900
},
{
"epoch": 0.743281875357347,
"grad_norm": 12.420381546020508,
"learning_rate": 9.464712341788826e-07,
"loss": 0.772,
"step": 13000
},
{
"epoch": 0.7489994282447112,
"grad_norm": 11.603715896606445,
"learning_rate": 9.456597295043971e-07,
"loss": 0.786,
"step": 13100
},
{
"epoch": 0.7547169811320755,
"grad_norm": 11.238115310668945,
"learning_rate": 9.448424730414131e-07,
"loss": 0.7662,
"step": 13200
},
{
"epoch": 0.7604345340194397,
"grad_norm": 13.849480628967285,
"learning_rate": 9.440194753376332e-07,
"loss": 0.7558,
"step": 13300
},
{
"epoch": 0.7661520869068039,
"grad_norm": 13.335151672363281,
"learning_rate": 9.431907470148577e-07,
"loss": 0.7562,
"step": 13400
},
{
"epoch": 0.7718696397941681,
"grad_norm": 13.283151626586914,
"learning_rate": 9.423562987688478e-07,
"loss": 0.7767,
"step": 13500
},
{
"epoch": 0.7775871926815323,
"grad_norm": 11.497271537780762,
"learning_rate": 9.415161413691875e-07,
"loss": 0.7544,
"step": 13600
},
{
"epoch": 0.7833047455688965,
"grad_norm": 13.979777336120605,
"learning_rate": 9.406702856591441e-07,
"loss": 0.7311,
"step": 13700
},
{
"epoch": 0.7890222984562607,
"grad_norm": 13.748780250549316,
"learning_rate": 9.398187425555291e-07,
"loss": 0.7717,
"step": 13800
},
{
"epoch": 0.7947398513436249,
"grad_norm": 11.056880950927734,
"learning_rate": 9.389615230485564e-07,
"loss": 0.7417,
"step": 13900
},
{
"epoch": 0.8004574042309891,
"grad_norm": 11.314404487609863,
"learning_rate": 9.380986382017011e-07,
"loss": 0.7675,
"step": 14000
},
{
"epoch": 0.8061749571183533,
"grad_norm": 10.690199851989746,
"learning_rate": 9.372300991515565e-07,
"loss": 0.7664,
"step": 14100
},
{
"epoch": 0.8118925100057176,
"grad_norm": 14.313260078430176,
"learning_rate": 9.363559171076902e-07,
"loss": 0.7533,
"step": 14200
},
{
"epoch": 0.8176100628930818,
"grad_norm": 13.0926513671875,
"learning_rate": 9.354761033524999e-07,
"loss": 0.7845,
"step": 14300
},
{
"epoch": 0.823327615780446,
"grad_norm": 13.171210289001465,
"learning_rate": 9.345906692410671e-07,
"loss": 0.7785,
"step": 14400
},
{
"epoch": 0.8290451686678102,
"grad_norm": 11.313733100891113,
"learning_rate": 9.336996262010113e-07,
"loss": 0.7562,
"step": 14500
},
{
"epoch": 0.8347627215551744,
"grad_norm": 13.027429580688477,
"learning_rate": 9.328029857323418e-07,
"loss": 0.764,
"step": 14600
},
{
"epoch": 0.8404802744425386,
"grad_norm": 11.618389129638672,
"learning_rate": 9.319007594073099e-07,
"loss": 0.7552,
"step": 14700
},
{
"epoch": 0.8461978273299028,
"grad_norm": 12.712207794189453,
"learning_rate": 9.309929588702592e-07,
"loss": 0.7638,
"step": 14800
},
{
"epoch": 0.851915380217267,
"grad_norm": 14.38819408416748,
"learning_rate": 9.300795958374752e-07,
"loss": 0.7716,
"step": 14900
},
{
"epoch": 0.8576329331046312,
"grad_norm": 13.714140892028809,
"learning_rate": 9.291606820970345e-07,
"loss": 0.778,
"step": 15000
},
{
"epoch": 0.8633504859919954,
"grad_norm": 10.53870964050293,
"learning_rate": 9.282362295086525e-07,
"loss": 0.782,
"step": 15100
},
{
"epoch": 0.8690680388793597,
"grad_norm": 20.9774227142334,
"learning_rate": 9.273062500035296e-07,
"loss": 0.7657,
"step": 15200
},
{
"epoch": 0.8747855917667239,
"grad_norm": 11.294500350952148,
"learning_rate": 9.263707555841989e-07,
"loss": 0.7585,
"step": 15300
},
{
"epoch": 0.8805031446540881,
"grad_norm": 13.51392936706543,
"learning_rate": 9.254297583243695e-07,
"loss": 0.7648,
"step": 15400
},
{
"epoch": 0.8862206975414523,
"grad_norm": 12.258986473083496,
"learning_rate": 9.244832703687718e-07,
"loss": 0.7744,
"step": 15500
},
{
"epoch": 0.8919382504288165,
"grad_norm": 13.734026908874512,
"learning_rate": 9.235313039330001e-07,
"loss": 0.7544,
"step": 15600
},
{
"epoch": 0.8976558033161807,
"grad_norm": 13.782454490661621,
"learning_rate": 9.225738713033555e-07,
"loss": 0.7536,
"step": 15700
},
{
"epoch": 0.9033733562035449,
"grad_norm": 11.396181106567383,
"learning_rate": 9.216109848366872e-07,
"loss": 0.7411,
"step": 15800
},
{
"epoch": 0.9090909090909091,
"grad_norm": 11.917641639709473,
"learning_rate": 9.206426569602326e-07,
"loss": 0.7821,
"step": 15900
},
{
"epoch": 0.9148084619782733,
"grad_norm": 12.97021198272705,
"learning_rate": 9.196689001714572e-07,
"loss": 0.7743,
"step": 16000
},
{
"epoch": 0.9205260148656375,
"grad_norm": 16.050718307495117,
"learning_rate": 9.186897270378935e-07,
"loss": 0.774,
"step": 16100
},
{
"epoch": 0.9262435677530018,
"grad_norm": 12.98830795288086,
"learning_rate": 9.177051501969786e-07,
"loss": 0.7544,
"step": 16200
},
{
"epoch": 0.931961120640366,
"grad_norm": 11.758196830749512,
"learning_rate": 9.167151823558908e-07,
"loss": 0.733,
"step": 16300
},
{
"epoch": 0.9376786735277302,
"grad_norm": 14.87873649597168,
"learning_rate": 9.157198362913865e-07,
"loss": 0.7548,
"step": 16400
},
{
"epoch": 0.9433962264150944,
"grad_norm": 11.650275230407715,
"learning_rate": 9.14719124849634e-07,
"loss": 0.7642,
"step": 16500
},
{
"epoch": 0.9491137793024585,
"grad_norm": 12.603800773620605,
"learning_rate": 9.137130609460491e-07,
"loss": 0.7729,
"step": 16600
},
{
"epoch": 0.9548313321898227,
"grad_norm": 13.641478538513184,
"learning_rate": 9.12701657565127e-07,
"loss": 0.7663,
"step": 16700
},
{
"epoch": 0.9605488850771869,
"grad_norm": 12.762767791748047,
"learning_rate": 9.116849277602762e-07,
"loss": 0.7667,
"step": 16800
},
{
"epoch": 0.9662664379645511,
"grad_norm": 15.196799278259277,
"learning_rate": 9.106628846536486e-07,
"loss": 0.7676,
"step": 16900
},
{
"epoch": 0.9719839908519153,
"grad_norm": 13.028931617736816,
"learning_rate": 9.096355414359714e-07,
"loss": 0.7681,
"step": 17000
},
{
"epoch": 0.9777015437392796,
"grad_norm": 15.02116584777832,
"learning_rate": 9.086029113663756e-07,
"loss": 0.7487,
"step": 17100
},
{
"epoch": 0.9834190966266438,
"grad_norm": 12.758607864379883,
"learning_rate": 9.075650077722263e-07,
"loss": 0.7502,
"step": 17200
},
{
"epoch": 0.989136649514008,
"grad_norm": 12.625545501708984,
"learning_rate": 9.065218440489493e-07,
"loss": 0.7363,
"step": 17300
},
{
"epoch": 0.9948542024013722,
"grad_norm": 13.428793907165527,
"learning_rate": 9.054734336598592e-07,
"loss": 0.7604,
"step": 17400
},
{
"epoch": 1.0005717552887363,
"grad_norm": 14.363871574401855,
"learning_rate": 9.044197901359854e-07,
"loss": 0.7669,
"step": 17500
},
{
"epoch": 1.0062893081761006,
"grad_norm": 12.019951820373535,
"learning_rate": 9.033609270758968e-07,
"loss": 0.7553,
"step": 17600
},
{
"epoch": 1.012006861063465,
"grad_norm": 13.016924858093262,
"learning_rate": 9.022968581455275e-07,
"loss": 0.7455,
"step": 17700
},
{
"epoch": 1.017724413950829,
"grad_norm": 12.672630310058594,
"learning_rate": 9.012275970779994e-07,
"loss": 0.77,
"step": 17800
},
{
"epoch": 1.0234419668381933,
"grad_norm": 13.483732223510742,
"learning_rate": 9.001531576734455e-07,
"loss": 0.7572,
"step": 17900
},
{
"epoch": 1.0291595197255574,
"grad_norm": 11.392430305480957,
"learning_rate": 8.990735537988315e-07,
"loss": 0.7518,
"step": 18000
},
{
"epoch": 1.0348770726129217,
"grad_norm": 11.757925033569336,
"learning_rate": 8.979887993877768e-07,
"loss": 0.7558,
"step": 18100
},
{
"epoch": 1.0405946255002858,
"grad_norm": 12.2052583694458,
"learning_rate": 8.968989084403755e-07,
"loss": 0.7539,
"step": 18200
},
{
"epoch": 1.0463121783876501,
"grad_norm": 14.284133911132812,
"learning_rate": 8.958038950230144e-07,
"loss": 0.7254,
"step": 18300
},
{
"epoch": 1.0520297312750142,
"grad_norm": 13.372608184814453,
"learning_rate": 8.947037732681921e-07,
"loss": 0.7109,
"step": 18400
},
{
"epoch": 1.0577472841623785,
"grad_norm": 14.469101905822754,
"learning_rate": 8.93598557374337e-07,
"loss": 0.7479,
"step": 18500
},
{
"epoch": 1.0634648370497426,
"grad_norm": 11.611613273620605,
"learning_rate": 8.924882616056231e-07,
"loss": 0.7355,
"step": 18600
},
{
"epoch": 1.069182389937107,
"grad_norm": 11.983068466186523,
"learning_rate": 8.913729002917872e-07,
"loss": 0.751,
"step": 18700
},
{
"epoch": 1.0748999428244712,
"grad_norm": 12.077164649963379,
"learning_rate": 8.902524878279424e-07,
"loss": 0.7583,
"step": 18800
},
{
"epoch": 1.0806174957118353,
"grad_norm": 11.417193412780762,
"learning_rate": 8.891270386743937e-07,
"loss": 0.7598,
"step": 18900
},
{
"epoch": 1.0863350485991996,
"grad_norm": 14.211779594421387,
"learning_rate": 8.879965673564504e-07,
"loss": 0.7403,
"step": 19000
},
{
"epoch": 1.0920526014865637,
"grad_norm": 12.398454666137695,
"learning_rate": 8.868610884642394e-07,
"loss": 0.7388,
"step": 19100
},
{
"epoch": 1.097770154373928,
"grad_norm": 10.495121002197266,
"learning_rate": 8.857206166525163e-07,
"loss": 0.7328,
"step": 19200
},
{
"epoch": 1.103487707261292,
"grad_norm": 14.72545051574707,
"learning_rate": 8.845751666404764e-07,
"loss": 0.7537,
"step": 19300
},
{
"epoch": 1.1092052601486564,
"grad_norm": 11.759736061096191,
"learning_rate": 8.834247532115651e-07,
"loss": 0.7537,
"step": 19400
},
{
"epoch": 1.1149228130360207,
"grad_norm": 13.681090354919434,
"learning_rate": 8.822693912132865e-07,
"loss": 0.7541,
"step": 19500
},
{
"epoch": 1.1206403659233848,
"grad_norm": 15.426194190979004,
"learning_rate": 8.811090955570126e-07,
"loss": 0.7636,
"step": 19600
},
{
"epoch": 1.126357918810749,
"grad_norm": 13.35466194152832,
"learning_rate": 8.799438812177895e-07,
"loss": 0.7409,
"step": 19700
},
{
"epoch": 1.1320754716981132,
"grad_norm": 16.639293670654297,
"learning_rate": 8.787737632341456e-07,
"loss": 0.7467,
"step": 19800
},
{
"epoch": 1.1377930245854775,
"grad_norm": 13.953729629516602,
"learning_rate": 8.775987567078969e-07,
"loss": 0.7399,
"step": 19900
},
{
"epoch": 1.1435105774728416,
"grad_norm": 11.683821678161621,
"learning_rate": 8.764188768039517e-07,
"loss": 0.7449,
"step": 20000
},
{
"epoch": 1.1492281303602059,
"grad_norm": 15.29350757598877,
"learning_rate": 8.752341387501158e-07,
"loss": 0.7468,
"step": 20100
},
{
"epoch": 1.15494568324757,
"grad_norm": 14.116186141967773,
"learning_rate": 8.740445578368947e-07,
"loss": 0.7351,
"step": 20200
},
{
"epoch": 1.1606632361349343,
"grad_norm": 12.669661521911621,
"learning_rate": 8.728501494172975e-07,
"loss": 0.7454,
"step": 20300
},
{
"epoch": 1.1663807890222984,
"grad_norm": 14.350497245788574,
"learning_rate": 8.71650928906638e-07,
"loss": 0.7339,
"step": 20400
},
{
"epoch": 1.1720983419096627,
"grad_norm": 13.84665584564209,
"learning_rate": 8.704469117823363e-07,
"loss": 0.7398,
"step": 20500
},
{
"epoch": 1.177815894797027,
"grad_norm": 11.469983100891113,
"learning_rate": 8.692381135837181e-07,
"loss": 0.7573,
"step": 20600
},
{
"epoch": 1.183533447684391,
"grad_norm": 18.06913185119629,
"learning_rate": 8.680245499118154e-07,
"loss": 0.7581,
"step": 20700
},
{
"epoch": 1.1892510005717554,
"grad_norm": 14.282899856567383,
"learning_rate": 8.668062364291639e-07,
"loss": 0.73,
"step": 20800
},
{
"epoch": 1.1949685534591195,
"grad_norm": 13.120650291442871,
"learning_rate": 8.655831888596023e-07,
"loss": 0.7204,
"step": 20900
},
{
"epoch": 1.2006861063464838,
"grad_norm": 11.917327880859375,
"learning_rate": 8.643554229880676e-07,
"loss": 0.7479,
"step": 21000
},
{
"epoch": 1.2064036592338478,
"grad_norm": 12.158246994018555,
"learning_rate": 8.631229546603928e-07,
"loss": 0.7578,
"step": 21100
},
{
"epoch": 1.2121212121212122,
"grad_norm": 11.658825874328613,
"learning_rate": 8.618857997831021e-07,
"loss": 0.7443,
"step": 21200
},
{
"epoch": 1.2178387650085762,
"grad_norm": 14.301033973693848,
"learning_rate": 8.606439743232047e-07,
"loss": 0.7212,
"step": 21300
},
{
"epoch": 1.2235563178959405,
"grad_norm": 17.928747177124023,
"learning_rate": 8.593974943079903e-07,
"loss": 0.7565,
"step": 21400
},
{
"epoch": 1.2292738707833046,
"grad_norm": 15.283846855163574,
"learning_rate": 8.581463758248206e-07,
"loss": 0.7541,
"step": 21500
},
{
"epoch": 1.234991423670669,
"grad_norm": 14.58877182006836,
"learning_rate": 8.56890635020923e-07,
"loss": 0.7422,
"step": 21600
},
{
"epoch": 1.2407089765580332,
"grad_norm": 12.548053741455078,
"learning_rate": 8.556302881031813e-07,
"loss": 0.7326,
"step": 21700
},
{
"epoch": 1.2464265294453973,
"grad_norm": 17.030057907104492,
"learning_rate": 8.543653513379272e-07,
"loss": 0.7404,
"step": 21800
},
{
"epoch": 1.2521440823327616,
"grad_norm": 12.616168975830078,
"learning_rate": 8.530958410507296e-07,
"loss": 0.7262,
"step": 21900
},
{
"epoch": 1.2578616352201257,
"grad_norm": 12.804983139038086,
"learning_rate": 8.518217736261848e-07,
"loss": 0.7273,
"step": 22000
},
{
"epoch": 1.26357918810749,
"grad_norm": 16.434965133666992,
"learning_rate": 8.505431655077045e-07,
"loss": 0.7326,
"step": 22100
},
{
"epoch": 1.2692967409948541,
"grad_norm": 15.001092910766602,
"learning_rate": 8.492600331973032e-07,
"loss": 0.7413,
"step": 22200
},
{
"epoch": 1.2750142938822184,
"grad_norm": 12.592192649841309,
"learning_rate": 8.479723932553864e-07,
"loss": 0.7229,
"step": 22300
},
{
"epoch": 1.2807318467695827,
"grad_norm": 12.405608177185059,
"learning_rate": 8.466802623005354e-07,
"loss": 0.7575,
"step": 22400
},
{
"epoch": 1.2864493996569468,
"grad_norm": 13.2118501663208,
"learning_rate": 8.453836570092941e-07,
"loss": 0.7368,
"step": 22500
},
{
"epoch": 1.292166952544311,
"grad_norm": 14.132501602172852,
"learning_rate": 8.44082594115953e-07,
"loss": 0.7418,
"step": 22600
},
{
"epoch": 1.2978845054316752,
"grad_norm": 12.394464492797852,
"learning_rate": 8.427770904123336e-07,
"loss": 0.7769,
"step": 22700
},
{
"epoch": 1.3036020583190395,
"grad_norm": 14.223743438720703,
"learning_rate": 8.414671627475716e-07,
"loss": 0.7402,
"step": 22800
},
{
"epoch": 1.3093196112064036,
"grad_norm": 11.712512969970703,
"learning_rate": 8.401528280278987e-07,
"loss": 0.7132,
"step": 22900
},
{
"epoch": 1.315037164093768,
"grad_norm": 14.974958419799805,
"learning_rate": 8.388341032164261e-07,
"loss": 0.7513,
"step": 23000
},
{
"epoch": 1.320754716981132,
"grad_norm": 12.907052993774414,
"learning_rate": 8.375110053329239e-07,
"loss": 0.7286,
"step": 23100
},
{
"epoch": 1.3264722698684963,
"grad_norm": 14.681177139282227,
"learning_rate": 8.36183551453602e-07,
"loss": 0.7272,
"step": 23200
},
{
"epoch": 1.3321898227558604,
"grad_norm": 14.21299934387207,
"learning_rate": 8.348517587108904e-07,
"loss": 0.7421,
"step": 23300
},
{
"epoch": 1.3379073756432247,
"grad_norm": 14.745231628417969,
"learning_rate": 8.335156442932167e-07,
"loss": 0.7193,
"step": 23400
},
{
"epoch": 1.343624928530589,
"grad_norm": 13.499730110168457,
"learning_rate": 8.321752254447858e-07,
"loss": 0.734,
"step": 23500
},
{
"epoch": 1.349342481417953,
"grad_norm": 14.754764556884766,
"learning_rate": 8.308305194653562e-07,
"loss": 0.7312,
"step": 23600
},
{
"epoch": 1.3550600343053174,
"grad_norm": 13.786625862121582,
"learning_rate": 8.294815437100171e-07,
"loss": 0.7445,
"step": 23700
},
{
"epoch": 1.3607775871926815,
"grad_norm": 13.24636459350586,
"learning_rate": 8.281283155889643e-07,
"loss": 0.753,
"step": 23800
},
{
"epoch": 1.3664951400800458,
"grad_norm": 18.36018943786621,
"learning_rate": 8.267708525672763e-07,
"loss": 0.7211,
"step": 23900
},
{
"epoch": 1.3722126929674099,
"grad_norm": 13.684020042419434,
"learning_rate": 8.254091721646872e-07,
"loss": 0.7305,
"step": 24000
},
{
"epoch": 1.3779302458547742,
"grad_norm": 14.666474342346191,
"learning_rate": 8.240432919553624e-07,
"loss": 0.7094,
"step": 24100
},
{
"epoch": 1.3836477987421385,
"grad_norm": 14.115777015686035,
"learning_rate": 8.226732295676708e-07,
"loss": 0.7347,
"step": 24200
},
{
"epoch": 1.3893653516295026,
"grad_norm": 13.236510276794434,
"learning_rate": 8.212990026839571e-07,
"loss": 0.7294,
"step": 24300
},
{
"epoch": 1.3950829045168667,
"grad_norm": 14.913257598876953,
"learning_rate": 8.199206290403146e-07,
"loss": 0.7548,
"step": 24400
},
{
"epoch": 1.400800457404231,
"grad_norm": 13.897913932800293,
"learning_rate": 8.185381264263549e-07,
"loss": 0.7123,
"step": 24500
},
{
"epoch": 1.4065180102915953,
"grad_norm": 11.049424171447754,
"learning_rate": 8.171515126849797e-07,
"loss": 0.7351,
"step": 24600
},
{
"epoch": 1.4122355631789594,
"grad_norm": 14.041512489318848,
"learning_rate": 8.157608057121499e-07,
"loss": 0.7156,
"step": 24700
},
{
"epoch": 1.4179531160663237,
"grad_norm": 14.78030014038086,
"learning_rate": 8.143660234566537e-07,
"loss": 0.7234,
"step": 24800
},
{
"epoch": 1.4236706689536878,
"grad_norm": 14.697408676147461,
"learning_rate": 8.129671839198769e-07,
"loss": 0.7369,
"step": 24900
},
{
"epoch": 1.429388221841052,
"grad_norm": 19.855506896972656,
"learning_rate": 8.115643051555687e-07,
"loss": 0.7242,
"step": 25000
},
{
"epoch": 1.4351057747284162,
"grad_norm": 12.898676872253418,
"learning_rate": 8.101574052696105e-07,
"loss": 0.7337,
"step": 25100
},
{
"epoch": 1.4408233276157805,
"grad_norm": 17.080387115478516,
"learning_rate": 8.087465024197801e-07,
"loss": 0.7563,
"step": 25200
},
{
"epoch": 1.4465408805031448,
"grad_norm": 14.034418106079102,
"learning_rate": 8.073316148155194e-07,
"loss": 0.7053,
"step": 25300
},
{
"epoch": 1.4522584333905089,
"grad_norm": 12.006173133850098,
"learning_rate": 8.059127607176979e-07,
"loss": 0.7416,
"step": 25400
},
{
"epoch": 1.457975986277873,
"grad_norm": 13.194514274597168,
"learning_rate": 8.044899584383776e-07,
"loss": 0.7358,
"step": 25500
},
{
"epoch": 1.4636935391652373,
"grad_norm": 13.047014236450195,
"learning_rate": 8.030632263405772e-07,
"loss": 0.7316,
"step": 25600
},
{
"epoch": 1.4694110920526016,
"grad_norm": 16.628822326660156,
"learning_rate": 8.016325828380342e-07,
"loss": 0.7257,
"step": 25700
},
{
"epoch": 1.4751286449399656,
"grad_norm": 13.5772705078125,
"learning_rate": 8.001980463949672e-07,
"loss": 0.7531,
"step": 25800
},
{
"epoch": 1.48084619782733,
"grad_norm": 11.860750198364258,
"learning_rate": 7.987596355258388e-07,
"loss": 0.7302,
"step": 25900
},
{
"epoch": 1.486563750714694,
"grad_norm": 17.01180648803711,
"learning_rate": 7.973173687951151e-07,
"loss": 0.7251,
"step": 26000
},
{
"epoch": 1.4922813036020584,
"grad_norm": 14.781421661376953,
"learning_rate": 7.958712648170276e-07,
"loss": 0.7333,
"step": 26100
},
{
"epoch": 1.4979988564894224,
"grad_norm": 13.350909233093262,
"learning_rate": 7.944213422553315e-07,
"loss": 0.7062,
"step": 26200
},
{
"epoch": 1.5037164093767867,
"grad_norm": 15.630850791931152,
"learning_rate": 7.92967619823066e-07,
"loss": 0.7294,
"step": 26300
},
{
"epoch": 1.509433962264151,
"grad_norm": 16.793577194213867,
"learning_rate": 7.915101162823119e-07,
"loss": 0.7447,
"step": 26400
},
{
"epoch": 1.5151515151515151,
"grad_norm": 16.42220115661621,
"learning_rate": 7.900488504439504e-07,
"loss": 0.7164,
"step": 26500
},
{
"epoch": 1.5208690680388792,
"grad_norm": 15.00322437286377,
"learning_rate": 7.885838411674192e-07,
"loss": 0.7303,
"step": 26600
},
{
"epoch": 1.5265866209262435,
"grad_norm": 11.874865531921387,
"learning_rate": 7.871151073604704e-07,
"loss": 0.739,
"step": 26700
},
{
"epoch": 1.5323041738136078,
"grad_norm": 14.509276390075684,
"learning_rate": 7.856426679789252e-07,
"loss": 0.7344,
"step": 26800
},
{
"epoch": 1.538021726700972,
"grad_norm": 14.377605438232422,
"learning_rate": 7.841665420264299e-07,
"loss": 0.7339,
"step": 26900
},
{
"epoch": 1.5437392795883362,
"grad_norm": 13.096054077148438,
"learning_rate": 7.826867485542106e-07,
"loss": 0.7146,
"step": 27000
},
{
"epoch": 1.5494568324757005,
"grad_norm": 13.225566864013672,
"learning_rate": 7.812033066608272e-07,
"loss": 0.7201,
"step": 27100
},
{
"epoch": 1.5551743853630646,
"grad_norm": 13.55895709991455,
"learning_rate": 7.797162354919272e-07,
"loss": 0.7703,
"step": 27200
},
{
"epoch": 1.5608919382504287,
"grad_norm": 13.646178245544434,
"learning_rate": 7.782255542399983e-07,
"loss": 0.7374,
"step": 27300
},
{
"epoch": 1.566609491137793,
"grad_norm": 14.485557556152344,
"learning_rate": 7.767312821441205e-07,
"loss": 0.7473,
"step": 27400
},
{
"epoch": 1.5723270440251573,
"grad_norm": 13.934147834777832,
"learning_rate": 7.752334384897185e-07,
"loss": 0.7222,
"step": 27500
},
{
"epoch": 1.5780445969125214,
"grad_norm": 17.379066467285156,
"learning_rate": 7.737320426083118e-07,
"loss": 0.7526,
"step": 27600
},
{
"epoch": 1.5837621497998855,
"grad_norm": 14.709135055541992,
"learning_rate": 7.722271138772665e-07,
"loss": 0.7433,
"step": 27700
},
{
"epoch": 1.5894797026872498,
"grad_norm": 14.56612491607666,
"learning_rate": 7.70718671719544e-07,
"loss": 0.731,
"step": 27800
},
{
"epoch": 1.5951972555746141,
"grad_norm": 12.300278663635254,
"learning_rate": 7.692067356034506e-07,
"loss": 0.724,
"step": 27900
},
{
"epoch": 1.6009148084619782,
"grad_norm": 14.767439842224121,
"learning_rate": 7.676913250423873e-07,
"loss": 0.7301,
"step": 28000
},
{
"epoch": 1.6066323613493425,
"grad_norm": 13.967864990234375,
"learning_rate": 7.66172459594596e-07,
"loss": 0.7166,
"step": 28100
},
{
"epoch": 1.6123499142367068,
"grad_norm": 13.520434379577637,
"learning_rate": 7.64650158862909e-07,
"loss": 0.7041,
"step": 28200
},
{
"epoch": 1.618067467124071,
"grad_norm": 15.095074653625488,
"learning_rate": 7.631244424944948e-07,
"loss": 0.7034,
"step": 28300
},
{
"epoch": 1.623785020011435,
"grad_norm": 15.181931495666504,
"learning_rate": 7.615953301806048e-07,
"loss": 0.719,
"step": 28400
},
{
"epoch": 1.6295025728987993,
"grad_norm": 14.636883735656738,
"learning_rate": 7.600628416563199e-07,
"loss": 0.7329,
"step": 28500
},
{
"epoch": 1.6352201257861636,
"grad_norm": 13.246421813964844,
"learning_rate": 7.585269967002946e-07,
"loss": 0.7387,
"step": 28600
},
{
"epoch": 1.6409376786735277,
"grad_norm": 13.783883094787598,
"learning_rate": 7.56987815134502e-07,
"loss": 0.7445,
"step": 28700
},
{
"epoch": 1.6466552315608918,
"grad_norm": 13.172192573547363,
"learning_rate": 7.554453168239793e-07,
"loss": 0.7222,
"step": 28800
},
{
"epoch": 1.6523727844482563,
"grad_norm": 12.598628997802734,
"learning_rate": 7.538995216765693e-07,
"loss": 0.7329,
"step": 28900
},
{
"epoch": 1.6580903373356204,
"grad_norm": 13.531826972961426,
"learning_rate": 7.523504496426651e-07,
"loss": 0.7389,
"step": 29000
},
{
"epoch": 1.6638078902229845,
"grad_norm": 12.720640182495117,
"learning_rate": 7.507981207149523e-07,
"loss": 0.7135,
"step": 29100
},
{
"epoch": 1.6695254431103488,
"grad_norm": 16.165489196777344,
"learning_rate": 7.492425549281499e-07,
"loss": 0.7352,
"step": 29200
},
{
"epoch": 1.675242995997713,
"grad_norm": 15.355998992919922,
"learning_rate": 7.476837723587532e-07,
"loss": 0.7273,
"step": 29300
},
{
"epoch": 1.6809605488850772,
"grad_norm": 15.340593338012695,
"learning_rate": 7.461217931247741e-07,
"loss": 0.7118,
"step": 29400
},
{
"epoch": 1.6866781017724413,
"grad_norm": 11.760583877563477,
"learning_rate": 7.445566373854812e-07,
"loss": 0.7161,
"step": 29500
},
{
"epoch": 1.6923956546598056,
"grad_norm": 14.857872009277344,
"learning_rate": 7.429883253411395e-07,
"loss": 0.7319,
"step": 29600
},
{
"epoch": 1.6981132075471699,
"grad_norm": 13.463951110839844,
"learning_rate": 7.414168772327507e-07,
"loss": 0.7473,
"step": 29700
},
{
"epoch": 1.703830760434534,
"grad_norm": 14.559701919555664,
"learning_rate": 7.398423133417906e-07,
"loss": 0.7039,
"step": 29800
},
{
"epoch": 1.7095483133218983,
"grad_norm": 17.633832931518555,
"learning_rate": 7.382646539899487e-07,
"loss": 0.7348,
"step": 29900
},
{
"epoch": 1.7152658662092626,
"grad_norm": 21.451581954956055,
"learning_rate": 7.366839195388643e-07,
"loss": 0.7153,
"step": 30000
},
{
"epoch": 1.7209834190966267,
"grad_norm": 15.963432312011719,
"learning_rate": 7.351001303898658e-07,
"loss": 0.7109,
"step": 30100
},
{
"epoch": 1.7267009719839908,
"grad_norm": 16.46929168701172,
"learning_rate": 7.335133069837053e-07,
"loss": 0.723,
"step": 30200
},
{
"epoch": 1.732418524871355,
"grad_norm": 14.31004524230957,
"learning_rate": 7.319234698002963e-07,
"loss": 0.7476,
"step": 30300
},
{
"epoch": 1.7381360777587194,
"grad_norm": 15.023375511169434,
"learning_rate": 7.303306393584486e-07,
"loss": 0.7405,
"step": 30400
},
{
"epoch": 1.7438536306460835,
"grad_norm": 13.826839447021484,
"learning_rate": 7.287348362156034e-07,
"loss": 0.7251,
"step": 30500
},
{
"epoch": 1.7495711835334475,
"grad_norm": 14.537930488586426,
"learning_rate": 7.271360809675688e-07,
"loss": 0.731,
"step": 30600
},
{
"epoch": 1.7552887364208118,
"grad_norm": 15.455423355102539,
"learning_rate": 7.255343942482534e-07,
"loss": 0.7238,
"step": 30700
},
{
"epoch": 1.7610062893081762,
"grad_norm": 13.903472900390625,
"learning_rate": 7.239297967293995e-07,
"loss": 0.7282,
"step": 30800
},
{
"epoch": 1.7667238421955402,
"grad_norm": 11.774541854858398,
"learning_rate": 7.223223091203174e-07,
"loss": 0.7391,
"step": 30900
},
{
"epoch": 1.7724413950829045,
"grad_norm": 16.479053497314453,
"learning_rate": 7.207119521676173e-07,
"loss": 0.7151,
"step": 31000
},
{
"epoch": 1.7781589479702689,
"grad_norm": 12.094974517822266,
"learning_rate": 7.190987466549423e-07,
"loss": 0.7351,
"step": 31100
},
{
"epoch": 1.783876500857633,
"grad_norm": 16.851329803466797,
"learning_rate": 7.17482713402699e-07,
"loss": 0.7326,
"step": 31200
},
{
"epoch": 1.789594053744997,
"grad_norm": 15.386981964111328,
"learning_rate": 7.158638732677898e-07,
"loss": 0.7278,
"step": 31300
},
{
"epoch": 1.7953116066323613,
"grad_norm": 21.271484375,
"learning_rate": 7.142422471433435e-07,
"loss": 0.7202,
"step": 31400
},
{
"epoch": 1.8010291595197256,
"grad_norm": 14.556968688964844,
"learning_rate": 7.126178559584453e-07,
"loss": 0.7188,
"step": 31500
},
{
"epoch": 1.8067467124070897,
"grad_norm": 14.779976844787598,
"learning_rate": 7.109907206778672e-07,
"loss": 0.6977,
"step": 31600
},
{
"epoch": 1.8124642652944538,
"grad_norm": 14.60632610321045,
"learning_rate": 7.093608623017965e-07,
"loss": 0.7082,
"step": 31700
},
{
"epoch": 1.8181818181818183,
"grad_norm": 11.756601333618164,
"learning_rate": 7.077283018655662e-07,
"loss": 0.7348,
"step": 31800
},
{
"epoch": 1.8238993710691824,
"grad_norm": 14.042952537536621,
"learning_rate": 7.060930604393825e-07,
"loss": 0.7188,
"step": 31900
},
{
"epoch": 1.8296169239565465,
"grad_norm": 12.664731979370117,
"learning_rate": 7.044551591280525e-07,
"loss": 0.7124,
"step": 32000
},
{
"epoch": 1.8353344768439108,
"grad_norm": 12.528229713439941,
"learning_rate": 7.028146190707131e-07,
"loss": 0.715,
"step": 32100
},
{
"epoch": 1.8410520297312751,
"grad_norm": 17.832958221435547,
"learning_rate": 7.011714614405576e-07,
"loss": 0.7424,
"step": 32200
},
{
"epoch": 1.8467695826186392,
"grad_norm": 14.167093276977539,
"learning_rate": 6.995257074445614e-07,
"loss": 0.6857,
"step": 32300
},
{
"epoch": 1.8524871355060033,
"grad_norm": 14.986869812011719,
"learning_rate": 6.978773783232099e-07,
"loss": 0.7286,
"step": 32400
},
{
"epoch": 1.8582046883933676,
"grad_norm": 13.462069511413574,
"learning_rate": 6.962264953502237e-07,
"loss": 0.7278,
"step": 32500
},
{
"epoch": 1.863922241280732,
"grad_norm": 12.811728477478027,
"learning_rate": 6.945730798322835e-07,
"loss": 0.725,
"step": 32600
},
{
"epoch": 1.869639794168096,
"grad_norm": 15.677468299865723,
"learning_rate": 6.929171531087561e-07,
"loss": 0.7358,
"step": 32700
},
{
"epoch": 1.8753573470554603,
"grad_norm": 15.89470386505127,
"learning_rate": 6.912587365514182e-07,
"loss": 0.7404,
"step": 32800
},
{
"epoch": 1.8810748999428246,
"grad_norm": 13.49212646484375,
"learning_rate": 6.895978515641814e-07,
"loss": 0.6961,
"step": 32900
},
{
"epoch": 1.8867924528301887,
"grad_norm": 15.530927658081055,
"learning_rate": 6.879345195828145e-07,
"loss": 0.72,
"step": 33000
},
{
"epoch": 1.8925100057175528,
"grad_norm": 15.449943542480469,
"learning_rate": 6.862687620746688e-07,
"loss": 0.7107,
"step": 33100
},
{
"epoch": 1.898227558604917,
"grad_norm": 12.777170181274414,
"learning_rate": 6.846006005383992e-07,
"loss": 0.7049,
"step": 33200
},
{
"epoch": 1.9039451114922814,
"grad_norm": 15.19930362701416,
"learning_rate": 6.829300565036882e-07,
"loss": 0.6905,
"step": 33300
},
{
"epoch": 1.9096626643796455,
"grad_norm": 14.737263679504395,
"learning_rate": 6.812571515309667e-07,
"loss": 0.7479,
"step": 33400
},
{
"epoch": 1.9153802172670096,
"grad_norm": 17.35552215576172,
"learning_rate": 6.795819072111369e-07,
"loss": 0.7247,
"step": 33500
},
{
"epoch": 1.9210977701543739,
"grad_norm": 15.307523727416992,
"learning_rate": 6.779043451652932e-07,
"loss": 0.7392,
"step": 33600
},
{
"epoch": 1.9268153230417382,
"grad_norm": 12.777303695678711,
"learning_rate": 6.762244870444427e-07,
"loss": 0.7179,
"step": 33700
},
{
"epoch": 1.9325328759291023,
"grad_norm": 13.537027359008789,
"learning_rate": 6.745423545292267e-07,
"loss": 0.7123,
"step": 33800
},
{
"epoch": 1.9382504288164666,
"grad_norm": 14.208198547363281,
"learning_rate": 6.7285796932964e-07,
"loss": 0.7474,
"step": 33900
},
{
"epoch": 1.943967981703831,
"grad_norm": 15.080704689025879,
"learning_rate": 6.711713531847512e-07,
"loss": 0.7285,
"step": 34000
},
{
"epoch": 1.949685534591195,
"grad_norm": 15.996607780456543,
"learning_rate": 6.694825278624219e-07,
"loss": 0.7365,
"step": 34100
},
{
"epoch": 1.955403087478559,
"grad_norm": 19.516117095947266,
"learning_rate": 6.677915151590259e-07,
"loss": 0.7181,
"step": 34200
},
{
"epoch": 1.9611206403659234,
"grad_norm": 13.266532897949219,
"learning_rate": 6.660983368991679e-07,
"loss": 0.7201,
"step": 34300
},
{
"epoch": 1.9668381932532877,
"grad_norm": 15.131275177001953,
"learning_rate": 6.644030149354017e-07,
"loss": 0.7209,
"step": 34400
},
{
"epoch": 1.9725557461406518,
"grad_norm": 13.388816833496094,
"learning_rate": 6.627055711479486e-07,
"loss": 0.7382,
"step": 34500
},
{
"epoch": 1.9782732990280159,
"grad_norm": 13.799139976501465,
"learning_rate": 6.61006027444414e-07,
"loss": 0.707,
"step": 34600
},
{
"epoch": 1.9839908519153804,
"grad_norm": 13.762521743774414,
"learning_rate": 6.593044057595059e-07,
"loss": 0.7365,
"step": 34700
},
{
"epoch": 1.9897084048027445,
"grad_norm": 15.104240417480469,
"learning_rate": 6.576007280547509e-07,
"loss": 0.7561,
"step": 34800
},
{
"epoch": 1.9954259576901086,
"grad_norm": 19.18963623046875,
"learning_rate": 6.558950163182111e-07,
"loss": 0.7286,
"step": 34900
},
{
"epoch": 2.0011435105774726,
"grad_norm": 13.76323413848877,
"learning_rate": 6.541872925642004e-07,
"loss": 0.7234,
"step": 35000
},
{
"epoch": 2.006861063464837,
"grad_norm": 16.3411808013916,
"learning_rate": 6.52477578833e-07,
"loss": 0.7251,
"step": 35100
},
{
"epoch": 2.0125786163522013,
"grad_norm": 13.213664054870605,
"learning_rate": 6.507658971905746e-07,
"loss": 0.7245,
"step": 35200
},
{
"epoch": 2.0182961692395653,
"grad_norm": 14.91565227508545,
"learning_rate": 6.490522697282872e-07,
"loss": 0.6989,
"step": 35300
},
{
"epoch": 2.02401372212693,
"grad_norm": 16.91631507873535,
"learning_rate": 6.473367185626134e-07,
"loss": 0.7126,
"step": 35400
},
{
"epoch": 2.029731275014294,
"grad_norm": 16.931249618530273,
"learning_rate": 6.456192658348573e-07,
"loss": 0.6899,
"step": 35500
},
{
"epoch": 2.035448827901658,
"grad_norm": 15.521553993225098,
"learning_rate": 6.438999337108647e-07,
"loss": 0.7277,
"step": 35600
},
{
"epoch": 2.041166380789022,
"grad_norm": 17.19545555114746,
"learning_rate": 6.421787443807371e-07,
"loss": 0.7121,
"step": 35700
},
{
"epoch": 2.0468839336763867,
"grad_norm": 18.010568618774414,
"learning_rate": 6.404557200585463e-07,
"loss": 0.6942,
"step": 35800
},
{
"epoch": 2.0526014865637507,
"grad_norm": 15.198273658752441,
"learning_rate": 6.387308829820459e-07,
"loss": 0.7072,
"step": 35900
},
{
"epoch": 2.058319039451115,
"grad_norm": 14.581058502197266,
"learning_rate": 6.370042554123859e-07,
"loss": 0.681,
"step": 36000
},
{
"epoch": 2.0640365923384794,
"grad_norm": 13.858463287353516,
"learning_rate": 6.352758596338249e-07,
"loss": 0.7045,
"step": 36100
},
{
"epoch": 2.0697541452258434,
"grad_norm": 15.719825744628906,
"learning_rate": 6.335457179534422e-07,
"loss": 0.7093,
"step": 36200
},
{
"epoch": 2.0754716981132075,
"grad_norm": 16.084144592285156,
"learning_rate": 6.318138527008503e-07,
"loss": 0.703,
"step": 36300
},
{
"epoch": 2.0811892510005716,
"grad_norm": 15.696854591369629,
"learning_rate": 6.300802862279063e-07,
"loss": 0.7312,
"step": 36400
},
{
"epoch": 2.086906803887936,
"grad_norm": 15.007280349731445,
"learning_rate": 6.283450409084237e-07,
"loss": 0.7069,
"step": 36500
},
{
"epoch": 2.0926243567753002,
"grad_norm": 18.342792510986328,
"learning_rate": 6.266081391378838e-07,
"loss": 0.6863,
"step": 36600
},
{
"epoch": 2.0983419096626643,
"grad_norm": 16.63268280029297,
"learning_rate": 6.248696033331463e-07,
"loss": 0.6961,
"step": 36700
},
{
"epoch": 2.1040594625500284,
"grad_norm": 15.034991264343262,
"learning_rate": 6.231294559321599e-07,
"loss": 0.7175,
"step": 36800
},
{
"epoch": 2.109777015437393,
"grad_norm": 17.893192291259766,
"learning_rate": 6.213877193936734e-07,
"loss": 0.7064,
"step": 36900
},
{
"epoch": 2.115494568324757,
"grad_norm": 22.963558197021484,
"learning_rate": 6.19644416196945e-07,
"loss": 0.7146,
"step": 37000
},
{
"epoch": 2.121212121212121,
"grad_norm": 16.66696548461914,
"learning_rate": 6.178995688414529e-07,
"loss": 0.7199,
"step": 37100
},
{
"epoch": 2.126929674099485,
"grad_norm": 17.46118927001953,
"learning_rate": 6.161531998466041e-07,
"loss": 0.6935,
"step": 37200
},
{
"epoch": 2.1326472269868497,
"grad_norm": 15.085572242736816,
"learning_rate": 6.144053317514446e-07,
"loss": 0.6958,
"step": 37300
},
{
"epoch": 2.138364779874214,
"grad_norm": 12.658440589904785,
"learning_rate": 6.126559871143681e-07,
"loss": 0.7337,
"step": 37400
},
{
"epoch": 2.144082332761578,
"grad_norm": 14.13125228881836,
"learning_rate": 6.109051885128248e-07,
"loss": 0.7458,
"step": 37500
},
{
"epoch": 2.1497998856489424,
"grad_norm": 13.70114517211914,
"learning_rate": 6.091529585430301e-07,
"loss": 0.6969,
"step": 37600
},
{
"epoch": 2.1555174385363065,
"grad_norm": 14.79975414276123,
"learning_rate": 6.07399319819673e-07,
"loss": 0.7079,
"step": 37700
},
{
"epoch": 2.1612349914236706,
"grad_norm": 16.53200912475586,
"learning_rate": 6.056442949756242e-07,
"loss": 0.693,
"step": 37800
},
{
"epoch": 2.1669525443110347,
"grad_norm": 16.908708572387695,
"learning_rate": 6.038879066616441e-07,
"loss": 0.7188,
"step": 37900
},
{
"epoch": 2.172670097198399,
"grad_norm": 14.79818058013916,
"learning_rate": 6.021301775460903e-07,
"loss": 0.6782,
"step": 38000
},
{
"epoch": 2.1783876500857633,
"grad_norm": 16.381088256835938,
"learning_rate": 6.003711303146249e-07,
"loss": 0.729,
"step": 38100
},
{
"epoch": 2.1841052029731274,
"grad_norm": 14.344415664672852,
"learning_rate": 5.986107876699221e-07,
"loss": 0.707,
"step": 38200
},
{
"epoch": 2.189822755860492,
"grad_norm": 14.784208297729492,
"learning_rate": 5.968491723313753e-07,
"loss": 0.6842,
"step": 38300
},
{
"epoch": 2.195540308747856,
"grad_norm": 14.890427589416504,
"learning_rate": 5.950863070348029e-07,
"loss": 0.6967,
"step": 38400
},
{
"epoch": 2.20125786163522,
"grad_norm": 17.429248809814453,
"learning_rate": 5.933222145321561e-07,
"loss": 0.701,
"step": 38500
},
{
"epoch": 2.206975414522584,
"grad_norm": 16.934354782104492,
"learning_rate": 5.915569175912244e-07,
"loss": 0.7107,
"step": 38600
},
{
"epoch": 2.2126929674099487,
"grad_norm": 16.012706756591797,
"learning_rate": 5.897904389953423e-07,
"loss": 0.7052,
"step": 38700
},
{
"epoch": 2.218410520297313,
"grad_norm": 14.395899772644043,
"learning_rate": 5.880228015430948e-07,
"loss": 0.7291,
"step": 38800
},
{
"epoch": 2.224128073184677,
"grad_norm": 15.9004545211792,
"learning_rate": 5.862540280480237e-07,
"loss": 0.7147,
"step": 38900
},
{
"epoch": 2.2298456260720414,
"grad_norm": 16.330738067626953,
"learning_rate": 5.844841413383324e-07,
"loss": 0.7343,
"step": 39000
},
{
"epoch": 2.2355631789594055,
"grad_norm": 15.460098266601562,
"learning_rate": 5.82713164256592e-07,
"loss": 0.7122,
"step": 39100
},
{
"epoch": 2.2412807318467696,
"grad_norm": 16.06856918334961,
"learning_rate": 5.809411196594462e-07,
"loss": 0.7078,
"step": 39200
},
{
"epoch": 2.2469982847341337,
"grad_norm": 14.644042015075684,
"learning_rate": 5.791680304173158e-07,
"loss": 0.7013,
"step": 39300
},
{
"epoch": 2.252715837621498,
"grad_norm": 14.263148307800293,
"learning_rate": 5.773939194141051e-07,
"loss": 0.7021,
"step": 39400
},
{
"epoch": 2.2584333905088623,
"grad_norm": 14.338480949401855,
"learning_rate": 5.756188095469043e-07,
"loss": 0.7025,
"step": 39500
},
{
"epoch": 2.2641509433962264,
"grad_norm": 15.25626277923584,
"learning_rate": 5.738427237256959e-07,
"loss": 0.6988,
"step": 39600
},
{
"epoch": 2.2698684962835904,
"grad_norm": 13.902752876281738,
"learning_rate": 5.720656848730582e-07,
"loss": 0.7105,
"step": 39700
},
{
"epoch": 2.275586049170955,
"grad_norm": 16.74924659729004,
"learning_rate": 5.702877159238692e-07,
"loss": 0.7226,
"step": 39800
},
{
"epoch": 2.281303602058319,
"grad_norm": 17.56850814819336,
"learning_rate": 5.685088398250113e-07,
"loss": 0.7081,
"step": 39900
},
{
"epoch": 2.287021154945683,
"grad_norm": 13.788533210754395,
"learning_rate": 5.66729079535075e-07,
"loss": 0.7198,
"step": 40000
},
{
"epoch": 2.2927387078330472,
"grad_norm": 15.556499481201172,
"learning_rate": 5.649484580240616e-07,
"loss": 0.7314,
"step": 40100
},
{
"epoch": 2.2984562607204118,
"grad_norm": 15.630646705627441,
"learning_rate": 5.63166998273088e-07,
"loss": 0.6885,
"step": 40200
},
{
"epoch": 2.304173813607776,
"grad_norm": 17.20891571044922,
"learning_rate": 5.613847232740897e-07,
"loss": 0.6971,
"step": 40300
},
{
"epoch": 2.30989136649514,
"grad_norm": 21.685649871826172,
"learning_rate": 5.596016560295241e-07,
"loss": 0.6983,
"step": 40400
},
{
"epoch": 2.3156089193825045,
"grad_norm": 15.817377090454102,
"learning_rate": 5.578178195520728e-07,
"loss": 0.7057,
"step": 40500
},
{
"epoch": 2.3213264722698685,
"grad_norm": 15.319486618041992,
"learning_rate": 5.560332368643462e-07,
"loss": 0.6795,
"step": 40600
},
{
"epoch": 2.3270440251572326,
"grad_norm": 14.907487869262695,
"learning_rate": 5.542479309985849e-07,
"loss": 0.6917,
"step": 40700
},
{
"epoch": 2.3327615780445967,
"grad_norm": 16.08393096923828,
"learning_rate": 5.52461924996363e-07,
"loss": 0.677,
"step": 40800
},
{
"epoch": 2.3384791309319612,
"grad_norm": 15.164365768432617,
"learning_rate": 5.506752419082911e-07,
"loss": 0.6969,
"step": 40900
},
{
"epoch": 2.3441966838193253,
"grad_norm": 16.06779670715332,
"learning_rate": 5.48887904793718e-07,
"loss": 0.7075,
"step": 41000
},
{
"epoch": 2.3499142367066894,
"grad_norm": 16.33745002746582,
"learning_rate": 5.470999367204338e-07,
"loss": 0.7135,
"step": 41100
},
{
"epoch": 2.355631789594054,
"grad_norm": 17.381423950195312,
"learning_rate": 5.453113607643719e-07,
"loss": 0.7374,
"step": 41200
},
{
"epoch": 2.361349342481418,
"grad_norm": 14.613143920898438,
"learning_rate": 5.435222000093109e-07,
"loss": 0.7104,
"step": 41300
},
{
"epoch": 2.367066895368782,
"grad_norm": 14.394537925720215,
"learning_rate": 5.417324775465773e-07,
"loss": 0.7123,
"step": 41400
},
{
"epoch": 2.372784448256146,
"grad_norm": 14.066143035888672,
"learning_rate": 5.399422164747469e-07,
"loss": 0.7207,
"step": 41500
},
{
"epoch": 2.3785020011435107,
"grad_norm": 16.41046714782715,
"learning_rate": 5.381514398993471e-07,
"loss": 0.7217,
"step": 41600
},
{
"epoch": 2.384219554030875,
"grad_norm": 17.5543270111084,
"learning_rate": 5.363601709325584e-07,
"loss": 0.7061,
"step": 41700
},
{
"epoch": 2.389937106918239,
"grad_norm": 16.649717330932617,
"learning_rate": 5.345684326929159e-07,
"loss": 0.718,
"step": 41800
},
{
"epoch": 2.3956546598056034,
"grad_norm": 17.401193618774414,
"learning_rate": 5.327762483050121e-07,
"loss": 0.726,
"step": 41900
},
{
"epoch": 2.4013722126929675,
"grad_norm": 17.420589447021484,
"learning_rate": 5.309836408991965e-07,
"loss": 0.714,
"step": 42000
},
{
"epoch": 2.4070897655803316,
"grad_norm": 17.177000045776367,
"learning_rate": 5.291906336112793e-07,
"loss": 0.7106,
"step": 42100
},
{
"epoch": 2.4128073184676957,
"grad_norm": 14.774796485900879,
"learning_rate": 5.273972495822304e-07,
"loss": 0.7013,
"step": 42200
},
{
"epoch": 2.4185248713550602,
"grad_norm": 16.312162399291992,
"learning_rate": 5.256035119578833e-07,
"loss": 0.7074,
"step": 42300
},
{
"epoch": 2.4242424242424243,
"grad_norm": 17.404407501220703,
"learning_rate": 5.238094438886344e-07,
"loss": 0.6916,
"step": 42400
},
{
"epoch": 2.4299599771297884,
"grad_norm": 15.71784782409668,
"learning_rate": 5.220150685291445e-07,
"loss": 0.6817,
"step": 42500
},
{
"epoch": 2.4356775300171525,
"grad_norm": 15.496809005737305,
"learning_rate": 5.202204090380416e-07,
"loss": 0.7052,
"step": 42600
},
{
"epoch": 2.441395082904517,
"grad_norm": 13.677003860473633,
"learning_rate": 5.184254885776195e-07,
"loss": 0.7252,
"step": 42700
},
{
"epoch": 2.447112635791881,
"grad_norm": 15.640375137329102,
"learning_rate": 5.166303303135408e-07,
"loss": 0.6826,
"step": 42800
},
{
"epoch": 2.452830188679245,
"grad_norm": 18.025440216064453,
"learning_rate": 5.14834957414537e-07,
"loss": 0.6858,
"step": 42900
},
{
"epoch": 2.4585477415666093,
"grad_norm": 13.545587539672852,
"learning_rate": 5.1303939305211e-07,
"loss": 0.6925,
"step": 43000
},
{
"epoch": 2.464265294453974,
"grad_norm": 16.854537963867188,
"learning_rate": 5.112436604002324e-07,
"loss": 0.7173,
"step": 43100
},
{
"epoch": 2.469982847341338,
"grad_norm": 15.970281600952148,
"learning_rate": 5.094477826350491e-07,
"loss": 0.7095,
"step": 43200
},
{
"epoch": 2.475700400228702,
"grad_norm": 15.472623825073242,
"learning_rate": 5.076517829345777e-07,
"loss": 0.7186,
"step": 43300
},
{
"epoch": 2.4814179531160665,
"grad_norm": 19.989017486572266,
"learning_rate": 5.058556844784098e-07,
"loss": 0.6928,
"step": 43400
},
{
"epoch": 2.4871355060034306,
"grad_norm": 19.59756851196289,
"learning_rate": 5.04059510447411e-07,
"loss": 0.709,
"step": 43500
},
{
"epoch": 2.4928530588907947,
"grad_norm": 16.710494995117188,
"learning_rate": 5.022632840234227e-07,
"loss": 0.7031,
"step": 43600
},
{
"epoch": 2.4985706117781588,
"grad_norm": 16.303966522216797,
"learning_rate": 5.004670283889626e-07,
"loss": 0.6951,
"step": 43700
},
{
"epoch": 2.5042881646655233,
"grad_norm": 16.28445053100586,
"learning_rate": 4.986707667269252e-07,
"loss": 0.7017,
"step": 43800
},
{
"epoch": 2.5100057175528874,
"grad_norm": 16.038339614868164,
"learning_rate": 4.968745222202824e-07,
"loss": 0.7348,
"step": 43900
},
{
"epoch": 2.5157232704402515,
"grad_norm": 15.450057983398438,
"learning_rate": 4.950783180517855e-07,
"loss": 0.696,
"step": 44000
},
{
"epoch": 2.521440823327616,
"grad_norm": 15.47207260131836,
"learning_rate": 4.932821774036647e-07,
"loss": 0.7014,
"step": 44100
},
{
"epoch": 2.52715837621498,
"grad_norm": 18.941865921020508,
"learning_rate": 4.914861234573305e-07,
"loss": 0.6922,
"step": 44200
},
{
"epoch": 2.532875929102344,
"grad_norm": 17.57488441467285,
"learning_rate": 4.896901793930745e-07,
"loss": 0.7082,
"step": 44300
},
{
"epoch": 2.5385934819897082,
"grad_norm": 15.198995590209961,
"learning_rate": 4.878943683897696e-07,
"loss": 0.7016,
"step": 44400
},
{
"epoch": 2.5443110348770728,
"grad_norm": 18.978601455688477,
"learning_rate": 4.860987136245723e-07,
"loss": 0.708,
"step": 44500
},
{
"epoch": 2.550028587764437,
"grad_norm": 15.620323181152344,
"learning_rate": 4.843032382726217e-07,
"loss": 0.6936,
"step": 44600
},
{
"epoch": 2.555746140651801,
"grad_norm": 15.971785545349121,
"learning_rate": 4.82507965506742e-07,
"loss": 0.7204,
"step": 44700
},
{
"epoch": 2.5614636935391655,
"grad_norm": 17.761568069458008,
"learning_rate": 4.807129184971428e-07,
"loss": 0.6965,
"step": 44800
},
{
"epoch": 2.5671812464265296,
"grad_norm": 16.433975219726562,
"learning_rate": 4.789181204111195e-07,
"loss": 0.6959,
"step": 44900
},
{
"epoch": 2.5728987993138936,
"grad_norm": 14.489997863769531,
"learning_rate": 4.771235944127554e-07,
"loss": 0.6675,
"step": 45000
},
{
"epoch": 2.5786163522012577,
"grad_norm": 17.199810028076172,
"learning_rate": 4.753293636626217e-07,
"loss": 0.6907,
"step": 45100
},
{
"epoch": 2.584333905088622,
"grad_norm": 14.820212364196777,
"learning_rate": 4.735354513174794e-07,
"loss": 0.6955,
"step": 45200
},
{
"epoch": 2.5900514579759863,
"grad_norm": 15.466897964477539,
"learning_rate": 4.717418805299801e-07,
"loss": 0.727,
"step": 45300
},
{
"epoch": 2.5957690108633504,
"grad_norm": 13.702413558959961,
"learning_rate": 4.6994867444836684e-07,
"loss": 0.6923,
"step": 45400
},
{
"epoch": 2.601486563750715,
"grad_norm": 16.696453094482422,
"learning_rate": 4.681558562161759e-07,
"loss": 0.7204,
"step": 45500
},
{
"epoch": 2.607204116638079,
"grad_norm": 12.730810165405273,
"learning_rate": 4.6636344897193824e-07,
"loss": 0.7209,
"step": 45600
},
{
"epoch": 2.612921669525443,
"grad_norm": 18.060680389404297,
"learning_rate": 4.645714758488797e-07,
"loss": 0.6942,
"step": 45700
},
{
"epoch": 2.618639222412807,
"grad_norm": 13.605435371398926,
"learning_rate": 4.627799599746238e-07,
"loss": 0.6714,
"step": 45800
},
{
"epoch": 2.6243567753001713,
"grad_norm": 16.512327194213867,
"learning_rate": 4.6098892447089274e-07,
"loss": 0.7124,
"step": 45900
},
{
"epoch": 2.630074328187536,
"grad_norm": 17.260540008544922,
"learning_rate": 4.5919839245320876e-07,
"loss": 0.6999,
"step": 46000
},
{
"epoch": 2.6357918810749,
"grad_norm": 19.792367935180664,
"learning_rate": 4.574083870305961e-07,
"loss": 0.7026,
"step": 46100
},
{
"epoch": 2.641509433962264,
"grad_norm": 15.92784309387207,
"learning_rate": 4.556189313052824e-07,
"loss": 0.7325,
"step": 46200
},
{
"epoch": 2.6472269868496285,
"grad_norm": 13.952754020690918,
"learning_rate": 4.5383004837240137e-07,
"loss": 0.7057,
"step": 46300
},
{
"epoch": 2.6529445397369926,
"grad_norm": 14.797870635986328,
"learning_rate": 4.5204176131969316e-07,
"loss": 0.7064,
"step": 46400
},
{
"epoch": 2.6586620926243567,
"grad_norm": 14.710382461547852,
"learning_rate": 4.502540932272082e-07,
"loss": 0.6909,
"step": 46500
},
{
"epoch": 2.664379645511721,
"grad_norm": 14.503265380859375,
"learning_rate": 4.4846706716700816e-07,
"loss": 0.6937,
"step": 46600
},
{
"epoch": 2.6700971983990853,
"grad_norm": 17.75704002380371,
"learning_rate": 4.466807062028685e-07,
"loss": 0.7037,
"step": 46700
},
{
"epoch": 2.6758147512864494,
"grad_norm": 19.279897689819336,
"learning_rate": 4.4489503338998085e-07,
"loss": 0.7048,
"step": 46800
},
{
"epoch": 2.6815323041738135,
"grad_norm": 18.44795036315918,
"learning_rate": 4.431100717746554e-07,
"loss": 0.7152,
"step": 46900
},
{
"epoch": 2.687249857061178,
"grad_norm": 14.337074279785156,
"learning_rate": 4.4132584439402343e-07,
"loss": 0.7076,
"step": 47000
},
{
"epoch": 2.692967409948542,
"grad_norm": 17.55514144897461,
"learning_rate": 4.3954237427573994e-07,
"loss": 0.6757,
"step": 47100
},
{
"epoch": 2.698684962835906,
"grad_norm": 16.805578231811523,
"learning_rate": 4.377596844376864e-07,
"loss": 0.7015,
"step": 47200
},
{
"epoch": 2.7044025157232703,
"grad_norm": 13.724994659423828,
"learning_rate": 4.359777978876742e-07,
"loss": 0.7132,
"step": 47300
},
{
"epoch": 2.710120068610635,
"grad_norm": 15.69048023223877,
"learning_rate": 4.341967376231471e-07,
"loss": 0.6707,
"step": 47400
},
{
"epoch": 2.715837621497999,
"grad_norm": 15.279611587524414,
"learning_rate": 4.324165266308846e-07,
"loss": 0.7217,
"step": 47500
},
{
"epoch": 2.721555174385363,
"grad_norm": 14.593955039978027,
"learning_rate": 4.3063718788670523e-07,
"loss": 0.6932,
"step": 47600
},
{
"epoch": 2.7272727272727275,
"grad_norm": 17.884376525878906,
"learning_rate": 4.288587443551705e-07,
"loss": 0.7203,
"step": 47700
},
{
"epoch": 2.7329902801600916,
"grad_norm": 14.831436157226562,
"learning_rate": 4.270812189892873e-07,
"loss": 0.6927,
"step": 47800
},
{
"epoch": 2.7387078330474557,
"grad_norm": 13.828678131103516,
"learning_rate": 4.253046347302133e-07,
"loss": 0.726,
"step": 47900
},
{
"epoch": 2.7444253859348198,
"grad_norm": 16.05724334716797,
"learning_rate": 4.235290145069594e-07,
"loss": 0.6964,
"step": 48000
},
{
"epoch": 2.750142938822184,
"grad_norm": 12.63790512084961,
"learning_rate": 4.2175438123609475e-07,
"loss": 0.6944,
"step": 48100
},
{
"epoch": 2.7558604917095484,
"grad_norm": 16.55657386779785,
"learning_rate": 4.199807578214506e-07,
"loss": 0.6944,
"step": 48200
},
{
"epoch": 2.7615780445969125,
"grad_norm": 16.512378692626953,
"learning_rate": 4.182081671538248e-07,
"loss": 0.6999,
"step": 48300
},
{
"epoch": 2.767295597484277,
"grad_norm": 16.38055992126465,
"learning_rate": 4.1643663211068645e-07,
"loss": 0.6956,
"step": 48400
},
{
"epoch": 2.773013150371641,
"grad_norm": 19.406944274902344,
"learning_rate": 4.1466617555588e-07,
"loss": 0.6961,
"step": 48500
},
{
"epoch": 2.778730703259005,
"grad_norm": 15.556472778320312,
"learning_rate": 4.1289682033933114e-07,
"loss": 0.7167,
"step": 48600
},
{
"epoch": 2.7844482561463693,
"grad_norm": 19.273683547973633,
"learning_rate": 4.1112858929675145e-07,
"loss": 0.7002,
"step": 48700
},
{
"epoch": 2.7901658090337333,
"grad_norm": 16.052841186523438,
"learning_rate": 4.093615052493433e-07,
"loss": 0.7183,
"step": 48800
},
{
"epoch": 2.795883361921098,
"grad_norm": 21.037002563476562,
"learning_rate": 4.0759559100350605e-07,
"loss": 0.685,
"step": 48900
},
{
"epoch": 2.801600914808462,
"grad_norm": 14.775420188903809,
"learning_rate": 4.0583086935054136e-07,
"loss": 0.706,
"step": 49000
},
{
"epoch": 2.807318467695826,
"grad_norm": 14.18565559387207,
"learning_rate": 4.040673630663583e-07,
"loss": 0.6957,
"step": 49100
},
{
"epoch": 2.8130360205831906,
"grad_norm": 17.286367416381836,
"learning_rate": 4.023050949111809e-07,
"loss": 0.7291,
"step": 49200
},
{
"epoch": 2.8187535734705547,
"grad_norm": 14.268885612487793,
"learning_rate": 4.0054408762925343e-07,
"loss": 0.7156,
"step": 49300
},
{
"epoch": 2.8244711263579187,
"grad_norm": 16.985570907592773,
"learning_rate": 3.9878436394854685e-07,
"loss": 0.6913,
"step": 49400
},
{
"epoch": 2.830188679245283,
"grad_norm": 19.517696380615234,
"learning_rate": 3.970259465804658e-07,
"loss": 0.7179,
"step": 49500
},
{
"epoch": 2.8359062321326474,
"grad_norm": 20.2652530670166,
"learning_rate": 3.952688582195553e-07,
"loss": 0.7018,
"step": 49600
},
{
"epoch": 2.8416237850200115,
"grad_norm": 19.40163803100586,
"learning_rate": 3.9351312154320787e-07,
"loss": 0.6981,
"step": 49700
},
{
"epoch": 2.8473413379073755,
"grad_norm": 17.079002380371094,
"learning_rate": 3.9175875921137094e-07,
"loss": 0.6888,
"step": 49800
},
{
"epoch": 2.85305889079474,
"grad_norm": 16.94386100769043,
"learning_rate": 3.90005793866254e-07,
"loss": 0.6734,
"step": 49900
},
{
"epoch": 2.858776443682104,
"grad_norm": 18.46063804626465,
"learning_rate": 3.882542481320373e-07,
"loss": 0.7051,
"step": 50000
},
{
"epoch": 2.8644939965694682,
"grad_norm": 18.862567901611328,
"learning_rate": 3.865041446145788e-07,
"loss": 0.7099,
"step": 50100
},
{
"epoch": 2.8702115494568323,
"grad_norm": 16.764009475708008,
"learning_rate": 3.847555059011231e-07,
"loss": 0.6957,
"step": 50200
},
{
"epoch": 2.8759291023441964,
"grad_norm": 18.46530532836914,
"learning_rate": 3.830083545600097e-07,
"loss": 0.6949,
"step": 50300
},
{
"epoch": 2.881646655231561,
"grad_norm": 14.511909484863281,
"learning_rate": 3.812627131403815e-07,
"loss": 0.6842,
"step": 50400
},
{
"epoch": 2.887364208118925,
"grad_norm": 14.872234344482422,
"learning_rate": 3.795186041718941e-07,
"loss": 0.6798,
"step": 50500
},
{
"epoch": 2.8930817610062896,
"grad_norm": 13.427173614501953,
"learning_rate": 3.7777605016442514e-07,
"loss": 0.7098,
"step": 50600
},
{
"epoch": 2.8987993138936536,
"grad_norm": 17.797651290893555,
"learning_rate": 3.7603507360778324e-07,
"loss": 0.6901,
"step": 50700
},
{
"epoch": 2.9045168667810177,
"grad_norm": 18.58063507080078,
"learning_rate": 3.7429569697141827e-07,
"loss": 0.6859,
"step": 50800
},
{
"epoch": 2.910234419668382,
"grad_norm": 16.973247528076172,
"learning_rate": 3.7255794270413123e-07,
"loss": 0.6969,
"step": 50900
},
{
"epoch": 2.915951972555746,
"grad_norm": 15.047221183776855,
"learning_rate": 3.708218332337841e-07,
"loss": 0.6913,
"step": 51000
},
{
"epoch": 2.9216695254431104,
"grad_norm": 16.692338943481445,
"learning_rate": 3.6908739096701145e-07,
"loss": 0.695,
"step": 51100
},
{
"epoch": 2.9273870783304745,
"grad_norm": 15.817334175109863,
"learning_rate": 3.6735463828892956e-07,
"loss": 0.6864,
"step": 51200
},
{
"epoch": 2.933104631217839,
"grad_norm": 16.537561416625977,
"learning_rate": 3.65623597562849e-07,
"loss": 0.6994,
"step": 51300
},
{
"epoch": 2.938822184105203,
"grad_norm": 17.43041229248047,
"learning_rate": 3.6389429112998574e-07,
"loss": 0.7183,
"step": 51400
},
{
"epoch": 2.944539736992567,
"grad_norm": 14.238208770751953,
"learning_rate": 3.62166741309172e-07,
"loss": 0.711,
"step": 51500
},
{
"epoch": 2.9502572898799313,
"grad_norm": 18.8483829498291,
"learning_rate": 3.6044097039656917e-07,
"loss": 0.6999,
"step": 51600
},
{
"epoch": 2.9559748427672954,
"grad_norm": 18.139741897583008,
"learning_rate": 3.587170006653794e-07,
"loss": 0.7067,
"step": 51700
},
{
"epoch": 2.96169239565466,
"grad_norm": 16.7657470703125,
"learning_rate": 3.569948543655588e-07,
"loss": 0.7044,
"step": 51800
},
{
"epoch": 2.967409948542024,
"grad_norm": 15.640189170837402,
"learning_rate": 3.55274553723529e-07,
"loss": 0.7125,
"step": 51900
},
{
"epoch": 2.973127501429388,
"grad_norm": 17.32538414001465,
"learning_rate": 3.535561209418918e-07,
"loss": 0.6741,
"step": 52000
},
{
"epoch": 2.9788450543167526,
"grad_norm": 16.29108428955078,
"learning_rate": 3.51839578199142e-07,
"loss": 0.6876,
"step": 52100
},
{
"epoch": 2.9845626072041167,
"grad_norm": 16.50261116027832,
"learning_rate": 3.5012494764938095e-07,
"loss": 0.6672,
"step": 52200
},
{
"epoch": 2.990280160091481,
"grad_norm": 18.512115478515625,
"learning_rate": 3.4841225142203045e-07,
"loss": 0.6906,
"step": 52300
},
{
"epoch": 2.995997712978845,
"grad_norm": 17.311607360839844,
"learning_rate": 3.4670151162154825e-07,
"loss": 0.7176,
"step": 52400
},
{
"epoch": 3.0017152658662094,
"grad_norm": 17.8248348236084,
"learning_rate": 3.4499275032714116e-07,
"loss": 0.7088,
"step": 52500
},
{
"epoch": 3.0074328187535735,
"grad_norm": 17.059553146362305,
"learning_rate": 3.4328598959248134e-07,
"loss": 0.6826,
"step": 52600
},
{
"epoch": 3.0131503716409376,
"grad_norm": 18.721872329711914,
"learning_rate": 3.415812514454215e-07,
"loss": 0.6721,
"step": 52700
},
{
"epoch": 3.018867924528302,
"grad_norm": 16.92786979675293,
"learning_rate": 3.398785578877098e-07,
"loss": 0.709,
"step": 52800
},
{
"epoch": 3.024585477415666,
"grad_norm": 16.176301956176758,
"learning_rate": 3.38177930894707e-07,
"loss": 0.6862,
"step": 52900
},
{
"epoch": 3.0303030303030303,
"grad_norm": 16.669635772705078,
"learning_rate": 3.36479392415102e-07,
"loss": 0.7259,
"step": 53000
},
{
"epoch": 3.0360205831903944,
"grad_norm": 18.89837074279785,
"learning_rate": 3.3478296437062905e-07,
"loss": 0.6883,
"step": 53100
},
{
"epoch": 3.041738136077759,
"grad_norm": 18.024032592773438,
"learning_rate": 3.330886686557841e-07,
"loss": 0.6985,
"step": 53200
},
{
"epoch": 3.047455688965123,
"grad_norm": 16.023338317871094,
"learning_rate": 3.313965271375433e-07,
"loss": 0.7058,
"step": 53300
},
{
"epoch": 3.053173241852487,
"grad_norm": 16.61005401611328,
"learning_rate": 3.2970656165508017e-07,
"loss": 0.6842,
"step": 53400
},
{
"epoch": 3.058890794739851,
"grad_norm": 19.768972396850586,
"learning_rate": 3.280187940194836e-07,
"loss": 0.685,
"step": 53500
},
{
"epoch": 3.0646083476272157,
"grad_norm": 18.239892959594727,
"learning_rate": 3.263332460134767e-07,
"loss": 0.694,
"step": 53600
},
{
"epoch": 3.0703259005145798,
"grad_norm": 20.76768684387207,
"learning_rate": 3.246499393911356e-07,
"loss": 0.6915,
"step": 53700
},
{
"epoch": 3.076043453401944,
"grad_norm": 15.302906036376953,
"learning_rate": 3.229688958776086e-07,
"loss": 0.6968,
"step": 53800
},
{
"epoch": 3.0817610062893084,
"grad_norm": 17.009716033935547,
"learning_rate": 3.212901371688353e-07,
"loss": 0.6893,
"step": 53900
},
{
"epoch": 3.0874785591766725,
"grad_norm": 16.54390525817871,
"learning_rate": 3.1961368493126784e-07,
"loss": 0.6914,
"step": 54000
},
{
"epoch": 3.0931961120640366,
"grad_norm": 16.515085220336914,
"learning_rate": 3.179395608015898e-07,
"loss": 0.6964,
"step": 54100
},
{
"epoch": 3.0989136649514006,
"grad_norm": 17.82352638244629,
"learning_rate": 3.1626778638643816e-07,
"loss": 0.6875,
"step": 54200
},
{
"epoch": 3.104631217838765,
"grad_norm": 26.154184341430664,
"learning_rate": 3.1459838326212375e-07,
"loss": 0.6829,
"step": 54300
},
{
"epoch": 3.1103487707261293,
"grad_norm": 15.472237586975098,
"learning_rate": 3.1293137297435294e-07,
"loss": 0.6771,
"step": 54400
},
{
"epoch": 3.1160663236134933,
"grad_norm": 19.009233474731445,
"learning_rate": 3.1126677703794995e-07,
"loss": 0.6897,
"step": 54500
},
{
"epoch": 3.1217838765008574,
"grad_norm": 19.676128387451172,
"learning_rate": 3.09604616936578e-07,
"loss": 0.6954,
"step": 54600
},
{
"epoch": 3.127501429388222,
"grad_norm": 15.318999290466309,
"learning_rate": 3.079449141224636e-07,
"loss": 0.683,
"step": 54700
},
{
"epoch": 3.133218982275586,
"grad_norm": 16.714824676513672,
"learning_rate": 3.0628769001611873e-07,
"loss": 0.6736,
"step": 54800
},
{
"epoch": 3.13893653516295,
"grad_norm": 19.067108154296875,
"learning_rate": 3.0463296600606434e-07,
"loss": 0.691,
"step": 54900
},
{
"epoch": 3.1446540880503147,
"grad_norm": 17.589868545532227,
"learning_rate": 3.029807634485551e-07,
"loss": 0.6967,
"step": 55000
},
{
"epoch": 3.1503716409376787,
"grad_norm": 15.109379768371582,
"learning_rate": 3.013311036673025e-07,
"loss": 0.6896,
"step": 55100
},
{
"epoch": 3.156089193825043,
"grad_norm": 16.637935638427734,
"learning_rate": 2.9968400795320125e-07,
"loss": 0.6745,
"step": 55200
},
{
"epoch": 3.161806746712407,
"grad_norm": 15.541582107543945,
"learning_rate": 2.9803949756405254e-07,
"loss": 0.6814,
"step": 55300
},
{
"epoch": 3.1675242995997714,
"grad_norm": 15.901352882385254,
"learning_rate": 2.9639759372429166e-07,
"loss": 0.6652,
"step": 55400
},
{
"epoch": 3.1732418524871355,
"grad_norm": 15.980581283569336,
"learning_rate": 2.947583176247128e-07,
"loss": 0.6916,
"step": 55500
},
{
"epoch": 3.1789594053744996,
"grad_norm": 20.715810775756836,
"learning_rate": 2.93121690422196e-07,
"loss": 0.7056,
"step": 55600
},
{
"epoch": 3.184676958261864,
"grad_norm": 15.07290267944336,
"learning_rate": 2.914877332394339e-07,
"loss": 0.6879,
"step": 55700
},
{
"epoch": 3.1903945111492282,
"grad_norm": 16.82330322265625,
"learning_rate": 2.8985646716465915e-07,
"loss": 0.6915,
"step": 55800
},
{
"epoch": 3.1961120640365923,
"grad_norm": 18.356937408447266,
"learning_rate": 2.882279132513731e-07,
"loss": 0.6875,
"step": 55900
},
{
"epoch": 3.2018296169239564,
"grad_norm": 15.063898086547852,
"learning_rate": 2.8660209251807177e-07,
"loss": 0.6848,
"step": 56000
},
{
"epoch": 3.207547169811321,
"grad_norm": 18.487459182739258,
"learning_rate": 2.8497902594797785e-07,
"loss": 0.6903,
"step": 56100
},
{
"epoch": 3.213264722698685,
"grad_norm": 14.276288986206055,
"learning_rate": 2.83358734488767e-07,
"loss": 0.6849,
"step": 56200
},
{
"epoch": 3.218982275586049,
"grad_norm": 16.27110481262207,
"learning_rate": 2.8174123905229897e-07,
"loss": 0.6939,
"step": 56300
},
{
"epoch": 3.224699828473413,
"grad_norm": 17.376869201660156,
"learning_rate": 2.8012656051434776e-07,
"loss": 0.6865,
"step": 56400
},
{
"epoch": 3.2304173813607777,
"grad_norm": 18.101781845092773,
"learning_rate": 2.785147197143312e-07,
"loss": 0.6857,
"step": 56500
},
{
"epoch": 3.236134934248142,
"grad_norm": 19.654117584228516,
"learning_rate": 2.7690573745504304e-07,
"loss": 0.6939,
"step": 56600
},
{
"epoch": 3.241852487135506,
"grad_norm": 16.06427001953125,
"learning_rate": 2.752996345023836e-07,
"loss": 0.6955,
"step": 56700
},
{
"epoch": 3.2475700400228704,
"grad_norm": 16.123897552490234,
"learning_rate": 2.7369643158509303e-07,
"loss": 0.7136,
"step": 56800
},
{
"epoch": 3.2532875929102345,
"grad_norm": 17.64776611328125,
"learning_rate": 2.72096149394482e-07,
"loss": 0.6974,
"step": 56900
},
{
"epoch": 3.2590051457975986,
"grad_norm": 15.063745498657227,
"learning_rate": 2.704988085841661e-07,
"loss": 0.681,
"step": 57000
},
{
"epoch": 3.2647226986849627,
"grad_norm": 16.657075881958008,
"learning_rate": 2.689044297697988e-07,
"loss": 0.7049,
"step": 57100
},
{
"epoch": 3.270440251572327,
"grad_norm": 15.873147964477539,
"learning_rate": 2.6731303352880523e-07,
"loss": 0.6895,
"step": 57200
},
{
"epoch": 3.2761578044596913,
"grad_norm": 17.87798500061035,
"learning_rate": 2.6572464040011674e-07,
"loss": 0.6888,
"step": 57300
},
{
"epoch": 3.2818753573470554,
"grad_norm": 15.981317520141602,
"learning_rate": 2.6413927088390554e-07,
"loss": 0.6955,
"step": 57400
},
{
"epoch": 3.2875929102344195,
"grad_norm": 17.505796432495117,
"learning_rate": 2.625569454413212e-07,
"loss": 0.6938,
"step": 57500
},
{
"epoch": 3.293310463121784,
"grad_norm": 15.487998962402344,
"learning_rate": 2.6097768449422473e-07,
"loss": 0.6791,
"step": 57600
},
{
"epoch": 3.299028016009148,
"grad_norm": 13.96382999420166,
"learning_rate": 2.59401508424927e-07,
"loss": 0.6829,
"step": 57700
},
{
"epoch": 3.304745568896512,
"grad_norm": 17.845163345336914,
"learning_rate": 2.578284375759241e-07,
"loss": 0.6899,
"step": 57800
},
{
"epoch": 3.3104631217838767,
"grad_norm": 16.549325942993164,
"learning_rate": 2.5625849224963557e-07,
"loss": 0.6748,
"step": 57900
},
{
"epoch": 3.316180674671241,
"grad_norm": 17.144500732421875,
"learning_rate": 2.5469169270814226e-07,
"loss": 0.6887,
"step": 58000
},
{
"epoch": 3.321898227558605,
"grad_norm": 16.58336067199707,
"learning_rate": 2.531280591729247e-07,
"loss": 0.6905,
"step": 58100
},
{
"epoch": 3.327615780445969,
"grad_norm": 19.05868911743164,
"learning_rate": 2.515676118246025e-07,
"loss": 0.6878,
"step": 58200
},
{
"epoch": 3.3333333333333335,
"grad_norm": 15.123451232910156,
"learning_rate": 2.5001037080267316e-07,
"loss": 0.6896,
"step": 58300
},
{
"epoch": 3.3390508862206976,
"grad_norm": 14.63602352142334,
"learning_rate": 2.484563562052532e-07,
"loss": 0.7065,
"step": 58400
},
{
"epoch": 3.3447684391080617,
"grad_norm": 17.01044273376465,
"learning_rate": 2.4690558808881745e-07,
"loss": 0.6992,
"step": 58500
},
{
"epoch": 3.350485991995426,
"grad_norm": 14.306794166564941,
"learning_rate": 2.4535808646794125e-07,
"loss": 0.7112,
"step": 58600
},
{
"epoch": 3.3562035448827903,
"grad_norm": 14.732017517089844,
"learning_rate": 2.438138713150418e-07,
"loss": 0.6792,
"step": 58700
},
{
"epoch": 3.3619210977701544,
"grad_norm": 16.027528762817383,
"learning_rate": 2.4227296256012013e-07,
"loss": 0.681,
"step": 58800
},
{
"epoch": 3.3676386506575184,
"grad_norm": 19.412641525268555,
"learning_rate": 2.407353800905047e-07,
"loss": 0.7146,
"step": 58900
},
{
"epoch": 3.373356203544883,
"grad_norm": 17.197065353393555,
"learning_rate": 2.3920114375059336e-07,
"loss": 0.6998,
"step": 59000
},
{
"epoch": 3.379073756432247,
"grad_norm": 19.924795150756836,
"learning_rate": 2.3767027334159883e-07,
"loss": 0.6898,
"step": 59100
},
{
"epoch": 3.384791309319611,
"grad_norm": 18.47333335876465,
"learning_rate": 2.3614278862129168e-07,
"loss": 0.708,
"step": 59200
},
{
"epoch": 3.3905088622069752,
"grad_norm": 19.336467742919922,
"learning_rate": 2.3461870930374667e-07,
"loss": 0.6849,
"step": 59300
},
{
"epoch": 3.3962264150943398,
"grad_norm": 17.760784149169922,
"learning_rate": 2.3309805505908664e-07,
"loss": 0.6956,
"step": 59400
},
{
"epoch": 3.401943967981704,
"grad_norm": 16.625778198242188,
"learning_rate": 2.3158084551323064e-07,
"loss": 0.69,
"step": 59500
},
{
"epoch": 3.407661520869068,
"grad_norm": 16.27397346496582,
"learning_rate": 2.300671002476392e-07,
"loss": 0.6804,
"step": 59600
},
{
"epoch": 3.413379073756432,
"grad_norm": 17.837446212768555,
"learning_rate": 2.2855683879906184e-07,
"loss": 0.6639,
"step": 59700
},
{
"epoch": 3.4190966266437965,
"grad_norm": 13.911274909973145,
"learning_rate": 2.2705008065928567e-07,
"loss": 0.6826,
"step": 59800
},
{
"epoch": 3.4248141795311606,
"grad_norm": 14.893251419067383,
"learning_rate": 2.2554684527488278e-07,
"loss": 0.6715,
"step": 59900
},
{
"epoch": 3.4305317324185247,
"grad_norm": 15.816070556640625,
"learning_rate": 2.2404715204695995e-07,
"loss": 0.6893,
"step": 60000
},
{
"epoch": 3.4362492853058892,
"grad_norm": 14.913555145263672,
"learning_rate": 2.2255102033090794e-07,
"loss": 0.6882,
"step": 60100
},
{
"epoch": 3.4419668381932533,
"grad_norm": 18.053449630737305,
"learning_rate": 2.210584694361522e-07,
"loss": 0.6733,
"step": 60200
},
{
"epoch": 3.4476843910806174,
"grad_norm": 17.731204986572266,
"learning_rate": 2.195695186259028e-07,
"loss": 0.6832,
"step": 60300
},
{
"epoch": 3.4534019439679815,
"grad_norm": 18.586484909057617,
"learning_rate": 2.1808418711690635e-07,
"loss": 0.6723,
"step": 60400
},
{
"epoch": 3.459119496855346,
"grad_norm": 16.404525756835938,
"learning_rate": 2.1660249407919828e-07,
"loss": 0.6914,
"step": 60500
},
{
"epoch": 3.46483704974271,
"grad_norm": 17.5385684967041,
"learning_rate": 2.151244586358547e-07,
"loss": 0.6789,
"step": 60600
},
{
"epoch": 3.470554602630074,
"grad_norm": 14.214513778686523,
"learning_rate": 2.1365009986274607e-07,
"loss": 0.7167,
"step": 60700
},
{
"epoch": 3.4762721555174387,
"grad_norm": 16.127033233642578,
"learning_rate": 2.1217943678829063e-07,
"loss": 0.6924,
"step": 60800
},
{
"epoch": 3.481989708404803,
"grad_norm": 18.360797882080078,
"learning_rate": 2.1071248839320975e-07,
"loss": 0.6815,
"step": 60900
},
{
"epoch": 3.487707261292167,
"grad_norm": 15.16903018951416,
"learning_rate": 2.0924927361028145e-07,
"loss": 0.6873,
"step": 61000
},
{
"epoch": 3.493424814179531,
"grad_norm": 15.084009170532227,
"learning_rate": 2.0778981132409756e-07,
"loss": 0.7026,
"step": 61100
},
{
"epoch": 3.4991423670668955,
"grad_norm": 17.53474235534668,
"learning_rate": 2.0633412037081894e-07,
"loss": 0.6972,
"step": 61200
},
{
"epoch": 3.5048599199542596,
"grad_norm": 14.676159858703613,
"learning_rate": 2.0488221953793278e-07,
"loss": 0.6897,
"step": 61300
},
{
"epoch": 3.5105774728416237,
"grad_norm": 15.6426420211792,
"learning_rate": 2.0343412756401023e-07,
"loss": 0.6925,
"step": 61400
},
{
"epoch": 3.516295025728988,
"grad_norm": 17.42006492614746,
"learning_rate": 2.0198986313846406e-07,
"loss": 0.67,
"step": 61500
},
{
"epoch": 3.5220125786163523,
"grad_norm": 17.58439826965332,
"learning_rate": 2.0054944490130844e-07,
"loss": 0.7124,
"step": 61600
},
{
"epoch": 3.5277301315037164,
"grad_norm": 16.255352020263672,
"learning_rate": 1.9911289144291704e-07,
"loss": 0.6698,
"step": 61700
},
{
"epoch": 3.5334476843910805,
"grad_norm": 19.97213363647461,
"learning_rate": 1.9768022130378454e-07,
"loss": 0.6892,
"step": 61800
},
{
"epoch": 3.5391652372784446,
"grad_norm": 18.21723747253418,
"learning_rate": 1.962514529742859e-07,
"loss": 0.6773,
"step": 61900
},
{
"epoch": 3.544882790165809,
"grad_norm": 16.968833923339844,
"learning_rate": 1.9482660489443876e-07,
"loss": 0.6759,
"step": 62000
},
{
"epoch": 3.550600343053173,
"grad_norm": 22.698190689086914,
"learning_rate": 1.9340569545366487e-07,
"loss": 0.6938,
"step": 62100
},
{
"epoch": 3.5563178959405377,
"grad_norm": 16.91656494140625,
"learning_rate": 1.9198874299055345e-07,
"loss": 0.6757,
"step": 62200
},
{
"epoch": 3.562035448827902,
"grad_norm": 14.955994606018066,
"learning_rate": 1.9057576579262362e-07,
"loss": 0.6737,
"step": 62300
},
{
"epoch": 3.567753001715266,
"grad_norm": 13.794143676757812,
"learning_rate": 1.891667820960887e-07,
"loss": 0.7063,
"step": 62400
},
{
"epoch": 3.57347055460263,
"grad_norm": 18.18063735961914,
"learning_rate": 1.8776181008562165e-07,
"loss": 0.6624,
"step": 62500
},
{
"epoch": 3.579188107489994,
"grad_norm": 18.60443878173828,
"learning_rate": 1.863608678941187e-07,
"loss": 0.7291,
"step": 62600
},
{
"epoch": 3.5849056603773586,
"grad_norm": 20.440473556518555,
"learning_rate": 1.8496397360246735e-07,
"loss": 0.6804,
"step": 62700
},
{
"epoch": 3.5906232132647227,
"grad_norm": 18.79050064086914,
"learning_rate": 1.8357114523931078e-07,
"loss": 0.681,
"step": 62800
},
{
"epoch": 3.5963407661520868,
"grad_norm": 17.849271774291992,
"learning_rate": 1.8218240078081737e-07,
"loss": 0.715,
"step": 62900
},
{
"epoch": 3.6020583190394513,
"grad_norm": 16.68122673034668,
"learning_rate": 1.8079775815044724e-07,
"loss": 0.6771,
"step": 63000
},
{
"epoch": 3.6077758719268154,
"grad_norm": 21.269062042236328,
"learning_rate": 1.7941723521872114e-07,
"loss": 0.6802,
"step": 63100
},
{
"epoch": 3.6134934248141795,
"grad_norm": 16.997730255126953,
"learning_rate": 1.780408498029906e-07,
"loss": 0.6978,
"step": 63200
},
{
"epoch": 3.6192109777015435,
"grad_norm": 16.483427047729492,
"learning_rate": 1.7666861966720698e-07,
"loss": 0.6705,
"step": 63300
},
{
"epoch": 3.624928530588908,
"grad_norm": 18.589954376220703,
"learning_rate": 1.7530056252169274e-07,
"loss": 0.6788,
"step": 63400
},
{
"epoch": 3.630646083476272,
"grad_norm": 16.00810432434082,
"learning_rate": 1.7393669602291244e-07,
"loss": 0.6839,
"step": 63500
},
{
"epoch": 3.6363636363636362,
"grad_norm": 18.412017822265625,
"learning_rate": 1.7257703777324595e-07,
"loss": 0.7016,
"step": 63600
},
{
"epoch": 3.6420811892510008,
"grad_norm": 18.97647476196289,
"learning_rate": 1.712216053207597e-07,
"loss": 0.6791,
"step": 63700
},
{
"epoch": 3.647798742138365,
"grad_norm": 16.424823760986328,
"learning_rate": 1.6987041615898152e-07,
"loss": 0.6766,
"step": 63800
},
{
"epoch": 3.653516295025729,
"grad_norm": 16.81319808959961,
"learning_rate": 1.6852348772667406e-07,
"loss": 0.7148,
"step": 63900
},
{
"epoch": 3.659233847913093,
"grad_norm": 18.625141143798828,
"learning_rate": 1.6718083740761013e-07,
"loss": 0.681,
"step": 64000
},
{
"epoch": 3.6649514008004576,
"grad_norm": 20.720613479614258,
"learning_rate": 1.6584248253034804e-07,
"loss": 0.6653,
"step": 64100
},
{
"epoch": 3.6706689536878216,
"grad_norm": 19.35886573791504,
"learning_rate": 1.6450844036800814e-07,
"loss": 0.7049,
"step": 64200
},
{
"epoch": 3.6763865065751857,
"grad_norm": 15.835514068603516,
"learning_rate": 1.6317872813805033e-07,
"loss": 0.679,
"step": 64300
},
{
"epoch": 3.6821040594625503,
"grad_norm": 17.737224578857422,
"learning_rate": 1.6185336300205073e-07,
"loss": 0.6774,
"step": 64400
},
{
"epoch": 3.6878216123499143,
"grad_norm": 15.835929870605469,
"learning_rate": 1.6053236206548143e-07,
"loss": 0.6842,
"step": 64500
},
{
"epoch": 3.6935391652372784,
"grad_norm": 16.491268157958984,
"learning_rate": 1.5921574237748868e-07,
"loss": 0.7032,
"step": 64600
},
{
"epoch": 3.6992567181246425,
"grad_norm": 19.976850509643555,
"learning_rate": 1.5790352093067365e-07,
"loss": 0.6972,
"step": 64700
},
{
"epoch": 3.7049742710120066,
"grad_norm": 17.204557418823242,
"learning_rate": 1.5659571466087253e-07,
"loss": 0.6979,
"step": 64800
},
{
"epoch": 3.710691823899371,
"grad_norm": 16.28580093383789,
"learning_rate": 1.5529234044693818e-07,
"loss": 0.672,
"step": 64900
},
{
"epoch": 3.716409376786735,
"grad_norm": 16.73383903503418,
"learning_rate": 1.5399341511052278e-07,
"loss": 0.6713,
"step": 65000
},
{
"epoch": 3.7221269296740997,
"grad_norm": 16.786861419677734,
"learning_rate": 1.5269895541585965e-07,
"loss": 0.6799,
"step": 65100
},
{
"epoch": 3.727844482561464,
"grad_norm": 16.983375549316406,
"learning_rate": 1.51408978069548e-07,
"loss": 0.6916,
"step": 65200
},
{
"epoch": 3.733562035448828,
"grad_norm": 15.858549118041992,
"learning_rate": 1.5012349972033634e-07,
"loss": 0.6709,
"step": 65300
},
{
"epoch": 3.739279588336192,
"grad_norm": 19.247562408447266,
"learning_rate": 1.488425369589087e-07,
"loss": 0.7158,
"step": 65400
},
{
"epoch": 3.744997141223556,
"grad_norm": 15.711267471313477,
"learning_rate": 1.4756610631766896e-07,
"loss": 0.6888,
"step": 65500
},
{
"epoch": 3.7507146941109206,
"grad_norm": 19.656904220581055,
"learning_rate": 1.4629422427052934e-07,
"loss": 0.6687,
"step": 65600
},
{
"epoch": 3.7564322469982847,
"grad_norm": 16.6679630279541,
"learning_rate": 1.4502690723269633e-07,
"loss": 0.6728,
"step": 65700
},
{
"epoch": 3.762149799885649,
"grad_norm": 15.8047456741333,
"learning_rate": 1.4376417156045923e-07,
"loss": 0.6943,
"step": 65800
},
{
"epoch": 3.7678673527730133,
"grad_norm": 17.548009872436523,
"learning_rate": 1.425060335509798e-07,
"loss": 0.6843,
"step": 65900
},
{
"epoch": 3.7735849056603774,
"grad_norm": 18.866233825683594,
"learning_rate": 1.4125250944208039e-07,
"loss": 0.6661,
"step": 66000
},
{
"epoch": 3.7793024585477415,
"grad_norm": 16.393108367919922,
"learning_rate": 1.4000361541203637e-07,
"loss": 0.6903,
"step": 66100
},
{
"epoch": 3.7850200114351056,
"grad_norm": 19.90047836303711,
"learning_rate": 1.3875936757936496e-07,
"loss": 0.7255,
"step": 66200
},
{
"epoch": 3.79073756432247,
"grad_norm": 17.67254638671875,
"learning_rate": 1.3751978200261954e-07,
"loss": 0.686,
"step": 66300
},
{
"epoch": 3.796455117209834,
"grad_norm": 18.827899932861328,
"learning_rate": 1.3628487468018085e-07,
"loss": 0.6793,
"step": 66400
},
{
"epoch": 3.8021726700971983,
"grad_norm": 20.450260162353516,
"learning_rate": 1.3505466155005092e-07,
"loss": 0.6828,
"step": 66500
},
{
"epoch": 3.807890222984563,
"grad_norm": 18.889638900756836,
"learning_rate": 1.3382915848964777e-07,
"loss": 0.7031,
"step": 66600
},
{
"epoch": 3.813607775871927,
"grad_norm": 19.9688663482666,
"learning_rate": 1.3260838131559987e-07,
"loss": 0.6785,
"step": 66700
},
{
"epoch": 3.819325328759291,
"grad_norm": 17.95281982421875,
"learning_rate": 1.3139234578354235e-07,
"loss": 0.7063,
"step": 66800
},
{
"epoch": 3.825042881646655,
"grad_norm": 15.684403419494629,
"learning_rate": 1.3018106758791358e-07,
"loss": 0.6715,
"step": 66900
},
{
"epoch": 3.8307604345340196,
"grad_norm": 17.38626480102539,
"learning_rate": 1.2897456236175298e-07,
"loss": 0.701,
"step": 67000
},
{
"epoch": 3.8364779874213837,
"grad_norm": 16.06013298034668,
"learning_rate": 1.2777284567649825e-07,
"loss": 0.6707,
"step": 67100
},
{
"epoch": 3.8421955403087478,
"grad_norm": 16.085487365722656,
"learning_rate": 1.2657593304178583e-07,
"loss": 0.6858,
"step": 67200
},
{
"epoch": 3.8479130931961123,
"grad_norm": 18.255733489990234,
"learning_rate": 1.2538383990524937e-07,
"loss": 0.686,
"step": 67300
},
{
"epoch": 3.8536306460834764,
"grad_norm": 20.442323684692383,
"learning_rate": 1.2419658165232122e-07,
"loss": 0.6726,
"step": 67400
},
{
"epoch": 3.8593481989708405,
"grad_norm": 16.613813400268555,
"learning_rate": 1.230141736060335e-07,
"loss": 0.6884,
"step": 67500
},
{
"epoch": 3.8650657518582046,
"grad_norm": 16.380739212036133,
"learning_rate": 1.2183663102682029e-07,
"loss": 0.6694,
"step": 67600
},
{
"epoch": 3.8707833047455686,
"grad_norm": 17.78510093688965,
"learning_rate": 1.206639691123213e-07,
"loss": 0.6988,
"step": 67700
},
{
"epoch": 3.876500857632933,
"grad_norm": 16.328201293945312,
"learning_rate": 1.1949620299718467e-07,
"loss": 0.6882,
"step": 67800
},
{
"epoch": 3.8822184105202973,
"grad_norm": 17.736520767211914,
"learning_rate": 1.1833334775287273e-07,
"loss": 0.6861,
"step": 67900
},
{
"epoch": 3.887935963407662,
"grad_norm": 18.87068748474121,
"learning_rate": 1.1717541838746659e-07,
"loss": 0.6827,
"step": 68000
},
{
"epoch": 3.893653516295026,
"grad_norm": 21.540847778320312,
"learning_rate": 1.160224298454729e-07,
"loss": 0.7021,
"step": 68100
},
{
"epoch": 3.89937106918239,
"grad_norm": 15.944188117980957,
"learning_rate": 1.148743970076309e-07,
"loss": 0.7025,
"step": 68200
},
{
"epoch": 3.905088622069754,
"grad_norm": 17.067222595214844,
"learning_rate": 1.1373133469072033e-07,
"loss": 0.6799,
"step": 68300
},
{
"epoch": 3.910806174957118,
"grad_norm": 17.428482055664062,
"learning_rate": 1.1259325764737049e-07,
"loss": 0.6897,
"step": 68400
},
{
"epoch": 3.9165237278444827,
"grad_norm": 15.801222801208496,
"learning_rate": 1.1146018056586903e-07,
"loss": 0.6857,
"step": 68500
},
{
"epoch": 3.9222412807318467,
"grad_norm": 19.24358558654785,
"learning_rate": 1.1033211806997367e-07,
"loss": 0.6919,
"step": 68600
},
{
"epoch": 3.927958833619211,
"grad_norm": 17.26407814025879,
"learning_rate": 1.0920908471872192e-07,
"loss": 0.6943,
"step": 68700
},
{
"epoch": 3.9336763865065754,
"grad_norm": 17.91105842590332,
"learning_rate": 1.0809109500624486e-07,
"loss": 0.6927,
"step": 68800
},
{
"epoch": 3.9393939393939394,
"grad_norm": 21.539033889770508,
"learning_rate": 1.0697816336157805e-07,
"loss": 0.6827,
"step": 68900
},
{
"epoch": 3.9451114922813035,
"grad_norm": 15.918169021606445,
"learning_rate": 1.0587030414847753e-07,
"loss": 0.6805,
"step": 69000
},
{
"epoch": 3.9508290451686676,
"grad_norm": 18.38698959350586,
"learning_rate": 1.0476753166523278e-07,
"loss": 0.7088,
"step": 69100
},
{
"epoch": 3.956546598056032,
"grad_norm": 16.94473648071289,
"learning_rate": 1.0366986014448276e-07,
"loss": 0.6869,
"step": 69200
},
{
"epoch": 3.9622641509433962,
"grad_norm": 16.196962356567383,
"learning_rate": 1.0257730375303264e-07,
"loss": 0.6828,
"step": 69300
},
{
"epoch": 3.9679817038307603,
"grad_norm": 19.227867126464844,
"learning_rate": 1.0148987659166986e-07,
"loss": 0.6786,
"step": 69400
},
{
"epoch": 3.973699256718125,
"grad_norm": 15.71776294708252,
"learning_rate": 1.0040759269498373e-07,
"loss": 0.6987,
"step": 69500
},
{
"epoch": 3.979416809605489,
"grad_norm": 15.973411560058594,
"learning_rate": 9.933046603118229e-08,
"loss": 0.6943,
"step": 69600
},
{
"epoch": 3.985134362492853,
"grad_norm": 18.862775802612305,
"learning_rate": 9.825851050191402e-08,
"loss": 0.6896,
"step": 69700
},
{
"epoch": 3.990851915380217,
"grad_norm": 15.70366382598877,
"learning_rate": 9.719173994208718e-08,
"loss": 0.7044,
"step": 69800
},
{
"epoch": 3.9965694682675816,
"grad_norm": 17.324993133544922,
"learning_rate": 9.613016811969144e-08,
"loss": 0.6801,
"step": 69900
},
{
"epoch": 4.002287021154945,
"grad_norm": 16.205799102783203,
"learning_rate": 9.507380873562082e-08,
"loss": 0.6951,
"step": 70000
},
{
"epoch": 4.00800457404231,
"grad_norm": 17.790599822998047,
"learning_rate": 9.402267542349595e-08,
"loss": 0.6811,
"step": 70100
},
{
"epoch": 4.013722126929674,
"grad_norm": 15.449918746948242,
"learning_rate": 9.297678174948875e-08,
"loss": 0.6645,
"step": 70200
},
{
"epoch": 4.019439679817038,
"grad_norm": 24.729137420654297,
"learning_rate": 9.193614121214699e-08,
"loss": 0.668,
"step": 70300
},
{
"epoch": 4.0251572327044025,
"grad_norm": 18.29893684387207,
"learning_rate": 9.090076724222063e-08,
"loss": 0.7008,
"step": 70400
},
{
"epoch": 4.030874785591767,
"grad_norm": 17.138538360595703,
"learning_rate": 8.987067320248753e-08,
"loss": 0.6688,
"step": 70500
},
{
"epoch": 4.036592338479131,
"grad_norm": 18.376750946044922,
"learning_rate": 8.884587238758207e-08,
"loss": 0.6829,
"step": 70600
},
{
"epoch": 4.042309891366495,
"grad_norm": 17.92237663269043,
"learning_rate": 8.78263780238227e-08,
"loss": 0.7007,
"step": 70700
},
{
"epoch": 4.04802744425386,
"grad_norm": 16.868654251098633,
"learning_rate": 8.681220326904165e-08,
"loss": 0.6996,
"step": 70800
},
{
"epoch": 4.053744997141224,
"grad_norm": 17.588733673095703,
"learning_rate": 8.5803361212415e-08,
"loss": 0.6894,
"step": 70900
},
{
"epoch": 4.059462550028588,
"grad_norm": 18.68837547302246,
"learning_rate": 8.47998648742937e-08,
"loss": 0.6716,
"step": 71000
},
{
"epoch": 4.065180102915952,
"grad_norm": 19.368438720703125,
"learning_rate": 8.380172720603601e-08,
"loss": 0.6771,
"step": 71100
},
{
"epoch": 4.070897655803316,
"grad_norm": 18.050640106201172,
"learning_rate": 8.280896108983943e-08,
"loss": 0.6803,
"step": 71200
},
{
"epoch": 4.07661520869068,
"grad_norm": 16.39789581298828,
"learning_rate": 8.182157933857543e-08,
"loss": 0.6801,
"step": 71300
},
{
"epoch": 4.082332761578044,
"grad_norm": 19.105905532836914,
"learning_rate": 8.083959469562324e-08,
"loss": 0.6868,
"step": 71400
},
{
"epoch": 4.088050314465409,
"grad_norm": 18.83084487915039,
"learning_rate": 7.986301983470595e-08,
"loss": 0.6655,
"step": 71500
},
{
"epoch": 4.093767867352773,
"grad_norm": 15.07455062866211,
"learning_rate": 7.889186735972647e-08,
"loss": 0.6782,
"step": 71600
},
{
"epoch": 4.099485420240137,
"grad_norm": 22.17110252380371,
"learning_rate": 7.79261498046056e-08,
"loss": 0.668,
"step": 71700
},
{
"epoch": 4.1052029731275015,
"grad_norm": 15.451409339904785,
"learning_rate": 7.69658796331194e-08,
"loss": 0.6651,
"step": 71800
},
{
"epoch": 4.110920526014866,
"grad_norm": 17.61263656616211,
"learning_rate": 7.60110692387388e-08,
"loss": 0.695,
"step": 71900
},
{
"epoch": 4.11663807890223,
"grad_norm": 15.308425903320312,
"learning_rate": 7.506173094446982e-08,
"loss": 0.6937,
"step": 72000
},
{
"epoch": 4.122355631789594,
"grad_norm": 20.97743034362793,
"learning_rate": 7.41178770026939e-08,
"loss": 0.6808,
"step": 72100
},
{
"epoch": 4.128073184676959,
"grad_norm": 18.173051834106445,
"learning_rate": 7.317951959501056e-08,
"loss": 0.6914,
"step": 72200
},
{
"epoch": 4.133790737564323,
"grad_norm": 20.474937438964844,
"learning_rate": 7.224667083207925e-08,
"loss": 0.6714,
"step": 72300
},
{
"epoch": 4.139508290451687,
"grad_norm": 17.615968704223633,
"learning_rate": 7.131934275346408e-08,
"loss": 0.6595,
"step": 72400
},
{
"epoch": 4.145225843339051,
"grad_norm": 18.85563087463379,
"learning_rate": 7.039754732747766e-08,
"loss": 0.6724,
"step": 72500
},
{
"epoch": 4.150943396226415,
"grad_norm": 22.671489715576172,
"learning_rate": 6.948129645102674e-08,
"loss": 0.6994,
"step": 72600
},
{
"epoch": 4.156660949113779,
"grad_norm": 18.428152084350586,
"learning_rate": 6.857060194945924e-08,
"loss": 0.6705,
"step": 72700
},
{
"epoch": 4.162378502001143,
"grad_norm": 16.202274322509766,
"learning_rate": 6.766547557641084e-08,
"loss": 0.6793,
"step": 72800
},
{
"epoch": 4.168096054888507,
"grad_norm": 16.106399536132812,
"learning_rate": 6.676592901365408e-08,
"loss": 0.6542,
"step": 72900
},
{
"epoch": 4.173813607775872,
"grad_norm": 22.502248764038086,
"learning_rate": 6.587197387094645e-08,
"loss": 0.7127,
"step": 73000
},
{
"epoch": 4.179531160663236,
"grad_norm": 20.502391815185547,
"learning_rate": 6.4983621685882e-08,
"loss": 0.7128,
"step": 73100
},
{
"epoch": 4.1852487135506005,
"grad_norm": 21.65897560119629,
"learning_rate": 6.410088392374119e-08,
"loss": 0.6901,
"step": 73200
},
{
"epoch": 4.1909662664379645,
"grad_norm": 16.635141372680664,
"learning_rate": 6.322377197734379e-08,
"loss": 0.6806,
"step": 73300
},
{
"epoch": 4.196683819325329,
"grad_norm": 20.783342361450195,
"learning_rate": 6.235229716690132e-08,
"loss": 0.6817,
"step": 73400
},
{
"epoch": 4.202401372212693,
"grad_norm": 16.46611785888672,
"learning_rate": 6.148647073987113e-08,
"loss": 0.686,
"step": 73500
},
{
"epoch": 4.208118925100057,
"grad_norm": 17.907161712646484,
"learning_rate": 6.062630387081102e-08,
"loss": 0.6759,
"step": 73600
},
{
"epoch": 4.213836477987422,
"grad_norm": 17.944095611572266,
"learning_rate": 5.977180766123547e-08,
"loss": 0.7087,
"step": 73700
},
{
"epoch": 4.219554030874786,
"grad_norm": 15.839191436767578,
"learning_rate": 5.892299313947213e-08,
"loss": 0.6782,
"step": 73800
},
{
"epoch": 4.22527158376215,
"grad_norm": 21.773887634277344,
"learning_rate": 5.807987126051922e-08,
"loss": 0.6756,
"step": 73900
},
{
"epoch": 4.230989136649514,
"grad_norm": 16.483123779296875,
"learning_rate": 5.7242452905904625e-08,
"loss": 0.6674,
"step": 74000
},
{
"epoch": 4.236706689536878,
"grad_norm": 20.822376251220703,
"learning_rate": 5.6410748883545137e-08,
"loss": 0.7021,
"step": 74100
},
{
"epoch": 4.242424242424242,
"grad_norm": 21.443439483642578,
"learning_rate": 5.558476992760686e-08,
"loss": 0.6816,
"step": 74200
},
{
"epoch": 4.248141795311606,
"grad_norm": 20.461912155151367,
"learning_rate": 5.476452669836712e-08,
"loss": 0.6659,
"step": 74300
},
{
"epoch": 4.25385934819897,
"grad_norm": 18.985849380493164,
"learning_rate": 5.3950029782076356e-08,
"loss": 0.7025,
"step": 74400
},
{
"epoch": 4.259576901086335,
"grad_norm": 17.57728385925293,
"learning_rate": 5.31412896908221e-08,
"loss": 0.6988,
"step": 74500
},
{
"epoch": 4.265294453973699,
"grad_norm": 18.530332565307617,
"learning_rate": 5.2338316862392596e-08,
"loss": 0.68,
"step": 74600
},
{
"epoch": 4.2710120068610635,
"grad_norm": 21.527196884155273,
"learning_rate": 5.154112166014274e-08,
"loss": 0.6725,
"step": 74700
},
{
"epoch": 4.276729559748428,
"grad_norm": 21.66538429260254,
"learning_rate": 5.0749714372859744e-08,
"loss": 0.6753,
"step": 74800
},
{
"epoch": 4.282447112635792,
"grad_norm": 23.227149963378906,
"learning_rate": 4.9964105214631016e-08,
"loss": 0.6791,
"step": 74900
},
{
"epoch": 4.288164665523156,
"grad_norm": 17.627233505249023,
"learning_rate": 4.9184304324711433e-08,
"loss": 0.6875,
"step": 75000
},
{
"epoch": 4.29388221841052,
"grad_norm": 16.310253143310547,
"learning_rate": 4.841032176739363e-08,
"loss": 0.6664,
"step": 75100
},
{
"epoch": 4.299599771297885,
"grad_norm": 17.00972557067871,
"learning_rate": 4.764216753187694e-08,
"loss": 0.6977,
"step": 75200
},
{
"epoch": 4.305317324185249,
"grad_norm": 20.538524627685547,
"learning_rate": 4.6879851532139194e-08,
"loss": 0.6803,
"step": 75300
},
{
"epoch": 4.311034877072613,
"grad_norm": 16.814311981201172,
"learning_rate": 4.61233836068089e-08,
"loss": 0.6922,
"step": 75400
},
{
"epoch": 4.316752429959977,
"grad_norm": 23.158313751220703,
"learning_rate": 4.537277351903734e-08,
"loss": 0.6774,
"step": 75500
},
{
"epoch": 4.322469982847341,
"grad_norm": 14.462933540344238,
"learning_rate": 4.4628030956373966e-08,
"loss": 0.6544,
"step": 75600
},
{
"epoch": 4.328187535734705,
"grad_norm": 21.14085578918457,
"learning_rate": 4.388916553063965e-08,
"loss": 0.6752,
"step": 75700
},
{
"epoch": 4.333905088622069,
"grad_norm": 21.701650619506836,
"learning_rate": 4.315618677780436e-08,
"loss": 0.6853,
"step": 75800
},
{
"epoch": 4.339622641509434,
"grad_norm": 17.62489128112793,
"learning_rate": 4.242910415786288e-08,
"loss": 0.702,
"step": 75900
},
{
"epoch": 4.345340194396798,
"grad_norm": 18.198381423950195,
"learning_rate": 4.170792705471315e-08,
"loss": 0.6784,
"step": 76000
},
{
"epoch": 4.3510577472841625,
"grad_norm": 19.804584503173828,
"learning_rate": 4.0992664776035414e-08,
"loss": 0.7006,
"step": 76100
},
{
"epoch": 4.356775300171527,
"grad_norm": 17.359596252441406,
"learning_rate": 4.0283326553171346e-08,
"loss": 0.6784,
"step": 76200
},
{
"epoch": 4.362492853058891,
"grad_norm": 15.833115577697754,
"learning_rate": 3.9579921541005925e-08,
"loss": 0.6723,
"step": 76300
},
{
"epoch": 4.368210405946255,
"grad_norm": 16.290063858032227,
"learning_rate": 3.888245881784813e-08,
"loss": 0.6656,
"step": 76400
},
{
"epoch": 4.373927958833619,
"grad_norm": 16.540603637695312,
"learning_rate": 3.8190947385314865e-08,
"loss": 0.6761,
"step": 76500
},
{
"epoch": 4.379645511720984,
"grad_norm": 15.980658531188965,
"learning_rate": 3.750539616821402e-08,
"loss": 0.6656,
"step": 76600
},
{
"epoch": 4.385363064608348,
"grad_norm": 16.78759002685547,
"learning_rate": 3.682581401442969e-08,
"loss": 0.6996,
"step": 76700
},
{
"epoch": 4.391080617495712,
"grad_norm": 14.298684120178223,
"learning_rate": 3.6152209694807755e-08,
"loss": 0.6911,
"step": 76800
},
{
"epoch": 4.396798170383076,
"grad_norm": 16.176973342895508,
"learning_rate": 3.548459190304276e-08,
"loss": 0.6678,
"step": 76900
},
{
"epoch": 4.40251572327044,
"grad_norm": 16.40897560119629,
"learning_rate": 3.4822969255565934e-08,
"loss": 0.6915,
"step": 77000
},
{
"epoch": 4.408233276157804,
"grad_norm": 23.246614456176758,
"learning_rate": 3.41673502914333e-08,
"loss": 0.6919,
"step": 77100
},
{
"epoch": 4.413950829045168,
"grad_norm": 18.167871475219727,
"learning_rate": 3.351774347221653e-08,
"loss": 0.7224,
"step": 77200
},
{
"epoch": 4.419668381932533,
"grad_norm": 19.697547912597656,
"learning_rate": 3.287415718189268e-08,
"loss": 0.711,
"step": 77300
},
{
"epoch": 4.425385934819897,
"grad_norm": 17.16083335876465,
"learning_rate": 3.223659972673681e-08,
"loss": 0.6496,
"step": 77400
},
{
"epoch": 4.4311034877072615,
"grad_norm": 15.293824195861816,
"learning_rate": 3.160507933521422e-08,
"loss": 0.6649,
"step": 77500
},
{
"epoch": 4.436821040594626,
"grad_norm": 16.652938842773438,
"learning_rate": 3.097960415787443e-08,
"loss": 0.6901,
"step": 77600
},
{
"epoch": 4.44253859348199,
"grad_norm": 19.576251983642578,
"learning_rate": 3.036018226724607e-08,
"loss": 0.6654,
"step": 77700
},
{
"epoch": 4.448256146369354,
"grad_norm": 18.951457977294922,
"learning_rate": 2.9746821657732678e-08,
"loss": 0.6804,
"step": 77800
},
{
"epoch": 4.453973699256718,
"grad_norm": 20.799861907958984,
"learning_rate": 2.9139530245509526e-08,
"loss": 0.6934,
"step": 77900
},
{
"epoch": 4.459691252144083,
"grad_norm": 14.459927558898926,
"learning_rate": 2.8538315868421203e-08,
"loss": 0.6595,
"step": 78000
},
{
"epoch": 4.465408805031447,
"grad_norm": 16.431798934936523,
"learning_rate": 2.794318628588094e-08,
"loss": 0.6773,
"step": 78100
},
{
"epoch": 4.471126357918811,
"grad_norm": 15.203049659729004,
"learning_rate": 2.7354149178769936e-08,
"loss": 0.7064,
"step": 78200
},
{
"epoch": 4.476843910806175,
"grad_norm": 22.0187931060791,
"learning_rate": 2.6771212149338807e-08,
"loss": 0.6787,
"step": 78300
},
{
"epoch": 4.482561463693539,
"grad_norm": 17.56171226501465,
"learning_rate": 2.619438272110863e-08,
"loss": 0.6941,
"step": 78400
},
{
"epoch": 4.488279016580903,
"grad_norm": 20.990859985351562,
"learning_rate": 2.5623668338774885e-08,
"loss": 0.7069,
"step": 78500
},
{
"epoch": 4.493996569468267,
"grad_norm": 21.19357681274414,
"learning_rate": 2.5059076368110444e-08,
"loss": 0.6983,
"step": 78600
},
{
"epoch": 4.499714122355631,
"grad_norm": 16.244356155395508,
"learning_rate": 2.4500614095871174e-08,
"loss": 0.6788,
"step": 78700
},
{
"epoch": 4.505431675242996,
"grad_norm": 18.686336517333984,
"learning_rate": 2.3948288729701506e-08,
"loss": 0.6861,
"step": 78800
},
{
"epoch": 4.5111492281303605,
"grad_norm": 18.47435760498047,
"learning_rate": 2.3402107398041492e-08,
"loss": 0.6557,
"step": 78900
},
{
"epoch": 4.5168667810177245,
"grad_norm": 17.009096145629883,
"learning_rate": 2.286207715003502e-08,
"loss": 0.6917,
"step": 79000
},
{
"epoch": 4.522584333905089,
"grad_norm": 17.560272216796875,
"learning_rate": 2.2328204955438357e-08,
"loss": 0.705,
"step": 79100
},
{
"epoch": 4.528301886792453,
"grad_norm": 20.05234146118164,
"learning_rate": 2.180049770453085e-08,
"loss": 0.6759,
"step": 79200
},
{
"epoch": 4.534019439679817,
"grad_norm": 19.148618698120117,
"learning_rate": 2.1278962208025332e-08,
"loss": 0.668,
"step": 79300
},
{
"epoch": 4.539736992567181,
"grad_norm": 18.14238929748535,
"learning_rate": 2.0763605196980615e-08,
"loss": 0.6781,
"step": 79400
},
{
"epoch": 4.545454545454545,
"grad_norm": 18.119190216064453,
"learning_rate": 2.0254433322714758e-08,
"loss": 0.6533,
"step": 79500
},
{
"epoch": 4.55117209834191,
"grad_norm": 16.649980545043945,
"learning_rate": 1.9751453156718666e-08,
"loss": 0.6872,
"step": 79600
},
{
"epoch": 4.556889651229274,
"grad_norm": 15.812389373779297,
"learning_rate": 1.9254671190571948e-08,
"loss": 0.6863,
"step": 79700
},
{
"epoch": 4.562607204116638,
"grad_norm": 14.970600128173828,
"learning_rate": 1.876409383585842e-08,
"loss": 0.6896,
"step": 79800
},
{
"epoch": 4.568324757004002,
"grad_norm": 18.37346649169922,
"learning_rate": 1.827972742408407e-08,
"loss": 0.6693,
"step": 79900
},
{
"epoch": 4.574042309891366,
"grad_norm": 20.24564552307129,
"learning_rate": 1.7801578206594725e-08,
"loss": 0.6849,
"step": 80000
},
{
"epoch": 4.57975986277873,
"grad_norm": 19.068187713623047,
"learning_rate": 1.7329652354496016e-08,
"loss": 0.6902,
"step": 80100
},
{
"epoch": 4.5854774156660945,
"grad_norm": 18.084623336791992,
"learning_rate": 1.686395595857304e-08,
"loss": 0.6784,
"step": 80200
},
{
"epoch": 4.591194968553459,
"grad_norm": 16.805601119995117,
"learning_rate": 1.640449502921215e-08,
"loss": 0.6657,
"step": 80300
},
{
"epoch": 4.5969125214408235,
"grad_norm": 16.83079719543457,
"learning_rate": 1.595127549632347e-08,
"loss": 0.6951,
"step": 80400
},
{
"epoch": 4.602630074328188,
"grad_norm": 18.01809310913086,
"learning_rate": 1.5504303209263935e-08,
"loss": 0.6749,
"step": 80500
},
{
"epoch": 4.608347627215552,
"grad_norm": 16.533803939819336,
"learning_rate": 1.5063583936762325e-08,
"loss": 0.6836,
"step": 80600
},
{
"epoch": 4.614065180102916,
"grad_norm": 17.195920944213867,
"learning_rate": 1.4629123366844354e-08,
"loss": 0.6721,
"step": 80700
},
{
"epoch": 4.61978273299028,
"grad_norm": 17.525428771972656,
"learning_rate": 1.420092710675963e-08,
"loss": 0.688,
"step": 80800
},
{
"epoch": 4.625500285877644,
"grad_norm": 17.868335723876953,
"learning_rate": 1.3779000682909103e-08,
"loss": 0.6875,
"step": 80900
},
{
"epoch": 4.631217838765009,
"grad_norm": 14.877182006835938,
"learning_rate": 1.3363349540773561e-08,
"loss": 0.6591,
"step": 81000
},
{
"epoch": 4.636935391652373,
"grad_norm": 22.093046188354492,
"learning_rate": 1.2953979044843633e-08,
"loss": 0.6894,
"step": 81100
},
{
"epoch": 4.642652944539737,
"grad_norm": 19.40646743774414,
"learning_rate": 1.2550894478550678e-08,
"loss": 0.6662,
"step": 81200
},
{
"epoch": 4.648370497427101,
"grad_norm": 17.37119483947754,
"learning_rate": 1.2154101044198118e-08,
"loss": 0.6887,
"step": 81300
},
{
"epoch": 4.654088050314465,
"grad_norm": 17.71015739440918,
"learning_rate": 1.1763603862894544e-08,
"loss": 0.6801,
"step": 81400
},
{
"epoch": 4.659805603201829,
"grad_norm": 17.027597427368164,
"learning_rate": 1.137940797448783e-08,
"loss": 0.6865,
"step": 81500
},
{
"epoch": 4.665523156089193,
"grad_norm": 24.06705665588379,
"learning_rate": 1.1001518337499793e-08,
"loss": 0.6884,
"step": 81600
},
{
"epoch": 4.671240708976558,
"grad_norm": 18.24313735961914,
"learning_rate": 1.0629939829062351e-08,
"loss": 0.693,
"step": 81700
},
{
"epoch": 4.6769582618639225,
"grad_norm": 19.492963790893555,
"learning_rate": 1.0264677244854303e-08,
"loss": 0.6848,
"step": 81800
},
{
"epoch": 4.682675814751287,
"grad_norm": 18.08428955078125,
"learning_rate": 9.90573529903993e-09,
"loss": 0.6924,
"step": 81900
},
{
"epoch": 4.688393367638651,
"grad_norm": 14.725008964538574,
"learning_rate": 9.553118624207824e-09,
"loss": 0.7114,
"step": 82000
},
{
"epoch": 4.694110920526015,
"grad_norm": 20.670289993286133,
"learning_rate": 9.206831771310986e-09,
"loss": 0.6478,
"step": 82100
},
{
"epoch": 4.699828473413379,
"grad_norm": 16.0545597076416,
"learning_rate": 8.866879209608436e-09,
"loss": 0.7017,
"step": 82200
},
{
"epoch": 4.705546026300743,
"grad_norm": 18.441730499267578,
"learning_rate": 8.533265326607253e-09,
"loss": 0.6937,
"step": 82300
},
{
"epoch": 4.711263579188108,
"grad_norm": 22.546123504638672,
"learning_rate": 8.20599442800618e-09,
"loss": 0.6759,
"step": 82400
},
{
"epoch": 4.716981132075472,
"grad_norm": 16.173694610595703,
"learning_rate": 7.88507073763972e-09,
"loss": 0.6623,
"step": 82500
},
{
"epoch": 4.722698684962836,
"grad_norm": 19.89776039123535,
"learning_rate": 7.570498397424018e-09,
"loss": 0.695,
"step": 82600
},
{
"epoch": 4.7284162378502,
"grad_norm": 19.8604736328125,
"learning_rate": 7.262281467303011e-09,
"loss": 0.6925,
"step": 82700
},
{
"epoch": 4.734133790737564,
"grad_norm": 19.60188865661621,
"learning_rate": 6.960423925196468e-09,
"loss": 0.6604,
"step": 82800
},
{
"epoch": 4.739851343624928,
"grad_norm": 16.95119857788086,
"learning_rate": 6.66492966694826e-09,
"loss": 0.6576,
"step": 82900
},
{
"epoch": 4.745568896512292,
"grad_norm": 26.87150001525879,
"learning_rate": 6.3758025062761736e-09,
"loss": 0.6714,
"step": 83000
},
{
"epoch": 4.751286449399657,
"grad_norm": 15.25060749053955,
"learning_rate": 6.093046174723004e-09,
"loss": 0.6928,
"step": 83100
},
{
"epoch": 4.7570040022870215,
"grad_norm": 19.705732345581055,
"learning_rate": 5.816664321607767e-09,
"loss": 0.6735,
"step": 83200
},
{
"epoch": 4.762721555174386,
"grad_norm": 14.579095840454102,
"learning_rate": 5.54666051397934e-09,
"loss": 0.6814,
"step": 83300
},
{
"epoch": 4.76843910806175,
"grad_norm": 19.118043899536133,
"learning_rate": 5.283038236569837e-09,
"loss": 0.6787,
"step": 83400
},
{
"epoch": 4.774156660949114,
"grad_norm": 18.051437377929688,
"learning_rate": 5.025800891749865e-09,
"loss": 0.6772,
"step": 83500
},
{
"epoch": 4.779874213836478,
"grad_norm": 17.763010025024414,
"learning_rate": 4.774951799484672e-09,
"loss": 0.6716,
"step": 83600
},
{
"epoch": 4.785591766723842,
"grad_norm": 18.0462646484375,
"learning_rate": 4.530494197291179e-09,
"loss": 0.6761,
"step": 83700
},
{
"epoch": 4.791309319611207,
"grad_norm": 16.567672729492188,
"learning_rate": 4.292431240196403e-09,
"loss": 0.6927,
"step": 83800
},
{
"epoch": 4.797026872498571,
"grad_norm": 15.077381134033203,
"learning_rate": 4.060766000696214e-09,
"loss": 0.6906,
"step": 83900
},
{
"epoch": 4.802744425385935,
"grad_norm": 18.41817855834961,
"learning_rate": 3.8355014687164175e-09,
"loss": 0.6884,
"step": 84000
},
{
"epoch": 4.808461978273299,
"grad_norm": 17.96348762512207,
"learning_rate": 3.6166405515735686e-09,
"loss": 0.6593,
"step": 84100
},
{
"epoch": 4.814179531160663,
"grad_norm": 16.820146560668945,
"learning_rate": 3.404186073937776e-09,
"loss": 0.6994,
"step": 84200
},
{
"epoch": 4.819897084048027,
"grad_norm": 16.160144805908203,
"learning_rate": 3.1981407777961767e-09,
"loss": 0.6702,
"step": 84300
},
{
"epoch": 4.825614636935391,
"grad_norm": 16.64735221862793,
"learning_rate": 2.998507322417465e-09,
"loss": 0.6854,
"step": 84400
},
{
"epoch": 4.8313321898227555,
"grad_norm": 18.906169891357422,
"learning_rate": 2.805288284317475e-09,
"loss": 0.6773,
"step": 84500
},
{
"epoch": 4.8370497427101204,
"grad_norm": 15.96215534210205,
"learning_rate": 2.618486157226374e-09,
"loss": 0.6826,
"step": 84600
},
{
"epoch": 4.8427672955974845,
"grad_norm": 17.746631622314453,
"learning_rate": 2.4381033520559648e-09,
"loss": 0.6786,
"step": 84700
},
{
"epoch": 4.848484848484849,
"grad_norm": 15.410120964050293,
"learning_rate": 2.264142196868768e-09,
"loss": 0.6698,
"step": 84800
},
{
"epoch": 4.854202401372213,
"grad_norm": 20.618608474731445,
"learning_rate": 2.0966049368481566e-09,
"loss": 0.722,
"step": 84900
},
{
"epoch": 4.859919954259577,
"grad_norm": 16.922340393066406,
"learning_rate": 1.9354937342690446e-09,
"loss": 0.7022,
"step": 85000
},
{
"epoch": 4.865637507146941,
"grad_norm": 17.488685607910156,
"learning_rate": 1.7808106684703005e-09,
"loss": 0.6848,
"step": 85100
},
{
"epoch": 4.871355060034305,
"grad_norm": 15.6874418258667,
"learning_rate": 1.6325577358276e-09,
"loss": 0.6674,
"step": 85200
},
{
"epoch": 4.877072612921669,
"grad_norm": 17.592735290527344,
"learning_rate": 1.4907368497279471e-09,
"loss": 0.6863,
"step": 85300
},
{
"epoch": 4.882790165809034,
"grad_norm": 18.772600173950195,
"learning_rate": 1.355349840544806e-09,
"loss": 0.6809,
"step": 85400
},
{
"epoch": 4.888507718696398,
"grad_norm": 19.88614273071289,
"learning_rate": 1.226398455614508e-09,
"loss": 0.6594,
"step": 85500
},
{
"epoch": 4.894225271583762,
"grad_norm": 18.379648208618164,
"learning_rate": 1.1038843592137136e-09,
"loss": 0.6746,
"step": 85600
},
{
"epoch": 4.899942824471126,
"grad_norm": 16.48954963684082,
"learning_rate": 9.878091325379311e-10,
"loss": 0.6912,
"step": 85700
},
{
"epoch": 4.90566037735849,
"grad_norm": 17.712480545043945,
"learning_rate": 8.781742736811426e-10,
"loss": 0.6921,
"step": 85800
},
{
"epoch": 4.9113779302458545,
"grad_norm": 17.105844497680664,
"learning_rate": 7.749811976164866e-10,
"loss": 0.6805,
"step": 85900
},
{
"epoch": 4.9170954831332185,
"grad_norm": 19.131834030151367,
"learning_rate": 6.782312361778286e-10,
"loss": 0.6824,
"step": 86000
},
{
"epoch": 4.9228130360205835,
"grad_norm": 17.468740463256836,
"learning_rate": 5.879256380427744e-10,
"loss": 0.6737,
"step": 86100
},
{
"epoch": 4.928530588907948,
"grad_norm": 21.079586029052734,
"learning_rate": 5.040655687164608e-10,
"loss": 0.6573,
"step": 86200
},
{
"epoch": 4.934248141795312,
"grad_norm": 18.417434692382812,
"learning_rate": 4.2665211051651216e-10,
"loss": 0.6848,
"step": 86300
},
{
"epoch": 4.939965694682676,
"grad_norm": 20.74665069580078,
"learning_rate": 3.5568626255910726e-10,
"loss": 0.6718,
"step": 86400
},
{
"epoch": 4.94568324757004,
"grad_norm": 19.413671493530273,
"learning_rate": 2.911689407459894e-10,
"loss": 0.676,
"step": 86500
},
{
"epoch": 4.951400800457404,
"grad_norm": 17.327878952026367,
"learning_rate": 2.3310097775280926e-10,
"loss": 0.676,
"step": 86600
},
{
"epoch": 4.957118353344768,
"grad_norm": 16.513090133666992,
"learning_rate": 1.8148312301830006e-10,
"loss": 0.6781,
"step": 86700
},
{
"epoch": 4.962835906232133,
"grad_norm": 20.207876205444336,
"learning_rate": 1.363160427344523e-10,
"loss": 0.6768,
"step": 86800
},
{
"epoch": 4.968553459119497,
"grad_norm": 15.743894577026367,
"learning_rate": 9.760031983824246e-11,
"loss": 0.6985,
"step": 86900
},
{
"epoch": 4.974271012006861,
"grad_norm": 19.615123748779297,
"learning_rate": 6.533645400375044e-11,
"loss": 0.6948,
"step": 87000
},
{
"epoch": 4.979988564894225,
"grad_norm": 16.00938606262207,
"learning_rate": 3.952486163594226e-11,
"loss": 0.6806,
"step": 87100
},
{
"epoch": 4.985706117781589,
"grad_norm": 19.39284324645996,
"learning_rate": 2.016587586534113e-11,
"loss": 0.6762,
"step": 87200
},
{
"epoch": 4.991423670668953,
"grad_norm": 18.533754348754883,
"learning_rate": 7.259746543530898e-12,
"loss": 0.6853,
"step": 87300
},
{
"epoch": 4.9971412235563175,
"grad_norm": 17.494525909423828,
"learning_rate": 8.066402399364846e-13,
"loss": 0.6858,
"step": 87400
},
{
"epoch": 5.0,
"step": 87450,
"total_flos": 1.0778410613996421e+19,
"train_loss": 0.7259524769071445,
"train_runtime": 192674.39,
"train_samples_per_second": 1.815,
"train_steps_per_second": 0.454
}
],
"logging_steps": 100,
"max_steps": 87450,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0778410613996421e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}