ArabianGPT1.5B-QA-FT / trainer_state.json
riotu-lab's picture
upload 13 files
ddfbc7a verified
raw
history blame
91.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9999943955298747,
"eval_steps": 500,
"global_step": 267642,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0037363134169146645,
"grad_norm": 72942.6796875,
"learning_rate": 9.999655549964018e-05,
"loss": 4.1478,
"step": 500
},
{
"epoch": 0.007472626833829329,
"grad_norm": 67110.5,
"learning_rate": 9.998622247314402e-05,
"loss": 3.6349,
"step": 1000
},
{
"epoch": 0.011208940250743994,
"grad_norm": 73347.7890625,
"learning_rate": 9.996900234419608e-05,
"loss": 3.4653,
"step": 1500
},
{
"epoch": 0.014945253667658658,
"grad_norm": 70530.5546875,
"learning_rate": 9.994489748538595e-05,
"loss": 3.3608,
"step": 2000
},
{
"epoch": 0.018681567084573324,
"grad_norm": 69644.8515625,
"learning_rate": 9.991391121788143e-05,
"loss": 3.2989,
"step": 2500
},
{
"epoch": 0.022417880501487988,
"grad_norm": 79574.8203125,
"learning_rate": 9.987604781097091e-05,
"loss": 3.1788,
"step": 3000
},
{
"epoch": 0.02615419391840265,
"grad_norm": 77448.375,
"learning_rate": 9.983131248147513e-05,
"loss": 3.1802,
"step": 3500
},
{
"epoch": 0.029890507335317316,
"grad_norm": 82995.328125,
"learning_rate": 9.977971139302844e-05,
"loss": 3.1495,
"step": 4000
},
{
"epoch": 0.03362682075223198,
"grad_norm": 77637.0625,
"learning_rate": 9.972125165522955e-05,
"loss": 3.0534,
"step": 4500
},
{
"epoch": 0.03736313416914665,
"grad_norm": 71801.3515625,
"learning_rate": 9.965594132266197e-05,
"loss": 3.0471,
"step": 5000
},
{
"epoch": 0.04109944758606131,
"grad_norm": 73166.8125,
"learning_rate": 9.958378939378425e-05,
"loss": 3.0568,
"step": 5500
},
{
"epoch": 0.044835761002975975,
"grad_norm": 82521.0390625,
"learning_rate": 9.950480580969022e-05,
"loss": 2.9987,
"step": 6000
},
{
"epoch": 0.048572074419890636,
"grad_norm": 73557.609375,
"learning_rate": 9.941900145273922e-05,
"loss": 2.9518,
"step": 6500
},
{
"epoch": 0.0523083878368053,
"grad_norm": 76367.0078125,
"learning_rate": 9.932638814505676e-05,
"loss": 2.9528,
"step": 7000
},
{
"epoch": 0.056044701253719964,
"grad_norm": 80728.796875,
"learning_rate": 9.922697864690573e-05,
"loss": 2.9501,
"step": 7500
},
{
"epoch": 0.05978101467063463,
"grad_norm": 71664.421875,
"learning_rate": 9.912078665492822e-05,
"loss": 2.9449,
"step": 8000
},
{
"epoch": 0.0635173280875493,
"grad_norm": 75999.953125,
"learning_rate": 9.90078268002584e-05,
"loss": 2.8971,
"step": 8500
},
{
"epoch": 0.06725364150446396,
"grad_norm": 67755.296875,
"learning_rate": 9.888811464650668e-05,
"loss": 2.8525,
"step": 9000
},
{
"epoch": 0.07098995492137862,
"grad_norm": 76703.8203125,
"learning_rate": 9.876166668761532e-05,
"loss": 2.8725,
"step": 9500
},
{
"epoch": 0.0747262683382933,
"grad_norm": 74384.6640625,
"learning_rate": 9.862850034558593e-05,
"loss": 2.8761,
"step": 10000
},
{
"epoch": 0.07846258175520796,
"grad_norm": 59161.4453125,
"learning_rate": 9.848863396807901e-05,
"loss": 2.8503,
"step": 10500
},
{
"epoch": 0.08219889517212262,
"grad_norm": 66133.3359375,
"learning_rate": 9.834208682588608e-05,
"loss": 2.8198,
"step": 11000
},
{
"epoch": 0.08593520858903728,
"grad_norm": 65402.43359375,
"learning_rate": 9.81888791102745e-05,
"loss": 2.8278,
"step": 11500
},
{
"epoch": 0.08967152200595195,
"grad_norm": 73789.2578125,
"learning_rate": 9.802903193020554e-05,
"loss": 2.808,
"step": 12000
},
{
"epoch": 0.09340783542286661,
"grad_norm": 71568.953125,
"learning_rate": 9.786256730942595e-05,
"loss": 2.7821,
"step": 12500
},
{
"epoch": 0.09714414883978127,
"grad_norm": 68736.6953125,
"learning_rate": 9.768950818343359e-05,
"loss": 2.7827,
"step": 13000
},
{
"epoch": 0.10088046225669595,
"grad_norm": 73235.984375,
"learning_rate": 9.750987839631733e-05,
"loss": 2.797,
"step": 13500
},
{
"epoch": 0.1046167756736106,
"grad_norm": 72581.59375,
"learning_rate": 9.732370269747181e-05,
"loss": 2.7515,
"step": 14000
},
{
"epoch": 0.10835308909052527,
"grad_norm": 77676.46875,
"learning_rate": 9.713100673818754e-05,
"loss": 2.7451,
"step": 14500
},
{
"epoch": 0.11208940250743993,
"grad_norm": 78359.078125,
"learning_rate": 9.693181706811652e-05,
"loss": 2.7544,
"step": 15000
},
{
"epoch": 0.1158257159243546,
"grad_norm": 67356.484375,
"learning_rate": 9.672616113161434e-05,
"loss": 2.7054,
"step": 15500
},
{
"epoch": 0.11956202934126926,
"grad_norm": 76034.65625,
"learning_rate": 9.651406726395895e-05,
"loss": 2.7419,
"step": 16000
},
{
"epoch": 0.12329834275818392,
"grad_norm": 77247.296875,
"learning_rate": 9.629556468744647e-05,
"loss": 2.7274,
"step": 16500
},
{
"epoch": 0.1270346561750986,
"grad_norm": 90000.84375,
"learning_rate": 9.607068350736501e-05,
"loss": 2.7118,
"step": 17000
},
{
"epoch": 0.13077096959201326,
"grad_norm": 82970.1953125,
"learning_rate": 9.583945470784683e-05,
"loss": 2.7036,
"step": 17500
},
{
"epoch": 0.13450728300892792,
"grad_norm": 70716.7578125,
"learning_rate": 9.560191014759922e-05,
"loss": 2.6936,
"step": 18000
},
{
"epoch": 0.13824359642584258,
"grad_norm": 69805.4140625,
"learning_rate": 9.535808255551515e-05,
"loss": 2.6633,
"step": 18500
},
{
"epoch": 0.14197990984275724,
"grad_norm": 69154.890625,
"learning_rate": 9.510800552616374e-05,
"loss": 2.6765,
"step": 19000
},
{
"epoch": 0.1457162232596719,
"grad_norm": 70633.8515625,
"learning_rate": 9.485171351516171e-05,
"loss": 2.6623,
"step": 19500
},
{
"epoch": 0.1494525366765866,
"grad_norm": 85893.5390625,
"learning_rate": 9.458924183442602e-05,
"loss": 2.6699,
"step": 20000
},
{
"epoch": 0.15318885009350125,
"grad_norm": 72002.125,
"learning_rate": 9.43206266473086e-05,
"loss": 2.6461,
"step": 20500
},
{
"epoch": 0.1569251635104159,
"grad_norm": 78129.734375,
"learning_rate": 9.404590496361382e-05,
"loss": 2.6687,
"step": 21000
},
{
"epoch": 0.16066147692733057,
"grad_norm": 64102.87890625,
"learning_rate": 9.376511463449922e-05,
"loss": 2.659,
"step": 21500
},
{
"epoch": 0.16439779034424523,
"grad_norm": 61640.66796875,
"learning_rate": 9.347829434726035e-05,
"loss": 2.6069,
"step": 22000
},
{
"epoch": 0.1681341037611599,
"grad_norm": 77315.6484375,
"learning_rate": 9.318548362000054e-05,
"loss": 2.6231,
"step": 22500
},
{
"epoch": 0.17187041717807455,
"grad_norm": 62667.19921875,
"learning_rate": 9.288672279618603e-05,
"loss": 2.6435,
"step": 23000
},
{
"epoch": 0.17560673059498924,
"grad_norm": 72954.6015625,
"learning_rate": 9.258205303908738e-05,
"loss": 2.6336,
"step": 23500
},
{
"epoch": 0.1793430440119039,
"grad_norm": 71380.78125,
"learning_rate": 9.227151632610811e-05,
"loss": 2.6389,
"step": 24000
},
{
"epoch": 0.18307935742881856,
"grad_norm": 72969.21875,
"learning_rate": 9.195515544300105e-05,
"loss": 2.6162,
"step": 24500
},
{
"epoch": 0.18681567084573322,
"grad_norm": 70472.6171875,
"learning_rate": 9.163301397797319e-05,
"loss": 2.6384,
"step": 25000
},
{
"epoch": 0.19055198426264788,
"grad_norm": 74207.796875,
"learning_rate": 9.130513631568021e-05,
"loss": 2.5962,
"step": 25500
},
{
"epoch": 0.19428829767956254,
"grad_norm": 66946.59375,
"learning_rate": 9.097156763111118e-05,
"loss": 2.607,
"step": 26000
},
{
"epoch": 0.1980246110964772,
"grad_norm": 75940.734375,
"learning_rate": 9.063235388336423e-05,
"loss": 2.5973,
"step": 26500
},
{
"epoch": 0.2017609245133919,
"grad_norm": 63575.37890625,
"learning_rate": 9.028754180931441e-05,
"loss": 2.6065,
"step": 27000
},
{
"epoch": 0.20549723793030655,
"grad_norm": 79057.234375,
"learning_rate": 8.993717891717425e-05,
"loss": 2.5717,
"step": 27500
},
{
"epoch": 0.2092335513472212,
"grad_norm": 72744.7578125,
"learning_rate": 8.958131347994808e-05,
"loss": 2.6101,
"step": 28000
},
{
"epoch": 0.21296986476413587,
"grad_norm": 77414.359375,
"learning_rate": 8.921999452878093e-05,
"loss": 2.5807,
"step": 28500
},
{
"epoch": 0.21670617818105053,
"grad_norm": 75271.28125,
"learning_rate": 8.885327184620314e-05,
"loss": 2.5952,
"step": 29000
},
{
"epoch": 0.2204424915979652,
"grad_norm": 69727.734375,
"learning_rate": 8.848119595927116e-05,
"loss": 2.587,
"step": 29500
},
{
"epoch": 0.22417880501487986,
"grad_norm": 83907.5859375,
"learning_rate": 8.810381813260606e-05,
"loss": 2.5377,
"step": 30000
},
{
"epoch": 0.22791511843179454,
"grad_norm": 93582.2265625,
"learning_rate": 8.772119036133022e-05,
"loss": 2.5808,
"step": 30500
},
{
"epoch": 0.2316514318487092,
"grad_norm": 69250.0703125,
"learning_rate": 8.733336536390349e-05,
"loss": 2.554,
"step": 31000
},
{
"epoch": 0.23538774526562387,
"grad_norm": 72463.203125,
"learning_rate": 8.69403965748596e-05,
"loss": 2.5899,
"step": 31500
},
{
"epoch": 0.23912405868253853,
"grad_norm": 71343.4765625,
"learning_rate": 8.654233813744392e-05,
"loss": 2.5627,
"step": 32000
},
{
"epoch": 0.2428603720994532,
"grad_norm": 78745.25,
"learning_rate": 8.613924489615373e-05,
"loss": 2.5353,
"step": 32500
},
{
"epoch": 0.24659668551636785,
"grad_norm": 81588.1875,
"learning_rate": 8.573117238918161e-05,
"loss": 2.5696,
"step": 33000
},
{
"epoch": 0.25033299893328254,
"grad_norm": 66313.8984375,
"learning_rate": 8.53181768407634e-05,
"loss": 2.5745,
"step": 33500
},
{
"epoch": 0.2540693123501972,
"grad_norm": 70478.8046875,
"learning_rate": 8.490031515343177e-05,
"loss": 2.5485,
"step": 34000
},
{
"epoch": 0.25780562576711186,
"grad_norm": 81928.53125,
"learning_rate": 8.447764490017597e-05,
"loss": 2.4977,
"step": 34500
},
{
"epoch": 0.2615419391840265,
"grad_norm": 65777.4921875,
"learning_rate": 8.405022431650959e-05,
"loss": 2.5664,
"step": 35000
},
{
"epoch": 0.2652782526009412,
"grad_norm": 81822.8828125,
"learning_rate": 8.361811229244682e-05,
"loss": 2.5249,
"step": 35500
},
{
"epoch": 0.26901456601785584,
"grad_norm": 66773.1328125,
"learning_rate": 8.318136836438853e-05,
"loss": 2.5385,
"step": 36000
},
{
"epoch": 0.2727508794347705,
"grad_norm": 78679.25,
"learning_rate": 8.274005270691942e-05,
"loss": 2.499,
"step": 36500
},
{
"epoch": 0.27648719285168516,
"grad_norm": 68307.140625,
"learning_rate": 8.229422612451711e-05,
"loss": 2.4764,
"step": 37000
},
{
"epoch": 0.2802235062685998,
"grad_norm": 67303.9453125,
"learning_rate": 8.184395004317458e-05,
"loss": 2.5165,
"step": 37500
},
{
"epoch": 0.2839598196855145,
"grad_norm": 69105.96875,
"learning_rate": 8.138928650193676e-05,
"loss": 2.5159,
"step": 38000
},
{
"epoch": 0.28769613310242914,
"grad_norm": 75505.984375,
"learning_rate": 8.093029814435293e-05,
"loss": 2.5268,
"step": 38500
},
{
"epoch": 0.2914324465193438,
"grad_norm": 80771.171875,
"learning_rate": 8.046704820984558e-05,
"loss": 2.5296,
"step": 39000
},
{
"epoch": 0.2951687599362585,
"grad_norm": 71207.0546875,
"learning_rate": 7.999960052499737e-05,
"loss": 2.518,
"step": 39500
},
{
"epoch": 0.2989050733531732,
"grad_norm": 69106.34375,
"learning_rate": 7.952801949475706e-05,
"loss": 2.4906,
"step": 40000
},
{
"epoch": 0.30264138677008784,
"grad_norm": 82679.9296875,
"learning_rate": 7.905237009356578e-05,
"loss": 2.4743,
"step": 40500
},
{
"epoch": 0.3063777001870025,
"grad_norm": 84038.28125,
"learning_rate": 7.857271785640486e-05,
"loss": 2.4726,
"step": 41000
},
{
"epoch": 0.31011401360391716,
"grad_norm": 75876.3203125,
"learning_rate": 7.808912886976641e-05,
"loss": 2.5081,
"step": 41500
},
{
"epoch": 0.3138503270208318,
"grad_norm": 78494.3046875,
"learning_rate": 7.760166976254804e-05,
"loss": 2.4795,
"step": 42000
},
{
"epoch": 0.3175866404377465,
"grad_norm": 68793.765625,
"learning_rate": 7.711040769687248e-05,
"loss": 2.4883,
"step": 42500
},
{
"epoch": 0.32132295385466114,
"grad_norm": 78543.03125,
"learning_rate": 7.661541035883427e-05,
"loss": 2.4752,
"step": 43000
},
{
"epoch": 0.3250592672715758,
"grad_norm": 78304.4140625,
"learning_rate": 7.611674594917376e-05,
"loss": 2.4747,
"step": 43500
},
{
"epoch": 0.32879558068849046,
"grad_norm": 77926.90625,
"learning_rate": 7.561448317388045e-05,
"loss": 2.4666,
"step": 44000
},
{
"epoch": 0.3325318941054051,
"grad_norm": 76948.8671875,
"learning_rate": 7.510869123472679e-05,
"loss": 2.4724,
"step": 44500
},
{
"epoch": 0.3362682075223198,
"grad_norm": 75918.4921875,
"learning_rate": 7.459943981973343e-05,
"loss": 2.4476,
"step": 45000
},
{
"epoch": 0.34000452093923444,
"grad_norm": 66944.8125,
"learning_rate": 7.408679909356764e-05,
"loss": 2.4982,
"step": 45500
},
{
"epoch": 0.3437408343561491,
"grad_norm": 68671.390625,
"learning_rate": 7.357083968787606e-05,
"loss": 2.4996,
"step": 46000
},
{
"epoch": 0.3474771477730638,
"grad_norm": 74974.25,
"learning_rate": 7.305163269155305e-05,
"loss": 2.44,
"step": 46500
},
{
"epoch": 0.3512134611899785,
"grad_norm": 83594.984375,
"learning_rate": 7.2529249640946e-05,
"loss": 2.466,
"step": 47000
},
{
"epoch": 0.35494977460689314,
"grad_norm": 67277.0546875,
"learning_rate": 7.200376250999917e-05,
"loss": 2.4398,
"step": 47500
},
{
"epoch": 0.3586860880238078,
"grad_norm": 64984.64453125,
"learning_rate": 7.1475243700337e-05,
"loss": 2.4536,
"step": 48000
},
{
"epoch": 0.36242240144072246,
"grad_norm": 76212.953125,
"learning_rate": 7.094376603128872e-05,
"loss": 2.4643,
"step": 48500
},
{
"epoch": 0.3661587148576371,
"grad_norm": 78486.7109375,
"learning_rate": 7.040940272985521e-05,
"loss": 2.4468,
"step": 49000
},
{
"epoch": 0.3698950282745518,
"grad_norm": 90899.328125,
"learning_rate": 6.987222742061982e-05,
"loss": 2.458,
"step": 49500
},
{
"epoch": 0.37363134169146645,
"grad_norm": 69692.6328125,
"learning_rate": 6.933231411560442e-05,
"loss": 2.4331,
"step": 50000
},
{
"epoch": 0.3773676551083811,
"grad_norm": 74715.71875,
"learning_rate": 6.878973720407191e-05,
"loss": 2.4379,
"step": 50500
},
{
"epoch": 0.38110396852529577,
"grad_norm": 72629.9140625,
"learning_rate": 6.824457144227698e-05,
"loss": 2.4424,
"step": 51000
},
{
"epoch": 0.3848402819422104,
"grad_norm": 74704.6328125,
"learning_rate": 6.769689194316615e-05,
"loss": 2.4905,
"step": 51500
},
{
"epoch": 0.3885765953591251,
"grad_norm": 75688.8125,
"learning_rate": 6.714677416602868e-05,
"loss": 2.4413,
"step": 52000
},
{
"epoch": 0.39231290877603975,
"grad_norm": 75325.578125,
"learning_rate": 6.659429390609984e-05,
"loss": 2.4505,
"step": 52500
},
{
"epoch": 0.3960492221929544,
"grad_norm": 70411.6953125,
"learning_rate": 6.603952728411776e-05,
"loss": 2.4358,
"step": 53000
},
{
"epoch": 0.3997855356098691,
"grad_norm": 71785.53125,
"learning_rate": 6.54825507358356e-05,
"loss": 2.4271,
"step": 53500
},
{
"epoch": 0.4035218490267838,
"grad_norm": 76337.5703125,
"learning_rate": 6.492344100149026e-05,
"loss": 2.4536,
"step": 54000
},
{
"epoch": 0.40725816244369845,
"grad_norm": 65393.2890625,
"learning_rate": 6.436227511522892e-05,
"loss": 2.4404,
"step": 54500
},
{
"epoch": 0.4109944758606131,
"grad_norm": 88779.578125,
"learning_rate": 6.379913039449547e-05,
"loss": 2.4277,
"step": 55000
},
{
"epoch": 0.41473078927752777,
"grad_norm": 70318.0546875,
"learning_rate": 6.32340844293777e-05,
"loss": 2.4209,
"step": 55500
},
{
"epoch": 0.4184671026944424,
"grad_norm": 77500.03125,
"learning_rate": 6.266721507191674e-05,
"loss": 2.4218,
"step": 56000
},
{
"epoch": 0.4222034161113571,
"grad_norm": 89454.5703125,
"learning_rate": 6.209860042538087e-05,
"loss": 2.409,
"step": 56500
},
{
"epoch": 0.42593972952827175,
"grad_norm": 72870.5,
"learning_rate": 6.152831883350426e-05,
"loss": 2.3841,
"step": 57000
},
{
"epoch": 0.4296760429451864,
"grad_norm": 75137.0703125,
"learning_rate": 6.095644886969283e-05,
"loss": 2.3922,
"step": 57500
},
{
"epoch": 0.43341235636210107,
"grad_norm": 64499.1328125,
"learning_rate": 6.038306932619846e-05,
"loss": 2.4042,
"step": 58000
},
{
"epoch": 0.43714866977901573,
"grad_norm": 74131.78125,
"learning_rate": 5.980825920326287e-05,
"loss": 2.3981,
"step": 58500
},
{
"epoch": 0.4408849831959304,
"grad_norm": 75569.671875,
"learning_rate": 5.923209769823308e-05,
"loss": 2.4412,
"step": 59000
},
{
"epoch": 0.44462129661284505,
"grad_norm": 94198.984375,
"learning_rate": 5.865466419464957e-05,
"loss": 2.4383,
"step": 59500
},
{
"epoch": 0.4483576100297597,
"grad_norm": 72894.984375,
"learning_rate": 5.807603825130873e-05,
"loss": 2.3944,
"step": 60000
},
{
"epoch": 0.45209392344667443,
"grad_norm": 75193.0234375,
"learning_rate": 5.7496299591301406e-05,
"loss": 2.3847,
"step": 60500
},
{
"epoch": 0.4558302368635891,
"grad_norm": 115278.234375,
"learning_rate": 5.6915528091028494e-05,
"loss": 2.3914,
"step": 61000
},
{
"epoch": 0.45956655028050375,
"grad_norm": 80114.7578125,
"learning_rate": 5.633380376919567e-05,
"loss": 2.394,
"step": 61500
},
{
"epoch": 0.4633028636974184,
"grad_norm": 77552.9921875,
"learning_rate": 5.5751206775788354e-05,
"loss": 2.3919,
"step": 62000
},
{
"epoch": 0.46703917711433307,
"grad_norm": 81552.4375,
"learning_rate": 5.516781738102871e-05,
"loss": 2.396,
"step": 62500
},
{
"epoch": 0.47077549053124773,
"grad_norm": 89432.1015625,
"learning_rate": 5.458371596431592e-05,
"loss": 2.4214,
"step": 63000
},
{
"epoch": 0.4745118039481624,
"grad_norm": 90061.796875,
"learning_rate": 5.3998983003151595e-05,
"loss": 2.3796,
"step": 63500
},
{
"epoch": 0.47824811736507705,
"grad_norm": 84587.1953125,
"learning_rate": 5.341369906205154e-05,
"loss": 2.39,
"step": 64000
},
{
"epoch": 0.4819844307819917,
"grad_norm": 63710.203125,
"learning_rate": 5.2827944781445583e-05,
"loss": 2.4094,
"step": 64500
},
{
"epoch": 0.4857207441989064,
"grad_norm": 69039.109375,
"learning_rate": 5.224180086656693e-05,
"loss": 2.3963,
"step": 65000
},
{
"epoch": 0.48945705761582103,
"grad_norm": 90888.671875,
"learning_rate": 5.165534807633261e-05,
"loss": 2.3701,
"step": 65500
},
{
"epoch": 0.4931933710327357,
"grad_norm": 75439.6875,
"learning_rate": 5.1068667212216527e-05,
"loss": 2.3588,
"step": 66000
},
{
"epoch": 0.49692968444965036,
"grad_norm": 77958.296875,
"learning_rate": 5.048183910711655e-05,
"loss": 2.3613,
"step": 66500
},
{
"epoch": 0.5006659978665651,
"grad_norm": 60339.5859375,
"learning_rate": 4.9894944614217454e-05,
"loss": 2.3501,
"step": 67000
},
{
"epoch": 0.5044023112834797,
"grad_norm": 88602.6015625,
"learning_rate": 4.930806459585094e-05,
"loss": 2.408,
"step": 67500
},
{
"epoch": 0.5081386247003944,
"grad_norm": 82704.09375,
"learning_rate": 4.872127991235435e-05,
"loss": 2.3974,
"step": 68000
},
{
"epoch": 0.511874938117309,
"grad_norm": 69181.0625,
"learning_rate": 4.8134671410929855e-05,
"loss": 2.337,
"step": 68500
},
{
"epoch": 0.5156112515342237,
"grad_norm": 83461.8984375,
"learning_rate": 4.75483199145052e-05,
"loss": 2.3468,
"step": 69000
},
{
"epoch": 0.5193475649511383,
"grad_norm": 70060.1875,
"learning_rate": 4.6962306210598024e-05,
"loss": 2.3686,
"step": 69500
},
{
"epoch": 0.523083878368053,
"grad_norm": 80763.5703125,
"learning_rate": 4.637671104018486e-05,
"loss": 2.3697,
"step": 70000
},
{
"epoch": 0.5268201917849676,
"grad_norm": 75881.0,
"learning_rate": 4.5791615086576726e-05,
"loss": 2.3789,
"step": 70500
},
{
"epoch": 0.5305565052018824,
"grad_norm": 68510.5390625,
"learning_rate": 4.5207098964302556e-05,
"loss": 2.3812,
"step": 71000
},
{
"epoch": 0.5342928186187971,
"grad_norm": 90292.1953125,
"learning_rate": 4.462324320800205e-05,
"loss": 2.3558,
"step": 71500
},
{
"epoch": 0.5380291320357117,
"grad_norm": 69267.453125,
"learning_rate": 4.4040128261329724e-05,
"loss": 2.3751,
"step": 72000
},
{
"epoch": 0.5417654454526264,
"grad_norm": 85016.8203125,
"learning_rate": 4.345783446587135e-05,
"loss": 2.3635,
"step": 72500
},
{
"epoch": 0.545501758869541,
"grad_norm": 103424.7109375,
"learning_rate": 4.2876442050074436e-05,
"loss": 2.3367,
"step": 73000
},
{
"epoch": 0.5492380722864557,
"grad_norm": 83033.2265625,
"learning_rate": 4.229603111819437e-05,
"loss": 2.371,
"step": 73500
},
{
"epoch": 0.5529743857033703,
"grad_norm": 89505.2890625,
"learning_rate": 4.1716681639257725e-05,
"loss": 2.3537,
"step": 74000
},
{
"epoch": 0.556710699120285,
"grad_norm": 86427.5390625,
"learning_rate": 4.113847343604404e-05,
"loss": 2.3518,
"step": 74500
},
{
"epoch": 0.5604470125371996,
"grad_norm": 95231.3203125,
"learning_rate": 4.05614861740879e-05,
"loss": 2.3475,
"step": 75000
},
{
"epoch": 0.5641833259541144,
"grad_norm": 73089.09375,
"learning_rate": 3.998579935070253e-05,
"loss": 2.3467,
"step": 75500
},
{
"epoch": 0.567919639371029,
"grad_norm": 82384.2109375,
"learning_rate": 3.941149228402674e-05,
"loss": 2.3278,
"step": 76000
},
{
"epoch": 0.5716559527879437,
"grad_norm": 84582.859375,
"learning_rate": 3.883864410209648e-05,
"loss": 2.3535,
"step": 76500
},
{
"epoch": 0.5753922662048583,
"grad_norm": 69513.640625,
"learning_rate": 3.826733373194247e-05,
"loss": 2.3257,
"step": 77000
},
{
"epoch": 0.579128579621773,
"grad_norm": 73027.9453125,
"learning_rate": 3.769763988871572e-05,
"loss": 2.3447,
"step": 77500
},
{
"epoch": 0.5828648930386876,
"grad_norm": 71039.1171875,
"learning_rate": 3.712964106484218e-05,
"loss": 2.3147,
"step": 78000
},
{
"epoch": 0.5866012064556023,
"grad_norm": 80346.1796875,
"learning_rate": 3.6563415519207985e-05,
"loss": 2.3144,
"step": 78500
},
{
"epoch": 0.590337519872517,
"grad_norm": 84110.34375,
"learning_rate": 3.599904126637692e-05,
"loss": 2.3333,
"step": 79000
},
{
"epoch": 0.5940738332894316,
"grad_norm": 82063.09375,
"learning_rate": 3.54365960658417e-05,
"loss": 2.3283,
"step": 79500
},
{
"epoch": 0.5978101467063464,
"grad_norm": 84510.8984375,
"learning_rate": 3.487615741131013e-05,
"loss": 2.3397,
"step": 80000
},
{
"epoch": 0.601546460123261,
"grad_norm": 74272.46875,
"learning_rate": 3.4317802520028117e-05,
"loss": 2.326,
"step": 80500
},
{
"epoch": 0.6052827735401757,
"grad_norm": 73607.890625,
"learning_rate": 3.37616083221406e-05,
"loss": 2.3238,
"step": 81000
},
{
"epoch": 0.6090190869570903,
"grad_norm": 84208.0078125,
"learning_rate": 3.320765145009216e-05,
"loss": 2.3112,
"step": 81500
},
{
"epoch": 0.612755400374005,
"grad_norm": 80192.90625,
"learning_rate": 3.265600822806866e-05,
"loss": 2.3361,
"step": 82000
},
{
"epoch": 0.6164917137909196,
"grad_norm": 83479.984375,
"learning_rate": 3.2106754661481086e-05,
"loss": 2.3383,
"step": 82500
},
{
"epoch": 0.6202280272078343,
"grad_norm": 88273.1015625,
"learning_rate": 3.1559966426493804e-05,
"loss": 2.3059,
"step": 83000
},
{
"epoch": 0.6239643406247489,
"grad_norm": 98511.15625,
"learning_rate": 3.101571885959766e-05,
"loss": 2.3113,
"step": 83500
},
{
"epoch": 0.6277006540416636,
"grad_norm": 76336.8046875,
"learning_rate": 3.047408694723032e-05,
"loss": 2.3254,
"step": 84000
},
{
"epoch": 0.6314369674585782,
"grad_norm": 80944.7734375,
"learning_rate": 2.99351453154444e-05,
"loss": 2.3428,
"step": 84500
},
{
"epoch": 0.635173280875493,
"grad_norm": 72115.8125,
"learning_rate": 2.9398968219625706e-05,
"loss": 2.3156,
"step": 85000
},
{
"epoch": 0.6389095942924077,
"grad_norm": 87770.3125,
"learning_rate": 2.8865629534262216e-05,
"loss": 2.3041,
"step": 85500
},
{
"epoch": 0.6426459077093223,
"grad_norm": 80694.6171875,
"learning_rate": 2.83352027427657e-05,
"loss": 2.3478,
"step": 86000
},
{
"epoch": 0.646382221126237,
"grad_norm": 77460.1171875,
"learning_rate": 2.7807760927347102e-05,
"loss": 2.302,
"step": 86500
},
{
"epoch": 0.6501185345431516,
"grad_norm": 82131.8671875,
"learning_rate": 2.7283376758947333e-05,
"loss": 2.3156,
"step": 87000
},
{
"epoch": 0.6538548479600663,
"grad_norm": 90870.65625,
"learning_rate": 2.6762122487224684e-05,
"loss": 2.331,
"step": 87500
},
{
"epoch": 0.6575911613769809,
"grad_norm": 98921.2109375,
"learning_rate": 2.624406993060019e-05,
"loss": 2.3073,
"step": 88000
},
{
"epoch": 0.6613274747938956,
"grad_norm": 73837.2109375,
"learning_rate": 2.5729290466362565e-05,
"loss": 2.301,
"step": 88500
},
{
"epoch": 0.6650637882108102,
"grad_norm": 82211.34375,
"learning_rate": 2.521785502083384e-05,
"loss": 2.3103,
"step": 89000
},
{
"epoch": 0.668800101627725,
"grad_norm": 86962.1484375,
"learning_rate": 2.4709834059597026e-05,
"loss": 2.3154,
"step": 89500
},
{
"epoch": 0.6725364150446396,
"grad_norm": 79155.3515625,
"learning_rate": 2.420529757778746e-05,
"loss": 2.348,
"step": 90000
},
{
"epoch": 0.6762727284615543,
"grad_norm": 97560.8125,
"learning_rate": 2.3704315090448886e-05,
"loss": 2.3417,
"step": 90500
},
{
"epoch": 0.6800090418784689,
"grad_norm": 124897.3203125,
"learning_rate": 2.3206955622955623e-05,
"loss": 2.3122,
"step": 91000
},
{
"epoch": 0.6837453552953836,
"grad_norm": 88038.5625,
"learning_rate": 2.271328770150227e-05,
"loss": 2.311,
"step": 91500
},
{
"epoch": 0.6874816687122982,
"grad_norm": 89480.2890625,
"learning_rate": 2.222337934366214e-05,
"loss": 2.3002,
"step": 92000
},
{
"epoch": 0.6912179821292129,
"grad_norm": 81038.4921875,
"learning_rate": 2.173729804901581e-05,
"loss": 2.3231,
"step": 92500
},
{
"epoch": 0.6949542955461276,
"grad_norm": 91312.8984375,
"learning_rate": 2.1255110789851084e-05,
"loss": 2.3125,
"step": 93000
},
{
"epoch": 0.6986906089630422,
"grad_norm": 84064.609375,
"learning_rate": 2.0776884001935437e-05,
"loss": 2.3379,
"step": 93500
},
{
"epoch": 0.702426922379957,
"grad_norm": 87531.078125,
"learning_rate": 2.0302683575362608e-05,
"loss": 2.2962,
"step": 94000
},
{
"epoch": 0.7061632357968716,
"grad_norm": 84962.34375,
"learning_rate": 1.983257484547421e-05,
"loss": 2.3032,
"step": 94500
},
{
"epoch": 0.7098995492137863,
"grad_norm": 88890.75,
"learning_rate": 1.9366622583857803e-05,
"loss": 2.3138,
"step": 95000
},
{
"epoch": 0.7136358626307009,
"grad_norm": 93013.0,
"learning_rate": 1.890489098942267e-05,
"loss": 2.3128,
"step": 95500
},
{
"epoch": 0.7173721760476156,
"grad_norm": 79075.3515625,
"learning_rate": 1.8447443679554567e-05,
"loss": 2.3013,
"step": 96000
},
{
"epoch": 0.7211084894645302,
"grad_norm": 94954.1640625,
"learning_rate": 1.7994343681350422e-05,
"loss": 2.2775,
"step": 96500
},
{
"epoch": 0.7248448028814449,
"grad_norm": 79968.1171875,
"learning_rate": 1.7545653422934537e-05,
"loss": 2.2979,
"step": 97000
},
{
"epoch": 0.7285811162983595,
"grad_norm": 90937.109375,
"learning_rate": 1.7101434724857147e-05,
"loss": 2.3466,
"step": 97500
},
{
"epoch": 0.7323174297152742,
"grad_norm": 85592.625,
"learning_rate": 1.6661748791576855e-05,
"loss": 2.3029,
"step": 98000
},
{
"epoch": 0.7360537431321889,
"grad_norm": 95187.1484375,
"learning_rate": 1.622665620302789e-05,
"loss": 2.3074,
"step": 98500
},
{
"epoch": 0.7397900565491036,
"grad_norm": 87762.859375,
"learning_rate": 1.5796216906273388e-05,
"loss": 2.2883,
"step": 99000
},
{
"epoch": 0.7435263699660182,
"grad_norm": 102495.0234375,
"learning_rate": 1.537049020724581e-05,
"loss": 2.2852,
"step": 99500
},
{
"epoch": 0.7472626833829329,
"grad_norm": 92540.0078125,
"learning_rate": 1.4949534762575912e-05,
"loss": 2.2914,
"step": 100000
},
{
"epoch": 0.7509989967998476,
"grad_norm": 81275.109375,
"learning_rate": 1.4533408571510904e-05,
"loss": 2.3119,
"step": 100500
},
{
"epoch": 0.7547353102167622,
"grad_norm": 82922.2578125,
"learning_rate": 1.4122168967923366e-05,
"loss": 2.3118,
"step": 101000
},
{
"epoch": 0.7584716236336769,
"grad_norm": 107727.1171875,
"learning_rate": 1.3715872612411806e-05,
"loss": 2.286,
"step": 101500
},
{
"epoch": 0.7622079370505915,
"grad_norm": 84810.40625,
"learning_rate": 1.3314575484493946e-05,
"loss": 2.3174,
"step": 102000
},
{
"epoch": 0.7659442504675062,
"grad_norm": 78007.40625,
"learning_rate": 1.2918332874893869e-05,
"loss": 2.3229,
"step": 102500
},
{
"epoch": 0.7696805638844209,
"grad_norm": 95875.2421875,
"learning_rate": 1.252719937792397e-05,
"loss": 2.2979,
"step": 103000
},
{
"epoch": 0.7734168773013356,
"grad_norm": 83561.375,
"learning_rate": 1.2141228883963118e-05,
"loss": 2.2667,
"step": 103500
},
{
"epoch": 0.7771531907182502,
"grad_norm": 73712.7578125,
"learning_rate": 1.1760474572031538e-05,
"loss": 2.2876,
"step": 104000
},
{
"epoch": 0.7808895041351649,
"grad_norm": 90907.0234375,
"learning_rate": 1.1384988902463806e-05,
"loss": 2.2799,
"step": 104500
},
{
"epoch": 0.7846258175520795,
"grad_norm": 84601.046875,
"learning_rate": 1.101482360968088e-05,
"loss": 2.3109,
"step": 105000
},
{
"epoch": 0.7883621309689942,
"grad_norm": 89593.640625,
"learning_rate": 1.0650029695062103e-05,
"loss": 2.3071,
"step": 105500
},
{
"epoch": 0.7920984443859088,
"grad_norm": 85944.5390625,
"learning_rate": 1.0290657419918308e-05,
"loss": 2.3275,
"step": 106000
},
{
"epoch": 0.7958347578028235,
"grad_norm": 71977.34375,
"learning_rate": 9.936756298566724e-06,
"loss": 2.3108,
"step": 106500
},
{
"epoch": 0.7995710712197382,
"grad_norm": 91883.7109375,
"learning_rate": 9.588375091508945e-06,
"loss": 2.2756,
"step": 107000
},
{
"epoch": 0.8033073846366529,
"grad_norm": 81686.0078125,
"learning_rate": 9.245561798712704e-06,
"loss": 2.3064,
"step": 107500
},
{
"epoch": 0.8070436980535676,
"grad_norm": 86408.875,
"learning_rate": 8.908363652998414e-06,
"loss": 2.3031,
"step": 108000
},
{
"epoch": 0.8107800114704822,
"grad_norm": 83010.9765625,
"learning_rate": 8.576827113531427e-06,
"loss": 2.2797,
"step": 108500
},
{
"epoch": 0.8145163248873969,
"grad_norm": 74965.84375,
"learning_rate": 8.25099785942094e-06,
"loss": 2.3011,
"step": 109000
},
{
"epoch": 0.8182526383043115,
"grad_norm": 91110.0859375,
"learning_rate": 7.930920783426265e-06,
"loss": 2.2572,
"step": 109500
},
{
"epoch": 0.8219889517212262,
"grad_norm": 96117.6640625,
"learning_rate": 7.616639985771556e-06,
"loss": 2.276,
"step": 110000
},
{
"epoch": 0.8257252651381408,
"grad_norm": 66883.046875,
"learning_rate": 7.30819876806963e-06,
"loss": 2.2586,
"step": 110500
},
{
"epoch": 0.8294615785550555,
"grad_norm": 86336.21875,
"learning_rate": 7.005639627355881e-06,
"loss": 2.2796,
"step": 111000
},
{
"epoch": 0.8331978919719701,
"grad_norm": 100607.5703125,
"learning_rate": 6.709004250233097e-06,
"loss": 2.2635,
"step": 111500
},
{
"epoch": 0.8369342053888849,
"grad_norm": 90908.2890625,
"learning_rate": 6.418333507127788e-06,
"loss": 2.2598,
"step": 112000
},
{
"epoch": 0.8406705188057995,
"grad_norm": 101526.234375,
"learning_rate": 6.133667446659131e-06,
"loss": 2.2648,
"step": 112500
},
{
"epoch": 0.8444068322227142,
"grad_norm": 87100.0234375,
"learning_rate": 5.855045290121047e-06,
"loss": 2.3005,
"step": 113000
},
{
"epoch": 0.8481431456396288,
"grad_norm": 95362.3984375,
"learning_rate": 5.58250542607826e-06,
"loss": 2.2747,
"step": 113500
},
{
"epoch": 0.8518794590565435,
"grad_norm": 105692.5625,
"learning_rate": 5.316085405077159e-06,
"loss": 2.2808,
"step": 114000
},
{
"epoch": 0.8556157724734582,
"grad_norm": 99888.703125,
"learning_rate": 5.055821934472071e-06,
"loss": 2.2707,
"step": 114500
},
{
"epoch": 0.8593520858903728,
"grad_norm": 89097.7421875,
"learning_rate": 4.801750873367728e-06,
"loss": 2.2996,
"step": 115000
},
{
"epoch": 0.8630883993072875,
"grad_norm": 94061.9453125,
"learning_rate": 4.553907227678594e-06,
"loss": 2.2585,
"step": 115500
},
{
"epoch": 0.8668247127242021,
"grad_norm": 89231.015625,
"learning_rate": 4.312325145305729e-06,
"loss": 2.2712,
"step": 116000
},
{
"epoch": 0.8705610261411169,
"grad_norm": 92370.8125,
"learning_rate": 4.077037911431913e-06,
"loss": 2.277,
"step": 116500
},
{
"epoch": 0.8742973395580315,
"grad_norm": 95057.8828125,
"learning_rate": 3.84807794393563e-06,
"loss": 2.2705,
"step": 117000
},
{
"epoch": 0.8780336529749462,
"grad_norm": 76279.6015625,
"learning_rate": 3.6254767889244813e-06,
"loss": 2.2692,
"step": 117500
},
{
"epoch": 0.8817699663918608,
"grad_norm": 99320.4375,
"learning_rate": 3.4092651163888154e-06,
"loss": 2.2841,
"step": 118000
},
{
"epoch": 0.8855062798087755,
"grad_norm": 97605.59375,
"learning_rate": 3.1994727159759986e-06,
"loss": 2.2942,
"step": 118500
},
{
"epoch": 0.8892425932256901,
"grad_norm": 87163.4765625,
"learning_rate": 2.996128492885969e-06,
"loss": 2.2918,
"step": 119000
},
{
"epoch": 0.8929789066426048,
"grad_norm": 78705.2421875,
"learning_rate": 2.7992604638886975e-06,
"loss": 2.259,
"step": 119500
},
{
"epoch": 0.8967152200595194,
"grad_norm": 94322.765625,
"learning_rate": 2.608895753464069e-06,
"loss": 2.2589,
"step": 120000
},
{
"epoch": 0.9004515334764341,
"grad_norm": 73016.4140625,
"learning_rate": 2.4250605900646204e-06,
"loss": 2.2938,
"step": 120500
},
{
"epoch": 0.9041878468933489,
"grad_norm": 90040.5546875,
"learning_rate": 2.2477803025018228e-06,
"loss": 2.2982,
"step": 121000
},
{
"epoch": 0.9079241603102635,
"grad_norm": 102110.484375,
"learning_rate": 2.0770793164562373e-06,
"loss": 2.2578,
"step": 121500
},
{
"epoch": 0.9116604737271782,
"grad_norm": 78370.1796875,
"learning_rate": 1.9129811511121697e-06,
"loss": 2.2854,
"step": 122000
},
{
"epoch": 0.9153967871440928,
"grad_norm": 87597.2578125,
"learning_rate": 1.7555084159172176e-06,
"loss": 2.2657,
"step": 122500
},
{
"epoch": 0.9191331005610075,
"grad_norm": 89568.2109375,
"learning_rate": 1.6046828074670894e-06,
"loss": 2.2674,
"step": 123000
},
{
"epoch": 0.9228694139779221,
"grad_norm": 80686.5234375,
"learning_rate": 1.4605251065162973e-06,
"loss": 2.2634,
"step": 123500
},
{
"epoch": 0.9266057273948368,
"grad_norm": 76351.1796875,
"learning_rate": 1.3230551751149568e-06,
"loss": 2.2889,
"step": 124000
},
{
"epoch": 0.9303420408117514,
"grad_norm": 82895.4921875,
"learning_rate": 1.1922919538721876e-06,
"loss": 2.2188,
"step": 124500
},
{
"epoch": 0.9340783542286661,
"grad_norm": 81276.09375,
"learning_rate": 1.0682534593464967e-06,
"loss": 2.268,
"step": 125000
},
{
"epoch": 0.9378146676455807,
"grad_norm": 83888.390625,
"learning_rate": 9.509567815634446e-07,
"loss": 2.2762,
"step": 125500
},
{
"epoch": 0.9415509810624955,
"grad_norm": 87934.8125,
"learning_rate": 8.404180816609819e-07,
"loss": 2.2759,
"step": 126000
},
{
"epoch": 0.9452872944794101,
"grad_norm": 89569.15625,
"learning_rate": 7.366525896627841e-07,
"loss": 2.2696,
"step": 126500
},
{
"epoch": 0.9490236078963248,
"grad_norm": 79137.96875,
"learning_rate": 6.396746023798139e-07,
"loss": 2.2951,
"step": 127000
},
{
"epoch": 0.9527599213132394,
"grad_norm": 74010.8984375,
"learning_rate": 5.49497481440564e-07,
"loss": 2.2875,
"step": 127500
},
{
"epoch": 0.9564962347301541,
"grad_norm": 70001.59375,
"learning_rate": 4.661336514500625e-07,
"loss": 2.2514,
"step": 128000
},
{
"epoch": 0.9602325481470688,
"grad_norm": 86343.359375,
"learning_rate": 3.895945982779925e-07,
"loss": 2.3027,
"step": 128500
},
{
"epoch": 0.9639688615639834,
"grad_norm": 92609.6171875,
"learning_rate": 3.19890867476208e-07,
"loss": 2.2857,
"step": 129000
},
{
"epoch": 0.9677051749808981,
"grad_norm": 91932.953125,
"learning_rate": 2.570320628257461e-07,
"loss": 2.2814,
"step": 129500
},
{
"epoch": 0.9714414883978127,
"grad_norm": 81636.1015625,
"learning_rate": 2.010268450136077e-07,
"loss": 2.2758,
"step": 130000
},
{
"epoch": 0.9751778018147275,
"grad_norm": 79906.6015625,
"learning_rate": 1.518829304395175e-07,
"loss": 2.2944,
"step": 130500
},
{
"epoch": 0.9789141152316421,
"grad_norm": 76946.6484375,
"learning_rate": 1.0960709015272464e-07,
"loss": 2.2305,
"step": 131000
},
{
"epoch": 0.9826504286485568,
"grad_norm": 81980.828125,
"learning_rate": 7.420514891911535e-08,
"loss": 2.2803,
"step": 131500
},
{
"epoch": 0.9863867420654714,
"grad_norm": 90421.125,
"learning_rate": 4.5681984418666224e-08,
"loss": 2.2767,
"step": 132000
},
{
"epoch": 0.9901230554823861,
"grad_norm": 81403.0390625,
"learning_rate": 2.404152657338732e-08,
"loss": 2.2399,
"step": 132500
},
{
"epoch": 0.9938593688993007,
"grad_norm": 98386.671875,
"learning_rate": 9.286757005871893e-09,
"loss": 2.2422,
"step": 133000
},
{
"epoch": 0.9975956823162154,
"grad_norm": 95571.890625,
"learning_rate": 1.4197086284917227e-09,
"loss": 2.2956,
"step": 133500
},
{
"epoch": 1.0013376002032555,
"grad_norm": 99771.578125,
"learning_rate": 4.9894944614217454e-05,
"loss": 2.211,
"step": 134000
},
{
"epoch": 1.00507391362017,
"grad_norm": 105597.21875,
"learning_rate": 4.960149774176924e-05,
"loss": 2.2131,
"step": 134500
},
{
"epoch": 1.0088102270370847,
"grad_norm": 87634.5,
"learning_rate": 4.930806459585094e-05,
"loss": 2.2453,
"step": 135000
},
{
"epoch": 1.0125465404539995,
"grad_norm": 86854.4375,
"learning_rate": 4.901465528385536e-05,
"loss": 2.1825,
"step": 135500
},
{
"epoch": 1.016282853870914,
"grad_norm": 75373.640625,
"learning_rate": 4.872127991235435e-05,
"loss": 2.1953,
"step": 136000
},
{
"epoch": 1.0200191672878287,
"grad_norm": 89905.53125,
"learning_rate": 4.842794858675067e-05,
"loss": 2.223,
"step": 136500
},
{
"epoch": 1.0237554807047435,
"grad_norm": 74529.2734375,
"learning_rate": 4.8134671410929855e-05,
"loss": 2.2232,
"step": 137000
},
{
"epoch": 1.0274917941216581,
"grad_norm": 86540.96875,
"learning_rate": 4.7841458486912313e-05,
"loss": 2.229,
"step": 137500
},
{
"epoch": 1.0312281075385727,
"grad_norm": 79359.9140625,
"learning_rate": 4.75483199145052e-05,
"loss": 2.2484,
"step": 138000
},
{
"epoch": 1.0349644209554874,
"grad_norm": 88735.703125,
"learning_rate": 4.725526579095468e-05,
"loss": 2.2141,
"step": 138500
},
{
"epoch": 1.0387007343724022,
"grad_norm": 84116.59375,
"learning_rate": 4.6962306210598024e-05,
"loss": 2.2362,
"step": 139000
},
{
"epoch": 1.0424370477893168,
"grad_norm": 85959.921875,
"learning_rate": 4.66694512645159e-05,
"loss": 2.2703,
"step": 139500
},
{
"epoch": 1.0461733612062314,
"grad_norm": 76734.796875,
"learning_rate": 4.637671104018486e-05,
"loss": 2.2324,
"step": 140000
},
{
"epoch": 1.049909674623146,
"grad_norm": 82724.0078125,
"learning_rate": 4.608409562112983e-05,
"loss": 2.2406,
"step": 140500
},
{
"epoch": 1.0536459880400608,
"grad_norm": 85404.375,
"learning_rate": 4.5791615086576726e-05,
"loss": 2.2049,
"step": 141000
},
{
"epoch": 1.0573823014569754,
"grad_norm": 73961.2890625,
"learning_rate": 4.549927951110542e-05,
"loss": 2.2176,
"step": 141500
},
{
"epoch": 1.06111861487389,
"grad_norm": 83664.9921875,
"learning_rate": 4.5207098964302556e-05,
"loss": 2.2473,
"step": 142000
},
{
"epoch": 1.0648549282908046,
"grad_norm": 89258.0390625,
"learning_rate": 4.491508351041477e-05,
"loss": 2.227,
"step": 142500
},
{
"epoch": 1.0685912417077195,
"grad_norm": 99317.4296875,
"learning_rate": 4.462324320800205e-05,
"loss": 2.2237,
"step": 143000
},
{
"epoch": 1.072327555124634,
"grad_norm": 90441.765625,
"learning_rate": 4.433158810959122e-05,
"loss": 2.2321,
"step": 143500
},
{
"epoch": 1.0760638685415487,
"grad_norm": 69907.328125,
"learning_rate": 4.4040128261329724e-05,
"loss": 2.2319,
"step": 144000
},
{
"epoch": 1.0798001819584635,
"grad_norm": 87409.109375,
"learning_rate": 4.3748873702639536e-05,
"loss": 2.2581,
"step": 144500
},
{
"epoch": 1.083536495375378,
"grad_norm": 91680.8828125,
"learning_rate": 4.345783446587135e-05,
"loss": 2.2381,
"step": 145000
},
{
"epoch": 1.0872728087922927,
"grad_norm": 84339.4140625,
"learning_rate": 4.316702057595907e-05,
"loss": 2.2398,
"step": 145500
},
{
"epoch": 1.0910091222092073,
"grad_norm": 99060.578125,
"learning_rate": 4.2876442050074436e-05,
"loss": 2.2251,
"step": 146000
},
{
"epoch": 1.0947454356261221,
"grad_norm": 103543.2421875,
"learning_rate": 4.258610889728198e-05,
"loss": 2.2536,
"step": 146500
},
{
"epoch": 1.0984817490430367,
"grad_norm": 89170.1953125,
"learning_rate": 4.229603111819437e-05,
"loss": 2.1822,
"step": 147000
},
{
"epoch": 1.1022180624599514,
"grad_norm": 95436.765625,
"learning_rate": 4.2006218704627756e-05,
"loss": 2.2338,
"step": 147500
},
{
"epoch": 1.105954375876866,
"grad_norm": 86420.7265625,
"learning_rate": 4.1716681639257725e-05,
"loss": 2.2153,
"step": 148000
},
{
"epoch": 1.1096906892937808,
"grad_norm": 76673.3359375,
"learning_rate": 4.1427429895275446e-05,
"loss": 2.2289,
"step": 148500
},
{
"epoch": 1.1134270027106954,
"grad_norm": 101906.8515625,
"learning_rate": 4.113847343604404e-05,
"loss": 2.2297,
"step": 149000
},
{
"epoch": 1.11716331612761,
"grad_norm": 88695.703125,
"learning_rate": 4.084982221475554e-05,
"loss": 2.2219,
"step": 149500
},
{
"epoch": 1.1208996295445246,
"grad_norm": 100434.46875,
"learning_rate": 4.05614861740879e-05,
"loss": 2.2344,
"step": 150000
},
{
"epoch": 1.1246359429614394,
"grad_norm": 90131.671875,
"learning_rate": 4.0273475245862594e-05,
"loss": 2.2137,
"step": 150500
},
{
"epoch": 1.128372256378354,
"grad_norm": 84749.8828125,
"learning_rate": 3.998579935070253e-05,
"loss": 2.2358,
"step": 151000
},
{
"epoch": 1.1321085697952686,
"grad_norm": 80951.0546875,
"learning_rate": 3.969846839769027e-05,
"loss": 2.2327,
"step": 151500
},
{
"epoch": 1.1358448832121835,
"grad_norm": 82394.359375,
"learning_rate": 3.941149228402674e-05,
"loss": 2.2448,
"step": 152000
},
{
"epoch": 1.139581196629098,
"grad_norm": 108267.5546875,
"learning_rate": 3.912488089469036e-05,
"loss": 2.211,
"step": 152500
},
{
"epoch": 1.1433175100460127,
"grad_norm": 73743.703125,
"learning_rate": 3.883864410209648e-05,
"loss": 2.2159,
"step": 153000
},
{
"epoch": 1.1470538234629273,
"grad_norm": 105991.84375,
"learning_rate": 3.855279176575732e-05,
"loss": 2.2191,
"step": 153500
},
{
"epoch": 1.150790136879842,
"grad_norm": 88528.921875,
"learning_rate": 3.826733373194247e-05,
"loss": 2.2397,
"step": 154000
},
{
"epoch": 1.1545264502967567,
"grad_norm": 99099.5390625,
"learning_rate": 3.798227983333955e-05,
"loss": 2.2083,
"step": 154500
},
{
"epoch": 1.1582627637136713,
"grad_norm": 96959.421875,
"learning_rate": 3.769763988871572e-05,
"loss": 2.2146,
"step": 155000
},
{
"epoch": 1.1619990771305861,
"grad_norm": 96111.9375,
"learning_rate": 3.741342370257932e-05,
"loss": 2.2421,
"step": 155500
},
{
"epoch": 1.1657353905475007,
"grad_norm": 80256.734375,
"learning_rate": 3.712964106484218e-05,
"loss": 2.2199,
"step": 156000
},
{
"epoch": 1.1694717039644154,
"grad_norm": 84992.546875,
"learning_rate": 3.684630175048249e-05,
"loss": 2.184,
"step": 156500
},
{
"epoch": 1.17320801738133,
"grad_norm": 94191.3671875,
"learning_rate": 3.6563415519207985e-05,
"loss": 2.2505,
"step": 157000
},
{
"epoch": 1.1769443307982446,
"grad_norm": 76174.2421875,
"learning_rate": 3.62809921151198e-05,
"loss": 2.2143,
"step": 157500
},
{
"epoch": 1.1806806442151594,
"grad_norm": 89977.34375,
"learning_rate": 3.599904126637692e-05,
"loss": 2.2338,
"step": 158000
},
{
"epoch": 1.184416957632074,
"grad_norm": 107886.5390625,
"learning_rate": 3.571757268486096e-05,
"loss": 2.2516,
"step": 158500
},
{
"epoch": 1.1881532710489886,
"grad_norm": 60902.64453125,
"learning_rate": 3.54365960658417e-05,
"loss": 2.197,
"step": 159000
},
{
"epoch": 1.1918895844659034,
"grad_norm": 91395.515625,
"learning_rate": 3.5156121087643154e-05,
"loss": 2.2238,
"step": 159500
},
{
"epoch": 1.195625897882818,
"grad_norm": 90301.53125,
"learning_rate": 3.487615741131013e-05,
"loss": 2.2317,
"step": 160000
},
{
"epoch": 1.1993622112997326,
"grad_norm": 91005.0625,
"learning_rate": 3.459671468027553e-05,
"loss": 2.2019,
"step": 160500
},
{
"epoch": 1.2030985247166472,
"grad_norm": 81654.03125,
"learning_rate": 3.4317802520028117e-05,
"loss": 2.2199,
"step": 161000
},
{
"epoch": 1.206834838133562,
"grad_norm": 100322.15625,
"learning_rate": 3.403943053778097e-05,
"loss": 2.2668,
"step": 161500
},
{
"epoch": 1.2105711515504767,
"grad_norm": 84712.84375,
"learning_rate": 3.37616083221406e-05,
"loss": 2.215,
"step": 162000
},
{
"epoch": 1.2143074649673913,
"grad_norm": 93723.7734375,
"learning_rate": 3.348434544277663e-05,
"loss": 2.2052,
"step": 162500
},
{
"epoch": 1.218043778384306,
"grad_norm": 98371.0859375,
"learning_rate": 3.320765145009216e-05,
"loss": 2.1767,
"step": 163000
},
{
"epoch": 1.2217800918012207,
"grad_norm": 78824.3359375,
"learning_rate": 3.293153587489488e-05,
"loss": 2.2341,
"step": 163500
},
{
"epoch": 1.2255164052181353,
"grad_norm": 93536.6640625,
"learning_rate": 3.265600822806866e-05,
"loss": 2.1954,
"step": 164000
},
{
"epoch": 1.22925271863505,
"grad_norm": 89381.3203125,
"learning_rate": 3.2381078000245965e-05,
"loss": 2.2026,
"step": 164500
},
{
"epoch": 1.2329890320519645,
"grad_norm": 110292.1484375,
"learning_rate": 3.2106754661481086e-05,
"loss": 2.2022,
"step": 165000
},
{
"epoch": 1.2367253454688794,
"grad_norm": 89096.8359375,
"learning_rate": 3.183304766092374e-05,
"loss": 2.1618,
"step": 165500
},
{
"epoch": 1.240461658885794,
"grad_norm": 97123.8671875,
"learning_rate": 3.1559966426493804e-05,
"loss": 2.192,
"step": 166000
},
{
"epoch": 1.2441979723027086,
"grad_norm": 86452.890625,
"learning_rate": 3.128752036455631e-05,
"loss": 2.203,
"step": 166500
},
{
"epoch": 1.2479342857196234,
"grad_norm": 99330.265625,
"learning_rate": 3.101571885959766e-05,
"loss": 2.1653,
"step": 167000
},
{
"epoch": 1.251670599136538,
"grad_norm": 82720.1015625,
"learning_rate": 3.0744571273902345e-05,
"loss": 2.1893,
"step": 167500
},
{
"epoch": 1.2554069125534526,
"grad_norm": 98328.6328125,
"learning_rate": 3.047408694723032e-05,
"loss": 2.2091,
"step": 168000
},
{
"epoch": 1.2591432259703672,
"grad_norm": 89367.40625,
"learning_rate": 3.0204275196495414e-05,
"loss": 2.1961,
"step": 168500
},
{
"epoch": 1.262879539387282,
"grad_norm": 95675.3203125,
"learning_rate": 2.99351453154444e-05,
"loss": 2.1824,
"step": 169000
},
{
"epoch": 1.2666158528041966,
"grad_norm": 83745.3359375,
"learning_rate": 2.9666706574336834e-05,
"loss": 2.1793,
"step": 169500
},
{
"epoch": 1.2703521662211112,
"grad_norm": 92235.09375,
"learning_rate": 2.9398968219625706e-05,
"loss": 2.2034,
"step": 170000
},
{
"epoch": 1.274088479638026,
"grad_norm": 79515.6328125,
"learning_rate": 2.913193947363908e-05,
"loss": 2.2091,
"step": 170500
},
{
"epoch": 1.2778247930549407,
"grad_norm": 92163.1640625,
"learning_rate": 2.8865629534262216e-05,
"loss": 2.2082,
"step": 171000
},
{
"epoch": 1.2815611064718553,
"grad_norm": 90488.1484375,
"learning_rate": 2.8600047574620998e-05,
"loss": 2.1976,
"step": 171500
},
{
"epoch": 1.2852974198887699,
"grad_norm": 93587.21875,
"learning_rate": 2.83352027427657e-05,
"loss": 2.2092,
"step": 172000
},
{
"epoch": 1.2890337333056845,
"grad_norm": 90367.0546875,
"learning_rate": 2.8071104161356087e-05,
"loss": 2.2086,
"step": 172500
},
{
"epoch": 1.2927700467225993,
"grad_norm": 94576.171875,
"learning_rate": 2.7807760927347102e-05,
"loss": 2.2071,
"step": 173000
},
{
"epoch": 1.296506360139514,
"grad_norm": 84362.703125,
"learning_rate": 2.7545182111675466e-05,
"loss": 2.1646,
"step": 173500
},
{
"epoch": 1.3002426735564285,
"grad_norm": 99929.3984375,
"learning_rate": 2.7283376758947333e-05,
"loss": 2.1721,
"step": 174000
},
{
"epoch": 1.3039789869733434,
"grad_norm": 99786.953125,
"learning_rate": 2.7022353887126668e-05,
"loss": 2.2208,
"step": 174500
},
{
"epoch": 1.307715300390258,
"grad_norm": 118692.8046875,
"learning_rate": 2.6762122487224684e-05,
"loss": 2.2036,
"step": 175000
},
{
"epoch": 1.3114516138071726,
"grad_norm": 88971.234375,
"learning_rate": 2.6502691522990032e-05,
"loss": 2.2018,
"step": 175500
},
{
"epoch": 1.3151879272240872,
"grad_norm": 106580.484375,
"learning_rate": 2.624406993060019e-05,
"loss": 2.1668,
"step": 176000
},
{
"epoch": 1.318924240641002,
"grad_norm": 94986.0390625,
"learning_rate": 2.598626661835354e-05,
"loss": 2.1782,
"step": 176500
},
{
"epoch": 1.3226605540579166,
"grad_norm": 87692.828125,
"learning_rate": 2.5729290466362565e-05,
"loss": 2.251,
"step": 177000
},
{
"epoch": 1.3263968674748312,
"grad_norm": 93603.875,
"learning_rate": 2.5473150326248006e-05,
"loss": 2.2143,
"step": 177500
},
{
"epoch": 1.330133180891746,
"grad_norm": 98669.8828125,
"learning_rate": 2.521785502083384e-05,
"loss": 2.1962,
"step": 178000
},
{
"epoch": 1.3338694943086606,
"grad_norm": 77537.9609375,
"learning_rate": 2.4963413343843522e-05,
"loss": 2.1533,
"step": 178500
},
{
"epoch": 1.3376058077255752,
"grad_norm": 107931.5625,
"learning_rate": 2.4709834059597026e-05,
"loss": 2.1715,
"step": 179000
},
{
"epoch": 1.3413421211424899,
"grad_norm": 91190.8125,
"learning_rate": 2.4457125902708872e-05,
"loss": 2.1757,
"step": 179500
},
{
"epoch": 1.3450784345594045,
"grad_norm": 84562.8046875,
"learning_rate": 2.420529757778746e-05,
"loss": 2.1666,
"step": 180000
},
{
"epoch": 1.3488147479763193,
"grad_norm": 99736.1640625,
"learning_rate": 2.3954357759134995e-05,
"loss": 2.1948,
"step": 180500
},
{
"epoch": 1.3525510613932339,
"grad_norm": 93177.078125,
"learning_rate": 2.3704315090448886e-05,
"loss": 2.1868,
"step": 181000
},
{
"epoch": 1.3562873748101485,
"grad_norm": 77689.5625,
"learning_rate": 2.345517818452394e-05,
"loss": 2.2091,
"step": 181500
},
{
"epoch": 1.3600236882270633,
"grad_norm": 92219.8671875,
"learning_rate": 2.3206955622955623e-05,
"loss": 2.1795,
"step": 182000
},
{
"epoch": 1.363760001643978,
"grad_norm": 96159.34375,
"learning_rate": 2.2959655955844656e-05,
"loss": 2.2186,
"step": 182500
},
{
"epoch": 1.3674963150608925,
"grad_norm": 84869.453125,
"learning_rate": 2.271328770150227e-05,
"loss": 2.1516,
"step": 183000
},
{
"epoch": 1.3712326284778071,
"grad_norm": 79080.1484375,
"learning_rate": 2.2467859346156968e-05,
"loss": 2.2031,
"step": 183500
},
{
"epoch": 1.374968941894722,
"grad_norm": 107557.9765625,
"learning_rate": 2.222337934366214e-05,
"loss": 2.1703,
"step": 184000
},
{
"epoch": 1.3787052553116366,
"grad_norm": 104010.9296875,
"learning_rate": 2.1979856115204845e-05,
"loss": 2.1637,
"step": 184500
},
{
"epoch": 1.3824415687285512,
"grad_norm": 93606.03125,
"learning_rate": 2.173729804901581e-05,
"loss": 2.1746,
"step": 185000
},
{
"epoch": 1.386177882145466,
"grad_norm": 100350.8515625,
"learning_rate": 2.1495713500080443e-05,
"loss": 2.1847,
"step": 185500
},
{
"epoch": 1.3899141955623806,
"grad_norm": 91779.265625,
"learning_rate": 2.1255110789851084e-05,
"loss": 2.1576,
"step": 186000
},
{
"epoch": 1.3936505089792952,
"grad_norm": 112973.15625,
"learning_rate": 2.1015498205960283e-05,
"loss": 2.177,
"step": 186500
},
{
"epoch": 1.3973868223962098,
"grad_norm": 91850.71875,
"learning_rate": 2.0776884001935437e-05,
"loss": 2.1904,
"step": 187000
},
{
"epoch": 1.4011231358131244,
"grad_norm": 92866.7265625,
"learning_rate": 2.0539276396914448e-05,
"loss": 2.1891,
"step": 187500
},
{
"epoch": 1.4048594492300392,
"grad_norm": 94535.1640625,
"learning_rate": 2.0302683575362608e-05,
"loss": 2.1768,
"step": 188000
},
{
"epoch": 1.4085957626469539,
"grad_norm": 99327.2890625,
"learning_rate": 2.0067113686790694e-05,
"loss": 2.195,
"step": 188500
},
{
"epoch": 1.4123320760638687,
"grad_norm": 95519.703125,
"learning_rate": 1.983257484547421e-05,
"loss": 2.1735,
"step": 189000
},
{
"epoch": 1.4160683894807833,
"grad_norm": 93955.65625,
"learning_rate": 1.9599075130173977e-05,
"loss": 2.2019,
"step": 189500
},
{
"epoch": 1.419804702897698,
"grad_norm": 143901.203125,
"learning_rate": 1.9366622583857803e-05,
"loss": 2.1768,
"step": 190000
},
{
"epoch": 1.4235410163146125,
"grad_norm": 77758.40625,
"learning_rate": 1.913522521342338e-05,
"loss": 2.1481,
"step": 190500
},
{
"epoch": 1.427277329731527,
"grad_norm": 102854.71875,
"learning_rate": 1.890489098942267e-05,
"loss": 2.1364,
"step": 191000
},
{
"epoch": 1.431013643148442,
"grad_norm": 76422.46875,
"learning_rate": 1.8675627845787126e-05,
"loss": 2.1446,
"step": 191500
},
{
"epoch": 1.4347499565653565,
"grad_norm": 103994.8359375,
"learning_rate": 1.8447443679554567e-05,
"loss": 2.1618,
"step": 192000
},
{
"epoch": 1.4384862699822711,
"grad_norm": 93646.765625,
"learning_rate": 1.8220346350597123e-05,
"loss": 2.2005,
"step": 192500
},
{
"epoch": 1.442222583399186,
"grad_norm": 104911.5234375,
"learning_rate": 1.7994343681350422e-05,
"loss": 2.185,
"step": 193000
},
{
"epoch": 1.4459588968161006,
"grad_norm": 103320.125,
"learning_rate": 1.7769443456544322e-05,
"loss": 2.1904,
"step": 193500
},
{
"epoch": 1.4496952102330152,
"grad_norm": 88957.3203125,
"learning_rate": 1.7545653422934537e-05,
"loss": 2.1924,
"step": 194000
},
{
"epoch": 1.4534315236499298,
"grad_norm": 98682.171875,
"learning_rate": 1.732298128903597e-05,
"loss": 2.1546,
"step": 194500
},
{
"epoch": 1.4571678370668444,
"grad_norm": 93626.0234375,
"learning_rate": 1.7101434724857147e-05,
"loss": 2.1653,
"step": 195000
},
{
"epoch": 1.4609041504837592,
"grad_norm": 84740.2265625,
"learning_rate": 1.6881021361635947e-05,
"loss": 2.1835,
"step": 195500
},
{
"epoch": 1.4646404639006738,
"grad_norm": 95559.859375,
"learning_rate": 1.6661748791576855e-05,
"loss": 2.1716,
"step": 196000
},
{
"epoch": 1.4683767773175886,
"grad_norm": 105746.453125,
"learning_rate": 1.6443624567589377e-05,
"loss": 2.1467,
"step": 196500
},
{
"epoch": 1.4721130907345032,
"grad_norm": 93445.3515625,
"learning_rate": 1.622665620302789e-05,
"loss": 2.1673,
"step": 197000
},
{
"epoch": 1.4758494041514179,
"grad_norm": 88159.8125,
"learning_rate": 1.6010851171432874e-05,
"loss": 2.1727,
"step": 197500
},
{
"epoch": 1.4795857175683325,
"grad_norm": 93973.4375,
"learning_rate": 1.5796216906273388e-05,
"loss": 2.1824,
"step": 198000
},
{
"epoch": 1.483322030985247,
"grad_norm": 95876.78125,
"learning_rate": 1.5582760800691155e-05,
"loss": 2.1858,
"step": 198500
},
{
"epoch": 1.487058344402162,
"grad_norm": 90519.78125,
"learning_rate": 1.537049020724581e-05,
"loss": 2.1984,
"step": 199000
},
{
"epoch": 1.4907946578190765,
"grad_norm": 88962.28125,
"learning_rate": 1.515941243766169e-05,
"loss": 2.2229,
"step": 199500
},
{
"epoch": 1.494530971235991,
"grad_norm": 113852.828125,
"learning_rate": 1.4949534762575912e-05,
"loss": 2.157,
"step": 200000
},
{
"epoch": 1.498267284652906,
"grad_norm": 97744.3984375,
"learning_rate": 1.4740864411288013e-05,
"loss": 2.1711,
"step": 200500
},
{
"epoch": 1.5020035980698205,
"grad_norm": 88245.1640625,
"learning_rate": 1.4533408571510904e-05,
"loss": 2.1867,
"step": 201000
},
{
"epoch": 1.5057399114867351,
"grad_norm": 89220.171875,
"learning_rate": 1.4327174389123227e-05,
"loss": 2.1382,
"step": 201500
},
{
"epoch": 1.5094762249036497,
"grad_norm": 108794.328125,
"learning_rate": 1.4122168967923366e-05,
"loss": 2.1608,
"step": 202000
},
{
"epoch": 1.5132125383205643,
"grad_norm": 108413.6640625,
"learning_rate": 1.3918399369384556e-05,
"loss": 2.1657,
"step": 202500
},
{
"epoch": 1.5169488517374792,
"grad_norm": 92711.84375,
"learning_rate": 1.3715872612411806e-05,
"loss": 2.1385,
"step": 203000
},
{
"epoch": 1.5206851651543938,
"grad_norm": 97720.765625,
"learning_rate": 1.3514595673100084e-05,
"loss": 2.1467,
"step": 203500
},
{
"epoch": 1.5244214785713086,
"grad_norm": 86545.078125,
"learning_rate": 1.3314575484493946e-05,
"loss": 2.152,
"step": 204000
},
{
"epoch": 1.5281577919882232,
"grad_norm": 96816.7421875,
"learning_rate": 1.3115818936348889e-05,
"loss": 2.1636,
"step": 204500
},
{
"epoch": 1.5318941054051378,
"grad_norm": 100713.7734375,
"learning_rate": 1.2918332874893869e-05,
"loss": 2.1784,
"step": 205000
},
{
"epoch": 1.5356304188220524,
"grad_norm": 97874.359375,
"learning_rate": 1.2722124102595518e-05,
"loss": 2.1903,
"step": 205500
},
{
"epoch": 1.539366732238967,
"grad_norm": 91848.2734375,
"learning_rate": 1.252719937792397e-05,
"loss": 2.1524,
"step": 206000
},
{
"epoch": 1.5431030456558816,
"grad_norm": 95539.515625,
"learning_rate": 1.2333565415119835e-05,
"loss": 2.1813,
"step": 206500
},
{
"epoch": 1.5468393590727965,
"grad_norm": 92111.3984375,
"learning_rate": 1.2141228883963118e-05,
"loss": 2.1615,
"step": 207000
},
{
"epoch": 1.5505756724897113,
"grad_norm": 90152.640625,
"learning_rate": 1.195019640954339e-05,
"loss": 2.1615,
"step": 207500
},
{
"epoch": 1.554311985906626,
"grad_norm": 96634.609375,
"learning_rate": 1.1760474572031538e-05,
"loss": 2.1455,
"step": 208000
},
{
"epoch": 1.5580482993235405,
"grad_norm": 102015.8984375,
"learning_rate": 1.1572069906453265e-05,
"loss": 2.1508,
"step": 208500
},
{
"epoch": 1.561784612740455,
"grad_norm": 90662.0859375,
"learning_rate": 1.1384988902463806e-05,
"loss": 2.1443,
"step": 209000
},
{
"epoch": 1.5655209261573697,
"grad_norm": 101081.84375,
"learning_rate": 1.1199238004124519e-05,
"loss": 2.191,
"step": 209500
},
{
"epoch": 1.5692572395742843,
"grad_norm": 86159.140625,
"learning_rate": 1.101482360968088e-05,
"loss": 2.1739,
"step": 210000
},
{
"epoch": 1.5729935529911991,
"grad_norm": 96140.9296875,
"learning_rate": 1.0831752071342033e-05,
"loss": 2.1597,
"step": 210500
},
{
"epoch": 1.5767298664081137,
"grad_norm": 93931.6484375,
"learning_rate": 1.0650029695062103e-05,
"loss": 2.1704,
"step": 211000
},
{
"epoch": 1.5804661798250286,
"grad_norm": 96550.375,
"learning_rate": 1.0469662740322888e-05,
"loss": 2.1762,
"step": 211500
},
{
"epoch": 1.5842024932419432,
"grad_norm": 91985.375,
"learning_rate": 1.0290657419918308e-05,
"loss": 2.1537,
"step": 212000
},
{
"epoch": 1.5879388066588578,
"grad_norm": 99446.453125,
"learning_rate": 1.0113019899740345e-05,
"loss": 2.1567,
"step": 212500
},
{
"epoch": 1.5916751200757724,
"grad_norm": 90499.4453125,
"learning_rate": 9.936756298566724e-06,
"loss": 2.1542,
"step": 213000
},
{
"epoch": 1.595411433492687,
"grad_norm": 91874.203125,
"learning_rate": 9.761872687850093e-06,
"loss": 2.1701,
"step": 213500
},
{
"epoch": 1.5991477469096018,
"grad_norm": 101494.265625,
"learning_rate": 9.588375091508945e-06,
"loss": 2.1785,
"step": 214000
},
{
"epoch": 1.6028840603265164,
"grad_norm": 101492.890625,
"learning_rate": 9.41626948572007e-06,
"loss": 2.1681,
"step": 214500
},
{
"epoch": 1.6066203737434313,
"grad_norm": 97085.359375,
"learning_rate": 9.245561798712704e-06,
"loss": 2.163,
"step": 215000
},
{
"epoch": 1.6103566871603459,
"grad_norm": 101823.4609375,
"learning_rate": 9.076257910564402e-06,
"loss": 2.1538,
"step": 215500
},
{
"epoch": 1.6140930005772605,
"grad_norm": 91494.078125,
"learning_rate": 8.908363652998414e-06,
"loss": 2.1235,
"step": 216000
},
{
"epoch": 1.617829313994175,
"grad_norm": 85581.09375,
"learning_rate": 8.74188480918282e-06,
"loss": 2.1736,
"step": 216500
},
{
"epoch": 1.6215656274110897,
"grad_norm": 92024.046875,
"learning_rate": 8.576827113531427e-06,
"loss": 2.177,
"step": 217000
},
{
"epoch": 1.6253019408280043,
"grad_norm": 74753.4296875,
"learning_rate": 8.413196251506073e-06,
"loss": 2.143,
"step": 217500
},
{
"epoch": 1.629038254244919,
"grad_norm": 107585.3125,
"learning_rate": 8.25099785942094e-06,
"loss": 2.1522,
"step": 218000
},
{
"epoch": 1.6327745676618337,
"grad_norm": 94180.15625,
"learning_rate": 8.090237524248346e-06,
"loss": 2.1343,
"step": 218500
},
{
"epoch": 1.6365108810787485,
"grad_norm": 102109.1953125,
"learning_rate": 7.930920783426265e-06,
"loss": 2.1643,
"step": 219000
},
{
"epoch": 1.6402471944956631,
"grad_norm": 107961.2109375,
"learning_rate": 7.773053124667712e-06,
"loss": 2.1483,
"step": 219500
},
{
"epoch": 1.6439835079125777,
"grad_norm": 89340.9296875,
"learning_rate": 7.616639985771556e-06,
"loss": 2.1602,
"step": 220000
},
{
"epoch": 1.6477198213294924,
"grad_norm": 103068.6015625,
"learning_rate": 7.461686754435332e-06,
"loss": 2.1549,
"step": 220500
},
{
"epoch": 1.651456134746407,
"grad_norm": 101308.2421875,
"learning_rate": 7.30819876806963e-06,
"loss": 2.1645,
"step": 221000
},
{
"epoch": 1.6551924481633218,
"grad_norm": 116589.4140625,
"learning_rate": 7.156181313614185e-06,
"loss": 2.1538,
"step": 221500
},
{
"epoch": 1.6589287615802364,
"grad_norm": 107515.390625,
"learning_rate": 7.005639627355881e-06,
"loss": 2.1506,
"step": 222000
},
{
"epoch": 1.6626650749971512,
"grad_norm": 105536.6484375,
"learning_rate": 6.856578894748284e-06,
"loss": 2.1563,
"step": 222500
},
{
"epoch": 1.6664013884140658,
"grad_norm": 125030.3046875,
"learning_rate": 6.709004250233097e-06,
"loss": 2.1574,
"step": 223000
},
{
"epoch": 1.6701377018309804,
"grad_norm": 95777.0546875,
"learning_rate": 6.562920777063231e-06,
"loss": 2.1502,
"step": 223500
},
{
"epoch": 1.673874015247895,
"grad_norm": 125466.265625,
"learning_rate": 6.418333507127788e-06,
"loss": 2.1569,
"step": 224000
},
{
"epoch": 1.6776103286648096,
"grad_norm": 82233.953125,
"learning_rate": 6.275247420778685e-06,
"loss": 2.1315,
"step": 224500
},
{
"epoch": 1.6813466420817242,
"grad_norm": 86722.7890625,
"learning_rate": 6.133667446659131e-06,
"loss": 2.1662,
"step": 225000
},
{
"epoch": 1.685082955498639,
"grad_norm": 95386.34375,
"learning_rate": 5.993598461533845e-06,
"loss": 2.147,
"step": 225500
},
{
"epoch": 1.6888192689155537,
"grad_norm": 82564.6484375,
"learning_rate": 5.855045290121047e-06,
"loss": 2.1426,
"step": 226000
},
{
"epoch": 1.6925555823324685,
"grad_norm": 86084.6875,
"learning_rate": 5.71801270492634e-06,
"loss": 2.1517,
"step": 226500
},
{
"epoch": 1.696291895749383,
"grad_norm": 90130.1796875,
"learning_rate": 5.58250542607826e-06,
"loss": 2.143,
"step": 227000
},
{
"epoch": 1.7000282091662977,
"grad_norm": 95907.765625,
"learning_rate": 5.448528121165691e-06,
"loss": 2.1631,
"step": 227500
},
{
"epoch": 1.7037645225832123,
"grad_norm": 88561.578125,
"learning_rate": 5.316085405077159e-06,
"loss": 2.135,
"step": 228000
},
{
"epoch": 1.707500836000127,
"grad_norm": 94637.078125,
"learning_rate": 5.185181839841746e-06,
"loss": 2.1314,
"step": 228500
},
{
"epoch": 1.7112371494170417,
"grad_norm": 96587.3125,
"learning_rate": 5.055821934472071e-06,
"loss": 2.1399,
"step": 229000
},
{
"epoch": 1.7149734628339564,
"grad_norm": 91652.40625,
"learning_rate": 4.92801014480892e-06,
"loss": 2.1798,
"step": 229500
},
{
"epoch": 1.7187097762508712,
"grad_norm": 104303.1015625,
"learning_rate": 4.801750873367728e-06,
"loss": 2.1749,
"step": 230000
},
{
"epoch": 1.7224460896677858,
"grad_norm": 85994.984375,
"learning_rate": 4.677048469187045e-06,
"loss": 2.161,
"step": 230500
},
{
"epoch": 1.7261824030847004,
"grad_norm": 105747.375,
"learning_rate": 4.553907227678594e-06,
"loss": 2.1273,
"step": 231000
},
{
"epoch": 1.729918716501615,
"grad_norm": 108115.8125,
"learning_rate": 4.432331390479416e-06,
"loss": 2.1493,
"step": 231500
},
{
"epoch": 1.7336550299185296,
"grad_norm": 97434.2109375,
"learning_rate": 4.312325145305729e-06,
"loss": 2.1339,
"step": 232000
},
{
"epoch": 1.7373913433354442,
"grad_norm": 100293.0546875,
"learning_rate": 4.193892625808659e-06,
"loss": 2.1501,
"step": 232500
},
{
"epoch": 1.741127656752359,
"grad_norm": 102609.921875,
"learning_rate": 4.077037911431913e-06,
"loss": 2.1309,
"step": 233000
},
{
"epoch": 1.7448639701692736,
"grad_norm": 115093.4453125,
"learning_rate": 3.961765027271197e-06,
"loss": 2.144,
"step": 233500
},
{
"epoch": 1.7486002835861885,
"grad_norm": 85552.25,
"learning_rate": 3.84807794393563e-06,
"loss": 2.0989,
"step": 234000
},
{
"epoch": 1.752336597003103,
"grad_norm": 104233.3359375,
"learning_rate": 3.735980577410908e-06,
"loss": 2.1801,
"step": 234500
},
{
"epoch": 1.7560729104200177,
"grad_norm": 103465.9375,
"learning_rate": 3.6254767889244813e-06,
"loss": 2.131,
"step": 235000
},
{
"epoch": 1.7598092238369323,
"grad_norm": 100807.5546875,
"learning_rate": 3.516570384812523e-06,
"loss": 2.1221,
"step": 235500
},
{
"epoch": 1.7635455372538469,
"grad_norm": 104049.0078125,
"learning_rate": 3.4092651163888154e-06,
"loss": 2.1382,
"step": 236000
},
{
"epoch": 1.7672818506707617,
"grad_norm": 100944.1015625,
"learning_rate": 3.303564679815557e-06,
"loss": 2.1736,
"step": 236500
},
{
"epoch": 1.7710181640876763,
"grad_norm": 104766.390625,
"learning_rate": 3.1994727159759986e-06,
"loss": 2.1125,
"step": 237000
},
{
"epoch": 1.7747544775045911,
"grad_norm": 95535.8515625,
"learning_rate": 3.0969928103490976e-06,
"loss": 2.1478,
"step": 237500
},
{
"epoch": 1.7784907909215057,
"grad_norm": 89555.0859375,
"learning_rate": 2.996128492885969e-06,
"loss": 2.1244,
"step": 238000
},
{
"epoch": 1.7822271043384204,
"grad_norm": 101483.0546875,
"learning_rate": 2.8968832378882926e-06,
"loss": 2.1287,
"step": 238500
},
{
"epoch": 1.785963417755335,
"grad_norm": 84929.1640625,
"learning_rate": 2.7992604638886975e-06,
"loss": 2.1421,
"step": 239000
},
{
"epoch": 1.7896997311722496,
"grad_norm": 96690.8984375,
"learning_rate": 2.703263533532929e-06,
"loss": 2.1681,
"step": 239500
},
{
"epoch": 1.7934360445891642,
"grad_norm": 105319.703125,
"learning_rate": 2.608895753464069e-06,
"loss": 2.1331,
"step": 240000
},
{
"epoch": 1.797172358006079,
"grad_norm": 101327.21875,
"learning_rate": 2.516160374208648e-06,
"loss": 2.1551,
"step": 240500
},
{
"epoch": 1.8009086714229936,
"grad_norm": 94917.8125,
"learning_rate": 2.4250605900646204e-06,
"loss": 2.1703,
"step": 241000
},
{
"epoch": 1.8046449848399084,
"grad_norm": 97282.4765625,
"learning_rate": 2.3355995389914342e-06,
"loss": 2.1457,
"step": 241500
},
{
"epoch": 1.808381298256823,
"grad_norm": 113476.78125,
"learning_rate": 2.2477803025018228e-06,
"loss": 2.1237,
"step": 242000
},
{
"epoch": 1.8121176116737376,
"grad_norm": 99987.9921875,
"learning_rate": 2.1616059055557636e-06,
"loss": 2.1792,
"step": 242500
},
{
"epoch": 1.8158539250906522,
"grad_norm": 110651.3828125,
"learning_rate": 2.0770793164562373e-06,
"loss": 2.135,
"step": 243000
},
{
"epoch": 1.8195902385075668,
"grad_norm": 96608.578125,
"learning_rate": 1.994203446746962e-06,
"loss": 2.1681,
"step": 243500
},
{
"epoch": 1.8233265519244817,
"grad_norm": 108624.8046875,
"learning_rate": 1.9129811511121697e-06,
"loss": 2.1457,
"step": 244000
},
{
"epoch": 1.8270628653413963,
"grad_norm": 113863.421875,
"learning_rate": 1.8334152272782069e-06,
"loss": 2.1444,
"step": 244500
},
{
"epoch": 1.830799178758311,
"grad_norm": 100759.1875,
"learning_rate": 1.7555084159172176e-06,
"loss": 2.1633,
"step": 245000
},
{
"epoch": 1.8345354921752257,
"grad_norm": 104006.0859375,
"learning_rate": 1.6792634005526943e-06,
"loss": 2.1561,
"step": 245500
},
{
"epoch": 1.8382718055921403,
"grad_norm": 93557.3203125,
"learning_rate": 1.6046828074670894e-06,
"loss": 2.1397,
"step": 246000
},
{
"epoch": 1.842008119009055,
"grad_norm": 92767.28125,
"learning_rate": 1.531769205611322e-06,
"loss": 2.1375,
"step": 246500
},
{
"epoch": 1.8457444324259695,
"grad_norm": 115375.0546875,
"learning_rate": 1.4605251065162973e-06,
"loss": 2.1149,
"step": 247000
},
{
"epoch": 1.8494807458428841,
"grad_norm": 85288.765625,
"learning_rate": 1.3909529642064045e-06,
"loss": 2.1298,
"step": 247500
},
{
"epoch": 1.853217059259799,
"grad_norm": 92626.7890625,
"learning_rate": 1.3230551751149568e-06,
"loss": 2.1488,
"step": 248000
},
{
"epoch": 1.8569533726767136,
"grad_norm": 105719.265625,
"learning_rate": 1.2568340780016953e-06,
"loss": 2.1397,
"step": 248500
},
{
"epoch": 1.8606896860936284,
"grad_norm": 89167.5703125,
"learning_rate": 1.1922919538721876e-06,
"loss": 2.1225,
"step": 249000
},
{
"epoch": 1.864425999510543,
"grad_norm": 100311.046875,
"learning_rate": 1.1294310258992747e-06,
"loss": 2.1445,
"step": 249500
},
{
"epoch": 1.8681623129274576,
"grad_norm": 87286.234375,
"learning_rate": 1.0682534593464967e-06,
"loss": 2.1369,
"step": 250000
},
{
"epoch": 1.8718986263443722,
"grad_norm": 103840.65625,
"learning_rate": 1.0087613614934987e-06,
"loss": 2.1134,
"step": 250500
},
{
"epoch": 1.8756349397612868,
"grad_norm": 97787.40625,
"learning_rate": 9.509567815634446e-07,
"loss": 2.1302,
"step": 251000
},
{
"epoch": 1.8793712531782016,
"grad_norm": 119866.546875,
"learning_rate": 8.948417106524554e-07,
"loss": 2.1391,
"step": 251500
},
{
"epoch": 1.8831075665951162,
"grad_norm": 101651.5234375,
"learning_rate": 8.404180816609819e-07,
"loss": 2.1628,
"step": 252000
},
{
"epoch": 1.886843880012031,
"grad_norm": 103326.453125,
"learning_rate": 7.876877692272799e-07,
"loss": 2.1341,
"step": 252500
},
{
"epoch": 1.8905801934289457,
"grad_norm": 100837.203125,
"learning_rate": 7.366525896627841e-07,
"loss": 2.125,
"step": 253000
},
{
"epoch": 1.8943165068458603,
"grad_norm": 101825.6015625,
"learning_rate": 6.873143008895633e-07,
"loss": 2.116,
"step": 253500
},
{
"epoch": 1.8980528202627749,
"grad_norm": 84607.0234375,
"learning_rate": 6.396746023798139e-07,
"loss": 2.1599,
"step": 254000
},
{
"epoch": 1.9017891336796895,
"grad_norm": 85267.1328125,
"learning_rate": 5.937351350972287e-07,
"loss": 2.1605,
"step": 254500
},
{
"epoch": 1.905525447096604,
"grad_norm": 112646.515625,
"learning_rate": 5.49497481440564e-07,
"loss": 2.1479,
"step": 255000
},
{
"epoch": 1.909261760513519,
"grad_norm": 101483.7578125,
"learning_rate": 5.069631651890783e-07,
"loss": 2.1306,
"step": 255500
},
{
"epoch": 1.9129980739304335,
"grad_norm": 104185.2265625,
"learning_rate": 4.661336514500625e-07,
"loss": 2.1222,
"step": 256000
},
{
"epoch": 1.9167343873473484,
"grad_norm": 83210.1484375,
"learning_rate": 4.270103466083697e-07,
"loss": 2.1573,
"step": 256500
},
{
"epoch": 1.920470700764263,
"grad_norm": 109216.5703125,
"learning_rate": 3.895945982779925e-07,
"loss": 2.166,
"step": 257000
},
{
"epoch": 1.9242070141811776,
"grad_norm": 100591.0546875,
"learning_rate": 3.5388769525560585e-07,
"loss": 2.1291,
"step": 257500
},
{
"epoch": 1.9279433275980922,
"grad_norm": 98919.7421875,
"learning_rate": 3.19890867476208e-07,
"loss": 2.1449,
"step": 258000
},
{
"epoch": 1.9316796410150068,
"grad_norm": 90432.4765625,
"learning_rate": 2.876052859707379e-07,
"loss": 2.1195,
"step": 258500
},
{
"epoch": 1.9354159544319216,
"grad_norm": 94623.453125,
"learning_rate": 2.570320628257461e-07,
"loss": 2.1409,
"step": 259000
},
{
"epoch": 1.9391522678488362,
"grad_norm": 112643.9765625,
"learning_rate": 2.2817225114507567e-07,
"loss": 2.1521,
"step": 259500
},
{
"epoch": 1.942888581265751,
"grad_norm": 83827.5078125,
"learning_rate": 2.010268450136077e-07,
"loss": 2.1273,
"step": 260000
},
{
"epoch": 1.9466248946826656,
"grad_norm": 97442.9453125,
"learning_rate": 1.7559677946300533e-07,
"loss": 2.1454,
"step": 260500
},
{
"epoch": 1.9503612080995802,
"grad_norm": 92114.421875,
"learning_rate": 1.518829304395175e-07,
"loss": 2.127,
"step": 261000
},
{
"epoch": 1.9540975215164949,
"grad_norm": 95378.5078125,
"learning_rate": 1.2988611477378066e-07,
"loss": 2.1213,
"step": 261500
},
{
"epoch": 1.9578338349334095,
"grad_norm": 87441.7890625,
"learning_rate": 1.0960709015272464e-07,
"loss": 2.1427,
"step": 262000
},
{
"epoch": 1.961570148350324,
"grad_norm": 87605.1640625,
"learning_rate": 9.104655509343252e-08,
"loss": 2.1541,
"step": 262500
},
{
"epoch": 1.9653064617672389,
"grad_norm": 101794.53125,
"learning_rate": 7.420514891911535e-08,
"loss": 2.152,
"step": 263000
},
{
"epoch": 1.9690427751841535,
"grad_norm": 94773.6328125,
"learning_rate": 5.908345173705754e-08,
"loss": 2.1471,
"step": 263500
},
{
"epoch": 1.9727790886010683,
"grad_norm": 105178.625,
"learning_rate": 4.5681984418666224e-08,
"loss": 2.1364,
"step": 264000
},
{
"epoch": 1.976515402017983,
"grad_norm": 97559.484375,
"learning_rate": 3.400120858149669e-08,
"loss": 2.17,
"step": 264500
},
{
"epoch": 1.9802517154348975,
"grad_norm": 120999.3125,
"learning_rate": 2.404152657338732e-08,
"loss": 2.1488,
"step": 265000
},
{
"epoch": 1.9839880288518121,
"grad_norm": 117999.5859375,
"learning_rate": 1.5803281458570684e-08,
"loss": 2.1481,
"step": 265500
},
{
"epoch": 1.9877243422687267,
"grad_norm": 94607.5078125,
"learning_rate": 9.286757005871893e-09,
"loss": 2.1104,
"step": 266000
},
{
"epoch": 1.9914606556856416,
"grad_norm": 94297.2109375,
"learning_rate": 4.492177678938614e-09,
"loss": 2.1349,
"step": 266500
},
{
"epoch": 1.9951969691025562,
"grad_norm": 95649.546875,
"learning_rate": 1.4197086284917227e-09,
"loss": 2.1224,
"step": 267000
},
{
"epoch": 1.998933282519471,
"grad_norm": 116585.421875,
"learning_rate": 6.945568664651347e-11,
"loss": 2.1757,
"step": 267500
}
],
"logging_steps": 500,
"max_steps": 267642,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.913150854495206e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}