{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9999943955298747, "eval_steps": 500, "global_step": 267642, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037363134169146645, "grad_norm": 72942.6796875, "learning_rate": 9.999655549964018e-05, "loss": 4.1478, "step": 500 }, { "epoch": 0.007472626833829329, "grad_norm": 67110.5, "learning_rate": 9.998622247314402e-05, "loss": 3.6349, "step": 1000 }, { "epoch": 0.011208940250743994, "grad_norm": 73347.7890625, "learning_rate": 9.996900234419608e-05, "loss": 3.4653, "step": 1500 }, { "epoch": 0.014945253667658658, "grad_norm": 70530.5546875, "learning_rate": 9.994489748538595e-05, "loss": 3.3608, "step": 2000 }, { "epoch": 0.018681567084573324, "grad_norm": 69644.8515625, "learning_rate": 9.991391121788143e-05, "loss": 3.2989, "step": 2500 }, { "epoch": 0.022417880501487988, "grad_norm": 79574.8203125, "learning_rate": 9.987604781097091e-05, "loss": 3.1788, "step": 3000 }, { "epoch": 0.02615419391840265, "grad_norm": 77448.375, "learning_rate": 9.983131248147513e-05, "loss": 3.1802, "step": 3500 }, { "epoch": 0.029890507335317316, "grad_norm": 82995.328125, "learning_rate": 9.977971139302844e-05, "loss": 3.1495, "step": 4000 }, { "epoch": 0.03362682075223198, "grad_norm": 77637.0625, "learning_rate": 9.972125165522955e-05, "loss": 3.0534, "step": 4500 }, { "epoch": 0.03736313416914665, "grad_norm": 71801.3515625, "learning_rate": 9.965594132266197e-05, "loss": 3.0471, "step": 5000 }, { "epoch": 0.04109944758606131, "grad_norm": 73166.8125, "learning_rate": 9.958378939378425e-05, "loss": 3.0568, "step": 5500 }, { "epoch": 0.044835761002975975, "grad_norm": 82521.0390625, "learning_rate": 9.950480580969022e-05, "loss": 2.9987, "step": 6000 }, { "epoch": 0.048572074419890636, "grad_norm": 73557.609375, "learning_rate": 9.941900145273922e-05, "loss": 2.9518, "step": 6500 }, { "epoch": 0.0523083878368053, "grad_norm": 76367.0078125, "learning_rate": 9.932638814505676e-05, "loss": 2.9528, "step": 7000 }, { "epoch": 0.056044701253719964, "grad_norm": 80728.796875, "learning_rate": 9.922697864690573e-05, "loss": 2.9501, "step": 7500 }, { "epoch": 0.05978101467063463, "grad_norm": 71664.421875, "learning_rate": 9.912078665492822e-05, "loss": 2.9449, "step": 8000 }, { "epoch": 0.0635173280875493, "grad_norm": 75999.953125, "learning_rate": 9.90078268002584e-05, "loss": 2.8971, "step": 8500 }, { "epoch": 0.06725364150446396, "grad_norm": 67755.296875, "learning_rate": 9.888811464650668e-05, "loss": 2.8525, "step": 9000 }, { "epoch": 0.07098995492137862, "grad_norm": 76703.8203125, "learning_rate": 9.876166668761532e-05, "loss": 2.8725, "step": 9500 }, { "epoch": 0.0747262683382933, "grad_norm": 74384.6640625, "learning_rate": 9.862850034558593e-05, "loss": 2.8761, "step": 10000 }, { "epoch": 0.07846258175520796, "grad_norm": 59161.4453125, "learning_rate": 9.848863396807901e-05, "loss": 2.8503, "step": 10500 }, { "epoch": 0.08219889517212262, "grad_norm": 66133.3359375, "learning_rate": 9.834208682588608e-05, "loss": 2.8198, "step": 11000 }, { "epoch": 0.08593520858903728, "grad_norm": 65402.43359375, "learning_rate": 9.81888791102745e-05, "loss": 2.8278, "step": 11500 }, { "epoch": 0.08967152200595195, "grad_norm": 73789.2578125, "learning_rate": 9.802903193020554e-05, "loss": 2.808, "step": 12000 }, { "epoch": 0.09340783542286661, "grad_norm": 71568.953125, "learning_rate": 9.786256730942595e-05, "loss": 2.7821, "step": 12500 }, { "epoch": 0.09714414883978127, "grad_norm": 68736.6953125, "learning_rate": 9.768950818343359e-05, "loss": 2.7827, "step": 13000 }, { "epoch": 0.10088046225669595, "grad_norm": 73235.984375, "learning_rate": 9.750987839631733e-05, "loss": 2.797, "step": 13500 }, { "epoch": 0.1046167756736106, "grad_norm": 72581.59375, "learning_rate": 9.732370269747181e-05, "loss": 2.7515, "step": 14000 }, { "epoch": 0.10835308909052527, "grad_norm": 77676.46875, "learning_rate": 9.713100673818754e-05, "loss": 2.7451, "step": 14500 }, { "epoch": 0.11208940250743993, "grad_norm": 78359.078125, "learning_rate": 9.693181706811652e-05, "loss": 2.7544, "step": 15000 }, { "epoch": 0.1158257159243546, "grad_norm": 67356.484375, "learning_rate": 9.672616113161434e-05, "loss": 2.7054, "step": 15500 }, { "epoch": 0.11956202934126926, "grad_norm": 76034.65625, "learning_rate": 9.651406726395895e-05, "loss": 2.7419, "step": 16000 }, { "epoch": 0.12329834275818392, "grad_norm": 77247.296875, "learning_rate": 9.629556468744647e-05, "loss": 2.7274, "step": 16500 }, { "epoch": 0.1270346561750986, "grad_norm": 90000.84375, "learning_rate": 9.607068350736501e-05, "loss": 2.7118, "step": 17000 }, { "epoch": 0.13077096959201326, "grad_norm": 82970.1953125, "learning_rate": 9.583945470784683e-05, "loss": 2.7036, "step": 17500 }, { "epoch": 0.13450728300892792, "grad_norm": 70716.7578125, "learning_rate": 9.560191014759922e-05, "loss": 2.6936, "step": 18000 }, { "epoch": 0.13824359642584258, "grad_norm": 69805.4140625, "learning_rate": 9.535808255551515e-05, "loss": 2.6633, "step": 18500 }, { "epoch": 0.14197990984275724, "grad_norm": 69154.890625, "learning_rate": 9.510800552616374e-05, "loss": 2.6765, "step": 19000 }, { "epoch": 0.1457162232596719, "grad_norm": 70633.8515625, "learning_rate": 9.485171351516171e-05, "loss": 2.6623, "step": 19500 }, { "epoch": 0.1494525366765866, "grad_norm": 85893.5390625, "learning_rate": 9.458924183442602e-05, "loss": 2.6699, "step": 20000 }, { "epoch": 0.15318885009350125, "grad_norm": 72002.125, "learning_rate": 9.43206266473086e-05, "loss": 2.6461, "step": 20500 }, { "epoch": 0.1569251635104159, "grad_norm": 78129.734375, "learning_rate": 9.404590496361382e-05, "loss": 2.6687, "step": 21000 }, { "epoch": 0.16066147692733057, "grad_norm": 64102.87890625, "learning_rate": 9.376511463449922e-05, "loss": 2.659, "step": 21500 }, { "epoch": 0.16439779034424523, "grad_norm": 61640.66796875, "learning_rate": 9.347829434726035e-05, "loss": 2.6069, "step": 22000 }, { "epoch": 0.1681341037611599, "grad_norm": 77315.6484375, "learning_rate": 9.318548362000054e-05, "loss": 2.6231, "step": 22500 }, { "epoch": 0.17187041717807455, "grad_norm": 62667.19921875, "learning_rate": 9.288672279618603e-05, "loss": 2.6435, "step": 23000 }, { "epoch": 0.17560673059498924, "grad_norm": 72954.6015625, "learning_rate": 9.258205303908738e-05, "loss": 2.6336, "step": 23500 }, { "epoch": 0.1793430440119039, "grad_norm": 71380.78125, "learning_rate": 9.227151632610811e-05, "loss": 2.6389, "step": 24000 }, { "epoch": 0.18307935742881856, "grad_norm": 72969.21875, "learning_rate": 9.195515544300105e-05, "loss": 2.6162, "step": 24500 }, { "epoch": 0.18681567084573322, "grad_norm": 70472.6171875, "learning_rate": 9.163301397797319e-05, "loss": 2.6384, "step": 25000 }, { "epoch": 0.19055198426264788, "grad_norm": 74207.796875, "learning_rate": 9.130513631568021e-05, "loss": 2.5962, "step": 25500 }, { "epoch": 0.19428829767956254, "grad_norm": 66946.59375, "learning_rate": 9.097156763111118e-05, "loss": 2.607, "step": 26000 }, { "epoch": 0.1980246110964772, "grad_norm": 75940.734375, "learning_rate": 9.063235388336423e-05, "loss": 2.5973, "step": 26500 }, { "epoch": 0.2017609245133919, "grad_norm": 63575.37890625, "learning_rate": 9.028754180931441e-05, "loss": 2.6065, "step": 27000 }, { "epoch": 0.20549723793030655, "grad_norm": 79057.234375, "learning_rate": 8.993717891717425e-05, "loss": 2.5717, "step": 27500 }, { "epoch": 0.2092335513472212, "grad_norm": 72744.7578125, "learning_rate": 8.958131347994808e-05, "loss": 2.6101, "step": 28000 }, { "epoch": 0.21296986476413587, "grad_norm": 77414.359375, "learning_rate": 8.921999452878093e-05, "loss": 2.5807, "step": 28500 }, { "epoch": 0.21670617818105053, "grad_norm": 75271.28125, "learning_rate": 8.885327184620314e-05, "loss": 2.5952, "step": 29000 }, { "epoch": 0.2204424915979652, "grad_norm": 69727.734375, "learning_rate": 8.848119595927116e-05, "loss": 2.587, "step": 29500 }, { "epoch": 0.22417880501487986, "grad_norm": 83907.5859375, "learning_rate": 8.810381813260606e-05, "loss": 2.5377, "step": 30000 }, { "epoch": 0.22791511843179454, "grad_norm": 93582.2265625, "learning_rate": 8.772119036133022e-05, "loss": 2.5808, "step": 30500 }, { "epoch": 0.2316514318487092, "grad_norm": 69250.0703125, "learning_rate": 8.733336536390349e-05, "loss": 2.554, "step": 31000 }, { "epoch": 0.23538774526562387, "grad_norm": 72463.203125, "learning_rate": 8.69403965748596e-05, "loss": 2.5899, "step": 31500 }, { "epoch": 0.23912405868253853, "grad_norm": 71343.4765625, "learning_rate": 8.654233813744392e-05, "loss": 2.5627, "step": 32000 }, { "epoch": 0.2428603720994532, "grad_norm": 78745.25, "learning_rate": 8.613924489615373e-05, "loss": 2.5353, "step": 32500 }, { "epoch": 0.24659668551636785, "grad_norm": 81588.1875, "learning_rate": 8.573117238918161e-05, "loss": 2.5696, "step": 33000 }, { "epoch": 0.25033299893328254, "grad_norm": 66313.8984375, "learning_rate": 8.53181768407634e-05, "loss": 2.5745, "step": 33500 }, { "epoch": 0.2540693123501972, "grad_norm": 70478.8046875, "learning_rate": 8.490031515343177e-05, "loss": 2.5485, "step": 34000 }, { "epoch": 0.25780562576711186, "grad_norm": 81928.53125, "learning_rate": 8.447764490017597e-05, "loss": 2.4977, "step": 34500 }, { "epoch": 0.2615419391840265, "grad_norm": 65777.4921875, "learning_rate": 8.405022431650959e-05, "loss": 2.5664, "step": 35000 }, { "epoch": 0.2652782526009412, "grad_norm": 81822.8828125, "learning_rate": 8.361811229244682e-05, "loss": 2.5249, "step": 35500 }, { "epoch": 0.26901456601785584, "grad_norm": 66773.1328125, "learning_rate": 8.318136836438853e-05, "loss": 2.5385, "step": 36000 }, { "epoch": 0.2727508794347705, "grad_norm": 78679.25, "learning_rate": 8.274005270691942e-05, "loss": 2.499, "step": 36500 }, { "epoch": 0.27648719285168516, "grad_norm": 68307.140625, "learning_rate": 8.229422612451711e-05, "loss": 2.4764, "step": 37000 }, { "epoch": 0.2802235062685998, "grad_norm": 67303.9453125, "learning_rate": 8.184395004317458e-05, "loss": 2.5165, "step": 37500 }, { "epoch": 0.2839598196855145, "grad_norm": 69105.96875, "learning_rate": 8.138928650193676e-05, "loss": 2.5159, "step": 38000 }, { "epoch": 0.28769613310242914, "grad_norm": 75505.984375, "learning_rate": 8.093029814435293e-05, "loss": 2.5268, "step": 38500 }, { "epoch": 0.2914324465193438, "grad_norm": 80771.171875, "learning_rate": 8.046704820984558e-05, "loss": 2.5296, "step": 39000 }, { "epoch": 0.2951687599362585, "grad_norm": 71207.0546875, "learning_rate": 7.999960052499737e-05, "loss": 2.518, "step": 39500 }, { "epoch": 0.2989050733531732, "grad_norm": 69106.34375, "learning_rate": 7.952801949475706e-05, "loss": 2.4906, "step": 40000 }, { "epoch": 0.30264138677008784, "grad_norm": 82679.9296875, "learning_rate": 7.905237009356578e-05, "loss": 2.4743, "step": 40500 }, { "epoch": 0.3063777001870025, "grad_norm": 84038.28125, "learning_rate": 7.857271785640486e-05, "loss": 2.4726, "step": 41000 }, { "epoch": 0.31011401360391716, "grad_norm": 75876.3203125, "learning_rate": 7.808912886976641e-05, "loss": 2.5081, "step": 41500 }, { "epoch": 0.3138503270208318, "grad_norm": 78494.3046875, "learning_rate": 7.760166976254804e-05, "loss": 2.4795, "step": 42000 }, { "epoch": 0.3175866404377465, "grad_norm": 68793.765625, "learning_rate": 7.711040769687248e-05, "loss": 2.4883, "step": 42500 }, { "epoch": 0.32132295385466114, "grad_norm": 78543.03125, "learning_rate": 7.661541035883427e-05, "loss": 2.4752, "step": 43000 }, { "epoch": 0.3250592672715758, "grad_norm": 78304.4140625, "learning_rate": 7.611674594917376e-05, "loss": 2.4747, "step": 43500 }, { "epoch": 0.32879558068849046, "grad_norm": 77926.90625, "learning_rate": 7.561448317388045e-05, "loss": 2.4666, "step": 44000 }, { "epoch": 0.3325318941054051, "grad_norm": 76948.8671875, "learning_rate": 7.510869123472679e-05, "loss": 2.4724, "step": 44500 }, { "epoch": 0.3362682075223198, "grad_norm": 75918.4921875, "learning_rate": 7.459943981973343e-05, "loss": 2.4476, "step": 45000 }, { "epoch": 0.34000452093923444, "grad_norm": 66944.8125, "learning_rate": 7.408679909356764e-05, "loss": 2.4982, "step": 45500 }, { "epoch": 0.3437408343561491, "grad_norm": 68671.390625, "learning_rate": 7.357083968787606e-05, "loss": 2.4996, "step": 46000 }, { "epoch": 0.3474771477730638, "grad_norm": 74974.25, "learning_rate": 7.305163269155305e-05, "loss": 2.44, "step": 46500 }, { "epoch": 0.3512134611899785, "grad_norm": 83594.984375, "learning_rate": 7.2529249640946e-05, "loss": 2.466, "step": 47000 }, { "epoch": 0.35494977460689314, "grad_norm": 67277.0546875, "learning_rate": 7.200376250999917e-05, "loss": 2.4398, "step": 47500 }, { "epoch": 0.3586860880238078, "grad_norm": 64984.64453125, "learning_rate": 7.1475243700337e-05, "loss": 2.4536, "step": 48000 }, { "epoch": 0.36242240144072246, "grad_norm": 76212.953125, "learning_rate": 7.094376603128872e-05, "loss": 2.4643, "step": 48500 }, { "epoch": 0.3661587148576371, "grad_norm": 78486.7109375, "learning_rate": 7.040940272985521e-05, "loss": 2.4468, "step": 49000 }, { "epoch": 0.3698950282745518, "grad_norm": 90899.328125, "learning_rate": 6.987222742061982e-05, "loss": 2.458, "step": 49500 }, { "epoch": 0.37363134169146645, "grad_norm": 69692.6328125, "learning_rate": 6.933231411560442e-05, "loss": 2.4331, "step": 50000 }, { "epoch": 0.3773676551083811, "grad_norm": 74715.71875, "learning_rate": 6.878973720407191e-05, "loss": 2.4379, "step": 50500 }, { "epoch": 0.38110396852529577, "grad_norm": 72629.9140625, "learning_rate": 6.824457144227698e-05, "loss": 2.4424, "step": 51000 }, { "epoch": 0.3848402819422104, "grad_norm": 74704.6328125, "learning_rate": 6.769689194316615e-05, "loss": 2.4905, "step": 51500 }, { "epoch": 0.3885765953591251, "grad_norm": 75688.8125, "learning_rate": 6.714677416602868e-05, "loss": 2.4413, "step": 52000 }, { "epoch": 0.39231290877603975, "grad_norm": 75325.578125, "learning_rate": 6.659429390609984e-05, "loss": 2.4505, "step": 52500 }, { "epoch": 0.3960492221929544, "grad_norm": 70411.6953125, "learning_rate": 6.603952728411776e-05, "loss": 2.4358, "step": 53000 }, { "epoch": 0.3997855356098691, "grad_norm": 71785.53125, "learning_rate": 6.54825507358356e-05, "loss": 2.4271, "step": 53500 }, { "epoch": 0.4035218490267838, "grad_norm": 76337.5703125, "learning_rate": 6.492344100149026e-05, "loss": 2.4536, "step": 54000 }, { "epoch": 0.40725816244369845, "grad_norm": 65393.2890625, "learning_rate": 6.436227511522892e-05, "loss": 2.4404, "step": 54500 }, { "epoch": 0.4109944758606131, "grad_norm": 88779.578125, "learning_rate": 6.379913039449547e-05, "loss": 2.4277, "step": 55000 }, { "epoch": 0.41473078927752777, "grad_norm": 70318.0546875, "learning_rate": 6.32340844293777e-05, "loss": 2.4209, "step": 55500 }, { "epoch": 0.4184671026944424, "grad_norm": 77500.03125, "learning_rate": 6.266721507191674e-05, "loss": 2.4218, "step": 56000 }, { "epoch": 0.4222034161113571, "grad_norm": 89454.5703125, "learning_rate": 6.209860042538087e-05, "loss": 2.409, "step": 56500 }, { "epoch": 0.42593972952827175, "grad_norm": 72870.5, "learning_rate": 6.152831883350426e-05, "loss": 2.3841, "step": 57000 }, { "epoch": 0.4296760429451864, "grad_norm": 75137.0703125, "learning_rate": 6.095644886969283e-05, "loss": 2.3922, "step": 57500 }, { "epoch": 0.43341235636210107, "grad_norm": 64499.1328125, "learning_rate": 6.038306932619846e-05, "loss": 2.4042, "step": 58000 }, { "epoch": 0.43714866977901573, "grad_norm": 74131.78125, "learning_rate": 5.980825920326287e-05, "loss": 2.3981, "step": 58500 }, { "epoch": 0.4408849831959304, "grad_norm": 75569.671875, "learning_rate": 5.923209769823308e-05, "loss": 2.4412, "step": 59000 }, { "epoch": 0.44462129661284505, "grad_norm": 94198.984375, "learning_rate": 5.865466419464957e-05, "loss": 2.4383, "step": 59500 }, { "epoch": 0.4483576100297597, "grad_norm": 72894.984375, "learning_rate": 5.807603825130873e-05, "loss": 2.3944, "step": 60000 }, { "epoch": 0.45209392344667443, "grad_norm": 75193.0234375, "learning_rate": 5.7496299591301406e-05, "loss": 2.3847, "step": 60500 }, { "epoch": 0.4558302368635891, "grad_norm": 115278.234375, "learning_rate": 5.6915528091028494e-05, "loss": 2.3914, "step": 61000 }, { "epoch": 0.45956655028050375, "grad_norm": 80114.7578125, "learning_rate": 5.633380376919567e-05, "loss": 2.394, "step": 61500 }, { "epoch": 0.4633028636974184, "grad_norm": 77552.9921875, "learning_rate": 5.5751206775788354e-05, "loss": 2.3919, "step": 62000 }, { "epoch": 0.46703917711433307, "grad_norm": 81552.4375, "learning_rate": 5.516781738102871e-05, "loss": 2.396, "step": 62500 }, { "epoch": 0.47077549053124773, "grad_norm": 89432.1015625, "learning_rate": 5.458371596431592e-05, "loss": 2.4214, "step": 63000 }, { "epoch": 0.4745118039481624, "grad_norm": 90061.796875, "learning_rate": 5.3998983003151595e-05, "loss": 2.3796, "step": 63500 }, { "epoch": 0.47824811736507705, "grad_norm": 84587.1953125, "learning_rate": 5.341369906205154e-05, "loss": 2.39, "step": 64000 }, { "epoch": 0.4819844307819917, "grad_norm": 63710.203125, "learning_rate": 5.2827944781445583e-05, "loss": 2.4094, "step": 64500 }, { "epoch": 0.4857207441989064, "grad_norm": 69039.109375, "learning_rate": 5.224180086656693e-05, "loss": 2.3963, "step": 65000 }, { "epoch": 0.48945705761582103, "grad_norm": 90888.671875, "learning_rate": 5.165534807633261e-05, "loss": 2.3701, "step": 65500 }, { "epoch": 0.4931933710327357, "grad_norm": 75439.6875, "learning_rate": 5.1068667212216527e-05, "loss": 2.3588, "step": 66000 }, { "epoch": 0.49692968444965036, "grad_norm": 77958.296875, "learning_rate": 5.048183910711655e-05, "loss": 2.3613, "step": 66500 }, { "epoch": 0.5006659978665651, "grad_norm": 60339.5859375, "learning_rate": 4.9894944614217454e-05, "loss": 2.3501, "step": 67000 }, { "epoch": 0.5044023112834797, "grad_norm": 88602.6015625, "learning_rate": 4.930806459585094e-05, "loss": 2.408, "step": 67500 }, { "epoch": 0.5081386247003944, "grad_norm": 82704.09375, "learning_rate": 4.872127991235435e-05, "loss": 2.3974, "step": 68000 }, { "epoch": 0.511874938117309, "grad_norm": 69181.0625, "learning_rate": 4.8134671410929855e-05, "loss": 2.337, "step": 68500 }, { "epoch": 0.5156112515342237, "grad_norm": 83461.8984375, "learning_rate": 4.75483199145052e-05, "loss": 2.3468, "step": 69000 }, { "epoch": 0.5193475649511383, "grad_norm": 70060.1875, "learning_rate": 4.6962306210598024e-05, "loss": 2.3686, "step": 69500 }, { "epoch": 0.523083878368053, "grad_norm": 80763.5703125, "learning_rate": 4.637671104018486e-05, "loss": 2.3697, "step": 70000 }, { "epoch": 0.5268201917849676, "grad_norm": 75881.0, "learning_rate": 4.5791615086576726e-05, "loss": 2.3789, "step": 70500 }, { "epoch": 0.5305565052018824, "grad_norm": 68510.5390625, "learning_rate": 4.5207098964302556e-05, "loss": 2.3812, "step": 71000 }, { "epoch": 0.5342928186187971, "grad_norm": 90292.1953125, "learning_rate": 4.462324320800205e-05, "loss": 2.3558, "step": 71500 }, { "epoch": 0.5380291320357117, "grad_norm": 69267.453125, "learning_rate": 4.4040128261329724e-05, "loss": 2.3751, "step": 72000 }, { "epoch": 0.5417654454526264, "grad_norm": 85016.8203125, "learning_rate": 4.345783446587135e-05, "loss": 2.3635, "step": 72500 }, { "epoch": 0.545501758869541, "grad_norm": 103424.7109375, "learning_rate": 4.2876442050074436e-05, "loss": 2.3367, "step": 73000 }, { "epoch": 0.5492380722864557, "grad_norm": 83033.2265625, "learning_rate": 4.229603111819437e-05, "loss": 2.371, "step": 73500 }, { "epoch": 0.5529743857033703, "grad_norm": 89505.2890625, "learning_rate": 4.1716681639257725e-05, "loss": 2.3537, "step": 74000 }, { "epoch": 0.556710699120285, "grad_norm": 86427.5390625, "learning_rate": 4.113847343604404e-05, "loss": 2.3518, "step": 74500 }, { "epoch": 0.5604470125371996, "grad_norm": 95231.3203125, "learning_rate": 4.05614861740879e-05, "loss": 2.3475, "step": 75000 }, { "epoch": 0.5641833259541144, "grad_norm": 73089.09375, "learning_rate": 3.998579935070253e-05, "loss": 2.3467, "step": 75500 }, { "epoch": 0.567919639371029, "grad_norm": 82384.2109375, "learning_rate": 3.941149228402674e-05, "loss": 2.3278, "step": 76000 }, { "epoch": 0.5716559527879437, "grad_norm": 84582.859375, "learning_rate": 3.883864410209648e-05, "loss": 2.3535, "step": 76500 }, { "epoch": 0.5753922662048583, "grad_norm": 69513.640625, "learning_rate": 3.826733373194247e-05, "loss": 2.3257, "step": 77000 }, { "epoch": 0.579128579621773, "grad_norm": 73027.9453125, "learning_rate": 3.769763988871572e-05, "loss": 2.3447, "step": 77500 }, { "epoch": 0.5828648930386876, "grad_norm": 71039.1171875, "learning_rate": 3.712964106484218e-05, "loss": 2.3147, "step": 78000 }, { "epoch": 0.5866012064556023, "grad_norm": 80346.1796875, "learning_rate": 3.6563415519207985e-05, "loss": 2.3144, "step": 78500 }, { "epoch": 0.590337519872517, "grad_norm": 84110.34375, "learning_rate": 3.599904126637692e-05, "loss": 2.3333, "step": 79000 }, { "epoch": 0.5940738332894316, "grad_norm": 82063.09375, "learning_rate": 3.54365960658417e-05, "loss": 2.3283, "step": 79500 }, { "epoch": 0.5978101467063464, "grad_norm": 84510.8984375, "learning_rate": 3.487615741131013e-05, "loss": 2.3397, "step": 80000 }, { "epoch": 0.601546460123261, "grad_norm": 74272.46875, "learning_rate": 3.4317802520028117e-05, "loss": 2.326, "step": 80500 }, { "epoch": 0.6052827735401757, "grad_norm": 73607.890625, "learning_rate": 3.37616083221406e-05, "loss": 2.3238, "step": 81000 }, { "epoch": 0.6090190869570903, "grad_norm": 84208.0078125, "learning_rate": 3.320765145009216e-05, "loss": 2.3112, "step": 81500 }, { "epoch": 0.612755400374005, "grad_norm": 80192.90625, "learning_rate": 3.265600822806866e-05, "loss": 2.3361, "step": 82000 }, { "epoch": 0.6164917137909196, "grad_norm": 83479.984375, "learning_rate": 3.2106754661481086e-05, "loss": 2.3383, "step": 82500 }, { "epoch": 0.6202280272078343, "grad_norm": 88273.1015625, "learning_rate": 3.1559966426493804e-05, "loss": 2.3059, "step": 83000 }, { "epoch": 0.6239643406247489, "grad_norm": 98511.15625, "learning_rate": 3.101571885959766e-05, "loss": 2.3113, "step": 83500 }, { "epoch": 0.6277006540416636, "grad_norm": 76336.8046875, "learning_rate": 3.047408694723032e-05, "loss": 2.3254, "step": 84000 }, { "epoch": 0.6314369674585782, "grad_norm": 80944.7734375, "learning_rate": 2.99351453154444e-05, "loss": 2.3428, "step": 84500 }, { "epoch": 0.635173280875493, "grad_norm": 72115.8125, "learning_rate": 2.9398968219625706e-05, "loss": 2.3156, "step": 85000 }, { "epoch": 0.6389095942924077, "grad_norm": 87770.3125, "learning_rate": 2.8865629534262216e-05, "loss": 2.3041, "step": 85500 }, { "epoch": 0.6426459077093223, "grad_norm": 80694.6171875, "learning_rate": 2.83352027427657e-05, "loss": 2.3478, "step": 86000 }, { "epoch": 0.646382221126237, "grad_norm": 77460.1171875, "learning_rate": 2.7807760927347102e-05, "loss": 2.302, "step": 86500 }, { "epoch": 0.6501185345431516, "grad_norm": 82131.8671875, "learning_rate": 2.7283376758947333e-05, "loss": 2.3156, "step": 87000 }, { "epoch": 0.6538548479600663, "grad_norm": 90870.65625, "learning_rate": 2.6762122487224684e-05, "loss": 2.331, "step": 87500 }, { "epoch": 0.6575911613769809, "grad_norm": 98921.2109375, "learning_rate": 2.624406993060019e-05, "loss": 2.3073, "step": 88000 }, { "epoch": 0.6613274747938956, "grad_norm": 73837.2109375, "learning_rate": 2.5729290466362565e-05, "loss": 2.301, "step": 88500 }, { "epoch": 0.6650637882108102, "grad_norm": 82211.34375, "learning_rate": 2.521785502083384e-05, "loss": 2.3103, "step": 89000 }, { "epoch": 0.668800101627725, "grad_norm": 86962.1484375, "learning_rate": 2.4709834059597026e-05, "loss": 2.3154, "step": 89500 }, { "epoch": 0.6725364150446396, "grad_norm": 79155.3515625, "learning_rate": 2.420529757778746e-05, "loss": 2.348, "step": 90000 }, { "epoch": 0.6762727284615543, "grad_norm": 97560.8125, "learning_rate": 2.3704315090448886e-05, "loss": 2.3417, "step": 90500 }, { "epoch": 0.6800090418784689, "grad_norm": 124897.3203125, "learning_rate": 2.3206955622955623e-05, "loss": 2.3122, "step": 91000 }, { "epoch": 0.6837453552953836, "grad_norm": 88038.5625, "learning_rate": 2.271328770150227e-05, "loss": 2.311, "step": 91500 }, { "epoch": 0.6874816687122982, "grad_norm": 89480.2890625, "learning_rate": 2.222337934366214e-05, "loss": 2.3002, "step": 92000 }, { "epoch": 0.6912179821292129, "grad_norm": 81038.4921875, "learning_rate": 2.173729804901581e-05, "loss": 2.3231, "step": 92500 }, { "epoch": 0.6949542955461276, "grad_norm": 91312.8984375, "learning_rate": 2.1255110789851084e-05, "loss": 2.3125, "step": 93000 }, { "epoch": 0.6986906089630422, "grad_norm": 84064.609375, "learning_rate": 2.0776884001935437e-05, "loss": 2.3379, "step": 93500 }, { "epoch": 0.702426922379957, "grad_norm": 87531.078125, "learning_rate": 2.0302683575362608e-05, "loss": 2.2962, "step": 94000 }, { "epoch": 0.7061632357968716, "grad_norm": 84962.34375, "learning_rate": 1.983257484547421e-05, "loss": 2.3032, "step": 94500 }, { "epoch": 0.7098995492137863, "grad_norm": 88890.75, "learning_rate": 1.9366622583857803e-05, "loss": 2.3138, "step": 95000 }, { "epoch": 0.7136358626307009, "grad_norm": 93013.0, "learning_rate": 1.890489098942267e-05, "loss": 2.3128, "step": 95500 }, { "epoch": 0.7173721760476156, "grad_norm": 79075.3515625, "learning_rate": 1.8447443679554567e-05, "loss": 2.3013, "step": 96000 }, { "epoch": 0.7211084894645302, "grad_norm": 94954.1640625, "learning_rate": 1.7994343681350422e-05, "loss": 2.2775, "step": 96500 }, { "epoch": 0.7248448028814449, "grad_norm": 79968.1171875, "learning_rate": 1.7545653422934537e-05, "loss": 2.2979, "step": 97000 }, { "epoch": 0.7285811162983595, "grad_norm": 90937.109375, "learning_rate": 1.7101434724857147e-05, "loss": 2.3466, "step": 97500 }, { "epoch": 0.7323174297152742, "grad_norm": 85592.625, "learning_rate": 1.6661748791576855e-05, "loss": 2.3029, "step": 98000 }, { "epoch": 0.7360537431321889, "grad_norm": 95187.1484375, "learning_rate": 1.622665620302789e-05, "loss": 2.3074, "step": 98500 }, { "epoch": 0.7397900565491036, "grad_norm": 87762.859375, "learning_rate": 1.5796216906273388e-05, "loss": 2.2883, "step": 99000 }, { "epoch": 0.7435263699660182, "grad_norm": 102495.0234375, "learning_rate": 1.537049020724581e-05, "loss": 2.2852, "step": 99500 }, { "epoch": 0.7472626833829329, "grad_norm": 92540.0078125, "learning_rate": 1.4949534762575912e-05, "loss": 2.2914, "step": 100000 }, { "epoch": 0.7509989967998476, "grad_norm": 81275.109375, "learning_rate": 1.4533408571510904e-05, "loss": 2.3119, "step": 100500 }, { "epoch": 0.7547353102167622, "grad_norm": 82922.2578125, "learning_rate": 1.4122168967923366e-05, "loss": 2.3118, "step": 101000 }, { "epoch": 0.7584716236336769, "grad_norm": 107727.1171875, "learning_rate": 1.3715872612411806e-05, "loss": 2.286, "step": 101500 }, { "epoch": 0.7622079370505915, "grad_norm": 84810.40625, "learning_rate": 1.3314575484493946e-05, "loss": 2.3174, "step": 102000 }, { "epoch": 0.7659442504675062, "grad_norm": 78007.40625, "learning_rate": 1.2918332874893869e-05, "loss": 2.3229, "step": 102500 }, { "epoch": 0.7696805638844209, "grad_norm": 95875.2421875, "learning_rate": 1.252719937792397e-05, "loss": 2.2979, "step": 103000 }, { "epoch": 0.7734168773013356, "grad_norm": 83561.375, "learning_rate": 1.2141228883963118e-05, "loss": 2.2667, "step": 103500 }, { "epoch": 0.7771531907182502, "grad_norm": 73712.7578125, "learning_rate": 1.1760474572031538e-05, "loss": 2.2876, "step": 104000 }, { "epoch": 0.7808895041351649, "grad_norm": 90907.0234375, "learning_rate": 1.1384988902463806e-05, "loss": 2.2799, "step": 104500 }, { "epoch": 0.7846258175520795, "grad_norm": 84601.046875, "learning_rate": 1.101482360968088e-05, "loss": 2.3109, "step": 105000 }, { "epoch": 0.7883621309689942, "grad_norm": 89593.640625, "learning_rate": 1.0650029695062103e-05, "loss": 2.3071, "step": 105500 }, { "epoch": 0.7920984443859088, "grad_norm": 85944.5390625, "learning_rate": 1.0290657419918308e-05, "loss": 2.3275, "step": 106000 }, { "epoch": 0.7958347578028235, "grad_norm": 71977.34375, "learning_rate": 9.936756298566724e-06, "loss": 2.3108, "step": 106500 }, { "epoch": 0.7995710712197382, "grad_norm": 91883.7109375, "learning_rate": 9.588375091508945e-06, "loss": 2.2756, "step": 107000 }, { "epoch": 0.8033073846366529, "grad_norm": 81686.0078125, "learning_rate": 9.245561798712704e-06, "loss": 2.3064, "step": 107500 }, { "epoch": 0.8070436980535676, "grad_norm": 86408.875, "learning_rate": 8.908363652998414e-06, "loss": 2.3031, "step": 108000 }, { "epoch": 0.8107800114704822, "grad_norm": 83010.9765625, "learning_rate": 8.576827113531427e-06, "loss": 2.2797, "step": 108500 }, { "epoch": 0.8145163248873969, "grad_norm": 74965.84375, "learning_rate": 8.25099785942094e-06, "loss": 2.3011, "step": 109000 }, { "epoch": 0.8182526383043115, "grad_norm": 91110.0859375, "learning_rate": 7.930920783426265e-06, "loss": 2.2572, "step": 109500 }, { "epoch": 0.8219889517212262, "grad_norm": 96117.6640625, "learning_rate": 7.616639985771556e-06, "loss": 2.276, "step": 110000 }, { "epoch": 0.8257252651381408, "grad_norm": 66883.046875, "learning_rate": 7.30819876806963e-06, "loss": 2.2586, "step": 110500 }, { "epoch": 0.8294615785550555, "grad_norm": 86336.21875, "learning_rate": 7.005639627355881e-06, "loss": 2.2796, "step": 111000 }, { "epoch": 0.8331978919719701, "grad_norm": 100607.5703125, "learning_rate": 6.709004250233097e-06, "loss": 2.2635, "step": 111500 }, { "epoch": 0.8369342053888849, "grad_norm": 90908.2890625, "learning_rate": 6.418333507127788e-06, "loss": 2.2598, "step": 112000 }, { "epoch": 0.8406705188057995, "grad_norm": 101526.234375, "learning_rate": 6.133667446659131e-06, "loss": 2.2648, "step": 112500 }, { "epoch": 0.8444068322227142, "grad_norm": 87100.0234375, "learning_rate": 5.855045290121047e-06, "loss": 2.3005, "step": 113000 }, { "epoch": 0.8481431456396288, "grad_norm": 95362.3984375, "learning_rate": 5.58250542607826e-06, "loss": 2.2747, "step": 113500 }, { "epoch": 0.8518794590565435, "grad_norm": 105692.5625, "learning_rate": 5.316085405077159e-06, "loss": 2.2808, "step": 114000 }, { "epoch": 0.8556157724734582, "grad_norm": 99888.703125, "learning_rate": 5.055821934472071e-06, "loss": 2.2707, "step": 114500 }, { "epoch": 0.8593520858903728, "grad_norm": 89097.7421875, "learning_rate": 4.801750873367728e-06, "loss": 2.2996, "step": 115000 }, { "epoch": 0.8630883993072875, "grad_norm": 94061.9453125, "learning_rate": 4.553907227678594e-06, "loss": 2.2585, "step": 115500 }, { "epoch": 0.8668247127242021, "grad_norm": 89231.015625, "learning_rate": 4.312325145305729e-06, "loss": 2.2712, "step": 116000 }, { "epoch": 0.8705610261411169, "grad_norm": 92370.8125, "learning_rate": 4.077037911431913e-06, "loss": 2.277, "step": 116500 }, { "epoch": 0.8742973395580315, "grad_norm": 95057.8828125, "learning_rate": 3.84807794393563e-06, "loss": 2.2705, "step": 117000 }, { "epoch": 0.8780336529749462, "grad_norm": 76279.6015625, "learning_rate": 3.6254767889244813e-06, "loss": 2.2692, "step": 117500 }, { "epoch": 0.8817699663918608, "grad_norm": 99320.4375, "learning_rate": 3.4092651163888154e-06, "loss": 2.2841, "step": 118000 }, { "epoch": 0.8855062798087755, "grad_norm": 97605.59375, "learning_rate": 3.1994727159759986e-06, "loss": 2.2942, "step": 118500 }, { "epoch": 0.8892425932256901, "grad_norm": 87163.4765625, "learning_rate": 2.996128492885969e-06, "loss": 2.2918, "step": 119000 }, { "epoch": 0.8929789066426048, "grad_norm": 78705.2421875, "learning_rate": 2.7992604638886975e-06, "loss": 2.259, "step": 119500 }, { "epoch": 0.8967152200595194, "grad_norm": 94322.765625, "learning_rate": 2.608895753464069e-06, "loss": 2.2589, "step": 120000 }, { "epoch": 0.9004515334764341, "grad_norm": 73016.4140625, "learning_rate": 2.4250605900646204e-06, "loss": 2.2938, "step": 120500 }, { "epoch": 0.9041878468933489, "grad_norm": 90040.5546875, "learning_rate": 2.2477803025018228e-06, "loss": 2.2982, "step": 121000 }, { "epoch": 0.9079241603102635, "grad_norm": 102110.484375, "learning_rate": 2.0770793164562373e-06, "loss": 2.2578, "step": 121500 }, { "epoch": 0.9116604737271782, "grad_norm": 78370.1796875, "learning_rate": 1.9129811511121697e-06, "loss": 2.2854, "step": 122000 }, { "epoch": 0.9153967871440928, "grad_norm": 87597.2578125, "learning_rate": 1.7555084159172176e-06, "loss": 2.2657, "step": 122500 }, { "epoch": 0.9191331005610075, "grad_norm": 89568.2109375, "learning_rate": 1.6046828074670894e-06, "loss": 2.2674, "step": 123000 }, { "epoch": 0.9228694139779221, "grad_norm": 80686.5234375, "learning_rate": 1.4605251065162973e-06, "loss": 2.2634, "step": 123500 }, { "epoch": 0.9266057273948368, "grad_norm": 76351.1796875, "learning_rate": 1.3230551751149568e-06, "loss": 2.2889, "step": 124000 }, { "epoch": 0.9303420408117514, "grad_norm": 82895.4921875, "learning_rate": 1.1922919538721876e-06, "loss": 2.2188, "step": 124500 }, { "epoch": 0.9340783542286661, "grad_norm": 81276.09375, "learning_rate": 1.0682534593464967e-06, "loss": 2.268, "step": 125000 }, { "epoch": 0.9378146676455807, "grad_norm": 83888.390625, "learning_rate": 9.509567815634446e-07, "loss": 2.2762, "step": 125500 }, { "epoch": 0.9415509810624955, "grad_norm": 87934.8125, "learning_rate": 8.404180816609819e-07, "loss": 2.2759, "step": 126000 }, { "epoch": 0.9452872944794101, "grad_norm": 89569.15625, "learning_rate": 7.366525896627841e-07, "loss": 2.2696, "step": 126500 }, { "epoch": 0.9490236078963248, "grad_norm": 79137.96875, "learning_rate": 6.396746023798139e-07, "loss": 2.2951, "step": 127000 }, { "epoch": 0.9527599213132394, "grad_norm": 74010.8984375, "learning_rate": 5.49497481440564e-07, "loss": 2.2875, "step": 127500 }, { "epoch": 0.9564962347301541, "grad_norm": 70001.59375, "learning_rate": 4.661336514500625e-07, "loss": 2.2514, "step": 128000 }, { "epoch": 0.9602325481470688, "grad_norm": 86343.359375, "learning_rate": 3.895945982779925e-07, "loss": 2.3027, "step": 128500 }, { "epoch": 0.9639688615639834, "grad_norm": 92609.6171875, "learning_rate": 3.19890867476208e-07, "loss": 2.2857, "step": 129000 }, { "epoch": 0.9677051749808981, "grad_norm": 91932.953125, "learning_rate": 2.570320628257461e-07, "loss": 2.2814, "step": 129500 }, { "epoch": 0.9714414883978127, "grad_norm": 81636.1015625, "learning_rate": 2.010268450136077e-07, "loss": 2.2758, "step": 130000 }, { "epoch": 0.9751778018147275, "grad_norm": 79906.6015625, "learning_rate": 1.518829304395175e-07, "loss": 2.2944, "step": 130500 }, { "epoch": 0.9789141152316421, "grad_norm": 76946.6484375, "learning_rate": 1.0960709015272464e-07, "loss": 2.2305, "step": 131000 }, { "epoch": 0.9826504286485568, "grad_norm": 81980.828125, "learning_rate": 7.420514891911535e-08, "loss": 2.2803, "step": 131500 }, { "epoch": 0.9863867420654714, "grad_norm": 90421.125, "learning_rate": 4.5681984418666224e-08, "loss": 2.2767, "step": 132000 }, { "epoch": 0.9901230554823861, "grad_norm": 81403.0390625, "learning_rate": 2.404152657338732e-08, "loss": 2.2399, "step": 132500 }, { "epoch": 0.9938593688993007, "grad_norm": 98386.671875, "learning_rate": 9.286757005871893e-09, "loss": 2.2422, "step": 133000 }, { "epoch": 0.9975956823162154, "grad_norm": 95571.890625, "learning_rate": 1.4197086284917227e-09, "loss": 2.2956, "step": 133500 }, { "epoch": 1.0013376002032555, "grad_norm": 99771.578125, "learning_rate": 4.9894944614217454e-05, "loss": 2.211, "step": 134000 }, { "epoch": 1.00507391362017, "grad_norm": 105597.21875, "learning_rate": 4.960149774176924e-05, "loss": 2.2131, "step": 134500 }, { "epoch": 1.0088102270370847, "grad_norm": 87634.5, "learning_rate": 4.930806459585094e-05, "loss": 2.2453, "step": 135000 }, { "epoch": 1.0125465404539995, "grad_norm": 86854.4375, "learning_rate": 4.901465528385536e-05, "loss": 2.1825, "step": 135500 }, { "epoch": 1.016282853870914, "grad_norm": 75373.640625, "learning_rate": 4.872127991235435e-05, "loss": 2.1953, "step": 136000 }, { "epoch": 1.0200191672878287, "grad_norm": 89905.53125, "learning_rate": 4.842794858675067e-05, "loss": 2.223, "step": 136500 }, { "epoch": 1.0237554807047435, "grad_norm": 74529.2734375, "learning_rate": 4.8134671410929855e-05, "loss": 2.2232, "step": 137000 }, { "epoch": 1.0274917941216581, "grad_norm": 86540.96875, "learning_rate": 4.7841458486912313e-05, "loss": 2.229, "step": 137500 }, { "epoch": 1.0312281075385727, "grad_norm": 79359.9140625, "learning_rate": 4.75483199145052e-05, "loss": 2.2484, "step": 138000 }, { "epoch": 1.0349644209554874, "grad_norm": 88735.703125, "learning_rate": 4.725526579095468e-05, "loss": 2.2141, "step": 138500 }, { "epoch": 1.0387007343724022, "grad_norm": 84116.59375, "learning_rate": 4.6962306210598024e-05, "loss": 2.2362, "step": 139000 }, { "epoch": 1.0424370477893168, "grad_norm": 85959.921875, "learning_rate": 4.66694512645159e-05, "loss": 2.2703, "step": 139500 }, { "epoch": 1.0461733612062314, "grad_norm": 76734.796875, "learning_rate": 4.637671104018486e-05, "loss": 2.2324, "step": 140000 }, { "epoch": 1.049909674623146, "grad_norm": 82724.0078125, "learning_rate": 4.608409562112983e-05, "loss": 2.2406, "step": 140500 }, { "epoch": 1.0536459880400608, "grad_norm": 85404.375, "learning_rate": 4.5791615086576726e-05, "loss": 2.2049, "step": 141000 }, { "epoch": 1.0573823014569754, "grad_norm": 73961.2890625, "learning_rate": 4.549927951110542e-05, "loss": 2.2176, "step": 141500 }, { "epoch": 1.06111861487389, "grad_norm": 83664.9921875, "learning_rate": 4.5207098964302556e-05, "loss": 2.2473, "step": 142000 }, { "epoch": 1.0648549282908046, "grad_norm": 89258.0390625, "learning_rate": 4.491508351041477e-05, "loss": 2.227, "step": 142500 }, { "epoch": 1.0685912417077195, "grad_norm": 99317.4296875, "learning_rate": 4.462324320800205e-05, "loss": 2.2237, "step": 143000 }, { "epoch": 1.072327555124634, "grad_norm": 90441.765625, "learning_rate": 4.433158810959122e-05, "loss": 2.2321, "step": 143500 }, { "epoch": 1.0760638685415487, "grad_norm": 69907.328125, "learning_rate": 4.4040128261329724e-05, "loss": 2.2319, "step": 144000 }, { "epoch": 1.0798001819584635, "grad_norm": 87409.109375, "learning_rate": 4.3748873702639536e-05, "loss": 2.2581, "step": 144500 }, { "epoch": 1.083536495375378, "grad_norm": 91680.8828125, "learning_rate": 4.345783446587135e-05, "loss": 2.2381, "step": 145000 }, { "epoch": 1.0872728087922927, "grad_norm": 84339.4140625, "learning_rate": 4.316702057595907e-05, "loss": 2.2398, "step": 145500 }, { "epoch": 1.0910091222092073, "grad_norm": 99060.578125, "learning_rate": 4.2876442050074436e-05, "loss": 2.2251, "step": 146000 }, { "epoch": 1.0947454356261221, "grad_norm": 103543.2421875, "learning_rate": 4.258610889728198e-05, "loss": 2.2536, "step": 146500 }, { "epoch": 1.0984817490430367, "grad_norm": 89170.1953125, "learning_rate": 4.229603111819437e-05, "loss": 2.1822, "step": 147000 }, { "epoch": 1.1022180624599514, "grad_norm": 95436.765625, "learning_rate": 4.2006218704627756e-05, "loss": 2.2338, "step": 147500 }, { "epoch": 1.105954375876866, "grad_norm": 86420.7265625, "learning_rate": 4.1716681639257725e-05, "loss": 2.2153, "step": 148000 }, { "epoch": 1.1096906892937808, "grad_norm": 76673.3359375, "learning_rate": 4.1427429895275446e-05, "loss": 2.2289, "step": 148500 }, { "epoch": 1.1134270027106954, "grad_norm": 101906.8515625, "learning_rate": 4.113847343604404e-05, "loss": 2.2297, "step": 149000 }, { "epoch": 1.11716331612761, "grad_norm": 88695.703125, "learning_rate": 4.084982221475554e-05, "loss": 2.2219, "step": 149500 }, { "epoch": 1.1208996295445246, "grad_norm": 100434.46875, "learning_rate": 4.05614861740879e-05, "loss": 2.2344, "step": 150000 }, { "epoch": 1.1246359429614394, "grad_norm": 90131.671875, "learning_rate": 4.0273475245862594e-05, "loss": 2.2137, "step": 150500 }, { "epoch": 1.128372256378354, "grad_norm": 84749.8828125, "learning_rate": 3.998579935070253e-05, "loss": 2.2358, "step": 151000 }, { "epoch": 1.1321085697952686, "grad_norm": 80951.0546875, "learning_rate": 3.969846839769027e-05, "loss": 2.2327, "step": 151500 }, { "epoch": 1.1358448832121835, "grad_norm": 82394.359375, "learning_rate": 3.941149228402674e-05, "loss": 2.2448, "step": 152000 }, { "epoch": 1.139581196629098, "grad_norm": 108267.5546875, "learning_rate": 3.912488089469036e-05, "loss": 2.211, "step": 152500 }, { "epoch": 1.1433175100460127, "grad_norm": 73743.703125, "learning_rate": 3.883864410209648e-05, "loss": 2.2159, "step": 153000 }, { "epoch": 1.1470538234629273, "grad_norm": 105991.84375, "learning_rate": 3.855279176575732e-05, "loss": 2.2191, "step": 153500 }, { "epoch": 1.150790136879842, "grad_norm": 88528.921875, "learning_rate": 3.826733373194247e-05, "loss": 2.2397, "step": 154000 }, { "epoch": 1.1545264502967567, "grad_norm": 99099.5390625, "learning_rate": 3.798227983333955e-05, "loss": 2.2083, "step": 154500 }, { "epoch": 1.1582627637136713, "grad_norm": 96959.421875, "learning_rate": 3.769763988871572e-05, "loss": 2.2146, "step": 155000 }, { "epoch": 1.1619990771305861, "grad_norm": 96111.9375, "learning_rate": 3.741342370257932e-05, "loss": 2.2421, "step": 155500 }, { "epoch": 1.1657353905475007, "grad_norm": 80256.734375, "learning_rate": 3.712964106484218e-05, "loss": 2.2199, "step": 156000 }, { "epoch": 1.1694717039644154, "grad_norm": 84992.546875, "learning_rate": 3.684630175048249e-05, "loss": 2.184, "step": 156500 }, { "epoch": 1.17320801738133, "grad_norm": 94191.3671875, "learning_rate": 3.6563415519207985e-05, "loss": 2.2505, "step": 157000 }, { "epoch": 1.1769443307982446, "grad_norm": 76174.2421875, "learning_rate": 3.62809921151198e-05, "loss": 2.2143, "step": 157500 }, { "epoch": 1.1806806442151594, "grad_norm": 89977.34375, "learning_rate": 3.599904126637692e-05, "loss": 2.2338, "step": 158000 }, { "epoch": 1.184416957632074, "grad_norm": 107886.5390625, "learning_rate": 3.571757268486096e-05, "loss": 2.2516, "step": 158500 }, { "epoch": 1.1881532710489886, "grad_norm": 60902.64453125, "learning_rate": 3.54365960658417e-05, "loss": 2.197, "step": 159000 }, { "epoch": 1.1918895844659034, "grad_norm": 91395.515625, "learning_rate": 3.5156121087643154e-05, "loss": 2.2238, "step": 159500 }, { "epoch": 1.195625897882818, "grad_norm": 90301.53125, "learning_rate": 3.487615741131013e-05, "loss": 2.2317, "step": 160000 }, { "epoch": 1.1993622112997326, "grad_norm": 91005.0625, "learning_rate": 3.459671468027553e-05, "loss": 2.2019, "step": 160500 }, { "epoch": 1.2030985247166472, "grad_norm": 81654.03125, "learning_rate": 3.4317802520028117e-05, "loss": 2.2199, "step": 161000 }, { "epoch": 1.206834838133562, "grad_norm": 100322.15625, "learning_rate": 3.403943053778097e-05, "loss": 2.2668, "step": 161500 }, { "epoch": 1.2105711515504767, "grad_norm": 84712.84375, "learning_rate": 3.37616083221406e-05, "loss": 2.215, "step": 162000 }, { "epoch": 1.2143074649673913, "grad_norm": 93723.7734375, "learning_rate": 3.348434544277663e-05, "loss": 2.2052, "step": 162500 }, { "epoch": 1.218043778384306, "grad_norm": 98371.0859375, "learning_rate": 3.320765145009216e-05, "loss": 2.1767, "step": 163000 }, { "epoch": 1.2217800918012207, "grad_norm": 78824.3359375, "learning_rate": 3.293153587489488e-05, "loss": 2.2341, "step": 163500 }, { "epoch": 1.2255164052181353, "grad_norm": 93536.6640625, "learning_rate": 3.265600822806866e-05, "loss": 2.1954, "step": 164000 }, { "epoch": 1.22925271863505, "grad_norm": 89381.3203125, "learning_rate": 3.2381078000245965e-05, "loss": 2.2026, "step": 164500 }, { "epoch": 1.2329890320519645, "grad_norm": 110292.1484375, "learning_rate": 3.2106754661481086e-05, "loss": 2.2022, "step": 165000 }, { "epoch": 1.2367253454688794, "grad_norm": 89096.8359375, "learning_rate": 3.183304766092374e-05, "loss": 2.1618, "step": 165500 }, { "epoch": 1.240461658885794, "grad_norm": 97123.8671875, "learning_rate": 3.1559966426493804e-05, "loss": 2.192, "step": 166000 }, { "epoch": 1.2441979723027086, "grad_norm": 86452.890625, "learning_rate": 3.128752036455631e-05, "loss": 2.203, "step": 166500 }, { "epoch": 1.2479342857196234, "grad_norm": 99330.265625, "learning_rate": 3.101571885959766e-05, "loss": 2.1653, "step": 167000 }, { "epoch": 1.251670599136538, "grad_norm": 82720.1015625, "learning_rate": 3.0744571273902345e-05, "loss": 2.1893, "step": 167500 }, { "epoch": 1.2554069125534526, "grad_norm": 98328.6328125, "learning_rate": 3.047408694723032e-05, "loss": 2.2091, "step": 168000 }, { "epoch": 1.2591432259703672, "grad_norm": 89367.40625, "learning_rate": 3.0204275196495414e-05, "loss": 2.1961, "step": 168500 }, { "epoch": 1.262879539387282, "grad_norm": 95675.3203125, "learning_rate": 2.99351453154444e-05, "loss": 2.1824, "step": 169000 }, { "epoch": 1.2666158528041966, "grad_norm": 83745.3359375, "learning_rate": 2.9666706574336834e-05, "loss": 2.1793, "step": 169500 }, { "epoch": 1.2703521662211112, "grad_norm": 92235.09375, "learning_rate": 2.9398968219625706e-05, "loss": 2.2034, "step": 170000 }, { "epoch": 1.274088479638026, "grad_norm": 79515.6328125, "learning_rate": 2.913193947363908e-05, "loss": 2.2091, "step": 170500 }, { "epoch": 1.2778247930549407, "grad_norm": 92163.1640625, "learning_rate": 2.8865629534262216e-05, "loss": 2.2082, "step": 171000 }, { "epoch": 1.2815611064718553, "grad_norm": 90488.1484375, "learning_rate": 2.8600047574620998e-05, "loss": 2.1976, "step": 171500 }, { "epoch": 1.2852974198887699, "grad_norm": 93587.21875, "learning_rate": 2.83352027427657e-05, "loss": 2.2092, "step": 172000 }, { "epoch": 1.2890337333056845, "grad_norm": 90367.0546875, "learning_rate": 2.8071104161356087e-05, "loss": 2.2086, "step": 172500 }, { "epoch": 1.2927700467225993, "grad_norm": 94576.171875, "learning_rate": 2.7807760927347102e-05, "loss": 2.2071, "step": 173000 }, { "epoch": 1.296506360139514, "grad_norm": 84362.703125, "learning_rate": 2.7545182111675466e-05, "loss": 2.1646, "step": 173500 }, { "epoch": 1.3002426735564285, "grad_norm": 99929.3984375, "learning_rate": 2.7283376758947333e-05, "loss": 2.1721, "step": 174000 }, { "epoch": 1.3039789869733434, "grad_norm": 99786.953125, "learning_rate": 2.7022353887126668e-05, "loss": 2.2208, "step": 174500 }, { "epoch": 1.307715300390258, "grad_norm": 118692.8046875, "learning_rate": 2.6762122487224684e-05, "loss": 2.2036, "step": 175000 }, { "epoch": 1.3114516138071726, "grad_norm": 88971.234375, "learning_rate": 2.6502691522990032e-05, "loss": 2.2018, "step": 175500 }, { "epoch": 1.3151879272240872, "grad_norm": 106580.484375, "learning_rate": 2.624406993060019e-05, "loss": 2.1668, "step": 176000 }, { "epoch": 1.318924240641002, "grad_norm": 94986.0390625, "learning_rate": 2.598626661835354e-05, "loss": 2.1782, "step": 176500 }, { "epoch": 1.3226605540579166, "grad_norm": 87692.828125, "learning_rate": 2.5729290466362565e-05, "loss": 2.251, "step": 177000 }, { "epoch": 1.3263968674748312, "grad_norm": 93603.875, "learning_rate": 2.5473150326248006e-05, "loss": 2.2143, "step": 177500 }, { "epoch": 1.330133180891746, "grad_norm": 98669.8828125, "learning_rate": 2.521785502083384e-05, "loss": 2.1962, "step": 178000 }, { "epoch": 1.3338694943086606, "grad_norm": 77537.9609375, "learning_rate": 2.4963413343843522e-05, "loss": 2.1533, "step": 178500 }, { "epoch": 1.3376058077255752, "grad_norm": 107931.5625, "learning_rate": 2.4709834059597026e-05, "loss": 2.1715, "step": 179000 }, { "epoch": 1.3413421211424899, "grad_norm": 91190.8125, "learning_rate": 2.4457125902708872e-05, "loss": 2.1757, "step": 179500 }, { "epoch": 1.3450784345594045, "grad_norm": 84562.8046875, "learning_rate": 2.420529757778746e-05, "loss": 2.1666, "step": 180000 }, { "epoch": 1.3488147479763193, "grad_norm": 99736.1640625, "learning_rate": 2.3954357759134995e-05, "loss": 2.1948, "step": 180500 }, { "epoch": 1.3525510613932339, "grad_norm": 93177.078125, "learning_rate": 2.3704315090448886e-05, "loss": 2.1868, "step": 181000 }, { "epoch": 1.3562873748101485, "grad_norm": 77689.5625, "learning_rate": 2.345517818452394e-05, "loss": 2.2091, "step": 181500 }, { "epoch": 1.3600236882270633, "grad_norm": 92219.8671875, "learning_rate": 2.3206955622955623e-05, "loss": 2.1795, "step": 182000 }, { "epoch": 1.363760001643978, "grad_norm": 96159.34375, "learning_rate": 2.2959655955844656e-05, "loss": 2.2186, "step": 182500 }, { "epoch": 1.3674963150608925, "grad_norm": 84869.453125, "learning_rate": 2.271328770150227e-05, "loss": 2.1516, "step": 183000 }, { "epoch": 1.3712326284778071, "grad_norm": 79080.1484375, "learning_rate": 2.2467859346156968e-05, "loss": 2.2031, "step": 183500 }, { "epoch": 1.374968941894722, "grad_norm": 107557.9765625, "learning_rate": 2.222337934366214e-05, "loss": 2.1703, "step": 184000 }, { "epoch": 1.3787052553116366, "grad_norm": 104010.9296875, "learning_rate": 2.1979856115204845e-05, "loss": 2.1637, "step": 184500 }, { "epoch": 1.3824415687285512, "grad_norm": 93606.03125, "learning_rate": 2.173729804901581e-05, "loss": 2.1746, "step": 185000 }, { "epoch": 1.386177882145466, "grad_norm": 100350.8515625, "learning_rate": 2.1495713500080443e-05, "loss": 2.1847, "step": 185500 }, { "epoch": 1.3899141955623806, "grad_norm": 91779.265625, "learning_rate": 2.1255110789851084e-05, "loss": 2.1576, "step": 186000 }, { "epoch": 1.3936505089792952, "grad_norm": 112973.15625, "learning_rate": 2.1015498205960283e-05, "loss": 2.177, "step": 186500 }, { "epoch": 1.3973868223962098, "grad_norm": 91850.71875, "learning_rate": 2.0776884001935437e-05, "loss": 2.1904, "step": 187000 }, { "epoch": 1.4011231358131244, "grad_norm": 92866.7265625, "learning_rate": 2.0539276396914448e-05, "loss": 2.1891, "step": 187500 }, { "epoch": 1.4048594492300392, "grad_norm": 94535.1640625, "learning_rate": 2.0302683575362608e-05, "loss": 2.1768, "step": 188000 }, { "epoch": 1.4085957626469539, "grad_norm": 99327.2890625, "learning_rate": 2.0067113686790694e-05, "loss": 2.195, "step": 188500 }, { "epoch": 1.4123320760638687, "grad_norm": 95519.703125, "learning_rate": 1.983257484547421e-05, "loss": 2.1735, "step": 189000 }, { "epoch": 1.4160683894807833, "grad_norm": 93955.65625, "learning_rate": 1.9599075130173977e-05, "loss": 2.2019, "step": 189500 }, { "epoch": 1.419804702897698, "grad_norm": 143901.203125, "learning_rate": 1.9366622583857803e-05, "loss": 2.1768, "step": 190000 }, { "epoch": 1.4235410163146125, "grad_norm": 77758.40625, "learning_rate": 1.913522521342338e-05, "loss": 2.1481, "step": 190500 }, { "epoch": 1.427277329731527, "grad_norm": 102854.71875, "learning_rate": 1.890489098942267e-05, "loss": 2.1364, "step": 191000 }, { "epoch": 1.431013643148442, "grad_norm": 76422.46875, "learning_rate": 1.8675627845787126e-05, "loss": 2.1446, "step": 191500 }, { "epoch": 1.4347499565653565, "grad_norm": 103994.8359375, "learning_rate": 1.8447443679554567e-05, "loss": 2.1618, "step": 192000 }, { "epoch": 1.4384862699822711, "grad_norm": 93646.765625, "learning_rate": 1.8220346350597123e-05, "loss": 2.2005, "step": 192500 }, { "epoch": 1.442222583399186, "grad_norm": 104911.5234375, "learning_rate": 1.7994343681350422e-05, "loss": 2.185, "step": 193000 }, { "epoch": 1.4459588968161006, "grad_norm": 103320.125, "learning_rate": 1.7769443456544322e-05, "loss": 2.1904, "step": 193500 }, { "epoch": 1.4496952102330152, "grad_norm": 88957.3203125, "learning_rate": 1.7545653422934537e-05, "loss": 2.1924, "step": 194000 }, { "epoch": 1.4534315236499298, "grad_norm": 98682.171875, "learning_rate": 1.732298128903597e-05, "loss": 2.1546, "step": 194500 }, { "epoch": 1.4571678370668444, "grad_norm": 93626.0234375, "learning_rate": 1.7101434724857147e-05, "loss": 2.1653, "step": 195000 }, { "epoch": 1.4609041504837592, "grad_norm": 84740.2265625, "learning_rate": 1.6881021361635947e-05, "loss": 2.1835, "step": 195500 }, { "epoch": 1.4646404639006738, "grad_norm": 95559.859375, "learning_rate": 1.6661748791576855e-05, "loss": 2.1716, "step": 196000 }, { "epoch": 1.4683767773175886, "grad_norm": 105746.453125, "learning_rate": 1.6443624567589377e-05, "loss": 2.1467, "step": 196500 }, { "epoch": 1.4721130907345032, "grad_norm": 93445.3515625, "learning_rate": 1.622665620302789e-05, "loss": 2.1673, "step": 197000 }, { "epoch": 1.4758494041514179, "grad_norm": 88159.8125, "learning_rate": 1.6010851171432874e-05, "loss": 2.1727, "step": 197500 }, { "epoch": 1.4795857175683325, "grad_norm": 93973.4375, "learning_rate": 1.5796216906273388e-05, "loss": 2.1824, "step": 198000 }, { "epoch": 1.483322030985247, "grad_norm": 95876.78125, "learning_rate": 1.5582760800691155e-05, "loss": 2.1858, "step": 198500 }, { "epoch": 1.487058344402162, "grad_norm": 90519.78125, "learning_rate": 1.537049020724581e-05, "loss": 2.1984, "step": 199000 }, { "epoch": 1.4907946578190765, "grad_norm": 88962.28125, "learning_rate": 1.515941243766169e-05, "loss": 2.2229, "step": 199500 }, { "epoch": 1.494530971235991, "grad_norm": 113852.828125, "learning_rate": 1.4949534762575912e-05, "loss": 2.157, "step": 200000 }, { "epoch": 1.498267284652906, "grad_norm": 97744.3984375, "learning_rate": 1.4740864411288013e-05, "loss": 2.1711, "step": 200500 }, { "epoch": 1.5020035980698205, "grad_norm": 88245.1640625, "learning_rate": 1.4533408571510904e-05, "loss": 2.1867, "step": 201000 }, { "epoch": 1.5057399114867351, "grad_norm": 89220.171875, "learning_rate": 1.4327174389123227e-05, "loss": 2.1382, "step": 201500 }, { "epoch": 1.5094762249036497, "grad_norm": 108794.328125, "learning_rate": 1.4122168967923366e-05, "loss": 2.1608, "step": 202000 }, { "epoch": 1.5132125383205643, "grad_norm": 108413.6640625, "learning_rate": 1.3918399369384556e-05, "loss": 2.1657, "step": 202500 }, { "epoch": 1.5169488517374792, "grad_norm": 92711.84375, "learning_rate": 1.3715872612411806e-05, "loss": 2.1385, "step": 203000 }, { "epoch": 1.5206851651543938, "grad_norm": 97720.765625, "learning_rate": 1.3514595673100084e-05, "loss": 2.1467, "step": 203500 }, { "epoch": 1.5244214785713086, "grad_norm": 86545.078125, "learning_rate": 1.3314575484493946e-05, "loss": 2.152, "step": 204000 }, { "epoch": 1.5281577919882232, "grad_norm": 96816.7421875, "learning_rate": 1.3115818936348889e-05, "loss": 2.1636, "step": 204500 }, { "epoch": 1.5318941054051378, "grad_norm": 100713.7734375, "learning_rate": 1.2918332874893869e-05, "loss": 2.1784, "step": 205000 }, { "epoch": 1.5356304188220524, "grad_norm": 97874.359375, "learning_rate": 1.2722124102595518e-05, "loss": 2.1903, "step": 205500 }, { "epoch": 1.539366732238967, "grad_norm": 91848.2734375, "learning_rate": 1.252719937792397e-05, "loss": 2.1524, "step": 206000 }, { "epoch": 1.5431030456558816, "grad_norm": 95539.515625, "learning_rate": 1.2333565415119835e-05, "loss": 2.1813, "step": 206500 }, { "epoch": 1.5468393590727965, "grad_norm": 92111.3984375, "learning_rate": 1.2141228883963118e-05, "loss": 2.1615, "step": 207000 }, { "epoch": 1.5505756724897113, "grad_norm": 90152.640625, "learning_rate": 1.195019640954339e-05, "loss": 2.1615, "step": 207500 }, { "epoch": 1.554311985906626, "grad_norm": 96634.609375, "learning_rate": 1.1760474572031538e-05, "loss": 2.1455, "step": 208000 }, { "epoch": 1.5580482993235405, "grad_norm": 102015.8984375, "learning_rate": 1.1572069906453265e-05, "loss": 2.1508, "step": 208500 }, { "epoch": 1.561784612740455, "grad_norm": 90662.0859375, "learning_rate": 1.1384988902463806e-05, "loss": 2.1443, "step": 209000 }, { "epoch": 1.5655209261573697, "grad_norm": 101081.84375, "learning_rate": 1.1199238004124519e-05, "loss": 2.191, "step": 209500 }, { "epoch": 1.5692572395742843, "grad_norm": 86159.140625, "learning_rate": 1.101482360968088e-05, "loss": 2.1739, "step": 210000 }, { "epoch": 1.5729935529911991, "grad_norm": 96140.9296875, "learning_rate": 1.0831752071342033e-05, "loss": 2.1597, "step": 210500 }, { "epoch": 1.5767298664081137, "grad_norm": 93931.6484375, "learning_rate": 1.0650029695062103e-05, "loss": 2.1704, "step": 211000 }, { "epoch": 1.5804661798250286, "grad_norm": 96550.375, "learning_rate": 1.0469662740322888e-05, "loss": 2.1762, "step": 211500 }, { "epoch": 1.5842024932419432, "grad_norm": 91985.375, "learning_rate": 1.0290657419918308e-05, "loss": 2.1537, "step": 212000 }, { "epoch": 1.5879388066588578, "grad_norm": 99446.453125, "learning_rate": 1.0113019899740345e-05, "loss": 2.1567, "step": 212500 }, { "epoch": 1.5916751200757724, "grad_norm": 90499.4453125, "learning_rate": 9.936756298566724e-06, "loss": 2.1542, "step": 213000 }, { "epoch": 1.595411433492687, "grad_norm": 91874.203125, "learning_rate": 9.761872687850093e-06, "loss": 2.1701, "step": 213500 }, { "epoch": 1.5991477469096018, "grad_norm": 101494.265625, "learning_rate": 9.588375091508945e-06, "loss": 2.1785, "step": 214000 }, { "epoch": 1.6028840603265164, "grad_norm": 101492.890625, "learning_rate": 9.41626948572007e-06, "loss": 2.1681, "step": 214500 }, { "epoch": 1.6066203737434313, "grad_norm": 97085.359375, "learning_rate": 9.245561798712704e-06, "loss": 2.163, "step": 215000 }, { "epoch": 1.6103566871603459, "grad_norm": 101823.4609375, "learning_rate": 9.076257910564402e-06, "loss": 2.1538, "step": 215500 }, { "epoch": 1.6140930005772605, "grad_norm": 91494.078125, "learning_rate": 8.908363652998414e-06, "loss": 2.1235, "step": 216000 }, { "epoch": 1.617829313994175, "grad_norm": 85581.09375, "learning_rate": 8.74188480918282e-06, "loss": 2.1736, "step": 216500 }, { "epoch": 1.6215656274110897, "grad_norm": 92024.046875, "learning_rate": 8.576827113531427e-06, "loss": 2.177, "step": 217000 }, { "epoch": 1.6253019408280043, "grad_norm": 74753.4296875, "learning_rate": 8.413196251506073e-06, "loss": 2.143, "step": 217500 }, { "epoch": 1.629038254244919, "grad_norm": 107585.3125, "learning_rate": 8.25099785942094e-06, "loss": 2.1522, "step": 218000 }, { "epoch": 1.6327745676618337, "grad_norm": 94180.15625, "learning_rate": 8.090237524248346e-06, "loss": 2.1343, "step": 218500 }, { "epoch": 1.6365108810787485, "grad_norm": 102109.1953125, "learning_rate": 7.930920783426265e-06, "loss": 2.1643, "step": 219000 }, { "epoch": 1.6402471944956631, "grad_norm": 107961.2109375, "learning_rate": 7.773053124667712e-06, "loss": 2.1483, "step": 219500 }, { "epoch": 1.6439835079125777, "grad_norm": 89340.9296875, "learning_rate": 7.616639985771556e-06, "loss": 2.1602, "step": 220000 }, { "epoch": 1.6477198213294924, "grad_norm": 103068.6015625, "learning_rate": 7.461686754435332e-06, "loss": 2.1549, "step": 220500 }, { "epoch": 1.651456134746407, "grad_norm": 101308.2421875, "learning_rate": 7.30819876806963e-06, "loss": 2.1645, "step": 221000 }, { "epoch": 1.6551924481633218, "grad_norm": 116589.4140625, "learning_rate": 7.156181313614185e-06, "loss": 2.1538, "step": 221500 }, { "epoch": 1.6589287615802364, "grad_norm": 107515.390625, "learning_rate": 7.005639627355881e-06, "loss": 2.1506, "step": 222000 }, { "epoch": 1.6626650749971512, "grad_norm": 105536.6484375, "learning_rate": 6.856578894748284e-06, "loss": 2.1563, "step": 222500 }, { "epoch": 1.6664013884140658, "grad_norm": 125030.3046875, "learning_rate": 6.709004250233097e-06, "loss": 2.1574, "step": 223000 }, { "epoch": 1.6701377018309804, "grad_norm": 95777.0546875, "learning_rate": 6.562920777063231e-06, "loss": 2.1502, "step": 223500 }, { "epoch": 1.673874015247895, "grad_norm": 125466.265625, "learning_rate": 6.418333507127788e-06, "loss": 2.1569, "step": 224000 }, { "epoch": 1.6776103286648096, "grad_norm": 82233.953125, "learning_rate": 6.275247420778685e-06, "loss": 2.1315, "step": 224500 }, { "epoch": 1.6813466420817242, "grad_norm": 86722.7890625, "learning_rate": 6.133667446659131e-06, "loss": 2.1662, "step": 225000 }, { "epoch": 1.685082955498639, "grad_norm": 95386.34375, "learning_rate": 5.993598461533845e-06, "loss": 2.147, "step": 225500 }, { "epoch": 1.6888192689155537, "grad_norm": 82564.6484375, "learning_rate": 5.855045290121047e-06, "loss": 2.1426, "step": 226000 }, { "epoch": 1.6925555823324685, "grad_norm": 86084.6875, "learning_rate": 5.71801270492634e-06, "loss": 2.1517, "step": 226500 }, { "epoch": 1.696291895749383, "grad_norm": 90130.1796875, "learning_rate": 5.58250542607826e-06, "loss": 2.143, "step": 227000 }, { "epoch": 1.7000282091662977, "grad_norm": 95907.765625, "learning_rate": 5.448528121165691e-06, "loss": 2.1631, "step": 227500 }, { "epoch": 1.7037645225832123, "grad_norm": 88561.578125, "learning_rate": 5.316085405077159e-06, "loss": 2.135, "step": 228000 }, { "epoch": 1.707500836000127, "grad_norm": 94637.078125, "learning_rate": 5.185181839841746e-06, "loss": 2.1314, "step": 228500 }, { "epoch": 1.7112371494170417, "grad_norm": 96587.3125, "learning_rate": 5.055821934472071e-06, "loss": 2.1399, "step": 229000 }, { "epoch": 1.7149734628339564, "grad_norm": 91652.40625, "learning_rate": 4.92801014480892e-06, "loss": 2.1798, "step": 229500 }, { "epoch": 1.7187097762508712, "grad_norm": 104303.1015625, "learning_rate": 4.801750873367728e-06, "loss": 2.1749, "step": 230000 }, { "epoch": 1.7224460896677858, "grad_norm": 85994.984375, "learning_rate": 4.677048469187045e-06, "loss": 2.161, "step": 230500 }, { "epoch": 1.7261824030847004, "grad_norm": 105747.375, "learning_rate": 4.553907227678594e-06, "loss": 2.1273, "step": 231000 }, { "epoch": 1.729918716501615, "grad_norm": 108115.8125, "learning_rate": 4.432331390479416e-06, "loss": 2.1493, "step": 231500 }, { "epoch": 1.7336550299185296, "grad_norm": 97434.2109375, "learning_rate": 4.312325145305729e-06, "loss": 2.1339, "step": 232000 }, { "epoch": 1.7373913433354442, "grad_norm": 100293.0546875, "learning_rate": 4.193892625808659e-06, "loss": 2.1501, "step": 232500 }, { "epoch": 1.741127656752359, "grad_norm": 102609.921875, "learning_rate": 4.077037911431913e-06, "loss": 2.1309, "step": 233000 }, { "epoch": 1.7448639701692736, "grad_norm": 115093.4453125, "learning_rate": 3.961765027271197e-06, "loss": 2.144, "step": 233500 }, { "epoch": 1.7486002835861885, "grad_norm": 85552.25, "learning_rate": 3.84807794393563e-06, "loss": 2.0989, "step": 234000 }, { "epoch": 1.752336597003103, "grad_norm": 104233.3359375, "learning_rate": 3.735980577410908e-06, "loss": 2.1801, "step": 234500 }, { "epoch": 1.7560729104200177, "grad_norm": 103465.9375, "learning_rate": 3.6254767889244813e-06, "loss": 2.131, "step": 235000 }, { "epoch": 1.7598092238369323, "grad_norm": 100807.5546875, "learning_rate": 3.516570384812523e-06, "loss": 2.1221, "step": 235500 }, { "epoch": 1.7635455372538469, "grad_norm": 104049.0078125, "learning_rate": 3.4092651163888154e-06, "loss": 2.1382, "step": 236000 }, { "epoch": 1.7672818506707617, "grad_norm": 100944.1015625, "learning_rate": 3.303564679815557e-06, "loss": 2.1736, "step": 236500 }, { "epoch": 1.7710181640876763, "grad_norm": 104766.390625, "learning_rate": 3.1994727159759986e-06, "loss": 2.1125, "step": 237000 }, { "epoch": 1.7747544775045911, "grad_norm": 95535.8515625, "learning_rate": 3.0969928103490976e-06, "loss": 2.1478, "step": 237500 }, { "epoch": 1.7784907909215057, "grad_norm": 89555.0859375, "learning_rate": 2.996128492885969e-06, "loss": 2.1244, "step": 238000 }, { "epoch": 1.7822271043384204, "grad_norm": 101483.0546875, "learning_rate": 2.8968832378882926e-06, "loss": 2.1287, "step": 238500 }, { "epoch": 1.785963417755335, "grad_norm": 84929.1640625, "learning_rate": 2.7992604638886975e-06, "loss": 2.1421, "step": 239000 }, { "epoch": 1.7896997311722496, "grad_norm": 96690.8984375, "learning_rate": 2.703263533532929e-06, "loss": 2.1681, "step": 239500 }, { "epoch": 1.7934360445891642, "grad_norm": 105319.703125, "learning_rate": 2.608895753464069e-06, "loss": 2.1331, "step": 240000 }, { "epoch": 1.797172358006079, "grad_norm": 101327.21875, "learning_rate": 2.516160374208648e-06, "loss": 2.1551, "step": 240500 }, { "epoch": 1.8009086714229936, "grad_norm": 94917.8125, "learning_rate": 2.4250605900646204e-06, "loss": 2.1703, "step": 241000 }, { "epoch": 1.8046449848399084, "grad_norm": 97282.4765625, "learning_rate": 2.3355995389914342e-06, "loss": 2.1457, "step": 241500 }, { "epoch": 1.808381298256823, "grad_norm": 113476.78125, "learning_rate": 2.2477803025018228e-06, "loss": 2.1237, "step": 242000 }, { "epoch": 1.8121176116737376, "grad_norm": 99987.9921875, "learning_rate": 2.1616059055557636e-06, "loss": 2.1792, "step": 242500 }, { "epoch": 1.8158539250906522, "grad_norm": 110651.3828125, "learning_rate": 2.0770793164562373e-06, "loss": 2.135, "step": 243000 }, { "epoch": 1.8195902385075668, "grad_norm": 96608.578125, "learning_rate": 1.994203446746962e-06, "loss": 2.1681, "step": 243500 }, { "epoch": 1.8233265519244817, "grad_norm": 108624.8046875, "learning_rate": 1.9129811511121697e-06, "loss": 2.1457, "step": 244000 }, { "epoch": 1.8270628653413963, "grad_norm": 113863.421875, "learning_rate": 1.8334152272782069e-06, "loss": 2.1444, "step": 244500 }, { "epoch": 1.830799178758311, "grad_norm": 100759.1875, "learning_rate": 1.7555084159172176e-06, "loss": 2.1633, "step": 245000 }, { "epoch": 1.8345354921752257, "grad_norm": 104006.0859375, "learning_rate": 1.6792634005526943e-06, "loss": 2.1561, "step": 245500 }, { "epoch": 1.8382718055921403, "grad_norm": 93557.3203125, "learning_rate": 1.6046828074670894e-06, "loss": 2.1397, "step": 246000 }, { "epoch": 1.842008119009055, "grad_norm": 92767.28125, "learning_rate": 1.531769205611322e-06, "loss": 2.1375, "step": 246500 }, { "epoch": 1.8457444324259695, "grad_norm": 115375.0546875, "learning_rate": 1.4605251065162973e-06, "loss": 2.1149, "step": 247000 }, { "epoch": 1.8494807458428841, "grad_norm": 85288.765625, "learning_rate": 1.3909529642064045e-06, "loss": 2.1298, "step": 247500 }, { "epoch": 1.853217059259799, "grad_norm": 92626.7890625, "learning_rate": 1.3230551751149568e-06, "loss": 2.1488, "step": 248000 }, { "epoch": 1.8569533726767136, "grad_norm": 105719.265625, "learning_rate": 1.2568340780016953e-06, "loss": 2.1397, "step": 248500 }, { "epoch": 1.8606896860936284, "grad_norm": 89167.5703125, "learning_rate": 1.1922919538721876e-06, "loss": 2.1225, "step": 249000 }, { "epoch": 1.864425999510543, "grad_norm": 100311.046875, "learning_rate": 1.1294310258992747e-06, "loss": 2.1445, "step": 249500 }, { "epoch": 1.8681623129274576, "grad_norm": 87286.234375, "learning_rate": 1.0682534593464967e-06, "loss": 2.1369, "step": 250000 }, { "epoch": 1.8718986263443722, "grad_norm": 103840.65625, "learning_rate": 1.0087613614934987e-06, "loss": 2.1134, "step": 250500 }, { "epoch": 1.8756349397612868, "grad_norm": 97787.40625, "learning_rate": 9.509567815634446e-07, "loss": 2.1302, "step": 251000 }, { "epoch": 1.8793712531782016, "grad_norm": 119866.546875, "learning_rate": 8.948417106524554e-07, "loss": 2.1391, "step": 251500 }, { "epoch": 1.8831075665951162, "grad_norm": 101651.5234375, "learning_rate": 8.404180816609819e-07, "loss": 2.1628, "step": 252000 }, { "epoch": 1.886843880012031, "grad_norm": 103326.453125, "learning_rate": 7.876877692272799e-07, "loss": 2.1341, "step": 252500 }, { "epoch": 1.8905801934289457, "grad_norm": 100837.203125, "learning_rate": 7.366525896627841e-07, "loss": 2.125, "step": 253000 }, { "epoch": 1.8943165068458603, "grad_norm": 101825.6015625, "learning_rate": 6.873143008895633e-07, "loss": 2.116, "step": 253500 }, { "epoch": 1.8980528202627749, "grad_norm": 84607.0234375, "learning_rate": 6.396746023798139e-07, "loss": 2.1599, "step": 254000 }, { "epoch": 1.9017891336796895, "grad_norm": 85267.1328125, "learning_rate": 5.937351350972287e-07, "loss": 2.1605, "step": 254500 }, { "epoch": 1.905525447096604, "grad_norm": 112646.515625, "learning_rate": 5.49497481440564e-07, "loss": 2.1479, "step": 255000 }, { "epoch": 1.909261760513519, "grad_norm": 101483.7578125, "learning_rate": 5.069631651890783e-07, "loss": 2.1306, "step": 255500 }, { "epoch": 1.9129980739304335, "grad_norm": 104185.2265625, "learning_rate": 4.661336514500625e-07, "loss": 2.1222, "step": 256000 }, { "epoch": 1.9167343873473484, "grad_norm": 83210.1484375, "learning_rate": 4.270103466083697e-07, "loss": 2.1573, "step": 256500 }, { "epoch": 1.920470700764263, "grad_norm": 109216.5703125, "learning_rate": 3.895945982779925e-07, "loss": 2.166, "step": 257000 }, { "epoch": 1.9242070141811776, "grad_norm": 100591.0546875, "learning_rate": 3.5388769525560585e-07, "loss": 2.1291, "step": 257500 }, { "epoch": 1.9279433275980922, "grad_norm": 98919.7421875, "learning_rate": 3.19890867476208e-07, "loss": 2.1449, "step": 258000 }, { "epoch": 1.9316796410150068, "grad_norm": 90432.4765625, "learning_rate": 2.876052859707379e-07, "loss": 2.1195, "step": 258500 }, { "epoch": 1.9354159544319216, "grad_norm": 94623.453125, "learning_rate": 2.570320628257461e-07, "loss": 2.1409, "step": 259000 }, { "epoch": 1.9391522678488362, "grad_norm": 112643.9765625, "learning_rate": 2.2817225114507567e-07, "loss": 2.1521, "step": 259500 }, { "epoch": 1.942888581265751, "grad_norm": 83827.5078125, "learning_rate": 2.010268450136077e-07, "loss": 2.1273, "step": 260000 }, { "epoch": 1.9466248946826656, "grad_norm": 97442.9453125, "learning_rate": 1.7559677946300533e-07, "loss": 2.1454, "step": 260500 }, { "epoch": 1.9503612080995802, "grad_norm": 92114.421875, "learning_rate": 1.518829304395175e-07, "loss": 2.127, "step": 261000 }, { "epoch": 1.9540975215164949, "grad_norm": 95378.5078125, "learning_rate": 1.2988611477378066e-07, "loss": 2.1213, "step": 261500 }, { "epoch": 1.9578338349334095, "grad_norm": 87441.7890625, "learning_rate": 1.0960709015272464e-07, "loss": 2.1427, "step": 262000 }, { "epoch": 1.961570148350324, "grad_norm": 87605.1640625, "learning_rate": 9.104655509343252e-08, "loss": 2.1541, "step": 262500 }, { "epoch": 1.9653064617672389, "grad_norm": 101794.53125, "learning_rate": 7.420514891911535e-08, "loss": 2.152, "step": 263000 }, { "epoch": 1.9690427751841535, "grad_norm": 94773.6328125, "learning_rate": 5.908345173705754e-08, "loss": 2.1471, "step": 263500 }, { "epoch": 1.9727790886010683, "grad_norm": 105178.625, "learning_rate": 4.5681984418666224e-08, "loss": 2.1364, "step": 264000 }, { "epoch": 1.976515402017983, "grad_norm": 97559.484375, "learning_rate": 3.400120858149669e-08, "loss": 2.17, "step": 264500 }, { "epoch": 1.9802517154348975, "grad_norm": 120999.3125, "learning_rate": 2.404152657338732e-08, "loss": 2.1488, "step": 265000 }, { "epoch": 1.9839880288518121, "grad_norm": 117999.5859375, "learning_rate": 1.5803281458570684e-08, "loss": 2.1481, "step": 265500 }, { "epoch": 1.9877243422687267, "grad_norm": 94607.5078125, "learning_rate": 9.286757005871893e-09, "loss": 2.1104, "step": 266000 }, { "epoch": 1.9914606556856416, "grad_norm": 94297.2109375, "learning_rate": 4.492177678938614e-09, "loss": 2.1349, "step": 266500 }, { "epoch": 1.9951969691025562, "grad_norm": 95649.546875, "learning_rate": 1.4197086284917227e-09, "loss": 2.1224, "step": 267000 }, { "epoch": 1.998933282519471, "grad_norm": 116585.421875, "learning_rate": 6.945568664651347e-11, "loss": 2.1757, "step": 267500 } ], "logging_steps": 500, "max_steps": 267642, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.913150854495206e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }