|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9999943955298747, |
|
"eval_steps": 500, |
|
"global_step": 267642, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0037363134169146645, |
|
"grad_norm": 72942.6796875, |
|
"learning_rate": 9.999655549964018e-05, |
|
"loss": 4.1478, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.007472626833829329, |
|
"grad_norm": 67110.5, |
|
"learning_rate": 9.998622247314402e-05, |
|
"loss": 3.6349, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.011208940250743994, |
|
"grad_norm": 73347.7890625, |
|
"learning_rate": 9.996900234419608e-05, |
|
"loss": 3.4653, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.014945253667658658, |
|
"grad_norm": 70530.5546875, |
|
"learning_rate": 9.994489748538595e-05, |
|
"loss": 3.3608, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.018681567084573324, |
|
"grad_norm": 69644.8515625, |
|
"learning_rate": 9.991391121788143e-05, |
|
"loss": 3.2989, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.022417880501487988, |
|
"grad_norm": 79574.8203125, |
|
"learning_rate": 9.987604781097091e-05, |
|
"loss": 3.1788, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.02615419391840265, |
|
"grad_norm": 77448.375, |
|
"learning_rate": 9.983131248147513e-05, |
|
"loss": 3.1802, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.029890507335317316, |
|
"grad_norm": 82995.328125, |
|
"learning_rate": 9.977971139302844e-05, |
|
"loss": 3.1495, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.03362682075223198, |
|
"grad_norm": 77637.0625, |
|
"learning_rate": 9.972125165522955e-05, |
|
"loss": 3.0534, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.03736313416914665, |
|
"grad_norm": 71801.3515625, |
|
"learning_rate": 9.965594132266197e-05, |
|
"loss": 3.0471, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.04109944758606131, |
|
"grad_norm": 73166.8125, |
|
"learning_rate": 9.958378939378425e-05, |
|
"loss": 3.0568, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.044835761002975975, |
|
"grad_norm": 82521.0390625, |
|
"learning_rate": 9.950480580969022e-05, |
|
"loss": 2.9987, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.048572074419890636, |
|
"grad_norm": 73557.609375, |
|
"learning_rate": 9.941900145273922e-05, |
|
"loss": 2.9518, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.0523083878368053, |
|
"grad_norm": 76367.0078125, |
|
"learning_rate": 9.932638814505676e-05, |
|
"loss": 2.9528, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.056044701253719964, |
|
"grad_norm": 80728.796875, |
|
"learning_rate": 9.922697864690573e-05, |
|
"loss": 2.9501, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.05978101467063463, |
|
"grad_norm": 71664.421875, |
|
"learning_rate": 9.912078665492822e-05, |
|
"loss": 2.9449, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0635173280875493, |
|
"grad_norm": 75999.953125, |
|
"learning_rate": 9.90078268002584e-05, |
|
"loss": 2.8971, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.06725364150446396, |
|
"grad_norm": 67755.296875, |
|
"learning_rate": 9.888811464650668e-05, |
|
"loss": 2.8525, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.07098995492137862, |
|
"grad_norm": 76703.8203125, |
|
"learning_rate": 9.876166668761532e-05, |
|
"loss": 2.8725, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.0747262683382933, |
|
"grad_norm": 74384.6640625, |
|
"learning_rate": 9.862850034558593e-05, |
|
"loss": 2.8761, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.07846258175520796, |
|
"grad_norm": 59161.4453125, |
|
"learning_rate": 9.848863396807901e-05, |
|
"loss": 2.8503, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.08219889517212262, |
|
"grad_norm": 66133.3359375, |
|
"learning_rate": 9.834208682588608e-05, |
|
"loss": 2.8198, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.08593520858903728, |
|
"grad_norm": 65402.43359375, |
|
"learning_rate": 9.81888791102745e-05, |
|
"loss": 2.8278, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.08967152200595195, |
|
"grad_norm": 73789.2578125, |
|
"learning_rate": 9.802903193020554e-05, |
|
"loss": 2.808, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.09340783542286661, |
|
"grad_norm": 71568.953125, |
|
"learning_rate": 9.786256730942595e-05, |
|
"loss": 2.7821, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.09714414883978127, |
|
"grad_norm": 68736.6953125, |
|
"learning_rate": 9.768950818343359e-05, |
|
"loss": 2.7827, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.10088046225669595, |
|
"grad_norm": 73235.984375, |
|
"learning_rate": 9.750987839631733e-05, |
|
"loss": 2.797, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.1046167756736106, |
|
"grad_norm": 72581.59375, |
|
"learning_rate": 9.732370269747181e-05, |
|
"loss": 2.7515, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.10835308909052527, |
|
"grad_norm": 77676.46875, |
|
"learning_rate": 9.713100673818754e-05, |
|
"loss": 2.7451, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.11208940250743993, |
|
"grad_norm": 78359.078125, |
|
"learning_rate": 9.693181706811652e-05, |
|
"loss": 2.7544, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.1158257159243546, |
|
"grad_norm": 67356.484375, |
|
"learning_rate": 9.672616113161434e-05, |
|
"loss": 2.7054, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.11956202934126926, |
|
"grad_norm": 76034.65625, |
|
"learning_rate": 9.651406726395895e-05, |
|
"loss": 2.7419, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.12329834275818392, |
|
"grad_norm": 77247.296875, |
|
"learning_rate": 9.629556468744647e-05, |
|
"loss": 2.7274, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.1270346561750986, |
|
"grad_norm": 90000.84375, |
|
"learning_rate": 9.607068350736501e-05, |
|
"loss": 2.7118, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.13077096959201326, |
|
"grad_norm": 82970.1953125, |
|
"learning_rate": 9.583945470784683e-05, |
|
"loss": 2.7036, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.13450728300892792, |
|
"grad_norm": 70716.7578125, |
|
"learning_rate": 9.560191014759922e-05, |
|
"loss": 2.6936, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.13824359642584258, |
|
"grad_norm": 69805.4140625, |
|
"learning_rate": 9.535808255551515e-05, |
|
"loss": 2.6633, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.14197990984275724, |
|
"grad_norm": 69154.890625, |
|
"learning_rate": 9.510800552616374e-05, |
|
"loss": 2.6765, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.1457162232596719, |
|
"grad_norm": 70633.8515625, |
|
"learning_rate": 9.485171351516171e-05, |
|
"loss": 2.6623, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.1494525366765866, |
|
"grad_norm": 85893.5390625, |
|
"learning_rate": 9.458924183442602e-05, |
|
"loss": 2.6699, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.15318885009350125, |
|
"grad_norm": 72002.125, |
|
"learning_rate": 9.43206266473086e-05, |
|
"loss": 2.6461, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.1569251635104159, |
|
"grad_norm": 78129.734375, |
|
"learning_rate": 9.404590496361382e-05, |
|
"loss": 2.6687, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.16066147692733057, |
|
"grad_norm": 64102.87890625, |
|
"learning_rate": 9.376511463449922e-05, |
|
"loss": 2.659, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.16439779034424523, |
|
"grad_norm": 61640.66796875, |
|
"learning_rate": 9.347829434726035e-05, |
|
"loss": 2.6069, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.1681341037611599, |
|
"grad_norm": 77315.6484375, |
|
"learning_rate": 9.318548362000054e-05, |
|
"loss": 2.6231, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.17187041717807455, |
|
"grad_norm": 62667.19921875, |
|
"learning_rate": 9.288672279618603e-05, |
|
"loss": 2.6435, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.17560673059498924, |
|
"grad_norm": 72954.6015625, |
|
"learning_rate": 9.258205303908738e-05, |
|
"loss": 2.6336, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.1793430440119039, |
|
"grad_norm": 71380.78125, |
|
"learning_rate": 9.227151632610811e-05, |
|
"loss": 2.6389, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.18307935742881856, |
|
"grad_norm": 72969.21875, |
|
"learning_rate": 9.195515544300105e-05, |
|
"loss": 2.6162, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.18681567084573322, |
|
"grad_norm": 70472.6171875, |
|
"learning_rate": 9.163301397797319e-05, |
|
"loss": 2.6384, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.19055198426264788, |
|
"grad_norm": 74207.796875, |
|
"learning_rate": 9.130513631568021e-05, |
|
"loss": 2.5962, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.19428829767956254, |
|
"grad_norm": 66946.59375, |
|
"learning_rate": 9.097156763111118e-05, |
|
"loss": 2.607, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.1980246110964772, |
|
"grad_norm": 75940.734375, |
|
"learning_rate": 9.063235388336423e-05, |
|
"loss": 2.5973, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.2017609245133919, |
|
"grad_norm": 63575.37890625, |
|
"learning_rate": 9.028754180931441e-05, |
|
"loss": 2.6065, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.20549723793030655, |
|
"grad_norm": 79057.234375, |
|
"learning_rate": 8.993717891717425e-05, |
|
"loss": 2.5717, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.2092335513472212, |
|
"grad_norm": 72744.7578125, |
|
"learning_rate": 8.958131347994808e-05, |
|
"loss": 2.6101, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.21296986476413587, |
|
"grad_norm": 77414.359375, |
|
"learning_rate": 8.921999452878093e-05, |
|
"loss": 2.5807, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.21670617818105053, |
|
"grad_norm": 75271.28125, |
|
"learning_rate": 8.885327184620314e-05, |
|
"loss": 2.5952, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.2204424915979652, |
|
"grad_norm": 69727.734375, |
|
"learning_rate": 8.848119595927116e-05, |
|
"loss": 2.587, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.22417880501487986, |
|
"grad_norm": 83907.5859375, |
|
"learning_rate": 8.810381813260606e-05, |
|
"loss": 2.5377, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.22791511843179454, |
|
"grad_norm": 93582.2265625, |
|
"learning_rate": 8.772119036133022e-05, |
|
"loss": 2.5808, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.2316514318487092, |
|
"grad_norm": 69250.0703125, |
|
"learning_rate": 8.733336536390349e-05, |
|
"loss": 2.554, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.23538774526562387, |
|
"grad_norm": 72463.203125, |
|
"learning_rate": 8.69403965748596e-05, |
|
"loss": 2.5899, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.23912405868253853, |
|
"grad_norm": 71343.4765625, |
|
"learning_rate": 8.654233813744392e-05, |
|
"loss": 2.5627, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.2428603720994532, |
|
"grad_norm": 78745.25, |
|
"learning_rate": 8.613924489615373e-05, |
|
"loss": 2.5353, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.24659668551636785, |
|
"grad_norm": 81588.1875, |
|
"learning_rate": 8.573117238918161e-05, |
|
"loss": 2.5696, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.25033299893328254, |
|
"grad_norm": 66313.8984375, |
|
"learning_rate": 8.53181768407634e-05, |
|
"loss": 2.5745, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.2540693123501972, |
|
"grad_norm": 70478.8046875, |
|
"learning_rate": 8.490031515343177e-05, |
|
"loss": 2.5485, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.25780562576711186, |
|
"grad_norm": 81928.53125, |
|
"learning_rate": 8.447764490017597e-05, |
|
"loss": 2.4977, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.2615419391840265, |
|
"grad_norm": 65777.4921875, |
|
"learning_rate": 8.405022431650959e-05, |
|
"loss": 2.5664, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.2652782526009412, |
|
"grad_norm": 81822.8828125, |
|
"learning_rate": 8.361811229244682e-05, |
|
"loss": 2.5249, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.26901456601785584, |
|
"grad_norm": 66773.1328125, |
|
"learning_rate": 8.318136836438853e-05, |
|
"loss": 2.5385, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.2727508794347705, |
|
"grad_norm": 78679.25, |
|
"learning_rate": 8.274005270691942e-05, |
|
"loss": 2.499, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.27648719285168516, |
|
"grad_norm": 68307.140625, |
|
"learning_rate": 8.229422612451711e-05, |
|
"loss": 2.4764, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.2802235062685998, |
|
"grad_norm": 67303.9453125, |
|
"learning_rate": 8.184395004317458e-05, |
|
"loss": 2.5165, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.2839598196855145, |
|
"grad_norm": 69105.96875, |
|
"learning_rate": 8.138928650193676e-05, |
|
"loss": 2.5159, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.28769613310242914, |
|
"grad_norm": 75505.984375, |
|
"learning_rate": 8.093029814435293e-05, |
|
"loss": 2.5268, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.2914324465193438, |
|
"grad_norm": 80771.171875, |
|
"learning_rate": 8.046704820984558e-05, |
|
"loss": 2.5296, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.2951687599362585, |
|
"grad_norm": 71207.0546875, |
|
"learning_rate": 7.999960052499737e-05, |
|
"loss": 2.518, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.2989050733531732, |
|
"grad_norm": 69106.34375, |
|
"learning_rate": 7.952801949475706e-05, |
|
"loss": 2.4906, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.30264138677008784, |
|
"grad_norm": 82679.9296875, |
|
"learning_rate": 7.905237009356578e-05, |
|
"loss": 2.4743, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.3063777001870025, |
|
"grad_norm": 84038.28125, |
|
"learning_rate": 7.857271785640486e-05, |
|
"loss": 2.4726, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.31011401360391716, |
|
"grad_norm": 75876.3203125, |
|
"learning_rate": 7.808912886976641e-05, |
|
"loss": 2.5081, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.3138503270208318, |
|
"grad_norm": 78494.3046875, |
|
"learning_rate": 7.760166976254804e-05, |
|
"loss": 2.4795, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.3175866404377465, |
|
"grad_norm": 68793.765625, |
|
"learning_rate": 7.711040769687248e-05, |
|
"loss": 2.4883, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.32132295385466114, |
|
"grad_norm": 78543.03125, |
|
"learning_rate": 7.661541035883427e-05, |
|
"loss": 2.4752, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.3250592672715758, |
|
"grad_norm": 78304.4140625, |
|
"learning_rate": 7.611674594917376e-05, |
|
"loss": 2.4747, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.32879558068849046, |
|
"grad_norm": 77926.90625, |
|
"learning_rate": 7.561448317388045e-05, |
|
"loss": 2.4666, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.3325318941054051, |
|
"grad_norm": 76948.8671875, |
|
"learning_rate": 7.510869123472679e-05, |
|
"loss": 2.4724, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.3362682075223198, |
|
"grad_norm": 75918.4921875, |
|
"learning_rate": 7.459943981973343e-05, |
|
"loss": 2.4476, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.34000452093923444, |
|
"grad_norm": 66944.8125, |
|
"learning_rate": 7.408679909356764e-05, |
|
"loss": 2.4982, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.3437408343561491, |
|
"grad_norm": 68671.390625, |
|
"learning_rate": 7.357083968787606e-05, |
|
"loss": 2.4996, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.3474771477730638, |
|
"grad_norm": 74974.25, |
|
"learning_rate": 7.305163269155305e-05, |
|
"loss": 2.44, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.3512134611899785, |
|
"grad_norm": 83594.984375, |
|
"learning_rate": 7.2529249640946e-05, |
|
"loss": 2.466, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.35494977460689314, |
|
"grad_norm": 67277.0546875, |
|
"learning_rate": 7.200376250999917e-05, |
|
"loss": 2.4398, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.3586860880238078, |
|
"grad_norm": 64984.64453125, |
|
"learning_rate": 7.1475243700337e-05, |
|
"loss": 2.4536, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.36242240144072246, |
|
"grad_norm": 76212.953125, |
|
"learning_rate": 7.094376603128872e-05, |
|
"loss": 2.4643, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.3661587148576371, |
|
"grad_norm": 78486.7109375, |
|
"learning_rate": 7.040940272985521e-05, |
|
"loss": 2.4468, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.3698950282745518, |
|
"grad_norm": 90899.328125, |
|
"learning_rate": 6.987222742061982e-05, |
|
"loss": 2.458, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.37363134169146645, |
|
"grad_norm": 69692.6328125, |
|
"learning_rate": 6.933231411560442e-05, |
|
"loss": 2.4331, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.3773676551083811, |
|
"grad_norm": 74715.71875, |
|
"learning_rate": 6.878973720407191e-05, |
|
"loss": 2.4379, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.38110396852529577, |
|
"grad_norm": 72629.9140625, |
|
"learning_rate": 6.824457144227698e-05, |
|
"loss": 2.4424, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.3848402819422104, |
|
"grad_norm": 74704.6328125, |
|
"learning_rate": 6.769689194316615e-05, |
|
"loss": 2.4905, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.3885765953591251, |
|
"grad_norm": 75688.8125, |
|
"learning_rate": 6.714677416602868e-05, |
|
"loss": 2.4413, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.39231290877603975, |
|
"grad_norm": 75325.578125, |
|
"learning_rate": 6.659429390609984e-05, |
|
"loss": 2.4505, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.3960492221929544, |
|
"grad_norm": 70411.6953125, |
|
"learning_rate": 6.603952728411776e-05, |
|
"loss": 2.4358, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.3997855356098691, |
|
"grad_norm": 71785.53125, |
|
"learning_rate": 6.54825507358356e-05, |
|
"loss": 2.4271, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.4035218490267838, |
|
"grad_norm": 76337.5703125, |
|
"learning_rate": 6.492344100149026e-05, |
|
"loss": 2.4536, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.40725816244369845, |
|
"grad_norm": 65393.2890625, |
|
"learning_rate": 6.436227511522892e-05, |
|
"loss": 2.4404, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.4109944758606131, |
|
"grad_norm": 88779.578125, |
|
"learning_rate": 6.379913039449547e-05, |
|
"loss": 2.4277, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.41473078927752777, |
|
"grad_norm": 70318.0546875, |
|
"learning_rate": 6.32340844293777e-05, |
|
"loss": 2.4209, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.4184671026944424, |
|
"grad_norm": 77500.03125, |
|
"learning_rate": 6.266721507191674e-05, |
|
"loss": 2.4218, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.4222034161113571, |
|
"grad_norm": 89454.5703125, |
|
"learning_rate": 6.209860042538087e-05, |
|
"loss": 2.409, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.42593972952827175, |
|
"grad_norm": 72870.5, |
|
"learning_rate": 6.152831883350426e-05, |
|
"loss": 2.3841, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.4296760429451864, |
|
"grad_norm": 75137.0703125, |
|
"learning_rate": 6.095644886969283e-05, |
|
"loss": 2.3922, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.43341235636210107, |
|
"grad_norm": 64499.1328125, |
|
"learning_rate": 6.038306932619846e-05, |
|
"loss": 2.4042, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.43714866977901573, |
|
"grad_norm": 74131.78125, |
|
"learning_rate": 5.980825920326287e-05, |
|
"loss": 2.3981, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.4408849831959304, |
|
"grad_norm": 75569.671875, |
|
"learning_rate": 5.923209769823308e-05, |
|
"loss": 2.4412, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.44462129661284505, |
|
"grad_norm": 94198.984375, |
|
"learning_rate": 5.865466419464957e-05, |
|
"loss": 2.4383, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.4483576100297597, |
|
"grad_norm": 72894.984375, |
|
"learning_rate": 5.807603825130873e-05, |
|
"loss": 2.3944, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.45209392344667443, |
|
"grad_norm": 75193.0234375, |
|
"learning_rate": 5.7496299591301406e-05, |
|
"loss": 2.3847, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.4558302368635891, |
|
"grad_norm": 115278.234375, |
|
"learning_rate": 5.6915528091028494e-05, |
|
"loss": 2.3914, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.45956655028050375, |
|
"grad_norm": 80114.7578125, |
|
"learning_rate": 5.633380376919567e-05, |
|
"loss": 2.394, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.4633028636974184, |
|
"grad_norm": 77552.9921875, |
|
"learning_rate": 5.5751206775788354e-05, |
|
"loss": 2.3919, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.46703917711433307, |
|
"grad_norm": 81552.4375, |
|
"learning_rate": 5.516781738102871e-05, |
|
"loss": 2.396, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.47077549053124773, |
|
"grad_norm": 89432.1015625, |
|
"learning_rate": 5.458371596431592e-05, |
|
"loss": 2.4214, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.4745118039481624, |
|
"grad_norm": 90061.796875, |
|
"learning_rate": 5.3998983003151595e-05, |
|
"loss": 2.3796, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.47824811736507705, |
|
"grad_norm": 84587.1953125, |
|
"learning_rate": 5.341369906205154e-05, |
|
"loss": 2.39, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.4819844307819917, |
|
"grad_norm": 63710.203125, |
|
"learning_rate": 5.2827944781445583e-05, |
|
"loss": 2.4094, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.4857207441989064, |
|
"grad_norm": 69039.109375, |
|
"learning_rate": 5.224180086656693e-05, |
|
"loss": 2.3963, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.48945705761582103, |
|
"grad_norm": 90888.671875, |
|
"learning_rate": 5.165534807633261e-05, |
|
"loss": 2.3701, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.4931933710327357, |
|
"grad_norm": 75439.6875, |
|
"learning_rate": 5.1068667212216527e-05, |
|
"loss": 2.3588, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.49692968444965036, |
|
"grad_norm": 77958.296875, |
|
"learning_rate": 5.048183910711655e-05, |
|
"loss": 2.3613, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.5006659978665651, |
|
"grad_norm": 60339.5859375, |
|
"learning_rate": 4.9894944614217454e-05, |
|
"loss": 2.3501, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.5044023112834797, |
|
"grad_norm": 88602.6015625, |
|
"learning_rate": 4.930806459585094e-05, |
|
"loss": 2.408, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.5081386247003944, |
|
"grad_norm": 82704.09375, |
|
"learning_rate": 4.872127991235435e-05, |
|
"loss": 2.3974, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.511874938117309, |
|
"grad_norm": 69181.0625, |
|
"learning_rate": 4.8134671410929855e-05, |
|
"loss": 2.337, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.5156112515342237, |
|
"grad_norm": 83461.8984375, |
|
"learning_rate": 4.75483199145052e-05, |
|
"loss": 2.3468, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.5193475649511383, |
|
"grad_norm": 70060.1875, |
|
"learning_rate": 4.6962306210598024e-05, |
|
"loss": 2.3686, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.523083878368053, |
|
"grad_norm": 80763.5703125, |
|
"learning_rate": 4.637671104018486e-05, |
|
"loss": 2.3697, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.5268201917849676, |
|
"grad_norm": 75881.0, |
|
"learning_rate": 4.5791615086576726e-05, |
|
"loss": 2.3789, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.5305565052018824, |
|
"grad_norm": 68510.5390625, |
|
"learning_rate": 4.5207098964302556e-05, |
|
"loss": 2.3812, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.5342928186187971, |
|
"grad_norm": 90292.1953125, |
|
"learning_rate": 4.462324320800205e-05, |
|
"loss": 2.3558, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.5380291320357117, |
|
"grad_norm": 69267.453125, |
|
"learning_rate": 4.4040128261329724e-05, |
|
"loss": 2.3751, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.5417654454526264, |
|
"grad_norm": 85016.8203125, |
|
"learning_rate": 4.345783446587135e-05, |
|
"loss": 2.3635, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.545501758869541, |
|
"grad_norm": 103424.7109375, |
|
"learning_rate": 4.2876442050074436e-05, |
|
"loss": 2.3367, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.5492380722864557, |
|
"grad_norm": 83033.2265625, |
|
"learning_rate": 4.229603111819437e-05, |
|
"loss": 2.371, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.5529743857033703, |
|
"grad_norm": 89505.2890625, |
|
"learning_rate": 4.1716681639257725e-05, |
|
"loss": 2.3537, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.556710699120285, |
|
"grad_norm": 86427.5390625, |
|
"learning_rate": 4.113847343604404e-05, |
|
"loss": 2.3518, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.5604470125371996, |
|
"grad_norm": 95231.3203125, |
|
"learning_rate": 4.05614861740879e-05, |
|
"loss": 2.3475, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.5641833259541144, |
|
"grad_norm": 73089.09375, |
|
"learning_rate": 3.998579935070253e-05, |
|
"loss": 2.3467, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.567919639371029, |
|
"grad_norm": 82384.2109375, |
|
"learning_rate": 3.941149228402674e-05, |
|
"loss": 2.3278, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.5716559527879437, |
|
"grad_norm": 84582.859375, |
|
"learning_rate": 3.883864410209648e-05, |
|
"loss": 2.3535, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.5753922662048583, |
|
"grad_norm": 69513.640625, |
|
"learning_rate": 3.826733373194247e-05, |
|
"loss": 2.3257, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.579128579621773, |
|
"grad_norm": 73027.9453125, |
|
"learning_rate": 3.769763988871572e-05, |
|
"loss": 2.3447, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.5828648930386876, |
|
"grad_norm": 71039.1171875, |
|
"learning_rate": 3.712964106484218e-05, |
|
"loss": 2.3147, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.5866012064556023, |
|
"grad_norm": 80346.1796875, |
|
"learning_rate": 3.6563415519207985e-05, |
|
"loss": 2.3144, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.590337519872517, |
|
"grad_norm": 84110.34375, |
|
"learning_rate": 3.599904126637692e-05, |
|
"loss": 2.3333, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.5940738332894316, |
|
"grad_norm": 82063.09375, |
|
"learning_rate": 3.54365960658417e-05, |
|
"loss": 2.3283, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.5978101467063464, |
|
"grad_norm": 84510.8984375, |
|
"learning_rate": 3.487615741131013e-05, |
|
"loss": 2.3397, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.601546460123261, |
|
"grad_norm": 74272.46875, |
|
"learning_rate": 3.4317802520028117e-05, |
|
"loss": 2.326, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.6052827735401757, |
|
"grad_norm": 73607.890625, |
|
"learning_rate": 3.37616083221406e-05, |
|
"loss": 2.3238, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.6090190869570903, |
|
"grad_norm": 84208.0078125, |
|
"learning_rate": 3.320765145009216e-05, |
|
"loss": 2.3112, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.612755400374005, |
|
"grad_norm": 80192.90625, |
|
"learning_rate": 3.265600822806866e-05, |
|
"loss": 2.3361, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.6164917137909196, |
|
"grad_norm": 83479.984375, |
|
"learning_rate": 3.2106754661481086e-05, |
|
"loss": 2.3383, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.6202280272078343, |
|
"grad_norm": 88273.1015625, |
|
"learning_rate": 3.1559966426493804e-05, |
|
"loss": 2.3059, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.6239643406247489, |
|
"grad_norm": 98511.15625, |
|
"learning_rate": 3.101571885959766e-05, |
|
"loss": 2.3113, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.6277006540416636, |
|
"grad_norm": 76336.8046875, |
|
"learning_rate": 3.047408694723032e-05, |
|
"loss": 2.3254, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.6314369674585782, |
|
"grad_norm": 80944.7734375, |
|
"learning_rate": 2.99351453154444e-05, |
|
"loss": 2.3428, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.635173280875493, |
|
"grad_norm": 72115.8125, |
|
"learning_rate": 2.9398968219625706e-05, |
|
"loss": 2.3156, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.6389095942924077, |
|
"grad_norm": 87770.3125, |
|
"learning_rate": 2.8865629534262216e-05, |
|
"loss": 2.3041, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.6426459077093223, |
|
"grad_norm": 80694.6171875, |
|
"learning_rate": 2.83352027427657e-05, |
|
"loss": 2.3478, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.646382221126237, |
|
"grad_norm": 77460.1171875, |
|
"learning_rate": 2.7807760927347102e-05, |
|
"loss": 2.302, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.6501185345431516, |
|
"grad_norm": 82131.8671875, |
|
"learning_rate": 2.7283376758947333e-05, |
|
"loss": 2.3156, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.6538548479600663, |
|
"grad_norm": 90870.65625, |
|
"learning_rate": 2.6762122487224684e-05, |
|
"loss": 2.331, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.6575911613769809, |
|
"grad_norm": 98921.2109375, |
|
"learning_rate": 2.624406993060019e-05, |
|
"loss": 2.3073, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.6613274747938956, |
|
"grad_norm": 73837.2109375, |
|
"learning_rate": 2.5729290466362565e-05, |
|
"loss": 2.301, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.6650637882108102, |
|
"grad_norm": 82211.34375, |
|
"learning_rate": 2.521785502083384e-05, |
|
"loss": 2.3103, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.668800101627725, |
|
"grad_norm": 86962.1484375, |
|
"learning_rate": 2.4709834059597026e-05, |
|
"loss": 2.3154, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.6725364150446396, |
|
"grad_norm": 79155.3515625, |
|
"learning_rate": 2.420529757778746e-05, |
|
"loss": 2.348, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.6762727284615543, |
|
"grad_norm": 97560.8125, |
|
"learning_rate": 2.3704315090448886e-05, |
|
"loss": 2.3417, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.6800090418784689, |
|
"grad_norm": 124897.3203125, |
|
"learning_rate": 2.3206955622955623e-05, |
|
"loss": 2.3122, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.6837453552953836, |
|
"grad_norm": 88038.5625, |
|
"learning_rate": 2.271328770150227e-05, |
|
"loss": 2.311, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.6874816687122982, |
|
"grad_norm": 89480.2890625, |
|
"learning_rate": 2.222337934366214e-05, |
|
"loss": 2.3002, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.6912179821292129, |
|
"grad_norm": 81038.4921875, |
|
"learning_rate": 2.173729804901581e-05, |
|
"loss": 2.3231, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.6949542955461276, |
|
"grad_norm": 91312.8984375, |
|
"learning_rate": 2.1255110789851084e-05, |
|
"loss": 2.3125, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.6986906089630422, |
|
"grad_norm": 84064.609375, |
|
"learning_rate": 2.0776884001935437e-05, |
|
"loss": 2.3379, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.702426922379957, |
|
"grad_norm": 87531.078125, |
|
"learning_rate": 2.0302683575362608e-05, |
|
"loss": 2.2962, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.7061632357968716, |
|
"grad_norm": 84962.34375, |
|
"learning_rate": 1.983257484547421e-05, |
|
"loss": 2.3032, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.7098995492137863, |
|
"grad_norm": 88890.75, |
|
"learning_rate": 1.9366622583857803e-05, |
|
"loss": 2.3138, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.7136358626307009, |
|
"grad_norm": 93013.0, |
|
"learning_rate": 1.890489098942267e-05, |
|
"loss": 2.3128, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.7173721760476156, |
|
"grad_norm": 79075.3515625, |
|
"learning_rate": 1.8447443679554567e-05, |
|
"loss": 2.3013, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.7211084894645302, |
|
"grad_norm": 94954.1640625, |
|
"learning_rate": 1.7994343681350422e-05, |
|
"loss": 2.2775, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.7248448028814449, |
|
"grad_norm": 79968.1171875, |
|
"learning_rate": 1.7545653422934537e-05, |
|
"loss": 2.2979, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.7285811162983595, |
|
"grad_norm": 90937.109375, |
|
"learning_rate": 1.7101434724857147e-05, |
|
"loss": 2.3466, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.7323174297152742, |
|
"grad_norm": 85592.625, |
|
"learning_rate": 1.6661748791576855e-05, |
|
"loss": 2.3029, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.7360537431321889, |
|
"grad_norm": 95187.1484375, |
|
"learning_rate": 1.622665620302789e-05, |
|
"loss": 2.3074, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.7397900565491036, |
|
"grad_norm": 87762.859375, |
|
"learning_rate": 1.5796216906273388e-05, |
|
"loss": 2.2883, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.7435263699660182, |
|
"grad_norm": 102495.0234375, |
|
"learning_rate": 1.537049020724581e-05, |
|
"loss": 2.2852, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.7472626833829329, |
|
"grad_norm": 92540.0078125, |
|
"learning_rate": 1.4949534762575912e-05, |
|
"loss": 2.2914, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.7509989967998476, |
|
"grad_norm": 81275.109375, |
|
"learning_rate": 1.4533408571510904e-05, |
|
"loss": 2.3119, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.7547353102167622, |
|
"grad_norm": 82922.2578125, |
|
"learning_rate": 1.4122168967923366e-05, |
|
"loss": 2.3118, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.7584716236336769, |
|
"grad_norm": 107727.1171875, |
|
"learning_rate": 1.3715872612411806e-05, |
|
"loss": 2.286, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.7622079370505915, |
|
"grad_norm": 84810.40625, |
|
"learning_rate": 1.3314575484493946e-05, |
|
"loss": 2.3174, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.7659442504675062, |
|
"grad_norm": 78007.40625, |
|
"learning_rate": 1.2918332874893869e-05, |
|
"loss": 2.3229, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.7696805638844209, |
|
"grad_norm": 95875.2421875, |
|
"learning_rate": 1.252719937792397e-05, |
|
"loss": 2.2979, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.7734168773013356, |
|
"grad_norm": 83561.375, |
|
"learning_rate": 1.2141228883963118e-05, |
|
"loss": 2.2667, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.7771531907182502, |
|
"grad_norm": 73712.7578125, |
|
"learning_rate": 1.1760474572031538e-05, |
|
"loss": 2.2876, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.7808895041351649, |
|
"grad_norm": 90907.0234375, |
|
"learning_rate": 1.1384988902463806e-05, |
|
"loss": 2.2799, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.7846258175520795, |
|
"grad_norm": 84601.046875, |
|
"learning_rate": 1.101482360968088e-05, |
|
"loss": 2.3109, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.7883621309689942, |
|
"grad_norm": 89593.640625, |
|
"learning_rate": 1.0650029695062103e-05, |
|
"loss": 2.3071, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.7920984443859088, |
|
"grad_norm": 85944.5390625, |
|
"learning_rate": 1.0290657419918308e-05, |
|
"loss": 2.3275, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.7958347578028235, |
|
"grad_norm": 71977.34375, |
|
"learning_rate": 9.936756298566724e-06, |
|
"loss": 2.3108, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.7995710712197382, |
|
"grad_norm": 91883.7109375, |
|
"learning_rate": 9.588375091508945e-06, |
|
"loss": 2.2756, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.8033073846366529, |
|
"grad_norm": 81686.0078125, |
|
"learning_rate": 9.245561798712704e-06, |
|
"loss": 2.3064, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.8070436980535676, |
|
"grad_norm": 86408.875, |
|
"learning_rate": 8.908363652998414e-06, |
|
"loss": 2.3031, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.8107800114704822, |
|
"grad_norm": 83010.9765625, |
|
"learning_rate": 8.576827113531427e-06, |
|
"loss": 2.2797, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.8145163248873969, |
|
"grad_norm": 74965.84375, |
|
"learning_rate": 8.25099785942094e-06, |
|
"loss": 2.3011, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.8182526383043115, |
|
"grad_norm": 91110.0859375, |
|
"learning_rate": 7.930920783426265e-06, |
|
"loss": 2.2572, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.8219889517212262, |
|
"grad_norm": 96117.6640625, |
|
"learning_rate": 7.616639985771556e-06, |
|
"loss": 2.276, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.8257252651381408, |
|
"grad_norm": 66883.046875, |
|
"learning_rate": 7.30819876806963e-06, |
|
"loss": 2.2586, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.8294615785550555, |
|
"grad_norm": 86336.21875, |
|
"learning_rate": 7.005639627355881e-06, |
|
"loss": 2.2796, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.8331978919719701, |
|
"grad_norm": 100607.5703125, |
|
"learning_rate": 6.709004250233097e-06, |
|
"loss": 2.2635, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.8369342053888849, |
|
"grad_norm": 90908.2890625, |
|
"learning_rate": 6.418333507127788e-06, |
|
"loss": 2.2598, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.8406705188057995, |
|
"grad_norm": 101526.234375, |
|
"learning_rate": 6.133667446659131e-06, |
|
"loss": 2.2648, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.8444068322227142, |
|
"grad_norm": 87100.0234375, |
|
"learning_rate": 5.855045290121047e-06, |
|
"loss": 2.3005, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.8481431456396288, |
|
"grad_norm": 95362.3984375, |
|
"learning_rate": 5.58250542607826e-06, |
|
"loss": 2.2747, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.8518794590565435, |
|
"grad_norm": 105692.5625, |
|
"learning_rate": 5.316085405077159e-06, |
|
"loss": 2.2808, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.8556157724734582, |
|
"grad_norm": 99888.703125, |
|
"learning_rate": 5.055821934472071e-06, |
|
"loss": 2.2707, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.8593520858903728, |
|
"grad_norm": 89097.7421875, |
|
"learning_rate": 4.801750873367728e-06, |
|
"loss": 2.2996, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.8630883993072875, |
|
"grad_norm": 94061.9453125, |
|
"learning_rate": 4.553907227678594e-06, |
|
"loss": 2.2585, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.8668247127242021, |
|
"grad_norm": 89231.015625, |
|
"learning_rate": 4.312325145305729e-06, |
|
"loss": 2.2712, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.8705610261411169, |
|
"grad_norm": 92370.8125, |
|
"learning_rate": 4.077037911431913e-06, |
|
"loss": 2.277, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.8742973395580315, |
|
"grad_norm": 95057.8828125, |
|
"learning_rate": 3.84807794393563e-06, |
|
"loss": 2.2705, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.8780336529749462, |
|
"grad_norm": 76279.6015625, |
|
"learning_rate": 3.6254767889244813e-06, |
|
"loss": 2.2692, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.8817699663918608, |
|
"grad_norm": 99320.4375, |
|
"learning_rate": 3.4092651163888154e-06, |
|
"loss": 2.2841, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.8855062798087755, |
|
"grad_norm": 97605.59375, |
|
"learning_rate": 3.1994727159759986e-06, |
|
"loss": 2.2942, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.8892425932256901, |
|
"grad_norm": 87163.4765625, |
|
"learning_rate": 2.996128492885969e-06, |
|
"loss": 2.2918, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.8929789066426048, |
|
"grad_norm": 78705.2421875, |
|
"learning_rate": 2.7992604638886975e-06, |
|
"loss": 2.259, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.8967152200595194, |
|
"grad_norm": 94322.765625, |
|
"learning_rate": 2.608895753464069e-06, |
|
"loss": 2.2589, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.9004515334764341, |
|
"grad_norm": 73016.4140625, |
|
"learning_rate": 2.4250605900646204e-06, |
|
"loss": 2.2938, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 0.9041878468933489, |
|
"grad_norm": 90040.5546875, |
|
"learning_rate": 2.2477803025018228e-06, |
|
"loss": 2.2982, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.9079241603102635, |
|
"grad_norm": 102110.484375, |
|
"learning_rate": 2.0770793164562373e-06, |
|
"loss": 2.2578, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 0.9116604737271782, |
|
"grad_norm": 78370.1796875, |
|
"learning_rate": 1.9129811511121697e-06, |
|
"loss": 2.2854, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.9153967871440928, |
|
"grad_norm": 87597.2578125, |
|
"learning_rate": 1.7555084159172176e-06, |
|
"loss": 2.2657, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 0.9191331005610075, |
|
"grad_norm": 89568.2109375, |
|
"learning_rate": 1.6046828074670894e-06, |
|
"loss": 2.2674, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.9228694139779221, |
|
"grad_norm": 80686.5234375, |
|
"learning_rate": 1.4605251065162973e-06, |
|
"loss": 2.2634, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 0.9266057273948368, |
|
"grad_norm": 76351.1796875, |
|
"learning_rate": 1.3230551751149568e-06, |
|
"loss": 2.2889, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.9303420408117514, |
|
"grad_norm": 82895.4921875, |
|
"learning_rate": 1.1922919538721876e-06, |
|
"loss": 2.2188, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 0.9340783542286661, |
|
"grad_norm": 81276.09375, |
|
"learning_rate": 1.0682534593464967e-06, |
|
"loss": 2.268, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.9378146676455807, |
|
"grad_norm": 83888.390625, |
|
"learning_rate": 9.509567815634446e-07, |
|
"loss": 2.2762, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 0.9415509810624955, |
|
"grad_norm": 87934.8125, |
|
"learning_rate": 8.404180816609819e-07, |
|
"loss": 2.2759, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.9452872944794101, |
|
"grad_norm": 89569.15625, |
|
"learning_rate": 7.366525896627841e-07, |
|
"loss": 2.2696, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 0.9490236078963248, |
|
"grad_norm": 79137.96875, |
|
"learning_rate": 6.396746023798139e-07, |
|
"loss": 2.2951, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.9527599213132394, |
|
"grad_norm": 74010.8984375, |
|
"learning_rate": 5.49497481440564e-07, |
|
"loss": 2.2875, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 0.9564962347301541, |
|
"grad_norm": 70001.59375, |
|
"learning_rate": 4.661336514500625e-07, |
|
"loss": 2.2514, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.9602325481470688, |
|
"grad_norm": 86343.359375, |
|
"learning_rate": 3.895945982779925e-07, |
|
"loss": 2.3027, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 0.9639688615639834, |
|
"grad_norm": 92609.6171875, |
|
"learning_rate": 3.19890867476208e-07, |
|
"loss": 2.2857, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.9677051749808981, |
|
"grad_norm": 91932.953125, |
|
"learning_rate": 2.570320628257461e-07, |
|
"loss": 2.2814, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 0.9714414883978127, |
|
"grad_norm": 81636.1015625, |
|
"learning_rate": 2.010268450136077e-07, |
|
"loss": 2.2758, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.9751778018147275, |
|
"grad_norm": 79906.6015625, |
|
"learning_rate": 1.518829304395175e-07, |
|
"loss": 2.2944, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 0.9789141152316421, |
|
"grad_norm": 76946.6484375, |
|
"learning_rate": 1.0960709015272464e-07, |
|
"loss": 2.2305, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.9826504286485568, |
|
"grad_norm": 81980.828125, |
|
"learning_rate": 7.420514891911535e-08, |
|
"loss": 2.2803, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 0.9863867420654714, |
|
"grad_norm": 90421.125, |
|
"learning_rate": 4.5681984418666224e-08, |
|
"loss": 2.2767, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.9901230554823861, |
|
"grad_norm": 81403.0390625, |
|
"learning_rate": 2.404152657338732e-08, |
|
"loss": 2.2399, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 0.9938593688993007, |
|
"grad_norm": 98386.671875, |
|
"learning_rate": 9.286757005871893e-09, |
|
"loss": 2.2422, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.9975956823162154, |
|
"grad_norm": 95571.890625, |
|
"learning_rate": 1.4197086284917227e-09, |
|
"loss": 2.2956, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 1.0013376002032555, |
|
"grad_norm": 99771.578125, |
|
"learning_rate": 4.9894944614217454e-05, |
|
"loss": 2.211, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 1.00507391362017, |
|
"grad_norm": 105597.21875, |
|
"learning_rate": 4.960149774176924e-05, |
|
"loss": 2.2131, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 1.0088102270370847, |
|
"grad_norm": 87634.5, |
|
"learning_rate": 4.930806459585094e-05, |
|
"loss": 2.2453, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 1.0125465404539995, |
|
"grad_norm": 86854.4375, |
|
"learning_rate": 4.901465528385536e-05, |
|
"loss": 2.1825, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 1.016282853870914, |
|
"grad_norm": 75373.640625, |
|
"learning_rate": 4.872127991235435e-05, |
|
"loss": 2.1953, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 1.0200191672878287, |
|
"grad_norm": 89905.53125, |
|
"learning_rate": 4.842794858675067e-05, |
|
"loss": 2.223, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 1.0237554807047435, |
|
"grad_norm": 74529.2734375, |
|
"learning_rate": 4.8134671410929855e-05, |
|
"loss": 2.2232, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 1.0274917941216581, |
|
"grad_norm": 86540.96875, |
|
"learning_rate": 4.7841458486912313e-05, |
|
"loss": 2.229, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 1.0312281075385727, |
|
"grad_norm": 79359.9140625, |
|
"learning_rate": 4.75483199145052e-05, |
|
"loss": 2.2484, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 1.0349644209554874, |
|
"grad_norm": 88735.703125, |
|
"learning_rate": 4.725526579095468e-05, |
|
"loss": 2.2141, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 1.0387007343724022, |
|
"grad_norm": 84116.59375, |
|
"learning_rate": 4.6962306210598024e-05, |
|
"loss": 2.2362, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 1.0424370477893168, |
|
"grad_norm": 85959.921875, |
|
"learning_rate": 4.66694512645159e-05, |
|
"loss": 2.2703, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 1.0461733612062314, |
|
"grad_norm": 76734.796875, |
|
"learning_rate": 4.637671104018486e-05, |
|
"loss": 2.2324, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 1.049909674623146, |
|
"grad_norm": 82724.0078125, |
|
"learning_rate": 4.608409562112983e-05, |
|
"loss": 2.2406, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 1.0536459880400608, |
|
"grad_norm": 85404.375, |
|
"learning_rate": 4.5791615086576726e-05, |
|
"loss": 2.2049, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 1.0573823014569754, |
|
"grad_norm": 73961.2890625, |
|
"learning_rate": 4.549927951110542e-05, |
|
"loss": 2.2176, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 1.06111861487389, |
|
"grad_norm": 83664.9921875, |
|
"learning_rate": 4.5207098964302556e-05, |
|
"loss": 2.2473, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 1.0648549282908046, |
|
"grad_norm": 89258.0390625, |
|
"learning_rate": 4.491508351041477e-05, |
|
"loss": 2.227, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 1.0685912417077195, |
|
"grad_norm": 99317.4296875, |
|
"learning_rate": 4.462324320800205e-05, |
|
"loss": 2.2237, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 1.072327555124634, |
|
"grad_norm": 90441.765625, |
|
"learning_rate": 4.433158810959122e-05, |
|
"loss": 2.2321, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 1.0760638685415487, |
|
"grad_norm": 69907.328125, |
|
"learning_rate": 4.4040128261329724e-05, |
|
"loss": 2.2319, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 1.0798001819584635, |
|
"grad_norm": 87409.109375, |
|
"learning_rate": 4.3748873702639536e-05, |
|
"loss": 2.2581, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 1.083536495375378, |
|
"grad_norm": 91680.8828125, |
|
"learning_rate": 4.345783446587135e-05, |
|
"loss": 2.2381, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 1.0872728087922927, |
|
"grad_norm": 84339.4140625, |
|
"learning_rate": 4.316702057595907e-05, |
|
"loss": 2.2398, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 1.0910091222092073, |
|
"grad_norm": 99060.578125, |
|
"learning_rate": 4.2876442050074436e-05, |
|
"loss": 2.2251, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 1.0947454356261221, |
|
"grad_norm": 103543.2421875, |
|
"learning_rate": 4.258610889728198e-05, |
|
"loss": 2.2536, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 1.0984817490430367, |
|
"grad_norm": 89170.1953125, |
|
"learning_rate": 4.229603111819437e-05, |
|
"loss": 2.1822, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 1.1022180624599514, |
|
"grad_norm": 95436.765625, |
|
"learning_rate": 4.2006218704627756e-05, |
|
"loss": 2.2338, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 1.105954375876866, |
|
"grad_norm": 86420.7265625, |
|
"learning_rate": 4.1716681639257725e-05, |
|
"loss": 2.2153, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 1.1096906892937808, |
|
"grad_norm": 76673.3359375, |
|
"learning_rate": 4.1427429895275446e-05, |
|
"loss": 2.2289, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 1.1134270027106954, |
|
"grad_norm": 101906.8515625, |
|
"learning_rate": 4.113847343604404e-05, |
|
"loss": 2.2297, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 1.11716331612761, |
|
"grad_norm": 88695.703125, |
|
"learning_rate": 4.084982221475554e-05, |
|
"loss": 2.2219, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 1.1208996295445246, |
|
"grad_norm": 100434.46875, |
|
"learning_rate": 4.05614861740879e-05, |
|
"loss": 2.2344, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 1.1246359429614394, |
|
"grad_norm": 90131.671875, |
|
"learning_rate": 4.0273475245862594e-05, |
|
"loss": 2.2137, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 1.128372256378354, |
|
"grad_norm": 84749.8828125, |
|
"learning_rate": 3.998579935070253e-05, |
|
"loss": 2.2358, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 1.1321085697952686, |
|
"grad_norm": 80951.0546875, |
|
"learning_rate": 3.969846839769027e-05, |
|
"loss": 2.2327, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 1.1358448832121835, |
|
"grad_norm": 82394.359375, |
|
"learning_rate": 3.941149228402674e-05, |
|
"loss": 2.2448, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 1.139581196629098, |
|
"grad_norm": 108267.5546875, |
|
"learning_rate": 3.912488089469036e-05, |
|
"loss": 2.211, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 1.1433175100460127, |
|
"grad_norm": 73743.703125, |
|
"learning_rate": 3.883864410209648e-05, |
|
"loss": 2.2159, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 1.1470538234629273, |
|
"grad_norm": 105991.84375, |
|
"learning_rate": 3.855279176575732e-05, |
|
"loss": 2.2191, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 1.150790136879842, |
|
"grad_norm": 88528.921875, |
|
"learning_rate": 3.826733373194247e-05, |
|
"loss": 2.2397, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 1.1545264502967567, |
|
"grad_norm": 99099.5390625, |
|
"learning_rate": 3.798227983333955e-05, |
|
"loss": 2.2083, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 1.1582627637136713, |
|
"grad_norm": 96959.421875, |
|
"learning_rate": 3.769763988871572e-05, |
|
"loss": 2.2146, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 1.1619990771305861, |
|
"grad_norm": 96111.9375, |
|
"learning_rate": 3.741342370257932e-05, |
|
"loss": 2.2421, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 1.1657353905475007, |
|
"grad_norm": 80256.734375, |
|
"learning_rate": 3.712964106484218e-05, |
|
"loss": 2.2199, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 1.1694717039644154, |
|
"grad_norm": 84992.546875, |
|
"learning_rate": 3.684630175048249e-05, |
|
"loss": 2.184, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 1.17320801738133, |
|
"grad_norm": 94191.3671875, |
|
"learning_rate": 3.6563415519207985e-05, |
|
"loss": 2.2505, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 1.1769443307982446, |
|
"grad_norm": 76174.2421875, |
|
"learning_rate": 3.62809921151198e-05, |
|
"loss": 2.2143, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 1.1806806442151594, |
|
"grad_norm": 89977.34375, |
|
"learning_rate": 3.599904126637692e-05, |
|
"loss": 2.2338, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 1.184416957632074, |
|
"grad_norm": 107886.5390625, |
|
"learning_rate": 3.571757268486096e-05, |
|
"loss": 2.2516, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 1.1881532710489886, |
|
"grad_norm": 60902.64453125, |
|
"learning_rate": 3.54365960658417e-05, |
|
"loss": 2.197, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 1.1918895844659034, |
|
"grad_norm": 91395.515625, |
|
"learning_rate": 3.5156121087643154e-05, |
|
"loss": 2.2238, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 1.195625897882818, |
|
"grad_norm": 90301.53125, |
|
"learning_rate": 3.487615741131013e-05, |
|
"loss": 2.2317, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 1.1993622112997326, |
|
"grad_norm": 91005.0625, |
|
"learning_rate": 3.459671468027553e-05, |
|
"loss": 2.2019, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 1.2030985247166472, |
|
"grad_norm": 81654.03125, |
|
"learning_rate": 3.4317802520028117e-05, |
|
"loss": 2.2199, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 1.206834838133562, |
|
"grad_norm": 100322.15625, |
|
"learning_rate": 3.403943053778097e-05, |
|
"loss": 2.2668, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 1.2105711515504767, |
|
"grad_norm": 84712.84375, |
|
"learning_rate": 3.37616083221406e-05, |
|
"loss": 2.215, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 1.2143074649673913, |
|
"grad_norm": 93723.7734375, |
|
"learning_rate": 3.348434544277663e-05, |
|
"loss": 2.2052, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 1.218043778384306, |
|
"grad_norm": 98371.0859375, |
|
"learning_rate": 3.320765145009216e-05, |
|
"loss": 2.1767, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 1.2217800918012207, |
|
"grad_norm": 78824.3359375, |
|
"learning_rate": 3.293153587489488e-05, |
|
"loss": 2.2341, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 1.2255164052181353, |
|
"grad_norm": 93536.6640625, |
|
"learning_rate": 3.265600822806866e-05, |
|
"loss": 2.1954, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 1.22925271863505, |
|
"grad_norm": 89381.3203125, |
|
"learning_rate": 3.2381078000245965e-05, |
|
"loss": 2.2026, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 1.2329890320519645, |
|
"grad_norm": 110292.1484375, |
|
"learning_rate": 3.2106754661481086e-05, |
|
"loss": 2.2022, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 1.2367253454688794, |
|
"grad_norm": 89096.8359375, |
|
"learning_rate": 3.183304766092374e-05, |
|
"loss": 2.1618, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 1.240461658885794, |
|
"grad_norm": 97123.8671875, |
|
"learning_rate": 3.1559966426493804e-05, |
|
"loss": 2.192, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 1.2441979723027086, |
|
"grad_norm": 86452.890625, |
|
"learning_rate": 3.128752036455631e-05, |
|
"loss": 2.203, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 1.2479342857196234, |
|
"grad_norm": 99330.265625, |
|
"learning_rate": 3.101571885959766e-05, |
|
"loss": 2.1653, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 1.251670599136538, |
|
"grad_norm": 82720.1015625, |
|
"learning_rate": 3.0744571273902345e-05, |
|
"loss": 2.1893, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 1.2554069125534526, |
|
"grad_norm": 98328.6328125, |
|
"learning_rate": 3.047408694723032e-05, |
|
"loss": 2.2091, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 1.2591432259703672, |
|
"grad_norm": 89367.40625, |
|
"learning_rate": 3.0204275196495414e-05, |
|
"loss": 2.1961, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 1.262879539387282, |
|
"grad_norm": 95675.3203125, |
|
"learning_rate": 2.99351453154444e-05, |
|
"loss": 2.1824, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 1.2666158528041966, |
|
"grad_norm": 83745.3359375, |
|
"learning_rate": 2.9666706574336834e-05, |
|
"loss": 2.1793, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 1.2703521662211112, |
|
"grad_norm": 92235.09375, |
|
"learning_rate": 2.9398968219625706e-05, |
|
"loss": 2.2034, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 1.274088479638026, |
|
"grad_norm": 79515.6328125, |
|
"learning_rate": 2.913193947363908e-05, |
|
"loss": 2.2091, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 1.2778247930549407, |
|
"grad_norm": 92163.1640625, |
|
"learning_rate": 2.8865629534262216e-05, |
|
"loss": 2.2082, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 1.2815611064718553, |
|
"grad_norm": 90488.1484375, |
|
"learning_rate": 2.8600047574620998e-05, |
|
"loss": 2.1976, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 1.2852974198887699, |
|
"grad_norm": 93587.21875, |
|
"learning_rate": 2.83352027427657e-05, |
|
"loss": 2.2092, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 1.2890337333056845, |
|
"grad_norm": 90367.0546875, |
|
"learning_rate": 2.8071104161356087e-05, |
|
"loss": 2.2086, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 1.2927700467225993, |
|
"grad_norm": 94576.171875, |
|
"learning_rate": 2.7807760927347102e-05, |
|
"loss": 2.2071, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 1.296506360139514, |
|
"grad_norm": 84362.703125, |
|
"learning_rate": 2.7545182111675466e-05, |
|
"loss": 2.1646, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 1.3002426735564285, |
|
"grad_norm": 99929.3984375, |
|
"learning_rate": 2.7283376758947333e-05, |
|
"loss": 2.1721, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 1.3039789869733434, |
|
"grad_norm": 99786.953125, |
|
"learning_rate": 2.7022353887126668e-05, |
|
"loss": 2.2208, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 1.307715300390258, |
|
"grad_norm": 118692.8046875, |
|
"learning_rate": 2.6762122487224684e-05, |
|
"loss": 2.2036, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 1.3114516138071726, |
|
"grad_norm": 88971.234375, |
|
"learning_rate": 2.6502691522990032e-05, |
|
"loss": 2.2018, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 1.3151879272240872, |
|
"grad_norm": 106580.484375, |
|
"learning_rate": 2.624406993060019e-05, |
|
"loss": 2.1668, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 1.318924240641002, |
|
"grad_norm": 94986.0390625, |
|
"learning_rate": 2.598626661835354e-05, |
|
"loss": 2.1782, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 1.3226605540579166, |
|
"grad_norm": 87692.828125, |
|
"learning_rate": 2.5729290466362565e-05, |
|
"loss": 2.251, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 1.3263968674748312, |
|
"grad_norm": 93603.875, |
|
"learning_rate": 2.5473150326248006e-05, |
|
"loss": 2.2143, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 1.330133180891746, |
|
"grad_norm": 98669.8828125, |
|
"learning_rate": 2.521785502083384e-05, |
|
"loss": 2.1962, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 1.3338694943086606, |
|
"grad_norm": 77537.9609375, |
|
"learning_rate": 2.4963413343843522e-05, |
|
"loss": 2.1533, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 1.3376058077255752, |
|
"grad_norm": 107931.5625, |
|
"learning_rate": 2.4709834059597026e-05, |
|
"loss": 2.1715, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 1.3413421211424899, |
|
"grad_norm": 91190.8125, |
|
"learning_rate": 2.4457125902708872e-05, |
|
"loss": 2.1757, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 1.3450784345594045, |
|
"grad_norm": 84562.8046875, |
|
"learning_rate": 2.420529757778746e-05, |
|
"loss": 2.1666, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 1.3488147479763193, |
|
"grad_norm": 99736.1640625, |
|
"learning_rate": 2.3954357759134995e-05, |
|
"loss": 2.1948, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 1.3525510613932339, |
|
"grad_norm": 93177.078125, |
|
"learning_rate": 2.3704315090448886e-05, |
|
"loss": 2.1868, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 1.3562873748101485, |
|
"grad_norm": 77689.5625, |
|
"learning_rate": 2.345517818452394e-05, |
|
"loss": 2.2091, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 1.3600236882270633, |
|
"grad_norm": 92219.8671875, |
|
"learning_rate": 2.3206955622955623e-05, |
|
"loss": 2.1795, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 1.363760001643978, |
|
"grad_norm": 96159.34375, |
|
"learning_rate": 2.2959655955844656e-05, |
|
"loss": 2.2186, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 1.3674963150608925, |
|
"grad_norm": 84869.453125, |
|
"learning_rate": 2.271328770150227e-05, |
|
"loss": 2.1516, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 1.3712326284778071, |
|
"grad_norm": 79080.1484375, |
|
"learning_rate": 2.2467859346156968e-05, |
|
"loss": 2.2031, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 1.374968941894722, |
|
"grad_norm": 107557.9765625, |
|
"learning_rate": 2.222337934366214e-05, |
|
"loss": 2.1703, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 1.3787052553116366, |
|
"grad_norm": 104010.9296875, |
|
"learning_rate": 2.1979856115204845e-05, |
|
"loss": 2.1637, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 1.3824415687285512, |
|
"grad_norm": 93606.03125, |
|
"learning_rate": 2.173729804901581e-05, |
|
"loss": 2.1746, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 1.386177882145466, |
|
"grad_norm": 100350.8515625, |
|
"learning_rate": 2.1495713500080443e-05, |
|
"loss": 2.1847, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 1.3899141955623806, |
|
"grad_norm": 91779.265625, |
|
"learning_rate": 2.1255110789851084e-05, |
|
"loss": 2.1576, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 1.3936505089792952, |
|
"grad_norm": 112973.15625, |
|
"learning_rate": 2.1015498205960283e-05, |
|
"loss": 2.177, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 1.3973868223962098, |
|
"grad_norm": 91850.71875, |
|
"learning_rate": 2.0776884001935437e-05, |
|
"loss": 2.1904, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 1.4011231358131244, |
|
"grad_norm": 92866.7265625, |
|
"learning_rate": 2.0539276396914448e-05, |
|
"loss": 2.1891, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 1.4048594492300392, |
|
"grad_norm": 94535.1640625, |
|
"learning_rate": 2.0302683575362608e-05, |
|
"loss": 2.1768, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 1.4085957626469539, |
|
"grad_norm": 99327.2890625, |
|
"learning_rate": 2.0067113686790694e-05, |
|
"loss": 2.195, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 1.4123320760638687, |
|
"grad_norm": 95519.703125, |
|
"learning_rate": 1.983257484547421e-05, |
|
"loss": 2.1735, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 1.4160683894807833, |
|
"grad_norm": 93955.65625, |
|
"learning_rate": 1.9599075130173977e-05, |
|
"loss": 2.2019, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 1.419804702897698, |
|
"grad_norm": 143901.203125, |
|
"learning_rate": 1.9366622583857803e-05, |
|
"loss": 2.1768, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 1.4235410163146125, |
|
"grad_norm": 77758.40625, |
|
"learning_rate": 1.913522521342338e-05, |
|
"loss": 2.1481, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 1.427277329731527, |
|
"grad_norm": 102854.71875, |
|
"learning_rate": 1.890489098942267e-05, |
|
"loss": 2.1364, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 1.431013643148442, |
|
"grad_norm": 76422.46875, |
|
"learning_rate": 1.8675627845787126e-05, |
|
"loss": 2.1446, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 1.4347499565653565, |
|
"grad_norm": 103994.8359375, |
|
"learning_rate": 1.8447443679554567e-05, |
|
"loss": 2.1618, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 1.4384862699822711, |
|
"grad_norm": 93646.765625, |
|
"learning_rate": 1.8220346350597123e-05, |
|
"loss": 2.2005, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 1.442222583399186, |
|
"grad_norm": 104911.5234375, |
|
"learning_rate": 1.7994343681350422e-05, |
|
"loss": 2.185, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 1.4459588968161006, |
|
"grad_norm": 103320.125, |
|
"learning_rate": 1.7769443456544322e-05, |
|
"loss": 2.1904, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 1.4496952102330152, |
|
"grad_norm": 88957.3203125, |
|
"learning_rate": 1.7545653422934537e-05, |
|
"loss": 2.1924, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 1.4534315236499298, |
|
"grad_norm": 98682.171875, |
|
"learning_rate": 1.732298128903597e-05, |
|
"loss": 2.1546, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 1.4571678370668444, |
|
"grad_norm": 93626.0234375, |
|
"learning_rate": 1.7101434724857147e-05, |
|
"loss": 2.1653, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 1.4609041504837592, |
|
"grad_norm": 84740.2265625, |
|
"learning_rate": 1.6881021361635947e-05, |
|
"loss": 2.1835, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 1.4646404639006738, |
|
"grad_norm": 95559.859375, |
|
"learning_rate": 1.6661748791576855e-05, |
|
"loss": 2.1716, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 1.4683767773175886, |
|
"grad_norm": 105746.453125, |
|
"learning_rate": 1.6443624567589377e-05, |
|
"loss": 2.1467, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 1.4721130907345032, |
|
"grad_norm": 93445.3515625, |
|
"learning_rate": 1.622665620302789e-05, |
|
"loss": 2.1673, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 1.4758494041514179, |
|
"grad_norm": 88159.8125, |
|
"learning_rate": 1.6010851171432874e-05, |
|
"loss": 2.1727, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 1.4795857175683325, |
|
"grad_norm": 93973.4375, |
|
"learning_rate": 1.5796216906273388e-05, |
|
"loss": 2.1824, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 1.483322030985247, |
|
"grad_norm": 95876.78125, |
|
"learning_rate": 1.5582760800691155e-05, |
|
"loss": 2.1858, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 1.487058344402162, |
|
"grad_norm": 90519.78125, |
|
"learning_rate": 1.537049020724581e-05, |
|
"loss": 2.1984, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 1.4907946578190765, |
|
"grad_norm": 88962.28125, |
|
"learning_rate": 1.515941243766169e-05, |
|
"loss": 2.2229, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 1.494530971235991, |
|
"grad_norm": 113852.828125, |
|
"learning_rate": 1.4949534762575912e-05, |
|
"loss": 2.157, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 1.498267284652906, |
|
"grad_norm": 97744.3984375, |
|
"learning_rate": 1.4740864411288013e-05, |
|
"loss": 2.1711, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 1.5020035980698205, |
|
"grad_norm": 88245.1640625, |
|
"learning_rate": 1.4533408571510904e-05, |
|
"loss": 2.1867, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 1.5057399114867351, |
|
"grad_norm": 89220.171875, |
|
"learning_rate": 1.4327174389123227e-05, |
|
"loss": 2.1382, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 1.5094762249036497, |
|
"grad_norm": 108794.328125, |
|
"learning_rate": 1.4122168967923366e-05, |
|
"loss": 2.1608, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 1.5132125383205643, |
|
"grad_norm": 108413.6640625, |
|
"learning_rate": 1.3918399369384556e-05, |
|
"loss": 2.1657, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 1.5169488517374792, |
|
"grad_norm": 92711.84375, |
|
"learning_rate": 1.3715872612411806e-05, |
|
"loss": 2.1385, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 1.5206851651543938, |
|
"grad_norm": 97720.765625, |
|
"learning_rate": 1.3514595673100084e-05, |
|
"loss": 2.1467, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 1.5244214785713086, |
|
"grad_norm": 86545.078125, |
|
"learning_rate": 1.3314575484493946e-05, |
|
"loss": 2.152, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 1.5281577919882232, |
|
"grad_norm": 96816.7421875, |
|
"learning_rate": 1.3115818936348889e-05, |
|
"loss": 2.1636, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 1.5318941054051378, |
|
"grad_norm": 100713.7734375, |
|
"learning_rate": 1.2918332874893869e-05, |
|
"loss": 2.1784, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 1.5356304188220524, |
|
"grad_norm": 97874.359375, |
|
"learning_rate": 1.2722124102595518e-05, |
|
"loss": 2.1903, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 1.539366732238967, |
|
"grad_norm": 91848.2734375, |
|
"learning_rate": 1.252719937792397e-05, |
|
"loss": 2.1524, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 1.5431030456558816, |
|
"grad_norm": 95539.515625, |
|
"learning_rate": 1.2333565415119835e-05, |
|
"loss": 2.1813, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 1.5468393590727965, |
|
"grad_norm": 92111.3984375, |
|
"learning_rate": 1.2141228883963118e-05, |
|
"loss": 2.1615, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 1.5505756724897113, |
|
"grad_norm": 90152.640625, |
|
"learning_rate": 1.195019640954339e-05, |
|
"loss": 2.1615, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 1.554311985906626, |
|
"grad_norm": 96634.609375, |
|
"learning_rate": 1.1760474572031538e-05, |
|
"loss": 2.1455, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 1.5580482993235405, |
|
"grad_norm": 102015.8984375, |
|
"learning_rate": 1.1572069906453265e-05, |
|
"loss": 2.1508, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 1.561784612740455, |
|
"grad_norm": 90662.0859375, |
|
"learning_rate": 1.1384988902463806e-05, |
|
"loss": 2.1443, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 1.5655209261573697, |
|
"grad_norm": 101081.84375, |
|
"learning_rate": 1.1199238004124519e-05, |
|
"loss": 2.191, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 1.5692572395742843, |
|
"grad_norm": 86159.140625, |
|
"learning_rate": 1.101482360968088e-05, |
|
"loss": 2.1739, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 1.5729935529911991, |
|
"grad_norm": 96140.9296875, |
|
"learning_rate": 1.0831752071342033e-05, |
|
"loss": 2.1597, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 1.5767298664081137, |
|
"grad_norm": 93931.6484375, |
|
"learning_rate": 1.0650029695062103e-05, |
|
"loss": 2.1704, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 1.5804661798250286, |
|
"grad_norm": 96550.375, |
|
"learning_rate": 1.0469662740322888e-05, |
|
"loss": 2.1762, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 1.5842024932419432, |
|
"grad_norm": 91985.375, |
|
"learning_rate": 1.0290657419918308e-05, |
|
"loss": 2.1537, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 1.5879388066588578, |
|
"grad_norm": 99446.453125, |
|
"learning_rate": 1.0113019899740345e-05, |
|
"loss": 2.1567, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 1.5916751200757724, |
|
"grad_norm": 90499.4453125, |
|
"learning_rate": 9.936756298566724e-06, |
|
"loss": 2.1542, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 1.595411433492687, |
|
"grad_norm": 91874.203125, |
|
"learning_rate": 9.761872687850093e-06, |
|
"loss": 2.1701, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 1.5991477469096018, |
|
"grad_norm": 101494.265625, |
|
"learning_rate": 9.588375091508945e-06, |
|
"loss": 2.1785, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 1.6028840603265164, |
|
"grad_norm": 101492.890625, |
|
"learning_rate": 9.41626948572007e-06, |
|
"loss": 2.1681, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 1.6066203737434313, |
|
"grad_norm": 97085.359375, |
|
"learning_rate": 9.245561798712704e-06, |
|
"loss": 2.163, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 1.6103566871603459, |
|
"grad_norm": 101823.4609375, |
|
"learning_rate": 9.076257910564402e-06, |
|
"loss": 2.1538, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 1.6140930005772605, |
|
"grad_norm": 91494.078125, |
|
"learning_rate": 8.908363652998414e-06, |
|
"loss": 2.1235, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 1.617829313994175, |
|
"grad_norm": 85581.09375, |
|
"learning_rate": 8.74188480918282e-06, |
|
"loss": 2.1736, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 1.6215656274110897, |
|
"grad_norm": 92024.046875, |
|
"learning_rate": 8.576827113531427e-06, |
|
"loss": 2.177, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 1.6253019408280043, |
|
"grad_norm": 74753.4296875, |
|
"learning_rate": 8.413196251506073e-06, |
|
"loss": 2.143, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 1.629038254244919, |
|
"grad_norm": 107585.3125, |
|
"learning_rate": 8.25099785942094e-06, |
|
"loss": 2.1522, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 1.6327745676618337, |
|
"grad_norm": 94180.15625, |
|
"learning_rate": 8.090237524248346e-06, |
|
"loss": 2.1343, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 1.6365108810787485, |
|
"grad_norm": 102109.1953125, |
|
"learning_rate": 7.930920783426265e-06, |
|
"loss": 2.1643, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 1.6402471944956631, |
|
"grad_norm": 107961.2109375, |
|
"learning_rate": 7.773053124667712e-06, |
|
"loss": 2.1483, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 1.6439835079125777, |
|
"grad_norm": 89340.9296875, |
|
"learning_rate": 7.616639985771556e-06, |
|
"loss": 2.1602, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 1.6477198213294924, |
|
"grad_norm": 103068.6015625, |
|
"learning_rate": 7.461686754435332e-06, |
|
"loss": 2.1549, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 1.651456134746407, |
|
"grad_norm": 101308.2421875, |
|
"learning_rate": 7.30819876806963e-06, |
|
"loss": 2.1645, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 1.6551924481633218, |
|
"grad_norm": 116589.4140625, |
|
"learning_rate": 7.156181313614185e-06, |
|
"loss": 2.1538, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 1.6589287615802364, |
|
"grad_norm": 107515.390625, |
|
"learning_rate": 7.005639627355881e-06, |
|
"loss": 2.1506, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 1.6626650749971512, |
|
"grad_norm": 105536.6484375, |
|
"learning_rate": 6.856578894748284e-06, |
|
"loss": 2.1563, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 1.6664013884140658, |
|
"grad_norm": 125030.3046875, |
|
"learning_rate": 6.709004250233097e-06, |
|
"loss": 2.1574, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 1.6701377018309804, |
|
"grad_norm": 95777.0546875, |
|
"learning_rate": 6.562920777063231e-06, |
|
"loss": 2.1502, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 1.673874015247895, |
|
"grad_norm": 125466.265625, |
|
"learning_rate": 6.418333507127788e-06, |
|
"loss": 2.1569, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 1.6776103286648096, |
|
"grad_norm": 82233.953125, |
|
"learning_rate": 6.275247420778685e-06, |
|
"loss": 2.1315, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 1.6813466420817242, |
|
"grad_norm": 86722.7890625, |
|
"learning_rate": 6.133667446659131e-06, |
|
"loss": 2.1662, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 1.685082955498639, |
|
"grad_norm": 95386.34375, |
|
"learning_rate": 5.993598461533845e-06, |
|
"loss": 2.147, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 1.6888192689155537, |
|
"grad_norm": 82564.6484375, |
|
"learning_rate": 5.855045290121047e-06, |
|
"loss": 2.1426, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 1.6925555823324685, |
|
"grad_norm": 86084.6875, |
|
"learning_rate": 5.71801270492634e-06, |
|
"loss": 2.1517, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 1.696291895749383, |
|
"grad_norm": 90130.1796875, |
|
"learning_rate": 5.58250542607826e-06, |
|
"loss": 2.143, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 1.7000282091662977, |
|
"grad_norm": 95907.765625, |
|
"learning_rate": 5.448528121165691e-06, |
|
"loss": 2.1631, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 1.7037645225832123, |
|
"grad_norm": 88561.578125, |
|
"learning_rate": 5.316085405077159e-06, |
|
"loss": 2.135, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 1.707500836000127, |
|
"grad_norm": 94637.078125, |
|
"learning_rate": 5.185181839841746e-06, |
|
"loss": 2.1314, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 1.7112371494170417, |
|
"grad_norm": 96587.3125, |
|
"learning_rate": 5.055821934472071e-06, |
|
"loss": 2.1399, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 1.7149734628339564, |
|
"grad_norm": 91652.40625, |
|
"learning_rate": 4.92801014480892e-06, |
|
"loss": 2.1798, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 1.7187097762508712, |
|
"grad_norm": 104303.1015625, |
|
"learning_rate": 4.801750873367728e-06, |
|
"loss": 2.1749, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 1.7224460896677858, |
|
"grad_norm": 85994.984375, |
|
"learning_rate": 4.677048469187045e-06, |
|
"loss": 2.161, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 1.7261824030847004, |
|
"grad_norm": 105747.375, |
|
"learning_rate": 4.553907227678594e-06, |
|
"loss": 2.1273, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 1.729918716501615, |
|
"grad_norm": 108115.8125, |
|
"learning_rate": 4.432331390479416e-06, |
|
"loss": 2.1493, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 1.7336550299185296, |
|
"grad_norm": 97434.2109375, |
|
"learning_rate": 4.312325145305729e-06, |
|
"loss": 2.1339, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 1.7373913433354442, |
|
"grad_norm": 100293.0546875, |
|
"learning_rate": 4.193892625808659e-06, |
|
"loss": 2.1501, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 1.741127656752359, |
|
"grad_norm": 102609.921875, |
|
"learning_rate": 4.077037911431913e-06, |
|
"loss": 2.1309, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 1.7448639701692736, |
|
"grad_norm": 115093.4453125, |
|
"learning_rate": 3.961765027271197e-06, |
|
"loss": 2.144, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 1.7486002835861885, |
|
"grad_norm": 85552.25, |
|
"learning_rate": 3.84807794393563e-06, |
|
"loss": 2.0989, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 1.752336597003103, |
|
"grad_norm": 104233.3359375, |
|
"learning_rate": 3.735980577410908e-06, |
|
"loss": 2.1801, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 1.7560729104200177, |
|
"grad_norm": 103465.9375, |
|
"learning_rate": 3.6254767889244813e-06, |
|
"loss": 2.131, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 1.7598092238369323, |
|
"grad_norm": 100807.5546875, |
|
"learning_rate": 3.516570384812523e-06, |
|
"loss": 2.1221, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 1.7635455372538469, |
|
"grad_norm": 104049.0078125, |
|
"learning_rate": 3.4092651163888154e-06, |
|
"loss": 2.1382, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 1.7672818506707617, |
|
"grad_norm": 100944.1015625, |
|
"learning_rate": 3.303564679815557e-06, |
|
"loss": 2.1736, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 1.7710181640876763, |
|
"grad_norm": 104766.390625, |
|
"learning_rate": 3.1994727159759986e-06, |
|
"loss": 2.1125, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 1.7747544775045911, |
|
"grad_norm": 95535.8515625, |
|
"learning_rate": 3.0969928103490976e-06, |
|
"loss": 2.1478, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 1.7784907909215057, |
|
"grad_norm": 89555.0859375, |
|
"learning_rate": 2.996128492885969e-06, |
|
"loss": 2.1244, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 1.7822271043384204, |
|
"grad_norm": 101483.0546875, |
|
"learning_rate": 2.8968832378882926e-06, |
|
"loss": 2.1287, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 1.785963417755335, |
|
"grad_norm": 84929.1640625, |
|
"learning_rate": 2.7992604638886975e-06, |
|
"loss": 2.1421, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 1.7896997311722496, |
|
"grad_norm": 96690.8984375, |
|
"learning_rate": 2.703263533532929e-06, |
|
"loss": 2.1681, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 1.7934360445891642, |
|
"grad_norm": 105319.703125, |
|
"learning_rate": 2.608895753464069e-06, |
|
"loss": 2.1331, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 1.797172358006079, |
|
"grad_norm": 101327.21875, |
|
"learning_rate": 2.516160374208648e-06, |
|
"loss": 2.1551, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 1.8009086714229936, |
|
"grad_norm": 94917.8125, |
|
"learning_rate": 2.4250605900646204e-06, |
|
"loss": 2.1703, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 1.8046449848399084, |
|
"grad_norm": 97282.4765625, |
|
"learning_rate": 2.3355995389914342e-06, |
|
"loss": 2.1457, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 1.808381298256823, |
|
"grad_norm": 113476.78125, |
|
"learning_rate": 2.2477803025018228e-06, |
|
"loss": 2.1237, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 1.8121176116737376, |
|
"grad_norm": 99987.9921875, |
|
"learning_rate": 2.1616059055557636e-06, |
|
"loss": 2.1792, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 1.8158539250906522, |
|
"grad_norm": 110651.3828125, |
|
"learning_rate": 2.0770793164562373e-06, |
|
"loss": 2.135, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 1.8195902385075668, |
|
"grad_norm": 96608.578125, |
|
"learning_rate": 1.994203446746962e-06, |
|
"loss": 2.1681, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 1.8233265519244817, |
|
"grad_norm": 108624.8046875, |
|
"learning_rate": 1.9129811511121697e-06, |
|
"loss": 2.1457, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 1.8270628653413963, |
|
"grad_norm": 113863.421875, |
|
"learning_rate": 1.8334152272782069e-06, |
|
"loss": 2.1444, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 1.830799178758311, |
|
"grad_norm": 100759.1875, |
|
"learning_rate": 1.7555084159172176e-06, |
|
"loss": 2.1633, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 1.8345354921752257, |
|
"grad_norm": 104006.0859375, |
|
"learning_rate": 1.6792634005526943e-06, |
|
"loss": 2.1561, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 1.8382718055921403, |
|
"grad_norm": 93557.3203125, |
|
"learning_rate": 1.6046828074670894e-06, |
|
"loss": 2.1397, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 1.842008119009055, |
|
"grad_norm": 92767.28125, |
|
"learning_rate": 1.531769205611322e-06, |
|
"loss": 2.1375, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 1.8457444324259695, |
|
"grad_norm": 115375.0546875, |
|
"learning_rate": 1.4605251065162973e-06, |
|
"loss": 2.1149, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 1.8494807458428841, |
|
"grad_norm": 85288.765625, |
|
"learning_rate": 1.3909529642064045e-06, |
|
"loss": 2.1298, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 1.853217059259799, |
|
"grad_norm": 92626.7890625, |
|
"learning_rate": 1.3230551751149568e-06, |
|
"loss": 2.1488, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 1.8569533726767136, |
|
"grad_norm": 105719.265625, |
|
"learning_rate": 1.2568340780016953e-06, |
|
"loss": 2.1397, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 1.8606896860936284, |
|
"grad_norm": 89167.5703125, |
|
"learning_rate": 1.1922919538721876e-06, |
|
"loss": 2.1225, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 1.864425999510543, |
|
"grad_norm": 100311.046875, |
|
"learning_rate": 1.1294310258992747e-06, |
|
"loss": 2.1445, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 1.8681623129274576, |
|
"grad_norm": 87286.234375, |
|
"learning_rate": 1.0682534593464967e-06, |
|
"loss": 2.1369, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 1.8718986263443722, |
|
"grad_norm": 103840.65625, |
|
"learning_rate": 1.0087613614934987e-06, |
|
"loss": 2.1134, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 1.8756349397612868, |
|
"grad_norm": 97787.40625, |
|
"learning_rate": 9.509567815634446e-07, |
|
"loss": 2.1302, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 1.8793712531782016, |
|
"grad_norm": 119866.546875, |
|
"learning_rate": 8.948417106524554e-07, |
|
"loss": 2.1391, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 1.8831075665951162, |
|
"grad_norm": 101651.5234375, |
|
"learning_rate": 8.404180816609819e-07, |
|
"loss": 2.1628, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 1.886843880012031, |
|
"grad_norm": 103326.453125, |
|
"learning_rate": 7.876877692272799e-07, |
|
"loss": 2.1341, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 1.8905801934289457, |
|
"grad_norm": 100837.203125, |
|
"learning_rate": 7.366525896627841e-07, |
|
"loss": 2.125, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 1.8943165068458603, |
|
"grad_norm": 101825.6015625, |
|
"learning_rate": 6.873143008895633e-07, |
|
"loss": 2.116, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 1.8980528202627749, |
|
"grad_norm": 84607.0234375, |
|
"learning_rate": 6.396746023798139e-07, |
|
"loss": 2.1599, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 1.9017891336796895, |
|
"grad_norm": 85267.1328125, |
|
"learning_rate": 5.937351350972287e-07, |
|
"loss": 2.1605, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 1.905525447096604, |
|
"grad_norm": 112646.515625, |
|
"learning_rate": 5.49497481440564e-07, |
|
"loss": 2.1479, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 1.909261760513519, |
|
"grad_norm": 101483.7578125, |
|
"learning_rate": 5.069631651890783e-07, |
|
"loss": 2.1306, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 1.9129980739304335, |
|
"grad_norm": 104185.2265625, |
|
"learning_rate": 4.661336514500625e-07, |
|
"loss": 2.1222, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 1.9167343873473484, |
|
"grad_norm": 83210.1484375, |
|
"learning_rate": 4.270103466083697e-07, |
|
"loss": 2.1573, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 1.920470700764263, |
|
"grad_norm": 109216.5703125, |
|
"learning_rate": 3.895945982779925e-07, |
|
"loss": 2.166, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 1.9242070141811776, |
|
"grad_norm": 100591.0546875, |
|
"learning_rate": 3.5388769525560585e-07, |
|
"loss": 2.1291, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 1.9279433275980922, |
|
"grad_norm": 98919.7421875, |
|
"learning_rate": 3.19890867476208e-07, |
|
"loss": 2.1449, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 1.9316796410150068, |
|
"grad_norm": 90432.4765625, |
|
"learning_rate": 2.876052859707379e-07, |
|
"loss": 2.1195, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 1.9354159544319216, |
|
"grad_norm": 94623.453125, |
|
"learning_rate": 2.570320628257461e-07, |
|
"loss": 2.1409, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 1.9391522678488362, |
|
"grad_norm": 112643.9765625, |
|
"learning_rate": 2.2817225114507567e-07, |
|
"loss": 2.1521, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 1.942888581265751, |
|
"grad_norm": 83827.5078125, |
|
"learning_rate": 2.010268450136077e-07, |
|
"loss": 2.1273, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 1.9466248946826656, |
|
"grad_norm": 97442.9453125, |
|
"learning_rate": 1.7559677946300533e-07, |
|
"loss": 2.1454, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 1.9503612080995802, |
|
"grad_norm": 92114.421875, |
|
"learning_rate": 1.518829304395175e-07, |
|
"loss": 2.127, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 1.9540975215164949, |
|
"grad_norm": 95378.5078125, |
|
"learning_rate": 1.2988611477378066e-07, |
|
"loss": 2.1213, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 1.9578338349334095, |
|
"grad_norm": 87441.7890625, |
|
"learning_rate": 1.0960709015272464e-07, |
|
"loss": 2.1427, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 1.961570148350324, |
|
"grad_norm": 87605.1640625, |
|
"learning_rate": 9.104655509343252e-08, |
|
"loss": 2.1541, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 1.9653064617672389, |
|
"grad_norm": 101794.53125, |
|
"learning_rate": 7.420514891911535e-08, |
|
"loss": 2.152, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 1.9690427751841535, |
|
"grad_norm": 94773.6328125, |
|
"learning_rate": 5.908345173705754e-08, |
|
"loss": 2.1471, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 1.9727790886010683, |
|
"grad_norm": 105178.625, |
|
"learning_rate": 4.5681984418666224e-08, |
|
"loss": 2.1364, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 1.976515402017983, |
|
"grad_norm": 97559.484375, |
|
"learning_rate": 3.400120858149669e-08, |
|
"loss": 2.17, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 1.9802517154348975, |
|
"grad_norm": 120999.3125, |
|
"learning_rate": 2.404152657338732e-08, |
|
"loss": 2.1488, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 1.9839880288518121, |
|
"grad_norm": 117999.5859375, |
|
"learning_rate": 1.5803281458570684e-08, |
|
"loss": 2.1481, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 1.9877243422687267, |
|
"grad_norm": 94607.5078125, |
|
"learning_rate": 9.286757005871893e-09, |
|
"loss": 2.1104, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 1.9914606556856416, |
|
"grad_norm": 94297.2109375, |
|
"learning_rate": 4.492177678938614e-09, |
|
"loss": 2.1349, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 1.9951969691025562, |
|
"grad_norm": 95649.546875, |
|
"learning_rate": 1.4197086284917227e-09, |
|
"loss": 2.1224, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 1.998933282519471, |
|
"grad_norm": 116585.421875, |
|
"learning_rate": 6.945568664651347e-11, |
|
"loss": 2.1757, |
|
"step": 267500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 267642, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.913150854495206e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|