{ "best_metric": 0.24187567830085754, "best_model_checkpoint": "./results\\checkpoint-150000", "epoch": 5.318552300921432, "eval_steps": 10000, "global_step": 200000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013296281030195855, "grad_norm": 1.8945389986038208, "learning_rate": 1.9986783320923308e-05, "loss": 2.1575, "step": 500 }, { "epoch": 0.02659256206039171, "grad_norm": 1.8132811784744263, "learning_rate": 1.997348686309967e-05, "loss": 0.6486, "step": 1000 }, { "epoch": 0.039888843090587565, "grad_norm": 1.9245736598968506, "learning_rate": 1.9960190405276038e-05, "loss": 0.6164, "step": 1500 }, { "epoch": 0.05318512412078342, "grad_norm": 1.4257850646972656, "learning_rate": 1.99468939474524e-05, "loss": 0.6067, "step": 2000 }, { "epoch": 0.06648140515097926, "grad_norm": 1.434894323348999, "learning_rate": 1.9933597489628764e-05, "loss": 0.5671, "step": 2500 }, { "epoch": 0.07977768618117513, "grad_norm": 1.8151600360870361, "learning_rate": 1.992030103180513e-05, "loss": 0.5535, "step": 3000 }, { "epoch": 0.09307396721137098, "grad_norm": 1.9667155742645264, "learning_rate": 1.990700457398149e-05, "loss": 0.5628, "step": 3500 }, { "epoch": 0.10637024824156684, "grad_norm": 2.0493931770324707, "learning_rate": 1.9893708116157857e-05, "loss": 0.5465, "step": 4000 }, { "epoch": 0.11966652927176269, "grad_norm": 2.201901435852051, "learning_rate": 1.988041165833422e-05, "loss": 0.5472, "step": 4500 }, { "epoch": 0.13296281030195853, "grad_norm": 1.8538507223129272, "learning_rate": 1.9867115200510584e-05, "loss": 0.5451, "step": 5000 }, { "epoch": 0.1462590913321544, "grad_norm": 1.35111403465271, "learning_rate": 1.985381874268695e-05, "loss": 0.5333, "step": 5500 }, { "epoch": 0.15955537236235026, "grad_norm": 1.2387117147445679, "learning_rate": 1.9840522284863314e-05, "loss": 0.5096, "step": 6000 }, { "epoch": 0.1728516533925461, "grad_norm": 1.2727868556976318, "learning_rate": 1.9827225827039677e-05, "loss": 0.5256, "step": 6500 }, { "epoch": 0.18614793442274197, "grad_norm": 1.7770028114318848, "learning_rate": 1.9813929369216044e-05, "loss": 0.5064, "step": 7000 }, { "epoch": 0.1994442154529378, "grad_norm": 2.1460671424865723, "learning_rate": 1.9800632911392407e-05, "loss": 0.5066, "step": 7500 }, { "epoch": 0.21274049648313367, "grad_norm": 1.315503716468811, "learning_rate": 1.978733645356877e-05, "loss": 0.4974, "step": 8000 }, { "epoch": 0.2260367775133295, "grad_norm": 1.6402277946472168, "learning_rate": 1.9774039995745137e-05, "loss": 0.5019, "step": 8500 }, { "epoch": 0.23933305854352538, "grad_norm": 1.4794352054595947, "learning_rate": 1.97607435379215e-05, "loss": 0.5072, "step": 9000 }, { "epoch": 0.25262933957372125, "grad_norm": 1.432265043258667, "learning_rate": 1.9747447080097863e-05, "loss": 0.4844, "step": 9500 }, { "epoch": 0.26592562060391706, "grad_norm": 1.7145007848739624, "learning_rate": 1.973415062227423e-05, "loss": 0.488, "step": 10000 }, { "epoch": 0.26592562060391706, "eval_loss": 0.4845176041126251, "eval_runtime": 45.8707, "eval_samples_per_second": 66.251, "eval_steps_per_second": 16.568, "step": 10000 }, { "epoch": 0.2792219016341129, "grad_norm": 1.404540777206421, "learning_rate": 1.972085416445059e-05, "loss": 0.4931, "step": 10500 }, { "epoch": 0.2925181826643088, "grad_norm": 1.3520866632461548, "learning_rate": 1.9707557706626957e-05, "loss": 0.4868, "step": 11000 }, { "epoch": 0.30581446369450466, "grad_norm": 1.4363296031951904, "learning_rate": 1.969426124880332e-05, "loss": 0.4798, "step": 11500 }, { "epoch": 0.3191107447247005, "grad_norm": 1.4917619228363037, "learning_rate": 1.9680964790979683e-05, "loss": 0.4736, "step": 12000 }, { "epoch": 0.33240702575489633, "grad_norm": 1.7679712772369385, "learning_rate": 1.966766833315605e-05, "loss": 0.4778, "step": 12500 }, { "epoch": 0.3457033067850922, "grad_norm": 1.4026507139205933, "learning_rate": 1.9654371875332413e-05, "loss": 0.4761, "step": 13000 }, { "epoch": 0.35899958781528807, "grad_norm": 1.8736493587493896, "learning_rate": 1.9641075417508776e-05, "loss": 0.4871, "step": 13500 }, { "epoch": 0.37229586884548393, "grad_norm": 1.2222760915756226, "learning_rate": 1.9627778959685143e-05, "loss": 0.4745, "step": 14000 }, { "epoch": 0.38559214987567975, "grad_norm": 1.4624443054199219, "learning_rate": 1.9614509094777154e-05, "loss": 0.4573, "step": 14500 }, { "epoch": 0.3988884309058756, "grad_norm": 1.8538755178451538, "learning_rate": 1.9601212636953517e-05, "loss": 0.4633, "step": 15000 }, { "epoch": 0.4121847119360715, "grad_norm": 1.47249174118042, "learning_rate": 1.9587916179129884e-05, "loss": 0.4764, "step": 15500 }, { "epoch": 0.42548099296626735, "grad_norm": 1.7755292654037476, "learning_rate": 1.9574619721306244e-05, "loss": 0.4699, "step": 16000 }, { "epoch": 0.4387772739964632, "grad_norm": 1.4009391069412231, "learning_rate": 1.956132326348261e-05, "loss": 0.4666, "step": 16500 }, { "epoch": 0.452073555026659, "grad_norm": 1.520727276802063, "learning_rate": 1.9548026805658973e-05, "loss": 0.4697, "step": 17000 }, { "epoch": 0.4653698360568549, "grad_norm": 1.7645255327224731, "learning_rate": 1.9534730347835337e-05, "loss": 0.46, "step": 17500 }, { "epoch": 0.47866611708705076, "grad_norm": 1.7914655208587646, "learning_rate": 1.9521433890011703e-05, "loss": 0.4601, "step": 18000 }, { "epoch": 0.4919623981172466, "grad_norm": 1.4558111429214478, "learning_rate": 1.9508164025103714e-05, "loss": 0.4601, "step": 18500 }, { "epoch": 0.5052586791474425, "grad_norm": 1.1980383396148682, "learning_rate": 1.9494894160195725e-05, "loss": 0.4563, "step": 19000 }, { "epoch": 0.5185549601776384, "grad_norm": 1.8653793334960938, "learning_rate": 1.9481597702372092e-05, "loss": 0.4434, "step": 19500 }, { "epoch": 0.5318512412078341, "grad_norm": 1.3913460969924927, "learning_rate": 1.946830124454845e-05, "loss": 0.45, "step": 20000 }, { "epoch": 0.5318512412078341, "eval_loss": 0.4461110234260559, "eval_runtime": 46.6164, "eval_samples_per_second": 65.192, "eval_steps_per_second": 16.303, "step": 20000 }, { "epoch": 0.54514752223803, "grad_norm": 1.4441711902618408, "learning_rate": 1.9455004786724818e-05, "loss": 0.4536, "step": 20500 }, { "epoch": 0.5584438032682258, "grad_norm": 1.2844038009643555, "learning_rate": 1.9441708328901185e-05, "loss": 0.447, "step": 21000 }, { "epoch": 0.5717400842984217, "grad_norm": 2.10949444770813, "learning_rate": 1.9428438463993192e-05, "loss": 0.4484, "step": 21500 }, { "epoch": 0.5850363653286176, "grad_norm": 0.925681471824646, "learning_rate": 1.941514200616956e-05, "loss": 0.4374, "step": 22000 }, { "epoch": 0.5983326463588134, "grad_norm": 1.49970543384552, "learning_rate": 1.9401845548345922e-05, "loss": 0.4533, "step": 22500 }, { "epoch": 0.6116289273890093, "grad_norm": 1.2436000108718872, "learning_rate": 1.9388549090522286e-05, "loss": 0.4455, "step": 23000 }, { "epoch": 0.6249252084192052, "grad_norm": 1.1918786764144897, "learning_rate": 1.9375252632698652e-05, "loss": 0.4389, "step": 23500 }, { "epoch": 0.638221489449401, "grad_norm": 1.2042317390441895, "learning_rate": 1.9361956174875015e-05, "loss": 0.4446, "step": 24000 }, { "epoch": 0.6515177704795968, "grad_norm": 1.3460853099822998, "learning_rate": 1.934865971705138e-05, "loss": 0.4464, "step": 24500 }, { "epoch": 0.6648140515097927, "grad_norm": 1.1541383266448975, "learning_rate": 1.9335363259227745e-05, "loss": 0.4404, "step": 25000 }, { "epoch": 0.6781103325399885, "grad_norm": 1.2001688480377197, "learning_rate": 1.9322093394319753e-05, "loss": 0.4409, "step": 25500 }, { "epoch": 0.6914066135701844, "grad_norm": 1.38569974899292, "learning_rate": 1.930879693649612e-05, "loss": 0.4412, "step": 26000 }, { "epoch": 0.7047028946003803, "grad_norm": 1.6519745588302612, "learning_rate": 1.9295500478672483e-05, "loss": 0.4293, "step": 26500 }, { "epoch": 0.7179991756305761, "grad_norm": 1.1891002655029297, "learning_rate": 1.9282230613764494e-05, "loss": 0.4478, "step": 27000 }, { "epoch": 0.731295456660772, "grad_norm": 1.2890375852584839, "learning_rate": 1.926893415594086e-05, "loss": 0.4422, "step": 27500 }, { "epoch": 0.7445917376909679, "grad_norm": 1.2893614768981934, "learning_rate": 1.9255637698117224e-05, "loss": 0.4379, "step": 28000 }, { "epoch": 0.7578880187211637, "grad_norm": 1.2751696109771729, "learning_rate": 1.9242341240293587e-05, "loss": 0.4259, "step": 28500 }, { "epoch": 0.7711842997513595, "grad_norm": 1.263238549232483, "learning_rate": 1.9229044782469953e-05, "loss": 0.4285, "step": 29000 }, { "epoch": 0.7844805807815554, "grad_norm": 1.6520916223526, "learning_rate": 1.921577491756196e-05, "loss": 0.4278, "step": 29500 }, { "epoch": 0.7977768618117512, "grad_norm": 1.4022353887557983, "learning_rate": 1.9202478459738328e-05, "loss": 0.4277, "step": 30000 }, { "epoch": 0.7977768618117512, "eval_loss": 0.4259246587753296, "eval_runtime": 45.6826, "eval_samples_per_second": 66.524, "eval_steps_per_second": 16.637, "step": 30000 }, { "epoch": 0.8110731428419471, "grad_norm": 1.3312253952026367, "learning_rate": 1.918920859483034e-05, "loss": 0.4262, "step": 30500 }, { "epoch": 0.824369423872143, "grad_norm": 1.5275497436523438, "learning_rate": 1.91759121370067e-05, "loss": 0.4281, "step": 31000 }, { "epoch": 0.8376657049023388, "grad_norm": 1.7679028511047363, "learning_rate": 1.916261567918307e-05, "loss": 0.44, "step": 31500 }, { "epoch": 0.8509619859325347, "grad_norm": 1.1467547416687012, "learning_rate": 1.914931922135943e-05, "loss": 0.4153, "step": 32000 }, { "epoch": 0.8642582669627306, "grad_norm": 1.2685679197311401, "learning_rate": 1.9136022763535795e-05, "loss": 0.4232, "step": 32500 }, { "epoch": 0.8775545479929264, "grad_norm": 0.9204725623130798, "learning_rate": 1.912272630571216e-05, "loss": 0.425, "step": 33000 }, { "epoch": 0.8908508290231222, "grad_norm": 0.7971585988998413, "learning_rate": 1.9109429847888525e-05, "loss": 0.431, "step": 33500 }, { "epoch": 0.904147110053318, "grad_norm": 1.6358304023742676, "learning_rate": 1.9096133390064888e-05, "loss": 0.4361, "step": 34000 }, { "epoch": 0.9174433910835139, "grad_norm": 1.7240855693817139, "learning_rate": 1.9082836932241255e-05, "loss": 0.432, "step": 34500 }, { "epoch": 0.9307396721137098, "grad_norm": 1.1937910318374634, "learning_rate": 1.9069567067333262e-05, "loss": 0.4324, "step": 35000 }, { "epoch": 0.9440359531439056, "grad_norm": 1.1422160863876343, "learning_rate": 1.905627060950963e-05, "loss": 0.4286, "step": 35500 }, { "epoch": 0.9573322341741015, "grad_norm": 1.0084590911865234, "learning_rate": 1.9042974151685992e-05, "loss": 0.4401, "step": 36000 }, { "epoch": 0.9706285152042974, "grad_norm": 1.7879573106765747, "learning_rate": 1.9029677693862355e-05, "loss": 0.4259, "step": 36500 }, { "epoch": 0.9839247962344932, "grad_norm": 1.1148507595062256, "learning_rate": 1.9016381236038722e-05, "loss": 0.3995, "step": 37000 }, { "epoch": 0.9972210772646891, "grad_norm": 1.2119601964950562, "learning_rate": 1.9003084778215085e-05, "loss": 0.4228, "step": 37500 }, { "epoch": 1.010517358294885, "grad_norm": 1.2811840772628784, "learning_rate": 1.898978832039145e-05, "loss": 0.3945, "step": 38000 }, { "epoch": 1.0238136393250807, "grad_norm": 1.199591040611267, "learning_rate": 1.8976491862567815e-05, "loss": 0.3724, "step": 38500 }, { "epoch": 1.0371099203552767, "grad_norm": 1.4783821105957031, "learning_rate": 1.8963221997659823e-05, "loss": 0.3869, "step": 39000 }, { "epoch": 1.0504062013854725, "grad_norm": 1.1598395109176636, "learning_rate": 1.894992553983619e-05, "loss": 0.3888, "step": 39500 }, { "epoch": 1.0637024824156684, "grad_norm": 1.0254014730453491, "learning_rate": 1.8936629082012556e-05, "loss": 0.3741, "step": 40000 }, { "epoch": 1.0637024824156684, "eval_loss": 0.4110487699508667, "eval_runtime": 45.9661, "eval_samples_per_second": 66.114, "eval_steps_per_second": 16.534, "step": 40000 }, { "epoch": 1.0769987634458642, "grad_norm": 1.4140042066574097, "learning_rate": 1.8923332624188916e-05, "loss": 0.3838, "step": 40500 }, { "epoch": 1.09029504447606, "grad_norm": 1.4062950611114502, "learning_rate": 1.8910089352196578e-05, "loss": 0.368, "step": 41000 }, { "epoch": 1.103591325506256, "grad_norm": 1.2314609289169312, "learning_rate": 1.889681948728859e-05, "loss": 0.4033, "step": 41500 }, { "epoch": 1.1168876065364517, "grad_norm": 1.4966250658035278, "learning_rate": 1.8883523029464952e-05, "loss": 0.3878, "step": 42000 }, { "epoch": 1.1301838875666477, "grad_norm": 1.1920274496078491, "learning_rate": 1.887022657164132e-05, "loss": 0.375, "step": 42500 }, { "epoch": 1.1434801685968434, "grad_norm": 1.0075514316558838, "learning_rate": 1.8856930113817678e-05, "loss": 0.3821, "step": 43000 }, { "epoch": 1.1567764496270394, "grad_norm": 1.2424709796905518, "learning_rate": 1.8843633655994045e-05, "loss": 0.3793, "step": 43500 }, { "epoch": 1.1700727306572352, "grad_norm": 0.7054141163825989, "learning_rate": 1.8830337198170408e-05, "loss": 0.3821, "step": 44000 }, { "epoch": 1.183369011687431, "grad_norm": 0.7828935384750366, "learning_rate": 1.881704074034677e-05, "loss": 0.3737, "step": 44500 }, { "epoch": 1.196665292717627, "grad_norm": 1.3097379207611084, "learning_rate": 1.8803744282523138e-05, "loss": 0.3747, "step": 45000 }, { "epoch": 1.2099615737478226, "grad_norm": 1.2207602262496948, "learning_rate": 1.87904478246995e-05, "loss": 0.3799, "step": 45500 }, { "epoch": 1.2232578547780186, "grad_norm": 1.5833189487457275, "learning_rate": 1.8777151366875865e-05, "loss": 0.387, "step": 46000 }, { "epoch": 1.2365541358082144, "grad_norm": 1.3836820125579834, "learning_rate": 1.876385490905223e-05, "loss": 0.3853, "step": 46500 }, { "epoch": 1.2498504168384104, "grad_norm": 0.9267215728759766, "learning_rate": 1.8750558451228595e-05, "loss": 0.3763, "step": 47000 }, { "epoch": 1.2631466978686061, "grad_norm": 1.4826302528381348, "learning_rate": 1.8737288586320605e-05, "loss": 0.3753, "step": 47500 }, { "epoch": 1.2764429788988019, "grad_norm": 1.7470266819000244, "learning_rate": 1.8723992128496972e-05, "loss": 0.389, "step": 48000 }, { "epoch": 1.2897392599289978, "grad_norm": 0.9160776734352112, "learning_rate": 1.8710695670673332e-05, "loss": 0.3732, "step": 48500 }, { "epoch": 1.3030355409591938, "grad_norm": 1.5541558265686035, "learning_rate": 1.86973992128497e-05, "loss": 0.3815, "step": 49000 }, { "epoch": 1.3163318219893896, "grad_norm": 1.3254436254501343, "learning_rate": 1.8684102755026062e-05, "loss": 0.3786, "step": 49500 }, { "epoch": 1.3296281030195853, "grad_norm": 1.5244851112365723, "learning_rate": 1.8670806297202425e-05, "loss": 0.3886, "step": 50000 }, { "epoch": 1.3296281030195853, "eval_loss": 0.4009094834327698, "eval_runtime": 45.5454, "eval_samples_per_second": 66.725, "eval_steps_per_second": 16.687, "step": 50000 }, { "epoch": 1.3429376803308115, "grad_norm": 0.867463231086731, "learning_rate": 1.8657509839378792e-05, "loss": 0.3681, "step": 50500 }, { "epoch": 1.3562339613610073, "grad_norm": 1.2311478853225708, "learning_rate": 1.8644213381555155e-05, "loss": 0.347, "step": 51000 }, { "epoch": 1.3695302423912032, "grad_norm": 1.4698890447616577, "learning_rate": 1.8630916923731518e-05, "loss": 0.3666, "step": 51500 }, { "epoch": 1.382826523421399, "grad_norm": 1.3057785034179688, "learning_rate": 1.8617620465907885e-05, "loss": 0.3716, "step": 52000 }, { "epoch": 1.3961228044515948, "grad_norm": 0.9626977443695068, "learning_rate": 1.8604324008084248e-05, "loss": 0.3633, "step": 52500 }, { "epoch": 1.4094190854817907, "grad_norm": 1.055158257484436, "learning_rate": 1.859102755026061e-05, "loss": 0.364, "step": 53000 }, { "epoch": 1.4227153665119867, "grad_norm": 1.7065026760101318, "learning_rate": 1.8577731092436978e-05, "loss": 0.365, "step": 53500 }, { "epoch": 1.4360116475421825, "grad_norm": 1.1983647346496582, "learning_rate": 1.856443463461334e-05, "loss": 0.3584, "step": 54000 }, { "epoch": 1.4493079285723782, "grad_norm": 1.465380072593689, "learning_rate": 1.8551138176789705e-05, "loss": 0.3698, "step": 54500 }, { "epoch": 1.4626042096025742, "grad_norm": 0.9600203037261963, "learning_rate": 1.853784171896607e-05, "loss": 0.3686, "step": 55000 }, { "epoch": 1.47590049063277, "grad_norm": 0.7194857597351074, "learning_rate": 1.852454526114243e-05, "loss": 0.3691, "step": 55500 }, { "epoch": 1.489196771662966, "grad_norm": 1.2525266408920288, "learning_rate": 1.8511248803318798e-05, "loss": 0.3716, "step": 56000 }, { "epoch": 1.5024930526931617, "grad_norm": 1.4390616416931152, "learning_rate": 1.849795234549516e-05, "loss": 0.3642, "step": 56500 }, { "epoch": 1.5157893337233577, "grad_norm": 0.9960741996765137, "learning_rate": 1.8484655887671524e-05, "loss": 0.3543, "step": 57000 }, { "epoch": 1.5290856147535534, "grad_norm": 1.5157722234725952, "learning_rate": 1.847135942984789e-05, "loss": 0.3684, "step": 57500 }, { "epoch": 1.5423818957837492, "grad_norm": 1.2276800870895386, "learning_rate": 1.8458062972024254e-05, "loss": 0.3682, "step": 58000 }, { "epoch": 1.5556781768139452, "grad_norm": 0.7182838916778564, "learning_rate": 1.8444793107116265e-05, "loss": 0.3571, "step": 58500 }, { "epoch": 1.5689744578441411, "grad_norm": 1.4738359451293945, "learning_rate": 1.8431496649292632e-05, "loss": 0.3495, "step": 59000 }, { "epoch": 1.582270738874337, "grad_norm": 1.1519092321395874, "learning_rate": 1.8418200191468995e-05, "loss": 0.3656, "step": 59500 }, { "epoch": 1.5955670199045326, "grad_norm": 0.9778446555137634, "learning_rate": 1.8404903733645358e-05, "loss": 0.3528, "step": 60000 }, { "epoch": 1.5955670199045326, "eval_loss": 0.33645519614219666, "eval_runtime": 45.1628, "eval_samples_per_second": 67.29, "eval_steps_per_second": 16.828, "step": 60000 }, { "epoch": 1.6088633009347286, "grad_norm": 1.5486047267913818, "learning_rate": 1.8391607275821725e-05, "loss": 0.3587, "step": 60500 }, { "epoch": 1.6221595819649244, "grad_norm": 0.8840271234512329, "learning_rate": 1.8378337410913732e-05, "loss": 0.3584, "step": 61000 }, { "epoch": 1.6354558629951201, "grad_norm": 0.9995741248130798, "learning_rate": 1.83650409530901e-05, "loss": 0.3737, "step": 61500 }, { "epoch": 1.6487521440253161, "grad_norm": 1.7882977724075317, "learning_rate": 1.835177108818211e-05, "loss": 0.3655, "step": 62000 }, { "epoch": 1.662048425055512, "grad_norm": 0.7783369421958923, "learning_rate": 1.8338474630358473e-05, "loss": 0.3673, "step": 62500 }, { "epoch": 1.6753447060857078, "grad_norm": 1.585405945777893, "learning_rate": 1.832517817253484e-05, "loss": 0.3595, "step": 63000 }, { "epoch": 1.6886409871159036, "grad_norm": 1.3467520475387573, "learning_rate": 1.8311881714711203e-05, "loss": 0.3627, "step": 63500 }, { "epoch": 1.7019372681460996, "grad_norm": 1.1446176767349243, "learning_rate": 1.8298585256887566e-05, "loss": 0.3574, "step": 64000 }, { "epoch": 1.7152335491762953, "grad_norm": 0.9003855586051941, "learning_rate": 1.828531539197958e-05, "loss": 0.3665, "step": 64500 }, { "epoch": 1.728529830206491, "grad_norm": 0.8468024730682373, "learning_rate": 1.827201893415594e-05, "loss": 0.3574, "step": 65000 }, { "epoch": 1.741826111236687, "grad_norm": 1.0654727220535278, "learning_rate": 1.8258722476332307e-05, "loss": 0.3651, "step": 65500 }, { "epoch": 1.755122392266883, "grad_norm": 0.6998704671859741, "learning_rate": 1.824542601850867e-05, "loss": 0.36, "step": 66000 }, { "epoch": 1.7684186732970788, "grad_norm": 0.8846042156219482, "learning_rate": 1.8232129560685034e-05, "loss": 0.362, "step": 66500 }, { "epoch": 1.7817149543272746, "grad_norm": 1.2595306634902954, "learning_rate": 1.82188331028614e-05, "loss": 0.3607, "step": 67000 }, { "epoch": 1.7950112353574705, "grad_norm": 1.0262569189071655, "learning_rate": 1.8205536645037764e-05, "loss": 0.3528, "step": 67500 }, { "epoch": 1.8083075163876665, "grad_norm": 1.034749150276184, "learning_rate": 1.8192240187214127e-05, "loss": 0.3659, "step": 68000 }, { "epoch": 1.8216037974178623, "grad_norm": 1.6457266807556152, "learning_rate": 1.817897032230614e-05, "loss": 0.3618, "step": 68500 }, { "epoch": 1.834900078448058, "grad_norm": 1.0784075260162354, "learning_rate": 1.81656738644825e-05, "loss": 0.3678, "step": 69000 }, { "epoch": 1.848196359478254, "grad_norm": 1.7093465328216553, "learning_rate": 1.8152377406658868e-05, "loss": 0.3586, "step": 69500 }, { "epoch": 1.8614926405084498, "grad_norm": 1.0873297452926636, "learning_rate": 1.8139080948835234e-05, "loss": 0.3486, "step": 70000 }, { "epoch": 1.8614926405084498, "eval_loss": 0.33657437562942505, "eval_runtime": 44.8327, "eval_samples_per_second": 67.785, "eval_steps_per_second": 16.952, "step": 70000 }, { "epoch": 1.8747889215386455, "grad_norm": 1.2218577861785889, "learning_rate": 1.8125784491011594e-05, "loss": 0.3538, "step": 70500 }, { "epoch": 1.8880852025688415, "grad_norm": 1.7675495147705078, "learning_rate": 1.8112541219019256e-05, "loss": 0.36, "step": 71000 }, { "epoch": 1.9013814835990375, "grad_norm": 1.546845555305481, "learning_rate": 1.809924476119562e-05, "loss": 0.3681, "step": 71500 }, { "epoch": 1.9146777646292332, "grad_norm": 1.0978270769119263, "learning_rate": 1.8085948303371982e-05, "loss": 0.349, "step": 72000 }, { "epoch": 1.927974045659429, "grad_norm": 1.0180225372314453, "learning_rate": 1.807265184554835e-05, "loss": 0.3607, "step": 72500 }, { "epoch": 1.941270326689625, "grad_norm": 1.5221401453018188, "learning_rate": 1.8059355387724712e-05, "loss": 0.3603, "step": 73000 }, { "epoch": 1.9545666077198207, "grad_norm": 1.8798828125, "learning_rate": 1.8046058929901076e-05, "loss": 0.3609, "step": 73500 }, { "epoch": 1.9678628887500165, "grad_norm": 1.5229233503341675, "learning_rate": 1.8032762472077442e-05, "loss": 0.3721, "step": 74000 }, { "epoch": 1.9811591697802124, "grad_norm": 0.7534819841384888, "learning_rate": 1.8019466014253802e-05, "loss": 0.3606, "step": 74500 }, { "epoch": 1.9944554508104084, "grad_norm": 1.0966280698776245, "learning_rate": 1.800616955643017e-05, "loss": 0.3581, "step": 75000 }, { "epoch": 2.0077517318406044, "grad_norm": 1.165394902229309, "learning_rate": 1.7992873098606532e-05, "loss": 0.3426, "step": 75500 }, { "epoch": 2.0210480128708, "grad_norm": 1.4377708435058594, "learning_rate": 1.7979576640782895e-05, "loss": 0.3315, "step": 76000 }, { "epoch": 2.034344293900996, "grad_norm": 1.065831184387207, "learning_rate": 1.7966280182959262e-05, "loss": 0.3348, "step": 76500 }, { "epoch": 2.047640574931192, "grad_norm": 1.0377391576766968, "learning_rate": 1.7953010318051273e-05, "loss": 0.3297, "step": 77000 }, { "epoch": 2.0609368559613874, "grad_norm": 1.28931725025177, "learning_rate": 1.7939713860227636e-05, "loss": 0.3353, "step": 77500 }, { "epoch": 2.0742331369915834, "grad_norm": 0.9061357378959656, "learning_rate": 1.7926417402404003e-05, "loss": 0.3373, "step": 78000 }, { "epoch": 2.0875294180217794, "grad_norm": 1.162048101425171, "learning_rate": 1.7913120944580366e-05, "loss": 0.3287, "step": 78500 }, { "epoch": 2.1008256990519754, "grad_norm": 1.1295771598815918, "learning_rate": 1.7899851079672377e-05, "loss": 0.3306, "step": 79000 }, { "epoch": 2.114121980082171, "grad_norm": 1.3394831418991089, "learning_rate": 1.788655462184874e-05, "loss": 0.3425, "step": 79500 }, { "epoch": 2.127418261112367, "grad_norm": 0.7900285124778748, "learning_rate": 1.7873258164025103e-05, "loss": 0.3339, "step": 80000 }, { "epoch": 2.127418261112367, "eval_loss": 0.33293381333351135, "eval_runtime": 45.161, "eval_samples_per_second": 67.293, "eval_steps_per_second": 16.829, "step": 80000 }, { "epoch": 2.140714542142563, "grad_norm": 1.2849562168121338, "learning_rate": 1.785996170620147e-05, "loss": 0.3299, "step": 80500 }, { "epoch": 2.1540108231727584, "grad_norm": 1.2442893981933594, "learning_rate": 1.7846665248377833e-05, "loss": 0.3234, "step": 81000 }, { "epoch": 2.1673071042029544, "grad_norm": 0.9078606963157654, "learning_rate": 1.7833368790554197e-05, "loss": 0.3405, "step": 81500 }, { "epoch": 2.1806033852331503, "grad_norm": 1.1512479782104492, "learning_rate": 1.782009892564621e-05, "loss": 0.332, "step": 82000 }, { "epoch": 2.1938996662633463, "grad_norm": 1.0047634840011597, "learning_rate": 1.7806802467822574e-05, "loss": 0.3294, "step": 82500 }, { "epoch": 2.207195947293542, "grad_norm": 0.8341502547264099, "learning_rate": 1.7793506009998937e-05, "loss": 0.3428, "step": 83000 }, { "epoch": 2.220492228323738, "grad_norm": 1.1106518507003784, "learning_rate": 1.7780209552175304e-05, "loss": 0.3312, "step": 83500 }, { "epoch": 2.233788509353934, "grad_norm": 1.0450632572174072, "learning_rate": 1.7766913094351664e-05, "loss": 0.3228, "step": 84000 }, { "epoch": 2.24708479038413, "grad_norm": 1.2051053047180176, "learning_rate": 1.775361663652803e-05, "loss": 0.3231, "step": 84500 }, { "epoch": 2.2603810714143253, "grad_norm": 1.2320146560668945, "learning_rate": 1.7740320178704394e-05, "loss": 0.3275, "step": 85000 }, { "epoch": 2.2736773524445213, "grad_norm": 1.878662347793579, "learning_rate": 1.7727023720880757e-05, "loss": 0.3394, "step": 85500 }, { "epoch": 2.2869736334747173, "grad_norm": 0.7427929043769836, "learning_rate": 1.771375385597277e-05, "loss": 0.3334, "step": 86000 }, { "epoch": 2.300269914504913, "grad_norm": 0.6553477644920349, "learning_rate": 1.7700483991064782e-05, "loss": 0.3426, "step": 86500 }, { "epoch": 2.313566195535109, "grad_norm": 1.1945719718933105, "learning_rate": 1.7687187533241145e-05, "loss": 0.3329, "step": 87000 }, { "epoch": 2.3268624765653048, "grad_norm": 1.410997986793518, "learning_rate": 1.7673891075417512e-05, "loss": 0.3309, "step": 87500 }, { "epoch": 2.3401587575955007, "grad_norm": 1.0178992748260498, "learning_rate": 1.7660594617593872e-05, "loss": 0.3247, "step": 88000 }, { "epoch": 2.3534550386256963, "grad_norm": 1.4672740697860718, "learning_rate": 1.764729815977024e-05, "loss": 0.3373, "step": 88500 }, { "epoch": 2.3667513196558922, "grad_norm": 1.1154576539993286, "learning_rate": 1.763402829486225e-05, "loss": 0.3331, "step": 89000 }, { "epoch": 2.3800476006860882, "grad_norm": 1.5529305934906006, "learning_rate": 1.7620731837038613e-05, "loss": 0.3325, "step": 89500 }, { "epoch": 2.3933438817162838, "grad_norm": 1.4075485467910767, "learning_rate": 1.760743537921498e-05, "loss": 0.3252, "step": 90000 }, { "epoch": 2.3933438817162838, "eval_loss": 0.3309895098209381, "eval_runtime": 45.3672, "eval_samples_per_second": 66.987, "eval_steps_per_second": 16.752, "step": 90000 }, { "epoch": 2.4066401627464797, "grad_norm": 1.5770002603530884, "learning_rate": 1.7594138921391343e-05, "loss": 0.3341, "step": 90500 }, { "epoch": 2.4199364437766757, "grad_norm": 1.350996494293213, "learning_rate": 1.7580842463567706e-05, "loss": 0.3316, "step": 91000 }, { "epoch": 2.4332327248068717, "grad_norm": 1.3606607913970947, "learning_rate": 1.7567599191575368e-05, "loss": 0.3437, "step": 91500 }, { "epoch": 2.4465290058370672, "grad_norm": 1.3645395040512085, "learning_rate": 1.7554302733751728e-05, "loss": 0.3287, "step": 92000 }, { "epoch": 2.459825286867263, "grad_norm": 1.6436042785644531, "learning_rate": 1.7541006275928094e-05, "loss": 0.3443, "step": 92500 }, { "epoch": 2.473121567897459, "grad_norm": 0.837202250957489, "learning_rate": 1.7527709818104458e-05, "loss": 0.3221, "step": 93000 }, { "epoch": 2.4864178489276547, "grad_norm": 1.0937107801437378, "learning_rate": 1.751441336028082e-05, "loss": 0.3406, "step": 93500 }, { "epoch": 2.4997141299578507, "grad_norm": 1.4137901067733765, "learning_rate": 1.7501116902457187e-05, "loss": 0.3243, "step": 94000 }, { "epoch": 2.5130104109880467, "grad_norm": 0.8641353249549866, "learning_rate": 1.748782044463355e-05, "loss": 0.328, "step": 94500 }, { "epoch": 2.5263066920182427, "grad_norm": 1.4282252788543701, "learning_rate": 1.7474523986809914e-05, "loss": 0.3362, "step": 95000 }, { "epoch": 2.5396029730484386, "grad_norm": 1.3759071826934814, "learning_rate": 1.746122752898628e-05, "loss": 0.3339, "step": 95500 }, { "epoch": 2.552899254078634, "grad_norm": 1.239725947380066, "learning_rate": 1.7447931071162644e-05, "loss": 0.3419, "step": 96000 }, { "epoch": 2.56619553510883, "grad_norm": 0.919045627117157, "learning_rate": 1.7434634613339007e-05, "loss": 0.3369, "step": 96500 }, { "epoch": 2.5794918161390257, "grad_norm": 1.8451613187789917, "learning_rate": 1.7421338155515374e-05, "loss": 0.3417, "step": 97000 }, { "epoch": 2.5927880971692217, "grad_norm": 1.0559651851654053, "learning_rate": 1.740806829060738e-05, "loss": 0.3291, "step": 97500 }, { "epoch": 2.6060843781994176, "grad_norm": 1.215611219406128, "learning_rate": 1.7394771832783748e-05, "loss": 0.3252, "step": 98000 }, { "epoch": 2.6193806592296136, "grad_norm": 0.8735470175743103, "learning_rate": 1.738147537496011e-05, "loss": 0.3338, "step": 98500 }, { "epoch": 2.6326769402598096, "grad_norm": 1.5552194118499756, "learning_rate": 1.7368178917136474e-05, "loss": 0.3336, "step": 99000 }, { "epoch": 2.645973221290005, "grad_norm": 1.0891499519348145, "learning_rate": 1.735490905222849e-05, "loss": 0.3302, "step": 99500 }, { "epoch": 2.659269502320201, "grad_norm": 1.1763145923614502, "learning_rate": 1.73416391873205e-05, "loss": 0.33, "step": 100000 }, { "epoch": 2.659269502320201, "eval_loss": 0.3290364444255829, "eval_runtime": 45.2861, "eval_samples_per_second": 67.107, "eval_steps_per_second": 16.782, "step": 100000 }, { "epoch": 2.6725657833503966, "grad_norm": 1.2883906364440918, "learning_rate": 1.7328342729496863e-05, "loss": 0.3306, "step": 100500 }, { "epoch": 2.6858620643805926, "grad_norm": 1.4006438255310059, "learning_rate": 1.7315072864588877e-05, "loss": 0.3342, "step": 101000 }, { "epoch": 2.6991583454107886, "grad_norm": 1.019209861755371, "learning_rate": 1.7301776406765237e-05, "loss": 0.3321, "step": 101500 }, { "epoch": 2.7124546264409846, "grad_norm": 1.1669857501983643, "learning_rate": 1.7288479948941604e-05, "loss": 0.334, "step": 102000 }, { "epoch": 2.7257509074711805, "grad_norm": 0.8057425022125244, "learning_rate": 1.7275183491117967e-05, "loss": 0.3347, "step": 102500 }, { "epoch": 2.739047188501376, "grad_norm": 0.6847462058067322, "learning_rate": 1.726188703329433e-05, "loss": 0.3299, "step": 103000 }, { "epoch": 2.752343469531572, "grad_norm": 1.1315343379974365, "learning_rate": 1.7248590575470697e-05, "loss": 0.3424, "step": 103500 }, { "epoch": 2.765639750561768, "grad_norm": 1.2325061559677124, "learning_rate": 1.723529411764706e-05, "loss": 0.3346, "step": 104000 }, { "epoch": 2.7789360315919636, "grad_norm": 1.2530683279037476, "learning_rate": 1.7221997659823423e-05, "loss": 0.3299, "step": 104500 }, { "epoch": 2.7922323126221595, "grad_norm": 1.4341391324996948, "learning_rate": 1.720870120199979e-05, "loss": 0.3331, "step": 105000 }, { "epoch": 2.8055285936523555, "grad_norm": 0.983672559261322, "learning_rate": 1.7195404744176153e-05, "loss": 0.3229, "step": 105500 }, { "epoch": 2.8188248746825515, "grad_norm": 1.336562156677246, "learning_rate": 1.7182108286352516e-05, "loss": 0.3336, "step": 106000 }, { "epoch": 2.832121155712747, "grad_norm": 1.2226226329803467, "learning_rate": 1.7168811828528883e-05, "loss": 0.3296, "step": 106500 }, { "epoch": 2.845417436742943, "grad_norm": 0.9698474407196045, "learning_rate": 1.7155515370705243e-05, "loss": 0.3362, "step": 107000 }, { "epoch": 2.858713717773139, "grad_norm": 1.006219744682312, "learning_rate": 1.7142245505797257e-05, "loss": 0.3435, "step": 107500 }, { "epoch": 2.8720099988033345, "grad_norm": 1.3786289691925049, "learning_rate": 1.712894904797362e-05, "loss": 0.335, "step": 108000 }, { "epoch": 2.8853062798335305, "grad_norm": 1.3363252878189087, "learning_rate": 1.7115652590149984e-05, "loss": 0.3295, "step": 108500 }, { "epoch": 2.8986025608637265, "grad_norm": 0.9451866149902344, "learning_rate": 1.710235613232635e-05, "loss": 0.3226, "step": 109000 }, { "epoch": 2.9118988418939225, "grad_norm": 0.9065133929252625, "learning_rate": 1.708908626741836e-05, "loss": 0.3356, "step": 109500 }, { "epoch": 2.925195122924118, "grad_norm": 2.04927396774292, "learning_rate": 1.7075789809594725e-05, "loss": 0.3325, "step": 110000 }, { "epoch": 2.925195122924118, "eval_loss": 0.32618749141693115, "eval_runtime": 44.7607, "eval_samples_per_second": 67.894, "eval_steps_per_second": 16.979, "step": 110000 }, { "epoch": 2.938491403954314, "grad_norm": 1.1163157224655151, "learning_rate": 1.706249335177109e-05, "loss": 0.3257, "step": 110500 }, { "epoch": 2.95178768498451, "grad_norm": 0.929104208946228, "learning_rate": 1.7049196893947454e-05, "loss": 0.3384, "step": 111000 }, { "epoch": 2.9650839660147055, "grad_norm": 1.0319048166275024, "learning_rate": 1.7035927029039465e-05, "loss": 0.3337, "step": 111500 }, { "epoch": 2.9783802470449015, "grad_norm": 1.5630576610565186, "learning_rate": 1.702263057121583e-05, "loss": 0.3372, "step": 112000 }, { "epoch": 2.9916765280750974, "grad_norm": 1.2456904649734497, "learning_rate": 1.7009334113392192e-05, "loss": 0.339, "step": 112500 }, { "epoch": 3.0049728091052934, "grad_norm": 1.1409209966659546, "learning_rate": 1.699603765556856e-05, "loss": 0.3236, "step": 113000 }, { "epoch": 3.018269090135489, "grad_norm": 1.1287899017333984, "learning_rate": 1.698276779066057e-05, "loss": 0.2909, "step": 113500 }, { "epoch": 3.031565371165685, "grad_norm": 1.0493087768554688, "learning_rate": 1.6969471332836933e-05, "loss": 0.2988, "step": 114000 }, { "epoch": 3.044861652195881, "grad_norm": 0.8281829357147217, "learning_rate": 1.69561748750133e-05, "loss": 0.3001, "step": 114500 }, { "epoch": 3.058157933226077, "grad_norm": 1.5901657342910767, "learning_rate": 1.6942878417189663e-05, "loss": 0.2954, "step": 115000 }, { "epoch": 3.0714542142562724, "grad_norm": 1.0881614685058594, "learning_rate": 1.6929581959366026e-05, "loss": 0.2985, "step": 115500 }, { "epoch": 3.0847504952864684, "grad_norm": 1.8229660987854004, "learning_rate": 1.6916285501542392e-05, "loss": 0.2959, "step": 116000 }, { "epoch": 3.0980467763166644, "grad_norm": 0.8429426550865173, "learning_rate": 1.6902989043718752e-05, "loss": 0.2977, "step": 116500 }, { "epoch": 3.11134305734686, "grad_norm": 0.7262697219848633, "learning_rate": 1.688969258589512e-05, "loss": 0.2984, "step": 117000 }, { "epoch": 3.124639338377056, "grad_norm": 1.0413541793823242, "learning_rate": 1.6876396128071482e-05, "loss": 0.2959, "step": 117500 }, { "epoch": 3.137935619407252, "grad_norm": 1.0889323949813843, "learning_rate": 1.686315285607914e-05, "loss": 0.3, "step": 118000 }, { "epoch": 3.151231900437448, "grad_norm": 1.1839392185211182, "learning_rate": 1.6849856398255507e-05, "loss": 0.2975, "step": 118500 }, { "epoch": 3.1645281814676434, "grad_norm": 0.779194712638855, "learning_rate": 1.683655994043187e-05, "loss": 0.3028, "step": 119000 }, { "epoch": 3.1778244624978393, "grad_norm": 1.3120828866958618, "learning_rate": 1.6823263482608234e-05, "loss": 0.2929, "step": 119500 }, { "epoch": 3.1911207435280353, "grad_norm": 1.7091648578643799, "learning_rate": 1.68099670247846e-05, "loss": 0.2948, "step": 120000 }, { "epoch": 3.1911207435280353, "eval_loss": 0.3274182677268982, "eval_runtime": 59.8113, "eval_samples_per_second": 50.81, "eval_steps_per_second": 12.707, "step": 120000 }, { "epoch": 3.2044436171202912, "grad_norm": 1.2384920120239258, "learning_rate": 1.679667056696096e-05, "loss": 0.2823, "step": 120500 }, { "epoch": 3.217739898150487, "grad_norm": 1.2225531339645386, "learning_rate": 1.6783400702052975e-05, "loss": 0.2925, "step": 121000 }, { "epoch": 3.231036179180683, "grad_norm": 1.3342504501342773, "learning_rate": 1.6770104244229338e-05, "loss": 0.2906, "step": 121500 }, { "epoch": 3.244332460210879, "grad_norm": 1.2133206129074097, "learning_rate": 1.67568077864057e-05, "loss": 0.2877, "step": 122000 }, { "epoch": 3.2576287412410747, "grad_norm": 1.1577247381210327, "learning_rate": 1.6743511328582068e-05, "loss": 0.2876, "step": 122500 }, { "epoch": 3.2709250222712707, "grad_norm": 1.1587375402450562, "learning_rate": 1.673021487075843e-05, "loss": 0.2937, "step": 123000 }, { "epoch": 3.2842213033014667, "grad_norm": 1.1512658596038818, "learning_rate": 1.6716918412934794e-05, "loss": 0.2971, "step": 123500 }, { "epoch": 3.297517584331662, "grad_norm": 0.8885172605514526, "learning_rate": 1.670362195511116e-05, "loss": 0.2825, "step": 124000 }, { "epoch": 3.310813865361858, "grad_norm": 1.435475468635559, "learning_rate": 1.6690325497287524e-05, "loss": 0.2984, "step": 124500 }, { "epoch": 3.324110146392054, "grad_norm": 1.7068617343902588, "learning_rate": 1.6677029039463887e-05, "loss": 0.2844, "step": 125000 }, { "epoch": 3.33740642742225, "grad_norm": 0.8843211531639099, "learning_rate": 1.6663732581640254e-05, "loss": 0.2961, "step": 125500 }, { "epoch": 3.3507027084524457, "grad_norm": 1.1262086629867554, "learning_rate": 1.6650436123816614e-05, "loss": 0.2896, "step": 126000 }, { "epoch": 3.3639989894826416, "grad_norm": 1.073869228363037, "learning_rate": 1.663713966599298e-05, "loss": 0.2923, "step": 126500 }, { "epoch": 3.3772952705128376, "grad_norm": 1.1707890033721924, "learning_rate": 1.6623843208169347e-05, "loss": 0.3003, "step": 127000 }, { "epoch": 3.3905915515430336, "grad_norm": 1.2858471870422363, "learning_rate": 1.6610546750345707e-05, "loss": 0.3041, "step": 127500 }, { "epoch": 3.403887832573229, "grad_norm": 0.9487111568450928, "learning_rate": 1.6597250292522074e-05, "loss": 0.2916, "step": 128000 }, { "epoch": 3.417184113603425, "grad_norm": 1.1401737928390503, "learning_rate": 1.6583953834698437e-05, "loss": 0.2905, "step": 128500 }, { "epoch": 3.430480394633621, "grad_norm": 0.862232506275177, "learning_rate": 1.65706573768748e-05, "loss": 0.292, "step": 129000 }, { "epoch": 3.443776675663817, "grad_norm": 2.11210560798645, "learning_rate": 1.6557360919051167e-05, "loss": 0.2873, "step": 129500 }, { "epoch": 3.4570729566940126, "grad_norm": 1.325928807258606, "learning_rate": 1.654406446122753e-05, "loss": 0.2874, "step": 130000 }, { "epoch": 3.4570729566940126, "eval_loss": 0.27055609226226807, "eval_runtime": 44.0558, "eval_samples_per_second": 68.981, "eval_steps_per_second": 17.251, "step": 130000 }, { "epoch": 3.4703692377242086, "grad_norm": 0.9289048314094543, "learning_rate": 1.6530768003403894e-05, "loss": 0.2955, "step": 130500 }, { "epoch": 3.4836655187544046, "grad_norm": 1.1079119443893433, "learning_rate": 1.651747154558026e-05, "loss": 0.3043, "step": 131000 }, { "epoch": 3.4969617997846, "grad_norm": 0.927392840385437, "learning_rate": 1.650420168067227e-05, "loss": 0.2993, "step": 131500 }, { "epoch": 3.510258080814796, "grad_norm": 1.21388578414917, "learning_rate": 1.6490931815764282e-05, "loss": 0.2922, "step": 132000 }, { "epoch": 3.523554361844992, "grad_norm": 0.8197039365768433, "learning_rate": 1.6477635357940645e-05, "loss": 0.2945, "step": 132500 }, { "epoch": 3.536850642875188, "grad_norm": 1.3974609375, "learning_rate": 1.646433890011701e-05, "loss": 0.2892, "step": 133000 }, { "epoch": 3.5501469239053836, "grad_norm": 1.4183751344680786, "learning_rate": 1.6451042442293375e-05, "loss": 0.2879, "step": 133500 }, { "epoch": 3.5634432049355795, "grad_norm": 1.218549132347107, "learning_rate": 1.6437772577385386e-05, "loss": 0.3097, "step": 134000 }, { "epoch": 3.5767394859657755, "grad_norm": 1.6601920127868652, "learning_rate": 1.642447611956175e-05, "loss": 0.2867, "step": 134500 }, { "epoch": 3.590035766995971, "grad_norm": 1.3657351732254028, "learning_rate": 1.6411179661738116e-05, "loss": 0.3013, "step": 135000 }, { "epoch": 3.603332048026167, "grad_norm": 1.6279337406158447, "learning_rate": 1.639788320391448e-05, "loss": 0.3024, "step": 135500 }, { "epoch": 3.616628329056363, "grad_norm": 1.4741065502166748, "learning_rate": 1.6384586746090842e-05, "loss": 0.2994, "step": 136000 }, { "epoch": 3.629924610086559, "grad_norm": 1.1416784524917603, "learning_rate": 1.637129028826721e-05, "loss": 0.2996, "step": 136500 }, { "epoch": 3.6432208911167545, "grad_norm": 1.3834564685821533, "learning_rate": 1.635799383044357e-05, "loss": 0.2884, "step": 137000 }, { "epoch": 3.6565171721469505, "grad_norm": 1.2296993732452393, "learning_rate": 1.6344697372619936e-05, "loss": 0.2908, "step": 137500 }, { "epoch": 3.6698134531771465, "grad_norm": 1.3037883043289185, "learning_rate": 1.63314009147963e-05, "loss": 0.2896, "step": 138000 }, { "epoch": 3.683109734207342, "grad_norm": 1.5054666996002197, "learning_rate": 1.6318104456972662e-05, "loss": 0.2949, "step": 138500 }, { "epoch": 3.696406015237538, "grad_norm": 1.01513671875, "learning_rate": 1.630480799914903e-05, "loss": 0.2964, "step": 139000 }, { "epoch": 3.709702296267734, "grad_norm": 1.0762807130813599, "learning_rate": 1.6291511541325392e-05, "loss": 0.2938, "step": 139500 }, { "epoch": 3.72299857729793, "grad_norm": 1.9730701446533203, "learning_rate": 1.6278241676417403e-05, "loss": 0.3019, "step": 140000 }, { "epoch": 3.72299857729793, "eval_loss": 0.2732769846916199, "eval_runtime": 44.8357, "eval_samples_per_second": 67.781, "eval_steps_per_second": 16.951, "step": 140000 }, { "epoch": 3.7362948583281255, "grad_norm": 1.622164011001587, "learning_rate": 1.626494521859377e-05, "loss": 0.272, "step": 140500 }, { "epoch": 3.7495911393583214, "grad_norm": 1.4417494535446167, "learning_rate": 1.6251648760770133e-05, "loss": 0.2696, "step": 141000 }, { "epoch": 3.7628874203885174, "grad_norm": 0.9713847637176514, "learning_rate": 1.6238352302946496e-05, "loss": 0.276, "step": 141500 }, { "epoch": 3.776183701418713, "grad_norm": 0.9661402106285095, "learning_rate": 1.6225055845122863e-05, "loss": 0.2773, "step": 142000 }, { "epoch": 3.789479982448909, "grad_norm": 1.307726502418518, "learning_rate": 1.6211759387299223e-05, "loss": 0.2763, "step": 142500 }, { "epoch": 3.802776263479105, "grad_norm": 1.0191502571105957, "learning_rate": 1.619846292947559e-05, "loss": 0.2872, "step": 143000 }, { "epoch": 3.816072544509301, "grad_norm": 1.083509922027588, "learning_rate": 1.6185166471651952e-05, "loss": 0.2737, "step": 143500 }, { "epoch": 3.829368825539497, "grad_norm": 1.0461057424545288, "learning_rate": 1.6171870013828316e-05, "loss": 0.2745, "step": 144000 }, { "epoch": 3.8426651065696924, "grad_norm": 1.0616631507873535, "learning_rate": 1.6158573556004682e-05, "loss": 0.2753, "step": 144500 }, { "epoch": 3.8559613875998884, "grad_norm": 1.418392539024353, "learning_rate": 1.6145277098181046e-05, "loss": 0.2815, "step": 145000 }, { "epoch": 3.869257668630084, "grad_norm": 1.022754430770874, "learning_rate": 1.613198064035741e-05, "loss": 0.2721, "step": 145500 }, { "epoch": 3.88255394966028, "grad_norm": 1.5018330812454224, "learning_rate": 1.6118684182533776e-05, "loss": 0.2804, "step": 146000 }, { "epoch": 3.895850230690476, "grad_norm": 1.0998148918151855, "learning_rate": 1.610538772471014e-05, "loss": 0.2801, "step": 146500 }, { "epoch": 3.909146511720672, "grad_norm": 1.5807846784591675, "learning_rate": 1.6092091266886502e-05, "loss": 0.2821, "step": 147000 }, { "epoch": 3.922442792750868, "grad_norm": 1.443129539489746, "learning_rate": 1.607879480906287e-05, "loss": 0.2734, "step": 147500 }, { "epoch": 3.9357390737810634, "grad_norm": 1.2961505651474, "learning_rate": 1.6065498351239232e-05, "loss": 0.2694, "step": 148000 }, { "epoch": 3.9490353548112593, "grad_norm": 1.235811471939087, "learning_rate": 1.6052228486331243e-05, "loss": 0.2811, "step": 148500 }, { "epoch": 3.962331635841455, "grad_norm": 1.3737879991531372, "learning_rate": 1.6038932028507606e-05, "loss": 0.2796, "step": 149000 }, { "epoch": 3.975627916871651, "grad_norm": 1.3180122375488281, "learning_rate": 1.602563557068397e-05, "loss": 0.2752, "step": 149500 }, { "epoch": 3.988924197901847, "grad_norm": 1.293617606163025, "learning_rate": 1.6012339112860336e-05, "loss": 0.2853, "step": 150000 }, { "epoch": 3.988924197901847, "eval_loss": 0.24187567830085754, "eval_runtime": 44.4954, "eval_samples_per_second": 68.299, "eval_steps_per_second": 17.08, "step": 150000 }, { "epoch": 4.002220478932043, "grad_norm": 1.8393179178237915, "learning_rate": 1.59990426550367e-05, "loss": 0.2718, "step": 150500 }, { "epoch": 4.015516759962239, "grad_norm": 1.5193239450454712, "learning_rate": 1.5985746197213063e-05, "loss": 0.2665, "step": 151000 }, { "epoch": 4.028813040992435, "grad_norm": 1.0958846807479858, "learning_rate": 1.597244973938943e-05, "loss": 0.2725, "step": 151500 }, { "epoch": 4.04210932202263, "grad_norm": 1.564980387687683, "learning_rate": 1.595917987448144e-05, "loss": 0.2745, "step": 152000 }, { "epoch": 4.055405603052826, "grad_norm": 1.3031373023986816, "learning_rate": 1.5945883416657803e-05, "loss": 0.2646, "step": 152500 }, { "epoch": 4.068701884083022, "grad_norm": 1.3589409589767456, "learning_rate": 1.593258695883417e-05, "loss": 0.2777, "step": 153000 }, { "epoch": 4.081998165113218, "grad_norm": 1.092926263809204, "learning_rate": 1.591929050101053e-05, "loss": 0.2787, "step": 153500 }, { "epoch": 4.095294446143414, "grad_norm": 1.362304925918579, "learning_rate": 1.5906020636102544e-05, "loss": 0.273, "step": 154000 }, { "epoch": 4.10859072717361, "grad_norm": 1.3661013841629028, "learning_rate": 1.5892724178278907e-05, "loss": 0.2624, "step": 154500 }, { "epoch": 4.121887008203806, "grad_norm": 0.923414945602417, "learning_rate": 1.587942772045527e-05, "loss": 0.2708, "step": 155000 }, { "epoch": 4.135183289234002, "grad_norm": 1.3246159553527832, "learning_rate": 1.5866131262631637e-05, "loss": 0.2661, "step": 155500 }, { "epoch": 4.148479570264197, "grad_norm": 1.3546644449234009, "learning_rate": 1.5852834804808e-05, "loss": 0.268, "step": 156000 }, { "epoch": 4.161775851294393, "grad_norm": 1.048305630683899, "learning_rate": 1.5839538346984364e-05, "loss": 0.2711, "step": 156500 }, { "epoch": 4.175072132324589, "grad_norm": 1.0217567682266235, "learning_rate": 1.5826268482076378e-05, "loss": 0.2689, "step": 157000 }, { "epoch": 4.188368413354785, "grad_norm": 1.253959059715271, "learning_rate": 1.5812972024252738e-05, "loss": 0.2709, "step": 157500 }, { "epoch": 4.201664694384981, "grad_norm": 1.3631120920181274, "learning_rate": 1.5799675566429105e-05, "loss": 0.2753, "step": 158000 }, { "epoch": 4.214960975415177, "grad_norm": 1.5026861429214478, "learning_rate": 1.578637910860547e-05, "loss": 0.2729, "step": 158500 }, { "epoch": 4.228257256445373, "grad_norm": 1.2928775548934937, "learning_rate": 1.577308265078183e-05, "loss": 0.2645, "step": 159000 }, { "epoch": 4.241553537475568, "grad_norm": 0.8542189598083496, "learning_rate": 1.5759812785873845e-05, "loss": 0.2779, "step": 159500 }, { "epoch": 4.254849818505764, "grad_norm": 1.4010766744613647, "learning_rate": 1.574651632805021e-05, "loss": 0.2709, "step": 160000 }, { "epoch": 4.254849818505764, "eval_loss": 0.24389813840389252, "eval_runtime": 44.8113, "eval_samples_per_second": 67.818, "eval_steps_per_second": 16.96, "step": 160000 }, { "epoch": 4.26814609953596, "grad_norm": 1.1164586544036865, "learning_rate": 1.5733219870226572e-05, "loss": 0.2675, "step": 160500 }, { "epoch": 4.281442380566156, "grad_norm": 1.361721158027649, "learning_rate": 1.571992341240294e-05, "loss": 0.2767, "step": 161000 }, { "epoch": 4.294738661596352, "grad_norm": 1.3751747608184814, "learning_rate": 1.5706626954579302e-05, "loss": 0.2672, "step": 161500 }, { "epoch": 4.308034942626548, "grad_norm": 1.032375693321228, "learning_rate": 1.5693357089671313e-05, "loss": 0.2786, "step": 162000 }, { "epoch": 4.321331223656744, "grad_norm": 1.288988471031189, "learning_rate": 1.568006063184768e-05, "loss": 0.2845, "step": 162500 }, { "epoch": 4.334627504686939, "grad_norm": 1.0330158472061157, "learning_rate": 1.566676417402404e-05, "loss": 0.2811, "step": 163000 }, { "epoch": 4.347923785717135, "grad_norm": 1.7785344123840332, "learning_rate": 1.5653467716200406e-05, "loss": 0.2791, "step": 163500 }, { "epoch": 4.361220066747331, "grad_norm": 1.7403333187103271, "learning_rate": 1.5640197851292417e-05, "loss": 0.2773, "step": 164000 }, { "epoch": 4.374516347777527, "grad_norm": 1.0584579706192017, "learning_rate": 1.562690139346878e-05, "loss": 0.2711, "step": 164500 }, { "epoch": 4.387812628807723, "grad_norm": 1.419774055480957, "learning_rate": 1.5613604935645147e-05, "loss": 0.2762, "step": 165000 }, { "epoch": 4.401108909837919, "grad_norm": 1.3535691499710083, "learning_rate": 1.560030847782151e-05, "loss": 0.271, "step": 165500 }, { "epoch": 4.414405190868115, "grad_norm": 1.4730337858200073, "learning_rate": 1.5587012019997873e-05, "loss": 0.2753, "step": 166000 }, { "epoch": 4.42770147189831, "grad_norm": 1.1973862648010254, "learning_rate": 1.557371556217424e-05, "loss": 0.2731, "step": 166500 }, { "epoch": 4.440997752928506, "grad_norm": 1.2967299222946167, "learning_rate": 1.5560472290181895e-05, "loss": 0.2815, "step": 167000 }, { "epoch": 4.454294033958702, "grad_norm": 1.1886343955993652, "learning_rate": 1.554717583235826e-05, "loss": 0.27, "step": 167500 }, { "epoch": 4.467590314988898, "grad_norm": 1.6000491380691528, "learning_rate": 1.5533879374534625e-05, "loss": 0.28, "step": 168000 }, { "epoch": 4.480886596019094, "grad_norm": 1.6364268064498901, "learning_rate": 1.5520582916710988e-05, "loss": 0.2741, "step": 168500 }, { "epoch": 4.4941828770492895, "grad_norm": 1.4403234720230103, "learning_rate": 1.5507286458887355e-05, "loss": 0.2768, "step": 169000 }, { "epoch": 4.5074791580794855, "grad_norm": 1.1721081733703613, "learning_rate": 1.5493990001063718e-05, "loss": 0.2812, "step": 169500 }, { "epoch": 4.520775439109681, "grad_norm": 1.213318943977356, "learning_rate": 1.548072013615573e-05, "loss": 0.2772, "step": 170000 }, { "epoch": 4.520775439109681, "eval_loss": 0.2475610226392746, "eval_runtime": 44.8886, "eval_samples_per_second": 67.701, "eval_steps_per_second": 16.931, "step": 170000 }, { "epoch": 4.534071720139877, "grad_norm": 1.3363765478134155, "learning_rate": 1.5467423678332095e-05, "loss": 0.2748, "step": 170500 }, { "epoch": 4.547368001170073, "grad_norm": 1.1976338624954224, "learning_rate": 1.5454127220508455e-05, "loss": 0.2731, "step": 171000 }, { "epoch": 4.5606642822002685, "grad_norm": 1.1618109941482544, "learning_rate": 1.5440830762684822e-05, "loss": 0.2772, "step": 171500 }, { "epoch": 4.5739605632304645, "grad_norm": 1.5081138610839844, "learning_rate": 1.542753430486119e-05, "loss": 0.2711, "step": 172000 }, { "epoch": 4.5872568442606605, "grad_norm": 1.2148953676223755, "learning_rate": 1.541423784703755e-05, "loss": 0.2708, "step": 172500 }, { "epoch": 4.6005531252908565, "grad_norm": 0.9179415702819824, "learning_rate": 1.5400941389213915e-05, "loss": 0.2674, "step": 173000 }, { "epoch": 4.613849406321052, "grad_norm": 1.1413265466690063, "learning_rate": 1.538764493139028e-05, "loss": 0.2747, "step": 173500 }, { "epoch": 4.6271456873512475, "grad_norm": 1.0427721738815308, "learning_rate": 1.537434847356664e-05, "loss": 0.2802, "step": 174000 }, { "epoch": 4.6404419683814435, "grad_norm": 1.8311312198638916, "learning_rate": 1.5361052015743008e-05, "loss": 0.2746, "step": 174500 }, { "epoch": 4.6537382494116395, "grad_norm": 1.482964038848877, "learning_rate": 1.534775555791937e-05, "loss": 0.2661, "step": 175000 }, { "epoch": 4.6670345304418355, "grad_norm": 1.2513574361801147, "learning_rate": 1.5334485693011382e-05, "loss": 0.2654, "step": 175500 }, { "epoch": 4.6803308114720314, "grad_norm": 1.0544030666351318, "learning_rate": 1.532118923518775e-05, "loss": 0.2783, "step": 176000 }, { "epoch": 4.693627092502227, "grad_norm": 1.350472331047058, "learning_rate": 1.530789277736411e-05, "loss": 0.2846, "step": 176500 }, { "epoch": 4.7069233735324225, "grad_norm": 1.0775673389434814, "learning_rate": 1.5294596319540476e-05, "loss": 0.2759, "step": 177000 }, { "epoch": 4.7202196545626185, "grad_norm": 0.8350367546081543, "learning_rate": 1.5281299861716842e-05, "loss": 0.2778, "step": 177500 }, { "epoch": 4.7335159355928145, "grad_norm": 1.2634960412979126, "learning_rate": 1.5268003403893202e-05, "loss": 0.2754, "step": 178000 }, { "epoch": 4.7468122166230104, "grad_norm": 1.2727707624435425, "learning_rate": 1.5254706946069569e-05, "loss": 0.2811, "step": 178500 }, { "epoch": 4.760108497653206, "grad_norm": 1.2721821069717407, "learning_rate": 1.5241410488245934e-05, "loss": 0.2765, "step": 179000 }, { "epoch": 4.773404778683402, "grad_norm": 1.0992257595062256, "learning_rate": 1.522816721625359e-05, "loss": 0.2767, "step": 179500 }, { "epoch": 4.786701059713598, "grad_norm": 0.9015975594520569, "learning_rate": 1.5214870758429955e-05, "loss": 0.2713, "step": 180000 }, { "epoch": 4.786701059713598, "eval_loss": 0.24829423427581787, "eval_runtime": 44.7845, "eval_samples_per_second": 67.858, "eval_steps_per_second": 16.97, "step": 180000 }, { "epoch": 4.7999973407437935, "grad_norm": 1.1640844345092773, "learning_rate": 1.5201600893521966e-05, "loss": 0.2836, "step": 180500 }, { "epoch": 4.8132936217739895, "grad_norm": 1.0341435670852661, "learning_rate": 1.5188304435698331e-05, "loss": 0.2761, "step": 181000 }, { "epoch": 4.826589902804185, "grad_norm": 1.327324628829956, "learning_rate": 1.5175007977874696e-05, "loss": 0.2791, "step": 181500 }, { "epoch": 4.839886183834381, "grad_norm": 0.8103187084197998, "learning_rate": 1.5161738112966707e-05, "loss": 0.2879, "step": 182000 }, { "epoch": 4.853182464864577, "grad_norm": 0.7801828384399414, "learning_rate": 1.5148441655143072e-05, "loss": 0.2756, "step": 182500 }, { "epoch": 4.866478745894773, "grad_norm": 1.30203378200531, "learning_rate": 1.5135145197319437e-05, "loss": 0.277, "step": 183000 }, { "epoch": 4.879775026924969, "grad_norm": 1.3664675951004028, "learning_rate": 1.5121848739495799e-05, "loss": 0.2731, "step": 183500 }, { "epoch": 4.893071307955165, "grad_norm": 1.314031720161438, "learning_rate": 1.5108552281672163e-05, "loss": 0.2729, "step": 184000 }, { "epoch": 4.90636758898536, "grad_norm": 0.8956036567687988, "learning_rate": 1.5095255823848527e-05, "loss": 0.2788, "step": 184500 }, { "epoch": 4.919663870015556, "grad_norm": 1.520376205444336, "learning_rate": 1.5081959366024892e-05, "loss": 0.2845, "step": 185000 }, { "epoch": 4.932960151045752, "grad_norm": 1.3505889177322388, "learning_rate": 1.5068662908201257e-05, "loss": 0.2818, "step": 185500 }, { "epoch": 4.946256432075948, "grad_norm": 0.6950279474258423, "learning_rate": 1.505536645037762e-05, "loss": 0.2737, "step": 186000 }, { "epoch": 4.959552713106144, "grad_norm": 1.2404699325561523, "learning_rate": 1.5042069992553985e-05, "loss": 0.2813, "step": 186500 }, { "epoch": 4.97284899413634, "grad_norm": 1.1977089643478394, "learning_rate": 1.5028800127645997e-05, "loss": 0.2758, "step": 187000 }, { "epoch": 4.986145275166536, "grad_norm": 1.2810465097427368, "learning_rate": 1.501550366982236e-05, "loss": 0.287, "step": 187500 }, { "epoch": 4.999441556196732, "grad_norm": 2.558598756790161, "learning_rate": 1.5002207211998726e-05, "loss": 0.2701, "step": 188000 }, { "epoch": 5.012737837226927, "grad_norm": 1.2791069746017456, "learning_rate": 1.498891075417509e-05, "loss": 0.2406, "step": 188500 }, { "epoch": 5.026034118257123, "grad_norm": 1.0913165807724, "learning_rate": 1.4975614296351452e-05, "loss": 0.2307, "step": 189000 }, { "epoch": 5.039330399287319, "grad_norm": 1.1861522197723389, "learning_rate": 1.4962317838527817e-05, "loss": 0.2429, "step": 189500 }, { "epoch": 5.052626680317515, "grad_norm": 1.2686249017715454, "learning_rate": 1.494902138070418e-05, "loss": 0.2395, "step": 190000 }, { "epoch": 5.052626680317515, "eval_loss": 0.2490723580121994, "eval_runtime": 43.0318, "eval_samples_per_second": 70.622, "eval_steps_per_second": 17.661, "step": 190000 }, { "epoch": 5.065922961347711, "grad_norm": 1.4228299856185913, "learning_rate": 1.4935724922880545e-05, "loss": 0.2493, "step": 190500 }, { "epoch": 5.079219242377907, "grad_norm": 1.0055017471313477, "learning_rate": 1.492242846505691e-05, "loss": 0.2522, "step": 191000 }, { "epoch": 5.092515523408102, "grad_norm": 0.8438519239425659, "learning_rate": 1.4909132007233274e-05, "loss": 0.2546, "step": 191500 }, { "epoch": 5.105811804438298, "grad_norm": 1.2010571956634521, "learning_rate": 1.4895862142325286e-05, "loss": 0.2352, "step": 192000 }, { "epoch": 5.119108085468494, "grad_norm": 1.275994062423706, "learning_rate": 1.4882565684501651e-05, "loss": 0.2372, "step": 192500 }, { "epoch": 5.13240436649869, "grad_norm": 0.7371090054512024, "learning_rate": 1.4869269226678014e-05, "loss": 0.2487, "step": 193000 }, { "epoch": 5.145700647528886, "grad_norm": 1.3083887100219727, "learning_rate": 1.485597276885438e-05, "loss": 0.2485, "step": 193500 }, { "epoch": 5.158996928559082, "grad_norm": 0.972006618976593, "learning_rate": 1.4842676311030744e-05, "loss": 0.2517, "step": 194000 }, { "epoch": 5.172293209589278, "grad_norm": 1.3523329496383667, "learning_rate": 1.4829379853207106e-05, "loss": 0.243, "step": 194500 }, { "epoch": 5.185589490619474, "grad_norm": 1.2454783916473389, "learning_rate": 1.4816109988299118e-05, "loss": 0.2456, "step": 195000 }, { "epoch": 5.198885771649669, "grad_norm": 1.2170870304107666, "learning_rate": 1.480284012339113e-05, "loss": 0.2524, "step": 195500 }, { "epoch": 5.212182052679865, "grad_norm": 1.7185641527175903, "learning_rate": 1.4789570258483142e-05, "loss": 0.2403, "step": 196000 }, { "epoch": 5.225478333710061, "grad_norm": 1.2505462169647217, "learning_rate": 1.4776273800659507e-05, "loss": 0.248, "step": 196500 }, { "epoch": 5.238774614740257, "grad_norm": 1.061320424079895, "learning_rate": 1.4762977342835868e-05, "loss": 0.2522, "step": 197000 }, { "epoch": 5.252070895770453, "grad_norm": 1.1784652471542358, "learning_rate": 1.4749680885012235e-05, "loss": 0.2471, "step": 197500 }, { "epoch": 5.265367176800649, "grad_norm": 1.6446605920791626, "learning_rate": 1.47363844271886e-05, "loss": 0.2459, "step": 198000 }, { "epoch": 5.278663457830845, "grad_norm": 1.7436884641647339, "learning_rate": 1.4723087969364962e-05, "loss": 0.2452, "step": 198500 }, { "epoch": 5.29195973886104, "grad_norm": 1.2458696365356445, "learning_rate": 1.4709791511541326e-05, "loss": 0.2535, "step": 199000 }, { "epoch": 5.305256019891236, "grad_norm": 1.1361490488052368, "learning_rate": 1.469649505371769e-05, "loss": 0.2444, "step": 199500 }, { "epoch": 5.318552300921432, "grad_norm": 1.2189501523971558, "learning_rate": 1.4683198595894055e-05, "loss": 0.2509, "step": 200000 }, { "epoch": 5.318552300921432, "eval_loss": 0.25082090497016907, "eval_runtime": 56.1856, "eval_samples_per_second": 54.089, "eval_steps_per_second": 13.527, "step": 200000 } ], "logging_steps": 500, "max_steps": 752080, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.334195659820237e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }