diff --git "a/data/mos-mamba-6x130m-trainer-sft/checkpoint-54000/trainer_state.json" "b/data/mos-mamba-6x130m-trainer-sft/checkpoint-54000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/data/mos-mamba-6x130m-trainer-sft/checkpoint-54000/trainer_state.json" @@ -0,0 +1,25233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.10515697599694655, + "eval_steps": 500, + "global_step": 54000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.9210271110262932e-05, + "grad_norm": 5.397985935211182, + "learning_rate": 2.9999999936841446e-05, + "loss": 2.1156, + "step": 15 + }, + { + "epoch": 5.8420542220525864e-05, + "grad_norm": 3.999190092086792, + "learning_rate": 2.999999974736578e-05, + "loss": 2.5268, + "step": 30 + }, + { + "epoch": 8.76308133307888e-05, + "grad_norm": 5.580333709716797, + "learning_rate": 2.999999943157301e-05, + "loss": 2.3955, + "step": 45 + }, + { + "epoch": 0.00011684108444105173, + "grad_norm": 3.6752560138702393, + "learning_rate": 2.9999998989463132e-05, + "loss": 2.171, + "step": 60 + }, + { + "epoch": 0.00014605135555131467, + "grad_norm": 2.655461072921753, + "learning_rate": 2.9999998421036153e-05, + "loss": 2.188, + "step": 75 + }, + { + "epoch": 0.0001752616266615776, + "grad_norm": 2.572371006011963, + "learning_rate": 2.9999997726292083e-05, + "loss": 2.1977, + "step": 90 + }, + { + "epoch": 0.00020447189777184053, + "grad_norm": 8.544811248779297, + "learning_rate": 2.999999690523092e-05, + "loss": 2.1871, + "step": 105 + }, + { + "epoch": 0.00023368216888210345, + "grad_norm": 3.530529737472534, + "learning_rate": 2.999999595785267e-05, + "loss": 2.2238, + "step": 120 + }, + { + "epoch": 0.00026289243999236636, + "grad_norm": 3.671182155609131, + "learning_rate": 2.9999994884157345e-05, + "loss": 2.1338, + "step": 135 + }, + { + "epoch": 0.00029210271110262934, + "grad_norm": 3.9643137454986572, + "learning_rate": 2.9999993684144956e-05, + "loss": 2.0788, + "step": 150 + }, + { + "epoch": 0.00032131298221289227, + "grad_norm": 2.62164044380188, + "learning_rate": 2.9999992357815508e-05, + "loss": 2.117, + "step": 165 + }, + { + "epoch": 0.0003505232533231552, + "grad_norm": 4.189517021179199, + "learning_rate": 2.999999090516902e-05, + "loss": 2.262, + "step": 180 + }, + { + "epoch": 0.0003797335244334181, + "grad_norm": 2.880622625350952, + "learning_rate": 2.9999989326205494e-05, + "loss": 2.1102, + "step": 195 + }, + { + "epoch": 0.00040894379554368105, + "grad_norm": 5.852839469909668, + "learning_rate": 2.9999987620924948e-05, + "loss": 2.0684, + "step": 210 + }, + { + "epoch": 0.000438154066653944, + "grad_norm": 3.6339924335479736, + "learning_rate": 2.9999985789327394e-05, + "loss": 2.2518, + "step": 225 + }, + { + "epoch": 0.0004673643377642069, + "grad_norm": 2.743267297744751, + "learning_rate": 2.9999983831412858e-05, + "loss": 2.1236, + "step": 240 + }, + { + "epoch": 0.0004965746088744698, + "grad_norm": 3.9490363597869873, + "learning_rate": 2.9999981747181345e-05, + "loss": 2.0878, + "step": 255 + }, + { + "epoch": 0.0005257848799847327, + "grad_norm": 3.0342047214508057, + "learning_rate": 2.9999979536632872e-05, + "loss": 2.1415, + "step": 270 + }, + { + "epoch": 0.0005549951510949957, + "grad_norm": 4.045855522155762, + "learning_rate": 2.9999977199767467e-05, + "loss": 2.0558, + "step": 285 + }, + { + "epoch": 0.0005842054222052587, + "grad_norm": 4.325068950653076, + "learning_rate": 2.999997473658514e-05, + "loss": 2.0617, + "step": 300 + }, + { + "epoch": 0.0006134156933155216, + "grad_norm": 4.770871162414551, + "learning_rate": 2.999997214708592e-05, + "loss": 2.0922, + "step": 315 + }, + { + "epoch": 0.0006426259644257845, + "grad_norm": 4.1802754402160645, + "learning_rate": 2.999996943126982e-05, + "loss": 2.217, + "step": 330 + }, + { + "epoch": 0.0006718362355360475, + "grad_norm": 3.044260263442993, + "learning_rate": 2.999996658913687e-05, + "loss": 2.0508, + "step": 345 + }, + { + "epoch": 0.0007010465066463104, + "grad_norm": 4.1525187492370605, + "learning_rate": 2.9999963620687095e-05, + "loss": 2.1678, + "step": 360 + }, + { + "epoch": 0.0007302567777565733, + "grad_norm": 2.5676231384277344, + "learning_rate": 2.9999960525920515e-05, + "loss": 2.1693, + "step": 375 + }, + { + "epoch": 0.0007594670488668362, + "grad_norm": 3.880307674407959, + "learning_rate": 2.9999957304837156e-05, + "loss": 1.9008, + "step": 390 + }, + { + "epoch": 0.0007886773199770992, + "grad_norm": 3.3433754444122314, + "learning_rate": 2.999995395743705e-05, + "loss": 2.0734, + "step": 405 + }, + { + "epoch": 0.0008178875910873621, + "grad_norm": 3.451378583908081, + "learning_rate": 2.999995048372022e-05, + "loss": 1.9901, + "step": 420 + }, + { + "epoch": 0.000847097862197625, + "grad_norm": 3.4411580562591553, + "learning_rate": 2.9999946883686695e-05, + "loss": 1.9734, + "step": 435 + }, + { + "epoch": 0.000876308133307888, + "grad_norm": 3.514651298522949, + "learning_rate": 2.999994315733651e-05, + "loss": 2.1632, + "step": 450 + }, + { + "epoch": 0.0009055184044181509, + "grad_norm": 5.380824565887451, + "learning_rate": 2.999993930466969e-05, + "loss": 2.1534, + "step": 465 + }, + { + "epoch": 0.0009347286755284138, + "grad_norm": 3.768618106842041, + "learning_rate": 2.999993532568628e-05, + "loss": 2.2182, + "step": 480 + }, + { + "epoch": 0.0009639389466386767, + "grad_norm": 2.9490509033203125, + "learning_rate": 2.99999312203863e-05, + "loss": 1.9677, + "step": 495 + }, + { + "epoch": 0.0009931492177489396, + "grad_norm": 3.4881184101104736, + "learning_rate": 2.9999926988769788e-05, + "loss": 2.0786, + "step": 510 + }, + { + "epoch": 0.0010223594888592025, + "grad_norm": 3.788675546646118, + "learning_rate": 2.9999922630836784e-05, + "loss": 1.9696, + "step": 525 + }, + { + "epoch": 0.0010515697599694654, + "grad_norm": 3.070878744125366, + "learning_rate": 2.999991814658732e-05, + "loss": 1.9053, + "step": 540 + }, + { + "epoch": 0.0010807800310797286, + "grad_norm": 2.456150531768799, + "learning_rate": 2.9999913536021436e-05, + "loss": 1.9813, + "step": 555 + }, + { + "epoch": 0.0011099903021899915, + "grad_norm": 4.81670618057251, + "learning_rate": 2.999990879913917e-05, + "loss": 1.9317, + "step": 570 + }, + { + "epoch": 0.0011392005733002544, + "grad_norm": 4.014492988586426, + "learning_rate": 2.9999903935940567e-05, + "loss": 1.9709, + "step": 585 + }, + { + "epoch": 0.0011684108444105174, + "grad_norm": 4.765859603881836, + "learning_rate": 2.999989894642566e-05, + "loss": 1.9252, + "step": 600 + }, + { + "epoch": 0.0011976211155207803, + "grad_norm": 3.5599727630615234, + "learning_rate": 2.9999893830594492e-05, + "loss": 2.0175, + "step": 615 + }, + { + "epoch": 0.0012268313866310432, + "grad_norm": 4.441170692443848, + "learning_rate": 2.999988858844711e-05, + "loss": 2.1166, + "step": 630 + }, + { + "epoch": 0.0012560416577413061, + "grad_norm": 2.691765308380127, + "learning_rate": 2.9999883219983558e-05, + "loss": 1.991, + "step": 645 + }, + { + "epoch": 0.001285251928851569, + "grad_norm": 4.039743423461914, + "learning_rate": 2.9999877725203878e-05, + "loss": 1.9917, + "step": 660 + }, + { + "epoch": 0.001314462199961832, + "grad_norm": 3.0612881183624268, + "learning_rate": 2.9999872104108115e-05, + "loss": 2.0436, + "step": 675 + }, + { + "epoch": 0.001343672471072095, + "grad_norm": 3.3579518795013428, + "learning_rate": 2.9999866356696326e-05, + "loss": 1.9659, + "step": 690 + }, + { + "epoch": 0.0013728827421823579, + "grad_norm": 4.145946502685547, + "learning_rate": 2.9999860482968552e-05, + "loss": 2.0267, + "step": 705 + }, + { + "epoch": 0.0014020930132926208, + "grad_norm": 5.213028430938721, + "learning_rate": 2.999985448292484e-05, + "loss": 2.0953, + "step": 720 + }, + { + "epoch": 0.0014313032844028837, + "grad_norm": 2.440793752670288, + "learning_rate": 2.9999848356565246e-05, + "loss": 2.0218, + "step": 735 + }, + { + "epoch": 0.0014605135555131466, + "grad_norm": 2.6167280673980713, + "learning_rate": 2.9999842103889813e-05, + "loss": 2.0638, + "step": 750 + }, + { + "epoch": 0.0014897238266234096, + "grad_norm": 2.2680811882019043, + "learning_rate": 2.999983572489861e-05, + "loss": 1.973, + "step": 765 + }, + { + "epoch": 0.0015189340977336725, + "grad_norm": 2.905947208404541, + "learning_rate": 2.999982921959167e-05, + "loss": 1.9346, + "step": 780 + }, + { + "epoch": 0.0015481443688439354, + "grad_norm": 4.673079967498779, + "learning_rate": 2.999982258796907e-05, + "loss": 2.1666, + "step": 795 + }, + { + "epoch": 0.0015773546399541984, + "grad_norm": 2.5963408946990967, + "learning_rate": 2.9999815830030846e-05, + "loss": 2.1063, + "step": 810 + }, + { + "epoch": 0.0016065649110644613, + "grad_norm": 3.656632661819458, + "learning_rate": 2.9999808945777066e-05, + "loss": 2.1328, + "step": 825 + }, + { + "epoch": 0.0016357751821747242, + "grad_norm": 4.4109368324279785, + "learning_rate": 2.9999801935207786e-05, + "loss": 1.9602, + "step": 840 + }, + { + "epoch": 0.0016649854532849871, + "grad_norm": 2.10768985748291, + "learning_rate": 2.9999794798323065e-05, + "loss": 1.9766, + "step": 855 + }, + { + "epoch": 0.00169419572439525, + "grad_norm": 4.301672458648682, + "learning_rate": 2.999978753512296e-05, + "loss": 1.9386, + "step": 870 + }, + { + "epoch": 0.001723405995505513, + "grad_norm": 8.171302795410156, + "learning_rate": 2.9999780145607538e-05, + "loss": 2.1288, + "step": 885 + }, + { + "epoch": 0.001752616266615776, + "grad_norm": 3.4784069061279297, + "learning_rate": 2.999977262977685e-05, + "loss": 2.1346, + "step": 900 + }, + { + "epoch": 0.0017818265377260389, + "grad_norm": 2.9945054054260254, + "learning_rate": 2.9999764987630976e-05, + "loss": 2.0432, + "step": 915 + }, + { + "epoch": 0.0018110368088363018, + "grad_norm": 3.632039785385132, + "learning_rate": 2.9999757219169964e-05, + "loss": 2.0642, + "step": 930 + }, + { + "epoch": 0.0018402470799465647, + "grad_norm": 3.9847047328948975, + "learning_rate": 2.999974932439389e-05, + "loss": 2.016, + "step": 945 + }, + { + "epoch": 0.0018694573510568276, + "grad_norm": 1.89451003074646, + "learning_rate": 2.9999741303302816e-05, + "loss": 2.0355, + "step": 960 + }, + { + "epoch": 0.0018986676221670906, + "grad_norm": 6.945189952850342, + "learning_rate": 2.9999733155896814e-05, + "loss": 1.97, + "step": 975 + }, + { + "epoch": 0.0019278778932773535, + "grad_norm": 3.347123384475708, + "learning_rate": 2.9999724882175947e-05, + "loss": 2.0421, + "step": 990 + }, + { + "epoch": 0.0019570881643876164, + "grad_norm": 4.331577777862549, + "learning_rate": 2.9999716482140292e-05, + "loss": 2.133, + "step": 1005 + }, + { + "epoch": 0.001986298435497879, + "grad_norm": 4.221049785614014, + "learning_rate": 2.999970795578991e-05, + "loss": 1.9272, + "step": 1020 + }, + { + "epoch": 0.0020155087066081423, + "grad_norm": 2.820549488067627, + "learning_rate": 2.9999699303124876e-05, + "loss": 2.0116, + "step": 1035 + }, + { + "epoch": 0.002044718977718405, + "grad_norm": 2.2391343116760254, + "learning_rate": 2.9999690524145268e-05, + "loss": 1.9337, + "step": 1050 + }, + { + "epoch": 0.002073929248828668, + "grad_norm": 2.285027503967285, + "learning_rate": 2.9999681618851156e-05, + "loss": 1.9773, + "step": 1065 + }, + { + "epoch": 0.002103139519938931, + "grad_norm": 3.266909599304199, + "learning_rate": 2.9999672587242616e-05, + "loss": 1.9471, + "step": 1080 + }, + { + "epoch": 0.002132349791049194, + "grad_norm": 5.03222131729126, + "learning_rate": 2.9999663429319722e-05, + "loss": 2.1337, + "step": 1095 + }, + { + "epoch": 0.002161560062159457, + "grad_norm": 2.044246196746826, + "learning_rate": 2.9999654145082552e-05, + "loss": 2.06, + "step": 1110 + }, + { + "epoch": 0.00219077033326972, + "grad_norm": 3.1399905681610107, + "learning_rate": 2.9999644734531183e-05, + "loss": 1.9174, + "step": 1125 + }, + { + "epoch": 0.002219980604379983, + "grad_norm": 3.2445454597473145, + "learning_rate": 2.99996351976657e-05, + "loss": 1.9728, + "step": 1140 + }, + { + "epoch": 0.0022491908754902457, + "grad_norm": 2.3682138919830322, + "learning_rate": 2.999962553448618e-05, + "loss": 1.9779, + "step": 1155 + }, + { + "epoch": 0.002278401146600509, + "grad_norm": 2.283808708190918, + "learning_rate": 2.99996157449927e-05, + "loss": 1.964, + "step": 1170 + }, + { + "epoch": 0.0023076114177107716, + "grad_norm": 3.048382043838501, + "learning_rate": 2.9999605829185353e-05, + "loss": 2.0908, + "step": 1185 + }, + { + "epoch": 0.0023368216888210347, + "grad_norm": 6.139120101928711, + "learning_rate": 2.999959578706421e-05, + "loss": 1.9535, + "step": 1200 + }, + { + "epoch": 0.0023660319599312974, + "grad_norm": 2.104069232940674, + "learning_rate": 2.9999585618629363e-05, + "loss": 1.8775, + "step": 1215 + }, + { + "epoch": 0.0023952422310415606, + "grad_norm": 2.531449317932129, + "learning_rate": 2.9999575323880894e-05, + "loss": 2.0309, + "step": 1230 + }, + { + "epoch": 0.0024244525021518233, + "grad_norm": 2.841078996658325, + "learning_rate": 2.9999564902818894e-05, + "loss": 2.0546, + "step": 1245 + }, + { + "epoch": 0.0024536627732620864, + "grad_norm": 2.0627243518829346, + "learning_rate": 2.9999554355443445e-05, + "loss": 1.8833, + "step": 1260 + }, + { + "epoch": 0.002482873044372349, + "grad_norm": 3.4014151096343994, + "learning_rate": 2.9999543681754645e-05, + "loss": 1.8935, + "step": 1275 + }, + { + "epoch": 0.0025120833154826123, + "grad_norm": 3.507380962371826, + "learning_rate": 2.999953288175257e-05, + "loss": 1.9512, + "step": 1290 + }, + { + "epoch": 0.002541293586592875, + "grad_norm": 5.4273362159729, + "learning_rate": 2.9999521955437325e-05, + "loss": 1.9794, + "step": 1305 + }, + { + "epoch": 0.002570503857703138, + "grad_norm": 4.466185092926025, + "learning_rate": 2.9999510902808993e-05, + "loss": 1.8266, + "step": 1320 + }, + { + "epoch": 0.002599714128813401, + "grad_norm": 2.4276440143585205, + "learning_rate": 2.9999499723867672e-05, + "loss": 2.0149, + "step": 1335 + }, + { + "epoch": 0.002628924399923664, + "grad_norm": 3.589876413345337, + "learning_rate": 2.9999488418613454e-05, + "loss": 1.8865, + "step": 1350 + }, + { + "epoch": 0.0026581346710339267, + "grad_norm": 3.750687599182129, + "learning_rate": 2.9999476987046434e-05, + "loss": 2.13, + "step": 1365 + }, + { + "epoch": 0.00268734494214419, + "grad_norm": 3.638850212097168, + "learning_rate": 2.999946542916671e-05, + "loss": 1.981, + "step": 1380 + }, + { + "epoch": 0.0027165552132544526, + "grad_norm": 4.610542297363281, + "learning_rate": 2.9999453744974375e-05, + "loss": 1.8887, + "step": 1395 + }, + { + "epoch": 0.0027457654843647157, + "grad_norm": 2.3758933544158936, + "learning_rate": 2.9999441934469534e-05, + "loss": 1.7867, + "step": 1410 + }, + { + "epoch": 0.0027749757554749784, + "grad_norm": 1.9892805814743042, + "learning_rate": 2.999942999765228e-05, + "loss": 1.8632, + "step": 1425 + }, + { + "epoch": 0.0028041860265852416, + "grad_norm": 2.5921459197998047, + "learning_rate": 2.999941793452272e-05, + "loss": 1.9155, + "step": 1440 + }, + { + "epoch": 0.0028333962976955043, + "grad_norm": 6.098895072937012, + "learning_rate": 2.9999405745080948e-05, + "loss": 2.0112, + "step": 1455 + }, + { + "epoch": 0.0028626065688057674, + "grad_norm": 2.7144930362701416, + "learning_rate": 2.9999393429327073e-05, + "loss": 2.0019, + "step": 1470 + }, + { + "epoch": 0.00289181683991603, + "grad_norm": 2.3964602947235107, + "learning_rate": 2.9999380987261195e-05, + "loss": 1.9524, + "step": 1485 + }, + { + "epoch": 0.0029210271110262933, + "grad_norm": 3.2644741535186768, + "learning_rate": 2.9999368418883422e-05, + "loss": 1.881, + "step": 1500 + }, + { + "epoch": 0.002950237382136556, + "grad_norm": 3.0930943489074707, + "learning_rate": 2.9999355724193854e-05, + "loss": 2.0001, + "step": 1515 + }, + { + "epoch": 0.002979447653246819, + "grad_norm": 2.6339657306671143, + "learning_rate": 2.9999342903192608e-05, + "loss": 1.9344, + "step": 1530 + }, + { + "epoch": 0.003008657924357082, + "grad_norm": 3.7254459857940674, + "learning_rate": 2.999932995587978e-05, + "loss": 2.0673, + "step": 1545 + }, + { + "epoch": 0.003037868195467345, + "grad_norm": 2.7078473567962646, + "learning_rate": 2.999931688225549e-05, + "loss": 2.0223, + "step": 1560 + }, + { + "epoch": 0.0030670784665776077, + "grad_norm": 2.7792718410491943, + "learning_rate": 2.999930368231984e-05, + "loss": 1.902, + "step": 1575 + }, + { + "epoch": 0.003096288737687871, + "grad_norm": 4.607854843139648, + "learning_rate": 2.999929035607294e-05, + "loss": 1.9536, + "step": 1590 + }, + { + "epoch": 0.0031254990087981336, + "grad_norm": 2.4656715393066406, + "learning_rate": 2.999927690351491e-05, + "loss": 2.1792, + "step": 1605 + }, + { + "epoch": 0.0031547092799083967, + "grad_norm": 2.8231770992279053, + "learning_rate": 2.9999263324645863e-05, + "loss": 1.8908, + "step": 1620 + }, + { + "epoch": 0.0031839195510186594, + "grad_norm": 3.5741395950317383, + "learning_rate": 2.9999249619465904e-05, + "loss": 1.9505, + "step": 1635 + }, + { + "epoch": 0.0032131298221289226, + "grad_norm": 2.0066606998443604, + "learning_rate": 2.999923578797516e-05, + "loss": 2.0346, + "step": 1650 + }, + { + "epoch": 0.0032423400932391853, + "grad_norm": 2.5830793380737305, + "learning_rate": 2.999922183017374e-05, + "loss": 1.9546, + "step": 1665 + }, + { + "epoch": 0.0032715503643494484, + "grad_norm": 2.8811633586883545, + "learning_rate": 2.999920774606176e-05, + "loss": 1.8517, + "step": 1680 + }, + { + "epoch": 0.003300760635459711, + "grad_norm": 3.0571601390838623, + "learning_rate": 2.999919353563935e-05, + "loss": 1.8746, + "step": 1695 + }, + { + "epoch": 0.0033299709065699743, + "grad_norm": 2.927371025085449, + "learning_rate": 2.9999179198906614e-05, + "loss": 2.0249, + "step": 1710 + }, + { + "epoch": 0.003359181177680237, + "grad_norm": 2.425579071044922, + "learning_rate": 2.9999164735863685e-05, + "loss": 1.9149, + "step": 1725 + }, + { + "epoch": 0.0033883914487905, + "grad_norm": 3.0711071491241455, + "learning_rate": 2.9999150146510678e-05, + "loss": 1.8335, + "step": 1740 + }, + { + "epoch": 0.003417601719900763, + "grad_norm": 4.509474277496338, + "learning_rate": 2.9999135430847718e-05, + "loss": 2.0002, + "step": 1755 + }, + { + "epoch": 0.003446811991011026, + "grad_norm": 2.784403085708618, + "learning_rate": 2.9999120588874927e-05, + "loss": 1.985, + "step": 1770 + }, + { + "epoch": 0.0034760222621212887, + "grad_norm": 4.556781768798828, + "learning_rate": 2.9999105620592434e-05, + "loss": 1.8832, + "step": 1785 + }, + { + "epoch": 0.003505232533231552, + "grad_norm": 3.1926143169403076, + "learning_rate": 2.9999090526000364e-05, + "loss": 2.0136, + "step": 1800 + }, + { + "epoch": 0.0035344428043418146, + "grad_norm": 3.627634286880493, + "learning_rate": 2.9999075305098846e-05, + "loss": 2.0413, + "step": 1815 + }, + { + "epoch": 0.0035636530754520777, + "grad_norm": 4.250141620635986, + "learning_rate": 2.9999059957887998e-05, + "loss": 1.9076, + "step": 1830 + }, + { + "epoch": 0.0035928633465623404, + "grad_norm": 4.263555526733398, + "learning_rate": 2.999904448436796e-05, + "loss": 1.9782, + "step": 1845 + }, + { + "epoch": 0.0036220736176726036, + "grad_norm": 2.5871047973632812, + "learning_rate": 2.999902888453886e-05, + "loss": 1.9703, + "step": 1860 + }, + { + "epoch": 0.0036512838887828663, + "grad_norm": 2.2850048542022705, + "learning_rate": 2.9999013158400827e-05, + "loss": 1.9571, + "step": 1875 + }, + { + "epoch": 0.0036804941598931294, + "grad_norm": 2.419032096862793, + "learning_rate": 2.9998997305953993e-05, + "loss": 2.1583, + "step": 1890 + }, + { + "epoch": 0.003709704431003392, + "grad_norm": 3.190084934234619, + "learning_rate": 2.9998981327198497e-05, + "loss": 1.9732, + "step": 1905 + }, + { + "epoch": 0.0037389147021136553, + "grad_norm": 3.31540846824646, + "learning_rate": 2.9998965222134468e-05, + "loss": 1.8646, + "step": 1920 + }, + { + "epoch": 0.003768124973223918, + "grad_norm": 2.423227310180664, + "learning_rate": 2.9998948990762044e-05, + "loss": 1.9282, + "step": 1935 + }, + { + "epoch": 0.003797335244334181, + "grad_norm": 3.4635472297668457, + "learning_rate": 2.999893263308136e-05, + "loss": 1.9859, + "step": 1950 + }, + { + "epoch": 0.003826545515444444, + "grad_norm": 2.474353790283203, + "learning_rate": 2.9998916149092556e-05, + "loss": 2.0055, + "step": 1965 + }, + { + "epoch": 0.003855755786554707, + "grad_norm": 3.264435052871704, + "learning_rate": 2.9998899538795766e-05, + "loss": 1.9755, + "step": 1980 + }, + { + "epoch": 0.0038849660576649697, + "grad_norm": 2.028916358947754, + "learning_rate": 2.9998882802191137e-05, + "loss": 1.9914, + "step": 1995 + }, + { + "epoch": 0.003914176328775233, + "grad_norm": 1.9189203977584839, + "learning_rate": 2.9998865939278805e-05, + "loss": 1.8109, + "step": 2010 + }, + { + "epoch": 0.0039433865998854956, + "grad_norm": 3.143857955932617, + "learning_rate": 2.9998848950058913e-05, + "loss": 1.8233, + "step": 2025 + }, + { + "epoch": 0.003972596870995758, + "grad_norm": 2.662853479385376, + "learning_rate": 2.9998831834531608e-05, + "loss": 2.0015, + "step": 2040 + }, + { + "epoch": 0.004001807142106022, + "grad_norm": 2.0305709838867188, + "learning_rate": 2.9998814592697027e-05, + "loss": 1.8817, + "step": 2055 + }, + { + "epoch": 0.0040310174132162846, + "grad_norm": 4.386982440948486, + "learning_rate": 2.999879722455532e-05, + "loss": 1.8511, + "step": 2070 + }, + { + "epoch": 0.004060227684326547, + "grad_norm": 3.3638317584991455, + "learning_rate": 2.9998779730106633e-05, + "loss": 1.9738, + "step": 2085 + }, + { + "epoch": 0.00408943795543681, + "grad_norm": 3.6872494220733643, + "learning_rate": 2.9998762109351107e-05, + "loss": 1.8496, + "step": 2100 + }, + { + "epoch": 0.0041186482265470736, + "grad_norm": 3.2654178142547607, + "learning_rate": 2.9998744362288902e-05, + "loss": 1.9003, + "step": 2115 + }, + { + "epoch": 0.004147858497657336, + "grad_norm": 3.4839179515838623, + "learning_rate": 2.9998726488920162e-05, + "loss": 2.016, + "step": 2130 + }, + { + "epoch": 0.004177068768767599, + "grad_norm": 2.537320613861084, + "learning_rate": 2.9998708489245034e-05, + "loss": 2.0073, + "step": 2145 + }, + { + "epoch": 0.004206279039877862, + "grad_norm": 3.2071378231048584, + "learning_rate": 2.999869036326367e-05, + "loss": 1.9835, + "step": 2160 + }, + { + "epoch": 0.004235489310988125, + "grad_norm": 2.7338645458221436, + "learning_rate": 2.999867211097623e-05, + "loss": 1.8995, + "step": 2175 + }, + { + "epoch": 0.004264699582098388, + "grad_norm": 4.866488933563232, + "learning_rate": 2.999865373238286e-05, + "loss": 1.9656, + "step": 2190 + }, + { + "epoch": 0.004293909853208651, + "grad_norm": 2.67726731300354, + "learning_rate": 2.9998635227483715e-05, + "loss": 2.0401, + "step": 2205 + }, + { + "epoch": 0.004323120124318914, + "grad_norm": 3.1271047592163086, + "learning_rate": 2.9998616596278955e-05, + "loss": 2.0406, + "step": 2220 + }, + { + "epoch": 0.004352330395429177, + "grad_norm": 5.673630237579346, + "learning_rate": 2.9998597838768738e-05, + "loss": 1.9733, + "step": 2235 + }, + { + "epoch": 0.00438154066653944, + "grad_norm": 2.696476697921753, + "learning_rate": 2.9998578954953216e-05, + "loss": 2.16, + "step": 2250 + }, + { + "epoch": 0.004410750937649702, + "grad_norm": 5.409951686859131, + "learning_rate": 2.9998559944832553e-05, + "loss": 1.9401, + "step": 2265 + }, + { + "epoch": 0.004439961208759966, + "grad_norm": 2.183530330657959, + "learning_rate": 2.9998540808406903e-05, + "loss": 1.9405, + "step": 2280 + }, + { + "epoch": 0.004469171479870229, + "grad_norm": 3.3076109886169434, + "learning_rate": 2.9998521545676438e-05, + "loss": 1.9617, + "step": 2295 + }, + { + "epoch": 0.004498381750980491, + "grad_norm": 2.793837547302246, + "learning_rate": 2.999850215664131e-05, + "loss": 1.9463, + "step": 2310 + }, + { + "epoch": 0.004527592022090754, + "grad_norm": 2.6732029914855957, + "learning_rate": 2.9998482641301687e-05, + "loss": 1.9114, + "step": 2325 + }, + { + "epoch": 0.004556802293201018, + "grad_norm": 2.1354994773864746, + "learning_rate": 2.999846299965773e-05, + "loss": 2.0321, + "step": 2340 + }, + { + "epoch": 0.00458601256431128, + "grad_norm": 3.889298439025879, + "learning_rate": 2.9998443231709608e-05, + "loss": 2.0722, + "step": 2355 + }, + { + "epoch": 0.004615222835421543, + "grad_norm": 3.6255850791931152, + "learning_rate": 2.9998423337457486e-05, + "loss": 2.0355, + "step": 2370 + }, + { + "epoch": 0.004644433106531806, + "grad_norm": 2.4627747535705566, + "learning_rate": 2.9998403316901533e-05, + "loss": 2.0076, + "step": 2385 + }, + { + "epoch": 0.004673643377642069, + "grad_norm": 3.3934834003448486, + "learning_rate": 2.9998383170041916e-05, + "loss": 1.9877, + "step": 2400 + }, + { + "epoch": 0.004702853648752332, + "grad_norm": 2.2475860118865967, + "learning_rate": 2.99983628968788e-05, + "loss": 1.8463, + "step": 2415 + }, + { + "epoch": 0.004732063919862595, + "grad_norm": 2.8804898262023926, + "learning_rate": 2.9998342497412365e-05, + "loss": 2.0943, + "step": 2430 + }, + { + "epoch": 0.0047612741909728576, + "grad_norm": 2.2566545009613037, + "learning_rate": 2.999832197164278e-05, + "loss": 1.8834, + "step": 2445 + }, + { + "epoch": 0.004790484462083121, + "grad_norm": 2.948420524597168, + "learning_rate": 2.9998301319570216e-05, + "loss": 2.002, + "step": 2460 + }, + { + "epoch": 0.004819694733193384, + "grad_norm": 3.6838879585266113, + "learning_rate": 2.999828054119484e-05, + "loss": 1.9071, + "step": 2475 + }, + { + "epoch": 0.0048489050043036466, + "grad_norm": 2.2608137130737305, + "learning_rate": 2.9998259636516845e-05, + "loss": 1.9241, + "step": 2490 + }, + { + "epoch": 0.004878115275413909, + "grad_norm": 2.5766665935516357, + "learning_rate": 2.999823860553639e-05, + "loss": 1.9269, + "step": 2505 + }, + { + "epoch": 0.004907325546524173, + "grad_norm": 3.382236957550049, + "learning_rate": 2.9998217448253658e-05, + "loss": 2.0165, + "step": 2520 + }, + { + "epoch": 0.0049365358176344356, + "grad_norm": 2.600278854370117, + "learning_rate": 2.999819616466883e-05, + "loss": 1.8852, + "step": 2535 + }, + { + "epoch": 0.004965746088744698, + "grad_norm": 2.8558318614959717, + "learning_rate": 2.999817475478208e-05, + "loss": 1.9202, + "step": 2550 + }, + { + "epoch": 0.004994956359854961, + "grad_norm": 2.5992493629455566, + "learning_rate": 2.9998153218593594e-05, + "loss": 2.0815, + "step": 2565 + }, + { + "epoch": 0.0050241666309652246, + "grad_norm": 3.0813794136047363, + "learning_rate": 2.9998131556103545e-05, + "loss": 2.0847, + "step": 2580 + }, + { + "epoch": 0.005053376902075487, + "grad_norm": 3.12187123298645, + "learning_rate": 2.999810976731213e-05, + "loss": 1.8592, + "step": 2595 + }, + { + "epoch": 0.00508258717318575, + "grad_norm": 3.4060111045837402, + "learning_rate": 2.9998087852219514e-05, + "loss": 1.8904, + "step": 2610 + }, + { + "epoch": 0.005111797444296013, + "grad_norm": 1.8444907665252686, + "learning_rate": 2.9998065810825895e-05, + "loss": 2.0493, + "step": 2625 + }, + { + "epoch": 0.005141007715406276, + "grad_norm": 2.9746735095977783, + "learning_rate": 2.999804364313145e-05, + "loss": 2.0021, + "step": 2640 + }, + { + "epoch": 0.005170217986516539, + "grad_norm": 2.150517463684082, + "learning_rate": 2.9998021349136373e-05, + "loss": 1.8938, + "step": 2655 + }, + { + "epoch": 0.005199428257626802, + "grad_norm": 2.4695417881011963, + "learning_rate": 2.9997998928840854e-05, + "loss": 1.956, + "step": 2670 + }, + { + "epoch": 0.005228638528737064, + "grad_norm": 3.365466356277466, + "learning_rate": 2.999797638224507e-05, + "loss": 1.8762, + "step": 2685 + }, + { + "epoch": 0.005257848799847328, + "grad_norm": 2.206486940383911, + "learning_rate": 2.999795370934922e-05, + "loss": 2.0777, + "step": 2700 + }, + { + "epoch": 0.005287059070957591, + "grad_norm": 2.5338962078094482, + "learning_rate": 2.9997930910153492e-05, + "loss": 1.9524, + "step": 2715 + }, + { + "epoch": 0.005316269342067853, + "grad_norm": 2.7835092544555664, + "learning_rate": 2.999790798465808e-05, + "loss": 1.9849, + "step": 2730 + }, + { + "epoch": 0.005345479613178116, + "grad_norm": 3.5604777336120605, + "learning_rate": 2.999788493286317e-05, + "loss": 1.9226, + "step": 2745 + }, + { + "epoch": 0.00537468988428838, + "grad_norm": 2.7719836235046387, + "learning_rate": 2.9997861754768965e-05, + "loss": 2.0174, + "step": 2760 + }, + { + "epoch": 0.005403900155398642, + "grad_norm": 2.0660643577575684, + "learning_rate": 2.999783845037566e-05, + "loss": 2.0959, + "step": 2775 + }, + { + "epoch": 0.005433110426508905, + "grad_norm": 2.501246690750122, + "learning_rate": 2.9997815019683443e-05, + "loss": 1.9407, + "step": 2790 + }, + { + "epoch": 0.005462320697619168, + "grad_norm": 3.6545281410217285, + "learning_rate": 2.9997791462692518e-05, + "loss": 2.0155, + "step": 2805 + }, + { + "epoch": 0.005491530968729431, + "grad_norm": 2.2655766010284424, + "learning_rate": 2.9997767779403085e-05, + "loss": 1.7096, + "step": 2820 + }, + { + "epoch": 0.005520741239839694, + "grad_norm": 1.9551692008972168, + "learning_rate": 2.9997743969815337e-05, + "loss": 2.0597, + "step": 2835 + }, + { + "epoch": 0.005549951510949957, + "grad_norm": 4.887934684753418, + "learning_rate": 2.999772003392948e-05, + "loss": 2.0615, + "step": 2850 + }, + { + "epoch": 0.0055791617820602196, + "grad_norm": 3.462581157684326, + "learning_rate": 2.999769597174571e-05, + "loss": 1.9574, + "step": 2865 + }, + { + "epoch": 0.005608372053170483, + "grad_norm": 4.445065498352051, + "learning_rate": 2.9997671783264234e-05, + "loss": 1.9514, + "step": 2880 + }, + { + "epoch": 0.005637582324280746, + "grad_norm": 2.81374454498291, + "learning_rate": 2.9997647468485254e-05, + "loss": 1.9163, + "step": 2895 + }, + { + "epoch": 0.0056667925953910086, + "grad_norm": 3.4871456623077393, + "learning_rate": 2.999762302740898e-05, + "loss": 1.966, + "step": 2910 + }, + { + "epoch": 0.005696002866501271, + "grad_norm": 2.006155490875244, + "learning_rate": 2.9997598460035608e-05, + "loss": 2.009, + "step": 2925 + }, + { + "epoch": 0.005725213137611535, + "grad_norm": 2.104846477508545, + "learning_rate": 2.9997573766365353e-05, + "loss": 1.9507, + "step": 2940 + }, + { + "epoch": 0.0057544234087217976, + "grad_norm": 3.3199880123138428, + "learning_rate": 2.999754894639842e-05, + "loss": 1.9102, + "step": 2955 + }, + { + "epoch": 0.00578363367983206, + "grad_norm": 3.4421074390411377, + "learning_rate": 2.9997524000135015e-05, + "loss": 1.9148, + "step": 2970 + }, + { + "epoch": 0.005812843950942323, + "grad_norm": 3.9583024978637695, + "learning_rate": 2.9997498927575352e-05, + "loss": 1.9872, + "step": 2985 + }, + { + "epoch": 0.0058420542220525866, + "grad_norm": 2.90704607963562, + "learning_rate": 2.999747372871964e-05, + "loss": 1.8581, + "step": 3000 + }, + { + "epoch": 0.005871264493162849, + "grad_norm": 2.327897548675537, + "learning_rate": 2.99974484035681e-05, + "loss": 2.155, + "step": 3015 + }, + { + "epoch": 0.005900474764273112, + "grad_norm": 2.7539587020874023, + "learning_rate": 2.999742295212093e-05, + "loss": 2.0412, + "step": 3030 + }, + { + "epoch": 0.005929685035383375, + "grad_norm": 3.2530441284179688, + "learning_rate": 2.999739737437835e-05, + "loss": 1.8964, + "step": 3045 + }, + { + "epoch": 0.005958895306493638, + "grad_norm": 4.038791179656982, + "learning_rate": 2.9997371670340583e-05, + "loss": 1.8992, + "step": 3060 + }, + { + "epoch": 0.005988105577603901, + "grad_norm": 3.810509443283081, + "learning_rate": 2.999734584000784e-05, + "loss": 2.0157, + "step": 3075 + }, + { + "epoch": 0.006017315848714164, + "grad_norm": 2.6701083183288574, + "learning_rate": 2.9997319883380334e-05, + "loss": 1.9206, + "step": 3090 + }, + { + "epoch": 0.006046526119824426, + "grad_norm": 2.0933597087860107, + "learning_rate": 2.999729380045829e-05, + "loss": 1.8986, + "step": 3105 + }, + { + "epoch": 0.00607573639093469, + "grad_norm": 7.184755802154541, + "learning_rate": 2.9997267591241924e-05, + "loss": 2.0928, + "step": 3120 + }, + { + "epoch": 0.006104946662044953, + "grad_norm": 4.04668664932251, + "learning_rate": 2.9997241255731465e-05, + "loss": 2.1005, + "step": 3135 + }, + { + "epoch": 0.006134156933155215, + "grad_norm": 2.4879837036132812, + "learning_rate": 2.9997214793927122e-05, + "loss": 2.0177, + "step": 3150 + }, + { + "epoch": 0.006163367204265478, + "grad_norm": 2.1891894340515137, + "learning_rate": 2.9997188205829127e-05, + "loss": 1.827, + "step": 3165 + }, + { + "epoch": 0.006192577475375742, + "grad_norm": 2.6386845111846924, + "learning_rate": 2.9997161491437696e-05, + "loss": 2.0118, + "step": 3180 + }, + { + "epoch": 0.006221787746486004, + "grad_norm": 2.792214870452881, + "learning_rate": 2.9997134650753066e-05, + "loss": 1.7433, + "step": 3195 + }, + { + "epoch": 0.006250998017596267, + "grad_norm": 3.470515727996826, + "learning_rate": 2.999710768377545e-05, + "loss": 2.0895, + "step": 3210 + }, + { + "epoch": 0.00628020828870653, + "grad_norm": 3.3043479919433594, + "learning_rate": 2.9997080590505085e-05, + "loss": 2.0023, + "step": 3225 + }, + { + "epoch": 0.006309418559816793, + "grad_norm": 2.1124866008758545, + "learning_rate": 2.9997053370942195e-05, + "loss": 2.0457, + "step": 3240 + }, + { + "epoch": 0.006338628830927056, + "grad_norm": 3.1867144107818604, + "learning_rate": 2.9997026025087004e-05, + "loss": 1.9503, + "step": 3255 + }, + { + "epoch": 0.006367839102037319, + "grad_norm": 2.3057665824890137, + "learning_rate": 2.999699855293975e-05, + "loss": 1.8968, + "step": 3270 + }, + { + "epoch": 0.0063970493731475815, + "grad_norm": 2.416084051132202, + "learning_rate": 2.999697095450066e-05, + "loss": 1.8304, + "step": 3285 + }, + { + "epoch": 0.006426259644257845, + "grad_norm": 2.481602191925049, + "learning_rate": 2.9996943229769977e-05, + "loss": 2.1386, + "step": 3300 + }, + { + "epoch": 0.006455469915368108, + "grad_norm": 2.4862520694732666, + "learning_rate": 2.9996915378747918e-05, + "loss": 1.9672, + "step": 3315 + }, + { + "epoch": 0.0064846801864783706, + "grad_norm": 3.704164743423462, + "learning_rate": 2.999688740143473e-05, + "loss": 1.9671, + "step": 3330 + }, + { + "epoch": 0.006513890457588634, + "grad_norm": 4.340814113616943, + "learning_rate": 2.999685929783064e-05, + "loss": 1.889, + "step": 3345 + }, + { + "epoch": 0.006543100728698897, + "grad_norm": 2.8202598094940186, + "learning_rate": 2.999683106793589e-05, + "loss": 1.883, + "step": 3360 + }, + { + "epoch": 0.0065723109998091596, + "grad_norm": 2.3717639446258545, + "learning_rate": 2.9996802711750716e-05, + "loss": 1.8932, + "step": 3375 + }, + { + "epoch": 0.006601521270919422, + "grad_norm": 2.1720776557922363, + "learning_rate": 2.999677422927536e-05, + "loss": 1.8262, + "step": 3390 + }, + { + "epoch": 0.006630731542029686, + "grad_norm": 6.15664005279541, + "learning_rate": 2.9996745620510055e-05, + "loss": 1.9139, + "step": 3405 + }, + { + "epoch": 0.0066599418131399486, + "grad_norm": 3.080832004547119, + "learning_rate": 2.9996716885455047e-05, + "loss": 1.9838, + "step": 3420 + }, + { + "epoch": 0.006689152084250211, + "grad_norm": 2.1853530406951904, + "learning_rate": 2.9996688024110577e-05, + "loss": 1.9353, + "step": 3435 + }, + { + "epoch": 0.006718362355360474, + "grad_norm": 2.609475612640381, + "learning_rate": 2.9996659036476886e-05, + "loss": 1.9734, + "step": 3450 + }, + { + "epoch": 0.0067475726264707376, + "grad_norm": 3.808016061782837, + "learning_rate": 2.9996629922554225e-05, + "loss": 1.9872, + "step": 3465 + }, + { + "epoch": 0.006776782897581, + "grad_norm": 4.373661994934082, + "learning_rate": 2.999660068234283e-05, + "loss": 1.8084, + "step": 3480 + }, + { + "epoch": 0.006805993168691263, + "grad_norm": 4.299337387084961, + "learning_rate": 2.9996571315842954e-05, + "loss": 2.0801, + "step": 3495 + }, + { + "epoch": 0.006835203439801526, + "grad_norm": 4.49350118637085, + "learning_rate": 2.999654182305484e-05, + "loss": 1.8882, + "step": 3510 + }, + { + "epoch": 0.006864413710911789, + "grad_norm": 2.319342613220215, + "learning_rate": 2.999651220397874e-05, + "loss": 1.9627, + "step": 3525 + }, + { + "epoch": 0.006893623982022052, + "grad_norm": 3.2572271823883057, + "learning_rate": 2.99964824586149e-05, + "loss": 1.9482, + "step": 3540 + }, + { + "epoch": 0.006922834253132315, + "grad_norm": 1.7764968872070312, + "learning_rate": 2.9996452586963575e-05, + "loss": 1.8938, + "step": 3555 + }, + { + "epoch": 0.006952044524242577, + "grad_norm": 2.2628538608551025, + "learning_rate": 2.9996422589025007e-05, + "loss": 1.9644, + "step": 3570 + }, + { + "epoch": 0.006981254795352841, + "grad_norm": 3.8122570514678955, + "learning_rate": 2.999639246479946e-05, + "loss": 1.7568, + "step": 3585 + }, + { + "epoch": 0.007010465066463104, + "grad_norm": 3.868978977203369, + "learning_rate": 2.999636221428718e-05, + "loss": 2.077, + "step": 3600 + }, + { + "epoch": 0.007039675337573366, + "grad_norm": 4.426783561706543, + "learning_rate": 2.999633183748843e-05, + "loss": 1.8161, + "step": 3615 + }, + { + "epoch": 0.007068885608683629, + "grad_norm": 2.3112306594848633, + "learning_rate": 2.9996301334403456e-05, + "loss": 1.8738, + "step": 3630 + }, + { + "epoch": 0.007098095879793893, + "grad_norm": 1.7659751176834106, + "learning_rate": 2.9996270705032523e-05, + "loss": 1.9714, + "step": 3645 + }, + { + "epoch": 0.007127306150904155, + "grad_norm": 5.0493364334106445, + "learning_rate": 2.9996239949375882e-05, + "loss": 1.9451, + "step": 3660 + }, + { + "epoch": 0.007156516422014418, + "grad_norm": 1.6928982734680176, + "learning_rate": 2.9996209067433794e-05, + "loss": 1.7872, + "step": 3675 + }, + { + "epoch": 0.007185726693124681, + "grad_norm": 2.2954142093658447, + "learning_rate": 2.9996178059206525e-05, + "loss": 2.0124, + "step": 3690 + }, + { + "epoch": 0.007214936964234944, + "grad_norm": 4.649162292480469, + "learning_rate": 2.9996146924694327e-05, + "loss": 1.823, + "step": 3705 + }, + { + "epoch": 0.007244147235345207, + "grad_norm": 2.248623847961426, + "learning_rate": 2.9996115663897468e-05, + "loss": 1.9382, + "step": 3720 + }, + { + "epoch": 0.00727335750645547, + "grad_norm": 3.033177375793457, + "learning_rate": 2.999608427681621e-05, + "loss": 2.0324, + "step": 3735 + }, + { + "epoch": 0.0073025677775657325, + "grad_norm": 3.4453887939453125, + "learning_rate": 2.9996052763450817e-05, + "loss": 1.9293, + "step": 3750 + }, + { + "epoch": 0.007331778048675996, + "grad_norm": 3.873504877090454, + "learning_rate": 2.9996021123801556e-05, + "loss": 2.0132, + "step": 3765 + }, + { + "epoch": 0.007360988319786259, + "grad_norm": 3.7370665073394775, + "learning_rate": 2.999598935786869e-05, + "loss": 1.8681, + "step": 3780 + }, + { + "epoch": 0.0073901985908965215, + "grad_norm": 5.5719523429870605, + "learning_rate": 2.999595746565249e-05, + "loss": 1.8799, + "step": 3795 + }, + { + "epoch": 0.007419408862006784, + "grad_norm": 3.7253758907318115, + "learning_rate": 2.9995925447153226e-05, + "loss": 1.8633, + "step": 3810 + }, + { + "epoch": 0.007448619133117048, + "grad_norm": 2.792862892150879, + "learning_rate": 2.9995893302371158e-05, + "loss": 1.9642, + "step": 3825 + }, + { + "epoch": 0.0074778294042273106, + "grad_norm": 2.693080425262451, + "learning_rate": 2.999586103130657e-05, + "loss": 1.8963, + "step": 3840 + }, + { + "epoch": 0.007507039675337573, + "grad_norm": 2.7681262493133545, + "learning_rate": 2.9995828633959724e-05, + "loss": 2.0581, + "step": 3855 + }, + { + "epoch": 0.007536249946447836, + "grad_norm": 3.635828733444214, + "learning_rate": 2.9995796110330894e-05, + "loss": 1.7659, + "step": 3870 + }, + { + "epoch": 0.0075654602175580996, + "grad_norm": 2.46408748626709, + "learning_rate": 2.9995763460420358e-05, + "loss": 1.7966, + "step": 3885 + }, + { + "epoch": 0.007594670488668362, + "grad_norm": 2.2896721363067627, + "learning_rate": 2.999573068422839e-05, + "loss": 1.9918, + "step": 3900 + }, + { + "epoch": 0.007623880759778625, + "grad_norm": 2.3533968925476074, + "learning_rate": 2.9995697781755262e-05, + "loss": 1.8725, + "step": 3915 + }, + { + "epoch": 0.007653091030888888, + "grad_norm": 2.5543251037597656, + "learning_rate": 2.999566475300125e-05, + "loss": 1.8896, + "step": 3930 + }, + { + "epoch": 0.007682301301999151, + "grad_norm": 3.2201671600341797, + "learning_rate": 2.999563159796665e-05, + "loss": 1.8444, + "step": 3945 + }, + { + "epoch": 0.007711511573109414, + "grad_norm": 2.2435178756713867, + "learning_rate": 2.9995598316651713e-05, + "loss": 1.976, + "step": 3960 + }, + { + "epoch": 0.007740721844219677, + "grad_norm": 3.219825506210327, + "learning_rate": 2.999556490905674e-05, + "loss": 1.8561, + "step": 3975 + }, + { + "epoch": 0.007769932115329939, + "grad_norm": 2.4510769844055176, + "learning_rate": 2.9995531375182008e-05, + "loss": 2.068, + "step": 3990 + }, + { + "epoch": 0.007799142386440203, + "grad_norm": 4.242166519165039, + "learning_rate": 2.999549771502779e-05, + "loss": 1.8497, + "step": 4005 + }, + { + "epoch": 0.007828352657550466, + "grad_norm": 2.9145870208740234, + "learning_rate": 2.9995463928594383e-05, + "loss": 1.8896, + "step": 4020 + }, + { + "epoch": 0.007857562928660728, + "grad_norm": 2.33396315574646, + "learning_rate": 2.9995430015882064e-05, + "loss": 1.9502, + "step": 4035 + }, + { + "epoch": 0.007886773199770991, + "grad_norm": 3.1073617935180664, + "learning_rate": 2.9995395976891118e-05, + "loss": 1.9178, + "step": 4050 + }, + { + "epoch": 0.007915983470881254, + "grad_norm": 3.444310188293457, + "learning_rate": 2.9995361811621838e-05, + "loss": 1.7477, + "step": 4065 + }, + { + "epoch": 0.007945193741991517, + "grad_norm": 3.131169080734253, + "learning_rate": 2.9995327520074504e-05, + "loss": 1.8799, + "step": 4080 + }, + { + "epoch": 0.007974404013101781, + "grad_norm": 2.9107017517089844, + "learning_rate": 2.9995293102249408e-05, + "loss": 1.8785, + "step": 4095 + }, + { + "epoch": 0.008003614284212044, + "grad_norm": 2.795280694961548, + "learning_rate": 2.9995258558146834e-05, + "loss": 2.0386, + "step": 4110 + }, + { + "epoch": 0.008032824555322306, + "grad_norm": 5.588748455047607, + "learning_rate": 2.9995223887767087e-05, + "loss": 1.9852, + "step": 4125 + }, + { + "epoch": 0.008062034826432569, + "grad_norm": 2.659045934677124, + "learning_rate": 2.999518909111045e-05, + "loss": 1.8707, + "step": 4140 + }, + { + "epoch": 0.008091245097542832, + "grad_norm": 2.9934346675872803, + "learning_rate": 2.9995154168177214e-05, + "loss": 1.9004, + "step": 4155 + }, + { + "epoch": 0.008120455368653095, + "grad_norm": 2.6598739624023438, + "learning_rate": 2.9995119118967674e-05, + "loss": 1.8897, + "step": 4170 + }, + { + "epoch": 0.008149665639763357, + "grad_norm": 3.6846165657043457, + "learning_rate": 2.9995083943482126e-05, + "loss": 1.9932, + "step": 4185 + }, + { + "epoch": 0.00817887591087362, + "grad_norm": 3.6916184425354004, + "learning_rate": 2.9995048641720873e-05, + "loss": 1.7672, + "step": 4200 + }, + { + "epoch": 0.008208086181983884, + "grad_norm": 3.650599956512451, + "learning_rate": 2.9995013213684202e-05, + "loss": 1.9271, + "step": 4215 + }, + { + "epoch": 0.008237296453094147, + "grad_norm": 4.196152687072754, + "learning_rate": 2.999497765937242e-05, + "loss": 1.8244, + "step": 4230 + }, + { + "epoch": 0.00826650672420441, + "grad_norm": 3.1078038215637207, + "learning_rate": 2.9994941978785817e-05, + "loss": 1.893, + "step": 4245 + }, + { + "epoch": 0.008295716995314673, + "grad_norm": 4.401791095733643, + "learning_rate": 2.9994906171924703e-05, + "loss": 1.8844, + "step": 4260 + }, + { + "epoch": 0.008324927266424935, + "grad_norm": 2.6136245727539062, + "learning_rate": 2.999487023878937e-05, + "loss": 1.8981, + "step": 4275 + }, + { + "epoch": 0.008354137537535198, + "grad_norm": 3.334519624710083, + "learning_rate": 2.9994834179380134e-05, + "loss": 1.9983, + "step": 4290 + }, + { + "epoch": 0.00838334780864546, + "grad_norm": 1.8010269403457642, + "learning_rate": 2.9994797993697283e-05, + "loss": 1.8192, + "step": 4305 + }, + { + "epoch": 0.008412558079755723, + "grad_norm": 3.2746548652648926, + "learning_rate": 2.9994761681741135e-05, + "loss": 1.9696, + "step": 4320 + }, + { + "epoch": 0.008441768350865988, + "grad_norm": 2.172431468963623, + "learning_rate": 2.9994725243511982e-05, + "loss": 1.9068, + "step": 4335 + }, + { + "epoch": 0.00847097862197625, + "grad_norm": 3.5535871982574463, + "learning_rate": 2.999468867901015e-05, + "loss": 1.985, + "step": 4350 + }, + { + "epoch": 0.008500188893086513, + "grad_norm": 3.866422176361084, + "learning_rate": 2.9994651988235923e-05, + "loss": 1.8057, + "step": 4365 + }, + { + "epoch": 0.008529399164196776, + "grad_norm": 4.770716190338135, + "learning_rate": 2.999461517118963e-05, + "loss": 1.9949, + "step": 4380 + }, + { + "epoch": 0.008558609435307039, + "grad_norm": 2.9273180961608887, + "learning_rate": 2.999457822787157e-05, + "loss": 1.8686, + "step": 4395 + }, + { + "epoch": 0.008587819706417301, + "grad_norm": 3.192166328430176, + "learning_rate": 2.9994541158282063e-05, + "loss": 1.9539, + "step": 4410 + }, + { + "epoch": 0.008617029977527564, + "grad_norm": 3.507930040359497, + "learning_rate": 2.9994503962421417e-05, + "loss": 1.7732, + "step": 4425 + }, + { + "epoch": 0.008646240248637829, + "grad_norm": 2.666705369949341, + "learning_rate": 2.9994466640289938e-05, + "loss": 1.9615, + "step": 4440 + }, + { + "epoch": 0.008675450519748091, + "grad_norm": 2.640362501144409, + "learning_rate": 2.999442919188795e-05, + "loss": 1.89, + "step": 4455 + }, + { + "epoch": 0.008704660790858354, + "grad_norm": 2.524216890335083, + "learning_rate": 2.9994391617215765e-05, + "loss": 2.0473, + "step": 4470 + }, + { + "epoch": 0.008733871061968617, + "grad_norm": 2.6444146633148193, + "learning_rate": 2.9994353916273696e-05, + "loss": 1.8889, + "step": 4485 + }, + { + "epoch": 0.00876308133307888, + "grad_norm": 2.4632787704467773, + "learning_rate": 2.9994316089062068e-05, + "loss": 1.9616, + "step": 4500 + }, + { + "epoch": 0.008792291604189142, + "grad_norm": 5.182243347167969, + "learning_rate": 2.999427813558119e-05, + "loss": 1.8216, + "step": 4515 + }, + { + "epoch": 0.008821501875299405, + "grad_norm": 3.6622776985168457, + "learning_rate": 2.9994240055831395e-05, + "loss": 1.8692, + "step": 4530 + }, + { + "epoch": 0.008850712146409668, + "grad_norm": 2.899912118911743, + "learning_rate": 2.9994201849812988e-05, + "loss": 1.9363, + "step": 4545 + }, + { + "epoch": 0.008879922417519932, + "grad_norm": 1.8940974473953247, + "learning_rate": 2.99941635175263e-05, + "loss": 1.8404, + "step": 4560 + }, + { + "epoch": 0.008909132688630195, + "grad_norm": 3.581655263900757, + "learning_rate": 2.9994125058971657e-05, + "loss": 1.9623, + "step": 4575 + }, + { + "epoch": 0.008938342959740457, + "grad_norm": 2.6804749965667725, + "learning_rate": 2.9994086474149375e-05, + "loss": 2.0692, + "step": 4590 + }, + { + "epoch": 0.00896755323085072, + "grad_norm": 4.302793502807617, + "learning_rate": 2.999404776305978e-05, + "loss": 1.9728, + "step": 4605 + }, + { + "epoch": 0.008996763501960983, + "grad_norm": 2.143483877182007, + "learning_rate": 2.9994008925703202e-05, + "loss": 1.941, + "step": 4620 + }, + { + "epoch": 0.009025973773071246, + "grad_norm": 2.2785732746124268, + "learning_rate": 2.9993969962079964e-05, + "loss": 1.9828, + "step": 4635 + }, + { + "epoch": 0.009055184044181508, + "grad_norm": 3.918194055557251, + "learning_rate": 2.9993930872190398e-05, + "loss": 1.9211, + "step": 4650 + }, + { + "epoch": 0.009084394315291771, + "grad_norm": 2.7863261699676514, + "learning_rate": 2.999389165603483e-05, + "loss": 1.8564, + "step": 4665 + }, + { + "epoch": 0.009113604586402035, + "grad_norm": 2.655966281890869, + "learning_rate": 2.9993852313613596e-05, + "loss": 1.9327, + "step": 4680 + }, + { + "epoch": 0.009142814857512298, + "grad_norm": 2.987030506134033, + "learning_rate": 2.999381284492702e-05, + "loss": 1.8325, + "step": 4695 + }, + { + "epoch": 0.00917202512862256, + "grad_norm": 3.127544641494751, + "learning_rate": 2.9993773249975435e-05, + "loss": 1.7246, + "step": 4710 + }, + { + "epoch": 0.009201235399732824, + "grad_norm": 2.395202398300171, + "learning_rate": 2.999373352875918e-05, + "loss": 1.9827, + "step": 4725 + }, + { + "epoch": 0.009230445670843086, + "grad_norm": 4.163525104522705, + "learning_rate": 2.9993693681278582e-05, + "loss": 1.9441, + "step": 4740 + }, + { + "epoch": 0.009259655941953349, + "grad_norm": 2.84067964553833, + "learning_rate": 2.9993653707533985e-05, + "loss": 2.0252, + "step": 4755 + }, + { + "epoch": 0.009288866213063612, + "grad_norm": 1.949599266052246, + "learning_rate": 2.9993613607525717e-05, + "loss": 1.9497, + "step": 4770 + }, + { + "epoch": 0.009318076484173874, + "grad_norm": 4.164729595184326, + "learning_rate": 2.9993573381254124e-05, + "loss": 2.1013, + "step": 4785 + }, + { + "epoch": 0.009347286755284139, + "grad_norm": 1.7438952922821045, + "learning_rate": 2.9993533028719537e-05, + "loss": 1.9461, + "step": 4800 + }, + { + "epoch": 0.009376497026394402, + "grad_norm": 1.9006491899490356, + "learning_rate": 2.9993492549922302e-05, + "loss": 1.9273, + "step": 4815 + }, + { + "epoch": 0.009405707297504664, + "grad_norm": 3.693070411682129, + "learning_rate": 2.9993451944862762e-05, + "loss": 1.9791, + "step": 4830 + }, + { + "epoch": 0.009434917568614927, + "grad_norm": 2.9572765827178955, + "learning_rate": 2.9993411213541248e-05, + "loss": 1.7725, + "step": 4845 + }, + { + "epoch": 0.00946412783972519, + "grad_norm": 2.882349967956543, + "learning_rate": 2.999337035595811e-05, + "loss": 1.8094, + "step": 4860 + }, + { + "epoch": 0.009493338110835452, + "grad_norm": 3.9489054679870605, + "learning_rate": 2.9993329372113695e-05, + "loss": 1.8678, + "step": 4875 + }, + { + "epoch": 0.009522548381945715, + "grad_norm": 2.7020881175994873, + "learning_rate": 2.999328826200834e-05, + "loss": 2.0528, + "step": 4890 + }, + { + "epoch": 0.009551758653055978, + "grad_norm": 3.121814489364624, + "learning_rate": 2.99932470256424e-05, + "loss": 1.9087, + "step": 4905 + }, + { + "epoch": 0.009580968924166242, + "grad_norm": 2.713003158569336, + "learning_rate": 2.9993205663016218e-05, + "loss": 1.8721, + "step": 4920 + }, + { + "epoch": 0.009610179195276505, + "grad_norm": 3.5016112327575684, + "learning_rate": 2.9993164174130137e-05, + "loss": 1.8573, + "step": 4935 + }, + { + "epoch": 0.009639389466386768, + "grad_norm": 4.257192611694336, + "learning_rate": 2.9993122558984516e-05, + "loss": 1.9638, + "step": 4950 + }, + { + "epoch": 0.00966859973749703, + "grad_norm": 3.4689440727233887, + "learning_rate": 2.9993080817579702e-05, + "loss": 1.801, + "step": 4965 + }, + { + "epoch": 0.009697810008607293, + "grad_norm": 2.3255503177642822, + "learning_rate": 2.999303894991605e-05, + "loss": 1.6792, + "step": 4980 + }, + { + "epoch": 0.009727020279717556, + "grad_norm": 2.5599734783172607, + "learning_rate": 2.9992996955993898e-05, + "loss": 2.0037, + "step": 4995 + }, + { + "epoch": 0.009756230550827819, + "grad_norm": 2.528571605682373, + "learning_rate": 2.9992954835813616e-05, + "loss": 1.9778, + "step": 5010 + }, + { + "epoch": 0.009785440821938081, + "grad_norm": 3.466859817504883, + "learning_rate": 2.999291258937555e-05, + "loss": 1.9507, + "step": 5025 + }, + { + "epoch": 0.009814651093048346, + "grad_norm": 2.9515936374664307, + "learning_rate": 2.999287021668006e-05, + "loss": 1.9506, + "step": 5040 + }, + { + "epoch": 0.009843861364158608, + "grad_norm": 2.116895914077759, + "learning_rate": 2.99928277177275e-05, + "loss": 2.0403, + "step": 5055 + }, + { + "epoch": 0.009873071635268871, + "grad_norm": 2.0982749462127686, + "learning_rate": 2.999278509251823e-05, + "loss": 1.981, + "step": 5070 + }, + { + "epoch": 0.009902281906379134, + "grad_norm": 2.1464314460754395, + "learning_rate": 2.9992742341052612e-05, + "loss": 2.0126, + "step": 5085 + }, + { + "epoch": 0.009931492177489397, + "grad_norm": 5.098091125488281, + "learning_rate": 2.9992699463330995e-05, + "loss": 1.8511, + "step": 5100 + }, + { + "epoch": 0.00996070244859966, + "grad_norm": 3.8748703002929688, + "learning_rate": 2.999265645935375e-05, + "loss": 1.9058, + "step": 5115 + }, + { + "epoch": 0.009989912719709922, + "grad_norm": 3.9060122966766357, + "learning_rate": 2.999261332912124e-05, + "loss": 1.9298, + "step": 5130 + }, + { + "epoch": 0.010019122990820185, + "grad_norm": 3.6169393062591553, + "learning_rate": 2.999257007263382e-05, + "loss": 1.6655, + "step": 5145 + }, + { + "epoch": 0.010048333261930449, + "grad_norm": 2.0531461238861084, + "learning_rate": 2.999252668989186e-05, + "loss": 2.0188, + "step": 5160 + }, + { + "epoch": 0.010077543533040712, + "grad_norm": 2.554202079772949, + "learning_rate": 2.9992483180895725e-05, + "loss": 1.8039, + "step": 5175 + }, + { + "epoch": 0.010106753804150975, + "grad_norm": 3.2061095237731934, + "learning_rate": 2.9992439545645778e-05, + "loss": 1.8889, + "step": 5190 + }, + { + "epoch": 0.010135964075261237, + "grad_norm": 2.8294739723205566, + "learning_rate": 2.9992395784142395e-05, + "loss": 1.9291, + "step": 5205 + }, + { + "epoch": 0.0101651743463715, + "grad_norm": 2.8096394538879395, + "learning_rate": 2.9992351896385932e-05, + "loss": 1.924, + "step": 5220 + }, + { + "epoch": 0.010194384617481763, + "grad_norm": 3.7793962955474854, + "learning_rate": 2.999230788237677e-05, + "loss": 1.933, + "step": 5235 + }, + { + "epoch": 0.010223594888592025, + "grad_norm": 5.307252407073975, + "learning_rate": 2.999226374211527e-05, + "loss": 1.9296, + "step": 5250 + }, + { + "epoch": 0.010252805159702288, + "grad_norm": 3.207782506942749, + "learning_rate": 2.9992219475601806e-05, + "loss": 2.0676, + "step": 5265 + }, + { + "epoch": 0.010282015430812553, + "grad_norm": 2.7217934131622314, + "learning_rate": 2.9992175082836765e-05, + "loss": 1.8675, + "step": 5280 + }, + { + "epoch": 0.010311225701922815, + "grad_norm": 3.455260753631592, + "learning_rate": 2.9992130563820497e-05, + "loss": 1.924, + "step": 5295 + }, + { + "epoch": 0.010340435973033078, + "grad_norm": 2.638262987136841, + "learning_rate": 2.9992085918553393e-05, + "loss": 1.9263, + "step": 5310 + }, + { + "epoch": 0.01036964624414334, + "grad_norm": 3.1709418296813965, + "learning_rate": 2.9992041147035828e-05, + "loss": 1.8848, + "step": 5325 + }, + { + "epoch": 0.010398856515253603, + "grad_norm": 2.1629397869110107, + "learning_rate": 2.9991996249268175e-05, + "loss": 1.9271, + "step": 5340 + }, + { + "epoch": 0.010428066786363866, + "grad_norm": 4.2046284675598145, + "learning_rate": 2.999195122525081e-05, + "loss": 1.9208, + "step": 5355 + }, + { + "epoch": 0.010457277057474129, + "grad_norm": 4.966402053833008, + "learning_rate": 2.9991906074984116e-05, + "loss": 1.8059, + "step": 5370 + }, + { + "epoch": 0.010486487328584392, + "grad_norm": 3.0322844982147217, + "learning_rate": 2.9991860798468473e-05, + "loss": 1.9608, + "step": 5385 + }, + { + "epoch": 0.010515697599694656, + "grad_norm": 2.3690319061279297, + "learning_rate": 2.9991815395704266e-05, + "loss": 1.9424, + "step": 5400 + }, + { + "epoch": 0.010544907870804919, + "grad_norm": 2.81915545463562, + "learning_rate": 2.9991769866691865e-05, + "loss": 1.8246, + "step": 5415 + }, + { + "epoch": 0.010574118141915181, + "grad_norm": 3.064317464828491, + "learning_rate": 2.9991724211431667e-05, + "loss": 1.8012, + "step": 5430 + }, + { + "epoch": 0.010603328413025444, + "grad_norm": 4.302711009979248, + "learning_rate": 2.999167842992405e-05, + "loss": 1.9098, + "step": 5445 + }, + { + "epoch": 0.010632538684135707, + "grad_norm": 3.3137192726135254, + "learning_rate": 2.9991632522169398e-05, + "loss": 1.8922, + "step": 5460 + }, + { + "epoch": 0.01066174895524597, + "grad_norm": 3.0742363929748535, + "learning_rate": 2.9991586488168104e-05, + "loss": 2.0403, + "step": 5475 + }, + { + "epoch": 0.010690959226356232, + "grad_norm": 3.034343957901001, + "learning_rate": 2.9991540327920547e-05, + "loss": 1.8944, + "step": 5490 + }, + { + "epoch": 0.010720169497466495, + "grad_norm": 4.131673812866211, + "learning_rate": 2.9991494041427124e-05, + "loss": 1.8897, + "step": 5505 + }, + { + "epoch": 0.01074937976857676, + "grad_norm": 3.8295650482177734, + "learning_rate": 2.999144762868822e-05, + "loss": 1.8522, + "step": 5520 + }, + { + "epoch": 0.010778590039687022, + "grad_norm": 4.337125301361084, + "learning_rate": 2.999140108970423e-05, + "loss": 1.8039, + "step": 5535 + }, + { + "epoch": 0.010807800310797285, + "grad_norm": 4.047338008880615, + "learning_rate": 2.999135442447554e-05, + "loss": 1.964, + "step": 5550 + }, + { + "epoch": 0.010837010581907548, + "grad_norm": 3.5203754901885986, + "learning_rate": 2.9991307633002546e-05, + "loss": 1.9198, + "step": 5565 + }, + { + "epoch": 0.01086622085301781, + "grad_norm": 3.9507977962493896, + "learning_rate": 2.9991260715285642e-05, + "loss": 1.9079, + "step": 5580 + }, + { + "epoch": 0.010895431124128073, + "grad_norm": 4.072928428649902, + "learning_rate": 2.9991213671325223e-05, + "loss": 2.032, + "step": 5595 + }, + { + "epoch": 0.010924641395238336, + "grad_norm": 2.8438544273376465, + "learning_rate": 2.9991166501121685e-05, + "loss": 1.9995, + "step": 5610 + }, + { + "epoch": 0.0109538516663486, + "grad_norm": 3.2885472774505615, + "learning_rate": 2.9991119204675425e-05, + "loss": 1.8653, + "step": 5625 + }, + { + "epoch": 0.010983061937458863, + "grad_norm": 3.6044580936431885, + "learning_rate": 2.9991071781986843e-05, + "loss": 1.9103, + "step": 5640 + }, + { + "epoch": 0.011012272208569126, + "grad_norm": 4.155179977416992, + "learning_rate": 2.9991024233056335e-05, + "loss": 1.8887, + "step": 5655 + }, + { + "epoch": 0.011041482479679388, + "grad_norm": 4.087657451629639, + "learning_rate": 2.9990976557884308e-05, + "loss": 1.967, + "step": 5670 + }, + { + "epoch": 0.011070692750789651, + "grad_norm": 4.724533557891846, + "learning_rate": 2.999092875647116e-05, + "loss": 1.8383, + "step": 5685 + }, + { + "epoch": 0.011099903021899914, + "grad_norm": 7.630526065826416, + "learning_rate": 2.9990880828817287e-05, + "loss": 1.9142, + "step": 5700 + }, + { + "epoch": 0.011129113293010176, + "grad_norm": 4.646370887756348, + "learning_rate": 2.99908327749231e-05, + "loss": 1.8862, + "step": 5715 + }, + { + "epoch": 0.011158323564120439, + "grad_norm": 1.9703487157821655, + "learning_rate": 2.9990784594789e-05, + "loss": 1.8747, + "step": 5730 + }, + { + "epoch": 0.011187533835230704, + "grad_norm": 2.7092244625091553, + "learning_rate": 2.99907362884154e-05, + "loss": 1.9593, + "step": 5745 + }, + { + "epoch": 0.011216744106340966, + "grad_norm": 2.3020477294921875, + "learning_rate": 2.9990687855802695e-05, + "loss": 1.9351, + "step": 5760 + }, + { + "epoch": 0.011245954377451229, + "grad_norm": 3.7924842834472656, + "learning_rate": 2.9990639296951303e-05, + "loss": 1.9888, + "step": 5775 + }, + { + "epoch": 0.011275164648561492, + "grad_norm": 2.8485357761383057, + "learning_rate": 2.9990590611861625e-05, + "loss": 1.7921, + "step": 5790 + }, + { + "epoch": 0.011304374919671754, + "grad_norm": 2.7510149478912354, + "learning_rate": 2.999054180053408e-05, + "loss": 1.9655, + "step": 5805 + }, + { + "epoch": 0.011333585190782017, + "grad_norm": 4.522819519042969, + "learning_rate": 2.999049286296907e-05, + "loss": 1.9384, + "step": 5820 + }, + { + "epoch": 0.01136279546189228, + "grad_norm": 2.325582265853882, + "learning_rate": 2.9990443799167018e-05, + "loss": 1.8342, + "step": 5835 + }, + { + "epoch": 0.011392005733002543, + "grad_norm": 3.583799123764038, + "learning_rate": 2.999039460912832e-05, + "loss": 1.9671, + "step": 5850 + }, + { + "epoch": 0.011421216004112807, + "grad_norm": 2.342571973800659, + "learning_rate": 2.999034529285341e-05, + "loss": 1.8169, + "step": 5865 + }, + { + "epoch": 0.01145042627522307, + "grad_norm": 3.772407054901123, + "learning_rate": 2.9990295850342694e-05, + "loss": 1.8066, + "step": 5880 + }, + { + "epoch": 0.011479636546333332, + "grad_norm": 2.550743341445923, + "learning_rate": 2.9990246281596583e-05, + "loss": 1.9611, + "step": 5895 + }, + { + "epoch": 0.011508846817443595, + "grad_norm": 1.9916815757751465, + "learning_rate": 2.9990196586615502e-05, + "loss": 1.8308, + "step": 5910 + }, + { + "epoch": 0.011538057088553858, + "grad_norm": 3.8781256675720215, + "learning_rate": 2.9990146765399868e-05, + "loss": 1.804, + "step": 5925 + }, + { + "epoch": 0.01156726735966412, + "grad_norm": 1.9027587175369263, + "learning_rate": 2.99900968179501e-05, + "loss": 1.8449, + "step": 5940 + }, + { + "epoch": 0.011596477630774383, + "grad_norm": 2.9147605895996094, + "learning_rate": 2.9990046744266612e-05, + "loss": 1.9283, + "step": 5955 + }, + { + "epoch": 0.011625687901884646, + "grad_norm": 2.0023555755615234, + "learning_rate": 2.9989996544349842e-05, + "loss": 1.9464, + "step": 5970 + }, + { + "epoch": 0.01165489817299491, + "grad_norm": 2.819077253341675, + "learning_rate": 2.9989946218200195e-05, + "loss": 1.7682, + "step": 5985 + }, + { + "epoch": 0.011684108444105173, + "grad_norm": 4.221833229064941, + "learning_rate": 2.998989576581811e-05, + "loss": 1.6921, + "step": 6000 + }, + { + "epoch": 0.011713318715215436, + "grad_norm": 2.9984993934631348, + "learning_rate": 2.9989845187204e-05, + "loss": 1.8921, + "step": 6015 + }, + { + "epoch": 0.011742528986325699, + "grad_norm": 2.866452932357788, + "learning_rate": 2.9989794482358293e-05, + "loss": 1.9759, + "step": 6030 + }, + { + "epoch": 0.011771739257435961, + "grad_norm": 2.398813486099243, + "learning_rate": 2.9989743651281424e-05, + "loss": 1.8948, + "step": 6045 + }, + { + "epoch": 0.011800949528546224, + "grad_norm": 2.6533145904541016, + "learning_rate": 2.998969269397381e-05, + "loss": 1.8532, + "step": 6060 + }, + { + "epoch": 0.011830159799656487, + "grad_norm": 3.8135955333709717, + "learning_rate": 2.998964161043589e-05, + "loss": 1.9564, + "step": 6075 + }, + { + "epoch": 0.01185937007076675, + "grad_norm": 2.5285484790802, + "learning_rate": 2.9989590400668086e-05, + "loss": 1.8229, + "step": 6090 + }, + { + "epoch": 0.011888580341877014, + "grad_norm": 2.0157272815704346, + "learning_rate": 2.9989539064670838e-05, + "loss": 1.9952, + "step": 6105 + }, + { + "epoch": 0.011917790612987277, + "grad_norm": 1.8600102663040161, + "learning_rate": 2.998948760244457e-05, + "loss": 2.0176, + "step": 6120 + }, + { + "epoch": 0.01194700088409754, + "grad_norm": 2.4452428817749023, + "learning_rate": 2.9989436013989718e-05, + "loss": 1.8965, + "step": 6135 + }, + { + "epoch": 0.011976211155207802, + "grad_norm": 3.593918561935425, + "learning_rate": 2.998938429930672e-05, + "loss": 1.9629, + "step": 6150 + }, + { + "epoch": 0.012005421426318065, + "grad_norm": 2.161616086959839, + "learning_rate": 2.9989332458396005e-05, + "loss": 2.0643, + "step": 6165 + }, + { + "epoch": 0.012034631697428327, + "grad_norm": 4.1473493576049805, + "learning_rate": 2.9989280491258015e-05, + "loss": 1.8227, + "step": 6180 + }, + { + "epoch": 0.01206384196853859, + "grad_norm": 3.4432153701782227, + "learning_rate": 2.9989228397893186e-05, + "loss": 1.9787, + "step": 6195 + }, + { + "epoch": 0.012093052239648853, + "grad_norm": 3.591747283935547, + "learning_rate": 2.9989176178301955e-05, + "loss": 1.8248, + "step": 6210 + }, + { + "epoch": 0.012122262510759117, + "grad_norm": 2.385715961456299, + "learning_rate": 2.9989123832484767e-05, + "loss": 2.0869, + "step": 6225 + }, + { + "epoch": 0.01215147278186938, + "grad_norm": 3.128617763519287, + "learning_rate": 2.9989071360442058e-05, + "loss": 1.8622, + "step": 6240 + }, + { + "epoch": 0.012180683052979643, + "grad_norm": 5.122420787811279, + "learning_rate": 2.998901876217427e-05, + "loss": 1.828, + "step": 6255 + }, + { + "epoch": 0.012209893324089905, + "grad_norm": 2.451355218887329, + "learning_rate": 2.9988966037681844e-05, + "loss": 1.8766, + "step": 6270 + }, + { + "epoch": 0.012239103595200168, + "grad_norm": 2.8492448329925537, + "learning_rate": 2.9988913186965232e-05, + "loss": 1.9675, + "step": 6285 + }, + { + "epoch": 0.01226831386631043, + "grad_norm": 4.332848072052002, + "learning_rate": 2.998886021002487e-05, + "loss": 2.0243, + "step": 6300 + }, + { + "epoch": 0.012297524137420694, + "grad_norm": 2.7769408226013184, + "learning_rate": 2.9988807106861208e-05, + "loss": 1.9015, + "step": 6315 + }, + { + "epoch": 0.012326734408530956, + "grad_norm": 2.2205896377563477, + "learning_rate": 2.9988753877474696e-05, + "loss": 1.8927, + "step": 6330 + }, + { + "epoch": 0.01235594467964122, + "grad_norm": 4.445504665374756, + "learning_rate": 2.9988700521865777e-05, + "loss": 1.9646, + "step": 6345 + }, + { + "epoch": 0.012385154950751483, + "grad_norm": 2.951155662536621, + "learning_rate": 2.9988647040034905e-05, + "loss": 1.8487, + "step": 6360 + }, + { + "epoch": 0.012414365221861746, + "grad_norm": 4.908761501312256, + "learning_rate": 2.998859343198253e-05, + "loss": 1.8704, + "step": 6375 + }, + { + "epoch": 0.012443575492972009, + "grad_norm": 3.0668880939483643, + "learning_rate": 2.9988539697709098e-05, + "loss": 1.8632, + "step": 6390 + }, + { + "epoch": 0.012472785764082272, + "grad_norm": 1.8523426055908203, + "learning_rate": 2.9988485837215068e-05, + "loss": 1.8434, + "step": 6405 + }, + { + "epoch": 0.012501996035192534, + "grad_norm": 4.285421848297119, + "learning_rate": 2.9988431850500887e-05, + "loss": 1.8438, + "step": 6420 + }, + { + "epoch": 0.012531206306302797, + "grad_norm": 1.9119669198989868, + "learning_rate": 2.9988377737567013e-05, + "loss": 1.8469, + "step": 6435 + }, + { + "epoch": 0.01256041657741306, + "grad_norm": 2.430860757827759, + "learning_rate": 2.9988323498413907e-05, + "loss": 2.1108, + "step": 6450 + }, + { + "epoch": 0.012589626848523324, + "grad_norm": 2.2205920219421387, + "learning_rate": 2.9988269133042016e-05, + "loss": 1.8481, + "step": 6465 + }, + { + "epoch": 0.012618837119633587, + "grad_norm": 2.0588815212249756, + "learning_rate": 2.9988214641451804e-05, + "loss": 1.8007, + "step": 6480 + }, + { + "epoch": 0.01264804739074385, + "grad_norm": 2.527836322784424, + "learning_rate": 2.998816002364373e-05, + "loss": 1.9974, + "step": 6495 + }, + { + "epoch": 0.012677257661854112, + "grad_norm": 3.0486202239990234, + "learning_rate": 2.9988105279618253e-05, + "loss": 1.8459, + "step": 6510 + }, + { + "epoch": 0.012706467932964375, + "grad_norm": 2.6629745960235596, + "learning_rate": 2.998805040937583e-05, + "loss": 1.9373, + "step": 6525 + }, + { + "epoch": 0.012735678204074638, + "grad_norm": 3.2922544479370117, + "learning_rate": 2.9987995412916928e-05, + "loss": 1.8839, + "step": 6540 + }, + { + "epoch": 0.0127648884751849, + "grad_norm": 4.026648044586182, + "learning_rate": 2.998794029024201e-05, + "loss": 2.1104, + "step": 6555 + }, + { + "epoch": 0.012794098746295163, + "grad_norm": 2.473214864730835, + "learning_rate": 2.998788504135154e-05, + "loss": 1.664, + "step": 6570 + }, + { + "epoch": 0.012823309017405428, + "grad_norm": 2.8834683895111084, + "learning_rate": 2.998782966624598e-05, + "loss": 1.9737, + "step": 6585 + }, + { + "epoch": 0.01285251928851569, + "grad_norm": 3.123595952987671, + "learning_rate": 2.99877741649258e-05, + "loss": 2.1071, + "step": 6600 + }, + { + "epoch": 0.012881729559625953, + "grad_norm": 2.3172380924224854, + "learning_rate": 2.998771853739146e-05, + "loss": 1.8585, + "step": 6615 + }, + { + "epoch": 0.012910939830736216, + "grad_norm": 3.4954440593719482, + "learning_rate": 2.998766278364344e-05, + "loss": 1.8458, + "step": 6630 + }, + { + "epoch": 0.012940150101846478, + "grad_norm": 2.48248553276062, + "learning_rate": 2.9987606903682203e-05, + "loss": 1.9398, + "step": 6645 + }, + { + "epoch": 0.012969360372956741, + "grad_norm": 2.7575950622558594, + "learning_rate": 2.998755089750822e-05, + "loss": 1.8692, + "step": 6660 + }, + { + "epoch": 0.012998570644067004, + "grad_norm": 2.821286678314209, + "learning_rate": 2.9987494765121962e-05, + "loss": 2.1104, + "step": 6675 + }, + { + "epoch": 0.013027780915177268, + "grad_norm": 3.881669521331787, + "learning_rate": 2.99874385065239e-05, + "loss": 1.8453, + "step": 6690 + }, + { + "epoch": 0.013056991186287531, + "grad_norm": 2.573246955871582, + "learning_rate": 2.9987382121714516e-05, + "loss": 1.9538, + "step": 6705 + }, + { + "epoch": 0.013086201457397794, + "grad_norm": 4.5084052085876465, + "learning_rate": 2.9987325610694277e-05, + "loss": 1.9216, + "step": 6720 + }, + { + "epoch": 0.013115411728508056, + "grad_norm": 3.882384777069092, + "learning_rate": 2.9987268973463662e-05, + "loss": 1.7694, + "step": 6735 + }, + { + "epoch": 0.013144621999618319, + "grad_norm": 2.769113302230835, + "learning_rate": 2.9987212210023147e-05, + "loss": 1.9591, + "step": 6750 + }, + { + "epoch": 0.013173832270728582, + "grad_norm": 3.9864275455474854, + "learning_rate": 2.9987155320373207e-05, + "loss": 1.9079, + "step": 6765 + }, + { + "epoch": 0.013203042541838845, + "grad_norm": 1.9832093715667725, + "learning_rate": 2.998709830451433e-05, + "loss": 1.8263, + "step": 6780 + }, + { + "epoch": 0.013232252812949107, + "grad_norm": 5.0461106300354, + "learning_rate": 2.9987041162446985e-05, + "loss": 1.8265, + "step": 6795 + }, + { + "epoch": 0.013261463084059372, + "grad_norm": 2.5139777660369873, + "learning_rate": 2.998698389417166e-05, + "loss": 1.8644, + "step": 6810 + }, + { + "epoch": 0.013290673355169634, + "grad_norm": 2.5377390384674072, + "learning_rate": 2.998692649968884e-05, + "loss": 1.9956, + "step": 6825 + }, + { + "epoch": 0.013319883626279897, + "grad_norm": 2.68589186668396, + "learning_rate": 2.9986868978998998e-05, + "loss": 1.9298, + "step": 6840 + }, + { + "epoch": 0.01334909389739016, + "grad_norm": 5.362221717834473, + "learning_rate": 2.9986811332102624e-05, + "loss": 1.8319, + "step": 6855 + }, + { + "epoch": 0.013378304168500423, + "grad_norm": 3.440687417984009, + "learning_rate": 2.9986753559000207e-05, + "loss": 1.8854, + "step": 6870 + }, + { + "epoch": 0.013407514439610685, + "grad_norm": 2.2173657417297363, + "learning_rate": 2.9986695659692233e-05, + "loss": 1.8672, + "step": 6885 + }, + { + "epoch": 0.013436724710720948, + "grad_norm": 2.1564152240753174, + "learning_rate": 2.998663763417918e-05, + "loss": 1.9548, + "step": 6900 + }, + { + "epoch": 0.01346593498183121, + "grad_norm": 2.339550018310547, + "learning_rate": 2.9986579482461552e-05, + "loss": 1.7549, + "step": 6915 + }, + { + "epoch": 0.013495145252941475, + "grad_norm": 8.277188301086426, + "learning_rate": 2.9986521204539824e-05, + "loss": 1.8958, + "step": 6930 + }, + { + "epoch": 0.013524355524051738, + "grad_norm": 3.044663190841675, + "learning_rate": 2.9986462800414498e-05, + "loss": 1.9019, + "step": 6945 + }, + { + "epoch": 0.013553565795162, + "grad_norm": 2.622844696044922, + "learning_rate": 2.9986404270086056e-05, + "loss": 1.8349, + "step": 6960 + }, + { + "epoch": 0.013582776066272263, + "grad_norm": 2.686244010925293, + "learning_rate": 2.9986345613554998e-05, + "loss": 1.9178, + "step": 6975 + }, + { + "epoch": 0.013611986337382526, + "grad_norm": 3.487210512161255, + "learning_rate": 2.9986286830821817e-05, + "loss": 2.0353, + "step": 6990 + }, + { + "epoch": 0.013641196608492789, + "grad_norm": 3.297060966491699, + "learning_rate": 2.9986227921887005e-05, + "loss": 2.0344, + "step": 7005 + }, + { + "epoch": 0.013670406879603051, + "grad_norm": 6.4708571434021, + "learning_rate": 2.9986168886751064e-05, + "loss": 1.946, + "step": 7020 + }, + { + "epoch": 0.013699617150713314, + "grad_norm": 3.04331636428833, + "learning_rate": 2.9986109725414485e-05, + "loss": 1.9229, + "step": 7035 + }, + { + "epoch": 0.013728827421823579, + "grad_norm": 3.3614730834960938, + "learning_rate": 2.9986050437877762e-05, + "loss": 1.8943, + "step": 7050 + }, + { + "epoch": 0.013758037692933841, + "grad_norm": 2.632763624191284, + "learning_rate": 2.998599102414141e-05, + "loss": 1.907, + "step": 7065 + }, + { + "epoch": 0.013787247964044104, + "grad_norm": 2.441969871520996, + "learning_rate": 2.998593148420592e-05, + "loss": 1.9382, + "step": 7080 + }, + { + "epoch": 0.013816458235154367, + "grad_norm": 3.3013856410980225, + "learning_rate": 2.9985871818071784e-05, + "loss": 1.7866, + "step": 7095 + }, + { + "epoch": 0.01384566850626463, + "grad_norm": 3.12922739982605, + "learning_rate": 2.9985812025739518e-05, + "loss": 1.9155, + "step": 7110 + }, + { + "epoch": 0.013874878777374892, + "grad_norm": 3.3649489879608154, + "learning_rate": 2.998575210720962e-05, + "loss": 1.7256, + "step": 7125 + }, + { + "epoch": 0.013904089048485155, + "grad_norm": 2.381049871444702, + "learning_rate": 2.9985692062482603e-05, + "loss": 1.9324, + "step": 7140 + }, + { + "epoch": 0.013933299319595418, + "grad_norm": 2.5328924655914307, + "learning_rate": 2.998563189155896e-05, + "loss": 2.0635, + "step": 7155 + }, + { + "epoch": 0.013962509590705682, + "grad_norm": 3.5615475177764893, + "learning_rate": 2.99855715944392e-05, + "loss": 1.8377, + "step": 7170 + }, + { + "epoch": 0.013991719861815945, + "grad_norm": 3.7230849266052246, + "learning_rate": 2.998551117112384e-05, + "loss": 1.839, + "step": 7185 + }, + { + "epoch": 0.014020930132926207, + "grad_norm": 2.7863929271698, + "learning_rate": 2.998545062161338e-05, + "loss": 1.7265, + "step": 7200 + }, + { + "epoch": 0.01405014040403647, + "grad_norm": 2.238398790359497, + "learning_rate": 2.9985389945908332e-05, + "loss": 1.9747, + "step": 7215 + }, + { + "epoch": 0.014079350675146733, + "grad_norm": 2.3368685245513916, + "learning_rate": 2.998532914400921e-05, + "loss": 2.0277, + "step": 7230 + }, + { + "epoch": 0.014108560946256996, + "grad_norm": 3.667415142059326, + "learning_rate": 2.9985268215916523e-05, + "loss": 1.8947, + "step": 7245 + }, + { + "epoch": 0.014137771217367258, + "grad_norm": 5.902375221252441, + "learning_rate": 2.9985207161630784e-05, + "loss": 2.023, + "step": 7260 + }, + { + "epoch": 0.014166981488477521, + "grad_norm": 3.5179100036621094, + "learning_rate": 2.998514598115251e-05, + "loss": 1.9654, + "step": 7275 + }, + { + "epoch": 0.014196191759587785, + "grad_norm": 4.09523344039917, + "learning_rate": 2.9985084674482207e-05, + "loss": 1.9822, + "step": 7290 + }, + { + "epoch": 0.014225402030698048, + "grad_norm": 3.9758706092834473, + "learning_rate": 2.9985023241620405e-05, + "loss": 1.7241, + "step": 7305 + }, + { + "epoch": 0.01425461230180831, + "grad_norm": 4.463382720947266, + "learning_rate": 2.9984961682567614e-05, + "loss": 1.9208, + "step": 7320 + }, + { + "epoch": 0.014283822572918574, + "grad_norm": 2.2591640949249268, + "learning_rate": 2.9984899997324357e-05, + "loss": 1.9372, + "step": 7335 + }, + { + "epoch": 0.014313032844028836, + "grad_norm": 2.958768129348755, + "learning_rate": 2.998483818589114e-05, + "loss": 1.9092, + "step": 7350 + }, + { + "epoch": 0.014342243115139099, + "grad_norm": 2.4651241302490234, + "learning_rate": 2.99847762482685e-05, + "loss": 1.7909, + "step": 7365 + }, + { + "epoch": 0.014371453386249362, + "grad_norm": 4.785153388977051, + "learning_rate": 2.9984714184456948e-05, + "loss": 2.0846, + "step": 7380 + }, + { + "epoch": 0.014400663657359624, + "grad_norm": 5.212136745452881, + "learning_rate": 2.9984651994457013e-05, + "loss": 1.7623, + "step": 7395 + }, + { + "epoch": 0.014429873928469889, + "grad_norm": 3.5508365631103516, + "learning_rate": 2.9984589678269216e-05, + "loss": 1.8922, + "step": 7410 + }, + { + "epoch": 0.014459084199580152, + "grad_norm": 4.388883113861084, + "learning_rate": 2.998452723589408e-05, + "loss": 1.9701, + "step": 7425 + }, + { + "epoch": 0.014488294470690414, + "grad_norm": 2.4577128887176514, + "learning_rate": 2.9984464667332135e-05, + "loss": 1.8807, + "step": 7440 + }, + { + "epoch": 0.014517504741800677, + "grad_norm": 3.046642541885376, + "learning_rate": 2.99844019725839e-05, + "loss": 2.0671, + "step": 7455 + }, + { + "epoch": 0.01454671501291094, + "grad_norm": 2.639798879623413, + "learning_rate": 2.9984339151649913e-05, + "loss": 1.7981, + "step": 7470 + }, + { + "epoch": 0.014575925284021202, + "grad_norm": 4.231945991516113, + "learning_rate": 2.9984276204530702e-05, + "loss": 1.822, + "step": 7485 + }, + { + "epoch": 0.014605135555131465, + "grad_norm": 2.3419246673583984, + "learning_rate": 2.9984213131226788e-05, + "loss": 1.8931, + "step": 7500 + }, + { + "epoch": 0.014634345826241728, + "grad_norm": 3.034775733947754, + "learning_rate": 2.998414993173871e-05, + "loss": 1.8925, + "step": 7515 + }, + { + "epoch": 0.014663556097351992, + "grad_norm": 2.818657875061035, + "learning_rate": 2.9984086606066997e-05, + "loss": 1.8714, + "step": 7530 + }, + { + "epoch": 0.014692766368462255, + "grad_norm": 2.0940043926239014, + "learning_rate": 2.9984023154212183e-05, + "loss": 1.9476, + "step": 7545 + }, + { + "epoch": 0.014721976639572518, + "grad_norm": 1.9247641563415527, + "learning_rate": 2.9983959576174807e-05, + "loss": 1.8433, + "step": 7560 + }, + { + "epoch": 0.01475118691068278, + "grad_norm": 4.496128559112549, + "learning_rate": 2.9983895871955397e-05, + "loss": 1.8137, + "step": 7575 + }, + { + "epoch": 0.014780397181793043, + "grad_norm": 1.846083402633667, + "learning_rate": 2.998383204155449e-05, + "loss": 1.9474, + "step": 7590 + }, + { + "epoch": 0.014809607452903306, + "grad_norm": 2.063615083694458, + "learning_rate": 2.9983768084972626e-05, + "loss": 1.9017, + "step": 7605 + }, + { + "epoch": 0.014838817724013569, + "grad_norm": 3.480355978012085, + "learning_rate": 2.9983704002210346e-05, + "loss": 1.8554, + "step": 7620 + }, + { + "epoch": 0.014868027995123831, + "grad_norm": 3.3823037147521973, + "learning_rate": 2.9983639793268187e-05, + "loss": 2.067, + "step": 7635 + }, + { + "epoch": 0.014897238266234096, + "grad_norm": 4.111497402191162, + "learning_rate": 2.998357545814669e-05, + "loss": 1.8451, + "step": 7650 + }, + { + "epoch": 0.014926448537344358, + "grad_norm": 2.51603102684021, + "learning_rate": 2.9983510996846397e-05, + "loss": 1.835, + "step": 7665 + }, + { + "epoch": 0.014955658808454621, + "grad_norm": 2.576188087463379, + "learning_rate": 2.9983446409367846e-05, + "loss": 2.0444, + "step": 7680 + }, + { + "epoch": 0.014984869079564884, + "grad_norm": 2.695582151412964, + "learning_rate": 2.9983381695711595e-05, + "loss": 1.9372, + "step": 7695 + }, + { + "epoch": 0.015014079350675147, + "grad_norm": 2.580967664718628, + "learning_rate": 2.9983316855878172e-05, + "loss": 1.7984, + "step": 7710 + }, + { + "epoch": 0.01504328962178541, + "grad_norm": 2.779932737350464, + "learning_rate": 2.9983251889868133e-05, + "loss": 2.0286, + "step": 7725 + }, + { + "epoch": 0.015072499892895672, + "grad_norm": 4.4765849113464355, + "learning_rate": 2.998318679768202e-05, + "loss": 1.873, + "step": 7740 + }, + { + "epoch": 0.015101710164005935, + "grad_norm": 5.089080810546875, + "learning_rate": 2.9983121579320387e-05, + "loss": 1.9452, + "step": 7755 + }, + { + "epoch": 0.015130920435116199, + "grad_norm": 3.860607862472534, + "learning_rate": 2.9983056234783774e-05, + "loss": 1.9469, + "step": 7770 + }, + { + "epoch": 0.015160130706226462, + "grad_norm": 3.014214277267456, + "learning_rate": 2.998299076407274e-05, + "loss": 2.0488, + "step": 7785 + }, + { + "epoch": 0.015189340977336725, + "grad_norm": 3.1203925609588623, + "learning_rate": 2.998292516718784e-05, + "loss": 1.8334, + "step": 7800 + }, + { + "epoch": 0.015218551248446987, + "grad_norm": 1.6761817932128906, + "learning_rate": 2.998285944412961e-05, + "loss": 2.0263, + "step": 7815 + }, + { + "epoch": 0.01524776151955725, + "grad_norm": 2.3997247219085693, + "learning_rate": 2.9982793594898623e-05, + "loss": 1.9195, + "step": 7830 + }, + { + "epoch": 0.015276971790667513, + "grad_norm": 2.0805671215057373, + "learning_rate": 2.998272761949542e-05, + "loss": 1.8461, + "step": 7845 + }, + { + "epoch": 0.015306182061777775, + "grad_norm": 3.0058650970458984, + "learning_rate": 2.998266151792056e-05, + "loss": 2.0221, + "step": 7860 + }, + { + "epoch": 0.01533539233288804, + "grad_norm": 4.376441955566406, + "learning_rate": 2.99825952901746e-05, + "loss": 1.8942, + "step": 7875 + }, + { + "epoch": 0.015364602603998303, + "grad_norm": 2.509711503982544, + "learning_rate": 2.9982528936258096e-05, + "loss": 1.7979, + "step": 7890 + }, + { + "epoch": 0.015393812875108565, + "grad_norm": 2.763103723526001, + "learning_rate": 2.9982462456171605e-05, + "loss": 1.9112, + "step": 7905 + }, + { + "epoch": 0.015423023146218828, + "grad_norm": 2.519554615020752, + "learning_rate": 2.9982395849915698e-05, + "loss": 1.9375, + "step": 7920 + }, + { + "epoch": 0.01545223341732909, + "grad_norm": 2.679543972015381, + "learning_rate": 2.9982329117490926e-05, + "loss": 1.8226, + "step": 7935 + }, + { + "epoch": 0.015481443688439353, + "grad_norm": 3.619253158569336, + "learning_rate": 2.9982262258897855e-05, + "loss": 1.8639, + "step": 7950 + }, + { + "epoch": 0.015510653959549616, + "grad_norm": 1.7691304683685303, + "learning_rate": 2.9982195274137042e-05, + "loss": 1.7112, + "step": 7965 + }, + { + "epoch": 0.015539864230659879, + "grad_norm": 3.720994234085083, + "learning_rate": 2.9982128163209058e-05, + "loss": 1.8901, + "step": 7980 + }, + { + "epoch": 0.015569074501770143, + "grad_norm": 3.499058723449707, + "learning_rate": 2.9982060926114467e-05, + "loss": 1.9527, + "step": 7995 + }, + { + "epoch": 0.015598284772880406, + "grad_norm": 2.401658773422241, + "learning_rate": 2.9981993562853833e-05, + "loss": 1.9037, + "step": 8010 + }, + { + "epoch": 0.015627495043990667, + "grad_norm": 3.7763659954071045, + "learning_rate": 2.9981926073427724e-05, + "loss": 2.0314, + "step": 8025 + }, + { + "epoch": 0.01565670531510093, + "grad_norm": 3.2521162033081055, + "learning_rate": 2.9981858457836707e-05, + "loss": 1.7982, + "step": 8040 + }, + { + "epoch": 0.015685915586211196, + "grad_norm": 3.2938101291656494, + "learning_rate": 2.9981790716081353e-05, + "loss": 1.9225, + "step": 8055 + }, + { + "epoch": 0.015715125857321457, + "grad_norm": 2.9045004844665527, + "learning_rate": 2.9981722848162233e-05, + "loss": 2.04, + "step": 8070 + }, + { + "epoch": 0.01574433612843172, + "grad_norm": 2.2528388500213623, + "learning_rate": 2.9981654854079918e-05, + "loss": 1.792, + "step": 8085 + }, + { + "epoch": 0.015773546399541982, + "grad_norm": 2.9156363010406494, + "learning_rate": 2.998158673383498e-05, + "loss": 1.9291, + "step": 8100 + }, + { + "epoch": 0.015802756670652247, + "grad_norm": 2.375291347503662, + "learning_rate": 2.9981518487427996e-05, + "loss": 1.8216, + "step": 8115 + }, + { + "epoch": 0.015831966941762508, + "grad_norm": 4.628626823425293, + "learning_rate": 2.9981450114859532e-05, + "loss": 1.7895, + "step": 8130 + }, + { + "epoch": 0.015861177212872772, + "grad_norm": 2.200885534286499, + "learning_rate": 2.9981381616130172e-05, + "loss": 1.9302, + "step": 8145 + }, + { + "epoch": 0.015890387483983033, + "grad_norm": 3.7578535079956055, + "learning_rate": 2.998131299124049e-05, + "loss": 1.9202, + "step": 8160 + }, + { + "epoch": 0.015919597755093298, + "grad_norm": 2.235351800918579, + "learning_rate": 2.9981244240191063e-05, + "loss": 1.9151, + "step": 8175 + }, + { + "epoch": 0.015948808026203562, + "grad_norm": 1.905808925628662, + "learning_rate": 2.9981175362982473e-05, + "loss": 2.0942, + "step": 8190 + }, + { + "epoch": 0.015978018297313823, + "grad_norm": 2.797210454940796, + "learning_rate": 2.99811063596153e-05, + "loss": 2.0168, + "step": 8205 + }, + { + "epoch": 0.016007228568424087, + "grad_norm": 2.7250447273254395, + "learning_rate": 2.9981037230090125e-05, + "loss": 1.8033, + "step": 8220 + }, + { + "epoch": 0.01603643883953435, + "grad_norm": 2.6646409034729004, + "learning_rate": 2.9980967974407525e-05, + "loss": 1.8914, + "step": 8235 + }, + { + "epoch": 0.016065649110644613, + "grad_norm": 4.621725082397461, + "learning_rate": 2.9980898592568086e-05, + "loss": 1.9805, + "step": 8250 + }, + { + "epoch": 0.016094859381754874, + "grad_norm": 7.544195175170898, + "learning_rate": 2.9980829084572393e-05, + "loss": 1.8876, + "step": 8265 + }, + { + "epoch": 0.016124069652865138, + "grad_norm": 3.1491341590881348, + "learning_rate": 2.9980759450421032e-05, + "loss": 1.8096, + "step": 8280 + }, + { + "epoch": 0.016153279923975403, + "grad_norm": 3.049743175506592, + "learning_rate": 2.998068969011459e-05, + "loss": 1.9014, + "step": 8295 + }, + { + "epoch": 0.016182490195085664, + "grad_norm": 3.4776172637939453, + "learning_rate": 2.998061980365365e-05, + "loss": 1.9011, + "step": 8310 + }, + { + "epoch": 0.016211700466195928, + "grad_norm": 3.4769961833953857, + "learning_rate": 2.9980549791038804e-05, + "loss": 2.2367, + "step": 8325 + }, + { + "epoch": 0.01624091073730619, + "grad_norm": 2.252976894378662, + "learning_rate": 2.9980479652270645e-05, + "loss": 1.9017, + "step": 8340 + }, + { + "epoch": 0.016270121008416454, + "grad_norm": 2.3866465091705322, + "learning_rate": 2.998040938734976e-05, + "loss": 1.8849, + "step": 8355 + }, + { + "epoch": 0.016299331279526715, + "grad_norm": 4.388668060302734, + "learning_rate": 2.998033899627674e-05, + "loss": 1.8601, + "step": 8370 + }, + { + "epoch": 0.01632854155063698, + "grad_norm": 3.2678322792053223, + "learning_rate": 2.9980268479052173e-05, + "loss": 1.8799, + "step": 8385 + }, + { + "epoch": 0.01635775182174724, + "grad_norm": 3.3072402477264404, + "learning_rate": 2.9980197835676665e-05, + "loss": 1.8995, + "step": 8400 + }, + { + "epoch": 0.016386962092857504, + "grad_norm": 2.1432266235351562, + "learning_rate": 2.99801270661508e-05, + "loss": 1.9894, + "step": 8415 + }, + { + "epoch": 0.01641617236396777, + "grad_norm": 4.359271049499512, + "learning_rate": 2.998005617047518e-05, + "loss": 1.8252, + "step": 8430 + }, + { + "epoch": 0.01644538263507803, + "grad_norm": 4.683668613433838, + "learning_rate": 2.99799851486504e-05, + "loss": 1.8865, + "step": 8445 + }, + { + "epoch": 0.016474592906188294, + "grad_norm": 2.4480679035186768, + "learning_rate": 2.997991400067706e-05, + "loss": 1.8999, + "step": 8460 + }, + { + "epoch": 0.016503803177298555, + "grad_norm": 4.169287204742432, + "learning_rate": 2.9979842726555753e-05, + "loss": 1.8878, + "step": 8475 + }, + { + "epoch": 0.01653301344840882, + "grad_norm": 2.4626035690307617, + "learning_rate": 2.9979771326287084e-05, + "loss": 2.0727, + "step": 8490 + }, + { + "epoch": 0.01656222371951908, + "grad_norm": 3.7514021396636963, + "learning_rate": 2.9979699799871658e-05, + "loss": 1.9559, + "step": 8505 + }, + { + "epoch": 0.016591433990629345, + "grad_norm": 3.0994958877563477, + "learning_rate": 2.9979628147310068e-05, + "loss": 1.8535, + "step": 8520 + }, + { + "epoch": 0.01662064426173961, + "grad_norm": 5.931899070739746, + "learning_rate": 2.9979556368602924e-05, + "loss": 1.9244, + "step": 8535 + }, + { + "epoch": 0.01664985453284987, + "grad_norm": 2.016737699508667, + "learning_rate": 2.9979484463750833e-05, + "loss": 1.9107, + "step": 8550 + }, + { + "epoch": 0.016679064803960135, + "grad_norm": 2.3796162605285645, + "learning_rate": 2.9979412432754394e-05, + "loss": 2.0904, + "step": 8565 + }, + { + "epoch": 0.016708275075070396, + "grad_norm": 1.870884656906128, + "learning_rate": 2.9979340275614217e-05, + "loss": 1.926, + "step": 8580 + }, + { + "epoch": 0.01673748534618066, + "grad_norm": 2.833564281463623, + "learning_rate": 2.997926799233091e-05, + "loss": 1.871, + "step": 8595 + }, + { + "epoch": 0.01676669561729092, + "grad_norm": 3.7916762828826904, + "learning_rate": 2.9979195582905075e-05, + "loss": 1.8166, + "step": 8610 + }, + { + "epoch": 0.016795905888401186, + "grad_norm": 2.3426475524902344, + "learning_rate": 2.997912304733733e-05, + "loss": 1.8514, + "step": 8625 + }, + { + "epoch": 0.016825116159511447, + "grad_norm": 4.586437225341797, + "learning_rate": 2.9979050385628286e-05, + "loss": 1.8544, + "step": 8640 + }, + { + "epoch": 0.01685432643062171, + "grad_norm": 4.281703472137451, + "learning_rate": 2.997897759777855e-05, + "loss": 1.8978, + "step": 8655 + }, + { + "epoch": 0.016883536701731976, + "grad_norm": 2.1048526763916016, + "learning_rate": 2.9978904683788735e-05, + "loss": 1.9104, + "step": 8670 + }, + { + "epoch": 0.016912746972842237, + "grad_norm": 4.526645660400391, + "learning_rate": 2.9978831643659462e-05, + "loss": 1.9814, + "step": 8685 + }, + { + "epoch": 0.0169419572439525, + "grad_norm": 1.9656120538711548, + "learning_rate": 2.9978758477391334e-05, + "loss": 1.878, + "step": 8700 + }, + { + "epoch": 0.016971167515062762, + "grad_norm": 2.5091657638549805, + "learning_rate": 2.997868518498498e-05, + "loss": 1.8586, + "step": 8715 + }, + { + "epoch": 0.017000377786173027, + "grad_norm": 3.0821568965911865, + "learning_rate": 2.997861176644101e-05, + "loss": 2.1301, + "step": 8730 + }, + { + "epoch": 0.017029588057283288, + "grad_norm": 2.465061902999878, + "learning_rate": 2.997853822176004e-05, + "loss": 1.9435, + "step": 8745 + }, + { + "epoch": 0.017058798328393552, + "grad_norm": 4.7108306884765625, + "learning_rate": 2.9978464550942697e-05, + "loss": 1.8857, + "step": 8760 + }, + { + "epoch": 0.017088008599503816, + "grad_norm": 2.838949203491211, + "learning_rate": 2.9978390753989597e-05, + "loss": 1.9367, + "step": 8775 + }, + { + "epoch": 0.017117218870614077, + "grad_norm": 6.954312324523926, + "learning_rate": 2.9978316830901358e-05, + "loss": 1.6954, + "step": 8790 + }, + { + "epoch": 0.017146429141724342, + "grad_norm": 3.075137138366699, + "learning_rate": 2.997824278167861e-05, + "loss": 1.8787, + "step": 8805 + }, + { + "epoch": 0.017175639412834603, + "grad_norm": 2.7041006088256836, + "learning_rate": 2.9978168606321975e-05, + "loss": 1.8062, + "step": 8820 + }, + { + "epoch": 0.017204849683944867, + "grad_norm": 2.9021966457366943, + "learning_rate": 2.997809430483207e-05, + "loss": 1.9083, + "step": 8835 + }, + { + "epoch": 0.017234059955055128, + "grad_norm": 3.350419521331787, + "learning_rate": 2.9978019877209528e-05, + "loss": 2.0403, + "step": 8850 + }, + { + "epoch": 0.017263270226165393, + "grad_norm": 4.803377628326416, + "learning_rate": 2.9977945323454977e-05, + "loss": 1.8116, + "step": 8865 + }, + { + "epoch": 0.017292480497275657, + "grad_norm": 4.362671375274658, + "learning_rate": 2.997787064356904e-05, + "loss": 1.8089, + "step": 8880 + }, + { + "epoch": 0.017321690768385918, + "grad_norm": 3.1585206985473633, + "learning_rate": 2.9977795837552347e-05, + "loss": 2.1531, + "step": 8895 + }, + { + "epoch": 0.017350901039496183, + "grad_norm": 2.9386544227600098, + "learning_rate": 2.997772090540553e-05, + "loss": 1.8995, + "step": 8910 + }, + { + "epoch": 0.017380111310606444, + "grad_norm": 4.238663673400879, + "learning_rate": 2.9977645847129216e-05, + "loss": 1.9374, + "step": 8925 + }, + { + "epoch": 0.017409321581716708, + "grad_norm": 3.94399356842041, + "learning_rate": 2.9977570662724047e-05, + "loss": 1.9782, + "step": 8940 + }, + { + "epoch": 0.01743853185282697, + "grad_norm": 3.2048075199127197, + "learning_rate": 2.9977495352190643e-05, + "loss": 1.8984, + "step": 8955 + }, + { + "epoch": 0.017467742123937233, + "grad_norm": 2.4807546138763428, + "learning_rate": 2.9977419915529646e-05, + "loss": 1.8613, + "step": 8970 + }, + { + "epoch": 0.017496952395047494, + "grad_norm": 2.518021583557129, + "learning_rate": 2.9977344352741686e-05, + "loss": 2.0321, + "step": 8985 + }, + { + "epoch": 0.01752616266615776, + "grad_norm": 4.450172424316406, + "learning_rate": 2.9977268663827403e-05, + "loss": 1.9419, + "step": 9000 + }, + { + "epoch": 0.017555372937268023, + "grad_norm": 3.8519856929779053, + "learning_rate": 2.9977192848787437e-05, + "loss": 1.9083, + "step": 9015 + }, + { + "epoch": 0.017584583208378284, + "grad_norm": 5.37404727935791, + "learning_rate": 2.9977116907622422e-05, + "loss": 1.9904, + "step": 9030 + }, + { + "epoch": 0.01761379347948855, + "grad_norm": 2.9285478591918945, + "learning_rate": 2.9977040840333e-05, + "loss": 1.7262, + "step": 9045 + }, + { + "epoch": 0.01764300375059881, + "grad_norm": 3.8375890254974365, + "learning_rate": 2.9976964646919814e-05, + "loss": 1.945, + "step": 9060 + }, + { + "epoch": 0.017672214021709074, + "grad_norm": 2.987417459487915, + "learning_rate": 2.9976888327383497e-05, + "loss": 2.1196, + "step": 9075 + }, + { + "epoch": 0.017701424292819335, + "grad_norm": 3.6940131187438965, + "learning_rate": 2.99768118817247e-05, + "loss": 2.002, + "step": 9090 + }, + { + "epoch": 0.0177306345639296, + "grad_norm": 1.8791909217834473, + "learning_rate": 2.997673530994406e-05, + "loss": 1.9082, + "step": 9105 + }, + { + "epoch": 0.017759844835039864, + "grad_norm": 5.242600440979004, + "learning_rate": 2.997665861204223e-05, + "loss": 1.8396, + "step": 9120 + }, + { + "epoch": 0.017789055106150125, + "grad_norm": 3.7789740562438965, + "learning_rate": 2.997658178801985e-05, + "loss": 1.9959, + "step": 9135 + }, + { + "epoch": 0.01781826537726039, + "grad_norm": 2.6715869903564453, + "learning_rate": 2.9976504837877566e-05, + "loss": 1.8346, + "step": 9150 + }, + { + "epoch": 0.01784747564837065, + "grad_norm": 3.229962110519409, + "learning_rate": 2.997642776161603e-05, + "loss": 2.0002, + "step": 9165 + }, + { + "epoch": 0.017876685919480915, + "grad_norm": 2.602320671081543, + "learning_rate": 2.997635055923589e-05, + "loss": 2.0765, + "step": 9180 + }, + { + "epoch": 0.017905896190591176, + "grad_norm": 3.277393102645874, + "learning_rate": 2.9976273230737795e-05, + "loss": 1.8345, + "step": 9195 + }, + { + "epoch": 0.01793510646170144, + "grad_norm": 2.1936373710632324, + "learning_rate": 2.9976195776122397e-05, + "loss": 1.9265, + "step": 9210 + }, + { + "epoch": 0.0179643167328117, + "grad_norm": 4.018658638000488, + "learning_rate": 2.997611819539035e-05, + "loss": 1.9237, + "step": 9225 + }, + { + "epoch": 0.017993527003921966, + "grad_norm": 3.5876455307006836, + "learning_rate": 2.9976040488542304e-05, + "loss": 1.8761, + "step": 9240 + }, + { + "epoch": 0.01802273727503223, + "grad_norm": 2.9479000568389893, + "learning_rate": 2.9975962655578915e-05, + "loss": 1.9062, + "step": 9255 + }, + { + "epoch": 0.01805194754614249, + "grad_norm": 3.185248613357544, + "learning_rate": 2.9975884696500835e-05, + "loss": 1.9958, + "step": 9270 + }, + { + "epoch": 0.018081157817252756, + "grad_norm": 2.5612637996673584, + "learning_rate": 2.9975806611308725e-05, + "loss": 2.0438, + "step": 9285 + }, + { + "epoch": 0.018110368088363017, + "grad_norm": 3.9929094314575195, + "learning_rate": 2.9975728400003244e-05, + "loss": 1.8404, + "step": 9300 + }, + { + "epoch": 0.01813957835947328, + "grad_norm": 2.15783953666687, + "learning_rate": 2.9975650062585043e-05, + "loss": 1.8677, + "step": 9315 + }, + { + "epoch": 0.018168788630583542, + "grad_norm": 3.7899887561798096, + "learning_rate": 2.997557159905479e-05, + "loss": 1.9269, + "step": 9330 + }, + { + "epoch": 0.018197998901693806, + "grad_norm": 4.0455145835876465, + "learning_rate": 2.9975493009413144e-05, + "loss": 1.8552, + "step": 9345 + }, + { + "epoch": 0.01822720917280407, + "grad_norm": 2.837963581085205, + "learning_rate": 2.9975414293660766e-05, + "loss": 2.1462, + "step": 9360 + }, + { + "epoch": 0.018256419443914332, + "grad_norm": 3.8562536239624023, + "learning_rate": 2.9975335451798317e-05, + "loss": 1.8625, + "step": 9375 + }, + { + "epoch": 0.018285629715024596, + "grad_norm": 2.4235100746154785, + "learning_rate": 2.9975256483826453e-05, + "loss": 1.8512, + "step": 9390 + }, + { + "epoch": 0.018314839986134857, + "grad_norm": 5.756357192993164, + "learning_rate": 2.997517738974586e-05, + "loss": 1.84, + "step": 9405 + }, + { + "epoch": 0.01834405025724512, + "grad_norm": 2.3832759857177734, + "learning_rate": 2.9975098169557187e-05, + "loss": 1.8721, + "step": 9420 + }, + { + "epoch": 0.018373260528355383, + "grad_norm": 2.0849883556365967, + "learning_rate": 2.9975018823261106e-05, + "loss": 1.914, + "step": 9435 + }, + { + "epoch": 0.018402470799465647, + "grad_norm": 2.3563778400421143, + "learning_rate": 2.997493935085829e-05, + "loss": 1.7283, + "step": 9450 + }, + { + "epoch": 0.018431681070575908, + "grad_norm": 2.018721580505371, + "learning_rate": 2.9974859752349396e-05, + "loss": 1.7748, + "step": 9465 + }, + { + "epoch": 0.018460891341686173, + "grad_norm": 2.725719451904297, + "learning_rate": 2.9974780027735103e-05, + "loss": 1.9287, + "step": 9480 + }, + { + "epoch": 0.018490101612796437, + "grad_norm": 3.2846550941467285, + "learning_rate": 2.9974700177016082e-05, + "loss": 1.8214, + "step": 9495 + }, + { + "epoch": 0.018519311883906698, + "grad_norm": 4.153242111206055, + "learning_rate": 2.997462020019301e-05, + "loss": 1.9377, + "step": 9510 + }, + { + "epoch": 0.018548522155016962, + "grad_norm": 2.386509656906128, + "learning_rate": 2.997454009726655e-05, + "loss": 1.7513, + "step": 9525 + }, + { + "epoch": 0.018577732426127223, + "grad_norm": 3.9288957118988037, + "learning_rate": 2.9974459868237384e-05, + "loss": 1.8623, + "step": 9540 + }, + { + "epoch": 0.018606942697237488, + "grad_norm": 2.2002370357513428, + "learning_rate": 2.9974379513106184e-05, + "loss": 2.0153, + "step": 9555 + }, + { + "epoch": 0.01863615296834775, + "grad_norm": 2.4271867275238037, + "learning_rate": 2.9974299031873625e-05, + "loss": 1.8772, + "step": 9570 + }, + { + "epoch": 0.018665363239458013, + "grad_norm": 2.355729341506958, + "learning_rate": 2.9974218424540395e-05, + "loss": 1.9947, + "step": 9585 + }, + { + "epoch": 0.018694573510568278, + "grad_norm": 4.210724830627441, + "learning_rate": 2.9974137691107164e-05, + "loss": 1.8113, + "step": 9600 + }, + { + "epoch": 0.01872378378167854, + "grad_norm": 3.092832088470459, + "learning_rate": 2.997405683157461e-05, + "loss": 1.8457, + "step": 9615 + }, + { + "epoch": 0.018752994052788803, + "grad_norm": 3.154505491256714, + "learning_rate": 2.997397584594342e-05, + "loss": 1.9223, + "step": 9630 + }, + { + "epoch": 0.018782204323899064, + "grad_norm": 2.661449909210205, + "learning_rate": 2.997389473421427e-05, + "loss": 1.8362, + "step": 9645 + }, + { + "epoch": 0.01881141459500933, + "grad_norm": 4.499241352081299, + "learning_rate": 2.997381349638785e-05, + "loss": 1.974, + "step": 9660 + }, + { + "epoch": 0.01884062486611959, + "grad_norm": 8.712298393249512, + "learning_rate": 2.9973732132464838e-05, + "loss": 1.793, + "step": 9675 + }, + { + "epoch": 0.018869835137229854, + "grad_norm": 3.771261215209961, + "learning_rate": 2.9973650642445926e-05, + "loss": 1.8158, + "step": 9690 + }, + { + "epoch": 0.018899045408340115, + "grad_norm": 2.3334624767303467, + "learning_rate": 2.997356902633179e-05, + "loss": 1.9791, + "step": 9705 + }, + { + "epoch": 0.01892825567945038, + "grad_norm": 3.4070332050323486, + "learning_rate": 2.997348728412313e-05, + "loss": 1.8621, + "step": 9720 + }, + { + "epoch": 0.018957465950560644, + "grad_norm": 2.4055604934692383, + "learning_rate": 2.997340541582062e-05, + "loss": 1.8424, + "step": 9735 + }, + { + "epoch": 0.018986676221670905, + "grad_norm": 3.209122896194458, + "learning_rate": 2.9973323421424962e-05, + "loss": 1.8791, + "step": 9750 + }, + { + "epoch": 0.01901588649278117, + "grad_norm": 2.720518112182617, + "learning_rate": 2.9973241300936842e-05, + "loss": 1.9752, + "step": 9765 + }, + { + "epoch": 0.01904509676389143, + "grad_norm": 1.9057116508483887, + "learning_rate": 2.9973159054356948e-05, + "loss": 1.9658, + "step": 9780 + }, + { + "epoch": 0.019074307035001695, + "grad_norm": 3.4243197441101074, + "learning_rate": 2.9973076681685977e-05, + "loss": 1.9394, + "step": 9795 + }, + { + "epoch": 0.019103517306111956, + "grad_norm": 3.557957410812378, + "learning_rate": 2.997299418292462e-05, + "loss": 1.8854, + "step": 9810 + }, + { + "epoch": 0.01913272757722222, + "grad_norm": 2.576314926147461, + "learning_rate": 2.9972911558073575e-05, + "loss": 1.8975, + "step": 9825 + }, + { + "epoch": 0.019161937848332485, + "grad_norm": 2.4958183765411377, + "learning_rate": 2.9972828807133537e-05, + "loss": 1.765, + "step": 9840 + }, + { + "epoch": 0.019191148119442746, + "grad_norm": 3.1889865398406982, + "learning_rate": 2.99727459301052e-05, + "loss": 1.9785, + "step": 9855 + }, + { + "epoch": 0.01922035839055301, + "grad_norm": 4.607937335968018, + "learning_rate": 2.9972662926989267e-05, + "loss": 1.7931, + "step": 9870 + }, + { + "epoch": 0.01924956866166327, + "grad_norm": 3.380537271499634, + "learning_rate": 2.997257979778643e-05, + "loss": 1.8366, + "step": 9885 + }, + { + "epoch": 0.019278778932773535, + "grad_norm": 1.7773466110229492, + "learning_rate": 2.9972496542497393e-05, + "loss": 1.841, + "step": 9900 + }, + { + "epoch": 0.019307989203883796, + "grad_norm": 3.43685245513916, + "learning_rate": 2.9972413161122858e-05, + "loss": 1.8255, + "step": 9915 + }, + { + "epoch": 0.01933719947499406, + "grad_norm": 4.238219261169434, + "learning_rate": 2.9972329653663525e-05, + "loss": 1.8403, + "step": 9930 + }, + { + "epoch": 0.019366409746104322, + "grad_norm": 3.8438355922698975, + "learning_rate": 2.99722460201201e-05, + "loss": 1.8005, + "step": 9945 + }, + { + "epoch": 0.019395620017214586, + "grad_norm": 4.286600589752197, + "learning_rate": 2.997216226049328e-05, + "loss": 1.8684, + "step": 9960 + }, + { + "epoch": 0.01942483028832485, + "grad_norm": 1.7342430353164673, + "learning_rate": 2.997207837478378e-05, + "loss": 1.9657, + "step": 9975 + }, + { + "epoch": 0.01945404055943511, + "grad_norm": 3.3315911293029785, + "learning_rate": 2.9971994362992304e-05, + "loss": 1.9156, + "step": 9990 + }, + { + "epoch": 0.019483250830545376, + "grad_norm": 4.346848011016846, + "learning_rate": 2.9971910225119556e-05, + "loss": 1.9114, + "step": 10005 + }, + { + "epoch": 0.019512461101655637, + "grad_norm": 2.8670308589935303, + "learning_rate": 2.9971825961166248e-05, + "loss": 1.8471, + "step": 10020 + }, + { + "epoch": 0.0195416713727659, + "grad_norm": 2.1465935707092285, + "learning_rate": 2.9971741571133085e-05, + "loss": 1.8608, + "step": 10035 + }, + { + "epoch": 0.019570881643876162, + "grad_norm": 3.3292319774627686, + "learning_rate": 2.9971657055020782e-05, + "loss": 1.877, + "step": 10050 + }, + { + "epoch": 0.019600091914986427, + "grad_norm": 2.3058934211730957, + "learning_rate": 2.9971572412830045e-05, + "loss": 1.9125, + "step": 10065 + }, + { + "epoch": 0.01962930218609669, + "grad_norm": 4.684175968170166, + "learning_rate": 2.9971487644561597e-05, + "loss": 1.9237, + "step": 10080 + }, + { + "epoch": 0.019658512457206952, + "grad_norm": 3.6633639335632324, + "learning_rate": 2.9971402750216144e-05, + "loss": 1.856, + "step": 10095 + }, + { + "epoch": 0.019687722728317217, + "grad_norm": 3.837944269180298, + "learning_rate": 2.9971317729794404e-05, + "loss": 2.0146, + "step": 10110 + }, + { + "epoch": 0.019716932999427478, + "grad_norm": 4.053643226623535, + "learning_rate": 2.997123258329709e-05, + "loss": 1.9125, + "step": 10125 + }, + { + "epoch": 0.019746143270537742, + "grad_norm": 1.9680250883102417, + "learning_rate": 2.9971147310724923e-05, + "loss": 1.9431, + "step": 10140 + }, + { + "epoch": 0.019775353541648003, + "grad_norm": 1.941953420639038, + "learning_rate": 2.9971061912078615e-05, + "loss": 1.8638, + "step": 10155 + }, + { + "epoch": 0.019804563812758268, + "grad_norm": 3.1855714321136475, + "learning_rate": 2.997097638735889e-05, + "loss": 1.6231, + "step": 10170 + }, + { + "epoch": 0.019833774083868532, + "grad_norm": 4.022531509399414, + "learning_rate": 2.997089073656647e-05, + "loss": 1.8637, + "step": 10185 + }, + { + "epoch": 0.019862984354978793, + "grad_norm": 4.320540904998779, + "learning_rate": 2.997080495970207e-05, + "loss": 1.8383, + "step": 10200 + }, + { + "epoch": 0.019892194626089058, + "grad_norm": 3.0604958534240723, + "learning_rate": 2.997071905676642e-05, + "loss": 1.8843, + "step": 10215 + }, + { + "epoch": 0.01992140489719932, + "grad_norm": 3.2216265201568604, + "learning_rate": 2.9970633027760235e-05, + "loss": 2.004, + "step": 10230 + }, + { + "epoch": 0.019950615168309583, + "grad_norm": 2.4354753494262695, + "learning_rate": 2.997054687268425e-05, + "loss": 1.9255, + "step": 10245 + }, + { + "epoch": 0.019979825439419844, + "grad_norm": 2.3556060791015625, + "learning_rate": 2.9970460591539175e-05, + "loss": 2.0436, + "step": 10260 + }, + { + "epoch": 0.02000903571053011, + "grad_norm": 3.6977896690368652, + "learning_rate": 2.9970374184325753e-05, + "loss": 1.9181, + "step": 10275 + }, + { + "epoch": 0.02003824598164037, + "grad_norm": 1.909177541732788, + "learning_rate": 2.99702876510447e-05, + "loss": 1.7653, + "step": 10290 + }, + { + "epoch": 0.020067456252750634, + "grad_norm": 2.486943006515503, + "learning_rate": 2.997020099169675e-05, + "loss": 1.9137, + "step": 10305 + }, + { + "epoch": 0.020096666523860898, + "grad_norm": 3.0627150535583496, + "learning_rate": 2.9970114206282634e-05, + "loss": 1.8442, + "step": 10320 + }, + { + "epoch": 0.02012587679497116, + "grad_norm": 2.7819271087646484, + "learning_rate": 2.997002729480308e-05, + "loss": 2.1325, + "step": 10335 + }, + { + "epoch": 0.020155087066081424, + "grad_norm": 2.097712755203247, + "learning_rate": 2.9969940257258823e-05, + "loss": 1.8952, + "step": 10350 + }, + { + "epoch": 0.020184297337191685, + "grad_norm": 2.317915439605713, + "learning_rate": 2.9969853093650592e-05, + "loss": 1.8896, + "step": 10365 + }, + { + "epoch": 0.02021350760830195, + "grad_norm": 3.3799221515655518, + "learning_rate": 2.996976580397912e-05, + "loss": 1.9032, + "step": 10380 + }, + { + "epoch": 0.02024271787941221, + "grad_norm": 4.226128578186035, + "learning_rate": 2.996967838824515e-05, + "loss": 1.9812, + "step": 10395 + }, + { + "epoch": 0.020271928150522475, + "grad_norm": 2.872182607650757, + "learning_rate": 2.996959084644941e-05, + "loss": 1.7335, + "step": 10410 + }, + { + "epoch": 0.02030113842163274, + "grad_norm": 1.931477427482605, + "learning_rate": 2.9969503178592638e-05, + "loss": 1.8178, + "step": 10425 + }, + { + "epoch": 0.020330348692743, + "grad_norm": 1.9570348262786865, + "learning_rate": 2.9969415384675577e-05, + "loss": 1.9652, + "step": 10440 + }, + { + "epoch": 0.020359558963853264, + "grad_norm": 2.1548566818237305, + "learning_rate": 2.996932746469896e-05, + "loss": 1.9105, + "step": 10455 + }, + { + "epoch": 0.020388769234963525, + "grad_norm": 2.138561248779297, + "learning_rate": 2.9969239418663538e-05, + "loss": 2.2595, + "step": 10470 + }, + { + "epoch": 0.02041797950607379, + "grad_norm": 3.682020425796509, + "learning_rate": 2.9969151246570038e-05, + "loss": 1.8828, + "step": 10485 + }, + { + "epoch": 0.02044718977718405, + "grad_norm": 1.9086365699768066, + "learning_rate": 2.9969062948419213e-05, + "loss": 1.8974, + "step": 10500 + }, + { + "epoch": 0.020476400048294315, + "grad_norm": 3.4526236057281494, + "learning_rate": 2.9968974524211807e-05, + "loss": 1.8972, + "step": 10515 + }, + { + "epoch": 0.020505610319404576, + "grad_norm": 5.137340068817139, + "learning_rate": 2.996888597394856e-05, + "loss": 1.7885, + "step": 10530 + }, + { + "epoch": 0.02053482059051484, + "grad_norm": 2.410789966583252, + "learning_rate": 2.9968797297630215e-05, + "loss": 1.9339, + "step": 10545 + }, + { + "epoch": 0.020564030861625105, + "grad_norm": 2.756986618041992, + "learning_rate": 2.9968708495257527e-05, + "loss": 1.9682, + "step": 10560 + }, + { + "epoch": 0.020593241132735366, + "grad_norm": 4.116410255432129, + "learning_rate": 2.9968619566831238e-05, + "loss": 1.8814, + "step": 10575 + }, + { + "epoch": 0.02062245140384563, + "grad_norm": 3.892730712890625, + "learning_rate": 2.9968530512352098e-05, + "loss": 1.9744, + "step": 10590 + }, + { + "epoch": 0.02065166167495589, + "grad_norm": 3.5503933429718018, + "learning_rate": 2.9968441331820856e-05, + "loss": 2.0472, + "step": 10605 + }, + { + "epoch": 0.020680871946066156, + "grad_norm": 4.085498809814453, + "learning_rate": 2.9968352025238263e-05, + "loss": 1.9428, + "step": 10620 + }, + { + "epoch": 0.020710082217176417, + "grad_norm": 3.806868553161621, + "learning_rate": 2.996826259260508e-05, + "loss": 1.8212, + "step": 10635 + }, + { + "epoch": 0.02073929248828668, + "grad_norm": 2.24172306060791, + "learning_rate": 2.9968173033922045e-05, + "loss": 1.7338, + "step": 10650 + }, + { + "epoch": 0.020768502759396946, + "grad_norm": 2.516962766647339, + "learning_rate": 2.996808334918992e-05, + "loss": 1.9185, + "step": 10665 + }, + { + "epoch": 0.020797713030507207, + "grad_norm": 3.8772926330566406, + "learning_rate": 2.9967993538409465e-05, + "loss": 1.8053, + "step": 10680 + }, + { + "epoch": 0.02082692330161747, + "grad_norm": 5.146918296813965, + "learning_rate": 2.9967903601581427e-05, + "loss": 1.8488, + "step": 10695 + }, + { + "epoch": 0.020856133572727732, + "grad_norm": 6.216543197631836, + "learning_rate": 2.9967813538706568e-05, + "loss": 1.9446, + "step": 10710 + }, + { + "epoch": 0.020885343843837997, + "grad_norm": 3.0393502712249756, + "learning_rate": 2.9967723349785648e-05, + "loss": 1.8881, + "step": 10725 + }, + { + "epoch": 0.020914554114948258, + "grad_norm": 2.7038638591766357, + "learning_rate": 2.996763303481942e-05, + "loss": 1.9454, + "step": 10740 + }, + { + "epoch": 0.020943764386058522, + "grad_norm": 2.4057178497314453, + "learning_rate": 2.9967542593808655e-05, + "loss": 2.0256, + "step": 10755 + }, + { + "epoch": 0.020972974657168783, + "grad_norm": 2.2479588985443115, + "learning_rate": 2.9967452026754104e-05, + "loss": 1.8835, + "step": 10770 + }, + { + "epoch": 0.021002184928279048, + "grad_norm": 2.2106525897979736, + "learning_rate": 2.996736133365654e-05, + "loss": 1.8594, + "step": 10785 + }, + { + "epoch": 0.021031395199389312, + "grad_norm": 2.273165225982666, + "learning_rate": 2.9967270514516718e-05, + "loss": 1.772, + "step": 10800 + }, + { + "epoch": 0.021060605470499573, + "grad_norm": 4.13364315032959, + "learning_rate": 2.9967179569335407e-05, + "loss": 1.8364, + "step": 10815 + }, + { + "epoch": 0.021089815741609837, + "grad_norm": 4.664285182952881, + "learning_rate": 2.9967088498113368e-05, + "loss": 1.7126, + "step": 10830 + }, + { + "epoch": 0.0211190260127201, + "grad_norm": 3.9353389739990234, + "learning_rate": 2.9966997300851376e-05, + "loss": 1.8366, + "step": 10845 + }, + { + "epoch": 0.021148236283830363, + "grad_norm": 3.433561086654663, + "learning_rate": 2.996690597755019e-05, + "loss": 1.7921, + "step": 10860 + }, + { + "epoch": 0.021177446554940624, + "grad_norm": 2.0325567722320557, + "learning_rate": 2.996681452821059e-05, + "loss": 1.88, + "step": 10875 + }, + { + "epoch": 0.021206656826050888, + "grad_norm": 3.6523592472076416, + "learning_rate": 2.9966722952833335e-05, + "loss": 1.8464, + "step": 10890 + }, + { + "epoch": 0.021235867097161153, + "grad_norm": 4.08983039855957, + "learning_rate": 2.99666312514192e-05, + "loss": 2.0643, + "step": 10905 + }, + { + "epoch": 0.021265077368271414, + "grad_norm": 4.714212894439697, + "learning_rate": 2.9966539423968964e-05, + "loss": 1.9458, + "step": 10920 + }, + { + "epoch": 0.021294287639381678, + "grad_norm": 3.2425897121429443, + "learning_rate": 2.996644747048339e-05, + "loss": 1.9802, + "step": 10935 + }, + { + "epoch": 0.02132349791049194, + "grad_norm": 5.025219440460205, + "learning_rate": 2.9966355390963258e-05, + "loss": 1.8679, + "step": 10950 + }, + { + "epoch": 0.021352708181602204, + "grad_norm": 3.669241428375244, + "learning_rate": 2.9966263185409343e-05, + "loss": 1.9634, + "step": 10965 + }, + { + "epoch": 0.021381918452712464, + "grad_norm": 3.5736284255981445, + "learning_rate": 2.996617085382242e-05, + "loss": 1.9348, + "step": 10980 + }, + { + "epoch": 0.02141112872382273, + "grad_norm": 2.8263094425201416, + "learning_rate": 2.996607839620327e-05, + "loss": 1.7897, + "step": 10995 + }, + { + "epoch": 0.02144033899493299, + "grad_norm": 4.033946990966797, + "learning_rate": 2.996598581255267e-05, + "loss": 1.9587, + "step": 11010 + }, + { + "epoch": 0.021469549266043254, + "grad_norm": 3.7127420902252197, + "learning_rate": 2.996589310287139e-05, + "loss": 1.874, + "step": 11025 + }, + { + "epoch": 0.02149875953715352, + "grad_norm": 4.083348751068115, + "learning_rate": 2.9965800267160223e-05, + "loss": 1.7831, + "step": 11040 + }, + { + "epoch": 0.02152796980826378, + "grad_norm": 4.666345596313477, + "learning_rate": 2.996570730541995e-05, + "loss": 1.8516, + "step": 11055 + }, + { + "epoch": 0.021557180079374044, + "grad_norm": 2.3497631549835205, + "learning_rate": 2.996561421765135e-05, + "loss": 1.9213, + "step": 11070 + }, + { + "epoch": 0.021586390350484305, + "grad_norm": 4.355146408081055, + "learning_rate": 2.996552100385521e-05, + "loss": 1.8072, + "step": 11085 + }, + { + "epoch": 0.02161560062159457, + "grad_norm": 3.6673879623413086, + "learning_rate": 2.996542766403231e-05, + "loss": 1.8541, + "step": 11100 + }, + { + "epoch": 0.02164481089270483, + "grad_norm": 2.2446086406707764, + "learning_rate": 2.996533419818344e-05, + "loss": 1.9764, + "step": 11115 + }, + { + "epoch": 0.021674021163815095, + "grad_norm": 3.8175010681152344, + "learning_rate": 2.996524060630938e-05, + "loss": 1.9664, + "step": 11130 + }, + { + "epoch": 0.02170323143492536, + "grad_norm": 4.360842704772949, + "learning_rate": 2.996514688841093e-05, + "loss": 1.9145, + "step": 11145 + }, + { + "epoch": 0.02173244170603562, + "grad_norm": 3.110860586166382, + "learning_rate": 2.996505304448887e-05, + "loss": 1.8939, + "step": 11160 + }, + { + "epoch": 0.021761651977145885, + "grad_norm": 2.5407495498657227, + "learning_rate": 2.9964959074544e-05, + "loss": 1.8649, + "step": 11175 + }, + { + "epoch": 0.021790862248256146, + "grad_norm": 2.933225393295288, + "learning_rate": 2.9964864978577103e-05, + "loss": 1.8987, + "step": 11190 + }, + { + "epoch": 0.02182007251936641, + "grad_norm": 3.003664493560791, + "learning_rate": 2.996477075658897e-05, + "loss": 1.936, + "step": 11205 + }, + { + "epoch": 0.02184928279047667, + "grad_norm": 3.119703531265259, + "learning_rate": 2.99646764085804e-05, + "loss": 1.7183, + "step": 11220 + }, + { + "epoch": 0.021878493061586936, + "grad_norm": 2.6627697944641113, + "learning_rate": 2.9964581934552182e-05, + "loss": 1.9218, + "step": 11235 + }, + { + "epoch": 0.0219077033326972, + "grad_norm": 4.310539245605469, + "learning_rate": 2.9964487334505114e-05, + "loss": 2.0666, + "step": 11250 + }, + { + "epoch": 0.02193691360380746, + "grad_norm": 2.611443519592285, + "learning_rate": 2.9964392608439997e-05, + "loss": 1.8154, + "step": 11265 + }, + { + "epoch": 0.021966123874917726, + "grad_norm": 3.391406774520874, + "learning_rate": 2.996429775635763e-05, + "loss": 1.9308, + "step": 11280 + }, + { + "epoch": 0.021995334146027987, + "grad_norm": 3.1492297649383545, + "learning_rate": 2.9964202778258797e-05, + "loss": 1.939, + "step": 11295 + }, + { + "epoch": 0.02202454441713825, + "grad_norm": 3.680859088897705, + "learning_rate": 2.9964107674144313e-05, + "loss": 1.9048, + "step": 11310 + }, + { + "epoch": 0.022053754688248512, + "grad_norm": 2.0522656440734863, + "learning_rate": 2.9964012444014972e-05, + "loss": 1.9477, + "step": 11325 + }, + { + "epoch": 0.022082964959358777, + "grad_norm": 3.265316963195801, + "learning_rate": 2.996391708787158e-05, + "loss": 1.8116, + "step": 11340 + }, + { + "epoch": 0.022112175230469037, + "grad_norm": 2.0570802688598633, + "learning_rate": 2.9963821605714934e-05, + "loss": 1.8493, + "step": 11355 + }, + { + "epoch": 0.022141385501579302, + "grad_norm": 4.1805877685546875, + "learning_rate": 2.9963725997545844e-05, + "loss": 1.8909, + "step": 11370 + }, + { + "epoch": 0.022170595772689566, + "grad_norm": 2.6846070289611816, + "learning_rate": 2.9963630263365116e-05, + "loss": 1.8424, + "step": 11385 + }, + { + "epoch": 0.022199806043799827, + "grad_norm": 4.480174541473389, + "learning_rate": 2.996353440317355e-05, + "loss": 1.919, + "step": 11400 + }, + { + "epoch": 0.022229016314910092, + "grad_norm": 2.179137706756592, + "learning_rate": 2.996343841697195e-05, + "loss": 1.8023, + "step": 11415 + }, + { + "epoch": 0.022258226586020353, + "grad_norm": 4.341340065002441, + "learning_rate": 2.996334230476114e-05, + "loss": 1.8263, + "step": 11430 + }, + { + "epoch": 0.022287436857130617, + "grad_norm": 1.6666501760482788, + "learning_rate": 2.9963246066541913e-05, + "loss": 1.8854, + "step": 11445 + }, + { + "epoch": 0.022316647128240878, + "grad_norm": 2.9793460369110107, + "learning_rate": 2.9963149702315093e-05, + "loss": 1.9214, + "step": 11460 + }, + { + "epoch": 0.022345857399351143, + "grad_norm": 4.338296413421631, + "learning_rate": 2.996305321208148e-05, + "loss": 1.9517, + "step": 11475 + }, + { + "epoch": 0.022375067670461407, + "grad_norm": 4.449549674987793, + "learning_rate": 2.99629565958419e-05, + "loss": 2.0761, + "step": 11490 + }, + { + "epoch": 0.022404277941571668, + "grad_norm": 2.6661598682403564, + "learning_rate": 2.9962859853597146e-05, + "loss": 1.9029, + "step": 11505 + }, + { + "epoch": 0.022433488212681933, + "grad_norm": 2.5066778659820557, + "learning_rate": 2.996276298534805e-05, + "loss": 1.7934, + "step": 11520 + }, + { + "epoch": 0.022462698483792193, + "grad_norm": 2.2089765071868896, + "learning_rate": 2.9962665991095424e-05, + "loss": 2.0755, + "step": 11535 + }, + { + "epoch": 0.022491908754902458, + "grad_norm": 3.543388605117798, + "learning_rate": 2.9962568870840078e-05, + "loss": 1.8682, + "step": 11550 + }, + { + "epoch": 0.02252111902601272, + "grad_norm": 3.1596784591674805, + "learning_rate": 2.9962471624582838e-05, + "loss": 2.0225, + "step": 11565 + }, + { + "epoch": 0.022550329297122983, + "grad_norm": 3.338447332382202, + "learning_rate": 2.9962374252324524e-05, + "loss": 1.7248, + "step": 11580 + }, + { + "epoch": 0.022579539568233244, + "grad_norm": 2.6090617179870605, + "learning_rate": 2.996227675406595e-05, + "loss": 2.1125, + "step": 11595 + }, + { + "epoch": 0.02260874983934351, + "grad_norm": 3.618283271789551, + "learning_rate": 2.9962179129807936e-05, + "loss": 2.0012, + "step": 11610 + }, + { + "epoch": 0.022637960110453773, + "grad_norm": 2.161893129348755, + "learning_rate": 2.996208137955131e-05, + "loss": 1.8541, + "step": 11625 + }, + { + "epoch": 0.022667170381564034, + "grad_norm": 4.789167881011963, + "learning_rate": 2.996198350329689e-05, + "loss": 1.7935, + "step": 11640 + }, + { + "epoch": 0.0226963806526743, + "grad_norm": 2.7407031059265137, + "learning_rate": 2.9961885501045505e-05, + "loss": 1.8944, + "step": 11655 + }, + { + "epoch": 0.02272559092378456, + "grad_norm": 2.0403008460998535, + "learning_rate": 2.9961787372797977e-05, + "loss": 1.9245, + "step": 11670 + }, + { + "epoch": 0.022754801194894824, + "grad_norm": 2.2842493057250977, + "learning_rate": 2.996168911855513e-05, + "loss": 1.8867, + "step": 11685 + }, + { + "epoch": 0.022784011466005085, + "grad_norm": 3.4035165309906006, + "learning_rate": 2.99615907383178e-05, + "loss": 1.8029, + "step": 11700 + }, + { + "epoch": 0.02281322173711535, + "grad_norm": 4.04712438583374, + "learning_rate": 2.996149223208681e-05, + "loss": 1.8933, + "step": 11715 + }, + { + "epoch": 0.022842432008225614, + "grad_norm": 3.2550394535064697, + "learning_rate": 2.996139359986299e-05, + "loss": 1.8503, + "step": 11730 + }, + { + "epoch": 0.022871642279335875, + "grad_norm": 2.3266172409057617, + "learning_rate": 2.9961294841647164e-05, + "loss": 1.939, + "step": 11745 + }, + { + "epoch": 0.02290085255044614, + "grad_norm": 1.7769925594329834, + "learning_rate": 2.9961195957440172e-05, + "loss": 2.0939, + "step": 11760 + }, + { + "epoch": 0.0229300628215564, + "grad_norm": 3.4839985370635986, + "learning_rate": 2.9961096947242846e-05, + "loss": 1.8933, + "step": 11775 + }, + { + "epoch": 0.022959273092666665, + "grad_norm": 4.051612377166748, + "learning_rate": 2.9960997811056017e-05, + "loss": 1.8464, + "step": 11790 + }, + { + "epoch": 0.022988483363776926, + "grad_norm": 5.2628703117370605, + "learning_rate": 2.9960898548880525e-05, + "loss": 1.9296, + "step": 11805 + }, + { + "epoch": 0.02301769363488719, + "grad_norm": 4.6693434715271, + "learning_rate": 2.99607991607172e-05, + "loss": 2.0126, + "step": 11820 + }, + { + "epoch": 0.02304690390599745, + "grad_norm": 2.2805821895599365, + "learning_rate": 2.996069964656688e-05, + "loss": 1.7084, + "step": 11835 + }, + { + "epoch": 0.023076114177107716, + "grad_norm": 5.130448818206787, + "learning_rate": 2.996060000643041e-05, + "loss": 1.8437, + "step": 11850 + }, + { + "epoch": 0.02310532444821798, + "grad_norm": 3.6057186126708984, + "learning_rate": 2.9960500240308616e-05, + "loss": 2.0847, + "step": 11865 + }, + { + "epoch": 0.02313453471932824, + "grad_norm": 3.6711442470550537, + "learning_rate": 2.9960400348202348e-05, + "loss": 1.9817, + "step": 11880 + }, + { + "epoch": 0.023163744990438506, + "grad_norm": 2.514784097671509, + "learning_rate": 2.9960300330112445e-05, + "loss": 1.7633, + "step": 11895 + }, + { + "epoch": 0.023192955261548766, + "grad_norm": 2.1372082233428955, + "learning_rate": 2.996020018603975e-05, + "loss": 1.7941, + "step": 11910 + }, + { + "epoch": 0.02322216553265903, + "grad_norm": 2.8542985916137695, + "learning_rate": 2.9960099915985104e-05, + "loss": 1.7778, + "step": 11925 + }, + { + "epoch": 0.023251375803769292, + "grad_norm": 2.1213855743408203, + "learning_rate": 2.9959999519949354e-05, + "loss": 1.8966, + "step": 11940 + }, + { + "epoch": 0.023280586074879556, + "grad_norm": 2.751647710800171, + "learning_rate": 2.995989899793334e-05, + "loss": 1.7919, + "step": 11955 + }, + { + "epoch": 0.02330979634598982, + "grad_norm": 2.2502388954162598, + "learning_rate": 2.9959798349937915e-05, + "loss": 1.8062, + "step": 11970 + }, + { + "epoch": 0.023339006617100082, + "grad_norm": 2.5316834449768066, + "learning_rate": 2.995969757596392e-05, + "loss": 1.9685, + "step": 11985 + }, + { + "epoch": 0.023368216888210346, + "grad_norm": 2.8897788524627686, + "learning_rate": 2.995959667601221e-05, + "loss": 1.9659, + "step": 12000 + }, + { + "epoch": 0.023397427159320607, + "grad_norm": 3.8147826194763184, + "learning_rate": 2.9959495650083634e-05, + "loss": 1.7452, + "step": 12015 + }, + { + "epoch": 0.02342663743043087, + "grad_norm": 1.9649070501327515, + "learning_rate": 2.9959394498179043e-05, + "loss": 1.7987, + "step": 12030 + }, + { + "epoch": 0.023455847701541133, + "grad_norm": 2.2376818656921387, + "learning_rate": 2.9959293220299287e-05, + "loss": 1.87, + "step": 12045 + }, + { + "epoch": 0.023485057972651397, + "grad_norm": 2.3533129692077637, + "learning_rate": 2.9959191816445217e-05, + "loss": 2.006, + "step": 12060 + }, + { + "epoch": 0.023514268243761658, + "grad_norm": 2.5106008052825928, + "learning_rate": 2.9959090286617686e-05, + "loss": 1.8053, + "step": 12075 + }, + { + "epoch": 0.023543478514871923, + "grad_norm": 2.843824863433838, + "learning_rate": 2.9958988630817555e-05, + "loss": 2.0062, + "step": 12090 + }, + { + "epoch": 0.023572688785982187, + "grad_norm": 3.826493501663208, + "learning_rate": 2.9958886849045678e-05, + "loss": 1.8213, + "step": 12105 + }, + { + "epoch": 0.023601899057092448, + "grad_norm": 2.669509172439575, + "learning_rate": 2.9958784941302908e-05, + "loss": 2.0031, + "step": 12120 + }, + { + "epoch": 0.023631109328202712, + "grad_norm": 5.227283477783203, + "learning_rate": 2.995868290759011e-05, + "loss": 1.9724, + "step": 12135 + }, + { + "epoch": 0.023660319599312973, + "grad_norm": 2.67545485496521, + "learning_rate": 2.9958580747908134e-05, + "loss": 1.7393, + "step": 12150 + }, + { + "epoch": 0.023689529870423238, + "grad_norm": 4.294065952301025, + "learning_rate": 2.9958478462257847e-05, + "loss": 2.0262, + "step": 12165 + }, + { + "epoch": 0.0237187401415335, + "grad_norm": 4.024529933929443, + "learning_rate": 2.9958376050640114e-05, + "loss": 1.8878, + "step": 12180 + }, + { + "epoch": 0.023747950412643763, + "grad_norm": 3.88948130607605, + "learning_rate": 2.9958273513055785e-05, + "loss": 1.9365, + "step": 12195 + }, + { + "epoch": 0.023777160683754028, + "grad_norm": 2.3247134685516357, + "learning_rate": 2.9958170849505736e-05, + "loss": 1.9275, + "step": 12210 + }, + { + "epoch": 0.02380637095486429, + "grad_norm": 2.8689277172088623, + "learning_rate": 2.9958068059990827e-05, + "loss": 1.9015, + "step": 12225 + }, + { + "epoch": 0.023835581225974553, + "grad_norm": 2.650768756866455, + "learning_rate": 2.995796514451192e-05, + "loss": 2.0453, + "step": 12240 + }, + { + "epoch": 0.023864791497084814, + "grad_norm": 3.2005455493927, + "learning_rate": 2.9957862103069886e-05, + "loss": 1.8641, + "step": 12255 + }, + { + "epoch": 0.02389400176819508, + "grad_norm": 3.2274022102355957, + "learning_rate": 2.9957758935665592e-05, + "loss": 2.0379, + "step": 12270 + }, + { + "epoch": 0.02392321203930534, + "grad_norm": 2.933011054992676, + "learning_rate": 2.9957655642299903e-05, + "loss": 2.0304, + "step": 12285 + }, + { + "epoch": 0.023952422310415604, + "grad_norm": 2.051677942276001, + "learning_rate": 2.9957552222973696e-05, + "loss": 1.9656, + "step": 12300 + }, + { + "epoch": 0.02398163258152587, + "grad_norm": 2.488805055618286, + "learning_rate": 2.995744867768784e-05, + "loss": 1.8997, + "step": 12315 + }, + { + "epoch": 0.02401084285263613, + "grad_norm": 2.1613032817840576, + "learning_rate": 2.99573450064432e-05, + "loss": 1.9733, + "step": 12330 + }, + { + "epoch": 0.024040053123746394, + "grad_norm": 3.817368745803833, + "learning_rate": 2.9957241209240656e-05, + "loss": 1.74, + "step": 12345 + }, + { + "epoch": 0.024069263394856655, + "grad_norm": 2.048835515975952, + "learning_rate": 2.995713728608108e-05, + "loss": 2.0201, + "step": 12360 + }, + { + "epoch": 0.02409847366596692, + "grad_norm": 3.2480578422546387, + "learning_rate": 2.995703323696535e-05, + "loss": 1.8652, + "step": 12375 + }, + { + "epoch": 0.02412768393707718, + "grad_norm": 2.2918214797973633, + "learning_rate": 2.9956929061894334e-05, + "loss": 1.9089, + "step": 12390 + }, + { + "epoch": 0.024156894208187445, + "grad_norm": 2.4332637786865234, + "learning_rate": 2.995682476086892e-05, + "loss": 1.8168, + "step": 12405 + }, + { + "epoch": 0.024186104479297706, + "grad_norm": 3.9398860931396484, + "learning_rate": 2.9956720333889978e-05, + "loss": 1.8114, + "step": 12420 + }, + { + "epoch": 0.02421531475040797, + "grad_norm": 1.3877410888671875, + "learning_rate": 2.995661578095839e-05, + "loss": 1.7609, + "step": 12435 + }, + { + "epoch": 0.024244525021518235, + "grad_norm": 3.1245439052581787, + "learning_rate": 2.9956511102075043e-05, + "loss": 1.9191, + "step": 12450 + }, + { + "epoch": 0.024273735292628495, + "grad_norm": 4.245307445526123, + "learning_rate": 2.9956406297240805e-05, + "loss": 1.9551, + "step": 12465 + }, + { + "epoch": 0.02430294556373876, + "grad_norm": 3.033841371536255, + "learning_rate": 2.995630136645657e-05, + "loss": 2.063, + "step": 12480 + }, + { + "epoch": 0.02433215583484902, + "grad_norm": 2.9336037635803223, + "learning_rate": 2.9956196309723217e-05, + "loss": 1.8383, + "step": 12495 + }, + { + "epoch": 0.024361366105959285, + "grad_norm": 2.946824312210083, + "learning_rate": 2.9956091127041628e-05, + "loss": 2.012, + "step": 12510 + }, + { + "epoch": 0.024390576377069546, + "grad_norm": 1.9165891408920288, + "learning_rate": 2.9955985818412695e-05, + "loss": 1.8422, + "step": 12525 + }, + { + "epoch": 0.02441978664817981, + "grad_norm": 2.4142305850982666, + "learning_rate": 2.9955880383837304e-05, + "loss": 2.027, + "step": 12540 + }, + { + "epoch": 0.024448996919290075, + "grad_norm": 3.1490895748138428, + "learning_rate": 2.9955774823316337e-05, + "loss": 1.8485, + "step": 12555 + }, + { + "epoch": 0.024478207190400336, + "grad_norm": 3.187546968460083, + "learning_rate": 2.995566913685069e-05, + "loss": 1.8955, + "step": 12570 + }, + { + "epoch": 0.0245074174615106, + "grad_norm": 4.743402481079102, + "learning_rate": 2.9955563324441246e-05, + "loss": 1.8438, + "step": 12585 + }, + { + "epoch": 0.02453662773262086, + "grad_norm": 4.372682094573975, + "learning_rate": 2.9955457386088904e-05, + "loss": 1.6814, + "step": 12600 + }, + { + "epoch": 0.024565838003731126, + "grad_norm": 4.228031158447266, + "learning_rate": 2.995535132179455e-05, + "loss": 1.8358, + "step": 12615 + }, + { + "epoch": 0.024595048274841387, + "grad_norm": 1.7856305837631226, + "learning_rate": 2.9955245131559078e-05, + "loss": 1.9314, + "step": 12630 + }, + { + "epoch": 0.02462425854595165, + "grad_norm": 2.232226610183716, + "learning_rate": 2.9955138815383383e-05, + "loss": 1.7662, + "step": 12645 + }, + { + "epoch": 0.024653468817061912, + "grad_norm": 2.812988758087158, + "learning_rate": 2.9955032373268366e-05, + "loss": 1.8475, + "step": 12660 + }, + { + "epoch": 0.024682679088172177, + "grad_norm": 4.58977746963501, + "learning_rate": 2.995492580521491e-05, + "loss": 1.8952, + "step": 12675 + }, + { + "epoch": 0.02471188935928244, + "grad_norm": 2.619033098220825, + "learning_rate": 2.995481911122393e-05, + "loss": 1.7343, + "step": 12690 + }, + { + "epoch": 0.024741099630392702, + "grad_norm": 2.20595645904541, + "learning_rate": 2.9954712291296303e-05, + "loss": 1.8573, + "step": 12705 + }, + { + "epoch": 0.024770309901502967, + "grad_norm": 4.261920928955078, + "learning_rate": 2.9954605345432948e-05, + "loss": 1.9771, + "step": 12720 + }, + { + "epoch": 0.024799520172613228, + "grad_norm": 4.171009063720703, + "learning_rate": 2.995449827363476e-05, + "loss": 1.8328, + "step": 12735 + }, + { + "epoch": 0.024828730443723492, + "grad_norm": 1.9221203327178955, + "learning_rate": 2.9954391075902634e-05, + "loss": 2.0122, + "step": 12750 + }, + { + "epoch": 0.024857940714833753, + "grad_norm": 4.801123142242432, + "learning_rate": 2.9954283752237478e-05, + "loss": 1.9288, + "step": 12765 + }, + { + "epoch": 0.024887150985944018, + "grad_norm": 5.146918296813965, + "learning_rate": 2.99541763026402e-05, + "loss": 1.8986, + "step": 12780 + }, + { + "epoch": 0.024916361257054282, + "grad_norm": 3.8306210041046143, + "learning_rate": 2.9954068727111694e-05, + "loss": 1.9432, + "step": 12795 + }, + { + "epoch": 0.024945571528164543, + "grad_norm": 3.569969892501831, + "learning_rate": 2.9953961025652875e-05, + "loss": 1.802, + "step": 12810 + }, + { + "epoch": 0.024974781799274808, + "grad_norm": 2.11238956451416, + "learning_rate": 2.995385319826465e-05, + "loss": 1.7489, + "step": 12825 + }, + { + "epoch": 0.02500399207038507, + "grad_norm": 2.0586655139923096, + "learning_rate": 2.995374524494792e-05, + "loss": 1.8308, + "step": 12840 + }, + { + "epoch": 0.025033202341495333, + "grad_norm": 2.133302688598633, + "learning_rate": 2.9953637165703597e-05, + "loss": 1.8642, + "step": 12855 + }, + { + "epoch": 0.025062412612605594, + "grad_norm": 6.014092922210693, + "learning_rate": 2.9953528960532594e-05, + "loss": 1.838, + "step": 12870 + }, + { + "epoch": 0.02509162288371586, + "grad_norm": 4.038995742797852, + "learning_rate": 2.9953420629435823e-05, + "loss": 1.9485, + "step": 12885 + }, + { + "epoch": 0.02512083315482612, + "grad_norm": 2.016037940979004, + "learning_rate": 2.995331217241419e-05, + "loss": 1.8532, + "step": 12900 + }, + { + "epoch": 0.025150043425936384, + "grad_norm": 2.268634796142578, + "learning_rate": 2.9953203589468617e-05, + "loss": 1.7899, + "step": 12915 + }, + { + "epoch": 0.025179253697046648, + "grad_norm": 4.676908016204834, + "learning_rate": 2.995309488060001e-05, + "loss": 1.875, + "step": 12930 + }, + { + "epoch": 0.02520846396815691, + "grad_norm": 2.0081729888916016, + "learning_rate": 2.9952986045809284e-05, + "loss": 1.9863, + "step": 12945 + }, + { + "epoch": 0.025237674239267174, + "grad_norm": 2.9928226470947266, + "learning_rate": 2.9952877085097364e-05, + "loss": 1.8073, + "step": 12960 + }, + { + "epoch": 0.025266884510377435, + "grad_norm": 3.514173746109009, + "learning_rate": 2.9952767998465164e-05, + "loss": 1.849, + "step": 12975 + }, + { + "epoch": 0.0252960947814877, + "grad_norm": 1.9361449480056763, + "learning_rate": 2.99526587859136e-05, + "loss": 1.8925, + "step": 12990 + }, + { + "epoch": 0.02532530505259796, + "grad_norm": 4.396752834320068, + "learning_rate": 2.9952549447443595e-05, + "loss": 1.8844, + "step": 13005 + }, + { + "epoch": 0.025354515323708225, + "grad_norm": 2.9122262001037598, + "learning_rate": 2.9952439983056066e-05, + "loss": 1.9564, + "step": 13020 + }, + { + "epoch": 0.02538372559481849, + "grad_norm": 2.6632208824157715, + "learning_rate": 2.9952330392751935e-05, + "loss": 1.8514, + "step": 13035 + }, + { + "epoch": 0.02541293586592875, + "grad_norm": 2.678126573562622, + "learning_rate": 2.995222067653213e-05, + "loss": 1.8146, + "step": 13050 + }, + { + "epoch": 0.025442146137039014, + "grad_norm": 3.901380777359009, + "learning_rate": 2.9952110834397572e-05, + "loss": 2.0268, + "step": 13065 + }, + { + "epoch": 0.025471356408149275, + "grad_norm": 2.7066125869750977, + "learning_rate": 2.9952000866349185e-05, + "loss": 1.8731, + "step": 13080 + }, + { + "epoch": 0.02550056667925954, + "grad_norm": 4.8974456787109375, + "learning_rate": 2.9951890772387897e-05, + "loss": 1.908, + "step": 13095 + }, + { + "epoch": 0.0255297769503698, + "grad_norm": 3.9215996265411377, + "learning_rate": 2.995178055251463e-05, + "loss": 2.0428, + "step": 13110 + }, + { + "epoch": 0.025558987221480065, + "grad_norm": 2.519742250442505, + "learning_rate": 2.9951670206730318e-05, + "loss": 1.7794, + "step": 13125 + }, + { + "epoch": 0.025588197492590326, + "grad_norm": 2.1564877033233643, + "learning_rate": 2.995155973503589e-05, + "loss": 1.6714, + "step": 13140 + }, + { + "epoch": 0.02561740776370059, + "grad_norm": 4.537586688995361, + "learning_rate": 2.9951449137432275e-05, + "loss": 2.0812, + "step": 13155 + }, + { + "epoch": 0.025646618034810855, + "grad_norm": 3.659740686416626, + "learning_rate": 2.9951338413920403e-05, + "loss": 1.995, + "step": 13170 + }, + { + "epoch": 0.025675828305921116, + "grad_norm": 2.057332992553711, + "learning_rate": 2.9951227564501207e-05, + "loss": 1.8206, + "step": 13185 + }, + { + "epoch": 0.02570503857703138, + "grad_norm": 3.4197821617126465, + "learning_rate": 2.995111658917562e-05, + "loss": 1.9727, + "step": 13200 + }, + { + "epoch": 0.02573424884814164, + "grad_norm": 2.212815761566162, + "learning_rate": 2.995100548794458e-05, + "loss": 1.959, + "step": 13215 + }, + { + "epoch": 0.025763459119251906, + "grad_norm": 2.1396286487579346, + "learning_rate": 2.9950894260809015e-05, + "loss": 1.9048, + "step": 13230 + }, + { + "epoch": 0.025792669390362167, + "grad_norm": 2.823305606842041, + "learning_rate": 2.995078290776987e-05, + "loss": 1.9381, + "step": 13245 + }, + { + "epoch": 0.02582187966147243, + "grad_norm": 3.0127432346343994, + "learning_rate": 2.9950671428828083e-05, + "loss": 1.959, + "step": 13260 + }, + { + "epoch": 0.025851089932582696, + "grad_norm": 2.1536638736724854, + "learning_rate": 2.9950559823984583e-05, + "loss": 1.7713, + "step": 13275 + }, + { + "epoch": 0.025880300203692957, + "grad_norm": 2.784940242767334, + "learning_rate": 2.9950448093240318e-05, + "loss": 1.8586, + "step": 13290 + }, + { + "epoch": 0.02590951047480322, + "grad_norm": 2.103855848312378, + "learning_rate": 2.9950336236596226e-05, + "loss": 1.8167, + "step": 13305 + }, + { + "epoch": 0.025938720745913482, + "grad_norm": 2.465585947036743, + "learning_rate": 2.9950224254053254e-05, + "loss": 1.9863, + "step": 13320 + }, + { + "epoch": 0.025967931017023747, + "grad_norm": 3.2060279846191406, + "learning_rate": 2.9950112145612335e-05, + "loss": 1.838, + "step": 13335 + }, + { + "epoch": 0.025997141288134008, + "grad_norm": 3.0637335777282715, + "learning_rate": 2.9949999911274427e-05, + "loss": 1.9325, + "step": 13350 + }, + { + "epoch": 0.026026351559244272, + "grad_norm": 2.8039963245391846, + "learning_rate": 2.994988755104046e-05, + "loss": 1.8671, + "step": 13365 + }, + { + "epoch": 0.026055561830354537, + "grad_norm": 4.239367485046387, + "learning_rate": 2.9949775064911388e-05, + "loss": 1.9305, + "step": 13380 + }, + { + "epoch": 0.026084772101464797, + "grad_norm": 4.794821262359619, + "learning_rate": 2.994966245288816e-05, + "loss": 1.8946, + "step": 13395 + }, + { + "epoch": 0.026113982372575062, + "grad_norm": 3.5099828243255615, + "learning_rate": 2.994954971497172e-05, + "loss": 1.919, + "step": 13410 + }, + { + "epoch": 0.026143192643685323, + "grad_norm": 4.3362603187561035, + "learning_rate": 2.994943685116302e-05, + "loss": 1.9266, + "step": 13425 + }, + { + "epoch": 0.026172402914795587, + "grad_norm": 4.010773658752441, + "learning_rate": 2.994932386146301e-05, + "loss": 1.8712, + "step": 13440 + }, + { + "epoch": 0.02620161318590585, + "grad_norm": 2.9033546447753906, + "learning_rate": 2.9949210745872638e-05, + "loss": 1.8179, + "step": 13455 + }, + { + "epoch": 0.026230823457016113, + "grad_norm": 2.215955972671509, + "learning_rate": 2.9949097504392866e-05, + "loss": 1.8573, + "step": 13470 + }, + { + "epoch": 0.026260033728126374, + "grad_norm": 4.64263391494751, + "learning_rate": 2.994898413702464e-05, + "loss": 1.8504, + "step": 13485 + }, + { + "epoch": 0.026289243999236638, + "grad_norm": 2.8851144313812256, + "learning_rate": 2.9948870643768915e-05, + "loss": 1.7891, + "step": 13500 + }, + { + "epoch": 0.026318454270346903, + "grad_norm": 5.705179214477539, + "learning_rate": 2.9948757024626645e-05, + "loss": 1.8502, + "step": 13515 + }, + { + "epoch": 0.026347664541457164, + "grad_norm": 4.427610397338867, + "learning_rate": 2.994864327959879e-05, + "loss": 1.746, + "step": 13530 + }, + { + "epoch": 0.026376874812567428, + "grad_norm": 2.9682793617248535, + "learning_rate": 2.994852940868631e-05, + "loss": 1.8766, + "step": 13545 + }, + { + "epoch": 0.02640608508367769, + "grad_norm": 2.8406543731689453, + "learning_rate": 2.9948415411890164e-05, + "loss": 1.8637, + "step": 13560 + }, + { + "epoch": 0.026435295354787954, + "grad_norm": 2.9661149978637695, + "learning_rate": 2.9948301289211308e-05, + "loss": 1.7703, + "step": 13575 + }, + { + "epoch": 0.026464505625898214, + "grad_norm": 2.961155652999878, + "learning_rate": 2.99481870406507e-05, + "loss": 1.8123, + "step": 13590 + }, + { + "epoch": 0.02649371589700848, + "grad_norm": 3.7241668701171875, + "learning_rate": 2.9948072666209308e-05, + "loss": 2.018, + "step": 13605 + }, + { + "epoch": 0.026522926168118743, + "grad_norm": 2.8102498054504395, + "learning_rate": 2.9947958165888096e-05, + "loss": 1.8577, + "step": 13620 + }, + { + "epoch": 0.026552136439229004, + "grad_norm": 2.061007022857666, + "learning_rate": 2.9947843539688027e-05, + "loss": 1.9684, + "step": 13635 + }, + { + "epoch": 0.02658134671033927, + "grad_norm": 4.699859619140625, + "learning_rate": 2.994772878761006e-05, + "loss": 1.9438, + "step": 13650 + }, + { + "epoch": 0.02661055698144953, + "grad_norm": 5.8805952072143555, + "learning_rate": 2.994761390965517e-05, + "loss": 1.8862, + "step": 13665 + }, + { + "epoch": 0.026639767252559794, + "grad_norm": 3.6178531646728516, + "learning_rate": 2.994749890582432e-05, + "loss": 1.9754, + "step": 13680 + }, + { + "epoch": 0.026668977523670055, + "grad_norm": 2.891448497772217, + "learning_rate": 2.9947383776118482e-05, + "loss": 1.8838, + "step": 13695 + }, + { + "epoch": 0.02669818779478032, + "grad_norm": 2.5380797386169434, + "learning_rate": 2.994726852053862e-05, + "loss": 2.0006, + "step": 13710 + }, + { + "epoch": 0.02672739806589058, + "grad_norm": 3.083801031112671, + "learning_rate": 2.994715313908571e-05, + "loss": 1.9287, + "step": 13725 + }, + { + "epoch": 0.026756608337000845, + "grad_norm": 3.9220306873321533, + "learning_rate": 2.9947037631760717e-05, + "loss": 2.0063, + "step": 13740 + }, + { + "epoch": 0.02678581860811111, + "grad_norm": 2.41329288482666, + "learning_rate": 2.994692199856462e-05, + "loss": 1.7779, + "step": 13755 + }, + { + "epoch": 0.02681502887922137, + "grad_norm": 3.137281656265259, + "learning_rate": 2.9946806239498392e-05, + "loss": 1.7686, + "step": 13770 + }, + { + "epoch": 0.026844239150331635, + "grad_norm": 3.8897507190704346, + "learning_rate": 2.994669035456301e-05, + "loss": 1.9879, + "step": 13785 + }, + { + "epoch": 0.026873449421441896, + "grad_norm": 2.888145685195923, + "learning_rate": 2.994657434375944e-05, + "loss": 2.0012, + "step": 13800 + }, + { + "epoch": 0.02690265969255216, + "grad_norm": 2.683145523071289, + "learning_rate": 2.9946458207088667e-05, + "loss": 1.8579, + "step": 13815 + }, + { + "epoch": 0.02693186996366242, + "grad_norm": 2.5023186206817627, + "learning_rate": 2.9946341944551668e-05, + "loss": 1.8899, + "step": 13830 + }, + { + "epoch": 0.026961080234772686, + "grad_norm": 4.522122383117676, + "learning_rate": 2.994622555614942e-05, + "loss": 1.8373, + "step": 13845 + }, + { + "epoch": 0.02699029050588295, + "grad_norm": 1.9197810888290405, + "learning_rate": 2.9946109041882902e-05, + "loss": 1.874, + "step": 13860 + }, + { + "epoch": 0.02701950077699321, + "grad_norm": 2.0907135009765625, + "learning_rate": 2.9945992401753103e-05, + "loss": 1.9878, + "step": 13875 + }, + { + "epoch": 0.027048711048103476, + "grad_norm": 3.0691592693328857, + "learning_rate": 2.9945875635761e-05, + "loss": 1.8859, + "step": 13890 + }, + { + "epoch": 0.027077921319213737, + "grad_norm": 2.0707552433013916, + "learning_rate": 2.9945758743907573e-05, + "loss": 1.7612, + "step": 13905 + }, + { + "epoch": 0.027107131590324, + "grad_norm": 2.2770462036132812, + "learning_rate": 2.994564172619381e-05, + "loss": 1.8028, + "step": 13920 + }, + { + "epoch": 0.027136341861434262, + "grad_norm": 2.681814193725586, + "learning_rate": 2.9945524582620695e-05, + "loss": 1.7967, + "step": 13935 + }, + { + "epoch": 0.027165552132544526, + "grad_norm": 3.0529186725616455, + "learning_rate": 2.994540731318922e-05, + "loss": 1.7972, + "step": 13950 + }, + { + "epoch": 0.027194762403654787, + "grad_norm": 3.369091033935547, + "learning_rate": 2.9945289917900368e-05, + "loss": 1.8092, + "step": 13965 + }, + { + "epoch": 0.027223972674765052, + "grad_norm": 2.190134048461914, + "learning_rate": 2.9945172396755124e-05, + "loss": 2.0228, + "step": 13980 + }, + { + "epoch": 0.027253182945875316, + "grad_norm": 2.805100202560425, + "learning_rate": 2.9945054749754483e-05, + "loss": 1.9312, + "step": 13995 + }, + { + "epoch": 0.027282393216985577, + "grad_norm": 2.195697546005249, + "learning_rate": 2.9944936976899433e-05, + "loss": 1.9791, + "step": 14010 + }, + { + "epoch": 0.027311603488095842, + "grad_norm": 1.723713755607605, + "learning_rate": 2.9944819078190967e-05, + "loss": 1.8542, + "step": 14025 + }, + { + "epoch": 0.027340813759206103, + "grad_norm": 2.633101463317871, + "learning_rate": 2.9944701053630075e-05, + "loss": 1.8127, + "step": 14040 + }, + { + "epoch": 0.027370024030316367, + "grad_norm": 1.9390171766281128, + "learning_rate": 2.9944582903217756e-05, + "loss": 1.9183, + "step": 14055 + }, + { + "epoch": 0.027399234301426628, + "grad_norm": 3.9491968154907227, + "learning_rate": 2.9944464626955003e-05, + "loss": 2.0849, + "step": 14070 + }, + { + "epoch": 0.027428444572536893, + "grad_norm": 2.4679179191589355, + "learning_rate": 2.9944346224842812e-05, + "loss": 1.9285, + "step": 14085 + }, + { + "epoch": 0.027457654843647157, + "grad_norm": 2.999509334564209, + "learning_rate": 2.994422769688218e-05, + "loss": 1.9523, + "step": 14100 + }, + { + "epoch": 0.027486865114757418, + "grad_norm": 3.8798091411590576, + "learning_rate": 2.9944109043074104e-05, + "loss": 1.9014, + "step": 14115 + }, + { + "epoch": 0.027516075385867683, + "grad_norm": 2.5288240909576416, + "learning_rate": 2.9943990263419582e-05, + "loss": 2.1135, + "step": 14130 + }, + { + "epoch": 0.027545285656977943, + "grad_norm": 2.2120304107666016, + "learning_rate": 2.994387135791962e-05, + "loss": 1.7418, + "step": 14145 + }, + { + "epoch": 0.027574495928088208, + "grad_norm": 2.805328607559204, + "learning_rate": 2.994375232657521e-05, + "loss": 1.8776, + "step": 14160 + }, + { + "epoch": 0.02760370619919847, + "grad_norm": 2.8841097354888916, + "learning_rate": 2.9943633169387365e-05, + "loss": 1.9106, + "step": 14175 + }, + { + "epoch": 0.027632916470308733, + "grad_norm": 1.8887025117874146, + "learning_rate": 2.994351388635708e-05, + "loss": 1.8916, + "step": 14190 + }, + { + "epoch": 0.027662126741418994, + "grad_norm": 2.8623757362365723, + "learning_rate": 2.9943394477485363e-05, + "loss": 1.8735, + "step": 14205 + }, + { + "epoch": 0.02769133701252926, + "grad_norm": 3.1046249866485596, + "learning_rate": 2.994327494277322e-05, + "loss": 1.9124, + "step": 14220 + }, + { + "epoch": 0.027720547283639523, + "grad_norm": 2.653933525085449, + "learning_rate": 2.9943155282221663e-05, + "loss": 1.9387, + "step": 14235 + }, + { + "epoch": 0.027749757554749784, + "grad_norm": 2.975820779800415, + "learning_rate": 2.9943035495831688e-05, + "loss": 1.8232, + "step": 14250 + }, + { + "epoch": 0.02777896782586005, + "grad_norm": 5.906015396118164, + "learning_rate": 2.9942915583604307e-05, + "loss": 1.9167, + "step": 14265 + }, + { + "epoch": 0.02780817809697031, + "grad_norm": 2.592456102371216, + "learning_rate": 2.994279554554054e-05, + "loss": 1.7433, + "step": 14280 + }, + { + "epoch": 0.027837388368080574, + "grad_norm": 5.042680263519287, + "learning_rate": 2.994267538164138e-05, + "loss": 1.7878, + "step": 14295 + }, + { + "epoch": 0.027866598639190835, + "grad_norm": 4.092184066772461, + "learning_rate": 2.9942555091907853e-05, + "loss": 1.6955, + "step": 14310 + }, + { + "epoch": 0.0278958089103011, + "grad_norm": 4.623755931854248, + "learning_rate": 2.994243467634097e-05, + "loss": 1.7866, + "step": 14325 + }, + { + "epoch": 0.027925019181411364, + "grad_norm": 3.042306661605835, + "learning_rate": 2.994231413494174e-05, + "loss": 1.8891, + "step": 14340 + }, + { + "epoch": 0.027954229452521625, + "grad_norm": 2.784275531768799, + "learning_rate": 2.9942193467711184e-05, + "loss": 2.0112, + "step": 14355 + }, + { + "epoch": 0.02798343972363189, + "grad_norm": 1.9308695793151855, + "learning_rate": 2.9942072674650317e-05, + "loss": 1.7964, + "step": 14370 + }, + { + "epoch": 0.02801264999474215, + "grad_norm": 3.7377004623413086, + "learning_rate": 2.994195175576015e-05, + "loss": 1.8661, + "step": 14385 + }, + { + "epoch": 0.028041860265852415, + "grad_norm": 2.484870195388794, + "learning_rate": 2.994183071104171e-05, + "loss": 1.7358, + "step": 14400 + }, + { + "epoch": 0.028071070536962676, + "grad_norm": 2.6344974040985107, + "learning_rate": 2.9941709540496013e-05, + "loss": 1.9183, + "step": 14415 + }, + { + "epoch": 0.02810028080807294, + "grad_norm": 2.168701410293579, + "learning_rate": 2.9941588244124072e-05, + "loss": 1.999, + "step": 14430 + }, + { + "epoch": 0.028129491079183205, + "grad_norm": 2.986727476119995, + "learning_rate": 2.994146682192692e-05, + "loss": 1.8344, + "step": 14445 + }, + { + "epoch": 0.028158701350293466, + "grad_norm": 3.3715713024139404, + "learning_rate": 2.9941345273905573e-05, + "loss": 2.0468, + "step": 14460 + }, + { + "epoch": 0.02818791162140373, + "grad_norm": 2.2077038288116455, + "learning_rate": 2.9941223600061054e-05, + "loss": 2.0255, + "step": 14475 + }, + { + "epoch": 0.02821712189251399, + "grad_norm": 3.4651224613189697, + "learning_rate": 2.994110180039439e-05, + "loss": 1.8604, + "step": 14490 + }, + { + "epoch": 0.028246332163624256, + "grad_norm": 2.0584287643432617, + "learning_rate": 2.994097987490661e-05, + "loss": 2.0636, + "step": 14505 + }, + { + "epoch": 0.028275542434734516, + "grad_norm": 3.1285014152526855, + "learning_rate": 2.9940857823598736e-05, + "loss": 1.7656, + "step": 14520 + }, + { + "epoch": 0.02830475270584478, + "grad_norm": 2.541280746459961, + "learning_rate": 2.9940735646471793e-05, + "loss": 1.8682, + "step": 14535 + }, + { + "epoch": 0.028333962976955042, + "grad_norm": 2.4748847484588623, + "learning_rate": 2.9940613343526817e-05, + "loss": 2.0047, + "step": 14550 + }, + { + "epoch": 0.028363173248065306, + "grad_norm": 3.2105560302734375, + "learning_rate": 2.9940490914764834e-05, + "loss": 1.7709, + "step": 14565 + }, + { + "epoch": 0.02839238351917557, + "grad_norm": 3.491591215133667, + "learning_rate": 2.9940368360186878e-05, + "loss": 1.786, + "step": 14580 + }, + { + "epoch": 0.028421593790285832, + "grad_norm": 3.315342903137207, + "learning_rate": 2.9940245679793978e-05, + "loss": 1.917, + "step": 14595 + }, + { + "epoch": 0.028450804061396096, + "grad_norm": 1.7594997882843018, + "learning_rate": 2.9940122873587164e-05, + "loss": 1.877, + "step": 14610 + }, + { + "epoch": 0.028480014332506357, + "grad_norm": 2.442725896835327, + "learning_rate": 2.9939999941567474e-05, + "loss": 1.9577, + "step": 14625 + }, + { + "epoch": 0.02850922460361662, + "grad_norm": 3.146977663040161, + "learning_rate": 2.993987688373595e-05, + "loss": 1.8722, + "step": 14640 + }, + { + "epoch": 0.028538434874726883, + "grad_norm": 2.6678929328918457, + "learning_rate": 2.9939753700093618e-05, + "loss": 1.659, + "step": 14655 + }, + { + "epoch": 0.028567645145837147, + "grad_norm": 2.4769906997680664, + "learning_rate": 2.9939630390641518e-05, + "loss": 1.8257, + "step": 14670 + }, + { + "epoch": 0.02859685541694741, + "grad_norm": 2.9314770698547363, + "learning_rate": 2.993950695538069e-05, + "loss": 1.8004, + "step": 14685 + }, + { + "epoch": 0.028626065688057672, + "grad_norm": 3.2279980182647705, + "learning_rate": 2.993938339431217e-05, + "loss": 1.9438, + "step": 14700 + }, + { + "epoch": 0.028655275959167937, + "grad_norm": 2.8929495811462402, + "learning_rate": 2.9939259707437002e-05, + "loss": 1.7995, + "step": 14715 + }, + { + "epoch": 0.028684486230278198, + "grad_norm": 4.861998558044434, + "learning_rate": 2.9939135894756232e-05, + "loss": 1.8188, + "step": 14730 + }, + { + "epoch": 0.028713696501388462, + "grad_norm": 5.37394905090332, + "learning_rate": 2.9939011956270893e-05, + "loss": 2.061, + "step": 14745 + }, + { + "epoch": 0.028742906772498723, + "grad_norm": 2.2253520488739014, + "learning_rate": 2.9938887891982035e-05, + "loss": 1.963, + "step": 14760 + }, + { + "epoch": 0.028772117043608988, + "grad_norm": 3.424954414367676, + "learning_rate": 2.99387637018907e-05, + "loss": 2.0077, + "step": 14775 + }, + { + "epoch": 0.02880132731471925, + "grad_norm": 2.8398706912994385, + "learning_rate": 2.9938639385997934e-05, + "loss": 2.0516, + "step": 14790 + }, + { + "epoch": 0.028830537585829513, + "grad_norm": 2.371492385864258, + "learning_rate": 2.9938514944304788e-05, + "loss": 1.8057, + "step": 14805 + }, + { + "epoch": 0.028859747856939778, + "grad_norm": 1.870301365852356, + "learning_rate": 2.9938390376812304e-05, + "loss": 1.8335, + "step": 14820 + }, + { + "epoch": 0.02888895812805004, + "grad_norm": 3.1508800983428955, + "learning_rate": 2.9938265683521533e-05, + "loss": 2.0272, + "step": 14835 + }, + { + "epoch": 0.028918168399160303, + "grad_norm": 2.8456640243530273, + "learning_rate": 2.9938140864433528e-05, + "loss": 1.889, + "step": 14850 + }, + { + "epoch": 0.028947378670270564, + "grad_norm": 2.3040804862976074, + "learning_rate": 2.9938015919549337e-05, + "loss": 1.9274, + "step": 14865 + }, + { + "epoch": 0.02897658894138083, + "grad_norm": 3.075559139251709, + "learning_rate": 2.9937890848870012e-05, + "loss": 1.9239, + "step": 14880 + }, + { + "epoch": 0.02900579921249109, + "grad_norm": 2.4840190410614014, + "learning_rate": 2.9937765652396608e-05, + "loss": 1.9836, + "step": 14895 + }, + { + "epoch": 0.029035009483601354, + "grad_norm": 2.915515422821045, + "learning_rate": 2.9937640330130182e-05, + "loss": 2.0196, + "step": 14910 + }, + { + "epoch": 0.02906421975471162, + "grad_norm": 2.684401035308838, + "learning_rate": 2.993751488207178e-05, + "loss": 1.9699, + "step": 14925 + }, + { + "epoch": 0.02909343002582188, + "grad_norm": 1.7906841039657593, + "learning_rate": 2.9937389308222468e-05, + "loss": 1.9435, + "step": 14940 + }, + { + "epoch": 0.029122640296932144, + "grad_norm": 2.7629384994506836, + "learning_rate": 2.9937263608583297e-05, + "loss": 1.9266, + "step": 14955 + }, + { + "epoch": 0.029151850568042405, + "grad_norm": 3.65447735786438, + "learning_rate": 2.9937137783155326e-05, + "loss": 1.8818, + "step": 14970 + }, + { + "epoch": 0.02918106083915267, + "grad_norm": 2.684885025024414, + "learning_rate": 2.993701183193962e-05, + "loss": 1.744, + "step": 14985 + }, + { + "epoch": 0.02921027111026293, + "grad_norm": 2.995678424835205, + "learning_rate": 2.9936885754937237e-05, + "loss": 1.868, + "step": 15000 + }, + { + "epoch": 0.029239481381373195, + "grad_norm": 1.9314979314804077, + "learning_rate": 2.993675955214924e-05, + "loss": 1.8982, + "step": 15015 + }, + { + "epoch": 0.029268691652483456, + "grad_norm": 3.8655660152435303, + "learning_rate": 2.993663322357669e-05, + "loss": 2.103, + "step": 15030 + }, + { + "epoch": 0.02929790192359372, + "grad_norm": 3.2133545875549316, + "learning_rate": 2.993650676922065e-05, + "loss": 1.867, + "step": 15045 + }, + { + "epoch": 0.029327112194703985, + "grad_norm": 1.7590382099151611, + "learning_rate": 2.9936380189082184e-05, + "loss": 1.9596, + "step": 15060 + }, + { + "epoch": 0.029356322465814245, + "grad_norm": 3.5091662406921387, + "learning_rate": 2.993625348316236e-05, + "loss": 2.0665, + "step": 15075 + }, + { + "epoch": 0.02938553273692451, + "grad_norm": 2.493689775466919, + "learning_rate": 2.9936126651462246e-05, + "loss": 1.8168, + "step": 15090 + }, + { + "epoch": 0.02941474300803477, + "grad_norm": 2.158205509185791, + "learning_rate": 2.993599969398291e-05, + "loss": 1.9542, + "step": 15105 + }, + { + "epoch": 0.029443953279145035, + "grad_norm": 1.6943566799163818, + "learning_rate": 2.9935872610725415e-05, + "loss": 1.9762, + "step": 15120 + }, + { + "epoch": 0.029473163550255296, + "grad_norm": 4.148080825805664, + "learning_rate": 2.993574540169084e-05, + "loss": 1.8338, + "step": 15135 + }, + { + "epoch": 0.02950237382136556, + "grad_norm": 3.9066367149353027, + "learning_rate": 2.993561806688025e-05, + "loss": 1.875, + "step": 15150 + }, + { + "epoch": 0.029531584092475825, + "grad_norm": 4.32364559173584, + "learning_rate": 2.9935490606294726e-05, + "loss": 1.7329, + "step": 15165 + }, + { + "epoch": 0.029560794363586086, + "grad_norm": 3.791557788848877, + "learning_rate": 2.9935363019935327e-05, + "loss": 1.8515, + "step": 15180 + }, + { + "epoch": 0.02959000463469635, + "grad_norm": 2.274827241897583, + "learning_rate": 2.9935235307803137e-05, + "loss": 1.7744, + "step": 15195 + }, + { + "epoch": 0.02961921490580661, + "grad_norm": 3.765348196029663, + "learning_rate": 2.9935107469899235e-05, + "loss": 1.7427, + "step": 15210 + }, + { + "epoch": 0.029648425176916876, + "grad_norm": 1.8818484544754028, + "learning_rate": 2.9934979506224687e-05, + "loss": 1.912, + "step": 15225 + }, + { + "epoch": 0.029677635448027137, + "grad_norm": 3.2676870822906494, + "learning_rate": 2.993485141678058e-05, + "loss": 2.022, + "step": 15240 + }, + { + "epoch": 0.0297068457191374, + "grad_norm": 3.329846143722534, + "learning_rate": 2.9934723201567986e-05, + "loss": 1.9317, + "step": 15255 + }, + { + "epoch": 0.029736055990247662, + "grad_norm": 4.478529453277588, + "learning_rate": 2.993459486058799e-05, + "loss": 1.8172, + "step": 15270 + }, + { + "epoch": 0.029765266261357927, + "grad_norm": 5.912947654724121, + "learning_rate": 2.9934466393841667e-05, + "loss": 1.8828, + "step": 15285 + }, + { + "epoch": 0.02979447653246819, + "grad_norm": 3.079585552215576, + "learning_rate": 2.9934337801330102e-05, + "loss": 1.946, + "step": 15300 + }, + { + "epoch": 0.029823686803578452, + "grad_norm": 2.770911455154419, + "learning_rate": 2.993420908305438e-05, + "loss": 1.884, + "step": 15315 + }, + { + "epoch": 0.029852897074688717, + "grad_norm": 2.8034188747406006, + "learning_rate": 2.993408023901558e-05, + "loss": 1.7856, + "step": 15330 + }, + { + "epoch": 0.029882107345798978, + "grad_norm": 4.039456367492676, + "learning_rate": 2.9933951269214793e-05, + "loss": 1.9302, + "step": 15345 + }, + { + "epoch": 0.029911317616909242, + "grad_norm": 3.329083204269409, + "learning_rate": 2.99338221736531e-05, + "loss": 1.9541, + "step": 15360 + }, + { + "epoch": 0.029940527888019503, + "grad_norm": 2.9838755130767822, + "learning_rate": 2.9933692952331593e-05, + "loss": 1.8633, + "step": 15375 + }, + { + "epoch": 0.029969738159129768, + "grad_norm": 3.2724435329437256, + "learning_rate": 2.9933563605251356e-05, + "loss": 1.9572, + "step": 15390 + }, + { + "epoch": 0.029998948430240032, + "grad_norm": 1.9584968090057373, + "learning_rate": 2.993343413241348e-05, + "loss": 1.7946, + "step": 15405 + }, + { + "epoch": 0.030028158701350293, + "grad_norm": 3.7978711128234863, + "learning_rate": 2.9933304533819053e-05, + "loss": 1.7179, + "step": 15420 + }, + { + "epoch": 0.030057368972460557, + "grad_norm": 3.708948850631714, + "learning_rate": 2.993317480946917e-05, + "loss": 1.7777, + "step": 15435 + }, + { + "epoch": 0.03008657924357082, + "grad_norm": 2.204556941986084, + "learning_rate": 2.993304495936492e-05, + "loss": 2.0804, + "step": 15450 + }, + { + "epoch": 0.030115789514681083, + "grad_norm": 4.719995021820068, + "learning_rate": 2.9932914983507398e-05, + "loss": 1.8505, + "step": 15465 + }, + { + "epoch": 0.030144999785791344, + "grad_norm": 2.958409547805786, + "learning_rate": 2.9932784881897703e-05, + "loss": 1.8833, + "step": 15480 + }, + { + "epoch": 0.03017421005690161, + "grad_norm": 2.398984909057617, + "learning_rate": 2.993265465453692e-05, + "loss": 1.6463, + "step": 15495 + }, + { + "epoch": 0.03020342032801187, + "grad_norm": 2.104973077774048, + "learning_rate": 2.9932524301426155e-05, + "loss": 1.7249, + "step": 15510 + }, + { + "epoch": 0.030232630599122134, + "grad_norm": 5.241917610168457, + "learning_rate": 2.99323938225665e-05, + "loss": 1.7852, + "step": 15525 + }, + { + "epoch": 0.030261840870232398, + "grad_norm": 2.9521865844726562, + "learning_rate": 2.9932263217959064e-05, + "loss": 1.8941, + "step": 15540 + }, + { + "epoch": 0.03029105114134266, + "grad_norm": 3.14367413520813, + "learning_rate": 2.9932132487604936e-05, + "loss": 1.9962, + "step": 15555 + }, + { + "epoch": 0.030320261412452924, + "grad_norm": 1.763987421989441, + "learning_rate": 2.9932001631505217e-05, + "loss": 1.9057, + "step": 15570 + }, + { + "epoch": 0.030349471683563185, + "grad_norm": 4.6951446533203125, + "learning_rate": 2.993187064966101e-05, + "loss": 1.8781, + "step": 15585 + }, + { + "epoch": 0.03037868195467345, + "grad_norm": 2.424650192260742, + "learning_rate": 2.993173954207343e-05, + "loss": 1.8808, + "step": 15600 + }, + { + "epoch": 0.03040789222578371, + "grad_norm": 2.743579387664795, + "learning_rate": 2.9931608308743562e-05, + "loss": 1.899, + "step": 15615 + }, + { + "epoch": 0.030437102496893974, + "grad_norm": 4.443767070770264, + "learning_rate": 2.9931476949672524e-05, + "loss": 1.8727, + "step": 15630 + }, + { + "epoch": 0.03046631276800424, + "grad_norm": 2.6599271297454834, + "learning_rate": 2.9931345464861418e-05, + "loss": 1.8466, + "step": 15645 + }, + { + "epoch": 0.0304955230391145, + "grad_norm": 2.598816394805908, + "learning_rate": 2.993121385431135e-05, + "loss": 2.0863, + "step": 15660 + }, + { + "epoch": 0.030524733310224764, + "grad_norm": 3.126720666885376, + "learning_rate": 2.9931082118023432e-05, + "loss": 1.7731, + "step": 15675 + }, + { + "epoch": 0.030553943581335025, + "grad_norm": 3.7353076934814453, + "learning_rate": 2.9930950255998773e-05, + "loss": 2.1104, + "step": 15690 + }, + { + "epoch": 0.03058315385244529, + "grad_norm": 4.932044982910156, + "learning_rate": 2.9930818268238483e-05, + "loss": 1.8693, + "step": 15705 + }, + { + "epoch": 0.03061236412355555, + "grad_norm": 2.6772072315216064, + "learning_rate": 2.9930686154743666e-05, + "loss": 1.8159, + "step": 15720 + }, + { + "epoch": 0.030641574394665815, + "grad_norm": 3.73004412651062, + "learning_rate": 2.9930553915515445e-05, + "loss": 1.6705, + "step": 15735 + }, + { + "epoch": 0.03067078466577608, + "grad_norm": 2.2986795902252197, + "learning_rate": 2.993042155055493e-05, + "loss": 2.05, + "step": 15750 + }, + { + "epoch": 0.03069999493688634, + "grad_norm": 2.9536030292510986, + "learning_rate": 2.9930289059863234e-05, + "loss": 1.9297, + "step": 15765 + }, + { + "epoch": 0.030729205207996605, + "grad_norm": 1.8823219537734985, + "learning_rate": 2.9930156443441477e-05, + "loss": 1.9812, + "step": 15780 + }, + { + "epoch": 0.030758415479106866, + "grad_norm": 2.553921699523926, + "learning_rate": 2.993002370129077e-05, + "loss": 2.0723, + "step": 15795 + }, + { + "epoch": 0.03078762575021713, + "grad_norm": 2.391080617904663, + "learning_rate": 2.9929890833412233e-05, + "loss": 1.8342, + "step": 15810 + }, + { + "epoch": 0.03081683602132739, + "grad_norm": 1.898431420326233, + "learning_rate": 2.9929757839806985e-05, + "loss": 1.8282, + "step": 15825 + }, + { + "epoch": 0.030846046292437656, + "grad_norm": 3.6443545818328857, + "learning_rate": 2.9929624720476153e-05, + "loss": 1.7823, + "step": 15840 + }, + { + "epoch": 0.030875256563547917, + "grad_norm": 2.1495354175567627, + "learning_rate": 2.9929491475420844e-05, + "loss": 1.9565, + "step": 15855 + }, + { + "epoch": 0.03090446683465818, + "grad_norm": 4.317218780517578, + "learning_rate": 2.992935810464219e-05, + "loss": 1.9516, + "step": 15870 + }, + { + "epoch": 0.030933677105768446, + "grad_norm": 2.2863664627075195, + "learning_rate": 2.992922460814131e-05, + "loss": 1.9399, + "step": 15885 + }, + { + "epoch": 0.030962887376878707, + "grad_norm": 3.2142059803009033, + "learning_rate": 2.9929090985919334e-05, + "loss": 2.0122, + "step": 15900 + }, + { + "epoch": 0.03099209764798897, + "grad_norm": 3.0333306789398193, + "learning_rate": 2.992895723797738e-05, + "loss": 1.7733, + "step": 15915 + }, + { + "epoch": 0.031021307919099232, + "grad_norm": 2.6775388717651367, + "learning_rate": 2.9928823364316575e-05, + "loss": 1.8205, + "step": 15930 + }, + { + "epoch": 0.031050518190209497, + "grad_norm": 2.1693944931030273, + "learning_rate": 2.9928689364938057e-05, + "loss": 1.895, + "step": 15945 + }, + { + "epoch": 0.031079728461319758, + "grad_norm": 2.754948377609253, + "learning_rate": 2.992855523984294e-05, + "loss": 1.8623, + "step": 15960 + }, + { + "epoch": 0.031108938732430022, + "grad_norm": 2.659349203109741, + "learning_rate": 2.9928420989032357e-05, + "loss": 1.9873, + "step": 15975 + }, + { + "epoch": 0.031138149003540287, + "grad_norm": 3.5560247898101807, + "learning_rate": 2.9928286612507445e-05, + "loss": 1.9364, + "step": 15990 + }, + { + "epoch": 0.031167359274650547, + "grad_norm": 4.824398994445801, + "learning_rate": 2.9928152110269335e-05, + "loss": 1.8708, + "step": 16005 + }, + { + "epoch": 0.031196569545760812, + "grad_norm": 2.5657832622528076, + "learning_rate": 2.992801748231915e-05, + "loss": 2.0086, + "step": 16020 + }, + { + "epoch": 0.031225779816871073, + "grad_norm": 5.117823123931885, + "learning_rate": 2.9927882728658036e-05, + "loss": 1.7608, + "step": 16035 + }, + { + "epoch": 0.031254990087981334, + "grad_norm": 1.9517539739608765, + "learning_rate": 2.992774784928712e-05, + "loss": 1.9367, + "step": 16050 + }, + { + "epoch": 0.0312842003590916, + "grad_norm": 3.202143669128418, + "learning_rate": 2.9927612844207537e-05, + "loss": 1.8725, + "step": 16065 + }, + { + "epoch": 0.03131341063020186, + "grad_norm": 2.5217974185943604, + "learning_rate": 2.992747771342043e-05, + "loss": 1.9029, + "step": 16080 + }, + { + "epoch": 0.031342620901312124, + "grad_norm": 1.6550703048706055, + "learning_rate": 2.992734245692693e-05, + "loss": 1.9016, + "step": 16095 + }, + { + "epoch": 0.03137183117242239, + "grad_norm": 2.7231898307800293, + "learning_rate": 2.9927207074728187e-05, + "loss": 1.9735, + "step": 16110 + }, + { + "epoch": 0.03140104144353265, + "grad_norm": 3.855612277984619, + "learning_rate": 2.9927071566825328e-05, + "loss": 1.7641, + "step": 16125 + }, + { + "epoch": 0.031430251714642914, + "grad_norm": 3.016885757446289, + "learning_rate": 2.99269359332195e-05, + "loss": 1.9789, + "step": 16140 + }, + { + "epoch": 0.031459461985753175, + "grad_norm": 2.0709967613220215, + "learning_rate": 2.9926800173911845e-05, + "loss": 1.922, + "step": 16155 + }, + { + "epoch": 0.03148867225686344, + "grad_norm": 5.23029088973999, + "learning_rate": 2.992666428890351e-05, + "loss": 1.9681, + "step": 16170 + }, + { + "epoch": 0.031517882527973703, + "grad_norm": 2.7961294651031494, + "learning_rate": 2.9926528278195634e-05, + "loss": 1.7172, + "step": 16185 + }, + { + "epoch": 0.031547092799083964, + "grad_norm": 2.5741660594940186, + "learning_rate": 2.992639214178936e-05, + "loss": 1.7638, + "step": 16200 + }, + { + "epoch": 0.031576303070194225, + "grad_norm": 1.9811028242111206, + "learning_rate": 2.9926255879685846e-05, + "loss": 2.0028, + "step": 16215 + }, + { + "epoch": 0.03160551334130449, + "grad_norm": 4.054990768432617, + "learning_rate": 2.992611949188623e-05, + "loss": 1.9305, + "step": 16230 + }, + { + "epoch": 0.031634723612414754, + "grad_norm": 2.189107894897461, + "learning_rate": 2.992598297839166e-05, + "loss": 1.7656, + "step": 16245 + }, + { + "epoch": 0.031663933883525015, + "grad_norm": 2.2795944213867188, + "learning_rate": 2.9925846339203285e-05, + "loss": 1.8474, + "step": 16260 + }, + { + "epoch": 0.03169314415463528, + "grad_norm": 5.294840335845947, + "learning_rate": 2.9925709574322262e-05, + "loss": 1.8983, + "step": 16275 + }, + { + "epoch": 0.031722354425745544, + "grad_norm": 3.4388887882232666, + "learning_rate": 2.9925572683749742e-05, + "loss": 1.7128, + "step": 16290 + }, + { + "epoch": 0.031751564696855805, + "grad_norm": 4.72010612487793, + "learning_rate": 2.992543566748687e-05, + "loss": 1.9095, + "step": 16305 + }, + { + "epoch": 0.031780774967966066, + "grad_norm": 3.0153849124908447, + "learning_rate": 2.9925298525534807e-05, + "loss": 1.8908, + "step": 16320 + }, + { + "epoch": 0.031809985239076334, + "grad_norm": 2.4210598468780518, + "learning_rate": 2.992516125789471e-05, + "loss": 2.0611, + "step": 16335 + }, + { + "epoch": 0.031839195510186595, + "grad_norm": 2.9562766551971436, + "learning_rate": 2.9925023864567727e-05, + "loss": 1.7784, + "step": 16350 + }, + { + "epoch": 0.031868405781296856, + "grad_norm": 2.944288969039917, + "learning_rate": 2.992488634555502e-05, + "loss": 1.8543, + "step": 16365 + }, + { + "epoch": 0.031897616052407124, + "grad_norm": 3.5835225582122803, + "learning_rate": 2.9924748700857747e-05, + "loss": 1.9654, + "step": 16380 + }, + { + "epoch": 0.031926826323517385, + "grad_norm": 3.5023677349090576, + "learning_rate": 2.9924610930477062e-05, + "loss": 1.7496, + "step": 16395 + }, + { + "epoch": 0.031956036594627646, + "grad_norm": 2.08420467376709, + "learning_rate": 2.9924473034414136e-05, + "loss": 1.8234, + "step": 16410 + }, + { + "epoch": 0.03198524686573791, + "grad_norm": 4.309360504150391, + "learning_rate": 2.992433501267012e-05, + "loss": 1.9149, + "step": 16425 + }, + { + "epoch": 0.032014457136848175, + "grad_norm": 3.0357537269592285, + "learning_rate": 2.9924196865246175e-05, + "loss": 1.9551, + "step": 16440 + }, + { + "epoch": 0.032043667407958436, + "grad_norm": 4.165322780609131, + "learning_rate": 2.9924058592143473e-05, + "loss": 1.7488, + "step": 16455 + }, + { + "epoch": 0.0320728776790687, + "grad_norm": 3.4814274311065674, + "learning_rate": 2.9923920193363176e-05, + "loss": 1.9659, + "step": 16470 + }, + { + "epoch": 0.032102087950178965, + "grad_norm": 2.5652506351470947, + "learning_rate": 2.992378166890645e-05, + "loss": 1.7986, + "step": 16485 + }, + { + "epoch": 0.032131298221289226, + "grad_norm": 1.9026885032653809, + "learning_rate": 2.9923643018774455e-05, + "loss": 1.8294, + "step": 16500 + }, + { + "epoch": 0.03216050849239949, + "grad_norm": 3.861070156097412, + "learning_rate": 2.9923504242968365e-05, + "loss": 1.7451, + "step": 16515 + }, + { + "epoch": 0.03218971876350975, + "grad_norm": 1.9591658115386963, + "learning_rate": 2.992336534148935e-05, + "loss": 1.805, + "step": 16530 + }, + { + "epoch": 0.032218929034620016, + "grad_norm": 4.189550399780273, + "learning_rate": 2.992322631433857e-05, + "loss": 1.8049, + "step": 16545 + }, + { + "epoch": 0.032248139305730276, + "grad_norm": 3.1660592555999756, + "learning_rate": 2.9923087161517205e-05, + "loss": 1.9415, + "step": 16560 + }, + { + "epoch": 0.03227734957684054, + "grad_norm": 2.801609754562378, + "learning_rate": 2.9922947883026426e-05, + "loss": 1.8096, + "step": 16575 + }, + { + "epoch": 0.032306559847950805, + "grad_norm": 2.699336290359497, + "learning_rate": 2.9922808478867403e-05, + "loss": 1.723, + "step": 16590 + }, + { + "epoch": 0.032335770119061066, + "grad_norm": 2.2953224182128906, + "learning_rate": 2.992266894904131e-05, + "loss": 2.0549, + "step": 16605 + }, + { + "epoch": 0.03236498039017133, + "grad_norm": 3.8196887969970703, + "learning_rate": 2.9922529293549327e-05, + "loss": 1.8312, + "step": 16620 + }, + { + "epoch": 0.03239419066128159, + "grad_norm": 2.974578380584717, + "learning_rate": 2.9922389512392622e-05, + "loss": 1.9242, + "step": 16635 + }, + { + "epoch": 0.032423400932391856, + "grad_norm": 4.62038516998291, + "learning_rate": 2.9922249605572376e-05, + "loss": 1.9786, + "step": 16650 + }, + { + "epoch": 0.03245261120350212, + "grad_norm": 1.5149112939834595, + "learning_rate": 2.992210957308977e-05, + "loss": 1.8984, + "step": 16665 + }, + { + "epoch": 0.03248182147461238, + "grad_norm": 2.2014825344085693, + "learning_rate": 2.992196941494598e-05, + "loss": 1.7994, + "step": 16680 + }, + { + "epoch": 0.032511031745722646, + "grad_norm": 4.065785884857178, + "learning_rate": 2.9921829131142186e-05, + "loss": 1.8157, + "step": 16695 + }, + { + "epoch": 0.03254024201683291, + "grad_norm": 4.191883087158203, + "learning_rate": 2.992168872167957e-05, + "loss": 1.8684, + "step": 16710 + }, + { + "epoch": 0.03256945228794317, + "grad_norm": 3.4441282749176025, + "learning_rate": 2.9921548186559314e-05, + "loss": 1.8299, + "step": 16725 + }, + { + "epoch": 0.03259866255905343, + "grad_norm": 4.0237345695495605, + "learning_rate": 2.9921407525782604e-05, + "loss": 1.9262, + "step": 16740 + }, + { + "epoch": 0.0326278728301637, + "grad_norm": 2.171717405319214, + "learning_rate": 2.992126673935062e-05, + "loss": 1.7679, + "step": 16755 + }, + { + "epoch": 0.03265708310127396, + "grad_norm": 4.097175121307373, + "learning_rate": 2.992112582726455e-05, + "loss": 1.9499, + "step": 16770 + }, + { + "epoch": 0.03268629337238422, + "grad_norm": 4.390431880950928, + "learning_rate": 2.9920984789525583e-05, + "loss": 1.8797, + "step": 16785 + }, + { + "epoch": 0.03271550364349448, + "grad_norm": 3.068178176879883, + "learning_rate": 2.9920843626134907e-05, + "loss": 2.0427, + "step": 16800 + }, + { + "epoch": 0.03274471391460475, + "grad_norm": 3.378275156021118, + "learning_rate": 2.9920702337093707e-05, + "loss": 1.8853, + "step": 16815 + }, + { + "epoch": 0.03277392418571501, + "grad_norm": 2.2076075077056885, + "learning_rate": 2.992056092240317e-05, + "loss": 1.9406, + "step": 16830 + }, + { + "epoch": 0.03280313445682527, + "grad_norm": 3.0358242988586426, + "learning_rate": 2.992041938206449e-05, + "loss": 2.0143, + "step": 16845 + }, + { + "epoch": 0.03283234472793554, + "grad_norm": 4.094939231872559, + "learning_rate": 2.9920277716078868e-05, + "loss": 1.9757, + "step": 16860 + }, + { + "epoch": 0.0328615549990458, + "grad_norm": 1.8015364408493042, + "learning_rate": 2.9920135924447484e-05, + "loss": 2.0698, + "step": 16875 + }, + { + "epoch": 0.03289076527015606, + "grad_norm": 3.4187324047088623, + "learning_rate": 2.9919994007171535e-05, + "loss": 1.9332, + "step": 16890 + }, + { + "epoch": 0.03291997554126632, + "grad_norm": 3.91485595703125, + "learning_rate": 2.991985196425222e-05, + "loss": 1.9433, + "step": 16905 + }, + { + "epoch": 0.03294918581237659, + "grad_norm": 3.0797996520996094, + "learning_rate": 2.9919709795690732e-05, + "loss": 1.7478, + "step": 16920 + }, + { + "epoch": 0.03297839608348685, + "grad_norm": 4.396271705627441, + "learning_rate": 2.9919567501488273e-05, + "loss": 1.9191, + "step": 16935 + }, + { + "epoch": 0.03300760635459711, + "grad_norm": 2.7897489070892334, + "learning_rate": 2.9919425081646036e-05, + "loss": 1.9495, + "step": 16950 + }, + { + "epoch": 0.03303681662570738, + "grad_norm": 3.063068151473999, + "learning_rate": 2.991928253616522e-05, + "loss": 2.1288, + "step": 16965 + }, + { + "epoch": 0.03306602689681764, + "grad_norm": 5.236184120178223, + "learning_rate": 2.991913986504703e-05, + "loss": 1.8698, + "step": 16980 + }, + { + "epoch": 0.0330952371679279, + "grad_norm": 2.758821725845337, + "learning_rate": 2.9918997068292666e-05, + "loss": 1.8664, + "step": 16995 + }, + { + "epoch": 0.03312444743903816, + "grad_norm": 3.3068835735321045, + "learning_rate": 2.9918854145903326e-05, + "loss": 2.0247, + "step": 17010 + }, + { + "epoch": 0.03315365771014843, + "grad_norm": 5.055883884429932, + "learning_rate": 2.991871109788022e-05, + "loss": 1.7333, + "step": 17025 + }, + { + "epoch": 0.03318286798125869, + "grad_norm": 2.9909791946411133, + "learning_rate": 2.9918567924224545e-05, + "loss": 1.7868, + "step": 17040 + }, + { + "epoch": 0.03321207825236895, + "grad_norm": 4.3141913414001465, + "learning_rate": 2.9918424624937514e-05, + "loss": 1.8976, + "step": 17055 + }, + { + "epoch": 0.03324128852347922, + "grad_norm": 3.3365187644958496, + "learning_rate": 2.991828120002033e-05, + "loss": 1.8246, + "step": 17070 + }, + { + "epoch": 0.03327049879458948, + "grad_norm": 1.7029449939727783, + "learning_rate": 2.991813764947421e-05, + "loss": 1.7464, + "step": 17085 + }, + { + "epoch": 0.03329970906569974, + "grad_norm": 2.34187650680542, + "learning_rate": 2.9917993973300343e-05, + "loss": 2.0614, + "step": 17100 + }, + { + "epoch": 0.03332891933681, + "grad_norm": 2.3299100399017334, + "learning_rate": 2.9917850171499957e-05, + "loss": 1.9139, + "step": 17115 + }, + { + "epoch": 0.03335812960792027, + "grad_norm": 2.58627986907959, + "learning_rate": 2.9917706244074254e-05, + "loss": 2.0092, + "step": 17130 + }, + { + "epoch": 0.03338733987903053, + "grad_norm": 3.580124855041504, + "learning_rate": 2.991756219102445e-05, + "loss": 1.8548, + "step": 17145 + }, + { + "epoch": 0.03341655015014079, + "grad_norm": 1.6242703199386597, + "learning_rate": 2.9917418012351755e-05, + "loss": 1.8455, + "step": 17160 + }, + { + "epoch": 0.03344576042125106, + "grad_norm": 3.0739760398864746, + "learning_rate": 2.991727370805739e-05, + "loss": 1.846, + "step": 17175 + }, + { + "epoch": 0.03347497069236132, + "grad_norm": 2.6895573139190674, + "learning_rate": 2.991712927814256e-05, + "loss": 1.8201, + "step": 17190 + }, + { + "epoch": 0.03350418096347158, + "grad_norm": 3.353482961654663, + "learning_rate": 2.9916984722608488e-05, + "loss": 1.9598, + "step": 17205 + }, + { + "epoch": 0.03353339123458184, + "grad_norm": 3.334764242172241, + "learning_rate": 2.991684004145639e-05, + "loss": 1.8388, + "step": 17220 + }, + { + "epoch": 0.03356260150569211, + "grad_norm": 2.8904106616973877, + "learning_rate": 2.9916695234687484e-05, + "loss": 1.9402, + "step": 17235 + }, + { + "epoch": 0.03359181177680237, + "grad_norm": 2.01975154876709, + "learning_rate": 2.991655030230299e-05, + "loss": 1.9462, + "step": 17250 + }, + { + "epoch": 0.03362102204791263, + "grad_norm": 3.490748882293701, + "learning_rate": 2.9916405244304123e-05, + "loss": 1.9171, + "step": 17265 + }, + { + "epoch": 0.033650232319022894, + "grad_norm": 3.2388625144958496, + "learning_rate": 2.9916260060692114e-05, + "loss": 1.8305, + "step": 17280 + }, + { + "epoch": 0.03367944259013316, + "grad_norm": 3.952791690826416, + "learning_rate": 2.991611475146818e-05, + "loss": 1.9577, + "step": 17295 + }, + { + "epoch": 0.03370865286124342, + "grad_norm": 2.0647149085998535, + "learning_rate": 2.9915969316633548e-05, + "loss": 1.7789, + "step": 17310 + }, + { + "epoch": 0.03373786313235368, + "grad_norm": 2.9091007709503174, + "learning_rate": 2.9915823756189438e-05, + "loss": 2.0568, + "step": 17325 + }, + { + "epoch": 0.03376707340346395, + "grad_norm": 3.5968873500823975, + "learning_rate": 2.9915678070137078e-05, + "loss": 1.9264, + "step": 17340 + }, + { + "epoch": 0.03379628367457421, + "grad_norm": 2.7411301136016846, + "learning_rate": 2.9915532258477697e-05, + "loss": 1.832, + "step": 17355 + }, + { + "epoch": 0.03382549394568447, + "grad_norm": 3.8037030696868896, + "learning_rate": 2.991538632121252e-05, + "loss": 1.9751, + "step": 17370 + }, + { + "epoch": 0.033854704216794734, + "grad_norm": 2.2467167377471924, + "learning_rate": 2.9915240258342776e-05, + "loss": 1.8501, + "step": 17385 + }, + { + "epoch": 0.033883914487905, + "grad_norm": 2.01926589012146, + "learning_rate": 2.9915094069869696e-05, + "loss": 1.8398, + "step": 17400 + }, + { + "epoch": 0.03391312475901526, + "grad_norm": 3.427839994430542, + "learning_rate": 2.9914947755794515e-05, + "loss": 2.0322, + "step": 17415 + }, + { + "epoch": 0.033942335030125524, + "grad_norm": 2.9915285110473633, + "learning_rate": 2.991480131611846e-05, + "loss": 1.7704, + "step": 17430 + }, + { + "epoch": 0.03397154530123579, + "grad_norm": 2.0821945667266846, + "learning_rate": 2.9914654750842765e-05, + "loss": 1.9759, + "step": 17445 + }, + { + "epoch": 0.03400075557234605, + "grad_norm": 3.167320966720581, + "learning_rate": 2.9914508059968664e-05, + "loss": 1.8321, + "step": 17460 + }, + { + "epoch": 0.034029965843456314, + "grad_norm": 3.4027440547943115, + "learning_rate": 2.991436124349739e-05, + "loss": 1.8643, + "step": 17475 + }, + { + "epoch": 0.034059176114566575, + "grad_norm": 1.9895273447036743, + "learning_rate": 2.9914214301430183e-05, + "loss": 2.018, + "step": 17490 + }, + { + "epoch": 0.03408838638567684, + "grad_norm": 3.4797520637512207, + "learning_rate": 2.9914067233768285e-05, + "loss": 1.8666, + "step": 17505 + }, + { + "epoch": 0.034117596656787104, + "grad_norm": 2.578434944152832, + "learning_rate": 2.9913920040512925e-05, + "loss": 1.8808, + "step": 17520 + }, + { + "epoch": 0.034146806927897365, + "grad_norm": 3.499577522277832, + "learning_rate": 2.991377272166535e-05, + "loss": 2.112, + "step": 17535 + }, + { + "epoch": 0.03417601719900763, + "grad_norm": 2.6072142124176025, + "learning_rate": 2.9913625277226795e-05, + "loss": 1.7954, + "step": 17550 + }, + { + "epoch": 0.034205227470117894, + "grad_norm": 4.382633686065674, + "learning_rate": 2.9913477707198505e-05, + "loss": 1.8943, + "step": 17565 + }, + { + "epoch": 0.034234437741228155, + "grad_norm": 2.883723497390747, + "learning_rate": 2.9913330011581718e-05, + "loss": 1.8615, + "step": 17580 + }, + { + "epoch": 0.034263648012338416, + "grad_norm": 2.309401273727417, + "learning_rate": 2.991318219037769e-05, + "loss": 1.9813, + "step": 17595 + }, + { + "epoch": 0.034292858283448684, + "grad_norm": 3.3916573524475098, + "learning_rate": 2.991303424358765e-05, + "loss": 2.0303, + "step": 17610 + }, + { + "epoch": 0.034322068554558945, + "grad_norm": 2.0413196086883545, + "learning_rate": 2.9912886171212855e-05, + "loss": 1.7976, + "step": 17625 + }, + { + "epoch": 0.034351278825669206, + "grad_norm": 2.3423221111297607, + "learning_rate": 2.991273797325454e-05, + "loss": 2.0449, + "step": 17640 + }, + { + "epoch": 0.034380489096779474, + "grad_norm": 2.9180712699890137, + "learning_rate": 2.991258964971397e-05, + "loss": 1.937, + "step": 17655 + }, + { + "epoch": 0.034409699367889734, + "grad_norm": 2.8222568035125732, + "learning_rate": 2.9912441200592385e-05, + "loss": 1.8245, + "step": 17670 + }, + { + "epoch": 0.034438909638999995, + "grad_norm": 2.49934720993042, + "learning_rate": 2.991229262589103e-05, + "loss": 1.7475, + "step": 17685 + }, + { + "epoch": 0.034468119910110256, + "grad_norm": 2.517383098602295, + "learning_rate": 2.9912143925611166e-05, + "loss": 2.0156, + "step": 17700 + }, + { + "epoch": 0.034497330181220524, + "grad_norm": 1.9787654876708984, + "learning_rate": 2.9911995099754037e-05, + "loss": 1.7771, + "step": 17715 + }, + { + "epoch": 0.034526540452330785, + "grad_norm": 3.3665497303009033, + "learning_rate": 2.9911846148320903e-05, + "loss": 1.7394, + "step": 17730 + }, + { + "epoch": 0.034555750723441046, + "grad_norm": 2.7270398139953613, + "learning_rate": 2.9911697071313017e-05, + "loss": 1.8666, + "step": 17745 + }, + { + "epoch": 0.034584960994551314, + "grad_norm": 2.7560067176818848, + "learning_rate": 2.9911547868731626e-05, + "loss": 1.7811, + "step": 17760 + }, + { + "epoch": 0.034614171265661575, + "grad_norm": 3.9651834964752197, + "learning_rate": 2.9911398540577996e-05, + "loss": 1.8758, + "step": 17775 + }, + { + "epoch": 0.034643381536771836, + "grad_norm": 2.7909586429595947, + "learning_rate": 2.9911249086853386e-05, + "loss": 2.0029, + "step": 17790 + }, + { + "epoch": 0.0346725918078821, + "grad_norm": 3.109741449356079, + "learning_rate": 2.9911099507559045e-05, + "loss": 1.9692, + "step": 17805 + }, + { + "epoch": 0.034701802078992365, + "grad_norm": 1.7844855785369873, + "learning_rate": 2.9910949802696244e-05, + "loss": 1.7244, + "step": 17820 + }, + { + "epoch": 0.034731012350102626, + "grad_norm": 1.8406388759613037, + "learning_rate": 2.9910799972266232e-05, + "loss": 1.8689, + "step": 17835 + }, + { + "epoch": 0.03476022262121289, + "grad_norm": 2.9145405292510986, + "learning_rate": 2.9910650016270278e-05, + "loss": 1.8982, + "step": 17850 + }, + { + "epoch": 0.03478943289232315, + "grad_norm": 1.9398448467254639, + "learning_rate": 2.991049993470964e-05, + "loss": 1.5994, + "step": 17865 + }, + { + "epoch": 0.034818643163433416, + "grad_norm": 2.3878304958343506, + "learning_rate": 2.991034972758559e-05, + "loss": 1.8854, + "step": 17880 + }, + { + "epoch": 0.03484785343454368, + "grad_norm": 1.8284587860107422, + "learning_rate": 2.9910199394899385e-05, + "loss": 1.8261, + "step": 17895 + }, + { + "epoch": 0.03487706370565394, + "grad_norm": 3.306748151779175, + "learning_rate": 2.9910048936652294e-05, + "loss": 1.8727, + "step": 17910 + }, + { + "epoch": 0.034906273976764206, + "grad_norm": 3.8391776084899902, + "learning_rate": 2.9909898352845585e-05, + "loss": 1.8347, + "step": 17925 + }, + { + "epoch": 0.03493548424787447, + "grad_norm": 2.275381565093994, + "learning_rate": 2.9909747643480526e-05, + "loss": 1.8268, + "step": 17940 + }, + { + "epoch": 0.03496469451898473, + "grad_norm": 2.871506929397583, + "learning_rate": 2.9909596808558385e-05, + "loss": 1.9375, + "step": 17955 + }, + { + "epoch": 0.03499390479009499, + "grad_norm": 3.8660874366760254, + "learning_rate": 2.990944584808043e-05, + "loss": 1.678, + "step": 17970 + }, + { + "epoch": 0.03502311506120526, + "grad_norm": 4.991106033325195, + "learning_rate": 2.9909294762047935e-05, + "loss": 1.9136, + "step": 17985 + }, + { + "epoch": 0.03505232533231552, + "grad_norm": 3.8985953330993652, + "learning_rate": 2.990914355046217e-05, + "loss": 1.7425, + "step": 18000 + }, + { + "epoch": 0.03508153560342578, + "grad_norm": 2.2547903060913086, + "learning_rate": 2.9908992213324413e-05, + "loss": 1.9613, + "step": 18015 + }, + { + "epoch": 0.035110745874536047, + "grad_norm": 4.33575963973999, + "learning_rate": 2.9908840750635936e-05, + "loss": 2.0328, + "step": 18030 + }, + { + "epoch": 0.03513995614564631, + "grad_norm": 3.8632030487060547, + "learning_rate": 2.9908689162398012e-05, + "loss": 1.7863, + "step": 18045 + }, + { + "epoch": 0.03516916641675657, + "grad_norm": 4.932112216949463, + "learning_rate": 2.9908537448611927e-05, + "loss": 1.7159, + "step": 18060 + }, + { + "epoch": 0.03519837668786683, + "grad_norm": 2.3018031120300293, + "learning_rate": 2.9908385609278943e-05, + "loss": 1.9104, + "step": 18075 + }, + { + "epoch": 0.0352275869589771, + "grad_norm": 2.7413060665130615, + "learning_rate": 2.990823364440035e-05, + "loss": 1.6733, + "step": 18090 + }, + { + "epoch": 0.03525679723008736, + "grad_norm": 3.928980588912964, + "learning_rate": 2.9908081553977424e-05, + "loss": 1.9619, + "step": 18105 + }, + { + "epoch": 0.03528600750119762, + "grad_norm": 1.6751407384872437, + "learning_rate": 2.9907929338011447e-05, + "loss": 1.8283, + "step": 18120 + }, + { + "epoch": 0.03531521777230789, + "grad_norm": 4.515170574188232, + "learning_rate": 2.99077769965037e-05, + "loss": 1.7808, + "step": 18135 + }, + { + "epoch": 0.03534442804341815, + "grad_norm": 3.367410182952881, + "learning_rate": 2.9907624529455468e-05, + "loss": 1.9427, + "step": 18150 + }, + { + "epoch": 0.03537363831452841, + "grad_norm": 2.9150290489196777, + "learning_rate": 2.990747193686803e-05, + "loss": 2.021, + "step": 18165 + }, + { + "epoch": 0.03540284858563867, + "grad_norm": 3.456367015838623, + "learning_rate": 2.9907319218742677e-05, + "loss": 1.8711, + "step": 18180 + }, + { + "epoch": 0.03543205885674894, + "grad_norm": 2.504117727279663, + "learning_rate": 2.990716637508069e-05, + "loss": 1.9605, + "step": 18195 + }, + { + "epoch": 0.0354612691278592, + "grad_norm": 3.5525166988372803, + "learning_rate": 2.990701340588336e-05, + "loss": 1.6192, + "step": 18210 + }, + { + "epoch": 0.03549047939896946, + "grad_norm": 4.048015594482422, + "learning_rate": 2.9906860311151973e-05, + "loss": 1.9809, + "step": 18225 + }, + { + "epoch": 0.03551968967007973, + "grad_norm": 3.0211918354034424, + "learning_rate": 2.990670709088782e-05, + "loss": 1.8898, + "step": 18240 + }, + { + "epoch": 0.03554889994118999, + "grad_norm": 2.4371862411499023, + "learning_rate": 2.9906553745092184e-05, + "loss": 1.8836, + "step": 18255 + }, + { + "epoch": 0.03557811021230025, + "grad_norm": 3.461329460144043, + "learning_rate": 2.990640027376637e-05, + "loss": 1.7893, + "step": 18270 + }, + { + "epoch": 0.03560732048341051, + "grad_norm": 2.6082050800323486, + "learning_rate": 2.990624667691166e-05, + "loss": 1.94, + "step": 18285 + }, + { + "epoch": 0.03563653075452078, + "grad_norm": 2.8006784915924072, + "learning_rate": 2.990609295452935e-05, + "loss": 1.7607, + "step": 18300 + }, + { + "epoch": 0.03566574102563104, + "grad_norm": 4.304664611816406, + "learning_rate": 2.990593910662073e-05, + "loss": 1.8973, + "step": 18315 + }, + { + "epoch": 0.0356949512967413, + "grad_norm": 2.063380479812622, + "learning_rate": 2.9905785133187108e-05, + "loss": 1.7456, + "step": 18330 + }, + { + "epoch": 0.03572416156785156, + "grad_norm": 4.626766681671143, + "learning_rate": 2.9905631034229772e-05, + "loss": 1.8402, + "step": 18345 + }, + { + "epoch": 0.03575337183896183, + "grad_norm": 4.0876288414001465, + "learning_rate": 2.9905476809750017e-05, + "loss": 1.8239, + "step": 18360 + }, + { + "epoch": 0.03578258211007209, + "grad_norm": 2.5533339977264404, + "learning_rate": 2.9905322459749148e-05, + "loss": 1.9771, + "step": 18375 + }, + { + "epoch": 0.03581179238118235, + "grad_norm": 2.268286943435669, + "learning_rate": 2.990516798422846e-05, + "loss": 1.8886, + "step": 18390 + }, + { + "epoch": 0.03584100265229262, + "grad_norm": 2.343240261077881, + "learning_rate": 2.9905013383189257e-05, + "loss": 1.8676, + "step": 18405 + }, + { + "epoch": 0.03587021292340288, + "grad_norm": 3.544220209121704, + "learning_rate": 2.990485865663284e-05, + "loss": 1.7922, + "step": 18420 + }, + { + "epoch": 0.03589942319451314, + "grad_norm": 3.608947277069092, + "learning_rate": 2.9904703804560516e-05, + "loss": 1.7506, + "step": 18435 + }, + { + "epoch": 0.0359286334656234, + "grad_norm": 2.520986557006836, + "learning_rate": 2.990454882697358e-05, + "loss": 1.9715, + "step": 18450 + }, + { + "epoch": 0.03595784373673367, + "grad_norm": 2.598273277282715, + "learning_rate": 2.9904393723873342e-05, + "loss": 2.0018, + "step": 18465 + }, + { + "epoch": 0.03598705400784393, + "grad_norm": 3.4648282527923584, + "learning_rate": 2.990423849526111e-05, + "loss": 1.8609, + "step": 18480 + }, + { + "epoch": 0.03601626427895419, + "grad_norm": 4.387876510620117, + "learning_rate": 2.9904083141138194e-05, + "loss": 1.9949, + "step": 18495 + }, + { + "epoch": 0.03604547455006446, + "grad_norm": 2.121781349182129, + "learning_rate": 2.9903927661505888e-05, + "loss": 1.91, + "step": 18510 + }, + { + "epoch": 0.03607468482117472, + "grad_norm": 2.5395514965057373, + "learning_rate": 2.9903772056365516e-05, + "loss": 1.7663, + "step": 18525 + }, + { + "epoch": 0.03610389509228498, + "grad_norm": 4.20078706741333, + "learning_rate": 2.9903616325718385e-05, + "loss": 1.8051, + "step": 18540 + }, + { + "epoch": 0.03613310536339524, + "grad_norm": 3.0787668228149414, + "learning_rate": 2.9903460469565802e-05, + "loss": 1.7457, + "step": 18555 + }, + { + "epoch": 0.03616231563450551, + "grad_norm": 2.493914842605591, + "learning_rate": 2.9903304487909084e-05, + "loss": 1.841, + "step": 18570 + }, + { + "epoch": 0.03619152590561577, + "grad_norm": 4.367134094238281, + "learning_rate": 2.9903148380749543e-05, + "loss": 1.8202, + "step": 18585 + }, + { + "epoch": 0.03622073617672603, + "grad_norm": 4.713270664215088, + "learning_rate": 2.990299214808849e-05, + "loss": 1.7427, + "step": 18600 + }, + { + "epoch": 0.0362499464478363, + "grad_norm": 2.3630452156066895, + "learning_rate": 2.9902835789927246e-05, + "loss": 1.8379, + "step": 18615 + }, + { + "epoch": 0.03627915671894656, + "grad_norm": 4.240983009338379, + "learning_rate": 2.9902679306267127e-05, + "loss": 1.8876, + "step": 18630 + }, + { + "epoch": 0.03630836699005682, + "grad_norm": 2.1189780235290527, + "learning_rate": 2.990252269710945e-05, + "loss": 1.7273, + "step": 18645 + }, + { + "epoch": 0.036337577261167084, + "grad_norm": 4.207765579223633, + "learning_rate": 2.9902365962455533e-05, + "loss": 1.7961, + "step": 18660 + }, + { + "epoch": 0.03636678753227735, + "grad_norm": 4.526651859283447, + "learning_rate": 2.9902209102306694e-05, + "loss": 1.8292, + "step": 18675 + }, + { + "epoch": 0.03639599780338761, + "grad_norm": 4.371520042419434, + "learning_rate": 2.990205211666426e-05, + "loss": 1.9646, + "step": 18690 + }, + { + "epoch": 0.036425208074497874, + "grad_norm": 3.766552209854126, + "learning_rate": 2.990189500552955e-05, + "loss": 1.6872, + "step": 18705 + }, + { + "epoch": 0.03645441834560814, + "grad_norm": 3.610987901687622, + "learning_rate": 2.9901737768903882e-05, + "loss": 1.8095, + "step": 18720 + }, + { + "epoch": 0.0364836286167184, + "grad_norm": 2.77374529838562, + "learning_rate": 2.9901580406788583e-05, + "loss": 1.7853, + "step": 18735 + }, + { + "epoch": 0.036512838887828664, + "grad_norm": 1.9032909870147705, + "learning_rate": 2.9901422919184984e-05, + "loss": 2.066, + "step": 18750 + }, + { + "epoch": 0.036542049158938925, + "grad_norm": 6.839906215667725, + "learning_rate": 2.9901265306094406e-05, + "loss": 2.0159, + "step": 18765 + }, + { + "epoch": 0.03657125943004919, + "grad_norm": 3.8178064823150635, + "learning_rate": 2.9901107567518177e-05, + "loss": 1.9257, + "step": 18780 + }, + { + "epoch": 0.036600469701159453, + "grad_norm": 2.0359580516815186, + "learning_rate": 2.990094970345762e-05, + "loss": 1.8261, + "step": 18795 + }, + { + "epoch": 0.036629679972269714, + "grad_norm": 3.5759871006011963, + "learning_rate": 2.9900791713914078e-05, + "loss": 1.9477, + "step": 18810 + }, + { + "epoch": 0.036658890243379975, + "grad_norm": 2.471161127090454, + "learning_rate": 2.990063359888887e-05, + "loss": 1.9519, + "step": 18825 + }, + { + "epoch": 0.03668810051449024, + "grad_norm": 4.123219966888428, + "learning_rate": 2.9900475358383327e-05, + "loss": 1.9215, + "step": 18840 + }, + { + "epoch": 0.036717310785600504, + "grad_norm": 2.517751693725586, + "learning_rate": 2.9900316992398793e-05, + "loss": 1.9601, + "step": 18855 + }, + { + "epoch": 0.036746521056710765, + "grad_norm": 3.0052502155303955, + "learning_rate": 2.9900158500936587e-05, + "loss": 1.8937, + "step": 18870 + }, + { + "epoch": 0.03677573132782103, + "grad_norm": 3.820679187774658, + "learning_rate": 2.9899999883998057e-05, + "loss": 1.7623, + "step": 18885 + }, + { + "epoch": 0.036804941598931294, + "grad_norm": 4.087255954742432, + "learning_rate": 2.989984114158453e-05, + "loss": 1.7651, + "step": 18900 + }, + { + "epoch": 0.036834151870041555, + "grad_norm": 2.3399763107299805, + "learning_rate": 2.989968227369734e-05, + "loss": 1.8389, + "step": 18915 + }, + { + "epoch": 0.036863362141151816, + "grad_norm": 4.5868072509765625, + "learning_rate": 2.9899523280337836e-05, + "loss": 1.9238, + "step": 18930 + }, + { + "epoch": 0.036892572412262084, + "grad_norm": 2.5856025218963623, + "learning_rate": 2.989936416150735e-05, + "loss": 1.8044, + "step": 18945 + }, + { + "epoch": 0.036921782683372345, + "grad_norm": 2.635301351547241, + "learning_rate": 2.9899204917207222e-05, + "loss": 1.9433, + "step": 18960 + }, + { + "epoch": 0.036950992954482606, + "grad_norm": 2.7841100692749023, + "learning_rate": 2.9899045547438792e-05, + "loss": 1.8401, + "step": 18975 + }, + { + "epoch": 0.036980203225592874, + "grad_norm": 3.0509424209594727, + "learning_rate": 2.9898886052203407e-05, + "loss": 1.8828, + "step": 18990 + }, + { + "epoch": 0.037009413496703135, + "grad_norm": 2.99225115776062, + "learning_rate": 2.9898726431502402e-05, + "loss": 1.8158, + "step": 19005 + }, + { + "epoch": 0.037038623767813396, + "grad_norm": 2.5677740573883057, + "learning_rate": 2.9898566685337135e-05, + "loss": 1.8706, + "step": 19020 + }, + { + "epoch": 0.03706783403892366, + "grad_norm": 3.502387762069702, + "learning_rate": 2.9898406813708934e-05, + "loss": 1.9128, + "step": 19035 + }, + { + "epoch": 0.037097044310033925, + "grad_norm": 3.5594568252563477, + "learning_rate": 2.9898246816619162e-05, + "loss": 1.732, + "step": 19050 + }, + { + "epoch": 0.037126254581144186, + "grad_norm": 2.2263100147247314, + "learning_rate": 2.989808669406915e-05, + "loss": 1.9151, + "step": 19065 + }, + { + "epoch": 0.03715546485225445, + "grad_norm": 3.859412670135498, + "learning_rate": 2.989792644606026e-05, + "loss": 1.9925, + "step": 19080 + }, + { + "epoch": 0.037184675123364715, + "grad_norm": 2.2860965728759766, + "learning_rate": 2.9897766072593834e-05, + "loss": 1.9347, + "step": 19095 + }, + { + "epoch": 0.037213885394474976, + "grad_norm": 3.5833935737609863, + "learning_rate": 2.9897605573671224e-05, + "loss": 1.7977, + "step": 19110 + }, + { + "epoch": 0.03724309566558524, + "grad_norm": 3.612175226211548, + "learning_rate": 2.989744494929378e-05, + "loss": 1.9048, + "step": 19125 + }, + { + "epoch": 0.0372723059366955, + "grad_norm": 2.6882388591766357, + "learning_rate": 2.989728419946286e-05, + "loss": 1.814, + "step": 19140 + }, + { + "epoch": 0.037301516207805765, + "grad_norm": 2.753767728805542, + "learning_rate": 2.989712332417982e-05, + "loss": 1.8261, + "step": 19155 + }, + { + "epoch": 0.037330726478916026, + "grad_norm": 3.5201592445373535, + "learning_rate": 2.9896962323446004e-05, + "loss": 1.9855, + "step": 19170 + }, + { + "epoch": 0.03735993675002629, + "grad_norm": 2.892357587814331, + "learning_rate": 2.9896801197262773e-05, + "loss": 1.8325, + "step": 19185 + }, + { + "epoch": 0.037389147021136555, + "grad_norm": 3.37709903717041, + "learning_rate": 2.9896639945631483e-05, + "loss": 1.9563, + "step": 19200 + }, + { + "epoch": 0.037418357292246816, + "grad_norm": 1.9762299060821533, + "learning_rate": 2.9896478568553492e-05, + "loss": 1.8228, + "step": 19215 + }, + { + "epoch": 0.03744756756335708, + "grad_norm": 5.543561935424805, + "learning_rate": 2.9896317066030162e-05, + "loss": 1.7656, + "step": 19230 + }, + { + "epoch": 0.03747677783446734, + "grad_norm": 1.8155004978179932, + "learning_rate": 2.9896155438062852e-05, + "loss": 1.8033, + "step": 19245 + }, + { + "epoch": 0.037505988105577606, + "grad_norm": 3.613931655883789, + "learning_rate": 2.989599368465292e-05, + "loss": 1.9275, + "step": 19260 + }, + { + "epoch": 0.03753519837668787, + "grad_norm": 2.2457938194274902, + "learning_rate": 2.989583180580173e-05, + "loss": 1.9459, + "step": 19275 + }, + { + "epoch": 0.03756440864779813, + "grad_norm": 2.4014956951141357, + "learning_rate": 2.9895669801510646e-05, + "loss": 1.7973, + "step": 19290 + }, + { + "epoch": 0.037593618918908396, + "grad_norm": 2.1578032970428467, + "learning_rate": 2.9895507671781032e-05, + "loss": 1.8551, + "step": 19305 + }, + { + "epoch": 0.03762282919001866, + "grad_norm": 2.952676296234131, + "learning_rate": 2.9895345416614254e-05, + "loss": 1.7808, + "step": 19320 + }, + { + "epoch": 0.03765203946112892, + "grad_norm": 3.352534770965576, + "learning_rate": 2.989518303601167e-05, + "loss": 1.8889, + "step": 19335 + }, + { + "epoch": 0.03768124973223918, + "grad_norm": 5.104668617248535, + "learning_rate": 2.9895020529974667e-05, + "loss": 1.9522, + "step": 19350 + }, + { + "epoch": 0.03771046000334945, + "grad_norm": 3.0823004245758057, + "learning_rate": 2.9894857898504595e-05, + "loss": 1.916, + "step": 19365 + }, + { + "epoch": 0.03773967027445971, + "grad_norm": 2.544391393661499, + "learning_rate": 2.9894695141602824e-05, + "loss": 1.9019, + "step": 19380 + }, + { + "epoch": 0.03776888054556997, + "grad_norm": 2.4823343753814697, + "learning_rate": 2.989453225927074e-05, + "loss": 1.7927, + "step": 19395 + }, + { + "epoch": 0.03779809081668023, + "grad_norm": 1.8794801235198975, + "learning_rate": 2.98943692515097e-05, + "loss": 2.0112, + "step": 19410 + }, + { + "epoch": 0.0378273010877905, + "grad_norm": 2.5527946949005127, + "learning_rate": 2.9894206118321083e-05, + "loss": 1.8095, + "step": 19425 + }, + { + "epoch": 0.03785651135890076, + "grad_norm": 3.4643588066101074, + "learning_rate": 2.9894042859706265e-05, + "loss": 1.7435, + "step": 19440 + }, + { + "epoch": 0.03788572163001102, + "grad_norm": 3.913656234741211, + "learning_rate": 2.9893879475666613e-05, + "loss": 2.0963, + "step": 19455 + }, + { + "epoch": 0.03791493190112129, + "grad_norm": 5.583499431610107, + "learning_rate": 2.9893715966203502e-05, + "loss": 1.8735, + "step": 19470 + }, + { + "epoch": 0.03794414217223155, + "grad_norm": 1.9320791959762573, + "learning_rate": 2.989355233131832e-05, + "loss": 1.9904, + "step": 19485 + }, + { + "epoch": 0.03797335244334181, + "grad_norm": 3.6746532917022705, + "learning_rate": 2.9893388571012443e-05, + "loss": 1.8231, + "step": 19500 + }, + { + "epoch": 0.03800256271445207, + "grad_norm": 2.002924680709839, + "learning_rate": 2.989322468528724e-05, + "loss": 1.989, + "step": 19515 + }, + { + "epoch": 0.03803177298556234, + "grad_norm": 3.4807581901550293, + "learning_rate": 2.98930606741441e-05, + "loss": 1.849, + "step": 19530 + }, + { + "epoch": 0.0380609832566726, + "grad_norm": 4.430756568908691, + "learning_rate": 2.98928965375844e-05, + "loss": 1.9161, + "step": 19545 + }, + { + "epoch": 0.03809019352778286, + "grad_norm": 3.5762476921081543, + "learning_rate": 2.9892732275609525e-05, + "loss": 1.8861, + "step": 19560 + }, + { + "epoch": 0.03811940379889313, + "grad_norm": 2.6605224609375, + "learning_rate": 2.9892567888220855e-05, + "loss": 1.9023, + "step": 19575 + }, + { + "epoch": 0.03814861407000339, + "grad_norm": 2.929426431655884, + "learning_rate": 2.9892403375419778e-05, + "loss": 1.7741, + "step": 19590 + }, + { + "epoch": 0.03817782434111365, + "grad_norm": 3.075709342956543, + "learning_rate": 2.9892238737207677e-05, + "loss": 1.9575, + "step": 19605 + }, + { + "epoch": 0.03820703461222391, + "grad_norm": 3.5462255477905273, + "learning_rate": 2.9892073973585942e-05, + "loss": 1.8336, + "step": 19620 + }, + { + "epoch": 0.03823624488333418, + "grad_norm": 2.098604202270508, + "learning_rate": 2.9891909084555954e-05, + "loss": 1.8161, + "step": 19635 + }, + { + "epoch": 0.03826545515444444, + "grad_norm": 3.255275249481201, + "learning_rate": 2.9891744070119106e-05, + "loss": 1.9291, + "step": 19650 + }, + { + "epoch": 0.0382946654255547, + "grad_norm": 3.4731807708740234, + "learning_rate": 2.9891578930276787e-05, + "loss": 1.9403, + "step": 19665 + }, + { + "epoch": 0.03832387569666497, + "grad_norm": 2.0823373794555664, + "learning_rate": 2.9891413665030387e-05, + "loss": 1.8713, + "step": 19680 + }, + { + "epoch": 0.03835308596777523, + "grad_norm": 2.6273398399353027, + "learning_rate": 2.98912482743813e-05, + "loss": 1.8764, + "step": 19695 + }, + { + "epoch": 0.03838229623888549, + "grad_norm": 3.4412078857421875, + "learning_rate": 2.9891082758330915e-05, + "loss": 1.8471, + "step": 19710 + }, + { + "epoch": 0.03841150650999575, + "grad_norm": 2.652684211730957, + "learning_rate": 2.9890917116880625e-05, + "loss": 2.0607, + "step": 19725 + }, + { + "epoch": 0.03844071678110602, + "grad_norm": 2.8258442878723145, + "learning_rate": 2.989075135003183e-05, + "loss": 1.8497, + "step": 19740 + }, + { + "epoch": 0.03846992705221628, + "grad_norm": 3.239922285079956, + "learning_rate": 2.9890585457785923e-05, + "loss": 1.7763, + "step": 19755 + }, + { + "epoch": 0.03849913732332654, + "grad_norm": 2.4102368354797363, + "learning_rate": 2.9890419440144303e-05, + "loss": 1.9937, + "step": 19770 + }, + { + "epoch": 0.03852834759443681, + "grad_norm": 3.4451348781585693, + "learning_rate": 2.989025329710837e-05, + "loss": 1.8446, + "step": 19785 + }, + { + "epoch": 0.03855755786554707, + "grad_norm": 2.9538424015045166, + "learning_rate": 2.9890087028679517e-05, + "loss": 1.8478, + "step": 19800 + }, + { + "epoch": 0.03858676813665733, + "grad_norm": 2.6566977500915527, + "learning_rate": 2.9889920634859144e-05, + "loss": 1.7005, + "step": 19815 + }, + { + "epoch": 0.03861597840776759, + "grad_norm": 1.9971015453338623, + "learning_rate": 2.988975411564866e-05, + "loss": 1.8428, + "step": 19830 + }, + { + "epoch": 0.03864518867887786, + "grad_norm": 2.648861885070801, + "learning_rate": 2.9889587471049456e-05, + "loss": 2.0656, + "step": 19845 + }, + { + "epoch": 0.03867439894998812, + "grad_norm": 4.70402193069458, + "learning_rate": 2.9889420701062947e-05, + "loss": 1.8098, + "step": 19860 + }, + { + "epoch": 0.03870360922109838, + "grad_norm": 4.031128883361816, + "learning_rate": 2.988925380569053e-05, + "loss": 1.9847, + "step": 19875 + }, + { + "epoch": 0.038732819492208644, + "grad_norm": 3.7528605461120605, + "learning_rate": 2.988908678493361e-05, + "loss": 1.8671, + "step": 19890 + }, + { + "epoch": 0.03876202976331891, + "grad_norm": 2.24607253074646, + "learning_rate": 2.9888919638793604e-05, + "loss": 1.8053, + "step": 19905 + }, + { + "epoch": 0.03879124003442917, + "grad_norm": 2.6132078170776367, + "learning_rate": 2.9888752367271903e-05, + "loss": 2.0429, + "step": 19920 + }, + { + "epoch": 0.03882045030553943, + "grad_norm": 2.334711790084839, + "learning_rate": 2.988858497036993e-05, + "loss": 1.7526, + "step": 19935 + }, + { + "epoch": 0.0388496605766497, + "grad_norm": 3.065218210220337, + "learning_rate": 2.988841744808909e-05, + "loss": 1.9287, + "step": 19950 + }, + { + "epoch": 0.03887887084775996, + "grad_norm": 4.216036796569824, + "learning_rate": 2.9888249800430787e-05, + "loss": 1.7889, + "step": 19965 + }, + { + "epoch": 0.03890808111887022, + "grad_norm": 4.185488224029541, + "learning_rate": 2.988808202739644e-05, + "loss": 1.865, + "step": 19980 + }, + { + "epoch": 0.038937291389980484, + "grad_norm": 2.3708608150482178, + "learning_rate": 2.9887914128987465e-05, + "loss": 1.852, + "step": 19995 + }, + { + "epoch": 0.03896650166109075, + "grad_norm": 4.136464595794678, + "learning_rate": 2.9887746105205264e-05, + "loss": 1.8378, + "step": 20010 + }, + { + "epoch": 0.03899571193220101, + "grad_norm": 3.3061976432800293, + "learning_rate": 2.9887577956051263e-05, + "loss": 1.8702, + "step": 20025 + }, + { + "epoch": 0.039024922203311274, + "grad_norm": 3.5822839736938477, + "learning_rate": 2.9887409681526876e-05, + "loss": 1.87, + "step": 20040 + }, + { + "epoch": 0.03905413247442154, + "grad_norm": 4.132532119750977, + "learning_rate": 2.9887241281633518e-05, + "loss": 2.051, + "step": 20055 + }, + { + "epoch": 0.0390833427455318, + "grad_norm": 3.9095962047576904, + "learning_rate": 2.9887072756372606e-05, + "loss": 1.8271, + "step": 20070 + }, + { + "epoch": 0.039112553016642064, + "grad_norm": 3.028393507003784, + "learning_rate": 2.988690410574556e-05, + "loss": 1.825, + "step": 20085 + }, + { + "epoch": 0.039141763287752325, + "grad_norm": 1.7903603315353394, + "learning_rate": 2.98867353297538e-05, + "loss": 1.9565, + "step": 20100 + }, + { + "epoch": 0.03917097355886259, + "grad_norm": 2.168895959854126, + "learning_rate": 2.988656642839875e-05, + "loss": 1.8483, + "step": 20115 + }, + { + "epoch": 0.039200183829972854, + "grad_norm": 7.789179801940918, + "learning_rate": 2.988639740168183e-05, + "loss": 2.0627, + "step": 20130 + }, + { + "epoch": 0.039229394101083115, + "grad_norm": 2.033658981323242, + "learning_rate": 2.9886228249604464e-05, + "loss": 1.8905, + "step": 20145 + }, + { + "epoch": 0.03925860437219338, + "grad_norm": 3.6176931858062744, + "learning_rate": 2.9886058972168076e-05, + "loss": 1.8981, + "step": 20160 + }, + { + "epoch": 0.039287814643303644, + "grad_norm": 3.776540517807007, + "learning_rate": 2.9885889569374088e-05, + "loss": 1.7417, + "step": 20175 + }, + { + "epoch": 0.039317024914413905, + "grad_norm": 2.138796329498291, + "learning_rate": 2.9885720041223934e-05, + "loss": 1.8845, + "step": 20190 + }, + { + "epoch": 0.039346235185524166, + "grad_norm": 5.0401482582092285, + "learning_rate": 2.988555038771904e-05, + "loss": 1.8966, + "step": 20205 + }, + { + "epoch": 0.039375445456634434, + "grad_norm": 2.2886552810668945, + "learning_rate": 2.9885380608860827e-05, + "loss": 1.8898, + "step": 20220 + }, + { + "epoch": 0.039404655727744695, + "grad_norm": 2.286959648132324, + "learning_rate": 2.9885210704650734e-05, + "loss": 1.9597, + "step": 20235 + }, + { + "epoch": 0.039433865998854956, + "grad_norm": 2.538752794265747, + "learning_rate": 2.988504067509019e-05, + "loss": 1.8451, + "step": 20250 + }, + { + "epoch": 0.039463076269965223, + "grad_norm": 3.4922502040863037, + "learning_rate": 2.988487052018062e-05, + "loss": 1.8717, + "step": 20265 + }, + { + "epoch": 0.039492286541075484, + "grad_norm": 3.5919086933135986, + "learning_rate": 2.9884700239923467e-05, + "loss": 1.8182, + "step": 20280 + }, + { + "epoch": 0.039521496812185745, + "grad_norm": 1.9253063201904297, + "learning_rate": 2.988452983432016e-05, + "loss": 1.8067, + "step": 20295 + }, + { + "epoch": 0.039550707083296006, + "grad_norm": 3.168278217315674, + "learning_rate": 2.9884359303372127e-05, + "loss": 1.9619, + "step": 20310 + }, + { + "epoch": 0.039579917354406274, + "grad_norm": 2.574296236038208, + "learning_rate": 2.9884188647080816e-05, + "loss": 1.9037, + "step": 20325 + }, + { + "epoch": 0.039609127625516535, + "grad_norm": 3.87908673286438, + "learning_rate": 2.9884017865447657e-05, + "loss": 1.8106, + "step": 20340 + }, + { + "epoch": 0.039638337896626796, + "grad_norm": 3.060088872909546, + "learning_rate": 2.9883846958474093e-05, + "loss": 1.8878, + "step": 20355 + }, + { + "epoch": 0.039667548167737064, + "grad_norm": 2.6092071533203125, + "learning_rate": 2.988367592616156e-05, + "loss": 2.0189, + "step": 20370 + }, + { + "epoch": 0.039696758438847325, + "grad_norm": 2.688831329345703, + "learning_rate": 2.9883504768511496e-05, + "loss": 1.9439, + "step": 20385 + }, + { + "epoch": 0.039725968709957586, + "grad_norm": 4.126415252685547, + "learning_rate": 2.988333348552535e-05, + "loss": 1.7334, + "step": 20400 + }, + { + "epoch": 0.03975517898106785, + "grad_norm": 2.5536105632781982, + "learning_rate": 2.988316207720455e-05, + "loss": 1.8741, + "step": 20415 + }, + { + "epoch": 0.039784389252178115, + "grad_norm": 4.146584510803223, + "learning_rate": 2.9882990543550557e-05, + "loss": 1.839, + "step": 20430 + }, + { + "epoch": 0.039813599523288376, + "grad_norm": 4.531203746795654, + "learning_rate": 2.9882818884564805e-05, + "loss": 1.801, + "step": 20445 + }, + { + "epoch": 0.03984280979439864, + "grad_norm": 4.3206562995910645, + "learning_rate": 2.988264710024874e-05, + "loss": 1.9127, + "step": 20460 + }, + { + "epoch": 0.0398720200655089, + "grad_norm": 2.258627414703369, + "learning_rate": 2.9882475190603815e-05, + "loss": 1.9329, + "step": 20475 + }, + { + "epoch": 0.039901230336619166, + "grad_norm": 2.874171018600464, + "learning_rate": 2.988230315563147e-05, + "loss": 1.8632, + "step": 20490 + }, + { + "epoch": 0.03993044060772943, + "grad_norm": 2.2248058319091797, + "learning_rate": 2.988213099533316e-05, + "loss": 1.7653, + "step": 20505 + }, + { + "epoch": 0.03995965087883969, + "grad_norm": 2.2211756706237793, + "learning_rate": 2.988195870971033e-05, + "loss": 2.0817, + "step": 20520 + }, + { + "epoch": 0.039988861149949956, + "grad_norm": 2.266099214553833, + "learning_rate": 2.9881786298764432e-05, + "loss": 2.0993, + "step": 20535 + }, + { + "epoch": 0.04001807142106022, + "grad_norm": 3.2420692443847656, + "learning_rate": 2.988161376249692e-05, + "loss": 1.8448, + "step": 20550 + }, + { + "epoch": 0.04004728169217048, + "grad_norm": 2.1219029426574707, + "learning_rate": 2.9881441100909244e-05, + "loss": 1.9547, + "step": 20565 + }, + { + "epoch": 0.04007649196328074, + "grad_norm": 4.585850238800049, + "learning_rate": 2.988126831400286e-05, + "loss": 1.9619, + "step": 20580 + }, + { + "epoch": 0.04010570223439101, + "grad_norm": 3.4670188426971436, + "learning_rate": 2.9881095401779224e-05, + "loss": 2.0, + "step": 20595 + }, + { + "epoch": 0.04013491250550127, + "grad_norm": 2.0285186767578125, + "learning_rate": 2.9880922364239787e-05, + "loss": 1.9434, + "step": 20610 + }, + { + "epoch": 0.04016412277661153, + "grad_norm": 1.873279094696045, + "learning_rate": 2.9880749201386014e-05, + "loss": 1.8546, + "step": 20625 + }, + { + "epoch": 0.040193333047721796, + "grad_norm": 2.1878137588500977, + "learning_rate": 2.9880575913219354e-05, + "loss": 1.8624, + "step": 20640 + }, + { + "epoch": 0.04022254331883206, + "grad_norm": 3.9366180896759033, + "learning_rate": 2.988040249974128e-05, + "loss": 1.9143, + "step": 20655 + }, + { + "epoch": 0.04025175358994232, + "grad_norm": 2.3610141277313232, + "learning_rate": 2.9880228960953236e-05, + "loss": 1.9616, + "step": 20670 + }, + { + "epoch": 0.04028096386105258, + "grad_norm": 2.8658604621887207, + "learning_rate": 2.9880055296856695e-05, + "loss": 1.7601, + "step": 20685 + }, + { + "epoch": 0.04031017413216285, + "grad_norm": 4.454057216644287, + "learning_rate": 2.9879881507453112e-05, + "loss": 1.7578, + "step": 20700 + }, + { + "epoch": 0.04033938440327311, + "grad_norm": 2.244455575942993, + "learning_rate": 2.9879707592743957e-05, + "loss": 1.8334, + "step": 20715 + }, + { + "epoch": 0.04036859467438337, + "grad_norm": 1.61302649974823, + "learning_rate": 2.987953355273069e-05, + "loss": 1.9746, + "step": 20730 + }, + { + "epoch": 0.04039780494549364, + "grad_norm": 2.8323678970336914, + "learning_rate": 2.987935938741478e-05, + "loss": 2.1251, + "step": 20745 + }, + { + "epoch": 0.0404270152166039, + "grad_norm": 2.418241262435913, + "learning_rate": 2.987918509679769e-05, + "loss": 2.0798, + "step": 20760 + }, + { + "epoch": 0.04045622548771416, + "grad_norm": 3.465550661087036, + "learning_rate": 2.987901068088089e-05, + "loss": 1.8706, + "step": 20775 + }, + { + "epoch": 0.04048543575882442, + "grad_norm": 4.148263931274414, + "learning_rate": 2.987883613966585e-05, + "loss": 1.7925, + "step": 20790 + }, + { + "epoch": 0.04051464602993469, + "grad_norm": 2.4039666652679443, + "learning_rate": 2.9878661473154037e-05, + "loss": 1.8672, + "step": 20805 + }, + { + "epoch": 0.04054385630104495, + "grad_norm": 2.12880277633667, + "learning_rate": 2.9878486681346923e-05, + "loss": 1.8978, + "step": 20820 + }, + { + "epoch": 0.04057306657215521, + "grad_norm": 1.9612979888916016, + "learning_rate": 2.987831176424598e-05, + "loss": 1.9175, + "step": 20835 + }, + { + "epoch": 0.04060227684326548, + "grad_norm": 2.161982297897339, + "learning_rate": 2.9878136721852682e-05, + "loss": 1.7752, + "step": 20850 + }, + { + "epoch": 0.04063148711437574, + "grad_norm": 4.4879961013793945, + "learning_rate": 2.9877961554168498e-05, + "loss": 2.0857, + "step": 20865 + }, + { + "epoch": 0.040660697385486, + "grad_norm": 4.1571364402771, + "learning_rate": 2.9877786261194914e-05, + "loss": 1.949, + "step": 20880 + }, + { + "epoch": 0.04068990765659626, + "grad_norm": 3.3120033740997314, + "learning_rate": 2.9877610842933397e-05, + "loss": 1.8585, + "step": 20895 + }, + { + "epoch": 0.04071911792770653, + "grad_norm": 3.193117618560791, + "learning_rate": 2.9877435299385424e-05, + "loss": 1.731, + "step": 20910 + }, + { + "epoch": 0.04074832819881679, + "grad_norm": 2.375343084335327, + "learning_rate": 2.987725963055248e-05, + "loss": 1.8269, + "step": 20925 + }, + { + "epoch": 0.04077753846992705, + "grad_norm": 2.3607242107391357, + "learning_rate": 2.9877083836436036e-05, + "loss": 1.8305, + "step": 20940 + }, + { + "epoch": 0.04080674874103731, + "grad_norm": 3.0205342769622803, + "learning_rate": 2.987690791703758e-05, + "loss": 1.8631, + "step": 20955 + }, + { + "epoch": 0.04083595901214758, + "grad_norm": 3.530947685241699, + "learning_rate": 2.9876731872358585e-05, + "loss": 1.8431, + "step": 20970 + }, + { + "epoch": 0.04086516928325784, + "grad_norm": 2.8419220447540283, + "learning_rate": 2.987655570240054e-05, + "loss": 1.8519, + "step": 20985 + }, + { + "epoch": 0.0408943795543681, + "grad_norm": 2.253532886505127, + "learning_rate": 2.9876379407164933e-05, + "loss": 1.8688, + "step": 21000 + }, + { + "epoch": 0.04092358982547837, + "grad_norm": 1.9279251098632812, + "learning_rate": 2.987620298665324e-05, + "loss": 1.9111, + "step": 21015 + }, + { + "epoch": 0.04095280009658863, + "grad_norm": 3.0645790100097656, + "learning_rate": 2.987602644086695e-05, + "loss": 1.8359, + "step": 21030 + }, + { + "epoch": 0.04098201036769889, + "grad_norm": 2.9384896755218506, + "learning_rate": 2.9875849769807544e-05, + "loss": 1.747, + "step": 21045 + }, + { + "epoch": 0.04101122063880915, + "grad_norm": 3.7138500213623047, + "learning_rate": 2.987567297347652e-05, + "loss": 1.8528, + "step": 21060 + }, + { + "epoch": 0.04104043090991942, + "grad_norm": 4.440821170806885, + "learning_rate": 2.987549605187536e-05, + "loss": 1.9611, + "step": 21075 + }, + { + "epoch": 0.04106964118102968, + "grad_norm": 3.0378854274749756, + "learning_rate": 2.9875319005005552e-05, + "loss": 2.012, + "step": 21090 + }, + { + "epoch": 0.04109885145213994, + "grad_norm": 4.378201961517334, + "learning_rate": 2.9875141832868598e-05, + "loss": 1.7366, + "step": 21105 + }, + { + "epoch": 0.04112806172325021, + "grad_norm": 4.794327735900879, + "learning_rate": 2.9874964535465978e-05, + "loss": 1.8728, + "step": 21120 + }, + { + "epoch": 0.04115727199436047, + "grad_norm": 2.5137903690338135, + "learning_rate": 2.987478711279919e-05, + "loss": 1.9813, + "step": 21135 + }, + { + "epoch": 0.04118648226547073, + "grad_norm": 2.2024412155151367, + "learning_rate": 2.987460956486973e-05, + "loss": 1.7721, + "step": 21150 + }, + { + "epoch": 0.04121569253658099, + "grad_norm": 3.0043609142303467, + "learning_rate": 2.987443189167909e-05, + "loss": 1.9015, + "step": 21165 + }, + { + "epoch": 0.04124490280769126, + "grad_norm": 4.627270698547363, + "learning_rate": 2.9874254093228763e-05, + "loss": 1.9487, + "step": 21180 + }, + { + "epoch": 0.04127411307880152, + "grad_norm": 3.208395481109619, + "learning_rate": 2.987407616952025e-05, + "loss": 1.6989, + "step": 21195 + }, + { + "epoch": 0.04130332334991178, + "grad_norm": 4.102930545806885, + "learning_rate": 2.9873898120555055e-05, + "loss": 2.0639, + "step": 21210 + }, + { + "epoch": 0.04133253362102205, + "grad_norm": 3.846593141555786, + "learning_rate": 2.987371994633467e-05, + "loss": 1.7067, + "step": 21225 + }, + { + "epoch": 0.04136174389213231, + "grad_norm": 3.6651105880737305, + "learning_rate": 2.9873541646860597e-05, + "loss": 1.8983, + "step": 21240 + }, + { + "epoch": 0.04139095416324257, + "grad_norm": 3.715604543685913, + "learning_rate": 2.987336322213434e-05, + "loss": 1.6676, + "step": 21255 + }, + { + "epoch": 0.041420164434352834, + "grad_norm": 3.0780601501464844, + "learning_rate": 2.9873184672157395e-05, + "loss": 1.9342, + "step": 21270 + }, + { + "epoch": 0.0414493747054631, + "grad_norm": 3.385103225708008, + "learning_rate": 2.9873005996931274e-05, + "loss": 1.9494, + "step": 21285 + }, + { + "epoch": 0.04147858497657336, + "grad_norm": 2.9652836322784424, + "learning_rate": 2.9872827196457475e-05, + "loss": 1.8491, + "step": 21300 + }, + { + "epoch": 0.041507795247683624, + "grad_norm": 2.346210479736328, + "learning_rate": 2.9872648270737507e-05, + "loss": 1.7948, + "step": 21315 + }, + { + "epoch": 0.04153700551879389, + "grad_norm": 2.0421793460845947, + "learning_rate": 2.9872469219772877e-05, + "loss": 1.7642, + "step": 21330 + }, + { + "epoch": 0.04156621578990415, + "grad_norm": 3.2347426414489746, + "learning_rate": 2.9872290043565094e-05, + "loss": 1.9741, + "step": 21345 + }, + { + "epoch": 0.041595426061014414, + "grad_norm": 2.8749160766601562, + "learning_rate": 2.987211074211566e-05, + "loss": 1.8364, + "step": 21360 + }, + { + "epoch": 0.041624636332124675, + "grad_norm": 3.484539031982422, + "learning_rate": 2.9871931315426094e-05, + "loss": 1.7097, + "step": 21375 + }, + { + "epoch": 0.04165384660323494, + "grad_norm": 3.105286121368408, + "learning_rate": 2.98717517634979e-05, + "loss": 1.8239, + "step": 21390 + }, + { + "epoch": 0.0416830568743452, + "grad_norm": 3.804901361465454, + "learning_rate": 2.9871572086332594e-05, + "loss": 1.7356, + "step": 21405 + }, + { + "epoch": 0.041712267145455464, + "grad_norm": 2.4536283016204834, + "learning_rate": 2.9871392283931686e-05, + "loss": 1.8601, + "step": 21420 + }, + { + "epoch": 0.04174147741656573, + "grad_norm": 2.8864688873291016, + "learning_rate": 2.9871212356296697e-05, + "loss": 1.802, + "step": 21435 + }, + { + "epoch": 0.04177068768767599, + "grad_norm": 3.4735238552093506, + "learning_rate": 2.9871032303429133e-05, + "loss": 1.9557, + "step": 21450 + }, + { + "epoch": 0.041799897958786254, + "grad_norm": 4.21823263168335, + "learning_rate": 2.9870852125330513e-05, + "loss": 1.8513, + "step": 21465 + }, + { + "epoch": 0.041829108229896515, + "grad_norm": 1.6568303108215332, + "learning_rate": 2.9870671822002357e-05, + "loss": 1.8443, + "step": 21480 + }, + { + "epoch": 0.04185831850100678, + "grad_norm": 3.8471972942352295, + "learning_rate": 2.9870491393446184e-05, + "loss": 1.8831, + "step": 21495 + }, + { + "epoch": 0.041887528772117044, + "grad_norm": 4.5181803703308105, + "learning_rate": 2.987031083966351e-05, + "loss": 2.0743, + "step": 21510 + }, + { + "epoch": 0.041916739043227305, + "grad_norm": 2.7604262828826904, + "learning_rate": 2.987013016065586e-05, + "loss": 1.7647, + "step": 21525 + }, + { + "epoch": 0.041945949314337566, + "grad_norm": 1.6995900869369507, + "learning_rate": 2.986994935642475e-05, + "loss": 1.9221, + "step": 21540 + }, + { + "epoch": 0.041975159585447834, + "grad_norm": 3.400073766708374, + "learning_rate": 2.9869768426971706e-05, + "loss": 1.7217, + "step": 21555 + }, + { + "epoch": 0.042004369856558095, + "grad_norm": 2.7783281803131104, + "learning_rate": 2.986958737229825e-05, + "loss": 1.8149, + "step": 21570 + }, + { + "epoch": 0.042033580127668356, + "grad_norm": 1.9334073066711426, + "learning_rate": 2.9869406192405904e-05, + "loss": 1.7657, + "step": 21585 + }, + { + "epoch": 0.042062790398778624, + "grad_norm": 3.244271993637085, + "learning_rate": 2.9869224887296205e-05, + "loss": 1.8122, + "step": 21600 + }, + { + "epoch": 0.042092000669888885, + "grad_norm": 3.9582252502441406, + "learning_rate": 2.9869043456970662e-05, + "loss": 1.8296, + "step": 21615 + }, + { + "epoch": 0.042121210940999146, + "grad_norm": 3.9118635654449463, + "learning_rate": 2.986886190143082e-05, + "loss": 1.9162, + "step": 21630 + }, + { + "epoch": 0.04215042121210941, + "grad_norm": 3.3247733116149902, + "learning_rate": 2.9868680220678198e-05, + "loss": 1.9345, + "step": 21645 + }, + { + "epoch": 0.042179631483219675, + "grad_norm": 4.189952373504639, + "learning_rate": 2.9868498414714332e-05, + "loss": 1.7394, + "step": 21660 + }, + { + "epoch": 0.042208841754329936, + "grad_norm": 4.380448818206787, + "learning_rate": 2.986831648354075e-05, + "loss": 1.8411, + "step": 21675 + }, + { + "epoch": 0.0422380520254402, + "grad_norm": 3.3280911445617676, + "learning_rate": 2.986813442715898e-05, + "loss": 1.7338, + "step": 21690 + }, + { + "epoch": 0.042267262296550465, + "grad_norm": 2.147976875305176, + "learning_rate": 2.9867952245570557e-05, + "loss": 1.7418, + "step": 21705 + }, + { + "epoch": 0.042296472567660726, + "grad_norm": 4.927828788757324, + "learning_rate": 2.9867769938777025e-05, + "loss": 1.9729, + "step": 21720 + }, + { + "epoch": 0.04232568283877099, + "grad_norm": 2.1555793285369873, + "learning_rate": 2.9867587506779903e-05, + "loss": 1.9733, + "step": 21735 + }, + { + "epoch": 0.04235489310988125, + "grad_norm": 4.826751232147217, + "learning_rate": 2.986740494958074e-05, + "loss": 1.9141, + "step": 21750 + }, + { + "epoch": 0.042384103380991515, + "grad_norm": 2.62713360786438, + "learning_rate": 2.986722226718107e-05, + "loss": 1.7623, + "step": 21765 + }, + { + "epoch": 0.042413313652101776, + "grad_norm": 2.9087536334991455, + "learning_rate": 2.9867039459582422e-05, + "loss": 2.0298, + "step": 21780 + }, + { + "epoch": 0.04244252392321204, + "grad_norm": 3.2233824729919434, + "learning_rate": 2.986685652678635e-05, + "loss": 1.8063, + "step": 21795 + }, + { + "epoch": 0.042471734194322305, + "grad_norm": 3.8226282596588135, + "learning_rate": 2.9866673468794392e-05, + "loss": 1.8173, + "step": 21810 + }, + { + "epoch": 0.042500944465432566, + "grad_norm": 2.362210273742676, + "learning_rate": 2.986649028560808e-05, + "loss": 2.0107, + "step": 21825 + }, + { + "epoch": 0.04253015473654283, + "grad_norm": 1.4686235189437866, + "learning_rate": 2.9866306977228964e-05, + "loss": 1.8339, + "step": 21840 + }, + { + "epoch": 0.04255936500765309, + "grad_norm": 3.2174501419067383, + "learning_rate": 2.9866123543658585e-05, + "loss": 1.9372, + "step": 21855 + }, + { + "epoch": 0.042588575278763356, + "grad_norm": 4.215010643005371, + "learning_rate": 2.9865939984898494e-05, + "loss": 1.7492, + "step": 21870 + }, + { + "epoch": 0.04261778554987362, + "grad_norm": 6.015155792236328, + "learning_rate": 2.9865756300950224e-05, + "loss": 1.8146, + "step": 21885 + }, + { + "epoch": 0.04264699582098388, + "grad_norm": 3.102923631668091, + "learning_rate": 2.9865572491815336e-05, + "loss": 1.9206, + "step": 21900 + }, + { + "epoch": 0.042676206092094146, + "grad_norm": 3.5606796741485596, + "learning_rate": 2.986538855749537e-05, + "loss": 1.7442, + "step": 21915 + }, + { + "epoch": 0.04270541636320441, + "grad_norm": 3.877696990966797, + "learning_rate": 2.9865204497991874e-05, + "loss": 1.8356, + "step": 21930 + }, + { + "epoch": 0.04273462663431467, + "grad_norm": 2.7707462310791016, + "learning_rate": 2.98650203133064e-05, + "loss": 2.1114, + "step": 21945 + }, + { + "epoch": 0.04276383690542493, + "grad_norm": 3.8471269607543945, + "learning_rate": 2.9864836003440496e-05, + "loss": 1.974, + "step": 21960 + }, + { + "epoch": 0.0427930471765352, + "grad_norm": 4.265331268310547, + "learning_rate": 2.9864651568395728e-05, + "loss": 1.7851, + "step": 21975 + }, + { + "epoch": 0.04282225744764546, + "grad_norm": 3.578641414642334, + "learning_rate": 2.986446700817363e-05, + "loss": 1.7996, + "step": 21990 + }, + { + "epoch": 0.04285146771875572, + "grad_norm": 3.289726495742798, + "learning_rate": 2.9864282322775768e-05, + "loss": 1.7165, + "step": 22005 + }, + { + "epoch": 0.04288067798986598, + "grad_norm": 1.883353590965271, + "learning_rate": 2.986409751220369e-05, + "loss": 1.8633, + "step": 22020 + }, + { + "epoch": 0.04290988826097625, + "grad_norm": 1.7199418544769287, + "learning_rate": 2.9863912576458955e-05, + "loss": 1.7696, + "step": 22035 + }, + { + "epoch": 0.04293909853208651, + "grad_norm": 3.445349931716919, + "learning_rate": 2.986372751554313e-05, + "loss": 1.8349, + "step": 22050 + }, + { + "epoch": 0.04296830880319677, + "grad_norm": 2.827427387237549, + "learning_rate": 2.986354232945776e-05, + "loss": 1.8647, + "step": 22065 + }, + { + "epoch": 0.04299751907430704, + "grad_norm": 2.4225332736968994, + "learning_rate": 2.986335701820441e-05, + "loss": 2.0951, + "step": 22080 + }, + { + "epoch": 0.0430267293454173, + "grad_norm": 2.9156410694122314, + "learning_rate": 2.986317158178464e-05, + "loss": 1.7989, + "step": 22095 + }, + { + "epoch": 0.04305593961652756, + "grad_norm": 2.3038766384124756, + "learning_rate": 2.986298602020001e-05, + "loss": 1.8867, + "step": 22110 + }, + { + "epoch": 0.04308514988763782, + "grad_norm": 3.085129976272583, + "learning_rate": 2.986280033345209e-05, + "loss": 1.6969, + "step": 22125 + }, + { + "epoch": 0.04311436015874809, + "grad_norm": 4.071490287780762, + "learning_rate": 2.986261452154243e-05, + "loss": 1.7865, + "step": 22140 + }, + { + "epoch": 0.04314357042985835, + "grad_norm": 2.2542295455932617, + "learning_rate": 2.986242858447261e-05, + "loss": 1.8687, + "step": 22155 + }, + { + "epoch": 0.04317278070096861, + "grad_norm": 3.3869330883026123, + "learning_rate": 2.9862242522244183e-05, + "loss": 1.8037, + "step": 22170 + }, + { + "epoch": 0.04320199097207888, + "grad_norm": 3.7993576526641846, + "learning_rate": 2.9862056334858727e-05, + "loss": 1.8819, + "step": 22185 + }, + { + "epoch": 0.04323120124318914, + "grad_norm": 3.019289255142212, + "learning_rate": 2.9861870022317798e-05, + "loss": 1.8932, + "step": 22200 + }, + { + "epoch": 0.0432604115142994, + "grad_norm": 2.831664562225342, + "learning_rate": 2.9861683584622976e-05, + "loss": 1.8813, + "step": 22215 + }, + { + "epoch": 0.04328962178540966, + "grad_norm": 4.9506754875183105, + "learning_rate": 2.9861497021775825e-05, + "loss": 1.7917, + "step": 22230 + }, + { + "epoch": 0.04331883205651993, + "grad_norm": 2.384033203125, + "learning_rate": 2.986131033377792e-05, + "loss": 2.0226, + "step": 22245 + }, + { + "epoch": 0.04334804232763019, + "grad_norm": 1.8661621809005737, + "learning_rate": 2.9861123520630828e-05, + "loss": 1.817, + "step": 22260 + }, + { + "epoch": 0.04337725259874045, + "grad_norm": 3.5283803939819336, + "learning_rate": 2.9860936582336123e-05, + "loss": 1.806, + "step": 22275 + }, + { + "epoch": 0.04340646286985072, + "grad_norm": 4.240048408508301, + "learning_rate": 2.9860749518895386e-05, + "loss": 1.7773, + "step": 22290 + }, + { + "epoch": 0.04343567314096098, + "grad_norm": 3.9326212406158447, + "learning_rate": 2.986056233031018e-05, + "loss": 1.8494, + "step": 22305 + }, + { + "epoch": 0.04346488341207124, + "grad_norm": 4.390437602996826, + "learning_rate": 2.986037501658209e-05, + "loss": 1.7608, + "step": 22320 + }, + { + "epoch": 0.0434940936831815, + "grad_norm": 2.5305089950561523, + "learning_rate": 2.98601875777127e-05, + "loss": 1.7295, + "step": 22335 + }, + { + "epoch": 0.04352330395429177, + "grad_norm": 2.4276509284973145, + "learning_rate": 2.9860000013703576e-05, + "loss": 2.0082, + "step": 22350 + }, + { + "epoch": 0.04355251422540203, + "grad_norm": 3.4239320755004883, + "learning_rate": 2.9859812324556298e-05, + "loss": 1.6979, + "step": 22365 + }, + { + "epoch": 0.04358172449651229, + "grad_norm": 2.544837474822998, + "learning_rate": 2.985962451027245e-05, + "loss": 1.7844, + "step": 22380 + }, + { + "epoch": 0.04361093476762256, + "grad_norm": 5.094317436218262, + "learning_rate": 2.985943657085362e-05, + "loss": 1.8792, + "step": 22395 + }, + { + "epoch": 0.04364014503873282, + "grad_norm": 3.235121250152588, + "learning_rate": 2.985924850630138e-05, + "loss": 1.7795, + "step": 22410 + }, + { + "epoch": 0.04366935530984308, + "grad_norm": 4.88664436340332, + "learning_rate": 2.9859060316617325e-05, + "loss": 1.9035, + "step": 22425 + }, + { + "epoch": 0.04369856558095334, + "grad_norm": 2.401301622390747, + "learning_rate": 2.9858872001803025e-05, + "loss": 1.9182, + "step": 22440 + }, + { + "epoch": 0.04372777585206361, + "grad_norm": 2.24088978767395, + "learning_rate": 2.9858683561860077e-05, + "loss": 1.688, + "step": 22455 + }, + { + "epoch": 0.04375698612317387, + "grad_norm": 2.521176338195801, + "learning_rate": 2.9858494996790065e-05, + "loss": 1.962, + "step": 22470 + }, + { + "epoch": 0.04378619639428413, + "grad_norm": 2.897905111312866, + "learning_rate": 2.9858306306594578e-05, + "loss": 1.7388, + "step": 22485 + }, + { + "epoch": 0.0438154066653944, + "grad_norm": 2.3301055431365967, + "learning_rate": 2.9858117491275204e-05, + "loss": 1.8834, + "step": 22500 + }, + { + "epoch": 0.04384461693650466, + "grad_norm": 2.078205108642578, + "learning_rate": 2.9857928550833533e-05, + "loss": 1.8959, + "step": 22515 + }, + { + "epoch": 0.04387382720761492, + "grad_norm": 2.8938355445861816, + "learning_rate": 2.9857739485271153e-05, + "loss": 1.8529, + "step": 22530 + }, + { + "epoch": 0.04390303747872518, + "grad_norm": 2.0363752841949463, + "learning_rate": 2.9857550294589663e-05, + "loss": 1.9974, + "step": 22545 + }, + { + "epoch": 0.04393224774983545, + "grad_norm": 3.58372163772583, + "learning_rate": 2.9857360978790647e-05, + "loss": 1.7921, + "step": 22560 + }, + { + "epoch": 0.04396145802094571, + "grad_norm": 3.4086432456970215, + "learning_rate": 2.985717153787571e-05, + "loss": 1.8401, + "step": 22575 + }, + { + "epoch": 0.04399066829205597, + "grad_norm": 2.3264150619506836, + "learning_rate": 2.985698197184644e-05, + "loss": 1.8789, + "step": 22590 + }, + { + "epoch": 0.044019878563166234, + "grad_norm": 2.727571964263916, + "learning_rate": 2.9856792280704435e-05, + "loss": 1.7859, + "step": 22605 + }, + { + "epoch": 0.0440490888342765, + "grad_norm": 3.833146095275879, + "learning_rate": 2.9856602464451293e-05, + "loss": 1.9976, + "step": 22620 + }, + { + "epoch": 0.04407829910538676, + "grad_norm": 1.8543720245361328, + "learning_rate": 2.9856412523088612e-05, + "loss": 1.8875, + "step": 22635 + }, + { + "epoch": 0.044107509376497024, + "grad_norm": 2.6003103256225586, + "learning_rate": 2.9856222456617993e-05, + "loss": 2.0006, + "step": 22650 + }, + { + "epoch": 0.04413671964760729, + "grad_norm": 3.0136611461639404, + "learning_rate": 2.9856032265041035e-05, + "loss": 1.8309, + "step": 22665 + }, + { + "epoch": 0.04416592991871755, + "grad_norm": 1.948042631149292, + "learning_rate": 2.9855841948359337e-05, + "loss": 1.8721, + "step": 22680 + }, + { + "epoch": 0.044195140189827814, + "grad_norm": 3.1908979415893555, + "learning_rate": 2.9855651506574507e-05, + "loss": 1.8373, + "step": 22695 + }, + { + "epoch": 0.044224350460938075, + "grad_norm": 3.519826889038086, + "learning_rate": 2.985546093968815e-05, + "loss": 1.8992, + "step": 22710 + }, + { + "epoch": 0.04425356073204834, + "grad_norm": 4.259973526000977, + "learning_rate": 2.985527024770186e-05, + "loss": 1.9466, + "step": 22725 + }, + { + "epoch": 0.044282771003158604, + "grad_norm": 2.69942569732666, + "learning_rate": 2.9855079430617253e-05, + "loss": 1.6805, + "step": 22740 + }, + { + "epoch": 0.044311981274268865, + "grad_norm": 3.0210318565368652, + "learning_rate": 2.9854888488435933e-05, + "loss": 1.8744, + "step": 22755 + }, + { + "epoch": 0.04434119154537913, + "grad_norm": 2.448091745376587, + "learning_rate": 2.9854697421159505e-05, + "loss": 1.8449, + "step": 22770 + }, + { + "epoch": 0.044370401816489394, + "grad_norm": 3.8297231197357178, + "learning_rate": 2.9854506228789586e-05, + "loss": 1.8071, + "step": 22785 + }, + { + "epoch": 0.044399612087599655, + "grad_norm": 4.065975189208984, + "learning_rate": 2.9854314911327777e-05, + "loss": 1.9335, + "step": 22800 + }, + { + "epoch": 0.044428822358709916, + "grad_norm": 2.0296108722686768, + "learning_rate": 2.9854123468775693e-05, + "loss": 1.748, + "step": 22815 + }, + { + "epoch": 0.044458032629820184, + "grad_norm": 1.7504347562789917, + "learning_rate": 2.985393190113495e-05, + "loss": 1.9084, + "step": 22830 + }, + { + "epoch": 0.044487242900930445, + "grad_norm": 2.8802502155303955, + "learning_rate": 2.9853740208407152e-05, + "loss": 1.8649, + "step": 22845 + }, + { + "epoch": 0.044516453172040706, + "grad_norm": 3.5376250743865967, + "learning_rate": 2.985354839059392e-05, + "loss": 1.7617, + "step": 22860 + }, + { + "epoch": 0.044545663443150973, + "grad_norm": 4.846216201782227, + "learning_rate": 2.985335644769687e-05, + "loss": 1.7531, + "step": 22875 + }, + { + "epoch": 0.044574873714261234, + "grad_norm": 1.9456549882888794, + "learning_rate": 2.9853164379717615e-05, + "loss": 1.9782, + "step": 22890 + }, + { + "epoch": 0.044604083985371495, + "grad_norm": 3.7463254928588867, + "learning_rate": 2.9852972186657774e-05, + "loss": 1.9347, + "step": 22905 + }, + { + "epoch": 0.044633294256481756, + "grad_norm": 1.9641201496124268, + "learning_rate": 2.9852779868518967e-05, + "loss": 1.8366, + "step": 22920 + }, + { + "epoch": 0.044662504527592024, + "grad_norm": 3.9136605262756348, + "learning_rate": 2.9852587425302812e-05, + "loss": 2.0271, + "step": 22935 + }, + { + "epoch": 0.044691714798702285, + "grad_norm": 2.161766767501831, + "learning_rate": 2.9852394857010923e-05, + "loss": 1.8681, + "step": 22950 + }, + { + "epoch": 0.044720925069812546, + "grad_norm": 3.1569862365722656, + "learning_rate": 2.9852202163644937e-05, + "loss": 1.7996, + "step": 22965 + }, + { + "epoch": 0.044750135340922814, + "grad_norm": 2.0586421489715576, + "learning_rate": 2.9852009345206458e-05, + "loss": 1.7727, + "step": 22980 + }, + { + "epoch": 0.044779345612033075, + "grad_norm": 5.425686359405518, + "learning_rate": 2.9851816401697127e-05, + "loss": 1.9209, + "step": 22995 + }, + { + "epoch": 0.044808555883143336, + "grad_norm": 5.385043621063232, + "learning_rate": 2.985162333311856e-05, + "loss": 1.8473, + "step": 23010 + }, + { + "epoch": 0.0448377661542536, + "grad_norm": 3.4577736854553223, + "learning_rate": 2.985143013947238e-05, + "loss": 1.848, + "step": 23025 + }, + { + "epoch": 0.044866976425363865, + "grad_norm": 3.0630152225494385, + "learning_rate": 2.985123682076022e-05, + "loss": 1.7272, + "step": 23040 + }, + { + "epoch": 0.044896186696474126, + "grad_norm": 3.8338427543640137, + "learning_rate": 2.985104337698371e-05, + "loss": 1.9042, + "step": 23055 + }, + { + "epoch": 0.04492539696758439, + "grad_norm": 3.1702969074249268, + "learning_rate": 2.985084980814447e-05, + "loss": 1.9389, + "step": 23070 + }, + { + "epoch": 0.04495460723869465, + "grad_norm": 2.758162021636963, + "learning_rate": 2.985065611424414e-05, + "loss": 1.8825, + "step": 23085 + }, + { + "epoch": 0.044983817509804916, + "grad_norm": 2.1373486518859863, + "learning_rate": 2.985046229528434e-05, + "loss": 1.9451, + "step": 23100 + }, + { + "epoch": 0.04501302778091518, + "grad_norm": 2.1302478313446045, + "learning_rate": 2.985026835126671e-05, + "loss": 2.0208, + "step": 23115 + }, + { + "epoch": 0.04504223805202544, + "grad_norm": 3.8777012825012207, + "learning_rate": 2.985007428219289e-05, + "loss": 2.0138, + "step": 23130 + }, + { + "epoch": 0.045071448323135706, + "grad_norm": 3.428769111633301, + "learning_rate": 2.9849880088064497e-05, + "loss": 1.9868, + "step": 23145 + }, + { + "epoch": 0.04510065859424597, + "grad_norm": 5.036014556884766, + "learning_rate": 2.9849685768883172e-05, + "loss": 1.7348, + "step": 23160 + }, + { + "epoch": 0.04512986886535623, + "grad_norm": 3.206932306289673, + "learning_rate": 2.9849491324650563e-05, + "loss": 1.9991, + "step": 23175 + }, + { + "epoch": 0.04515907913646649, + "grad_norm": 2.5311954021453857, + "learning_rate": 2.9849296755368297e-05, + "loss": 1.7501, + "step": 23190 + }, + { + "epoch": 0.04518828940757676, + "grad_norm": 2.1587252616882324, + "learning_rate": 2.984910206103801e-05, + "loss": 1.7981, + "step": 23205 + }, + { + "epoch": 0.04521749967868702, + "grad_norm": 2.198859453201294, + "learning_rate": 2.984890724166135e-05, + "loss": 1.9349, + "step": 23220 + }, + { + "epoch": 0.04524670994979728, + "grad_norm": 3.513099193572998, + "learning_rate": 2.9848712297239955e-05, + "loss": 1.7639, + "step": 23235 + }, + { + "epoch": 0.045275920220907546, + "grad_norm": 4.66519021987915, + "learning_rate": 2.984851722777546e-05, + "loss": 1.6865, + "step": 23250 + }, + { + "epoch": 0.04530513049201781, + "grad_norm": 2.0970771312713623, + "learning_rate": 2.9848322033269523e-05, + "loss": 1.8861, + "step": 23265 + }, + { + "epoch": 0.04533434076312807, + "grad_norm": 4.110560417175293, + "learning_rate": 2.984812671372377e-05, + "loss": 1.8482, + "step": 23280 + }, + { + "epoch": 0.04536355103423833, + "grad_norm": 3.2896182537078857, + "learning_rate": 2.9847931269139854e-05, + "loss": 1.9021, + "step": 23295 + }, + { + "epoch": 0.0453927613053486, + "grad_norm": 1.929777979850769, + "learning_rate": 2.9847735699519423e-05, + "loss": 1.9236, + "step": 23310 + }, + { + "epoch": 0.04542197157645886, + "grad_norm": 2.989177703857422, + "learning_rate": 2.9847540004864115e-05, + "loss": 1.8256, + "step": 23325 + }, + { + "epoch": 0.04545118184756912, + "grad_norm": 2.735318899154663, + "learning_rate": 2.984734418517559e-05, + "loss": 1.8279, + "step": 23340 + }, + { + "epoch": 0.04548039211867939, + "grad_norm": 2.597491502761841, + "learning_rate": 2.9847148240455495e-05, + "loss": 1.8952, + "step": 23355 + }, + { + "epoch": 0.04550960238978965, + "grad_norm": 3.5635156631469727, + "learning_rate": 2.9846952170705473e-05, + "loss": 1.7302, + "step": 23370 + }, + { + "epoch": 0.04553881266089991, + "grad_norm": 6.853522777557373, + "learning_rate": 2.9846755975927174e-05, + "loss": 2.0572, + "step": 23385 + }, + { + "epoch": 0.04556802293201017, + "grad_norm": 2.006582498550415, + "learning_rate": 2.984655965612226e-05, + "loss": 1.939, + "step": 23400 + }, + { + "epoch": 0.04559723320312044, + "grad_norm": 3.6737654209136963, + "learning_rate": 2.984636321129238e-05, + "loss": 1.8714, + "step": 23415 + }, + { + "epoch": 0.0456264434742307, + "grad_norm": 2.5802881717681885, + "learning_rate": 2.9846166641439185e-05, + "loss": 1.8687, + "step": 23430 + }, + { + "epoch": 0.04565565374534096, + "grad_norm": 3.748178005218506, + "learning_rate": 2.984596994656433e-05, + "loss": 1.9146, + "step": 23445 + }, + { + "epoch": 0.04568486401645123, + "grad_norm": 2.5193021297454834, + "learning_rate": 2.9845773126669475e-05, + "loss": 1.8845, + "step": 23460 + }, + { + "epoch": 0.04571407428756149, + "grad_norm": 2.3217403888702393, + "learning_rate": 2.984557618175628e-05, + "loss": 1.9149, + "step": 23475 + }, + { + "epoch": 0.04574328455867175, + "grad_norm": 3.9094045162200928, + "learning_rate": 2.98453791118264e-05, + "loss": 1.8615, + "step": 23490 + }, + { + "epoch": 0.04577249482978201, + "grad_norm": 2.190436840057373, + "learning_rate": 2.9845181916881495e-05, + "loss": 1.8181, + "step": 23505 + }, + { + "epoch": 0.04580170510089228, + "grad_norm": 3.33785343170166, + "learning_rate": 2.984498459692322e-05, + "loss": 1.9345, + "step": 23520 + }, + { + "epoch": 0.04583091537200254, + "grad_norm": 4.065570831298828, + "learning_rate": 2.9844787151953242e-05, + "loss": 1.7801, + "step": 23535 + }, + { + "epoch": 0.0458601256431128, + "grad_norm": 3.4585118293762207, + "learning_rate": 2.984458958197323e-05, + "loss": 1.8181, + "step": 23550 + }, + { + "epoch": 0.04588933591422307, + "grad_norm": 2.1615355014801025, + "learning_rate": 2.984439188698484e-05, + "loss": 1.8265, + "step": 23565 + }, + { + "epoch": 0.04591854618533333, + "grad_norm": 3.9168038368225098, + "learning_rate": 2.9844194066989737e-05, + "loss": 1.8508, + "step": 23580 + }, + { + "epoch": 0.04594775645644359, + "grad_norm": 2.436638593673706, + "learning_rate": 2.9843996121989587e-05, + "loss": 1.9247, + "step": 23595 + }, + { + "epoch": 0.04597696672755385, + "grad_norm": 3.637157678604126, + "learning_rate": 2.9843798051986053e-05, + "loss": 1.9791, + "step": 23610 + }, + { + "epoch": 0.04600617699866412, + "grad_norm": 2.9777755737304688, + "learning_rate": 2.9843599856980815e-05, + "loss": 1.8227, + "step": 23625 + }, + { + "epoch": 0.04603538726977438, + "grad_norm": 5.862057209014893, + "learning_rate": 2.9843401536975533e-05, + "loss": 1.9745, + "step": 23640 + }, + { + "epoch": 0.04606459754088464, + "grad_norm": 2.937347173690796, + "learning_rate": 2.9843203091971878e-05, + "loss": 1.7575, + "step": 23655 + }, + { + "epoch": 0.0460938078119949, + "grad_norm": 3.7695438861846924, + "learning_rate": 2.984300452197152e-05, + "loss": 1.9655, + "step": 23670 + }, + { + "epoch": 0.04612301808310517, + "grad_norm": 3.1122677326202393, + "learning_rate": 2.9842805826976137e-05, + "loss": 2.0459, + "step": 23685 + }, + { + "epoch": 0.04615222835421543, + "grad_norm": 4.503921031951904, + "learning_rate": 2.98426070069874e-05, + "loss": 1.8319, + "step": 23700 + }, + { + "epoch": 0.04618143862532569, + "grad_norm": 2.247570514678955, + "learning_rate": 2.9842408062006982e-05, + "loss": 1.765, + "step": 23715 + }, + { + "epoch": 0.04621064889643596, + "grad_norm": 3.512427806854248, + "learning_rate": 2.9842208992036554e-05, + "loss": 1.8118, + "step": 23730 + }, + { + "epoch": 0.04623985916754622, + "grad_norm": 3.174893379211426, + "learning_rate": 2.98420097970778e-05, + "loss": 1.8791, + "step": 23745 + }, + { + "epoch": 0.04626906943865648, + "grad_norm": 5.151320934295654, + "learning_rate": 2.9841810477132392e-05, + "loss": 1.817, + "step": 23760 + }, + { + "epoch": 0.04629827970976674, + "grad_norm": 2.6584534645080566, + "learning_rate": 2.984161103220201e-05, + "loss": 1.8441, + "step": 23775 + }, + { + "epoch": 0.04632748998087701, + "grad_norm": 1.967596173286438, + "learning_rate": 2.9841411462288335e-05, + "loss": 2.0349, + "step": 23790 + }, + { + "epoch": 0.04635670025198727, + "grad_norm": 3.7256250381469727, + "learning_rate": 2.9841211767393048e-05, + "loss": 1.8131, + "step": 23805 + }, + { + "epoch": 0.04638591052309753, + "grad_norm": 4.081748962402344, + "learning_rate": 2.9841011947517826e-05, + "loss": 1.8664, + "step": 23820 + }, + { + "epoch": 0.0464151207942078, + "grad_norm": 2.527451276779175, + "learning_rate": 2.984081200266436e-05, + "loss": 1.8916, + "step": 23835 + }, + { + "epoch": 0.04644433106531806, + "grad_norm": 2.159346103668213, + "learning_rate": 2.9840611932834326e-05, + "loss": 1.9254, + "step": 23850 + }, + { + "epoch": 0.04647354133642832, + "grad_norm": 2.636519432067871, + "learning_rate": 2.9840411738029412e-05, + "loss": 1.8062, + "step": 23865 + }, + { + "epoch": 0.046502751607538584, + "grad_norm": 2.3750388622283936, + "learning_rate": 2.9840211418251303e-05, + "loss": 1.7347, + "step": 23880 + }, + { + "epoch": 0.04653196187864885, + "grad_norm": 2.181845188140869, + "learning_rate": 2.9840010973501685e-05, + "loss": 1.9207, + "step": 23895 + }, + { + "epoch": 0.04656117214975911, + "grad_norm": 3.098555564880371, + "learning_rate": 2.9839810403782252e-05, + "loss": 1.9566, + "step": 23910 + }, + { + "epoch": 0.046590382420869374, + "grad_norm": 5.142632484436035, + "learning_rate": 2.9839609709094685e-05, + "loss": 1.8871, + "step": 23925 + }, + { + "epoch": 0.04661959269197964, + "grad_norm": 1.7162665128707886, + "learning_rate": 2.9839408889440678e-05, + "loss": 1.8784, + "step": 23940 + }, + { + "epoch": 0.0466488029630899, + "grad_norm": 3.743401050567627, + "learning_rate": 2.9839207944821925e-05, + "loss": 1.9179, + "step": 23955 + }, + { + "epoch": 0.046678013234200164, + "grad_norm": 2.952817440032959, + "learning_rate": 2.983900687524011e-05, + "loss": 1.9182, + "step": 23970 + }, + { + "epoch": 0.046707223505310425, + "grad_norm": 2.3484249114990234, + "learning_rate": 2.983880568069693e-05, + "loss": 1.8336, + "step": 23985 + }, + { + "epoch": 0.04673643377642069, + "grad_norm": 1.931921362876892, + "learning_rate": 2.9838604361194087e-05, + "loss": 1.771, + "step": 24000 + }, + { + "epoch": 0.04676564404753095, + "grad_norm": 5.197054386138916, + "learning_rate": 2.9838402916733263e-05, + "loss": 1.8558, + "step": 24015 + }, + { + "epoch": 0.046794854318641214, + "grad_norm": 2.862427234649658, + "learning_rate": 2.9838201347316164e-05, + "loss": 1.6956, + "step": 24030 + }, + { + "epoch": 0.04682406458975148, + "grad_norm": 2.6003293991088867, + "learning_rate": 2.9837999652944487e-05, + "loss": 2.0473, + "step": 24045 + }, + { + "epoch": 0.04685327486086174, + "grad_norm": 2.068455934524536, + "learning_rate": 2.9837797833619926e-05, + "loss": 1.9233, + "step": 24060 + }, + { + "epoch": 0.046882485131972004, + "grad_norm": 3.4722557067871094, + "learning_rate": 2.983759588934418e-05, + "loss": 2.0052, + "step": 24075 + }, + { + "epoch": 0.046911695403082265, + "grad_norm": 3.198732852935791, + "learning_rate": 2.9837393820118954e-05, + "loss": 1.906, + "step": 24090 + }, + { + "epoch": 0.04694090567419253, + "grad_norm": 2.5050504207611084, + "learning_rate": 2.983719162594595e-05, + "loss": 1.833, + "step": 24105 + }, + { + "epoch": 0.046970115945302794, + "grad_norm": 2.1438305377960205, + "learning_rate": 2.9836989306826866e-05, + "loss": 1.7439, + "step": 24120 + }, + { + "epoch": 0.046999326216413055, + "grad_norm": 1.5980552434921265, + "learning_rate": 2.983678686276341e-05, + "loss": 1.8018, + "step": 24135 + }, + { + "epoch": 0.047028536487523316, + "grad_norm": 2.566882610321045, + "learning_rate": 2.9836584293757282e-05, + "loss": 1.8807, + "step": 24150 + }, + { + "epoch": 0.047057746758633584, + "grad_norm": 4.125818252563477, + "learning_rate": 2.9836381599810196e-05, + "loss": 1.8198, + "step": 24165 + }, + { + "epoch": 0.047086957029743845, + "grad_norm": 3.1217780113220215, + "learning_rate": 2.983617878092385e-05, + "loss": 2.0202, + "step": 24180 + }, + { + "epoch": 0.047116167300854106, + "grad_norm": 3.001835823059082, + "learning_rate": 2.9835975837099956e-05, + "loss": 1.851, + "step": 24195 + }, + { + "epoch": 0.047145377571964374, + "grad_norm": 4.060051918029785, + "learning_rate": 2.9835772768340225e-05, + "loss": 1.8918, + "step": 24210 + }, + { + "epoch": 0.047174587843074635, + "grad_norm": 4.199372291564941, + "learning_rate": 2.9835569574646363e-05, + "loss": 1.848, + "step": 24225 + }, + { + "epoch": 0.047203798114184896, + "grad_norm": 3.1068427562713623, + "learning_rate": 2.9835366256020085e-05, + "loss": 1.793, + "step": 24240 + }, + { + "epoch": 0.04723300838529516, + "grad_norm": 3.30556321144104, + "learning_rate": 2.9835162812463098e-05, + "loss": 1.8914, + "step": 24255 + }, + { + "epoch": 0.047262218656405425, + "grad_norm": 3.898911476135254, + "learning_rate": 2.9834959243977123e-05, + "loss": 1.929, + "step": 24270 + }, + { + "epoch": 0.047291428927515686, + "grad_norm": 2.1169817447662354, + "learning_rate": 2.9834755550563865e-05, + "loss": 1.9513, + "step": 24285 + }, + { + "epoch": 0.04732063919862595, + "grad_norm": 3.0540499687194824, + "learning_rate": 2.9834551732225044e-05, + "loss": 1.8283, + "step": 24300 + }, + { + "epoch": 0.047349849469736215, + "grad_norm": 2.495234727859497, + "learning_rate": 2.9834347788962383e-05, + "loss": 1.8268, + "step": 24315 + }, + { + "epoch": 0.047379059740846476, + "grad_norm": 4.254831790924072, + "learning_rate": 2.9834143720777588e-05, + "loss": 1.8933, + "step": 24330 + }, + { + "epoch": 0.04740827001195674, + "grad_norm": 5.430483818054199, + "learning_rate": 2.9833939527672384e-05, + "loss": 1.8128, + "step": 24345 + }, + { + "epoch": 0.047437480283067, + "grad_norm": 3.332515239715576, + "learning_rate": 2.983373520964849e-05, + "loss": 1.9089, + "step": 24360 + }, + { + "epoch": 0.047466690554177265, + "grad_norm": 2.1588125228881836, + "learning_rate": 2.983353076670762e-05, + "loss": 1.7578, + "step": 24375 + }, + { + "epoch": 0.047495900825287526, + "grad_norm": 2.6773619651794434, + "learning_rate": 2.9833326198851503e-05, + "loss": 1.9486, + "step": 24390 + }, + { + "epoch": 0.04752511109639779, + "grad_norm": 2.855078935623169, + "learning_rate": 2.9833121506081862e-05, + "loss": 1.9339, + "step": 24405 + }, + { + "epoch": 0.047554321367508055, + "grad_norm": 2.539210557937622, + "learning_rate": 2.983291668840042e-05, + "loss": 1.8479, + "step": 24420 + }, + { + "epoch": 0.047583531638618316, + "grad_norm": 3.0605132579803467, + "learning_rate": 2.9832711745808895e-05, + "loss": 1.8924, + "step": 24435 + }, + { + "epoch": 0.04761274190972858, + "grad_norm": 2.142998218536377, + "learning_rate": 2.9832506678309025e-05, + "loss": 2.0331, + "step": 24450 + }, + { + "epoch": 0.04764195218083884, + "grad_norm": 3.082610845565796, + "learning_rate": 2.983230148590253e-05, + "loss": 1.8299, + "step": 24465 + }, + { + "epoch": 0.047671162451949106, + "grad_norm": 4.011190414428711, + "learning_rate": 2.9832096168591128e-05, + "loss": 1.8591, + "step": 24480 + }, + { + "epoch": 0.04770037272305937, + "grad_norm": 2.326756238937378, + "learning_rate": 2.983189072637657e-05, + "loss": 1.8687, + "step": 24495 + }, + { + "epoch": 0.04772958299416963, + "grad_norm": 2.1641993522644043, + "learning_rate": 2.9831685159260568e-05, + "loss": 1.9402, + "step": 24510 + }, + { + "epoch": 0.047758793265279896, + "grad_norm": 2.614429473876953, + "learning_rate": 2.983147946724486e-05, + "loss": 1.8911, + "step": 24525 + }, + { + "epoch": 0.04778800353639016, + "grad_norm": 2.163760185241699, + "learning_rate": 2.983127365033118e-05, + "loss": 1.7623, + "step": 24540 + }, + { + "epoch": 0.04781721380750042, + "grad_norm": 1.837815761566162, + "learning_rate": 2.9831067708521257e-05, + "loss": 2.0103, + "step": 24555 + }, + { + "epoch": 0.04784642407861068, + "grad_norm": 4.629458904266357, + "learning_rate": 2.9830861641816826e-05, + "loss": 1.8473, + "step": 24570 + }, + { + "epoch": 0.04787563434972095, + "grad_norm": 2.741942882537842, + "learning_rate": 2.9830655450219623e-05, + "loss": 1.8219, + "step": 24585 + }, + { + "epoch": 0.04790484462083121, + "grad_norm": 2.6273906230926514, + "learning_rate": 2.9830449133731387e-05, + "loss": 1.7688, + "step": 24600 + }, + { + "epoch": 0.04793405489194147, + "grad_norm": 2.2749216556549072, + "learning_rate": 2.983024269235385e-05, + "loss": 1.9809, + "step": 24615 + }, + { + "epoch": 0.04796326516305174, + "grad_norm": 4.1970367431640625, + "learning_rate": 2.9830036126088754e-05, + "loss": 1.7998, + "step": 24630 + }, + { + "epoch": 0.047992475434162, + "grad_norm": 2.87906813621521, + "learning_rate": 2.982982943493784e-05, + "loss": 1.8496, + "step": 24645 + }, + { + "epoch": 0.04802168570527226, + "grad_norm": 1.9954204559326172, + "learning_rate": 2.9829622618902848e-05, + "loss": 1.8782, + "step": 24660 + }, + { + "epoch": 0.04805089597638252, + "grad_norm": 3.3936777114868164, + "learning_rate": 2.982941567798551e-05, + "loss": 1.7, + "step": 24675 + }, + { + "epoch": 0.04808010624749279, + "grad_norm": 5.30580997467041, + "learning_rate": 2.9829208612187585e-05, + "loss": 1.8712, + "step": 24690 + }, + { + "epoch": 0.04810931651860305, + "grad_norm": 1.797080636024475, + "learning_rate": 2.982900142151081e-05, + "loss": 1.8711, + "step": 24705 + }, + { + "epoch": 0.04813852678971331, + "grad_norm": 1.7338413000106812, + "learning_rate": 2.9828794105956922e-05, + "loss": 1.9125, + "step": 24720 + }, + { + "epoch": 0.04816773706082357, + "grad_norm": 6.833633899688721, + "learning_rate": 2.9828586665527677e-05, + "loss": 1.8622, + "step": 24735 + }, + { + "epoch": 0.04819694733193384, + "grad_norm": 2.8855645656585693, + "learning_rate": 2.9828379100224814e-05, + "loss": 1.7596, + "step": 24750 + }, + { + "epoch": 0.0482261576030441, + "grad_norm": 3.7425286769866943, + "learning_rate": 2.982817141005009e-05, + "loss": 1.8452, + "step": 24765 + }, + { + "epoch": 0.04825536787415436, + "grad_norm": 4.187405109405518, + "learning_rate": 2.9827963595005248e-05, + "loss": 1.8669, + "step": 24780 + }, + { + "epoch": 0.04828457814526463, + "grad_norm": 4.476016998291016, + "learning_rate": 2.982775565509204e-05, + "loss": 1.9221, + "step": 24795 + }, + { + "epoch": 0.04831378841637489, + "grad_norm": 3.6991500854492188, + "learning_rate": 2.9827547590312213e-05, + "loss": 1.5434, + "step": 24810 + }, + { + "epoch": 0.04834299868748515, + "grad_norm": 3.0707576274871826, + "learning_rate": 2.9827339400667524e-05, + "loss": 1.88, + "step": 24825 + }, + { + "epoch": 0.04837220895859541, + "grad_norm": 2.604163646697998, + "learning_rate": 2.9827131086159723e-05, + "loss": 1.9946, + "step": 24840 + }, + { + "epoch": 0.04840141922970568, + "grad_norm": 4.7181549072265625, + "learning_rate": 2.9826922646790568e-05, + "loss": 1.8807, + "step": 24855 + }, + { + "epoch": 0.04843062950081594, + "grad_norm": 3.983146905899048, + "learning_rate": 2.9826714082561808e-05, + "loss": 1.8593, + "step": 24870 + }, + { + "epoch": 0.0484598397719262, + "grad_norm": 4.0342912673950195, + "learning_rate": 2.982650539347521e-05, + "loss": 1.9208, + "step": 24885 + }, + { + "epoch": 0.04848905004303647, + "grad_norm": 3.092644691467285, + "learning_rate": 2.982629657953252e-05, + "loss": 1.9611, + "step": 24900 + }, + { + "epoch": 0.04851826031414673, + "grad_norm": 2.244311809539795, + "learning_rate": 2.98260876407355e-05, + "loss": 1.7965, + "step": 24915 + }, + { + "epoch": 0.04854747058525699, + "grad_norm": 4.740140914916992, + "learning_rate": 2.9825878577085917e-05, + "loss": 2.0418, + "step": 24930 + }, + { + "epoch": 0.04857668085636725, + "grad_norm": 1.6131473779678345, + "learning_rate": 2.9825669388585523e-05, + "loss": 1.7781, + "step": 24945 + }, + { + "epoch": 0.04860589112747752, + "grad_norm": 3.3024797439575195, + "learning_rate": 2.9825460075236077e-05, + "loss": 1.9176, + "step": 24960 + }, + { + "epoch": 0.04863510139858778, + "grad_norm": 3.6958138942718506, + "learning_rate": 2.9825250637039348e-05, + "loss": 1.8613, + "step": 24975 + }, + { + "epoch": 0.04866431166969804, + "grad_norm": 2.277597427368164, + "learning_rate": 2.9825041073997102e-05, + "loss": 1.9942, + "step": 24990 + }, + { + "epoch": 0.04869352194080831, + "grad_norm": 3.3070085048675537, + "learning_rate": 2.9824831386111103e-05, + "loss": 1.5052, + "step": 25005 + }, + { + "epoch": 0.04872273221191857, + "grad_norm": 2.769148826599121, + "learning_rate": 2.9824621573383107e-05, + "loss": 1.8361, + "step": 25020 + }, + { + "epoch": 0.04875194248302883, + "grad_norm": 2.3901126384735107, + "learning_rate": 2.982441163581489e-05, + "loss": 1.8346, + "step": 25035 + }, + { + "epoch": 0.04878115275413909, + "grad_norm": 3.8897323608398438, + "learning_rate": 2.9824201573408218e-05, + "loss": 1.8186, + "step": 25050 + }, + { + "epoch": 0.04881036302524936, + "grad_norm": 1.947713017463684, + "learning_rate": 2.982399138616486e-05, + "loss": 1.8812, + "step": 25065 + }, + { + "epoch": 0.04883957329635962, + "grad_norm": 1.6136304140090942, + "learning_rate": 2.9823781074086582e-05, + "loss": 2.0161, + "step": 25080 + }, + { + "epoch": 0.04886878356746988, + "grad_norm": 2.6880128383636475, + "learning_rate": 2.9823570637175166e-05, + "loss": 1.8861, + "step": 25095 + }, + { + "epoch": 0.04889799383858015, + "grad_norm": 4.511691570281982, + "learning_rate": 2.982336007543237e-05, + "loss": 1.8334, + "step": 25110 + }, + { + "epoch": 0.04892720410969041, + "grad_norm": 2.540619134902954, + "learning_rate": 2.9823149388859975e-05, + "loss": 1.8554, + "step": 25125 + }, + { + "epoch": 0.04895641438080067, + "grad_norm": 2.650416135787964, + "learning_rate": 2.982293857745976e-05, + "loss": 2.0556, + "step": 25140 + }, + { + "epoch": 0.04898562465191093, + "grad_norm": 4.075965881347656, + "learning_rate": 2.9822727641233488e-05, + "loss": 1.7369, + "step": 25155 + }, + { + "epoch": 0.0490148349230212, + "grad_norm": 4.21481466293335, + "learning_rate": 2.9822516580182944e-05, + "loss": 1.9624, + "step": 25170 + }, + { + "epoch": 0.04904404519413146, + "grad_norm": 4.617081642150879, + "learning_rate": 2.98223053943099e-05, + "loss": 1.7276, + "step": 25185 + }, + { + "epoch": 0.04907325546524172, + "grad_norm": 3.5099408626556396, + "learning_rate": 2.9822094083616145e-05, + "loss": 1.8783, + "step": 25200 + }, + { + "epoch": 0.049102465736351984, + "grad_norm": 4.118253231048584, + "learning_rate": 2.9821882648103445e-05, + "loss": 1.7848, + "step": 25215 + }, + { + "epoch": 0.04913167600746225, + "grad_norm": 3.617659091949463, + "learning_rate": 2.982167108777359e-05, + "loss": 1.8307, + "step": 25230 + }, + { + "epoch": 0.04916088627857251, + "grad_norm": 3.3717000484466553, + "learning_rate": 2.9821459402628357e-05, + "loss": 1.6414, + "step": 25245 + }, + { + "epoch": 0.049190096549682774, + "grad_norm": 3.3341469764709473, + "learning_rate": 2.9821247592669526e-05, + "loss": 1.9641, + "step": 25260 + }, + { + "epoch": 0.04921930682079304, + "grad_norm": 3.8818039894104004, + "learning_rate": 2.9821035657898886e-05, + "loss": 1.8227, + "step": 25275 + }, + { + "epoch": 0.0492485170919033, + "grad_norm": 2.4647955894470215, + "learning_rate": 2.9820823598318226e-05, + "loss": 1.9441, + "step": 25290 + }, + { + "epoch": 0.049277727363013564, + "grad_norm": 5.145657062530518, + "learning_rate": 2.9820611413929318e-05, + "loss": 1.7515, + "step": 25305 + }, + { + "epoch": 0.049306937634123825, + "grad_norm": 2.595554828643799, + "learning_rate": 2.9820399104733964e-05, + "loss": 1.8185, + "step": 25320 + }, + { + "epoch": 0.04933614790523409, + "grad_norm": 3.0854387283325195, + "learning_rate": 2.9820186670733944e-05, + "loss": 2.0462, + "step": 25335 + }, + { + "epoch": 0.049365358176344354, + "grad_norm": 4.086148262023926, + "learning_rate": 2.9819974111931045e-05, + "loss": 1.923, + "step": 25350 + }, + { + "epoch": 0.049394568447454615, + "grad_norm": 2.3999664783477783, + "learning_rate": 2.9819761428327057e-05, + "loss": 1.9243, + "step": 25365 + }, + { + "epoch": 0.04942377871856488, + "grad_norm": 1.8367825746536255, + "learning_rate": 2.981954861992378e-05, + "loss": 1.9104, + "step": 25380 + }, + { + "epoch": 0.049452988989675144, + "grad_norm": 2.3233165740966797, + "learning_rate": 2.9819335686722997e-05, + "loss": 1.7446, + "step": 25395 + }, + { + "epoch": 0.049482199260785405, + "grad_norm": 3.7498881816864014, + "learning_rate": 2.98191226287265e-05, + "loss": 2.0172, + "step": 25410 + }, + { + "epoch": 0.049511409531895666, + "grad_norm": 2.636087656021118, + "learning_rate": 2.9818909445936092e-05, + "loss": 1.9958, + "step": 25425 + }, + { + "epoch": 0.049540619803005934, + "grad_norm": 2.6167049407958984, + "learning_rate": 2.9818696138353564e-05, + "loss": 1.9746, + "step": 25440 + }, + { + "epoch": 0.049569830074116195, + "grad_norm": 2.699380874633789, + "learning_rate": 2.9818482705980708e-05, + "loss": 2.0158, + "step": 25455 + }, + { + "epoch": 0.049599040345226456, + "grad_norm": 2.8639230728149414, + "learning_rate": 2.9818269148819326e-05, + "loss": 1.8795, + "step": 25470 + }, + { + "epoch": 0.04962825061633672, + "grad_norm": 3.6716597080230713, + "learning_rate": 2.9818055466871217e-05, + "loss": 2.0066, + "step": 25485 + }, + { + "epoch": 0.049657460887446984, + "grad_norm": 3.0852763652801514, + "learning_rate": 2.981784166013818e-05, + "loss": 1.8392, + "step": 25500 + }, + { + "epoch": 0.049686671158557245, + "grad_norm": 2.0497000217437744, + "learning_rate": 2.981762772862201e-05, + "loss": 2.0587, + "step": 25515 + }, + { + "epoch": 0.049715881429667506, + "grad_norm": 4.302377700805664, + "learning_rate": 2.9817413672324517e-05, + "loss": 1.9415, + "step": 25530 + }, + { + "epoch": 0.049745091700777774, + "grad_norm": 2.478428840637207, + "learning_rate": 2.9817199491247495e-05, + "loss": 2.0082, + "step": 25545 + }, + { + "epoch": 0.049774301971888035, + "grad_norm": 3.375516891479492, + "learning_rate": 2.9816985185392752e-05, + "loss": 2.061, + "step": 25560 + }, + { + "epoch": 0.049803512242998296, + "grad_norm": 2.3733456134796143, + "learning_rate": 2.9816770754762094e-05, + "loss": 1.8752, + "step": 25575 + }, + { + "epoch": 0.049832722514108564, + "grad_norm": 2.8646862506866455, + "learning_rate": 2.9816556199357334e-05, + "loss": 1.9048, + "step": 25590 + }, + { + "epoch": 0.049861932785218825, + "grad_norm": 3.711494207382202, + "learning_rate": 2.981634151918026e-05, + "loss": 1.8555, + "step": 25605 + }, + { + "epoch": 0.049891143056329086, + "grad_norm": 4.652657985687256, + "learning_rate": 2.9816126714232694e-05, + "loss": 1.9591, + "step": 25620 + }, + { + "epoch": 0.04992035332743935, + "grad_norm": 3.5854547023773193, + "learning_rate": 2.981591178451644e-05, + "loss": 1.8043, + "step": 25635 + }, + { + "epoch": 0.049949563598549615, + "grad_norm": 2.532128095626831, + "learning_rate": 2.981569673003331e-05, + "loss": 1.7456, + "step": 25650 + }, + { + "epoch": 0.049978773869659876, + "grad_norm": 1.8968348503112793, + "learning_rate": 2.9815481550785116e-05, + "loss": 1.8971, + "step": 25665 + }, + { + "epoch": 0.05000798414077014, + "grad_norm": 4.662414073944092, + "learning_rate": 2.9815266246773663e-05, + "loss": 2.1576, + "step": 25680 + }, + { + "epoch": 0.050037194411880405, + "grad_norm": 2.1178083419799805, + "learning_rate": 2.9815050818000773e-05, + "loss": 1.9055, + "step": 25695 + }, + { + "epoch": 0.050066404682990666, + "grad_norm": 3.8748810291290283, + "learning_rate": 2.9814835264468254e-05, + "loss": 1.8177, + "step": 25710 + }, + { + "epoch": 0.05009561495410093, + "grad_norm": 3.424405097961426, + "learning_rate": 2.9814619586177926e-05, + "loss": 1.8804, + "step": 25725 + }, + { + "epoch": 0.05012482522521119, + "grad_norm": 4.104612827301025, + "learning_rate": 2.98144037831316e-05, + "loss": 1.973, + "step": 25740 + }, + { + "epoch": 0.050154035496321456, + "grad_norm": 2.1848320960998535, + "learning_rate": 2.98141878553311e-05, + "loss": 1.8529, + "step": 25755 + }, + { + "epoch": 0.05018324576743172, + "grad_norm": 4.191700458526611, + "learning_rate": 2.981397180277824e-05, + "loss": 1.8094, + "step": 25770 + }, + { + "epoch": 0.05021245603854198, + "grad_norm": 3.3454208374023438, + "learning_rate": 2.981375562547484e-05, + "loss": 1.9105, + "step": 25785 + }, + { + "epoch": 0.05024166630965224, + "grad_norm": 2.848618507385254, + "learning_rate": 2.9813539323422717e-05, + "loss": 1.9054, + "step": 25800 + }, + { + "epoch": 0.05027087658076251, + "grad_norm": 1.7352811098098755, + "learning_rate": 2.98133228966237e-05, + "loss": 1.91, + "step": 25815 + }, + { + "epoch": 0.05030008685187277, + "grad_norm": 2.177297592163086, + "learning_rate": 2.9813106345079604e-05, + "loss": 1.8276, + "step": 25830 + }, + { + "epoch": 0.05032929712298303, + "grad_norm": 2.1822316646575928, + "learning_rate": 2.981288966879226e-05, + "loss": 1.7961, + "step": 25845 + }, + { + "epoch": 0.050358507394093296, + "grad_norm": 2.70729398727417, + "learning_rate": 2.9812672867763482e-05, + "loss": 1.7497, + "step": 25860 + }, + { + "epoch": 0.05038771766520356, + "grad_norm": 2.869450807571411, + "learning_rate": 2.981245594199511e-05, + "loss": 1.8309, + "step": 25875 + }, + { + "epoch": 0.05041692793631382, + "grad_norm": 2.233219623565674, + "learning_rate": 2.981223889148896e-05, + "loss": 1.8577, + "step": 25890 + }, + { + "epoch": 0.05044613820742408, + "grad_norm": 5.402493000030518, + "learning_rate": 2.981202171624686e-05, + "loss": 1.9151, + "step": 25905 + }, + { + "epoch": 0.05047534847853435, + "grad_norm": 4.127412796020508, + "learning_rate": 2.9811804416270648e-05, + "loss": 1.8644, + "step": 25920 + }, + { + "epoch": 0.05050455874964461, + "grad_norm": 2.9654555320739746, + "learning_rate": 2.9811586991562145e-05, + "loss": 1.99, + "step": 25935 + }, + { + "epoch": 0.05053376902075487, + "grad_norm": 2.6316134929656982, + "learning_rate": 2.981136944212318e-05, + "loss": 1.7434, + "step": 25950 + }, + { + "epoch": 0.05056297929186514, + "grad_norm": 2.6688010692596436, + "learning_rate": 2.9811151767955597e-05, + "loss": 1.8075, + "step": 25965 + }, + { + "epoch": 0.0505921895629754, + "grad_norm": 1.7729136943817139, + "learning_rate": 2.981093396906122e-05, + "loss": 1.7828, + "step": 25980 + }, + { + "epoch": 0.05062139983408566, + "grad_norm": 3.017512321472168, + "learning_rate": 2.9810716045441884e-05, + "loss": 1.9602, + "step": 25995 + }, + { + "epoch": 0.05065061010519592, + "grad_norm": 3.954099416732788, + "learning_rate": 2.9810497997099427e-05, + "loss": 1.9158, + "step": 26010 + }, + { + "epoch": 0.05067982037630619, + "grad_norm": 4.384187698364258, + "learning_rate": 2.981027982403568e-05, + "loss": 2.0172, + "step": 26025 + }, + { + "epoch": 0.05070903064741645, + "grad_norm": 2.7591800689697266, + "learning_rate": 2.9810061526252488e-05, + "loss": 1.8372, + "step": 26040 + }, + { + "epoch": 0.05073824091852671, + "grad_norm": 2.6615748405456543, + "learning_rate": 2.980984310375168e-05, + "loss": 1.8127, + "step": 26055 + }, + { + "epoch": 0.05076745118963698, + "grad_norm": 3.1193888187408447, + "learning_rate": 2.9809624556535106e-05, + "loss": 1.8826, + "step": 26070 + }, + { + "epoch": 0.05079666146074724, + "grad_norm": 2.338146448135376, + "learning_rate": 2.9809405884604594e-05, + "loss": 1.9003, + "step": 26085 + }, + { + "epoch": 0.0508258717318575, + "grad_norm": 2.339162588119507, + "learning_rate": 2.9809187087961993e-05, + "loss": 1.8444, + "step": 26100 + }, + { + "epoch": 0.05085508200296776, + "grad_norm": 6.189070224761963, + "learning_rate": 2.980896816660915e-05, + "loss": 1.8682, + "step": 26115 + }, + { + "epoch": 0.05088429227407803, + "grad_norm": 3.7556257247924805, + "learning_rate": 2.9808749120547898e-05, + "loss": 1.8692, + "step": 26130 + }, + { + "epoch": 0.05091350254518829, + "grad_norm": 2.4181950092315674, + "learning_rate": 2.980852994978009e-05, + "loss": 1.8497, + "step": 26145 + }, + { + "epoch": 0.05094271281629855, + "grad_norm": 2.1751251220703125, + "learning_rate": 2.9808310654307566e-05, + "loss": 1.7236, + "step": 26160 + }, + { + "epoch": 0.05097192308740882, + "grad_norm": 2.304203748703003, + "learning_rate": 2.9808091234132177e-05, + "loss": 1.8394, + "step": 26175 + }, + { + "epoch": 0.05100113335851908, + "grad_norm": 2.03320050239563, + "learning_rate": 2.980787168925577e-05, + "loss": 2.0082, + "step": 26190 + }, + { + "epoch": 0.05103034362962934, + "grad_norm": 3.1527299880981445, + "learning_rate": 2.9807652019680195e-05, + "loss": 1.8783, + "step": 26205 + }, + { + "epoch": 0.0510595539007396, + "grad_norm": 3.638120412826538, + "learning_rate": 2.9807432225407295e-05, + "loss": 1.8912, + "step": 26220 + }, + { + "epoch": 0.05108876417184987, + "grad_norm": 1.9177019596099854, + "learning_rate": 2.9807212306438927e-05, + "loss": 1.7949, + "step": 26235 + }, + { + "epoch": 0.05111797444296013, + "grad_norm": 4.763120174407959, + "learning_rate": 2.9806992262776945e-05, + "loss": 1.9464, + "step": 26250 + }, + { + "epoch": 0.05114718471407039, + "grad_norm": 4.4592132568359375, + "learning_rate": 2.980677209442319e-05, + "loss": 1.834, + "step": 26265 + }, + { + "epoch": 0.05117639498518065, + "grad_norm": 2.6289279460906982, + "learning_rate": 2.980655180137953e-05, + "loss": 1.7644, + "step": 26280 + }, + { + "epoch": 0.05120560525629092, + "grad_norm": 2.2794735431671143, + "learning_rate": 2.9806331383647816e-05, + "loss": 1.9125, + "step": 26295 + }, + { + "epoch": 0.05123481552740118, + "grad_norm": 3.5884125232696533, + "learning_rate": 2.9806110841229904e-05, + "loss": 1.925, + "step": 26310 + }, + { + "epoch": 0.05126402579851144, + "grad_norm": 2.8750176429748535, + "learning_rate": 2.9805890174127648e-05, + "loss": 1.6463, + "step": 26325 + }, + { + "epoch": 0.05129323606962171, + "grad_norm": 3.1167027950286865, + "learning_rate": 2.980566938234291e-05, + "loss": 1.8897, + "step": 26340 + }, + { + "epoch": 0.05132244634073197, + "grad_norm": 2.8106181621551514, + "learning_rate": 2.9805448465877546e-05, + "loss": 1.855, + "step": 26355 + }, + { + "epoch": 0.05135165661184223, + "grad_norm": 3.8905303478240967, + "learning_rate": 2.980522742473342e-05, + "loss": 1.9127, + "step": 26370 + }, + { + "epoch": 0.05138086688295249, + "grad_norm": 3.031163215637207, + "learning_rate": 2.980500625891239e-05, + "loss": 1.9577, + "step": 26385 + }, + { + "epoch": 0.05141007715406276, + "grad_norm": 1.991543173789978, + "learning_rate": 2.980478496841632e-05, + "loss": 1.6416, + "step": 26400 + }, + { + "epoch": 0.05143928742517302, + "grad_norm": 3.90432071685791, + "learning_rate": 2.9804563553247076e-05, + "loss": 1.7318, + "step": 26415 + }, + { + "epoch": 0.05146849769628328, + "grad_norm": 3.582280158996582, + "learning_rate": 2.980434201340652e-05, + "loss": 1.7349, + "step": 26430 + }, + { + "epoch": 0.05149770796739355, + "grad_norm": 3.1633496284484863, + "learning_rate": 2.980412034889651e-05, + "loss": 1.8023, + "step": 26445 + }, + { + "epoch": 0.05152691823850381, + "grad_norm": 3.7939155101776123, + "learning_rate": 2.9803898559718927e-05, + "loss": 1.6626, + "step": 26460 + }, + { + "epoch": 0.05155612850961407, + "grad_norm": 3.093492031097412, + "learning_rate": 2.9803676645875634e-05, + "loss": 1.7914, + "step": 26475 + }, + { + "epoch": 0.051585338780724334, + "grad_norm": 2.4469592571258545, + "learning_rate": 2.9803454607368493e-05, + "loss": 1.7484, + "step": 26490 + }, + { + "epoch": 0.0516145490518346, + "grad_norm": 2.613276243209839, + "learning_rate": 2.9803232444199382e-05, + "loss": 1.7984, + "step": 26505 + }, + { + "epoch": 0.05164375932294486, + "grad_norm": 3.1793646812438965, + "learning_rate": 2.9803010156370166e-05, + "loss": 1.7114, + "step": 26520 + }, + { + "epoch": 0.051672969594055124, + "grad_norm": 1.9021662473678589, + "learning_rate": 2.980278774388272e-05, + "loss": 1.6837, + "step": 26535 + }, + { + "epoch": 0.05170217986516539, + "grad_norm": 2.415710687637329, + "learning_rate": 2.9802565206738922e-05, + "loss": 1.9818, + "step": 26550 + }, + { + "epoch": 0.05173139013627565, + "grad_norm": 3.8227622509002686, + "learning_rate": 2.9802342544940635e-05, + "loss": 1.9691, + "step": 26565 + }, + { + "epoch": 0.051760600407385914, + "grad_norm": 1.6945210695266724, + "learning_rate": 2.980211975848974e-05, + "loss": 1.9788, + "step": 26580 + }, + { + "epoch": 0.051789810678496175, + "grad_norm": 2.1171348094940186, + "learning_rate": 2.980189684738811e-05, + "loss": 1.8498, + "step": 26595 + }, + { + "epoch": 0.05181902094960644, + "grad_norm": 3.8129332065582275, + "learning_rate": 2.9801673811637628e-05, + "loss": 1.8109, + "step": 26610 + }, + { + "epoch": 0.0518482312207167, + "grad_norm": 3.1358275413513184, + "learning_rate": 2.9801450651240173e-05, + "loss": 1.7809, + "step": 26625 + }, + { + "epoch": 0.051877441491826964, + "grad_norm": 3.9287755489349365, + "learning_rate": 2.9801227366197614e-05, + "loss": 1.7791, + "step": 26640 + }, + { + "epoch": 0.05190665176293723, + "grad_norm": 3.269742488861084, + "learning_rate": 2.980100395651184e-05, + "loss": 2.0053, + "step": 26655 + }, + { + "epoch": 0.05193586203404749, + "grad_norm": 3.23358154296875, + "learning_rate": 2.980078042218473e-05, + "loss": 1.7863, + "step": 26670 + }, + { + "epoch": 0.051965072305157754, + "grad_norm": 3.6313607692718506, + "learning_rate": 2.980055676321817e-05, + "loss": 1.8125, + "step": 26685 + }, + { + "epoch": 0.051994282576268015, + "grad_norm": 2.303229331970215, + "learning_rate": 2.9800332979614035e-05, + "loss": 1.8336, + "step": 26700 + }, + { + "epoch": 0.05202349284737828, + "grad_norm": 4.923130035400391, + "learning_rate": 2.9800109071374216e-05, + "loss": 1.9392, + "step": 26715 + }, + { + "epoch": 0.052052703118488544, + "grad_norm": 6.160820484161377, + "learning_rate": 2.9799885038500597e-05, + "loss": 1.8991, + "step": 26730 + }, + { + "epoch": 0.052081913389598805, + "grad_norm": 1.9204509258270264, + "learning_rate": 2.9799660880995065e-05, + "loss": 1.8511, + "step": 26745 + }, + { + "epoch": 0.05211112366070907, + "grad_norm": 4.1540656089782715, + "learning_rate": 2.9799436598859507e-05, + "loss": 1.7353, + "step": 26760 + }, + { + "epoch": 0.052140333931819334, + "grad_norm": 2.5721335411071777, + "learning_rate": 2.979921219209581e-05, + "loss": 1.6949, + "step": 26775 + }, + { + "epoch": 0.052169544202929595, + "grad_norm": 2.5524466037750244, + "learning_rate": 2.9798987660705867e-05, + "loss": 1.9033, + "step": 26790 + }, + { + "epoch": 0.052198754474039856, + "grad_norm": 2.8870813846588135, + "learning_rate": 2.979876300469157e-05, + "loss": 1.6899, + "step": 26805 + }, + { + "epoch": 0.052227964745150124, + "grad_norm": 2.5837631225585938, + "learning_rate": 2.9798538224054804e-05, + "loss": 1.7715, + "step": 26820 + }, + { + "epoch": 0.052257175016260385, + "grad_norm": 2.5015158653259277, + "learning_rate": 2.979831331879747e-05, + "loss": 1.9121, + "step": 26835 + }, + { + "epoch": 0.052286385287370646, + "grad_norm": 3.3133704662323, + "learning_rate": 2.9798088288921457e-05, + "loss": 1.87, + "step": 26850 + }, + { + "epoch": 0.05231559555848091, + "grad_norm": 2.159090042114258, + "learning_rate": 2.979786313442866e-05, + "loss": 1.9665, + "step": 26865 + }, + { + "epoch": 0.052344805829591175, + "grad_norm": 3.6745707988739014, + "learning_rate": 2.9797637855320977e-05, + "loss": 1.8956, + "step": 26880 + }, + { + "epoch": 0.052374016100701436, + "grad_norm": 2.593538761138916, + "learning_rate": 2.9797412451600305e-05, + "loss": 1.9206, + "step": 26895 + }, + { + "epoch": 0.0524032263718117, + "grad_norm": 2.1524672508239746, + "learning_rate": 2.979718692326854e-05, + "loss": 2.0598, + "step": 26910 + }, + { + "epoch": 0.052432436642921965, + "grad_norm": 5.403210639953613, + "learning_rate": 2.9796961270327583e-05, + "loss": 1.9446, + "step": 26925 + }, + { + "epoch": 0.052461646914032226, + "grad_norm": 2.859320878982544, + "learning_rate": 2.9796735492779338e-05, + "loss": 1.6969, + "step": 26940 + }, + { + "epoch": 0.05249085718514249, + "grad_norm": 2.0825371742248535, + "learning_rate": 2.9796509590625696e-05, + "loss": 1.8951, + "step": 26955 + }, + { + "epoch": 0.05252006745625275, + "grad_norm": 2.3604981899261475, + "learning_rate": 2.979628356386857e-05, + "loss": 1.8861, + "step": 26970 + }, + { + "epoch": 0.052549277727363015, + "grad_norm": 3.510629415512085, + "learning_rate": 2.9796057412509856e-05, + "loss": 1.9885, + "step": 26985 + }, + { + "epoch": 0.052578487998473276, + "grad_norm": 1.72383713722229, + "learning_rate": 2.9795831136551467e-05, + "loss": 1.832, + "step": 27000 + }, + { + "epoch": 0.05260769826958354, + "grad_norm": 2.767523765563965, + "learning_rate": 2.9795604735995297e-05, + "loss": 1.8956, + "step": 27015 + }, + { + "epoch": 0.052636908540693805, + "grad_norm": 2.731154203414917, + "learning_rate": 2.979537821084326e-05, + "loss": 1.7157, + "step": 27030 + }, + { + "epoch": 0.052666118811804066, + "grad_norm": 3.17053484916687, + "learning_rate": 2.9795151561097265e-05, + "loss": 1.9691, + "step": 27045 + }, + { + "epoch": 0.05269532908291433, + "grad_norm": 3.9647130966186523, + "learning_rate": 2.979492478675922e-05, + "loss": 1.9736, + "step": 27060 + }, + { + "epoch": 0.05272453935402459, + "grad_norm": 2.4074623584747314, + "learning_rate": 2.9794697887831027e-05, + "loss": 1.8325, + "step": 27075 + }, + { + "epoch": 0.052753749625134856, + "grad_norm": 4.785901069641113, + "learning_rate": 2.9794470864314603e-05, + "loss": 1.9717, + "step": 27090 + }, + { + "epoch": 0.05278295989624512, + "grad_norm": 5.4298577308654785, + "learning_rate": 2.979424371621186e-05, + "loss": 1.8316, + "step": 27105 + }, + { + "epoch": 0.05281217016735538, + "grad_norm": 2.509413003921509, + "learning_rate": 2.9794016443524713e-05, + "loss": 1.8792, + "step": 27120 + }, + { + "epoch": 0.052841380438465646, + "grad_norm": 1.903182029724121, + "learning_rate": 2.979378904625507e-05, + "loss": 1.8049, + "step": 27135 + }, + { + "epoch": 0.05287059070957591, + "grad_norm": 3.3434927463531494, + "learning_rate": 2.9793561524404846e-05, + "loss": 1.7794, + "step": 27150 + }, + { + "epoch": 0.05289980098068617, + "grad_norm": 5.064967632293701, + "learning_rate": 2.9793333877975964e-05, + "loss": 1.8726, + "step": 27165 + }, + { + "epoch": 0.05292901125179643, + "grad_norm": 1.8450191020965576, + "learning_rate": 2.9793106106970335e-05, + "loss": 1.8586, + "step": 27180 + }, + { + "epoch": 0.0529582215229067, + "grad_norm": 2.540570020675659, + "learning_rate": 2.979287821138988e-05, + "loss": 1.8988, + "step": 27195 + }, + { + "epoch": 0.05298743179401696, + "grad_norm": 2.0893425941467285, + "learning_rate": 2.9792650191236516e-05, + "loss": 1.7794, + "step": 27210 + }, + { + "epoch": 0.05301664206512722, + "grad_norm": 2.7562851905822754, + "learning_rate": 2.979242204651216e-05, + "loss": 1.7025, + "step": 27225 + }, + { + "epoch": 0.05304585233623749, + "grad_norm": 4.634995937347412, + "learning_rate": 2.9792193777218743e-05, + "loss": 1.8236, + "step": 27240 + }, + { + "epoch": 0.05307506260734775, + "grad_norm": 3.1855075359344482, + "learning_rate": 2.9791965383358184e-05, + "loss": 1.8439, + "step": 27255 + }, + { + "epoch": 0.05310427287845801, + "grad_norm": 3.3286306858062744, + "learning_rate": 2.9791736864932403e-05, + "loss": 1.9314, + "step": 27270 + }, + { + "epoch": 0.05313348314956827, + "grad_norm": 3.1028332710266113, + "learning_rate": 2.979150822194332e-05, + "loss": 2.0257, + "step": 27285 + }, + { + "epoch": 0.05316269342067854, + "grad_norm": 4.1933393478393555, + "learning_rate": 2.979127945439287e-05, + "loss": 1.9908, + "step": 27300 + }, + { + "epoch": 0.0531919036917888, + "grad_norm": 4.206679344177246, + "learning_rate": 2.9791050562282974e-05, + "loss": 1.8144, + "step": 27315 + }, + { + "epoch": 0.05322111396289906, + "grad_norm": 2.231621265411377, + "learning_rate": 2.9790821545615562e-05, + "loss": 1.824, + "step": 27330 + }, + { + "epoch": 0.05325032423400932, + "grad_norm": 3.625483512878418, + "learning_rate": 2.9790592404392557e-05, + "loss": 2.0087, + "step": 27345 + }, + { + "epoch": 0.05327953450511959, + "grad_norm": 4.063029766082764, + "learning_rate": 2.9790363138615902e-05, + "loss": 1.8927, + "step": 27360 + }, + { + "epoch": 0.05330874477622985, + "grad_norm": 4.166107654571533, + "learning_rate": 2.979013374828751e-05, + "loss": 1.9004, + "step": 27375 + }, + { + "epoch": 0.05333795504734011, + "grad_norm": 2.737416982650757, + "learning_rate": 2.9789904233409326e-05, + "loss": 1.9678, + "step": 27390 + }, + { + "epoch": 0.05336716531845038, + "grad_norm": 2.131272315979004, + "learning_rate": 2.9789674593983277e-05, + "loss": 1.895, + "step": 27405 + }, + { + "epoch": 0.05339637558956064, + "grad_norm": 2.983872890472412, + "learning_rate": 2.9789444830011302e-05, + "loss": 1.7807, + "step": 27420 + }, + { + "epoch": 0.0534255858606709, + "grad_norm": 5.785390377044678, + "learning_rate": 2.978921494149533e-05, + "loss": 1.9559, + "step": 27435 + }, + { + "epoch": 0.05345479613178116, + "grad_norm": 2.7100813388824463, + "learning_rate": 2.9788984928437298e-05, + "loss": 1.9147, + "step": 27450 + }, + { + "epoch": 0.05348400640289143, + "grad_norm": 2.0759334564208984, + "learning_rate": 2.978875479083914e-05, + "loss": 1.7345, + "step": 27465 + }, + { + "epoch": 0.05351321667400169, + "grad_norm": 4.645893573760986, + "learning_rate": 2.9788524528702804e-05, + "loss": 1.7246, + "step": 27480 + }, + { + "epoch": 0.05354242694511195, + "grad_norm": 3.1474320888519287, + "learning_rate": 2.9788294142030225e-05, + "loss": 1.8679, + "step": 27495 + }, + { + "epoch": 0.05357163721622222, + "grad_norm": 2.8445558547973633, + "learning_rate": 2.9788063630823335e-05, + "loss": 1.748, + "step": 27510 + }, + { + "epoch": 0.05360084748733248, + "grad_norm": 3.176330089569092, + "learning_rate": 2.978783299508408e-05, + "loss": 1.9698, + "step": 27525 + }, + { + "epoch": 0.05363005775844274, + "grad_norm": 2.875300884246826, + "learning_rate": 2.9787602234814407e-05, + "loss": 1.7617, + "step": 27540 + }, + { + "epoch": 0.053659268029553, + "grad_norm": 4.004800796508789, + "learning_rate": 2.978737135001626e-05, + "loss": 1.8204, + "step": 27555 + }, + { + "epoch": 0.05368847830066327, + "grad_norm": 2.3588531017303467, + "learning_rate": 2.9787140340691574e-05, + "loss": 1.8028, + "step": 27570 + }, + { + "epoch": 0.05371768857177353, + "grad_norm": 2.293210744857788, + "learning_rate": 2.9786909206842297e-05, + "loss": 1.9525, + "step": 27585 + }, + { + "epoch": 0.05374689884288379, + "grad_norm": 3.4703333377838135, + "learning_rate": 2.9786677948470382e-05, + "loss": 1.8144, + "step": 27600 + }, + { + "epoch": 0.05377610911399406, + "grad_norm": 2.1387853622436523, + "learning_rate": 2.9786446565577772e-05, + "loss": 1.7853, + "step": 27615 + }, + { + "epoch": 0.05380531938510432, + "grad_norm": 1.9415435791015625, + "learning_rate": 2.9786215058166417e-05, + "loss": 1.8912, + "step": 27630 + }, + { + "epoch": 0.05383452965621458, + "grad_norm": 3.315534830093384, + "learning_rate": 2.978598342623826e-05, + "loss": 1.7991, + "step": 27645 + }, + { + "epoch": 0.05386373992732484, + "grad_norm": 3.2139084339141846, + "learning_rate": 2.9785751669795265e-05, + "loss": 1.8284, + "step": 27660 + }, + { + "epoch": 0.05389295019843511, + "grad_norm": 2.0407660007476807, + "learning_rate": 2.9785519788839368e-05, + "loss": 1.8263, + "step": 27675 + }, + { + "epoch": 0.05392216046954537, + "grad_norm": 3.6719486713409424, + "learning_rate": 2.9785287783372538e-05, + "loss": 1.7552, + "step": 27690 + }, + { + "epoch": 0.05395137074065563, + "grad_norm": 2.3214800357818604, + "learning_rate": 2.978505565339671e-05, + "loss": 1.7752, + "step": 27705 + }, + { + "epoch": 0.0539805810117659, + "grad_norm": 3.5290887355804443, + "learning_rate": 2.9784823398913856e-05, + "loss": 1.6684, + "step": 27720 + }, + { + "epoch": 0.05400979128287616, + "grad_norm": 2.344564437866211, + "learning_rate": 2.978459101992592e-05, + "loss": 1.7649, + "step": 27735 + }, + { + "epoch": 0.05403900155398642, + "grad_norm": 3.5880258083343506, + "learning_rate": 2.9784358516434867e-05, + "loss": 1.8194, + "step": 27750 + }, + { + "epoch": 0.05406821182509668, + "grad_norm": 4.46945858001709, + "learning_rate": 2.978412588844265e-05, + "loss": 1.9552, + "step": 27765 + }, + { + "epoch": 0.05409742209620695, + "grad_norm": 2.9063570499420166, + "learning_rate": 2.978389313595123e-05, + "loss": 1.9078, + "step": 27780 + }, + { + "epoch": 0.05412663236731721, + "grad_norm": 4.740022659301758, + "learning_rate": 2.9783660258962568e-05, + "loss": 1.8773, + "step": 27795 + }, + { + "epoch": 0.05415584263842747, + "grad_norm": 5.9590654373168945, + "learning_rate": 2.9783427257478623e-05, + "loss": 2.0891, + "step": 27810 + }, + { + "epoch": 0.05418505290953774, + "grad_norm": 3.4905142784118652, + "learning_rate": 2.978319413150136e-05, + "loss": 1.912, + "step": 27825 + }, + { + "epoch": 0.054214263180648, + "grad_norm": 4.312283039093018, + "learning_rate": 2.978296088103273e-05, + "loss": 1.7969, + "step": 27840 + }, + { + "epoch": 0.05424347345175826, + "grad_norm": 3.419679880142212, + "learning_rate": 2.978272750607472e-05, + "loss": 2.0149, + "step": 27855 + }, + { + "epoch": 0.054272683722868524, + "grad_norm": 1.4746593236923218, + "learning_rate": 2.9782494006629275e-05, + "loss": 1.7991, + "step": 27870 + }, + { + "epoch": 0.05430189399397879, + "grad_norm": 2.6810593605041504, + "learning_rate": 2.9782260382698374e-05, + "loss": 1.8557, + "step": 27885 + }, + { + "epoch": 0.05433110426508905, + "grad_norm": 4.144235134124756, + "learning_rate": 2.9782026634283975e-05, + "loss": 2.0213, + "step": 27900 + }, + { + "epoch": 0.054360314536199314, + "grad_norm": 3.5831942558288574, + "learning_rate": 2.9781792761388055e-05, + "loss": 1.7606, + "step": 27915 + }, + { + "epoch": 0.054389524807309575, + "grad_norm": 3.201408863067627, + "learning_rate": 2.9781558764012573e-05, + "loss": 1.7619, + "step": 27930 + }, + { + "epoch": 0.05441873507841984, + "grad_norm": 2.8114073276519775, + "learning_rate": 2.978132464215951e-05, + "loss": 1.9014, + "step": 27945 + }, + { + "epoch": 0.054447945349530104, + "grad_norm": 2.400465726852417, + "learning_rate": 2.9781090395830834e-05, + "loss": 1.9041, + "step": 27960 + }, + { + "epoch": 0.054477155620640365, + "grad_norm": 2.1496269702911377, + "learning_rate": 2.9780856025028513e-05, + "loss": 1.7252, + "step": 27975 + }, + { + "epoch": 0.05450636589175063, + "grad_norm": 3.477867603302002, + "learning_rate": 2.978062152975453e-05, + "loss": 1.7466, + "step": 27990 + }, + { + "epoch": 0.054535576162860894, + "grad_norm": 2.1709723472595215, + "learning_rate": 2.978038691001085e-05, + "loss": 1.6808, + "step": 28005 + }, + { + "epoch": 0.054564786433971155, + "grad_norm": 2.13655686378479, + "learning_rate": 2.978015216579945e-05, + "loss": 1.8089, + "step": 28020 + }, + { + "epoch": 0.054593996705081416, + "grad_norm": 3.563598394393921, + "learning_rate": 2.9779917297122318e-05, + "loss": 1.8368, + "step": 28035 + }, + { + "epoch": 0.054623206976191684, + "grad_norm": 3.1836514472961426, + "learning_rate": 2.977968230398142e-05, + "loss": 1.8042, + "step": 28050 + }, + { + "epoch": 0.054652417247301945, + "grad_norm": 2.4931259155273438, + "learning_rate": 2.9779447186378738e-05, + "loss": 1.8061, + "step": 28065 + }, + { + "epoch": 0.054681627518412206, + "grad_norm": 3.2399775981903076, + "learning_rate": 2.977921194431625e-05, + "loss": 1.8723, + "step": 28080 + }, + { + "epoch": 0.05471083778952247, + "grad_norm": 3.281590461730957, + "learning_rate": 2.977897657779594e-05, + "loss": 1.8882, + "step": 28095 + }, + { + "epoch": 0.054740048060632734, + "grad_norm": 2.607039451599121, + "learning_rate": 2.9778741086819795e-05, + "loss": 1.8796, + "step": 28110 + }, + { + "epoch": 0.054769258331742995, + "grad_norm": 3.4543254375457764, + "learning_rate": 2.977850547138979e-05, + "loss": 1.8826, + "step": 28125 + }, + { + "epoch": 0.054798468602853256, + "grad_norm": 3.8168885707855225, + "learning_rate": 2.9778269731507914e-05, + "loss": 1.9827, + "step": 28140 + }, + { + "epoch": 0.054827678873963524, + "grad_norm": 2.7308406829833984, + "learning_rate": 2.977803386717615e-05, + "loss": 1.768, + "step": 28155 + }, + { + "epoch": 0.054856889145073785, + "grad_norm": 4.276648998260498, + "learning_rate": 2.9777797878396477e-05, + "loss": 1.8696, + "step": 28170 + }, + { + "epoch": 0.054886099416184046, + "grad_norm": 3.1874961853027344, + "learning_rate": 2.97775617651709e-05, + "loss": 1.881, + "step": 28185 + }, + { + "epoch": 0.054915309687294314, + "grad_norm": 2.4808239936828613, + "learning_rate": 2.977732552750139e-05, + "loss": 1.7825, + "step": 28200 + }, + { + "epoch": 0.054944519958404575, + "grad_norm": 3.0970373153686523, + "learning_rate": 2.9777089165389942e-05, + "loss": 1.7825, + "step": 28215 + }, + { + "epoch": 0.054973730229514836, + "grad_norm": 1.757534384727478, + "learning_rate": 2.9776852678838555e-05, + "loss": 2.0145, + "step": 28230 + }, + { + "epoch": 0.0550029405006251, + "grad_norm": 2.351555347442627, + "learning_rate": 2.977661606784921e-05, + "loss": 2.0268, + "step": 28245 + }, + { + "epoch": 0.055032150771735365, + "grad_norm": 2.4561007022857666, + "learning_rate": 2.9776379332423902e-05, + "loss": 1.7745, + "step": 28260 + }, + { + "epoch": 0.055061361042845626, + "grad_norm": 3.8144009113311768, + "learning_rate": 2.9776142472564624e-05, + "loss": 1.8408, + "step": 28275 + }, + { + "epoch": 0.05509057131395589, + "grad_norm": 4.969006061553955, + "learning_rate": 2.9775905488273373e-05, + "loss": 1.9223, + "step": 28290 + }, + { + "epoch": 0.055119781585066155, + "grad_norm": 5.71866512298584, + "learning_rate": 2.9775668379552146e-05, + "loss": 1.842, + "step": 28305 + }, + { + "epoch": 0.055148991856176416, + "grad_norm": 1.9934380054473877, + "learning_rate": 2.9775431146402937e-05, + "loss": 1.976, + "step": 28320 + }, + { + "epoch": 0.05517820212728668, + "grad_norm": 1.7895939350128174, + "learning_rate": 2.9775193788827743e-05, + "loss": 2.0921, + "step": 28335 + }, + { + "epoch": 0.05520741239839694, + "grad_norm": 4.20900821685791, + "learning_rate": 2.9774956306828566e-05, + "loss": 1.9333, + "step": 28350 + }, + { + "epoch": 0.055236622669507206, + "grad_norm": 1.7822163105010986, + "learning_rate": 2.97747187004074e-05, + "loss": 2.0872, + "step": 28365 + }, + { + "epoch": 0.05526583294061747, + "grad_norm": 1.7469080686569214, + "learning_rate": 2.9774480969566254e-05, + "loss": 1.8781, + "step": 28380 + }, + { + "epoch": 0.05529504321172773, + "grad_norm": 3.0533454418182373, + "learning_rate": 2.977424311430712e-05, + "loss": 1.7184, + "step": 28395 + }, + { + "epoch": 0.05532425348283799, + "grad_norm": 2.729780912399292, + "learning_rate": 2.977400513463201e-05, + "loss": 1.7794, + "step": 28410 + }, + { + "epoch": 0.05535346375394826, + "grad_norm": 3.2360620498657227, + "learning_rate": 2.9773767030542926e-05, + "loss": 1.7711, + "step": 28425 + }, + { + "epoch": 0.05538267402505852, + "grad_norm": 2.3599355220794678, + "learning_rate": 2.9773528802041873e-05, + "loss": 1.8758, + "step": 28440 + }, + { + "epoch": 0.05541188429616878, + "grad_norm": 3.3827106952667236, + "learning_rate": 2.9773290449130856e-05, + "loss": 1.901, + "step": 28455 + }, + { + "epoch": 0.055441094567279046, + "grad_norm": 2.9506921768188477, + "learning_rate": 2.977305197181188e-05, + "loss": 1.9286, + "step": 28470 + }, + { + "epoch": 0.05547030483838931, + "grad_norm": 3.381622791290283, + "learning_rate": 2.9772813370086956e-05, + "loss": 2.0169, + "step": 28485 + }, + { + "epoch": 0.05549951510949957, + "grad_norm": 3.6618142127990723, + "learning_rate": 2.9772574643958095e-05, + "loss": 1.8318, + "step": 28500 + }, + { + "epoch": 0.05552872538060983, + "grad_norm": 2.906064033508301, + "learning_rate": 2.9772335793427304e-05, + "loss": 1.8778, + "step": 28515 + }, + { + "epoch": 0.0555579356517201, + "grad_norm": 1.9280356168746948, + "learning_rate": 2.9772096818496592e-05, + "loss": 1.9095, + "step": 28530 + }, + { + "epoch": 0.05558714592283036, + "grad_norm": 2.451441764831543, + "learning_rate": 2.977185771916798e-05, + "loss": 1.7975, + "step": 28545 + }, + { + "epoch": 0.05561635619394062, + "grad_norm": 3.4363293647766113, + "learning_rate": 2.9771618495443473e-05, + "loss": 1.9995, + "step": 28560 + }, + { + "epoch": 0.05564556646505089, + "grad_norm": 3.402430772781372, + "learning_rate": 2.9771379147325095e-05, + "loss": 1.879, + "step": 28575 + }, + { + "epoch": 0.05567477673616115, + "grad_norm": 3.7891762256622314, + "learning_rate": 2.977113967481485e-05, + "loss": 1.8275, + "step": 28590 + }, + { + "epoch": 0.05570398700727141, + "grad_norm": 2.45809268951416, + "learning_rate": 2.977090007791476e-05, + "loss": 1.8131, + "step": 28605 + }, + { + "epoch": 0.05573319727838167, + "grad_norm": 3.5447278022766113, + "learning_rate": 2.9770660356626848e-05, + "loss": 1.8373, + "step": 28620 + }, + { + "epoch": 0.05576240754949194, + "grad_norm": 3.397735595703125, + "learning_rate": 2.9770420510953124e-05, + "loss": 1.7907, + "step": 28635 + }, + { + "epoch": 0.0557916178206022, + "grad_norm": 2.185011386871338, + "learning_rate": 2.9770180540895613e-05, + "loss": 1.8909, + "step": 28650 + }, + { + "epoch": 0.05582082809171246, + "grad_norm": 3.66780161857605, + "learning_rate": 2.9769940446456332e-05, + "loss": 1.8898, + "step": 28665 + }, + { + "epoch": 0.05585003836282273, + "grad_norm": 3.8541507720947266, + "learning_rate": 2.9769700227637307e-05, + "loss": 1.8156, + "step": 28680 + }, + { + "epoch": 0.05587924863393299, + "grad_norm": 2.504997491836548, + "learning_rate": 2.9769459884440563e-05, + "loss": 1.8584, + "step": 28695 + }, + { + "epoch": 0.05590845890504325, + "grad_norm": 2.811286687850952, + "learning_rate": 2.9769219416868114e-05, + "loss": 1.8762, + "step": 28710 + }, + { + "epoch": 0.05593766917615351, + "grad_norm": 3.2774808406829834, + "learning_rate": 2.976897882492199e-05, + "loss": 1.8637, + "step": 28725 + }, + { + "epoch": 0.05596687944726378, + "grad_norm": 3.6415061950683594, + "learning_rate": 2.9768738108604222e-05, + "loss": 2.033, + "step": 28740 + }, + { + "epoch": 0.05599608971837404, + "grad_norm": 2.5112550258636475, + "learning_rate": 2.9768497267916833e-05, + "loss": 1.9643, + "step": 28755 + }, + { + "epoch": 0.0560252999894843, + "grad_norm": 3.3538918495178223, + "learning_rate": 2.9768256302861852e-05, + "loss": 1.7964, + "step": 28770 + }, + { + "epoch": 0.05605451026059457, + "grad_norm": 1.7511709928512573, + "learning_rate": 2.9768015213441306e-05, + "loss": 1.9891, + "step": 28785 + }, + { + "epoch": 0.05608372053170483, + "grad_norm": 4.007176399230957, + "learning_rate": 2.9767773999657225e-05, + "loss": 2.0263, + "step": 28800 + }, + { + "epoch": 0.05611293080281509, + "grad_norm": 2.968200206756592, + "learning_rate": 2.9767532661511644e-05, + "loss": 1.7171, + "step": 28815 + }, + { + "epoch": 0.05614214107392535, + "grad_norm": 3.2194066047668457, + "learning_rate": 2.9767291199006594e-05, + "loss": 1.8825, + "step": 28830 + }, + { + "epoch": 0.05617135134503562, + "grad_norm": 3.994147539138794, + "learning_rate": 2.976704961214411e-05, + "loss": 1.6859, + "step": 28845 + }, + { + "epoch": 0.05620056161614588, + "grad_norm": 2.1359071731567383, + "learning_rate": 2.976680790092622e-05, + "loss": 1.7796, + "step": 28860 + }, + { + "epoch": 0.05622977188725614, + "grad_norm": 3.074885606765747, + "learning_rate": 2.976656606535497e-05, + "loss": 1.9507, + "step": 28875 + }, + { + "epoch": 0.05625898215836641, + "grad_norm": 3.184913396835327, + "learning_rate": 2.9766324105432385e-05, + "loss": 1.9536, + "step": 28890 + }, + { + "epoch": 0.05628819242947667, + "grad_norm": 3.911243438720703, + "learning_rate": 2.976608202116051e-05, + "loss": 1.8609, + "step": 28905 + }, + { + "epoch": 0.05631740270058693, + "grad_norm": 4.364305019378662, + "learning_rate": 2.9765839812541378e-05, + "loss": 1.8573, + "step": 28920 + }, + { + "epoch": 0.05634661297169719, + "grad_norm": 3.0088484287261963, + "learning_rate": 2.9765597479577034e-05, + "loss": 2.0879, + "step": 28935 + }, + { + "epoch": 0.05637582324280746, + "grad_norm": 1.9867300987243652, + "learning_rate": 2.9765355022269518e-05, + "loss": 1.7799, + "step": 28950 + }, + { + "epoch": 0.05640503351391772, + "grad_norm": 2.1443519592285156, + "learning_rate": 2.9765112440620874e-05, + "loss": 1.7201, + "step": 28965 + }, + { + "epoch": 0.05643424378502798, + "grad_norm": 2.226407051086426, + "learning_rate": 2.9764869734633134e-05, + "loss": 1.7974, + "step": 28980 + }, + { + "epoch": 0.05646345405613824, + "grad_norm": 4.08579158782959, + "learning_rate": 2.9764626904308354e-05, + "loss": 1.8633, + "step": 28995 + }, + { + "epoch": 0.05649266432724851, + "grad_norm": 4.314965724945068, + "learning_rate": 2.9764383949648576e-05, + "loss": 1.9242, + "step": 29010 + }, + { + "epoch": 0.05652187459835877, + "grad_norm": 4.151242733001709, + "learning_rate": 2.976414087065584e-05, + "loss": 1.942, + "step": 29025 + }, + { + "epoch": 0.05655108486946903, + "grad_norm": 1.9677321910858154, + "learning_rate": 2.97638976673322e-05, + "loss": 1.8371, + "step": 29040 + }, + { + "epoch": 0.0565802951405793, + "grad_norm": 2.814545154571533, + "learning_rate": 2.97636543396797e-05, + "loss": 1.9929, + "step": 29055 + }, + { + "epoch": 0.05660950541168956, + "grad_norm": 3.4638845920562744, + "learning_rate": 2.976341088770039e-05, + "loss": 1.8635, + "step": 29070 + }, + { + "epoch": 0.05663871568279982, + "grad_norm": 2.0748291015625, + "learning_rate": 2.976316731139632e-05, + "loss": 1.6827, + "step": 29085 + }, + { + "epoch": 0.056667925953910084, + "grad_norm": 2.4861667156219482, + "learning_rate": 2.9762923610769545e-05, + "loss": 1.8022, + "step": 29100 + }, + { + "epoch": 0.05669713622502035, + "grad_norm": 2.12156343460083, + "learning_rate": 2.9762679785822113e-05, + "loss": 1.8912, + "step": 29115 + }, + { + "epoch": 0.05672634649613061, + "grad_norm": 2.3051788806915283, + "learning_rate": 2.9762435836556075e-05, + "loss": 1.786, + "step": 29130 + }, + { + "epoch": 0.056755556767240874, + "grad_norm": 3.0045323371887207, + "learning_rate": 2.9762191762973492e-05, + "loss": 1.9105, + "step": 29145 + }, + { + "epoch": 0.05678476703835114, + "grad_norm": 2.9411308765411377, + "learning_rate": 2.9761947565076413e-05, + "loss": 1.7949, + "step": 29160 + }, + { + "epoch": 0.0568139773094614, + "grad_norm": 3.0440449714660645, + "learning_rate": 2.97617032428669e-05, + "loss": 2.0024, + "step": 29175 + }, + { + "epoch": 0.056843187580571664, + "grad_norm": 3.836582660675049, + "learning_rate": 2.976145879634701e-05, + "loss": 1.9192, + "step": 29190 + }, + { + "epoch": 0.056872397851681925, + "grad_norm": 3.63226580619812, + "learning_rate": 2.9761214225518792e-05, + "loss": 1.6728, + "step": 29205 + }, + { + "epoch": 0.05690160812279219, + "grad_norm": 2.588270902633667, + "learning_rate": 2.9760969530384317e-05, + "loss": 1.8828, + "step": 29220 + }, + { + "epoch": 0.05693081839390245, + "grad_norm": 3.4929933547973633, + "learning_rate": 2.9760724710945642e-05, + "loss": 2.0755, + "step": 29235 + }, + { + "epoch": 0.056960028665012714, + "grad_norm": 2.916142463684082, + "learning_rate": 2.976047976720483e-05, + "loss": 1.8955, + "step": 29250 + }, + { + "epoch": 0.05698923893612298, + "grad_norm": 3.606926918029785, + "learning_rate": 2.976023469916394e-05, + "loss": 1.9196, + "step": 29265 + }, + { + "epoch": 0.05701844920723324, + "grad_norm": 4.033385276794434, + "learning_rate": 2.9759989506825033e-05, + "loss": 1.9555, + "step": 29280 + }, + { + "epoch": 0.057047659478343504, + "grad_norm": 2.1779022216796875, + "learning_rate": 2.9759744190190185e-05, + "loss": 1.7735, + "step": 29295 + }, + { + "epoch": 0.057076869749453765, + "grad_norm": 4.592677116394043, + "learning_rate": 2.9759498749261452e-05, + "loss": 1.8971, + "step": 29310 + }, + { + "epoch": 0.05710608002056403, + "grad_norm": 4.797886371612549, + "learning_rate": 2.9759253184040906e-05, + "loss": 1.8494, + "step": 29325 + }, + { + "epoch": 0.057135290291674294, + "grad_norm": 2.316049337387085, + "learning_rate": 2.9759007494530615e-05, + "loss": 1.7288, + "step": 29340 + }, + { + "epoch": 0.057164500562784555, + "grad_norm": 2.9396653175354004, + "learning_rate": 2.975876168073264e-05, + "loss": 1.8751, + "step": 29355 + }, + { + "epoch": 0.05719371083389482, + "grad_norm": 4.224997043609619, + "learning_rate": 2.9758515742649063e-05, + "loss": 1.78, + "step": 29370 + }, + { + "epoch": 0.057222921105005084, + "grad_norm": 2.5549795627593994, + "learning_rate": 2.9758269680281946e-05, + "loss": 1.7375, + "step": 29385 + }, + { + "epoch": 0.057252131376115345, + "grad_norm": 3.9945852756500244, + "learning_rate": 2.9758023493633365e-05, + "loss": 1.7078, + "step": 29400 + }, + { + "epoch": 0.057281341647225606, + "grad_norm": 3.5278730392456055, + "learning_rate": 2.975777718270539e-05, + "loss": 1.7572, + "step": 29415 + }, + { + "epoch": 0.057310551918335874, + "grad_norm": 3.9674766063690186, + "learning_rate": 2.9757530747500104e-05, + "loss": 1.9125, + "step": 29430 + }, + { + "epoch": 0.057339762189446135, + "grad_norm": 3.498309373855591, + "learning_rate": 2.9757284188019573e-05, + "loss": 1.8242, + "step": 29445 + }, + { + "epoch": 0.057368972460556396, + "grad_norm": 3.2913856506347656, + "learning_rate": 2.9757037504265874e-05, + "loss": 1.8559, + "step": 29460 + }, + { + "epoch": 0.05739818273166666, + "grad_norm": 2.72259521484375, + "learning_rate": 2.9756790696241088e-05, + "loss": 1.7855, + "step": 29475 + }, + { + "epoch": 0.057427393002776925, + "grad_norm": 7.3963704109191895, + "learning_rate": 2.9756543763947292e-05, + "loss": 1.9272, + "step": 29490 + }, + { + "epoch": 0.057456603273887186, + "grad_norm": 2.901502847671509, + "learning_rate": 2.9756296707386566e-05, + "loss": 1.7294, + "step": 29505 + }, + { + "epoch": 0.05748581354499745, + "grad_norm": 1.9630255699157715, + "learning_rate": 2.9756049526560995e-05, + "loss": 1.8477, + "step": 29520 + }, + { + "epoch": 0.057515023816107715, + "grad_norm": 2.449836492538452, + "learning_rate": 2.975580222147265e-05, + "loss": 1.9282, + "step": 29535 + }, + { + "epoch": 0.057544234087217976, + "grad_norm": 2.7461347579956055, + "learning_rate": 2.9755554792123617e-05, + "loss": 1.8883, + "step": 29550 + }, + { + "epoch": 0.05757344435832824, + "grad_norm": 3.3567450046539307, + "learning_rate": 2.9755307238515986e-05, + "loss": 1.9717, + "step": 29565 + }, + { + "epoch": 0.0576026546294385, + "grad_norm": 3.6341872215270996, + "learning_rate": 2.975505956065184e-05, + "loss": 1.9378, + "step": 29580 + }, + { + "epoch": 0.057631864900548765, + "grad_norm": 4.170873165130615, + "learning_rate": 2.9754811758533253e-05, + "loss": 1.9077, + "step": 29595 + }, + { + "epoch": 0.057661075171659026, + "grad_norm": 1.9185329675674438, + "learning_rate": 2.975456383216233e-05, + "loss": 1.7987, + "step": 29610 + }, + { + "epoch": 0.05769028544276929, + "grad_norm": 2.3139781951904297, + "learning_rate": 2.9754315781541144e-05, + "loss": 1.9518, + "step": 29625 + }, + { + "epoch": 0.057719495713879555, + "grad_norm": 3.581907033920288, + "learning_rate": 2.9754067606671794e-05, + "loss": 1.8729, + "step": 29640 + }, + { + "epoch": 0.057748705984989816, + "grad_norm": 2.490278720855713, + "learning_rate": 2.975381930755636e-05, + "loss": 1.7576, + "step": 29655 + }, + { + "epoch": 0.05777791625610008, + "grad_norm": 3.5740153789520264, + "learning_rate": 2.9753570884196942e-05, + "loss": 1.9749, + "step": 29670 + }, + { + "epoch": 0.05780712652721034, + "grad_norm": 3.012131929397583, + "learning_rate": 2.975332233659563e-05, + "loss": 1.8704, + "step": 29685 + }, + { + "epoch": 0.057836336798320606, + "grad_norm": 4.225703239440918, + "learning_rate": 2.9753073664754514e-05, + "loss": 1.6134, + "step": 29700 + }, + { + "epoch": 0.05786554706943087, + "grad_norm": 3.840487480163574, + "learning_rate": 2.9752824868675693e-05, + "loss": 1.822, + "step": 29715 + }, + { + "epoch": 0.05789475734054113, + "grad_norm": 2.8694260120391846, + "learning_rate": 2.975257594836125e-05, + "loss": 1.939, + "step": 29730 + }, + { + "epoch": 0.057923967611651396, + "grad_norm": 3.4660465717315674, + "learning_rate": 2.97523269038133e-05, + "loss": 1.9517, + "step": 29745 + }, + { + "epoch": 0.05795317788276166, + "grad_norm": 3.125666379928589, + "learning_rate": 2.9752077735033924e-05, + "loss": 1.7034, + "step": 29760 + }, + { + "epoch": 0.05798238815387192, + "grad_norm": 4.895527362823486, + "learning_rate": 2.975182844202523e-05, + "loss": 1.8021, + "step": 29775 + }, + { + "epoch": 0.05801159842498218, + "grad_norm": 3.4364778995513916, + "learning_rate": 2.9751579024789314e-05, + "loss": 1.9357, + "step": 29790 + }, + { + "epoch": 0.05804080869609245, + "grad_norm": 3.209791421890259, + "learning_rate": 2.9751329483328276e-05, + "loss": 1.8969, + "step": 29805 + }, + { + "epoch": 0.05807001896720271, + "grad_norm": 2.851810932159424, + "learning_rate": 2.9751079817644217e-05, + "loss": 1.8274, + "step": 29820 + }, + { + "epoch": 0.05809922923831297, + "grad_norm": 1.9028266668319702, + "learning_rate": 2.975083002773924e-05, + "loss": 1.7498, + "step": 29835 + }, + { + "epoch": 0.05812843950942324, + "grad_norm": 4.442659378051758, + "learning_rate": 2.9750580113615448e-05, + "loss": 1.906, + "step": 29850 + }, + { + "epoch": 0.0581576497805335, + "grad_norm": 4.513755798339844, + "learning_rate": 2.9750330075274948e-05, + "loss": 1.7855, + "step": 29865 + }, + { + "epoch": 0.05818686005164376, + "grad_norm": 2.4421234130859375, + "learning_rate": 2.975007991271984e-05, + "loss": 1.8355, + "step": 29880 + }, + { + "epoch": 0.05821607032275402, + "grad_norm": 2.0383033752441406, + "learning_rate": 2.974982962595224e-05, + "loss": 1.8456, + "step": 29895 + }, + { + "epoch": 0.05824528059386429, + "grad_norm": 2.9908933639526367, + "learning_rate": 2.9749579214974245e-05, + "loss": 1.7223, + "step": 29910 + }, + { + "epoch": 0.05827449086497455, + "grad_norm": 3.1746137142181396, + "learning_rate": 2.9749328679787976e-05, + "loss": 1.8949, + "step": 29925 + }, + { + "epoch": 0.05830370113608481, + "grad_norm": 3.351712942123413, + "learning_rate": 2.9749078020395526e-05, + "loss": 1.8423, + "step": 29940 + }, + { + "epoch": 0.05833291140719508, + "grad_norm": 4.363383769989014, + "learning_rate": 2.9748827236799024e-05, + "loss": 2.0718, + "step": 29955 + }, + { + "epoch": 0.05836212167830534, + "grad_norm": 3.5311388969421387, + "learning_rate": 2.974857632900057e-05, + "loss": 1.8393, + "step": 29970 + }, + { + "epoch": 0.0583913319494156, + "grad_norm": 2.739178419113159, + "learning_rate": 2.974832529700228e-05, + "loss": 1.901, + "step": 29985 + }, + { + "epoch": 0.05842054222052586, + "grad_norm": 5.376321315765381, + "learning_rate": 2.974807414080627e-05, + "loss": 1.9742, + "step": 30000 + }, + { + "epoch": 0.05844975249163613, + "grad_norm": 4.261213302612305, + "learning_rate": 2.9747822860414652e-05, + "loss": 1.747, + "step": 30015 + }, + { + "epoch": 0.05847896276274639, + "grad_norm": 2.775156259536743, + "learning_rate": 2.9747571455829544e-05, + "loss": 1.8235, + "step": 30030 + }, + { + "epoch": 0.05850817303385665, + "grad_norm": 4.271034240722656, + "learning_rate": 2.9747319927053066e-05, + "loss": 1.834, + "step": 30045 + }, + { + "epoch": 0.05853738330496691, + "grad_norm": 3.628793239593506, + "learning_rate": 2.9747068274087327e-05, + "loss": 2.0451, + "step": 30060 + }, + { + "epoch": 0.05856659357607718, + "grad_norm": 2.388171434402466, + "learning_rate": 2.9746816496934452e-05, + "loss": 1.9012, + "step": 30075 + }, + { + "epoch": 0.05859580384718744, + "grad_norm": 5.260224342346191, + "learning_rate": 2.9746564595596566e-05, + "loss": 1.8394, + "step": 30090 + }, + { + "epoch": 0.0586250141182977, + "grad_norm": 3.47929310798645, + "learning_rate": 2.9746312570075786e-05, + "loss": 1.7858, + "step": 30105 + }, + { + "epoch": 0.05865422438940797, + "grad_norm": 2.082864761352539, + "learning_rate": 2.974606042037423e-05, + "loss": 1.8976, + "step": 30120 + }, + { + "epoch": 0.05868343466051823, + "grad_norm": 2.003638982772827, + "learning_rate": 2.9745808146494027e-05, + "loss": 1.7789, + "step": 30135 + }, + { + "epoch": 0.05871264493162849, + "grad_norm": 3.8583061695098877, + "learning_rate": 2.9745555748437298e-05, + "loss": 1.9207, + "step": 30150 + }, + { + "epoch": 0.05874185520273875, + "grad_norm": 4.0564703941345215, + "learning_rate": 2.9745303226206172e-05, + "loss": 1.9422, + "step": 30165 + }, + { + "epoch": 0.05877106547384902, + "grad_norm": 1.928350806236267, + "learning_rate": 2.974505057980277e-05, + "loss": 1.8624, + "step": 30180 + }, + { + "epoch": 0.05880027574495928, + "grad_norm": 2.1937801837921143, + "learning_rate": 2.974479780922923e-05, + "loss": 1.9617, + "step": 30195 + }, + { + "epoch": 0.05882948601606954, + "grad_norm": 4.145959854125977, + "learning_rate": 2.974454491448767e-05, + "loss": 1.8212, + "step": 30210 + }, + { + "epoch": 0.05885869628717981, + "grad_norm": 4.780306816101074, + "learning_rate": 2.9744291895580222e-05, + "loss": 1.7573, + "step": 30225 + }, + { + "epoch": 0.05888790655829007, + "grad_norm": 3.6125333309173584, + "learning_rate": 2.9744038752509016e-05, + "loss": 1.9406, + "step": 30240 + }, + { + "epoch": 0.05891711682940033, + "grad_norm": 2.673916816711426, + "learning_rate": 2.974378548527619e-05, + "loss": 1.8403, + "step": 30255 + }, + { + "epoch": 0.05894632710051059, + "grad_norm": 2.3269338607788086, + "learning_rate": 2.9743532093883877e-05, + "loss": 2.1496, + "step": 30270 + }, + { + "epoch": 0.05897553737162086, + "grad_norm": 4.606401443481445, + "learning_rate": 2.9743278578334205e-05, + "loss": 1.7679, + "step": 30285 + }, + { + "epoch": 0.05900474764273112, + "grad_norm": 4.217224597930908, + "learning_rate": 2.9743024938629303e-05, + "loss": 1.8091, + "step": 30300 + }, + { + "epoch": 0.05903395791384138, + "grad_norm": 2.7839462757110596, + "learning_rate": 2.974277117477132e-05, + "loss": 1.9216, + "step": 30315 + }, + { + "epoch": 0.05906316818495165, + "grad_norm": 2.616410970687866, + "learning_rate": 2.9742517286762386e-05, + "loss": 1.7535, + "step": 30330 + }, + { + "epoch": 0.05909237845606191, + "grad_norm": 2.646209239959717, + "learning_rate": 2.974226327460464e-05, + "loss": 1.7763, + "step": 30345 + }, + { + "epoch": 0.05912158872717217, + "grad_norm": 4.532891750335693, + "learning_rate": 2.9742009138300225e-05, + "loss": 1.9677, + "step": 30360 + }, + { + "epoch": 0.05915079899828243, + "grad_norm": 3.891831874847412, + "learning_rate": 2.9741754877851277e-05, + "loss": 1.8411, + "step": 30375 + }, + { + "epoch": 0.0591800092693927, + "grad_norm": 2.234100580215454, + "learning_rate": 2.9741500493259933e-05, + "loss": 1.77, + "step": 30390 + }, + { + "epoch": 0.05920921954050296, + "grad_norm": 3.4290971755981445, + "learning_rate": 2.9741245984528342e-05, + "loss": 1.9234, + "step": 30405 + }, + { + "epoch": 0.05923842981161322, + "grad_norm": 2.175567150115967, + "learning_rate": 2.974099135165865e-05, + "loss": 2.1054, + "step": 30420 + }, + { + "epoch": 0.05926764008272349, + "grad_norm": 2.412052631378174, + "learning_rate": 2.9740736594652992e-05, + "loss": 1.8607, + "step": 30435 + }, + { + "epoch": 0.05929685035383375, + "grad_norm": 2.27789044380188, + "learning_rate": 2.974048171351352e-05, + "loss": 1.9491, + "step": 30450 + }, + { + "epoch": 0.05932606062494401, + "grad_norm": 3.9195199012756348, + "learning_rate": 2.9740226708242375e-05, + "loss": 1.8516, + "step": 30465 + }, + { + "epoch": 0.059355270896054274, + "grad_norm": 2.3743503093719482, + "learning_rate": 2.973997157884171e-05, + "loss": 2.1744, + "step": 30480 + }, + { + "epoch": 0.05938448116716454, + "grad_norm": 4.110390663146973, + "learning_rate": 2.9739716325313676e-05, + "loss": 1.8803, + "step": 30495 + }, + { + "epoch": 0.0594136914382748, + "grad_norm": 3.738240957260132, + "learning_rate": 2.973946094766041e-05, + "loss": 1.7978, + "step": 30510 + }, + { + "epoch": 0.059442901709385064, + "grad_norm": 2.840871572494507, + "learning_rate": 2.9739205445884075e-05, + "loss": 1.8413, + "step": 30525 + }, + { + "epoch": 0.059472111980495325, + "grad_norm": 2.931602954864502, + "learning_rate": 2.9738949819986814e-05, + "loss": 1.9035, + "step": 30540 + }, + { + "epoch": 0.05950132225160559, + "grad_norm": 2.814918041229248, + "learning_rate": 2.9738694069970787e-05, + "loss": 1.9169, + "step": 30555 + }, + { + "epoch": 0.059530532522715854, + "grad_norm": 4.824680805206299, + "learning_rate": 2.9738438195838145e-05, + "loss": 1.8696, + "step": 30570 + }, + { + "epoch": 0.059559742793826115, + "grad_norm": 2.1868393421173096, + "learning_rate": 2.9738182197591038e-05, + "loss": 1.8737, + "step": 30585 + }, + { + "epoch": 0.05958895306493638, + "grad_norm": 4.172234535217285, + "learning_rate": 2.973792607523163e-05, + "loss": 1.8148, + "step": 30600 + }, + { + "epoch": 0.059618163336046644, + "grad_norm": 4.052221775054932, + "learning_rate": 2.9737669828762074e-05, + "loss": 1.8824, + "step": 30615 + }, + { + "epoch": 0.059647373607156905, + "grad_norm": 3.6041007041931152, + "learning_rate": 2.9737413458184522e-05, + "loss": 1.8255, + "step": 30630 + }, + { + "epoch": 0.059676583878267166, + "grad_norm": 4.06122350692749, + "learning_rate": 2.9737156963501143e-05, + "loss": 1.951, + "step": 30645 + }, + { + "epoch": 0.059705794149377434, + "grad_norm": 2.3008453845977783, + "learning_rate": 2.973690034471409e-05, + "loss": 1.758, + "step": 30660 + }, + { + "epoch": 0.059735004420487695, + "grad_norm": 3.7912003993988037, + "learning_rate": 2.9736643601825532e-05, + "loss": 1.8688, + "step": 30675 + }, + { + "epoch": 0.059764214691597956, + "grad_norm": 3.7779648303985596, + "learning_rate": 2.9736386734837616e-05, + "loss": 1.7613, + "step": 30690 + }, + { + "epoch": 0.05979342496270822, + "grad_norm": 4.632726669311523, + "learning_rate": 2.9736129743752523e-05, + "loss": 1.6212, + "step": 30705 + }, + { + "epoch": 0.059822635233818484, + "grad_norm": 4.628555774688721, + "learning_rate": 2.9735872628572404e-05, + "loss": 1.8643, + "step": 30720 + }, + { + "epoch": 0.059851845504928745, + "grad_norm": 3.8072681427001953, + "learning_rate": 2.9735615389299434e-05, + "loss": 1.7805, + "step": 30735 + }, + { + "epoch": 0.059881055776039006, + "grad_norm": 3.9826717376708984, + "learning_rate": 2.973535802593577e-05, + "loss": 1.8594, + "step": 30750 + }, + { + "epoch": 0.059910266047149274, + "grad_norm": 3.04962420463562, + "learning_rate": 2.9735100538483587e-05, + "loss": 1.9104, + "step": 30765 + }, + { + "epoch": 0.059939476318259535, + "grad_norm": 2.8500261306762695, + "learning_rate": 2.9734842926945048e-05, + "loss": 1.717, + "step": 30780 + }, + { + "epoch": 0.059968686589369796, + "grad_norm": 1.5368775129318237, + "learning_rate": 2.9734585191322324e-05, + "loss": 1.8905, + "step": 30795 + }, + { + "epoch": 0.059997896860480064, + "grad_norm": 3.886584997177124, + "learning_rate": 2.9734327331617588e-05, + "loss": 1.804, + "step": 30810 + }, + { + "epoch": 0.060027107131590325, + "grad_norm": 3.792663812637329, + "learning_rate": 2.9734069347833005e-05, + "loss": 1.9667, + "step": 30825 + }, + { + "epoch": 0.060056317402700586, + "grad_norm": 3.7504067420959473, + "learning_rate": 2.9733811239970756e-05, + "loss": 1.9053, + "step": 30840 + }, + { + "epoch": 0.06008552767381085, + "grad_norm": 2.617370367050171, + "learning_rate": 2.973355300803301e-05, + "loss": 1.8853, + "step": 30855 + }, + { + "epoch": 0.060114737944921115, + "grad_norm": 3.2594456672668457, + "learning_rate": 2.973329465202194e-05, + "loss": 1.7045, + "step": 30870 + }, + { + "epoch": 0.060143948216031376, + "grad_norm": 1.8166978359222412, + "learning_rate": 2.9733036171939723e-05, + "loss": 1.8431, + "step": 30885 + }, + { + "epoch": 0.06017315848714164, + "grad_norm": 2.9280359745025635, + "learning_rate": 2.973277756778854e-05, + "loss": 1.9088, + "step": 30900 + }, + { + "epoch": 0.060202368758251905, + "grad_norm": 4.446992874145508, + "learning_rate": 2.973251883957056e-05, + "loss": 1.7744, + "step": 30915 + }, + { + "epoch": 0.060231579029362166, + "grad_norm": 4.279955863952637, + "learning_rate": 2.9732259987287967e-05, + "loss": 1.8309, + "step": 30930 + }, + { + "epoch": 0.06026078930047243, + "grad_norm": 2.3727381229400635, + "learning_rate": 2.9732001010942946e-05, + "loss": 1.9247, + "step": 30945 + }, + { + "epoch": 0.06028999957158269, + "grad_norm": 2.6014645099639893, + "learning_rate": 2.9731741910537668e-05, + "loss": 1.7653, + "step": 30960 + }, + { + "epoch": 0.060319209842692956, + "grad_norm": 2.387563467025757, + "learning_rate": 2.973148268607432e-05, + "loss": 1.7814, + "step": 30975 + }, + { + "epoch": 0.06034842011380322, + "grad_norm": 3.4683475494384766, + "learning_rate": 2.9731223337555088e-05, + "loss": 1.8522, + "step": 30990 + }, + { + "epoch": 0.06037763038491348, + "grad_norm": 2.352069854736328, + "learning_rate": 2.973096386498215e-05, + "loss": 2.0015, + "step": 31005 + }, + { + "epoch": 0.06040684065602374, + "grad_norm": 3.2257015705108643, + "learning_rate": 2.9730704268357694e-05, + "loss": 1.8176, + "step": 31020 + }, + { + "epoch": 0.06043605092713401, + "grad_norm": 2.4939286708831787, + "learning_rate": 2.97304445476839e-05, + "loss": 1.7388, + "step": 31035 + }, + { + "epoch": 0.06046526119824427, + "grad_norm": 3.216533660888672, + "learning_rate": 2.9730184702962968e-05, + "loss": 1.859, + "step": 31050 + }, + { + "epoch": 0.06049447146935453, + "grad_norm": 3.4212570190429688, + "learning_rate": 2.9729924734197077e-05, + "loss": 1.7748, + "step": 31065 + }, + { + "epoch": 0.060523681740464796, + "grad_norm": 3.6867172718048096, + "learning_rate": 2.9729664641388417e-05, + "loss": 2.0345, + "step": 31080 + }, + { + "epoch": 0.06055289201157506, + "grad_norm": 3.243898630142212, + "learning_rate": 2.9729404424539183e-05, + "loss": 1.9114, + "step": 31095 + }, + { + "epoch": 0.06058210228268532, + "grad_norm": 2.8866801261901855, + "learning_rate": 2.9729144083651554e-05, + "loss": 1.9833, + "step": 31110 + }, + { + "epoch": 0.06061131255379558, + "grad_norm": 3.0051145553588867, + "learning_rate": 2.9728883618727738e-05, + "loss": 2.026, + "step": 31125 + }, + { + "epoch": 0.06064052282490585, + "grad_norm": 3.0416595935821533, + "learning_rate": 2.972862302976992e-05, + "loss": 1.8541, + "step": 31140 + }, + { + "epoch": 0.06066973309601611, + "grad_norm": 2.835458517074585, + "learning_rate": 2.9728362316780294e-05, + "loss": 1.8636, + "step": 31155 + }, + { + "epoch": 0.06069894336712637, + "grad_norm": 2.40006947517395, + "learning_rate": 2.972810147976106e-05, + "loss": 1.9535, + "step": 31170 + }, + { + "epoch": 0.06072815363823664, + "grad_norm": 2.7326056957244873, + "learning_rate": 2.9727840518714415e-05, + "loss": 1.7814, + "step": 31185 + }, + { + "epoch": 0.0607573639093469, + "grad_norm": 4.403411388397217, + "learning_rate": 2.972757943364255e-05, + "loss": 1.7441, + "step": 31200 + }, + { + "epoch": 0.06078657418045716, + "grad_norm": 2.259716272354126, + "learning_rate": 2.9727318224547667e-05, + "loss": 1.8447, + "step": 31215 + }, + { + "epoch": 0.06081578445156742, + "grad_norm": 2.397702217102051, + "learning_rate": 2.9727056891431966e-05, + "loss": 1.7677, + "step": 31230 + }, + { + "epoch": 0.06084499472267769, + "grad_norm": 3.840811014175415, + "learning_rate": 2.9726795434297646e-05, + "loss": 1.7271, + "step": 31245 + }, + { + "epoch": 0.06087420499378795, + "grad_norm": 2.695345163345337, + "learning_rate": 2.9726533853146913e-05, + "loss": 1.9575, + "step": 31260 + }, + { + "epoch": 0.06090341526489821, + "grad_norm": 2.330676555633545, + "learning_rate": 2.972627214798197e-05, + "loss": 1.9488, + "step": 31275 + }, + { + "epoch": 0.06093262553600848, + "grad_norm": 3.987534999847412, + "learning_rate": 2.9726010318805014e-05, + "loss": 1.8628, + "step": 31290 + }, + { + "epoch": 0.06096183580711874, + "grad_norm": 3.307753086090088, + "learning_rate": 2.9725748365618252e-05, + "loss": 1.9931, + "step": 31305 + }, + { + "epoch": 0.060991046078229, + "grad_norm": 4.703588485717773, + "learning_rate": 2.9725486288423894e-05, + "loss": 1.8416, + "step": 31320 + }, + { + "epoch": 0.06102025634933926, + "grad_norm": 2.01497220993042, + "learning_rate": 2.9725224087224146e-05, + "loss": 1.897, + "step": 31335 + }, + { + "epoch": 0.06104946662044953, + "grad_norm": 2.9679431915283203, + "learning_rate": 2.9724961762021215e-05, + "loss": 1.951, + "step": 31350 + }, + { + "epoch": 0.06107867689155979, + "grad_norm": 2.5453622341156006, + "learning_rate": 2.972469931281731e-05, + "loss": 1.8823, + "step": 31365 + }, + { + "epoch": 0.06110788716267005, + "grad_norm": 4.135504245758057, + "learning_rate": 2.9724436739614643e-05, + "loss": 1.9061, + "step": 31380 + }, + { + "epoch": 0.06113709743378032, + "grad_norm": 1.99344003200531, + "learning_rate": 2.9724174042415417e-05, + "loss": 1.986, + "step": 31395 + }, + { + "epoch": 0.06116630770489058, + "grad_norm": 4.097934722900391, + "learning_rate": 2.9723911221221857e-05, + "loss": 1.8222, + "step": 31410 + }, + { + "epoch": 0.06119551797600084, + "grad_norm": 4.836818695068359, + "learning_rate": 2.9723648276036165e-05, + "loss": 1.9038, + "step": 31425 + }, + { + "epoch": 0.0612247282471111, + "grad_norm": 2.4198572635650635, + "learning_rate": 2.972338520686056e-05, + "loss": 1.9228, + "step": 31440 + }, + { + "epoch": 0.06125393851822137, + "grad_norm": 4.546353816986084, + "learning_rate": 2.9723122013697265e-05, + "loss": 1.6837, + "step": 31455 + }, + { + "epoch": 0.06128314878933163, + "grad_norm": 3.9125819206237793, + "learning_rate": 2.972285869654848e-05, + "loss": 1.9057, + "step": 31470 + }, + { + "epoch": 0.06131235906044189, + "grad_norm": 2.7409276962280273, + "learning_rate": 2.9722595255416435e-05, + "loss": 1.6805, + "step": 31485 + }, + { + "epoch": 0.06134156933155216, + "grad_norm": 3.4497761726379395, + "learning_rate": 2.9722331690303344e-05, + "loss": 1.7006, + "step": 31500 + }, + { + "epoch": 0.06137077960266242, + "grad_norm": 3.8101327419281006, + "learning_rate": 2.9722068001211427e-05, + "loss": 1.8445, + "step": 31515 + }, + { + "epoch": 0.06139998987377268, + "grad_norm": 4.296467304229736, + "learning_rate": 2.9721804188142906e-05, + "loss": 1.9453, + "step": 31530 + }, + { + "epoch": 0.06142920014488294, + "grad_norm": 4.05743408203125, + "learning_rate": 2.9721540251100003e-05, + "loss": 1.6627, + "step": 31545 + }, + { + "epoch": 0.06145841041599321, + "grad_norm": 1.896415114402771, + "learning_rate": 2.9721276190084934e-05, + "loss": 2.046, + "step": 31560 + }, + { + "epoch": 0.06148762068710347, + "grad_norm": 3.3542966842651367, + "learning_rate": 2.9721012005099933e-05, + "loss": 1.7947, + "step": 31575 + }, + { + "epoch": 0.06151683095821373, + "grad_norm": 2.018230676651001, + "learning_rate": 2.9720747696147214e-05, + "loss": 1.8102, + "step": 31590 + }, + { + "epoch": 0.06154604122932399, + "grad_norm": 2.8734261989593506, + "learning_rate": 2.9720483263229012e-05, + "loss": 1.9102, + "step": 31605 + }, + { + "epoch": 0.06157525150043426, + "grad_norm": 2.8697266578674316, + "learning_rate": 2.972021870634755e-05, + "loss": 1.9179, + "step": 31620 + }, + { + "epoch": 0.06160446177154452, + "grad_norm": 2.5872962474823, + "learning_rate": 2.9719954025505054e-05, + "loss": 1.8555, + "step": 31635 + }, + { + "epoch": 0.06163367204265478, + "grad_norm": 3.8212106227874756, + "learning_rate": 2.9719689220703758e-05, + "loss": 1.7516, + "step": 31650 + }, + { + "epoch": 0.06166288231376505, + "grad_norm": 2.7884206771850586, + "learning_rate": 2.9719424291945885e-05, + "loss": 1.8213, + "step": 31665 + }, + { + "epoch": 0.06169209258487531, + "grad_norm": 2.5221076011657715, + "learning_rate": 2.971915923923367e-05, + "loss": 1.6586, + "step": 31680 + }, + { + "epoch": 0.06172130285598557, + "grad_norm": 2.4005963802337646, + "learning_rate": 2.971889406256935e-05, + "loss": 2.0582, + "step": 31695 + }, + { + "epoch": 0.061750513127095834, + "grad_norm": 2.02565860748291, + "learning_rate": 2.9718628761955146e-05, + "loss": 1.8941, + "step": 31710 + }, + { + "epoch": 0.0617797233982061, + "grad_norm": 3.2867822647094727, + "learning_rate": 2.9718363337393303e-05, + "loss": 1.7149, + "step": 31725 + }, + { + "epoch": 0.06180893366931636, + "grad_norm": 2.4237873554229736, + "learning_rate": 2.9718097788886054e-05, + "loss": 1.8685, + "step": 31740 + }, + { + "epoch": 0.061838143940426624, + "grad_norm": 2.000887155532837, + "learning_rate": 2.9717832116435632e-05, + "loss": 1.7489, + "step": 31755 + }, + { + "epoch": 0.06186735421153689, + "grad_norm": 2.2382023334503174, + "learning_rate": 2.9717566320044276e-05, + "loss": 1.8845, + "step": 31770 + }, + { + "epoch": 0.06189656448264715, + "grad_norm": 3.9158990383148193, + "learning_rate": 2.9717300399714222e-05, + "loss": 1.689, + "step": 31785 + }, + { + "epoch": 0.061925774753757414, + "grad_norm": 4.004560470581055, + "learning_rate": 2.9717034355447713e-05, + "loss": 1.9279, + "step": 31800 + }, + { + "epoch": 0.061954985024867675, + "grad_norm": 2.9310123920440674, + "learning_rate": 2.9716768187246986e-05, + "loss": 1.771, + "step": 31815 + }, + { + "epoch": 0.06198419529597794, + "grad_norm": 2.5769150257110596, + "learning_rate": 2.9716501895114287e-05, + "loss": 1.9365, + "step": 31830 + }, + { + "epoch": 0.0620134055670882, + "grad_norm": 3.767500162124634, + "learning_rate": 2.9716235479051858e-05, + "loss": 1.8358, + "step": 31845 + }, + { + "epoch": 0.062042615838198464, + "grad_norm": 4.410051345825195, + "learning_rate": 2.9715968939061932e-05, + "loss": 1.8382, + "step": 31860 + }, + { + "epoch": 0.06207182610930873, + "grad_norm": 2.5900702476501465, + "learning_rate": 2.9715702275146767e-05, + "loss": 1.8717, + "step": 31875 + }, + { + "epoch": 0.06210103638041899, + "grad_norm": 3.7013213634490967, + "learning_rate": 2.9715435487308608e-05, + "loss": 1.8794, + "step": 31890 + }, + { + "epoch": 0.062130246651529254, + "grad_norm": 2.48606276512146, + "learning_rate": 2.9715168575549688e-05, + "loss": 1.9964, + "step": 31905 + }, + { + "epoch": 0.062159456922639515, + "grad_norm": 2.358415365219116, + "learning_rate": 2.9714901539872267e-05, + "loss": 1.902, + "step": 31920 + }, + { + "epoch": 0.06218866719374978, + "grad_norm": 3.140083074569702, + "learning_rate": 2.9714634380278594e-05, + "loss": 1.8696, + "step": 31935 + }, + { + "epoch": 0.062217877464860044, + "grad_norm": 3.2135777473449707, + "learning_rate": 2.9714367096770913e-05, + "loss": 2.0002, + "step": 31950 + }, + { + "epoch": 0.062247087735970305, + "grad_norm": 3.2304131984710693, + "learning_rate": 2.9714099689351477e-05, + "loss": 1.8721, + "step": 31965 + }, + { + "epoch": 0.06227629800708057, + "grad_norm": 2.206209659576416, + "learning_rate": 2.9713832158022535e-05, + "loss": 1.9788, + "step": 31980 + }, + { + "epoch": 0.062305508278190834, + "grad_norm": 3.055828332901001, + "learning_rate": 2.9713564502786348e-05, + "loss": 1.7976, + "step": 31995 + }, + { + "epoch": 0.062334718549301095, + "grad_norm": 3.891845226287842, + "learning_rate": 2.9713296723645165e-05, + "loss": 1.7772, + "step": 32010 + }, + { + "epoch": 0.062363928820411356, + "grad_norm": 1.8532415628433228, + "learning_rate": 2.9713028820601238e-05, + "loss": 1.9817, + "step": 32025 + }, + { + "epoch": 0.062393139091521624, + "grad_norm": 1.650211215019226, + "learning_rate": 2.9712760793656826e-05, + "loss": 2.1243, + "step": 32040 + }, + { + "epoch": 0.062422349362631885, + "grad_norm": 2.663621187210083, + "learning_rate": 2.971249264281419e-05, + "loss": 2.0554, + "step": 32055 + }, + { + "epoch": 0.062451559633742146, + "grad_norm": 3.2437026500701904, + "learning_rate": 2.9712224368075578e-05, + "loss": 1.765, + "step": 32070 + }, + { + "epoch": 0.06248076990485241, + "grad_norm": 4.013802528381348, + "learning_rate": 2.971195596944326e-05, + "loss": 1.8195, + "step": 32085 + }, + { + "epoch": 0.06250998017596267, + "grad_norm": 4.0295610427856445, + "learning_rate": 2.971168744691949e-05, + "loss": 2.036, + "step": 32100 + }, + { + "epoch": 0.06253919044707294, + "grad_norm": 2.830064296722412, + "learning_rate": 2.9711418800506533e-05, + "loss": 1.8499, + "step": 32115 + }, + { + "epoch": 0.0625684007181832, + "grad_norm": 3.896538019180298, + "learning_rate": 2.9711150030206643e-05, + "loss": 2.0089, + "step": 32130 + }, + { + "epoch": 0.06259761098929346, + "grad_norm": 3.6802754402160645, + "learning_rate": 2.97108811360221e-05, + "loss": 1.9039, + "step": 32145 + }, + { + "epoch": 0.06262682126040373, + "grad_norm": 2.8547956943511963, + "learning_rate": 2.9710612117955148e-05, + "loss": 1.9065, + "step": 32160 + }, + { + "epoch": 0.06265603153151399, + "grad_norm": 2.0819807052612305, + "learning_rate": 2.9710342976008066e-05, + "loss": 1.7786, + "step": 32175 + }, + { + "epoch": 0.06268524180262425, + "grad_norm": 2.173609972000122, + "learning_rate": 2.9710073710183118e-05, + "loss": 1.8011, + "step": 32190 + }, + { + "epoch": 0.06271445207373451, + "grad_norm": 2.2233803272247314, + "learning_rate": 2.970980432048257e-05, + "loss": 1.8308, + "step": 32205 + }, + { + "epoch": 0.06274366234484478, + "grad_norm": 2.227229356765747, + "learning_rate": 2.9709534806908692e-05, + "loss": 1.9227, + "step": 32220 + }, + { + "epoch": 0.06277287261595504, + "grad_norm": 3.8464114665985107, + "learning_rate": 2.970926516946375e-05, + "loss": 1.9808, + "step": 32235 + }, + { + "epoch": 0.0628020828870653, + "grad_norm": 4.511869430541992, + "learning_rate": 2.9708995408150018e-05, + "loss": 1.9856, + "step": 32250 + }, + { + "epoch": 0.06283129315817557, + "grad_norm": 2.0209896564483643, + "learning_rate": 2.9708725522969767e-05, + "loss": 1.8766, + "step": 32265 + }, + { + "epoch": 0.06286050342928583, + "grad_norm": 4.369567394256592, + "learning_rate": 2.9708455513925273e-05, + "loss": 1.8103, + "step": 32280 + }, + { + "epoch": 0.06288971370039609, + "grad_norm": 4.502676486968994, + "learning_rate": 2.97081853810188e-05, + "loss": 1.8489, + "step": 32295 + }, + { + "epoch": 0.06291892397150635, + "grad_norm": 2.288998603820801, + "learning_rate": 2.9707915124252637e-05, + "loss": 1.9542, + "step": 32310 + }, + { + "epoch": 0.06294813424261662, + "grad_norm": 2.6359355449676514, + "learning_rate": 2.9707644743629046e-05, + "loss": 1.7883, + "step": 32325 + }, + { + "epoch": 0.06297734451372689, + "grad_norm": 4.356678485870361, + "learning_rate": 2.970737423915031e-05, + "loss": 1.8937, + "step": 32340 + }, + { + "epoch": 0.06300655478483715, + "grad_norm": 3.2190353870391846, + "learning_rate": 2.970710361081871e-05, + "loss": 1.9758, + "step": 32355 + }, + { + "epoch": 0.06303576505594741, + "grad_norm": 2.522303819656372, + "learning_rate": 2.970683285863652e-05, + "loss": 1.7315, + "step": 32370 + }, + { + "epoch": 0.06306497532705767, + "grad_norm": 2.500927448272705, + "learning_rate": 2.9706561982606023e-05, + "loss": 1.9004, + "step": 32385 + }, + { + "epoch": 0.06309418559816793, + "grad_norm": 3.656970262527466, + "learning_rate": 2.97062909827295e-05, + "loss": 1.9246, + "step": 32400 + }, + { + "epoch": 0.06312339586927819, + "grad_norm": 4.1837158203125, + "learning_rate": 2.970601985900923e-05, + "loss": 1.6453, + "step": 32415 + }, + { + "epoch": 0.06315260614038845, + "grad_norm": 5.141902446746826, + "learning_rate": 2.9705748611447498e-05, + "loss": 1.7215, + "step": 32430 + }, + { + "epoch": 0.06318181641149873, + "grad_norm": 5.384919166564941, + "learning_rate": 2.9705477240046595e-05, + "loss": 2.0638, + "step": 32445 + }, + { + "epoch": 0.06321102668260899, + "grad_norm": 2.812014579772949, + "learning_rate": 2.9705205744808795e-05, + "loss": 1.8133, + "step": 32460 + }, + { + "epoch": 0.06324023695371925, + "grad_norm": 4.618744850158691, + "learning_rate": 2.970493412573639e-05, + "loss": 1.7221, + "step": 32475 + }, + { + "epoch": 0.06326944722482951, + "grad_norm": 2.0948140621185303, + "learning_rate": 2.9704662382831665e-05, + "loss": 1.7292, + "step": 32490 + }, + { + "epoch": 0.06329865749593977, + "grad_norm": 2.2021284103393555, + "learning_rate": 2.970439051609691e-05, + "loss": 1.6685, + "step": 32505 + }, + { + "epoch": 0.06332786776705003, + "grad_norm": 4.773664474487305, + "learning_rate": 2.9704118525534414e-05, + "loss": 1.8922, + "step": 32520 + }, + { + "epoch": 0.06335707803816029, + "grad_norm": 2.716444253921509, + "learning_rate": 2.970384641114647e-05, + "loss": 1.816, + "step": 32535 + }, + { + "epoch": 0.06338628830927057, + "grad_norm": 4.086103916168213, + "learning_rate": 2.9703574172935366e-05, + "loss": 1.8084, + "step": 32550 + }, + { + "epoch": 0.06341549858038083, + "grad_norm": 2.4854936599731445, + "learning_rate": 2.9703301810903397e-05, + "loss": 1.7871, + "step": 32565 + }, + { + "epoch": 0.06344470885149109, + "grad_norm": 1.8130111694335938, + "learning_rate": 2.9703029325052857e-05, + "loss": 1.8795, + "step": 32580 + }, + { + "epoch": 0.06347391912260135, + "grad_norm": 3.8686211109161377, + "learning_rate": 2.970275671538604e-05, + "loss": 1.9867, + "step": 32595 + }, + { + "epoch": 0.06350312939371161, + "grad_norm": 2.5253164768218994, + "learning_rate": 2.9702483981905233e-05, + "loss": 1.875, + "step": 32610 + }, + { + "epoch": 0.06353233966482187, + "grad_norm": 3.0097036361694336, + "learning_rate": 2.9702211124612745e-05, + "loss": 1.9521, + "step": 32625 + }, + { + "epoch": 0.06356154993593213, + "grad_norm": 3.3433878421783447, + "learning_rate": 2.9701938143510873e-05, + "loss": 1.806, + "step": 32640 + }, + { + "epoch": 0.06359076020704241, + "grad_norm": 1.4688777923583984, + "learning_rate": 2.970166503860191e-05, + "loss": 1.8248, + "step": 32655 + }, + { + "epoch": 0.06361997047815267, + "grad_norm": 2.365694522857666, + "learning_rate": 2.9701391809888156e-05, + "loss": 1.7152, + "step": 32670 + }, + { + "epoch": 0.06364918074926293, + "grad_norm": 2.6821203231811523, + "learning_rate": 2.9701118457371915e-05, + "loss": 1.858, + "step": 32685 + }, + { + "epoch": 0.06367839102037319, + "grad_norm": 1.842124342918396, + "learning_rate": 2.9700844981055486e-05, + "loss": 1.7689, + "step": 32700 + }, + { + "epoch": 0.06370760129148345, + "grad_norm": 2.2430474758148193, + "learning_rate": 2.9700571380941178e-05, + "loss": 2.0056, + "step": 32715 + }, + { + "epoch": 0.06373681156259371, + "grad_norm": 3.1308460235595703, + "learning_rate": 2.970029765703129e-05, + "loss": 1.9163, + "step": 32730 + }, + { + "epoch": 0.06376602183370397, + "grad_norm": 2.6456706523895264, + "learning_rate": 2.9700023809328122e-05, + "loss": 1.8002, + "step": 32745 + }, + { + "epoch": 0.06379523210481425, + "grad_norm": 3.659024953842163, + "learning_rate": 2.969974983783399e-05, + "loss": 1.9222, + "step": 32760 + }, + { + "epoch": 0.06382444237592451, + "grad_norm": 2.3271381855010986, + "learning_rate": 2.96994757425512e-05, + "loss": 1.7778, + "step": 32775 + }, + { + "epoch": 0.06385365264703477, + "grad_norm": 3.411533832550049, + "learning_rate": 2.9699201523482057e-05, + "loss": 1.778, + "step": 32790 + }, + { + "epoch": 0.06388286291814503, + "grad_norm": 2.649052143096924, + "learning_rate": 2.9698927180628866e-05, + "loss": 1.8199, + "step": 32805 + }, + { + "epoch": 0.06391207318925529, + "grad_norm": 2.8163363933563232, + "learning_rate": 2.9698652713993947e-05, + "loss": 1.8579, + "step": 32820 + }, + { + "epoch": 0.06394128346036555, + "grad_norm": 3.319955348968506, + "learning_rate": 2.9698378123579603e-05, + "loss": 1.9385, + "step": 32835 + }, + { + "epoch": 0.06397049373147581, + "grad_norm": 2.217761754989624, + "learning_rate": 2.969810340938815e-05, + "loss": 2.0007, + "step": 32850 + }, + { + "epoch": 0.06399970400258609, + "grad_norm": 4.801873683929443, + "learning_rate": 2.96978285714219e-05, + "loss": 1.9876, + "step": 32865 + }, + { + "epoch": 0.06402891427369635, + "grad_norm": 3.5146069526672363, + "learning_rate": 2.969755360968317e-05, + "loss": 1.8267, + "step": 32880 + }, + { + "epoch": 0.06405812454480661, + "grad_norm": 3.407545328140259, + "learning_rate": 2.9697278524174275e-05, + "loss": 1.743, + "step": 32895 + }, + { + "epoch": 0.06408733481591687, + "grad_norm": 1.410946011543274, + "learning_rate": 2.969700331489753e-05, + "loss": 1.9281, + "step": 32910 + }, + { + "epoch": 0.06411654508702713, + "grad_norm": 2.602207899093628, + "learning_rate": 2.9696727981855253e-05, + "loss": 1.8532, + "step": 32925 + }, + { + "epoch": 0.0641457553581374, + "grad_norm": 4.358025550842285, + "learning_rate": 2.969645252504976e-05, + "loss": 1.7457, + "step": 32940 + }, + { + "epoch": 0.06417496562924765, + "grad_norm": 3.8209457397460938, + "learning_rate": 2.9696176944483373e-05, + "loss": 2.0736, + "step": 32955 + }, + { + "epoch": 0.06420417590035793, + "grad_norm": 3.317567825317383, + "learning_rate": 2.9695901240158415e-05, + "loss": 1.9322, + "step": 32970 + }, + { + "epoch": 0.06423338617146819, + "grad_norm": 4.668582439422607, + "learning_rate": 2.9695625412077208e-05, + "loss": 2.0152, + "step": 32985 + }, + { + "epoch": 0.06426259644257845, + "grad_norm": 2.6789300441741943, + "learning_rate": 2.969534946024207e-05, + "loss": 1.7129, + "step": 33000 + }, + { + "epoch": 0.06429180671368871, + "grad_norm": 2.073744297027588, + "learning_rate": 2.9695073384655326e-05, + "loss": 1.6952, + "step": 33015 + }, + { + "epoch": 0.06432101698479897, + "grad_norm": 3.3017473220825195, + "learning_rate": 2.9694797185319304e-05, + "loss": 1.8158, + "step": 33030 + }, + { + "epoch": 0.06435022725590923, + "grad_norm": 4.441155433654785, + "learning_rate": 2.969452086223633e-05, + "loss": 1.9204, + "step": 33045 + }, + { + "epoch": 0.0643794375270195, + "grad_norm": 2.9764716625213623, + "learning_rate": 2.9694244415408727e-05, + "loss": 1.8952, + "step": 33060 + }, + { + "epoch": 0.06440864779812977, + "grad_norm": 4.1646575927734375, + "learning_rate": 2.9693967844838827e-05, + "loss": 1.7692, + "step": 33075 + }, + { + "epoch": 0.06443785806924003, + "grad_norm": 4.056210994720459, + "learning_rate": 2.9693691150528955e-05, + "loss": 1.855, + "step": 33090 + }, + { + "epoch": 0.06446706834035029, + "grad_norm": 2.6098761558532715, + "learning_rate": 2.9693414332481443e-05, + "loss": 1.8284, + "step": 33105 + }, + { + "epoch": 0.06449627861146055, + "grad_norm": 4.3388166427612305, + "learning_rate": 2.9693137390698622e-05, + "loss": 1.8155, + "step": 33120 + }, + { + "epoch": 0.06452548888257081, + "grad_norm": 3.3735921382904053, + "learning_rate": 2.9692860325182825e-05, + "loss": 1.7765, + "step": 33135 + }, + { + "epoch": 0.06455469915368107, + "grad_norm": 3.9341137409210205, + "learning_rate": 2.9692583135936385e-05, + "loss": 1.7622, + "step": 33150 + }, + { + "epoch": 0.06458390942479134, + "grad_norm": 5.620511531829834, + "learning_rate": 2.9692305822961637e-05, + "loss": 1.7926, + "step": 33165 + }, + { + "epoch": 0.06461311969590161, + "grad_norm": 1.8949609994888306, + "learning_rate": 2.9692028386260915e-05, + "loss": 1.9158, + "step": 33180 + }, + { + "epoch": 0.06464232996701187, + "grad_norm": 2.183438539505005, + "learning_rate": 2.9691750825836557e-05, + "loss": 1.9586, + "step": 33195 + }, + { + "epoch": 0.06467154023812213, + "grad_norm": 2.05364727973938, + "learning_rate": 2.9691473141690894e-05, + "loss": 1.8952, + "step": 33210 + }, + { + "epoch": 0.0647007505092324, + "grad_norm": 2.479680299758911, + "learning_rate": 2.9691195333826277e-05, + "loss": 1.7908, + "step": 33225 + }, + { + "epoch": 0.06472996078034265, + "grad_norm": 2.0384159088134766, + "learning_rate": 2.9690917402245034e-05, + "loss": 1.8231, + "step": 33240 + }, + { + "epoch": 0.06475917105145292, + "grad_norm": 2.3003928661346436, + "learning_rate": 2.9690639346949503e-05, + "loss": 1.6675, + "step": 33255 + }, + { + "epoch": 0.06478838132256318, + "grad_norm": 2.9308221340179443, + "learning_rate": 2.9690361167942042e-05, + "loss": 1.9136, + "step": 33270 + }, + { + "epoch": 0.06481759159367345, + "grad_norm": 2.3866639137268066, + "learning_rate": 2.9690082865224975e-05, + "loss": 1.8992, + "step": 33285 + }, + { + "epoch": 0.06484680186478371, + "grad_norm": 4.399453163146973, + "learning_rate": 2.968980443880066e-05, + "loss": 1.9369, + "step": 33300 + }, + { + "epoch": 0.06487601213589397, + "grad_norm": 3.06842041015625, + "learning_rate": 2.9689525888671436e-05, + "loss": 1.9624, + "step": 33315 + }, + { + "epoch": 0.06490522240700423, + "grad_norm": 1.9636744260787964, + "learning_rate": 2.9689247214839647e-05, + "loss": 1.7785, + "step": 33330 + }, + { + "epoch": 0.0649344326781145, + "grad_norm": 3.882155418395996, + "learning_rate": 2.968896841730764e-05, + "loss": 1.8328, + "step": 33345 + }, + { + "epoch": 0.06496364294922476, + "grad_norm": 3.773200511932373, + "learning_rate": 2.9688689496077764e-05, + "loss": 1.9322, + "step": 33360 + }, + { + "epoch": 0.06499285322033502, + "grad_norm": 2.331101417541504, + "learning_rate": 2.968841045115237e-05, + "loss": 1.7726, + "step": 33375 + }, + { + "epoch": 0.06502206349144529, + "grad_norm": 3.3869986534118652, + "learning_rate": 2.9688131282533802e-05, + "loss": 1.9489, + "step": 33390 + }, + { + "epoch": 0.06505127376255555, + "grad_norm": 2.627638578414917, + "learning_rate": 2.968785199022442e-05, + "loss": 1.7626, + "step": 33405 + }, + { + "epoch": 0.06508048403366581, + "grad_norm": 2.031043291091919, + "learning_rate": 2.9687572574226566e-05, + "loss": 1.8948, + "step": 33420 + }, + { + "epoch": 0.06510969430477608, + "grad_norm": 1.6995463371276855, + "learning_rate": 2.96872930345426e-05, + "loss": 1.8642, + "step": 33435 + }, + { + "epoch": 0.06513890457588634, + "grad_norm": 3.292022943496704, + "learning_rate": 2.9687013371174873e-05, + "loss": 1.8344, + "step": 33450 + }, + { + "epoch": 0.0651681148469966, + "grad_norm": 2.9283857345581055, + "learning_rate": 2.968673358412574e-05, + "loss": 2.0218, + "step": 33465 + }, + { + "epoch": 0.06519732511810686, + "grad_norm": 3.2172300815582275, + "learning_rate": 2.968645367339756e-05, + "loss": 1.8049, + "step": 33480 + }, + { + "epoch": 0.06522653538921712, + "grad_norm": 2.1529486179351807, + "learning_rate": 2.9686173638992687e-05, + "loss": 1.8755, + "step": 33495 + }, + { + "epoch": 0.0652557456603274, + "grad_norm": 2.5349905490875244, + "learning_rate": 2.9685893480913477e-05, + "loss": 2.0173, + "step": 33510 + }, + { + "epoch": 0.06528495593143765, + "grad_norm": 3.646724224090576, + "learning_rate": 2.9685613199162296e-05, + "loss": 1.9269, + "step": 33525 + }, + { + "epoch": 0.06531416620254792, + "grad_norm": 3.3442795276641846, + "learning_rate": 2.96853327937415e-05, + "loss": 1.7363, + "step": 33540 + }, + { + "epoch": 0.06534337647365818, + "grad_norm": 3.301544189453125, + "learning_rate": 2.9685052264653452e-05, + "loss": 1.9225, + "step": 33555 + }, + { + "epoch": 0.06537258674476844, + "grad_norm": 1.919858455657959, + "learning_rate": 2.9684771611900512e-05, + "loss": 1.7969, + "step": 33570 + }, + { + "epoch": 0.0654017970158787, + "grad_norm": 3.6086807250976562, + "learning_rate": 2.9684490835485045e-05, + "loss": 1.8075, + "step": 33585 + }, + { + "epoch": 0.06543100728698896, + "grad_norm": 1.659013032913208, + "learning_rate": 2.9684209935409418e-05, + "loss": 1.7631, + "step": 33600 + }, + { + "epoch": 0.06546021755809923, + "grad_norm": 4.616360187530518, + "learning_rate": 2.968392891167599e-05, + "loss": 1.817, + "step": 33615 + }, + { + "epoch": 0.0654894278292095, + "grad_norm": 2.684124708175659, + "learning_rate": 2.9683647764287136e-05, + "loss": 1.8655, + "step": 33630 + }, + { + "epoch": 0.06551863810031976, + "grad_norm": 2.5763068199157715, + "learning_rate": 2.9683366493245213e-05, + "loss": 1.851, + "step": 33645 + }, + { + "epoch": 0.06554784837143002, + "grad_norm": 4.1813859939575195, + "learning_rate": 2.96830850985526e-05, + "loss": 1.8571, + "step": 33660 + }, + { + "epoch": 0.06557705864254028, + "grad_norm": 3.5396945476531982, + "learning_rate": 2.968280358021166e-05, + "loss": 1.8414, + "step": 33675 + }, + { + "epoch": 0.06560626891365054, + "grad_norm": 3.497781276702881, + "learning_rate": 2.968252193822477e-05, + "loss": 2.0295, + "step": 33690 + }, + { + "epoch": 0.0656354791847608, + "grad_norm": 2.3261985778808594, + "learning_rate": 2.9682240172594294e-05, + "loss": 1.8092, + "step": 33705 + }, + { + "epoch": 0.06566468945587108, + "grad_norm": 2.558701753616333, + "learning_rate": 2.968195828332261e-05, + "loss": 1.9174, + "step": 33720 + }, + { + "epoch": 0.06569389972698134, + "grad_norm": 3.210392713546753, + "learning_rate": 2.9681676270412092e-05, + "loss": 1.8506, + "step": 33735 + }, + { + "epoch": 0.0657231099980916, + "grad_norm": 2.7672812938690186, + "learning_rate": 2.968139413386511e-05, + "loss": 1.9361, + "step": 33750 + }, + { + "epoch": 0.06575232026920186, + "grad_norm": 4.212212562561035, + "learning_rate": 2.9681111873684046e-05, + "loss": 1.7308, + "step": 33765 + }, + { + "epoch": 0.06578153054031212, + "grad_norm": 1.9889777898788452, + "learning_rate": 2.9680829489871274e-05, + "loss": 1.8549, + "step": 33780 + }, + { + "epoch": 0.06581074081142238, + "grad_norm": 1.7036499977111816, + "learning_rate": 2.9680546982429166e-05, + "loss": 1.877, + "step": 33795 + }, + { + "epoch": 0.06583995108253264, + "grad_norm": 2.7053897380828857, + "learning_rate": 2.9680264351360115e-05, + "loss": 1.8077, + "step": 33810 + }, + { + "epoch": 0.06586916135364292, + "grad_norm": 2.8420169353485107, + "learning_rate": 2.967998159666649e-05, + "loss": 1.8568, + "step": 33825 + }, + { + "epoch": 0.06589837162475318, + "grad_norm": 2.4401204586029053, + "learning_rate": 2.9679698718350673e-05, + "loss": 1.8119, + "step": 33840 + }, + { + "epoch": 0.06592758189586344, + "grad_norm": 3.5529043674468994, + "learning_rate": 2.9679415716415053e-05, + "loss": 1.755, + "step": 33855 + }, + { + "epoch": 0.0659567921669737, + "grad_norm": 2.672156572341919, + "learning_rate": 2.9679132590862004e-05, + "loss": 1.9222, + "step": 33870 + }, + { + "epoch": 0.06598600243808396, + "grad_norm": 1.9536501169204712, + "learning_rate": 2.967884934169392e-05, + "loss": 1.9098, + "step": 33885 + }, + { + "epoch": 0.06601521270919422, + "grad_norm": 2.567133903503418, + "learning_rate": 2.9678565968913177e-05, + "loss": 1.924, + "step": 33900 + }, + { + "epoch": 0.06604442298030448, + "grad_norm": 2.739780902862549, + "learning_rate": 2.967828247252217e-05, + "loss": 1.9591, + "step": 33915 + }, + { + "epoch": 0.06607363325141476, + "grad_norm": 3.8824591636657715, + "learning_rate": 2.9677998852523277e-05, + "loss": 1.8305, + "step": 33930 + }, + { + "epoch": 0.06610284352252502, + "grad_norm": 3.137244939804077, + "learning_rate": 2.967771510891889e-05, + "loss": 1.9673, + "step": 33945 + }, + { + "epoch": 0.06613205379363528, + "grad_norm": 2.5533084869384766, + "learning_rate": 2.9677431241711405e-05, + "loss": 1.7838, + "step": 33960 + }, + { + "epoch": 0.06616126406474554, + "grad_norm": 3.2840499877929688, + "learning_rate": 2.9677147250903203e-05, + "loss": 1.7223, + "step": 33975 + }, + { + "epoch": 0.0661904743358558, + "grad_norm": 3.5756568908691406, + "learning_rate": 2.9676863136496685e-05, + "loss": 2.0505, + "step": 33990 + }, + { + "epoch": 0.06621968460696606, + "grad_norm": 2.9339518547058105, + "learning_rate": 2.967657889849423e-05, + "loss": 1.8248, + "step": 34005 + }, + { + "epoch": 0.06624889487807632, + "grad_norm": 2.587409496307373, + "learning_rate": 2.9676294536898247e-05, + "loss": 2.1007, + "step": 34020 + }, + { + "epoch": 0.0662781051491866, + "grad_norm": 2.164442777633667, + "learning_rate": 2.9676010051711123e-05, + "loss": 1.6494, + "step": 34035 + }, + { + "epoch": 0.06630731542029686, + "grad_norm": 2.022341012954712, + "learning_rate": 2.9675725442935252e-05, + "loss": 2.01, + "step": 34050 + }, + { + "epoch": 0.06633652569140712, + "grad_norm": 2.1963977813720703, + "learning_rate": 2.9675440710573036e-05, + "loss": 1.7275, + "step": 34065 + }, + { + "epoch": 0.06636573596251738, + "grad_norm": 2.7345070838928223, + "learning_rate": 2.967515585462687e-05, + "loss": 1.6542, + "step": 34080 + }, + { + "epoch": 0.06639494623362764, + "grad_norm": 2.9531948566436768, + "learning_rate": 2.9674870875099144e-05, + "loss": 1.9653, + "step": 34095 + }, + { + "epoch": 0.0664241565047379, + "grad_norm": 2.839843988418579, + "learning_rate": 2.9674585771992277e-05, + "loss": 1.8574, + "step": 34110 + }, + { + "epoch": 0.06645336677584816, + "grad_norm": 2.976874351501465, + "learning_rate": 2.967430054530865e-05, + "loss": 2.0622, + "step": 34125 + }, + { + "epoch": 0.06648257704695844, + "grad_norm": 2.1137330532073975, + "learning_rate": 2.967401519505068e-05, + "loss": 1.8188, + "step": 34140 + }, + { + "epoch": 0.0665117873180687, + "grad_norm": 2.7060563564300537, + "learning_rate": 2.9673729721220765e-05, + "loss": 1.9227, + "step": 34155 + }, + { + "epoch": 0.06654099758917896, + "grad_norm": 2.3645975589752197, + "learning_rate": 2.9673444123821306e-05, + "loss": 1.8725, + "step": 34170 + }, + { + "epoch": 0.06657020786028922, + "grad_norm": 2.382066011428833, + "learning_rate": 2.967315840285471e-05, + "loss": 1.7719, + "step": 34185 + }, + { + "epoch": 0.06659941813139948, + "grad_norm": 1.7729606628417969, + "learning_rate": 2.9672872558323385e-05, + "loss": 1.8138, + "step": 34200 + }, + { + "epoch": 0.06662862840250974, + "grad_norm": 2.7769312858581543, + "learning_rate": 2.9672586590229735e-05, + "loss": 1.8957, + "step": 34215 + }, + { + "epoch": 0.06665783867362, + "grad_norm": 4.402248382568359, + "learning_rate": 2.9672300498576173e-05, + "loss": 1.9404, + "step": 34230 + }, + { + "epoch": 0.06668704894473028, + "grad_norm": 4.229050159454346, + "learning_rate": 2.96720142833651e-05, + "loss": 1.8112, + "step": 34245 + }, + { + "epoch": 0.06671625921584054, + "grad_norm": 4.353719711303711, + "learning_rate": 2.9671727944598935e-05, + "loss": 1.8216, + "step": 34260 + }, + { + "epoch": 0.0667454694869508, + "grad_norm": 3.652221918106079, + "learning_rate": 2.9671441482280083e-05, + "loss": 1.9569, + "step": 34275 + }, + { + "epoch": 0.06677467975806106, + "grad_norm": 2.7157819271087646, + "learning_rate": 2.9671154896410962e-05, + "loss": 1.7512, + "step": 34290 + }, + { + "epoch": 0.06680389002917132, + "grad_norm": 2.719609498977661, + "learning_rate": 2.9670868186993982e-05, + "loss": 1.9252, + "step": 34305 + }, + { + "epoch": 0.06683310030028158, + "grad_norm": 2.834977865219116, + "learning_rate": 2.967058135403155e-05, + "loss": 1.7615, + "step": 34320 + }, + { + "epoch": 0.06686231057139184, + "grad_norm": 2.27976393699646, + "learning_rate": 2.9670294397526097e-05, + "loss": 1.7578, + "step": 34335 + }, + { + "epoch": 0.06689152084250212, + "grad_norm": 1.7607027292251587, + "learning_rate": 2.967000731748003e-05, + "loss": 1.8367, + "step": 34350 + }, + { + "epoch": 0.06692073111361238, + "grad_norm": 3.453352451324463, + "learning_rate": 2.9669720113895763e-05, + "loss": 1.8645, + "step": 34365 + }, + { + "epoch": 0.06694994138472264, + "grad_norm": 2.4427194595336914, + "learning_rate": 2.9669432786775727e-05, + "loss": 1.8592, + "step": 34380 + }, + { + "epoch": 0.0669791516558329, + "grad_norm": 3.492604970932007, + "learning_rate": 2.9669145336122335e-05, + "loss": 1.8394, + "step": 34395 + }, + { + "epoch": 0.06700836192694316, + "grad_norm": 2.3918869495391846, + "learning_rate": 2.9668857761938e-05, + "loss": 1.7539, + "step": 34410 + }, + { + "epoch": 0.06703757219805342, + "grad_norm": 1.826912760734558, + "learning_rate": 2.9668570064225156e-05, + "loss": 1.8747, + "step": 34425 + }, + { + "epoch": 0.06706678246916369, + "grad_norm": 4.5764241218566895, + "learning_rate": 2.966828224298622e-05, + "loss": 1.8474, + "step": 34440 + }, + { + "epoch": 0.06709599274027396, + "grad_norm": 3.9237124919891357, + "learning_rate": 2.9667994298223612e-05, + "loss": 1.8965, + "step": 34455 + }, + { + "epoch": 0.06712520301138422, + "grad_norm": 3.064443826675415, + "learning_rate": 2.9667706229939765e-05, + "loss": 1.8549, + "step": 34470 + }, + { + "epoch": 0.06715441328249448, + "grad_norm": 2.0611674785614014, + "learning_rate": 2.96674180381371e-05, + "loss": 1.8735, + "step": 34485 + }, + { + "epoch": 0.06718362355360474, + "grad_norm": 3.887948513031006, + "learning_rate": 2.9667129722818044e-05, + "loss": 1.973, + "step": 34500 + }, + { + "epoch": 0.067212833824715, + "grad_norm": 2.5983119010925293, + "learning_rate": 2.966684128398503e-05, + "loss": 1.8455, + "step": 34515 + }, + { + "epoch": 0.06724204409582527, + "grad_norm": 3.6295909881591797, + "learning_rate": 2.9666552721640474e-05, + "loss": 1.738, + "step": 34530 + }, + { + "epoch": 0.06727125436693553, + "grad_norm": 2.456125020980835, + "learning_rate": 2.966626403578682e-05, + "loss": 1.8651, + "step": 34545 + }, + { + "epoch": 0.06730046463804579, + "grad_norm": 1.6152453422546387, + "learning_rate": 2.966597522642649e-05, + "loss": 1.8728, + "step": 34560 + }, + { + "epoch": 0.06732967490915606, + "grad_norm": 2.789376974105835, + "learning_rate": 2.966568629356193e-05, + "loss": 1.8996, + "step": 34575 + }, + { + "epoch": 0.06735888518026632, + "grad_norm": 3.9699041843414307, + "learning_rate": 2.9665397237195555e-05, + "loss": 1.7645, + "step": 34590 + }, + { + "epoch": 0.06738809545137658, + "grad_norm": 2.7433955669403076, + "learning_rate": 2.966510805732981e-05, + "loss": 1.885, + "step": 34605 + }, + { + "epoch": 0.06741730572248684, + "grad_norm": 3.254692316055298, + "learning_rate": 2.9664818753967123e-05, + "loss": 1.7664, + "step": 34620 + }, + { + "epoch": 0.0674465159935971, + "grad_norm": 2.647752285003662, + "learning_rate": 2.966452932710994e-05, + "loss": 1.8037, + "step": 34635 + }, + { + "epoch": 0.06747572626470737, + "grad_norm": 3.770737648010254, + "learning_rate": 2.966423977676069e-05, + "loss": 1.7053, + "step": 34650 + }, + { + "epoch": 0.06750493653581763, + "grad_norm": 2.101461410522461, + "learning_rate": 2.9663950102921814e-05, + "loss": 1.8039, + "step": 34665 + }, + { + "epoch": 0.0675341468069279, + "grad_norm": 4.246973514556885, + "learning_rate": 2.9663660305595754e-05, + "loss": 1.776, + "step": 34680 + }, + { + "epoch": 0.06756335707803816, + "grad_norm": 6.2981157302856445, + "learning_rate": 2.9663370384784946e-05, + "loss": 1.7988, + "step": 34695 + }, + { + "epoch": 0.06759256734914842, + "grad_norm": 2.8231287002563477, + "learning_rate": 2.9663080340491838e-05, + "loss": 1.9157, + "step": 34710 + }, + { + "epoch": 0.06762177762025869, + "grad_norm": 2.1489720344543457, + "learning_rate": 2.9662790172718867e-05, + "loss": 1.7745, + "step": 34725 + }, + { + "epoch": 0.06765098789136895, + "grad_norm": 2.714576482772827, + "learning_rate": 2.9662499881468475e-05, + "loss": 1.8596, + "step": 34740 + }, + { + "epoch": 0.06768019816247921, + "grad_norm": 2.1026928424835205, + "learning_rate": 2.966220946674311e-05, + "loss": 1.9096, + "step": 34755 + }, + { + "epoch": 0.06770940843358947, + "grad_norm": 4.316662311553955, + "learning_rate": 2.9661918928545215e-05, + "loss": 1.8735, + "step": 34770 + }, + { + "epoch": 0.06773861870469974, + "grad_norm": 3.814514636993408, + "learning_rate": 2.966162826687724e-05, + "loss": 1.9701, + "step": 34785 + }, + { + "epoch": 0.06776782897581, + "grad_norm": 4.129803657531738, + "learning_rate": 2.966133748174164e-05, + "loss": 1.9273, + "step": 34800 + }, + { + "epoch": 0.06779703924692027, + "grad_norm": 1.9344481229782104, + "learning_rate": 2.966104657314084e-05, + "loss": 1.8738, + "step": 34815 + }, + { + "epoch": 0.06782624951803053, + "grad_norm": 5.156272888183594, + "learning_rate": 2.9660755541077314e-05, + "loss": 1.9722, + "step": 34830 + }, + { + "epoch": 0.06785545978914079, + "grad_norm": 2.4508838653564453, + "learning_rate": 2.9660464385553504e-05, + "loss": 1.8354, + "step": 34845 + }, + { + "epoch": 0.06788467006025105, + "grad_norm": 4.646978855133057, + "learning_rate": 2.9660173106571856e-05, + "loss": 1.6924, + "step": 34860 + }, + { + "epoch": 0.06791388033136131, + "grad_norm": 4.116873741149902, + "learning_rate": 2.9659881704134835e-05, + "loss": 1.9238, + "step": 34875 + }, + { + "epoch": 0.06794309060247158, + "grad_norm": 2.2530601024627686, + "learning_rate": 2.965959017824488e-05, + "loss": 1.744, + "step": 34890 + }, + { + "epoch": 0.06797230087358185, + "grad_norm": 3.1118083000183105, + "learning_rate": 2.9659298528904462e-05, + "loss": 1.7291, + "step": 34905 + }, + { + "epoch": 0.0680015111446921, + "grad_norm": 4.10203218460083, + "learning_rate": 2.9659006756116024e-05, + "loss": 1.9676, + "step": 34920 + }, + { + "epoch": 0.06803072141580237, + "grad_norm": 2.487515926361084, + "learning_rate": 2.9658714859882033e-05, + "loss": 1.755, + "step": 34935 + }, + { + "epoch": 0.06805993168691263, + "grad_norm": 4.722667694091797, + "learning_rate": 2.965842284020494e-05, + "loss": 1.6132, + "step": 34950 + }, + { + "epoch": 0.06808914195802289, + "grad_norm": 3.8074474334716797, + "learning_rate": 2.9658130697087206e-05, + "loss": 1.8949, + "step": 34965 + }, + { + "epoch": 0.06811835222913315, + "grad_norm": 3.2412679195404053, + "learning_rate": 2.9657838430531295e-05, + "loss": 1.8062, + "step": 34980 + }, + { + "epoch": 0.06814756250024342, + "grad_norm": 3.2179923057556152, + "learning_rate": 2.9657546040539662e-05, + "loss": 1.7753, + "step": 34995 + }, + { + "epoch": 0.06817677277135369, + "grad_norm": 3.395028591156006, + "learning_rate": 2.9657253527114772e-05, + "loss": 1.7966, + "step": 35010 + }, + { + "epoch": 0.06820598304246395, + "grad_norm": 2.3021628856658936, + "learning_rate": 2.9656960890259093e-05, + "loss": 1.9369, + "step": 35025 + }, + { + "epoch": 0.06823519331357421, + "grad_norm": 2.2561123371124268, + "learning_rate": 2.9656668129975077e-05, + "loss": 1.874, + "step": 35040 + }, + { + "epoch": 0.06826440358468447, + "grad_norm": 3.339489459991455, + "learning_rate": 2.9656375246265205e-05, + "loss": 1.9124, + "step": 35055 + }, + { + "epoch": 0.06829361385579473, + "grad_norm": 3.698767900466919, + "learning_rate": 2.965608223913193e-05, + "loss": 1.9178, + "step": 35070 + }, + { + "epoch": 0.06832282412690499, + "grad_norm": 4.208549499511719, + "learning_rate": 2.965578910857773e-05, + "loss": 2.035, + "step": 35085 + }, + { + "epoch": 0.06835203439801527, + "grad_norm": 3.827305555343628, + "learning_rate": 2.9655495854605067e-05, + "loss": 1.8416, + "step": 35100 + }, + { + "epoch": 0.06838124466912553, + "grad_norm": 3.3755781650543213, + "learning_rate": 2.9655202477216413e-05, + "loss": 1.8389, + "step": 35115 + }, + { + "epoch": 0.06841045494023579, + "grad_norm": 3.1055173873901367, + "learning_rate": 2.9654908976414233e-05, + "loss": 1.7763, + "step": 35130 + }, + { + "epoch": 0.06843966521134605, + "grad_norm": 3.2858526706695557, + "learning_rate": 2.9654615352201006e-05, + "loss": 1.9048, + "step": 35145 + }, + { + "epoch": 0.06846887548245631, + "grad_norm": 2.145545721054077, + "learning_rate": 2.9654321604579208e-05, + "loss": 1.813, + "step": 35160 + }, + { + "epoch": 0.06849808575356657, + "grad_norm": 2.673372268676758, + "learning_rate": 2.9654027733551296e-05, + "loss": 1.8894, + "step": 35175 + }, + { + "epoch": 0.06852729602467683, + "grad_norm": 3.6063785552978516, + "learning_rate": 2.965373373911976e-05, + "loss": 1.7333, + "step": 35190 + }, + { + "epoch": 0.0685565062957871, + "grad_norm": 3.307016611099243, + "learning_rate": 2.9653439621287072e-05, + "loss": 1.9798, + "step": 35205 + }, + { + "epoch": 0.06858571656689737, + "grad_norm": 1.9707258939743042, + "learning_rate": 2.9653145380055713e-05, + "loss": 1.7571, + "step": 35220 + }, + { + "epoch": 0.06861492683800763, + "grad_norm": 3.5209414958953857, + "learning_rate": 2.965285101542815e-05, + "loss": 1.7073, + "step": 35235 + }, + { + "epoch": 0.06864413710911789, + "grad_norm": 3.5485973358154297, + "learning_rate": 2.9652556527406868e-05, + "loss": 1.7296, + "step": 35250 + }, + { + "epoch": 0.06867334738022815, + "grad_norm": 4.829267978668213, + "learning_rate": 2.9652261915994345e-05, + "loss": 1.7223, + "step": 35265 + }, + { + "epoch": 0.06870255765133841, + "grad_norm": 4.529970169067383, + "learning_rate": 2.9651967181193066e-05, + "loss": 1.8972, + "step": 35280 + }, + { + "epoch": 0.06873176792244867, + "grad_norm": 2.448244571685791, + "learning_rate": 2.9651672323005514e-05, + "loss": 1.6871, + "step": 35295 + }, + { + "epoch": 0.06876097819355895, + "grad_norm": 2.37665057182312, + "learning_rate": 2.9651377341434166e-05, + "loss": 1.8861, + "step": 35310 + }, + { + "epoch": 0.06879018846466921, + "grad_norm": 2.69604229927063, + "learning_rate": 2.9651082236481508e-05, + "loss": 1.7115, + "step": 35325 + }, + { + "epoch": 0.06881939873577947, + "grad_norm": 2.311614513397217, + "learning_rate": 2.9650787008150024e-05, + "loss": 1.8526, + "step": 35340 + }, + { + "epoch": 0.06884860900688973, + "grad_norm": 4.728626251220703, + "learning_rate": 2.9650491656442205e-05, + "loss": 1.9195, + "step": 35355 + }, + { + "epoch": 0.06887781927799999, + "grad_norm": 1.917162537574768, + "learning_rate": 2.9650196181360537e-05, + "loss": 1.9401, + "step": 35370 + }, + { + "epoch": 0.06890702954911025, + "grad_norm": 2.5558958053588867, + "learning_rate": 2.96499005829075e-05, + "loss": 1.8347, + "step": 35385 + }, + { + "epoch": 0.06893623982022051, + "grad_norm": 5.177891254425049, + "learning_rate": 2.96496048610856e-05, + "loss": 1.9398, + "step": 35400 + }, + { + "epoch": 0.06896545009133079, + "grad_norm": 4.181756019592285, + "learning_rate": 2.9649309015897306e-05, + "loss": 1.8944, + "step": 35415 + }, + { + "epoch": 0.06899466036244105, + "grad_norm": 2.468771457672119, + "learning_rate": 2.9649013047345123e-05, + "loss": 1.8136, + "step": 35430 + }, + { + "epoch": 0.06902387063355131, + "grad_norm": 4.117799758911133, + "learning_rate": 2.9648716955431545e-05, + "loss": 1.882, + "step": 35445 + }, + { + "epoch": 0.06905308090466157, + "grad_norm": 1.6718223094940186, + "learning_rate": 2.964842074015906e-05, + "loss": 1.908, + "step": 35460 + }, + { + "epoch": 0.06908229117577183, + "grad_norm": 2.493669033050537, + "learning_rate": 2.9648124401530162e-05, + "loss": 1.8696, + "step": 35475 + }, + { + "epoch": 0.06911150144688209, + "grad_norm": 2.0989701747894287, + "learning_rate": 2.964782793954735e-05, + "loss": 1.9898, + "step": 35490 + }, + { + "epoch": 0.06914071171799235, + "grad_norm": 3.0798256397247314, + "learning_rate": 2.9647531354213117e-05, + "loss": 1.7473, + "step": 35505 + }, + { + "epoch": 0.06916992198910263, + "grad_norm": 2.311537265777588, + "learning_rate": 2.9647234645529965e-05, + "loss": 1.9823, + "step": 35520 + }, + { + "epoch": 0.06919913226021289, + "grad_norm": 2.0401742458343506, + "learning_rate": 2.964693781350039e-05, + "loss": 1.8064, + "step": 35535 + }, + { + "epoch": 0.06922834253132315, + "grad_norm": 3.3467860221862793, + "learning_rate": 2.9646640858126888e-05, + "loss": 1.9125, + "step": 35550 + }, + { + "epoch": 0.06925755280243341, + "grad_norm": 2.1629061698913574, + "learning_rate": 2.9646343779411965e-05, + "loss": 1.9639, + "step": 35565 + }, + { + "epoch": 0.06928676307354367, + "grad_norm": 2.8887295722961426, + "learning_rate": 2.9646046577358124e-05, + "loss": 1.839, + "step": 35580 + }, + { + "epoch": 0.06931597334465393, + "grad_norm": 2.5036697387695312, + "learning_rate": 2.964574925196786e-05, + "loss": 1.8771, + "step": 35595 + }, + { + "epoch": 0.0693451836157642, + "grad_norm": 3.0015201568603516, + "learning_rate": 2.9645451803243684e-05, + "loss": 1.7982, + "step": 35610 + }, + { + "epoch": 0.06937439388687446, + "grad_norm": 1.9739888906478882, + "learning_rate": 2.96451542311881e-05, + "loss": 1.8182, + "step": 35625 + }, + { + "epoch": 0.06940360415798473, + "grad_norm": 3.399923801422119, + "learning_rate": 2.964485653580361e-05, + "loss": 1.9865, + "step": 35640 + }, + { + "epoch": 0.06943281442909499, + "grad_norm": 2.0819296836853027, + "learning_rate": 2.9644558717092726e-05, + "loss": 1.7867, + "step": 35655 + }, + { + "epoch": 0.06946202470020525, + "grad_norm": 4.023995876312256, + "learning_rate": 2.964426077505795e-05, + "loss": 1.888, + "step": 35670 + }, + { + "epoch": 0.06949123497131551, + "grad_norm": 3.412802219390869, + "learning_rate": 2.9643962709701797e-05, + "loss": 1.7254, + "step": 35685 + }, + { + "epoch": 0.06952044524242577, + "grad_norm": 5.086963653564453, + "learning_rate": 2.964366452102677e-05, + "loss": 1.8761, + "step": 35700 + }, + { + "epoch": 0.06954965551353604, + "grad_norm": 3.394970417022705, + "learning_rate": 2.964336620903539e-05, + "loss": 1.7277, + "step": 35715 + }, + { + "epoch": 0.0695788657846463, + "grad_norm": 3.2881181240081787, + "learning_rate": 2.964306777373016e-05, + "loss": 1.6754, + "step": 35730 + }, + { + "epoch": 0.06960807605575657, + "grad_norm": 2.9079060554504395, + "learning_rate": 2.96427692151136e-05, + "loss": 1.7099, + "step": 35745 + }, + { + "epoch": 0.06963728632686683, + "grad_norm": 2.8861443996429443, + "learning_rate": 2.964247053318822e-05, + "loss": 2.0041, + "step": 35760 + }, + { + "epoch": 0.06966649659797709, + "grad_norm": 1.9946422576904297, + "learning_rate": 2.9642171727956537e-05, + "loss": 1.6527, + "step": 35775 + }, + { + "epoch": 0.06969570686908735, + "grad_norm": 4.953859806060791, + "learning_rate": 2.9641872799421064e-05, + "loss": 1.8033, + "step": 35790 + }, + { + "epoch": 0.06972491714019761, + "grad_norm": 3.2027766704559326, + "learning_rate": 2.9641573747584324e-05, + "loss": 2.0676, + "step": 35805 + }, + { + "epoch": 0.06975412741130788, + "grad_norm": 3.439577102661133, + "learning_rate": 2.9641274572448833e-05, + "loss": 1.8123, + "step": 35820 + }, + { + "epoch": 0.06978333768241814, + "grad_norm": 3.8689444065093994, + "learning_rate": 2.9640975274017104e-05, + "loss": 1.888, + "step": 35835 + }, + { + "epoch": 0.06981254795352841, + "grad_norm": 2.1769585609436035, + "learning_rate": 2.9640675852291664e-05, + "loss": 1.8283, + "step": 35850 + }, + { + "epoch": 0.06984175822463867, + "grad_norm": 4.714607238769531, + "learning_rate": 2.964037630727504e-05, + "loss": 1.8005, + "step": 35865 + }, + { + "epoch": 0.06987096849574893, + "grad_norm": 3.4551823139190674, + "learning_rate": 2.9640076638969745e-05, + "loss": 1.9842, + "step": 35880 + }, + { + "epoch": 0.0699001787668592, + "grad_norm": 2.2861387729644775, + "learning_rate": 2.96397768473783e-05, + "loss": 1.8149, + "step": 35895 + }, + { + "epoch": 0.06992938903796946, + "grad_norm": 2.4420621395111084, + "learning_rate": 2.963947693250324e-05, + "loss": 2.0058, + "step": 35910 + }, + { + "epoch": 0.06995859930907972, + "grad_norm": 2.770094871520996, + "learning_rate": 2.9639176894347083e-05, + "loss": 2.0128, + "step": 35925 + }, + { + "epoch": 0.06998780958018998, + "grad_norm": 3.4321415424346924, + "learning_rate": 2.9638876732912364e-05, + "loss": 1.7902, + "step": 35940 + }, + { + "epoch": 0.07001701985130025, + "grad_norm": 3.649888753890991, + "learning_rate": 2.96385764482016e-05, + "loss": 1.7774, + "step": 35955 + }, + { + "epoch": 0.07004623012241051, + "grad_norm": 3.11183762550354, + "learning_rate": 2.963827604021733e-05, + "loss": 1.8819, + "step": 35970 + }, + { + "epoch": 0.07007544039352077, + "grad_norm": 3.1074795722961426, + "learning_rate": 2.9637975508962076e-05, + "loss": 1.7814, + "step": 35985 + }, + { + "epoch": 0.07010465066463104, + "grad_norm": 2.314819097518921, + "learning_rate": 2.9637674854438368e-05, + "loss": 2.0389, + "step": 36000 + }, + { + "epoch": 0.0701338609357413, + "grad_norm": 3.201878309249878, + "learning_rate": 2.9637374076648744e-05, + "loss": 1.8632, + "step": 36015 + }, + { + "epoch": 0.07016307120685156, + "grad_norm": 3.392383337020874, + "learning_rate": 2.9637073175595738e-05, + "loss": 1.7767, + "step": 36030 + }, + { + "epoch": 0.07019228147796182, + "grad_norm": 3.2556650638580322, + "learning_rate": 2.9636772151281874e-05, + "loss": 1.748, + "step": 36045 + }, + { + "epoch": 0.07022149174907209, + "grad_norm": 4.08565092086792, + "learning_rate": 2.9636471003709697e-05, + "loss": 1.9722, + "step": 36060 + }, + { + "epoch": 0.07025070202018235, + "grad_norm": 3.140951156616211, + "learning_rate": 2.9636169732881737e-05, + "loss": 1.9794, + "step": 36075 + }, + { + "epoch": 0.07027991229129261, + "grad_norm": 3.052402973175049, + "learning_rate": 2.9635868338800532e-05, + "loss": 1.7071, + "step": 36090 + }, + { + "epoch": 0.07030912256240288, + "grad_norm": 3.39620304107666, + "learning_rate": 2.9635566821468624e-05, + "loss": 2.0134, + "step": 36105 + }, + { + "epoch": 0.07033833283351314, + "grad_norm": 3.394157648086548, + "learning_rate": 2.963526518088855e-05, + "loss": 1.9427, + "step": 36120 + }, + { + "epoch": 0.0703675431046234, + "grad_norm": 2.371034622192383, + "learning_rate": 2.9634963417062847e-05, + "loss": 1.8057, + "step": 36135 + }, + { + "epoch": 0.07039675337573366, + "grad_norm": 2.449495792388916, + "learning_rate": 2.963466152999406e-05, + "loss": 2.0568, + "step": 36150 + }, + { + "epoch": 0.07042596364684393, + "grad_norm": 2.6000113487243652, + "learning_rate": 2.9634359519684732e-05, + "loss": 1.6321, + "step": 36165 + }, + { + "epoch": 0.0704551739179542, + "grad_norm": 4.883264064788818, + "learning_rate": 2.9634057386137405e-05, + "loss": 1.8741, + "step": 36180 + }, + { + "epoch": 0.07048438418906446, + "grad_norm": 3.775376319885254, + "learning_rate": 2.9633755129354614e-05, + "loss": 1.8035, + "step": 36195 + }, + { + "epoch": 0.07051359446017472, + "grad_norm": 2.50616455078125, + "learning_rate": 2.963345274933892e-05, + "loss": 1.7234, + "step": 36210 + }, + { + "epoch": 0.07054280473128498, + "grad_norm": 2.0914623737335205, + "learning_rate": 2.9633150246092862e-05, + "loss": 1.8678, + "step": 36225 + }, + { + "epoch": 0.07057201500239524, + "grad_norm": 2.3975253105163574, + "learning_rate": 2.9632847619618987e-05, + "loss": 2.0389, + "step": 36240 + }, + { + "epoch": 0.0706012252735055, + "grad_norm": 4.599456310272217, + "learning_rate": 2.9632544869919844e-05, + "loss": 1.8546, + "step": 36255 + }, + { + "epoch": 0.07063043554461577, + "grad_norm": 2.8365259170532227, + "learning_rate": 2.9632241996997982e-05, + "loss": 1.8831, + "step": 36270 + }, + { + "epoch": 0.07065964581572604, + "grad_norm": 5.333098888397217, + "learning_rate": 2.963193900085595e-05, + "loss": 1.9734, + "step": 36285 + }, + { + "epoch": 0.0706888560868363, + "grad_norm": 3.1167783737182617, + "learning_rate": 2.9631635881496307e-05, + "loss": 1.8843, + "step": 36300 + }, + { + "epoch": 0.07071806635794656, + "grad_norm": 2.7622556686401367, + "learning_rate": 2.9631332638921597e-05, + "loss": 1.7392, + "step": 36315 + }, + { + "epoch": 0.07074727662905682, + "grad_norm": 2.8044185638427734, + "learning_rate": 2.9631029273134377e-05, + "loss": 1.8026, + "step": 36330 + }, + { + "epoch": 0.07077648690016708, + "grad_norm": 3.532073974609375, + "learning_rate": 2.9630725784137206e-05, + "loss": 1.9251, + "step": 36345 + }, + { + "epoch": 0.07080569717127734, + "grad_norm": 2.9674692153930664, + "learning_rate": 2.963042217193263e-05, + "loss": 1.7559, + "step": 36360 + }, + { + "epoch": 0.07083490744238762, + "grad_norm": 3.634361982345581, + "learning_rate": 2.963011843652321e-05, + "loss": 1.8846, + "step": 36375 + }, + { + "epoch": 0.07086411771349788, + "grad_norm": 3.8142337799072266, + "learning_rate": 2.9629814577911512e-05, + "loss": 1.9241, + "step": 36390 + }, + { + "epoch": 0.07089332798460814, + "grad_norm": 3.4953882694244385, + "learning_rate": 2.9629510596100083e-05, + "loss": 1.6802, + "step": 36405 + }, + { + "epoch": 0.0709225382557184, + "grad_norm": 3.115720272064209, + "learning_rate": 2.962920649109149e-05, + "loss": 1.8446, + "step": 36420 + }, + { + "epoch": 0.07095174852682866, + "grad_norm": 2.615313768386841, + "learning_rate": 2.962890226288829e-05, + "loss": 1.8841, + "step": 36435 + }, + { + "epoch": 0.07098095879793892, + "grad_norm": 2.0485146045684814, + "learning_rate": 2.9628597911493044e-05, + "loss": 1.8237, + "step": 36450 + }, + { + "epoch": 0.07101016906904918, + "grad_norm": 2.433716297149658, + "learning_rate": 2.9628293436908322e-05, + "loss": 1.9638, + "step": 36465 + }, + { + "epoch": 0.07103937934015946, + "grad_norm": 2.5944697856903076, + "learning_rate": 2.962798883913668e-05, + "loss": 1.7643, + "step": 36480 + }, + { + "epoch": 0.07106858961126972, + "grad_norm": 2.829939603805542, + "learning_rate": 2.962768411818069e-05, + "loss": 1.7773, + "step": 36495 + }, + { + "epoch": 0.07109779988237998, + "grad_norm": 4.522650718688965, + "learning_rate": 2.9627379274042914e-05, + "loss": 1.8784, + "step": 36510 + }, + { + "epoch": 0.07112701015349024, + "grad_norm": 2.9910271167755127, + "learning_rate": 2.962707430672592e-05, + "loss": 1.7878, + "step": 36525 + }, + { + "epoch": 0.0711562204246005, + "grad_norm": 3.928048849105835, + "learning_rate": 2.9626769216232272e-05, + "loss": 1.9815, + "step": 36540 + }, + { + "epoch": 0.07118543069571076, + "grad_norm": 2.1179707050323486, + "learning_rate": 2.9626464002564545e-05, + "loss": 1.823, + "step": 36555 + }, + { + "epoch": 0.07121464096682102, + "grad_norm": 3.4463741779327393, + "learning_rate": 2.9626158665725314e-05, + "loss": 2.0457, + "step": 36570 + }, + { + "epoch": 0.07124385123793128, + "grad_norm": 4.101587295532227, + "learning_rate": 2.962585320571714e-05, + "loss": 1.8559, + "step": 36585 + }, + { + "epoch": 0.07127306150904156, + "grad_norm": 3.634690046310425, + "learning_rate": 2.9625547622542594e-05, + "loss": 1.8351, + "step": 36600 + }, + { + "epoch": 0.07130227178015182, + "grad_norm": 2.704759359359741, + "learning_rate": 2.9625241916204257e-05, + "loss": 1.7875, + "step": 36615 + }, + { + "epoch": 0.07133148205126208, + "grad_norm": 2.140449285507202, + "learning_rate": 2.9624936086704705e-05, + "loss": 1.8588, + "step": 36630 + }, + { + "epoch": 0.07136069232237234, + "grad_norm": 4.091612339019775, + "learning_rate": 2.9624630134046504e-05, + "loss": 1.7989, + "step": 36645 + }, + { + "epoch": 0.0713899025934826, + "grad_norm": 2.931297779083252, + "learning_rate": 2.962432405823224e-05, + "loss": 1.8749, + "step": 36660 + }, + { + "epoch": 0.07141911286459286, + "grad_norm": 2.6341919898986816, + "learning_rate": 2.9624017859264485e-05, + "loss": 1.8023, + "step": 36675 + }, + { + "epoch": 0.07144832313570312, + "grad_norm": 4.892661094665527, + "learning_rate": 2.962371153714582e-05, + "loss": 1.7806, + "step": 36690 + }, + { + "epoch": 0.0714775334068134, + "grad_norm": 3.581634998321533, + "learning_rate": 2.962340509187882e-05, + "loss": 1.9455, + "step": 36705 + }, + { + "epoch": 0.07150674367792366, + "grad_norm": 3.0839149951934814, + "learning_rate": 2.9623098523466072e-05, + "loss": 1.8815, + "step": 36720 + }, + { + "epoch": 0.07153595394903392, + "grad_norm": 3.6118085384368896, + "learning_rate": 2.9622791831910152e-05, + "loss": 1.7464, + "step": 36735 + }, + { + "epoch": 0.07156516422014418, + "grad_norm": 2.926898956298828, + "learning_rate": 2.9622485017213646e-05, + "loss": 1.794, + "step": 36750 + }, + { + "epoch": 0.07159437449125444, + "grad_norm": 2.8011724948883057, + "learning_rate": 2.962217807937914e-05, + "loss": 1.9756, + "step": 36765 + }, + { + "epoch": 0.0716235847623647, + "grad_norm": 2.957306146621704, + "learning_rate": 2.9621871018409213e-05, + "loss": 1.799, + "step": 36780 + }, + { + "epoch": 0.07165279503347496, + "grad_norm": 2.094928503036499, + "learning_rate": 2.9621563834306453e-05, + "loss": 1.9149, + "step": 36795 + }, + { + "epoch": 0.07168200530458524, + "grad_norm": 3.5579111576080322, + "learning_rate": 2.962125652707345e-05, + "loss": 1.8073, + "step": 36810 + }, + { + "epoch": 0.0717112155756955, + "grad_norm": 2.3005943298339844, + "learning_rate": 2.962094909671279e-05, + "loss": 1.894, + "step": 36825 + }, + { + "epoch": 0.07174042584680576, + "grad_norm": 1.7549917697906494, + "learning_rate": 2.962064154322706e-05, + "loss": 1.9434, + "step": 36840 + }, + { + "epoch": 0.07176963611791602, + "grad_norm": 2.696244955062866, + "learning_rate": 2.962033386661885e-05, + "loss": 1.8287, + "step": 36855 + }, + { + "epoch": 0.07179884638902628, + "grad_norm": 4.194484233856201, + "learning_rate": 2.9620026066890756e-05, + "loss": 1.8381, + "step": 36870 + }, + { + "epoch": 0.07182805666013654, + "grad_norm": 4.518919944763184, + "learning_rate": 2.961971814404536e-05, + "loss": 1.9311, + "step": 36885 + }, + { + "epoch": 0.0718572669312468, + "grad_norm": 2.8230600357055664, + "learning_rate": 2.9619410098085267e-05, + "loss": 1.9481, + "step": 36900 + }, + { + "epoch": 0.07188647720235708, + "grad_norm": 2.9653265476226807, + "learning_rate": 2.961910192901306e-05, + "loss": 1.9819, + "step": 36915 + }, + { + "epoch": 0.07191568747346734, + "grad_norm": 4.166685104370117, + "learning_rate": 2.9618793636831343e-05, + "loss": 1.902, + "step": 36930 + }, + { + "epoch": 0.0719448977445776, + "grad_norm": 3.7438416481018066, + "learning_rate": 2.9618485221542707e-05, + "loss": 1.9503, + "step": 36945 + }, + { + "epoch": 0.07197410801568786, + "grad_norm": 3.582324266433716, + "learning_rate": 2.9618176683149753e-05, + "loss": 1.9552, + "step": 36960 + }, + { + "epoch": 0.07200331828679812, + "grad_norm": 3.1160881519317627, + "learning_rate": 2.9617868021655077e-05, + "loss": 2.0002, + "step": 36975 + }, + { + "epoch": 0.07203252855790838, + "grad_norm": 4.370865345001221, + "learning_rate": 2.961755923706128e-05, + "loss": 1.8732, + "step": 36990 + }, + { + "epoch": 0.07206173882901865, + "grad_norm": 3.8529598712921143, + "learning_rate": 2.961725032937096e-05, + "loss": 1.7087, + "step": 37005 + }, + { + "epoch": 0.07209094910012892, + "grad_norm": 3.0339152812957764, + "learning_rate": 2.9616941298586717e-05, + "loss": 1.8224, + "step": 37020 + }, + { + "epoch": 0.07212015937123918, + "grad_norm": 2.1726677417755127, + "learning_rate": 2.9616632144711155e-05, + "loss": 1.8681, + "step": 37035 + }, + { + "epoch": 0.07214936964234944, + "grad_norm": 2.9488229751586914, + "learning_rate": 2.961632286774688e-05, + "loss": 1.827, + "step": 37050 + }, + { + "epoch": 0.0721785799134597, + "grad_norm": 2.599917411804199, + "learning_rate": 2.9616013467696494e-05, + "loss": 2.0164, + "step": 37065 + }, + { + "epoch": 0.07220779018456996, + "grad_norm": 2.8067574501037598, + "learning_rate": 2.9615703944562604e-05, + "loss": 1.8203, + "step": 37080 + }, + { + "epoch": 0.07223700045568023, + "grad_norm": 3.9336917400360107, + "learning_rate": 2.9615394298347814e-05, + "loss": 1.7452, + "step": 37095 + }, + { + "epoch": 0.07226621072679049, + "grad_norm": 3.1062943935394287, + "learning_rate": 2.9615084529054732e-05, + "loss": 1.9874, + "step": 37110 + }, + { + "epoch": 0.07229542099790076, + "grad_norm": 2.639976739883423, + "learning_rate": 2.961477463668597e-05, + "loss": 1.8517, + "step": 37125 + }, + { + "epoch": 0.07232463126901102, + "grad_norm": 4.7184529304504395, + "learning_rate": 2.9614464621244135e-05, + "loss": 1.7864, + "step": 37140 + }, + { + "epoch": 0.07235384154012128, + "grad_norm": 3.362306594848633, + "learning_rate": 2.9614154482731838e-05, + "loss": 1.7878, + "step": 37155 + }, + { + "epoch": 0.07238305181123154, + "grad_norm": 3.6693410873413086, + "learning_rate": 2.9613844221151688e-05, + "loss": 1.8032, + "step": 37170 + }, + { + "epoch": 0.0724122620823418, + "grad_norm": 3.523045539855957, + "learning_rate": 2.9613533836506304e-05, + "loss": 1.7877, + "step": 37185 + }, + { + "epoch": 0.07244147235345207, + "grad_norm": 3.0142440795898438, + "learning_rate": 2.961322332879829e-05, + "loss": 1.8242, + "step": 37200 + }, + { + "epoch": 0.07247068262456233, + "grad_norm": 2.044818878173828, + "learning_rate": 2.9612912698030275e-05, + "loss": 1.7869, + "step": 37215 + }, + { + "epoch": 0.0724998928956726, + "grad_norm": 2.8730952739715576, + "learning_rate": 2.9612601944204866e-05, + "loss": 2.0202, + "step": 37230 + }, + { + "epoch": 0.07252910316678286, + "grad_norm": 2.93044114112854, + "learning_rate": 2.9612291067324675e-05, + "loss": 1.9302, + "step": 37245 + }, + { + "epoch": 0.07255831343789312, + "grad_norm": 3.5285401344299316, + "learning_rate": 2.961198006739233e-05, + "loss": 2.0005, + "step": 37260 + }, + { + "epoch": 0.07258752370900338, + "grad_norm": 3.873157262802124, + "learning_rate": 2.9611668944410446e-05, + "loss": 1.869, + "step": 37275 + }, + { + "epoch": 0.07261673398011365, + "grad_norm": 4.3793840408325195, + "learning_rate": 2.961135769838164e-05, + "loss": 1.8881, + "step": 37290 + }, + { + "epoch": 0.0726459442512239, + "grad_norm": 1.939467191696167, + "learning_rate": 2.9611046329308538e-05, + "loss": 1.8512, + "step": 37305 + }, + { + "epoch": 0.07267515452233417, + "grad_norm": 2.9882404804229736, + "learning_rate": 2.961073483719376e-05, + "loss": 1.672, + "step": 37320 + }, + { + "epoch": 0.07270436479344444, + "grad_norm": 1.6897094249725342, + "learning_rate": 2.961042322203993e-05, + "loss": 1.9174, + "step": 37335 + }, + { + "epoch": 0.0727335750645547, + "grad_norm": 4.178640842437744, + "learning_rate": 2.9610111483849668e-05, + "loss": 1.9587, + "step": 37350 + }, + { + "epoch": 0.07276278533566496, + "grad_norm": 4.1040825843811035, + "learning_rate": 2.9609799622625604e-05, + "loss": 1.7908, + "step": 37365 + }, + { + "epoch": 0.07279199560677523, + "grad_norm": 2.250300168991089, + "learning_rate": 2.960948763837036e-05, + "loss": 1.8558, + "step": 37380 + }, + { + "epoch": 0.07282120587788549, + "grad_norm": 3.2331349849700928, + "learning_rate": 2.960917553108657e-05, + "loss": 1.8444, + "step": 37395 + }, + { + "epoch": 0.07285041614899575, + "grad_norm": 3.2942092418670654, + "learning_rate": 2.9608863300776855e-05, + "loss": 1.8456, + "step": 37410 + }, + { + "epoch": 0.07287962642010601, + "grad_norm": 2.708117961883545, + "learning_rate": 2.960855094744385e-05, + "loss": 1.8107, + "step": 37425 + }, + { + "epoch": 0.07290883669121628, + "grad_norm": 5.859858512878418, + "learning_rate": 2.9608238471090187e-05, + "loss": 1.8856, + "step": 37440 + }, + { + "epoch": 0.07293804696232654, + "grad_norm": 2.5912370681762695, + "learning_rate": 2.9607925871718486e-05, + "loss": 1.802, + "step": 37455 + }, + { + "epoch": 0.0729672572334368, + "grad_norm": 2.456089973449707, + "learning_rate": 2.960761314933139e-05, + "loss": 1.9207, + "step": 37470 + }, + { + "epoch": 0.07299646750454707, + "grad_norm": 3.9020590782165527, + "learning_rate": 2.9607300303931526e-05, + "loss": 1.762, + "step": 37485 + }, + { + "epoch": 0.07302567777565733, + "grad_norm": 2.20559024810791, + "learning_rate": 2.960698733552153e-05, + "loss": 1.9261, + "step": 37500 + }, + { + "epoch": 0.07305488804676759, + "grad_norm": 2.871424913406372, + "learning_rate": 2.9606674244104048e-05, + "loss": 1.8707, + "step": 37515 + }, + { + "epoch": 0.07308409831787785, + "grad_norm": 2.1989009380340576, + "learning_rate": 2.96063610296817e-05, + "loss": 2.0706, + "step": 37530 + }, + { + "epoch": 0.07311330858898812, + "grad_norm": 2.3892788887023926, + "learning_rate": 2.9606047692257134e-05, + "loss": 1.8737, + "step": 37545 + }, + { + "epoch": 0.07314251886009838, + "grad_norm": 1.9396522045135498, + "learning_rate": 2.9605734231832992e-05, + "loss": 1.8575, + "step": 37560 + }, + { + "epoch": 0.07317172913120865, + "grad_norm": 4.139869689941406, + "learning_rate": 2.96054206484119e-05, + "loss": 1.7875, + "step": 37575 + }, + { + "epoch": 0.07320093940231891, + "grad_norm": 4.462748050689697, + "learning_rate": 2.960510694199651e-05, + "loss": 1.9463, + "step": 37590 + }, + { + "epoch": 0.07323014967342917, + "grad_norm": 2.580022096633911, + "learning_rate": 2.9604793112589458e-05, + "loss": 1.9219, + "step": 37605 + }, + { + "epoch": 0.07325935994453943, + "grad_norm": 2.900674343109131, + "learning_rate": 2.960447916019339e-05, + "loss": 1.8848, + "step": 37620 + }, + { + "epoch": 0.07328857021564969, + "grad_norm": 4.659104347229004, + "learning_rate": 2.960416508481095e-05, + "loss": 1.7897, + "step": 37635 + }, + { + "epoch": 0.07331778048675995, + "grad_norm": 3.549299955368042, + "learning_rate": 2.960385088644478e-05, + "loss": 1.9, + "step": 37650 + }, + { + "epoch": 0.07334699075787023, + "grad_norm": 2.934943675994873, + "learning_rate": 2.9603536565097526e-05, + "loss": 1.8177, + "step": 37665 + }, + { + "epoch": 0.07337620102898049, + "grad_norm": 2.9049174785614014, + "learning_rate": 2.960322212077184e-05, + "loss": 1.7554, + "step": 37680 + }, + { + "epoch": 0.07340541130009075, + "grad_norm": 2.8519704341888428, + "learning_rate": 2.9602907553470366e-05, + "loss": 1.8584, + "step": 37695 + }, + { + "epoch": 0.07343462157120101, + "grad_norm": 4.067868709564209, + "learning_rate": 2.9602592863195754e-05, + "loss": 1.8013, + "step": 37710 + }, + { + "epoch": 0.07346383184231127, + "grad_norm": 2.65765118598938, + "learning_rate": 2.9602278049950654e-05, + "loss": 1.9897, + "step": 37725 + }, + { + "epoch": 0.07349304211342153, + "grad_norm": 2.998044967651367, + "learning_rate": 2.9601963113737718e-05, + "loss": 1.8279, + "step": 37740 + }, + { + "epoch": 0.07352225238453179, + "grad_norm": 2.2030725479125977, + "learning_rate": 2.9601648054559594e-05, + "loss": 1.939, + "step": 37755 + }, + { + "epoch": 0.07355146265564207, + "grad_norm": 5.181911945343018, + "learning_rate": 2.960133287241894e-05, + "loss": 1.7382, + "step": 37770 + }, + { + "epoch": 0.07358067292675233, + "grad_norm": 3.682950496673584, + "learning_rate": 2.960101756731841e-05, + "loss": 1.8002, + "step": 37785 + }, + { + "epoch": 0.07360988319786259, + "grad_norm": 3.4098355770111084, + "learning_rate": 2.960070213926065e-05, + "loss": 1.8048, + "step": 37800 + }, + { + "epoch": 0.07363909346897285, + "grad_norm": 2.7876944541931152, + "learning_rate": 2.9600386588248333e-05, + "loss": 1.7966, + "step": 37815 + }, + { + "epoch": 0.07366830374008311, + "grad_norm": 3.52449107170105, + "learning_rate": 2.9600070914284103e-05, + "loss": 1.861, + "step": 37830 + }, + { + "epoch": 0.07369751401119337, + "grad_norm": 3.2730326652526855, + "learning_rate": 2.959975511737062e-05, + "loss": 1.9195, + "step": 37845 + }, + { + "epoch": 0.07372672428230363, + "grad_norm": 3.7927465438842773, + "learning_rate": 2.9599439197510553e-05, + "loss": 1.9699, + "step": 37860 + }, + { + "epoch": 0.07375593455341391, + "grad_norm": 2.2197682857513428, + "learning_rate": 2.9599123154706545e-05, + "loss": 1.9252, + "step": 37875 + }, + { + "epoch": 0.07378514482452417, + "grad_norm": 2.5548315048217773, + "learning_rate": 2.9598806988961275e-05, + "loss": 1.933, + "step": 37890 + }, + { + "epoch": 0.07381435509563443, + "grad_norm": 2.335618734359741, + "learning_rate": 2.9598490700277395e-05, + "loss": 1.8436, + "step": 37905 + }, + { + "epoch": 0.07384356536674469, + "grad_norm": 3.7140378952026367, + "learning_rate": 2.9598174288657573e-05, + "loss": 1.7947, + "step": 37920 + }, + { + "epoch": 0.07387277563785495, + "grad_norm": 3.628742218017578, + "learning_rate": 2.9597857754104474e-05, + "loss": 1.8114, + "step": 37935 + }, + { + "epoch": 0.07390198590896521, + "grad_norm": 2.147961139678955, + "learning_rate": 2.9597541096620758e-05, + "loss": 1.799, + "step": 37950 + }, + { + "epoch": 0.07393119618007547, + "grad_norm": 3.9110617637634277, + "learning_rate": 2.9597224316209096e-05, + "loss": 1.9354, + "step": 37965 + }, + { + "epoch": 0.07396040645118575, + "grad_norm": 3.213088274002075, + "learning_rate": 2.9596907412872156e-05, + "loss": 1.9373, + "step": 37980 + }, + { + "epoch": 0.07398961672229601, + "grad_norm": 2.0284922122955322, + "learning_rate": 2.9596590386612607e-05, + "loss": 1.8561, + "step": 37995 + }, + { + "epoch": 0.07401882699340627, + "grad_norm": 1.9195611476898193, + "learning_rate": 2.9596273237433116e-05, + "loss": 1.7875, + "step": 38010 + }, + { + "epoch": 0.07404803726451653, + "grad_norm": 1.8990719318389893, + "learning_rate": 2.9595955965336358e-05, + "loss": 1.886, + "step": 38025 + }, + { + "epoch": 0.07407724753562679, + "grad_norm": 2.7792749404907227, + "learning_rate": 2.9595638570324995e-05, + "loss": 1.9444, + "step": 38040 + }, + { + "epoch": 0.07410645780673705, + "grad_norm": 3.936563730239868, + "learning_rate": 2.9595321052401712e-05, + "loss": 1.8684, + "step": 38055 + }, + { + "epoch": 0.07413566807784731, + "grad_norm": 2.845834255218506, + "learning_rate": 2.9595003411569178e-05, + "loss": 1.8743, + "step": 38070 + }, + { + "epoch": 0.07416487834895759, + "grad_norm": 1.712980031967163, + "learning_rate": 2.9594685647830065e-05, + "loss": 1.8693, + "step": 38085 + }, + { + "epoch": 0.07419408862006785, + "grad_norm": 4.104766845703125, + "learning_rate": 2.959436776118705e-05, + "loss": 1.9156, + "step": 38100 + }, + { + "epoch": 0.07422329889117811, + "grad_norm": 4.8171281814575195, + "learning_rate": 2.9594049751642816e-05, + "loss": 1.8934, + "step": 38115 + }, + { + "epoch": 0.07425250916228837, + "grad_norm": 3.2426438331604004, + "learning_rate": 2.9593731619200037e-05, + "loss": 1.8768, + "step": 38130 + }, + { + "epoch": 0.07428171943339863, + "grad_norm": 4.016167640686035, + "learning_rate": 2.9593413363861386e-05, + "loss": 1.8933, + "step": 38145 + }, + { + "epoch": 0.0743109297045089, + "grad_norm": 1.8209270238876343, + "learning_rate": 2.9593094985629553e-05, + "loss": 1.704, + "step": 38160 + }, + { + "epoch": 0.07434013997561915, + "grad_norm": 2.429961681365967, + "learning_rate": 2.9592776484507213e-05, + "loss": 1.9408, + "step": 38175 + }, + { + "epoch": 0.07436935024672943, + "grad_norm": 4.900204181671143, + "learning_rate": 2.9592457860497048e-05, + "loss": 1.8678, + "step": 38190 + }, + { + "epoch": 0.07439856051783969, + "grad_norm": 2.6684536933898926, + "learning_rate": 2.959213911360175e-05, + "loss": 1.8831, + "step": 38205 + }, + { + "epoch": 0.07442777078894995, + "grad_norm": 3.2485227584838867, + "learning_rate": 2.959182024382399e-05, + "loss": 1.9345, + "step": 38220 + }, + { + "epoch": 0.07445698106006021, + "grad_norm": 2.1203527450561523, + "learning_rate": 2.959150125116646e-05, + "loss": 1.7432, + "step": 38235 + }, + { + "epoch": 0.07448619133117047, + "grad_norm": 4.162999629974365, + "learning_rate": 2.9591182135631848e-05, + "loss": 1.8684, + "step": 38250 + }, + { + "epoch": 0.07451540160228073, + "grad_norm": 2.8070929050445557, + "learning_rate": 2.959086289722284e-05, + "loss": 1.7381, + "step": 38265 + }, + { + "epoch": 0.074544611873391, + "grad_norm": 2.4631731510162354, + "learning_rate": 2.959054353594212e-05, + "loss": 1.5987, + "step": 38280 + }, + { + "epoch": 0.07457382214450127, + "grad_norm": 3.986906051635742, + "learning_rate": 2.9590224051792377e-05, + "loss": 1.9416, + "step": 38295 + }, + { + "epoch": 0.07460303241561153, + "grad_norm": 2.761138439178467, + "learning_rate": 2.9589904444776313e-05, + "loss": 1.819, + "step": 38310 + }, + { + "epoch": 0.07463224268672179, + "grad_norm": 5.958553791046143, + "learning_rate": 2.958958471489661e-05, + "loss": 2.0435, + "step": 38325 + }, + { + "epoch": 0.07466145295783205, + "grad_norm": 4.8247199058532715, + "learning_rate": 2.958926486215596e-05, + "loss": 2.0231, + "step": 38340 + }, + { + "epoch": 0.07469066322894231, + "grad_norm": 2.023098945617676, + "learning_rate": 2.958894488655706e-05, + "loss": 1.6504, + "step": 38355 + }, + { + "epoch": 0.07471987350005257, + "grad_norm": 2.826117992401123, + "learning_rate": 2.9588624788102604e-05, + "loss": 1.6953, + "step": 38370 + }, + { + "epoch": 0.07474908377116284, + "grad_norm": 2.362399101257324, + "learning_rate": 2.9588304566795282e-05, + "loss": 1.8232, + "step": 38385 + }, + { + "epoch": 0.07477829404227311, + "grad_norm": 2.825855255126953, + "learning_rate": 2.9587984222637804e-05, + "loss": 1.8065, + "step": 38400 + }, + { + "epoch": 0.07480750431338337, + "grad_norm": 2.7027783393859863, + "learning_rate": 2.9587663755632854e-05, + "loss": 1.849, + "step": 38415 + }, + { + "epoch": 0.07483671458449363, + "grad_norm": 3.6305863857269287, + "learning_rate": 2.9587343165783136e-05, + "loss": 1.8779, + "step": 38430 + }, + { + "epoch": 0.0748659248556039, + "grad_norm": 2.5080883502960205, + "learning_rate": 2.958702245309135e-05, + "loss": 2.1624, + "step": 38445 + }, + { + "epoch": 0.07489513512671415, + "grad_norm": 2.4447906017303467, + "learning_rate": 2.9586701617560197e-05, + "loss": 1.837, + "step": 38460 + }, + { + "epoch": 0.07492434539782442, + "grad_norm": 2.823728084564209, + "learning_rate": 2.9586380659192373e-05, + "loss": 1.8959, + "step": 38475 + }, + { + "epoch": 0.07495355566893468, + "grad_norm": 3.1042325496673584, + "learning_rate": 2.9586059577990593e-05, + "loss": 1.8348, + "step": 38490 + }, + { + "epoch": 0.07498276594004495, + "grad_norm": 3.0661540031433105, + "learning_rate": 2.958573837395755e-05, + "loss": 2.1726, + "step": 38505 + }, + { + "epoch": 0.07501197621115521, + "grad_norm": 2.4136931896209717, + "learning_rate": 2.9585417047095956e-05, + "loss": 1.8825, + "step": 38520 + }, + { + "epoch": 0.07504118648226547, + "grad_norm": 3.421844244003296, + "learning_rate": 2.9585095597408508e-05, + "loss": 1.85, + "step": 38535 + }, + { + "epoch": 0.07507039675337573, + "grad_norm": 3.8951923847198486, + "learning_rate": 2.9584774024897924e-05, + "loss": 1.9928, + "step": 38550 + }, + { + "epoch": 0.075099607024486, + "grad_norm": 4.008039951324463, + "learning_rate": 2.9584452329566906e-05, + "loss": 1.8238, + "step": 38565 + }, + { + "epoch": 0.07512881729559626, + "grad_norm": 2.5100114345550537, + "learning_rate": 2.9584130511418164e-05, + "loss": 1.854, + "step": 38580 + }, + { + "epoch": 0.07515802756670652, + "grad_norm": 3.6023712158203125, + "learning_rate": 2.9583808570454404e-05, + "loss": 2.0781, + "step": 38595 + }, + { + "epoch": 0.07518723783781679, + "grad_norm": 2.3269500732421875, + "learning_rate": 2.9583486506678342e-05, + "loss": 1.822, + "step": 38610 + }, + { + "epoch": 0.07521644810892705, + "grad_norm": 4.363159656524658, + "learning_rate": 2.9583164320092693e-05, + "loss": 2.0055, + "step": 38625 + }, + { + "epoch": 0.07524565838003731, + "grad_norm": 4.24274206161499, + "learning_rate": 2.9582842010700163e-05, + "loss": 1.9181, + "step": 38640 + }, + { + "epoch": 0.07527486865114758, + "grad_norm": 2.5762596130371094, + "learning_rate": 2.9582519578503468e-05, + "loss": 1.8822, + "step": 38655 + }, + { + "epoch": 0.07530407892225784, + "grad_norm": 2.1492116451263428, + "learning_rate": 2.958219702350533e-05, + "loss": 1.9506, + "step": 38670 + }, + { + "epoch": 0.0753332891933681, + "grad_norm": 3.317279815673828, + "learning_rate": 2.9581874345708455e-05, + "loss": 1.8698, + "step": 38685 + }, + { + "epoch": 0.07536249946447836, + "grad_norm": 2.714073419570923, + "learning_rate": 2.9581551545115565e-05, + "loss": 2.0611, + "step": 38700 + }, + { + "epoch": 0.07539170973558862, + "grad_norm": 2.718071699142456, + "learning_rate": 2.958122862172938e-05, + "loss": 1.9292, + "step": 38715 + }, + { + "epoch": 0.0754209200066989, + "grad_norm": 2.4822440147399902, + "learning_rate": 2.958090557555262e-05, + "loss": 1.9971, + "step": 38730 + }, + { + "epoch": 0.07545013027780915, + "grad_norm": 3.417344808578491, + "learning_rate": 2.9580582406588005e-05, + "loss": 1.8292, + "step": 38745 + }, + { + "epoch": 0.07547934054891942, + "grad_norm": 2.5632808208465576, + "learning_rate": 2.958025911483825e-05, + "loss": 1.816, + "step": 38760 + }, + { + "epoch": 0.07550855082002968, + "grad_norm": 2.580145835876465, + "learning_rate": 2.9579935700306085e-05, + "loss": 1.9879, + "step": 38775 + }, + { + "epoch": 0.07553776109113994, + "grad_norm": 3.973757266998291, + "learning_rate": 2.9579612162994228e-05, + "loss": 1.9594, + "step": 38790 + }, + { + "epoch": 0.0755669713622502, + "grad_norm": 3.92153000831604, + "learning_rate": 2.957928850290541e-05, + "loss": 1.773, + "step": 38805 + }, + { + "epoch": 0.07559618163336046, + "grad_norm": 2.885376214981079, + "learning_rate": 2.9578964720042353e-05, + "loss": 2.2071, + "step": 38820 + }, + { + "epoch": 0.07562539190447073, + "grad_norm": 2.7979674339294434, + "learning_rate": 2.9578640814407777e-05, + "loss": 1.9249, + "step": 38835 + }, + { + "epoch": 0.075654602175581, + "grad_norm": 2.843914270401001, + "learning_rate": 2.9578316786004425e-05, + "loss": 1.9859, + "step": 38850 + }, + { + "epoch": 0.07568381244669126, + "grad_norm": 4.267323017120361, + "learning_rate": 2.9577992634835014e-05, + "loss": 2.2496, + "step": 38865 + }, + { + "epoch": 0.07571302271780152, + "grad_norm": 1.9718761444091797, + "learning_rate": 2.9577668360902276e-05, + "loss": 1.8447, + "step": 38880 + }, + { + "epoch": 0.07574223298891178, + "grad_norm": 4.806777000427246, + "learning_rate": 2.9577343964208944e-05, + "loss": 1.7591, + "step": 38895 + }, + { + "epoch": 0.07577144326002204, + "grad_norm": 2.606508255004883, + "learning_rate": 2.9577019444757746e-05, + "loss": 1.7231, + "step": 38910 + }, + { + "epoch": 0.0758006535311323, + "grad_norm": 3.9986696243286133, + "learning_rate": 2.9576694802551416e-05, + "loss": 1.8346, + "step": 38925 + }, + { + "epoch": 0.07582986380224258, + "grad_norm": 2.828104019165039, + "learning_rate": 2.9576370037592693e-05, + "loss": 1.9536, + "step": 38940 + }, + { + "epoch": 0.07585907407335284, + "grad_norm": 4.390168190002441, + "learning_rate": 2.9576045149884305e-05, + "loss": 1.8409, + "step": 38955 + }, + { + "epoch": 0.0758882843444631, + "grad_norm": 2.9613535404205322, + "learning_rate": 2.957572013942899e-05, + "loss": 1.8244, + "step": 38970 + }, + { + "epoch": 0.07591749461557336, + "grad_norm": 2.586947441101074, + "learning_rate": 2.9575395006229488e-05, + "loss": 1.775, + "step": 38985 + }, + { + "epoch": 0.07594670488668362, + "grad_norm": 1.6005862951278687, + "learning_rate": 2.9575069750288533e-05, + "loss": 1.7216, + "step": 39000 + }, + { + "epoch": 0.07597591515779388, + "grad_norm": 3.558748960494995, + "learning_rate": 2.9574744371608865e-05, + "loss": 1.8911, + "step": 39015 + }, + { + "epoch": 0.07600512542890414, + "grad_norm": 3.7911698818206787, + "learning_rate": 2.957441887019323e-05, + "loss": 1.935, + "step": 39030 + }, + { + "epoch": 0.07603433570001442, + "grad_norm": 3.4796767234802246, + "learning_rate": 2.957409324604436e-05, + "loss": 1.7889, + "step": 39045 + }, + { + "epoch": 0.07606354597112468, + "grad_norm": 4.014665126800537, + "learning_rate": 2.9573767499165e-05, + "loss": 1.8464, + "step": 39060 + }, + { + "epoch": 0.07609275624223494, + "grad_norm": 3.5164144039154053, + "learning_rate": 2.9573441629557895e-05, + "loss": 1.8483, + "step": 39075 + }, + { + "epoch": 0.0761219665133452, + "grad_norm": 2.4140758514404297, + "learning_rate": 2.957311563722579e-05, + "loss": 1.7548, + "step": 39090 + }, + { + "epoch": 0.07615117678445546, + "grad_norm": 2.7079226970672607, + "learning_rate": 2.957278952217143e-05, + "loss": 1.9792, + "step": 39105 + }, + { + "epoch": 0.07618038705556572, + "grad_norm": 3.3955190181732178, + "learning_rate": 2.9572463284397554e-05, + "loss": 1.8887, + "step": 39120 + }, + { + "epoch": 0.07620959732667598, + "grad_norm": 2.3643722534179688, + "learning_rate": 2.9572136923906916e-05, + "loss": 1.8438, + "step": 39135 + }, + { + "epoch": 0.07623880759778626, + "grad_norm": 3.363495111465454, + "learning_rate": 2.957181044070227e-05, + "loss": 1.9439, + "step": 39150 + }, + { + "epoch": 0.07626801786889652, + "grad_norm": 2.454704523086548, + "learning_rate": 2.957148383478635e-05, + "loss": 1.8084, + "step": 39165 + }, + { + "epoch": 0.07629722814000678, + "grad_norm": 3.183285713195801, + "learning_rate": 2.957115710616192e-05, + "loss": 2.0987, + "step": 39180 + }, + { + "epoch": 0.07632643841111704, + "grad_norm": 4.382565975189209, + "learning_rate": 2.9570830254831726e-05, + "loss": 1.7397, + "step": 39195 + }, + { + "epoch": 0.0763556486822273, + "grad_norm": 4.5598320960998535, + "learning_rate": 2.9570503280798523e-05, + "loss": 1.8252, + "step": 39210 + }, + { + "epoch": 0.07638485895333756, + "grad_norm": 4.986062049865723, + "learning_rate": 2.957017618406506e-05, + "loss": 1.7812, + "step": 39225 + }, + { + "epoch": 0.07641406922444782, + "grad_norm": 3.9942119121551514, + "learning_rate": 2.9569848964634098e-05, + "loss": 1.8359, + "step": 39240 + }, + { + "epoch": 0.0764432794955581, + "grad_norm": 3.4912500381469727, + "learning_rate": 2.9569521622508385e-05, + "loss": 1.8588, + "step": 39255 + }, + { + "epoch": 0.07647248976666836, + "grad_norm": 2.060900926589966, + "learning_rate": 2.9569194157690682e-05, + "loss": 1.9333, + "step": 39270 + }, + { + "epoch": 0.07650170003777862, + "grad_norm": 3.9621095657348633, + "learning_rate": 2.9568866570183746e-05, + "loss": 1.9239, + "step": 39285 + }, + { + "epoch": 0.07653091030888888, + "grad_norm": 3.5381596088409424, + "learning_rate": 2.9568538859990336e-05, + "loss": 1.9451, + "step": 39300 + }, + { + "epoch": 0.07656012057999914, + "grad_norm": 3.013927936553955, + "learning_rate": 2.9568211027113212e-05, + "loss": 1.8415, + "step": 39315 + }, + { + "epoch": 0.0765893308511094, + "grad_norm": 3.284257650375366, + "learning_rate": 2.9567883071555136e-05, + "loss": 1.9289, + "step": 39330 + }, + { + "epoch": 0.07661854112221966, + "grad_norm": 2.15523099899292, + "learning_rate": 2.9567554993318863e-05, + "loss": 1.9155, + "step": 39345 + }, + { + "epoch": 0.07664775139332994, + "grad_norm": 3.1030516624450684, + "learning_rate": 2.9567226792407164e-05, + "loss": 1.9226, + "step": 39360 + }, + { + "epoch": 0.0766769616644402, + "grad_norm": 2.698444128036499, + "learning_rate": 2.9566898468822797e-05, + "loss": 1.8306, + "step": 39375 + }, + { + "epoch": 0.07670617193555046, + "grad_norm": 2.724808931350708, + "learning_rate": 2.956657002256853e-05, + "loss": 1.8251, + "step": 39390 + }, + { + "epoch": 0.07673538220666072, + "grad_norm": 2.9143025875091553, + "learning_rate": 2.9566241453647123e-05, + "loss": 1.755, + "step": 39405 + }, + { + "epoch": 0.07676459247777098, + "grad_norm": 3.780363082885742, + "learning_rate": 2.9565912762061357e-05, + "loss": 1.8865, + "step": 39420 + }, + { + "epoch": 0.07679380274888124, + "grad_norm": 2.4257946014404297, + "learning_rate": 2.9565583947813985e-05, + "loss": 1.8871, + "step": 39435 + }, + { + "epoch": 0.0768230130199915, + "grad_norm": 2.0739023685455322, + "learning_rate": 2.9565255010907785e-05, + "loss": 1.9244, + "step": 39450 + }, + { + "epoch": 0.07685222329110178, + "grad_norm": 4.436620712280273, + "learning_rate": 2.9564925951345526e-05, + "loss": 1.9081, + "step": 39465 + }, + { + "epoch": 0.07688143356221204, + "grad_norm": 2.5859375, + "learning_rate": 2.9564596769129976e-05, + "loss": 1.8228, + "step": 39480 + }, + { + "epoch": 0.0769106438333223, + "grad_norm": 2.353349208831787, + "learning_rate": 2.9564267464263906e-05, + "loss": 1.7626, + "step": 39495 + }, + { + "epoch": 0.07693985410443256, + "grad_norm": 2.6317977905273438, + "learning_rate": 2.956393803675009e-05, + "loss": 1.8967, + "step": 39510 + }, + { + "epoch": 0.07696906437554282, + "grad_norm": 2.7638471126556396, + "learning_rate": 2.9563608486591305e-05, + "loss": 2.1839, + "step": 39525 + }, + { + "epoch": 0.07699827464665308, + "grad_norm": 3.3399016857147217, + "learning_rate": 2.9563278813790325e-05, + "loss": 1.6782, + "step": 39540 + }, + { + "epoch": 0.07702748491776334, + "grad_norm": 4.528406620025635, + "learning_rate": 2.9562949018349927e-05, + "loss": 1.8966, + "step": 39555 + }, + { + "epoch": 0.07705669518887362, + "grad_norm": 3.12764573097229, + "learning_rate": 2.9562619100272884e-05, + "loss": 1.7098, + "step": 39570 + }, + { + "epoch": 0.07708590545998388, + "grad_norm": 3.5218427181243896, + "learning_rate": 2.956228905956198e-05, + "loss": 1.8546, + "step": 39585 + }, + { + "epoch": 0.07711511573109414, + "grad_norm": 3.075268268585205, + "learning_rate": 2.956195889621999e-05, + "loss": 1.5858, + "step": 39600 + }, + { + "epoch": 0.0771443260022044, + "grad_norm": 2.4391558170318604, + "learning_rate": 2.9561628610249696e-05, + "loss": 1.7186, + "step": 39615 + }, + { + "epoch": 0.07717353627331466, + "grad_norm": 2.846036195755005, + "learning_rate": 2.956129820165388e-05, + "loss": 1.768, + "step": 39630 + }, + { + "epoch": 0.07720274654442492, + "grad_norm": 4.640497207641602, + "learning_rate": 2.9560967670435326e-05, + "loss": 1.8488, + "step": 39645 + }, + { + "epoch": 0.07723195681553519, + "grad_norm": 4.146282196044922, + "learning_rate": 2.9560637016596815e-05, + "loss": 2.0399, + "step": 39660 + }, + { + "epoch": 0.07726116708664546, + "grad_norm": 2.2617909908294678, + "learning_rate": 2.9560306240141127e-05, + "loss": 1.7802, + "step": 39675 + }, + { + "epoch": 0.07729037735775572, + "grad_norm": 2.9520163536071777, + "learning_rate": 2.9559975341071057e-05, + "loss": 1.7356, + "step": 39690 + }, + { + "epoch": 0.07731958762886598, + "grad_norm": 2.114037036895752, + "learning_rate": 2.955964431938939e-05, + "loss": 1.8138, + "step": 39705 + }, + { + "epoch": 0.07734879789997624, + "grad_norm": 3.0344507694244385, + "learning_rate": 2.9559313175098904e-05, + "loss": 1.9566, + "step": 39720 + }, + { + "epoch": 0.0773780081710865, + "grad_norm": 2.6870083808898926, + "learning_rate": 2.9558981908202395e-05, + "loss": 1.8728, + "step": 39735 + }, + { + "epoch": 0.07740721844219677, + "grad_norm": 4.387387752532959, + "learning_rate": 2.9558650518702654e-05, + "loss": 1.8694, + "step": 39750 + }, + { + "epoch": 0.07743642871330703, + "grad_norm": 3.3471992015838623, + "learning_rate": 2.955831900660247e-05, + "loss": 1.8195, + "step": 39765 + }, + { + "epoch": 0.07746563898441729, + "grad_norm": 3.68962025642395, + "learning_rate": 2.9557987371904634e-05, + "loss": 1.8546, + "step": 39780 + }, + { + "epoch": 0.07749484925552756, + "grad_norm": 4.748502254486084, + "learning_rate": 2.9557655614611935e-05, + "loss": 2.0023, + "step": 39795 + }, + { + "epoch": 0.07752405952663782, + "grad_norm": 3.4492971897125244, + "learning_rate": 2.955732373472717e-05, + "loss": 1.8199, + "step": 39810 + }, + { + "epoch": 0.07755326979774808, + "grad_norm": 2.27078914642334, + "learning_rate": 2.955699173225314e-05, + "loss": 1.8082, + "step": 39825 + }, + { + "epoch": 0.07758248006885834, + "grad_norm": 3.9737517833709717, + "learning_rate": 2.9556659607192633e-05, + "loss": 1.7823, + "step": 39840 + }, + { + "epoch": 0.0776116903399686, + "grad_norm": 2.3084990978240967, + "learning_rate": 2.9556327359548445e-05, + "loss": 1.865, + "step": 39855 + }, + { + "epoch": 0.07764090061107887, + "grad_norm": 3.0408246517181396, + "learning_rate": 2.9555994989323383e-05, + "loss": 1.8234, + "step": 39870 + }, + { + "epoch": 0.07767011088218913, + "grad_norm": 3.016524314880371, + "learning_rate": 2.9555662496520236e-05, + "loss": 1.8594, + "step": 39885 + }, + { + "epoch": 0.0776993211532994, + "grad_norm": 2.70461106300354, + "learning_rate": 2.9555329881141806e-05, + "loss": 1.9029, + "step": 39900 + }, + { + "epoch": 0.07772853142440966, + "grad_norm": 2.4621024131774902, + "learning_rate": 2.9554997143190902e-05, + "loss": 1.8281, + "step": 39915 + }, + { + "epoch": 0.07775774169551992, + "grad_norm": 1.4638293981552124, + "learning_rate": 2.9554664282670316e-05, + "loss": 1.9086, + "step": 39930 + }, + { + "epoch": 0.07778695196663019, + "grad_norm": 1.8144150972366333, + "learning_rate": 2.955433129958286e-05, + "loss": 1.8863, + "step": 39945 + }, + { + "epoch": 0.07781616223774045, + "grad_norm": 3.1351184844970703, + "learning_rate": 2.955399819393132e-05, + "loss": 2.0427, + "step": 39960 + }, + { + "epoch": 0.07784537250885071, + "grad_norm": 2.8836092948913574, + "learning_rate": 2.9553664965718526e-05, + "loss": 1.8601, + "step": 39975 + }, + { + "epoch": 0.07787458277996097, + "grad_norm": 3.3076083660125732, + "learning_rate": 2.9553331614947274e-05, + "loss": 1.7664, + "step": 39990 + }, + { + "epoch": 0.07790379305107124, + "grad_norm": 1.984757900238037, + "learning_rate": 2.955299814162036e-05, + "loss": 2.0087, + "step": 40005 + }, + { + "epoch": 0.0779330033221815, + "grad_norm": 3.184436082839966, + "learning_rate": 2.9552664545740608e-05, + "loss": 1.8379, + "step": 40020 + }, + { + "epoch": 0.07796221359329177, + "grad_norm": 1.9847835302352905, + "learning_rate": 2.9552330827310822e-05, + "loss": 1.8154, + "step": 40035 + }, + { + "epoch": 0.07799142386440203, + "grad_norm": 3.718916893005371, + "learning_rate": 2.9551996986333807e-05, + "loss": 2.0364, + "step": 40050 + }, + { + "epoch": 0.07802063413551229, + "grad_norm": 6.052098274230957, + "learning_rate": 2.9551663022812382e-05, + "loss": 1.7957, + "step": 40065 + }, + { + "epoch": 0.07804984440662255, + "grad_norm": 2.4147698879241943, + "learning_rate": 2.9551328936749355e-05, + "loss": 1.979, + "step": 40080 + }, + { + "epoch": 0.07807905467773281, + "grad_norm": 2.1994168758392334, + "learning_rate": 2.9550994728147542e-05, + "loss": 1.8055, + "step": 40095 + }, + { + "epoch": 0.07810826494884308, + "grad_norm": 2.1785078048706055, + "learning_rate": 2.9550660397009755e-05, + "loss": 1.9657, + "step": 40110 + }, + { + "epoch": 0.07813747521995335, + "grad_norm": 4.680526256561279, + "learning_rate": 2.955032594333881e-05, + "loss": 1.8007, + "step": 40125 + }, + { + "epoch": 0.0781666854910636, + "grad_norm": 2.373208999633789, + "learning_rate": 2.9549991367137522e-05, + "loss": 1.8067, + "step": 40140 + }, + { + "epoch": 0.07819589576217387, + "grad_norm": 2.384096145629883, + "learning_rate": 2.9549656668408714e-05, + "loss": 1.9754, + "step": 40155 + }, + { + "epoch": 0.07822510603328413, + "grad_norm": 3.645989179611206, + "learning_rate": 2.9549321847155197e-05, + "loss": 1.9702, + "step": 40170 + }, + { + "epoch": 0.07825431630439439, + "grad_norm": 2.4934167861938477, + "learning_rate": 2.9548986903379794e-05, + "loss": 1.7859, + "step": 40185 + }, + { + "epoch": 0.07828352657550465, + "grad_norm": 3.5801844596862793, + "learning_rate": 2.954865183708533e-05, + "loss": 2.0304, + "step": 40200 + }, + { + "epoch": 0.07831273684661492, + "grad_norm": 1.9151461124420166, + "learning_rate": 2.954831664827462e-05, + "loss": 1.8535, + "step": 40215 + }, + { + "epoch": 0.07834194711772519, + "grad_norm": 2.5540692806243896, + "learning_rate": 2.9547981336950493e-05, + "loss": 1.797, + "step": 40230 + }, + { + "epoch": 0.07837115738883545, + "grad_norm": 5.4373955726623535, + "learning_rate": 2.9547645903115762e-05, + "loss": 1.9314, + "step": 40245 + }, + { + "epoch": 0.07840036765994571, + "grad_norm": 2.6249446868896484, + "learning_rate": 2.9547310346773267e-05, + "loss": 1.9209, + "step": 40260 + }, + { + "epoch": 0.07842957793105597, + "grad_norm": 2.437994956970215, + "learning_rate": 2.9546974667925824e-05, + "loss": 1.8113, + "step": 40275 + }, + { + "epoch": 0.07845878820216623, + "grad_norm": 2.461296558380127, + "learning_rate": 2.9546638866576257e-05, + "loss": 1.8157, + "step": 40290 + }, + { + "epoch": 0.07848799847327649, + "grad_norm": 3.721189022064209, + "learning_rate": 2.95463029427274e-05, + "loss": 1.7759, + "step": 40305 + }, + { + "epoch": 0.07851720874438677, + "grad_norm": 3.0771353244781494, + "learning_rate": 2.954596689638208e-05, + "loss": 1.9319, + "step": 40320 + }, + { + "epoch": 0.07854641901549703, + "grad_norm": 3.901102066040039, + "learning_rate": 2.9545630727543127e-05, + "loss": 1.8157, + "step": 40335 + }, + { + "epoch": 0.07857562928660729, + "grad_norm": 3.15122127532959, + "learning_rate": 2.9545294436213376e-05, + "loss": 1.7357, + "step": 40350 + }, + { + "epoch": 0.07860483955771755, + "grad_norm": 2.7092130184173584, + "learning_rate": 2.9544958022395647e-05, + "loss": 1.7302, + "step": 40365 + }, + { + "epoch": 0.07863404982882781, + "grad_norm": 2.950127363204956, + "learning_rate": 2.9544621486092787e-05, + "loss": 1.9859, + "step": 40380 + }, + { + "epoch": 0.07866326009993807, + "grad_norm": 3.0624284744262695, + "learning_rate": 2.954428482730762e-05, + "loss": 1.8898, + "step": 40395 + }, + { + "epoch": 0.07869247037104833, + "grad_norm": 2.806520700454712, + "learning_rate": 2.9543948046042988e-05, + "loss": 1.7211, + "step": 40410 + }, + { + "epoch": 0.0787216806421586, + "grad_norm": 2.7486681938171387, + "learning_rate": 2.9543611142301724e-05, + "loss": 1.7469, + "step": 40425 + }, + { + "epoch": 0.07875089091326887, + "grad_norm": 3.4893131256103516, + "learning_rate": 2.954327411608666e-05, + "loss": 1.7237, + "step": 40440 + }, + { + "epoch": 0.07878010118437913, + "grad_norm": 2.313070774078369, + "learning_rate": 2.9542936967400645e-05, + "loss": 1.8785, + "step": 40455 + }, + { + "epoch": 0.07880931145548939, + "grad_norm": 3.2802863121032715, + "learning_rate": 2.954259969624651e-05, + "loss": 1.9601, + "step": 40470 + }, + { + "epoch": 0.07883852172659965, + "grad_norm": 2.6983420848846436, + "learning_rate": 2.9542262302627097e-05, + "loss": 1.8998, + "step": 40485 + }, + { + "epoch": 0.07886773199770991, + "grad_norm": 1.7535884380340576, + "learning_rate": 2.954192478654525e-05, + "loss": 1.8958, + "step": 40500 + }, + { + "epoch": 0.07889694226882017, + "grad_norm": 2.297440528869629, + "learning_rate": 2.9541587148003808e-05, + "loss": 1.8084, + "step": 40515 + }, + { + "epoch": 0.07892615253993045, + "grad_norm": 3.2494289875030518, + "learning_rate": 2.9541249387005618e-05, + "loss": 1.7015, + "step": 40530 + }, + { + "epoch": 0.07895536281104071, + "grad_norm": 3.295280933380127, + "learning_rate": 2.954091150355352e-05, + "loss": 1.8645, + "step": 40545 + }, + { + "epoch": 0.07898457308215097, + "grad_norm": 2.3075287342071533, + "learning_rate": 2.9540573497650358e-05, + "loss": 1.8123, + "step": 40560 + }, + { + "epoch": 0.07901378335326123, + "grad_norm": 2.7343220710754395, + "learning_rate": 2.9540235369298984e-05, + "loss": 2.0116, + "step": 40575 + }, + { + "epoch": 0.07904299362437149, + "grad_norm": 3.2301554679870605, + "learning_rate": 2.9539897118502244e-05, + "loss": 1.777, + "step": 40590 + }, + { + "epoch": 0.07907220389548175, + "grad_norm": 2.2123513221740723, + "learning_rate": 2.953955874526298e-05, + "loss": 1.9181, + "step": 40605 + }, + { + "epoch": 0.07910141416659201, + "grad_norm": 5.533998966217041, + "learning_rate": 2.9539220249584056e-05, + "loss": 1.722, + "step": 40620 + }, + { + "epoch": 0.07913062443770229, + "grad_norm": 4.452574729919434, + "learning_rate": 2.9538881631468304e-05, + "loss": 1.7739, + "step": 40635 + }, + { + "epoch": 0.07915983470881255, + "grad_norm": 3.7982211112976074, + "learning_rate": 2.953854289091859e-05, + "loss": 1.8986, + "step": 40650 + }, + { + "epoch": 0.07918904497992281, + "grad_norm": 2.014842987060547, + "learning_rate": 2.9538204027937764e-05, + "loss": 1.9852, + "step": 40665 + }, + { + "epoch": 0.07921825525103307, + "grad_norm": 2.1987805366516113, + "learning_rate": 2.9537865042528672e-05, + "loss": 1.9075, + "step": 40680 + }, + { + "epoch": 0.07924746552214333, + "grad_norm": 4.476477146148682, + "learning_rate": 2.9537525934694177e-05, + "loss": 1.9831, + "step": 40695 + }, + { + "epoch": 0.07927667579325359, + "grad_norm": 2.2541050910949707, + "learning_rate": 2.953718670443713e-05, + "loss": 1.9745, + "step": 40710 + }, + { + "epoch": 0.07930588606436385, + "grad_norm": 3.9762489795684814, + "learning_rate": 2.953684735176039e-05, + "loss": 1.7637, + "step": 40725 + }, + { + "epoch": 0.07933509633547413, + "grad_norm": 3.4468305110931396, + "learning_rate": 2.9536507876666815e-05, + "loss": 1.721, + "step": 40740 + }, + { + "epoch": 0.07936430660658439, + "grad_norm": 4.6930623054504395, + "learning_rate": 2.953616827915926e-05, + "loss": 1.8222, + "step": 40755 + }, + { + "epoch": 0.07939351687769465, + "grad_norm": 2.1669819355010986, + "learning_rate": 2.953582855924059e-05, + "loss": 1.9004, + "step": 40770 + }, + { + "epoch": 0.07942272714880491, + "grad_norm": 2.0153393745422363, + "learning_rate": 2.953548871691366e-05, + "loss": 1.7301, + "step": 40785 + }, + { + "epoch": 0.07945193741991517, + "grad_norm": 3.3808844089508057, + "learning_rate": 2.9535148752181336e-05, + "loss": 1.8399, + "step": 40800 + }, + { + "epoch": 0.07948114769102543, + "grad_norm": 2.579909563064575, + "learning_rate": 2.9534808665046484e-05, + "loss": 1.838, + "step": 40815 + }, + { + "epoch": 0.0795103579621357, + "grad_norm": 4.192935943603516, + "learning_rate": 2.953446845551196e-05, + "loss": 1.9925, + "step": 40830 + }, + { + "epoch": 0.07953956823324596, + "grad_norm": 3.621619701385498, + "learning_rate": 2.9534128123580632e-05, + "loss": 1.8791, + "step": 40845 + }, + { + "epoch": 0.07956877850435623, + "grad_norm": 2.2754971981048584, + "learning_rate": 2.953378766925537e-05, + "loss": 2.041, + "step": 40860 + }, + { + "epoch": 0.07959798877546649, + "grad_norm": 2.975817918777466, + "learning_rate": 2.9533447092539037e-05, + "loss": 1.6106, + "step": 40875 + }, + { + "epoch": 0.07962719904657675, + "grad_norm": 3.3760697841644287, + "learning_rate": 2.95331063934345e-05, + "loss": 1.9524, + "step": 40890 + }, + { + "epoch": 0.07965640931768701, + "grad_norm": 5.000794887542725, + "learning_rate": 2.953276557194464e-05, + "loss": 1.9093, + "step": 40905 + }, + { + "epoch": 0.07968561958879727, + "grad_norm": 1.6077380180358887, + "learning_rate": 2.9532424628072302e-05, + "loss": 1.9355, + "step": 40920 + }, + { + "epoch": 0.07971482985990754, + "grad_norm": 1.9960527420043945, + "learning_rate": 2.953208356182038e-05, + "loss": 1.8322, + "step": 40935 + }, + { + "epoch": 0.0797440401310178, + "grad_norm": 1.9623360633850098, + "learning_rate": 2.9531742373191738e-05, + "loss": 1.7081, + "step": 40950 + }, + { + "epoch": 0.07977325040212807, + "grad_norm": 3.023564338684082, + "learning_rate": 2.9531401062189253e-05, + "loss": 1.8457, + "step": 40965 + }, + { + "epoch": 0.07980246067323833, + "grad_norm": 2.2501392364501953, + "learning_rate": 2.953105962881579e-05, + "loss": 1.9475, + "step": 40980 + }, + { + "epoch": 0.07983167094434859, + "grad_norm": 2.9284751415252686, + "learning_rate": 2.9530718073074235e-05, + "loss": 1.9101, + "step": 40995 + }, + { + "epoch": 0.07986088121545885, + "grad_norm": 5.609287738800049, + "learning_rate": 2.9530376394967457e-05, + "loss": 1.9273, + "step": 41010 + }, + { + "epoch": 0.07989009148656911, + "grad_norm": 3.8887929916381836, + "learning_rate": 2.9530034594498334e-05, + "loss": 1.8836, + "step": 41025 + }, + { + "epoch": 0.07991930175767938, + "grad_norm": 3.847130537033081, + "learning_rate": 2.952969267166975e-05, + "loss": 2.0747, + "step": 41040 + }, + { + "epoch": 0.07994851202878964, + "grad_norm": 1.7652339935302734, + "learning_rate": 2.952935062648458e-05, + "loss": 1.7563, + "step": 41055 + }, + { + "epoch": 0.07997772229989991, + "grad_norm": 4.982182025909424, + "learning_rate": 2.9529008458945703e-05, + "loss": 1.8016, + "step": 41070 + }, + { + "epoch": 0.08000693257101017, + "grad_norm": 4.783311367034912, + "learning_rate": 2.9528666169056e-05, + "loss": 1.9055, + "step": 41085 + }, + { + "epoch": 0.08003614284212043, + "grad_norm": 2.197756767272949, + "learning_rate": 2.952832375681836e-05, + "loss": 1.7831, + "step": 41100 + }, + { + "epoch": 0.0800653531132307, + "grad_norm": 2.6935906410217285, + "learning_rate": 2.9527981222235662e-05, + "loss": 1.8409, + "step": 41115 + }, + { + "epoch": 0.08009456338434096, + "grad_norm": 2.441845178604126, + "learning_rate": 2.9527638565310787e-05, + "loss": 1.7458, + "step": 41130 + }, + { + "epoch": 0.08012377365545122, + "grad_norm": 2.773716688156128, + "learning_rate": 2.952729578604663e-05, + "loss": 1.8769, + "step": 41145 + }, + { + "epoch": 0.08015298392656148, + "grad_norm": 2.964289665222168, + "learning_rate": 2.9526952884446066e-05, + "loss": 1.7942, + "step": 41160 + }, + { + "epoch": 0.08018219419767175, + "grad_norm": 3.813652992248535, + "learning_rate": 2.9526609860511993e-05, + "loss": 1.8336, + "step": 41175 + }, + { + "epoch": 0.08021140446878201, + "grad_norm": 2.442913293838501, + "learning_rate": 2.9526266714247295e-05, + "loss": 1.9877, + "step": 41190 + }, + { + "epoch": 0.08024061473989227, + "grad_norm": 2.4335250854492188, + "learning_rate": 2.9525923445654863e-05, + "loss": 1.7597, + "step": 41205 + }, + { + "epoch": 0.08026982501100254, + "grad_norm": 3.397404670715332, + "learning_rate": 2.9525580054737582e-05, + "loss": 1.9251, + "step": 41220 + }, + { + "epoch": 0.0802990352821128, + "grad_norm": 2.41749906539917, + "learning_rate": 2.9525236541498353e-05, + "loss": 2.0381, + "step": 41235 + }, + { + "epoch": 0.08032824555322306, + "grad_norm": 4.3670654296875, + "learning_rate": 2.952489290594006e-05, + "loss": 1.796, + "step": 41250 + }, + { + "epoch": 0.08035745582433332, + "grad_norm": 4.269131183624268, + "learning_rate": 2.9524549148065607e-05, + "loss": 1.8409, + "step": 41265 + }, + { + "epoch": 0.08038666609544359, + "grad_norm": 3.3020429611206055, + "learning_rate": 2.9524205267877874e-05, + "loss": 1.956, + "step": 41280 + }, + { + "epoch": 0.08041587636655385, + "grad_norm": 2.023874282836914, + "learning_rate": 2.9523861265379773e-05, + "loss": 1.8352, + "step": 41295 + }, + { + "epoch": 0.08044508663766411, + "grad_norm": 1.9308613538742065, + "learning_rate": 2.952351714057419e-05, + "loss": 1.888, + "step": 41310 + }, + { + "epoch": 0.08047429690877438, + "grad_norm": 5.0439558029174805, + "learning_rate": 2.9523172893464033e-05, + "loss": 1.7923, + "step": 41325 + }, + { + "epoch": 0.08050350717988464, + "grad_norm": 2.352971076965332, + "learning_rate": 2.9522828524052184e-05, + "loss": 1.8482, + "step": 41340 + }, + { + "epoch": 0.0805327174509949, + "grad_norm": 3.005178213119507, + "learning_rate": 2.952248403234156e-05, + "loss": 1.9192, + "step": 41355 + }, + { + "epoch": 0.08056192772210516, + "grad_norm": 2.740795612335205, + "learning_rate": 2.9522139418335058e-05, + "loss": 1.9969, + "step": 41370 + }, + { + "epoch": 0.08059113799321543, + "grad_norm": 2.5653722286224365, + "learning_rate": 2.952179468203557e-05, + "loss": 1.9732, + "step": 41385 + }, + { + "epoch": 0.0806203482643257, + "grad_norm": 2.194342613220215, + "learning_rate": 2.9521449823446013e-05, + "loss": 1.929, + "step": 41400 + }, + { + "epoch": 0.08064955853543596, + "grad_norm": 2.7607100009918213, + "learning_rate": 2.952110484256928e-05, + "loss": 1.8187, + "step": 41415 + }, + { + "epoch": 0.08067876880654622, + "grad_norm": 3.4654762744903564, + "learning_rate": 2.9520759739408285e-05, + "loss": 1.8563, + "step": 41430 + }, + { + "epoch": 0.08070797907765648, + "grad_norm": 2.130572557449341, + "learning_rate": 2.9520414513965926e-05, + "loss": 1.8695, + "step": 41445 + }, + { + "epoch": 0.08073718934876674, + "grad_norm": 3.304382085800171, + "learning_rate": 2.9520069166245114e-05, + "loss": 1.8876, + "step": 41460 + }, + { + "epoch": 0.080766399619877, + "grad_norm": 3.911742925643921, + "learning_rate": 2.951972369624876e-05, + "loss": 1.7005, + "step": 41475 + }, + { + "epoch": 0.08079560989098727, + "grad_norm": 4.597911357879639, + "learning_rate": 2.9519378103979773e-05, + "loss": 2.0433, + "step": 41490 + }, + { + "epoch": 0.08082482016209754, + "grad_norm": 5.262660980224609, + "learning_rate": 2.9519032389441057e-05, + "loss": 1.8291, + "step": 41505 + }, + { + "epoch": 0.0808540304332078, + "grad_norm": 2.321345090866089, + "learning_rate": 2.9518686552635527e-05, + "loss": 1.9505, + "step": 41520 + }, + { + "epoch": 0.08088324070431806, + "grad_norm": 4.096843242645264, + "learning_rate": 2.9518340593566098e-05, + "loss": 1.7984, + "step": 41535 + }, + { + "epoch": 0.08091245097542832, + "grad_norm": 4.057320594787598, + "learning_rate": 2.9517994512235678e-05, + "loss": 1.6034, + "step": 41550 + }, + { + "epoch": 0.08094166124653858, + "grad_norm": 2.995281219482422, + "learning_rate": 2.9517648308647186e-05, + "loss": 1.7187, + "step": 41565 + }, + { + "epoch": 0.08097087151764884, + "grad_norm": 2.0676705837249756, + "learning_rate": 2.951730198280354e-05, + "loss": 1.8877, + "step": 41580 + }, + { + "epoch": 0.08100008178875912, + "grad_norm": 3.508944272994995, + "learning_rate": 2.9516955534707643e-05, + "loss": 1.932, + "step": 41595 + }, + { + "epoch": 0.08102929205986938, + "grad_norm": 5.34934139251709, + "learning_rate": 2.9516608964362426e-05, + "loss": 1.8071, + "step": 41610 + }, + { + "epoch": 0.08105850233097964, + "grad_norm": 2.5603349208831787, + "learning_rate": 2.9516262271770802e-05, + "loss": 1.8635, + "step": 41625 + }, + { + "epoch": 0.0810877126020899, + "grad_norm": 3.6904408931732178, + "learning_rate": 2.95159154569357e-05, + "loss": 1.9512, + "step": 41640 + }, + { + "epoch": 0.08111692287320016, + "grad_norm": 3.564453125, + "learning_rate": 2.9515568519860022e-05, + "loss": 1.6514, + "step": 41655 + }, + { + "epoch": 0.08114613314431042, + "grad_norm": 2.5232253074645996, + "learning_rate": 2.9515221460546703e-05, + "loss": 1.7861, + "step": 41670 + }, + { + "epoch": 0.08117534341542068, + "grad_norm": 1.795364499092102, + "learning_rate": 2.9514874278998666e-05, + "loss": 1.8497, + "step": 41685 + }, + { + "epoch": 0.08120455368653096, + "grad_norm": 3.8312599658966064, + "learning_rate": 2.9514526975218825e-05, + "loss": 1.7274, + "step": 41700 + }, + { + "epoch": 0.08123376395764122, + "grad_norm": 4.170705795288086, + "learning_rate": 2.9514179549210114e-05, + "loss": 1.8489, + "step": 41715 + }, + { + "epoch": 0.08126297422875148, + "grad_norm": 2.639073371887207, + "learning_rate": 2.951383200097546e-05, + "loss": 1.8641, + "step": 41730 + }, + { + "epoch": 0.08129218449986174, + "grad_norm": 2.9022207260131836, + "learning_rate": 2.951348433051778e-05, + "loss": 1.6874, + "step": 41745 + }, + { + "epoch": 0.081321394770972, + "grad_norm": 4.153652191162109, + "learning_rate": 2.9513136537840008e-05, + "loss": 1.8162, + "step": 41760 + }, + { + "epoch": 0.08135060504208226, + "grad_norm": 1.798093318939209, + "learning_rate": 2.9512788622945075e-05, + "loss": 1.8429, + "step": 41775 + }, + { + "epoch": 0.08137981531319252, + "grad_norm": 3.2090342044830322, + "learning_rate": 2.9512440585835906e-05, + "loss": 1.714, + "step": 41790 + }, + { + "epoch": 0.0814090255843028, + "grad_norm": 2.163233518600464, + "learning_rate": 2.951209242651543e-05, + "loss": 1.809, + "step": 41805 + }, + { + "epoch": 0.08143823585541306, + "grad_norm": 4.145123481750488, + "learning_rate": 2.951174414498659e-05, + "loss": 1.7756, + "step": 41820 + }, + { + "epoch": 0.08146744612652332, + "grad_norm": 4.807456016540527, + "learning_rate": 2.9511395741252308e-05, + "loss": 2.0115, + "step": 41835 + }, + { + "epoch": 0.08149665639763358, + "grad_norm": 2.5362160205841064, + "learning_rate": 2.951104721531552e-05, + "loss": 1.8761, + "step": 41850 + }, + { + "epoch": 0.08152586666874384, + "grad_norm": 3.089874505996704, + "learning_rate": 2.9510698567179163e-05, + "loss": 1.9963, + "step": 41865 + }, + { + "epoch": 0.0815550769398541, + "grad_norm": 2.398677349090576, + "learning_rate": 2.9510349796846176e-05, + "loss": 1.9228, + "step": 41880 + }, + { + "epoch": 0.08158428721096436, + "grad_norm": 3.9991390705108643, + "learning_rate": 2.9510000904319487e-05, + "loss": 1.9077, + "step": 41895 + }, + { + "epoch": 0.08161349748207462, + "grad_norm": 5.66585111618042, + "learning_rate": 2.9509651889602044e-05, + "loss": 1.9959, + "step": 41910 + }, + { + "epoch": 0.0816427077531849, + "grad_norm": 4.229529857635498, + "learning_rate": 2.950930275269678e-05, + "loss": 1.8397, + "step": 41925 + }, + { + "epoch": 0.08167191802429516, + "grad_norm": 3.0767483711242676, + "learning_rate": 2.9508953493606637e-05, + "loss": 1.7965, + "step": 41940 + }, + { + "epoch": 0.08170112829540542, + "grad_norm": 8.055323600769043, + "learning_rate": 2.9508604112334558e-05, + "loss": 1.9566, + "step": 41955 + }, + { + "epoch": 0.08173033856651568, + "grad_norm": 2.262690544128418, + "learning_rate": 2.950825460888348e-05, + "loss": 1.7102, + "step": 41970 + }, + { + "epoch": 0.08175954883762594, + "grad_norm": 3.084411859512329, + "learning_rate": 2.9507904983256347e-05, + "loss": 1.8877, + "step": 41985 + }, + { + "epoch": 0.0817887591087362, + "grad_norm": 3.5004804134368896, + "learning_rate": 2.9507555235456113e-05, + "loss": 1.8649, + "step": 42000 + }, + { + "epoch": 0.08181796937984646, + "grad_norm": 3.7755775451660156, + "learning_rate": 2.9507205365485712e-05, + "loss": 1.7779, + "step": 42015 + }, + { + "epoch": 0.08184717965095674, + "grad_norm": 2.6499814987182617, + "learning_rate": 2.9506855373348095e-05, + "loss": 1.8773, + "step": 42030 + }, + { + "epoch": 0.081876389922067, + "grad_norm": 3.725449562072754, + "learning_rate": 2.950650525904621e-05, + "loss": 1.6785, + "step": 42045 + }, + { + "epoch": 0.08190560019317726, + "grad_norm": 2.3505465984344482, + "learning_rate": 2.9506155022583e-05, + "loss": 1.9197, + "step": 42060 + }, + { + "epoch": 0.08193481046428752, + "grad_norm": 3.0493154525756836, + "learning_rate": 2.950580466396142e-05, + "loss": 1.828, + "step": 42075 + }, + { + "epoch": 0.08196402073539778, + "grad_norm": 1.90041184425354, + "learning_rate": 2.9505454183184422e-05, + "loss": 1.8616, + "step": 42090 + }, + { + "epoch": 0.08199323100650804, + "grad_norm": 3.4464054107666016, + "learning_rate": 2.9505103580254948e-05, + "loss": 1.8181, + "step": 42105 + }, + { + "epoch": 0.0820224412776183, + "grad_norm": 2.4218642711639404, + "learning_rate": 2.9504752855175964e-05, + "loss": 1.7809, + "step": 42120 + }, + { + "epoch": 0.08205165154872858, + "grad_norm": 3.8021080493927, + "learning_rate": 2.950440200795041e-05, + "loss": 1.8681, + "step": 42135 + }, + { + "epoch": 0.08208086181983884, + "grad_norm": 4.094058036804199, + "learning_rate": 2.950405103858125e-05, + "loss": 1.8401, + "step": 42150 + }, + { + "epoch": 0.0821100720909491, + "grad_norm": 3.918071985244751, + "learning_rate": 2.9503699947071435e-05, + "loss": 1.8717, + "step": 42165 + }, + { + "epoch": 0.08213928236205936, + "grad_norm": 2.4167909622192383, + "learning_rate": 2.9503348733423925e-05, + "loss": 1.8206, + "step": 42180 + }, + { + "epoch": 0.08216849263316962, + "grad_norm": 4.92215633392334, + "learning_rate": 2.9502997397641674e-05, + "loss": 1.9056, + "step": 42195 + }, + { + "epoch": 0.08219770290427988, + "grad_norm": 4.555650234222412, + "learning_rate": 2.9502645939727643e-05, + "loss": 1.6728, + "step": 42210 + }, + { + "epoch": 0.08222691317539015, + "grad_norm": 3.3694517612457275, + "learning_rate": 2.9502294359684786e-05, + "loss": 1.7773, + "step": 42225 + }, + { + "epoch": 0.08225612344650042, + "grad_norm": 2.3496294021606445, + "learning_rate": 2.9501942657516076e-05, + "loss": 1.9216, + "step": 42240 + }, + { + "epoch": 0.08228533371761068, + "grad_norm": 2.9559128284454346, + "learning_rate": 2.9501590833224462e-05, + "loss": 1.9598, + "step": 42255 + }, + { + "epoch": 0.08231454398872094, + "grad_norm": 2.4877378940582275, + "learning_rate": 2.9501238886812913e-05, + "loss": 1.9056, + "step": 42270 + }, + { + "epoch": 0.0823437542598312, + "grad_norm": 3.095447063446045, + "learning_rate": 2.950088681828439e-05, + "loss": 1.8015, + "step": 42285 + }, + { + "epoch": 0.08237296453094146, + "grad_norm": 3.4131603240966797, + "learning_rate": 2.9500534627641862e-05, + "loss": 1.9394, + "step": 42300 + }, + { + "epoch": 0.08240217480205173, + "grad_norm": 2.7747130393981934, + "learning_rate": 2.9500182314888296e-05, + "loss": 2.0229, + "step": 42315 + }, + { + "epoch": 0.08243138507316199, + "grad_norm": 3.355043888092041, + "learning_rate": 2.9499829880026647e-05, + "loss": 1.8285, + "step": 42330 + }, + { + "epoch": 0.08246059534427226, + "grad_norm": 3.1953892707824707, + "learning_rate": 2.94994773230599e-05, + "loss": 1.8574, + "step": 42345 + }, + { + "epoch": 0.08248980561538252, + "grad_norm": 2.5266549587249756, + "learning_rate": 2.949912464399101e-05, + "loss": 1.8056, + "step": 42360 + }, + { + "epoch": 0.08251901588649278, + "grad_norm": 1.8549035787582397, + "learning_rate": 2.949877184282295e-05, + "loss": 1.9169, + "step": 42375 + }, + { + "epoch": 0.08254822615760304, + "grad_norm": 4.0360612869262695, + "learning_rate": 2.9498418919558698e-05, + "loss": 1.8708, + "step": 42390 + }, + { + "epoch": 0.0825774364287133, + "grad_norm": 2.234935998916626, + "learning_rate": 2.9498065874201223e-05, + "loss": 1.6797, + "step": 42405 + }, + { + "epoch": 0.08260664669982357, + "grad_norm": 3.7219271659851074, + "learning_rate": 2.949771270675349e-05, + "loss": 1.8735, + "step": 42420 + }, + { + "epoch": 0.08263585697093383, + "grad_norm": 3.9249229431152344, + "learning_rate": 2.9497359417218483e-05, + "loss": 1.9485, + "step": 42435 + }, + { + "epoch": 0.0826650672420441, + "grad_norm": 2.7230405807495117, + "learning_rate": 2.9497006005599174e-05, + "loss": 1.8244, + "step": 42450 + }, + { + "epoch": 0.08269427751315436, + "grad_norm": 3.115924119949341, + "learning_rate": 2.9496652471898535e-05, + "loss": 1.858, + "step": 42465 + }, + { + "epoch": 0.08272348778426462, + "grad_norm": 2.483771324157715, + "learning_rate": 2.949629881611955e-05, + "loss": 1.8566, + "step": 42480 + }, + { + "epoch": 0.08275269805537488, + "grad_norm": 3.040868043899536, + "learning_rate": 2.9495945038265194e-05, + "loss": 1.6987, + "step": 42495 + }, + { + "epoch": 0.08278190832648515, + "grad_norm": 2.728682279586792, + "learning_rate": 2.9495591138338442e-05, + "loss": 1.8991, + "step": 42510 + }, + { + "epoch": 0.0828111185975954, + "grad_norm": 2.2937636375427246, + "learning_rate": 2.9495237116342282e-05, + "loss": 1.801, + "step": 42525 + }, + { + "epoch": 0.08284032886870567, + "grad_norm": 2.9612514972686768, + "learning_rate": 2.949488297227969e-05, + "loss": 1.9554, + "step": 42540 + }, + { + "epoch": 0.08286953913981594, + "grad_norm": 2.841144323348999, + "learning_rate": 2.9494528706153652e-05, + "loss": 1.6266, + "step": 42555 + }, + { + "epoch": 0.0828987494109262, + "grad_norm": 2.340595245361328, + "learning_rate": 2.9494174317967144e-05, + "loss": 1.6548, + "step": 42570 + }, + { + "epoch": 0.08292795968203646, + "grad_norm": 3.0034701824188232, + "learning_rate": 2.949381980772316e-05, + "loss": 1.7248, + "step": 42585 + }, + { + "epoch": 0.08295716995314673, + "grad_norm": 3.8806467056274414, + "learning_rate": 2.949346517542468e-05, + "loss": 1.8022, + "step": 42600 + }, + { + "epoch": 0.08298638022425699, + "grad_norm": 2.9137628078460693, + "learning_rate": 2.9493110421074692e-05, + "loss": 1.8976, + "step": 42615 + }, + { + "epoch": 0.08301559049536725, + "grad_norm": 2.0429863929748535, + "learning_rate": 2.949275554467618e-05, + "loss": 1.6848, + "step": 42630 + }, + { + "epoch": 0.08304480076647751, + "grad_norm": 7.566515922546387, + "learning_rate": 2.949240054623214e-05, + "loss": 1.902, + "step": 42645 + }, + { + "epoch": 0.08307401103758778, + "grad_norm": 2.313962697982788, + "learning_rate": 2.9492045425745554e-05, + "loss": 1.8932, + "step": 42660 + }, + { + "epoch": 0.08310322130869804, + "grad_norm": 3.407179117202759, + "learning_rate": 2.9491690183219413e-05, + "loss": 1.7459, + "step": 42675 + }, + { + "epoch": 0.0831324315798083, + "grad_norm": 3.9030697345733643, + "learning_rate": 2.9491334818656716e-05, + "loss": 1.9114, + "step": 42690 + }, + { + "epoch": 0.08316164185091857, + "grad_norm": 2.450484275817871, + "learning_rate": 2.9490979332060444e-05, + "loss": 1.9702, + "step": 42705 + }, + { + "epoch": 0.08319085212202883, + "grad_norm": 4.048968315124512, + "learning_rate": 2.9490623723433603e-05, + "loss": 1.9229, + "step": 42720 + }, + { + "epoch": 0.08322006239313909, + "grad_norm": 4.658755302429199, + "learning_rate": 2.949026799277918e-05, + "loss": 1.824, + "step": 42735 + }, + { + "epoch": 0.08324927266424935, + "grad_norm": 3.0579545497894287, + "learning_rate": 2.9489912140100167e-05, + "loss": 1.7174, + "step": 42750 + }, + { + "epoch": 0.08327848293535962, + "grad_norm": 1.9854892492294312, + "learning_rate": 2.948955616539957e-05, + "loss": 1.7176, + "step": 42765 + }, + { + "epoch": 0.08330769320646988, + "grad_norm": 4.234609127044678, + "learning_rate": 2.9489200068680388e-05, + "loss": 1.9884, + "step": 42780 + }, + { + "epoch": 0.08333690347758015, + "grad_norm": 3.243767023086548, + "learning_rate": 2.9488843849945608e-05, + "loss": 1.8489, + "step": 42795 + }, + { + "epoch": 0.0833661137486904, + "grad_norm": 2.477585792541504, + "learning_rate": 2.9488487509198235e-05, + "loss": 1.8609, + "step": 42810 + }, + { + "epoch": 0.08339532401980067, + "grad_norm": 3.6641616821289062, + "learning_rate": 2.9488131046441276e-05, + "loss": 1.9305, + "step": 42825 + }, + { + "epoch": 0.08342453429091093, + "grad_norm": 2.413062334060669, + "learning_rate": 2.9487774461677725e-05, + "loss": 1.833, + "step": 42840 + }, + { + "epoch": 0.08345374456202119, + "grad_norm": 4.575281620025635, + "learning_rate": 2.9487417754910587e-05, + "loss": 1.8558, + "step": 42855 + }, + { + "epoch": 0.08348295483313146, + "grad_norm": 2.5189337730407715, + "learning_rate": 2.9487060926142868e-05, + "loss": 1.8171, + "step": 42870 + }, + { + "epoch": 0.08351216510424173, + "grad_norm": 2.8013970851898193, + "learning_rate": 2.948670397537757e-05, + "loss": 2.0007, + "step": 42885 + }, + { + "epoch": 0.08354137537535199, + "grad_norm": 4.385171890258789, + "learning_rate": 2.94863469026177e-05, + "loss": 1.791, + "step": 42900 + }, + { + "epoch": 0.08357058564646225, + "grad_norm": 3.104492425918579, + "learning_rate": 2.9485989707866266e-05, + "loss": 1.8535, + "step": 42915 + }, + { + "epoch": 0.08359979591757251, + "grad_norm": 1.912163496017456, + "learning_rate": 2.9485632391126274e-05, + "loss": 1.7938, + "step": 42930 + }, + { + "epoch": 0.08362900618868277, + "grad_norm": 3.2798397541046143, + "learning_rate": 2.9485274952400738e-05, + "loss": 1.8626, + "step": 42945 + }, + { + "epoch": 0.08365821645979303, + "grad_norm": 3.1956722736358643, + "learning_rate": 2.9484917391692663e-05, + "loss": 1.9577, + "step": 42960 + }, + { + "epoch": 0.08368742673090329, + "grad_norm": 4.458518028259277, + "learning_rate": 2.9484559709005063e-05, + "loss": 1.9054, + "step": 42975 + }, + { + "epoch": 0.08371663700201357, + "grad_norm": 2.3765296936035156, + "learning_rate": 2.9484201904340944e-05, + "loss": 2.0728, + "step": 42990 + }, + { + "epoch": 0.08374584727312383, + "grad_norm": 2.685070514678955, + "learning_rate": 2.9483843977703322e-05, + "loss": 2.0102, + "step": 43005 + }, + { + "epoch": 0.08377505754423409, + "grad_norm": 2.6258368492126465, + "learning_rate": 2.948348592909522e-05, + "loss": 1.8041, + "step": 43020 + }, + { + "epoch": 0.08380426781534435, + "grad_norm": 2.5244250297546387, + "learning_rate": 2.948312775851964e-05, + "loss": 1.8507, + "step": 43035 + }, + { + "epoch": 0.08383347808645461, + "grad_norm": 2.486104965209961, + "learning_rate": 2.948276946597961e-05, + "loss": 1.8822, + "step": 43050 + }, + { + "epoch": 0.08386268835756487, + "grad_norm": 3.720233917236328, + "learning_rate": 2.9482411051478137e-05, + "loss": 1.6757, + "step": 43065 + }, + { + "epoch": 0.08389189862867513, + "grad_norm": 2.1805083751678467, + "learning_rate": 2.9482052515018247e-05, + "loss": 1.9184, + "step": 43080 + }, + { + "epoch": 0.08392110889978541, + "grad_norm": 6.519444465637207, + "learning_rate": 2.9481693856602953e-05, + "loss": 1.8735, + "step": 43095 + }, + { + "epoch": 0.08395031917089567, + "grad_norm": 3.030322790145874, + "learning_rate": 2.9481335076235277e-05, + "loss": 2.0388, + "step": 43110 + }, + { + "epoch": 0.08397952944200593, + "grad_norm": 3.333339214324951, + "learning_rate": 2.9480976173918245e-05, + "loss": 1.7558, + "step": 43125 + }, + { + "epoch": 0.08400873971311619, + "grad_norm": 2.7111990451812744, + "learning_rate": 2.9480617149654872e-05, + "loss": 1.7693, + "step": 43140 + }, + { + "epoch": 0.08403794998422645, + "grad_norm": 3.3287692070007324, + "learning_rate": 2.948025800344819e-05, + "loss": 1.8074, + "step": 43155 + }, + { + "epoch": 0.08406716025533671, + "grad_norm": 2.8257551193237305, + "learning_rate": 2.9479898735301217e-05, + "loss": 2.0247, + "step": 43170 + }, + { + "epoch": 0.08409637052644697, + "grad_norm": 2.685481071472168, + "learning_rate": 2.947953934521698e-05, + "loss": 1.8955, + "step": 43185 + }, + { + "epoch": 0.08412558079755725, + "grad_norm": 2.8256003856658936, + "learning_rate": 2.9479179833198504e-05, + "loss": 1.6823, + "step": 43200 + }, + { + "epoch": 0.08415479106866751, + "grad_norm": 2.5234475135803223, + "learning_rate": 2.947882019924882e-05, + "loss": 1.8303, + "step": 43215 + }, + { + "epoch": 0.08418400133977777, + "grad_norm": 2.968050003051758, + "learning_rate": 2.9478460443370956e-05, + "loss": 1.8881, + "step": 43230 + }, + { + "epoch": 0.08421321161088803, + "grad_norm": 2.8748507499694824, + "learning_rate": 2.947810056556794e-05, + "loss": 1.918, + "step": 43245 + }, + { + "epoch": 0.08424242188199829, + "grad_norm": 3.759028434753418, + "learning_rate": 2.94777405658428e-05, + "loss": 1.7358, + "step": 43260 + }, + { + "epoch": 0.08427163215310855, + "grad_norm": 2.4832756519317627, + "learning_rate": 2.947738044419857e-05, + "loss": 1.8534, + "step": 43275 + }, + { + "epoch": 0.08430084242421881, + "grad_norm": 2.4439778327941895, + "learning_rate": 2.9477020200638287e-05, + "loss": 1.6968, + "step": 43290 + }, + { + "epoch": 0.08433005269532909, + "grad_norm": 3.8299081325531006, + "learning_rate": 2.947665983516498e-05, + "loss": 1.9434, + "step": 43305 + }, + { + "epoch": 0.08435926296643935, + "grad_norm": 2.444437026977539, + "learning_rate": 2.947629934778168e-05, + "loss": 1.7078, + "step": 43320 + }, + { + "epoch": 0.08438847323754961, + "grad_norm": 4.316204071044922, + "learning_rate": 2.9475938738491432e-05, + "loss": 2.0225, + "step": 43335 + }, + { + "epoch": 0.08441768350865987, + "grad_norm": 2.716829538345337, + "learning_rate": 2.947557800729727e-05, + "loss": 1.8284, + "step": 43350 + }, + { + "epoch": 0.08444689377977013, + "grad_norm": 1.8645912408828735, + "learning_rate": 2.9475217154202225e-05, + "loss": 1.8862, + "step": 43365 + }, + { + "epoch": 0.0844761040508804, + "grad_norm": 2.9769399166107178, + "learning_rate": 2.9474856179209343e-05, + "loss": 1.9656, + "step": 43380 + }, + { + "epoch": 0.08450531432199065, + "grad_norm": 2.7728824615478516, + "learning_rate": 2.947449508232166e-05, + "loss": 1.8666, + "step": 43395 + }, + { + "epoch": 0.08453452459310093, + "grad_norm": 2.3458614349365234, + "learning_rate": 2.9474133863542216e-05, + "loss": 1.9181, + "step": 43410 + }, + { + "epoch": 0.08456373486421119, + "grad_norm": 2.8927040100097656, + "learning_rate": 2.9473772522874057e-05, + "loss": 1.8441, + "step": 43425 + }, + { + "epoch": 0.08459294513532145, + "grad_norm": 3.4922873973846436, + "learning_rate": 2.9473411060320226e-05, + "loss": 1.9061, + "step": 43440 + }, + { + "epoch": 0.08462215540643171, + "grad_norm": 2.4876444339752197, + "learning_rate": 2.9473049475883762e-05, + "loss": 1.8024, + "step": 43455 + }, + { + "epoch": 0.08465136567754197, + "grad_norm": 2.6545233726501465, + "learning_rate": 2.9472687769567716e-05, + "loss": 1.764, + "step": 43470 + }, + { + "epoch": 0.08468057594865223, + "grad_norm": 3.1122703552246094, + "learning_rate": 2.947232594137513e-05, + "loss": 1.921, + "step": 43485 + }, + { + "epoch": 0.0847097862197625, + "grad_norm": 3.626193046569824, + "learning_rate": 2.947196399130905e-05, + "loss": 1.7188, + "step": 43500 + }, + { + "epoch": 0.08473899649087277, + "grad_norm": 3.0047879219055176, + "learning_rate": 2.9471601919372522e-05, + "loss": 1.9129, + "step": 43515 + }, + { + "epoch": 0.08476820676198303, + "grad_norm": 4.2390456199646, + "learning_rate": 2.9471239725568606e-05, + "loss": 1.7775, + "step": 43530 + }, + { + "epoch": 0.08479741703309329, + "grad_norm": 2.4242374897003174, + "learning_rate": 2.947087740990034e-05, + "loss": 1.9588, + "step": 43545 + }, + { + "epoch": 0.08482662730420355, + "grad_norm": 3.0172243118286133, + "learning_rate": 2.9470514972370784e-05, + "loss": 1.7774, + "step": 43560 + }, + { + "epoch": 0.08485583757531381, + "grad_norm": 2.9357850551605225, + "learning_rate": 2.9470152412982985e-05, + "loss": 1.8818, + "step": 43575 + }, + { + "epoch": 0.08488504784642407, + "grad_norm": 3.8699593544006348, + "learning_rate": 2.9469789731739996e-05, + "loss": 1.8835, + "step": 43590 + }, + { + "epoch": 0.08491425811753434, + "grad_norm": 6.237668037414551, + "learning_rate": 2.9469426928644877e-05, + "loss": 1.8208, + "step": 43605 + }, + { + "epoch": 0.08494346838864461, + "grad_norm": 3.9943366050720215, + "learning_rate": 2.946906400370067e-05, + "loss": 1.7787, + "step": 43620 + }, + { + "epoch": 0.08497267865975487, + "grad_norm": 3.0211308002471924, + "learning_rate": 2.946870095691045e-05, + "loss": 1.9552, + "step": 43635 + }, + { + "epoch": 0.08500188893086513, + "grad_norm": 3.7002623081207275, + "learning_rate": 2.9468337788277255e-05, + "loss": 1.7297, + "step": 43650 + }, + { + "epoch": 0.0850310992019754, + "grad_norm": 3.6715025901794434, + "learning_rate": 2.946797449780416e-05, + "loss": 1.9472, + "step": 43665 + }, + { + "epoch": 0.08506030947308565, + "grad_norm": 3.456092596054077, + "learning_rate": 2.9467611085494212e-05, + "loss": 1.8751, + "step": 43680 + }, + { + "epoch": 0.08508951974419592, + "grad_norm": 5.426120758056641, + "learning_rate": 2.946724755135048e-05, + "loss": 1.8391, + "step": 43695 + }, + { + "epoch": 0.08511873001530618, + "grad_norm": 2.2696099281311035, + "learning_rate": 2.946688389537602e-05, + "loss": 1.7744, + "step": 43710 + }, + { + "epoch": 0.08514794028641645, + "grad_norm": 4.38014030456543, + "learning_rate": 2.9466520117573897e-05, + "loss": 2.0601, + "step": 43725 + }, + { + "epoch": 0.08517715055752671, + "grad_norm": 2.5647072792053223, + "learning_rate": 2.946615621794717e-05, + "loss": 1.8444, + "step": 43740 + }, + { + "epoch": 0.08520636082863697, + "grad_norm": 2.123666286468506, + "learning_rate": 2.946579219649891e-05, + "loss": 1.923, + "step": 43755 + }, + { + "epoch": 0.08523557109974723, + "grad_norm": 2.335630178451538, + "learning_rate": 2.946542805323218e-05, + "loss": 1.8607, + "step": 43770 + }, + { + "epoch": 0.0852647813708575, + "grad_norm": 2.300090789794922, + "learning_rate": 2.946506378815004e-05, + "loss": 1.8034, + "step": 43785 + }, + { + "epoch": 0.08529399164196776, + "grad_norm": 2.441096067428589, + "learning_rate": 2.946469940125557e-05, + "loss": 1.899, + "step": 43800 + }, + { + "epoch": 0.08532320191307802, + "grad_norm": 3.2876861095428467, + "learning_rate": 2.946433489255183e-05, + "loss": 1.8932, + "step": 43815 + }, + { + "epoch": 0.08535241218418829, + "grad_norm": 3.2221438884735107, + "learning_rate": 2.946397026204189e-05, + "loss": 1.8407, + "step": 43830 + }, + { + "epoch": 0.08538162245529855, + "grad_norm": 1.7918709516525269, + "learning_rate": 2.9463605509728823e-05, + "loss": 1.8096, + "step": 43845 + }, + { + "epoch": 0.08541083272640881, + "grad_norm": 3.6342146396636963, + "learning_rate": 2.9463240635615704e-05, + "loss": 1.8982, + "step": 43860 + }, + { + "epoch": 0.08544004299751908, + "grad_norm": 3.7016751766204834, + "learning_rate": 2.9462875639705598e-05, + "loss": 1.7444, + "step": 43875 + }, + { + "epoch": 0.08546925326862934, + "grad_norm": 4.3460259437561035, + "learning_rate": 2.946251052200158e-05, + "loss": 1.8327, + "step": 43890 + }, + { + "epoch": 0.0854984635397396, + "grad_norm": 2.2607274055480957, + "learning_rate": 2.946214528250673e-05, + "loss": 1.9111, + "step": 43905 + }, + { + "epoch": 0.08552767381084986, + "grad_norm": 2.9234113693237305, + "learning_rate": 2.9461779921224118e-05, + "loss": 1.8753, + "step": 43920 + }, + { + "epoch": 0.08555688408196013, + "grad_norm": 4.067141532897949, + "learning_rate": 2.9461414438156825e-05, + "loss": 1.6669, + "step": 43935 + }, + { + "epoch": 0.0855860943530704, + "grad_norm": 2.427942991256714, + "learning_rate": 2.9461048833307926e-05, + "loss": 1.7331, + "step": 43950 + }, + { + "epoch": 0.08561530462418065, + "grad_norm": 2.4281039237976074, + "learning_rate": 2.94606831066805e-05, + "loss": 1.9844, + "step": 43965 + }, + { + "epoch": 0.08564451489529092, + "grad_norm": 4.457733154296875, + "learning_rate": 2.946031725827763e-05, + "loss": 2.0372, + "step": 43980 + }, + { + "epoch": 0.08567372516640118, + "grad_norm": 4.074513912200928, + "learning_rate": 2.94599512881024e-05, + "loss": 1.8984, + "step": 43995 + }, + { + "epoch": 0.08570293543751144, + "grad_norm": 4.201878547668457, + "learning_rate": 2.9459585196157876e-05, + "loss": 1.9715, + "step": 44010 + }, + { + "epoch": 0.0857321457086217, + "grad_norm": 4.125231742858887, + "learning_rate": 2.9459218982447156e-05, + "loss": 1.709, + "step": 44025 + }, + { + "epoch": 0.08576135597973196, + "grad_norm": 4.726757526397705, + "learning_rate": 2.945885264697332e-05, + "loss": 1.784, + "step": 44040 + }, + { + "epoch": 0.08579056625084223, + "grad_norm": 2.9422481060028076, + "learning_rate": 2.945848618973945e-05, + "loss": 1.9584, + "step": 44055 + }, + { + "epoch": 0.0858197765219525, + "grad_norm": 3.583988666534424, + "learning_rate": 2.9458119610748635e-05, + "loss": 1.9566, + "step": 44070 + }, + { + "epoch": 0.08584898679306276, + "grad_norm": 3.0787510871887207, + "learning_rate": 2.9457752910003965e-05, + "loss": 1.8864, + "step": 44085 + }, + { + "epoch": 0.08587819706417302, + "grad_norm": 2.8716557025909424, + "learning_rate": 2.9457386087508517e-05, + "loss": 1.8846, + "step": 44100 + }, + { + "epoch": 0.08590740733528328, + "grad_norm": 2.74885630607605, + "learning_rate": 2.9457019143265392e-05, + "loss": 1.8493, + "step": 44115 + }, + { + "epoch": 0.08593661760639354, + "grad_norm": 2.7693309783935547, + "learning_rate": 2.945665207727768e-05, + "loss": 1.9983, + "step": 44130 + }, + { + "epoch": 0.0859658278775038, + "grad_norm": 4.332008361816406, + "learning_rate": 2.945628488954846e-05, + "loss": 1.6724, + "step": 44145 + }, + { + "epoch": 0.08599503814861408, + "grad_norm": 3.0237069129943848, + "learning_rate": 2.9455917580080834e-05, + "loss": 1.9695, + "step": 44160 + }, + { + "epoch": 0.08602424841972434, + "grad_norm": 3.4789276123046875, + "learning_rate": 2.9455550148877895e-05, + "loss": 1.9119, + "step": 44175 + }, + { + "epoch": 0.0860534586908346, + "grad_norm": 4.246069431304932, + "learning_rate": 2.9455182595942735e-05, + "loss": 1.8591, + "step": 44190 + }, + { + "epoch": 0.08608266896194486, + "grad_norm": 3.916719436645508, + "learning_rate": 2.9454814921278444e-05, + "loss": 1.8525, + "step": 44205 + }, + { + "epoch": 0.08611187923305512, + "grad_norm": 3.07059383392334, + "learning_rate": 2.945444712488813e-05, + "loss": 1.814, + "step": 44220 + }, + { + "epoch": 0.08614108950416538, + "grad_norm": 1.955853819847107, + "learning_rate": 2.945407920677488e-05, + "loss": 1.8526, + "step": 44235 + }, + { + "epoch": 0.08617029977527564, + "grad_norm": 3.5619003772735596, + "learning_rate": 2.9453711166941797e-05, + "loss": 1.9248, + "step": 44250 + }, + { + "epoch": 0.08619951004638592, + "grad_norm": 3.539492607116699, + "learning_rate": 2.9453343005391984e-05, + "loss": 1.9684, + "step": 44265 + }, + { + "epoch": 0.08622872031749618, + "grad_norm": 3.498070240020752, + "learning_rate": 2.9452974722128533e-05, + "loss": 1.892, + "step": 44280 + }, + { + "epoch": 0.08625793058860644, + "grad_norm": 1.820780634880066, + "learning_rate": 2.9452606317154546e-05, + "loss": 1.8162, + "step": 44295 + }, + { + "epoch": 0.0862871408597167, + "grad_norm": 4.180066108703613, + "learning_rate": 2.9452237790473132e-05, + "loss": 1.8408, + "step": 44310 + }, + { + "epoch": 0.08631635113082696, + "grad_norm": 3.168524980545044, + "learning_rate": 2.945186914208739e-05, + "loss": 1.8724, + "step": 44325 + }, + { + "epoch": 0.08634556140193722, + "grad_norm": 3.31315541267395, + "learning_rate": 2.9451500372000425e-05, + "loss": 1.9657, + "step": 44340 + }, + { + "epoch": 0.08637477167304748, + "grad_norm": 3.3687546253204346, + "learning_rate": 2.9451131480215343e-05, + "loss": 2.011, + "step": 44355 + }, + { + "epoch": 0.08640398194415776, + "grad_norm": 2.4594132900238037, + "learning_rate": 2.945076246673525e-05, + "loss": 1.8296, + "step": 44370 + }, + { + "epoch": 0.08643319221526802, + "grad_norm": 2.2705142498016357, + "learning_rate": 2.9450393331563254e-05, + "loss": 1.7826, + "step": 44385 + }, + { + "epoch": 0.08646240248637828, + "grad_norm": 4.375278949737549, + "learning_rate": 2.9450024074702465e-05, + "loss": 1.7716, + "step": 44400 + }, + { + "epoch": 0.08649161275748854, + "grad_norm": 2.814610481262207, + "learning_rate": 2.944965469615599e-05, + "loss": 1.7126, + "step": 44415 + }, + { + "epoch": 0.0865208230285988, + "grad_norm": 2.329692840576172, + "learning_rate": 2.9449285195926942e-05, + "loss": 1.6665, + "step": 44430 + }, + { + "epoch": 0.08655003329970906, + "grad_norm": 2.890446901321411, + "learning_rate": 2.944891557401843e-05, + "loss": 1.6763, + "step": 44445 + }, + { + "epoch": 0.08657924357081932, + "grad_norm": 1.720476746559143, + "learning_rate": 2.9448545830433565e-05, + "loss": 1.8273, + "step": 44460 + }, + { + "epoch": 0.0866084538419296, + "grad_norm": 2.7548534870147705, + "learning_rate": 2.9448175965175465e-05, + "loss": 1.7968, + "step": 44475 + }, + { + "epoch": 0.08663766411303986, + "grad_norm": 2.2770633697509766, + "learning_rate": 2.9447805978247244e-05, + "loss": 1.8821, + "step": 44490 + }, + { + "epoch": 0.08666687438415012, + "grad_norm": 1.8714686632156372, + "learning_rate": 2.944743586965202e-05, + "loss": 1.8877, + "step": 44505 + }, + { + "epoch": 0.08669608465526038, + "grad_norm": 3.634554624557495, + "learning_rate": 2.9447065639392902e-05, + "loss": 1.8628, + "step": 44520 + }, + { + "epoch": 0.08672529492637064, + "grad_norm": 3.813159465789795, + "learning_rate": 2.9446695287473015e-05, + "loss": 1.9144, + "step": 44535 + }, + { + "epoch": 0.0867545051974809, + "grad_norm": 3.10882568359375, + "learning_rate": 2.9446324813895475e-05, + "loss": 1.8602, + "step": 44550 + }, + { + "epoch": 0.08678371546859116, + "grad_norm": 2.8320236206054688, + "learning_rate": 2.94459542186634e-05, + "loss": 1.8681, + "step": 44565 + }, + { + "epoch": 0.08681292573970144, + "grad_norm": 2.379601001739502, + "learning_rate": 2.9445583501779914e-05, + "loss": 2.0004, + "step": 44580 + }, + { + "epoch": 0.0868421360108117, + "grad_norm": 4.256722927093506, + "learning_rate": 2.9445212663248136e-05, + "loss": 1.8432, + "step": 44595 + }, + { + "epoch": 0.08687134628192196, + "grad_norm": 3.5287508964538574, + "learning_rate": 2.944484170307119e-05, + "loss": 2.0324, + "step": 44610 + }, + { + "epoch": 0.08690055655303222, + "grad_norm": 5.067081451416016, + "learning_rate": 2.9444470621252206e-05, + "loss": 1.7134, + "step": 44625 + }, + { + "epoch": 0.08692976682414248, + "grad_norm": 2.34912109375, + "learning_rate": 2.94440994177943e-05, + "loss": 1.9109, + "step": 44640 + }, + { + "epoch": 0.08695897709525274, + "grad_norm": 2.16899037361145, + "learning_rate": 2.9443728092700598e-05, + "loss": 1.939, + "step": 44655 + }, + { + "epoch": 0.086988187366363, + "grad_norm": 2.583625555038452, + "learning_rate": 2.9443356645974234e-05, + "loss": 1.9217, + "step": 44670 + }, + { + "epoch": 0.08701739763747328, + "grad_norm": 3.3169472217559814, + "learning_rate": 2.9442985077618333e-05, + "loss": 1.9824, + "step": 44685 + }, + { + "epoch": 0.08704660790858354, + "grad_norm": 3.411571979522705, + "learning_rate": 2.944261338763602e-05, + "loss": 1.865, + "step": 44700 + }, + { + "epoch": 0.0870758181796938, + "grad_norm": 2.273318290710449, + "learning_rate": 2.9442241576030427e-05, + "loss": 1.7771, + "step": 44715 + }, + { + "epoch": 0.08710502845080406, + "grad_norm": 1.9476948976516724, + "learning_rate": 2.944186964280469e-05, + "loss": 1.8675, + "step": 44730 + }, + { + "epoch": 0.08713423872191432, + "grad_norm": 2.1292243003845215, + "learning_rate": 2.944149758796194e-05, + "loss": 1.7585, + "step": 44745 + }, + { + "epoch": 0.08716344899302458, + "grad_norm": 3.4270260334014893, + "learning_rate": 2.9441125411505303e-05, + "loss": 1.7854, + "step": 44760 + }, + { + "epoch": 0.08719265926413484, + "grad_norm": 2.7164952754974365, + "learning_rate": 2.944075311343792e-05, + "loss": 1.8049, + "step": 44775 + }, + { + "epoch": 0.08722186953524512, + "grad_norm": 4.517385005950928, + "learning_rate": 2.9440380693762927e-05, + "loss": 1.7206, + "step": 44790 + }, + { + "epoch": 0.08725107980635538, + "grad_norm": 4.334033489227295, + "learning_rate": 2.9440008152483452e-05, + "loss": 1.7694, + "step": 44805 + }, + { + "epoch": 0.08728029007746564, + "grad_norm": 3.0128273963928223, + "learning_rate": 2.943963548960264e-05, + "loss": 1.8354, + "step": 44820 + }, + { + "epoch": 0.0873095003485759, + "grad_norm": 6.147031784057617, + "learning_rate": 2.9439262705123626e-05, + "loss": 1.7589, + "step": 44835 + }, + { + "epoch": 0.08733871061968616, + "grad_norm": 3.1377012729644775, + "learning_rate": 2.943888979904955e-05, + "loss": 1.8435, + "step": 44850 + }, + { + "epoch": 0.08736792089079642, + "grad_norm": 3.672729969024658, + "learning_rate": 2.9438516771383554e-05, + "loss": 1.765, + "step": 44865 + }, + { + "epoch": 0.08739713116190669, + "grad_norm": 2.4337987899780273, + "learning_rate": 2.9438143622128772e-05, + "loss": 1.8002, + "step": 44880 + }, + { + "epoch": 0.08742634143301696, + "grad_norm": 2.0574724674224854, + "learning_rate": 2.9437770351288357e-05, + "loss": 1.6424, + "step": 44895 + }, + { + "epoch": 0.08745555170412722, + "grad_norm": 2.1482346057891846, + "learning_rate": 2.9437396958865446e-05, + "loss": 1.92, + "step": 44910 + }, + { + "epoch": 0.08748476197523748, + "grad_norm": 2.3287675380706787, + "learning_rate": 2.9437023444863187e-05, + "loss": 1.9602, + "step": 44925 + }, + { + "epoch": 0.08751397224634774, + "grad_norm": 3.350952625274658, + "learning_rate": 2.943664980928472e-05, + "loss": 1.8386, + "step": 44940 + }, + { + "epoch": 0.087543182517458, + "grad_norm": 3.5424532890319824, + "learning_rate": 2.9436276052133196e-05, + "loss": 1.878, + "step": 44955 + }, + { + "epoch": 0.08757239278856827, + "grad_norm": 2.377460241317749, + "learning_rate": 2.943590217341176e-05, + "loss": 1.9263, + "step": 44970 + }, + { + "epoch": 0.08760160305967853, + "grad_norm": 3.12561297416687, + "learning_rate": 2.943552817312356e-05, + "loss": 1.9618, + "step": 44985 + }, + { + "epoch": 0.0876308133307888, + "grad_norm": 2.070358991622925, + "learning_rate": 2.9435154051271748e-05, + "loss": 1.7616, + "step": 45000 + }, + { + "epoch": 0.08766002360189906, + "grad_norm": 1.9374375343322754, + "learning_rate": 2.9434779807859477e-05, + "loss": 1.9208, + "step": 45015 + }, + { + "epoch": 0.08768923387300932, + "grad_norm": 5.294185161590576, + "learning_rate": 2.943440544288989e-05, + "loss": 1.8057, + "step": 45030 + }, + { + "epoch": 0.08771844414411958, + "grad_norm": 3.713681697845459, + "learning_rate": 2.943403095636615e-05, + "loss": 1.8673, + "step": 45045 + }, + { + "epoch": 0.08774765441522984, + "grad_norm": 2.3561294078826904, + "learning_rate": 2.94336563482914e-05, + "loss": 1.8507, + "step": 45060 + }, + { + "epoch": 0.0877768646863401, + "grad_norm": 2.0783450603485107, + "learning_rate": 2.9433281618668805e-05, + "loss": 1.8737, + "step": 45075 + }, + { + "epoch": 0.08780607495745037, + "grad_norm": 3.277926445007324, + "learning_rate": 2.943290676750151e-05, + "loss": 1.9591, + "step": 45090 + }, + { + "epoch": 0.08783528522856063, + "grad_norm": 3.0757181644439697, + "learning_rate": 2.9432531794792683e-05, + "loss": 1.9447, + "step": 45105 + }, + { + "epoch": 0.0878644954996709, + "grad_norm": 1.8041589260101318, + "learning_rate": 2.943215670054547e-05, + "loss": 1.8033, + "step": 45120 + }, + { + "epoch": 0.08789370577078116, + "grad_norm": 2.865537405014038, + "learning_rate": 2.9431781484763037e-05, + "loss": 1.9961, + "step": 45135 + }, + { + "epoch": 0.08792291604189142, + "grad_norm": 2.375731945037842, + "learning_rate": 2.9431406147448545e-05, + "loss": 1.8439, + "step": 45150 + }, + { + "epoch": 0.08795212631300169, + "grad_norm": 3.0966978073120117, + "learning_rate": 2.9431030688605154e-05, + "loss": 2.1031, + "step": 45165 + }, + { + "epoch": 0.08798133658411195, + "grad_norm": 2.7011566162109375, + "learning_rate": 2.9430655108236018e-05, + "loss": 1.7007, + "step": 45180 + }, + { + "epoch": 0.08801054685522221, + "grad_norm": 2.7523937225341797, + "learning_rate": 2.943027940634431e-05, + "loss": 1.8057, + "step": 45195 + }, + { + "epoch": 0.08803975712633247, + "grad_norm": 3.8090713024139404, + "learning_rate": 2.9429903582933186e-05, + "loss": 2.0009, + "step": 45210 + }, + { + "epoch": 0.08806896739744274, + "grad_norm": 2.986457586288452, + "learning_rate": 2.9429527638005816e-05, + "loss": 1.7333, + "step": 45225 + }, + { + "epoch": 0.088098177668553, + "grad_norm": 2.3911290168762207, + "learning_rate": 2.942915157156537e-05, + "loss": 1.72, + "step": 45240 + }, + { + "epoch": 0.08812738793966327, + "grad_norm": 3.72906756401062, + "learning_rate": 2.9428775383614998e-05, + "loss": 2.0707, + "step": 45255 + }, + { + "epoch": 0.08815659821077353, + "grad_norm": 3.646193504333496, + "learning_rate": 2.942839907415789e-05, + "loss": 1.808, + "step": 45270 + }, + { + "epoch": 0.08818580848188379, + "grad_norm": 2.3747620582580566, + "learning_rate": 2.9428022643197196e-05, + "loss": 1.8695, + "step": 45285 + }, + { + "epoch": 0.08821501875299405, + "grad_norm": 4.011888027191162, + "learning_rate": 2.94276460907361e-05, + "loss": 1.7226, + "step": 45300 + }, + { + "epoch": 0.08824422902410431, + "grad_norm": 5.516124248504639, + "learning_rate": 2.9427269416777765e-05, + "loss": 1.8723, + "step": 45315 + }, + { + "epoch": 0.08827343929521458, + "grad_norm": 3.8080711364746094, + "learning_rate": 2.9426892621325364e-05, + "loss": 2.1226, + "step": 45330 + }, + { + "epoch": 0.08830264956632485, + "grad_norm": 3.3641180992126465, + "learning_rate": 2.9426515704382073e-05, + "loss": 1.9913, + "step": 45345 + }, + { + "epoch": 0.0883318598374351, + "grad_norm": 2.9782888889312744, + "learning_rate": 2.9426138665951062e-05, + "loss": 1.9224, + "step": 45360 + }, + { + "epoch": 0.08836107010854537, + "grad_norm": 3.1421496868133545, + "learning_rate": 2.9425761506035506e-05, + "loss": 1.7136, + "step": 45375 + }, + { + "epoch": 0.08839028037965563, + "grad_norm": 3.040264844894409, + "learning_rate": 2.942538422463859e-05, + "loss": 1.7175, + "step": 45390 + }, + { + "epoch": 0.08841949065076589, + "grad_norm": 2.179347038269043, + "learning_rate": 2.942500682176348e-05, + "loss": 1.7239, + "step": 45405 + }, + { + "epoch": 0.08844870092187615, + "grad_norm": 2.941560983657837, + "learning_rate": 2.9424629297413365e-05, + "loss": 1.7729, + "step": 45420 + }, + { + "epoch": 0.08847791119298642, + "grad_norm": 3.767063617706299, + "learning_rate": 2.942425165159141e-05, + "loss": 1.8561, + "step": 45435 + }, + { + "epoch": 0.08850712146409669, + "grad_norm": 2.905930757522583, + "learning_rate": 2.942387388430081e-05, + "loss": 1.9278, + "step": 45450 + }, + { + "epoch": 0.08853633173520695, + "grad_norm": 4.0752458572387695, + "learning_rate": 2.9423495995544735e-05, + "loss": 1.9129, + "step": 45465 + }, + { + "epoch": 0.08856554200631721, + "grad_norm": 3.448967218399048, + "learning_rate": 2.942311798532637e-05, + "loss": 2.0275, + "step": 45480 + }, + { + "epoch": 0.08859475227742747, + "grad_norm": 3.87823748588562, + "learning_rate": 2.9422739853648903e-05, + "loss": 1.81, + "step": 45495 + }, + { + "epoch": 0.08862396254853773, + "grad_norm": 2.705256938934326, + "learning_rate": 2.9422361600515515e-05, + "loss": 1.8793, + "step": 45510 + }, + { + "epoch": 0.08865317281964799, + "grad_norm": 3.3810877799987793, + "learning_rate": 2.942198322592939e-05, + "loss": 1.9076, + "step": 45525 + }, + { + "epoch": 0.08868238309075827, + "grad_norm": 1.9039440155029297, + "learning_rate": 2.9421604729893716e-05, + "loss": 2.1173, + "step": 45540 + }, + { + "epoch": 0.08871159336186853, + "grad_norm": 2.4718310832977295, + "learning_rate": 2.942122611241168e-05, + "loss": 1.8903, + "step": 45555 + }, + { + "epoch": 0.08874080363297879, + "grad_norm": 1.8678919076919556, + "learning_rate": 2.942084737348647e-05, + "loss": 1.9504, + "step": 45570 + }, + { + "epoch": 0.08877001390408905, + "grad_norm": 4.566815376281738, + "learning_rate": 2.942046851312128e-05, + "loss": 1.8323, + "step": 45585 + }, + { + "epoch": 0.08879922417519931, + "grad_norm": 2.7292160987854004, + "learning_rate": 2.9420089531319295e-05, + "loss": 1.896, + "step": 45600 + }, + { + "epoch": 0.08882843444630957, + "grad_norm": 4.215520858764648, + "learning_rate": 2.9419710428083705e-05, + "loss": 1.9013, + "step": 45615 + }, + { + "epoch": 0.08885764471741983, + "grad_norm": 4.495998859405518, + "learning_rate": 2.941933120341771e-05, + "loss": 2.0898, + "step": 45630 + }, + { + "epoch": 0.0888868549885301, + "grad_norm": 4.300086975097656, + "learning_rate": 2.94189518573245e-05, + "loss": 2.0319, + "step": 45645 + }, + { + "epoch": 0.08891606525964037, + "grad_norm": 2.787299156188965, + "learning_rate": 2.9418572389807263e-05, + "loss": 1.9491, + "step": 45660 + }, + { + "epoch": 0.08894527553075063, + "grad_norm": 3.497532844543457, + "learning_rate": 2.94181928008692e-05, + "loss": 1.852, + "step": 45675 + }, + { + "epoch": 0.08897448580186089, + "grad_norm": 2.902589797973633, + "learning_rate": 2.941781309051351e-05, + "loss": 1.8578, + "step": 45690 + }, + { + "epoch": 0.08900369607297115, + "grad_norm": 4.419853210449219, + "learning_rate": 2.9417433258743386e-05, + "loss": 1.8007, + "step": 45705 + }, + { + "epoch": 0.08903290634408141, + "grad_norm": 1.7236632108688354, + "learning_rate": 2.941705330556203e-05, + "loss": 2.0065, + "step": 45720 + }, + { + "epoch": 0.08906211661519167, + "grad_norm": 3.044177532196045, + "learning_rate": 2.9416673230972642e-05, + "loss": 1.7847, + "step": 45735 + }, + { + "epoch": 0.08909132688630195, + "grad_norm": 4.658002853393555, + "learning_rate": 2.941629303497842e-05, + "loss": 1.6592, + "step": 45750 + }, + { + "epoch": 0.08912053715741221, + "grad_norm": 5.669498443603516, + "learning_rate": 2.9415912717582572e-05, + "loss": 1.7555, + "step": 45765 + }, + { + "epoch": 0.08914974742852247, + "grad_norm": 3.3243215084075928, + "learning_rate": 2.9415532278788288e-05, + "loss": 1.854, + "step": 45780 + }, + { + "epoch": 0.08917895769963273, + "grad_norm": 2.8647665977478027, + "learning_rate": 2.9415151718598783e-05, + "loss": 1.9471, + "step": 45795 + }, + { + "epoch": 0.08920816797074299, + "grad_norm": 2.4509527683258057, + "learning_rate": 2.9414771037017256e-05, + "loss": 1.6894, + "step": 45810 + }, + { + "epoch": 0.08923737824185325, + "grad_norm": 2.953076124191284, + "learning_rate": 2.9414390234046916e-05, + "loss": 1.5692, + "step": 45825 + }, + { + "epoch": 0.08926658851296351, + "grad_norm": 3.253838300704956, + "learning_rate": 2.9414009309690968e-05, + "loss": 1.85, + "step": 45840 + }, + { + "epoch": 0.08929579878407379, + "grad_norm": 2.290224075317383, + "learning_rate": 2.9413628263952616e-05, + "loss": 2.0317, + "step": 45855 + }, + { + "epoch": 0.08932500905518405, + "grad_norm": 4.230685234069824, + "learning_rate": 2.941324709683508e-05, + "loss": 1.7835, + "step": 45870 + }, + { + "epoch": 0.08935421932629431, + "grad_norm": 2.5834014415740967, + "learning_rate": 2.941286580834156e-05, + "loss": 1.8562, + "step": 45885 + }, + { + "epoch": 0.08938342959740457, + "grad_norm": 3.2415804862976074, + "learning_rate": 2.941248439847527e-05, + "loss": 1.7909, + "step": 45900 + }, + { + "epoch": 0.08941263986851483, + "grad_norm": 4.316317081451416, + "learning_rate": 2.941210286723942e-05, + "loss": 1.7632, + "step": 45915 + }, + { + "epoch": 0.08944185013962509, + "grad_norm": 2.109940528869629, + "learning_rate": 2.941172121463723e-05, + "loss": 1.6742, + "step": 45930 + }, + { + "epoch": 0.08947106041073535, + "grad_norm": 3.1508097648620605, + "learning_rate": 2.9411339440671907e-05, + "loss": 2.0397, + "step": 45945 + }, + { + "epoch": 0.08950027068184563, + "grad_norm": 3.212796211242676, + "learning_rate": 2.9410957545346665e-05, + "loss": 1.8796, + "step": 45960 + }, + { + "epoch": 0.08952948095295589, + "grad_norm": 3.6629080772399902, + "learning_rate": 2.9410575528664725e-05, + "loss": 1.9981, + "step": 45975 + }, + { + "epoch": 0.08955869122406615, + "grad_norm": 3.9494054317474365, + "learning_rate": 2.94101933906293e-05, + "loss": 1.8718, + "step": 45990 + }, + { + "epoch": 0.08958790149517641, + "grad_norm": 4.630847930908203, + "learning_rate": 2.9409811131243612e-05, + "loss": 1.9543, + "step": 46005 + }, + { + "epoch": 0.08961711176628667, + "grad_norm": 3.8639118671417236, + "learning_rate": 2.9409428750510873e-05, + "loss": 2.0264, + "step": 46020 + }, + { + "epoch": 0.08964632203739693, + "grad_norm": 4.643235683441162, + "learning_rate": 2.9409046248434314e-05, + "loss": 1.9919, + "step": 46035 + }, + { + "epoch": 0.0896755323085072, + "grad_norm": 2.777958393096924, + "learning_rate": 2.9408663625017145e-05, + "loss": 1.9507, + "step": 46050 + }, + { + "epoch": 0.08970474257961747, + "grad_norm": 3.0993664264678955, + "learning_rate": 2.9408280880262595e-05, + "loss": 1.6426, + "step": 46065 + }, + { + "epoch": 0.08973395285072773, + "grad_norm": 3.035386323928833, + "learning_rate": 2.9407898014173888e-05, + "loss": 2.0447, + "step": 46080 + }, + { + "epoch": 0.08976316312183799, + "grad_norm": 4.014139175415039, + "learning_rate": 2.9407515026754244e-05, + "loss": 1.6994, + "step": 46095 + }, + { + "epoch": 0.08979237339294825, + "grad_norm": 2.161834716796875, + "learning_rate": 2.9407131918006888e-05, + "loss": 1.9796, + "step": 46110 + }, + { + "epoch": 0.08982158366405851, + "grad_norm": 2.8618903160095215, + "learning_rate": 2.940674868793505e-05, + "loss": 1.714, + "step": 46125 + }, + { + "epoch": 0.08985079393516877, + "grad_norm": 3.7389118671417236, + "learning_rate": 2.9406365336541953e-05, + "loss": 1.7062, + "step": 46140 + }, + { + "epoch": 0.08988000420627903, + "grad_norm": 2.665452241897583, + "learning_rate": 2.9405981863830828e-05, + "loss": 1.8467, + "step": 46155 + }, + { + "epoch": 0.0899092144773893, + "grad_norm": 3.7498791217803955, + "learning_rate": 2.9405598269804903e-05, + "loss": 1.7833, + "step": 46170 + }, + { + "epoch": 0.08993842474849957, + "grad_norm": 4.2607035636901855, + "learning_rate": 2.9405214554467404e-05, + "loss": 1.9136, + "step": 46185 + }, + { + "epoch": 0.08996763501960983, + "grad_norm": 3.001507043838501, + "learning_rate": 2.9404830717821577e-05, + "loss": 1.917, + "step": 46200 + }, + { + "epoch": 0.08999684529072009, + "grad_norm": 2.4021408557891846, + "learning_rate": 2.9404446759870636e-05, + "loss": 1.9636, + "step": 46215 + }, + { + "epoch": 0.09002605556183035, + "grad_norm": 3.0223772525787354, + "learning_rate": 2.940406268061783e-05, + "loss": 2.0063, + "step": 46230 + }, + { + "epoch": 0.09005526583294061, + "grad_norm": 3.279000759124756, + "learning_rate": 2.9403678480066383e-05, + "loss": 1.7461, + "step": 46245 + }, + { + "epoch": 0.09008447610405088, + "grad_norm": 4.059992790222168, + "learning_rate": 2.9403294158219534e-05, + "loss": 1.8883, + "step": 46260 + }, + { + "epoch": 0.09011368637516114, + "grad_norm": 1.6985243558883667, + "learning_rate": 2.940290971508052e-05, + "loss": 1.8436, + "step": 46275 + }, + { + "epoch": 0.09014289664627141, + "grad_norm": 4.409860610961914, + "learning_rate": 2.940252515065258e-05, + "loss": 1.6792, + "step": 46290 + }, + { + "epoch": 0.09017210691738167, + "grad_norm": 3.3904550075531006, + "learning_rate": 2.9402140464938943e-05, + "loss": 1.9071, + "step": 46305 + }, + { + "epoch": 0.09020131718849193, + "grad_norm": 2.881351947784424, + "learning_rate": 2.9401755657942858e-05, + "loss": 1.808, + "step": 46320 + }, + { + "epoch": 0.0902305274596022, + "grad_norm": 3.654620409011841, + "learning_rate": 2.9401370729667565e-05, + "loss": 1.9737, + "step": 46335 + }, + { + "epoch": 0.09025973773071246, + "grad_norm": 4.317416667938232, + "learning_rate": 2.9400985680116304e-05, + "loss": 1.696, + "step": 46350 + }, + { + "epoch": 0.09028894800182272, + "grad_norm": 2.8537566661834717, + "learning_rate": 2.940060050929232e-05, + "loss": 1.8523, + "step": 46365 + }, + { + "epoch": 0.09031815827293298, + "grad_norm": 3.021620273590088, + "learning_rate": 2.940021521719885e-05, + "loss": 1.8763, + "step": 46380 + }, + { + "epoch": 0.09034736854404325, + "grad_norm": 2.3264734745025635, + "learning_rate": 2.939982980383914e-05, + "loss": 2.0591, + "step": 46395 + }, + { + "epoch": 0.09037657881515351, + "grad_norm": 3.1548166275024414, + "learning_rate": 2.939944426921644e-05, + "loss": 1.9082, + "step": 46410 + }, + { + "epoch": 0.09040578908626377, + "grad_norm": 2.1883738040924072, + "learning_rate": 2.9399058613333992e-05, + "loss": 1.8559, + "step": 46425 + }, + { + "epoch": 0.09043499935737404, + "grad_norm": 3.480323076248169, + "learning_rate": 2.9398672836195053e-05, + "loss": 1.6578, + "step": 46440 + }, + { + "epoch": 0.0904642096284843, + "grad_norm": 4.378427028656006, + "learning_rate": 2.939828693780286e-05, + "loss": 1.7655, + "step": 46455 + }, + { + "epoch": 0.09049341989959456, + "grad_norm": 2.4116623401641846, + "learning_rate": 2.9397900918160667e-05, + "loss": 1.7197, + "step": 46470 + }, + { + "epoch": 0.09052263017070482, + "grad_norm": 2.80350923538208, + "learning_rate": 2.939751477727173e-05, + "loss": 1.9612, + "step": 46485 + }, + { + "epoch": 0.09055184044181509, + "grad_norm": 2.7664198875427246, + "learning_rate": 2.9397128515139292e-05, + "loss": 1.7675, + "step": 46500 + }, + { + "epoch": 0.09058105071292535, + "grad_norm": 2.6225290298461914, + "learning_rate": 2.9396742131766615e-05, + "loss": 1.65, + "step": 46515 + }, + { + "epoch": 0.09061026098403561, + "grad_norm": 4.222823143005371, + "learning_rate": 2.9396355627156942e-05, + "loss": 1.6833, + "step": 46530 + }, + { + "epoch": 0.09063947125514588, + "grad_norm": 4.5323967933654785, + "learning_rate": 2.9395969001313538e-05, + "loss": 2.0888, + "step": 46545 + }, + { + "epoch": 0.09066868152625614, + "grad_norm": 5.949542045593262, + "learning_rate": 2.939558225423965e-05, + "loss": 2.0995, + "step": 46560 + }, + { + "epoch": 0.0906978917973664, + "grad_norm": 1.957553505897522, + "learning_rate": 2.939519538593854e-05, + "loss": 1.7008, + "step": 46575 + }, + { + "epoch": 0.09072710206847666, + "grad_norm": 3.7566747665405273, + "learning_rate": 2.9394808396413463e-05, + "loss": 1.9004, + "step": 46590 + }, + { + "epoch": 0.09075631233958693, + "grad_norm": 4.103549957275391, + "learning_rate": 2.9394421285667684e-05, + "loss": 1.7224, + "step": 46605 + }, + { + "epoch": 0.0907855226106972, + "grad_norm": 4.131819725036621, + "learning_rate": 2.9394034053704457e-05, + "loss": 2.0474, + "step": 46620 + }, + { + "epoch": 0.09081473288180746, + "grad_norm": 4.420338153839111, + "learning_rate": 2.9393646700527047e-05, + "loss": 1.9015, + "step": 46635 + }, + { + "epoch": 0.09084394315291772, + "grad_norm": 4.487269401550293, + "learning_rate": 2.939325922613871e-05, + "loss": 2.0197, + "step": 46650 + }, + { + "epoch": 0.09087315342402798, + "grad_norm": 5.222062587738037, + "learning_rate": 2.939287163054272e-05, + "loss": 1.9145, + "step": 46665 + }, + { + "epoch": 0.09090236369513824, + "grad_norm": 5.2663655281066895, + "learning_rate": 2.9392483913742325e-05, + "loss": 1.7939, + "step": 46680 + }, + { + "epoch": 0.0909315739662485, + "grad_norm": 6.097779273986816, + "learning_rate": 2.9392096075740804e-05, + "loss": 1.975, + "step": 46695 + }, + { + "epoch": 0.09096078423735877, + "grad_norm": 2.3450088500976562, + "learning_rate": 2.9391708116541417e-05, + "loss": 1.9715, + "step": 46710 + }, + { + "epoch": 0.09098999450846904, + "grad_norm": 3.173430919647217, + "learning_rate": 2.939132003614743e-05, + "loss": 1.8709, + "step": 46725 + }, + { + "epoch": 0.0910192047795793, + "grad_norm": 3.0776472091674805, + "learning_rate": 2.9390931834562113e-05, + "loss": 1.7442, + "step": 46740 + }, + { + "epoch": 0.09104841505068956, + "grad_norm": 2.8173866271972656, + "learning_rate": 2.9390543511788735e-05, + "loss": 1.9549, + "step": 46755 + }, + { + "epoch": 0.09107762532179982, + "grad_norm": 4.186956405639648, + "learning_rate": 2.939015506783057e-05, + "loss": 1.7209, + "step": 46770 + }, + { + "epoch": 0.09110683559291008, + "grad_norm": 1.6793681383132935, + "learning_rate": 2.938976650269088e-05, + "loss": 1.6313, + "step": 46785 + }, + { + "epoch": 0.09113604586402034, + "grad_norm": 2.971320152282715, + "learning_rate": 2.9389377816372947e-05, + "loss": 1.8833, + "step": 46800 + }, + { + "epoch": 0.09116525613513062, + "grad_norm": 2.4654321670532227, + "learning_rate": 2.938898900888004e-05, + "loss": 1.8134, + "step": 46815 + }, + { + "epoch": 0.09119446640624088, + "grad_norm": 2.388918399810791, + "learning_rate": 2.9388600080215428e-05, + "loss": 1.9447, + "step": 46830 + }, + { + "epoch": 0.09122367667735114, + "grad_norm": 3.474928617477417, + "learning_rate": 2.9388211030382395e-05, + "loss": 1.7331, + "step": 46845 + }, + { + "epoch": 0.0912528869484614, + "grad_norm": 2.335491180419922, + "learning_rate": 2.938782185938421e-05, + "loss": 1.8248, + "step": 46860 + }, + { + "epoch": 0.09128209721957166, + "grad_norm": 2.1897199153900146, + "learning_rate": 2.9387432567224155e-05, + "loss": 1.752, + "step": 46875 + }, + { + "epoch": 0.09131130749068192, + "grad_norm": 2.5771682262420654, + "learning_rate": 2.9387043153905506e-05, + "loss": 1.964, + "step": 46890 + }, + { + "epoch": 0.09134051776179218, + "grad_norm": 2.8829903602600098, + "learning_rate": 2.9386653619431548e-05, + "loss": 2.0032, + "step": 46905 + }, + { + "epoch": 0.09136972803290246, + "grad_norm": 2.8059322834014893, + "learning_rate": 2.938626396380555e-05, + "loss": 1.8491, + "step": 46920 + }, + { + "epoch": 0.09139893830401272, + "grad_norm": 3.0749945640563965, + "learning_rate": 2.9385874187030802e-05, + "loss": 1.7575, + "step": 46935 + }, + { + "epoch": 0.09142814857512298, + "grad_norm": 2.8172197341918945, + "learning_rate": 2.938548428911058e-05, + "loss": 1.7225, + "step": 46950 + }, + { + "epoch": 0.09145735884623324, + "grad_norm": 5.23187780380249, + "learning_rate": 2.9385094270048182e-05, + "loss": 1.7931, + "step": 46965 + }, + { + "epoch": 0.0914865691173435, + "grad_norm": 2.241788387298584, + "learning_rate": 2.9384704129846876e-05, + "loss": 1.9205, + "step": 46980 + }, + { + "epoch": 0.09151577938845376, + "grad_norm": 3.4979000091552734, + "learning_rate": 2.938431386850995e-05, + "loss": 1.8775, + "step": 46995 + }, + { + "epoch": 0.09154498965956402, + "grad_norm": 3.5703864097595215, + "learning_rate": 2.93839234860407e-05, + "loss": 1.7483, + "step": 47010 + }, + { + "epoch": 0.0915741999306743, + "grad_norm": 3.68125319480896, + "learning_rate": 2.9383532982442404e-05, + "loss": 1.9416, + "step": 47025 + }, + { + "epoch": 0.09160341020178456, + "grad_norm": 2.311479091644287, + "learning_rate": 2.9383142357718356e-05, + "loss": 1.7815, + "step": 47040 + }, + { + "epoch": 0.09163262047289482, + "grad_norm": 2.455263137817383, + "learning_rate": 2.9382751611871843e-05, + "loss": 1.8873, + "step": 47055 + }, + { + "epoch": 0.09166183074400508, + "grad_norm": 3.0800442695617676, + "learning_rate": 2.9382360744906153e-05, + "loss": 2.0687, + "step": 47070 + }, + { + "epoch": 0.09169104101511534, + "grad_norm": 1.9920344352722168, + "learning_rate": 2.938196975682458e-05, + "loss": 1.8431, + "step": 47085 + }, + { + "epoch": 0.0917202512862256, + "grad_norm": 2.7366907596588135, + "learning_rate": 2.938157864763042e-05, + "loss": 1.7235, + "step": 47100 + }, + { + "epoch": 0.09174946155733586, + "grad_norm": 2.735663414001465, + "learning_rate": 2.9381187417326964e-05, + "loss": 1.874, + "step": 47115 + }, + { + "epoch": 0.09177867182844614, + "grad_norm": 3.411982774734497, + "learning_rate": 2.93807960659175e-05, + "loss": 1.7868, + "step": 47130 + }, + { + "epoch": 0.0918078820995564, + "grad_norm": 4.17244291305542, + "learning_rate": 2.9380404593405338e-05, + "loss": 1.9122, + "step": 47145 + }, + { + "epoch": 0.09183709237066666, + "grad_norm": 2.1301894187927246, + "learning_rate": 2.9380012999793763e-05, + "loss": 1.9863, + "step": 47160 + }, + { + "epoch": 0.09186630264177692, + "grad_norm": 2.0585176944732666, + "learning_rate": 2.9379621285086072e-05, + "loss": 1.9591, + "step": 47175 + }, + { + "epoch": 0.09189551291288718, + "grad_norm": 3.732590913772583, + "learning_rate": 2.9379229449285575e-05, + "loss": 1.7843, + "step": 47190 + }, + { + "epoch": 0.09192472318399744, + "grad_norm": 3.9266607761383057, + "learning_rate": 2.937883749239556e-05, + "loss": 1.685, + "step": 47205 + }, + { + "epoch": 0.0919539334551077, + "grad_norm": 2.804568290710449, + "learning_rate": 2.9378445414419333e-05, + "loss": 1.9214, + "step": 47220 + }, + { + "epoch": 0.09198314372621796, + "grad_norm": 3.9198246002197266, + "learning_rate": 2.9378053215360194e-05, + "loss": 1.8938, + "step": 47235 + }, + { + "epoch": 0.09201235399732824, + "grad_norm": 3.0435566902160645, + "learning_rate": 2.937766089522145e-05, + "loss": 1.8995, + "step": 47250 + }, + { + "epoch": 0.0920415642684385, + "grad_norm": 4.04058313369751, + "learning_rate": 2.9377268454006397e-05, + "loss": 1.7821, + "step": 47265 + }, + { + "epoch": 0.09207077453954876, + "grad_norm": 3.9782166481018066, + "learning_rate": 2.9376875891718348e-05, + "loss": 1.7486, + "step": 47280 + }, + { + "epoch": 0.09209998481065902, + "grad_norm": 3.0122504234313965, + "learning_rate": 2.93764832083606e-05, + "loss": 1.8464, + "step": 47295 + }, + { + "epoch": 0.09212919508176928, + "grad_norm": 2.877807140350342, + "learning_rate": 2.9376090403936474e-05, + "loss": 1.7075, + "step": 47310 + }, + { + "epoch": 0.09215840535287954, + "grad_norm": 3.1419224739074707, + "learning_rate": 2.937569747844926e-05, + "loss": 1.9716, + "step": 47325 + }, + { + "epoch": 0.0921876156239898, + "grad_norm": 2.612062931060791, + "learning_rate": 2.9375304431902277e-05, + "loss": 1.8906, + "step": 47340 + }, + { + "epoch": 0.09221682589510008, + "grad_norm": 2.703277587890625, + "learning_rate": 2.9374911264298836e-05, + "loss": 1.9261, + "step": 47355 + }, + { + "epoch": 0.09224603616621034, + "grad_norm": 2.6118664741516113, + "learning_rate": 2.9374517975642247e-05, + "loss": 1.8008, + "step": 47370 + }, + { + "epoch": 0.0922752464373206, + "grad_norm": 4.062849521636963, + "learning_rate": 2.9374124565935817e-05, + "loss": 1.8587, + "step": 47385 + }, + { + "epoch": 0.09230445670843086, + "grad_norm": 2.748095989227295, + "learning_rate": 2.9373731035182863e-05, + "loss": 1.9159, + "step": 47400 + }, + { + "epoch": 0.09233366697954112, + "grad_norm": 2.4311137199401855, + "learning_rate": 2.93733373833867e-05, + "loss": 1.8601, + "step": 47415 + }, + { + "epoch": 0.09236287725065138, + "grad_norm": 2.542649269104004, + "learning_rate": 2.937294361055064e-05, + "loss": 1.9995, + "step": 47430 + }, + { + "epoch": 0.09239208752176165, + "grad_norm": 2.3988935947418213, + "learning_rate": 2.9372549716678e-05, + "loss": 1.946, + "step": 47445 + }, + { + "epoch": 0.09242129779287192, + "grad_norm": 3.805635690689087, + "learning_rate": 2.93721557017721e-05, + "loss": 1.8609, + "step": 47460 + }, + { + "epoch": 0.09245050806398218, + "grad_norm": 3.613706111907959, + "learning_rate": 2.9371761565836253e-05, + "loss": 1.7101, + "step": 47475 + }, + { + "epoch": 0.09247971833509244, + "grad_norm": 2.3323137760162354, + "learning_rate": 2.937136730887378e-05, + "loss": 1.8609, + "step": 47490 + }, + { + "epoch": 0.0925089286062027, + "grad_norm": 3.184736967086792, + "learning_rate": 2.9370972930888003e-05, + "loss": 1.8663, + "step": 47505 + }, + { + "epoch": 0.09253813887731296, + "grad_norm": 2.841794967651367, + "learning_rate": 2.937057843188224e-05, + "loss": 1.8648, + "step": 47520 + }, + { + "epoch": 0.09256734914842323, + "grad_norm": 3.5229077339172363, + "learning_rate": 2.937018381185982e-05, + "loss": 1.9722, + "step": 47535 + }, + { + "epoch": 0.09259655941953349, + "grad_norm": 2.2388434410095215, + "learning_rate": 2.9369789070824058e-05, + "loss": 1.8857, + "step": 47550 + }, + { + "epoch": 0.09262576969064376, + "grad_norm": 2.3352901935577393, + "learning_rate": 2.9369394208778277e-05, + "loss": 1.8446, + "step": 47565 + }, + { + "epoch": 0.09265497996175402, + "grad_norm": 2.5115973949432373, + "learning_rate": 2.9368999225725814e-05, + "loss": 1.8825, + "step": 47580 + }, + { + "epoch": 0.09268419023286428, + "grad_norm": 5.338953495025635, + "learning_rate": 2.9368604121669984e-05, + "loss": 1.8563, + "step": 47595 + }, + { + "epoch": 0.09271340050397454, + "grad_norm": 3.1112778186798096, + "learning_rate": 2.936820889661412e-05, + "loss": 1.893, + "step": 47610 + }, + { + "epoch": 0.0927426107750848, + "grad_norm": 3.126549243927002, + "learning_rate": 2.9367813550561542e-05, + "loss": 1.897, + "step": 47625 + }, + { + "epoch": 0.09277182104619507, + "grad_norm": 1.950778603553772, + "learning_rate": 2.9367418083515587e-05, + "loss": 1.8451, + "step": 47640 + }, + { + "epoch": 0.09280103131730533, + "grad_norm": 2.479407548904419, + "learning_rate": 2.936702249547959e-05, + "loss": 2.0399, + "step": 47655 + }, + { + "epoch": 0.0928302415884156, + "grad_norm": 6.256388187408447, + "learning_rate": 2.9366626786456866e-05, + "loss": 1.8861, + "step": 47670 + }, + { + "epoch": 0.09285945185952586, + "grad_norm": 2.9249043464660645, + "learning_rate": 2.9366230956450765e-05, + "loss": 1.9422, + "step": 47685 + }, + { + "epoch": 0.09288866213063612, + "grad_norm": 3.4006667137145996, + "learning_rate": 2.936583500546461e-05, + "loss": 1.6933, + "step": 47700 + }, + { + "epoch": 0.09291787240174638, + "grad_norm": 2.45302677154541, + "learning_rate": 2.936543893350174e-05, + "loss": 1.8965, + "step": 47715 + }, + { + "epoch": 0.09294708267285665, + "grad_norm": 2.511388063430786, + "learning_rate": 2.9365042740565486e-05, + "loss": 1.991, + "step": 47730 + }, + { + "epoch": 0.0929762929439669, + "grad_norm": 2.6051487922668457, + "learning_rate": 2.9364646426659185e-05, + "loss": 2.043, + "step": 47745 + }, + { + "epoch": 0.09300550321507717, + "grad_norm": 2.067706346511841, + "learning_rate": 2.936424999178618e-05, + "loss": 1.7748, + "step": 47760 + }, + { + "epoch": 0.09303471348618744, + "grad_norm": 3.630814552307129, + "learning_rate": 2.9363853435949806e-05, + "loss": 1.873, + "step": 47775 + }, + { + "epoch": 0.0930639237572977, + "grad_norm": 4.217411994934082, + "learning_rate": 2.9363456759153398e-05, + "loss": 1.8956, + "step": 47790 + }, + { + "epoch": 0.09309313402840796, + "grad_norm": 1.8592040538787842, + "learning_rate": 2.9363059961400303e-05, + "loss": 1.917, + "step": 47805 + }, + { + "epoch": 0.09312234429951823, + "grad_norm": 2.441096305847168, + "learning_rate": 2.936266304269386e-05, + "loss": 1.7576, + "step": 47820 + }, + { + "epoch": 0.09315155457062849, + "grad_norm": 3.1484501361846924, + "learning_rate": 2.9362266003037408e-05, + "loss": 1.8034, + "step": 47835 + }, + { + "epoch": 0.09318076484173875, + "grad_norm": 3.104492425918579, + "learning_rate": 2.93618688424343e-05, + "loss": 1.9117, + "step": 47850 + }, + { + "epoch": 0.09320997511284901, + "grad_norm": 4.626640796661377, + "learning_rate": 2.9361471560887868e-05, + "loss": 1.8304, + "step": 47865 + }, + { + "epoch": 0.09323918538395928, + "grad_norm": 3.3960869312286377, + "learning_rate": 2.936107415840147e-05, + "loss": 1.9412, + "step": 47880 + }, + { + "epoch": 0.09326839565506954, + "grad_norm": 2.6509194374084473, + "learning_rate": 2.9360676634978445e-05, + "loss": 1.8504, + "step": 47895 + }, + { + "epoch": 0.0932976059261798, + "grad_norm": 4.512158393859863, + "learning_rate": 2.9360278990622142e-05, + "loss": 1.9797, + "step": 47910 + }, + { + "epoch": 0.09332681619729007, + "grad_norm": 3.2379143238067627, + "learning_rate": 2.935988122533591e-05, + "loss": 1.8505, + "step": 47925 + }, + { + "epoch": 0.09335602646840033, + "grad_norm": 2.5431196689605713, + "learning_rate": 2.9359483339123097e-05, + "loss": 2.036, + "step": 47940 + }, + { + "epoch": 0.09338523673951059, + "grad_norm": 5.5352396965026855, + "learning_rate": 2.9359085331987056e-05, + "loss": 1.9166, + "step": 47955 + }, + { + "epoch": 0.09341444701062085, + "grad_norm": 3.4632835388183594, + "learning_rate": 2.935868720393114e-05, + "loss": 1.8466, + "step": 47970 + }, + { + "epoch": 0.09344365728173112, + "grad_norm": 5.365049362182617, + "learning_rate": 2.9358288954958697e-05, + "loss": 2.0837, + "step": 47985 + }, + { + "epoch": 0.09347286755284138, + "grad_norm": 4.2348761558532715, + "learning_rate": 2.9357890585073082e-05, + "loss": 1.7106, + "step": 48000 + }, + { + "epoch": 0.09350207782395165, + "grad_norm": 2.3832342624664307, + "learning_rate": 2.9357492094277653e-05, + "loss": 1.8389, + "step": 48015 + }, + { + "epoch": 0.0935312880950619, + "grad_norm": 3.986356735229492, + "learning_rate": 2.9357093482575766e-05, + "loss": 1.8841, + "step": 48030 + }, + { + "epoch": 0.09356049836617217, + "grad_norm": 2.661665678024292, + "learning_rate": 2.935669474997077e-05, + "loss": 1.8138, + "step": 48045 + }, + { + "epoch": 0.09358970863728243, + "grad_norm": 3.051889419555664, + "learning_rate": 2.9356295896466036e-05, + "loss": 1.8381, + "step": 48060 + }, + { + "epoch": 0.09361891890839269, + "grad_norm": 2.316612482070923, + "learning_rate": 2.935589692206491e-05, + "loss": 1.8114, + "step": 48075 + }, + { + "epoch": 0.09364812917950296, + "grad_norm": 2.154189109802246, + "learning_rate": 2.935549782677076e-05, + "loss": 1.9186, + "step": 48090 + }, + { + "epoch": 0.09367733945061323, + "grad_norm": 2.2305965423583984, + "learning_rate": 2.9355098610586936e-05, + "loss": 1.8252, + "step": 48105 + }, + { + "epoch": 0.09370654972172349, + "grad_norm": 2.1520867347717285, + "learning_rate": 2.9354699273516818e-05, + "loss": 1.9562, + "step": 48120 + }, + { + "epoch": 0.09373575999283375, + "grad_norm": 6.05635929107666, + "learning_rate": 2.935429981556375e-05, + "loss": 1.8604, + "step": 48135 + }, + { + "epoch": 0.09376497026394401, + "grad_norm": 2.818211793899536, + "learning_rate": 2.935390023673111e-05, + "loss": 1.8975, + "step": 48150 + }, + { + "epoch": 0.09379418053505427, + "grad_norm": 3.0314574241638184, + "learning_rate": 2.9353500537022256e-05, + "loss": 1.7525, + "step": 48165 + }, + { + "epoch": 0.09382339080616453, + "grad_norm": 3.3957324028015137, + "learning_rate": 2.9353100716440554e-05, + "loss": 1.8434, + "step": 48180 + }, + { + "epoch": 0.0938526010772748, + "grad_norm": 3.6811084747314453, + "learning_rate": 2.9352700774989374e-05, + "loss": 2.0105, + "step": 48195 + }, + { + "epoch": 0.09388181134838507, + "grad_norm": 3.504967451095581, + "learning_rate": 2.935230071267208e-05, + "loss": 1.9383, + "step": 48210 + }, + { + "epoch": 0.09391102161949533, + "grad_norm": 2.807276964187622, + "learning_rate": 2.935190052949204e-05, + "loss": 1.7538, + "step": 48225 + }, + { + "epoch": 0.09394023189060559, + "grad_norm": 3.5521178245544434, + "learning_rate": 2.9351500225452637e-05, + "loss": 1.8844, + "step": 48240 + }, + { + "epoch": 0.09396944216171585, + "grad_norm": 2.884357213973999, + "learning_rate": 2.9351099800557223e-05, + "loss": 1.7718, + "step": 48255 + }, + { + "epoch": 0.09399865243282611, + "grad_norm": 5.653458595275879, + "learning_rate": 2.9350699254809185e-05, + "loss": 1.8595, + "step": 48270 + }, + { + "epoch": 0.09402786270393637, + "grad_norm": 2.2456629276275635, + "learning_rate": 2.9350298588211887e-05, + "loss": 1.6842, + "step": 48285 + }, + { + "epoch": 0.09405707297504663, + "grad_norm": 3.6718297004699707, + "learning_rate": 2.934989780076871e-05, + "loss": 1.8907, + "step": 48300 + }, + { + "epoch": 0.09408628324615691, + "grad_norm": 3.220428705215454, + "learning_rate": 2.9349496892483017e-05, + "loss": 1.8176, + "step": 48315 + }, + { + "epoch": 0.09411549351726717, + "grad_norm": 2.8433873653411865, + "learning_rate": 2.93490958633582e-05, + "loss": 1.8805, + "step": 48330 + }, + { + "epoch": 0.09414470378837743, + "grad_norm": 3.2564454078674316, + "learning_rate": 2.934869471339763e-05, + "loss": 1.8994, + "step": 48345 + }, + { + "epoch": 0.09417391405948769, + "grad_norm": 3.8655519485473633, + "learning_rate": 2.9348293442604675e-05, + "loss": 1.8639, + "step": 48360 + }, + { + "epoch": 0.09420312433059795, + "grad_norm": 3.8386640548706055, + "learning_rate": 2.9347892050982732e-05, + "loss": 1.7577, + "step": 48375 + }, + { + "epoch": 0.09423233460170821, + "grad_norm": 3.6252169609069824, + "learning_rate": 2.9347490538535165e-05, + "loss": 1.8352, + "step": 48390 + }, + { + "epoch": 0.09426154487281847, + "grad_norm": 3.1127262115478516, + "learning_rate": 2.9347088905265367e-05, + "loss": 1.9405, + "step": 48405 + }, + { + "epoch": 0.09429075514392875, + "grad_norm": 2.3375282287597656, + "learning_rate": 2.934668715117671e-05, + "loss": 1.8079, + "step": 48420 + }, + { + "epoch": 0.09431996541503901, + "grad_norm": 2.0516679286956787, + "learning_rate": 2.9346285276272592e-05, + "loss": 1.9539, + "step": 48435 + }, + { + "epoch": 0.09434917568614927, + "grad_norm": 3.3882431983947754, + "learning_rate": 2.9345883280556384e-05, + "loss": 1.819, + "step": 48450 + }, + { + "epoch": 0.09437838595725953, + "grad_norm": 2.644202709197998, + "learning_rate": 2.934548116403147e-05, + "loss": 1.9176, + "step": 48465 + }, + { + "epoch": 0.09440759622836979, + "grad_norm": 3.2326552867889404, + "learning_rate": 2.934507892670125e-05, + "loss": 1.8493, + "step": 48480 + }, + { + "epoch": 0.09443680649948005, + "grad_norm": 2.4110164642333984, + "learning_rate": 2.9344676568569095e-05, + "loss": 1.7326, + "step": 48495 + }, + { + "epoch": 0.09446601677059031, + "grad_norm": 3.2485642433166504, + "learning_rate": 2.9344274089638405e-05, + "loss": 1.7972, + "step": 48510 + }, + { + "epoch": 0.09449522704170059, + "grad_norm": 2.770838499069214, + "learning_rate": 2.9343871489912563e-05, + "loss": 1.9041, + "step": 48525 + }, + { + "epoch": 0.09452443731281085, + "grad_norm": 2.4787659645080566, + "learning_rate": 2.9343468769394965e-05, + "loss": 1.9034, + "step": 48540 + }, + { + "epoch": 0.09455364758392111, + "grad_norm": 4.980709075927734, + "learning_rate": 2.9343065928088997e-05, + "loss": 1.7995, + "step": 48555 + }, + { + "epoch": 0.09458285785503137, + "grad_norm": 3.064199686050415, + "learning_rate": 2.9342662965998053e-05, + "loss": 1.7465, + "step": 48570 + }, + { + "epoch": 0.09461206812614163, + "grad_norm": 4.2010955810546875, + "learning_rate": 2.9342259883125533e-05, + "loss": 1.8556, + "step": 48585 + }, + { + "epoch": 0.0946412783972519, + "grad_norm": 3.320122718811035, + "learning_rate": 2.9341856679474818e-05, + "loss": 1.8096, + "step": 48600 + }, + { + "epoch": 0.09467048866836215, + "grad_norm": 2.7381668090820312, + "learning_rate": 2.9341453355049315e-05, + "loss": 1.8962, + "step": 48615 + }, + { + "epoch": 0.09469969893947243, + "grad_norm": 3.2660324573516846, + "learning_rate": 2.934104990985241e-05, + "loss": 2.032, + "step": 48630 + }, + { + "epoch": 0.09472890921058269, + "grad_norm": 3.2627756595611572, + "learning_rate": 2.9340646343887514e-05, + "loss": 1.7129, + "step": 48645 + }, + { + "epoch": 0.09475811948169295, + "grad_norm": 2.6300570964813232, + "learning_rate": 2.9340242657158016e-05, + "loss": 1.7445, + "step": 48660 + }, + { + "epoch": 0.09478732975280321, + "grad_norm": 2.7047741413116455, + "learning_rate": 2.9339838849667315e-05, + "loss": 1.7302, + "step": 48675 + }, + { + "epoch": 0.09481654002391347, + "grad_norm": 2.4090497493743896, + "learning_rate": 2.933943492141882e-05, + "loss": 1.9483, + "step": 48690 + }, + { + "epoch": 0.09484575029502373, + "grad_norm": 2.5661919116973877, + "learning_rate": 2.9339030872415923e-05, + "loss": 1.8992, + "step": 48705 + }, + { + "epoch": 0.094874960566134, + "grad_norm": 2.628004789352417, + "learning_rate": 2.9338626702662028e-05, + "loss": 1.9041, + "step": 48720 + }, + { + "epoch": 0.09490417083724427, + "grad_norm": 4.652515888214111, + "learning_rate": 2.9338222412160547e-05, + "loss": 1.9847, + "step": 48735 + }, + { + "epoch": 0.09493338110835453, + "grad_norm": 3.3412258625030518, + "learning_rate": 2.933781800091487e-05, + "loss": 1.8348, + "step": 48750 + }, + { + "epoch": 0.09496259137946479, + "grad_norm": 1.8659213781356812, + "learning_rate": 2.9337413468928417e-05, + "loss": 1.663, + "step": 48765 + }, + { + "epoch": 0.09499180165057505, + "grad_norm": 3.2646689414978027, + "learning_rate": 2.933700881620459e-05, + "loss": 1.842, + "step": 48780 + }, + { + "epoch": 0.09502101192168531, + "grad_norm": 4.736114978790283, + "learning_rate": 2.9336604042746796e-05, + "loss": 2.0885, + "step": 48795 + }, + { + "epoch": 0.09505022219279557, + "grad_norm": 4.1501078605651855, + "learning_rate": 2.9336199148558438e-05, + "loss": 1.8239, + "step": 48810 + }, + { + "epoch": 0.09507943246390584, + "grad_norm": 4.221755027770996, + "learning_rate": 2.9335794133642932e-05, + "loss": 2.0309, + "step": 48825 + }, + { + "epoch": 0.09510864273501611, + "grad_norm": 2.461284875869751, + "learning_rate": 2.933538899800369e-05, + "loss": 1.962, + "step": 48840 + }, + { + "epoch": 0.09513785300612637, + "grad_norm": 4.0798187255859375, + "learning_rate": 2.9334983741644118e-05, + "loss": 1.9001, + "step": 48855 + }, + { + "epoch": 0.09516706327723663, + "grad_norm": 3.028874158859253, + "learning_rate": 2.933457836456763e-05, + "loss": 1.7871, + "step": 48870 + }, + { + "epoch": 0.0951962735483469, + "grad_norm": 2.044832229614258, + "learning_rate": 2.9334172866777646e-05, + "loss": 1.6503, + "step": 48885 + }, + { + "epoch": 0.09522548381945715, + "grad_norm": 3.0452120304107666, + "learning_rate": 2.9333767248277574e-05, + "loss": 1.8384, + "step": 48900 + }, + { + "epoch": 0.09525469409056742, + "grad_norm": 5.384150981903076, + "learning_rate": 2.9333361509070833e-05, + "loss": 1.8761, + "step": 48915 + }, + { + "epoch": 0.09528390436167768, + "grad_norm": 2.2106215953826904, + "learning_rate": 2.933295564916084e-05, + "loss": 1.8296, + "step": 48930 + }, + { + "epoch": 0.09531311463278795, + "grad_norm": 3.9018208980560303, + "learning_rate": 2.933254966855101e-05, + "loss": 1.8843, + "step": 48945 + }, + { + "epoch": 0.09534232490389821, + "grad_norm": 2.1903138160705566, + "learning_rate": 2.933214356724476e-05, + "loss": 1.9112, + "step": 48960 + }, + { + "epoch": 0.09537153517500847, + "grad_norm": 2.6728515625, + "learning_rate": 2.933173734524552e-05, + "loss": 1.9539, + "step": 48975 + }, + { + "epoch": 0.09540074544611873, + "grad_norm": 4.163172721862793, + "learning_rate": 2.9331331002556697e-05, + "loss": 1.9025, + "step": 48990 + }, + { + "epoch": 0.095429955717229, + "grad_norm": 3.705101490020752, + "learning_rate": 2.9330924539181726e-05, + "loss": 1.8348, + "step": 49005 + }, + { + "epoch": 0.09545916598833926, + "grad_norm": 4.595695495605469, + "learning_rate": 2.9330517955124024e-05, + "loss": 1.9086, + "step": 49020 + }, + { + "epoch": 0.09548837625944952, + "grad_norm": 2.3054087162017822, + "learning_rate": 2.933011125038701e-05, + "loss": 1.9129, + "step": 49035 + }, + { + "epoch": 0.09551758653055979, + "grad_norm": 2.9813685417175293, + "learning_rate": 2.9329704424974118e-05, + "loss": 1.7802, + "step": 49050 + }, + { + "epoch": 0.09554679680167005, + "grad_norm": 5.957952499389648, + "learning_rate": 2.932929747888877e-05, + "loss": 1.789, + "step": 49065 + }, + { + "epoch": 0.09557600707278031, + "grad_norm": 3.371269702911377, + "learning_rate": 2.932889041213439e-05, + "loss": 1.772, + "step": 49080 + }, + { + "epoch": 0.09560521734389058, + "grad_norm": 5.457712173461914, + "learning_rate": 2.9328483224714408e-05, + "loss": 1.8218, + "step": 49095 + }, + { + "epoch": 0.09563442761500084, + "grad_norm": 5.165330410003662, + "learning_rate": 2.9328075916632255e-05, + "loss": 1.8362, + "step": 49110 + }, + { + "epoch": 0.0956636378861111, + "grad_norm": 1.8119124174118042, + "learning_rate": 2.9327668487891357e-05, + "loss": 1.8338, + "step": 49125 + }, + { + "epoch": 0.09569284815722136, + "grad_norm": 4.478886127471924, + "learning_rate": 2.9327260938495152e-05, + "loss": 1.7219, + "step": 49140 + }, + { + "epoch": 0.09572205842833163, + "grad_norm": 3.2933688163757324, + "learning_rate": 2.9326853268447066e-05, + "loss": 1.8621, + "step": 49155 + }, + { + "epoch": 0.0957512686994419, + "grad_norm": 4.14644718170166, + "learning_rate": 2.932644547775053e-05, + "loss": 2.0011, + "step": 49170 + }, + { + "epoch": 0.09578047897055215, + "grad_norm": 2.830348014831543, + "learning_rate": 2.9326037566408985e-05, + "loss": 1.9063, + "step": 49185 + }, + { + "epoch": 0.09580968924166242, + "grad_norm": 3.782336473464966, + "learning_rate": 2.9325629534425865e-05, + "loss": 1.7197, + "step": 49200 + }, + { + "epoch": 0.09583889951277268, + "grad_norm": 3.7808749675750732, + "learning_rate": 2.9325221381804603e-05, + "loss": 1.7621, + "step": 49215 + }, + { + "epoch": 0.09586810978388294, + "grad_norm": 2.5332424640655518, + "learning_rate": 2.9324813108548634e-05, + "loss": 1.9891, + "step": 49230 + }, + { + "epoch": 0.0958973200549932, + "grad_norm": 3.981290102005005, + "learning_rate": 2.93244047146614e-05, + "loss": 1.9297, + "step": 49245 + }, + { + "epoch": 0.09592653032610347, + "grad_norm": 5.40943717956543, + "learning_rate": 2.9323996200146338e-05, + "loss": 1.7935, + "step": 49260 + }, + { + "epoch": 0.09595574059721373, + "grad_norm": 2.819638967514038, + "learning_rate": 2.932358756500689e-05, + "loss": 1.7276, + "step": 49275 + }, + { + "epoch": 0.095984950868324, + "grad_norm": 2.1289079189300537, + "learning_rate": 2.93231788092465e-05, + "loss": 1.9824, + "step": 49290 + }, + { + "epoch": 0.09601416113943426, + "grad_norm": 1.9851287603378296, + "learning_rate": 2.93227699328686e-05, + "loss": 1.8725, + "step": 49305 + }, + { + "epoch": 0.09604337141054452, + "grad_norm": 4.321681499481201, + "learning_rate": 2.9322360935876647e-05, + "loss": 1.8184, + "step": 49320 + }, + { + "epoch": 0.09607258168165478, + "grad_norm": 3.4782915115356445, + "learning_rate": 2.932195181827408e-05, + "loss": 2.057, + "step": 49335 + }, + { + "epoch": 0.09610179195276504, + "grad_norm": 3.1813442707061768, + "learning_rate": 2.9321542580064332e-05, + "loss": 1.8996, + "step": 49350 + }, + { + "epoch": 0.0961310022238753, + "grad_norm": 3.8128514289855957, + "learning_rate": 2.932113322125087e-05, + "loss": 1.8302, + "step": 49365 + }, + { + "epoch": 0.09616021249498558, + "grad_norm": 4.271757125854492, + "learning_rate": 2.9320723741837122e-05, + "loss": 1.778, + "step": 49380 + }, + { + "epoch": 0.09618942276609584, + "grad_norm": 1.6343133449554443, + "learning_rate": 2.9320314141826556e-05, + "loss": 1.7966, + "step": 49395 + }, + { + "epoch": 0.0962186330372061, + "grad_norm": 2.3712074756622314, + "learning_rate": 2.9319904421222605e-05, + "loss": 1.6004, + "step": 49410 + }, + { + "epoch": 0.09624784330831636, + "grad_norm": 2.9529919624328613, + "learning_rate": 2.931949458002872e-05, + "loss": 1.918, + "step": 49425 + }, + { + "epoch": 0.09627705357942662, + "grad_norm": 3.2947447299957275, + "learning_rate": 2.9319084618248364e-05, + "loss": 1.8999, + "step": 49440 + }, + { + "epoch": 0.09630626385053688, + "grad_norm": 4.216789245605469, + "learning_rate": 2.9318674535884976e-05, + "loss": 1.867, + "step": 49455 + }, + { + "epoch": 0.09633547412164714, + "grad_norm": 3.108816385269165, + "learning_rate": 2.9318264332942023e-05, + "loss": 1.9415, + "step": 49470 + }, + { + "epoch": 0.09636468439275742, + "grad_norm": 2.3301572799682617, + "learning_rate": 2.9317854009422947e-05, + "loss": 1.8643, + "step": 49485 + }, + { + "epoch": 0.09639389466386768, + "grad_norm": 2.4246294498443604, + "learning_rate": 2.9317443565331215e-05, + "loss": 2.1021, + "step": 49500 + }, + { + "epoch": 0.09642310493497794, + "grad_norm": 2.9938971996307373, + "learning_rate": 2.931703300067027e-05, + "loss": 1.7982, + "step": 49515 + }, + { + "epoch": 0.0964523152060882, + "grad_norm": 4.092379093170166, + "learning_rate": 2.931662231544358e-05, + "loss": 1.8046, + "step": 49530 + }, + { + "epoch": 0.09648152547719846, + "grad_norm": 3.6921536922454834, + "learning_rate": 2.93162115096546e-05, + "loss": 1.8048, + "step": 49545 + }, + { + "epoch": 0.09651073574830872, + "grad_norm": 2.1144022941589355, + "learning_rate": 2.9315800583306788e-05, + "loss": 1.7977, + "step": 49560 + }, + { + "epoch": 0.09653994601941898, + "grad_norm": 3.873920202255249, + "learning_rate": 2.9315389536403608e-05, + "loss": 1.8269, + "step": 49575 + }, + { + "epoch": 0.09656915629052926, + "grad_norm": 2.7853615283966064, + "learning_rate": 2.9314978368948517e-05, + "loss": 1.8349, + "step": 49590 + }, + { + "epoch": 0.09659836656163952, + "grad_norm": 2.341984272003174, + "learning_rate": 2.9314567080944978e-05, + "loss": 1.9739, + "step": 49605 + }, + { + "epoch": 0.09662757683274978, + "grad_norm": 5.124273300170898, + "learning_rate": 2.9314155672396463e-05, + "loss": 1.894, + "step": 49620 + }, + { + "epoch": 0.09665678710386004, + "grad_norm": 2.3911478519439697, + "learning_rate": 2.9313744143306425e-05, + "loss": 1.763, + "step": 49635 + }, + { + "epoch": 0.0966859973749703, + "grad_norm": 3.236575126647949, + "learning_rate": 2.9313332493678334e-05, + "loss": 1.7819, + "step": 49650 + }, + { + "epoch": 0.09671520764608056, + "grad_norm": 3.3634097576141357, + "learning_rate": 2.931292072351566e-05, + "loss": 1.8155, + "step": 49665 + }, + { + "epoch": 0.09674441791719082, + "grad_norm": 2.849099636077881, + "learning_rate": 2.9312508832821865e-05, + "loss": 1.8128, + "step": 49680 + }, + { + "epoch": 0.0967736281883011, + "grad_norm": 2.615405321121216, + "learning_rate": 2.931209682160042e-05, + "loss": 1.7717, + "step": 49695 + }, + { + "epoch": 0.09680283845941136, + "grad_norm": 2.432985305786133, + "learning_rate": 2.9311684689854795e-05, + "loss": 1.9783, + "step": 49710 + }, + { + "epoch": 0.09683204873052162, + "grad_norm": 3.7597007751464844, + "learning_rate": 2.931127243758846e-05, + "loss": 1.7927, + "step": 49725 + }, + { + "epoch": 0.09686125900163188, + "grad_norm": 3.092031240463257, + "learning_rate": 2.9310860064804893e-05, + "loss": 1.8321, + "step": 49740 + }, + { + "epoch": 0.09689046927274214, + "grad_norm": 3.606152296066284, + "learning_rate": 2.931044757150755e-05, + "loss": 1.6903, + "step": 49755 + }, + { + "epoch": 0.0969196795438524, + "grad_norm": 3.0055344104766846, + "learning_rate": 2.9310034957699925e-05, + "loss": 1.9332, + "step": 49770 + }, + { + "epoch": 0.09694888981496266, + "grad_norm": 3.8841702938079834, + "learning_rate": 2.930962222338548e-05, + "loss": 2.0835, + "step": 49785 + }, + { + "epoch": 0.09697810008607294, + "grad_norm": 5.191905975341797, + "learning_rate": 2.930920936856769e-05, + "loss": 1.9355, + "step": 49800 + }, + { + "epoch": 0.0970073103571832, + "grad_norm": 3.7676405906677246, + "learning_rate": 2.9308796393250037e-05, + "loss": 1.889, + "step": 49815 + }, + { + "epoch": 0.09703652062829346, + "grad_norm": 2.2673709392547607, + "learning_rate": 2.9308383297436e-05, + "loss": 1.8505, + "step": 49830 + }, + { + "epoch": 0.09706573089940372, + "grad_norm": 2.153475522994995, + "learning_rate": 2.9307970081129052e-05, + "loss": 1.9271, + "step": 49845 + }, + { + "epoch": 0.09709494117051398, + "grad_norm": 3.575103998184204, + "learning_rate": 2.930755674433268e-05, + "loss": 1.7848, + "step": 49860 + }, + { + "epoch": 0.09712415144162424, + "grad_norm": 3.662231206893921, + "learning_rate": 2.9307143287050355e-05, + "loss": 1.9642, + "step": 49875 + }, + { + "epoch": 0.0971533617127345, + "grad_norm": 3.604665517807007, + "learning_rate": 2.9306729709285567e-05, + "loss": 1.9541, + "step": 49890 + }, + { + "epoch": 0.09718257198384478, + "grad_norm": 5.528378009796143, + "learning_rate": 2.930631601104179e-05, + "loss": 1.8596, + "step": 49905 + }, + { + "epoch": 0.09721178225495504, + "grad_norm": 4.253903865814209, + "learning_rate": 2.9305902192322523e-05, + "loss": 1.9061, + "step": 49920 + }, + { + "epoch": 0.0972409925260653, + "grad_norm": 3.9055936336517334, + "learning_rate": 2.9305488253131236e-05, + "loss": 1.5918, + "step": 49935 + }, + { + "epoch": 0.09727020279717556, + "grad_norm": 3.4722824096679688, + "learning_rate": 2.9305074193471423e-05, + "loss": 1.766, + "step": 49950 + }, + { + "epoch": 0.09729941306828582, + "grad_norm": 3.583777904510498, + "learning_rate": 2.9304660013346567e-05, + "loss": 1.9215, + "step": 49965 + }, + { + "epoch": 0.09732862333939608, + "grad_norm": 2.2608377933502197, + "learning_rate": 2.9304245712760156e-05, + "loss": 1.8978, + "step": 49980 + }, + { + "epoch": 0.09735783361050634, + "grad_norm": 3.6289658546447754, + "learning_rate": 2.930383129171568e-05, + "loss": 1.8614, + "step": 49995 + }, + { + "epoch": 0.09738704388161662, + "grad_norm": 2.0757150650024414, + "learning_rate": 2.9303416750216633e-05, + "loss": 1.6696, + "step": 50010 + }, + { + "epoch": 0.09741625415272688, + "grad_norm": 2.5153489112854004, + "learning_rate": 2.93030020882665e-05, + "loss": 1.9128, + "step": 50025 + }, + { + "epoch": 0.09744546442383714, + "grad_norm": 2.9471731185913086, + "learning_rate": 2.9302587305868775e-05, + "loss": 1.9189, + "step": 50040 + }, + { + "epoch": 0.0974746746949474, + "grad_norm": 1.6924009323120117, + "learning_rate": 2.930217240302695e-05, + "loss": 1.7534, + "step": 50055 + }, + { + "epoch": 0.09750388496605766, + "grad_norm": 4.795204162597656, + "learning_rate": 2.9301757379744518e-05, + "loss": 1.8246, + "step": 50070 + }, + { + "epoch": 0.09753309523716792, + "grad_norm": 4.580473899841309, + "learning_rate": 2.9301342236024977e-05, + "loss": 1.9236, + "step": 50085 + }, + { + "epoch": 0.09756230550827819, + "grad_norm": 2.0105273723602295, + "learning_rate": 2.930092697187182e-05, + "loss": 1.7976, + "step": 50100 + }, + { + "epoch": 0.09759151577938846, + "grad_norm": 2.0396523475646973, + "learning_rate": 2.930051158728855e-05, + "loss": 1.768, + "step": 50115 + }, + { + "epoch": 0.09762072605049872, + "grad_norm": 1.9610315561294556, + "learning_rate": 2.930009608227866e-05, + "loss": 1.7016, + "step": 50130 + }, + { + "epoch": 0.09764993632160898, + "grad_norm": 5.699824333190918, + "learning_rate": 2.9299680456845643e-05, + "loss": 1.8118, + "step": 50145 + }, + { + "epoch": 0.09767914659271924, + "grad_norm": 4.904826641082764, + "learning_rate": 2.929926471099301e-05, + "loss": 1.8966, + "step": 50160 + }, + { + "epoch": 0.0977083568638295, + "grad_norm": 4.166045188903809, + "learning_rate": 2.9298848844724263e-05, + "loss": 1.8054, + "step": 50175 + }, + { + "epoch": 0.09773756713493977, + "grad_norm": 3.8375942707061768, + "learning_rate": 2.929843285804289e-05, + "loss": 1.9218, + "step": 50190 + }, + { + "epoch": 0.09776677740605003, + "grad_norm": 2.3785958290100098, + "learning_rate": 2.9298016750952408e-05, + "loss": 1.8998, + "step": 50205 + }, + { + "epoch": 0.0977959876771603, + "grad_norm": 2.632284164428711, + "learning_rate": 2.9297600523456314e-05, + "loss": 1.6993, + "step": 50220 + }, + { + "epoch": 0.09782519794827056, + "grad_norm": 3.581476926803589, + "learning_rate": 2.9297184175558116e-05, + "loss": 1.6674, + "step": 50235 + }, + { + "epoch": 0.09785440821938082, + "grad_norm": 2.3087403774261475, + "learning_rate": 2.929676770726132e-05, + "loss": 1.8005, + "step": 50250 + }, + { + "epoch": 0.09788361849049108, + "grad_norm": 2.212217330932617, + "learning_rate": 2.9296351118569433e-05, + "loss": 1.8502, + "step": 50265 + }, + { + "epoch": 0.09791282876160134, + "grad_norm": 2.2795591354370117, + "learning_rate": 2.9295934409485957e-05, + "loss": 1.8022, + "step": 50280 + }, + { + "epoch": 0.0979420390327116, + "grad_norm": 1.9416465759277344, + "learning_rate": 2.9295517580014414e-05, + "loss": 2.1082, + "step": 50295 + }, + { + "epoch": 0.09797124930382187, + "grad_norm": 3.3864188194274902, + "learning_rate": 2.9295100630158303e-05, + "loss": 1.8254, + "step": 50310 + }, + { + "epoch": 0.09800045957493214, + "grad_norm": 2.7044012546539307, + "learning_rate": 2.9294683559921138e-05, + "loss": 1.8115, + "step": 50325 + }, + { + "epoch": 0.0980296698460424, + "grad_norm": 3.7825982570648193, + "learning_rate": 2.9294266369306435e-05, + "loss": 1.8268, + "step": 50340 + }, + { + "epoch": 0.09805888011715266, + "grad_norm": 2.6438560485839844, + "learning_rate": 2.92938490583177e-05, + "loss": 1.9648, + "step": 50355 + }, + { + "epoch": 0.09808809038826292, + "grad_norm": 2.6116576194763184, + "learning_rate": 2.9293431626958458e-05, + "loss": 1.9675, + "step": 50370 + }, + { + "epoch": 0.09811730065937319, + "grad_norm": 2.0167150497436523, + "learning_rate": 2.9293014075232217e-05, + "loss": 1.8153, + "step": 50385 + }, + { + "epoch": 0.09814651093048345, + "grad_norm": 3.124234437942505, + "learning_rate": 2.9292596403142492e-05, + "loss": 1.9441, + "step": 50400 + }, + { + "epoch": 0.09817572120159371, + "grad_norm": 4.874459743499756, + "learning_rate": 2.92921786106928e-05, + "loss": 1.8517, + "step": 50415 + }, + { + "epoch": 0.09820493147270397, + "grad_norm": 3.4582738876342773, + "learning_rate": 2.9291760697886668e-05, + "loss": 1.9337, + "step": 50430 + }, + { + "epoch": 0.09823414174381424, + "grad_norm": 3.4939401149749756, + "learning_rate": 2.9291342664727605e-05, + "loss": 1.9163, + "step": 50445 + }, + { + "epoch": 0.0982633520149245, + "grad_norm": 4.0186381340026855, + "learning_rate": 2.9290924511219132e-05, + "loss": 2.0225, + "step": 50460 + }, + { + "epoch": 0.09829256228603477, + "grad_norm": 2.9652369022369385, + "learning_rate": 2.929050623736478e-05, + "loss": 1.9003, + "step": 50475 + }, + { + "epoch": 0.09832177255714503, + "grad_norm": 3.2894749641418457, + "learning_rate": 2.9290087843168065e-05, + "loss": 1.9578, + "step": 50490 + }, + { + "epoch": 0.09835098282825529, + "grad_norm": 4.110862731933594, + "learning_rate": 2.9289669328632504e-05, + "loss": 1.7971, + "step": 50505 + }, + { + "epoch": 0.09838019309936555, + "grad_norm": 3.563746690750122, + "learning_rate": 2.9289250693761635e-05, + "loss": 1.8875, + "step": 50520 + }, + { + "epoch": 0.09840940337047581, + "grad_norm": 2.2554609775543213, + "learning_rate": 2.9288831938558975e-05, + "loss": 1.828, + "step": 50535 + }, + { + "epoch": 0.09843861364158608, + "grad_norm": 4.824254035949707, + "learning_rate": 2.9288413063028048e-05, + "loss": 1.9239, + "step": 50550 + }, + { + "epoch": 0.09846782391269635, + "grad_norm": 1.684004783630371, + "learning_rate": 2.9287994067172388e-05, + "loss": 1.8381, + "step": 50565 + }, + { + "epoch": 0.0984970341838066, + "grad_norm": 5.158596038818359, + "learning_rate": 2.928757495099552e-05, + "loss": 1.8882, + "step": 50580 + }, + { + "epoch": 0.09852624445491687, + "grad_norm": 2.6243045330047607, + "learning_rate": 2.928715571450097e-05, + "loss": 1.9741, + "step": 50595 + }, + { + "epoch": 0.09855545472602713, + "grad_norm": 2.581686019897461, + "learning_rate": 2.9286736357692277e-05, + "loss": 1.7908, + "step": 50610 + }, + { + "epoch": 0.09858466499713739, + "grad_norm": 4.69744348526001, + "learning_rate": 2.9286316880572967e-05, + "loss": 1.9603, + "step": 50625 + }, + { + "epoch": 0.09861387526824765, + "grad_norm": 4.348971366882324, + "learning_rate": 2.9285897283146572e-05, + "loss": 1.8657, + "step": 50640 + }, + { + "epoch": 0.09864308553935792, + "grad_norm": 2.9074223041534424, + "learning_rate": 2.928547756541663e-05, + "loss": 1.9288, + "step": 50655 + }, + { + "epoch": 0.09867229581046819, + "grad_norm": 3.6681530475616455, + "learning_rate": 2.9285057727386668e-05, + "loss": 1.8544, + "step": 50670 + }, + { + "epoch": 0.09870150608157845, + "grad_norm": 2.248783826828003, + "learning_rate": 2.928463776906023e-05, + "loss": 1.9141, + "step": 50685 + }, + { + "epoch": 0.09873071635268871, + "grad_norm": 3.9001669883728027, + "learning_rate": 2.9284217690440845e-05, + "loss": 1.8755, + "step": 50700 + }, + { + "epoch": 0.09875992662379897, + "grad_norm": 3.1399364471435547, + "learning_rate": 2.928379749153205e-05, + "loss": 1.7657, + "step": 50715 + }, + { + "epoch": 0.09878913689490923, + "grad_norm": 3.4785237312316895, + "learning_rate": 2.9283377172337397e-05, + "loss": 1.7498, + "step": 50730 + }, + { + "epoch": 0.09881834716601949, + "grad_norm": 5.273774147033691, + "learning_rate": 2.9282956732860408e-05, + "loss": 1.8231, + "step": 50745 + }, + { + "epoch": 0.09884755743712977, + "grad_norm": 4.573613166809082, + "learning_rate": 2.9282536173104635e-05, + "loss": 1.8316, + "step": 50760 + }, + { + "epoch": 0.09887676770824003, + "grad_norm": 2.501286745071411, + "learning_rate": 2.9282115493073616e-05, + "loss": 1.9897, + "step": 50775 + }, + { + "epoch": 0.09890597797935029, + "grad_norm": 2.594648599624634, + "learning_rate": 2.928169469277089e-05, + "loss": 1.8894, + "step": 50790 + }, + { + "epoch": 0.09893518825046055, + "grad_norm": 3.582399606704712, + "learning_rate": 2.928127377220001e-05, + "loss": 1.8343, + "step": 50805 + }, + { + "epoch": 0.09896439852157081, + "grad_norm": 2.9742801189422607, + "learning_rate": 2.9280852731364515e-05, + "loss": 1.8572, + "step": 50820 + }, + { + "epoch": 0.09899360879268107, + "grad_norm": 2.9038615226745605, + "learning_rate": 2.9280431570267947e-05, + "loss": 1.72, + "step": 50835 + }, + { + "epoch": 0.09902281906379133, + "grad_norm": 1.726894736289978, + "learning_rate": 2.9280010288913853e-05, + "loss": 1.9789, + "step": 50850 + }, + { + "epoch": 0.0990520293349016, + "grad_norm": 2.48158860206604, + "learning_rate": 2.9279588887305793e-05, + "loss": 1.752, + "step": 50865 + }, + { + "epoch": 0.09908123960601187, + "grad_norm": 2.734722852706909, + "learning_rate": 2.9279167365447302e-05, + "loss": 2.0492, + "step": 50880 + }, + { + "epoch": 0.09911044987712213, + "grad_norm": 3.3718225955963135, + "learning_rate": 2.927874572334193e-05, + "loss": 1.7142, + "step": 50895 + }, + { + "epoch": 0.09913966014823239, + "grad_norm": 3.647507667541504, + "learning_rate": 2.9278323960993238e-05, + "loss": 1.8559, + "step": 50910 + }, + { + "epoch": 0.09916887041934265, + "grad_norm": 2.6375949382781982, + "learning_rate": 2.927790207840477e-05, + "loss": 1.7471, + "step": 50925 + }, + { + "epoch": 0.09919808069045291, + "grad_norm": 2.8976383209228516, + "learning_rate": 2.9277480075580083e-05, + "loss": 1.8442, + "step": 50940 + }, + { + "epoch": 0.09922729096156317, + "grad_norm": 4.061347484588623, + "learning_rate": 2.927705795252273e-05, + "loss": 1.717, + "step": 50955 + }, + { + "epoch": 0.09925650123267345, + "grad_norm": 2.345612049102783, + "learning_rate": 2.9276635709236254e-05, + "loss": 1.7043, + "step": 50970 + }, + { + "epoch": 0.09928571150378371, + "grad_norm": 3.33445143699646, + "learning_rate": 2.9276213345724227e-05, + "loss": 1.7703, + "step": 50985 + }, + { + "epoch": 0.09931492177489397, + "grad_norm": 3.6107683181762695, + "learning_rate": 2.9275790861990202e-05, + "loss": 1.8683, + "step": 51000 + }, + { + "epoch": 0.09934413204600423, + "grad_norm": 3.7334389686584473, + "learning_rate": 2.9275368258037732e-05, + "loss": 1.8768, + "step": 51015 + }, + { + "epoch": 0.09937334231711449, + "grad_norm": 2.4035027027130127, + "learning_rate": 2.9274945533870375e-05, + "loss": 1.9563, + "step": 51030 + }, + { + "epoch": 0.09940255258822475, + "grad_norm": 5.561787128448486, + "learning_rate": 2.9274522689491696e-05, + "loss": 1.9885, + "step": 51045 + }, + { + "epoch": 0.09943176285933501, + "grad_norm": 4.420948505401611, + "learning_rate": 2.9274099724905256e-05, + "loss": 2.0977, + "step": 51060 + }, + { + "epoch": 0.09946097313044529, + "grad_norm": 2.3784122467041016, + "learning_rate": 2.9273676640114614e-05, + "loss": 1.8659, + "step": 51075 + }, + { + "epoch": 0.09949018340155555, + "grad_norm": 4.577541351318359, + "learning_rate": 2.927325343512333e-05, + "loss": 1.8968, + "step": 51090 + }, + { + "epoch": 0.09951939367266581, + "grad_norm": 3.8092751502990723, + "learning_rate": 2.9272830109934974e-05, + "loss": 1.8309, + "step": 51105 + }, + { + "epoch": 0.09954860394377607, + "grad_norm": 2.1280651092529297, + "learning_rate": 2.927240666455311e-05, + "loss": 1.8919, + "step": 51120 + }, + { + "epoch": 0.09957781421488633, + "grad_norm": 2.514244794845581, + "learning_rate": 2.92719830989813e-05, + "loss": 1.8846, + "step": 51135 + }, + { + "epoch": 0.09960702448599659, + "grad_norm": 4.485027313232422, + "learning_rate": 2.9271559413223113e-05, + "loss": 1.859, + "step": 51150 + }, + { + "epoch": 0.09963623475710685, + "grad_norm": 2.2585437297821045, + "learning_rate": 2.927113560728212e-05, + "loss": 2.0863, + "step": 51165 + }, + { + "epoch": 0.09966544502821713, + "grad_norm": 3.707125425338745, + "learning_rate": 2.9270711681161884e-05, + "loss": 1.8858, + "step": 51180 + }, + { + "epoch": 0.09969465529932739, + "grad_norm": 3.3982746601104736, + "learning_rate": 2.9270287634865977e-05, + "loss": 1.603, + "step": 51195 + }, + { + "epoch": 0.09972386557043765, + "grad_norm": 4.028176784515381, + "learning_rate": 2.926986346839797e-05, + "loss": 1.8977, + "step": 51210 + }, + { + "epoch": 0.09975307584154791, + "grad_norm": 4.433598041534424, + "learning_rate": 2.926943918176144e-05, + "loss": 1.835, + "step": 51225 + }, + { + "epoch": 0.09978228611265817, + "grad_norm": 2.535775661468506, + "learning_rate": 2.926901477495995e-05, + "loss": 1.7966, + "step": 51240 + }, + { + "epoch": 0.09981149638376843, + "grad_norm": 2.6152336597442627, + "learning_rate": 2.9268590247997087e-05, + "loss": 1.7176, + "step": 51255 + }, + { + "epoch": 0.0998407066548787, + "grad_norm": 3.727165699005127, + "learning_rate": 2.9268165600876414e-05, + "loss": 1.7679, + "step": 51270 + }, + { + "epoch": 0.09986991692598897, + "grad_norm": 3.6433348655700684, + "learning_rate": 2.9267740833601512e-05, + "loss": 1.7259, + "step": 51285 + }, + { + "epoch": 0.09989912719709923, + "grad_norm": 2.462620496749878, + "learning_rate": 2.9267315946175956e-05, + "loss": 1.8875, + "step": 51300 + }, + { + "epoch": 0.09992833746820949, + "grad_norm": 3.0979127883911133, + "learning_rate": 2.9266890938603326e-05, + "loss": 1.8023, + "step": 51315 + }, + { + "epoch": 0.09995754773931975, + "grad_norm": 2.2129147052764893, + "learning_rate": 2.9266465810887205e-05, + "loss": 1.8864, + "step": 51330 + }, + { + "epoch": 0.09998675801043001, + "grad_norm": 3.626903772354126, + "learning_rate": 2.926604056303117e-05, + "loss": 1.7717, + "step": 51345 + }, + { + "epoch": 0.10001596828154027, + "grad_norm": 3.7731549739837646, + "learning_rate": 2.9265615195038797e-05, + "loss": 1.8323, + "step": 51360 + }, + { + "epoch": 0.10004517855265053, + "grad_norm": 2.074625015258789, + "learning_rate": 2.9265189706913673e-05, + "loss": 1.8121, + "step": 51375 + }, + { + "epoch": 0.10007438882376081, + "grad_norm": 2.714456796646118, + "learning_rate": 2.926476409865938e-05, + "loss": 1.8879, + "step": 51390 + }, + { + "epoch": 0.10010359909487107, + "grad_norm": 4.711696147918701, + "learning_rate": 2.9264338370279502e-05, + "loss": 1.9525, + "step": 51405 + }, + { + "epoch": 0.10013280936598133, + "grad_norm": 2.2080190181732178, + "learning_rate": 2.9263912521777624e-05, + "loss": 1.7958, + "step": 51420 + }, + { + "epoch": 0.10016201963709159, + "grad_norm": 2.6654586791992188, + "learning_rate": 2.9263486553157333e-05, + "loss": 2.0238, + "step": 51435 + }, + { + "epoch": 0.10019122990820185, + "grad_norm": 2.1102378368377686, + "learning_rate": 2.9263060464422217e-05, + "loss": 1.8453, + "step": 51450 + }, + { + "epoch": 0.10022044017931211, + "grad_norm": 2.4648003578186035, + "learning_rate": 2.926263425557586e-05, + "loss": 1.8386, + "step": 51465 + }, + { + "epoch": 0.10024965045042238, + "grad_norm": 4.718409538269043, + "learning_rate": 2.9262207926621855e-05, + "loss": 1.9976, + "step": 51480 + }, + { + "epoch": 0.10027886072153264, + "grad_norm": 2.6654043197631836, + "learning_rate": 2.9261781477563792e-05, + "loss": 1.818, + "step": 51495 + }, + { + "epoch": 0.10030807099264291, + "grad_norm": 2.2968661785125732, + "learning_rate": 2.926135490840526e-05, + "loss": 1.9363, + "step": 51510 + }, + { + "epoch": 0.10033728126375317, + "grad_norm": 5.633299350738525, + "learning_rate": 2.9260928219149856e-05, + "loss": 1.7987, + "step": 51525 + }, + { + "epoch": 0.10036649153486343, + "grad_norm": 3.4443323612213135, + "learning_rate": 2.926050140980117e-05, + "loss": 1.9957, + "step": 51540 + }, + { + "epoch": 0.1003957018059737, + "grad_norm": 2.6234657764434814, + "learning_rate": 2.926007448036279e-05, + "loss": 1.8814, + "step": 51555 + }, + { + "epoch": 0.10042491207708396, + "grad_norm": 3.8197031021118164, + "learning_rate": 2.925964743083832e-05, + "loss": 1.8781, + "step": 51570 + }, + { + "epoch": 0.10045412234819422, + "grad_norm": 4.93427848815918, + "learning_rate": 2.9259220261231355e-05, + "loss": 1.9082, + "step": 51585 + }, + { + "epoch": 0.10048333261930448, + "grad_norm": 2.1275949478149414, + "learning_rate": 2.925879297154549e-05, + "loss": 1.8595, + "step": 51600 + }, + { + "epoch": 0.10051254289041475, + "grad_norm": 3.616004467010498, + "learning_rate": 2.925836556178432e-05, + "loss": 1.8563, + "step": 51615 + }, + { + "epoch": 0.10054175316152501, + "grad_norm": 3.3207528591156006, + "learning_rate": 2.9257938031951455e-05, + "loss": 1.9716, + "step": 51630 + }, + { + "epoch": 0.10057096343263527, + "grad_norm": 2.2283315658569336, + "learning_rate": 2.9257510382050484e-05, + "loss": 1.8513, + "step": 51645 + }, + { + "epoch": 0.10060017370374554, + "grad_norm": 2.2449660301208496, + "learning_rate": 2.9257082612085014e-05, + "loss": 1.9802, + "step": 51660 + }, + { + "epoch": 0.1006293839748558, + "grad_norm": 2.182696580886841, + "learning_rate": 2.9256654722058648e-05, + "loss": 1.7613, + "step": 51675 + }, + { + "epoch": 0.10065859424596606, + "grad_norm": 2.3022265434265137, + "learning_rate": 2.9256226711974984e-05, + "loss": 2.0325, + "step": 51690 + }, + { + "epoch": 0.10068780451707632, + "grad_norm": 2.959179162979126, + "learning_rate": 2.9255798581837634e-05, + "loss": 1.8477, + "step": 51705 + }, + { + "epoch": 0.10071701478818659, + "grad_norm": 2.473968505859375, + "learning_rate": 2.9255370331650196e-05, + "loss": 1.7675, + "step": 51720 + }, + { + "epoch": 0.10074622505929685, + "grad_norm": 2.5270917415618896, + "learning_rate": 2.9254941961416282e-05, + "loss": 1.7278, + "step": 51735 + }, + { + "epoch": 0.10077543533040711, + "grad_norm": 3.788010358810425, + "learning_rate": 2.9254513471139493e-05, + "loss": 1.9634, + "step": 51750 + }, + { + "epoch": 0.10080464560151738, + "grad_norm": 2.746396780014038, + "learning_rate": 2.9254084860823444e-05, + "loss": 1.7762, + "step": 51765 + }, + { + "epoch": 0.10083385587262764, + "grad_norm": 3.6009750366210938, + "learning_rate": 2.925365613047174e-05, + "loss": 1.876, + "step": 51780 + }, + { + "epoch": 0.1008630661437379, + "grad_norm": 4.298994541168213, + "learning_rate": 2.9253227280087992e-05, + "loss": 1.8371, + "step": 51795 + }, + { + "epoch": 0.10089227641484816, + "grad_norm": 3.8493106365203857, + "learning_rate": 2.9252798309675818e-05, + "loss": 1.7269, + "step": 51810 + }, + { + "epoch": 0.10092148668595843, + "grad_norm": 2.5960609912872314, + "learning_rate": 2.9252369219238818e-05, + "loss": 1.7847, + "step": 51825 + }, + { + "epoch": 0.1009506969570687, + "grad_norm": 4.169469356536865, + "learning_rate": 2.9251940008780618e-05, + "loss": 1.8292, + "step": 51840 + }, + { + "epoch": 0.10097990722817896, + "grad_norm": 2.5621516704559326, + "learning_rate": 2.9251510678304824e-05, + "loss": 1.947, + "step": 51855 + }, + { + "epoch": 0.10100911749928922, + "grad_norm": 3.860788583755493, + "learning_rate": 2.9251081227815054e-05, + "loss": 1.7945, + "step": 51870 + }, + { + "epoch": 0.10103832777039948, + "grad_norm": 2.9201455116271973, + "learning_rate": 2.9250651657314925e-05, + "loss": 1.821, + "step": 51885 + }, + { + "epoch": 0.10106753804150974, + "grad_norm": 3.1246635913848877, + "learning_rate": 2.925022196680806e-05, + "loss": 1.7569, + "step": 51900 + }, + { + "epoch": 0.10109674831262, + "grad_norm": 3.1227333545684814, + "learning_rate": 2.924979215629806e-05, + "loss": 2.0489, + "step": 51915 + }, + { + "epoch": 0.10112595858373027, + "grad_norm": 2.620610237121582, + "learning_rate": 2.924936222578856e-05, + "loss": 1.6126, + "step": 51930 + }, + { + "epoch": 0.10115516885484054, + "grad_norm": 3.166445255279541, + "learning_rate": 2.924893217528318e-05, + "loss": 1.6744, + "step": 51945 + }, + { + "epoch": 0.1011843791259508, + "grad_norm": 3.7178285121917725, + "learning_rate": 2.9248502004785538e-05, + "loss": 1.9204, + "step": 51960 + }, + { + "epoch": 0.10121358939706106, + "grad_norm": 2.7688398361206055, + "learning_rate": 2.9248071714299255e-05, + "loss": 1.9956, + "step": 51975 + }, + { + "epoch": 0.10124279966817132, + "grad_norm": 3.1003968715667725, + "learning_rate": 2.924764130382796e-05, + "loss": 2.0411, + "step": 51990 + }, + { + "epoch": 0.10127200993928158, + "grad_norm": 2.189988136291504, + "learning_rate": 2.9247210773375268e-05, + "loss": 1.8253, + "step": 52005 + }, + { + "epoch": 0.10130122021039184, + "grad_norm": 1.7499350309371948, + "learning_rate": 2.924678012294481e-05, + "loss": 1.8106, + "step": 52020 + }, + { + "epoch": 0.10133043048150212, + "grad_norm": 3.2424755096435547, + "learning_rate": 2.9246349352540217e-05, + "loss": 1.7018, + "step": 52035 + }, + { + "epoch": 0.10135964075261238, + "grad_norm": 2.846792697906494, + "learning_rate": 2.924591846216511e-05, + "loss": 1.8952, + "step": 52050 + }, + { + "epoch": 0.10138885102372264, + "grad_norm": 2.178586006164551, + "learning_rate": 2.9245487451823123e-05, + "loss": 1.6687, + "step": 52065 + }, + { + "epoch": 0.1014180612948329, + "grad_norm": 4.501686096191406, + "learning_rate": 2.924505632151788e-05, + "loss": 1.8369, + "step": 52080 + }, + { + "epoch": 0.10144727156594316, + "grad_norm": 2.375622272491455, + "learning_rate": 2.9244625071253014e-05, + "loss": 1.7362, + "step": 52095 + }, + { + "epoch": 0.10147648183705342, + "grad_norm": 2.0563292503356934, + "learning_rate": 2.924419370103216e-05, + "loss": 1.8708, + "step": 52110 + }, + { + "epoch": 0.10150569210816368, + "grad_norm": 2.451967477798462, + "learning_rate": 2.924376221085894e-05, + "loss": 1.8092, + "step": 52125 + }, + { + "epoch": 0.10153490237927396, + "grad_norm": 3.121628999710083, + "learning_rate": 2.9243330600737003e-05, + "loss": 1.843, + "step": 52140 + }, + { + "epoch": 0.10156411265038422, + "grad_norm": 2.7133121490478516, + "learning_rate": 2.924289887066997e-05, + "loss": 2.1271, + "step": 52155 + }, + { + "epoch": 0.10159332292149448, + "grad_norm": 2.930771589279175, + "learning_rate": 2.9242467020661488e-05, + "loss": 1.8519, + "step": 52170 + }, + { + "epoch": 0.10162253319260474, + "grad_norm": 2.4638404846191406, + "learning_rate": 2.9242035050715184e-05, + "loss": 1.8286, + "step": 52185 + }, + { + "epoch": 0.101651743463715, + "grad_norm": 2.1258838176727295, + "learning_rate": 2.92416029608347e-05, + "loss": 1.7845, + "step": 52200 + }, + { + "epoch": 0.10168095373482526, + "grad_norm": 2.9611170291900635, + "learning_rate": 2.924117075102368e-05, + "loss": 1.9441, + "step": 52215 + }, + { + "epoch": 0.10171016400593552, + "grad_norm": 3.221245288848877, + "learning_rate": 2.9240738421285748e-05, + "loss": 2.0082, + "step": 52230 + }, + { + "epoch": 0.1017393742770458, + "grad_norm": 2.655762195587158, + "learning_rate": 2.924030597162456e-05, + "loss": 2.1104, + "step": 52245 + }, + { + "epoch": 0.10176858454815606, + "grad_norm": 2.8793299198150635, + "learning_rate": 2.9239873402043755e-05, + "loss": 1.728, + "step": 52260 + }, + { + "epoch": 0.10179779481926632, + "grad_norm": 5.0151214599609375, + "learning_rate": 2.9239440712546974e-05, + "loss": 1.9032, + "step": 52275 + }, + { + "epoch": 0.10182700509037658, + "grad_norm": 3.8040266036987305, + "learning_rate": 2.9239007903137857e-05, + "loss": 1.8819, + "step": 52290 + }, + { + "epoch": 0.10185621536148684, + "grad_norm": 2.1644256114959717, + "learning_rate": 2.9238574973820055e-05, + "loss": 1.8921, + "step": 52305 + }, + { + "epoch": 0.1018854256325971, + "grad_norm": 2.887611150741577, + "learning_rate": 2.9238141924597207e-05, + "loss": 1.7561, + "step": 52320 + }, + { + "epoch": 0.10191463590370736, + "grad_norm": 3.016183853149414, + "learning_rate": 2.9237708755472963e-05, + "loss": 2.0133, + "step": 52335 + }, + { + "epoch": 0.10194384617481764, + "grad_norm": 2.413327693939209, + "learning_rate": 2.923727546645097e-05, + "loss": 1.8857, + "step": 52350 + }, + { + "epoch": 0.1019730564459279, + "grad_norm": 2.3333163261413574, + "learning_rate": 2.923684205753488e-05, + "loss": 1.8865, + "step": 52365 + }, + { + "epoch": 0.10200226671703816, + "grad_norm": 2.833000898361206, + "learning_rate": 2.9236408528728346e-05, + "loss": 1.7953, + "step": 52380 + }, + { + "epoch": 0.10203147698814842, + "grad_norm": 4.4297776222229, + "learning_rate": 2.9235974880035002e-05, + "loss": 1.7317, + "step": 52395 + }, + { + "epoch": 0.10206068725925868, + "grad_norm": 5.2759175300598145, + "learning_rate": 2.9235541111458518e-05, + "loss": 1.9133, + "step": 52410 + }, + { + "epoch": 0.10208989753036894, + "grad_norm": 2.9812684059143066, + "learning_rate": 2.923510722300254e-05, + "loss": 1.8737, + "step": 52425 + }, + { + "epoch": 0.1021191078014792, + "grad_norm": 2.062561511993408, + "learning_rate": 2.9234673214670725e-05, + "loss": 1.9559, + "step": 52440 + }, + { + "epoch": 0.10214831807258948, + "grad_norm": 3.152745008468628, + "learning_rate": 2.9234239086466725e-05, + "loss": 1.9828, + "step": 52455 + }, + { + "epoch": 0.10217752834369974, + "grad_norm": 2.9057366847991943, + "learning_rate": 2.923380483839419e-05, + "loss": 1.7809, + "step": 52470 + }, + { + "epoch": 0.10220673861481, + "grad_norm": 2.111750602722168, + "learning_rate": 2.9233370470456785e-05, + "loss": 1.7501, + "step": 52485 + }, + { + "epoch": 0.10223594888592026, + "grad_norm": 2.3556952476501465, + "learning_rate": 2.9232935982658164e-05, + "loss": 1.8154, + "step": 52500 + }, + { + "epoch": 0.10226515915703052, + "grad_norm": 3.024723768234253, + "learning_rate": 2.923250137500199e-05, + "loss": 1.8, + "step": 52515 + }, + { + "epoch": 0.10229436942814078, + "grad_norm": 2.7702646255493164, + "learning_rate": 2.923206664749192e-05, + "loss": 1.8854, + "step": 52530 + }, + { + "epoch": 0.10232357969925104, + "grad_norm": 3.769913911819458, + "learning_rate": 2.923163180013161e-05, + "loss": 1.6611, + "step": 52545 + }, + { + "epoch": 0.1023527899703613, + "grad_norm": 4.062292575836182, + "learning_rate": 2.923119683292473e-05, + "loss": 1.7646, + "step": 52560 + }, + { + "epoch": 0.10238200024147158, + "grad_norm": 2.6236956119537354, + "learning_rate": 2.9230761745874945e-05, + "loss": 1.74, + "step": 52575 + }, + { + "epoch": 0.10241121051258184, + "grad_norm": 3.377202033996582, + "learning_rate": 2.923032653898591e-05, + "loss": 1.7617, + "step": 52590 + }, + { + "epoch": 0.1024404207836921, + "grad_norm": 2.714411735534668, + "learning_rate": 2.922989121226129e-05, + "loss": 1.9378, + "step": 52605 + }, + { + "epoch": 0.10246963105480236, + "grad_norm": 2.279560089111328, + "learning_rate": 2.9229455765704763e-05, + "loss": 1.8215, + "step": 52620 + }, + { + "epoch": 0.10249884132591262, + "grad_norm": 2.452533721923828, + "learning_rate": 2.9229020199319984e-05, + "loss": 1.8956, + "step": 52635 + }, + { + "epoch": 0.10252805159702288, + "grad_norm": 2.515977382659912, + "learning_rate": 2.9228584513110617e-05, + "loss": 1.838, + "step": 52650 + }, + { + "epoch": 0.10255726186813315, + "grad_norm": 2.400944232940674, + "learning_rate": 2.922814870708035e-05, + "loss": 1.8145, + "step": 52665 + }, + { + "epoch": 0.10258647213924342, + "grad_norm": 4.391109943389893, + "learning_rate": 2.9227712781232837e-05, + "loss": 1.981, + "step": 52680 + }, + { + "epoch": 0.10261568241035368, + "grad_norm": 3.477064847946167, + "learning_rate": 2.922727673557175e-05, + "loss": 1.9165, + "step": 52695 + }, + { + "epoch": 0.10264489268146394, + "grad_norm": 2.854994058609009, + "learning_rate": 2.922684057010077e-05, + "loss": 2.0644, + "step": 52710 + }, + { + "epoch": 0.1026741029525742, + "grad_norm": 2.474790573120117, + "learning_rate": 2.9226404284823566e-05, + "loss": 2.003, + "step": 52725 + }, + { + "epoch": 0.10270331322368446, + "grad_norm": 1.8789151906967163, + "learning_rate": 2.9225967879743805e-05, + "loss": 1.9084, + "step": 52740 + }, + { + "epoch": 0.10273252349479473, + "grad_norm": 3.587696075439453, + "learning_rate": 2.922553135486517e-05, + "loss": 1.7805, + "step": 52755 + }, + { + "epoch": 0.10276173376590499, + "grad_norm": 3.5017142295837402, + "learning_rate": 2.9225094710191335e-05, + "loss": 1.8052, + "step": 52770 + }, + { + "epoch": 0.10279094403701526, + "grad_norm": 3.3561439514160156, + "learning_rate": 2.9224657945725974e-05, + "loss": 1.5558, + "step": 52785 + }, + { + "epoch": 0.10282015430812552, + "grad_norm": 2.1130895614624023, + "learning_rate": 2.9224221061472774e-05, + "loss": 2.0765, + "step": 52800 + }, + { + "epoch": 0.10284936457923578, + "grad_norm": 3.7945330142974854, + "learning_rate": 2.9223784057435403e-05, + "loss": 1.8974, + "step": 52815 + }, + { + "epoch": 0.10287857485034604, + "grad_norm": 2.3494579792022705, + "learning_rate": 2.9223346933617547e-05, + "loss": 1.8936, + "step": 52830 + }, + { + "epoch": 0.1029077851214563, + "grad_norm": 3.628025770187378, + "learning_rate": 2.9222909690022887e-05, + "loss": 1.8944, + "step": 52845 + }, + { + "epoch": 0.10293699539256657, + "grad_norm": 2.961174488067627, + "learning_rate": 2.9222472326655102e-05, + "loss": 1.7924, + "step": 52860 + }, + { + "epoch": 0.10296620566367683, + "grad_norm": 2.788062810897827, + "learning_rate": 2.9222034843517878e-05, + "loss": 1.8915, + "step": 52875 + }, + { + "epoch": 0.1029954159347871, + "grad_norm": 2.6212539672851562, + "learning_rate": 2.92215972406149e-05, + "loss": 1.8854, + "step": 52890 + }, + { + "epoch": 0.10302462620589736, + "grad_norm": 3.915565013885498, + "learning_rate": 2.9221159517949848e-05, + "loss": 1.8489, + "step": 52905 + }, + { + "epoch": 0.10305383647700762, + "grad_norm": 2.287111759185791, + "learning_rate": 2.9220721675526418e-05, + "loss": 1.8905, + "step": 52920 + }, + { + "epoch": 0.10308304674811788, + "grad_norm": 1.9591848850250244, + "learning_rate": 2.9220283713348287e-05, + "loss": 1.8771, + "step": 52935 + }, + { + "epoch": 0.10311225701922815, + "grad_norm": 2.181339979171753, + "learning_rate": 2.9219845631419146e-05, + "loss": 1.8998, + "step": 52950 + }, + { + "epoch": 0.1031414672903384, + "grad_norm": 4.41172981262207, + "learning_rate": 2.9219407429742683e-05, + "loss": 1.7746, + "step": 52965 + }, + { + "epoch": 0.10317067756144867, + "grad_norm": 3.7191624641418457, + "learning_rate": 2.9218969108322593e-05, + "loss": 1.7355, + "step": 52980 + }, + { + "epoch": 0.10319988783255894, + "grad_norm": 2.4057443141937256, + "learning_rate": 2.9218530667162567e-05, + "loss": 1.8293, + "step": 52995 + }, + { + "epoch": 0.1032290981036692, + "grad_norm": 3.978999137878418, + "learning_rate": 2.921809210626629e-05, + "loss": 1.9741, + "step": 53010 + }, + { + "epoch": 0.10325830837477946, + "grad_norm": 1.9080361127853394, + "learning_rate": 2.9217653425637462e-05, + "loss": 1.7423, + "step": 53025 + }, + { + "epoch": 0.10328751864588973, + "grad_norm": 2.817746877670288, + "learning_rate": 2.9217214625279776e-05, + "loss": 1.701, + "step": 53040 + }, + { + "epoch": 0.10331672891699999, + "grad_norm": 2.3751299381256104, + "learning_rate": 2.9216775705196924e-05, + "loss": 1.8182, + "step": 53055 + }, + { + "epoch": 0.10334593918811025, + "grad_norm": 2.12813401222229, + "learning_rate": 2.9216336665392602e-05, + "loss": 1.7919, + "step": 53070 + }, + { + "epoch": 0.10337514945922051, + "grad_norm": 2.3823862075805664, + "learning_rate": 2.921589750587051e-05, + "loss": 1.8145, + "step": 53085 + }, + { + "epoch": 0.10340435973033078, + "grad_norm": 3.9402434825897217, + "learning_rate": 2.921545822663435e-05, + "loss": 1.8797, + "step": 53100 + }, + { + "epoch": 0.10343357000144104, + "grad_norm": 2.0574233531951904, + "learning_rate": 2.9215018827687816e-05, + "loss": 1.8935, + "step": 53115 + }, + { + "epoch": 0.1034627802725513, + "grad_norm": 2.729658842086792, + "learning_rate": 2.9214579309034603e-05, + "loss": 2.0222, + "step": 53130 + }, + { + "epoch": 0.10349199054366157, + "grad_norm": 2.9515881538391113, + "learning_rate": 2.9214139670678423e-05, + "loss": 1.9446, + "step": 53145 + }, + { + "epoch": 0.10352120081477183, + "grad_norm": 3.3878188133239746, + "learning_rate": 2.9213699912622976e-05, + "loss": 1.7399, + "step": 53160 + }, + { + "epoch": 0.10355041108588209, + "grad_norm": 3.7901418209075928, + "learning_rate": 2.921326003487196e-05, + "loss": 1.8246, + "step": 53175 + }, + { + "epoch": 0.10357962135699235, + "grad_norm": 3.6533632278442383, + "learning_rate": 2.9212820037429082e-05, + "loss": 1.835, + "step": 53190 + }, + { + "epoch": 0.10360883162810262, + "grad_norm": 3.312243938446045, + "learning_rate": 2.921237992029805e-05, + "loss": 1.9275, + "step": 53205 + }, + { + "epoch": 0.10363804189921288, + "grad_norm": 3.8666319847106934, + "learning_rate": 2.9211939683482568e-05, + "loss": 1.9619, + "step": 53220 + }, + { + "epoch": 0.10366725217032315, + "grad_norm": 3.0532631874084473, + "learning_rate": 2.921149932698634e-05, + "loss": 1.6835, + "step": 53235 + }, + { + "epoch": 0.1036964624414334, + "grad_norm": 3.0404837131500244, + "learning_rate": 2.9211058850813076e-05, + "loss": 2.1241, + "step": 53250 + }, + { + "epoch": 0.10372567271254367, + "grad_norm": 3.4185240268707275, + "learning_rate": 2.9210618254966492e-05, + "loss": 2.0408, + "step": 53265 + }, + { + "epoch": 0.10375488298365393, + "grad_norm": 3.211634874343872, + "learning_rate": 2.9210177539450294e-05, + "loss": 2.0227, + "step": 53280 + }, + { + "epoch": 0.10378409325476419, + "grad_norm": 2.959613561630249, + "learning_rate": 2.9209736704268188e-05, + "loss": 1.7733, + "step": 53295 + }, + { + "epoch": 0.10381330352587446, + "grad_norm": 2.7243733406066895, + "learning_rate": 2.920929574942389e-05, + "loss": 1.8423, + "step": 53310 + }, + { + "epoch": 0.10384251379698473, + "grad_norm": 1.9866245985031128, + "learning_rate": 2.9208854674921116e-05, + "loss": 1.6459, + "step": 53325 + }, + { + "epoch": 0.10387172406809499, + "grad_norm": 3.77958345413208, + "learning_rate": 2.9208413480763577e-05, + "loss": 1.9329, + "step": 53340 + }, + { + "epoch": 0.10390093433920525, + "grad_norm": 2.4954938888549805, + "learning_rate": 2.9207972166954994e-05, + "loss": 1.6866, + "step": 53355 + }, + { + "epoch": 0.10393014461031551, + "grad_norm": 2.948174476623535, + "learning_rate": 2.9207530733499073e-05, + "loss": 1.676, + "step": 53370 + }, + { + "epoch": 0.10395935488142577, + "grad_norm": 1.7526262998580933, + "learning_rate": 2.920708918039954e-05, + "loss": 1.6741, + "step": 53385 + }, + { + "epoch": 0.10398856515253603, + "grad_norm": 1.9519858360290527, + "learning_rate": 2.920664750766011e-05, + "loss": 1.9324, + "step": 53400 + }, + { + "epoch": 0.1040177754236463, + "grad_norm": 3.7122256755828857, + "learning_rate": 2.920620571528451e-05, + "loss": 2.0231, + "step": 53415 + }, + { + "epoch": 0.10404698569475657, + "grad_norm": 2.3948066234588623, + "learning_rate": 2.920576380327644e-05, + "loss": 1.8915, + "step": 53430 + }, + { + "epoch": 0.10407619596586683, + "grad_norm": 2.3851559162139893, + "learning_rate": 2.920532177163965e-05, + "loss": 1.941, + "step": 53445 + }, + { + "epoch": 0.10410540623697709, + "grad_norm": 2.0305466651916504, + "learning_rate": 2.920487962037784e-05, + "loss": 1.8089, + "step": 53460 + }, + { + "epoch": 0.10413461650808735, + "grad_norm": 3.6030194759368896, + "learning_rate": 2.920443734949474e-05, + "loss": 1.9752, + "step": 53475 + }, + { + "epoch": 0.10416382677919761, + "grad_norm": 4.3383636474609375, + "learning_rate": 2.9203994958994074e-05, + "loss": 1.6396, + "step": 53490 + }, + { + "epoch": 0.10419303705030787, + "grad_norm": 4.8360915184021, + "learning_rate": 2.9203552448879576e-05, + "loss": 1.6672, + "step": 53505 + }, + { + "epoch": 0.10422224732141815, + "grad_norm": 3.100064754486084, + "learning_rate": 2.9203109819154958e-05, + "loss": 1.9135, + "step": 53520 + }, + { + "epoch": 0.10425145759252841, + "grad_norm": 2.019653081893921, + "learning_rate": 2.920266706982396e-05, + "loss": 1.9831, + "step": 53535 + }, + { + "epoch": 0.10428066786363867, + "grad_norm": 2.8222882747650146, + "learning_rate": 2.92022242008903e-05, + "loss": 1.8984, + "step": 53550 + }, + { + "epoch": 0.10430987813474893, + "grad_norm": 1.9325177669525146, + "learning_rate": 2.9201781212357716e-05, + "loss": 1.8352, + "step": 53565 + }, + { + "epoch": 0.10433908840585919, + "grad_norm": 4.413449764251709, + "learning_rate": 2.9201338104229937e-05, + "loss": 1.8048, + "step": 53580 + }, + { + "epoch": 0.10436829867696945, + "grad_norm": 3.0887482166290283, + "learning_rate": 2.9200894876510687e-05, + "loss": 1.8392, + "step": 53595 + }, + { + "epoch": 0.10439750894807971, + "grad_norm": 1.9377580881118774, + "learning_rate": 2.9200451529203707e-05, + "loss": 1.8473, + "step": 53610 + }, + { + "epoch": 0.10442671921918997, + "grad_norm": 2.265601396560669, + "learning_rate": 2.9200008062312728e-05, + "loss": 1.8195, + "step": 53625 + }, + { + "epoch": 0.10445592949030025, + "grad_norm": 3.035935401916504, + "learning_rate": 2.919956447584148e-05, + "loss": 1.8027, + "step": 53640 + }, + { + "epoch": 0.10448513976141051, + "grad_norm": 2.5451693534851074, + "learning_rate": 2.9199120769793705e-05, + "loss": 1.6678, + "step": 53655 + }, + { + "epoch": 0.10451435003252077, + "grad_norm": 3.594522476196289, + "learning_rate": 2.9198676944173135e-05, + "loss": 1.701, + "step": 53670 + }, + { + "epoch": 0.10454356030363103, + "grad_norm": 2.1554315090179443, + "learning_rate": 2.9198232998983514e-05, + "loss": 1.8878, + "step": 53685 + }, + { + "epoch": 0.10457277057474129, + "grad_norm": 3.5895397663116455, + "learning_rate": 2.9197788934228576e-05, + "loss": 1.7508, + "step": 53700 + }, + { + "epoch": 0.10460198084585155, + "grad_norm": 7.4402337074279785, + "learning_rate": 2.9197344749912054e-05, + "loss": 1.6983, + "step": 53715 + }, + { + "epoch": 0.10463119111696181, + "grad_norm": 2.151456117630005, + "learning_rate": 2.9196900446037702e-05, + "loss": 1.7552, + "step": 53730 + }, + { + "epoch": 0.10466040138807209, + "grad_norm": 4.100891590118408, + "learning_rate": 2.919645602260925e-05, + "loss": 1.7669, + "step": 53745 + }, + { + "epoch": 0.10468961165918235, + "grad_norm": 3.0575950145721436, + "learning_rate": 2.9196011479630448e-05, + "loss": 2.0725, + "step": 53760 + }, + { + "epoch": 0.10471882193029261, + "grad_norm": 2.82108211517334, + "learning_rate": 2.9195566817105036e-05, + "loss": 1.9274, + "step": 53775 + }, + { + "epoch": 0.10474803220140287, + "grad_norm": 2.9936230182647705, + "learning_rate": 2.919512203503676e-05, + "loss": 1.75, + "step": 53790 + }, + { + "epoch": 0.10477724247251313, + "grad_norm": 3.8622894287109375, + "learning_rate": 2.9194677133429364e-05, + "loss": 1.962, + "step": 53805 + }, + { + "epoch": 0.1048064527436234, + "grad_norm": 2.2222511768341064, + "learning_rate": 2.9194232112286594e-05, + "loss": 1.806, + "step": 53820 + }, + { + "epoch": 0.10483566301473365, + "grad_norm": 2.2523059844970703, + "learning_rate": 2.91937869716122e-05, + "loss": 1.7863, + "step": 53835 + }, + { + "epoch": 0.10486487328584393, + "grad_norm": 2.7013745307922363, + "learning_rate": 2.919334171140993e-05, + "loss": 1.9081, + "step": 53850 + }, + { + "epoch": 0.10489408355695419, + "grad_norm": 2.1757166385650635, + "learning_rate": 2.9192896331683532e-05, + "loss": 1.7756, + "step": 53865 + }, + { + "epoch": 0.10492329382806445, + "grad_norm": 3.7949211597442627, + "learning_rate": 2.9192450832436762e-05, + "loss": 1.6572, + "step": 53880 + }, + { + "epoch": 0.10495250409917471, + "grad_norm": 4.856202125549316, + "learning_rate": 2.9192005213673363e-05, + "loss": 1.7781, + "step": 53895 + }, + { + "epoch": 0.10498171437028497, + "grad_norm": 3.3601744174957275, + "learning_rate": 2.9191559475397094e-05, + "loss": 1.8988, + "step": 53910 + }, + { + "epoch": 0.10501092464139523, + "grad_norm": 2.9697728157043457, + "learning_rate": 2.919111361761171e-05, + "loss": 1.8731, + "step": 53925 + }, + { + "epoch": 0.1050401349125055, + "grad_norm": 4.680106163024902, + "learning_rate": 2.9190667640320958e-05, + "loss": 1.8678, + "step": 53940 + }, + { + "epoch": 0.10506934518361577, + "grad_norm": 3.960859537124634, + "learning_rate": 2.91902215435286e-05, + "loss": 1.7715, + "step": 53955 + }, + { + "epoch": 0.10509855545472603, + "grad_norm": 2.9306797981262207, + "learning_rate": 2.9189775327238387e-05, + "loss": 1.9109, + "step": 53970 + }, + { + "epoch": 0.10512776572583629, + "grad_norm": 3.277047872543335, + "learning_rate": 2.9189328991454083e-05, + "loss": 1.7692, + "step": 53985 + }, + { + "epoch": 0.10515697599694655, + "grad_norm": 2.453782081604004, + "learning_rate": 2.9188882536179443e-05, + "loss": 1.8473, + "step": 54000 + } + ], + "logging_steps": 15, + "max_steps": 513518, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.558444682759168e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}