{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 25254, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011879306248515087, "grad_norm": 1.3375307411067108, "learning_rate": 6.596306068601583e-07, "loss": 0.5847, "step": 10 }, { "epoch": 0.0023758612497030173, "grad_norm": 1.1330480272402368, "learning_rate": 1.3192612137203166e-06, "loss": 0.5711, "step": 20 }, { "epoch": 0.003563791874554526, "grad_norm": 0.8884513815319889, "learning_rate": 1.9788918205804753e-06, "loss": 0.5244, "step": 30 }, { "epoch": 0.004751722499406035, "grad_norm": 0.5721846434384168, "learning_rate": 2.6385224274406333e-06, "loss": 0.4198, "step": 40 }, { "epoch": 0.005939653124257543, "grad_norm": 0.331579930310183, "learning_rate": 3.2981530343007917e-06, "loss": 0.3617, "step": 50 }, { "epoch": 0.007127583749109052, "grad_norm": 0.2245534528862174, "learning_rate": 3.9577836411609505e-06, "loss": 0.3351, "step": 60 }, { "epoch": 0.00831551437396056, "grad_norm": 0.20771338934645084, "learning_rate": 4.617414248021108e-06, "loss": 0.3136, "step": 70 }, { "epoch": 0.00950344499881207, "grad_norm": 0.17570226539236866, "learning_rate": 5.2770448548812665e-06, "loss": 0.31, "step": 80 }, { "epoch": 0.010691375623663579, "grad_norm": 0.1803744108486992, "learning_rate": 5.936675461741425e-06, "loss": 0.2947, "step": 90 }, { "epoch": 0.011879306248515086, "grad_norm": 0.18693912387865996, "learning_rate": 6.596306068601583e-06, "loss": 0.2869, "step": 100 }, { "epoch": 0.013067236873366595, "grad_norm": 0.17839634757940323, "learning_rate": 7.255936675461741e-06, "loss": 0.2782, "step": 110 }, { "epoch": 0.014255167498218105, "grad_norm": 0.17641841442766706, "learning_rate": 7.915567282321901e-06, "loss": 0.2753, "step": 120 }, { "epoch": 0.015443098123069612, "grad_norm": 0.1905886526952489, "learning_rate": 8.575197889182058e-06, "loss": 0.2667, "step": 130 }, { "epoch": 0.01663102874792112, "grad_norm": 0.18023805114249247, "learning_rate": 9.234828496042216e-06, "loss": 0.2591, "step": 140 }, { "epoch": 0.01781895937277263, "grad_norm": 0.20247344180603036, "learning_rate": 9.894459102902375e-06, "loss": 0.2581, "step": 150 }, { "epoch": 0.01900688999762414, "grad_norm": 0.2166707933241936, "learning_rate": 1.0554089709762533e-05, "loss": 0.2502, "step": 160 }, { "epoch": 0.020194820622475646, "grad_norm": 0.20510648619157942, "learning_rate": 1.1213720316622692e-05, "loss": 0.2468, "step": 170 }, { "epoch": 0.021382751247327157, "grad_norm": 0.20177463452015965, "learning_rate": 1.187335092348285e-05, "loss": 0.2457, "step": 180 }, { "epoch": 0.022570681872178665, "grad_norm": 0.18974302843810567, "learning_rate": 1.2532981530343008e-05, "loss": 0.2449, "step": 190 }, { "epoch": 0.023758612497030172, "grad_norm": 0.2131710550627266, "learning_rate": 1.3192612137203167e-05, "loss": 0.2396, "step": 200 }, { "epoch": 0.024946543121881683, "grad_norm": 0.21392576686344883, "learning_rate": 1.3852242744063327e-05, "loss": 0.2346, "step": 210 }, { "epoch": 0.02613447374673319, "grad_norm": 0.2352662871203368, "learning_rate": 1.4511873350923482e-05, "loss": 0.2341, "step": 220 }, { "epoch": 0.0273224043715847, "grad_norm": 0.2475045667140486, "learning_rate": 1.5171503957783642e-05, "loss": 0.2323, "step": 230 }, { "epoch": 0.02851033499643621, "grad_norm": 0.20366248140372556, "learning_rate": 1.5831134564643802e-05, "loss": 0.2318, "step": 240 }, { "epoch": 0.029698265621287717, "grad_norm": 0.2228382255586845, "learning_rate": 1.6490765171503957e-05, "loss": 0.2283, "step": 250 }, { "epoch": 0.030886196246139225, "grad_norm": 0.19696926882599325, "learning_rate": 1.7150395778364116e-05, "loss": 0.2263, "step": 260 }, { "epoch": 0.03207412687099073, "grad_norm": 0.2048755612510685, "learning_rate": 1.7810026385224277e-05, "loss": 0.2287, "step": 270 }, { "epoch": 0.03326205749584224, "grad_norm": 0.20503375503526874, "learning_rate": 1.8469656992084432e-05, "loss": 0.2265, "step": 280 }, { "epoch": 0.034449988120693754, "grad_norm": 0.2211942169439577, "learning_rate": 1.912928759894459e-05, "loss": 0.223, "step": 290 }, { "epoch": 0.03563791874554526, "grad_norm": 0.1964796736070463, "learning_rate": 1.978891820580475e-05, "loss": 0.2194, "step": 300 }, { "epoch": 0.03682584937039677, "grad_norm": 0.20266283034258564, "learning_rate": 2.0448548812664908e-05, "loss": 0.218, "step": 310 }, { "epoch": 0.03801377999524828, "grad_norm": 0.23195254013994365, "learning_rate": 2.1108179419525066e-05, "loss": 0.2192, "step": 320 }, { "epoch": 0.039201710620099785, "grad_norm": 0.20427336264795565, "learning_rate": 2.1767810026385225e-05, "loss": 0.2193, "step": 330 }, { "epoch": 0.04038964124495129, "grad_norm": 0.22674712302067587, "learning_rate": 2.2427440633245383e-05, "loss": 0.2098, "step": 340 }, { "epoch": 0.04157757186980281, "grad_norm": 0.20452300971517717, "learning_rate": 2.308707124010554e-05, "loss": 0.2129, "step": 350 }, { "epoch": 0.042765502494654314, "grad_norm": 0.19289212369873515, "learning_rate": 2.37467018469657e-05, "loss": 0.211, "step": 360 }, { "epoch": 0.04395343311950582, "grad_norm": 0.23186214839770652, "learning_rate": 2.4406332453825858e-05, "loss": 0.2164, "step": 370 }, { "epoch": 0.04514136374435733, "grad_norm": 0.20807038084196836, "learning_rate": 2.5065963060686017e-05, "loss": 0.2132, "step": 380 }, { "epoch": 0.04632929436920884, "grad_norm": 0.19021386964910378, "learning_rate": 2.572559366754618e-05, "loss": 0.2091, "step": 390 }, { "epoch": 0.047517224994060345, "grad_norm": 0.21295671440455782, "learning_rate": 2.6385224274406334e-05, "loss": 0.2094, "step": 400 }, { "epoch": 0.04870515561891185, "grad_norm": 0.183677073007422, "learning_rate": 2.7044854881266492e-05, "loss": 0.2084, "step": 410 }, { "epoch": 0.04989308624376337, "grad_norm": 0.19873926783011642, "learning_rate": 2.7704485488126654e-05, "loss": 0.2107, "step": 420 }, { "epoch": 0.051081016868614874, "grad_norm": 0.18654629589246152, "learning_rate": 2.836411609498681e-05, "loss": 0.2062, "step": 430 }, { "epoch": 0.05226894749346638, "grad_norm": 0.19120313890953805, "learning_rate": 2.9023746701846964e-05, "loss": 0.2093, "step": 440 }, { "epoch": 0.05345687811831789, "grad_norm": 0.21646763522431314, "learning_rate": 2.968337730870713e-05, "loss": 0.21, "step": 450 }, { "epoch": 0.0546448087431694, "grad_norm": 0.21940692956554064, "learning_rate": 3.0343007915567284e-05, "loss": 0.2054, "step": 460 }, { "epoch": 0.055832739368020905, "grad_norm": 0.16267633205554966, "learning_rate": 3.100263852242744e-05, "loss": 0.2025, "step": 470 }, { "epoch": 0.05702066999287242, "grad_norm": 0.1903395566284162, "learning_rate": 3.1662269129287604e-05, "loss": 0.2001, "step": 480 }, { "epoch": 0.05820860061772393, "grad_norm": 0.19589782473013528, "learning_rate": 3.232189973614776e-05, "loss": 0.2048, "step": 490 }, { "epoch": 0.059396531242575434, "grad_norm": 0.19244560967816657, "learning_rate": 3.2981530343007914e-05, "loss": 0.203, "step": 500 }, { "epoch": 0.06058446186742694, "grad_norm": 0.18703489795625536, "learning_rate": 3.3641160949868076e-05, "loss": 0.2003, "step": 510 }, { "epoch": 0.06177239249227845, "grad_norm": 0.21029051663533255, "learning_rate": 3.430079155672823e-05, "loss": 0.1997, "step": 520 }, { "epoch": 0.06296032311712996, "grad_norm": 0.19162858346491501, "learning_rate": 3.496042216358839e-05, "loss": 0.2014, "step": 530 }, { "epoch": 0.06414825374198146, "grad_norm": 0.16967272112654969, "learning_rate": 3.5620052770448555e-05, "loss": 0.2037, "step": 540 }, { "epoch": 0.06533618436683297, "grad_norm": 0.18670219282671494, "learning_rate": 3.627968337730871e-05, "loss": 0.2028, "step": 550 }, { "epoch": 0.06652411499168448, "grad_norm": 0.18060418167183204, "learning_rate": 3.6939313984168865e-05, "loss": 0.2013, "step": 560 }, { "epoch": 0.06771204561653599, "grad_norm": 0.1800151447302433, "learning_rate": 3.759894459102902e-05, "loss": 0.1974, "step": 570 }, { "epoch": 0.06889997624138751, "grad_norm": 0.1920643440047871, "learning_rate": 3.825857519788918e-05, "loss": 0.2017, "step": 580 }, { "epoch": 0.07008790686623902, "grad_norm": 0.1737888894856264, "learning_rate": 3.8918205804749344e-05, "loss": 0.1955, "step": 590 }, { "epoch": 0.07127583749109052, "grad_norm": 0.17515683556893008, "learning_rate": 3.95778364116095e-05, "loss": 0.2019, "step": 600 }, { "epoch": 0.07246376811594203, "grad_norm": 0.18115340986259615, "learning_rate": 4.023746701846966e-05, "loss": 0.1957, "step": 610 }, { "epoch": 0.07365169874079354, "grad_norm": 0.18984654742600848, "learning_rate": 4.0897097625329815e-05, "loss": 0.1967, "step": 620 }, { "epoch": 0.07483962936564505, "grad_norm": 0.16169514557317077, "learning_rate": 4.155672823218997e-05, "loss": 0.1937, "step": 630 }, { "epoch": 0.07602755999049655, "grad_norm": 0.17919327702230167, "learning_rate": 4.221635883905013e-05, "loss": 0.196, "step": 640 }, { "epoch": 0.07721549061534806, "grad_norm": 0.1855051643117777, "learning_rate": 4.2875989445910294e-05, "loss": 0.2008, "step": 650 }, { "epoch": 0.07840342124019957, "grad_norm": 0.1806291687429354, "learning_rate": 4.353562005277045e-05, "loss": 0.1957, "step": 660 }, { "epoch": 0.07959135186505108, "grad_norm": 0.17857667946683528, "learning_rate": 4.419525065963061e-05, "loss": 0.1941, "step": 670 }, { "epoch": 0.08077928248990258, "grad_norm": 0.1735486633718123, "learning_rate": 4.4854881266490766e-05, "loss": 0.1934, "step": 680 }, { "epoch": 0.08196721311475409, "grad_norm": 0.15996160949468444, "learning_rate": 4.551451187335092e-05, "loss": 0.1984, "step": 690 }, { "epoch": 0.08315514373960561, "grad_norm": 0.1744229067016569, "learning_rate": 4.617414248021108e-05, "loss": 0.1948, "step": 700 }, { "epoch": 0.08434307436445712, "grad_norm": 0.17248148817847028, "learning_rate": 4.6833773087071245e-05, "loss": 0.1896, "step": 710 }, { "epoch": 0.08553100498930863, "grad_norm": 0.17350376972289455, "learning_rate": 4.74934036939314e-05, "loss": 0.1957, "step": 720 }, { "epoch": 0.08671893561416014, "grad_norm": 0.1653912269385569, "learning_rate": 4.815303430079156e-05, "loss": 0.1974, "step": 730 }, { "epoch": 0.08790686623901164, "grad_norm": 0.1820339770197377, "learning_rate": 4.8812664907651717e-05, "loss": 0.1954, "step": 740 }, { "epoch": 0.08909479686386315, "grad_norm": 0.17520524706935645, "learning_rate": 4.947229551451187e-05, "loss": 0.1913, "step": 750 }, { "epoch": 0.09028272748871466, "grad_norm": 0.17169118849678508, "learning_rate": 4.999999917760701e-05, "loss": 0.1938, "step": 760 }, { "epoch": 0.09147065811356617, "grad_norm": 0.1568264122179646, "learning_rate": 4.999997039385799e-05, "loss": 0.1933, "step": 770 }, { "epoch": 0.09265858873841767, "grad_norm": 0.18851843775408814, "learning_rate": 4.999990049051348e-05, "loss": 0.1921, "step": 780 }, { "epoch": 0.09384651936326918, "grad_norm": 0.16228564316594588, "learning_rate": 4.999978946768849e-05, "loss": 0.1909, "step": 790 }, { "epoch": 0.09503444998812069, "grad_norm": 0.17374094554645406, "learning_rate": 4.99996373255656e-05, "loss": 0.1889, "step": 800 }, { "epoch": 0.0962223806129722, "grad_norm": 0.16865736395120826, "learning_rate": 4.9999444064395065e-05, "loss": 0.1966, "step": 810 }, { "epoch": 0.0974103112378237, "grad_norm": 0.17643536598002654, "learning_rate": 4.999920968449475e-05, "loss": 0.1912, "step": 820 }, { "epoch": 0.09859824186267523, "grad_norm": 0.1525215280631803, "learning_rate": 4.999893418625017e-05, "loss": 0.1897, "step": 830 }, { "epoch": 0.09978617248752673, "grad_norm": 0.155429587243708, "learning_rate": 4.999861757011446e-05, "loss": 0.1858, "step": 840 }, { "epoch": 0.10097410311237824, "grad_norm": 0.17467549788797296, "learning_rate": 4.999825983660837e-05, "loss": 0.1954, "step": 850 }, { "epoch": 0.10216203373722975, "grad_norm": 0.14778200200324496, "learning_rate": 4.9997860986320314e-05, "loss": 0.1882, "step": 860 }, { "epoch": 0.10334996436208126, "grad_norm": 0.15219332727618373, "learning_rate": 4.99974210199063e-05, "loss": 0.1932, "step": 870 }, { "epoch": 0.10453789498693276, "grad_norm": 0.14871274302449108, "learning_rate": 4.9996939938089994e-05, "loss": 0.1838, "step": 880 }, { "epoch": 0.10572582561178427, "grad_norm": 0.17242379229363572, "learning_rate": 4.999641774166265e-05, "loss": 0.1897, "step": 890 }, { "epoch": 0.10691375623663578, "grad_norm": 0.1506562834735722, "learning_rate": 4.99958544314832e-05, "loss": 0.1919, "step": 900 }, { "epoch": 0.10810168686148729, "grad_norm": 0.16948494342932197, "learning_rate": 4.9995250008478145e-05, "loss": 0.1874, "step": 910 }, { "epoch": 0.1092896174863388, "grad_norm": 0.1422833793216466, "learning_rate": 4.9994604473641635e-05, "loss": 0.1873, "step": 920 }, { "epoch": 0.1104775481111903, "grad_norm": 0.13618794348360194, "learning_rate": 4.9993917828035444e-05, "loss": 0.189, "step": 930 }, { "epoch": 0.11166547873604181, "grad_norm": 0.1538622970732332, "learning_rate": 4.999319007278896e-05, "loss": 0.1871, "step": 940 }, { "epoch": 0.11285340936089332, "grad_norm": 0.16347928962626837, "learning_rate": 4.999242120909916e-05, "loss": 0.1878, "step": 950 }, { "epoch": 0.11404133998574484, "grad_norm": 0.13395103439814854, "learning_rate": 4.999161123823069e-05, "loss": 0.1858, "step": 960 }, { "epoch": 0.11522927061059635, "grad_norm": 0.14291064576765222, "learning_rate": 4.999076016151576e-05, "loss": 0.1867, "step": 970 }, { "epoch": 0.11641720123544785, "grad_norm": 0.15025512102573274, "learning_rate": 4.998986798035421e-05, "loss": 0.1878, "step": 980 }, { "epoch": 0.11760513186029936, "grad_norm": 0.14909323278678724, "learning_rate": 4.9988934696213505e-05, "loss": 0.1878, "step": 990 }, { "epoch": 0.11879306248515087, "grad_norm": 0.15796936574032197, "learning_rate": 4.998796031062868e-05, "loss": 0.1894, "step": 1000 }, { "epoch": 0.11998099311000238, "grad_norm": 0.14907273880096458, "learning_rate": 4.998694482520239e-05, "loss": 0.1859, "step": 1010 }, { "epoch": 0.12116892373485388, "grad_norm": 0.15211037072395478, "learning_rate": 4.998588824160489e-05, "loss": 0.1811, "step": 1020 }, { "epoch": 0.12235685435970539, "grad_norm": 0.16324883375765722, "learning_rate": 4.998479056157405e-05, "loss": 0.1855, "step": 1030 }, { "epoch": 0.1235447849845569, "grad_norm": 0.1623017028011017, "learning_rate": 4.9983651786915305e-05, "loss": 0.1841, "step": 1040 }, { "epoch": 0.1247327156094084, "grad_norm": 0.1489999938411819, "learning_rate": 4.99824719195017e-05, "loss": 0.1799, "step": 1050 }, { "epoch": 0.12592064623425991, "grad_norm": 0.14200383761546015, "learning_rate": 4.9981250961273865e-05, "loss": 0.1848, "step": 1060 }, { "epoch": 0.12710857685911142, "grad_norm": 0.15854847810355827, "learning_rate": 4.997998891424e-05, "loss": 0.1828, "step": 1070 }, { "epoch": 0.12829650748396293, "grad_norm": 0.14131922856266793, "learning_rate": 4.997868578047592e-05, "loss": 0.1868, "step": 1080 }, { "epoch": 0.12948443810881444, "grad_norm": 0.1507242976514963, "learning_rate": 4.9977341562125e-05, "loss": 0.1828, "step": 1090 }, { "epoch": 0.13067236873366594, "grad_norm": 0.14264586866396559, "learning_rate": 4.997595626139818e-05, "loss": 0.1837, "step": 1100 }, { "epoch": 0.13186029935851745, "grad_norm": 0.14701586382118145, "learning_rate": 4.997452988057399e-05, "loss": 0.1822, "step": 1110 }, { "epoch": 0.13304822998336896, "grad_norm": 0.149759876251617, "learning_rate": 4.9973062421998516e-05, "loss": 0.1774, "step": 1120 }, { "epoch": 0.13423616060822047, "grad_norm": 0.16427138676420056, "learning_rate": 4.997155388808542e-05, "loss": 0.1831, "step": 1130 }, { "epoch": 0.13542409123307197, "grad_norm": 0.15511928066665362, "learning_rate": 4.997000428131592e-05, "loss": 0.1811, "step": 1140 }, { "epoch": 0.1366120218579235, "grad_norm": 0.14304348656922528, "learning_rate": 4.9968413604238776e-05, "loss": 0.1804, "step": 1150 }, { "epoch": 0.13779995248277502, "grad_norm": 0.14596716864784876, "learning_rate": 4.996678185947031e-05, "loss": 0.1773, "step": 1160 }, { "epoch": 0.13898788310762653, "grad_norm": 0.17224241329867696, "learning_rate": 4.996510904969441e-05, "loss": 0.178, "step": 1170 }, { "epoch": 0.14017581373247803, "grad_norm": 0.1561228191855426, "learning_rate": 4.996339517766247e-05, "loss": 0.1799, "step": 1180 }, { "epoch": 0.14136374435732954, "grad_norm": 0.14928024494605732, "learning_rate": 4.9961640246193456e-05, "loss": 0.1805, "step": 1190 }, { "epoch": 0.14255167498218105, "grad_norm": 0.15269109753484178, "learning_rate": 4.995984425817385e-05, "loss": 0.1788, "step": 1200 }, { "epoch": 0.14373960560703256, "grad_norm": 0.14508537705636773, "learning_rate": 4.995800721655766e-05, "loss": 0.1804, "step": 1210 }, { "epoch": 0.14492753623188406, "grad_norm": 0.1508823519403891, "learning_rate": 4.995612912436643e-05, "loss": 0.1847, "step": 1220 }, { "epoch": 0.14611546685673557, "grad_norm": 0.13978679116401313, "learning_rate": 4.995420998468923e-05, "loss": 0.1801, "step": 1230 }, { "epoch": 0.14730339748158708, "grad_norm": 0.13099058264475974, "learning_rate": 4.995224980068263e-05, "loss": 0.1794, "step": 1240 }, { "epoch": 0.14849132810643859, "grad_norm": 0.13505806415353072, "learning_rate": 4.995024857557069e-05, "loss": 0.176, "step": 1250 }, { "epoch": 0.1496792587312901, "grad_norm": 0.13825288442826847, "learning_rate": 4.994820631264503e-05, "loss": 0.1762, "step": 1260 }, { "epoch": 0.1508671893561416, "grad_norm": 0.14477001704419554, "learning_rate": 4.9946123015264715e-05, "loss": 0.1795, "step": 1270 }, { "epoch": 0.1520551199809931, "grad_norm": 0.14128439734851037, "learning_rate": 4.994399868685633e-05, "loss": 0.1798, "step": 1280 }, { "epoch": 0.15324305060584462, "grad_norm": 0.14439708873004722, "learning_rate": 4.994183333091393e-05, "loss": 0.1741, "step": 1290 }, { "epoch": 0.15443098123069612, "grad_norm": 0.14580118897058822, "learning_rate": 4.9939626950999075e-05, "loss": 0.1775, "step": 1300 }, { "epoch": 0.15561891185554763, "grad_norm": 0.14241776682057933, "learning_rate": 4.9937379550740785e-05, "loss": 0.1791, "step": 1310 }, { "epoch": 0.15680684248039914, "grad_norm": 0.13022421288089261, "learning_rate": 4.993509113383554e-05, "loss": 0.1757, "step": 1320 }, { "epoch": 0.15799477310525065, "grad_norm": 0.14307613399058647, "learning_rate": 4.993276170404731e-05, "loss": 0.1809, "step": 1330 }, { "epoch": 0.15918270373010215, "grad_norm": 0.12998787992073887, "learning_rate": 4.99303912652075e-05, "loss": 0.1799, "step": 1340 }, { "epoch": 0.16037063435495366, "grad_norm": 0.13346779289342067, "learning_rate": 4.992797982121498e-05, "loss": 0.1798, "step": 1350 }, { "epoch": 0.16155856497980517, "grad_norm": 0.13214388131658458, "learning_rate": 4.992552737603605e-05, "loss": 0.1821, "step": 1360 }, { "epoch": 0.16274649560465668, "grad_norm": 0.14581322508315855, "learning_rate": 4.992303393370447e-05, "loss": 0.1778, "step": 1370 }, { "epoch": 0.16393442622950818, "grad_norm": 0.1463105222934338, "learning_rate": 4.992049949832139e-05, "loss": 0.1816, "step": 1380 }, { "epoch": 0.1651223568543597, "grad_norm": 0.13236040975365665, "learning_rate": 4.991792407405546e-05, "loss": 0.1747, "step": 1390 }, { "epoch": 0.16631028747921123, "grad_norm": 0.13056007381192647, "learning_rate": 4.991530766514265e-05, "loss": 0.1775, "step": 1400 }, { "epoch": 0.16749821810406273, "grad_norm": 0.14719604785273918, "learning_rate": 4.991265027588643e-05, "loss": 0.1764, "step": 1410 }, { "epoch": 0.16868614872891424, "grad_norm": 0.12405723712531191, "learning_rate": 4.9909951910657625e-05, "loss": 0.173, "step": 1420 }, { "epoch": 0.16987407935376575, "grad_norm": 0.12709652593106827, "learning_rate": 4.990721257389446e-05, "loss": 0.1766, "step": 1430 }, { "epoch": 0.17106200997861726, "grad_norm": 0.13362029149865604, "learning_rate": 4.990443227010256e-05, "loss": 0.1771, "step": 1440 }, { "epoch": 0.17224994060346877, "grad_norm": 0.14023450260443857, "learning_rate": 4.990161100385494e-05, "loss": 0.1751, "step": 1450 }, { "epoch": 0.17343787122832027, "grad_norm": 0.13330000656405214, "learning_rate": 4.989874877979197e-05, "loss": 0.1751, "step": 1460 }, { "epoch": 0.17462580185317178, "grad_norm": 0.1318553193058698, "learning_rate": 4.9895845602621394e-05, "loss": 0.1787, "step": 1470 }, { "epoch": 0.1758137324780233, "grad_norm": 0.14265251710969418, "learning_rate": 4.989290147711831e-05, "loss": 0.1753, "step": 1480 }, { "epoch": 0.1770016631028748, "grad_norm": 0.13610515240027976, "learning_rate": 4.9889916408125196e-05, "loss": 0.1768, "step": 1490 }, { "epoch": 0.1781895937277263, "grad_norm": 0.1407870523752269, "learning_rate": 4.988689040055183e-05, "loss": 0.177, "step": 1500 }, { "epoch": 0.1793775243525778, "grad_norm": 0.14989612847651768, "learning_rate": 4.988382345937536e-05, "loss": 0.1762, "step": 1510 }, { "epoch": 0.18056545497742932, "grad_norm": 0.13898354997853513, "learning_rate": 4.988071558964023e-05, "loss": 0.1773, "step": 1520 }, { "epoch": 0.18175338560228083, "grad_norm": 0.1264460602767895, "learning_rate": 4.987756679645823e-05, "loss": 0.1728, "step": 1530 }, { "epoch": 0.18294131622713233, "grad_norm": 0.12901083788404266, "learning_rate": 4.987437708500845e-05, "loss": 0.1756, "step": 1540 }, { "epoch": 0.18412924685198384, "grad_norm": 0.1436210962293996, "learning_rate": 4.9871146460537285e-05, "loss": 0.1691, "step": 1550 }, { "epoch": 0.18531717747683535, "grad_norm": 0.1354243553476289, "learning_rate": 4.986787492835843e-05, "loss": 0.1755, "step": 1560 }, { "epoch": 0.18650510810168686, "grad_norm": 0.13925203924167526, "learning_rate": 4.986456249385283e-05, "loss": 0.1751, "step": 1570 }, { "epoch": 0.18769303872653836, "grad_norm": 0.13167095607951632, "learning_rate": 4.986120916246873e-05, "loss": 0.1753, "step": 1580 }, { "epoch": 0.18888096935138987, "grad_norm": 0.13450548059811876, "learning_rate": 4.9857814939721676e-05, "loss": 0.1768, "step": 1590 }, { "epoch": 0.19006889997624138, "grad_norm": 0.12883413410087297, "learning_rate": 4.985437983119441e-05, "loss": 0.1748, "step": 1600 }, { "epoch": 0.1912568306010929, "grad_norm": 0.13223313138590803, "learning_rate": 4.9850903842536946e-05, "loss": 0.1728, "step": 1610 }, { "epoch": 0.1924447612259444, "grad_norm": 0.141947492702088, "learning_rate": 4.984738697946656e-05, "loss": 0.1756, "step": 1620 }, { "epoch": 0.1936326918507959, "grad_norm": 0.1340388951728084, "learning_rate": 4.984382924776772e-05, "loss": 0.1761, "step": 1630 }, { "epoch": 0.1948206224756474, "grad_norm": 0.14591919961702615, "learning_rate": 4.9840230653292144e-05, "loss": 0.1724, "step": 1640 }, { "epoch": 0.19600855310049894, "grad_norm": 0.12101074065557327, "learning_rate": 4.983659120195875e-05, "loss": 0.1722, "step": 1650 }, { "epoch": 0.19719648372535045, "grad_norm": 0.12726558722857728, "learning_rate": 4.9832910899753636e-05, "loss": 0.1729, "step": 1660 }, { "epoch": 0.19838441435020196, "grad_norm": 0.13100731747151956, "learning_rate": 4.9829189752730144e-05, "loss": 0.1724, "step": 1670 }, { "epoch": 0.19957234497505347, "grad_norm": 0.1254496556486626, "learning_rate": 4.982542776700874e-05, "loss": 0.171, "step": 1680 }, { "epoch": 0.20076027559990497, "grad_norm": 0.12691871963480564, "learning_rate": 4.982162494877708e-05, "loss": 0.173, "step": 1690 }, { "epoch": 0.20194820622475648, "grad_norm": 0.14717200249041432, "learning_rate": 4.9817781304290004e-05, "loss": 0.1753, "step": 1700 }, { "epoch": 0.203136136849608, "grad_norm": 0.13199835988033198, "learning_rate": 4.981389683986947e-05, "loss": 0.1723, "step": 1710 }, { "epoch": 0.2043240674744595, "grad_norm": 0.12126489946507459, "learning_rate": 4.98099715619046e-05, "loss": 0.1717, "step": 1720 }, { "epoch": 0.205511998099311, "grad_norm": 0.13194254453610865, "learning_rate": 4.980600547685163e-05, "loss": 0.1703, "step": 1730 }, { "epoch": 0.2066999287241625, "grad_norm": 0.12464646544232248, "learning_rate": 4.980199859123392e-05, "loss": 0.1678, "step": 1740 }, { "epoch": 0.20788785934901402, "grad_norm": 0.14172461498025843, "learning_rate": 4.9797950911641933e-05, "loss": 0.1715, "step": 1750 }, { "epoch": 0.20907578997386553, "grad_norm": 0.1247486511538755, "learning_rate": 4.9793862444733254e-05, "loss": 0.1729, "step": 1760 }, { "epoch": 0.21026372059871704, "grad_norm": 0.1238504904208088, "learning_rate": 4.978973319723251e-05, "loss": 0.1742, "step": 1770 }, { "epoch": 0.21145165122356854, "grad_norm": 0.13112475440152166, "learning_rate": 4.978556317593144e-05, "loss": 0.1735, "step": 1780 }, { "epoch": 0.21263958184842005, "grad_norm": 0.13160145201914836, "learning_rate": 4.978135238768884e-05, "loss": 0.1701, "step": 1790 }, { "epoch": 0.21382751247327156, "grad_norm": 0.1282385329114809, "learning_rate": 4.9777100839430543e-05, "loss": 0.1706, "step": 1800 }, { "epoch": 0.21501544309812307, "grad_norm": 0.13441296568208452, "learning_rate": 4.9772808538149455e-05, "loss": 0.17, "step": 1810 }, { "epoch": 0.21620337372297457, "grad_norm": 0.123657014159621, "learning_rate": 4.9768475490905474e-05, "loss": 0.1739, "step": 1820 }, { "epoch": 0.21739130434782608, "grad_norm": 0.1647303509571939, "learning_rate": 4.976410170482554e-05, "loss": 0.1717, "step": 1830 }, { "epoch": 0.2185792349726776, "grad_norm": 0.12418772932798722, "learning_rate": 4.9759687187103596e-05, "loss": 0.1691, "step": 1840 }, { "epoch": 0.2197671655975291, "grad_norm": 0.13580130495092976, "learning_rate": 4.9755231945000574e-05, "loss": 0.1717, "step": 1850 }, { "epoch": 0.2209550962223806, "grad_norm": 0.12041270252314899, "learning_rate": 4.97507359858444e-05, "loss": 0.1728, "step": 1860 }, { "epoch": 0.2221430268472321, "grad_norm": 0.12698793321412596, "learning_rate": 4.974619931702995e-05, "loss": 0.1726, "step": 1870 }, { "epoch": 0.22333095747208362, "grad_norm": 0.13079159846254593, "learning_rate": 4.974162194601908e-05, "loss": 0.1741, "step": 1880 }, { "epoch": 0.22451888809693513, "grad_norm": 0.12356744807908791, "learning_rate": 4.97370038803406e-05, "loss": 0.1709, "step": 1890 }, { "epoch": 0.22570681872178663, "grad_norm": 0.13152628855989856, "learning_rate": 4.973234512759021e-05, "loss": 0.1689, "step": 1900 }, { "epoch": 0.22689474934663817, "grad_norm": 0.1370260822328974, "learning_rate": 4.972764569543058e-05, "loss": 0.1725, "step": 1910 }, { "epoch": 0.22808267997148968, "grad_norm": 0.13960511102235468, "learning_rate": 4.972290559159126e-05, "loss": 0.1678, "step": 1920 }, { "epoch": 0.22927061059634118, "grad_norm": 0.12774920942086532, "learning_rate": 4.971812482386872e-05, "loss": 0.1732, "step": 1930 }, { "epoch": 0.2304585412211927, "grad_norm": 0.125715257215186, "learning_rate": 4.9713303400126274e-05, "loss": 0.1706, "step": 1940 }, { "epoch": 0.2316464718460442, "grad_norm": 0.12416122527569026, "learning_rate": 4.970844132829415e-05, "loss": 0.1699, "step": 1950 }, { "epoch": 0.2328344024708957, "grad_norm": 0.12187321375876654, "learning_rate": 4.97035386163694e-05, "loss": 0.169, "step": 1960 }, { "epoch": 0.23402233309574721, "grad_norm": 0.13451551025828778, "learning_rate": 4.969859527241596e-05, "loss": 0.1706, "step": 1970 }, { "epoch": 0.23521026372059872, "grad_norm": 0.12547497414994147, "learning_rate": 4.969361130456455e-05, "loss": 0.1687, "step": 1980 }, { "epoch": 0.23639819434545023, "grad_norm": 0.1353956633230934, "learning_rate": 4.968858672101274e-05, "loss": 0.1755, "step": 1990 }, { "epoch": 0.23758612497030174, "grad_norm": 0.13594232354732386, "learning_rate": 4.968352153002488e-05, "loss": 0.1697, "step": 2000 }, { "epoch": 0.23877405559515325, "grad_norm": 0.13259449803880705, "learning_rate": 4.967841573993214e-05, "loss": 0.1664, "step": 2010 }, { "epoch": 0.23996198622000475, "grad_norm": 0.11592865242636971, "learning_rate": 4.9673269359132435e-05, "loss": 0.1712, "step": 2020 }, { "epoch": 0.24114991684485626, "grad_norm": 0.1395904491383818, "learning_rate": 4.966808239609048e-05, "loss": 0.1732, "step": 2030 }, { "epoch": 0.24233784746970777, "grad_norm": 0.12437281972487449, "learning_rate": 4.9662854859337696e-05, "loss": 0.1701, "step": 2040 }, { "epoch": 0.24352577809455928, "grad_norm": 0.14262316805334, "learning_rate": 4.965758675747226e-05, "loss": 0.1711, "step": 2050 }, { "epoch": 0.24471370871941078, "grad_norm": 0.13071913834856103, "learning_rate": 4.9652278099159097e-05, "loss": 0.1685, "step": 2060 }, { "epoch": 0.2459016393442623, "grad_norm": 0.12183368203672362, "learning_rate": 4.96469288931298e-05, "loss": 0.1681, "step": 2070 }, { "epoch": 0.2470895699691138, "grad_norm": 0.11371553412800556, "learning_rate": 4.964153914818266e-05, "loss": 0.1677, "step": 2080 }, { "epoch": 0.2482775005939653, "grad_norm": 0.11834026934778352, "learning_rate": 4.963610887318265e-05, "loss": 0.1707, "step": 2090 }, { "epoch": 0.2494654312188168, "grad_norm": 0.12809818771564455, "learning_rate": 4.963063807706142e-05, "loss": 0.1683, "step": 2100 }, { "epoch": 0.2506533618436683, "grad_norm": 0.1337499377397155, "learning_rate": 4.962512676881725e-05, "loss": 0.1693, "step": 2110 }, { "epoch": 0.25184129246851983, "grad_norm": 0.12101663392193407, "learning_rate": 4.961957495751508e-05, "loss": 0.1637, "step": 2120 }, { "epoch": 0.25302922309337134, "grad_norm": 0.12817896168780796, "learning_rate": 4.961398265228642e-05, "loss": 0.1701, "step": 2130 }, { "epoch": 0.25421715371822284, "grad_norm": 0.1429938557695763, "learning_rate": 4.960834986232943e-05, "loss": 0.1634, "step": 2140 }, { "epoch": 0.25540508434307435, "grad_norm": 0.11988575894881831, "learning_rate": 4.960267659690885e-05, "loss": 0.1671, "step": 2150 }, { "epoch": 0.25659301496792586, "grad_norm": 0.11793766414136837, "learning_rate": 4.959696286535598e-05, "loss": 0.1669, "step": 2160 }, { "epoch": 0.25778094559277737, "grad_norm": 0.11822161352414833, "learning_rate": 4.959120867706867e-05, "loss": 0.1692, "step": 2170 }, { "epoch": 0.2589688762176289, "grad_norm": 0.12898836490182855, "learning_rate": 4.958541404151135e-05, "loss": 0.1671, "step": 2180 }, { "epoch": 0.2601568068424804, "grad_norm": 0.1261286465104795, "learning_rate": 4.957957896821494e-05, "loss": 0.1718, "step": 2190 }, { "epoch": 0.2613447374673319, "grad_norm": 0.12820970058436473, "learning_rate": 4.957370346677688e-05, "loss": 0.1641, "step": 2200 }, { "epoch": 0.2625326680921834, "grad_norm": 0.12020422204966766, "learning_rate": 4.9567787546861135e-05, "loss": 0.1675, "step": 2210 }, { "epoch": 0.2637205987170349, "grad_norm": 0.16927165143640283, "learning_rate": 4.95618312181981e-05, "loss": 0.1668, "step": 2220 }, { "epoch": 0.2649085293418864, "grad_norm": 0.11835855848760432, "learning_rate": 4.9555834490584675e-05, "loss": 0.1674, "step": 2230 }, { "epoch": 0.2660964599667379, "grad_norm": 0.12830980714353143, "learning_rate": 4.954979737388419e-05, "loss": 0.1664, "step": 2240 }, { "epoch": 0.2672843905915894, "grad_norm": 0.12906825354653398, "learning_rate": 4.954371987802641e-05, "loss": 0.1647, "step": 2250 }, { "epoch": 0.26847232121644093, "grad_norm": 0.12019262369148721, "learning_rate": 4.95376020130075e-05, "loss": 0.1648, "step": 2260 }, { "epoch": 0.26966025184129244, "grad_norm": 0.12465622613186758, "learning_rate": 4.9531443788890054e-05, "loss": 0.1706, "step": 2270 }, { "epoch": 0.27084818246614395, "grad_norm": 0.1359357557613979, "learning_rate": 4.952524521580302e-05, "loss": 0.1714, "step": 2280 }, { "epoch": 0.2720361130909955, "grad_norm": 0.1176453098734514, "learning_rate": 4.951900630394173e-05, "loss": 0.1687, "step": 2290 }, { "epoch": 0.273224043715847, "grad_norm": 0.13135679485367857, "learning_rate": 4.9512727063567856e-05, "loss": 0.1676, "step": 2300 }, { "epoch": 0.27441197434069853, "grad_norm": 0.12035260975831834, "learning_rate": 4.95064075050094e-05, "loss": 0.1698, "step": 2310 }, { "epoch": 0.27559990496555004, "grad_norm": 0.11998505530699825, "learning_rate": 4.950004763866069e-05, "loss": 0.1686, "step": 2320 }, { "epoch": 0.27678783559040154, "grad_norm": 0.12120271899707427, "learning_rate": 4.949364747498233e-05, "loss": 0.1659, "step": 2330 }, { "epoch": 0.27797576621525305, "grad_norm": 0.12193527736416142, "learning_rate": 4.9487207024501236e-05, "loss": 0.1636, "step": 2340 }, { "epoch": 0.27916369684010456, "grad_norm": 0.1209862342343259, "learning_rate": 4.948072629781056e-05, "loss": 0.1642, "step": 2350 }, { "epoch": 0.28035162746495607, "grad_norm": 0.12574122752480124, "learning_rate": 4.947420530556969e-05, "loss": 0.1658, "step": 2360 }, { "epoch": 0.2815395580898076, "grad_norm": 0.1140681237640739, "learning_rate": 4.9467644058504295e-05, "loss": 0.1635, "step": 2370 }, { "epoch": 0.2827274887146591, "grad_norm": 0.11938603661009778, "learning_rate": 4.9461042567406203e-05, "loss": 0.1666, "step": 2380 }, { "epoch": 0.2839154193395106, "grad_norm": 0.11880588026675264, "learning_rate": 4.945440084313345e-05, "loss": 0.1676, "step": 2390 }, { "epoch": 0.2851033499643621, "grad_norm": 0.11661442625877443, "learning_rate": 4.944771889661026e-05, "loss": 0.1697, "step": 2400 }, { "epoch": 0.2862912805892136, "grad_norm": 0.11680732191166936, "learning_rate": 4.9440996738826994e-05, "loss": 0.1658, "step": 2410 }, { "epoch": 0.2874792112140651, "grad_norm": 0.1165662942994535, "learning_rate": 4.943423438084017e-05, "loss": 0.164, "step": 2420 }, { "epoch": 0.2886671418389166, "grad_norm": 0.11244365288571985, "learning_rate": 4.942743183377241e-05, "loss": 0.1676, "step": 2430 }, { "epoch": 0.2898550724637681, "grad_norm": 0.144760725751456, "learning_rate": 4.942058910881246e-05, "loss": 0.1678, "step": 2440 }, { "epoch": 0.29104300308861963, "grad_norm": 0.11499765779333442, "learning_rate": 4.941370621721511e-05, "loss": 0.1662, "step": 2450 }, { "epoch": 0.29223093371347114, "grad_norm": 0.13418908618905104, "learning_rate": 4.940678317030127e-05, "loss": 0.1669, "step": 2460 }, { "epoch": 0.29341886433832265, "grad_norm": 0.11759569072780972, "learning_rate": 4.9399819979457854e-05, "loss": 0.1704, "step": 2470 }, { "epoch": 0.29460679496317416, "grad_norm": 0.11850129542623435, "learning_rate": 4.9392816656137826e-05, "loss": 0.1635, "step": 2480 }, { "epoch": 0.29579472558802566, "grad_norm": 0.11832578571866408, "learning_rate": 4.938577321186014e-05, "loss": 0.1679, "step": 2490 }, { "epoch": 0.29698265621287717, "grad_norm": 0.12983884242314456, "learning_rate": 4.937868965820977e-05, "loss": 0.1672, "step": 2500 }, { "epoch": 0.2981705868377287, "grad_norm": 0.12470316824671664, "learning_rate": 4.937156600683764e-05, "loss": 0.1669, "step": 2510 }, { "epoch": 0.2993585174625802, "grad_norm": 0.1237666940295697, "learning_rate": 4.936440226946063e-05, "loss": 0.1609, "step": 2520 }, { "epoch": 0.3005464480874317, "grad_norm": 0.11553136138087797, "learning_rate": 4.935719845786154e-05, "loss": 0.1676, "step": 2530 }, { "epoch": 0.3017343787122832, "grad_norm": 0.11910835650388192, "learning_rate": 4.934995458388911e-05, "loss": 0.1626, "step": 2540 }, { "epoch": 0.3029223093371347, "grad_norm": 0.11695135928448529, "learning_rate": 4.934267065945797e-05, "loss": 0.1607, "step": 2550 }, { "epoch": 0.3041102399619862, "grad_norm": 0.11843574033623183, "learning_rate": 4.933534669654859e-05, "loss": 0.1674, "step": 2560 }, { "epoch": 0.3052981705868377, "grad_norm": 0.11566167132139951, "learning_rate": 4.932798270720734e-05, "loss": 0.1674, "step": 2570 }, { "epoch": 0.30648610121168923, "grad_norm": 0.11094547861531937, "learning_rate": 4.9320578703546396e-05, "loss": 0.1638, "step": 2580 }, { "epoch": 0.30767403183654074, "grad_norm": 0.12745526570042726, "learning_rate": 4.9313134697743756e-05, "loss": 0.1617, "step": 2590 }, { "epoch": 0.30886196246139225, "grad_norm": 0.11073297065473978, "learning_rate": 4.930565070204323e-05, "loss": 0.1653, "step": 2600 }, { "epoch": 0.31004989308624376, "grad_norm": 0.12289834586339737, "learning_rate": 4.9298126728754364e-05, "loss": 0.1685, "step": 2610 }, { "epoch": 0.31123782371109526, "grad_norm": 0.1195335137368077, "learning_rate": 4.9290562790252495e-05, "loss": 0.1642, "step": 2620 }, { "epoch": 0.31242575433594677, "grad_norm": 0.1179990014694022, "learning_rate": 4.928295889897869e-05, "loss": 0.1663, "step": 2630 }, { "epoch": 0.3136136849607983, "grad_norm": 0.10668951372352378, "learning_rate": 4.9275315067439706e-05, "loss": 0.1665, "step": 2640 }, { "epoch": 0.3148016155856498, "grad_norm": 0.1179204349635216, "learning_rate": 4.926763130820801e-05, "loss": 0.1646, "step": 2650 }, { "epoch": 0.3159895462105013, "grad_norm": 0.1177085768334525, "learning_rate": 4.925990763392175e-05, "loss": 0.1617, "step": 2660 }, { "epoch": 0.3171774768353528, "grad_norm": 0.12166252667054489, "learning_rate": 4.925214405728471e-05, "loss": 0.1642, "step": 2670 }, { "epoch": 0.3183654074602043, "grad_norm": 0.11187062293231036, "learning_rate": 4.9244340591066314e-05, "loss": 0.1608, "step": 2680 }, { "epoch": 0.3195533380850558, "grad_norm": 0.1169187530211736, "learning_rate": 4.923649724810158e-05, "loss": 0.1649, "step": 2690 }, { "epoch": 0.3207412687099073, "grad_norm": 0.11855986220577831, "learning_rate": 4.9228614041291145e-05, "loss": 0.1667, "step": 2700 }, { "epoch": 0.32192919933475883, "grad_norm": 0.12845385228572642, "learning_rate": 4.9220690983601174e-05, "loss": 0.1638, "step": 2710 }, { "epoch": 0.32311712995961034, "grad_norm": 0.11924282845827594, "learning_rate": 4.921272808806342e-05, "loss": 0.1654, "step": 2720 }, { "epoch": 0.32430506058446185, "grad_norm": 0.11520285725362303, "learning_rate": 4.920472536777512e-05, "loss": 0.1603, "step": 2730 }, { "epoch": 0.32549299120931335, "grad_norm": 0.11786868438389068, "learning_rate": 4.919668283589905e-05, "loss": 0.1659, "step": 2740 }, { "epoch": 0.32668092183416486, "grad_norm": 0.1227571231906244, "learning_rate": 4.9188600505663455e-05, "loss": 0.1649, "step": 2750 }, { "epoch": 0.32786885245901637, "grad_norm": 0.11891699367075527, "learning_rate": 4.9180478390362026e-05, "loss": 0.1658, "step": 2760 }, { "epoch": 0.3290567830838679, "grad_norm": 0.12425229703991726, "learning_rate": 4.917231650335391e-05, "loss": 0.1685, "step": 2770 }, { "epoch": 0.3302447137087194, "grad_norm": 0.13809551038832812, "learning_rate": 4.9164114858063645e-05, "loss": 0.1646, "step": 2780 }, { "epoch": 0.33143264433357095, "grad_norm": 0.11449975498148422, "learning_rate": 4.9155873467981205e-05, "loss": 0.1639, "step": 2790 }, { "epoch": 0.33262057495842245, "grad_norm": 0.1153812562783787, "learning_rate": 4.9147592346661896e-05, "loss": 0.1638, "step": 2800 }, { "epoch": 0.33380850558327396, "grad_norm": 0.11735516139035142, "learning_rate": 4.91392715077264e-05, "loss": 0.1602, "step": 2810 }, { "epoch": 0.33499643620812547, "grad_norm": 0.11464488991100508, "learning_rate": 4.91309109648607e-05, "loss": 0.165, "step": 2820 }, { "epoch": 0.336184366832977, "grad_norm": 0.12183893719346385, "learning_rate": 4.912251073181611e-05, "loss": 0.1638, "step": 2830 }, { "epoch": 0.3373722974578285, "grad_norm": 0.11143463444639552, "learning_rate": 4.91140708224092e-05, "loss": 0.1638, "step": 2840 }, { "epoch": 0.33856022808268, "grad_norm": 0.1198959779813086, "learning_rate": 4.9105591250521834e-05, "loss": 0.1675, "step": 2850 }, { "epoch": 0.3397481587075315, "grad_norm": 0.1184324944784931, "learning_rate": 4.909707203010107e-05, "loss": 0.1659, "step": 2860 }, { "epoch": 0.340936089332383, "grad_norm": 0.11723179934254115, "learning_rate": 4.908851317515921e-05, "loss": 0.159, "step": 2870 }, { "epoch": 0.3421240199572345, "grad_norm": 0.11832528620855987, "learning_rate": 4.907991469977373e-05, "loss": 0.1608, "step": 2880 }, { "epoch": 0.343311950582086, "grad_norm": 0.11511710162159076, "learning_rate": 4.90712766180873e-05, "loss": 0.1642, "step": 2890 }, { "epoch": 0.34449988120693753, "grad_norm": 0.11319390546521237, "learning_rate": 4.906259894430769e-05, "loss": 0.1653, "step": 2900 }, { "epoch": 0.34568781183178904, "grad_norm": 0.12621554849051025, "learning_rate": 4.905388169270782e-05, "loss": 0.161, "step": 2910 }, { "epoch": 0.34687574245664055, "grad_norm": 0.11090975880561871, "learning_rate": 4.904512487762572e-05, "loss": 0.1606, "step": 2920 }, { "epoch": 0.34806367308149205, "grad_norm": 0.11136408910235625, "learning_rate": 4.903632851346445e-05, "loss": 0.1597, "step": 2930 }, { "epoch": 0.34925160370634356, "grad_norm": 0.1126317197026116, "learning_rate": 4.902749261469216e-05, "loss": 0.162, "step": 2940 }, { "epoch": 0.35043953433119507, "grad_norm": 0.11619357558544803, "learning_rate": 4.9018617195842e-05, "loss": 0.1654, "step": 2950 }, { "epoch": 0.3516274649560466, "grad_norm": 0.10966302453496642, "learning_rate": 4.9009702271512134e-05, "loss": 0.1618, "step": 2960 }, { "epoch": 0.3528153955808981, "grad_norm": 0.12281724719249981, "learning_rate": 4.900074785636572e-05, "loss": 0.1691, "step": 2970 }, { "epoch": 0.3540033262057496, "grad_norm": 0.12245458405259024, "learning_rate": 4.899175396513083e-05, "loss": 0.1627, "step": 2980 }, { "epoch": 0.3551912568306011, "grad_norm": 0.12270643564143918, "learning_rate": 4.89827206126005e-05, "loss": 0.164, "step": 2990 }, { "epoch": 0.3563791874554526, "grad_norm": 0.11324708570578644, "learning_rate": 4.897364781363266e-05, "loss": 0.164, "step": 3000 }, { "epoch": 0.3575671180803041, "grad_norm": 0.10739493626630099, "learning_rate": 4.896453558315011e-05, "loss": 0.1634, "step": 3010 }, { "epoch": 0.3587550487051556, "grad_norm": 0.11752165813455126, "learning_rate": 4.895538393614053e-05, "loss": 0.1663, "step": 3020 }, { "epoch": 0.35994297933000713, "grad_norm": 0.11407430511988358, "learning_rate": 4.894619288765642e-05, "loss": 0.1653, "step": 3030 }, { "epoch": 0.36113090995485864, "grad_norm": 0.11174275382448715, "learning_rate": 4.8936962452815064e-05, "loss": 0.1614, "step": 3040 }, { "epoch": 0.36231884057971014, "grad_norm": 0.1321690032445373, "learning_rate": 4.8927692646798574e-05, "loss": 0.1643, "step": 3050 }, { "epoch": 0.36350677120456165, "grad_norm": 0.10871079111384568, "learning_rate": 4.8918383484853784e-05, "loss": 0.1612, "step": 3060 }, { "epoch": 0.36469470182941316, "grad_norm": 0.11687822370948737, "learning_rate": 4.890903498229228e-05, "loss": 0.1615, "step": 3070 }, { "epoch": 0.36588263245426467, "grad_norm": 0.10772429542959276, "learning_rate": 4.889964715449033e-05, "loss": 0.1623, "step": 3080 }, { "epoch": 0.3670705630791162, "grad_norm": 0.10654696006546763, "learning_rate": 4.889022001688891e-05, "loss": 0.1674, "step": 3090 }, { "epoch": 0.3682584937039677, "grad_norm": 0.11394898817732017, "learning_rate": 4.888075358499364e-05, "loss": 0.1625, "step": 3100 }, { "epoch": 0.3694464243288192, "grad_norm": 0.11842148094307195, "learning_rate": 4.887124787437478e-05, "loss": 0.1671, "step": 3110 }, { "epoch": 0.3706343549536707, "grad_norm": 0.11846036332756636, "learning_rate": 4.8861702900667174e-05, "loss": 0.1642, "step": 3120 }, { "epoch": 0.3718222855785222, "grad_norm": 0.11253903213073212, "learning_rate": 4.8852118679570267e-05, "loss": 0.1631, "step": 3130 }, { "epoch": 0.3730102162033737, "grad_norm": 0.12372170621579044, "learning_rate": 4.884249522684805e-05, "loss": 0.165, "step": 3140 }, { "epoch": 0.3741981468282252, "grad_norm": 0.11530345601562467, "learning_rate": 4.883283255832904e-05, "loss": 0.165, "step": 3150 }, { "epoch": 0.3753860774530767, "grad_norm": 0.11039841045022361, "learning_rate": 4.882313068990625e-05, "loss": 0.1598, "step": 3160 }, { "epoch": 0.37657400807792824, "grad_norm": 0.11325732486392462, "learning_rate": 4.8813389637537175e-05, "loss": 0.1607, "step": 3170 }, { "epoch": 0.37776193870277974, "grad_norm": 0.11287388015833345, "learning_rate": 4.880360941724378e-05, "loss": 0.1623, "step": 3180 }, { "epoch": 0.37894986932763125, "grad_norm": 0.11480417818557334, "learning_rate": 4.879379004511241e-05, "loss": 0.1627, "step": 3190 }, { "epoch": 0.38013779995248276, "grad_norm": 0.11050537620963752, "learning_rate": 4.878393153729383e-05, "loss": 0.1648, "step": 3200 }, { "epoch": 0.38132573057733427, "grad_norm": 0.12561906031825926, "learning_rate": 4.877403391000318e-05, "loss": 0.1595, "step": 3210 }, { "epoch": 0.3825136612021858, "grad_norm": 0.11410001759326929, "learning_rate": 4.876409717951994e-05, "loss": 0.1639, "step": 3220 }, { "epoch": 0.3837015918270373, "grad_norm": 0.11276914379222647, "learning_rate": 4.87541213621879e-05, "loss": 0.1613, "step": 3230 }, { "epoch": 0.3848895224518888, "grad_norm": 0.11644978046901808, "learning_rate": 4.8744106474415134e-05, "loss": 0.1595, "step": 3240 }, { "epoch": 0.3860774530767403, "grad_norm": 0.1147135057784953, "learning_rate": 4.8734052532674e-05, "loss": 0.1613, "step": 3250 }, { "epoch": 0.3872653837015918, "grad_norm": 0.12011900853430277, "learning_rate": 4.872395955350106e-05, "loss": 0.1594, "step": 3260 }, { "epoch": 0.3884533143264433, "grad_norm": 0.10951731938292678, "learning_rate": 4.871382755349712e-05, "loss": 0.1609, "step": 3270 }, { "epoch": 0.3896412449512948, "grad_norm": 0.12355116177662707, "learning_rate": 4.870365654932716e-05, "loss": 0.1613, "step": 3280 }, { "epoch": 0.3908291755761463, "grad_norm": 0.1191613334847591, "learning_rate": 4.869344655772027e-05, "loss": 0.161, "step": 3290 }, { "epoch": 0.3920171062009979, "grad_norm": 0.11064369140030765, "learning_rate": 4.868319759546972e-05, "loss": 0.1627, "step": 3300 }, { "epoch": 0.3932050368258494, "grad_norm": 0.10997971449022427, "learning_rate": 4.8672909679432846e-05, "loss": 0.1561, "step": 3310 }, { "epoch": 0.3943929674507009, "grad_norm": 0.1192981165342182, "learning_rate": 4.866258282653108e-05, "loss": 0.1578, "step": 3320 }, { "epoch": 0.3955808980755524, "grad_norm": 0.1077331898742313, "learning_rate": 4.865221705374987e-05, "loss": 0.161, "step": 3330 }, { "epoch": 0.3967688287004039, "grad_norm": 0.11098272517169397, "learning_rate": 4.8641812378138695e-05, "loss": 0.1637, "step": 3340 }, { "epoch": 0.3979567593252554, "grad_norm": 0.11359482825533, "learning_rate": 4.863136881681103e-05, "loss": 0.1634, "step": 3350 }, { "epoch": 0.39914468995010693, "grad_norm": 0.11466885252217508, "learning_rate": 4.862088638694428e-05, "loss": 0.1596, "step": 3360 }, { "epoch": 0.40033262057495844, "grad_norm": 0.11354561908991577, "learning_rate": 4.8610365105779794e-05, "loss": 0.1605, "step": 3370 }, { "epoch": 0.40152055119980995, "grad_norm": 0.11472234443676237, "learning_rate": 4.859980499062283e-05, "loss": 0.1585, "step": 3380 }, { "epoch": 0.40270848182466146, "grad_norm": 0.11197049806166733, "learning_rate": 4.858920605884253e-05, "loss": 0.164, "step": 3390 }, { "epoch": 0.40389641244951296, "grad_norm": 0.1128838500124921, "learning_rate": 4.857856832787185e-05, "loss": 0.1648, "step": 3400 }, { "epoch": 0.4050843430743645, "grad_norm": 0.10444549744290164, "learning_rate": 4.856789181520759e-05, "loss": 0.1634, "step": 3410 }, { "epoch": 0.406272273699216, "grad_norm": 0.10466449113871898, "learning_rate": 4.855717653841031e-05, "loss": 0.1616, "step": 3420 }, { "epoch": 0.4074602043240675, "grad_norm": 0.11575366612800406, "learning_rate": 4.854642251510435e-05, "loss": 0.1618, "step": 3430 }, { "epoch": 0.408648134948919, "grad_norm": 0.11583270953167271, "learning_rate": 4.8535629762977784e-05, "loss": 0.1637, "step": 3440 }, { "epoch": 0.4098360655737705, "grad_norm": 0.11256085886327093, "learning_rate": 4.852479829978237e-05, "loss": 0.1604, "step": 3450 }, { "epoch": 0.411023996198622, "grad_norm": 0.11186861087311903, "learning_rate": 4.851392814333354e-05, "loss": 0.1642, "step": 3460 }, { "epoch": 0.4122119268234735, "grad_norm": 0.10762723333107645, "learning_rate": 4.850301931151039e-05, "loss": 0.1595, "step": 3470 }, { "epoch": 0.413399857448325, "grad_norm": 0.11281332125145285, "learning_rate": 4.849207182225559e-05, "loss": 0.1625, "step": 3480 }, { "epoch": 0.41458778807317653, "grad_norm": 0.12167051129595687, "learning_rate": 4.848108569357544e-05, "loss": 0.164, "step": 3490 }, { "epoch": 0.41577571869802804, "grad_norm": 0.10273906090501332, "learning_rate": 4.8470060943539733e-05, "loss": 0.1586, "step": 3500 }, { "epoch": 0.41696364932287955, "grad_norm": 0.10622489322786707, "learning_rate": 4.845899759028184e-05, "loss": 0.1621, "step": 3510 }, { "epoch": 0.41815157994773106, "grad_norm": 0.11635582035435428, "learning_rate": 4.844789565199862e-05, "loss": 0.1618, "step": 3520 }, { "epoch": 0.41933951057258256, "grad_norm": 0.1047510028800329, "learning_rate": 4.8436755146950364e-05, "loss": 0.1602, "step": 3530 }, { "epoch": 0.42052744119743407, "grad_norm": 0.10603895092820655, "learning_rate": 4.842557609346082e-05, "loss": 0.1607, "step": 3540 }, { "epoch": 0.4217153718222856, "grad_norm": 0.10437036916879769, "learning_rate": 4.841435850991714e-05, "loss": 0.1638, "step": 3550 }, { "epoch": 0.4229033024471371, "grad_norm": 0.10557382763380849, "learning_rate": 4.840310241476984e-05, "loss": 0.1596, "step": 3560 }, { "epoch": 0.4240912330719886, "grad_norm": 0.11476779169831051, "learning_rate": 4.83918078265328e-05, "loss": 0.1591, "step": 3570 }, { "epoch": 0.4252791636968401, "grad_norm": 0.10848374335216221, "learning_rate": 4.838047476378318e-05, "loss": 0.1583, "step": 3580 }, { "epoch": 0.4264670943216916, "grad_norm": 0.11174054541122111, "learning_rate": 4.836910324516145e-05, "loss": 0.1583, "step": 3590 }, { "epoch": 0.4276550249465431, "grad_norm": 0.11533663430411428, "learning_rate": 4.835769328937131e-05, "loss": 0.1614, "step": 3600 }, { "epoch": 0.4288429555713946, "grad_norm": 0.12341802552272693, "learning_rate": 4.834624491517971e-05, "loss": 0.1581, "step": 3610 }, { "epoch": 0.43003088619624613, "grad_norm": 0.11954203211681229, "learning_rate": 4.833475814141677e-05, "loss": 0.1633, "step": 3620 }, { "epoch": 0.43121881682109764, "grad_norm": 0.11248448567841887, "learning_rate": 4.832323298697576e-05, "loss": 0.1558, "step": 3630 }, { "epoch": 0.43240674744594915, "grad_norm": 0.1022643022937346, "learning_rate": 4.8311669470813095e-05, "loss": 0.1613, "step": 3640 }, { "epoch": 0.43359467807080065, "grad_norm": 0.11617880153741245, "learning_rate": 4.830006761194828e-05, "loss": 0.1592, "step": 3650 }, { "epoch": 0.43478260869565216, "grad_norm": 0.109408796544848, "learning_rate": 4.828842742946391e-05, "loss": 0.1598, "step": 3660 }, { "epoch": 0.43597053932050367, "grad_norm": 0.11805566006311585, "learning_rate": 4.827674894250556e-05, "loss": 0.1581, "step": 3670 }, { "epoch": 0.4371584699453552, "grad_norm": 0.11133604336378554, "learning_rate": 4.826503217028185e-05, "loss": 0.1574, "step": 3680 }, { "epoch": 0.4383464005702067, "grad_norm": 0.11610059244434347, "learning_rate": 4.825327713206437e-05, "loss": 0.1624, "step": 3690 }, { "epoch": 0.4395343311950582, "grad_norm": 0.10749364519012104, "learning_rate": 4.824148384718763e-05, "loss": 0.1582, "step": 3700 }, { "epoch": 0.4407222618199097, "grad_norm": 0.11250381894725146, "learning_rate": 4.822965233504905e-05, "loss": 0.159, "step": 3710 }, { "epoch": 0.4419101924447612, "grad_norm": 0.10571932997790255, "learning_rate": 4.821778261510895e-05, "loss": 0.1596, "step": 3720 }, { "epoch": 0.4430981230696127, "grad_norm": 0.11297751859241782, "learning_rate": 4.8205874706890466e-05, "loss": 0.1574, "step": 3730 }, { "epoch": 0.4442860536944642, "grad_norm": 0.11579560100556308, "learning_rate": 4.819392862997956e-05, "loss": 0.1636, "step": 3740 }, { "epoch": 0.44547398431931573, "grad_norm": 0.11973604085918033, "learning_rate": 4.818194440402496e-05, "loss": 0.1604, "step": 3750 }, { "epoch": 0.44666191494416724, "grad_norm": 0.10975125412318944, "learning_rate": 4.816992204873816e-05, "loss": 0.1598, "step": 3760 }, { "epoch": 0.44784984556901875, "grad_norm": 0.10549821652089367, "learning_rate": 4.815786158389336e-05, "loss": 0.1582, "step": 3770 }, { "epoch": 0.44903777619387025, "grad_norm": 0.11043040459360363, "learning_rate": 4.814576302932744e-05, "loss": 0.1601, "step": 3780 }, { "epoch": 0.45022570681872176, "grad_norm": 0.11261633874191385, "learning_rate": 4.8133626404939924e-05, "loss": 0.1554, "step": 3790 }, { "epoch": 0.45141363744357327, "grad_norm": 0.11269206315822486, "learning_rate": 4.8121451730692954e-05, "loss": 0.1591, "step": 3800 }, { "epoch": 0.45260156806842483, "grad_norm": 0.10002425408069793, "learning_rate": 4.810923902661128e-05, "loss": 0.1619, "step": 3810 }, { "epoch": 0.45378949869327634, "grad_norm": 0.11107603222878472, "learning_rate": 4.8096988312782174e-05, "loss": 0.1584, "step": 3820 }, { "epoch": 0.45497742931812785, "grad_norm": 0.1029481043753534, "learning_rate": 4.8084699609355436e-05, "loss": 0.1559, "step": 3830 }, { "epoch": 0.45616535994297935, "grad_norm": 0.11108456387845299, "learning_rate": 4.807237293654334e-05, "loss": 0.1619, "step": 3840 }, { "epoch": 0.45735329056783086, "grad_norm": 0.11375942333307101, "learning_rate": 4.806000831462063e-05, "loss": 0.157, "step": 3850 }, { "epoch": 0.45854122119268237, "grad_norm": 0.10797499559032005, "learning_rate": 4.804760576392448e-05, "loss": 0.1561, "step": 3860 }, { "epoch": 0.4597291518175339, "grad_norm": 0.10233135993205879, "learning_rate": 4.803516530485439e-05, "loss": 0.1608, "step": 3870 }, { "epoch": 0.4609170824423854, "grad_norm": 0.10089163445967854, "learning_rate": 4.802268695787228e-05, "loss": 0.1568, "step": 3880 }, { "epoch": 0.4621050130672369, "grad_norm": 0.10755863252628718, "learning_rate": 4.801017074350235e-05, "loss": 0.1589, "step": 3890 }, { "epoch": 0.4632929436920884, "grad_norm": 0.11022605806812014, "learning_rate": 4.7997616682331084e-05, "loss": 0.1585, "step": 3900 }, { "epoch": 0.4644808743169399, "grad_norm": 0.11147844682370243, "learning_rate": 4.7985024795007236e-05, "loss": 0.1598, "step": 3910 }, { "epoch": 0.4656688049417914, "grad_norm": 0.10769532079996631, "learning_rate": 4.797239510224175e-05, "loss": 0.1593, "step": 3920 }, { "epoch": 0.4668567355666429, "grad_norm": 0.11087644020091982, "learning_rate": 4.795972762480777e-05, "loss": 0.163, "step": 3930 }, { "epoch": 0.46804466619149443, "grad_norm": 0.10973835708302715, "learning_rate": 4.7947022383540585e-05, "loss": 0.1592, "step": 3940 }, { "epoch": 0.46923259681634594, "grad_norm": 0.13094235188402167, "learning_rate": 4.793427939933759e-05, "loss": 0.1643, "step": 3950 }, { "epoch": 0.47042052744119744, "grad_norm": 0.12321762726523605, "learning_rate": 4.792149869315827e-05, "loss": 0.162, "step": 3960 }, { "epoch": 0.47160845806604895, "grad_norm": 0.10559952177664443, "learning_rate": 4.7908680286024144e-05, "loss": 0.1559, "step": 3970 }, { "epoch": 0.47279638869090046, "grad_norm": 0.10673544179062881, "learning_rate": 4.789582419901875e-05, "loss": 0.1566, "step": 3980 }, { "epoch": 0.47398431931575197, "grad_norm": 0.10661741084626987, "learning_rate": 4.788293045328759e-05, "loss": 0.1575, "step": 3990 }, { "epoch": 0.4751722499406035, "grad_norm": 0.10479608612970076, "learning_rate": 4.786999907003812e-05, "loss": 0.1599, "step": 4000 }, { "epoch": 0.476360180565455, "grad_norm": 0.11099548587083514, "learning_rate": 4.785703007053969e-05, "loss": 0.1597, "step": 4010 }, { "epoch": 0.4775481111903065, "grad_norm": 0.1123104176877015, "learning_rate": 4.7844023476123536e-05, "loss": 0.1544, "step": 4020 }, { "epoch": 0.478736041815158, "grad_norm": 0.09752345541026995, "learning_rate": 4.783097930818271e-05, "loss": 0.158, "step": 4030 }, { "epoch": 0.4799239724400095, "grad_norm": 0.10345454631400362, "learning_rate": 4.781789758817207e-05, "loss": 0.1567, "step": 4040 }, { "epoch": 0.481111903064861, "grad_norm": 0.10729796677818869, "learning_rate": 4.780477833760825e-05, "loss": 0.1605, "step": 4050 }, { "epoch": 0.4822998336897125, "grad_norm": 0.1012543145571688, "learning_rate": 4.779162157806961e-05, "loss": 0.1605, "step": 4060 }, { "epoch": 0.48348776431456403, "grad_norm": 0.1161783450703404, "learning_rate": 4.7778427331196195e-05, "loss": 0.1593, "step": 4070 }, { "epoch": 0.48467569493941554, "grad_norm": 0.11590855935869485, "learning_rate": 4.7765195618689705e-05, "loss": 0.1593, "step": 4080 }, { "epoch": 0.48586362556426704, "grad_norm": 0.10453598650616769, "learning_rate": 4.7751926462313493e-05, "loss": 0.1602, "step": 4090 }, { "epoch": 0.48705155618911855, "grad_norm": 0.11607454897657873, "learning_rate": 4.773861988389246e-05, "loss": 0.1563, "step": 4100 }, { "epoch": 0.48823948681397006, "grad_norm": 0.10519909713949187, "learning_rate": 4.7725275905313104e-05, "loss": 0.1553, "step": 4110 }, { "epoch": 0.48942741743882157, "grad_norm": 0.11250896909455539, "learning_rate": 4.771189454852338e-05, "loss": 0.159, "step": 4120 }, { "epoch": 0.4906153480636731, "grad_norm": 0.1065115005421147, "learning_rate": 4.769847583553276e-05, "loss": 0.1583, "step": 4130 }, { "epoch": 0.4918032786885246, "grad_norm": 0.10460033396516155, "learning_rate": 4.768501978841217e-05, "loss": 0.1562, "step": 4140 }, { "epoch": 0.4929912093133761, "grad_norm": 0.10355950237701753, "learning_rate": 4.76715264292939e-05, "loss": 0.1572, "step": 4150 }, { "epoch": 0.4941791399382276, "grad_norm": 0.11560340850930247, "learning_rate": 4.7657995780371654e-05, "loss": 0.1573, "step": 4160 }, { "epoch": 0.4953670705630791, "grad_norm": 0.10196496691640149, "learning_rate": 4.7644427863900444e-05, "loss": 0.1616, "step": 4170 }, { "epoch": 0.4965550011879306, "grad_norm": 0.1060231677737427, "learning_rate": 4.7630822702196586e-05, "loss": 0.1562, "step": 4180 }, { "epoch": 0.4977429318127821, "grad_norm": 0.11251688500389763, "learning_rate": 4.7617180317637654e-05, "loss": 0.1572, "step": 4190 }, { "epoch": 0.4989308624376336, "grad_norm": 0.10718579519199224, "learning_rate": 4.760350073266245e-05, "loss": 0.1579, "step": 4200 }, { "epoch": 0.5001187930624852, "grad_norm": 0.11212336844251224, "learning_rate": 4.7589783969770965e-05, "loss": 0.1615, "step": 4210 }, { "epoch": 0.5013067236873366, "grad_norm": 0.10556343871540011, "learning_rate": 4.7576030051524325e-05, "loss": 0.1586, "step": 4220 }, { "epoch": 0.5024946543121882, "grad_norm": 0.11016139562040811, "learning_rate": 4.756223900054479e-05, "loss": 0.1577, "step": 4230 }, { "epoch": 0.5036825849370397, "grad_norm": 0.10745735387618335, "learning_rate": 4.754841083951568e-05, "loss": 0.1546, "step": 4240 }, { "epoch": 0.5048705155618912, "grad_norm": 0.10377336042555621, "learning_rate": 4.753454559118135e-05, "loss": 0.1599, "step": 4250 }, { "epoch": 0.5060584461867427, "grad_norm": 0.10211302412418251, "learning_rate": 4.752064327834718e-05, "loss": 0.1569, "step": 4260 }, { "epoch": 0.5072463768115942, "grad_norm": 0.11289258762020478, "learning_rate": 4.750670392387948e-05, "loss": 0.1561, "step": 4270 }, { "epoch": 0.5084343074364457, "grad_norm": 0.10072433442939775, "learning_rate": 4.749272755070552e-05, "loss": 0.155, "step": 4280 }, { "epoch": 0.5096222380612973, "grad_norm": 0.11674142601521431, "learning_rate": 4.747871418181341e-05, "loss": 0.1578, "step": 4290 }, { "epoch": 0.5108101686861487, "grad_norm": 0.10873260470671811, "learning_rate": 4.746466384025217e-05, "loss": 0.16, "step": 4300 }, { "epoch": 0.5119980993110003, "grad_norm": 0.10291741698146538, "learning_rate": 4.7450576549131585e-05, "loss": 0.1551, "step": 4310 }, { "epoch": 0.5131860299358517, "grad_norm": 0.10134798960345162, "learning_rate": 4.7436452331622236e-05, "loss": 0.1545, "step": 4320 }, { "epoch": 0.5143739605607033, "grad_norm": 0.11510662063051261, "learning_rate": 4.7422291210955436e-05, "loss": 0.154, "step": 4330 }, { "epoch": 0.5155618911855547, "grad_norm": 0.10842928018698075, "learning_rate": 4.740809321042319e-05, "loss": 0.1554, "step": 4340 }, { "epoch": 0.5167498218104063, "grad_norm": 0.11474704402911515, "learning_rate": 4.7393858353378173e-05, "loss": 0.1564, "step": 4350 }, { "epoch": 0.5179377524352577, "grad_norm": 0.11141459010437124, "learning_rate": 4.7379586663233673e-05, "loss": 0.1613, "step": 4360 }, { "epoch": 0.5191256830601093, "grad_norm": 0.10552169718897175, "learning_rate": 4.736527816346356e-05, "loss": 0.1574, "step": 4370 }, { "epoch": 0.5203136136849608, "grad_norm": 0.11328830755996376, "learning_rate": 4.735093287760225e-05, "loss": 0.1516, "step": 4380 }, { "epoch": 0.5215015443098123, "grad_norm": 0.11029660659304971, "learning_rate": 4.733655082924467e-05, "loss": 0.1581, "step": 4390 }, { "epoch": 0.5226894749346638, "grad_norm": 0.10939366163687551, "learning_rate": 4.732213204204622e-05, "loss": 0.1559, "step": 4400 }, { "epoch": 0.5238774055595153, "grad_norm": 0.11109435776105434, "learning_rate": 4.730767653972269e-05, "loss": 0.1567, "step": 4410 }, { "epoch": 0.5250653361843668, "grad_norm": 0.10627018625029279, "learning_rate": 4.7293184346050296e-05, "loss": 0.1563, "step": 4420 }, { "epoch": 0.5262532668092184, "grad_norm": 0.10628934859291067, "learning_rate": 4.7278655484865595e-05, "loss": 0.1587, "step": 4430 }, { "epoch": 0.5274411974340698, "grad_norm": 0.11042766133445335, "learning_rate": 4.726408998006545e-05, "loss": 0.1581, "step": 4440 }, { "epoch": 0.5286291280589214, "grad_norm": 0.10851879006596378, "learning_rate": 4.7249487855606994e-05, "loss": 0.156, "step": 4450 }, { "epoch": 0.5298170586837728, "grad_norm": 0.10578537392658523, "learning_rate": 4.723484913550759e-05, "loss": 0.1554, "step": 4460 }, { "epoch": 0.5310049893086244, "grad_norm": 0.11070749978438492, "learning_rate": 4.722017384384481e-05, "loss": 0.1575, "step": 4470 }, { "epoch": 0.5321929199334758, "grad_norm": 0.10746661330620033, "learning_rate": 4.720546200475635e-05, "loss": 0.1541, "step": 4480 }, { "epoch": 0.5333808505583274, "grad_norm": 0.10894575224517972, "learning_rate": 4.7190713642440044e-05, "loss": 0.1536, "step": 4490 }, { "epoch": 0.5345687811831789, "grad_norm": 0.10946679371304778, "learning_rate": 4.717592878115378e-05, "loss": 0.1587, "step": 4500 }, { "epoch": 0.5357567118080304, "grad_norm": 0.10449891659110647, "learning_rate": 4.716110744521548e-05, "loss": 0.1548, "step": 4510 }, { "epoch": 0.5369446424328819, "grad_norm": 0.1063724726860897, "learning_rate": 4.714624965900309e-05, "loss": 0.1604, "step": 4520 }, { "epoch": 0.5381325730577334, "grad_norm": 0.11255501010423644, "learning_rate": 4.7131355446954476e-05, "loss": 0.1548, "step": 4530 }, { "epoch": 0.5393205036825849, "grad_norm": 0.10174867020823623, "learning_rate": 4.711642483356742e-05, "loss": 0.151, "step": 4540 }, { "epoch": 0.5405084343074364, "grad_norm": 0.10779561113785395, "learning_rate": 4.710145784339958e-05, "loss": 0.1566, "step": 4550 }, { "epoch": 0.5416963649322879, "grad_norm": 0.10330321354665718, "learning_rate": 4.708645450106846e-05, "loss": 0.1529, "step": 4560 }, { "epoch": 0.5428842955571395, "grad_norm": 0.11395867082285413, "learning_rate": 4.707141483125133e-05, "loss": 0.1588, "step": 4570 }, { "epoch": 0.544072226181991, "grad_norm": 0.099920275801884, "learning_rate": 4.705633885868524e-05, "loss": 0.1542, "step": 4580 }, { "epoch": 0.5452601568068425, "grad_norm": 0.10416805582295353, "learning_rate": 4.704122660816692e-05, "loss": 0.1561, "step": 4590 }, { "epoch": 0.546448087431694, "grad_norm": 0.10475749527435091, "learning_rate": 4.7026078104552794e-05, "loss": 0.154, "step": 4600 }, { "epoch": 0.5476360180565455, "grad_norm": 0.10558137441108725, "learning_rate": 4.7010893372758914e-05, "loss": 0.1574, "step": 4610 }, { "epoch": 0.5488239486813971, "grad_norm": 0.10587451388623116, "learning_rate": 4.699567243776088e-05, "loss": 0.1568, "step": 4620 }, { "epoch": 0.5500118793062485, "grad_norm": 0.10862164731240488, "learning_rate": 4.6980415324593904e-05, "loss": 0.1564, "step": 4630 }, { "epoch": 0.5511998099311001, "grad_norm": 0.10965179742654597, "learning_rate": 4.696512205835265e-05, "loss": 0.1579, "step": 4640 }, { "epoch": 0.5523877405559515, "grad_norm": 0.10904919062805346, "learning_rate": 4.694979266419127e-05, "loss": 0.1518, "step": 4650 }, { "epoch": 0.5535756711808031, "grad_norm": 0.11180998336150468, "learning_rate": 4.693442716732333e-05, "loss": 0.1598, "step": 4660 }, { "epoch": 0.5547636018056545, "grad_norm": 0.10877909079318093, "learning_rate": 4.6919025593021784e-05, "loss": 0.1574, "step": 4670 }, { "epoch": 0.5559515324305061, "grad_norm": 0.10158185159623848, "learning_rate": 4.690358796661891e-05, "loss": 0.1535, "step": 4680 }, { "epoch": 0.5571394630553576, "grad_norm": 0.1039393745574548, "learning_rate": 4.688811431350632e-05, "loss": 0.1543, "step": 4690 }, { "epoch": 0.5583273936802091, "grad_norm": 0.11207410097220717, "learning_rate": 4.6872604659134836e-05, "loss": 0.16, "step": 4700 }, { "epoch": 0.5595153243050606, "grad_norm": 0.1099914893082099, "learning_rate": 4.6857059029014534e-05, "loss": 0.1568, "step": 4710 }, { "epoch": 0.5607032549299121, "grad_norm": 0.10500538436346325, "learning_rate": 4.684147744871463e-05, "loss": 0.156, "step": 4720 }, { "epoch": 0.5618911855547636, "grad_norm": 0.09617591580836773, "learning_rate": 4.6825859943863493e-05, "loss": 0.1554, "step": 4730 }, { "epoch": 0.5630791161796151, "grad_norm": 0.10703834512301233, "learning_rate": 4.681020654014858e-05, "loss": 0.1566, "step": 4740 }, { "epoch": 0.5642670468044666, "grad_norm": 0.10722259044535741, "learning_rate": 4.679451726331637e-05, "loss": 0.1552, "step": 4750 }, { "epoch": 0.5654549774293182, "grad_norm": 0.10933028762895931, "learning_rate": 4.677879213917237e-05, "loss": 0.1543, "step": 4760 }, { "epoch": 0.5666429080541696, "grad_norm": 0.10250918640323159, "learning_rate": 4.676303119358104e-05, "loss": 0.1575, "step": 4770 }, { "epoch": 0.5678308386790212, "grad_norm": 0.10651517643976699, "learning_rate": 4.6747234452465774e-05, "loss": 0.1617, "step": 4780 }, { "epoch": 0.5690187693038726, "grad_norm": 0.10328757388752115, "learning_rate": 4.6731401941808806e-05, "loss": 0.1561, "step": 4790 }, { "epoch": 0.5702066999287242, "grad_norm": 0.10790710601619892, "learning_rate": 4.6715533687651224e-05, "loss": 0.1513, "step": 4800 }, { "epoch": 0.5713946305535756, "grad_norm": 0.0999860660319598, "learning_rate": 4.669962971609292e-05, "loss": 0.1551, "step": 4810 }, { "epoch": 0.5725825611784272, "grad_norm": 0.10466572392343053, "learning_rate": 4.6683690053292515e-05, "loss": 0.1514, "step": 4820 }, { "epoch": 0.5737704918032787, "grad_norm": 0.11327570296813576, "learning_rate": 4.666771472546734e-05, "loss": 0.1573, "step": 4830 }, { "epoch": 0.5749584224281302, "grad_norm": 0.10058162139621304, "learning_rate": 4.6651703758893376e-05, "loss": 0.1573, "step": 4840 }, { "epoch": 0.5761463530529817, "grad_norm": 0.10292532517381033, "learning_rate": 4.663565717990525e-05, "loss": 0.1553, "step": 4850 }, { "epoch": 0.5773342836778332, "grad_norm": 0.1012390090020939, "learning_rate": 4.661957501489614e-05, "loss": 0.1521, "step": 4860 }, { "epoch": 0.5785222143026847, "grad_norm": 0.11070464878754993, "learning_rate": 4.6603457290317755e-05, "loss": 0.1563, "step": 4870 }, { "epoch": 0.5797101449275363, "grad_norm": 0.10528031860951549, "learning_rate": 4.658730403268031e-05, "loss": 0.1556, "step": 4880 }, { "epoch": 0.5808980755523877, "grad_norm": 0.10684878970373035, "learning_rate": 4.6571115268552446e-05, "loss": 0.1554, "step": 4890 }, { "epoch": 0.5820860061772393, "grad_norm": 0.10483609084372218, "learning_rate": 4.655489102456122e-05, "loss": 0.1546, "step": 4900 }, { "epoch": 0.5832739368020907, "grad_norm": 0.10263855958047292, "learning_rate": 4.6538631327392024e-05, "loss": 0.1523, "step": 4910 }, { "epoch": 0.5844618674269423, "grad_norm": 0.1145647754957686, "learning_rate": 4.6522336203788584e-05, "loss": 0.1616, "step": 4920 }, { "epoch": 0.5856497980517937, "grad_norm": 0.10088978434611914, "learning_rate": 4.6506005680552896e-05, "loss": 0.1565, "step": 4930 }, { "epoch": 0.5868377286766453, "grad_norm": 0.10005372032211046, "learning_rate": 4.648963978454516e-05, "loss": 0.158, "step": 4940 }, { "epoch": 0.5880256593014968, "grad_norm": 0.10048743956944088, "learning_rate": 4.6473238542683775e-05, "loss": 0.1556, "step": 4950 }, { "epoch": 0.5892135899263483, "grad_norm": 0.10826172504104899, "learning_rate": 4.645680198194526e-05, "loss": 0.1501, "step": 4960 }, { "epoch": 0.5904015205511998, "grad_norm": 0.1036290473991301, "learning_rate": 4.6440330129364244e-05, "loss": 0.1576, "step": 4970 }, { "epoch": 0.5915894511760513, "grad_norm": 0.1039717595484876, "learning_rate": 4.642382301203341e-05, "loss": 0.1519, "step": 4980 }, { "epoch": 0.5927773818009028, "grad_norm": 0.11154631887116098, "learning_rate": 4.6407280657103404e-05, "loss": 0.155, "step": 4990 }, { "epoch": 0.5939653124257543, "grad_norm": 0.10513663437202782, "learning_rate": 4.639070309178286e-05, "loss": 0.1553, "step": 5000 }, { "epoch": 0.5951532430506058, "grad_norm": 0.10634653301194791, "learning_rate": 4.637409034333834e-05, "loss": 0.155, "step": 5010 }, { "epoch": 0.5963411736754574, "grad_norm": 0.10434229323866724, "learning_rate": 4.635744243909424e-05, "loss": 0.1605, "step": 5020 }, { "epoch": 0.5975291043003088, "grad_norm": 0.10355286728138327, "learning_rate": 4.6340759406432805e-05, "loss": 0.1535, "step": 5030 }, { "epoch": 0.5987170349251604, "grad_norm": 0.11273308400353166, "learning_rate": 4.632404127279404e-05, "loss": 0.1557, "step": 5040 }, { "epoch": 0.5999049655500118, "grad_norm": 0.10287923588662996, "learning_rate": 4.63072880656757e-05, "loss": 0.155, "step": 5050 }, { "epoch": 0.6010928961748634, "grad_norm": 0.10312423519645827, "learning_rate": 4.629049981263323e-05, "loss": 0.1564, "step": 5060 }, { "epoch": 0.6022808267997148, "grad_norm": 0.09810559916461416, "learning_rate": 4.6273676541279686e-05, "loss": 0.1546, "step": 5070 }, { "epoch": 0.6034687574245664, "grad_norm": 0.10033138006675021, "learning_rate": 4.6256818279285773e-05, "loss": 0.1552, "step": 5080 }, { "epoch": 0.604656688049418, "grad_norm": 0.10573134989403979, "learning_rate": 4.62399250543797e-05, "loss": 0.1535, "step": 5090 }, { "epoch": 0.6058446186742694, "grad_norm": 0.10593663651309553, "learning_rate": 4.62229968943472e-05, "loss": 0.1544, "step": 5100 }, { "epoch": 0.607032549299121, "grad_norm": 0.10312176657344115, "learning_rate": 4.620603382703149e-05, "loss": 0.156, "step": 5110 }, { "epoch": 0.6082204799239724, "grad_norm": 0.10050065746031575, "learning_rate": 4.618903588033318e-05, "loss": 0.1566, "step": 5120 }, { "epoch": 0.609408410548824, "grad_norm": 0.10185633440481608, "learning_rate": 4.6172003082210225e-05, "loss": 0.1535, "step": 5130 }, { "epoch": 0.6105963411736754, "grad_norm": 0.10143985475598066, "learning_rate": 4.6154935460677954e-05, "loss": 0.1569, "step": 5140 }, { "epoch": 0.611784271798527, "grad_norm": 0.09936654220897524, "learning_rate": 4.613783304380893e-05, "loss": 0.1498, "step": 5150 }, { "epoch": 0.6129722024233785, "grad_norm": 0.09958088728045703, "learning_rate": 4.612069585973299e-05, "loss": 0.1537, "step": 5160 }, { "epoch": 0.61416013304823, "grad_norm": 0.10116203974765962, "learning_rate": 4.61035239366371e-05, "loss": 0.1523, "step": 5170 }, { "epoch": 0.6153480636730815, "grad_norm": 0.10333584000723317, "learning_rate": 4.6086317302765414e-05, "loss": 0.153, "step": 5180 }, { "epoch": 0.616535994297933, "grad_norm": 0.10096213326937269, "learning_rate": 4.6069075986419165e-05, "loss": 0.154, "step": 5190 }, { "epoch": 0.6177239249227845, "grad_norm": 0.10464620251203928, "learning_rate": 4.60518000159566e-05, "loss": 0.1552, "step": 5200 }, { "epoch": 0.6189118555476361, "grad_norm": 0.10439156119636706, "learning_rate": 4.603448941979301e-05, "loss": 0.1501, "step": 5210 }, { "epoch": 0.6200997861724875, "grad_norm": 0.10379537649269562, "learning_rate": 4.601714422640061e-05, "loss": 0.1521, "step": 5220 }, { "epoch": 0.6212877167973391, "grad_norm": 0.10827264026016292, "learning_rate": 4.5999764464308524e-05, "loss": 0.157, "step": 5230 }, { "epoch": 0.6224756474221905, "grad_norm": 0.10290614354660811, "learning_rate": 4.598235016210274e-05, "loss": 0.1537, "step": 5240 }, { "epoch": 0.6236635780470421, "grad_norm": 0.10508758915776062, "learning_rate": 4.596490134842606e-05, "loss": 0.1561, "step": 5250 }, { "epoch": 0.6248515086718935, "grad_norm": 0.10912257742985176, "learning_rate": 4.594741805197804e-05, "loss": 0.15, "step": 5260 }, { "epoch": 0.6260394392967451, "grad_norm": 0.10125525242528792, "learning_rate": 4.592990030151495e-05, "loss": 0.156, "step": 5270 }, { "epoch": 0.6272273699215966, "grad_norm": 0.10256927210022763, "learning_rate": 4.5912348125849745e-05, "loss": 0.1509, "step": 5280 }, { "epoch": 0.6284153005464481, "grad_norm": 0.10004276126919943, "learning_rate": 4.5894761553852e-05, "loss": 0.1529, "step": 5290 }, { "epoch": 0.6296032311712996, "grad_norm": 0.0965297348826321, "learning_rate": 4.587714061444784e-05, "loss": 0.1549, "step": 5300 }, { "epoch": 0.6307911617961511, "grad_norm": 0.10303274976177496, "learning_rate": 4.5859485336619956e-05, "loss": 0.1537, "step": 5310 }, { "epoch": 0.6319790924210026, "grad_norm": 0.09834653274864798, "learning_rate": 4.5841795749407486e-05, "loss": 0.1539, "step": 5320 }, { "epoch": 0.6331670230458541, "grad_norm": 0.10367033740243481, "learning_rate": 4.5824071881906006e-05, "loss": 0.154, "step": 5330 }, { "epoch": 0.6343549536707056, "grad_norm": 0.0982753913147257, "learning_rate": 4.580631376326749e-05, "loss": 0.1564, "step": 5340 }, { "epoch": 0.6355428842955572, "grad_norm": 0.09876535735305413, "learning_rate": 4.578852142270024e-05, "loss": 0.1542, "step": 5350 }, { "epoch": 0.6367308149204086, "grad_norm": 0.09951537842687022, "learning_rate": 4.577069488946883e-05, "loss": 0.1491, "step": 5360 }, { "epoch": 0.6379187455452602, "grad_norm": 0.10326581267622643, "learning_rate": 4.5752834192894115e-05, "loss": 0.1558, "step": 5370 }, { "epoch": 0.6391066761701116, "grad_norm": 0.10387317012577876, "learning_rate": 4.5734939362353093e-05, "loss": 0.1561, "step": 5380 }, { "epoch": 0.6402946067949632, "grad_norm": 0.1104767739419294, "learning_rate": 4.571701042727893e-05, "loss": 0.1567, "step": 5390 }, { "epoch": 0.6414825374198146, "grad_norm": 0.10821556224999712, "learning_rate": 4.5699047417160887e-05, "loss": 0.1526, "step": 5400 }, { "epoch": 0.6426704680446662, "grad_norm": 0.10482666876879644, "learning_rate": 4.5681050361544266e-05, "loss": 0.1515, "step": 5410 }, { "epoch": 0.6438583986695177, "grad_norm": 0.10388043904088386, "learning_rate": 4.566301929003036e-05, "loss": 0.1549, "step": 5420 }, { "epoch": 0.6450463292943692, "grad_norm": 0.09183039030116447, "learning_rate": 4.564495423227644e-05, "loss": 0.1514, "step": 5430 }, { "epoch": 0.6462342599192207, "grad_norm": 0.10438981347974711, "learning_rate": 4.562685521799564e-05, "loss": 0.152, "step": 5440 }, { "epoch": 0.6474221905440722, "grad_norm": 0.10298376929314082, "learning_rate": 4.5608722276956954e-05, "loss": 0.1527, "step": 5450 }, { "epoch": 0.6486101211689237, "grad_norm": 0.09871439763689163, "learning_rate": 4.5590555438985207e-05, "loss": 0.1552, "step": 5460 }, { "epoch": 0.6497980517937753, "grad_norm": 0.09839536623918414, "learning_rate": 4.557235473396093e-05, "loss": 0.1529, "step": 5470 }, { "epoch": 0.6509859824186267, "grad_norm": 0.10532127098390479, "learning_rate": 4.55541201918204e-05, "loss": 0.1501, "step": 5480 }, { "epoch": 0.6521739130434783, "grad_norm": 0.10136469827283576, "learning_rate": 4.5535851842555536e-05, "loss": 0.1512, "step": 5490 }, { "epoch": 0.6533618436683297, "grad_norm": 0.09785407138441989, "learning_rate": 4.5517549716213845e-05, "loss": 0.1491, "step": 5500 }, { "epoch": 0.6545497742931813, "grad_norm": 0.100985223268451, "learning_rate": 4.5499213842898426e-05, "loss": 0.1498, "step": 5510 }, { "epoch": 0.6557377049180327, "grad_norm": 0.09811763146931897, "learning_rate": 4.548084425276783e-05, "loss": 0.1585, "step": 5520 }, { "epoch": 0.6569256355428843, "grad_norm": 0.0953646028870514, "learning_rate": 4.546244097603612e-05, "loss": 0.1571, "step": 5530 }, { "epoch": 0.6581135661677358, "grad_norm": 0.09954835990316238, "learning_rate": 4.5444004042972744e-05, "loss": 0.1568, "step": 5540 }, { "epoch": 0.6593014967925873, "grad_norm": 0.10361458175112044, "learning_rate": 4.5425533483902494e-05, "loss": 0.1536, "step": 5550 }, { "epoch": 0.6604894274174388, "grad_norm": 0.100769974418551, "learning_rate": 4.5407029329205494e-05, "loss": 0.1579, "step": 5560 }, { "epoch": 0.6616773580422903, "grad_norm": 0.10805570996742671, "learning_rate": 4.538849160931711e-05, "loss": 0.155, "step": 5570 }, { "epoch": 0.6628652886671419, "grad_norm": 0.0963934362932124, "learning_rate": 4.536992035472793e-05, "loss": 0.1479, "step": 5580 }, { "epoch": 0.6640532192919933, "grad_norm": 0.0923292322204585, "learning_rate": 4.5351315595983666e-05, "loss": 0.1524, "step": 5590 }, { "epoch": 0.6652411499168449, "grad_norm": 0.09814435015256213, "learning_rate": 4.533267736368518e-05, "loss": 0.152, "step": 5600 }, { "epoch": 0.6664290805416964, "grad_norm": 0.09696480462923773, "learning_rate": 4.5314005688488356e-05, "loss": 0.1559, "step": 5610 }, { "epoch": 0.6676170111665479, "grad_norm": 0.09977989896913397, "learning_rate": 4.529530060110412e-05, "loss": 0.1564, "step": 5620 }, { "epoch": 0.6688049417913994, "grad_norm": 0.1008704851944186, "learning_rate": 4.5276562132298315e-05, "loss": 0.151, "step": 5630 }, { "epoch": 0.6699928724162509, "grad_norm": 0.11422976892213564, "learning_rate": 4.5257790312891726e-05, "loss": 0.1496, "step": 5640 }, { "epoch": 0.6711808030411024, "grad_norm": 0.10334706650489314, "learning_rate": 4.523898517375995e-05, "loss": 0.1559, "step": 5650 }, { "epoch": 0.672368733665954, "grad_norm": 0.10356728235010884, "learning_rate": 4.522014674583344e-05, "loss": 0.1507, "step": 5660 }, { "epoch": 0.6735566642908054, "grad_norm": 0.09824737645312963, "learning_rate": 4.520127506009735e-05, "loss": 0.1512, "step": 5670 }, { "epoch": 0.674744594915657, "grad_norm": 0.09759034664375965, "learning_rate": 4.518237014759157e-05, "loss": 0.1561, "step": 5680 }, { "epoch": 0.6759325255405084, "grad_norm": 0.10182525717360143, "learning_rate": 4.516343203941063e-05, "loss": 0.1557, "step": 5690 }, { "epoch": 0.67712045616536, "grad_norm": 0.09846418758469144, "learning_rate": 4.514446076670367e-05, "loss": 0.1539, "step": 5700 }, { "epoch": 0.6783083867902114, "grad_norm": 0.10223837918670088, "learning_rate": 4.5125456360674354e-05, "loss": 0.1539, "step": 5710 }, { "epoch": 0.679496317415063, "grad_norm": 0.10042421798894954, "learning_rate": 4.5106418852580866e-05, "loss": 0.1528, "step": 5720 }, { "epoch": 0.6806842480399145, "grad_norm": 0.09994182443599618, "learning_rate": 4.508734827373582e-05, "loss": 0.1529, "step": 5730 }, { "epoch": 0.681872178664766, "grad_norm": 0.09562824849114311, "learning_rate": 4.506824465550625e-05, "loss": 0.1539, "step": 5740 }, { "epoch": 0.6830601092896175, "grad_norm": 0.10056086276945943, "learning_rate": 4.504910802931351e-05, "loss": 0.1552, "step": 5750 }, { "epoch": 0.684248039914469, "grad_norm": 0.09536277990050356, "learning_rate": 4.502993842663324e-05, "loss": 0.1496, "step": 5760 }, { "epoch": 0.6854359705393205, "grad_norm": 0.10046445476773044, "learning_rate": 4.501073587899534e-05, "loss": 0.1524, "step": 5770 }, { "epoch": 0.686623901164172, "grad_norm": 0.0974728096639592, "learning_rate": 4.4991500417983876e-05, "loss": 0.1506, "step": 5780 }, { "epoch": 0.6878118317890235, "grad_norm": 0.10634031311037614, "learning_rate": 4.4972232075237075e-05, "loss": 0.1499, "step": 5790 }, { "epoch": 0.6889997624138751, "grad_norm": 0.10520286685145457, "learning_rate": 4.495293088244722e-05, "loss": 0.1523, "step": 5800 }, { "epoch": 0.6901876930387265, "grad_norm": 0.10119302369333899, "learning_rate": 4.493359687136064e-05, "loss": 0.1506, "step": 5810 }, { "epoch": 0.6913756236635781, "grad_norm": 0.09757047760969241, "learning_rate": 4.4914230073777654e-05, "loss": 0.1555, "step": 5820 }, { "epoch": 0.6925635542884295, "grad_norm": 0.10597952581907688, "learning_rate": 4.489483052155248e-05, "loss": 0.1516, "step": 5830 }, { "epoch": 0.6937514849132811, "grad_norm": 0.10494171704286394, "learning_rate": 4.4875398246593234e-05, "loss": 0.1472, "step": 5840 }, { "epoch": 0.6949394155381325, "grad_norm": 0.10214248804868191, "learning_rate": 4.4855933280861836e-05, "loss": 0.1499, "step": 5850 }, { "epoch": 0.6961273461629841, "grad_norm": 0.09904591056256853, "learning_rate": 4.4836435656373995e-05, "loss": 0.1506, "step": 5860 }, { "epoch": 0.6973152767878356, "grad_norm": 0.0927989099036332, "learning_rate": 4.481690540519913e-05, "loss": 0.1523, "step": 5870 }, { "epoch": 0.6985032074126871, "grad_norm": 0.10239284185229314, "learning_rate": 4.479734255946031e-05, "loss": 0.153, "step": 5880 }, { "epoch": 0.6996911380375386, "grad_norm": 0.10547579314426755, "learning_rate": 4.477774715133422e-05, "loss": 0.1551, "step": 5890 }, { "epoch": 0.7008790686623901, "grad_norm": 0.10786577074662307, "learning_rate": 4.475811921305112e-05, "loss": 0.1527, "step": 5900 }, { "epoch": 0.7020669992872416, "grad_norm": 0.09876353135956947, "learning_rate": 4.4738458776894754e-05, "loss": 0.1536, "step": 5910 }, { "epoch": 0.7032549299120932, "grad_norm": 0.10178700040320791, "learning_rate": 4.471876587520234e-05, "loss": 0.1522, "step": 5920 }, { "epoch": 0.7044428605369446, "grad_norm": 0.09841926143321733, "learning_rate": 4.469904054036448e-05, "loss": 0.1498, "step": 5930 }, { "epoch": 0.7056307911617962, "grad_norm": 0.09878806149770626, "learning_rate": 4.467928280482511e-05, "loss": 0.1488, "step": 5940 }, { "epoch": 0.7068187217866476, "grad_norm": 0.10130911522699079, "learning_rate": 4.4659492701081486e-05, "loss": 0.1517, "step": 5950 }, { "epoch": 0.7080066524114992, "grad_norm": 0.0988937177290149, "learning_rate": 4.4639670261684083e-05, "loss": 0.1532, "step": 5960 }, { "epoch": 0.7091945830363506, "grad_norm": 0.10081104751497807, "learning_rate": 4.461981551923657e-05, "loss": 0.1505, "step": 5970 }, { "epoch": 0.7103825136612022, "grad_norm": 0.09503042370569685, "learning_rate": 4.459992850639574e-05, "loss": 0.1467, "step": 5980 }, { "epoch": 0.7115704442860536, "grad_norm": 0.10448946081762106, "learning_rate": 4.458000925587148e-05, "loss": 0.1519, "step": 5990 }, { "epoch": 0.7127583749109052, "grad_norm": 0.09848725605804205, "learning_rate": 4.456005780042668e-05, "loss": 0.1559, "step": 6000 }, { "epoch": 0.7139463055357567, "grad_norm": 0.10164718165465038, "learning_rate": 4.45400741728772e-05, "loss": 0.1497, "step": 6010 }, { "epoch": 0.7151342361606082, "grad_norm": 0.0970334517432489, "learning_rate": 4.4520058406091846e-05, "loss": 0.1548, "step": 6020 }, { "epoch": 0.7163221667854597, "grad_norm": 0.10793956418073702, "learning_rate": 4.450001053299226e-05, "loss": 0.1552, "step": 6030 }, { "epoch": 0.7175100974103112, "grad_norm": 0.09962776649716415, "learning_rate": 4.44799305865529e-05, "loss": 0.1495, "step": 6040 }, { "epoch": 0.7186980280351627, "grad_norm": 0.09486840946202821, "learning_rate": 4.445981859980097e-05, "loss": 0.1471, "step": 6050 }, { "epoch": 0.7198859586600143, "grad_norm": 0.10255436938134074, "learning_rate": 4.4439674605816385e-05, "loss": 0.1535, "step": 6060 }, { "epoch": 0.7210738892848657, "grad_norm": 0.10072270076045324, "learning_rate": 4.4419498637731694e-05, "loss": 0.15, "step": 6070 }, { "epoch": 0.7222618199097173, "grad_norm": 0.09809360948146152, "learning_rate": 4.439929072873204e-05, "loss": 0.1477, "step": 6080 }, { "epoch": 0.7234497505345688, "grad_norm": 0.09349644960693054, "learning_rate": 4.437905091205512e-05, "loss": 0.1485, "step": 6090 }, { "epoch": 0.7246376811594203, "grad_norm": 0.0966294115790837, "learning_rate": 4.4358779220991074e-05, "loss": 0.1469, "step": 6100 }, { "epoch": 0.7258256117842719, "grad_norm": 0.09863795893119731, "learning_rate": 4.433847568888251e-05, "loss": 0.1525, "step": 6110 }, { "epoch": 0.7270135424091233, "grad_norm": 0.09731086484814003, "learning_rate": 4.431814034912438e-05, "loss": 0.1491, "step": 6120 }, { "epoch": 0.7282014730339749, "grad_norm": 0.09742628624983998, "learning_rate": 4.429777323516397e-05, "loss": 0.1499, "step": 6130 }, { "epoch": 0.7293894036588263, "grad_norm": 0.09591974295604039, "learning_rate": 4.42773743805008e-05, "loss": 0.1542, "step": 6140 }, { "epoch": 0.7305773342836779, "grad_norm": 0.09680644050190962, "learning_rate": 4.4256943818686635e-05, "loss": 0.1474, "step": 6150 }, { "epoch": 0.7317652649085293, "grad_norm": 0.10920322833804263, "learning_rate": 4.4236481583325364e-05, "loss": 0.1531, "step": 6160 }, { "epoch": 0.7329531955333809, "grad_norm": 0.10653208548886707, "learning_rate": 4.421598770807297e-05, "loss": 0.1511, "step": 6170 }, { "epoch": 0.7341411261582323, "grad_norm": 0.09716312757063887, "learning_rate": 4.419546222663752e-05, "loss": 0.1497, "step": 6180 }, { "epoch": 0.7353290567830839, "grad_norm": 0.09395036088174011, "learning_rate": 4.417490517277899e-05, "loss": 0.1497, "step": 6190 }, { "epoch": 0.7365169874079354, "grad_norm": 0.09152752067774668, "learning_rate": 4.415431658030936e-05, "loss": 0.1536, "step": 6200 }, { "epoch": 0.7377049180327869, "grad_norm": 0.09520427190536024, "learning_rate": 4.413369648309246e-05, "loss": 0.1489, "step": 6210 }, { "epoch": 0.7388928486576384, "grad_norm": 0.1003373110509424, "learning_rate": 4.41130449150439e-05, "loss": 0.1531, "step": 6220 }, { "epoch": 0.7400807792824899, "grad_norm": 0.10589404281922715, "learning_rate": 4.4092361910131106e-05, "loss": 0.1477, "step": 6230 }, { "epoch": 0.7412687099073414, "grad_norm": 0.10859108399499486, "learning_rate": 4.4071647502373197e-05, "loss": 0.1556, "step": 6240 }, { "epoch": 0.742456640532193, "grad_norm": 0.09406038798915271, "learning_rate": 4.405090172584092e-05, "loss": 0.1514, "step": 6250 }, { "epoch": 0.7436445711570444, "grad_norm": 0.10180978960778604, "learning_rate": 4.4030124614656644e-05, "loss": 0.1536, "step": 6260 }, { "epoch": 0.744832501781896, "grad_norm": 0.10044426299075783, "learning_rate": 4.4009316202994254e-05, "loss": 0.1499, "step": 6270 }, { "epoch": 0.7460204324067474, "grad_norm": 0.10183538287597141, "learning_rate": 4.398847652507914e-05, "loss": 0.1504, "step": 6280 }, { "epoch": 0.747208363031599, "grad_norm": 0.10031393403076265, "learning_rate": 4.3967605615188106e-05, "loss": 0.1497, "step": 6290 }, { "epoch": 0.7483962936564504, "grad_norm": 0.10659458469466178, "learning_rate": 4.394670350764932e-05, "loss": 0.1512, "step": 6300 }, { "epoch": 0.749584224281302, "grad_norm": 0.09338181334919889, "learning_rate": 4.392577023684229e-05, "loss": 0.1527, "step": 6310 }, { "epoch": 0.7507721549061535, "grad_norm": 0.09577559692334943, "learning_rate": 4.3904805837197737e-05, "loss": 0.1479, "step": 6320 }, { "epoch": 0.751960085531005, "grad_norm": 0.09799474533209679, "learning_rate": 4.388381034319762e-05, "loss": 0.1489, "step": 6330 }, { "epoch": 0.7531480161558565, "grad_norm": 0.09848950906140154, "learning_rate": 4.386278378937503e-05, "loss": 0.1491, "step": 6340 }, { "epoch": 0.754335946780708, "grad_norm": 0.09584832511500556, "learning_rate": 4.3841726210314136e-05, "loss": 0.1496, "step": 6350 }, { "epoch": 0.7555238774055595, "grad_norm": 0.0966253685425035, "learning_rate": 4.382063764065016e-05, "loss": 0.1514, "step": 6360 }, { "epoch": 0.756711808030411, "grad_norm": 0.09521753824737926, "learning_rate": 4.379951811506926e-05, "loss": 0.1519, "step": 6370 }, { "epoch": 0.7578997386552625, "grad_norm": 0.10553146150785055, "learning_rate": 4.377836766830855e-05, "loss": 0.1514, "step": 6380 }, { "epoch": 0.7590876692801141, "grad_norm": 0.10182219102148278, "learning_rate": 4.375718633515597e-05, "loss": 0.1474, "step": 6390 }, { "epoch": 0.7602755999049655, "grad_norm": 0.09545167370415254, "learning_rate": 4.3735974150450275e-05, "loss": 0.1508, "step": 6400 }, { "epoch": 0.7614635305298171, "grad_norm": 0.0969922740217759, "learning_rate": 4.371473114908098e-05, "loss": 0.1521, "step": 6410 }, { "epoch": 0.7626514611546685, "grad_norm": 0.10134188473044431, "learning_rate": 4.369345736598826e-05, "loss": 0.147, "step": 6420 }, { "epoch": 0.7638393917795201, "grad_norm": 0.095374864388944, "learning_rate": 4.3672152836162926e-05, "loss": 0.1514, "step": 6430 }, { "epoch": 0.7650273224043715, "grad_norm": 0.10041633324390857, "learning_rate": 4.3650817594646374e-05, "loss": 0.1501, "step": 6440 }, { "epoch": 0.7662152530292231, "grad_norm": 0.09969705821410041, "learning_rate": 4.3629451676530506e-05, "loss": 0.1479, "step": 6450 }, { "epoch": 0.7674031836540746, "grad_norm": 0.08827971752404666, "learning_rate": 4.360805511695768e-05, "loss": 0.1465, "step": 6460 }, { "epoch": 0.7685911142789261, "grad_norm": 0.09211529067365862, "learning_rate": 4.3586627951120646e-05, "loss": 0.1511, "step": 6470 }, { "epoch": 0.7697790449037776, "grad_norm": 0.09873576138870536, "learning_rate": 4.35651702142625e-05, "loss": 0.152, "step": 6480 }, { "epoch": 0.7709669755286291, "grad_norm": 0.0968307146700879, "learning_rate": 4.354368194167664e-05, "loss": 0.1521, "step": 6490 }, { "epoch": 0.7721549061534806, "grad_norm": 0.10228895500420028, "learning_rate": 4.352216316870664e-05, "loss": 0.1503, "step": 6500 }, { "epoch": 0.7733428367783322, "grad_norm": 0.09159952225179341, "learning_rate": 4.35006139307463e-05, "loss": 0.1497, "step": 6510 }, { "epoch": 0.7745307674031836, "grad_norm": 0.09648100547094494, "learning_rate": 4.347903426323949e-05, "loss": 0.1478, "step": 6520 }, { "epoch": 0.7757186980280352, "grad_norm": 0.10082179155411948, "learning_rate": 4.3457424201680145e-05, "loss": 0.1487, "step": 6530 }, { "epoch": 0.7769066286528866, "grad_norm": 0.09787155652177461, "learning_rate": 4.343578378161217e-05, "loss": 0.1473, "step": 6540 }, { "epoch": 0.7780945592777382, "grad_norm": 0.09857958870861769, "learning_rate": 4.3414113038629436e-05, "loss": 0.1439, "step": 6550 }, { "epoch": 0.7792824899025896, "grad_norm": 0.09662071192801293, "learning_rate": 4.339241200837567e-05, "loss": 0.1517, "step": 6560 }, { "epoch": 0.7804704205274412, "grad_norm": 0.09855452570650076, "learning_rate": 4.337068072654441e-05, "loss": 0.1508, "step": 6570 }, { "epoch": 0.7816583511522927, "grad_norm": 0.09979944031375682, "learning_rate": 4.334891922887897e-05, "loss": 0.1513, "step": 6580 }, { "epoch": 0.7828462817771442, "grad_norm": 0.09510875817114685, "learning_rate": 4.332712755117234e-05, "loss": 0.1489, "step": 6590 }, { "epoch": 0.7840342124019958, "grad_norm": 0.11021271438170328, "learning_rate": 4.330530572926718e-05, "loss": 0.1472, "step": 6600 }, { "epoch": 0.7852221430268472, "grad_norm": 0.09650299735821763, "learning_rate": 4.3283453799055684e-05, "loss": 0.1495, "step": 6610 }, { "epoch": 0.7864100736516988, "grad_norm": 0.10285292597699291, "learning_rate": 4.3261571796479615e-05, "loss": 0.1498, "step": 6620 }, { "epoch": 0.7875980042765502, "grad_norm": 0.09073531191150339, "learning_rate": 4.323965975753018e-05, "loss": 0.1473, "step": 6630 }, { "epoch": 0.7887859349014018, "grad_norm": 0.10162548449727758, "learning_rate": 4.321771771824798e-05, "loss": 0.1533, "step": 6640 }, { "epoch": 0.7899738655262533, "grad_norm": 0.09955253587670077, "learning_rate": 4.319574571472298e-05, "loss": 0.1462, "step": 6650 }, { "epoch": 0.7911617961511048, "grad_norm": 0.10177391476222575, "learning_rate": 4.317374378309441e-05, "loss": 0.1511, "step": 6660 }, { "epoch": 0.7923497267759563, "grad_norm": 0.09724925637569751, "learning_rate": 4.3151711959550736e-05, "loss": 0.1492, "step": 6670 }, { "epoch": 0.7935376574008078, "grad_norm": 0.09658357225772787, "learning_rate": 4.3129650280329584e-05, "loss": 0.1522, "step": 6680 }, { "epoch": 0.7947255880256593, "grad_norm": 0.0937155545803499, "learning_rate": 4.3107558781717693e-05, "loss": 0.1484, "step": 6690 }, { "epoch": 0.7959135186505109, "grad_norm": 0.09473875928536421, "learning_rate": 4.3085437500050854e-05, "loss": 0.1524, "step": 6700 }, { "epoch": 0.7971014492753623, "grad_norm": 0.09176954463816528, "learning_rate": 4.306328647171383e-05, "loss": 0.1509, "step": 6710 }, { "epoch": 0.7982893799002139, "grad_norm": 0.09847549111673104, "learning_rate": 4.3041105733140305e-05, "loss": 0.1481, "step": 6720 }, { "epoch": 0.7994773105250653, "grad_norm": 0.10003211135165646, "learning_rate": 4.301889532081285e-05, "loss": 0.1486, "step": 6730 }, { "epoch": 0.8006652411499169, "grad_norm": 0.09536121552305779, "learning_rate": 4.299665527126285e-05, "loss": 0.144, "step": 6740 }, { "epoch": 0.8018531717747683, "grad_norm": 0.10468817300071087, "learning_rate": 4.29743856210704e-05, "loss": 0.1524, "step": 6750 }, { "epoch": 0.8030411023996199, "grad_norm": 0.09660987984506914, "learning_rate": 4.295208640686432e-05, "loss": 0.1486, "step": 6760 }, { "epoch": 0.8042290330244714, "grad_norm": 0.09354174047364024, "learning_rate": 4.292975766532204e-05, "loss": 0.1507, "step": 6770 }, { "epoch": 0.8054169636493229, "grad_norm": 0.10106861846779551, "learning_rate": 4.290739943316954e-05, "loss": 0.1465, "step": 6780 }, { "epoch": 0.8066048942741744, "grad_norm": 0.09713559001802353, "learning_rate": 4.2885011747181326e-05, "loss": 0.1494, "step": 6790 }, { "epoch": 0.8077928248990259, "grad_norm": 0.09293465260058027, "learning_rate": 4.2862594644180356e-05, "loss": 0.1491, "step": 6800 }, { "epoch": 0.8089807555238774, "grad_norm": 0.09893995586860842, "learning_rate": 4.2840148161037966e-05, "loss": 0.1476, "step": 6810 }, { "epoch": 0.810168686148729, "grad_norm": 0.09480449307341818, "learning_rate": 4.2817672334673795e-05, "loss": 0.1467, "step": 6820 }, { "epoch": 0.8113566167735804, "grad_norm": 0.09758301489188395, "learning_rate": 4.279516720205577e-05, "loss": 0.1467, "step": 6830 }, { "epoch": 0.812544547398432, "grad_norm": 0.09238916253212692, "learning_rate": 4.277263280020002e-05, "loss": 0.148, "step": 6840 }, { "epoch": 0.8137324780232834, "grad_norm": 0.10484134851435764, "learning_rate": 4.275006916617079e-05, "loss": 0.151, "step": 6850 }, { "epoch": 0.814920408648135, "grad_norm": 0.10024440602043365, "learning_rate": 4.272747633708044e-05, "loss": 0.1467, "step": 6860 }, { "epoch": 0.8161083392729864, "grad_norm": 0.10215891467499766, "learning_rate": 4.2704854350089335e-05, "loss": 0.1481, "step": 6870 }, { "epoch": 0.817296269897838, "grad_norm": 0.09285969182540618, "learning_rate": 4.268220324240579e-05, "loss": 0.149, "step": 6880 }, { "epoch": 0.8184842005226894, "grad_norm": 0.09781937972325928, "learning_rate": 4.265952305128602e-05, "loss": 0.1498, "step": 6890 }, { "epoch": 0.819672131147541, "grad_norm": 0.09820446968519186, "learning_rate": 4.2636813814034096e-05, "loss": 0.1513, "step": 6900 }, { "epoch": 0.8208600617723925, "grad_norm": 0.09351464445039782, "learning_rate": 4.261407556800183e-05, "loss": 0.1498, "step": 6910 }, { "epoch": 0.822047992397244, "grad_norm": 0.09130357979407278, "learning_rate": 4.2591308350588775e-05, "loss": 0.1479, "step": 6920 }, { "epoch": 0.8232359230220955, "grad_norm": 0.10316105934015204, "learning_rate": 4.2568512199242136e-05, "loss": 0.1449, "step": 6930 }, { "epoch": 0.824423853646947, "grad_norm": 0.10242165523300387, "learning_rate": 4.254568715145668e-05, "loss": 0.1512, "step": 6940 }, { "epoch": 0.8256117842717985, "grad_norm": 0.09141061710142113, "learning_rate": 4.252283324477473e-05, "loss": 0.1491, "step": 6950 }, { "epoch": 0.82679971489665, "grad_norm": 0.10403712279713874, "learning_rate": 4.2499950516786056e-05, "loss": 0.1509, "step": 6960 }, { "epoch": 0.8279876455215015, "grad_norm": 0.09485764361617496, "learning_rate": 4.247703900512786e-05, "loss": 0.1463, "step": 6970 }, { "epoch": 0.8291755761463531, "grad_norm": 0.09579363966169936, "learning_rate": 4.2454098747484674e-05, "loss": 0.1491, "step": 6980 }, { "epoch": 0.8303635067712045, "grad_norm": 0.08912464286710549, "learning_rate": 4.2431129781588275e-05, "loss": 0.1436, "step": 6990 }, { "epoch": 0.8315514373960561, "grad_norm": 0.10024511123805416, "learning_rate": 4.240813214521772e-05, "loss": 0.1473, "step": 7000 }, { "epoch": 0.8327393680209075, "grad_norm": 0.09295920927368924, "learning_rate": 4.2385105876199194e-05, "loss": 0.1471, "step": 7010 }, { "epoch": 0.8339272986457591, "grad_norm": 0.09607007697836045, "learning_rate": 4.2362051012405954e-05, "loss": 0.1524, "step": 7020 }, { "epoch": 0.8351152292706105, "grad_norm": 0.09292698243823291, "learning_rate": 4.233896759175834e-05, "loss": 0.1503, "step": 7030 }, { "epoch": 0.8363031598954621, "grad_norm": 0.10197407893023963, "learning_rate": 4.231585565222361e-05, "loss": 0.1438, "step": 7040 }, { "epoch": 0.8374910905203136, "grad_norm": 0.09794601933033288, "learning_rate": 4.2292715231815974e-05, "loss": 0.1491, "step": 7050 }, { "epoch": 0.8386790211451651, "grad_norm": 0.09352783187373297, "learning_rate": 4.226954636859644e-05, "loss": 0.1451, "step": 7060 }, { "epoch": 0.8398669517700166, "grad_norm": 0.09253634575189193, "learning_rate": 4.224634910067285e-05, "loss": 0.1494, "step": 7070 }, { "epoch": 0.8410548823948681, "grad_norm": 0.10100449904033496, "learning_rate": 4.222312346619973e-05, "loss": 0.1493, "step": 7080 }, { "epoch": 0.8422428130197196, "grad_norm": 0.09274552601932051, "learning_rate": 4.219986950337826e-05, "loss": 0.1438, "step": 7090 }, { "epoch": 0.8434307436445712, "grad_norm": 0.09566412857330853, "learning_rate": 4.2176587250456255e-05, "loss": 0.1467, "step": 7100 }, { "epoch": 0.8446186742694227, "grad_norm": 0.08993924813170384, "learning_rate": 4.215327674572802e-05, "loss": 0.1427, "step": 7110 }, { "epoch": 0.8458066048942742, "grad_norm": 0.09717776399344373, "learning_rate": 4.212993802753433e-05, "loss": 0.1467, "step": 7120 }, { "epoch": 0.8469945355191257, "grad_norm": 0.09438122632659697, "learning_rate": 4.210657113426239e-05, "loss": 0.1496, "step": 7130 }, { "epoch": 0.8481824661439772, "grad_norm": 0.09987074050116358, "learning_rate": 4.2083176104345736e-05, "loss": 0.1494, "step": 7140 }, { "epoch": 0.8493703967688288, "grad_norm": 0.09892021080301508, "learning_rate": 4.205975297626416e-05, "loss": 0.1462, "step": 7150 }, { "epoch": 0.8505583273936802, "grad_norm": 0.09804478987587412, "learning_rate": 4.203630178854371e-05, "loss": 0.1484, "step": 7160 }, { "epoch": 0.8517462580185318, "grad_norm": 0.08839405471690985, "learning_rate": 4.201282257975656e-05, "loss": 0.147, "step": 7170 }, { "epoch": 0.8529341886433832, "grad_norm": 0.09359429030117486, "learning_rate": 4.198931538852098e-05, "loss": 0.1473, "step": 7180 }, { "epoch": 0.8541221192682348, "grad_norm": 0.09309500711728226, "learning_rate": 4.1965780253501256e-05, "loss": 0.1481, "step": 7190 }, { "epoch": 0.8553100498930862, "grad_norm": 0.09333781815889221, "learning_rate": 4.194221721340765e-05, "loss": 0.149, "step": 7200 }, { "epoch": 0.8564979805179378, "grad_norm": 0.09078800873098491, "learning_rate": 4.191862630699631e-05, "loss": 0.1443, "step": 7210 }, { "epoch": 0.8576859111427892, "grad_norm": 0.09063521757587002, "learning_rate": 4.189500757306923e-05, "loss": 0.1501, "step": 7220 }, { "epoch": 0.8588738417676408, "grad_norm": 0.0913009493713216, "learning_rate": 4.1871361050474165e-05, "loss": 0.1456, "step": 7230 }, { "epoch": 0.8600617723924923, "grad_norm": 0.09723596341024514, "learning_rate": 4.1847686778104575e-05, "loss": 0.147, "step": 7240 }, { "epoch": 0.8612497030173438, "grad_norm": 0.09529119077091336, "learning_rate": 4.1823984794899575e-05, "loss": 0.1474, "step": 7250 }, { "epoch": 0.8624376336421953, "grad_norm": 0.10089585422435772, "learning_rate": 4.1800255139843836e-05, "loss": 0.1493, "step": 7260 }, { "epoch": 0.8636255642670468, "grad_norm": 0.09944891354670841, "learning_rate": 4.177649785196757e-05, "loss": 0.146, "step": 7270 }, { "epoch": 0.8648134948918983, "grad_norm": 0.0938247621700637, "learning_rate": 4.175271297034642e-05, "loss": 0.1496, "step": 7280 }, { "epoch": 0.8660014255167499, "grad_norm": 0.09505490516915684, "learning_rate": 4.1728900534101415e-05, "loss": 0.1477, "step": 7290 }, { "epoch": 0.8671893561416013, "grad_norm": 0.09496728821842224, "learning_rate": 4.170506058239893e-05, "loss": 0.1455, "step": 7300 }, { "epoch": 0.8683772867664529, "grad_norm": 0.09594244521793001, "learning_rate": 4.1681193154450546e-05, "loss": 0.1457, "step": 7310 }, { "epoch": 0.8695652173913043, "grad_norm": 0.09722775402122648, "learning_rate": 4.1657298289513096e-05, "loss": 0.1459, "step": 7320 }, { "epoch": 0.8707531480161559, "grad_norm": 0.0963547817710499, "learning_rate": 4.163337602688851e-05, "loss": 0.1465, "step": 7330 }, { "epoch": 0.8719410786410073, "grad_norm": 0.09184234239748792, "learning_rate": 4.160942640592378e-05, "loss": 0.1484, "step": 7340 }, { "epoch": 0.8731290092658589, "grad_norm": 0.09390936384237122, "learning_rate": 4.158544946601091e-05, "loss": 0.1477, "step": 7350 }, { "epoch": 0.8743169398907104, "grad_norm": 0.09352760960960388, "learning_rate": 4.15614452465868e-05, "loss": 0.1466, "step": 7360 }, { "epoch": 0.8755048705155619, "grad_norm": 0.112145830563964, "learning_rate": 4.153741378713329e-05, "loss": 0.1469, "step": 7370 }, { "epoch": 0.8766928011404134, "grad_norm": 0.10220629436339439, "learning_rate": 4.151335512717696e-05, "loss": 0.1496, "step": 7380 }, { "epoch": 0.8778807317652649, "grad_norm": 0.09739549412064757, "learning_rate": 4.148926930628915e-05, "loss": 0.1493, "step": 7390 }, { "epoch": 0.8790686623901164, "grad_norm": 0.09256273385595576, "learning_rate": 4.146515636408589e-05, "loss": 0.1458, "step": 7400 }, { "epoch": 0.880256593014968, "grad_norm": 0.093215980097129, "learning_rate": 4.144101634022779e-05, "loss": 0.1498, "step": 7410 }, { "epoch": 0.8814445236398194, "grad_norm": 0.09065581241788219, "learning_rate": 4.1416849274420013e-05, "loss": 0.145, "step": 7420 }, { "epoch": 0.882632454264671, "grad_norm": 0.09626792439819937, "learning_rate": 4.1392655206412223e-05, "loss": 0.1506, "step": 7430 }, { "epoch": 0.8838203848895224, "grad_norm": 0.09060561803750047, "learning_rate": 4.136843417599846e-05, "loss": 0.1484, "step": 7440 }, { "epoch": 0.885008315514374, "grad_norm": 0.08880801827345766, "learning_rate": 4.1344186223017146e-05, "loss": 0.1445, "step": 7450 }, { "epoch": 0.8861962461392254, "grad_norm": 0.09124098321584735, "learning_rate": 4.131991138735096e-05, "loss": 0.1417, "step": 7460 }, { "epoch": 0.887384176764077, "grad_norm": 0.0963571404426181, "learning_rate": 4.1295609708926817e-05, "loss": 0.1458, "step": 7470 }, { "epoch": 0.8885721073889284, "grad_norm": 0.0976717869254983, "learning_rate": 4.1271281227715755e-05, "loss": 0.1445, "step": 7480 }, { "epoch": 0.88976003801378, "grad_norm": 0.09444718069017198, "learning_rate": 4.124692598373292e-05, "loss": 0.142, "step": 7490 }, { "epoch": 0.8909479686386315, "grad_norm": 0.09062663464305061, "learning_rate": 4.122254401703748e-05, "loss": 0.1477, "step": 7500 }, { "epoch": 0.892135899263483, "grad_norm": 0.09650444487531518, "learning_rate": 4.1198135367732534e-05, "loss": 0.1424, "step": 7510 }, { "epoch": 0.8933238298883345, "grad_norm": 0.09625053655177908, "learning_rate": 4.117370007596508e-05, "loss": 0.1466, "step": 7520 }, { "epoch": 0.894511760513186, "grad_norm": 0.09390336315825515, "learning_rate": 4.114923818192595e-05, "loss": 0.1488, "step": 7530 }, { "epoch": 0.8956996911380375, "grad_norm": 0.09558617458141211, "learning_rate": 4.112474972584972e-05, "loss": 0.1496, "step": 7540 }, { "epoch": 0.896887621762889, "grad_norm": 0.09519007677469421, "learning_rate": 4.110023474801465e-05, "loss": 0.1455, "step": 7550 }, { "epoch": 0.8980755523877405, "grad_norm": 0.09839479168221206, "learning_rate": 4.107569328874261e-05, "loss": 0.1456, "step": 7560 }, { "epoch": 0.8992634830125921, "grad_norm": 0.09591381235226544, "learning_rate": 4.105112538839907e-05, "loss": 0.1459, "step": 7570 }, { "epoch": 0.9004514136374435, "grad_norm": 0.0896548611559472, "learning_rate": 4.1026531087392945e-05, "loss": 0.1489, "step": 7580 }, { "epoch": 0.9016393442622951, "grad_norm": 0.0920756569278124, "learning_rate": 4.1001910426176596e-05, "loss": 0.1454, "step": 7590 }, { "epoch": 0.9028272748871465, "grad_norm": 0.09001865442710655, "learning_rate": 4.097726344524574e-05, "loss": 0.1438, "step": 7600 }, { "epoch": 0.9040152055119981, "grad_norm": 0.0936265251770909, "learning_rate": 4.095259018513937e-05, "loss": 0.1432, "step": 7610 }, { "epoch": 0.9052031361368497, "grad_norm": 0.09714099399685253, "learning_rate": 4.0927890686439726e-05, "loss": 0.1432, "step": 7620 }, { "epoch": 0.9063910667617011, "grad_norm": 0.09647415972323525, "learning_rate": 4.090316498977218e-05, "loss": 0.1461, "step": 7630 }, { "epoch": 0.9075789973865527, "grad_norm": 0.09406179955973318, "learning_rate": 4.087841313580521e-05, "loss": 0.1482, "step": 7640 }, { "epoch": 0.9087669280114041, "grad_norm": 0.10124693821788253, "learning_rate": 4.085363516525033e-05, "loss": 0.148, "step": 7650 }, { "epoch": 0.9099548586362557, "grad_norm": 0.08932393117136253, "learning_rate": 4.082883111886197e-05, "loss": 0.1456, "step": 7660 }, { "epoch": 0.9111427892611071, "grad_norm": 0.0906626720268479, "learning_rate": 4.0804001037437474e-05, "loss": 0.1491, "step": 7670 }, { "epoch": 0.9123307198859587, "grad_norm": 0.09676175551585736, "learning_rate": 4.0779144961817015e-05, "loss": 0.1458, "step": 7680 }, { "epoch": 0.9135186505108102, "grad_norm": 0.08921370854163818, "learning_rate": 4.075426293288351e-05, "loss": 0.1417, "step": 7690 }, { "epoch": 0.9147065811356617, "grad_norm": 0.0909304332872762, "learning_rate": 4.072935499156257e-05, "loss": 0.1418, "step": 7700 }, { "epoch": 0.9158945117605132, "grad_norm": 0.09583737251340603, "learning_rate": 4.070442117882242e-05, "loss": 0.1477, "step": 7710 }, { "epoch": 0.9170824423853647, "grad_norm": 0.08804452798870997, "learning_rate": 4.0679461535673837e-05, "loss": 0.1436, "step": 7720 }, { "epoch": 0.9182703730102162, "grad_norm": 0.08999259891272403, "learning_rate": 4.0654476103170086e-05, "loss": 0.147, "step": 7730 }, { "epoch": 0.9194583036350678, "grad_norm": 0.09757831555915465, "learning_rate": 4.0629464922406854e-05, "loss": 0.1473, "step": 7740 }, { "epoch": 0.9206462342599192, "grad_norm": 0.09728480737083739, "learning_rate": 4.060442803452218e-05, "loss": 0.1484, "step": 7750 }, { "epoch": 0.9218341648847708, "grad_norm": 0.09054459310223108, "learning_rate": 4.057936548069637e-05, "loss": 0.1451, "step": 7760 }, { "epoch": 0.9230220955096222, "grad_norm": 0.09435224884475957, "learning_rate": 4.055427730215197e-05, "loss": 0.1475, "step": 7770 }, { "epoch": 0.9242100261344738, "grad_norm": 0.0973378543541739, "learning_rate": 4.0529163540153646e-05, "loss": 0.1475, "step": 7780 }, { "epoch": 0.9253979567593252, "grad_norm": 0.10268301130910194, "learning_rate": 4.0504024236008156e-05, "loss": 0.1472, "step": 7790 }, { "epoch": 0.9265858873841768, "grad_norm": 0.09197260064772046, "learning_rate": 4.047885943106428e-05, "loss": 0.1452, "step": 7800 }, { "epoch": 0.9277738180090283, "grad_norm": 0.0918995269256754, "learning_rate": 4.0453669166712724e-05, "loss": 0.1425, "step": 7810 }, { "epoch": 0.9289617486338798, "grad_norm": 0.10101501301260922, "learning_rate": 4.0428453484386076e-05, "loss": 0.1447, "step": 7820 }, { "epoch": 0.9301496792587313, "grad_norm": 0.09033121654993352, "learning_rate": 4.040321242555874e-05, "loss": 0.1456, "step": 7830 }, { "epoch": 0.9313376098835828, "grad_norm": 0.0923330666502994, "learning_rate": 4.037794603174684e-05, "loss": 0.1423, "step": 7840 }, { "epoch": 0.9325255405084343, "grad_norm": 0.09582349604688331, "learning_rate": 4.035265434450818e-05, "loss": 0.1438, "step": 7850 }, { "epoch": 0.9337134711332858, "grad_norm": 0.09682924408366331, "learning_rate": 4.032733740544217e-05, "loss": 0.1431, "step": 7860 }, { "epoch": 0.9349014017581373, "grad_norm": 0.09062705624110846, "learning_rate": 4.030199525618976e-05, "loss": 0.1456, "step": 7870 }, { "epoch": 0.9360893323829889, "grad_norm": 0.09792964292193473, "learning_rate": 4.027662793843334e-05, "loss": 0.1499, "step": 7880 }, { "epoch": 0.9372772630078403, "grad_norm": 0.10085794947647446, "learning_rate": 4.0251235493896733e-05, "loss": 0.1502, "step": 7890 }, { "epoch": 0.9384651936326919, "grad_norm": 0.08623494729521514, "learning_rate": 4.0225817964345056e-05, "loss": 0.1437, "step": 7900 }, { "epoch": 0.9396531242575433, "grad_norm": 0.08894964425261413, "learning_rate": 4.02003753915847e-05, "loss": 0.1448, "step": 7910 }, { "epoch": 0.9408410548823949, "grad_norm": 0.09347969138702553, "learning_rate": 4.017490781746325e-05, "loss": 0.146, "step": 7920 }, { "epoch": 0.9420289855072463, "grad_norm": 0.09500134546045241, "learning_rate": 4.0149415283869404e-05, "loss": 0.1437, "step": 7930 }, { "epoch": 0.9432169161320979, "grad_norm": 0.09322678022226807, "learning_rate": 4.012389783273293e-05, "loss": 0.1482, "step": 7940 }, { "epoch": 0.9444048467569494, "grad_norm": 0.08910972007526972, "learning_rate": 4.009835550602456e-05, "loss": 0.1452, "step": 7950 }, { "epoch": 0.9455927773818009, "grad_norm": 0.09611881557589651, "learning_rate": 4.0072788345755956e-05, "loss": 0.1421, "step": 7960 }, { "epoch": 0.9467807080066524, "grad_norm": 0.10581358000076017, "learning_rate": 4.0047196393979616e-05, "loss": 0.1441, "step": 7970 }, { "epoch": 0.9479686386315039, "grad_norm": 0.09391595930554066, "learning_rate": 4.0021579692788816e-05, "loss": 0.1448, "step": 7980 }, { "epoch": 0.9491565692563554, "grad_norm": 0.09308613272047339, "learning_rate": 3.999593828431754e-05, "loss": 0.147, "step": 7990 }, { "epoch": 0.950344499881207, "grad_norm": 0.09407725523617397, "learning_rate": 3.997027221074041e-05, "loss": 0.146, "step": 8000 }, { "epoch": 0.9515324305060584, "grad_norm": 0.08983470428425382, "learning_rate": 3.994458151427263e-05, "loss": 0.1447, "step": 8010 }, { "epoch": 0.95272036113091, "grad_norm": 0.09496406570014611, "learning_rate": 3.991886623716988e-05, "loss": 0.1457, "step": 8020 }, { "epoch": 0.9539082917557614, "grad_norm": 0.09365970725335483, "learning_rate": 3.989312642172828e-05, "loss": 0.144, "step": 8030 }, { "epoch": 0.955096222380613, "grad_norm": 0.08961623351961047, "learning_rate": 3.9867362110284327e-05, "loss": 0.1474, "step": 8040 }, { "epoch": 0.9562841530054644, "grad_norm": 0.09078722598536862, "learning_rate": 3.9841573345214784e-05, "loss": 0.1448, "step": 8050 }, { "epoch": 0.957472083630316, "grad_norm": 0.09226344125428644, "learning_rate": 3.9815760168936645e-05, "loss": 0.1459, "step": 8060 }, { "epoch": 0.9586600142551674, "grad_norm": 0.09193661559935834, "learning_rate": 3.9789922623907064e-05, "loss": 0.1461, "step": 8070 }, { "epoch": 0.959847944880019, "grad_norm": 0.09019198483110191, "learning_rate": 3.976406075262324e-05, "loss": 0.1449, "step": 8080 }, { "epoch": 0.9610358755048705, "grad_norm": 0.09183441350477109, "learning_rate": 3.973817459762244e-05, "loss": 0.1448, "step": 8090 }, { "epoch": 0.962223806129722, "grad_norm": 0.0935146806475464, "learning_rate": 3.9712264201481834e-05, "loss": 0.1452, "step": 8100 }, { "epoch": 0.9634117367545735, "grad_norm": 0.09672399441703464, "learning_rate": 3.9686329606818475e-05, "loss": 0.1463, "step": 8110 }, { "epoch": 0.964599667379425, "grad_norm": 0.09513892963971958, "learning_rate": 3.966037085628921e-05, "loss": 0.1437, "step": 8120 }, { "epoch": 0.9657875980042766, "grad_norm": 0.09384351382420242, "learning_rate": 3.9634387992590625e-05, "loss": 0.1479, "step": 8130 }, { "epoch": 0.9669755286291281, "grad_norm": 0.09629765719892774, "learning_rate": 3.960838105845897e-05, "loss": 0.1452, "step": 8140 }, { "epoch": 0.9681634592539796, "grad_norm": 0.08785642380547937, "learning_rate": 3.958235009667006e-05, "loss": 0.1457, "step": 8150 }, { "epoch": 0.9693513898788311, "grad_norm": 0.0909358128485166, "learning_rate": 3.955629515003927e-05, "loss": 0.1459, "step": 8160 }, { "epoch": 0.9705393205036826, "grad_norm": 0.09207913015511901, "learning_rate": 3.95302162614214e-05, "loss": 0.1467, "step": 8170 }, { "epoch": 0.9717272511285341, "grad_norm": 0.09641967568176801, "learning_rate": 3.950411347371064e-05, "loss": 0.1431, "step": 8180 }, { "epoch": 0.9729151817533856, "grad_norm": 0.09016412250555218, "learning_rate": 3.947798682984046e-05, "loss": 0.1439, "step": 8190 }, { "epoch": 0.9741031123782371, "grad_norm": 0.09194551609436193, "learning_rate": 3.945183637278364e-05, "loss": 0.1473, "step": 8200 }, { "epoch": 0.9752910430030887, "grad_norm": 0.09077590143054295, "learning_rate": 3.942566214555203e-05, "loss": 0.143, "step": 8210 }, { "epoch": 0.9764789736279401, "grad_norm": 0.09005877967329294, "learning_rate": 3.9399464191196643e-05, "loss": 0.1411, "step": 8220 }, { "epoch": 0.9776669042527917, "grad_norm": 0.09027685834126463, "learning_rate": 3.9373242552807514e-05, "loss": 0.1436, "step": 8230 }, { "epoch": 0.9788548348776431, "grad_norm": 0.08730728137831195, "learning_rate": 3.9346997273513606e-05, "loss": 0.1467, "step": 8240 }, { "epoch": 0.9800427655024947, "grad_norm": 0.09411724805548273, "learning_rate": 3.9320728396482795e-05, "loss": 0.1413, "step": 8250 }, { "epoch": 0.9812306961273461, "grad_norm": 0.09312228111679173, "learning_rate": 3.9294435964921736e-05, "loss": 0.1403, "step": 8260 }, { "epoch": 0.9824186267521977, "grad_norm": 0.09258851786359996, "learning_rate": 3.926812002207586e-05, "loss": 0.1468, "step": 8270 }, { "epoch": 0.9836065573770492, "grad_norm": 0.09302984307448785, "learning_rate": 3.924178061122926e-05, "loss": 0.1465, "step": 8280 }, { "epoch": 0.9847944880019007, "grad_norm": 0.09936327905543893, "learning_rate": 3.921541777570461e-05, "loss": 0.1456, "step": 8290 }, { "epoch": 0.9859824186267522, "grad_norm": 0.09186611627499243, "learning_rate": 3.918903155886314e-05, "loss": 0.1447, "step": 8300 }, { "epoch": 0.9871703492516037, "grad_norm": 0.09737268264078248, "learning_rate": 3.916262200410451e-05, "loss": 0.1463, "step": 8310 }, { "epoch": 0.9883582798764552, "grad_norm": 0.09116167159594729, "learning_rate": 3.9136189154866784e-05, "loss": 0.1431, "step": 8320 }, { "epoch": 0.9895462105013068, "grad_norm": 0.09819003341649268, "learning_rate": 3.9109733054626354e-05, "loss": 0.1441, "step": 8330 }, { "epoch": 0.9907341411261582, "grad_norm": 0.09258435755592904, "learning_rate": 3.908325374689781e-05, "loss": 0.1428, "step": 8340 }, { "epoch": 0.9919220717510098, "grad_norm": 0.09060058590753481, "learning_rate": 3.905675127523396e-05, "loss": 0.143, "step": 8350 }, { "epoch": 0.9931100023758612, "grad_norm": 0.0924372912088493, "learning_rate": 3.903022568322567e-05, "loss": 0.1441, "step": 8360 }, { "epoch": 0.9942979330007128, "grad_norm": 0.09030133209971872, "learning_rate": 3.900367701450189e-05, "loss": 0.1455, "step": 8370 }, { "epoch": 0.9954858636255642, "grad_norm": 0.09509821651932694, "learning_rate": 3.897710531272947e-05, "loss": 0.1414, "step": 8380 }, { "epoch": 0.9966737942504158, "grad_norm": 0.09350612078129233, "learning_rate": 3.895051062161316e-05, "loss": 0.1433, "step": 8390 }, { "epoch": 0.9978617248752673, "grad_norm": 0.09527285909925803, "learning_rate": 3.892389298489555e-05, "loss": 0.1429, "step": 8400 }, { "epoch": 0.9990496555001188, "grad_norm": 0.08680471102015792, "learning_rate": 3.8897252446356934e-05, "loss": 0.141, "step": 8410 }, { "epoch": 1.0002375861249704, "grad_norm": 0.09184548106070618, "learning_rate": 3.88705890498153e-05, "loss": 0.14, "step": 8420 }, { "epoch": 1.0014255167498218, "grad_norm": 0.08761263753655299, "learning_rate": 3.8843902839126215e-05, "loss": 0.1254, "step": 8430 }, { "epoch": 1.0026134473746733, "grad_norm": 0.09142716088379588, "learning_rate": 3.881719385818279e-05, "loss": 0.125, "step": 8440 }, { "epoch": 1.0038013779995247, "grad_norm": 0.09715952314824622, "learning_rate": 3.8790462150915566e-05, "loss": 0.1222, "step": 8450 }, { "epoch": 1.0049893086243764, "grad_norm": 0.09093491676801584, "learning_rate": 3.876370776129248e-05, "loss": 0.1233, "step": 8460 }, { "epoch": 1.0061772392492279, "grad_norm": 0.0893632646051428, "learning_rate": 3.873693073331877e-05, "loss": 0.1244, "step": 8470 }, { "epoch": 1.0073651698740793, "grad_norm": 0.09309025743982981, "learning_rate": 3.8710131111036904e-05, "loss": 0.1259, "step": 8480 }, { "epoch": 1.0085531004989308, "grad_norm": 0.0928802609855727, "learning_rate": 3.868330893852653e-05, "loss": 0.1209, "step": 8490 }, { "epoch": 1.0097410311237824, "grad_norm": 0.09219604082104503, "learning_rate": 3.8656464259904375e-05, "loss": 0.1256, "step": 8500 }, { "epoch": 1.010928961748634, "grad_norm": 0.0900064429020332, "learning_rate": 3.862959711932418e-05, "loss": 0.1232, "step": 8510 }, { "epoch": 1.0121168923734853, "grad_norm": 0.09216244972984075, "learning_rate": 3.860270756097663e-05, "loss": 0.1242, "step": 8520 }, { "epoch": 1.0133048229983368, "grad_norm": 0.10160994124867355, "learning_rate": 3.8575795629089294e-05, "loss": 0.1224, "step": 8530 }, { "epoch": 1.0144927536231885, "grad_norm": 0.0940469799903498, "learning_rate": 3.8548861367926534e-05, "loss": 0.1232, "step": 8540 }, { "epoch": 1.01568068424804, "grad_norm": 0.0925567595244161, "learning_rate": 3.8521904821789447e-05, "loss": 0.1217, "step": 8550 }, { "epoch": 1.0168686148728914, "grad_norm": 0.09080114418888581, "learning_rate": 3.849492603501577e-05, "loss": 0.1215, "step": 8560 }, { "epoch": 1.0180565454977428, "grad_norm": 0.0919045630350627, "learning_rate": 3.8467925051979816e-05, "loss": 0.1193, "step": 8570 }, { "epoch": 1.0192444761225945, "grad_norm": 0.08929494379867599, "learning_rate": 3.8440901917092434e-05, "loss": 0.1255, "step": 8580 }, { "epoch": 1.020432406747446, "grad_norm": 0.08959475884717845, "learning_rate": 3.841385667480089e-05, "loss": 0.122, "step": 8590 }, { "epoch": 1.0216203373722974, "grad_norm": 0.09097704150826598, "learning_rate": 3.83867893695888e-05, "loss": 0.1254, "step": 8600 }, { "epoch": 1.0228082679971489, "grad_norm": 0.08880813873771468, "learning_rate": 3.83597000459761e-05, "loss": 0.1217, "step": 8610 }, { "epoch": 1.0239961986220005, "grad_norm": 0.09062269829725074, "learning_rate": 3.83325887485189e-05, "loss": 0.1226, "step": 8620 }, { "epoch": 1.025184129246852, "grad_norm": 0.09495776792733522, "learning_rate": 3.830545552180951e-05, "loss": 0.1211, "step": 8630 }, { "epoch": 1.0263720598717034, "grad_norm": 0.09031071091420967, "learning_rate": 3.827830041047624e-05, "loss": 0.1253, "step": 8640 }, { "epoch": 1.027559990496555, "grad_norm": 0.0963573526699911, "learning_rate": 3.8251123459183466e-05, "loss": 0.1239, "step": 8650 }, { "epoch": 1.0287479211214066, "grad_norm": 0.0893261628792356, "learning_rate": 3.822392471263142e-05, "loss": 0.1267, "step": 8660 }, { "epoch": 1.029935851746258, "grad_norm": 0.08994832741065324, "learning_rate": 3.8196704215556235e-05, "loss": 0.1237, "step": 8670 }, { "epoch": 1.0311237823711095, "grad_norm": 0.0903129655957455, "learning_rate": 3.8169462012729784e-05, "loss": 0.1207, "step": 8680 }, { "epoch": 1.0323117129959611, "grad_norm": 0.09026569469880734, "learning_rate": 3.814219814895965e-05, "loss": 0.1231, "step": 8690 }, { "epoch": 1.0334996436208126, "grad_norm": 0.09325819939936558, "learning_rate": 3.811491266908906e-05, "loss": 0.1231, "step": 8700 }, { "epoch": 1.034687574245664, "grad_norm": 0.09024209362876745, "learning_rate": 3.808760561799678e-05, "loss": 0.124, "step": 8710 }, { "epoch": 1.0358755048705155, "grad_norm": 0.09755161944305694, "learning_rate": 3.806027704059706e-05, "loss": 0.1231, "step": 8720 }, { "epoch": 1.0370634354953672, "grad_norm": 0.091106074051233, "learning_rate": 3.803292698183955e-05, "loss": 0.1251, "step": 8730 }, { "epoch": 1.0382513661202186, "grad_norm": 0.09017469631426388, "learning_rate": 3.800555548670924e-05, "loss": 0.1219, "step": 8740 }, { "epoch": 1.03943929674507, "grad_norm": 0.08989697433973447, "learning_rate": 3.797816260022639e-05, "loss": 0.1243, "step": 8750 }, { "epoch": 1.0406272273699215, "grad_norm": 0.09600775510944254, "learning_rate": 3.7950748367446396e-05, "loss": 0.1228, "step": 8760 }, { "epoch": 1.0418151579947732, "grad_norm": 0.09570585337166992, "learning_rate": 3.7923312833459837e-05, "loss": 0.1233, "step": 8770 }, { "epoch": 1.0430030886196247, "grad_norm": 0.10540435893206922, "learning_rate": 3.7895856043392255e-05, "loss": 0.1243, "step": 8780 }, { "epoch": 1.044191019244476, "grad_norm": 0.08858044671281895, "learning_rate": 3.78683780424042e-05, "loss": 0.1203, "step": 8790 }, { "epoch": 1.0453789498693276, "grad_norm": 0.09146626934783599, "learning_rate": 3.784087887569111e-05, "loss": 0.1208, "step": 8800 }, { "epoch": 1.0465668804941792, "grad_norm": 0.09426394194175677, "learning_rate": 3.78133585884832e-05, "loss": 0.1231, "step": 8810 }, { "epoch": 1.0477548111190307, "grad_norm": 0.09171134261936235, "learning_rate": 3.7785817226045467e-05, "loss": 0.1248, "step": 8820 }, { "epoch": 1.0489427417438821, "grad_norm": 0.08655640254989366, "learning_rate": 3.775825483367754e-05, "loss": 0.126, "step": 8830 }, { "epoch": 1.0501306723687336, "grad_norm": 0.09578808965248974, "learning_rate": 3.773067145671366e-05, "loss": 0.1228, "step": 8840 }, { "epoch": 1.0513186029935853, "grad_norm": 0.0909385634412012, "learning_rate": 3.770306714052256e-05, "loss": 0.1241, "step": 8850 }, { "epoch": 1.0525065336184367, "grad_norm": 0.09254696333383722, "learning_rate": 3.7675441930507435e-05, "loss": 0.121, "step": 8860 }, { "epoch": 1.0536944642432882, "grad_norm": 0.08957294925818811, "learning_rate": 3.7647795872105836e-05, "loss": 0.1217, "step": 8870 }, { "epoch": 1.0548823948681396, "grad_norm": 0.09382186086765036, "learning_rate": 3.7620129010789596e-05, "loss": 0.1214, "step": 8880 }, { "epoch": 1.0560703254929913, "grad_norm": 0.09650702920100285, "learning_rate": 3.7592441392064793e-05, "loss": 0.1233, "step": 8890 }, { "epoch": 1.0572582561178427, "grad_norm": 0.08756725567924217, "learning_rate": 3.7564733061471614e-05, "loss": 0.1234, "step": 8900 }, { "epoch": 1.0584461867426942, "grad_norm": 0.09096769298380533, "learning_rate": 3.753700406458434e-05, "loss": 0.1229, "step": 8910 }, { "epoch": 1.0596341173675456, "grad_norm": 0.09256005851949851, "learning_rate": 3.7509254447011214e-05, "loss": 0.1233, "step": 8920 }, { "epoch": 1.0608220479923973, "grad_norm": 0.09340500385077853, "learning_rate": 3.748148425439442e-05, "loss": 0.1215, "step": 8930 }, { "epoch": 1.0620099786172488, "grad_norm": 0.0923168099366343, "learning_rate": 3.7453693532409983e-05, "loss": 0.1233, "step": 8940 }, { "epoch": 1.0631979092421002, "grad_norm": 0.10073019352302921, "learning_rate": 3.742588232676767e-05, "loss": 0.1281, "step": 8950 }, { "epoch": 1.0643858398669517, "grad_norm": 0.0935530124803195, "learning_rate": 3.7398050683210975e-05, "loss": 0.12, "step": 8960 }, { "epoch": 1.0655737704918034, "grad_norm": 0.08891525402367767, "learning_rate": 3.737019864751698e-05, "loss": 0.1219, "step": 8970 }, { "epoch": 1.0667617011166548, "grad_norm": 0.0974024584579301, "learning_rate": 3.7342326265496305e-05, "loss": 0.1269, "step": 8980 }, { "epoch": 1.0679496317415063, "grad_norm": 0.09146682070207333, "learning_rate": 3.731443358299307e-05, "loss": 0.1233, "step": 8990 }, { "epoch": 1.0691375623663577, "grad_norm": 0.09081190825200626, "learning_rate": 3.728652064588475e-05, "loss": 0.1236, "step": 9000 }, { "epoch": 1.0703254929912094, "grad_norm": 0.08970746942164767, "learning_rate": 3.7258587500082154e-05, "loss": 0.1246, "step": 9010 }, { "epoch": 1.0715134236160608, "grad_norm": 0.09654731948303738, "learning_rate": 3.72306341915293e-05, "loss": 0.1228, "step": 9020 }, { "epoch": 1.0727013542409123, "grad_norm": 0.08883323220192282, "learning_rate": 3.7202660766203425e-05, "loss": 0.1251, "step": 9030 }, { "epoch": 1.0738892848657637, "grad_norm": 0.0956296515081509, "learning_rate": 3.717466727011479e-05, "loss": 0.1224, "step": 9040 }, { "epoch": 1.0750772154906154, "grad_norm": 0.09005225460272669, "learning_rate": 3.7146653749306724e-05, "loss": 0.1224, "step": 9050 }, { "epoch": 1.0762651461154669, "grad_norm": 0.09634844795459742, "learning_rate": 3.711862024985546e-05, "loss": 0.1242, "step": 9060 }, { "epoch": 1.0774530767403183, "grad_norm": 0.09155332801507778, "learning_rate": 3.7090566817870085e-05, "loss": 0.1238, "step": 9070 }, { "epoch": 1.07864100736517, "grad_norm": 0.09078132312713884, "learning_rate": 3.7062493499492504e-05, "loss": 0.1249, "step": 9080 }, { "epoch": 1.0798289379900214, "grad_norm": 0.09503296130588743, "learning_rate": 3.70344003408973e-05, "loss": 0.1244, "step": 9090 }, { "epoch": 1.081016868614873, "grad_norm": 0.09370442840775109, "learning_rate": 3.70062873882917e-05, "loss": 0.1246, "step": 9100 }, { "epoch": 1.0822047992397243, "grad_norm": 0.08973298195255432, "learning_rate": 3.6978154687915496e-05, "loss": 0.1254, "step": 9110 }, { "epoch": 1.0833927298645758, "grad_norm": 0.09145110880074346, "learning_rate": 3.695000228604095e-05, "loss": 0.1244, "step": 9120 }, { "epoch": 1.0845806604894275, "grad_norm": 0.1005326197761607, "learning_rate": 3.692183022897273e-05, "loss": 0.127, "step": 9130 }, { "epoch": 1.085768591114279, "grad_norm": 0.08996398033792795, "learning_rate": 3.689363856304783e-05, "loss": 0.1212, "step": 9140 }, { "epoch": 1.0869565217391304, "grad_norm": 0.09125458941696586, "learning_rate": 3.6865427334635506e-05, "loss": 0.1244, "step": 9150 }, { "epoch": 1.088144452363982, "grad_norm": 0.08981972000260369, "learning_rate": 3.683719659013719e-05, "loss": 0.1237, "step": 9160 }, { "epoch": 1.0893323829888335, "grad_norm": 0.09368408701308531, "learning_rate": 3.6808946375986404e-05, "loss": 0.1219, "step": 9170 }, { "epoch": 1.090520313613685, "grad_norm": 0.09161695894273823, "learning_rate": 3.678067673864869e-05, "loss": 0.12, "step": 9180 }, { "epoch": 1.0917082442385364, "grad_norm": 0.099277236356516, "learning_rate": 3.6752387724621564e-05, "loss": 0.1283, "step": 9190 }, { "epoch": 1.092896174863388, "grad_norm": 0.09547067978066773, "learning_rate": 3.672407938043438e-05, "loss": 0.1227, "step": 9200 }, { "epoch": 1.0940841054882395, "grad_norm": 0.10076524045800156, "learning_rate": 3.669575175264829e-05, "loss": 0.1221, "step": 9210 }, { "epoch": 1.095272036113091, "grad_norm": 0.09678033498839739, "learning_rate": 3.66674048878562e-05, "loss": 0.1252, "step": 9220 }, { "epoch": 1.0964599667379424, "grad_norm": 0.08619524997648742, "learning_rate": 3.6639038832682607e-05, "loss": 0.1204, "step": 9230 }, { "epoch": 1.0976478973627941, "grad_norm": 0.09638850751457168, "learning_rate": 3.661065363378361e-05, "loss": 0.124, "step": 9240 }, { "epoch": 1.0988358279876456, "grad_norm": 0.09351587376469017, "learning_rate": 3.6582249337846766e-05, "loss": 0.1227, "step": 9250 }, { "epoch": 1.100023758612497, "grad_norm": 0.09720177876995593, "learning_rate": 3.6553825991591064e-05, "loss": 0.1259, "step": 9260 }, { "epoch": 1.1012116892373485, "grad_norm": 0.08993816757340518, "learning_rate": 3.652538364176682e-05, "loss": 0.1201, "step": 9270 }, { "epoch": 1.1023996198622001, "grad_norm": 0.08992694569440267, "learning_rate": 3.64969223351556e-05, "loss": 0.1267, "step": 9280 }, { "epoch": 1.1035875504870516, "grad_norm": 0.08832107330163436, "learning_rate": 3.6468442118570165e-05, "loss": 0.1223, "step": 9290 }, { "epoch": 1.104775481111903, "grad_norm": 0.09404567807597368, "learning_rate": 3.643994303885437e-05, "loss": 0.1248, "step": 9300 }, { "epoch": 1.1059634117367545, "grad_norm": 0.0906877126212463, "learning_rate": 3.6411425142883084e-05, "loss": 0.1226, "step": 9310 }, { "epoch": 1.1071513423616062, "grad_norm": 0.09104948375246719, "learning_rate": 3.638288847756216e-05, "loss": 0.1217, "step": 9320 }, { "epoch": 1.1083392729864576, "grad_norm": 0.0939910843852219, "learning_rate": 3.6354333089828266e-05, "loss": 0.1229, "step": 9330 }, { "epoch": 1.109527203611309, "grad_norm": 0.09346553829322472, "learning_rate": 3.632575902664893e-05, "loss": 0.1234, "step": 9340 }, { "epoch": 1.1107151342361605, "grad_norm": 0.09057434676249931, "learning_rate": 3.629716633502233e-05, "loss": 0.1247, "step": 9350 }, { "epoch": 1.1119030648610122, "grad_norm": 0.09410224428011067, "learning_rate": 3.626855506197735e-05, "loss": 0.1229, "step": 9360 }, { "epoch": 1.1130909954858637, "grad_norm": 0.09698567002425222, "learning_rate": 3.623992525457338e-05, "loss": 0.1263, "step": 9370 }, { "epoch": 1.114278926110715, "grad_norm": 0.08775166902211826, "learning_rate": 3.621127695990034e-05, "loss": 0.123, "step": 9380 }, { "epoch": 1.1154668567355666, "grad_norm": 0.10642672818020185, "learning_rate": 3.618261022507853e-05, "loss": 0.1236, "step": 9390 }, { "epoch": 1.1166547873604182, "grad_norm": 0.09238998309399579, "learning_rate": 3.615392509725858e-05, "loss": 0.1252, "step": 9400 }, { "epoch": 1.1178427179852697, "grad_norm": 0.09454819936469375, "learning_rate": 3.6125221623621396e-05, "loss": 0.1243, "step": 9410 }, { "epoch": 1.1190306486101211, "grad_norm": 0.09524254693376145, "learning_rate": 3.609649985137802e-05, "loss": 0.1224, "step": 9420 }, { "epoch": 1.1202185792349726, "grad_norm": 0.09056672944027683, "learning_rate": 3.606775982776964e-05, "loss": 0.1235, "step": 9430 }, { "epoch": 1.1214065098598243, "grad_norm": 0.09299509294133763, "learning_rate": 3.6039001600067414e-05, "loss": 0.1194, "step": 9440 }, { "epoch": 1.1225944404846757, "grad_norm": 0.09209938682929372, "learning_rate": 3.601022521557248e-05, "loss": 0.1229, "step": 9450 }, { "epoch": 1.1237823711095272, "grad_norm": 0.08815408889932083, "learning_rate": 3.5981430721615824e-05, "loss": 0.1257, "step": 9460 }, { "epoch": 1.1249703017343786, "grad_norm": 0.09546690937370418, "learning_rate": 3.5952618165558215e-05, "loss": 0.1223, "step": 9470 }, { "epoch": 1.1261582323592303, "grad_norm": 0.09529533139784171, "learning_rate": 3.592378759479014e-05, "loss": 0.1242, "step": 9480 }, { "epoch": 1.1273461629840817, "grad_norm": 0.09399750448222878, "learning_rate": 3.589493905673171e-05, "loss": 0.1217, "step": 9490 }, { "epoch": 1.1285340936089332, "grad_norm": 0.09240303524674581, "learning_rate": 3.58660725988326e-05, "loss": 0.1246, "step": 9500 }, { "epoch": 1.1297220242337849, "grad_norm": 0.09527358281616312, "learning_rate": 3.583718826857192e-05, "loss": 0.1228, "step": 9510 }, { "epoch": 1.1309099548586363, "grad_norm": 0.09168899448462929, "learning_rate": 3.580828611345823e-05, "loss": 0.1229, "step": 9520 }, { "epoch": 1.1320978854834878, "grad_norm": 0.10158980720295487, "learning_rate": 3.577936618102938e-05, "loss": 0.1257, "step": 9530 }, { "epoch": 1.1332858161083392, "grad_norm": 0.09722225401517444, "learning_rate": 3.575042851885245e-05, "loss": 0.1236, "step": 9540 }, { "epoch": 1.1344737467331907, "grad_norm": 0.08954533822353475, "learning_rate": 3.572147317452372e-05, "loss": 0.1215, "step": 9550 }, { "epoch": 1.1356616773580424, "grad_norm": 0.09614577138114505, "learning_rate": 3.5692500195668505e-05, "loss": 0.1252, "step": 9560 }, { "epoch": 1.1368496079828938, "grad_norm": 0.09019890493852852, "learning_rate": 3.566350962994116e-05, "loss": 0.1253, "step": 9570 }, { "epoch": 1.1380375386077453, "grad_norm": 0.09033803017728789, "learning_rate": 3.563450152502495e-05, "loss": 0.1213, "step": 9580 }, { "epoch": 1.139225469232597, "grad_norm": 0.09041482666719639, "learning_rate": 3.560547592863199e-05, "loss": 0.1203, "step": 9590 }, { "epoch": 1.1404133998574484, "grad_norm": 0.08682425922466168, "learning_rate": 3.557643288850318e-05, "loss": 0.1236, "step": 9600 }, { "epoch": 1.1416013304822998, "grad_norm": 0.09127760240791419, "learning_rate": 3.5547372452408084e-05, "loss": 0.1244, "step": 9610 }, { "epoch": 1.1427892611071513, "grad_norm": 0.09808531638708518, "learning_rate": 3.551829466814491e-05, "loss": 0.121, "step": 9620 }, { "epoch": 1.1439771917320027, "grad_norm": 0.09045819715045189, "learning_rate": 3.548919958354037e-05, "loss": 0.1221, "step": 9630 }, { "epoch": 1.1451651223568544, "grad_norm": 0.0961750599604424, "learning_rate": 3.5460087246449644e-05, "loss": 0.1194, "step": 9640 }, { "epoch": 1.1463530529817059, "grad_norm": 0.09211239883259079, "learning_rate": 3.5430957704756304e-05, "loss": 0.1213, "step": 9650 }, { "epoch": 1.1475409836065573, "grad_norm": 0.09246119454808378, "learning_rate": 3.5401811006372196e-05, "loss": 0.1232, "step": 9660 }, { "epoch": 1.148728914231409, "grad_norm": 0.0929747459971951, "learning_rate": 3.53726471992374e-05, "loss": 0.122, "step": 9670 }, { "epoch": 1.1499168448562604, "grad_norm": 0.09201856162177256, "learning_rate": 3.5343466331320114e-05, "loss": 0.1248, "step": 9680 }, { "epoch": 1.151104775481112, "grad_norm": 0.089717350380086, "learning_rate": 3.5314268450616636e-05, "loss": 0.1226, "step": 9690 }, { "epoch": 1.1522927061059633, "grad_norm": 0.09373012556618195, "learning_rate": 3.528505360515121e-05, "loss": 0.1232, "step": 9700 }, { "epoch": 1.1534806367308148, "grad_norm": 0.08729646004978567, "learning_rate": 3.5255821842976005e-05, "loss": 0.1239, "step": 9710 }, { "epoch": 1.1546685673556665, "grad_norm": 0.09031502679344239, "learning_rate": 3.522657321217101e-05, "loss": 0.1219, "step": 9720 }, { "epoch": 1.155856497980518, "grad_norm": 0.0930124728247515, "learning_rate": 3.519730776084395e-05, "loss": 0.1232, "step": 9730 }, { "epoch": 1.1570444286053694, "grad_norm": 0.09199537789717985, "learning_rate": 3.516802553713023e-05, "loss": 0.1218, "step": 9740 }, { "epoch": 1.158232359230221, "grad_norm": 0.09717347717213397, "learning_rate": 3.513872658919282e-05, "loss": 0.121, "step": 9750 }, { "epoch": 1.1594202898550725, "grad_norm": 0.08964070130968972, "learning_rate": 3.5109410965222226e-05, "loss": 0.1228, "step": 9760 }, { "epoch": 1.160608220479924, "grad_norm": 0.09092041761405396, "learning_rate": 3.5080078713436366e-05, "loss": 0.126, "step": 9770 }, { "epoch": 1.1617961511047754, "grad_norm": 0.08940403258264368, "learning_rate": 3.505072988208051e-05, "loss": 0.1212, "step": 9780 }, { "epoch": 1.162984081729627, "grad_norm": 0.0912061059917945, "learning_rate": 3.5021364519427205e-05, "loss": 0.1253, "step": 9790 }, { "epoch": 1.1641720123544785, "grad_norm": 0.09423261313541892, "learning_rate": 3.499198267377616e-05, "loss": 0.1235, "step": 9800 }, { "epoch": 1.16535994297933, "grad_norm": 0.09253833927202909, "learning_rate": 3.4962584393454244e-05, "loss": 0.1226, "step": 9810 }, { "epoch": 1.1665478736041814, "grad_norm": 0.09916770374701597, "learning_rate": 3.493316972681531e-05, "loss": 0.1227, "step": 9820 }, { "epoch": 1.1677358042290331, "grad_norm": 0.09611683059299217, "learning_rate": 3.49037387222402e-05, "loss": 0.122, "step": 9830 }, { "epoch": 1.1689237348538846, "grad_norm": 0.09569421299478786, "learning_rate": 3.4874291428136586e-05, "loss": 0.1217, "step": 9840 }, { "epoch": 1.170111665478736, "grad_norm": 0.09372157952226472, "learning_rate": 3.4844827892938985e-05, "loss": 0.1201, "step": 9850 }, { "epoch": 1.1712995961035875, "grad_norm": 0.09031261237191125, "learning_rate": 3.4815348165108585e-05, "loss": 0.1187, "step": 9860 }, { "epoch": 1.1724875267284391, "grad_norm": 0.0894890920244584, "learning_rate": 3.4785852293133224e-05, "loss": 0.1238, "step": 9870 }, { "epoch": 1.1736754573532906, "grad_norm": 0.09106710046785663, "learning_rate": 3.4756340325527305e-05, "loss": 0.1241, "step": 9880 }, { "epoch": 1.174863387978142, "grad_norm": 0.0906029791710071, "learning_rate": 3.4726812310831676e-05, "loss": 0.1254, "step": 9890 }, { "epoch": 1.1760513186029935, "grad_norm": 0.09316236369804001, "learning_rate": 3.469726829761361e-05, "loss": 0.1228, "step": 9900 }, { "epoch": 1.1772392492278452, "grad_norm": 0.08882569762714383, "learning_rate": 3.4667708334466655e-05, "loss": 0.1216, "step": 9910 }, { "epoch": 1.1784271798526966, "grad_norm": 0.0931316951648619, "learning_rate": 3.4638132470010645e-05, "loss": 0.1224, "step": 9920 }, { "epoch": 1.179615110477548, "grad_norm": 0.09499858021620851, "learning_rate": 3.4608540752891525e-05, "loss": 0.1212, "step": 9930 }, { "epoch": 1.1808030411023995, "grad_norm": 0.0917408530638084, "learning_rate": 3.4578933231781336e-05, "loss": 0.1217, "step": 9940 }, { "epoch": 1.1819909717272512, "grad_norm": 0.0850723397992959, "learning_rate": 3.4549309955378107e-05, "loss": 0.1223, "step": 9950 }, { "epoch": 1.1831789023521027, "grad_norm": 0.0943192678288103, "learning_rate": 3.4519670972405776e-05, "loss": 0.1254, "step": 9960 }, { "epoch": 1.184366832976954, "grad_norm": 0.09436260463292305, "learning_rate": 3.4490016331614124e-05, "loss": 0.1225, "step": 9970 }, { "epoch": 1.1855547636018056, "grad_norm": 0.09178344663013545, "learning_rate": 3.446034608177869e-05, "loss": 0.1227, "step": 9980 }, { "epoch": 1.1867426942266572, "grad_norm": 0.0989726282708995, "learning_rate": 3.443066027170068e-05, "loss": 0.1224, "step": 9990 }, { "epoch": 1.1879306248515087, "grad_norm": 0.09472734962982736, "learning_rate": 3.4400958950206876e-05, "loss": 0.1239, "step": 10000 }, { "epoch": 1.1891185554763601, "grad_norm": 0.09474470674997743, "learning_rate": 3.437124216614959e-05, "loss": 0.1245, "step": 10010 }, { "epoch": 1.1903064861012118, "grad_norm": 0.09325066448728793, "learning_rate": 3.4341509968406576e-05, "loss": 0.1238, "step": 10020 }, { "epoch": 1.1914944167260633, "grad_norm": 0.090551479439998, "learning_rate": 3.431176240588092e-05, "loss": 0.123, "step": 10030 }, { "epoch": 1.1926823473509147, "grad_norm": 0.08907088080869321, "learning_rate": 3.4281999527501e-05, "loss": 0.1236, "step": 10040 }, { "epoch": 1.1938702779757662, "grad_norm": 0.08983263237757466, "learning_rate": 3.425222138222037e-05, "loss": 0.1218, "step": 10050 }, { "epoch": 1.1950582086006176, "grad_norm": 0.09010603985812038, "learning_rate": 3.42224280190177e-05, "loss": 0.1219, "step": 10060 }, { "epoch": 1.1962461392254693, "grad_norm": 0.09471893806414104, "learning_rate": 3.419261948689669e-05, "loss": 0.1231, "step": 10070 }, { "epoch": 1.1974340698503207, "grad_norm": 0.09418481437856063, "learning_rate": 3.4162795834885984e-05, "loss": 0.124, "step": 10080 }, { "epoch": 1.1986220004751722, "grad_norm": 0.09168021557656164, "learning_rate": 3.413295711203911e-05, "loss": 0.1198, "step": 10090 }, { "epoch": 1.1998099311000239, "grad_norm": 0.08874613336250081, "learning_rate": 3.4103103367434365e-05, "loss": 0.1212, "step": 10100 }, { "epoch": 1.2009978617248753, "grad_norm": 0.09615023450453491, "learning_rate": 3.407323465017477e-05, "loss": 0.1183, "step": 10110 }, { "epoch": 1.2021857923497268, "grad_norm": 0.09483607571837216, "learning_rate": 3.4043351009387964e-05, "loss": 0.1198, "step": 10120 }, { "epoch": 1.2033737229745782, "grad_norm": 0.09259804944747915, "learning_rate": 3.401345249422613e-05, "loss": 0.124, "step": 10130 }, { "epoch": 1.2045616535994297, "grad_norm": 0.08991549137494541, "learning_rate": 3.398353915386593e-05, "loss": 0.1238, "step": 10140 }, { "epoch": 1.2057495842242814, "grad_norm": 0.0954769045441978, "learning_rate": 3.3953611037508384e-05, "loss": 0.1204, "step": 10150 }, { "epoch": 1.2069375148491328, "grad_norm": 0.09396073085352308, "learning_rate": 3.3923668194378856e-05, "loss": 0.1228, "step": 10160 }, { "epoch": 1.2081254454739843, "grad_norm": 0.08905069034332287, "learning_rate": 3.389371067372688e-05, "loss": 0.1197, "step": 10170 }, { "epoch": 1.209313376098836, "grad_norm": 0.09457430058105507, "learning_rate": 3.3863738524826184e-05, "loss": 0.1203, "step": 10180 }, { "epoch": 1.2105013067236874, "grad_norm": 0.08711657586107342, "learning_rate": 3.3833751796974514e-05, "loss": 0.1219, "step": 10190 }, { "epoch": 1.2116892373485388, "grad_norm": 0.09436689717129246, "learning_rate": 3.380375053949362e-05, "loss": 0.1192, "step": 10200 }, { "epoch": 1.2128771679733903, "grad_norm": 0.0949241200690866, "learning_rate": 3.377373480172915e-05, "loss": 0.1243, "step": 10210 }, { "epoch": 1.2140650985982417, "grad_norm": 0.08831490540414921, "learning_rate": 3.374370463305056e-05, "loss": 0.1209, "step": 10220 }, { "epoch": 1.2152530292230934, "grad_norm": 0.09009989876537705, "learning_rate": 3.3713660082851036e-05, "loss": 0.1202, "step": 10230 }, { "epoch": 1.2164409598479449, "grad_norm": 0.08737213449196486, "learning_rate": 3.3683601200547424e-05, "loss": 0.1239, "step": 10240 }, { "epoch": 1.2176288904727963, "grad_norm": 0.09075341149652894, "learning_rate": 3.365352803558016e-05, "loss": 0.1207, "step": 10250 }, { "epoch": 1.218816821097648, "grad_norm": 0.0939654166540414, "learning_rate": 3.3623440637413154e-05, "loss": 0.12, "step": 10260 }, { "epoch": 1.2200047517224994, "grad_norm": 0.09455430681936758, "learning_rate": 3.359333905553372e-05, "loss": 0.1226, "step": 10270 }, { "epoch": 1.221192682347351, "grad_norm": 0.08839714349649284, "learning_rate": 3.356322333945252e-05, "loss": 0.1206, "step": 10280 }, { "epoch": 1.2223806129722024, "grad_norm": 0.09057961128417627, "learning_rate": 3.3533093538703454e-05, "loss": 0.1224, "step": 10290 }, { "epoch": 1.223568543597054, "grad_norm": 0.0961834874597904, "learning_rate": 3.350294970284359e-05, "loss": 0.1214, "step": 10300 }, { "epoch": 1.2247564742219055, "grad_norm": 0.08747893790826583, "learning_rate": 3.347279188145308e-05, "loss": 0.1228, "step": 10310 }, { "epoch": 1.225944404846757, "grad_norm": 0.09156949923800402, "learning_rate": 3.344262012413507e-05, "loss": 0.119, "step": 10320 }, { "epoch": 1.2271323354716084, "grad_norm": 0.09308216421311342, "learning_rate": 3.341243448051565e-05, "loss": 0.1269, "step": 10330 }, { "epoch": 1.22832026609646, "grad_norm": 0.08975496587485778, "learning_rate": 3.338223500024373e-05, "loss": 0.1228, "step": 10340 }, { "epoch": 1.2295081967213115, "grad_norm": 0.09377300033009188, "learning_rate": 3.3352021732991e-05, "loss": 0.1187, "step": 10350 }, { "epoch": 1.230696127346163, "grad_norm": 0.08736152737626084, "learning_rate": 3.3321794728451784e-05, "loss": 0.1226, "step": 10360 }, { "epoch": 1.2318840579710144, "grad_norm": 0.08978351472383594, "learning_rate": 3.329155403634305e-05, "loss": 0.1186, "step": 10370 }, { "epoch": 1.233071988595866, "grad_norm": 0.09571039554699264, "learning_rate": 3.326129970640425e-05, "loss": 0.1271, "step": 10380 }, { "epoch": 1.2342599192207175, "grad_norm": 0.09349339839748168, "learning_rate": 3.323103178839729e-05, "loss": 0.1238, "step": 10390 }, { "epoch": 1.235447849845569, "grad_norm": 0.0920152999087287, "learning_rate": 3.320075033210639e-05, "loss": 0.1186, "step": 10400 }, { "epoch": 1.2366357804704204, "grad_norm": 0.08610843088070953, "learning_rate": 3.3170455387338064e-05, "loss": 0.1224, "step": 10410 }, { "epoch": 1.2378237110952721, "grad_norm": 0.08813979433710294, "learning_rate": 3.314014700392101e-05, "loss": 0.1224, "step": 10420 }, { "epoch": 1.2390116417201236, "grad_norm": 0.09091918253251201, "learning_rate": 3.310982523170601e-05, "loss": 0.1262, "step": 10430 }, { "epoch": 1.240199572344975, "grad_norm": 0.09571414204266204, "learning_rate": 3.307949012056592e-05, "loss": 0.1229, "step": 10440 }, { "epoch": 1.2413875029698265, "grad_norm": 0.08941148613529262, "learning_rate": 3.304914172039547e-05, "loss": 0.1195, "step": 10450 }, { "epoch": 1.2425754335946781, "grad_norm": 0.09456381253408537, "learning_rate": 3.301878008111128e-05, "loss": 0.1221, "step": 10460 }, { "epoch": 1.2437633642195296, "grad_norm": 0.09490566857746063, "learning_rate": 3.298840525265175e-05, "loss": 0.1217, "step": 10470 }, { "epoch": 1.244951294844381, "grad_norm": 0.08863333722689429, "learning_rate": 3.295801728497696e-05, "loss": 0.1219, "step": 10480 }, { "epoch": 1.2461392254692325, "grad_norm": 0.08860345308014132, "learning_rate": 3.2927616228068605e-05, "loss": 0.1221, "step": 10490 }, { "epoch": 1.2473271560940842, "grad_norm": 0.09428459073492103, "learning_rate": 3.289720213192991e-05, "loss": 0.1245, "step": 10500 }, { "epoch": 1.2485150867189356, "grad_norm": 0.08780102577225256, "learning_rate": 3.286677504658556e-05, "loss": 0.1211, "step": 10510 }, { "epoch": 1.249703017343787, "grad_norm": 0.09057449243527917, "learning_rate": 3.283633502208158e-05, "loss": 0.1214, "step": 10520 }, { "epoch": 1.2508909479686388, "grad_norm": 0.09192764855897853, "learning_rate": 3.2805882108485284e-05, "loss": 0.1234, "step": 10530 }, { "epoch": 1.2520788785934902, "grad_norm": 0.08654810819046659, "learning_rate": 3.2775416355885204e-05, "loss": 0.1239, "step": 10540 }, { "epoch": 1.2532668092183417, "grad_norm": 0.09386343816644606, "learning_rate": 3.2744937814390974e-05, "loss": 0.1191, "step": 10550 }, { "epoch": 1.2544547398431931, "grad_norm": 0.09241919001822536, "learning_rate": 3.2714446534133256e-05, "loss": 0.1232, "step": 10560 }, { "epoch": 1.2556426704680446, "grad_norm": 0.08964067833910465, "learning_rate": 3.268394256526368e-05, "loss": 0.1224, "step": 10570 }, { "epoch": 1.2568306010928962, "grad_norm": 0.08954181280464248, "learning_rate": 3.265342595795475e-05, "loss": 0.1191, "step": 10580 }, { "epoch": 1.2580185317177477, "grad_norm": 0.09075503452676863, "learning_rate": 3.262289676239973e-05, "loss": 0.1207, "step": 10590 }, { "epoch": 1.2592064623425991, "grad_norm": 0.09669573172375395, "learning_rate": 3.259235502881261e-05, "loss": 0.1218, "step": 10600 }, { "epoch": 1.2603943929674508, "grad_norm": 0.090644651157699, "learning_rate": 3.256180080742801e-05, "loss": 0.1217, "step": 10610 }, { "epoch": 1.2615823235923023, "grad_norm": 0.09264379815761134, "learning_rate": 3.253123414850107e-05, "loss": 0.1251, "step": 10620 }, { "epoch": 1.2627702542171537, "grad_norm": 0.0865724442080419, "learning_rate": 3.2500655102307386e-05, "loss": 0.1177, "step": 10630 }, { "epoch": 1.2639581848420052, "grad_norm": 0.09533662546090818, "learning_rate": 3.247006371914295e-05, "loss": 0.124, "step": 10640 }, { "epoch": 1.2651461154668566, "grad_norm": 0.08856819508094185, "learning_rate": 3.243946004932404e-05, "loss": 0.1188, "step": 10650 }, { "epoch": 1.2663340460917083, "grad_norm": 0.09216974076956319, "learning_rate": 3.2408844143187126e-05, "loss": 0.1234, "step": 10660 }, { "epoch": 1.2675219767165598, "grad_norm": 0.09219047874288781, "learning_rate": 3.237821605108881e-05, "loss": 0.1179, "step": 10670 }, { "epoch": 1.2687099073414112, "grad_norm": 0.09195467720140503, "learning_rate": 3.234757582340575e-05, "loss": 0.1214, "step": 10680 }, { "epoch": 1.2698978379662629, "grad_norm": 0.0903232999235718, "learning_rate": 3.231692351053456e-05, "loss": 0.1214, "step": 10690 }, { "epoch": 1.2710857685911143, "grad_norm": 0.0892285731838692, "learning_rate": 3.2286259162891724e-05, "loss": 0.1229, "step": 10700 }, { "epoch": 1.2722736992159658, "grad_norm": 0.09337580079881677, "learning_rate": 3.2255582830913525e-05, "loss": 0.1194, "step": 10710 }, { "epoch": 1.2734616298408172, "grad_norm": 0.08686210663391465, "learning_rate": 3.222489456505595e-05, "loss": 0.1183, "step": 10720 }, { "epoch": 1.2746495604656687, "grad_norm": 0.09376737936769888, "learning_rate": 3.219419441579463e-05, "loss": 0.1222, "step": 10730 }, { "epoch": 1.2758374910905204, "grad_norm": 0.09426616689499498, "learning_rate": 3.216348243362472e-05, "loss": 0.1234, "step": 10740 }, { "epoch": 1.2770254217153718, "grad_norm": 0.09346336780841889, "learning_rate": 3.213275866906088e-05, "loss": 0.1192, "step": 10750 }, { "epoch": 1.2782133523402233, "grad_norm": 0.08996388205839717, "learning_rate": 3.21020231726371e-05, "loss": 0.1263, "step": 10760 }, { "epoch": 1.279401282965075, "grad_norm": 0.08853304436406388, "learning_rate": 3.207127599490668e-05, "loss": 0.1242, "step": 10770 }, { "epoch": 1.2805892135899264, "grad_norm": 0.08637741571406114, "learning_rate": 3.204051718644216e-05, "loss": 0.123, "step": 10780 }, { "epoch": 1.2817771442147778, "grad_norm": 0.0932987560728318, "learning_rate": 3.2009746797835185e-05, "loss": 0.1186, "step": 10790 }, { "epoch": 1.2829650748396293, "grad_norm": 0.09414801097521111, "learning_rate": 3.1978964879696437e-05, "loss": 0.1207, "step": 10800 }, { "epoch": 1.2841530054644807, "grad_norm": 0.08606700276114809, "learning_rate": 3.194817148265559e-05, "loss": 0.1215, "step": 10810 }, { "epoch": 1.2853409360893324, "grad_norm": 0.08957823866030454, "learning_rate": 3.191736665736118e-05, "loss": 0.1229, "step": 10820 }, { "epoch": 1.2865288667141839, "grad_norm": 0.09566475081216802, "learning_rate": 3.188655045448056e-05, "loss": 0.1218, "step": 10830 }, { "epoch": 1.2877167973390353, "grad_norm": 0.0933161656946844, "learning_rate": 3.185572292469976e-05, "loss": 0.1209, "step": 10840 }, { "epoch": 1.288904727963887, "grad_norm": 0.09183141314820754, "learning_rate": 3.182488411872348e-05, "loss": 0.1254, "step": 10850 }, { "epoch": 1.2900926585887385, "grad_norm": 0.08768983636374292, "learning_rate": 3.1794034087274935e-05, "loss": 0.1206, "step": 10860 }, { "epoch": 1.29128058921359, "grad_norm": 0.09453467472508678, "learning_rate": 3.1763172881095826e-05, "loss": 0.1219, "step": 10870 }, { "epoch": 1.2924685198384414, "grad_norm": 0.09518082969900468, "learning_rate": 3.1732300550946226e-05, "loss": 0.1213, "step": 10880 }, { "epoch": 1.2936564504632928, "grad_norm": 0.09463374265649513, "learning_rate": 3.17014171476045e-05, "loss": 0.1197, "step": 10890 }, { "epoch": 1.2948443810881445, "grad_norm": 0.09156744449858546, "learning_rate": 3.167052272186722e-05, "loss": 0.1217, "step": 10900 }, { "epoch": 1.296032311712996, "grad_norm": 0.09044365650083432, "learning_rate": 3.163961732454913e-05, "loss": 0.1215, "step": 10910 }, { "epoch": 1.2972202423378474, "grad_norm": 0.09023173195399714, "learning_rate": 3.160870100648296e-05, "loss": 0.1216, "step": 10920 }, { "epoch": 1.298408172962699, "grad_norm": 0.09124966949768881, "learning_rate": 3.1577773818519434e-05, "loss": 0.125, "step": 10930 }, { "epoch": 1.2995961035875505, "grad_norm": 0.08887180506895287, "learning_rate": 3.154683581152716e-05, "loss": 0.1241, "step": 10940 }, { "epoch": 1.300784034212402, "grad_norm": 0.09176120411967026, "learning_rate": 3.151588703639252e-05, "loss": 0.1229, "step": 10950 }, { "epoch": 1.3019719648372536, "grad_norm": 0.094666519163694, "learning_rate": 3.1484927544019624e-05, "loss": 0.1203, "step": 10960 }, { "epoch": 1.303159895462105, "grad_norm": 0.09217965579368032, "learning_rate": 3.145395738533021e-05, "loss": 0.1209, "step": 10970 }, { "epoch": 1.3043478260869565, "grad_norm": 0.0946535283088645, "learning_rate": 3.142297661126355e-05, "loss": 0.1262, "step": 10980 }, { "epoch": 1.305535756711808, "grad_norm": 0.08830069202648722, "learning_rate": 3.1391985272776375e-05, "loss": 0.1225, "step": 10990 }, { "epoch": 1.3067236873366594, "grad_norm": 0.09118077501774689, "learning_rate": 3.13609834208428e-05, "loss": 0.1208, "step": 11000 }, { "epoch": 1.3079116179615111, "grad_norm": 0.09154490894399005, "learning_rate": 3.1329971106454234e-05, "loss": 0.1212, "step": 11010 }, { "epoch": 1.3090995485863626, "grad_norm": 0.09262140674092212, "learning_rate": 3.129894838061929e-05, "loss": 0.1219, "step": 11020 }, { "epoch": 1.310287479211214, "grad_norm": 0.09579888994844041, "learning_rate": 3.12679152943637e-05, "loss": 0.1232, "step": 11030 }, { "epoch": 1.3114754098360657, "grad_norm": 0.09097003995873548, "learning_rate": 3.123687189873025e-05, "loss": 0.1219, "step": 11040 }, { "epoch": 1.3126633404609171, "grad_norm": 0.09974223918724895, "learning_rate": 3.1205818244778666e-05, "loss": 0.1237, "step": 11050 }, { "epoch": 1.3138512710857686, "grad_norm": 0.09109458925351054, "learning_rate": 3.117475438358557e-05, "loss": 0.1257, "step": 11060 }, { "epoch": 1.31503920171062, "grad_norm": 0.0914001387585773, "learning_rate": 3.1143680366244344e-05, "loss": 0.1212, "step": 11070 }, { "epoch": 1.3162271323354715, "grad_norm": 0.0858824345032307, "learning_rate": 3.11125962438651e-05, "loss": 0.1191, "step": 11080 }, { "epoch": 1.3174150629603232, "grad_norm": 0.08862006358282304, "learning_rate": 3.108150206757455e-05, "loss": 0.1199, "step": 11090 }, { "epoch": 1.3186029935851746, "grad_norm": 0.09023614664803503, "learning_rate": 3.105039788851596e-05, "loss": 0.1243, "step": 11100 }, { "epoch": 1.319790924210026, "grad_norm": 0.08905872171076704, "learning_rate": 3.101928375784904e-05, "loss": 0.1201, "step": 11110 }, { "epoch": 1.3209788548348778, "grad_norm": 0.09147880240436172, "learning_rate": 3.0988159726749875e-05, "loss": 0.122, "step": 11120 }, { "epoch": 1.3221667854597292, "grad_norm": 0.09106419585948405, "learning_rate": 3.095702584641082e-05, "loss": 0.1227, "step": 11130 }, { "epoch": 1.3233547160845807, "grad_norm": 0.09003075919786994, "learning_rate": 3.092588216804043e-05, "loss": 0.1211, "step": 11140 }, { "epoch": 1.3245426467094321, "grad_norm": 0.09104751245091836, "learning_rate": 3.08947287428634e-05, "loss": 0.1239, "step": 11150 }, { "epoch": 1.3257305773342836, "grad_norm": 0.08655875382424383, "learning_rate": 3.0863565622120436e-05, "loss": 0.1221, "step": 11160 }, { "epoch": 1.3269185079591352, "grad_norm": 0.09071987537710541, "learning_rate": 3.08323928570682e-05, "loss": 0.122, "step": 11170 }, { "epoch": 1.3281064385839867, "grad_norm": 0.08704940483355417, "learning_rate": 3.08012104989792e-05, "loss": 0.1167, "step": 11180 }, { "epoch": 1.3292943692088381, "grad_norm": 0.09257044567679111, "learning_rate": 3.077001859914174e-05, "loss": 0.1236, "step": 11190 }, { "epoch": 1.3304822998336898, "grad_norm": 0.08928137037483373, "learning_rate": 3.0738817208859816e-05, "loss": 0.121, "step": 11200 }, { "epoch": 1.3316702304585413, "grad_norm": 0.09266478711840741, "learning_rate": 3.070760637945303e-05, "loss": 0.1193, "step": 11210 }, { "epoch": 1.3328581610833927, "grad_norm": 0.08680049011012635, "learning_rate": 3.0676386162256505e-05, "loss": 0.1198, "step": 11220 }, { "epoch": 1.3340460917082442, "grad_norm": 0.09328165171325849, "learning_rate": 3.0645156608620815e-05, "loss": 0.1229, "step": 11230 }, { "epoch": 1.3352340223330956, "grad_norm": 0.09220348569812543, "learning_rate": 3.061391776991188e-05, "loss": 0.1199, "step": 11240 }, { "epoch": 1.3364219529579473, "grad_norm": 0.08866424760938028, "learning_rate": 3.0582669697510906e-05, "loss": 0.1197, "step": 11250 }, { "epoch": 1.3376098835827988, "grad_norm": 0.09668942640498793, "learning_rate": 3.055141244281427e-05, "loss": 0.1187, "step": 11260 }, { "epoch": 1.3387978142076502, "grad_norm": 0.09077520175056634, "learning_rate": 3.0520146057233466e-05, "loss": 0.1205, "step": 11270 }, { "epoch": 1.3399857448325019, "grad_norm": 0.09076910104869507, "learning_rate": 3.0488870592194994e-05, "loss": 0.1213, "step": 11280 }, { "epoch": 1.3411736754573533, "grad_norm": 0.08880359571469834, "learning_rate": 3.04575860991403e-05, "loss": 0.1215, "step": 11290 }, { "epoch": 1.3423616060822048, "grad_norm": 0.08813968722558266, "learning_rate": 3.042629262952566e-05, "loss": 0.1218, "step": 11300 }, { "epoch": 1.3435495367070562, "grad_norm": 0.09525655004898499, "learning_rate": 3.0394990234822147e-05, "loss": 0.119, "step": 11310 }, { "epoch": 1.3447374673319077, "grad_norm": 0.09038385434701146, "learning_rate": 3.036367896651548e-05, "loss": 0.118, "step": 11320 }, { "epoch": 1.3459253979567594, "grad_norm": 0.08968087928288042, "learning_rate": 3.033235887610598e-05, "loss": 0.1192, "step": 11330 }, { "epoch": 1.3471133285816108, "grad_norm": 0.09231716670065253, "learning_rate": 3.0301030015108512e-05, "loss": 0.1179, "step": 11340 }, { "epoch": 1.3483012592064623, "grad_norm": 0.09422501564841132, "learning_rate": 3.026969243505232e-05, "loss": 0.1175, "step": 11350 }, { "epoch": 1.349489189831314, "grad_norm": 0.08863243385381814, "learning_rate": 3.023834618748102e-05, "loss": 0.1236, "step": 11360 }, { "epoch": 1.3506771204561654, "grad_norm": 0.09297693873558442, "learning_rate": 3.020699132395246e-05, "loss": 0.119, "step": 11370 }, { "epoch": 1.3518650510810168, "grad_norm": 0.09281973555843419, "learning_rate": 3.017562789603869e-05, "loss": 0.1205, "step": 11380 }, { "epoch": 1.3530529817058683, "grad_norm": 0.09039728760703357, "learning_rate": 3.0144255955325824e-05, "loss": 0.1221, "step": 11390 }, { "epoch": 1.3542409123307197, "grad_norm": 0.09269347172715944, "learning_rate": 3.0112875553413973e-05, "loss": 0.1215, "step": 11400 }, { "epoch": 1.3554288429555714, "grad_norm": 0.09074215548451814, "learning_rate": 3.0081486741917194e-05, "loss": 0.118, "step": 11410 }, { "epoch": 1.3566167735804229, "grad_norm": 0.09145743750935115, "learning_rate": 3.0050089572463337e-05, "loss": 0.1198, "step": 11420 }, { "epoch": 1.3578047042052743, "grad_norm": 0.08987061595599309, "learning_rate": 3.001868409669404e-05, "loss": 0.1204, "step": 11430 }, { "epoch": 1.358992634830126, "grad_norm": 0.09035399806638171, "learning_rate": 2.9987270366264563e-05, "loss": 0.1171, "step": 11440 }, { "epoch": 1.3601805654549775, "grad_norm": 0.091946495460671, "learning_rate": 2.995584843284378e-05, "loss": 0.12, "step": 11450 }, { "epoch": 1.361368496079829, "grad_norm": 0.09733454015573198, "learning_rate": 2.9924418348114035e-05, "loss": 0.1211, "step": 11460 }, { "epoch": 1.3625564267046806, "grad_norm": 0.0880211133646279, "learning_rate": 2.989298016377107e-05, "loss": 0.1193, "step": 11470 }, { "epoch": 1.363744357329532, "grad_norm": 0.08854167466520659, "learning_rate": 2.986153393152398e-05, "loss": 0.121, "step": 11480 }, { "epoch": 1.3649322879543835, "grad_norm": 0.08759377452354525, "learning_rate": 2.9830079703095077e-05, "loss": 0.1215, "step": 11490 }, { "epoch": 1.366120218579235, "grad_norm": 0.09223072646853409, "learning_rate": 2.9798617530219826e-05, "loss": 0.1188, "step": 11500 }, { "epoch": 1.3673081492040864, "grad_norm": 0.09173951312123389, "learning_rate": 2.976714746464676e-05, "loss": 0.1219, "step": 11510 }, { "epoch": 1.368496079828938, "grad_norm": 0.0922207929460813, "learning_rate": 2.9735669558137398e-05, "loss": 0.1196, "step": 11520 }, { "epoch": 1.3696840104537895, "grad_norm": 0.09446608507835662, "learning_rate": 2.970418386246615e-05, "loss": 0.1226, "step": 11530 }, { "epoch": 1.370871941078641, "grad_norm": 0.08816547775474973, "learning_rate": 2.9672690429420236e-05, "loss": 0.122, "step": 11540 }, { "epoch": 1.3720598717034926, "grad_norm": 0.09033150339216577, "learning_rate": 2.964118931079961e-05, "loss": 0.1176, "step": 11550 }, { "epoch": 1.373247802328344, "grad_norm": 0.08790830614595775, "learning_rate": 2.9609680558416863e-05, "loss": 0.1194, "step": 11560 }, { "epoch": 1.3744357329531955, "grad_norm": 0.0932072734184521, "learning_rate": 2.9578164224097154e-05, "loss": 0.1209, "step": 11570 }, { "epoch": 1.375623663578047, "grad_norm": 0.0922176870423602, "learning_rate": 2.9546640359678086e-05, "loss": 0.1192, "step": 11580 }, { "epoch": 1.3768115942028984, "grad_norm": 0.09446024946917643, "learning_rate": 2.951510901700966e-05, "loss": 0.1186, "step": 11590 }, { "epoch": 1.3779995248277501, "grad_norm": 0.09339371769730778, "learning_rate": 2.9483570247954202e-05, "loss": 0.1213, "step": 11600 }, { "epoch": 1.3791874554526016, "grad_norm": 0.08899705985001156, "learning_rate": 2.945202410438621e-05, "loss": 0.1199, "step": 11610 }, { "epoch": 1.380375386077453, "grad_norm": 0.08931612813741215, "learning_rate": 2.9420470638192338e-05, "loss": 0.1193, "step": 11620 }, { "epoch": 1.3815633167023047, "grad_norm": 0.09403177709301577, "learning_rate": 2.9388909901271273e-05, "loss": 0.1221, "step": 11630 }, { "epoch": 1.3827512473271562, "grad_norm": 0.09379322952382421, "learning_rate": 2.9357341945533673e-05, "loss": 0.1153, "step": 11640 }, { "epoch": 1.3839391779520076, "grad_norm": 0.09078437299229096, "learning_rate": 2.9325766822902063e-05, "loss": 0.1209, "step": 11650 }, { "epoch": 1.385127108576859, "grad_norm": 0.0934427492468726, "learning_rate": 2.9294184585310745e-05, "loss": 0.1195, "step": 11660 }, { "epoch": 1.3863150392017105, "grad_norm": 0.0892019193936632, "learning_rate": 2.9262595284705747e-05, "loss": 0.12, "step": 11670 }, { "epoch": 1.3875029698265622, "grad_norm": 0.08830247959958917, "learning_rate": 2.9230998973044693e-05, "loss": 0.1194, "step": 11680 }, { "epoch": 1.3886909004514136, "grad_norm": 0.08765273283846839, "learning_rate": 2.919939570229675e-05, "loss": 0.1192, "step": 11690 }, { "epoch": 1.389878831076265, "grad_norm": 0.08947411744059208, "learning_rate": 2.9167785524442527e-05, "loss": 0.1206, "step": 11700 }, { "epoch": 1.3910667617011168, "grad_norm": 0.09239182309641851, "learning_rate": 2.913616849147399e-05, "loss": 0.1244, "step": 11710 }, { "epoch": 1.3922546923259682, "grad_norm": 0.0918371949928003, "learning_rate": 2.9104544655394395e-05, "loss": 0.1188, "step": 11720 }, { "epoch": 1.3934426229508197, "grad_norm": 0.09684401246590299, "learning_rate": 2.9072914068218166e-05, "loss": 0.1236, "step": 11730 }, { "epoch": 1.3946305535756711, "grad_norm": 0.08850484955753087, "learning_rate": 2.9041276781970856e-05, "loss": 0.1142, "step": 11740 }, { "epoch": 1.3958184842005226, "grad_norm": 0.0908328298763111, "learning_rate": 2.9009632848689007e-05, "loss": 0.122, "step": 11750 }, { "epoch": 1.3970064148253742, "grad_norm": 0.09286975627209194, "learning_rate": 2.8977982320420123e-05, "loss": 0.1194, "step": 11760 }, { "epoch": 1.3981943454502257, "grad_norm": 0.08875898887796851, "learning_rate": 2.8946325249222534e-05, "loss": 0.1159, "step": 11770 }, { "epoch": 1.3993822760750771, "grad_norm": 0.09177586012279018, "learning_rate": 2.891466168716535e-05, "loss": 0.1201, "step": 11780 }, { "epoch": 1.4005702066999288, "grad_norm": 0.08912849041352588, "learning_rate": 2.8882991686328337e-05, "loss": 0.1202, "step": 11790 }, { "epoch": 1.4017581373247803, "grad_norm": 0.08871320977457059, "learning_rate": 2.8851315298801864e-05, "loss": 0.1185, "step": 11800 }, { "epoch": 1.4029460679496317, "grad_norm": 0.09279819652759044, "learning_rate": 2.8819632576686805e-05, "loss": 0.1189, "step": 11810 }, { "epoch": 1.4041339985744832, "grad_norm": 0.09329623782347997, "learning_rate": 2.878794357209445e-05, "loss": 0.1188, "step": 11820 }, { "epoch": 1.4053219291993346, "grad_norm": 0.09381221560380082, "learning_rate": 2.8756248337146424e-05, "loss": 0.1189, "step": 11830 }, { "epoch": 1.4065098598241863, "grad_norm": 0.08955414338417599, "learning_rate": 2.87245469239746e-05, "loss": 0.1208, "step": 11840 }, { "epoch": 1.4076977904490378, "grad_norm": 0.08703206267962739, "learning_rate": 2.8692839384721003e-05, "loss": 0.12, "step": 11850 }, { "epoch": 1.4088857210738892, "grad_norm": 0.08763505344605575, "learning_rate": 2.866112577153775e-05, "loss": 0.1175, "step": 11860 }, { "epoch": 1.4100736516987409, "grad_norm": 0.09065148812516685, "learning_rate": 2.862940613658693e-05, "loss": 0.1228, "step": 11870 }, { "epoch": 1.4112615823235923, "grad_norm": 0.09041164595267707, "learning_rate": 2.8597680532040562e-05, "loss": 0.1216, "step": 11880 }, { "epoch": 1.4124495129484438, "grad_norm": 0.08771153340890063, "learning_rate": 2.8565949010080452e-05, "loss": 0.1227, "step": 11890 }, { "epoch": 1.4136374435732952, "grad_norm": 0.09070704569939414, "learning_rate": 2.8534211622898175e-05, "loss": 0.1184, "step": 11900 }, { "epoch": 1.4148253741981467, "grad_norm": 0.09510944103242208, "learning_rate": 2.8502468422694922e-05, "loss": 0.1196, "step": 11910 }, { "epoch": 1.4160133048229984, "grad_norm": 0.08794130940561314, "learning_rate": 2.847071946168145e-05, "loss": 0.1186, "step": 11920 }, { "epoch": 1.4172012354478498, "grad_norm": 0.08670642130581338, "learning_rate": 2.843896479207802e-05, "loss": 0.1185, "step": 11930 }, { "epoch": 1.4183891660727013, "grad_norm": 0.08689527714527813, "learning_rate": 2.840720446611424e-05, "loss": 0.1203, "step": 11940 }, { "epoch": 1.419577096697553, "grad_norm": 0.09493977943105936, "learning_rate": 2.8375438536029053e-05, "loss": 0.1193, "step": 11950 }, { "epoch": 1.4207650273224044, "grad_norm": 0.0917217254956896, "learning_rate": 2.83436670540706e-05, "loss": 0.117, "step": 11960 }, { "epoch": 1.4219529579472558, "grad_norm": 0.08841713614636518, "learning_rate": 2.8311890072496173e-05, "loss": 0.1202, "step": 11970 }, { "epoch": 1.4231408885721075, "grad_norm": 0.09504598674908932, "learning_rate": 2.828010764357209e-05, "loss": 0.1166, "step": 11980 }, { "epoch": 1.424328819196959, "grad_norm": 0.09141045646266704, "learning_rate": 2.8248319819573644e-05, "loss": 0.1221, "step": 11990 }, { "epoch": 1.4255167498218104, "grad_norm": 0.09872531951788706, "learning_rate": 2.821652665278499e-05, "loss": 0.1223, "step": 12000 }, { "epoch": 1.4267046804466619, "grad_norm": 0.09200553025688753, "learning_rate": 2.818472819549908e-05, "loss": 0.1202, "step": 12010 }, { "epoch": 1.4278926110715133, "grad_norm": 0.09093707165522986, "learning_rate": 2.8152924500017564e-05, "loss": 0.1182, "step": 12020 }, { "epoch": 1.429080541696365, "grad_norm": 0.08933007603448515, "learning_rate": 2.8121115618650705e-05, "loss": 0.1185, "step": 12030 }, { "epoch": 1.4302684723212165, "grad_norm": 0.08920060531197896, "learning_rate": 2.80893016037173e-05, "loss": 0.1167, "step": 12040 }, { "epoch": 1.431456402946068, "grad_norm": 0.08899527099986478, "learning_rate": 2.8057482507544593e-05, "loss": 0.1205, "step": 12050 }, { "epoch": 1.4326443335709196, "grad_norm": 0.08848766616977262, "learning_rate": 2.8025658382468163e-05, "loss": 0.121, "step": 12060 }, { "epoch": 1.433832264195771, "grad_norm": 0.09388963569326432, "learning_rate": 2.7993829280831902e-05, "loss": 0.1171, "step": 12070 }, { "epoch": 1.4350201948206225, "grad_norm": 0.09299434157302407, "learning_rate": 2.7961995254987845e-05, "loss": 0.1199, "step": 12080 }, { "epoch": 1.436208125445474, "grad_norm": 0.0906442120536184, "learning_rate": 2.793015635729616e-05, "loss": 0.1222, "step": 12090 }, { "epoch": 1.4373960560703254, "grad_norm": 0.08981091431825519, "learning_rate": 2.789831264012499e-05, "loss": 0.1176, "step": 12100 }, { "epoch": 1.438583986695177, "grad_norm": 0.09347167961253912, "learning_rate": 2.7866464155850454e-05, "loss": 0.118, "step": 12110 }, { "epoch": 1.4397719173200285, "grad_norm": 0.09025218012040383, "learning_rate": 2.783461095685647e-05, "loss": 0.1189, "step": 12120 }, { "epoch": 1.44095984794488, "grad_norm": 0.0895247188897672, "learning_rate": 2.7802753095534727e-05, "loss": 0.1201, "step": 12130 }, { "epoch": 1.4421477785697316, "grad_norm": 0.0888096018991233, "learning_rate": 2.7770890624284585e-05, "loss": 0.1189, "step": 12140 }, { "epoch": 1.443335709194583, "grad_norm": 0.08787176908462684, "learning_rate": 2.773902359551298e-05, "loss": 0.1199, "step": 12150 }, { "epoch": 1.4445236398194345, "grad_norm": 0.08946345965691639, "learning_rate": 2.7707152061634356e-05, "loss": 0.1189, "step": 12160 }, { "epoch": 1.445711570444286, "grad_norm": 0.09976930935849956, "learning_rate": 2.7675276075070555e-05, "loss": 0.1208, "step": 12170 }, { "epoch": 1.4468995010691375, "grad_norm": 0.0917448134721162, "learning_rate": 2.764339568825074e-05, "loss": 0.1187, "step": 12180 }, { "epoch": 1.4480874316939891, "grad_norm": 0.09014333994658424, "learning_rate": 2.761151095361132e-05, "loss": 0.1171, "step": 12190 }, { "epoch": 1.4492753623188406, "grad_norm": 0.09215326387661174, "learning_rate": 2.757962192359585e-05, "loss": 0.1182, "step": 12200 }, { "epoch": 1.450463292943692, "grad_norm": 0.0930611885508255, "learning_rate": 2.7547728650654962e-05, "loss": 0.1206, "step": 12210 }, { "epoch": 1.4516512235685437, "grad_norm": 0.0935403867267884, "learning_rate": 2.751583118724625e-05, "loss": 0.1215, "step": 12220 }, { "epoch": 1.4528391541933952, "grad_norm": 0.08640335619260975, "learning_rate": 2.7483929585834213e-05, "loss": 0.1215, "step": 12230 }, { "epoch": 1.4540270848182466, "grad_norm": 0.09687613595738248, "learning_rate": 2.7452023898890138e-05, "loss": 0.1206, "step": 12240 }, { "epoch": 1.455215015443098, "grad_norm": 0.08819357276500892, "learning_rate": 2.7420114178892064e-05, "loss": 0.1202, "step": 12250 }, { "epoch": 1.4564029460679495, "grad_norm": 0.08574018851288662, "learning_rate": 2.738820047832463e-05, "loss": 0.1179, "step": 12260 }, { "epoch": 1.4575908766928012, "grad_norm": 0.09294312686405384, "learning_rate": 2.7356282849679043e-05, "loss": 0.1178, "step": 12270 }, { "epoch": 1.4587788073176526, "grad_norm": 0.09100374466990117, "learning_rate": 2.732436134545296e-05, "loss": 0.1183, "step": 12280 }, { "epoch": 1.459966737942504, "grad_norm": 0.09052316713813946, "learning_rate": 2.729243601815042e-05, "loss": 0.1211, "step": 12290 }, { "epoch": 1.4611546685673558, "grad_norm": 0.09046662771331901, "learning_rate": 2.726050692028174e-05, "loss": 0.1189, "step": 12300 }, { "epoch": 1.4623425991922072, "grad_norm": 0.09033257205208381, "learning_rate": 2.7228574104363462e-05, "loss": 0.1176, "step": 12310 }, { "epoch": 1.4635305298170587, "grad_norm": 0.08669816466062308, "learning_rate": 2.719663762291821e-05, "loss": 0.119, "step": 12320 }, { "epoch": 1.4647184604419101, "grad_norm": 0.08474958041495452, "learning_rate": 2.7164697528474663e-05, "loss": 0.1165, "step": 12330 }, { "epoch": 1.4659063910667616, "grad_norm": 0.09476309715927864, "learning_rate": 2.7132753873567434e-05, "loss": 0.1209, "step": 12340 }, { "epoch": 1.4670943216916132, "grad_norm": 0.09207950814415805, "learning_rate": 2.7100806710736994e-05, "loss": 0.119, "step": 12350 }, { "epoch": 1.4682822523164647, "grad_norm": 0.09066961066395475, "learning_rate": 2.7068856092529565e-05, "loss": 0.1148, "step": 12360 }, { "epoch": 1.4694701829413161, "grad_norm": 0.08778889072759417, "learning_rate": 2.703690207149709e-05, "loss": 0.1178, "step": 12370 }, { "epoch": 1.4706581135661678, "grad_norm": 0.090358118627708, "learning_rate": 2.7004944700197075e-05, "loss": 0.1184, "step": 12380 }, { "epoch": 1.4718460441910193, "grad_norm": 0.0890555546213061, "learning_rate": 2.6972984031192554e-05, "loss": 0.1196, "step": 12390 }, { "epoch": 1.4730339748158707, "grad_norm": 0.09494639099799426, "learning_rate": 2.6941020117051984e-05, "loss": 0.1241, "step": 12400 }, { "epoch": 1.4742219054407222, "grad_norm": 0.09074791416666984, "learning_rate": 2.6909053010349155e-05, "loss": 0.1179, "step": 12410 }, { "epoch": 1.4754098360655736, "grad_norm": 0.08953764284026072, "learning_rate": 2.6877082763663104e-05, "loss": 0.12, "step": 12420 }, { "epoch": 1.4765977666904253, "grad_norm": 0.08726211596297748, "learning_rate": 2.6845109429578037e-05, "loss": 0.12, "step": 12430 }, { "epoch": 1.4777856973152768, "grad_norm": 0.09037471471388721, "learning_rate": 2.6813133060683242e-05, "loss": 0.1179, "step": 12440 }, { "epoch": 1.4789736279401282, "grad_norm": 0.08825583110020613, "learning_rate": 2.6781153709572997e-05, "loss": 0.117, "step": 12450 }, { "epoch": 1.4801615585649799, "grad_norm": 0.08953629955946228, "learning_rate": 2.674917142884648e-05, "loss": 0.1149, "step": 12460 }, { "epoch": 1.4813494891898313, "grad_norm": 0.0938599619183914, "learning_rate": 2.6717186271107698e-05, "loss": 0.1166, "step": 12470 }, { "epoch": 1.4825374198146828, "grad_norm": 0.0924076095582924, "learning_rate": 2.6685198288965372e-05, "loss": 0.1166, "step": 12480 }, { "epoch": 1.4837253504395345, "grad_norm": 0.08786113009879881, "learning_rate": 2.665320753503289e-05, "loss": 0.1184, "step": 12490 }, { "epoch": 1.484913281064386, "grad_norm": 0.08765808711482556, "learning_rate": 2.662121406192819e-05, "loss": 0.1172, "step": 12500 }, { "epoch": 1.4861012116892374, "grad_norm": 0.08983349696944765, "learning_rate": 2.6589217922273667e-05, "loss": 0.1223, "step": 12510 }, { "epoch": 1.4872891423140888, "grad_norm": 0.09103656744769699, "learning_rate": 2.6557219168696135e-05, "loss": 0.1177, "step": 12520 }, { "epoch": 1.4884770729389403, "grad_norm": 0.09707949916325652, "learning_rate": 2.6525217853826668e-05, "loss": 0.1218, "step": 12530 }, { "epoch": 1.489665003563792, "grad_norm": 0.09472194709958429, "learning_rate": 2.649321403030059e-05, "loss": 0.118, "step": 12540 }, { "epoch": 1.4908529341886434, "grad_norm": 0.09221370605824777, "learning_rate": 2.646120775075732e-05, "loss": 0.1176, "step": 12550 }, { "epoch": 1.4920408648134948, "grad_norm": 0.09104784830722205, "learning_rate": 2.6429199067840348e-05, "loss": 0.1208, "step": 12560 }, { "epoch": 1.4932287954383465, "grad_norm": 0.09015293727675928, "learning_rate": 2.639718803419709e-05, "loss": 0.1174, "step": 12570 }, { "epoch": 1.494416726063198, "grad_norm": 0.08992452939299583, "learning_rate": 2.6365174702478838e-05, "loss": 0.1161, "step": 12580 }, { "epoch": 1.4956046566880494, "grad_norm": 0.08657218438895999, "learning_rate": 2.6333159125340668e-05, "loss": 0.1183, "step": 12590 }, { "epoch": 1.4967925873129009, "grad_norm": 0.09192478540083546, "learning_rate": 2.630114135544134e-05, "loss": 0.1194, "step": 12600 }, { "epoch": 1.4979805179377523, "grad_norm": 0.08941429340217999, "learning_rate": 2.6269121445443236e-05, "loss": 0.1207, "step": 12610 }, { "epoch": 1.499168448562604, "grad_norm": 0.0902654471713569, "learning_rate": 2.623709944801223e-05, "loss": 0.1178, "step": 12620 }, { "epoch": 1.5003563791874555, "grad_norm": 0.08916884013416723, "learning_rate": 2.6205075415817672e-05, "loss": 0.1199, "step": 12630 }, { "epoch": 1.501544309812307, "grad_norm": 0.09195682011590359, "learning_rate": 2.6173049401532217e-05, "loss": 0.121, "step": 12640 }, { "epoch": 1.5027322404371586, "grad_norm": 0.08937105793922134, "learning_rate": 2.6141021457831804e-05, "loss": 0.1189, "step": 12650 }, { "epoch": 1.5039201710620098, "grad_norm": 0.09631727427452344, "learning_rate": 2.6108991637395534e-05, "loss": 0.1228, "step": 12660 }, { "epoch": 1.5051081016868615, "grad_norm": 0.08853117515777988, "learning_rate": 2.6076959992905613e-05, "loss": 0.1184, "step": 12670 }, { "epoch": 1.5062960323117132, "grad_norm": 0.09260983169295994, "learning_rate": 2.604492657704722e-05, "loss": 0.1189, "step": 12680 }, { "epoch": 1.5074839629365644, "grad_norm": 0.09229223771097265, "learning_rate": 2.6012891442508458e-05, "loss": 0.1196, "step": 12690 }, { "epoch": 1.508671893561416, "grad_norm": 0.09295248995830255, "learning_rate": 2.5980854641980272e-05, "loss": 0.1207, "step": 12700 }, { "epoch": 1.5098598241862675, "grad_norm": 0.09573285890972805, "learning_rate": 2.5948816228156336e-05, "loss": 0.1196, "step": 12710 }, { "epoch": 1.511047754811119, "grad_norm": 0.08942684074859877, "learning_rate": 2.5916776253732965e-05, "loss": 0.1186, "step": 12720 }, { "epoch": 1.5122356854359706, "grad_norm": 0.09803795008514543, "learning_rate": 2.588473477140907e-05, "loss": 0.1158, "step": 12730 }, { "epoch": 1.513423616060822, "grad_norm": 0.0923986769861945, "learning_rate": 2.5852691833886013e-05, "loss": 0.1187, "step": 12740 }, { "epoch": 1.5146115466856735, "grad_norm": 0.0842234420794639, "learning_rate": 2.5820647493867568e-05, "loss": 0.1166, "step": 12750 }, { "epoch": 1.5157994773105252, "grad_norm": 0.0897198600986419, "learning_rate": 2.5788601804059793e-05, "loss": 0.1183, "step": 12760 }, { "epoch": 1.5169874079353765, "grad_norm": 0.09154710864264791, "learning_rate": 2.5756554817171e-05, "loss": 0.1184, "step": 12770 }, { "epoch": 1.5181753385602281, "grad_norm": 0.09541779082456156, "learning_rate": 2.5724506585911616e-05, "loss": 0.1162, "step": 12780 }, { "epoch": 1.5193632691850796, "grad_norm": 0.09035907278446835, "learning_rate": 2.569245716299411e-05, "loss": 0.1194, "step": 12790 }, { "epoch": 1.520551199809931, "grad_norm": 0.09202226887895743, "learning_rate": 2.566040660113292e-05, "loss": 0.1191, "step": 12800 }, { "epoch": 1.5217391304347827, "grad_norm": 0.09073110864647548, "learning_rate": 2.5628354953044343e-05, "loss": 0.1208, "step": 12810 }, { "epoch": 1.5229270610596342, "grad_norm": 0.09062295422917896, "learning_rate": 2.5596302271446487e-05, "loss": 0.1191, "step": 12820 }, { "epoch": 1.5241149916844856, "grad_norm": 0.0910224992902642, "learning_rate": 2.556424860905915e-05, "loss": 0.1172, "step": 12830 }, { "epoch": 1.5253029223093373, "grad_norm": 0.08780104764281604, "learning_rate": 2.5532194018603726e-05, "loss": 0.1188, "step": 12840 }, { "epoch": 1.5264908529341885, "grad_norm": 0.09305276509092088, "learning_rate": 2.550013855280316e-05, "loss": 0.1161, "step": 12850 }, { "epoch": 1.5276787835590402, "grad_norm": 0.09008148717900204, "learning_rate": 2.546808226438181e-05, "loss": 0.116, "step": 12860 }, { "epoch": 1.5288667141838916, "grad_norm": 0.08873296622541854, "learning_rate": 2.5436025206065423e-05, "loss": 0.1155, "step": 12870 }, { "epoch": 1.530054644808743, "grad_norm": 0.08988650851933706, "learning_rate": 2.5403967430580978e-05, "loss": 0.1208, "step": 12880 }, { "epoch": 1.5312425754335948, "grad_norm": 0.09115045327855012, "learning_rate": 2.5371908990656655e-05, "loss": 0.1187, "step": 12890 }, { "epoch": 1.5324305060584462, "grad_norm": 0.09082358256315998, "learning_rate": 2.533984993902172e-05, "loss": 0.1175, "step": 12900 }, { "epoch": 1.5336184366832977, "grad_norm": 0.08805531495504713, "learning_rate": 2.530779032840644e-05, "loss": 0.1213, "step": 12910 }, { "epoch": 1.5348063673081493, "grad_norm": 0.09327965545486394, "learning_rate": 2.5275730211542008e-05, "loss": 0.1175, "step": 12920 }, { "epoch": 1.5359942979330006, "grad_norm": 0.08910884534299598, "learning_rate": 2.524366964116045e-05, "loss": 0.1166, "step": 12930 }, { "epoch": 1.5371822285578522, "grad_norm": 0.09468046059907977, "learning_rate": 2.5211608669994536e-05, "loss": 0.1172, "step": 12940 }, { "epoch": 1.5383701591827037, "grad_norm": 0.09550918446178427, "learning_rate": 2.517954735077769e-05, "loss": 0.1177, "step": 12950 }, { "epoch": 1.5395580898075552, "grad_norm": 0.09274549563350033, "learning_rate": 2.5147485736243913e-05, "loss": 0.1162, "step": 12960 }, { "epoch": 1.5407460204324068, "grad_norm": 0.09162981130043667, "learning_rate": 2.511542387912771e-05, "loss": 0.1143, "step": 12970 }, { "epoch": 1.5419339510572583, "grad_norm": 0.08543481166141248, "learning_rate": 2.5083361832163942e-05, "loss": 0.1172, "step": 12980 }, { "epoch": 1.5431218816821097, "grad_norm": 0.09415848793018937, "learning_rate": 2.505129964808782e-05, "loss": 0.1185, "step": 12990 }, { "epoch": 1.5443098123069614, "grad_norm": 0.09231870794890515, "learning_rate": 2.501923737963477e-05, "loss": 0.1161, "step": 13000 }, { "epoch": 1.5454977429318126, "grad_norm": 0.08524822236670197, "learning_rate": 2.498717507954035e-05, "loss": 0.1156, "step": 13010 }, { "epoch": 1.5466856735566643, "grad_norm": 0.0932992079637765, "learning_rate": 2.495511280054018e-05, "loss": 0.1166, "step": 13020 }, { "epoch": 1.5478736041815158, "grad_norm": 0.09549887901466167, "learning_rate": 2.4923050595369825e-05, "loss": 0.1222, "step": 13030 }, { "epoch": 1.5490615348063672, "grad_norm": 0.08671988100148088, "learning_rate": 2.4890988516764765e-05, "loss": 0.1214, "step": 13040 }, { "epoch": 1.5502494654312189, "grad_norm": 0.09387725598335737, "learning_rate": 2.4858926617460243e-05, "loss": 0.1184, "step": 13050 }, { "epoch": 1.5514373960560703, "grad_norm": 0.09675079926561155, "learning_rate": 2.4826864950191216e-05, "loss": 0.1201, "step": 13060 }, { "epoch": 1.5526253266809218, "grad_norm": 0.09523572571322565, "learning_rate": 2.479480356769225e-05, "loss": 0.121, "step": 13070 }, { "epoch": 1.5538132573057735, "grad_norm": 0.09356049580711867, "learning_rate": 2.476274252269745e-05, "loss": 0.1183, "step": 13080 }, { "epoch": 1.5550011879306247, "grad_norm": 0.09293022829393234, "learning_rate": 2.4730681867940375e-05, "loss": 0.1204, "step": 13090 }, { "epoch": 1.5561891185554764, "grad_norm": 0.08138968239662998, "learning_rate": 2.4698621656153932e-05, "loss": 0.1174, "step": 13100 }, { "epoch": 1.5573770491803278, "grad_norm": 0.09390645437547693, "learning_rate": 2.4666561940070303e-05, "loss": 0.1167, "step": 13110 }, { "epoch": 1.5585649798051793, "grad_norm": 0.08539320202806028, "learning_rate": 2.4634502772420845e-05, "loss": 0.1175, "step": 13120 }, { "epoch": 1.559752910430031, "grad_norm": 0.08863939219608627, "learning_rate": 2.4602444205936014e-05, "loss": 0.1166, "step": 13130 }, { "epoch": 1.5609408410548824, "grad_norm": 0.08563219855014056, "learning_rate": 2.4570386293345317e-05, "loss": 0.1147, "step": 13140 }, { "epoch": 1.5621287716797339, "grad_norm": 0.09370263753685214, "learning_rate": 2.4538329087377118e-05, "loss": 0.119, "step": 13150 }, { "epoch": 1.5633167023045855, "grad_norm": 0.09202914366940217, "learning_rate": 2.4506272640758666e-05, "loss": 0.1166, "step": 13160 }, { "epoch": 1.5645046329294368, "grad_norm": 0.08925225789586315, "learning_rate": 2.4474217006215948e-05, "loss": 0.1164, "step": 13170 }, { "epoch": 1.5656925635542884, "grad_norm": 0.09087940233490341, "learning_rate": 2.4442162236473605e-05, "loss": 0.1164, "step": 13180 }, { "epoch": 1.56688049417914, "grad_norm": 0.09356857900509057, "learning_rate": 2.4410108384254884e-05, "loss": 0.117, "step": 13190 }, { "epoch": 1.5680684248039913, "grad_norm": 0.0886513238415608, "learning_rate": 2.4378055502281498e-05, "loss": 0.1185, "step": 13200 }, { "epoch": 1.569256355428843, "grad_norm": 0.08712963738551363, "learning_rate": 2.434600364327357e-05, "loss": 0.1192, "step": 13210 }, { "epoch": 1.5704442860536945, "grad_norm": 0.09576998812297088, "learning_rate": 2.4313952859949536e-05, "loss": 0.1169, "step": 13220 }, { "epoch": 1.571632216678546, "grad_norm": 0.09100998632162673, "learning_rate": 2.4281903205026073e-05, "loss": 0.1156, "step": 13230 }, { "epoch": 1.5728201473033976, "grad_norm": 0.09012517303759199, "learning_rate": 2.4249854731218004e-05, "loss": 0.1161, "step": 13240 }, { "epoch": 1.574008077928249, "grad_norm": 0.09363599702963535, "learning_rate": 2.4217807491238194e-05, "loss": 0.1147, "step": 13250 }, { "epoch": 1.5751960085531005, "grad_norm": 0.08714975856009675, "learning_rate": 2.418576153779749e-05, "loss": 0.1186, "step": 13260 }, { "epoch": 1.5763839391779522, "grad_norm": 0.09790411699313713, "learning_rate": 2.4153716923604606e-05, "loss": 0.1188, "step": 13270 }, { "epoch": 1.5775718698028034, "grad_norm": 0.08666175333325278, "learning_rate": 2.4121673701366095e-05, "loss": 0.1175, "step": 13280 }, { "epoch": 1.578759800427655, "grad_norm": 0.09191854770685054, "learning_rate": 2.408963192378618e-05, "loss": 0.1186, "step": 13290 }, { "epoch": 1.5799477310525065, "grad_norm": 0.0885344401660881, "learning_rate": 2.4057591643566716e-05, "loss": 0.1159, "step": 13300 }, { "epoch": 1.581135661677358, "grad_norm": 0.09194870756919082, "learning_rate": 2.4025552913407104e-05, "loss": 0.1166, "step": 13310 }, { "epoch": 1.5823235923022096, "grad_norm": 0.09205360384296943, "learning_rate": 2.3993515786004184e-05, "loss": 0.1194, "step": 13320 }, { "epoch": 1.583511522927061, "grad_norm": 0.08905836894856561, "learning_rate": 2.3961480314052183e-05, "loss": 0.1144, "step": 13330 }, { "epoch": 1.5846994535519126, "grad_norm": 0.08889882145308087, "learning_rate": 2.3929446550242575e-05, "loss": 0.1194, "step": 13340 }, { "epoch": 1.5858873841767642, "grad_norm": 0.08938060948100202, "learning_rate": 2.3897414547264048e-05, "loss": 0.1154, "step": 13350 }, { "epoch": 1.5870753148016155, "grad_norm": 0.0882486047541412, "learning_rate": 2.3865384357802363e-05, "loss": 0.1153, "step": 13360 }, { "epoch": 1.5882632454264671, "grad_norm": 0.08757736021393303, "learning_rate": 2.3833356034540354e-05, "loss": 0.1173, "step": 13370 }, { "epoch": 1.5894511760513186, "grad_norm": 0.08878853676327562, "learning_rate": 2.3801329630157735e-05, "loss": 0.1171, "step": 13380 }, { "epoch": 1.59063910667617, "grad_norm": 0.09002594926435659, "learning_rate": 2.3769305197331077e-05, "loss": 0.1175, "step": 13390 }, { "epoch": 1.5918270373010217, "grad_norm": 0.08937163957443661, "learning_rate": 2.373728278873371e-05, "loss": 0.1158, "step": 13400 }, { "epoch": 1.5930149679258732, "grad_norm": 0.09237578291449984, "learning_rate": 2.3705262457035642e-05, "loss": 0.1176, "step": 13410 }, { "epoch": 1.5942028985507246, "grad_norm": 0.08821121837755773, "learning_rate": 2.3673244254903455e-05, "loss": 0.1187, "step": 13420 }, { "epoch": 1.5953908291755763, "grad_norm": 0.08950718470829591, "learning_rate": 2.3641228235000238e-05, "loss": 0.1176, "step": 13430 }, { "epoch": 1.5965787598004275, "grad_norm": 0.0914391990691104, "learning_rate": 2.360921444998548e-05, "loss": 0.1179, "step": 13440 }, { "epoch": 1.5977666904252792, "grad_norm": 0.08660053977181562, "learning_rate": 2.3577202952514994e-05, "loss": 0.1164, "step": 13450 }, { "epoch": 1.5989546210501306, "grad_norm": 0.08751835613610263, "learning_rate": 2.354519379524084e-05, "loss": 0.1175, "step": 13460 }, { "epoch": 1.600142551674982, "grad_norm": 0.0882955560801595, "learning_rate": 2.351318703081123e-05, "loss": 0.117, "step": 13470 }, { "epoch": 1.6013304822998338, "grad_norm": 0.08909806771019886, "learning_rate": 2.348118271187043e-05, "loss": 0.1151, "step": 13480 }, { "epoch": 1.6025184129246852, "grad_norm": 0.09332016471942695, "learning_rate": 2.344918089105869e-05, "loss": 0.1197, "step": 13490 }, { "epoch": 1.6037063435495367, "grad_norm": 0.09501044001331196, "learning_rate": 2.3417181621012144e-05, "loss": 0.1176, "step": 13500 }, { "epoch": 1.6048942741743883, "grad_norm": 0.09060635788515024, "learning_rate": 2.3385184954362725e-05, "loss": 0.1165, "step": 13510 }, { "epoch": 1.6060822047992396, "grad_norm": 0.08870453933202166, "learning_rate": 2.3353190943738128e-05, "loss": 0.1174, "step": 13520 }, { "epoch": 1.6072701354240913, "grad_norm": 0.08683327741456072, "learning_rate": 2.332119964176162e-05, "loss": 0.1138, "step": 13530 }, { "epoch": 1.6084580660489427, "grad_norm": 0.09331220799089616, "learning_rate": 2.328921110105205e-05, "loss": 0.1163, "step": 13540 }, { "epoch": 1.6096459966737942, "grad_norm": 0.09012860584999757, "learning_rate": 2.3257225374223706e-05, "loss": 0.1173, "step": 13550 }, { "epoch": 1.6108339272986458, "grad_norm": 0.09506713545762634, "learning_rate": 2.3225242513886267e-05, "loss": 0.1167, "step": 13560 }, { "epoch": 1.6120218579234973, "grad_norm": 0.092888885663396, "learning_rate": 2.319326257264468e-05, "loss": 0.119, "step": 13570 }, { "epoch": 1.6132097885483487, "grad_norm": 0.08686831010194984, "learning_rate": 2.3161285603099097e-05, "loss": 0.1151, "step": 13580 }, { "epoch": 1.6143977191732004, "grad_norm": 0.09158592399365412, "learning_rate": 2.3129311657844783e-05, "loss": 0.1125, "step": 13590 }, { "epoch": 1.6155856497980516, "grad_norm": 0.0949409158219062, "learning_rate": 2.309734078947202e-05, "loss": 0.1188, "step": 13600 }, { "epoch": 1.6167735804229033, "grad_norm": 0.08509807892358066, "learning_rate": 2.3065373050566055e-05, "loss": 0.1178, "step": 13610 }, { "epoch": 1.6179615110477548, "grad_norm": 0.08953684735709797, "learning_rate": 2.303340849370696e-05, "loss": 0.1143, "step": 13620 }, { "epoch": 1.6191494416726062, "grad_norm": 0.08594829339220712, "learning_rate": 2.3001447171469584e-05, "loss": 0.1178, "step": 13630 }, { "epoch": 1.620337372297458, "grad_norm": 0.08671421206110616, "learning_rate": 2.2969489136423456e-05, "loss": 0.1166, "step": 13640 }, { "epoch": 1.6215253029223093, "grad_norm": 0.09622302113428233, "learning_rate": 2.2937534441132687e-05, "loss": 0.1146, "step": 13650 }, { "epoch": 1.6227132335471608, "grad_norm": 0.09796893383552414, "learning_rate": 2.2905583138155922e-05, "loss": 0.1173, "step": 13660 }, { "epoch": 1.6239011641720125, "grad_norm": 0.08710989157663646, "learning_rate": 2.28736352800462e-05, "loss": 0.1156, "step": 13670 }, { "epoch": 1.6250890947968637, "grad_norm": 0.09742571387152417, "learning_rate": 2.2841690919350907e-05, "loss": 0.1144, "step": 13680 }, { "epoch": 1.6262770254217154, "grad_norm": 0.08579068572092559, "learning_rate": 2.2809750108611656e-05, "loss": 0.1141, "step": 13690 }, { "epoch": 1.627464956046567, "grad_norm": 0.08895244617846193, "learning_rate": 2.2777812900364263e-05, "loss": 0.1143, "step": 13700 }, { "epoch": 1.6286528866714183, "grad_norm": 0.08533493835957433, "learning_rate": 2.2745879347138582e-05, "loss": 0.1166, "step": 13710 }, { "epoch": 1.62984081729627, "grad_norm": 0.08495178947168135, "learning_rate": 2.2713949501458477e-05, "loss": 0.1144, "step": 13720 }, { "epoch": 1.6310287479211214, "grad_norm": 0.09409085611077705, "learning_rate": 2.268202341584168e-05, "loss": 0.1158, "step": 13730 }, { "epoch": 1.6322166785459729, "grad_norm": 0.09032840698123001, "learning_rate": 2.2650101142799775e-05, "loss": 0.1207, "step": 13740 }, { "epoch": 1.6334046091708245, "grad_norm": 0.08999497283278513, "learning_rate": 2.2618182734838074e-05, "loss": 0.117, "step": 13750 }, { "epoch": 1.634592539795676, "grad_norm": 0.09043126595876382, "learning_rate": 2.2586268244455504e-05, "loss": 0.117, "step": 13760 }, { "epoch": 1.6357804704205274, "grad_norm": 0.09024880712760538, "learning_rate": 2.2554357724144575e-05, "loss": 0.1164, "step": 13770 }, { "epoch": 1.636968401045379, "grad_norm": 0.0917826250246221, "learning_rate": 2.252245122639125e-05, "loss": 0.1146, "step": 13780 }, { "epoch": 1.6381563316702303, "grad_norm": 0.0883243528719016, "learning_rate": 2.2490548803674878e-05, "loss": 0.1186, "step": 13790 }, { "epoch": 1.639344262295082, "grad_norm": 0.09138082371728348, "learning_rate": 2.2458650508468127e-05, "loss": 0.1171, "step": 13800 }, { "epoch": 1.6405321929199335, "grad_norm": 0.0881071409099892, "learning_rate": 2.242675639323684e-05, "loss": 0.1196, "step": 13810 }, { "epoch": 1.641720123544785, "grad_norm": 0.0951766959537864, "learning_rate": 2.239486651044002e-05, "loss": 0.1158, "step": 13820 }, { "epoch": 1.6429080541696366, "grad_norm": 0.09709830968130559, "learning_rate": 2.236298091252968e-05, "loss": 0.1147, "step": 13830 }, { "epoch": 1.644095984794488, "grad_norm": 0.09270473028401724, "learning_rate": 2.233109965195079e-05, "loss": 0.1171, "step": 13840 }, { "epoch": 1.6452839154193395, "grad_norm": 0.08787572075855843, "learning_rate": 2.229922278114122e-05, "loss": 0.1154, "step": 13850 }, { "epoch": 1.6464718460441912, "grad_norm": 0.091374347396926, "learning_rate": 2.226735035253158e-05, "loss": 0.1163, "step": 13860 }, { "epoch": 1.6476597766690424, "grad_norm": 0.09110469115897811, "learning_rate": 2.2235482418545178e-05, "loss": 0.1163, "step": 13870 }, { "epoch": 1.648847707293894, "grad_norm": 0.09076047354516131, "learning_rate": 2.2203619031597945e-05, "loss": 0.1158, "step": 13880 }, { "epoch": 1.6500356379187455, "grad_norm": 0.09366623597158938, "learning_rate": 2.217176024409833e-05, "loss": 0.1187, "step": 13890 }, { "epoch": 1.651223568543597, "grad_norm": 0.09228204019467187, "learning_rate": 2.2139906108447218e-05, "loss": 0.1153, "step": 13900 }, { "epoch": 1.6524114991684486, "grad_norm": 0.09046246471727355, "learning_rate": 2.2108056677037825e-05, "loss": 0.1171, "step": 13910 }, { "epoch": 1.6535994297933, "grad_norm": 0.08888380779813483, "learning_rate": 2.207621200225565e-05, "loss": 0.1176, "step": 13920 }, { "epoch": 1.6547873604181516, "grad_norm": 0.09033122013026054, "learning_rate": 2.2044372136478348e-05, "loss": 0.1148, "step": 13930 }, { "epoch": 1.6559752910430032, "grad_norm": 0.09205972895581255, "learning_rate": 2.2012537132075697e-05, "loss": 0.1118, "step": 13940 }, { "epoch": 1.6571632216678545, "grad_norm": 0.09324847190685011, "learning_rate": 2.1980707041409452e-05, "loss": 0.1149, "step": 13950 }, { "epoch": 1.6583511522927061, "grad_norm": 0.08890361341038673, "learning_rate": 2.1948881916833287e-05, "loss": 0.1166, "step": 13960 }, { "epoch": 1.6595390829175576, "grad_norm": 0.08702890957749948, "learning_rate": 2.1917061810692724e-05, "loss": 0.1192, "step": 13970 }, { "epoch": 1.660727013542409, "grad_norm": 0.09208352905743492, "learning_rate": 2.1885246775325006e-05, "loss": 0.1145, "step": 13980 }, { "epoch": 1.6619149441672607, "grad_norm": 0.08575412173413191, "learning_rate": 2.1853436863059066e-05, "loss": 0.1146, "step": 13990 }, { "epoch": 1.6631028747921122, "grad_norm": 0.08670068938926483, "learning_rate": 2.1821632126215386e-05, "loss": 0.1135, "step": 14000 }, { "epoch": 1.6642908054169636, "grad_norm": 0.08772429975187647, "learning_rate": 2.178983261710595e-05, "loss": 0.1182, "step": 14010 }, { "epoch": 1.6654787360418153, "grad_norm": 0.09500758476811284, "learning_rate": 2.175803838803412e-05, "loss": 0.1152, "step": 14020 }, { "epoch": 1.6666666666666665, "grad_norm": 0.08987833648463162, "learning_rate": 2.1726249491294613e-05, "loss": 0.1192, "step": 14030 }, { "epoch": 1.6678545972915182, "grad_norm": 0.09003104647500025, "learning_rate": 2.169446597917336e-05, "loss": 0.1185, "step": 14040 }, { "epoch": 1.6690425279163696, "grad_norm": 0.09199443503146203, "learning_rate": 2.1662687903947408e-05, "loss": 0.1166, "step": 14050 }, { "epoch": 1.670230458541221, "grad_norm": 0.0960280770009361, "learning_rate": 2.163091531788489e-05, "loss": 0.1188, "step": 14060 }, { "epoch": 1.6714183891660728, "grad_norm": 0.09062384446930247, "learning_rate": 2.1599148273244896e-05, "loss": 0.119, "step": 14070 }, { "epoch": 1.6726063197909242, "grad_norm": 0.09406017562508627, "learning_rate": 2.1567386822277425e-05, "loss": 0.1163, "step": 14080 }, { "epoch": 1.6737942504157757, "grad_norm": 0.0924903695813593, "learning_rate": 2.1535631017223252e-05, "loss": 0.1162, "step": 14090 }, { "epoch": 1.6749821810406273, "grad_norm": 0.09766957402144329, "learning_rate": 2.1503880910313872e-05, "loss": 0.1183, "step": 14100 }, { "epoch": 1.6761701116654786, "grad_norm": 0.09021605219338888, "learning_rate": 2.1472136553771414e-05, "loss": 0.1169, "step": 14110 }, { "epoch": 1.6773580422903303, "grad_norm": 0.08120367450808272, "learning_rate": 2.144039799980853e-05, "loss": 0.1142, "step": 14120 }, { "epoch": 1.6785459729151817, "grad_norm": 0.08726771603607683, "learning_rate": 2.140866530062836e-05, "loss": 0.1161, "step": 14130 }, { "epoch": 1.6797339035400332, "grad_norm": 0.08772715753162898, "learning_rate": 2.1376938508424384e-05, "loss": 0.1195, "step": 14140 }, { "epoch": 1.6809218341648848, "grad_norm": 0.08665587402785634, "learning_rate": 2.1345217675380386e-05, "loss": 0.1182, "step": 14150 }, { "epoch": 1.6821097647897363, "grad_norm": 0.08669998220469956, "learning_rate": 2.1313502853670332e-05, "loss": 0.1188, "step": 14160 }, { "epoch": 1.6832976954145877, "grad_norm": 0.09417140835021913, "learning_rate": 2.1281794095458303e-05, "loss": 0.1166, "step": 14170 }, { "epoch": 1.6844856260394394, "grad_norm": 0.08769142287432652, "learning_rate": 2.1250091452898438e-05, "loss": 0.115, "step": 14180 }, { "epoch": 1.6856735566642906, "grad_norm": 0.09013929704128044, "learning_rate": 2.1218394978134776e-05, "loss": 0.1143, "step": 14190 }, { "epoch": 1.6868614872891423, "grad_norm": 0.09030972492354984, "learning_rate": 2.118670472330123e-05, "loss": 0.1154, "step": 14200 }, { "epoch": 1.688049417913994, "grad_norm": 0.08965437282769072, "learning_rate": 2.1155020740521476e-05, "loss": 0.1159, "step": 14210 }, { "epoch": 1.6892373485388452, "grad_norm": 0.08865704918462891, "learning_rate": 2.1123343081908885e-05, "loss": 0.1154, "step": 14220 }, { "epoch": 1.690425279163697, "grad_norm": 0.09008105805484744, "learning_rate": 2.1091671799566414e-05, "loss": 0.1176, "step": 14230 }, { "epoch": 1.6916132097885483, "grad_norm": 0.08509558064590031, "learning_rate": 2.106000694558654e-05, "loss": 0.1142, "step": 14240 }, { "epoch": 1.6928011404133998, "grad_norm": 0.08892981289344977, "learning_rate": 2.1028348572051166e-05, "loss": 0.1149, "step": 14250 }, { "epoch": 1.6939890710382515, "grad_norm": 0.08795966491738208, "learning_rate": 2.0996696731031518e-05, "loss": 0.1124, "step": 14260 }, { "epoch": 1.695177001663103, "grad_norm": 0.0897145226138221, "learning_rate": 2.0965051474588118e-05, "loss": 0.1161, "step": 14270 }, { "epoch": 1.6963649322879544, "grad_norm": 0.09112911393296862, "learning_rate": 2.0933412854770622e-05, "loss": 0.1165, "step": 14280 }, { "epoch": 1.697552862912806, "grad_norm": 0.08926965694259696, "learning_rate": 2.0901780923617783e-05, "loss": 0.1119, "step": 14290 }, { "epoch": 1.6987407935376573, "grad_norm": 0.09159471388723055, "learning_rate": 2.0870155733157352e-05, "loss": 0.1151, "step": 14300 }, { "epoch": 1.699928724162509, "grad_norm": 0.0909727555583453, "learning_rate": 2.0838537335405984e-05, "loss": 0.1146, "step": 14310 }, { "epoch": 1.7011166547873604, "grad_norm": 0.08563581473538279, "learning_rate": 2.0806925782369175e-05, "loss": 0.114, "step": 14320 }, { "epoch": 1.7023045854122119, "grad_norm": 0.0930851494035586, "learning_rate": 2.0775321126041167e-05, "loss": 0.1157, "step": 14330 }, { "epoch": 1.7034925160370635, "grad_norm": 0.08912560730168699, "learning_rate": 2.0743723418404835e-05, "loss": 0.1185, "step": 14340 }, { "epoch": 1.704680446661915, "grad_norm": 0.0889547814669299, "learning_rate": 2.071213271143164e-05, "loss": 0.1172, "step": 14350 }, { "epoch": 1.7058683772867664, "grad_norm": 0.09220320187051387, "learning_rate": 2.0680549057081537e-05, "loss": 0.116, "step": 14360 }, { "epoch": 1.707056307911618, "grad_norm": 0.08658535774493838, "learning_rate": 2.0648972507302867e-05, "loss": 0.1133, "step": 14370 }, { "epoch": 1.7082442385364693, "grad_norm": 0.09153841366580182, "learning_rate": 2.0617403114032287e-05, "loss": 0.1209, "step": 14380 }, { "epoch": 1.709432169161321, "grad_norm": 0.0879021282142619, "learning_rate": 2.0585840929194684e-05, "loss": 0.1126, "step": 14390 }, { "epoch": 1.7106200997861725, "grad_norm": 0.0908797185922007, "learning_rate": 2.0554286004703082e-05, "loss": 0.1123, "step": 14400 }, { "epoch": 1.711808030411024, "grad_norm": 0.09146233970106141, "learning_rate": 2.0522738392458594e-05, "loss": 0.1163, "step": 14410 }, { "epoch": 1.7129959610358756, "grad_norm": 0.09171456487483887, "learning_rate": 2.0491198144350277e-05, "loss": 0.1156, "step": 14420 }, { "epoch": 1.714183891660727, "grad_norm": 0.08616689144536578, "learning_rate": 2.0459665312255075e-05, "loss": 0.1184, "step": 14430 }, { "epoch": 1.7153718222855785, "grad_norm": 0.0920573201640267, "learning_rate": 2.0428139948037742e-05, "loss": 0.1181, "step": 14440 }, { "epoch": 1.7165597529104302, "grad_norm": 0.09312773304660277, "learning_rate": 2.0396622103550748e-05, "loss": 0.1136, "step": 14450 }, { "epoch": 1.7177476835352814, "grad_norm": 0.08679607202234789, "learning_rate": 2.03651118306342e-05, "loss": 0.1129, "step": 14460 }, { "epoch": 1.718935614160133, "grad_norm": 0.09047693167725847, "learning_rate": 2.0333609181115742e-05, "loss": 0.1168, "step": 14470 }, { "epoch": 1.7201235447849845, "grad_norm": 0.08957290166396618, "learning_rate": 2.0302114206810492e-05, "loss": 0.115, "step": 14480 }, { "epoch": 1.721311475409836, "grad_norm": 0.09062878642664418, "learning_rate": 2.027062695952091e-05, "loss": 0.1144, "step": 14490 }, { "epoch": 1.7224994060346877, "grad_norm": 0.09455267905951706, "learning_rate": 2.023914749103681e-05, "loss": 0.1156, "step": 14500 }, { "epoch": 1.723687336659539, "grad_norm": 0.08955157671065217, "learning_rate": 2.0207675853135147e-05, "loss": 0.1142, "step": 14510 }, { "epoch": 1.7248752672843906, "grad_norm": 0.09371031875637113, "learning_rate": 2.0176212097580033e-05, "loss": 0.1131, "step": 14520 }, { "epoch": 1.7260631979092422, "grad_norm": 0.09455655015919659, "learning_rate": 2.0144756276122604e-05, "loss": 0.1162, "step": 14530 }, { "epoch": 1.7272511285340935, "grad_norm": 0.08565268781680861, "learning_rate": 2.0113308440500942e-05, "loss": 0.1158, "step": 14540 }, { "epoch": 1.7284390591589451, "grad_norm": 0.08602110114480159, "learning_rate": 2.0081868642440007e-05, "loss": 0.1144, "step": 14550 }, { "epoch": 1.7296269897837966, "grad_norm": 0.09275524953342124, "learning_rate": 2.0050436933651528e-05, "loss": 0.1166, "step": 14560 }, { "epoch": 1.730814920408648, "grad_norm": 0.09041495058087023, "learning_rate": 2.0019013365833935e-05, "loss": 0.1148, "step": 14570 }, { "epoch": 1.7320028510334997, "grad_norm": 0.08930885217517144, "learning_rate": 1.998759799067226e-05, "loss": 0.1153, "step": 14580 }, { "epoch": 1.7331907816583512, "grad_norm": 0.0832994765535002, "learning_rate": 1.995619085983806e-05, "loss": 0.1099, "step": 14590 }, { "epoch": 1.7343787122832026, "grad_norm": 0.0886786624265946, "learning_rate": 1.992479202498935e-05, "loss": 0.1166, "step": 14600 }, { "epoch": 1.7355666429080543, "grad_norm": 0.08970454647913621, "learning_rate": 1.9893401537770486e-05, "loss": 0.1158, "step": 14610 }, { "epoch": 1.7367545735329055, "grad_norm": 0.08982165812540865, "learning_rate": 1.9862019449812092e-05, "loss": 0.1166, "step": 14620 }, { "epoch": 1.7379425041577572, "grad_norm": 0.08999613055175412, "learning_rate": 1.9830645812730972e-05, "loss": 0.1149, "step": 14630 }, { "epoch": 1.7391304347826086, "grad_norm": 0.08864754008792053, "learning_rate": 1.979928067813003e-05, "loss": 0.1157, "step": 14640 }, { "epoch": 1.74031836540746, "grad_norm": 0.08959425359142488, "learning_rate": 1.976792409759822e-05, "loss": 0.1131, "step": 14650 }, { "epoch": 1.7415062960323118, "grad_norm": 0.09053107264115648, "learning_rate": 1.9736576122710386e-05, "loss": 0.1155, "step": 14660 }, { "epoch": 1.7426942266571632, "grad_norm": 0.0918225633661447, "learning_rate": 1.9705236805027226e-05, "loss": 0.118, "step": 14670 }, { "epoch": 1.7438821572820147, "grad_norm": 0.08662849713997695, "learning_rate": 1.9673906196095203e-05, "loss": 0.114, "step": 14680 }, { "epoch": 1.7450700879068664, "grad_norm": 0.08928486425412752, "learning_rate": 1.9642584347446468e-05, "loss": 0.1135, "step": 14690 }, { "epoch": 1.7462580185317176, "grad_norm": 0.09010828496077375, "learning_rate": 1.9611271310598746e-05, "loss": 0.115, "step": 14700 }, { "epoch": 1.7474459491565693, "grad_norm": 0.08739159675407605, "learning_rate": 1.9579967137055273e-05, "loss": 0.1143, "step": 14710 }, { "epoch": 1.748633879781421, "grad_norm": 0.09223060737890848, "learning_rate": 1.9548671878304708e-05, "loss": 0.1138, "step": 14720 }, { "epoch": 1.7498218104062722, "grad_norm": 0.0873602571892572, "learning_rate": 1.9517385585821037e-05, "loss": 0.1102, "step": 14730 }, { "epoch": 1.7510097410311238, "grad_norm": 0.08579983679279285, "learning_rate": 1.9486108311063527e-05, "loss": 0.1118, "step": 14740 }, { "epoch": 1.7521976716559753, "grad_norm": 0.09277723499431699, "learning_rate": 1.945484010547659e-05, "loss": 0.1155, "step": 14750 }, { "epoch": 1.7533856022808267, "grad_norm": 0.08585095408880485, "learning_rate": 1.9423581020489723e-05, "loss": 0.1126, "step": 14760 }, { "epoch": 1.7545735329056784, "grad_norm": 0.08746341266468963, "learning_rate": 1.939233110751742e-05, "loss": 0.1128, "step": 14770 }, { "epoch": 1.7557614635305299, "grad_norm": 0.08897467326762268, "learning_rate": 1.9361090417959097e-05, "loss": 0.1157, "step": 14780 }, { "epoch": 1.7569493941553813, "grad_norm": 0.08677340502241601, "learning_rate": 1.9329859003198997e-05, "loss": 0.1136, "step": 14790 }, { "epoch": 1.758137324780233, "grad_norm": 0.09616440166448031, "learning_rate": 1.9298636914606102e-05, "loss": 0.1158, "step": 14800 }, { "epoch": 1.7593252554050842, "grad_norm": 0.08457678328446307, "learning_rate": 1.9267424203534062e-05, "loss": 0.1131, "step": 14810 }, { "epoch": 1.760513186029936, "grad_norm": 0.09010160124281295, "learning_rate": 1.923622092132109e-05, "loss": 0.1153, "step": 14820 }, { "epoch": 1.7617011166547873, "grad_norm": 0.0919424647163785, "learning_rate": 1.920502711928992e-05, "loss": 0.116, "step": 14830 }, { "epoch": 1.7628890472796388, "grad_norm": 0.08935121839317646, "learning_rate": 1.9173842848747665e-05, "loss": 0.114, "step": 14840 }, { "epoch": 1.7640769779044905, "grad_norm": 0.08846910592736404, "learning_rate": 1.9142668160985762e-05, "loss": 0.1127, "step": 14850 }, { "epoch": 1.765264908529342, "grad_norm": 0.0910723255163988, "learning_rate": 1.9111503107279904e-05, "loss": 0.1104, "step": 14860 }, { "epoch": 1.7664528391541934, "grad_norm": 0.0927984823975911, "learning_rate": 1.9080347738889916e-05, "loss": 0.1159, "step": 14870 }, { "epoch": 1.767640769779045, "grad_norm": 0.093676238873656, "learning_rate": 1.904920210705972e-05, "loss": 0.1154, "step": 14880 }, { "epoch": 1.7688287004038963, "grad_norm": 0.09123233743727406, "learning_rate": 1.9018066263017195e-05, "loss": 0.1149, "step": 14890 }, { "epoch": 1.770016631028748, "grad_norm": 0.08784146048863165, "learning_rate": 1.898694025797414e-05, "loss": 0.1123, "step": 14900 }, { "epoch": 1.7712045616535994, "grad_norm": 0.08732488095696447, "learning_rate": 1.8955824143126164e-05, "loss": 0.1152, "step": 14910 }, { "epoch": 1.7723924922784509, "grad_norm": 0.08811836260454232, "learning_rate": 1.89247179696526e-05, "loss": 0.116, "step": 14920 }, { "epoch": 1.7735804229033025, "grad_norm": 0.08657766586276756, "learning_rate": 1.8893621788716456e-05, "loss": 0.1136, "step": 14930 }, { "epoch": 1.774768353528154, "grad_norm": 0.0911291741238905, "learning_rate": 1.886253565146429e-05, "loss": 0.1151, "step": 14940 }, { "epoch": 1.7759562841530054, "grad_norm": 0.08877960124657092, "learning_rate": 1.883145960902612e-05, "loss": 0.1161, "step": 14950 }, { "epoch": 1.7771442147778571, "grad_norm": 0.08362452405743087, "learning_rate": 1.880039371251539e-05, "loss": 0.1165, "step": 14960 }, { "epoch": 1.7783321454027083, "grad_norm": 0.08808929433646093, "learning_rate": 1.8769338013028828e-05, "loss": 0.1145, "step": 14970 }, { "epoch": 1.77952007602756, "grad_norm": 0.0925308015001537, "learning_rate": 1.873829256164643e-05, "loss": 0.1119, "step": 14980 }, { "epoch": 1.7807080066524115, "grad_norm": 0.08818451467489818, "learning_rate": 1.8707257409431307e-05, "loss": 0.114, "step": 14990 }, { "epoch": 1.781895937277263, "grad_norm": 0.08762619321370985, "learning_rate": 1.8676232607429627e-05, "loss": 0.1141, "step": 15000 }, { "epoch": 1.7830838679021146, "grad_norm": 0.08872551128643816, "learning_rate": 1.8645218206670545e-05, "loss": 0.1138, "step": 15010 }, { "epoch": 1.784271798526966, "grad_norm": 0.09008193236751572, "learning_rate": 1.861421425816611e-05, "loss": 0.1114, "step": 15020 }, { "epoch": 1.7854597291518175, "grad_norm": 0.09268726159889899, "learning_rate": 1.8583220812911174e-05, "loss": 0.1196, "step": 15030 }, { "epoch": 1.7866476597766692, "grad_norm": 0.08847684363363575, "learning_rate": 1.8552237921883314e-05, "loss": 0.1124, "step": 15040 }, { "epoch": 1.7878355904015204, "grad_norm": 0.08946826558916127, "learning_rate": 1.8521265636042752e-05, "loss": 0.1145, "step": 15050 }, { "epoch": 1.789023521026372, "grad_norm": 0.08928739566628813, "learning_rate": 1.849030400633224e-05, "loss": 0.1167, "step": 15060 }, { "epoch": 1.7902114516512235, "grad_norm": 0.08262372729405541, "learning_rate": 1.845935308367706e-05, "loss": 0.1109, "step": 15070 }, { "epoch": 1.791399382276075, "grad_norm": 0.08770634101034165, "learning_rate": 1.8428412918984833e-05, "loss": 0.1136, "step": 15080 }, { "epoch": 1.7925873129009267, "grad_norm": 0.08694340160353438, "learning_rate": 1.8397483563145503e-05, "loss": 0.1145, "step": 15090 }, { "epoch": 1.793775243525778, "grad_norm": 0.0994626947867343, "learning_rate": 1.836656506703123e-05, "loss": 0.1127, "step": 15100 }, { "epoch": 1.7949631741506296, "grad_norm": 0.09099560910655682, "learning_rate": 1.8335657481496316e-05, "loss": 0.1107, "step": 15110 }, { "epoch": 1.7961511047754812, "grad_norm": 0.09176418275070572, "learning_rate": 1.830476085737713e-05, "loss": 0.114, "step": 15120 }, { "epoch": 1.7973390354003325, "grad_norm": 0.09034852968726191, "learning_rate": 1.8273875245491988e-05, "loss": 0.1128, "step": 15130 }, { "epoch": 1.7985269660251841, "grad_norm": 0.08997350798949753, "learning_rate": 1.824300069664111e-05, "loss": 0.1115, "step": 15140 }, { "epoch": 1.7997148966500356, "grad_norm": 0.09066004742609747, "learning_rate": 1.8212137261606498e-05, "loss": 0.1183, "step": 15150 }, { "epoch": 1.800902827274887, "grad_norm": 0.0899364779712256, "learning_rate": 1.818128499115192e-05, "loss": 0.1144, "step": 15160 }, { "epoch": 1.8020907578997387, "grad_norm": 0.09027112408677838, "learning_rate": 1.8150443936022736e-05, "loss": 0.1138, "step": 15170 }, { "epoch": 1.8032786885245902, "grad_norm": 0.09172301426893295, "learning_rate": 1.8119614146945874e-05, "loss": 0.1109, "step": 15180 }, { "epoch": 1.8044666191494416, "grad_norm": 0.08636038199572688, "learning_rate": 1.8088795674629733e-05, "loss": 0.1124, "step": 15190 }, { "epoch": 1.8056545497742933, "grad_norm": 0.08635268520982738, "learning_rate": 1.805798856976409e-05, "loss": 0.1137, "step": 15200 }, { "epoch": 1.8068424803991445, "grad_norm": 0.09361472292379971, "learning_rate": 1.802719288302005e-05, "loss": 0.1144, "step": 15210 }, { "epoch": 1.8080304110239962, "grad_norm": 0.08972567406758898, "learning_rate": 1.799640866504991e-05, "loss": 0.1118, "step": 15220 }, { "epoch": 1.8092183416488479, "grad_norm": 0.08798245693121295, "learning_rate": 1.7965635966487112e-05, "loss": 0.1152, "step": 15230 }, { "epoch": 1.810406272273699, "grad_norm": 0.0887426702717753, "learning_rate": 1.7934874837946156e-05, "loss": 0.1128, "step": 15240 }, { "epoch": 1.8115942028985508, "grad_norm": 0.0864930479221224, "learning_rate": 1.7904125330022504e-05, "loss": 0.1126, "step": 15250 }, { "epoch": 1.8127821335234022, "grad_norm": 0.08644634136291682, "learning_rate": 1.7873387493292515e-05, "loss": 0.1142, "step": 15260 }, { "epoch": 1.8139700641482537, "grad_norm": 0.08737996538139121, "learning_rate": 1.7842661378313342e-05, "loss": 0.1127, "step": 15270 }, { "epoch": 1.8151579947731054, "grad_norm": 0.08567752524741667, "learning_rate": 1.781194703562286e-05, "loss": 0.118, "step": 15280 }, { "epoch": 1.8163459253979568, "grad_norm": 0.09523761597734633, "learning_rate": 1.7781244515739594e-05, "loss": 0.1141, "step": 15290 }, { "epoch": 1.8175338560228083, "grad_norm": 0.09459460225970034, "learning_rate": 1.7750553869162585e-05, "loss": 0.1127, "step": 15300 }, { "epoch": 1.81872178664766, "grad_norm": 0.09004921417547664, "learning_rate": 1.77198751463714e-05, "loss": 0.1118, "step": 15310 }, { "epoch": 1.8199097172725112, "grad_norm": 0.09359116003899301, "learning_rate": 1.7689208397825956e-05, "loss": 0.1151, "step": 15320 }, { "epoch": 1.8210976478973628, "grad_norm": 0.09202203932150485, "learning_rate": 1.7658553673966483e-05, "loss": 0.1156, "step": 15330 }, { "epoch": 1.8222855785222143, "grad_norm": 0.08636911475222162, "learning_rate": 1.7627911025213428e-05, "loss": 0.1143, "step": 15340 }, { "epoch": 1.8234735091470657, "grad_norm": 0.08703217948414511, "learning_rate": 1.75972805019674e-05, "loss": 0.1118, "step": 15350 }, { "epoch": 1.8246614397719174, "grad_norm": 0.08853159694705455, "learning_rate": 1.756666215460903e-05, "loss": 0.1109, "step": 15360 }, { "epoch": 1.8258493703967689, "grad_norm": 0.09121189694101142, "learning_rate": 1.7536056033498953e-05, "loss": 0.1089, "step": 15370 }, { "epoch": 1.8270373010216203, "grad_norm": 0.08838460116289293, "learning_rate": 1.750546218897768e-05, "loss": 0.1163, "step": 15380 }, { "epoch": 1.828225231646472, "grad_norm": 0.08835559869802082, "learning_rate": 1.7474880671365513e-05, "loss": 0.1132, "step": 15390 }, { "epoch": 1.8294131622713232, "grad_norm": 0.09609804285270343, "learning_rate": 1.744431153096252e-05, "loss": 0.1124, "step": 15400 }, { "epoch": 1.830601092896175, "grad_norm": 0.09144493029748925, "learning_rate": 1.7413754818048384e-05, "loss": 0.1133, "step": 15410 }, { "epoch": 1.8317890235210263, "grad_norm": 0.08614529304754234, "learning_rate": 1.7383210582882342e-05, "loss": 0.112, "step": 15420 }, { "epoch": 1.8329769541458778, "grad_norm": 0.09396351982515125, "learning_rate": 1.7352678875703126e-05, "loss": 0.1153, "step": 15430 }, { "epoch": 1.8341648847707295, "grad_norm": 0.09137052550510825, "learning_rate": 1.732215974672884e-05, "loss": 0.1128, "step": 15440 }, { "epoch": 1.835352815395581, "grad_norm": 0.08797453315632556, "learning_rate": 1.729165324615693e-05, "loss": 0.1133, "step": 15450 }, { "epoch": 1.8365407460204324, "grad_norm": 0.08817668197478476, "learning_rate": 1.726115942416405e-05, "loss": 0.1139, "step": 15460 }, { "epoch": 1.837728676645284, "grad_norm": 0.09558267914655894, "learning_rate": 1.7230678330905998e-05, "loss": 0.1122, "step": 15470 }, { "epoch": 1.8389166072701353, "grad_norm": 0.0928081596774017, "learning_rate": 1.7200210016517633e-05, "loss": 0.1168, "step": 15480 }, { "epoch": 1.840104537894987, "grad_norm": 0.09087521631936382, "learning_rate": 1.7169754531112826e-05, "loss": 0.1168, "step": 15490 }, { "epoch": 1.8412924685198384, "grad_norm": 0.09186096540409625, "learning_rate": 1.7139311924784318e-05, "loss": 0.1125, "step": 15500 }, { "epoch": 1.8424803991446899, "grad_norm": 0.09162896831598626, "learning_rate": 1.7108882247603674e-05, "loss": 0.1143, "step": 15510 }, { "epoch": 1.8436683297695415, "grad_norm": 0.08815513205784194, "learning_rate": 1.7078465549621197e-05, "loss": 0.1141, "step": 15520 }, { "epoch": 1.844856260394393, "grad_norm": 0.08301998338546733, "learning_rate": 1.7048061880865822e-05, "loss": 0.1138, "step": 15530 }, { "epoch": 1.8460441910192444, "grad_norm": 0.08995798482965538, "learning_rate": 1.7017671291345095e-05, "loss": 0.1176, "step": 15540 }, { "epoch": 1.8472321216440961, "grad_norm": 0.09565109419421765, "learning_rate": 1.698729383104501e-05, "loss": 0.1138, "step": 15550 }, { "epoch": 1.8484200522689473, "grad_norm": 0.08784160619229844, "learning_rate": 1.695692954992999e-05, "loss": 0.1112, "step": 15560 }, { "epoch": 1.849607982893799, "grad_norm": 0.08975129076285439, "learning_rate": 1.6926578497942766e-05, "loss": 0.1151, "step": 15570 }, { "epoch": 1.8507959135186505, "grad_norm": 0.08947421272028988, "learning_rate": 1.689624072500431e-05, "loss": 0.1125, "step": 15580 }, { "epoch": 1.851983844143502, "grad_norm": 0.08828518283732849, "learning_rate": 1.6865916281013776e-05, "loss": 0.1145, "step": 15590 }, { "epoch": 1.8531717747683536, "grad_norm": 0.0904774404793168, "learning_rate": 1.6835605215848368e-05, "loss": 0.1131, "step": 15600 }, { "epoch": 1.854359705393205, "grad_norm": 0.08866585849222795, "learning_rate": 1.6805307579363294e-05, "loss": 0.1089, "step": 15610 }, { "epoch": 1.8555476360180565, "grad_norm": 0.08843953384282376, "learning_rate": 1.677502342139168e-05, "loss": 0.111, "step": 15620 }, { "epoch": 1.8567355666429082, "grad_norm": 0.09411466667995426, "learning_rate": 1.6744752791744462e-05, "loss": 0.1124, "step": 15630 }, { "epoch": 1.8579234972677594, "grad_norm": 0.09010288738611956, "learning_rate": 1.6714495740210363e-05, "loss": 0.1156, "step": 15640 }, { "epoch": 1.859111427892611, "grad_norm": 0.08940105417824468, "learning_rate": 1.6684252316555742e-05, "loss": 0.1126, "step": 15650 }, { "epoch": 1.8602993585174625, "grad_norm": 0.09085975904630779, "learning_rate": 1.665402257052455e-05, "loss": 0.1127, "step": 15660 }, { "epoch": 1.861487289142314, "grad_norm": 0.09050363431477876, "learning_rate": 1.6623806551838243e-05, "loss": 0.1151, "step": 15670 }, { "epoch": 1.8626752197671657, "grad_norm": 0.08718326501092706, "learning_rate": 1.6593604310195704e-05, "loss": 0.1125, "step": 15680 }, { "epoch": 1.863863150392017, "grad_norm": 0.0914385611183382, "learning_rate": 1.6563415895273143e-05, "loss": 0.1145, "step": 15690 }, { "epoch": 1.8650510810168686, "grad_norm": 0.09149478342451242, "learning_rate": 1.6533241356724038e-05, "loss": 0.1132, "step": 15700 }, { "epoch": 1.8662390116417202, "grad_norm": 0.08951629424634343, "learning_rate": 1.650308074417904e-05, "loss": 0.114, "step": 15710 }, { "epoch": 1.8674269422665715, "grad_norm": 0.08995373002947624, "learning_rate": 1.6472934107245886e-05, "loss": 0.1138, "step": 15720 }, { "epoch": 1.8686148728914231, "grad_norm": 0.08934384245335375, "learning_rate": 1.644280149550936e-05, "loss": 0.1133, "step": 15730 }, { "epoch": 1.8698028035162748, "grad_norm": 0.09232930902013163, "learning_rate": 1.6412682958531134e-05, "loss": 0.1165, "step": 15740 }, { "epoch": 1.870990734141126, "grad_norm": 0.08997570283592156, "learning_rate": 1.6382578545849754e-05, "loss": 0.1129, "step": 15750 }, { "epoch": 1.8721786647659777, "grad_norm": 0.08868109303704588, "learning_rate": 1.6352488306980522e-05, "loss": 0.1122, "step": 15760 }, { "epoch": 1.8733665953908292, "grad_norm": 0.09017848012380678, "learning_rate": 1.6322412291415442e-05, "loss": 0.1124, "step": 15770 }, { "epoch": 1.8745545260156806, "grad_norm": 0.08935752853353607, "learning_rate": 1.629235054862312e-05, "loss": 0.1144, "step": 15780 }, { "epoch": 1.8757424566405323, "grad_norm": 0.09147784376878043, "learning_rate": 1.6262303128048678e-05, "loss": 0.113, "step": 15790 }, { "epoch": 1.8769303872653837, "grad_norm": 0.09153794441780241, "learning_rate": 1.6232270079113685e-05, "loss": 0.1132, "step": 15800 }, { "epoch": 1.8781183178902352, "grad_norm": 0.08942895574045563, "learning_rate": 1.6202251451216062e-05, "loss": 0.1133, "step": 15810 }, { "epoch": 1.8793062485150869, "grad_norm": 0.09305805290224196, "learning_rate": 1.6172247293730043e-05, "loss": 0.1177, "step": 15820 }, { "epoch": 1.880494179139938, "grad_norm": 0.08861821153134787, "learning_rate": 1.6142257656006027e-05, "loss": 0.111, "step": 15830 }, { "epoch": 1.8816821097647898, "grad_norm": 0.08599133995061267, "learning_rate": 1.6112282587370554e-05, "loss": 0.1149, "step": 15840 }, { "epoch": 1.8828700403896412, "grad_norm": 0.08471066390484676, "learning_rate": 1.608232213712617e-05, "loss": 0.1143, "step": 15850 }, { "epoch": 1.8840579710144927, "grad_norm": 0.09296050505944824, "learning_rate": 1.60523763545514e-05, "loss": 0.113, "step": 15860 }, { "epoch": 1.8852459016393444, "grad_norm": 0.09094489923074821, "learning_rate": 1.602244528890066e-05, "loss": 0.1159, "step": 15870 }, { "epoch": 1.8864338322641958, "grad_norm": 0.09079328626257285, "learning_rate": 1.5992528989404125e-05, "loss": 0.1126, "step": 15880 }, { "epoch": 1.8876217628890473, "grad_norm": 0.08917607438021033, "learning_rate": 1.596262750526771e-05, "loss": 0.1114, "step": 15890 }, { "epoch": 1.888809693513899, "grad_norm": 0.08869392341350395, "learning_rate": 1.5932740885672937e-05, "loss": 0.1155, "step": 15900 }, { "epoch": 1.8899976241387502, "grad_norm": 0.08707767956448893, "learning_rate": 1.5902869179776897e-05, "loss": 0.1143, "step": 15910 }, { "epoch": 1.8911855547636018, "grad_norm": 0.0891961334551608, "learning_rate": 1.5873012436712154e-05, "loss": 0.1128, "step": 15920 }, { "epoch": 1.8923734853884533, "grad_norm": 0.08971298042392051, "learning_rate": 1.5843170705586653e-05, "loss": 0.1104, "step": 15930 }, { "epoch": 1.8935614160133047, "grad_norm": 0.09017856789720155, "learning_rate": 1.5813344035483637e-05, "loss": 0.1152, "step": 15940 }, { "epoch": 1.8947493466381564, "grad_norm": 0.08926047544502289, "learning_rate": 1.57835324754616e-05, "loss": 0.1135, "step": 15950 }, { "epoch": 1.8959372772630079, "grad_norm": 0.08724690873225287, "learning_rate": 1.5753736074554156e-05, "loss": 0.1134, "step": 15960 }, { "epoch": 1.8971252078878593, "grad_norm": 0.08802672669888162, "learning_rate": 1.5723954881770024e-05, "loss": 0.1115, "step": 15970 }, { "epoch": 1.898313138512711, "grad_norm": 0.08840064678331808, "learning_rate": 1.5694188946092875e-05, "loss": 0.1127, "step": 15980 }, { "epoch": 1.8995010691375622, "grad_norm": 0.0951076214118213, "learning_rate": 1.5664438316481293e-05, "loss": 0.1124, "step": 15990 }, { "epoch": 1.900688999762414, "grad_norm": 0.08697362287294386, "learning_rate": 1.5634703041868692e-05, "loss": 0.1138, "step": 16000 }, { "epoch": 1.9018769303872654, "grad_norm": 0.09149322995381802, "learning_rate": 1.5604983171163235e-05, "loss": 0.1137, "step": 16010 }, { "epoch": 1.9030648610121168, "grad_norm": 0.08763828917985457, "learning_rate": 1.5575278753247734e-05, "loss": 0.1135, "step": 16020 }, { "epoch": 1.9042527916369685, "grad_norm": 0.09168021151809126, "learning_rate": 1.5545589836979597e-05, "loss": 0.1147, "step": 16030 }, { "epoch": 1.90544072226182, "grad_norm": 0.100591179133007, "learning_rate": 1.551591647119073e-05, "loss": 0.1128, "step": 16040 }, { "epoch": 1.9066286528866714, "grad_norm": 0.09210231891819662, "learning_rate": 1.548625870468745e-05, "loss": 0.1121, "step": 16050 }, { "epoch": 1.907816583511523, "grad_norm": 0.09047184718691875, "learning_rate": 1.5456616586250457e-05, "loss": 0.114, "step": 16060 }, { "epoch": 1.9090045141363743, "grad_norm": 0.08965188508836376, "learning_rate": 1.5426990164634662e-05, "loss": 0.1099, "step": 16070 }, { "epoch": 1.910192444761226, "grad_norm": 0.08943395041991764, "learning_rate": 1.5397379488569195e-05, "loss": 0.1142, "step": 16080 }, { "epoch": 1.9113803753860774, "grad_norm": 0.08586819113130548, "learning_rate": 1.536778460675727e-05, "loss": 0.1137, "step": 16090 }, { "epoch": 1.9125683060109289, "grad_norm": 0.08617478147326357, "learning_rate": 1.533820556787612e-05, "loss": 0.1133, "step": 16100 }, { "epoch": 1.9137562366357805, "grad_norm": 0.09244127137633319, "learning_rate": 1.530864242057694e-05, "loss": 0.1132, "step": 16110 }, { "epoch": 1.914944167260632, "grad_norm": 0.09225247383817628, "learning_rate": 1.5279095213484768e-05, "loss": 0.1124, "step": 16120 }, { "epoch": 1.9161320978854834, "grad_norm": 0.09344427407023205, "learning_rate": 1.5249563995198426e-05, "loss": 0.1098, "step": 16130 }, { "epoch": 1.9173200285103351, "grad_norm": 0.08741619640792364, "learning_rate": 1.5220048814290438e-05, "loss": 0.1152, "step": 16140 }, { "epoch": 1.9185079591351863, "grad_norm": 0.09051342706326738, "learning_rate": 1.5190549719306974e-05, "loss": 0.1118, "step": 16150 }, { "epoch": 1.919695889760038, "grad_norm": 0.09361467228815833, "learning_rate": 1.5161066758767706e-05, "loss": 0.1133, "step": 16160 }, { "epoch": 1.9208838203848895, "grad_norm": 0.085762648551318, "learning_rate": 1.5131599981165795e-05, "loss": 0.1127, "step": 16170 }, { "epoch": 1.922071751009741, "grad_norm": 0.09223709400210856, "learning_rate": 1.5102149434967778e-05, "loss": 0.1144, "step": 16180 }, { "epoch": 1.9232596816345926, "grad_norm": 0.0934467580769674, "learning_rate": 1.5072715168613477e-05, "loss": 0.1135, "step": 16190 }, { "epoch": 1.924447612259444, "grad_norm": 0.09604291186176254, "learning_rate": 1.5043297230515985e-05, "loss": 0.1072, "step": 16200 }, { "epoch": 1.9256355428842955, "grad_norm": 0.0928531891871139, "learning_rate": 1.501389566906149e-05, "loss": 0.1145, "step": 16210 }, { "epoch": 1.9268234735091472, "grad_norm": 0.09028775047316785, "learning_rate": 1.4984510532609263e-05, "loss": 0.1159, "step": 16220 }, { "epoch": 1.9280114041339986, "grad_norm": 0.08952538211500047, "learning_rate": 1.4955141869491562e-05, "loss": 0.1127, "step": 16230 }, { "epoch": 1.92919933475885, "grad_norm": 0.09159842458044233, "learning_rate": 1.4925789728013536e-05, "loss": 0.1109, "step": 16240 }, { "epoch": 1.9303872653837018, "grad_norm": 0.08832423266377182, "learning_rate": 1.4896454156453185e-05, "loss": 0.1142, "step": 16250 }, { "epoch": 1.931575196008553, "grad_norm": 0.09090402003242964, "learning_rate": 1.486713520306123e-05, "loss": 0.1104, "step": 16260 }, { "epoch": 1.9327631266334047, "grad_norm": 0.09425493606729404, "learning_rate": 1.4837832916061074e-05, "loss": 0.1145, "step": 16270 }, { "epoch": 1.9339510572582561, "grad_norm": 0.08929019923337338, "learning_rate": 1.4808547343648682e-05, "loss": 0.115, "step": 16280 }, { "epoch": 1.9351389878831076, "grad_norm": 0.08925593307684501, "learning_rate": 1.4779278533992574e-05, "loss": 0.111, "step": 16290 }, { "epoch": 1.9363269185079592, "grad_norm": 0.08640331114404909, "learning_rate": 1.475002653523366e-05, "loss": 0.1133, "step": 16300 }, { "epoch": 1.9375148491328107, "grad_norm": 0.08882607985712844, "learning_rate": 1.4720791395485211e-05, "loss": 0.1119, "step": 16310 }, { "epoch": 1.9387027797576621, "grad_norm": 0.0909162019802, "learning_rate": 1.4691573162832766e-05, "loss": 0.1107, "step": 16320 }, { "epoch": 1.9398907103825138, "grad_norm": 0.08882337917177055, "learning_rate": 1.4662371885334062e-05, "loss": 0.1112, "step": 16330 }, { "epoch": 1.941078641007365, "grad_norm": 0.08641157082172284, "learning_rate": 1.4633187611018945e-05, "loss": 0.1115, "step": 16340 }, { "epoch": 1.9422665716322167, "grad_norm": 0.09069215777551591, "learning_rate": 1.4604020387889295e-05, "loss": 0.1126, "step": 16350 }, { "epoch": 1.9434545022570682, "grad_norm": 0.0918124537016585, "learning_rate": 1.4574870263918939e-05, "loss": 0.1147, "step": 16360 }, { "epoch": 1.9446424328819196, "grad_norm": 0.09202904410406412, "learning_rate": 1.454573728705359e-05, "loss": 0.1152, "step": 16370 }, { "epoch": 1.9458303635067713, "grad_norm": 0.09092847837752108, "learning_rate": 1.4516621505210748e-05, "loss": 0.1107, "step": 16380 }, { "epoch": 1.9470182941316228, "grad_norm": 0.09168457758575163, "learning_rate": 1.4487522966279648e-05, "loss": 0.1147, "step": 16390 }, { "epoch": 1.9482062247564742, "grad_norm": 0.09030590896440953, "learning_rate": 1.4458441718121149e-05, "loss": 0.1131, "step": 16400 }, { "epoch": 1.9493941553813259, "grad_norm": 0.08598786712942044, "learning_rate": 1.4429377808567673e-05, "loss": 0.1097, "step": 16410 }, { "epoch": 1.950582086006177, "grad_norm": 0.09147080260548499, "learning_rate": 1.4400331285423122e-05, "loss": 0.109, "step": 16420 }, { "epoch": 1.9517700166310288, "grad_norm": 0.08703279382640655, "learning_rate": 1.4371302196462793e-05, "loss": 0.1128, "step": 16430 }, { "epoch": 1.9529579472558802, "grad_norm": 0.08733295929493576, "learning_rate": 1.4342290589433354e-05, "loss": 0.1147, "step": 16440 }, { "epoch": 1.9541458778807317, "grad_norm": 0.09087994235461208, "learning_rate": 1.4313296512052655e-05, "loss": 0.1105, "step": 16450 }, { "epoch": 1.9553338085055834, "grad_norm": 0.09319550511298358, "learning_rate": 1.4284320012009739e-05, "loss": 0.1111, "step": 16460 }, { "epoch": 1.9565217391304348, "grad_norm": 0.08849462975750824, "learning_rate": 1.4255361136964742e-05, "loss": 0.108, "step": 16470 }, { "epoch": 1.9577096697552863, "grad_norm": 0.08861531971902528, "learning_rate": 1.4226419934548829e-05, "loss": 0.1141, "step": 16480 }, { "epoch": 1.958897600380138, "grad_norm": 0.08763344283231074, "learning_rate": 1.4197496452364063e-05, "loss": 0.1116, "step": 16490 }, { "epoch": 1.9600855310049892, "grad_norm": 0.09324789986602144, "learning_rate": 1.4168590737983376e-05, "loss": 0.1149, "step": 16500 }, { "epoch": 1.9612734616298408, "grad_norm": 0.09196795544950531, "learning_rate": 1.4139702838950481e-05, "loss": 0.1123, "step": 16510 }, { "epoch": 1.9624613922546923, "grad_norm": 0.08640020874003493, "learning_rate": 1.4110832802779767e-05, "loss": 0.1132, "step": 16520 }, { "epoch": 1.9636493228795437, "grad_norm": 0.08964604577942674, "learning_rate": 1.408198067695628e-05, "loss": 0.113, "step": 16530 }, { "epoch": 1.9648372535043954, "grad_norm": 0.09887307142300221, "learning_rate": 1.405314650893558e-05, "loss": 0.1098, "step": 16540 }, { "epoch": 1.9660251841292469, "grad_norm": 0.08651789928881999, "learning_rate": 1.4024330346143694e-05, "loss": 0.1082, "step": 16550 }, { "epoch": 1.9672131147540983, "grad_norm": 0.08731215873458109, "learning_rate": 1.3995532235977036e-05, "loss": 0.1126, "step": 16560 }, { "epoch": 1.96840104537895, "grad_norm": 0.08629497658028003, "learning_rate": 1.3966752225802316e-05, "loss": 0.1082, "step": 16570 }, { "epoch": 1.9695889760038012, "grad_norm": 0.09675843578740842, "learning_rate": 1.3937990362956505e-05, "loss": 0.1145, "step": 16580 }, { "epoch": 1.970776906628653, "grad_norm": 0.08815743094429375, "learning_rate": 1.39092466947467e-05, "loss": 0.1101, "step": 16590 }, { "epoch": 1.9719648372535044, "grad_norm": 0.09159992016676166, "learning_rate": 1.3880521268450075e-05, "loss": 0.1097, "step": 16600 }, { "epoch": 1.9731527678783558, "grad_norm": 0.0838501459499305, "learning_rate": 1.38518141313138e-05, "loss": 0.1113, "step": 16610 }, { "epoch": 1.9743406985032075, "grad_norm": 0.09079125090369276, "learning_rate": 1.3823125330554967e-05, "loss": 0.1107, "step": 16620 }, { "epoch": 1.975528629128059, "grad_norm": 0.09404009572585598, "learning_rate": 1.379445491336051e-05, "loss": 0.1121, "step": 16630 }, { "epoch": 1.9767165597529104, "grad_norm": 0.08891391785888408, "learning_rate": 1.3765802926887119e-05, "loss": 0.1132, "step": 16640 }, { "epoch": 1.977904490377762, "grad_norm": 0.0882785493338732, "learning_rate": 1.3737169418261176e-05, "loss": 0.1104, "step": 16650 }, { "epoch": 1.9790924210026133, "grad_norm": 0.09036623516724818, "learning_rate": 1.3708554434578658e-05, "loss": 0.1124, "step": 16660 }, { "epoch": 1.980280351627465, "grad_norm": 0.09569584693157905, "learning_rate": 1.3679958022905104e-05, "loss": 0.1097, "step": 16670 }, { "epoch": 1.9814682822523164, "grad_norm": 0.08787533911258574, "learning_rate": 1.3651380230275471e-05, "loss": 0.1128, "step": 16680 }, { "epoch": 1.9826562128771679, "grad_norm": 0.09455305850134012, "learning_rate": 1.362282110369411e-05, "loss": 0.1094, "step": 16690 }, { "epoch": 1.9838441435020195, "grad_norm": 0.0876687339231918, "learning_rate": 1.3594280690134664e-05, "loss": 0.1104, "step": 16700 }, { "epoch": 1.985032074126871, "grad_norm": 0.08929173378882785, "learning_rate": 1.3565759036539988e-05, "loss": 0.1116, "step": 16710 }, { "epoch": 1.9862200047517224, "grad_norm": 0.08794746398271228, "learning_rate": 1.3537256189822113e-05, "loss": 0.112, "step": 16720 }, { "epoch": 1.9874079353765741, "grad_norm": 0.09199988594765929, "learning_rate": 1.3508772196862104e-05, "loss": 0.1123, "step": 16730 }, { "epoch": 1.9885958660014256, "grad_norm": 0.09287578744478828, "learning_rate": 1.3480307104510031e-05, "loss": 0.1118, "step": 16740 }, { "epoch": 1.989783796626277, "grad_norm": 0.08826792678853126, "learning_rate": 1.3451860959584869e-05, "loss": 0.1114, "step": 16750 }, { "epoch": 1.9909717272511287, "grad_norm": 0.08863215839351793, "learning_rate": 1.3423433808874436e-05, "loss": 0.1089, "step": 16760 }, { "epoch": 1.99215965787598, "grad_norm": 0.08624496122126764, "learning_rate": 1.3395025699135299e-05, "loss": 0.1135, "step": 16770 }, { "epoch": 1.9933475885008316, "grad_norm": 0.08634948183601741, "learning_rate": 1.336663667709272e-05, "loss": 0.1088, "step": 16780 }, { "epoch": 1.994535519125683, "grad_norm": 0.08594250501044007, "learning_rate": 1.333826678944055e-05, "loss": 0.1091, "step": 16790 }, { "epoch": 1.9957234497505345, "grad_norm": 0.0884708693437515, "learning_rate": 1.3309916082841179e-05, "loss": 0.1118, "step": 16800 }, { "epoch": 1.9969113803753862, "grad_norm": 0.09023834833429958, "learning_rate": 1.3281584603925451e-05, "loss": 0.1101, "step": 16810 }, { "epoch": 1.9980993110002376, "grad_norm": 0.08936372183620735, "learning_rate": 1.3253272399292577e-05, "loss": 0.1142, "step": 16820 }, { "epoch": 1.999287241625089, "grad_norm": 0.08995020498937097, "learning_rate": 1.3224979515510066e-05, "loss": 0.1095, "step": 16830 }, { "epoch": 2.0004751722499408, "grad_norm": 0.0924873852666803, "learning_rate": 1.3196705999113656e-05, "loss": 0.1009, "step": 16840 }, { "epoch": 2.001663102874792, "grad_norm": 0.09227922345908839, "learning_rate": 1.316845189660721e-05, "loss": 0.089, "step": 16850 }, { "epoch": 2.0028510334996437, "grad_norm": 0.08700579386449803, "learning_rate": 1.3140217254462688e-05, "loss": 0.0899, "step": 16860 }, { "epoch": 2.0040389641244953, "grad_norm": 0.08654311999345311, "learning_rate": 1.3112002119120025e-05, "loss": 0.0851, "step": 16870 }, { "epoch": 2.0052268947493466, "grad_norm": 0.09228912216922175, "learning_rate": 1.3083806536987075e-05, "loss": 0.0891, "step": 16880 }, { "epoch": 2.0064148253741982, "grad_norm": 0.09128894477678795, "learning_rate": 1.3055630554439527e-05, "loss": 0.087, "step": 16890 }, { "epoch": 2.0076027559990495, "grad_norm": 0.09260905140215228, "learning_rate": 1.3027474217820825e-05, "loss": 0.0879, "step": 16900 }, { "epoch": 2.008790686623901, "grad_norm": 0.0875833473137307, "learning_rate": 1.2999337573442133e-05, "loss": 0.0888, "step": 16910 }, { "epoch": 2.009978617248753, "grad_norm": 0.0924142694752428, "learning_rate": 1.2971220667582196e-05, "loss": 0.087, "step": 16920 }, { "epoch": 2.011166547873604, "grad_norm": 0.09279453914398536, "learning_rate": 1.294312354648729e-05, "loss": 0.088, "step": 16930 }, { "epoch": 2.0123544784984557, "grad_norm": 0.09478695016334969, "learning_rate": 1.291504625637117e-05, "loss": 0.0865, "step": 16940 }, { "epoch": 2.0135424091233074, "grad_norm": 0.09227871299220451, "learning_rate": 1.2886988843414962e-05, "loss": 0.0863, "step": 16950 }, { "epoch": 2.0147303397481586, "grad_norm": 0.09101475320911041, "learning_rate": 1.2858951353767096e-05, "loss": 0.0868, "step": 16960 }, { "epoch": 2.0159182703730103, "grad_norm": 0.09765536749551743, "learning_rate": 1.2830933833543237e-05, "loss": 0.0869, "step": 16970 }, { "epoch": 2.0171062009978615, "grad_norm": 0.0899957528669951, "learning_rate": 1.2802936328826204e-05, "loss": 0.0856, "step": 16980 }, { "epoch": 2.018294131622713, "grad_norm": 0.09116615034921327, "learning_rate": 1.2774958885665883e-05, "loss": 0.0876, "step": 16990 }, { "epoch": 2.019482062247565, "grad_norm": 0.09121539396049767, "learning_rate": 1.27470015500792e-05, "loss": 0.0871, "step": 17000 }, { "epoch": 2.020669992872416, "grad_norm": 0.09218692764651705, "learning_rate": 1.2719064368049962e-05, "loss": 0.0847, "step": 17010 }, { "epoch": 2.021857923497268, "grad_norm": 0.09158139491868442, "learning_rate": 1.269114738552886e-05, "loss": 0.087, "step": 17020 }, { "epoch": 2.0230458541221195, "grad_norm": 0.09200787716308687, "learning_rate": 1.266325064843334e-05, "loss": 0.0853, "step": 17030 }, { "epoch": 2.0242337847469707, "grad_norm": 0.0957110697829761, "learning_rate": 1.2635374202647554e-05, "loss": 0.0881, "step": 17040 }, { "epoch": 2.0254217153718224, "grad_norm": 0.09553750694484266, "learning_rate": 1.26075180940223e-05, "loss": 0.0859, "step": 17050 }, { "epoch": 2.0266096459966736, "grad_norm": 0.08809674268074627, "learning_rate": 1.2579682368374901e-05, "loss": 0.0881, "step": 17060 }, { "epoch": 2.0277975766215253, "grad_norm": 0.09220827232916895, "learning_rate": 1.2551867071489163e-05, "loss": 0.084, "step": 17070 }, { "epoch": 2.028985507246377, "grad_norm": 0.09312587210951115, "learning_rate": 1.2524072249115288e-05, "loss": 0.0874, "step": 17080 }, { "epoch": 2.030173437871228, "grad_norm": 0.09613147005096723, "learning_rate": 1.2496297946969804e-05, "loss": 0.0874, "step": 17090 }, { "epoch": 2.03136136849608, "grad_norm": 0.09650315889083438, "learning_rate": 1.2468544210735492e-05, "loss": 0.0852, "step": 17100 }, { "epoch": 2.0325492991209315, "grad_norm": 0.09590243313927344, "learning_rate": 1.2440811086061297e-05, "loss": 0.0884, "step": 17110 }, { "epoch": 2.0337372297457827, "grad_norm": 0.09214954429891148, "learning_rate": 1.2413098618562272e-05, "loss": 0.0843, "step": 17120 }, { "epoch": 2.0349251603706344, "grad_norm": 0.09482580369697916, "learning_rate": 1.2385406853819475e-05, "loss": 0.0887, "step": 17130 }, { "epoch": 2.0361130909954857, "grad_norm": 0.09287548803663785, "learning_rate": 1.2357735837379949e-05, "loss": 0.0847, "step": 17140 }, { "epoch": 2.0373010216203373, "grad_norm": 0.09534994695872205, "learning_rate": 1.2330085614756582e-05, "loss": 0.0855, "step": 17150 }, { "epoch": 2.038488952245189, "grad_norm": 0.08907553628440773, "learning_rate": 1.2302456231428059e-05, "loss": 0.0851, "step": 17160 }, { "epoch": 2.0396768828700402, "grad_norm": 0.09726961049337671, "learning_rate": 1.2274847732838801e-05, "loss": 0.0886, "step": 17170 }, { "epoch": 2.040864813494892, "grad_norm": 0.09576614677426365, "learning_rate": 1.2247260164398864e-05, "loss": 0.0901, "step": 17180 }, { "epoch": 2.0420527441197436, "grad_norm": 0.09277976698494574, "learning_rate": 1.2219693571483904e-05, "loss": 0.0893, "step": 17190 }, { "epoch": 2.043240674744595, "grad_norm": 0.0952064125108923, "learning_rate": 1.2192147999435052e-05, "loss": 0.0877, "step": 17200 }, { "epoch": 2.0444286053694465, "grad_norm": 0.0968848945109287, "learning_rate": 1.2164623493558875e-05, "loss": 0.0894, "step": 17210 }, { "epoch": 2.0456165359942977, "grad_norm": 0.09548685757596373, "learning_rate": 1.2137120099127289e-05, "loss": 0.0876, "step": 17220 }, { "epoch": 2.0468044666191494, "grad_norm": 0.09487665114085837, "learning_rate": 1.210963786137747e-05, "loss": 0.0852, "step": 17230 }, { "epoch": 2.047992397244001, "grad_norm": 0.09329978559292507, "learning_rate": 1.208217682551184e-05, "loss": 0.0842, "step": 17240 }, { "epoch": 2.0491803278688523, "grad_norm": 0.0955256388918495, "learning_rate": 1.2054737036697905e-05, "loss": 0.0848, "step": 17250 }, { "epoch": 2.050368258493704, "grad_norm": 0.09634349321815967, "learning_rate": 1.202731854006824e-05, "loss": 0.0855, "step": 17260 }, { "epoch": 2.0515561891185556, "grad_norm": 0.09360719911849032, "learning_rate": 1.1999921380720394e-05, "loss": 0.0847, "step": 17270 }, { "epoch": 2.052744119743407, "grad_norm": 0.09424852949803941, "learning_rate": 1.1972545603716832e-05, "loss": 0.0835, "step": 17280 }, { "epoch": 2.0539320503682585, "grad_norm": 0.09722559439732775, "learning_rate": 1.1945191254084833e-05, "loss": 0.0885, "step": 17290 }, { "epoch": 2.05511998099311, "grad_norm": 0.0938868638134765, "learning_rate": 1.1917858376816454e-05, "loss": 0.0852, "step": 17300 }, { "epoch": 2.0563079116179614, "grad_norm": 0.0959190639519432, "learning_rate": 1.1890547016868415e-05, "loss": 0.0837, "step": 17310 }, { "epoch": 2.057495842242813, "grad_norm": 0.09500057660181253, "learning_rate": 1.1863257219162041e-05, "loss": 0.0869, "step": 17320 }, { "epoch": 2.0586837728676644, "grad_norm": 0.09104462492964494, "learning_rate": 1.1835989028583228e-05, "loss": 0.0881, "step": 17330 }, { "epoch": 2.059871703492516, "grad_norm": 0.09645860319981896, "learning_rate": 1.1808742489982291e-05, "loss": 0.0881, "step": 17340 }, { "epoch": 2.0610596341173677, "grad_norm": 0.0925569618072811, "learning_rate": 1.1781517648173954e-05, "loss": 0.0843, "step": 17350 }, { "epoch": 2.062247564742219, "grad_norm": 0.09627022773951124, "learning_rate": 1.175431454793725e-05, "loss": 0.0894, "step": 17360 }, { "epoch": 2.0634354953670706, "grad_norm": 0.09426651003792402, "learning_rate": 1.1727133234015439e-05, "loss": 0.088, "step": 17370 }, { "epoch": 2.0646234259919223, "grad_norm": 0.09762692910354487, "learning_rate": 1.1699973751115977e-05, "loss": 0.0912, "step": 17380 }, { "epoch": 2.0658113566167735, "grad_norm": 0.09251376611617049, "learning_rate": 1.167283614391039e-05, "loss": 0.0876, "step": 17390 }, { "epoch": 2.066999287241625, "grad_norm": 0.09962394377604816, "learning_rate": 1.1645720457034224e-05, "loss": 0.0877, "step": 17400 }, { "epoch": 2.0681872178664764, "grad_norm": 0.09486868919054431, "learning_rate": 1.1618626735086974e-05, "loss": 0.0851, "step": 17410 }, { "epoch": 2.069375148491328, "grad_norm": 0.09569455399967267, "learning_rate": 1.1591555022632013e-05, "loss": 0.085, "step": 17420 }, { "epoch": 2.0705630791161798, "grad_norm": 0.09543266780063876, "learning_rate": 1.1564505364196506e-05, "loss": 0.0869, "step": 17430 }, { "epoch": 2.071751009741031, "grad_norm": 0.09407071466735613, "learning_rate": 1.1537477804271346e-05, "loss": 0.0863, "step": 17440 }, { "epoch": 2.0729389403658827, "grad_norm": 0.09397124831580884, "learning_rate": 1.1510472387311078e-05, "loss": 0.0868, "step": 17450 }, { "epoch": 2.0741268709907343, "grad_norm": 0.09623566989779721, "learning_rate": 1.1483489157733817e-05, "loss": 0.0856, "step": 17460 }, { "epoch": 2.0753148016155856, "grad_norm": 0.09150068869489729, "learning_rate": 1.1456528159921217e-05, "loss": 0.0854, "step": 17470 }, { "epoch": 2.0765027322404372, "grad_norm": 0.09175290547289268, "learning_rate": 1.1429589438218336e-05, "loss": 0.0873, "step": 17480 }, { "epoch": 2.0776906628652885, "grad_norm": 0.0974641238656291, "learning_rate": 1.1402673036933592e-05, "loss": 0.0845, "step": 17490 }, { "epoch": 2.07887859349014, "grad_norm": 0.09747023275483338, "learning_rate": 1.1375779000338708e-05, "loss": 0.0845, "step": 17500 }, { "epoch": 2.080066524114992, "grad_norm": 0.09366568835518688, "learning_rate": 1.1348907372668593e-05, "loss": 0.0846, "step": 17510 }, { "epoch": 2.081254454739843, "grad_norm": 0.09374707882505007, "learning_rate": 1.1322058198121347e-05, "loss": 0.0854, "step": 17520 }, { "epoch": 2.0824423853646947, "grad_norm": 0.0948857839151772, "learning_rate": 1.1295231520858093e-05, "loss": 0.0871, "step": 17530 }, { "epoch": 2.0836303159895464, "grad_norm": 0.09617543048701425, "learning_rate": 1.1268427385002972e-05, "loss": 0.0874, "step": 17540 }, { "epoch": 2.0848182466143976, "grad_norm": 0.09444084114113172, "learning_rate": 1.1241645834643045e-05, "loss": 0.084, "step": 17550 }, { "epoch": 2.0860061772392493, "grad_norm": 0.10001947131768317, "learning_rate": 1.1214886913828212e-05, "loss": 0.0863, "step": 17560 }, { "epoch": 2.0871941078641005, "grad_norm": 0.0966641913342431, "learning_rate": 1.118815066657119e-05, "loss": 0.0839, "step": 17570 }, { "epoch": 2.088382038488952, "grad_norm": 0.10050458754324579, "learning_rate": 1.116143713684736e-05, "loss": 0.0862, "step": 17580 }, { "epoch": 2.089569969113804, "grad_norm": 0.09665733239742841, "learning_rate": 1.1134746368594765e-05, "loss": 0.0836, "step": 17590 }, { "epoch": 2.090757899738655, "grad_norm": 0.09345744993784293, "learning_rate": 1.1108078405713992e-05, "loss": 0.0857, "step": 17600 }, { "epoch": 2.091945830363507, "grad_norm": 0.09514825935918304, "learning_rate": 1.1081433292068133e-05, "loss": 0.0854, "step": 17610 }, { "epoch": 2.0931337609883585, "grad_norm": 0.09752283458722545, "learning_rate": 1.105481107148269e-05, "loss": 0.0837, "step": 17620 }, { "epoch": 2.0943216916132097, "grad_norm": 0.09322921566536005, "learning_rate": 1.1028211787745512e-05, "loss": 0.0867, "step": 17630 }, { "epoch": 2.0955096222380614, "grad_norm": 0.09312635952297278, "learning_rate": 1.100163548460672e-05, "loss": 0.0876, "step": 17640 }, { "epoch": 2.0966975528629126, "grad_norm": 0.09700681553607864, "learning_rate": 1.0975082205778637e-05, "loss": 0.0854, "step": 17650 }, { "epoch": 2.0978854834877643, "grad_norm": 0.10161311928769923, "learning_rate": 1.0948551994935726e-05, "loss": 0.0854, "step": 17660 }, { "epoch": 2.099073414112616, "grad_norm": 0.09540178251483282, "learning_rate": 1.09220448957145e-05, "loss": 0.0846, "step": 17670 }, { "epoch": 2.100261344737467, "grad_norm": 0.0944843391467141, "learning_rate": 1.0895560951713455e-05, "loss": 0.0876, "step": 17680 }, { "epoch": 2.101449275362319, "grad_norm": 0.09583705820888383, "learning_rate": 1.0869100206493004e-05, "loss": 0.0843, "step": 17690 }, { "epoch": 2.1026372059871705, "grad_norm": 0.09440072904605387, "learning_rate": 1.0842662703575398e-05, "loss": 0.0861, "step": 17700 }, { "epoch": 2.1038251366120218, "grad_norm": 0.09407656072566277, "learning_rate": 1.0816248486444684e-05, "loss": 0.0894, "step": 17710 }, { "epoch": 2.1050130672368734, "grad_norm": 0.09387105763643643, "learning_rate": 1.0789857598546575e-05, "loss": 0.0885, "step": 17720 }, { "epoch": 2.106200997861725, "grad_norm": 0.09306776161579523, "learning_rate": 1.0763490083288433e-05, "loss": 0.087, "step": 17730 }, { "epoch": 2.1073889284865763, "grad_norm": 0.09319536630864116, "learning_rate": 1.0737145984039174e-05, "loss": 0.084, "step": 17740 }, { "epoch": 2.108576859111428, "grad_norm": 0.09779199040880647, "learning_rate": 1.071082534412919e-05, "loss": 0.0854, "step": 17750 }, { "epoch": 2.1097647897362792, "grad_norm": 0.0986096218830203, "learning_rate": 1.0684528206850303e-05, "loss": 0.0832, "step": 17760 }, { "epoch": 2.110952720361131, "grad_norm": 0.0936638195263505, "learning_rate": 1.0658254615455664e-05, "loss": 0.0868, "step": 17770 }, { "epoch": 2.1121406509859826, "grad_norm": 0.09380585142126585, "learning_rate": 1.0632004613159707e-05, "loss": 0.087, "step": 17780 }, { "epoch": 2.113328581610834, "grad_norm": 0.0957357408255024, "learning_rate": 1.060577824313805e-05, "loss": 0.0848, "step": 17790 }, { "epoch": 2.1145165122356855, "grad_norm": 0.09777367550165274, "learning_rate": 1.0579575548527474e-05, "loss": 0.0851, "step": 17800 }, { "epoch": 2.1157044428605367, "grad_norm": 0.09407461318092931, "learning_rate": 1.0553396572425783e-05, "loss": 0.0844, "step": 17810 }, { "epoch": 2.1168923734853884, "grad_norm": 0.09619895869269052, "learning_rate": 1.052724135789179e-05, "loss": 0.0864, "step": 17820 }, { "epoch": 2.11808030411024, "grad_norm": 0.0929594043006304, "learning_rate": 1.0501109947945217e-05, "loss": 0.0848, "step": 17830 }, { "epoch": 2.1192682347350913, "grad_norm": 0.09035417519264159, "learning_rate": 1.0475002385566629e-05, "loss": 0.0831, "step": 17840 }, { "epoch": 2.120456165359943, "grad_norm": 0.0967945122083799, "learning_rate": 1.044891871369739e-05, "loss": 0.0849, "step": 17850 }, { "epoch": 2.1216440959847946, "grad_norm": 0.09699581357239914, "learning_rate": 1.0422858975239539e-05, "loss": 0.0853, "step": 17860 }, { "epoch": 2.122832026609646, "grad_norm": 0.09861966913129763, "learning_rate": 1.0396823213055767e-05, "loss": 0.0878, "step": 17870 }, { "epoch": 2.1240199572344975, "grad_norm": 0.09958604889676635, "learning_rate": 1.0370811469969327e-05, "loss": 0.0845, "step": 17880 }, { "epoch": 2.125207887859349, "grad_norm": 0.09161672675982284, "learning_rate": 1.0344823788763949e-05, "loss": 0.0816, "step": 17890 }, { "epoch": 2.1263958184842005, "grad_norm": 0.09672069422314154, "learning_rate": 1.0318860212183823e-05, "loss": 0.0833, "step": 17900 }, { "epoch": 2.127583749109052, "grad_norm": 0.09502414831530245, "learning_rate": 1.029292078293346e-05, "loss": 0.0863, "step": 17910 }, { "epoch": 2.1287716797339034, "grad_norm": 0.0966711202721373, "learning_rate": 1.0267005543677678e-05, "loss": 0.0875, "step": 17920 }, { "epoch": 2.129959610358755, "grad_norm": 0.09837218574210117, "learning_rate": 1.0241114537041458e-05, "loss": 0.0873, "step": 17930 }, { "epoch": 2.1311475409836067, "grad_norm": 0.0956664191156803, "learning_rate": 1.021524780560999e-05, "loss": 0.0876, "step": 17940 }, { "epoch": 2.132335471608458, "grad_norm": 0.09335226658343682, "learning_rate": 1.0189405391928497e-05, "loss": 0.0873, "step": 17950 }, { "epoch": 2.1335234022333096, "grad_norm": 0.09353876158980563, "learning_rate": 1.0163587338502214e-05, "loss": 0.0855, "step": 17960 }, { "epoch": 2.1347113328581613, "grad_norm": 0.09145925847194085, "learning_rate": 1.0137793687796307e-05, "loss": 0.0833, "step": 17970 }, { "epoch": 2.1358992634830125, "grad_norm": 0.093739857865411, "learning_rate": 1.0112024482235799e-05, "loss": 0.0835, "step": 17980 }, { "epoch": 2.137087194107864, "grad_norm": 0.09433239598438985, "learning_rate": 1.0086279764205525e-05, "loss": 0.0869, "step": 17990 }, { "epoch": 2.1382751247327154, "grad_norm": 0.09676950756862072, "learning_rate": 1.0060559576050027e-05, "loss": 0.085, "step": 18000 }, { "epoch": 2.139463055357567, "grad_norm": 0.09595132054741819, "learning_rate": 1.00348639600735e-05, "loss": 0.0858, "step": 18010 }, { "epoch": 2.1406509859824188, "grad_norm": 0.09881355501608764, "learning_rate": 1.0009192958539731e-05, "loss": 0.0888, "step": 18020 }, { "epoch": 2.14183891660727, "grad_norm": 0.09693957378048619, "learning_rate": 9.983546613672007e-06, "loss": 0.0847, "step": 18030 }, { "epoch": 2.1430268472321217, "grad_norm": 0.09249676336665646, "learning_rate": 9.957924967653092e-06, "loss": 0.0833, "step": 18040 }, { "epoch": 2.1442147778569733, "grad_norm": 0.0997058973222426, "learning_rate": 9.932328062625093e-06, "loss": 0.0872, "step": 18050 }, { "epoch": 2.1454027084818246, "grad_norm": 0.09570421168704561, "learning_rate": 9.906755940689433e-06, "loss": 0.0832, "step": 18060 }, { "epoch": 2.1465906391066762, "grad_norm": 0.09538260861611315, "learning_rate": 9.881208643906777e-06, "loss": 0.0863, "step": 18070 }, { "epoch": 2.1477785697315275, "grad_norm": 0.09534372857062422, "learning_rate": 9.855686214296958e-06, "loss": 0.0853, "step": 18080 }, { "epoch": 2.148966500356379, "grad_norm": 0.09308354458893545, "learning_rate": 9.830188693838899e-06, "loss": 0.0856, "step": 18090 }, { "epoch": 2.150154430981231, "grad_norm": 0.09383919301025292, "learning_rate": 9.804716124470561e-06, "loss": 0.0843, "step": 18100 }, { "epoch": 2.151342361606082, "grad_norm": 0.09891912136040464, "learning_rate": 9.779268548088866e-06, "loss": 0.0898, "step": 18110 }, { "epoch": 2.1525302922309337, "grad_norm": 0.09746990803403588, "learning_rate": 9.753846006549613e-06, "loss": 0.085, "step": 18120 }, { "epoch": 2.1537182228557854, "grad_norm": 0.09317786127269032, "learning_rate": 9.728448541667456e-06, "loss": 0.0834, "step": 18130 }, { "epoch": 2.1549061534806366, "grad_norm": 0.09853950587249884, "learning_rate": 9.703076195215776e-06, "loss": 0.0892, "step": 18140 }, { "epoch": 2.1560940841054883, "grad_norm": 0.09502275583576038, "learning_rate": 9.677729008926645e-06, "loss": 0.0854, "step": 18150 }, { "epoch": 2.15728201473034, "grad_norm": 0.09839223481339945, "learning_rate": 9.652407024490757e-06, "loss": 0.0842, "step": 18160 }, { "epoch": 2.158469945355191, "grad_norm": 0.09199558532383337, "learning_rate": 9.627110283557339e-06, "loss": 0.0857, "step": 18170 }, { "epoch": 2.159657875980043, "grad_norm": 0.09312907460130032, "learning_rate": 9.60183882773413e-06, "loss": 0.0869, "step": 18180 }, { "epoch": 2.160845806604894, "grad_norm": 0.10093881366492927, "learning_rate": 9.576592698587253e-06, "loss": 0.0849, "step": 18190 }, { "epoch": 2.162033737229746, "grad_norm": 0.0949352119978816, "learning_rate": 9.551371937641178e-06, "loss": 0.0848, "step": 18200 }, { "epoch": 2.1632216678545975, "grad_norm": 0.09933790290259069, "learning_rate": 9.526176586378663e-06, "loss": 0.0852, "step": 18210 }, { "epoch": 2.1644095984794487, "grad_norm": 0.09728090166845405, "learning_rate": 9.501006686240643e-06, "loss": 0.0865, "step": 18220 }, { "epoch": 2.1655975291043004, "grad_norm": 0.09311232511929206, "learning_rate": 9.475862278626236e-06, "loss": 0.0837, "step": 18230 }, { "epoch": 2.1667854597291516, "grad_norm": 0.09751152286668704, "learning_rate": 9.450743404892603e-06, "loss": 0.0847, "step": 18240 }, { "epoch": 2.1679733903540033, "grad_norm": 0.09313801837861571, "learning_rate": 9.425650106354894e-06, "loss": 0.0827, "step": 18250 }, { "epoch": 2.169161320978855, "grad_norm": 0.09876569431939043, "learning_rate": 9.400582424286205e-06, "loss": 0.0852, "step": 18260 }, { "epoch": 2.170349251603706, "grad_norm": 0.09346655391969864, "learning_rate": 9.375540399917529e-06, "loss": 0.0855, "step": 18270 }, { "epoch": 2.171537182228558, "grad_norm": 0.09561721316777257, "learning_rate": 9.350524074437613e-06, "loss": 0.0866, "step": 18280 }, { "epoch": 2.1727251128534095, "grad_norm": 0.09594358209056668, "learning_rate": 9.32553348899296e-06, "loss": 0.0837, "step": 18290 }, { "epoch": 2.1739130434782608, "grad_norm": 0.09738436612763852, "learning_rate": 9.300568684687727e-06, "loss": 0.0852, "step": 18300 }, { "epoch": 2.1751009741031124, "grad_norm": 0.09845164950490008, "learning_rate": 9.275629702583658e-06, "loss": 0.0861, "step": 18310 }, { "epoch": 2.176288904727964, "grad_norm": 0.09296714200460708, "learning_rate": 9.250716583700057e-06, "loss": 0.0865, "step": 18320 }, { "epoch": 2.1774768353528153, "grad_norm": 0.09573645882045387, "learning_rate": 9.225829369013652e-06, "loss": 0.0861, "step": 18330 }, { "epoch": 2.178664765977667, "grad_norm": 0.09690356188505489, "learning_rate": 9.200968099458588e-06, "loss": 0.085, "step": 18340 }, { "epoch": 2.1798526966025182, "grad_norm": 0.0953679925816962, "learning_rate": 9.176132815926322e-06, "loss": 0.0839, "step": 18350 }, { "epoch": 2.18104062722737, "grad_norm": 0.09664668816815628, "learning_rate": 9.151323559265568e-06, "loss": 0.085, "step": 18360 }, { "epoch": 2.1822285578522216, "grad_norm": 0.09966362852926167, "learning_rate": 9.126540370282253e-06, "loss": 0.0879, "step": 18370 }, { "epoch": 2.183416488477073, "grad_norm": 0.10060961489182525, "learning_rate": 9.10178328973941e-06, "loss": 0.0876, "step": 18380 }, { "epoch": 2.1846044191019245, "grad_norm": 0.09150465395939951, "learning_rate": 9.077052358357128e-06, "loss": 0.0861, "step": 18390 }, { "epoch": 2.185792349726776, "grad_norm": 0.09671469013560681, "learning_rate": 9.052347616812492e-06, "loss": 0.0865, "step": 18400 }, { "epoch": 2.1869802803516274, "grad_norm": 0.09908135570722151, "learning_rate": 9.02766910573951e-06, "loss": 0.0832, "step": 18410 }, { "epoch": 2.188168210976479, "grad_norm": 0.09291755370628982, "learning_rate": 9.003016865729047e-06, "loss": 0.0858, "step": 18420 }, { "epoch": 2.1893561416013303, "grad_norm": 0.09595516007567617, "learning_rate": 8.978390937328755e-06, "loss": 0.086, "step": 18430 }, { "epoch": 2.190544072226182, "grad_norm": 0.09551540762193261, "learning_rate": 8.953791361043007e-06, "loss": 0.0851, "step": 18440 }, { "epoch": 2.1917320028510336, "grad_norm": 0.0957881729160017, "learning_rate": 8.929218177332834e-06, "loss": 0.0851, "step": 18450 }, { "epoch": 2.192919933475885, "grad_norm": 0.09732007464013469, "learning_rate": 8.904671426615874e-06, "loss": 0.0849, "step": 18460 }, { "epoch": 2.1941078641007365, "grad_norm": 0.09274639327143186, "learning_rate": 8.880151149266261e-06, "loss": 0.0868, "step": 18470 }, { "epoch": 2.1952957947255882, "grad_norm": 0.09605610501769044, "learning_rate": 8.855657385614602e-06, "loss": 0.0863, "step": 18480 }, { "epoch": 2.1964837253504395, "grad_norm": 0.09350754834327166, "learning_rate": 8.83119017594789e-06, "loss": 0.0835, "step": 18490 }, { "epoch": 2.197671655975291, "grad_norm": 0.09900973736555614, "learning_rate": 8.806749560509434e-06, "loss": 0.084, "step": 18500 }, { "epoch": 2.1988595866001424, "grad_norm": 0.09681795811392058, "learning_rate": 8.782335579498827e-06, "loss": 0.0822, "step": 18510 }, { "epoch": 2.200047517224994, "grad_norm": 0.10093958069490636, "learning_rate": 8.75794827307183e-06, "loss": 0.0848, "step": 18520 }, { "epoch": 2.2012354478498457, "grad_norm": 0.09687746187248512, "learning_rate": 8.733587681340336e-06, "loss": 0.084, "step": 18530 }, { "epoch": 2.202423378474697, "grad_norm": 0.10005441538431208, "learning_rate": 8.709253844372303e-06, "loss": 0.0849, "step": 18540 }, { "epoch": 2.2036113090995486, "grad_norm": 0.09580220894081976, "learning_rate": 8.684946802191677e-06, "loss": 0.082, "step": 18550 }, { "epoch": 2.2047992397244003, "grad_norm": 0.09560990678432808, "learning_rate": 8.660666594778333e-06, "loss": 0.0824, "step": 18560 }, { "epoch": 2.2059871703492515, "grad_norm": 0.09794370006460607, "learning_rate": 8.636413262068016e-06, "loss": 0.0846, "step": 18570 }, { "epoch": 2.207175100974103, "grad_norm": 0.09699865506770555, "learning_rate": 8.612186843952258e-06, "loss": 0.0862, "step": 18580 }, { "epoch": 2.2083630315989544, "grad_norm": 0.0968008917555889, "learning_rate": 8.58798738027832e-06, "loss": 0.0836, "step": 18590 }, { "epoch": 2.209550962223806, "grad_norm": 0.09840474381324901, "learning_rate": 8.563814910849149e-06, "loss": 0.0862, "step": 18600 }, { "epoch": 2.2107388928486578, "grad_norm": 0.12039085192090919, "learning_rate": 8.539669475423279e-06, "loss": 0.0856, "step": 18610 }, { "epoch": 2.211926823473509, "grad_norm": 0.09778736592340043, "learning_rate": 8.515551113714767e-06, "loss": 0.0839, "step": 18620 }, { "epoch": 2.2131147540983607, "grad_norm": 0.0965552487448092, "learning_rate": 8.491459865393162e-06, "loss": 0.0857, "step": 18630 }, { "epoch": 2.2143026847232123, "grad_norm": 0.09840341144707128, "learning_rate": 8.46739577008339e-06, "loss": 0.0854, "step": 18640 }, { "epoch": 2.2154906153480636, "grad_norm": 0.09869281698203951, "learning_rate": 8.443358867365749e-06, "loss": 0.0845, "step": 18650 }, { "epoch": 2.2166785459729152, "grad_norm": 0.09399301907746442, "learning_rate": 8.419349196775794e-06, "loss": 0.0829, "step": 18660 }, { "epoch": 2.2178664765977665, "grad_norm": 0.09819257897428699, "learning_rate": 8.395366797804283e-06, "loss": 0.0848, "step": 18670 }, { "epoch": 2.219054407222618, "grad_norm": 0.09670354064742004, "learning_rate": 8.371411709897125e-06, "loss": 0.0846, "step": 18680 }, { "epoch": 2.22024233784747, "grad_norm": 0.09803072493335382, "learning_rate": 8.347483972455302e-06, "loss": 0.0849, "step": 18690 }, { "epoch": 2.221430268472321, "grad_norm": 0.09687642472674386, "learning_rate": 8.323583624834829e-06, "loss": 0.0869, "step": 18700 }, { "epoch": 2.2226181990971727, "grad_norm": 0.09608694867442416, "learning_rate": 8.299710706346653e-06, "loss": 0.0826, "step": 18710 }, { "epoch": 2.2238061297220244, "grad_norm": 0.09544104870538314, "learning_rate": 8.275865256256607e-06, "loss": 0.0841, "step": 18720 }, { "epoch": 2.2249940603468756, "grad_norm": 0.0993188357756491, "learning_rate": 8.25204731378535e-06, "loss": 0.082, "step": 18730 }, { "epoch": 2.2261819909717273, "grad_norm": 0.09905779368401214, "learning_rate": 8.228256918108295e-06, "loss": 0.0879, "step": 18740 }, { "epoch": 2.227369921596579, "grad_norm": 0.0979812729887852, "learning_rate": 8.204494108355546e-06, "loss": 0.0854, "step": 18750 }, { "epoch": 2.22855785222143, "grad_norm": 0.09246692792470836, "learning_rate": 8.180758923611834e-06, "loss": 0.0861, "step": 18760 }, { "epoch": 2.229745782846282, "grad_norm": 0.09608508031017099, "learning_rate": 8.157051402916452e-06, "loss": 0.0811, "step": 18770 }, { "epoch": 2.230933713471133, "grad_norm": 0.09521125272748876, "learning_rate": 8.133371585263183e-06, "loss": 0.0839, "step": 18780 }, { "epoch": 2.232121644095985, "grad_norm": 0.09295994808994613, "learning_rate": 8.109719509600278e-06, "loss": 0.0805, "step": 18790 }, { "epoch": 2.2333095747208365, "grad_norm": 0.10118117615069187, "learning_rate": 8.086095214830317e-06, "loss": 0.0838, "step": 18800 }, { "epoch": 2.2344975053456877, "grad_norm": 0.09689510799533171, "learning_rate": 8.06249873981021e-06, "loss": 0.0867, "step": 18810 }, { "epoch": 2.2356854359705394, "grad_norm": 0.09663625754261257, "learning_rate": 8.038930123351098e-06, "loss": 0.0829, "step": 18820 }, { "epoch": 2.2368733665953906, "grad_norm": 0.09450172488673801, "learning_rate": 8.015389404218305e-06, "loss": 0.0862, "step": 18830 }, { "epoch": 2.2380612972202423, "grad_norm": 0.09924356949375222, "learning_rate": 7.991876621131278e-06, "loss": 0.0835, "step": 18840 }, { "epoch": 2.239249227845094, "grad_norm": 0.10236512883992474, "learning_rate": 7.968391812763507e-06, "loss": 0.0847, "step": 18850 }, { "epoch": 2.240437158469945, "grad_norm": 0.09206828156852952, "learning_rate": 7.944935017742469e-06, "loss": 0.0865, "step": 18860 }, { "epoch": 2.241625089094797, "grad_norm": 0.09676615199188786, "learning_rate": 7.921506274649564e-06, "loss": 0.0832, "step": 18870 }, { "epoch": 2.2428130197196485, "grad_norm": 0.09510617569446017, "learning_rate": 7.898105622020058e-06, "loss": 0.0865, "step": 18880 }, { "epoch": 2.2440009503444998, "grad_norm": 0.0955828369039875, "learning_rate": 7.87473309834301e-06, "loss": 0.0826, "step": 18890 }, { "epoch": 2.2451888809693514, "grad_norm": 0.09197247014314291, "learning_rate": 7.851388742061214e-06, "loss": 0.0833, "step": 18900 }, { "epoch": 2.246376811594203, "grad_norm": 0.09611560656293774, "learning_rate": 7.828072591571136e-06, "loss": 0.0844, "step": 18910 }, { "epoch": 2.2475647422190543, "grad_norm": 0.09821614373343983, "learning_rate": 7.80478468522284e-06, "loss": 0.0889, "step": 18920 }, { "epoch": 2.248752672843906, "grad_norm": 0.09493799945989287, "learning_rate": 7.781525061319961e-06, "loss": 0.0843, "step": 18930 }, { "epoch": 2.2499406034687572, "grad_norm": 0.0951669324435755, "learning_rate": 7.758293758119583e-06, "loss": 0.0854, "step": 18940 }, { "epoch": 2.251128534093609, "grad_norm": 0.09627800433883851, "learning_rate": 7.73509081383223e-06, "loss": 0.0822, "step": 18950 }, { "epoch": 2.2523164647184606, "grad_norm": 0.10265826560432761, "learning_rate": 7.711916266621772e-06, "loss": 0.0856, "step": 18960 }, { "epoch": 2.253504395343312, "grad_norm": 0.09638019899236502, "learning_rate": 7.688770154605366e-06, "loss": 0.0834, "step": 18970 }, { "epoch": 2.2546923259681635, "grad_norm": 0.09383384351755149, "learning_rate": 7.665652515853421e-06, "loss": 0.0846, "step": 18980 }, { "epoch": 2.2558802565930147, "grad_norm": 0.09451254730522626, "learning_rate": 7.642563388389495e-06, "loss": 0.083, "step": 18990 }, { "epoch": 2.2570681872178664, "grad_norm": 0.09380008239164012, "learning_rate": 7.61950281019026e-06, "loss": 0.0825, "step": 19000 }, { "epoch": 2.258256117842718, "grad_norm": 0.09726626291503598, "learning_rate": 7.596470819185419e-06, "loss": 0.0836, "step": 19010 }, { "epoch": 2.2594440484675697, "grad_norm": 0.09989996748966869, "learning_rate": 7.57346745325766e-06, "loss": 0.0839, "step": 19020 }, { "epoch": 2.260631979092421, "grad_norm": 0.09873835977713705, "learning_rate": 7.550492750242605e-06, "loss": 0.0863, "step": 19030 }, { "epoch": 2.2618199097172726, "grad_norm": 0.09545727616514499, "learning_rate": 7.5275467479287155e-06, "loss": 0.0869, "step": 19040 }, { "epoch": 2.263007840342124, "grad_norm": 0.09652614577505929, "learning_rate": 7.504629484057243e-06, "loss": 0.0856, "step": 19050 }, { "epoch": 2.2641957709669756, "grad_norm": 0.10178739617521862, "learning_rate": 7.481740996322182e-06, "loss": 0.085, "step": 19060 }, { "epoch": 2.2653837015918272, "grad_norm": 0.09977524247214259, "learning_rate": 7.4588813223701895e-06, "loss": 0.0835, "step": 19070 }, { "epoch": 2.2665716322166785, "grad_norm": 0.09981472806580309, "learning_rate": 7.436050499800532e-06, "loss": 0.0866, "step": 19080 }, { "epoch": 2.26775956284153, "grad_norm": 0.10080937656896953, "learning_rate": 7.413248566165018e-06, "loss": 0.0883, "step": 19090 }, { "epoch": 2.2689474934663814, "grad_norm": 0.09445325688717356, "learning_rate": 7.390475558967949e-06, "loss": 0.0834, "step": 19100 }, { "epoch": 2.270135424091233, "grad_norm": 0.09470736608806561, "learning_rate": 7.3677315156660346e-06, "loss": 0.0832, "step": 19110 }, { "epoch": 2.2713233547160847, "grad_norm": 0.09550072692057851, "learning_rate": 7.345016473668365e-06, "loss": 0.0841, "step": 19120 }, { "epoch": 2.272511285340936, "grad_norm": 0.09527855290960452, "learning_rate": 7.3223304703363135e-06, "loss": 0.0825, "step": 19130 }, { "epoch": 2.2736992159657876, "grad_norm": 0.09923981229922248, "learning_rate": 7.299673542983496e-06, "loss": 0.085, "step": 19140 }, { "epoch": 2.2748871465906393, "grad_norm": 0.09481191123379298, "learning_rate": 7.277045728875701e-06, "loss": 0.082, "step": 19150 }, { "epoch": 2.2760750772154905, "grad_norm": 0.09695617871199816, "learning_rate": 7.2544470652308295e-06, "loss": 0.0835, "step": 19160 }, { "epoch": 2.277263007840342, "grad_norm": 0.09979581408451321, "learning_rate": 7.231877589218858e-06, "loss": 0.0857, "step": 19170 }, { "epoch": 2.278450938465194, "grad_norm": 0.09709324375198412, "learning_rate": 7.2093373379617305e-06, "loss": 0.0876, "step": 19180 }, { "epoch": 2.279638869090045, "grad_norm": 0.09745005184487636, "learning_rate": 7.186826348533329e-06, "loss": 0.0864, "step": 19190 }, { "epoch": 2.2808267997148968, "grad_norm": 0.09863767644771262, "learning_rate": 7.164344657959413e-06, "loss": 0.0842, "step": 19200 }, { "epoch": 2.282014730339748, "grad_norm": 0.10476975487328334, "learning_rate": 7.141892303217546e-06, "loss": 0.0835, "step": 19210 }, { "epoch": 2.2832026609645997, "grad_norm": 0.098886110400792, "learning_rate": 7.119469321237041e-06, "loss": 0.0841, "step": 19220 }, { "epoch": 2.2843905915894513, "grad_norm": 0.10178195980389097, "learning_rate": 7.097075748898896e-06, "loss": 0.0846, "step": 19230 }, { "epoch": 2.2855785222143026, "grad_norm": 0.09917101299475406, "learning_rate": 7.074711623035745e-06, "loss": 0.0856, "step": 19240 }, { "epoch": 2.2867664528391543, "grad_norm": 0.09835230301153136, "learning_rate": 7.052376980431777e-06, "loss": 0.0867, "step": 19250 }, { "epoch": 2.2879543834640055, "grad_norm": 0.09541708773301771, "learning_rate": 7.030071857822703e-06, "loss": 0.0827, "step": 19260 }, { "epoch": 2.289142314088857, "grad_norm": 0.09994150553801714, "learning_rate": 7.0077962918956715e-06, "loss": 0.0812, "step": 19270 }, { "epoch": 2.290330244713709, "grad_norm": 0.09650206594132485, "learning_rate": 6.9855503192892105e-06, "loss": 0.0839, "step": 19280 }, { "epoch": 2.29151817533856, "grad_norm": 0.09536660566948152, "learning_rate": 6.963333976593183e-06, "loss": 0.0839, "step": 19290 }, { "epoch": 2.2927061059634117, "grad_norm": 0.09323543912781072, "learning_rate": 6.941147300348702e-06, "loss": 0.0833, "step": 19300 }, { "epoch": 2.2938940365882634, "grad_norm": 0.09268401063654245, "learning_rate": 6.9189903270481135e-06, "loss": 0.0836, "step": 19310 }, { "epoch": 2.2950819672131146, "grad_norm": 0.09320896972786878, "learning_rate": 6.8968630931348864e-06, "loss": 0.0835, "step": 19320 }, { "epoch": 2.2962698978379663, "grad_norm": 0.0952098553022097, "learning_rate": 6.874765635003577e-06, "loss": 0.0854, "step": 19330 }, { "epoch": 2.297457828462818, "grad_norm": 0.09516052825250032, "learning_rate": 6.852697988999774e-06, "loss": 0.0831, "step": 19340 }, { "epoch": 2.298645759087669, "grad_norm": 0.09822904497846759, "learning_rate": 6.830660191420013e-06, "loss": 0.0846, "step": 19350 }, { "epoch": 2.299833689712521, "grad_norm": 0.09647556709743745, "learning_rate": 6.808652278511771e-06, "loss": 0.0844, "step": 19360 }, { "epoch": 2.301021620337372, "grad_norm": 0.09360207323437751, "learning_rate": 6.786674286473338e-06, "loss": 0.0861, "step": 19370 }, { "epoch": 2.302209550962224, "grad_norm": 0.0948297683277249, "learning_rate": 6.764726251453807e-06, "loss": 0.0846, "step": 19380 }, { "epoch": 2.3033974815870755, "grad_norm": 0.09424260920279937, "learning_rate": 6.742808209552992e-06, "loss": 0.0852, "step": 19390 }, { "epoch": 2.3045854122119267, "grad_norm": 0.10008535383657508, "learning_rate": 6.720920196821373e-06, "loss": 0.0828, "step": 19400 }, { "epoch": 2.3057733428367784, "grad_norm": 0.09549450766257012, "learning_rate": 6.6990622492600464e-06, "loss": 0.0845, "step": 19410 }, { "epoch": 2.3069612734616296, "grad_norm": 0.10020507834597701, "learning_rate": 6.677234402820651e-06, "loss": 0.0848, "step": 19420 }, { "epoch": 2.3081492040864813, "grad_norm": 0.09558928193791398, "learning_rate": 6.65543669340532e-06, "loss": 0.0823, "step": 19430 }, { "epoch": 2.309337134711333, "grad_norm": 0.0932008441165792, "learning_rate": 6.633669156866604e-06, "loss": 0.083, "step": 19440 }, { "epoch": 2.310525065336184, "grad_norm": 0.09664593626168191, "learning_rate": 6.611931829007456e-06, "loss": 0.0847, "step": 19450 }, { "epoch": 2.311712995961036, "grad_norm": 0.09465684800677526, "learning_rate": 6.590224745581114e-06, "loss": 0.0862, "step": 19460 }, { "epoch": 2.3129009265858875, "grad_norm": 0.09611162991692172, "learning_rate": 6.5685479422910815e-06, "loss": 0.0809, "step": 19470 }, { "epoch": 2.3140888572107388, "grad_norm": 0.0989598351148143, "learning_rate": 6.546901454791055e-06, "loss": 0.083, "step": 19480 }, { "epoch": 2.3152767878355904, "grad_norm": 0.095479578687509, "learning_rate": 6.5252853186848595e-06, "loss": 0.0832, "step": 19490 }, { "epoch": 2.316464718460442, "grad_norm": 0.10226350284430413, "learning_rate": 6.503699569526428e-06, "loss": 0.087, "step": 19500 }, { "epoch": 2.3176526490852933, "grad_norm": 0.09671428848684659, "learning_rate": 6.482144242819682e-06, "loss": 0.0864, "step": 19510 }, { "epoch": 2.318840579710145, "grad_norm": 0.09861233457348843, "learning_rate": 6.460619374018518e-06, "loss": 0.0855, "step": 19520 }, { "epoch": 2.3200285103349962, "grad_norm": 0.10054875846223188, "learning_rate": 6.43912499852673e-06, "loss": 0.0833, "step": 19530 }, { "epoch": 2.321216440959848, "grad_norm": 0.0973790513464369, "learning_rate": 6.417661151697965e-06, "loss": 0.0869, "step": 19540 }, { "epoch": 2.3224043715846996, "grad_norm": 0.09724463241020631, "learning_rate": 6.396227868835653e-06, "loss": 0.0826, "step": 19550 }, { "epoch": 2.323592302209551, "grad_norm": 0.09866417093946421, "learning_rate": 6.3748251851929504e-06, "loss": 0.087, "step": 19560 }, { "epoch": 2.3247802328344025, "grad_norm": 0.09136753737828171, "learning_rate": 6.353453135972687e-06, "loss": 0.0833, "step": 19570 }, { "epoch": 2.325968163459254, "grad_norm": 0.10002398491147996, "learning_rate": 6.3321117563273e-06, "loss": 0.0841, "step": 19580 }, { "epoch": 2.3271560940841054, "grad_norm": 0.10046559616452534, "learning_rate": 6.310801081358797e-06, "loss": 0.084, "step": 19590 }, { "epoch": 2.328344024708957, "grad_norm": 0.09511330205113173, "learning_rate": 6.289521146118671e-06, "loss": 0.0855, "step": 19600 }, { "epoch": 2.3295319553338087, "grad_norm": 0.09789187022689982, "learning_rate": 6.268271985607854e-06, "loss": 0.085, "step": 19610 }, { "epoch": 2.33071988595866, "grad_norm": 0.09718499052451804, "learning_rate": 6.2470536347766615e-06, "loss": 0.0823, "step": 19620 }, { "epoch": 2.3319078165835117, "grad_norm": 0.09752617929653107, "learning_rate": 6.225866128524729e-06, "loss": 0.085, "step": 19630 }, { "epoch": 2.333095747208363, "grad_norm": 0.09984356078722285, "learning_rate": 6.204709501700978e-06, "loss": 0.0856, "step": 19640 }, { "epoch": 2.3342836778332146, "grad_norm": 0.09878091590524465, "learning_rate": 6.183583789103525e-06, "loss": 0.0859, "step": 19650 }, { "epoch": 2.3354716084580662, "grad_norm": 0.09710521794934796, "learning_rate": 6.162489025479637e-06, "loss": 0.084, "step": 19660 }, { "epoch": 2.3366595390829175, "grad_norm": 0.09547199477630566, "learning_rate": 6.141425245525681e-06, "loss": 0.0826, "step": 19670 }, { "epoch": 2.337847469707769, "grad_norm": 0.09490549894238057, "learning_rate": 6.120392483887055e-06, "loss": 0.0834, "step": 19680 }, { "epoch": 2.3390354003326204, "grad_norm": 0.09868657970186381, "learning_rate": 6.0993907751581644e-06, "loss": 0.0849, "step": 19690 }, { "epoch": 2.340223330957472, "grad_norm": 0.09840819687210271, "learning_rate": 6.078420153882308e-06, "loss": 0.0856, "step": 19700 }, { "epoch": 2.3414112615823237, "grad_norm": 0.10093403181806868, "learning_rate": 6.0574806545516695e-06, "loss": 0.0829, "step": 19710 }, { "epoch": 2.342599192207175, "grad_norm": 0.09703794842261335, "learning_rate": 6.036572311607239e-06, "loss": 0.0831, "step": 19720 }, { "epoch": 2.3437871228320266, "grad_norm": 0.09722538641774758, "learning_rate": 6.015695159438759e-06, "loss": 0.0826, "step": 19730 }, { "epoch": 2.3449750534568783, "grad_norm": 0.10115973864105321, "learning_rate": 5.994849232384672e-06, "loss": 0.0832, "step": 19740 }, { "epoch": 2.3461629840817295, "grad_norm": 0.0964534435020742, "learning_rate": 5.974034564732065e-06, "loss": 0.083, "step": 19750 }, { "epoch": 2.347350914706581, "grad_norm": 0.096177016817494, "learning_rate": 5.953251190716605e-06, "loss": 0.0832, "step": 19760 }, { "epoch": 2.348538845331433, "grad_norm": 0.09782244105447468, "learning_rate": 5.932499144522483e-06, "loss": 0.0839, "step": 19770 }, { "epoch": 2.349726775956284, "grad_norm": 0.09871976840658897, "learning_rate": 5.911778460282386e-06, "loss": 0.0839, "step": 19780 }, { "epoch": 2.3509147065811358, "grad_norm": 0.0986400624030474, "learning_rate": 5.891089172077388e-06, "loss": 0.0833, "step": 19790 }, { "epoch": 2.352102637205987, "grad_norm": 0.09661627425788377, "learning_rate": 5.870431313936941e-06, "loss": 0.0849, "step": 19800 }, { "epoch": 2.3532905678308387, "grad_norm": 0.09603092855666229, "learning_rate": 5.849804919838795e-06, "loss": 0.0815, "step": 19810 }, { "epoch": 2.3544784984556903, "grad_norm": 0.09629816353057805, "learning_rate": 5.829210023708939e-06, "loss": 0.0831, "step": 19820 }, { "epoch": 2.3556664290805416, "grad_norm": 0.09740898060026477, "learning_rate": 5.808646659421582e-06, "loss": 0.0832, "step": 19830 }, { "epoch": 2.3568543597053933, "grad_norm": 0.09567103172702376, "learning_rate": 5.788114860799043e-06, "loss": 0.0822, "step": 19840 }, { "epoch": 2.3580422903302445, "grad_norm": 0.09616344022147193, "learning_rate": 5.767614661611739e-06, "loss": 0.0832, "step": 19850 }, { "epoch": 2.359230220955096, "grad_norm": 0.10056517876426033, "learning_rate": 5.747146095578099e-06, "loss": 0.0837, "step": 19860 }, { "epoch": 2.360418151579948, "grad_norm": 0.09306765333084611, "learning_rate": 5.726709196364532e-06, "loss": 0.0803, "step": 19870 }, { "epoch": 2.361606082204799, "grad_norm": 0.09573736233074821, "learning_rate": 5.706303997585358e-06, "loss": 0.0814, "step": 19880 }, { "epoch": 2.3627940128296507, "grad_norm": 0.09666188311284783, "learning_rate": 5.685930532802758e-06, "loss": 0.0832, "step": 19890 }, { "epoch": 2.3639819434545024, "grad_norm": 0.09601606425838725, "learning_rate": 5.665588835526714e-06, "loss": 0.0833, "step": 19900 }, { "epoch": 2.3651698740793536, "grad_norm": 0.10175141683079335, "learning_rate": 5.64527893921496e-06, "loss": 0.0858, "step": 19910 }, { "epoch": 2.3663578047042053, "grad_norm": 0.0923261383305043, "learning_rate": 5.625000877272932e-06, "loss": 0.0813, "step": 19920 }, { "epoch": 2.367545735329057, "grad_norm": 0.0977772547812404, "learning_rate": 5.604754683053693e-06, "loss": 0.0837, "step": 19930 }, { "epoch": 2.368733665953908, "grad_norm": 0.0958442341095941, "learning_rate": 5.584540389857895e-06, "loss": 0.0831, "step": 19940 }, { "epoch": 2.36992159657876, "grad_norm": 0.09931407336992859, "learning_rate": 5.564358030933719e-06, "loss": 0.0846, "step": 19950 }, { "epoch": 2.371109527203611, "grad_norm": 0.09661914417083615, "learning_rate": 5.544207639476817e-06, "loss": 0.0842, "step": 19960 }, { "epoch": 2.372297457828463, "grad_norm": 0.09459162623700525, "learning_rate": 5.524089248630279e-06, "loss": 0.0823, "step": 19970 }, { "epoch": 2.3734853884533145, "grad_norm": 0.09731713967086458, "learning_rate": 5.504002891484539e-06, "loss": 0.0839, "step": 19980 }, { "epoch": 2.3746733190781657, "grad_norm": 0.09988862677230154, "learning_rate": 5.483948601077354e-06, "loss": 0.0837, "step": 19990 }, { "epoch": 2.3758612497030174, "grad_norm": 0.09407881148051271, "learning_rate": 5.4639264103937325e-06, "loss": 0.0821, "step": 20000 }, { "epoch": 2.3770491803278686, "grad_norm": 0.0971582714110854, "learning_rate": 5.443936352365886e-06, "loss": 0.0819, "step": 20010 }, { "epoch": 2.3782371109527203, "grad_norm": 0.0942162261244562, "learning_rate": 5.423978459873186e-06, "loss": 0.0848, "step": 20020 }, { "epoch": 2.379425041577572, "grad_norm": 0.1032220681629773, "learning_rate": 5.404052765742096e-06, "loss": 0.0828, "step": 20030 }, { "epoch": 2.3806129722024236, "grad_norm": 0.09570785933708546, "learning_rate": 5.384159302746095e-06, "loss": 0.0881, "step": 20040 }, { "epoch": 2.381800902827275, "grad_norm": 0.0976046810873024, "learning_rate": 5.364298103605669e-06, "loss": 0.082, "step": 20050 }, { "epoch": 2.3829888334521265, "grad_norm": 0.09655106649364527, "learning_rate": 5.344469200988251e-06, "loss": 0.0834, "step": 20060 }, { "epoch": 2.3841767640769778, "grad_norm": 0.09583999531306471, "learning_rate": 5.324672627508129e-06, "loss": 0.0836, "step": 20070 }, { "epoch": 2.3853646947018294, "grad_norm": 0.09198858974222063, "learning_rate": 5.3049084157264285e-06, "loss": 0.083, "step": 20080 }, { "epoch": 2.386552625326681, "grad_norm": 0.09731945759469417, "learning_rate": 5.285176598151043e-06, "loss": 0.084, "step": 20090 }, { "epoch": 2.3877405559515323, "grad_norm": 0.10079539194957199, "learning_rate": 5.2654772072365744e-06, "loss": 0.0838, "step": 20100 }, { "epoch": 2.388928486576384, "grad_norm": 0.09566658045625726, "learning_rate": 5.2458102753843166e-06, "loss": 0.0825, "step": 20110 }, { "epoch": 2.3901164172012352, "grad_norm": 0.0971917761813903, "learning_rate": 5.226175834942154e-06, "loss": 0.0823, "step": 20120 }, { "epoch": 2.391304347826087, "grad_norm": 0.09722745423504789, "learning_rate": 5.206573918204533e-06, "loss": 0.0851, "step": 20130 }, { "epoch": 2.3924922784509386, "grad_norm": 0.0922914657754315, "learning_rate": 5.1870045574124084e-06, "loss": 0.085, "step": 20140 }, { "epoch": 2.39368020907579, "grad_norm": 0.10050949316007048, "learning_rate": 5.1674677847531776e-06, "loss": 0.0809, "step": 20150 }, { "epoch": 2.3948681397006415, "grad_norm": 0.10148142188546233, "learning_rate": 5.1479636323606614e-06, "loss": 0.0844, "step": 20160 }, { "epoch": 2.396056070325493, "grad_norm": 0.09759208923122287, "learning_rate": 5.128492132315005e-06, "loss": 0.0819, "step": 20170 }, { "epoch": 2.3972440009503444, "grad_norm": 0.1012910093453832, "learning_rate": 5.109053316642654e-06, "loss": 0.0823, "step": 20180 }, { "epoch": 2.398431931575196, "grad_norm": 0.09348127179810431, "learning_rate": 5.089647217316296e-06, "loss": 0.0855, "step": 20190 }, { "epoch": 2.3996198622000477, "grad_norm": 0.0924759571652859, "learning_rate": 5.070273866254807e-06, "loss": 0.0841, "step": 20200 }, { "epoch": 2.400807792824899, "grad_norm": 0.09813591361586473, "learning_rate": 5.0509332953231985e-06, "loss": 0.0863, "step": 20210 }, { "epoch": 2.4019957234497507, "grad_norm": 0.09847351043341299, "learning_rate": 5.0316255363325686e-06, "loss": 0.0823, "step": 20220 }, { "epoch": 2.403183654074602, "grad_norm": 0.09984195267327771, "learning_rate": 5.012350621040041e-06, "loss": 0.0842, "step": 20230 }, { "epoch": 2.4043715846994536, "grad_norm": 0.09884465033746422, "learning_rate": 4.993108581148717e-06, "loss": 0.085, "step": 20240 }, { "epoch": 2.4055595153243052, "grad_norm": 0.10052439848333017, "learning_rate": 4.973899448307645e-06, "loss": 0.0813, "step": 20250 }, { "epoch": 2.4067474459491565, "grad_norm": 0.09634922176523485, "learning_rate": 4.954723254111724e-06, "loss": 0.0853, "step": 20260 }, { "epoch": 2.407935376574008, "grad_norm": 0.09849491651186201, "learning_rate": 4.935580030101686e-06, "loss": 0.0835, "step": 20270 }, { "epoch": 2.4091233071988594, "grad_norm": 0.09667867787041406, "learning_rate": 4.9164698077640344e-06, "loss": 0.0834, "step": 20280 }, { "epoch": 2.410311237823711, "grad_norm": 0.09672963214548853, "learning_rate": 4.897392618530983e-06, "loss": 0.0822, "step": 20290 }, { "epoch": 2.4114991684485627, "grad_norm": 0.0982491336679762, "learning_rate": 4.878348493780435e-06, "loss": 0.083, "step": 20300 }, { "epoch": 2.412687099073414, "grad_norm": 0.10005938538293571, "learning_rate": 4.859337464835889e-06, "loss": 0.0838, "step": 20310 }, { "epoch": 2.4138750296982656, "grad_norm": 0.09813091449195295, "learning_rate": 4.840359562966415e-06, "loss": 0.0845, "step": 20320 }, { "epoch": 2.4150629603231173, "grad_norm": 0.09549459800890772, "learning_rate": 4.821414819386597e-06, "loss": 0.086, "step": 20330 }, { "epoch": 2.4162508909479685, "grad_norm": 0.10047515649876128, "learning_rate": 4.802503265256469e-06, "loss": 0.0837, "step": 20340 }, { "epoch": 2.41743882157282, "grad_norm": 0.09914183362753298, "learning_rate": 4.783624931681513e-06, "loss": 0.0821, "step": 20350 }, { "epoch": 2.418626752197672, "grad_norm": 0.09648306550499322, "learning_rate": 4.764779849712514e-06, "loss": 0.0825, "step": 20360 }, { "epoch": 2.419814682822523, "grad_norm": 0.09604704555495751, "learning_rate": 4.745968050345609e-06, "loss": 0.0841, "step": 20370 }, { "epoch": 2.4210026134473748, "grad_norm": 0.09892531480448721, "learning_rate": 4.72718956452217e-06, "loss": 0.0845, "step": 20380 }, { "epoch": 2.422190544072226, "grad_norm": 0.09943794135479157, "learning_rate": 4.7084444231287935e-06, "loss": 0.0851, "step": 20390 }, { "epoch": 2.4233784746970777, "grad_norm": 0.09979561689821388, "learning_rate": 4.6897326569972145e-06, "loss": 0.0811, "step": 20400 }, { "epoch": 2.4245664053219294, "grad_norm": 0.09842473061882394, "learning_rate": 4.671054296904284e-06, "loss": 0.0853, "step": 20410 }, { "epoch": 2.4257543359467806, "grad_norm": 0.09286777286579571, "learning_rate": 4.652409373571898e-06, "loss": 0.0846, "step": 20420 }, { "epoch": 2.4269422665716323, "grad_norm": 0.09722406861697167, "learning_rate": 4.6337979176669575e-06, "loss": 0.0817, "step": 20430 }, { "epoch": 2.4281301971964835, "grad_norm": 0.10026049344224129, "learning_rate": 4.615219959801332e-06, "loss": 0.0828, "step": 20440 }, { "epoch": 2.429318127821335, "grad_norm": 0.09301260953957417, "learning_rate": 4.596675530531777e-06, "loss": 0.0831, "step": 20450 }, { "epoch": 2.430506058446187, "grad_norm": 0.09931757765973875, "learning_rate": 4.578164660359904e-06, "loss": 0.0849, "step": 20460 }, { "epoch": 2.431693989071038, "grad_norm": 0.097509802061949, "learning_rate": 4.55968737973213e-06, "loss": 0.083, "step": 20470 }, { "epoch": 2.4328819196958897, "grad_norm": 0.10207899057592079, "learning_rate": 4.541243719039617e-06, "loss": 0.085, "step": 20480 }, { "epoch": 2.4340698503207414, "grad_norm": 0.09667966471142944, "learning_rate": 4.522833708618246e-06, "loss": 0.0814, "step": 20490 }, { "epoch": 2.4352577809455926, "grad_norm": 0.0956837426745059, "learning_rate": 4.504457378748536e-06, "loss": 0.083, "step": 20500 }, { "epoch": 2.4364457115704443, "grad_norm": 0.09500639932642808, "learning_rate": 4.486114759655613e-06, "loss": 0.0853, "step": 20510 }, { "epoch": 2.437633642195296, "grad_norm": 0.09418797055315452, "learning_rate": 4.467805881509157e-06, "loss": 0.0807, "step": 20520 }, { "epoch": 2.438821572820147, "grad_norm": 0.0975260280339202, "learning_rate": 4.449530774423347e-06, "loss": 0.0844, "step": 20530 }, { "epoch": 2.440009503444999, "grad_norm": 0.0982855287706997, "learning_rate": 4.43128946845682e-06, "loss": 0.0823, "step": 20540 }, { "epoch": 2.44119743406985, "grad_norm": 0.09852220042191705, "learning_rate": 4.413081993612617e-06, "loss": 0.0822, "step": 20550 }, { "epoch": 2.442385364694702, "grad_norm": 0.09617394020317145, "learning_rate": 4.394908379838133e-06, "loss": 0.0799, "step": 20560 }, { "epoch": 2.4435732953195535, "grad_norm": 0.09544716447893033, "learning_rate": 4.376768657025062e-06, "loss": 0.0823, "step": 20570 }, { "epoch": 2.4447612259444047, "grad_norm": 0.10068592096883983, "learning_rate": 4.3586628550093766e-06, "loss": 0.0839, "step": 20580 }, { "epoch": 2.4459491565692564, "grad_norm": 0.09678458699697279, "learning_rate": 4.340591003571237e-06, "loss": 0.0825, "step": 20590 }, { "epoch": 2.447137087194108, "grad_norm": 0.09636477512760103, "learning_rate": 4.322553132434965e-06, "loss": 0.0821, "step": 20600 }, { "epoch": 2.4483250178189593, "grad_norm": 0.10354760200135356, "learning_rate": 4.304549271268993e-06, "loss": 0.0842, "step": 20610 }, { "epoch": 2.449512948443811, "grad_norm": 0.098460137490856, "learning_rate": 4.28657944968581e-06, "loss": 0.0839, "step": 20620 }, { "epoch": 2.4507008790686626, "grad_norm": 0.09756106383463836, "learning_rate": 4.268643697241939e-06, "loss": 0.0812, "step": 20630 }, { "epoch": 2.451888809693514, "grad_norm": 0.0975749682135694, "learning_rate": 4.250742043437839e-06, "loss": 0.083, "step": 20640 }, { "epoch": 2.4530767403183655, "grad_norm": 0.09555770436805873, "learning_rate": 4.2328745177179e-06, "loss": 0.0858, "step": 20650 }, { "epoch": 2.4542646709432168, "grad_norm": 0.09974990784968729, "learning_rate": 4.21504114947037e-06, "loss": 0.0841, "step": 20660 }, { "epoch": 2.4554526015680684, "grad_norm": 0.09362413396457193, "learning_rate": 4.197241968027321e-06, "loss": 0.0851, "step": 20670 }, { "epoch": 2.45664053219292, "grad_norm": 0.09580800712396464, "learning_rate": 4.1794770026645945e-06, "loss": 0.0838, "step": 20680 }, { "epoch": 2.4578284628177713, "grad_norm": 0.09793863158042893, "learning_rate": 4.16174628260175e-06, "loss": 0.085, "step": 20690 }, { "epoch": 2.459016393442623, "grad_norm": 0.10204794230922827, "learning_rate": 4.144049837002026e-06, "loss": 0.0818, "step": 20700 }, { "epoch": 2.4602043240674742, "grad_norm": 0.1010246830621527, "learning_rate": 4.126387694972278e-06, "loss": 0.0824, "step": 20710 }, { "epoch": 2.461392254692326, "grad_norm": 0.100742541427019, "learning_rate": 4.108759885562955e-06, "loss": 0.0821, "step": 20720 }, { "epoch": 2.4625801853171776, "grad_norm": 0.0936094526589514, "learning_rate": 4.091166437768023e-06, "loss": 0.0831, "step": 20730 }, { "epoch": 2.463768115942029, "grad_norm": 0.10090223426664904, "learning_rate": 4.0736073805249355e-06, "loss": 0.0821, "step": 20740 }, { "epoch": 2.4649560465668805, "grad_norm": 0.09768011713922016, "learning_rate": 4.056082742714582e-06, "loss": 0.0851, "step": 20750 }, { "epoch": 2.466143977191732, "grad_norm": 0.10124934018737423, "learning_rate": 4.038592553161224e-06, "loss": 0.0835, "step": 20760 }, { "epoch": 2.4673319078165834, "grad_norm": 0.099532632504447, "learning_rate": 4.021136840632495e-06, "loss": 0.0858, "step": 20770 }, { "epoch": 2.468519838441435, "grad_norm": 0.10374582988960472, "learning_rate": 4.003715633839294e-06, "loss": 0.0842, "step": 20780 }, { "epoch": 2.4697077690662868, "grad_norm": 0.0976985554634864, "learning_rate": 3.986328961435773e-06, "loss": 0.0834, "step": 20790 }, { "epoch": 2.470895699691138, "grad_norm": 0.10053583919171322, "learning_rate": 3.968976852019285e-06, "loss": 0.082, "step": 20800 }, { "epoch": 2.4720836303159897, "grad_norm": 0.10182035181288693, "learning_rate": 3.951659334130329e-06, "loss": 0.082, "step": 20810 }, { "epoch": 2.473271560940841, "grad_norm": 0.09725169866543548, "learning_rate": 3.934376436252518e-06, "loss": 0.082, "step": 20820 }, { "epoch": 2.4744594915656926, "grad_norm": 0.09912771098352777, "learning_rate": 3.9171281868125135e-06, "loss": 0.0824, "step": 20830 }, { "epoch": 2.4756474221905442, "grad_norm": 0.10009734218193182, "learning_rate": 3.8999146141799915e-06, "loss": 0.0818, "step": 20840 }, { "epoch": 2.4768353528153955, "grad_norm": 0.0996005275253805, "learning_rate": 3.88273574666759e-06, "loss": 0.0851, "step": 20850 }, { "epoch": 2.478023283440247, "grad_norm": 0.09757969050149766, "learning_rate": 3.865591612530866e-06, "loss": 0.0828, "step": 20860 }, { "epoch": 2.4792112140650984, "grad_norm": 0.09574152282876225, "learning_rate": 3.848482239968249e-06, "loss": 0.0822, "step": 20870 }, { "epoch": 2.48039914468995, "grad_norm": 0.09415669291206438, "learning_rate": 3.831407657120989e-06, "loss": 0.083, "step": 20880 }, { "epoch": 2.4815870753148017, "grad_norm": 0.09394101755653708, "learning_rate": 3.8143678920731174e-06, "loss": 0.0828, "step": 20890 }, { "epoch": 2.482775005939653, "grad_norm": 0.09410789805174297, "learning_rate": 3.797362972851393e-06, "loss": 0.0809, "step": 20900 }, { "epoch": 2.4839629365645046, "grad_norm": 0.09997348319780282, "learning_rate": 3.7803929274252746e-06, "loss": 0.0801, "step": 20910 }, { "epoch": 2.4851508671893563, "grad_norm": 0.09223843085410775, "learning_rate": 3.7634577837068476e-06, "loss": 0.0806, "step": 20920 }, { "epoch": 2.4863387978142075, "grad_norm": 0.09961086356956765, "learning_rate": 3.7465575695507933e-06, "loss": 0.0855, "step": 20930 }, { "epoch": 2.487526728439059, "grad_norm": 0.09446545857321349, "learning_rate": 3.729692312754343e-06, "loss": 0.0844, "step": 20940 }, { "epoch": 2.488714659063911, "grad_norm": 0.09681695016748544, "learning_rate": 3.712862041057227e-06, "loss": 0.086, "step": 20950 }, { "epoch": 2.489902589688762, "grad_norm": 0.09925126711329697, "learning_rate": 3.6960667821416494e-06, "loss": 0.0832, "step": 20960 }, { "epoch": 2.4910905203136138, "grad_norm": 0.09561334075511256, "learning_rate": 3.6793065636322036e-06, "loss": 0.0825, "step": 20970 }, { "epoch": 2.492278450938465, "grad_norm": 0.09380874698307973, "learning_rate": 3.6625814130958614e-06, "loss": 0.0833, "step": 20980 }, { "epoch": 2.4934663815633167, "grad_norm": 0.09927949213374476, "learning_rate": 3.645891358041911e-06, "loss": 0.0817, "step": 20990 }, { "epoch": 2.4946543121881684, "grad_norm": 0.0973711321083331, "learning_rate": 3.629236425921917e-06, "loss": 0.0829, "step": 21000 }, { "epoch": 2.4958422428130196, "grad_norm": 0.09505284332984275, "learning_rate": 3.6126166441296767e-06, "loss": 0.0823, "step": 21010 }, { "epoch": 2.4970301734378713, "grad_norm": 0.10022206944283212, "learning_rate": 3.596032040001168e-06, "loss": 0.083, "step": 21020 }, { "epoch": 2.4982181040627225, "grad_norm": 0.0962498589126744, "learning_rate": 3.579482640814513e-06, "loss": 0.0838, "step": 21030 }, { "epoch": 2.499406034687574, "grad_norm": 0.09980586254336456, "learning_rate": 3.5629684737899238e-06, "loss": 0.0818, "step": 21040 }, { "epoch": 2.500593965312426, "grad_norm": 0.10078417345344469, "learning_rate": 3.546489566089678e-06, "loss": 0.0839, "step": 21050 }, { "epoch": 2.5017818959372775, "grad_norm": 0.1013112696512672, "learning_rate": 3.530045944818039e-06, "loss": 0.0826, "step": 21060 }, { "epoch": 2.5029698265621287, "grad_norm": 0.09918416878988946, "learning_rate": 3.513637637021244e-06, "loss": 0.084, "step": 21070 }, { "epoch": 2.5041577571869804, "grad_norm": 0.09883006097562448, "learning_rate": 3.4972646696874473e-06, "loss": 0.0824, "step": 21080 }, { "epoch": 2.5053456878118316, "grad_norm": 0.10138389360577794, "learning_rate": 3.480927069746659e-06, "loss": 0.0807, "step": 21090 }, { "epoch": 2.5065336184366833, "grad_norm": 0.09813387256325283, "learning_rate": 3.4646248640707508e-06, "loss": 0.0883, "step": 21100 }, { "epoch": 2.507721549061535, "grad_norm": 0.09350422302116387, "learning_rate": 3.448358079473346e-06, "loss": 0.0833, "step": 21110 }, { "epoch": 2.5089094796863862, "grad_norm": 0.0972025971110306, "learning_rate": 3.4321267427098263e-06, "loss": 0.083, "step": 21120 }, { "epoch": 2.510097410311238, "grad_norm": 0.0981793490811542, "learning_rate": 3.415930880477258e-06, "loss": 0.0827, "step": 21130 }, { "epoch": 2.511285340936089, "grad_norm": 0.10049017677184374, "learning_rate": 3.399770519414364e-06, "loss": 0.0845, "step": 21140 }, { "epoch": 2.512473271560941, "grad_norm": 0.09724234837281237, "learning_rate": 3.383645686101483e-06, "loss": 0.0848, "step": 21150 }, { "epoch": 2.5136612021857925, "grad_norm": 0.09975572481590793, "learning_rate": 3.367556407060507e-06, "loss": 0.0835, "step": 21160 }, { "epoch": 2.5148491328106437, "grad_norm": 0.09725097705970917, "learning_rate": 3.3515027087548555e-06, "loss": 0.0808, "step": 21170 }, { "epoch": 2.5160370634354954, "grad_norm": 0.1022842610179056, "learning_rate": 3.335484617589421e-06, "loss": 0.0816, "step": 21180 }, { "epoch": 2.5172249940603466, "grad_norm": 0.09934490646250788, "learning_rate": 3.3195021599105312e-06, "loss": 0.0815, "step": 21190 }, { "epoch": 2.5184129246851983, "grad_norm": 0.10027681172263776, "learning_rate": 3.3035553620059086e-06, "loss": 0.084, "step": 21200 }, { "epoch": 2.51960085531005, "grad_norm": 0.10071425915112929, "learning_rate": 3.2876442501046156e-06, "loss": 0.0846, "step": 21210 }, { "epoch": 2.5207887859349016, "grad_norm": 0.10258318752027061, "learning_rate": 3.2717688503770216e-06, "loss": 0.0849, "step": 21220 }, { "epoch": 2.521976716559753, "grad_norm": 0.09457283645468388, "learning_rate": 3.255929188934759e-06, "loss": 0.0826, "step": 21230 }, { "epoch": 2.5231646471846045, "grad_norm": 0.09677119002923974, "learning_rate": 3.2401252918306815e-06, "loss": 0.0823, "step": 21240 }, { "epoch": 2.5243525778094558, "grad_norm": 0.0992749469298588, "learning_rate": 3.22435718505881e-06, "loss": 0.085, "step": 21250 }, { "epoch": 2.5255405084343074, "grad_norm": 0.09772825197732296, "learning_rate": 3.2086248945543072e-06, "loss": 0.0806, "step": 21260 }, { "epoch": 2.526728439059159, "grad_norm": 0.09801211653349576, "learning_rate": 3.1929284461934132e-06, "loss": 0.0832, "step": 21270 }, { "epoch": 2.5279163696840103, "grad_norm": 0.09754289542005998, "learning_rate": 3.1772678657934237e-06, "loss": 0.0861, "step": 21280 }, { "epoch": 2.529104300308862, "grad_norm": 0.09669096741423627, "learning_rate": 3.1616431791126473e-06, "loss": 0.0829, "step": 21290 }, { "epoch": 2.5302922309337132, "grad_norm": 0.10154860190734044, "learning_rate": 3.1460544118503382e-06, "loss": 0.0845, "step": 21300 }, { "epoch": 2.531480161558565, "grad_norm": 0.10229582957022705, "learning_rate": 3.130501589646684e-06, "loss": 0.0826, "step": 21310 }, { "epoch": 2.5326680921834166, "grad_norm": 0.09731915435561073, "learning_rate": 3.114984738082738e-06, "loss": 0.0803, "step": 21320 }, { "epoch": 2.5338560228082683, "grad_norm": 0.09908274607625009, "learning_rate": 3.0995038826804047e-06, "loss": 0.0822, "step": 21330 }, { "epoch": 2.5350439534331195, "grad_norm": 0.09899302015390174, "learning_rate": 3.0840590489023687e-06, "loss": 0.0828, "step": 21340 }, { "epoch": 2.536231884057971, "grad_norm": 0.09627628597879721, "learning_rate": 3.0686502621520756e-06, "loss": 0.0828, "step": 21350 }, { "epoch": 2.5374198146828224, "grad_norm": 0.09650837207744188, "learning_rate": 3.0532775477736808e-06, "loss": 0.0838, "step": 21360 }, { "epoch": 2.538607745307674, "grad_norm": 0.09871845770622241, "learning_rate": 3.037940931051997e-06, "loss": 0.0868, "step": 21370 }, { "epoch": 2.5397956759325258, "grad_norm": 0.09934893997662077, "learning_rate": 3.0226404372124817e-06, "loss": 0.0819, "step": 21380 }, { "epoch": 2.540983606557377, "grad_norm": 0.09759119681221214, "learning_rate": 3.0073760914211714e-06, "loss": 0.0852, "step": 21390 }, { "epoch": 2.5421715371822287, "grad_norm": 0.10026136249232451, "learning_rate": 2.992147918784638e-06, "loss": 0.0816, "step": 21400 }, { "epoch": 2.54335946780708, "grad_norm": 0.0980970583476325, "learning_rate": 2.9769559443499655e-06, "loss": 0.0818, "step": 21410 }, { "epoch": 2.5445473984319316, "grad_norm": 0.09884363857018365, "learning_rate": 2.961800193104694e-06, "loss": 0.0829, "step": 21420 }, { "epoch": 2.5457353290567832, "grad_norm": 0.10201946570951494, "learning_rate": 2.946680689976794e-06, "loss": 0.087, "step": 21430 }, { "epoch": 2.5469232596816345, "grad_norm": 0.09557829619623755, "learning_rate": 2.931597459834609e-06, "loss": 0.0837, "step": 21440 }, { "epoch": 2.548111190306486, "grad_norm": 0.09933142987250666, "learning_rate": 2.9165505274868165e-06, "loss": 0.0837, "step": 21450 }, { "epoch": 2.5492991209313374, "grad_norm": 0.1012588139760825, "learning_rate": 2.901539917682397e-06, "loss": 0.0835, "step": 21460 }, { "epoch": 2.550487051556189, "grad_norm": 0.09556908871601993, "learning_rate": 2.8865656551105844e-06, "loss": 0.082, "step": 21470 }, { "epoch": 2.5516749821810407, "grad_norm": 0.10102613385483174, "learning_rate": 2.8716277644008445e-06, "loss": 0.0835, "step": 21480 }, { "epoch": 2.5528629128058924, "grad_norm": 0.09412970464951861, "learning_rate": 2.8567262701228e-06, "loss": 0.0811, "step": 21490 }, { "epoch": 2.5540508434307436, "grad_norm": 0.09517006363476718, "learning_rate": 2.8418611967862165e-06, "loss": 0.0804, "step": 21500 }, { "epoch": 2.5552387740555953, "grad_norm": 0.09780141055644505, "learning_rate": 2.827032568840954e-06, "loss": 0.0815, "step": 21510 }, { "epoch": 2.5564267046804465, "grad_norm": 0.09799429120918036, "learning_rate": 2.8122404106769307e-06, "loss": 0.0816, "step": 21520 }, { "epoch": 2.557614635305298, "grad_norm": 0.10080297930984551, "learning_rate": 2.7974847466240768e-06, "loss": 0.0811, "step": 21530 }, { "epoch": 2.55880256593015, "grad_norm": 0.0937966652365748, "learning_rate": 2.7827656009523e-06, "loss": 0.0813, "step": 21540 }, { "epoch": 2.559990496555001, "grad_norm": 0.09702856232294414, "learning_rate": 2.7680829978714405e-06, "loss": 0.0811, "step": 21550 }, { "epoch": 2.561178427179853, "grad_norm": 0.09388755981403524, "learning_rate": 2.7534369615312306e-06, "loss": 0.0826, "step": 21560 }, { "epoch": 2.562366357804704, "grad_norm": 0.09792034766633306, "learning_rate": 2.7388275160212696e-06, "loss": 0.0789, "step": 21570 }, { "epoch": 2.5635542884295557, "grad_norm": 0.09890715551619719, "learning_rate": 2.7242546853709638e-06, "loss": 0.0811, "step": 21580 }, { "epoch": 2.5647422190544074, "grad_norm": 0.10181426791780414, "learning_rate": 2.709718493549496e-06, "loss": 0.0811, "step": 21590 }, { "epoch": 2.5659301496792586, "grad_norm": 0.0970009982907639, "learning_rate": 2.69521896446579e-06, "loss": 0.0816, "step": 21600 }, { "epoch": 2.5671180803041103, "grad_norm": 0.10200329972405696, "learning_rate": 2.6807561219684557e-06, "loss": 0.0826, "step": 21610 }, { "epoch": 2.5683060109289615, "grad_norm": 0.09763321596005048, "learning_rate": 2.666329989845784e-06, "loss": 0.082, "step": 21620 }, { "epoch": 2.569493941553813, "grad_norm": 0.0925161562939458, "learning_rate": 2.6519405918256635e-06, "loss": 0.0826, "step": 21630 }, { "epoch": 2.570681872178665, "grad_norm": 0.09917707985443674, "learning_rate": 2.6375879515755737e-06, "loss": 0.0867, "step": 21640 }, { "epoch": 2.5718698028035165, "grad_norm": 0.09852309237188679, "learning_rate": 2.6232720927025362e-06, "loss": 0.0868, "step": 21650 }, { "epoch": 2.5730577334283677, "grad_norm": 0.09783766520937687, "learning_rate": 2.608993038753063e-06, "loss": 0.0842, "step": 21660 }, { "epoch": 2.5742456640532194, "grad_norm": 0.0985978348501161, "learning_rate": 2.594750813213148e-06, "loss": 0.0826, "step": 21670 }, { "epoch": 2.5754335946780706, "grad_norm": 0.09790588396326819, "learning_rate": 2.5805454395081947e-06, "loss": 0.0829, "step": 21680 }, { "epoch": 2.5766215253029223, "grad_norm": 0.09737621733157388, "learning_rate": 2.5663769410030042e-06, "loss": 0.0824, "step": 21690 }, { "epoch": 2.577809455927774, "grad_norm": 0.09844774538287004, "learning_rate": 2.5522453410017135e-06, "loss": 0.0825, "step": 21700 }, { "epoch": 2.5789973865526252, "grad_norm": 0.09598438721180629, "learning_rate": 2.5381506627477874e-06, "loss": 0.082, "step": 21710 }, { "epoch": 2.580185317177477, "grad_norm": 0.09543807332633224, "learning_rate": 2.524092929423946e-06, "loss": 0.0795, "step": 21720 }, { "epoch": 2.581373247802328, "grad_norm": 0.09928452751697031, "learning_rate": 2.5100721641521523e-06, "loss": 0.0833, "step": 21730 }, { "epoch": 2.58256117842718, "grad_norm": 0.10165865240805044, "learning_rate": 2.496088389993559e-06, "loss": 0.0825, "step": 21740 }, { "epoch": 2.5837491090520315, "grad_norm": 0.09914058043321018, "learning_rate": 2.482141629948473e-06, "loss": 0.0833, "step": 21750 }, { "epoch": 2.5849370396768827, "grad_norm": 0.09597969753641722, "learning_rate": 2.4682319069563394e-06, "loss": 0.0824, "step": 21760 }, { "epoch": 2.5861249703017344, "grad_norm": 0.09718518920424304, "learning_rate": 2.454359243895665e-06, "loss": 0.0843, "step": 21770 }, { "epoch": 2.5873129009265856, "grad_norm": 0.09215844317888065, "learning_rate": 2.44052366358401e-06, "loss": 0.0817, "step": 21780 }, { "epoch": 2.5885008315514373, "grad_norm": 0.0948976820416827, "learning_rate": 2.4267251887779395e-06, "loss": 0.0842, "step": 21790 }, { "epoch": 2.589688762176289, "grad_norm": 0.09612646215635681, "learning_rate": 2.4129638421729857e-06, "loss": 0.0814, "step": 21800 }, { "epoch": 2.5908766928011406, "grad_norm": 0.10126570735761727, "learning_rate": 2.3992396464036225e-06, "loss": 0.0808, "step": 21810 }, { "epoch": 2.592064623425992, "grad_norm": 0.10055079248792194, "learning_rate": 2.385552624043205e-06, "loss": 0.0818, "step": 21820 }, { "epoch": 2.5932525540508435, "grad_norm": 0.09813713298154851, "learning_rate": 2.371902797603964e-06, "loss": 0.0824, "step": 21830 }, { "epoch": 2.5944404846756948, "grad_norm": 0.09570463024335667, "learning_rate": 2.3582901895369213e-06, "loss": 0.0831, "step": 21840 }, { "epoch": 2.5956284153005464, "grad_norm": 0.10158874237174487, "learning_rate": 2.3447148222319138e-06, "loss": 0.0829, "step": 21850 }, { "epoch": 2.596816345925398, "grad_norm": 0.09739845874362503, "learning_rate": 2.331176718017511e-06, "loss": 0.0822, "step": 21860 }, { "epoch": 2.5980042765502493, "grad_norm": 0.10465968682704715, "learning_rate": 2.3176758991609928e-06, "loss": 0.0842, "step": 21870 }, { "epoch": 2.599192207175101, "grad_norm": 0.0960078939477718, "learning_rate": 2.3042123878683126e-06, "loss": 0.0851, "step": 21880 }, { "epoch": 2.6003801377999523, "grad_norm": 0.10246579350735688, "learning_rate": 2.2907862062840596e-06, "loss": 0.0821, "step": 21890 }, { "epoch": 2.601568068424804, "grad_norm": 0.09886982395668978, "learning_rate": 2.2773973764914356e-06, "loss": 0.0791, "step": 21900 }, { "epoch": 2.6027559990496556, "grad_norm": 0.09839585356338668, "learning_rate": 2.2640459205121894e-06, "loss": 0.0828, "step": 21910 }, { "epoch": 2.6039439296745073, "grad_norm": 0.14269151988429826, "learning_rate": 2.2507318603066097e-06, "loss": 0.0815, "step": 21920 }, { "epoch": 2.6051318602993585, "grad_norm": 0.09557168283081569, "learning_rate": 2.237455217773468e-06, "loss": 0.0792, "step": 21930 }, { "epoch": 2.60631979092421, "grad_norm": 0.09983577749508181, "learning_rate": 2.2242160147499995e-06, "loss": 0.0824, "step": 21940 }, { "epoch": 2.6075077215490614, "grad_norm": 0.09501497174032927, "learning_rate": 2.2110142730118607e-06, "loss": 0.0815, "step": 21950 }, { "epoch": 2.608695652173913, "grad_norm": 0.09615944550740832, "learning_rate": 2.1978500142730867e-06, "loss": 0.0836, "step": 21960 }, { "epoch": 2.6098835827987648, "grad_norm": 0.10013479741278296, "learning_rate": 2.184723260186061e-06, "loss": 0.0817, "step": 21970 }, { "epoch": 2.611071513423616, "grad_norm": 0.09891117905738456, "learning_rate": 2.1716340323414825e-06, "loss": 0.0845, "step": 21980 }, { "epoch": 2.6122594440484677, "grad_norm": 0.10127722045063019, "learning_rate": 2.1585823522683264e-06, "loss": 0.082, "step": 21990 }, { "epoch": 2.613447374673319, "grad_norm": 0.09789871281332463, "learning_rate": 2.1455682414338123e-06, "loss": 0.0802, "step": 22000 }, { "epoch": 2.6146353052981706, "grad_norm": 0.09315163751951823, "learning_rate": 2.13259172124336e-06, "loss": 0.081, "step": 22010 }, { "epoch": 2.6158232359230222, "grad_norm": 0.1041520499985791, "learning_rate": 2.119652813040571e-06, "loss": 0.082, "step": 22020 }, { "epoch": 2.6170111665478735, "grad_norm": 0.09814316673411277, "learning_rate": 2.1067515381071713e-06, "loss": 0.0826, "step": 22030 }, { "epoch": 2.618199097172725, "grad_norm": 0.09971280811141664, "learning_rate": 2.093887917663001e-06, "loss": 0.0831, "step": 22040 }, { "epoch": 2.6193870277975764, "grad_norm": 0.09915085715966777, "learning_rate": 2.0810619728659552e-06, "loss": 0.0824, "step": 22050 }, { "epoch": 2.620574958422428, "grad_norm": 0.09734860808453234, "learning_rate": 2.068273724811967e-06, "loss": 0.0833, "step": 22060 }, { "epoch": 2.6217628890472797, "grad_norm": 0.09662032012695579, "learning_rate": 2.0555231945349667e-06, "loss": 0.082, "step": 22070 }, { "epoch": 2.6229508196721314, "grad_norm": 0.09437569290598163, "learning_rate": 2.0428104030068367e-06, "loss": 0.0822, "step": 22080 }, { "epoch": 2.6241387502969826, "grad_norm": 0.10123779879383578, "learning_rate": 2.0301353711374065e-06, "loss": 0.0819, "step": 22090 }, { "epoch": 2.6253266809218343, "grad_norm": 0.09968473046854572, "learning_rate": 2.017498119774383e-06, "loss": 0.0839, "step": 22100 }, { "epoch": 2.6265146115466855, "grad_norm": 0.10018938451498186, "learning_rate": 2.0048986697033366e-06, "loss": 0.0836, "step": 22110 }, { "epoch": 2.627702542171537, "grad_norm": 0.09703591737027158, "learning_rate": 1.9923370416476655e-06, "loss": 0.0844, "step": 22120 }, { "epoch": 2.628890472796389, "grad_norm": 0.095250481023444, "learning_rate": 1.979813256268551e-06, "loss": 0.082, "step": 22130 }, { "epoch": 2.63007840342124, "grad_norm": 0.09701206313295553, "learning_rate": 1.967327334164951e-06, "loss": 0.0845, "step": 22140 }, { "epoch": 2.631266334046092, "grad_norm": 0.09874986255722035, "learning_rate": 1.9548792958735185e-06, "loss": 0.0845, "step": 22150 }, { "epoch": 2.632454264670943, "grad_norm": 0.09891877985652502, "learning_rate": 1.9424691618686152e-06, "loss": 0.0821, "step": 22160 }, { "epoch": 2.6336421952957947, "grad_norm": 0.09705195680165957, "learning_rate": 1.9300969525622464e-06, "loss": 0.0805, "step": 22170 }, { "epoch": 2.6348301259206464, "grad_norm": 0.09931685256203783, "learning_rate": 1.917762688304059e-06, "loss": 0.0836, "step": 22180 }, { "epoch": 2.6360180565454976, "grad_norm": 0.09404462086604085, "learning_rate": 1.9054663893812635e-06, "loss": 0.0825, "step": 22190 }, { "epoch": 2.6372059871703493, "grad_norm": 0.09827789801791523, "learning_rate": 1.893208076018646e-06, "loss": 0.0818, "step": 22200 }, { "epoch": 2.6383939177952005, "grad_norm": 0.09638963927405252, "learning_rate": 1.8809877683784977e-06, "loss": 0.0813, "step": 22210 }, { "epoch": 2.639581848420052, "grad_norm": 0.098278167890346, "learning_rate": 1.8688054865606098e-06, "loss": 0.0852, "step": 22220 }, { "epoch": 2.640769779044904, "grad_norm": 0.09331589451993229, "learning_rate": 1.856661250602229e-06, "loss": 0.0808, "step": 22230 }, { "epoch": 2.6419577096697555, "grad_norm": 0.0984969449272301, "learning_rate": 1.8445550804780192e-06, "loss": 0.0824, "step": 22240 }, { "epoch": 2.6431456402946067, "grad_norm": 0.10110262716379108, "learning_rate": 1.8324869961000385e-06, "loss": 0.0838, "step": 22250 }, { "epoch": 2.6443335709194584, "grad_norm": 0.10025872838235429, "learning_rate": 1.8204570173176977e-06, "loss": 0.0825, "step": 22260 }, { "epoch": 2.6455215015443097, "grad_norm": 0.09508207520215241, "learning_rate": 1.808465163917733e-06, "loss": 0.0832, "step": 22270 }, { "epoch": 2.6467094321691613, "grad_norm": 0.09767869854901429, "learning_rate": 1.796511455624178e-06, "loss": 0.081, "step": 22280 }, { "epoch": 2.647897362794013, "grad_norm": 0.09598340627838889, "learning_rate": 1.7845959120983247e-06, "loss": 0.0817, "step": 22290 }, { "epoch": 2.6490852934188642, "grad_norm": 0.09985534309947645, "learning_rate": 1.7727185529386848e-06, "loss": 0.0809, "step": 22300 }, { "epoch": 2.650273224043716, "grad_norm": 0.09473960864064537, "learning_rate": 1.76087939768097e-06, "loss": 0.0816, "step": 22310 }, { "epoch": 2.651461154668567, "grad_norm": 0.0966016538412682, "learning_rate": 1.7490784657980541e-06, "loss": 0.0819, "step": 22320 }, { "epoch": 2.652649085293419, "grad_norm": 0.09556423367136488, "learning_rate": 1.7373157766999437e-06, "loss": 0.0813, "step": 22330 }, { "epoch": 2.6538370159182705, "grad_norm": 0.09562712065540871, "learning_rate": 1.7255913497337435e-06, "loss": 0.0832, "step": 22340 }, { "epoch": 2.655024946543122, "grad_norm": 0.09922620613435351, "learning_rate": 1.713905204183619e-06, "loss": 0.0811, "step": 22350 }, { "epoch": 2.6562128771679734, "grad_norm": 0.10116207436174574, "learning_rate": 1.7022573592707759e-06, "loss": 0.084, "step": 22360 }, { "epoch": 2.657400807792825, "grad_norm": 0.09407948228416342, "learning_rate": 1.6906478341534303e-06, "loss": 0.0828, "step": 22370 }, { "epoch": 2.6585887384176763, "grad_norm": 0.09913990752008611, "learning_rate": 1.679076647926761e-06, "loss": 0.0788, "step": 22380 }, { "epoch": 2.659776669042528, "grad_norm": 0.09591600163484032, "learning_rate": 1.667543819622891e-06, "loss": 0.0838, "step": 22390 }, { "epoch": 2.6609645996673796, "grad_norm": 0.09782780338600039, "learning_rate": 1.656049368210852e-06, "loss": 0.0832, "step": 22400 }, { "epoch": 2.662152530292231, "grad_norm": 0.10115949437957791, "learning_rate": 1.6445933125965458e-06, "loss": 0.0827, "step": 22410 }, { "epoch": 2.6633404609170825, "grad_norm": 0.09792873936961863, "learning_rate": 1.6331756716227442e-06, "loss": 0.0825, "step": 22420 }, { "epoch": 2.6645283915419338, "grad_norm": 0.09425790112908036, "learning_rate": 1.6217964640690113e-06, "loss": 0.0817, "step": 22430 }, { "epoch": 2.6657163221667854, "grad_norm": 0.09451313764212345, "learning_rate": 1.6104557086517092e-06, "loss": 0.0796, "step": 22440 }, { "epoch": 2.666904252791637, "grad_norm": 0.10022792585239157, "learning_rate": 1.599153424023947e-06, "loss": 0.0821, "step": 22450 }, { "epoch": 2.6680921834164883, "grad_norm": 0.09521875497351336, "learning_rate": 1.5878896287755657e-06, "loss": 0.0824, "step": 22460 }, { "epoch": 2.66928011404134, "grad_norm": 0.09593139683810464, "learning_rate": 1.5766643414330956e-06, "loss": 0.0861, "step": 22470 }, { "epoch": 2.6704680446661913, "grad_norm": 0.09569622498148286, "learning_rate": 1.565477580459726e-06, "loss": 0.0831, "step": 22480 }, { "epoch": 2.671655975291043, "grad_norm": 0.09711059411964296, "learning_rate": 1.5543293642552831e-06, "loss": 0.081, "step": 22490 }, { "epoch": 2.6728439059158946, "grad_norm": 0.09648010416842487, "learning_rate": 1.5432197111561937e-06, "loss": 0.0835, "step": 22500 }, { "epoch": 2.6740318365407463, "grad_norm": 0.09837765115325198, "learning_rate": 1.5321486394354601e-06, "loss": 0.0817, "step": 22510 }, { "epoch": 2.6752197671655975, "grad_norm": 0.0951817203411527, "learning_rate": 1.521116167302622e-06, "loss": 0.0808, "step": 22520 }, { "epoch": 2.676407697790449, "grad_norm": 0.09519109378003303, "learning_rate": 1.5101223129037333e-06, "loss": 0.0808, "step": 22530 }, { "epoch": 2.6775956284153004, "grad_norm": 0.09804761886630001, "learning_rate": 1.499167094321327e-06, "loss": 0.0833, "step": 22540 }, { "epoch": 2.678783559040152, "grad_norm": 0.10038742670492523, "learning_rate": 1.4882505295743894e-06, "loss": 0.0811, "step": 22550 }, { "epoch": 2.6799714896650038, "grad_norm": 0.09951597107749217, "learning_rate": 1.4773726366183377e-06, "loss": 0.0803, "step": 22560 }, { "epoch": 2.681159420289855, "grad_norm": 0.09922892116356963, "learning_rate": 1.4665334333449714e-06, "loss": 0.0835, "step": 22570 }, { "epoch": 2.6823473509147067, "grad_norm": 0.09935767101407508, "learning_rate": 1.4557329375824574e-06, "loss": 0.081, "step": 22580 }, { "epoch": 2.683535281539558, "grad_norm": 0.09501062281683544, "learning_rate": 1.4449711670952966e-06, "loss": 0.0823, "step": 22590 }, { "epoch": 2.6847232121644096, "grad_norm": 0.09961989195361715, "learning_rate": 1.4342481395842933e-06, "loss": 0.0852, "step": 22600 }, { "epoch": 2.6859111427892612, "grad_norm": 0.098877930458093, "learning_rate": 1.423563872686537e-06, "loss": 0.0829, "step": 22610 }, { "epoch": 2.6870990734141125, "grad_norm": 0.10301493653750428, "learning_rate": 1.4129183839753535e-06, "loss": 0.0853, "step": 22620 }, { "epoch": 2.688287004038964, "grad_norm": 0.09916629941315273, "learning_rate": 1.4023116909602952e-06, "loss": 0.083, "step": 22630 }, { "epoch": 2.6894749346638154, "grad_norm": 0.09833061511451036, "learning_rate": 1.3917438110870984e-06, "loss": 0.0834, "step": 22640 }, { "epoch": 2.690662865288667, "grad_norm": 0.09662967917783653, "learning_rate": 1.3812147617376563e-06, "loss": 0.0819, "step": 22650 }, { "epoch": 2.6918507959135187, "grad_norm": 0.09792908783220589, "learning_rate": 1.370724560230005e-06, "loss": 0.0824, "step": 22660 }, { "epoch": 2.6930387265383704, "grad_norm": 0.09345310514730532, "learning_rate": 1.3602732238182786e-06, "loss": 0.0859, "step": 22670 }, { "epoch": 2.6942266571632216, "grad_norm": 0.09775196548071804, "learning_rate": 1.3498607696926818e-06, "loss": 0.0819, "step": 22680 }, { "epoch": 2.6954145877880733, "grad_norm": 0.10024996809067718, "learning_rate": 1.3394872149794736e-06, "loss": 0.0834, "step": 22690 }, { "epoch": 2.6966025184129245, "grad_norm": 0.10080542150203362, "learning_rate": 1.3291525767409303e-06, "loss": 0.0814, "step": 22700 }, { "epoch": 2.697790449037776, "grad_norm": 0.09614327745614372, "learning_rate": 1.3188568719753185e-06, "loss": 0.0807, "step": 22710 }, { "epoch": 2.698978379662628, "grad_norm": 0.1011749870119934, "learning_rate": 1.308600117616865e-06, "loss": 0.0826, "step": 22720 }, { "epoch": 2.700166310287479, "grad_norm": 0.09829147612179857, "learning_rate": 1.2983823305357329e-06, "loss": 0.0827, "step": 22730 }, { "epoch": 2.701354240912331, "grad_norm": 0.09895577783952303, "learning_rate": 1.2882035275379928e-06, "loss": 0.0827, "step": 22740 }, { "epoch": 2.702542171537182, "grad_norm": 0.1007259188860891, "learning_rate": 1.278063725365597e-06, "loss": 0.0817, "step": 22750 }, { "epoch": 2.7037301021620337, "grad_norm": 0.09474975654030852, "learning_rate": 1.2679629406963462e-06, "loss": 0.0817, "step": 22760 }, { "epoch": 2.7049180327868854, "grad_norm": 0.09905648260295093, "learning_rate": 1.2579011901438704e-06, "loss": 0.0804, "step": 22770 }, { "epoch": 2.7061059634117366, "grad_norm": 0.09998473495326414, "learning_rate": 1.2478784902575896e-06, "loss": 0.0836, "step": 22780 }, { "epoch": 2.7072938940365883, "grad_norm": 0.09645853574594415, "learning_rate": 1.2378948575227006e-06, "loss": 0.0798, "step": 22790 }, { "epoch": 2.7084818246614395, "grad_norm": 0.10090875585864276, "learning_rate": 1.2279503083601373e-06, "loss": 0.0831, "step": 22800 }, { "epoch": 2.709669755286291, "grad_norm": 0.09383572763482503, "learning_rate": 1.2180448591265548e-06, "loss": 0.0832, "step": 22810 }, { "epoch": 2.710857685911143, "grad_norm": 0.09857140487883734, "learning_rate": 1.2081785261142952e-06, "loss": 0.0812, "step": 22820 }, { "epoch": 2.7120456165359945, "grad_norm": 0.09999088021692984, "learning_rate": 1.1983513255513612e-06, "loss": 0.0823, "step": 22830 }, { "epoch": 2.7132335471608457, "grad_norm": 0.10165190621870976, "learning_rate": 1.1885632736013923e-06, "loss": 0.0825, "step": 22840 }, { "epoch": 2.7144214777856974, "grad_norm": 0.09874324816750697, "learning_rate": 1.1788143863636413e-06, "loss": 0.0826, "step": 22850 }, { "epoch": 2.7156094084105487, "grad_norm": 0.09833781873240172, "learning_rate": 1.1691046798729372e-06, "loss": 0.0799, "step": 22860 }, { "epoch": 2.7167973390354003, "grad_norm": 0.09622168977850035, "learning_rate": 1.159434170099663e-06, "loss": 0.0812, "step": 22870 }, { "epoch": 2.717985269660252, "grad_norm": 0.09849900078146939, "learning_rate": 1.1498028729497344e-06, "loss": 0.0834, "step": 22880 }, { "epoch": 2.7191732002851032, "grad_norm": 0.09752777757874835, "learning_rate": 1.1402108042645825e-06, "loss": 0.0794, "step": 22890 }, { "epoch": 2.720361130909955, "grad_norm": 0.09872589876882687, "learning_rate": 1.1306579798210953e-06, "loss": 0.0827, "step": 22900 }, { "epoch": 2.721549061534806, "grad_norm": 0.09867371183509518, "learning_rate": 1.121144415331629e-06, "loss": 0.0819, "step": 22910 }, { "epoch": 2.722736992159658, "grad_norm": 0.09897120223046395, "learning_rate": 1.1116701264439534e-06, "loss": 0.0813, "step": 22920 }, { "epoch": 2.7239249227845095, "grad_norm": 0.09434827731517001, "learning_rate": 1.1022351287412442e-06, "loss": 0.0821, "step": 22930 }, { "epoch": 2.725112853409361, "grad_norm": 0.09391035404316103, "learning_rate": 1.0928394377420576e-06, "loss": 0.0844, "step": 22940 }, { "epoch": 2.7263007840342124, "grad_norm": 0.09624223135429173, "learning_rate": 1.08348306890029e-06, "loss": 0.0793, "step": 22950 }, { "epoch": 2.727488714659064, "grad_norm": 0.09264255338310012, "learning_rate": 1.0741660376051593e-06, "loss": 0.0798, "step": 22960 }, { "epoch": 2.7286766452839153, "grad_norm": 0.09822166813016883, "learning_rate": 1.0648883591811903e-06, "loss": 0.0832, "step": 22970 }, { "epoch": 2.729864575908767, "grad_norm": 0.09959271510430978, "learning_rate": 1.0556500488881737e-06, "loss": 0.0771, "step": 22980 }, { "epoch": 2.7310525065336186, "grad_norm": 0.09675793413473777, "learning_rate": 1.046451121921152e-06, "loss": 0.0819, "step": 22990 }, { "epoch": 2.73224043715847, "grad_norm": 0.10029237880384449, "learning_rate": 1.0372915934103916e-06, "loss": 0.0827, "step": 23000 }, { "epoch": 2.7334283677833215, "grad_norm": 0.1022162694938258, "learning_rate": 1.0281714784213526e-06, "loss": 0.0831, "step": 23010 }, { "epoch": 2.7346162984081728, "grad_norm": 0.1001591007100596, "learning_rate": 1.0190907919546634e-06, "loss": 0.0812, "step": 23020 }, { "epoch": 2.7358042290330244, "grad_norm": 0.09776409512721462, "learning_rate": 1.010049548946121e-06, "loss": 0.0813, "step": 23030 }, { "epoch": 2.736992159657876, "grad_norm": 0.09803414797360438, "learning_rate": 1.0010477642666244e-06, "loss": 0.081, "step": 23040 }, { "epoch": 2.7381800902827274, "grad_norm": 0.09821627202403387, "learning_rate": 9.920854527221857e-07, "loss": 0.0814, "step": 23050 }, { "epoch": 2.739368020907579, "grad_norm": 0.09687977568090735, "learning_rate": 9.831626290538853e-07, "loss": 0.0828, "step": 23060 }, { "epoch": 2.7405559515324303, "grad_norm": 0.10122434332132936, "learning_rate": 9.742793079378505e-07, "loss": 0.0831, "step": 23070 }, { "epoch": 2.741743882157282, "grad_norm": 0.09936009875266502, "learning_rate": 9.654355039852514e-07, "loss": 0.0823, "step": 23080 }, { "epoch": 2.7429318127821336, "grad_norm": 0.09809779004401559, "learning_rate": 9.566312317422471e-07, "loss": 0.0829, "step": 23090 }, { "epoch": 2.7441197434069853, "grad_norm": 0.10252261404923702, "learning_rate": 9.478665056899789e-07, "loss": 0.0812, "step": 23100 }, { "epoch": 2.7453076740318365, "grad_norm": 0.09716680734694264, "learning_rate": 9.391413402445454e-07, "loss": 0.0826, "step": 23110 }, { "epoch": 2.746495604656688, "grad_norm": 0.09903138755615673, "learning_rate": 9.3045574975697e-07, "loss": 0.0834, "step": 23120 }, { "epoch": 2.7476835352815394, "grad_norm": 0.09823799424718613, "learning_rate": 9.218097485131915e-07, "loss": 0.0827, "step": 23130 }, { "epoch": 2.748871465906391, "grad_norm": 0.09923278625431843, "learning_rate": 9.132033507340293e-07, "loss": 0.0816, "step": 23140 }, { "epoch": 2.7500593965312428, "grad_norm": 0.10368830889188282, "learning_rate": 9.046365705751625e-07, "loss": 0.0813, "step": 23150 }, { "epoch": 2.751247327156094, "grad_norm": 0.10328758406738488, "learning_rate": 8.961094221271088e-07, "loss": 0.0828, "step": 23160 }, { "epoch": 2.7524352577809457, "grad_norm": 0.09800541109378266, "learning_rate": 8.876219194152046e-07, "loss": 0.084, "step": 23170 }, { "epoch": 2.753623188405797, "grad_norm": 0.09817545412788485, "learning_rate": 8.791740763995721e-07, "loss": 0.0818, "step": 23180 }, { "epoch": 2.7548111190306486, "grad_norm": 0.0932601415128728, "learning_rate": 8.707659069751018e-07, "loss": 0.0836, "step": 23190 }, { "epoch": 2.7559990496555002, "grad_norm": 0.09871705147320409, "learning_rate": 8.623974249714311e-07, "loss": 0.0806, "step": 23200 }, { "epoch": 2.7571869802803515, "grad_norm": 0.0978310151484371, "learning_rate": 8.540686441529166e-07, "loss": 0.0811, "step": 23210 }, { "epoch": 2.758374910905203, "grad_norm": 0.10726829470206542, "learning_rate": 8.457795782186278e-07, "loss": 0.0821, "step": 23220 }, { "epoch": 2.7595628415300544, "grad_norm": 0.09977372104985986, "learning_rate": 8.37530240802295e-07, "loss": 0.0809, "step": 23230 }, { "epoch": 2.760750772154906, "grad_norm": 0.09502931717676452, "learning_rate": 8.293206454723145e-07, "loss": 0.0796, "step": 23240 }, { "epoch": 2.7619387027797577, "grad_norm": 0.10167092483227941, "learning_rate": 8.211508057317074e-07, "loss": 0.0824, "step": 23250 }, { "epoch": 2.7631266334046094, "grad_norm": 0.09660866573258636, "learning_rate": 8.130207350181135e-07, "loss": 0.0836, "step": 23260 }, { "epoch": 2.7643145640294606, "grad_norm": 0.10158870863496154, "learning_rate": 8.049304467037583e-07, "loss": 0.0813, "step": 23270 }, { "epoch": 2.7655024946543123, "grad_norm": 0.10261820565211699, "learning_rate": 7.968799540954308e-07, "loss": 0.0815, "step": 23280 }, { "epoch": 2.7666904252791635, "grad_norm": 0.0963878185223938, "learning_rate": 7.888692704344669e-07, "loss": 0.0829, "step": 23290 }, { "epoch": 2.767878355904015, "grad_norm": 0.09697367516219227, "learning_rate": 7.80898408896727e-07, "loss": 0.0805, "step": 23300 }, { "epoch": 2.769066286528867, "grad_norm": 0.09366227163408783, "learning_rate": 7.729673825925654e-07, "loss": 0.0795, "step": 23310 }, { "epoch": 2.770254217153718, "grad_norm": 0.10143099565366419, "learning_rate": 7.650762045668281e-07, "loss": 0.0812, "step": 23320 }, { "epoch": 2.77144214777857, "grad_norm": 0.09427572140483254, "learning_rate": 7.572248877988075e-07, "loss": 0.0845, "step": 23330 }, { "epoch": 2.772630078403421, "grad_norm": 0.10398969197164676, "learning_rate": 7.494134452022406e-07, "loss": 0.0827, "step": 23340 }, { "epoch": 2.7738180090282727, "grad_norm": 0.09920216220816212, "learning_rate": 7.416418896252719e-07, "loss": 0.0836, "step": 23350 }, { "epoch": 2.7750059396531244, "grad_norm": 0.09554548254607402, "learning_rate": 7.339102338504516e-07, "loss": 0.0832, "step": 23360 }, { "epoch": 2.776193870277976, "grad_norm": 0.09697129492667891, "learning_rate": 7.262184905946962e-07, "loss": 0.0807, "step": 23370 }, { "epoch": 2.7773818009028273, "grad_norm": 0.0947899496795266, "learning_rate": 7.185666725092716e-07, "loss": 0.0838, "step": 23380 }, { "epoch": 2.778569731527679, "grad_norm": 0.10201408933901629, "learning_rate": 7.109547921797827e-07, "loss": 0.0844, "step": 23390 }, { "epoch": 2.77975766215253, "grad_norm": 0.09658820438339603, "learning_rate": 7.033828621261396e-07, "loss": 0.0857, "step": 23400 }, { "epoch": 2.780945592777382, "grad_norm": 0.09481714820404097, "learning_rate": 6.958508948025494e-07, "loss": 0.0839, "step": 23410 }, { "epoch": 2.7821335234022335, "grad_norm": 0.10103045819002777, "learning_rate": 6.883589025974801e-07, "loss": 0.0836, "step": 23420 }, { "epoch": 2.7833214540270848, "grad_norm": 0.09541600423496459, "learning_rate": 6.809068978336553e-07, "loss": 0.0818, "step": 23430 }, { "epoch": 2.7845093846519364, "grad_norm": 0.0969365328736408, "learning_rate": 6.734948927680257e-07, "loss": 0.0831, "step": 23440 }, { "epoch": 2.7856973152767877, "grad_norm": 0.10068270884435301, "learning_rate": 6.661228995917534e-07, "loss": 0.0814, "step": 23450 }, { "epoch": 2.7868852459016393, "grad_norm": 0.09578280380591477, "learning_rate": 6.587909304301893e-07, "loss": 0.083, "step": 23460 }, { "epoch": 2.788073176526491, "grad_norm": 0.09172025086257789, "learning_rate": 6.514989973428476e-07, "loss": 0.0813, "step": 23470 }, { "epoch": 2.7892611071513422, "grad_norm": 0.09791876982261628, "learning_rate": 6.442471123233984e-07, "loss": 0.0786, "step": 23480 }, { "epoch": 2.790449037776194, "grad_norm": 0.09682365882189653, "learning_rate": 6.370352872996338e-07, "loss": 0.081, "step": 23490 }, { "epoch": 2.791636968401045, "grad_norm": 0.1028795667499414, "learning_rate": 6.298635341334708e-07, "loss": 0.0844, "step": 23500 }, { "epoch": 2.792824899025897, "grad_norm": 0.1048640608199929, "learning_rate": 6.22731864620904e-07, "loss": 0.0828, "step": 23510 }, { "epoch": 2.7940128296507485, "grad_norm": 0.09629162883331369, "learning_rate": 6.15640290491995e-07, "loss": 0.0821, "step": 23520 }, { "epoch": 2.7952007602756, "grad_norm": 0.09469071844792042, "learning_rate": 6.085888234108716e-07, "loss": 0.0797, "step": 23530 }, { "epoch": 2.7963886909004514, "grad_norm": 0.09574123426346512, "learning_rate": 6.015774749756814e-07, "loss": 0.0791, "step": 23540 }, { "epoch": 2.797576621525303, "grad_norm": 0.09298421652829625, "learning_rate": 5.946062567185967e-07, "loss": 0.0827, "step": 23550 }, { "epoch": 2.7987645521501543, "grad_norm": 0.09659073030722025, "learning_rate": 5.876751801057706e-07, "loss": 0.0818, "step": 23560 }, { "epoch": 2.799952482775006, "grad_norm": 0.09474765650781287, "learning_rate": 5.807842565373451e-07, "loss": 0.0841, "step": 23570 }, { "epoch": 2.8011404133998576, "grad_norm": 0.09706287759613581, "learning_rate": 5.739334973474119e-07, "loss": 0.0808, "step": 23580 }, { "epoch": 2.802328344024709, "grad_norm": 0.09896089671040914, "learning_rate": 5.671229138039968e-07, "loss": 0.0822, "step": 23590 }, { "epoch": 2.8035162746495605, "grad_norm": 0.09953661448120742, "learning_rate": 5.603525171090584e-07, "loss": 0.081, "step": 23600 }, { "epoch": 2.8047042052744118, "grad_norm": 0.10197384757411805, "learning_rate": 5.536223183984446e-07, "loss": 0.0841, "step": 23610 }, { "epoch": 2.8058921358992635, "grad_norm": 0.09760732950536372, "learning_rate": 5.469323287418948e-07, "loss": 0.0828, "step": 23620 }, { "epoch": 2.807080066524115, "grad_norm": 0.0996811992864147, "learning_rate": 5.402825591430016e-07, "loss": 0.0815, "step": 23630 }, { "epoch": 2.8082679971489664, "grad_norm": 0.09958902095929931, "learning_rate": 5.33673020539216e-07, "loss": 0.0812, "step": 23640 }, { "epoch": 2.809455927773818, "grad_norm": 0.0970144285749179, "learning_rate": 5.271037238018139e-07, "loss": 0.08, "step": 23650 }, { "epoch": 2.8106438583986693, "grad_norm": 0.0942989090465548, "learning_rate": 5.205746797358773e-07, "loss": 0.0783, "step": 23660 }, { "epoch": 2.811831789023521, "grad_norm": 0.10331064355073083, "learning_rate": 5.140858990802882e-07, "loss": 0.0822, "step": 23670 }, { "epoch": 2.8130197196483726, "grad_norm": 0.09637487056391521, "learning_rate": 5.076373925076955e-07, "loss": 0.0835, "step": 23680 }, { "epoch": 2.8142076502732243, "grad_norm": 0.09806322365503041, "learning_rate": 5.012291706245204e-07, "loss": 0.081, "step": 23690 }, { "epoch": 2.8153955808980755, "grad_norm": 0.09693310973364277, "learning_rate": 4.948612439709066e-07, "loss": 0.0819, "step": 23700 }, { "epoch": 2.816583511522927, "grad_norm": 0.10446047845853525, "learning_rate": 4.885336230207344e-07, "loss": 0.082, "step": 23710 }, { "epoch": 2.8177714421477784, "grad_norm": 0.09864170613397646, "learning_rate": 4.822463181815812e-07, "loss": 0.0817, "step": 23720 }, { "epoch": 2.81895937277263, "grad_norm": 0.09737754471298454, "learning_rate": 4.7599933979471655e-07, "loss": 0.0833, "step": 23730 }, { "epoch": 2.8201473033974818, "grad_norm": 0.09897701558126082, "learning_rate": 4.697926981350853e-07, "loss": 0.0825, "step": 23740 }, { "epoch": 2.821335234022333, "grad_norm": 0.09981193810984444, "learning_rate": 4.636264034112797e-07, "loss": 0.0835, "step": 23750 }, { "epoch": 2.8225231646471847, "grad_norm": 0.0966703456218848, "learning_rate": 4.5750046576553684e-07, "loss": 0.0833, "step": 23760 }, { "epoch": 2.823711095272036, "grad_norm": 0.09577081600227534, "learning_rate": 4.5141489527370796e-07, "loss": 0.0831, "step": 23770 }, { "epoch": 2.8248990258968876, "grad_norm": 0.09927757942836057, "learning_rate": 4.4536970194525573e-07, "loss": 0.079, "step": 23780 }, { "epoch": 2.8260869565217392, "grad_norm": 0.09953037888961043, "learning_rate": 4.393648957232266e-07, "loss": 0.0828, "step": 23790 }, { "epoch": 2.8272748871465905, "grad_norm": 0.09812102638080136, "learning_rate": 4.334004864842395e-07, "loss": 0.0805, "step": 23800 }, { "epoch": 2.828462817771442, "grad_norm": 0.09749847636492717, "learning_rate": 4.274764840384721e-07, "loss": 0.0823, "step": 23810 }, { "epoch": 2.8296507483962934, "grad_norm": 0.09818109329325155, "learning_rate": 4.215928981296358e-07, "loss": 0.0826, "step": 23820 }, { "epoch": 2.830838679021145, "grad_norm": 0.10161964018841209, "learning_rate": 4.157497384349729e-07, "loss": 0.0844, "step": 23830 }, { "epoch": 2.8320266096459967, "grad_norm": 0.09608166330970355, "learning_rate": 4.09947014565229e-07, "loss": 0.0818, "step": 23840 }, { "epoch": 2.8332145402708484, "grad_norm": 0.0953071698114558, "learning_rate": 4.041847360646389e-07, "loss": 0.08, "step": 23850 }, { "epoch": 2.8344024708956996, "grad_norm": 0.10324478416395637, "learning_rate": 3.984629124109185e-07, "loss": 0.0821, "step": 23860 }, { "epoch": 2.8355904015205513, "grad_norm": 0.09951679239648377, "learning_rate": 3.927815530152396e-07, "loss": 0.0827, "step": 23870 }, { "epoch": 2.8367783321454025, "grad_norm": 0.10254679141248157, "learning_rate": 3.8714066722222444e-07, "loss": 0.0821, "step": 23880 }, { "epoch": 2.837966262770254, "grad_norm": 0.09943009422990827, "learning_rate": 3.8154026430992085e-07, "loss": 0.0816, "step": 23890 }, { "epoch": 2.839154193395106, "grad_norm": 0.0945966599474562, "learning_rate": 3.75980353489791e-07, "loss": 0.0819, "step": 23900 }, { "epoch": 2.840342124019957, "grad_norm": 0.09679499089357767, "learning_rate": 3.704609439066975e-07, "loss": 0.0816, "step": 23910 }, { "epoch": 2.841530054644809, "grad_norm": 0.09936930228804187, "learning_rate": 3.6498204463888406e-07, "loss": 0.0789, "step": 23920 }, { "epoch": 2.84271798526966, "grad_norm": 0.09814010456641371, "learning_rate": 3.5954366469797e-07, "loss": 0.082, "step": 23930 }, { "epoch": 2.8439059158945117, "grad_norm": 0.09523461031114874, "learning_rate": 3.541458130289277e-07, "loss": 0.0807, "step": 23940 }, { "epoch": 2.8450938465193634, "grad_norm": 0.09955475946802822, "learning_rate": 3.4878849851005814e-07, "loss": 0.082, "step": 23950 }, { "epoch": 2.846281777144215, "grad_norm": 0.0978583124694583, "learning_rate": 3.434717299530016e-07, "loss": 0.0822, "step": 23960 }, { "epoch": 2.8474697077690663, "grad_norm": 0.09641123549074869, "learning_rate": 3.381955161027017e-07, "loss": 0.0802, "step": 23970 }, { "epoch": 2.848657638393918, "grad_norm": 0.10002532330140354, "learning_rate": 3.329598656374028e-07, "loss": 0.0816, "step": 23980 }, { "epoch": 2.849845569018769, "grad_norm": 0.0981760692574214, "learning_rate": 3.277647871686246e-07, "loss": 0.0812, "step": 23990 }, { "epoch": 2.851033499643621, "grad_norm": 0.09842742303107185, "learning_rate": 3.2261028924115975e-07, "loss": 0.0825, "step": 24000 }, { "epoch": 2.8522214302684725, "grad_norm": 0.09795498990984108, "learning_rate": 3.174963803330544e-07, "loss": 0.084, "step": 24010 }, { "epoch": 2.8534093608933238, "grad_norm": 0.09813834792218952, "learning_rate": 3.1242306885558846e-07, "loss": 0.084, "step": 24020 }, { "epoch": 2.8545972915181754, "grad_norm": 0.10479873083630942, "learning_rate": 3.0739036315327876e-07, "loss": 0.0826, "step": 24030 }, { "epoch": 2.8557852221430267, "grad_norm": 0.09849680150375095, "learning_rate": 3.0239827150384536e-07, "loss": 0.0838, "step": 24040 }, { "epoch": 2.8569731527678783, "grad_norm": 0.0986616545972339, "learning_rate": 2.9744680211820916e-07, "loss": 0.0791, "step": 24050 }, { "epoch": 2.85816108339273, "grad_norm": 0.09745615257304263, "learning_rate": 2.925359631404723e-07, "loss": 0.0843, "step": 24060 }, { "epoch": 2.8593490140175812, "grad_norm": 0.09940427245144731, "learning_rate": 2.8766576264792066e-07, "loss": 0.0841, "step": 24070 }, { "epoch": 2.860536944642433, "grad_norm": 0.09581839872368444, "learning_rate": 2.8283620865098836e-07, "loss": 0.0823, "step": 24080 }, { "epoch": 2.861724875267284, "grad_norm": 0.10213556514447397, "learning_rate": 2.780473090932545e-07, "loss": 0.0818, "step": 24090 }, { "epoch": 2.862912805892136, "grad_norm": 0.0979323593121551, "learning_rate": 2.73299071851435e-07, "loss": 0.0803, "step": 24100 }, { "epoch": 2.8641007365169875, "grad_norm": 0.09506873290450223, "learning_rate": 2.6859150473536044e-07, "loss": 0.0823, "step": 24110 }, { "epoch": 2.865288667141839, "grad_norm": 0.099569938035543, "learning_rate": 2.639246154879732e-07, "loss": 0.0789, "step": 24120 }, { "epoch": 2.8664765977666904, "grad_norm": 0.09978836354230032, "learning_rate": 2.592984117853053e-07, "loss": 0.0794, "step": 24130 }, { "epoch": 2.867664528391542, "grad_norm": 0.10057761145678726, "learning_rate": 2.547129012364702e-07, "loss": 0.0806, "step": 24140 }, { "epoch": 2.8688524590163933, "grad_norm": 0.09626451722482361, "learning_rate": 2.501680913836485e-07, "loss": 0.0785, "step": 24150 }, { "epoch": 2.870040389641245, "grad_norm": 0.11582788350543531, "learning_rate": 2.456639897020829e-07, "loss": 0.0807, "step": 24160 }, { "epoch": 2.8712283202660966, "grad_norm": 0.0965535691352476, "learning_rate": 2.4120060360005305e-07, "loss": 0.0797, "step": 24170 }, { "epoch": 2.872416250890948, "grad_norm": 0.09524470647140695, "learning_rate": 2.3677794041887523e-07, "loss": 0.0803, "step": 24180 }, { "epoch": 2.8736041815157995, "grad_norm": 0.10124788163872935, "learning_rate": 2.3239600743287792e-07, "loss": 0.0834, "step": 24190 }, { "epoch": 2.874792112140651, "grad_norm": 0.09570944942631229, "learning_rate": 2.280548118494069e-07, "loss": 0.0828, "step": 24200 }, { "epoch": 2.8759800427655025, "grad_norm": 0.09866051475304073, "learning_rate": 2.2375436080879774e-07, "loss": 0.0842, "step": 24210 }, { "epoch": 2.877167973390354, "grad_norm": 0.09927275547171249, "learning_rate": 2.1949466138437292e-07, "loss": 0.0824, "step": 24220 }, { "epoch": 2.8783559040152054, "grad_norm": 0.10222381441691444, "learning_rate": 2.1527572058241963e-07, "loss": 0.0806, "step": 24230 }, { "epoch": 2.879543834640057, "grad_norm": 0.09621366987871849, "learning_rate": 2.1109754534219815e-07, "loss": 0.0812, "step": 24240 }, { "epoch": 2.8807317652649083, "grad_norm": 0.09879471397219769, "learning_rate": 2.0696014253590857e-07, "loss": 0.0832, "step": 24250 }, { "epoch": 2.88191969588976, "grad_norm": 0.09847037987597558, "learning_rate": 2.028635189686906e-07, "loss": 0.081, "step": 24260 }, { "epoch": 2.8831076265146116, "grad_norm": 0.10525327321155575, "learning_rate": 1.988076813786155e-07, "loss": 0.0821, "step": 24270 }, { "epoch": 2.8842955571394633, "grad_norm": 0.09724358515140893, "learning_rate": 1.9479263643666644e-07, "loss": 0.0843, "step": 24280 }, { "epoch": 2.8854834877643145, "grad_norm": 0.10039742658968419, "learning_rate": 1.908183907467276e-07, "loss": 0.0836, "step": 24290 }, { "epoch": 2.886671418389166, "grad_norm": 0.09638174831004093, "learning_rate": 1.8688495084558944e-07, "loss": 0.0818, "step": 24300 }, { "epoch": 2.8878593490140174, "grad_norm": 0.09907730264537767, "learning_rate": 1.8299232320291292e-07, "loss": 0.0826, "step": 24310 }, { "epoch": 2.889047279638869, "grad_norm": 0.09741229053066516, "learning_rate": 1.7914051422124044e-07, "loss": 0.0833, "step": 24320 }, { "epoch": 2.8902352102637208, "grad_norm": 0.09478729954642712, "learning_rate": 1.753295302359681e-07, "loss": 0.0828, "step": 24330 }, { "epoch": 2.891423140888572, "grad_norm": 0.09655311946995504, "learning_rate": 1.7155937751534845e-07, "loss": 0.0826, "step": 24340 }, { "epoch": 2.8926110715134237, "grad_norm": 0.09879558946094806, "learning_rate": 1.6783006226047671e-07, "loss": 0.0797, "step": 24350 }, { "epoch": 2.893799002138275, "grad_norm": 0.09485288244192205, "learning_rate": 1.6414159060527956e-07, "loss": 0.0835, "step": 24360 }, { "epoch": 2.8949869327631266, "grad_norm": 0.09605818288069813, "learning_rate": 1.6049396861650133e-07, "loss": 0.0811, "step": 24370 }, { "epoch": 2.8961748633879782, "grad_norm": 0.0976286292932288, "learning_rate": 1.568872022936957e-07, "loss": 0.0817, "step": 24380 }, { "epoch": 2.89736279401283, "grad_norm": 0.09937171675370127, "learning_rate": 1.5332129756922276e-07, "loss": 0.0839, "step": 24390 }, { "epoch": 2.898550724637681, "grad_norm": 0.10224665001191029, "learning_rate": 1.4979626030823258e-07, "loss": 0.0811, "step": 24400 }, { "epoch": 2.899738655262533, "grad_norm": 0.10026564844758061, "learning_rate": 1.463120963086567e-07, "loss": 0.0819, "step": 24410 }, { "epoch": 2.900926585887384, "grad_norm": 0.09675723831690596, "learning_rate": 1.4286881130119722e-07, "loss": 0.0831, "step": 24420 }, { "epoch": 2.9021145165122357, "grad_norm": 0.09718712500459749, "learning_rate": 1.3946641094931823e-07, "loss": 0.0806, "step": 24430 }, { "epoch": 2.9033024471370874, "grad_norm": 0.09728586282686906, "learning_rate": 1.3610490084924322e-07, "loss": 0.0836, "step": 24440 }, { "epoch": 2.9044903777619386, "grad_norm": 0.09656145237647457, "learning_rate": 1.327842865299356e-07, "loss": 0.0786, "step": 24450 }, { "epoch": 2.9056783083867903, "grad_norm": 0.1045061460489505, "learning_rate": 1.2950457345309307e-07, "loss": 0.0796, "step": 24460 }, { "epoch": 2.9068662390116415, "grad_norm": 0.09799193883875394, "learning_rate": 1.262657670131423e-07, "loss": 0.0782, "step": 24470 }, { "epoch": 2.908054169636493, "grad_norm": 0.09727622137756099, "learning_rate": 1.2306787253722195e-07, "loss": 0.0811, "step": 24480 }, { "epoch": 2.909242100261345, "grad_norm": 0.10300251258269337, "learning_rate": 1.199108952851885e-07, "loss": 0.0816, "step": 24490 }, { "epoch": 2.910430030886196, "grad_norm": 0.0950155024919391, "learning_rate": 1.1679484044959111e-07, "loss": 0.0812, "step": 24500 }, { "epoch": 2.911617961511048, "grad_norm": 0.09997124677925304, "learning_rate": 1.1371971315567442e-07, "loss": 0.0837, "step": 24510 }, { "epoch": 2.912805892135899, "grad_norm": 0.09991409943067786, "learning_rate": 1.1068551846135922e-07, "loss": 0.0814, "step": 24520 }, { "epoch": 2.9139938227607507, "grad_norm": 0.09594557908771265, "learning_rate": 1.0769226135725064e-07, "loss": 0.0827, "step": 24530 }, { "epoch": 2.9151817533856024, "grad_norm": 0.09920131782477445, "learning_rate": 1.0473994676661326e-07, "loss": 0.08, "step": 24540 }, { "epoch": 2.916369684010454, "grad_norm": 0.10107636588976102, "learning_rate": 1.0182857954537106e-07, "loss": 0.0808, "step": 24550 }, { "epoch": 2.9175576146353053, "grad_norm": 0.09429993407111538, "learning_rate": 9.895816448210194e-08, "loss": 0.0804, "step": 24560 }, { "epoch": 2.918745545260157, "grad_norm": 0.0978286542691409, "learning_rate": 9.612870629802373e-08, "loss": 0.079, "step": 24570 }, { "epoch": 2.919933475885008, "grad_norm": 0.09728623963838344, "learning_rate": 9.334020964698598e-08, "loss": 0.0796, "step": 24580 }, { "epoch": 2.92112140650986, "grad_norm": 0.0927988567995289, "learning_rate": 9.059267911547265e-08, "loss": 0.0805, "step": 24590 }, { "epoch": 2.9223093371347115, "grad_norm": 0.09493213510041153, "learning_rate": 8.788611922257717e-08, "loss": 0.0831, "step": 24600 }, { "epoch": 2.9234972677595628, "grad_norm": 0.09794881402598142, "learning_rate": 8.522053442001354e-08, "loss": 0.0788, "step": 24610 }, { "epoch": 2.9246851983844144, "grad_norm": 0.09824628659801862, "learning_rate": 8.259592909209968e-08, "loss": 0.0829, "step": 24620 }, { "epoch": 2.9258731290092657, "grad_norm": 0.09559019579553085, "learning_rate": 8.001230755574629e-08, "loss": 0.0827, "step": 24630 }, { "epoch": 2.9270610596341173, "grad_norm": 0.09157703457342291, "learning_rate": 7.746967406045969e-08, "loss": 0.0819, "step": 24640 }, { "epoch": 2.928248990258969, "grad_norm": 0.0958657901253652, "learning_rate": 7.496803278832509e-08, "loss": 0.0817, "step": 24650 }, { "epoch": 2.9294369208838202, "grad_norm": 0.10018387251607058, "learning_rate": 7.250738785400668e-08, "loss": 0.0802, "step": 24660 }, { "epoch": 2.930624851508672, "grad_norm": 0.09862392910016525, "learning_rate": 7.008774330473922e-08, "loss": 0.0823, "step": 24670 }, { "epoch": 2.931812782133523, "grad_norm": 0.09903081044732176, "learning_rate": 6.770910312031975e-08, "loss": 0.0807, "step": 24680 }, { "epoch": 2.933000712758375, "grad_norm": 0.0978640599651094, "learning_rate": 6.537147121310205e-08, "loss": 0.0806, "step": 24690 }, { "epoch": 2.9341886433832265, "grad_norm": 0.09561030834008562, "learning_rate": 6.307485142798831e-08, "loss": 0.0814, "step": 24700 }, { "epoch": 2.935376574008078, "grad_norm": 0.09686559555441182, "learning_rate": 6.08192475424263e-08, "loss": 0.0825, "step": 24710 }, { "epoch": 2.9365645046329294, "grad_norm": 0.09984260967120921, "learning_rate": 5.860466326640113e-08, "loss": 0.0805, "step": 24720 }, { "epoch": 2.937752435257781, "grad_norm": 0.09083728080666499, "learning_rate": 5.643110224243242e-08, "loss": 0.079, "step": 24730 }, { "epoch": 2.9389403658826323, "grad_norm": 0.09566432407565448, "learning_rate": 5.4298568045557664e-08, "loss": 0.083, "step": 24740 }, { "epoch": 2.940128296507484, "grad_norm": 0.09704967490712954, "learning_rate": 5.220706418334331e-08, "loss": 0.0822, "step": 24750 }, { "epoch": 2.9413162271323356, "grad_norm": 0.10140306341347502, "learning_rate": 5.0156594095862596e-08, "loss": 0.0849, "step": 24760 }, { "epoch": 2.942504157757187, "grad_norm": 0.09992590165426171, "learning_rate": 4.8147161155698284e-08, "loss": 0.0814, "step": 24770 }, { "epoch": 2.9436920883820386, "grad_norm": 0.09731455277577356, "learning_rate": 4.6178768667939906e-08, "loss": 0.082, "step": 24780 }, { "epoch": 2.94488001900689, "grad_norm": 0.09628952181259037, "learning_rate": 4.42514198701699e-08, "loss": 0.0797, "step": 24790 }, { "epoch": 2.9460679496317415, "grad_norm": 0.09955331185392563, "learning_rate": 4.236511793246356e-08, "loss": 0.0813, "step": 24800 }, { "epoch": 2.947255880256593, "grad_norm": 0.09598784113958019, "learning_rate": 4.051986595738633e-08, "loss": 0.0824, "step": 24810 }, { "epoch": 2.9484438108814444, "grad_norm": 0.09800460928719792, "learning_rate": 3.8715666979977085e-08, "loss": 0.083, "step": 24820 }, { "epoch": 2.949631741506296, "grad_norm": 0.09746266771729263, "learning_rate": 3.695252396776483e-08, "loss": 0.079, "step": 24830 }, { "epoch": 2.9508196721311473, "grad_norm": 0.10230419931638465, "learning_rate": 3.52304398207326e-08, "loss": 0.083, "step": 24840 }, { "epoch": 2.952007602755999, "grad_norm": 0.10078284741392542, "learning_rate": 3.354941737134798e-08, "loss": 0.0828, "step": 24850 }, { "epoch": 2.9531955333808506, "grad_norm": 0.09885411238869703, "learning_rate": 3.190945938452705e-08, "loss": 0.0824, "step": 24860 }, { "epoch": 2.9543834640057023, "grad_norm": 0.09733701424993019, "learning_rate": 3.031056855765102e-08, "loss": 0.0815, "step": 24870 }, { "epoch": 2.9555713946305535, "grad_norm": 0.10097465173339439, "learning_rate": 2.8752747520555124e-08, "loss": 0.0806, "step": 24880 }, { "epoch": 2.956759325255405, "grad_norm": 0.09569591227526092, "learning_rate": 2.7235998835520303e-08, "loss": 0.0766, "step": 24890 }, { "epoch": 2.9579472558802564, "grad_norm": 0.0958816683358014, "learning_rate": 2.5760324997270434e-08, "loss": 0.0795, "step": 24900 }, { "epoch": 2.959135186505108, "grad_norm": 0.09944597324797678, "learning_rate": 2.4325728432975094e-08, "loss": 0.0822, "step": 24910 }, { "epoch": 2.9603231171299598, "grad_norm": 0.09852867772854465, "learning_rate": 2.2932211502238453e-08, "loss": 0.0789, "step": 24920 }, { "epoch": 2.961511047754811, "grad_norm": 0.10060630382577843, "learning_rate": 2.1579776497096525e-08, "loss": 0.0811, "step": 24930 }, { "epoch": 2.9626989783796627, "grad_norm": 0.1012825289740897, "learning_rate": 2.0268425642017142e-08, "loss": 0.0807, "step": 24940 }, { "epoch": 2.963886909004514, "grad_norm": 0.09521986003129242, "learning_rate": 1.899816109388608e-08, "loss": 0.0817, "step": 24950 }, { "epoch": 2.9650748396293656, "grad_norm": 0.09917385078253332, "learning_rate": 1.7768984942023725e-08, "loss": 0.0809, "step": 24960 }, { "epoch": 2.9662627702542173, "grad_norm": 0.09832359519211938, "learning_rate": 1.6580899208157307e-08, "loss": 0.0846, "step": 24970 }, { "epoch": 2.967450700879069, "grad_norm": 0.0981456448796273, "learning_rate": 1.5433905846432008e-08, "loss": 0.0804, "step": 24980 }, { "epoch": 2.96863863150392, "grad_norm": 0.09775099953156026, "learning_rate": 1.432800674341095e-08, "loss": 0.0814, "step": 24990 }, { "epoch": 2.969826562128772, "grad_norm": 0.09694007198985743, "learning_rate": 1.3263203718055783e-08, "loss": 0.0815, "step": 25000 }, { "epoch": 2.971014492753623, "grad_norm": 0.09550910307512682, "learning_rate": 1.223949852174333e-08, "loss": 0.0807, "step": 25010 }, { "epoch": 2.9722024233784747, "grad_norm": 0.0990026014616539, "learning_rate": 1.1256892838248934e-08, "loss": 0.0823, "step": 25020 }, { "epoch": 2.9733903540033264, "grad_norm": 0.1025691100851051, "learning_rate": 1.0315388283746452e-08, "loss": 0.0821, "step": 25030 }, { "epoch": 2.9745782846281776, "grad_norm": 0.10088553463455913, "learning_rate": 9.414986406808268e-09, "loss": 0.0805, "step": 25040 }, { "epoch": 2.9757662152530293, "grad_norm": 0.09490462596293817, "learning_rate": 8.555688688408059e-09, "loss": 0.0811, "step": 25050 }, { "epoch": 2.9769541458778805, "grad_norm": 0.09991096471090515, "learning_rate": 7.737496541901368e-09, "loss": 0.0828, "step": 25060 }, { "epoch": 2.978142076502732, "grad_norm": 0.09505909768324797, "learning_rate": 6.960411313039484e-09, "loss": 0.0824, "step": 25070 }, { "epoch": 2.979330007127584, "grad_norm": 0.09863938454721016, "learning_rate": 6.224434279963887e-09, "loss": 0.0808, "step": 25080 }, { "epoch": 2.980517937752435, "grad_norm": 0.09834368719243018, "learning_rate": 5.529566653197926e-09, "loss": 0.0834, "step": 25090 }, { "epoch": 2.981705868377287, "grad_norm": 0.10478166578189264, "learning_rate": 4.875809575649593e-09, "loss": 0.0801, "step": 25100 }, { "epoch": 2.982893799002138, "grad_norm": 0.09691230991850248, "learning_rate": 4.263164122608743e-09, "loss": 0.0793, "step": 25110 }, { "epoch": 2.9840817296269897, "grad_norm": 0.10109646642288618, "learning_rate": 3.691631301744325e-09, "loss": 0.0786, "step": 25120 }, { "epoch": 2.9852696602518414, "grad_norm": 0.0969184261735935, "learning_rate": 3.1612120531099298e-09, "loss": 0.0813, "step": 25130 }, { "epoch": 2.986457590876693, "grad_norm": 0.09642400791597407, "learning_rate": 2.6719072491271368e-09, "loss": 0.0827, "step": 25140 }, { "epoch": 2.9876455215015443, "grad_norm": 0.09593755363970177, "learning_rate": 2.2237176946021675e-09, "loss": 0.0832, "step": 25150 }, { "epoch": 2.988833452126396, "grad_norm": 0.09506706084731334, "learning_rate": 1.8166441267064572e-09, "loss": 0.0835, "step": 25160 }, { "epoch": 2.990021382751247, "grad_norm": 0.09872324230467913, "learning_rate": 1.4506872149905315e-09, "loss": 0.0827, "step": 25170 }, { "epoch": 2.991209313376099, "grad_norm": 0.09655768554339623, "learning_rate": 1.1258475613729059e-09, "loss": 0.0808, "step": 25180 }, { "epoch": 2.9923972440009505, "grad_norm": 0.09943967185986503, "learning_rate": 8.421257001511862e-10, "loss": 0.0841, "step": 25190 }, { "epoch": 2.9935851746258018, "grad_norm": 0.10170756366801488, "learning_rate": 5.995220979798655e-10, "loss": 0.0841, "step": 25200 }, { "epoch": 2.9947731052506534, "grad_norm": 0.0995287231743133, "learning_rate": 3.980371538953032e-10, "loss": 0.0807, "step": 25210 }, { "epoch": 2.9959610358755047, "grad_norm": 0.09504416513489544, "learning_rate": 2.37671199293521e-10, "loss": 0.0797, "step": 25220 }, { "epoch": 2.9971489665003563, "grad_norm": 0.09555712793622373, "learning_rate": 1.1842449794685627e-10, "loss": 0.0825, "step": 25230 }, { "epoch": 2.998336897125208, "grad_norm": 0.0972984600541247, "learning_rate": 4.0297245984532995e-11, "loss": 0.0803, "step": 25240 }, { "epoch": 2.9995248277500592, "grad_norm": 0.09820029887026188, "learning_rate": 3.289571912090672e-12, "loss": 0.0855, "step": 25250 }, { "epoch": 3.0, "step": 25254, "total_flos": 7030779601223680.0, "train_loss": 0.12248244438147218, "train_runtime": 55247.1944, "train_samples_per_second": 80.449, "train_steps_per_second": 0.457 } ], "logging_steps": 10, "max_steps": 25254, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7030779601223680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }