{ "best_metric": null, "best_model_checkpoint": null, "epoch": 146.4307504575961, "eval_steps": 500, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09762050030506407, "grad_norm": 1.6181267499923706, "learning_rate": 4e-05, "loss": 2.5822, "step": 20 }, { "epoch": 0.19524100061012814, "grad_norm": 1.536891222000122, "learning_rate": 8e-05, "loss": 2.2602, "step": 40 }, { "epoch": 0.2928615009151922, "grad_norm": 1.1685199737548828, "learning_rate": 0.00012, "loss": 1.7209, "step": 60 }, { "epoch": 0.3904820012202563, "grad_norm": 1.5937044620513916, "learning_rate": 0.00016, "loss": 1.6235, "step": 80 }, { "epoch": 0.4881025015253203, "grad_norm": 2.1078925132751465, "learning_rate": 0.0002, "loss": 1.5259, "step": 100 }, { "epoch": 0.5857230018303844, "grad_norm": 2.369110345840454, "learning_rate": 0.0001998688524590164, "loss": 1.5145, "step": 120 }, { "epoch": 0.6833435021354485, "grad_norm": 2.531956434249878, "learning_rate": 0.0001997377049180328, "loss": 1.4408, "step": 140 }, { "epoch": 0.7809640024405126, "grad_norm": 2.254030466079712, "learning_rate": 0.00019960655737704918, "loss": 1.3881, "step": 160 }, { "epoch": 0.8785845027455765, "grad_norm": 1.8673783540725708, "learning_rate": 0.0001994754098360656, "loss": 1.3656, "step": 180 }, { "epoch": 0.9762050030506406, "grad_norm": 2.6715195178985596, "learning_rate": 0.00019934426229508198, "loss": 1.2812, "step": 200 }, { "epoch": 1.0738255033557047, "grad_norm": 1.7909507751464844, "learning_rate": 0.00019921311475409837, "loss": 1.3006, "step": 220 }, { "epoch": 1.1714460036607688, "grad_norm": 2.6789026260375977, "learning_rate": 0.00019908196721311476, "loss": 1.2214, "step": 240 }, { "epoch": 1.2690665039658329, "grad_norm": 2.88253116607666, "learning_rate": 0.00019895081967213115, "loss": 1.2806, "step": 260 }, { "epoch": 1.366687004270897, "grad_norm": 1.9062846899032593, "learning_rate": 0.00019881967213114757, "loss": 1.2342, "step": 280 }, { "epoch": 1.4643075045759608, "grad_norm": 2.622042179107666, "learning_rate": 0.00019868852459016393, "loss": 1.1657, "step": 300 }, { "epoch": 1.561928004881025, "grad_norm": 2.6899402141571045, "learning_rate": 0.00019855737704918035, "loss": 1.1497, "step": 320 }, { "epoch": 1.659548505186089, "grad_norm": 3.025324821472168, "learning_rate": 0.00019842622950819674, "loss": 1.0834, "step": 340 }, { "epoch": 1.757169005491153, "grad_norm": 2.729680061340332, "learning_rate": 0.00019829508196721313, "loss": 1.1031, "step": 360 }, { "epoch": 1.8547895057962172, "grad_norm": 3.1579582691192627, "learning_rate": 0.00019816393442622951, "loss": 1.1726, "step": 380 }, { "epoch": 1.9524100061012812, "grad_norm": 2.698084592819214, "learning_rate": 0.0001980327868852459, "loss": 1.1241, "step": 400 }, { "epoch": 2.0500305064063453, "grad_norm": 2.4257960319519043, "learning_rate": 0.00019790163934426232, "loss": 1.0619, "step": 420 }, { "epoch": 2.1476510067114094, "grad_norm": 3.122441291809082, "learning_rate": 0.00019777049180327868, "loss": 1.0413, "step": 440 }, { "epoch": 2.2452715070164735, "grad_norm": 3.2882649898529053, "learning_rate": 0.0001976393442622951, "loss": 0.9949, "step": 460 }, { "epoch": 2.3428920073215376, "grad_norm": 2.3513290882110596, "learning_rate": 0.0001975081967213115, "loss": 1.0627, "step": 480 }, { "epoch": 2.4405125076266017, "grad_norm": 2.9709153175354004, "learning_rate": 0.00019737704918032788, "loss": 1.0612, "step": 500 }, { "epoch": 2.5381330079316657, "grad_norm": 4.2656569480896, "learning_rate": 0.00019724590163934427, "loss": 0.9615, "step": 520 }, { "epoch": 2.63575350823673, "grad_norm": 2.7959163188934326, "learning_rate": 0.00019711475409836066, "loss": 0.9915, "step": 540 }, { "epoch": 2.733374008541794, "grad_norm": 2.7856175899505615, "learning_rate": 0.00019698360655737707, "loss": 1.0299, "step": 560 }, { "epoch": 2.830994508846858, "grad_norm": 3.0926871299743652, "learning_rate": 0.00019685245901639344, "loss": 0.9642, "step": 580 }, { "epoch": 2.9286150091519216, "grad_norm": 4.457515239715576, "learning_rate": 0.00019672131147540985, "loss": 0.9922, "step": 600 }, { "epoch": 3.026235509456986, "grad_norm": 3.0343308448791504, "learning_rate": 0.00019659016393442624, "loss": 0.9777, "step": 620 }, { "epoch": 3.1238560097620502, "grad_norm": 3.12493896484375, "learning_rate": 0.00019645901639344263, "loss": 0.8765, "step": 640 }, { "epoch": 3.221476510067114, "grad_norm": 3.1015610694885254, "learning_rate": 0.00019632786885245902, "loss": 0.8882, "step": 660 }, { "epoch": 3.319097010372178, "grad_norm": 3.334591865539551, "learning_rate": 0.0001961967213114754, "loss": 0.8958, "step": 680 }, { "epoch": 3.416717510677242, "grad_norm": 3.3690192699432373, "learning_rate": 0.00019606557377049183, "loss": 0.954, "step": 700 }, { "epoch": 3.514338010982306, "grad_norm": 3.070910930633545, "learning_rate": 0.00019593442622950822, "loss": 0.8964, "step": 720 }, { "epoch": 3.61195851128737, "grad_norm": 3.369615077972412, "learning_rate": 0.0001958032786885246, "loss": 0.8758, "step": 740 }, { "epoch": 3.7095790115924343, "grad_norm": 2.3834476470947266, "learning_rate": 0.000195672131147541, "loss": 0.9307, "step": 760 }, { "epoch": 3.8071995118974984, "grad_norm": 2.924922466278076, "learning_rate": 0.00019554098360655738, "loss": 0.8911, "step": 780 }, { "epoch": 3.9048200122025625, "grad_norm": 3.321655035018921, "learning_rate": 0.00019540983606557377, "loss": 0.86, "step": 800 }, { "epoch": 4.002440512507627, "grad_norm": 4.9272074699401855, "learning_rate": 0.00019527868852459016, "loss": 0.9365, "step": 820 }, { "epoch": 4.100061012812691, "grad_norm": 3.4161181449890137, "learning_rate": 0.00019514754098360658, "loss": 0.804, "step": 840 }, { "epoch": 4.197681513117755, "grad_norm": 2.86344051361084, "learning_rate": 0.00019501639344262297, "loss": 0.8029, "step": 860 }, { "epoch": 4.295302013422819, "grad_norm": 3.9372735023498535, "learning_rate": 0.00019488524590163936, "loss": 0.7969, "step": 880 }, { "epoch": 4.392922513727883, "grad_norm": 3.2105510234832764, "learning_rate": 0.00019475409836065575, "loss": 0.7838, "step": 900 }, { "epoch": 4.490543014032947, "grad_norm": 3.1704697608947754, "learning_rate": 0.00019462295081967214, "loss": 0.8246, "step": 920 }, { "epoch": 4.588163514338011, "grad_norm": 2.9047462940216064, "learning_rate": 0.00019449180327868855, "loss": 0.79, "step": 940 }, { "epoch": 4.685784014643075, "grad_norm": 3.0593173503875732, "learning_rate": 0.00019436065573770491, "loss": 0.8105, "step": 960 }, { "epoch": 4.783404514948139, "grad_norm": 3.4904775619506836, "learning_rate": 0.00019422950819672133, "loss": 0.864, "step": 980 }, { "epoch": 4.881025015253203, "grad_norm": 3.080754280090332, "learning_rate": 0.00019409836065573772, "loss": 0.8163, "step": 1000 }, { "epoch": 4.978645515558267, "grad_norm": 3.663107395172119, "learning_rate": 0.0001939672131147541, "loss": 0.8192, "step": 1020 }, { "epoch": 5.0762660158633315, "grad_norm": 3.986875534057617, "learning_rate": 0.0001938360655737705, "loss": 0.7943, "step": 1040 }, { "epoch": 5.173886516168396, "grad_norm": 3.140963554382324, "learning_rate": 0.0001937049180327869, "loss": 0.7274, "step": 1060 }, { "epoch": 5.27150701647346, "grad_norm": 4.659877300262451, "learning_rate": 0.0001935737704918033, "loss": 0.7494, "step": 1080 }, { "epoch": 5.369127516778524, "grad_norm": 4.54330587387085, "learning_rate": 0.00019344262295081967, "loss": 0.6986, "step": 1100 }, { "epoch": 5.466748017083588, "grad_norm": 3.893850088119507, "learning_rate": 0.00019331147540983608, "loss": 0.7555, "step": 1120 }, { "epoch": 5.564368517388652, "grad_norm": 3.24857759475708, "learning_rate": 0.00019318032786885247, "loss": 0.721, "step": 1140 }, { "epoch": 5.661989017693716, "grad_norm": 4.82593297958374, "learning_rate": 0.00019304918032786886, "loss": 0.7523, "step": 1160 }, { "epoch": 5.75960951799878, "grad_norm": 3.055907726287842, "learning_rate": 0.00019291803278688525, "loss": 0.7719, "step": 1180 }, { "epoch": 5.857230018303844, "grad_norm": 3.192716121673584, "learning_rate": 0.00019278688524590164, "loss": 0.7732, "step": 1200 }, { "epoch": 5.954850518608908, "grad_norm": 3.196037769317627, "learning_rate": 0.00019265573770491806, "loss": 0.745, "step": 1220 }, { "epoch": 6.052471018913972, "grad_norm": 3.7806878089904785, "learning_rate": 0.00019252459016393442, "loss": 0.713, "step": 1240 }, { "epoch": 6.150091519219036, "grad_norm": 3.6102254390716553, "learning_rate": 0.00019239344262295084, "loss": 0.6063, "step": 1260 }, { "epoch": 6.2477120195241005, "grad_norm": 4.120365142822266, "learning_rate": 0.00019226229508196723, "loss": 0.6771, "step": 1280 }, { "epoch": 6.345332519829164, "grad_norm": 3.118666887283325, "learning_rate": 0.00019213114754098362, "loss": 0.7173, "step": 1300 }, { "epoch": 6.442953020134228, "grad_norm": 3.8055977821350098, "learning_rate": 0.000192, "loss": 0.6609, "step": 1320 }, { "epoch": 6.540573520439292, "grad_norm": 3.4207704067230225, "learning_rate": 0.0001918688524590164, "loss": 0.7084, "step": 1340 }, { "epoch": 6.638194020744356, "grad_norm": 3.812415838241577, "learning_rate": 0.0001917377049180328, "loss": 0.744, "step": 1360 }, { "epoch": 6.73581452104942, "grad_norm": 4.302225112915039, "learning_rate": 0.00019160655737704917, "loss": 0.667, "step": 1380 }, { "epoch": 6.833435021354484, "grad_norm": 3.3958733081817627, "learning_rate": 0.0001914754098360656, "loss": 0.6769, "step": 1400 }, { "epoch": 6.931055521659548, "grad_norm": 3.6200642585754395, "learning_rate": 0.00019134426229508198, "loss": 0.7494, "step": 1420 }, { "epoch": 7.028676021964612, "grad_norm": 3.2580292224884033, "learning_rate": 0.00019121311475409837, "loss": 0.6252, "step": 1440 }, { "epoch": 7.126296522269676, "grad_norm": 3.581437826156616, "learning_rate": 0.00019108196721311476, "loss": 0.6072, "step": 1460 }, { "epoch": 7.22391702257474, "grad_norm": 3.7347512245178223, "learning_rate": 0.00019095081967213115, "loss": 0.5715, "step": 1480 }, { "epoch": 7.3215375228798045, "grad_norm": 3.564328193664551, "learning_rate": 0.00019081967213114756, "loss": 0.5937, "step": 1500 }, { "epoch": 7.419158023184869, "grad_norm": 3.8675971031188965, "learning_rate": 0.00019068852459016395, "loss": 0.6441, "step": 1520 }, { "epoch": 7.516778523489933, "grad_norm": 3.7394795417785645, "learning_rate": 0.00019055737704918034, "loss": 0.6345, "step": 1540 }, { "epoch": 7.614399023794997, "grad_norm": 3.7391791343688965, "learning_rate": 0.00019042622950819673, "loss": 0.6334, "step": 1560 }, { "epoch": 7.712019524100061, "grad_norm": 3.2816712856292725, "learning_rate": 0.00019029508196721312, "loss": 0.6448, "step": 1580 }, { "epoch": 7.809640024405125, "grad_norm": 3.741111993789673, "learning_rate": 0.0001901639344262295, "loss": 0.6605, "step": 1600 }, { "epoch": 7.907260524710189, "grad_norm": 3.5533151626586914, "learning_rate": 0.0001900327868852459, "loss": 0.6434, "step": 1620 }, { "epoch": 8.004881025015253, "grad_norm": 3.243546724319458, "learning_rate": 0.00018990163934426232, "loss": 0.6923, "step": 1640 }, { "epoch": 8.102501525320317, "grad_norm": 3.860666513442993, "learning_rate": 0.0001897704918032787, "loss": 0.5619, "step": 1660 }, { "epoch": 8.200122025625381, "grad_norm": 3.4826905727386475, "learning_rate": 0.0001896393442622951, "loss": 0.5446, "step": 1680 }, { "epoch": 8.297742525930445, "grad_norm": 5.119688034057617, "learning_rate": 0.00018950819672131148, "loss": 0.5525, "step": 1700 }, { "epoch": 8.39536302623551, "grad_norm": 3.46353816986084, "learning_rate": 0.00018937704918032787, "loss": 0.5346, "step": 1720 }, { "epoch": 8.492983526540574, "grad_norm": 4.458425045013428, "learning_rate": 0.0001892459016393443, "loss": 0.6127, "step": 1740 }, { "epoch": 8.590604026845638, "grad_norm": 3.592191457748413, "learning_rate": 0.00018911475409836065, "loss": 0.5721, "step": 1760 }, { "epoch": 8.688224527150702, "grad_norm": 4.131028652191162, "learning_rate": 0.00018898360655737707, "loss": 0.5799, "step": 1780 }, { "epoch": 8.785845027455766, "grad_norm": 3.6534175872802734, "learning_rate": 0.00018885245901639346, "loss": 0.6198, "step": 1800 }, { "epoch": 8.88346552776083, "grad_norm": 3.0888559818267822, "learning_rate": 0.00018872131147540985, "loss": 0.6309, "step": 1820 }, { "epoch": 8.981086028065894, "grad_norm": 2.7711477279663086, "learning_rate": 0.00018859016393442624, "loss": 0.601, "step": 1840 }, { "epoch": 9.078706528370958, "grad_norm": 2.7783164978027344, "learning_rate": 0.00018845901639344263, "loss": 0.5696, "step": 1860 }, { "epoch": 9.176327028676022, "grad_norm": 3.6661510467529297, "learning_rate": 0.00018832786885245904, "loss": 0.5214, "step": 1880 }, { "epoch": 9.273947528981086, "grad_norm": 3.4614779949188232, "learning_rate": 0.0001881967213114754, "loss": 0.5181, "step": 1900 }, { "epoch": 9.37156802928615, "grad_norm": 3.470071315765381, "learning_rate": 0.00018806557377049182, "loss": 0.5196, "step": 1920 }, { "epoch": 9.469188529591214, "grad_norm": 3.686056613922119, "learning_rate": 0.0001879344262295082, "loss": 0.5432, "step": 1940 }, { "epoch": 9.566809029896278, "grad_norm": 3.4668374061584473, "learning_rate": 0.0001878032786885246, "loss": 0.5502, "step": 1960 }, { "epoch": 9.664429530201343, "grad_norm": 4.547699928283691, "learning_rate": 0.000187672131147541, "loss": 0.5482, "step": 1980 }, { "epoch": 9.762050030506407, "grad_norm": 3.784132957458496, "learning_rate": 0.00018754098360655738, "loss": 0.516, "step": 2000 }, { "epoch": 9.85967053081147, "grad_norm": 4.160193920135498, "learning_rate": 0.0001874098360655738, "loss": 0.5658, "step": 2020 }, { "epoch": 9.957291031116535, "grad_norm": 4.115555286407471, "learning_rate": 0.00018727868852459016, "loss": 0.5775, "step": 2040 }, { "epoch": 10.054911531421599, "grad_norm": 3.361625909805298, "learning_rate": 0.00018714754098360657, "loss": 0.5105, "step": 2060 }, { "epoch": 10.152532031726663, "grad_norm": 3.663980484008789, "learning_rate": 0.00018701639344262296, "loss": 0.4556, "step": 2080 }, { "epoch": 10.250152532031727, "grad_norm": 3.6515650749206543, "learning_rate": 0.00018688524590163935, "loss": 0.4937, "step": 2100 }, { "epoch": 10.347773032336791, "grad_norm": 3.2449493408203125, "learning_rate": 0.00018675409836065574, "loss": 0.481, "step": 2120 }, { "epoch": 10.445393532641855, "grad_norm": 4.262176513671875, "learning_rate": 0.00018662295081967213, "loss": 0.4559, "step": 2140 }, { "epoch": 10.54301403294692, "grad_norm": 3.885936737060547, "learning_rate": 0.00018649180327868855, "loss": 0.5263, "step": 2160 }, { "epoch": 10.640634533251983, "grad_norm": 3.676922559738159, "learning_rate": 0.00018636065573770494, "loss": 0.4942, "step": 2180 }, { "epoch": 10.738255033557047, "grad_norm": 4.31233024597168, "learning_rate": 0.00018622950819672133, "loss": 0.5339, "step": 2200 }, { "epoch": 10.835875533862112, "grad_norm": 3.410269260406494, "learning_rate": 0.00018609836065573772, "loss": 0.5365, "step": 2220 }, { "epoch": 10.933496034167176, "grad_norm": 3.114283800125122, "learning_rate": 0.0001859672131147541, "loss": 0.5252, "step": 2240 }, { "epoch": 11.03111653447224, "grad_norm": 3.4641740322113037, "learning_rate": 0.0001858360655737705, "loss": 0.4993, "step": 2260 }, { "epoch": 11.128737034777304, "grad_norm": 5.0364203453063965, "learning_rate": 0.00018570491803278688, "loss": 0.4097, "step": 2280 }, { "epoch": 11.226357535082368, "grad_norm": 3.811703681945801, "learning_rate": 0.0001855737704918033, "loss": 0.4265, "step": 2300 }, { "epoch": 11.323978035387432, "grad_norm": 3.528463125228882, "learning_rate": 0.0001854426229508197, "loss": 0.4542, "step": 2320 }, { "epoch": 11.421598535692496, "grad_norm": 3.013249158859253, "learning_rate": 0.00018531147540983608, "loss": 0.4824, "step": 2340 }, { "epoch": 11.51921903599756, "grad_norm": 3.7065834999084473, "learning_rate": 0.00018518032786885247, "loss": 0.4724, "step": 2360 }, { "epoch": 11.616839536302624, "grad_norm": 3.271639347076416, "learning_rate": 0.00018504918032786886, "loss": 0.4731, "step": 2380 }, { "epoch": 11.714460036607688, "grad_norm": 3.738567590713501, "learning_rate": 0.00018491803278688527, "loss": 0.5024, "step": 2400 }, { "epoch": 11.812080536912752, "grad_norm": 3.5277130603790283, "learning_rate": 0.00018478688524590164, "loss": 0.4552, "step": 2420 }, { "epoch": 11.909701037217816, "grad_norm": 3.909186840057373, "learning_rate": 0.00018465573770491805, "loss": 0.4454, "step": 2440 }, { "epoch": 12.00732153752288, "grad_norm": 3.629971981048584, "learning_rate": 0.00018452459016393444, "loss": 0.4928, "step": 2460 }, { "epoch": 12.104942037827945, "grad_norm": 4.8608245849609375, "learning_rate": 0.0001843934426229508, "loss": 0.395, "step": 2480 }, { "epoch": 12.202562538133009, "grad_norm": 3.377361536026001, "learning_rate": 0.00018426229508196722, "loss": 0.4124, "step": 2500 }, { "epoch": 12.300183038438073, "grad_norm": 3.1160948276519775, "learning_rate": 0.0001841311475409836, "loss": 0.4317, "step": 2520 }, { "epoch": 12.397803538743137, "grad_norm": 4.747882843017578, "learning_rate": 0.00018400000000000003, "loss": 0.4054, "step": 2540 }, { "epoch": 12.495424039048201, "grad_norm": 3.0192835330963135, "learning_rate": 0.0001838688524590164, "loss": 0.4384, "step": 2560 }, { "epoch": 12.593044539353265, "grad_norm": 3.397606372833252, "learning_rate": 0.0001837377049180328, "loss": 0.4095, "step": 2580 }, { "epoch": 12.690665039658327, "grad_norm": 3.571641445159912, "learning_rate": 0.0001836065573770492, "loss": 0.4526, "step": 2600 }, { "epoch": 12.788285539963393, "grad_norm": 3.9279720783233643, "learning_rate": 0.00018347540983606558, "loss": 0.4447, "step": 2620 }, { "epoch": 12.885906040268456, "grad_norm": 5.057145118713379, "learning_rate": 0.00018334426229508197, "loss": 0.4285, "step": 2640 }, { "epoch": 12.98352654057352, "grad_norm": 4.515413761138916, "learning_rate": 0.00018321311475409836, "loss": 0.4614, "step": 2660 }, { "epoch": 13.081147040878584, "grad_norm": 5.738150119781494, "learning_rate": 0.00018308196721311478, "loss": 0.3594, "step": 2680 }, { "epoch": 13.178767541183648, "grad_norm": 4.123675346374512, "learning_rate": 0.00018295081967213114, "loss": 0.3604, "step": 2700 }, { "epoch": 13.276388041488712, "grad_norm": 4.020540237426758, "learning_rate": 0.00018281967213114756, "loss": 0.3455, "step": 2720 }, { "epoch": 13.374008541793776, "grad_norm": 3.946470260620117, "learning_rate": 0.00018268852459016395, "loss": 0.3936, "step": 2740 }, { "epoch": 13.47162904209884, "grad_norm": 3.4082658290863037, "learning_rate": 0.00018255737704918034, "loss": 0.3853, "step": 2760 }, { "epoch": 13.569249542403904, "grad_norm": 3.8602256774902344, "learning_rate": 0.00018242622950819673, "loss": 0.4138, "step": 2780 }, { "epoch": 13.666870042708968, "grad_norm": 3.4819118976593018, "learning_rate": 0.00018229508196721312, "loss": 0.4235, "step": 2800 }, { "epoch": 13.764490543014032, "grad_norm": 4.620802402496338, "learning_rate": 0.00018216393442622953, "loss": 0.3917, "step": 2820 }, { "epoch": 13.862111043319096, "grad_norm": 4.1257500648498535, "learning_rate": 0.00018203278688524592, "loss": 0.4289, "step": 2840 }, { "epoch": 13.95973154362416, "grad_norm": 4.032405376434326, "learning_rate": 0.0001819016393442623, "loss": 0.4501, "step": 2860 }, { "epoch": 14.057352043929225, "grad_norm": 3.9458096027374268, "learning_rate": 0.0001817704918032787, "loss": 0.3863, "step": 2880 }, { "epoch": 14.154972544234289, "grad_norm": 5.01477575302124, "learning_rate": 0.0001816393442622951, "loss": 0.3421, "step": 2900 }, { "epoch": 14.252593044539353, "grad_norm": 3.396898031234741, "learning_rate": 0.00018150819672131148, "loss": 0.3438, "step": 2920 }, { "epoch": 14.350213544844417, "grad_norm": 4.596593856811523, "learning_rate": 0.00018137704918032787, "loss": 0.3435, "step": 2940 }, { "epoch": 14.44783404514948, "grad_norm": 3.6386988162994385, "learning_rate": 0.00018124590163934429, "loss": 0.3703, "step": 2960 }, { "epoch": 14.545454545454545, "grad_norm": 3.3389110565185547, "learning_rate": 0.00018111475409836067, "loss": 0.3727, "step": 2980 }, { "epoch": 14.643075045759609, "grad_norm": 4.675887107849121, "learning_rate": 0.00018098360655737704, "loss": 0.3972, "step": 3000 }, { "epoch": 14.740695546064673, "grad_norm": 3.3281068801879883, "learning_rate": 0.00018085245901639345, "loss": 0.38, "step": 3020 }, { "epoch": 14.838316046369737, "grad_norm": 3.9550278186798096, "learning_rate": 0.00018072131147540984, "loss": 0.3995, "step": 3040 }, { "epoch": 14.935936546674801, "grad_norm": 2.93839168548584, "learning_rate": 0.00018059016393442626, "loss": 0.3885, "step": 3060 }, { "epoch": 15.033557046979865, "grad_norm": 3.583588123321533, "learning_rate": 0.00018045901639344262, "loss": 0.3407, "step": 3080 }, { "epoch": 15.13117754728493, "grad_norm": 3.6306777000427246, "learning_rate": 0.00018032786885245904, "loss": 0.3016, "step": 3100 }, { "epoch": 15.228798047589994, "grad_norm": 3.278693437576294, "learning_rate": 0.00018019672131147543, "loss": 0.292, "step": 3120 }, { "epoch": 15.326418547895058, "grad_norm": 2.9507830142974854, "learning_rate": 0.0001800655737704918, "loss": 0.3204, "step": 3140 }, { "epoch": 15.424039048200122, "grad_norm": 2.957294464111328, "learning_rate": 0.0001799344262295082, "loss": 0.3446, "step": 3160 }, { "epoch": 15.521659548505186, "grad_norm": 3.4953787326812744, "learning_rate": 0.0001798032786885246, "loss": 0.3307, "step": 3180 }, { "epoch": 15.61928004881025, "grad_norm": 3.349458694458008, "learning_rate": 0.000179672131147541, "loss": 0.3448, "step": 3200 }, { "epoch": 15.716900549115314, "grad_norm": 4.927302837371826, "learning_rate": 0.00017954098360655737, "loss": 0.3661, "step": 3220 }, { "epoch": 15.814521049420378, "grad_norm": 4.214022636413574, "learning_rate": 0.0001794098360655738, "loss": 0.3595, "step": 3240 }, { "epoch": 15.912141549725442, "grad_norm": 3.7332723140716553, "learning_rate": 0.00017927868852459018, "loss": 0.3916, "step": 3260 }, { "epoch": 16.009762050030506, "grad_norm": 3.1130378246307373, "learning_rate": 0.00017914754098360657, "loss": 0.365, "step": 3280 }, { "epoch": 16.107382550335572, "grad_norm": 3.9815053939819336, "learning_rate": 0.00017901639344262296, "loss": 0.2466, "step": 3300 }, { "epoch": 16.205003050640634, "grad_norm": 4.437533855438232, "learning_rate": 0.00017888524590163935, "loss": 0.2917, "step": 3320 }, { "epoch": 16.3026235509457, "grad_norm": 3.869553565979004, "learning_rate": 0.00017875409836065576, "loss": 0.2982, "step": 3340 }, { "epoch": 16.400244051250763, "grad_norm": 3.1284611225128174, "learning_rate": 0.00017862295081967213, "loss": 0.3146, "step": 3360 }, { "epoch": 16.49786455155583, "grad_norm": 3.837712049484253, "learning_rate": 0.00017849180327868852, "loss": 0.3129, "step": 3380 }, { "epoch": 16.59548505186089, "grad_norm": 3.348344326019287, "learning_rate": 0.00017836065573770493, "loss": 0.3379, "step": 3400 }, { "epoch": 16.693105552165953, "grad_norm": 3.809512138366699, "learning_rate": 0.00017822950819672132, "loss": 0.3193, "step": 3420 }, { "epoch": 16.79072605247102, "grad_norm": 3.3745222091674805, "learning_rate": 0.0001780983606557377, "loss": 0.3308, "step": 3440 }, { "epoch": 16.888346552776085, "grad_norm": 4.550785541534424, "learning_rate": 0.0001779672131147541, "loss": 0.3609, "step": 3460 }, { "epoch": 16.985967053081147, "grad_norm": 4.031665802001953, "learning_rate": 0.00017783606557377052, "loss": 0.3648, "step": 3480 }, { "epoch": 17.08358755338621, "grad_norm": 3.0923168659210205, "learning_rate": 0.0001777049180327869, "loss": 0.2731, "step": 3500 }, { "epoch": 17.181208053691275, "grad_norm": 3.291416883468628, "learning_rate": 0.00017757377049180327, "loss": 0.2829, "step": 3520 }, { "epoch": 17.278828553996338, "grad_norm": 3.059995651245117, "learning_rate": 0.00017744262295081969, "loss": 0.3047, "step": 3540 }, { "epoch": 17.376449054301403, "grad_norm": 3.1089327335357666, "learning_rate": 0.00017731147540983607, "loss": 0.3092, "step": 3560 }, { "epoch": 17.474069554606466, "grad_norm": 3.9796876907348633, "learning_rate": 0.00017718032786885246, "loss": 0.2647, "step": 3580 }, { "epoch": 17.57169005491153, "grad_norm": 3.587038040161133, "learning_rate": 0.00017704918032786885, "loss": 0.308, "step": 3600 }, { "epoch": 17.669310555216594, "grad_norm": 3.7032790184020996, "learning_rate": 0.00017691803278688527, "loss": 0.3101, "step": 3620 }, { "epoch": 17.76693105552166, "grad_norm": 3.7440781593322754, "learning_rate": 0.00017678688524590166, "loss": 0.2922, "step": 3640 }, { "epoch": 17.864551555826722, "grad_norm": 3.4123542308807373, "learning_rate": 0.00017665573770491802, "loss": 0.3097, "step": 3660 }, { "epoch": 17.962172056131788, "grad_norm": 3.958204507827759, "learning_rate": 0.00017652459016393444, "loss": 0.2955, "step": 3680 }, { "epoch": 18.05979255643685, "grad_norm": 4.549073696136475, "learning_rate": 0.00017639344262295083, "loss": 0.2846, "step": 3700 }, { "epoch": 18.157413056741916, "grad_norm": 3.5509791374206543, "learning_rate": 0.00017626229508196724, "loss": 0.2651, "step": 3720 }, { "epoch": 18.25503355704698, "grad_norm": 4.044325828552246, "learning_rate": 0.0001761311475409836, "loss": 0.242, "step": 3740 }, { "epoch": 18.352654057352044, "grad_norm": 3.255535125732422, "learning_rate": 0.00017600000000000002, "loss": 0.2857, "step": 3760 }, { "epoch": 18.450274557657107, "grad_norm": 3.6761083602905273, "learning_rate": 0.0001758688524590164, "loss": 0.285, "step": 3780 }, { "epoch": 18.547895057962172, "grad_norm": 4.1334381103515625, "learning_rate": 0.00017573770491803277, "loss": 0.2648, "step": 3800 }, { "epoch": 18.645515558267235, "grad_norm": 3.5164451599121094, "learning_rate": 0.0001756065573770492, "loss": 0.2654, "step": 3820 }, { "epoch": 18.7431360585723, "grad_norm": 3.96533465385437, "learning_rate": 0.00017547540983606558, "loss": 0.2862, "step": 3840 }, { "epoch": 18.840756558877363, "grad_norm": 3.932554006576538, "learning_rate": 0.000175344262295082, "loss": 0.2803, "step": 3860 }, { "epoch": 18.93837705918243, "grad_norm": 4.888127326965332, "learning_rate": 0.00017521311475409836, "loss": 0.2902, "step": 3880 }, { "epoch": 19.03599755948749, "grad_norm": 3.475789785385132, "learning_rate": 0.00017508196721311475, "loss": 0.2815, "step": 3900 }, { "epoch": 19.133618059792557, "grad_norm": 2.57926607131958, "learning_rate": 0.00017495081967213116, "loss": 0.21, "step": 3920 }, { "epoch": 19.23123856009762, "grad_norm": 3.3455469608306885, "learning_rate": 0.00017481967213114753, "loss": 0.232, "step": 3940 }, { "epoch": 19.328859060402685, "grad_norm": 3.574331045150757, "learning_rate": 0.00017468852459016394, "loss": 0.2506, "step": 3960 }, { "epoch": 19.426479560707747, "grad_norm": 4.422223091125488, "learning_rate": 0.00017455737704918033, "loss": 0.2494, "step": 3980 }, { "epoch": 19.524100061012813, "grad_norm": 3.8312060832977295, "learning_rate": 0.00017442622950819675, "loss": 0.2758, "step": 4000 }, { "epoch": 19.621720561317876, "grad_norm": 3.464089870452881, "learning_rate": 0.0001742950819672131, "loss": 0.2628, "step": 4020 }, { "epoch": 19.71934106162294, "grad_norm": 4.096322536468506, "learning_rate": 0.0001741639344262295, "loss": 0.2651, "step": 4040 }, { "epoch": 19.816961561928004, "grad_norm": 5.72092866897583, "learning_rate": 0.00017403278688524592, "loss": 0.29, "step": 4060 }, { "epoch": 19.91458206223307, "grad_norm": 3.7189135551452637, "learning_rate": 0.0001739016393442623, "loss": 0.2715, "step": 4080 }, { "epoch": 20.012202562538132, "grad_norm": 3.4471986293792725, "learning_rate": 0.0001737704918032787, "loss": 0.2762, "step": 4100 }, { "epoch": 20.109823062843198, "grad_norm": 3.046600580215454, "learning_rate": 0.00017363934426229509, "loss": 0.2219, "step": 4120 }, { "epoch": 20.20744356314826, "grad_norm": 3.6281490325927734, "learning_rate": 0.0001735081967213115, "loss": 0.231, "step": 4140 }, { "epoch": 20.305064063453326, "grad_norm": 2.8515024185180664, "learning_rate": 0.00017337704918032786, "loss": 0.2212, "step": 4160 }, { "epoch": 20.40268456375839, "grad_norm": 2.9778804779052734, "learning_rate": 0.00017324590163934425, "loss": 0.2327, "step": 4180 }, { "epoch": 20.500305064063454, "grad_norm": 2.92470121383667, "learning_rate": 0.00017311475409836067, "loss": 0.2378, "step": 4200 }, { "epoch": 20.597925564368516, "grad_norm": 3.8426191806793213, "learning_rate": 0.00017298360655737706, "loss": 0.2652, "step": 4220 }, { "epoch": 20.695546064673582, "grad_norm": 3.8123021125793457, "learning_rate": 0.00017285245901639345, "loss": 0.2446, "step": 4240 }, { "epoch": 20.793166564978645, "grad_norm": 3.650644540786743, "learning_rate": 0.00017272131147540984, "loss": 0.2331, "step": 4260 }, { "epoch": 20.89078706528371, "grad_norm": 4.259769916534424, "learning_rate": 0.00017259016393442625, "loss": 0.2595, "step": 4280 }, { "epoch": 20.988407565588773, "grad_norm": 4.065052509307861, "learning_rate": 0.00017245901639344264, "loss": 0.2672, "step": 4300 }, { "epoch": 21.08602806589384, "grad_norm": 3.565068244934082, "learning_rate": 0.000172327868852459, "loss": 0.1997, "step": 4320 }, { "epoch": 21.1836485661989, "grad_norm": 3.036924362182617, "learning_rate": 0.00017219672131147542, "loss": 0.1936, "step": 4340 }, { "epoch": 21.281269066503967, "grad_norm": 3.2253048419952393, "learning_rate": 0.0001720655737704918, "loss": 0.2099, "step": 4360 }, { "epoch": 21.37888956680903, "grad_norm": 3.6241676807403564, "learning_rate": 0.0001719344262295082, "loss": 0.2232, "step": 4380 }, { "epoch": 21.476510067114095, "grad_norm": 3.3272531032562256, "learning_rate": 0.0001718032786885246, "loss": 0.2103, "step": 4400 }, { "epoch": 21.574130567419157, "grad_norm": 3.6076695919036865, "learning_rate": 0.00017167213114754098, "loss": 0.2323, "step": 4420 }, { "epoch": 21.671751067724223, "grad_norm": 3.642751455307007, "learning_rate": 0.0001715409836065574, "loss": 0.2385, "step": 4440 }, { "epoch": 21.769371568029285, "grad_norm": 3.1882801055908203, "learning_rate": 0.00017140983606557376, "loss": 0.2516, "step": 4460 }, { "epoch": 21.86699206833435, "grad_norm": 3.4758126735687256, "learning_rate": 0.00017127868852459018, "loss": 0.2365, "step": 4480 }, { "epoch": 21.964612568639414, "grad_norm": 3.498697519302368, "learning_rate": 0.00017114754098360656, "loss": 0.2436, "step": 4500 }, { "epoch": 22.06223306894448, "grad_norm": 3.926394462585449, "learning_rate": 0.00017101639344262298, "loss": 0.2101, "step": 4520 }, { "epoch": 22.15985356924954, "grad_norm": 2.982520580291748, "learning_rate": 0.00017088524590163934, "loss": 0.1655, "step": 4540 }, { "epoch": 22.257474069554608, "grad_norm": 3.843905448913574, "learning_rate": 0.00017075409836065573, "loss": 0.1896, "step": 4560 }, { "epoch": 22.35509456985967, "grad_norm": 2.8714027404785156, "learning_rate": 0.00017062295081967215, "loss": 0.2113, "step": 4580 }, { "epoch": 22.452715070164736, "grad_norm": 3.552220344543457, "learning_rate": 0.0001704918032786885, "loss": 0.2311, "step": 4600 }, { "epoch": 22.550335570469798, "grad_norm": 4.077561378479004, "learning_rate": 0.00017036065573770493, "loss": 0.2205, "step": 4620 }, { "epoch": 22.647956070774864, "grad_norm": 3.475970506668091, "learning_rate": 0.00017022950819672132, "loss": 0.2047, "step": 4640 }, { "epoch": 22.745576571079926, "grad_norm": 2.9817659854888916, "learning_rate": 0.00017009836065573773, "loss": 0.2312, "step": 4660 }, { "epoch": 22.843197071384992, "grad_norm": 2.8980283737182617, "learning_rate": 0.0001699672131147541, "loss": 0.2113, "step": 4680 }, { "epoch": 22.940817571690054, "grad_norm": 2.9917445182800293, "learning_rate": 0.00016983606557377049, "loss": 0.2334, "step": 4700 }, { "epoch": 23.03843807199512, "grad_norm": 2.9001080989837646, "learning_rate": 0.0001697049180327869, "loss": 0.2096, "step": 4720 }, { "epoch": 23.136058572300183, "grad_norm": 2.7678308486938477, "learning_rate": 0.0001695737704918033, "loss": 0.18, "step": 4740 }, { "epoch": 23.23367907260525, "grad_norm": 4.077008247375488, "learning_rate": 0.00016944262295081968, "loss": 0.175, "step": 4760 }, { "epoch": 23.33129957291031, "grad_norm": 3.806175708770752, "learning_rate": 0.00016931147540983607, "loss": 0.1962, "step": 4780 }, { "epoch": 23.428920073215377, "grad_norm": 3.203763246536255, "learning_rate": 0.00016918032786885249, "loss": 0.2072, "step": 4800 }, { "epoch": 23.52654057352044, "grad_norm": 3.717864513397217, "learning_rate": 0.00016904918032786885, "loss": 0.203, "step": 4820 }, { "epoch": 23.624161073825505, "grad_norm": 3.1602675914764404, "learning_rate": 0.00016891803278688524, "loss": 0.1925, "step": 4840 }, { "epoch": 23.721781574130567, "grad_norm": 4.2820000648498535, "learning_rate": 0.00016878688524590165, "loss": 0.2037, "step": 4860 }, { "epoch": 23.819402074435633, "grad_norm": 3.9633703231811523, "learning_rate": 0.00016865573770491804, "loss": 0.2143, "step": 4880 }, { "epoch": 23.917022574740695, "grad_norm": 2.801804542541504, "learning_rate": 0.00016852459016393443, "loss": 0.1938, "step": 4900 }, { "epoch": 24.01464307504576, "grad_norm": 2.719308614730835, "learning_rate": 0.00016839344262295082, "loss": 0.215, "step": 4920 }, { "epoch": 24.112263575350823, "grad_norm": 3.0437123775482178, "learning_rate": 0.0001682622950819672, "loss": 0.1806, "step": 4940 }, { "epoch": 24.20988407565589, "grad_norm": 3.314267158508301, "learning_rate": 0.00016813114754098363, "loss": 0.169, "step": 4960 }, { "epoch": 24.30750457596095, "grad_norm": 2.26737117767334, "learning_rate": 0.000168, "loss": 0.1704, "step": 4980 }, { "epoch": 24.405125076266017, "grad_norm": 4.0000715255737305, "learning_rate": 0.0001678688524590164, "loss": 0.176, "step": 5000 }, { "epoch": 24.50274557657108, "grad_norm": 3.358684778213501, "learning_rate": 0.0001677377049180328, "loss": 0.1756, "step": 5020 }, { "epoch": 24.600366076876146, "grad_norm": 3.2529423236846924, "learning_rate": 0.00016760655737704919, "loss": 0.201, "step": 5040 }, { "epoch": 24.697986577181208, "grad_norm": 4.20505952835083, "learning_rate": 0.00016747540983606558, "loss": 0.1863, "step": 5060 }, { "epoch": 24.795607077486274, "grad_norm": 3.8670005798339844, "learning_rate": 0.00016734426229508196, "loss": 0.1955, "step": 5080 }, { "epoch": 24.893227577791336, "grad_norm": 3.262305736541748, "learning_rate": 0.00016721311475409838, "loss": 0.2145, "step": 5100 }, { "epoch": 24.990848078096402, "grad_norm": 3.3101160526275635, "learning_rate": 0.00016708196721311474, "loss": 0.2064, "step": 5120 }, { "epoch": 25.088468578401464, "grad_norm": 2.9248740673065186, "learning_rate": 0.00016695081967213116, "loss": 0.1591, "step": 5140 }, { "epoch": 25.18608907870653, "grad_norm": 3.645301103591919, "learning_rate": 0.00016681967213114755, "loss": 0.1433, "step": 5160 }, { "epoch": 25.283709579011592, "grad_norm": 3.857302665710449, "learning_rate": 0.00016668852459016397, "loss": 0.1636, "step": 5180 }, { "epoch": 25.381330079316655, "grad_norm": 3.101661205291748, "learning_rate": 0.00016655737704918033, "loss": 0.1665, "step": 5200 }, { "epoch": 25.47895057962172, "grad_norm": 3.0728678703308105, "learning_rate": 0.00016642622950819672, "loss": 0.1769, "step": 5220 }, { "epoch": 25.576571079926783, "grad_norm": 3.7951607704162598, "learning_rate": 0.00016629508196721313, "loss": 0.1692, "step": 5240 }, { "epoch": 25.67419158023185, "grad_norm": 2.699662446975708, "learning_rate": 0.0001661639344262295, "loss": 0.1983, "step": 5260 }, { "epoch": 25.77181208053691, "grad_norm": 2.861830949783325, "learning_rate": 0.0001660327868852459, "loss": 0.1838, "step": 5280 }, { "epoch": 25.869432580841977, "grad_norm": 3.30417799949646, "learning_rate": 0.0001659016393442623, "loss": 0.1953, "step": 5300 }, { "epoch": 25.96705308114704, "grad_norm": 3.30916690826416, "learning_rate": 0.0001657704918032787, "loss": 0.2021, "step": 5320 }, { "epoch": 26.064673581452105, "grad_norm": 2.5547962188720703, "learning_rate": 0.00016563934426229508, "loss": 0.1494, "step": 5340 }, { "epoch": 26.162294081757167, "grad_norm": 2.986764907836914, "learning_rate": 0.00016550819672131147, "loss": 0.1569, "step": 5360 }, { "epoch": 26.259914582062233, "grad_norm": 3.37117862701416, "learning_rate": 0.00016537704918032789, "loss": 0.1651, "step": 5380 }, { "epoch": 26.357535082367296, "grad_norm": 2.9431986808776855, "learning_rate": 0.00016524590163934428, "loss": 0.1497, "step": 5400 }, { "epoch": 26.45515558267236, "grad_norm": 3.107166051864624, "learning_rate": 0.00016511475409836067, "loss": 0.1824, "step": 5420 }, { "epoch": 26.552776082977424, "grad_norm": 3.1725735664367676, "learning_rate": 0.00016498360655737705, "loss": 0.1624, "step": 5440 }, { "epoch": 26.65039658328249, "grad_norm": 4.063880443572998, "learning_rate": 0.00016485245901639344, "loss": 0.1613, "step": 5460 }, { "epoch": 26.748017083587552, "grad_norm": 3.5230929851531982, "learning_rate": 0.00016472131147540983, "loss": 0.1753, "step": 5480 }, { "epoch": 26.845637583892618, "grad_norm": 3.2048895359039307, "learning_rate": 0.00016459016393442622, "loss": 0.1797, "step": 5500 }, { "epoch": 26.94325808419768, "grad_norm": 4.232358455657959, "learning_rate": 0.00016445901639344264, "loss": 0.1771, "step": 5520 }, { "epoch": 27.040878584502746, "grad_norm": 3.4074463844299316, "learning_rate": 0.00016432786885245903, "loss": 0.1695, "step": 5540 }, { "epoch": 27.13849908480781, "grad_norm": 2.856752872467041, "learning_rate": 0.00016419672131147542, "loss": 0.137, "step": 5560 }, { "epoch": 27.236119585112874, "grad_norm": 3.1748337745666504, "learning_rate": 0.0001640655737704918, "loss": 0.1535, "step": 5580 }, { "epoch": 27.333740085417936, "grad_norm": 3.6794328689575195, "learning_rate": 0.0001639344262295082, "loss": 0.1598, "step": 5600 }, { "epoch": 27.431360585723002, "grad_norm": 3.439868927001953, "learning_rate": 0.0001638032786885246, "loss": 0.1465, "step": 5620 }, { "epoch": 27.528981086028065, "grad_norm": 2.997490644454956, "learning_rate": 0.00016367213114754098, "loss": 0.1686, "step": 5640 }, { "epoch": 27.62660158633313, "grad_norm": 4.262759208679199, "learning_rate": 0.0001635409836065574, "loss": 0.1589, "step": 5660 }, { "epoch": 27.724222086638193, "grad_norm": 2.9930195808410645, "learning_rate": 0.00016340983606557378, "loss": 0.1526, "step": 5680 }, { "epoch": 27.82184258694326, "grad_norm": 3.221529245376587, "learning_rate": 0.00016327868852459017, "loss": 0.1649, "step": 5700 }, { "epoch": 27.91946308724832, "grad_norm": 3.318105697631836, "learning_rate": 0.00016314754098360656, "loss": 0.1835, "step": 5720 }, { "epoch": 28.017083587553387, "grad_norm": 2.85650372505188, "learning_rate": 0.00016301639344262295, "loss": 0.164, "step": 5740 }, { "epoch": 28.11470408785845, "grad_norm": 2.4882328510284424, "learning_rate": 0.00016288524590163937, "loss": 0.1325, "step": 5760 }, { "epoch": 28.212324588163515, "grad_norm": 2.730262041091919, "learning_rate": 0.00016275409836065573, "loss": 0.1329, "step": 5780 }, { "epoch": 28.309945088468577, "grad_norm": 3.4027230739593506, "learning_rate": 0.00016262295081967214, "loss": 0.1471, "step": 5800 }, { "epoch": 28.407565588773643, "grad_norm": 3.170252799987793, "learning_rate": 0.00016249180327868853, "loss": 0.1459, "step": 5820 }, { "epoch": 28.505186089078705, "grad_norm": 2.631844997406006, "learning_rate": 0.00016236065573770492, "loss": 0.1492, "step": 5840 }, { "epoch": 28.60280658938377, "grad_norm": 3.2945570945739746, "learning_rate": 0.0001622295081967213, "loss": 0.1398, "step": 5860 }, { "epoch": 28.700427089688834, "grad_norm": 3.442495822906494, "learning_rate": 0.0001620983606557377, "loss": 0.1495, "step": 5880 }, { "epoch": 28.7980475899939, "grad_norm": 3.4794762134552, "learning_rate": 0.00016196721311475412, "loss": 0.1697, "step": 5900 }, { "epoch": 28.89566809029896, "grad_norm": 3.2846438884735107, "learning_rate": 0.00016183606557377048, "loss": 0.1603, "step": 5920 }, { "epoch": 28.993288590604028, "grad_norm": 2.8020057678222656, "learning_rate": 0.0001617049180327869, "loss": 0.1705, "step": 5940 }, { "epoch": 29.09090909090909, "grad_norm": 2.949068069458008, "learning_rate": 0.0001615737704918033, "loss": 0.1213, "step": 5960 }, { "epoch": 29.188529591214156, "grad_norm": 2.6021780967712402, "learning_rate": 0.00016144262295081968, "loss": 0.1247, "step": 5980 }, { "epoch": 29.286150091519218, "grad_norm": 2.752091884613037, "learning_rate": 0.00016131147540983607, "loss": 0.1226, "step": 6000 }, { "epoch": 29.383770591824284, "grad_norm": 3.845365285873413, "learning_rate": 0.00016118032786885245, "loss": 0.1452, "step": 6020 }, { "epoch": 29.481391092129346, "grad_norm": 3.541273832321167, "learning_rate": 0.00016104918032786887, "loss": 0.1392, "step": 6040 }, { "epoch": 29.579011592434412, "grad_norm": 3.554525375366211, "learning_rate": 0.00016091803278688526, "loss": 0.1466, "step": 6060 }, { "epoch": 29.676632092739474, "grad_norm": 3.1420257091522217, "learning_rate": 0.00016078688524590165, "loss": 0.1433, "step": 6080 }, { "epoch": 29.77425259304454, "grad_norm": 3.15902042388916, "learning_rate": 0.00016065573770491804, "loss": 0.1532, "step": 6100 }, { "epoch": 29.871873093349603, "grad_norm": 2.8925654888153076, "learning_rate": 0.00016052459016393443, "loss": 0.1499, "step": 6120 }, { "epoch": 29.96949359365467, "grad_norm": 3.0476958751678467, "learning_rate": 0.00016039344262295082, "loss": 0.1706, "step": 6140 }, { "epoch": 30.06711409395973, "grad_norm": 3.0503270626068115, "learning_rate": 0.0001602622950819672, "loss": 0.116, "step": 6160 }, { "epoch": 30.164734594264797, "grad_norm": 2.6872098445892334, "learning_rate": 0.00016013114754098362, "loss": 0.1177, "step": 6180 }, { "epoch": 30.26235509456986, "grad_norm": 3.1871418952941895, "learning_rate": 0.00016, "loss": 0.134, "step": 6200 }, { "epoch": 30.359975594874925, "grad_norm": 2.611163854598999, "learning_rate": 0.0001598688524590164, "loss": 0.1363, "step": 6220 }, { "epoch": 30.457596095179987, "grad_norm": 2.7208786010742188, "learning_rate": 0.0001597377049180328, "loss": 0.1346, "step": 6240 }, { "epoch": 30.555216595485053, "grad_norm": 3.424114227294922, "learning_rate": 0.00015960655737704918, "loss": 0.1336, "step": 6260 }, { "epoch": 30.652837095790115, "grad_norm": 2.9046945571899414, "learning_rate": 0.0001594754098360656, "loss": 0.1391, "step": 6280 }, { "epoch": 30.75045759609518, "grad_norm": 3.6124610900878906, "learning_rate": 0.00015934426229508196, "loss": 0.147, "step": 6300 }, { "epoch": 30.848078096400243, "grad_norm": 3.3340744972229004, "learning_rate": 0.00015921311475409838, "loss": 0.1471, "step": 6320 }, { "epoch": 30.94569859670531, "grad_norm": 3.6494932174682617, "learning_rate": 0.00015908196721311477, "loss": 0.1392, "step": 6340 }, { "epoch": 31.04331909701037, "grad_norm": 2.10213303565979, "learning_rate": 0.00015895081967213116, "loss": 0.1319, "step": 6360 }, { "epoch": 31.140939597315437, "grad_norm": 2.641988515853882, "learning_rate": 0.00015881967213114754, "loss": 0.1124, "step": 6380 }, { "epoch": 31.2385600976205, "grad_norm": 3.3964974880218506, "learning_rate": 0.00015868852459016393, "loss": 0.1214, "step": 6400 }, { "epoch": 31.336180597925566, "grad_norm": 3.438147783279419, "learning_rate": 0.00015855737704918035, "loss": 0.1302, "step": 6420 }, { "epoch": 31.433801098230628, "grad_norm": 3.079634189605713, "learning_rate": 0.0001584262295081967, "loss": 0.1297, "step": 6440 }, { "epoch": 31.531421598535694, "grad_norm": 4.332663536071777, "learning_rate": 0.00015829508196721313, "loss": 0.1328, "step": 6460 }, { "epoch": 31.629042098840756, "grad_norm": 3.123251438140869, "learning_rate": 0.00015816393442622952, "loss": 0.1201, "step": 6480 }, { "epoch": 31.726662599145822, "grad_norm": 2.725952386856079, "learning_rate": 0.0001580327868852459, "loss": 0.1378, "step": 6500 }, { "epoch": 31.824283099450884, "grad_norm": 3.1809983253479004, "learning_rate": 0.0001579016393442623, "loss": 0.1395, "step": 6520 }, { "epoch": 31.92190359975595, "grad_norm": 3.050304889678955, "learning_rate": 0.0001577704918032787, "loss": 0.1208, "step": 6540 }, { "epoch": 32.01952410006101, "grad_norm": 3.966635227203369, "learning_rate": 0.0001576393442622951, "loss": 0.1496, "step": 6560 }, { "epoch": 32.117144600366075, "grad_norm": 2.607820510864258, "learning_rate": 0.00015750819672131147, "loss": 0.1028, "step": 6580 }, { "epoch": 32.214765100671144, "grad_norm": 3.476501941680908, "learning_rate": 0.00015737704918032788, "loss": 0.1048, "step": 6600 }, { "epoch": 32.31238560097621, "grad_norm": 2.745434284210205, "learning_rate": 0.00015724590163934427, "loss": 0.1173, "step": 6620 }, { "epoch": 32.41000610128127, "grad_norm": 3.01653790473938, "learning_rate": 0.00015711475409836066, "loss": 0.1256, "step": 6640 }, { "epoch": 32.50762660158633, "grad_norm": 2.9087026119232178, "learning_rate": 0.00015698360655737705, "loss": 0.1343, "step": 6660 }, { "epoch": 32.6052471018914, "grad_norm": 2.6568126678466797, "learning_rate": 0.00015685245901639344, "loss": 0.1248, "step": 6680 }, { "epoch": 32.70286760219646, "grad_norm": 3.052931547164917, "learning_rate": 0.00015672131147540986, "loss": 0.1343, "step": 6700 }, { "epoch": 32.800488102501525, "grad_norm": 3.3897056579589844, "learning_rate": 0.00015659016393442622, "loss": 0.136, "step": 6720 }, { "epoch": 32.89810860280659, "grad_norm": 3.269697904586792, "learning_rate": 0.00015645901639344263, "loss": 0.1304, "step": 6740 }, { "epoch": 32.99572910311166, "grad_norm": 3.2755305767059326, "learning_rate": 0.00015632786885245902, "loss": 0.1368, "step": 6760 }, { "epoch": 33.09334960341672, "grad_norm": 1.8959623575210571, "learning_rate": 0.0001561967213114754, "loss": 0.1067, "step": 6780 }, { "epoch": 33.19097010372178, "grad_norm": 2.583085536956787, "learning_rate": 0.0001560655737704918, "loss": 0.1031, "step": 6800 }, { "epoch": 33.288590604026844, "grad_norm": 2.4109268188476562, "learning_rate": 0.0001559344262295082, "loss": 0.1035, "step": 6820 }, { "epoch": 33.38621110433191, "grad_norm": 3.176445722579956, "learning_rate": 0.0001558032786885246, "loss": 0.1153, "step": 6840 }, { "epoch": 33.483831604636975, "grad_norm": 3.5049378871917725, "learning_rate": 0.000155672131147541, "loss": 0.1195, "step": 6860 }, { "epoch": 33.58145210494204, "grad_norm": 2.7021915912628174, "learning_rate": 0.0001555409836065574, "loss": 0.1205, "step": 6880 }, { "epoch": 33.6790726052471, "grad_norm": 4.213137626647949, "learning_rate": 0.00015540983606557378, "loss": 0.1192, "step": 6900 }, { "epoch": 33.77669310555217, "grad_norm": 3.276128053665161, "learning_rate": 0.00015527868852459017, "loss": 0.1252, "step": 6920 }, { "epoch": 33.87431360585723, "grad_norm": 3.3642396926879883, "learning_rate": 0.00015514754098360656, "loss": 0.1234, "step": 6940 }, { "epoch": 33.971934106162294, "grad_norm": 3.0481181144714355, "learning_rate": 0.00015501639344262294, "loss": 0.1464, "step": 6960 }, { "epoch": 34.06955460646736, "grad_norm": 2.147581100463867, "learning_rate": 0.00015488524590163936, "loss": 0.1029, "step": 6980 }, { "epoch": 34.16717510677242, "grad_norm": 2.751429557800293, "learning_rate": 0.00015475409836065575, "loss": 0.103, "step": 7000 }, { "epoch": 34.26479560707749, "grad_norm": 2.6378426551818848, "learning_rate": 0.00015462295081967214, "loss": 0.1146, "step": 7020 }, { "epoch": 34.36241610738255, "grad_norm": 2.744255304336548, "learning_rate": 0.00015449180327868853, "loss": 0.106, "step": 7040 }, { "epoch": 34.46003660768761, "grad_norm": 3.518716812133789, "learning_rate": 0.00015436065573770492, "loss": 0.1146, "step": 7060 }, { "epoch": 34.557657107992675, "grad_norm": 4.025069236755371, "learning_rate": 0.00015422950819672133, "loss": 0.1173, "step": 7080 }, { "epoch": 34.655277608297745, "grad_norm": 2.4882397651672363, "learning_rate": 0.0001540983606557377, "loss": 0.1118, "step": 7100 }, { "epoch": 34.75289810860281, "grad_norm": 2.998798131942749, "learning_rate": 0.00015396721311475411, "loss": 0.1277, "step": 7120 }, { "epoch": 34.85051860890787, "grad_norm": 3.375034809112549, "learning_rate": 0.0001538360655737705, "loss": 0.1137, "step": 7140 }, { "epoch": 34.94813910921293, "grad_norm": 2.8172638416290283, "learning_rate": 0.0001537049180327869, "loss": 0.1083, "step": 7160 }, { "epoch": 35.045759609518, "grad_norm": 2.669013500213623, "learning_rate": 0.00015357377049180328, "loss": 0.1086, "step": 7180 }, { "epoch": 35.14338010982306, "grad_norm": 3.0433802604675293, "learning_rate": 0.00015344262295081967, "loss": 0.0921, "step": 7200 }, { "epoch": 35.241000610128125, "grad_norm": 3.2270560264587402, "learning_rate": 0.0001533114754098361, "loss": 0.0991, "step": 7220 }, { "epoch": 35.33862111043319, "grad_norm": 4.046402931213379, "learning_rate": 0.00015318032786885245, "loss": 0.0979, "step": 7240 }, { "epoch": 35.43624161073826, "grad_norm": 3.2767364978790283, "learning_rate": 0.00015304918032786887, "loss": 0.1098, "step": 7260 }, { "epoch": 35.53386211104332, "grad_norm": 3.503641366958618, "learning_rate": 0.00015291803278688526, "loss": 0.1093, "step": 7280 }, { "epoch": 35.63148261134838, "grad_norm": 2.7504665851593018, "learning_rate": 0.00015278688524590165, "loss": 0.1087, "step": 7300 }, { "epoch": 35.729103111653444, "grad_norm": 2.6193149089813232, "learning_rate": 0.00015265573770491803, "loss": 0.1207, "step": 7320 }, { "epoch": 35.82672361195851, "grad_norm": 3.204240322113037, "learning_rate": 0.00015252459016393442, "loss": 0.1189, "step": 7340 }, { "epoch": 35.924344112263576, "grad_norm": 3.0229642391204834, "learning_rate": 0.00015239344262295084, "loss": 0.1221, "step": 7360 }, { "epoch": 36.02196461256864, "grad_norm": 2.1176633834838867, "learning_rate": 0.0001522622950819672, "loss": 0.1065, "step": 7380 }, { "epoch": 36.1195851128737, "grad_norm": 2.1961264610290527, "learning_rate": 0.00015213114754098362, "loss": 0.0946, "step": 7400 }, { "epoch": 36.21720561317877, "grad_norm": 2.6912176609039307, "learning_rate": 0.000152, "loss": 0.1052, "step": 7420 }, { "epoch": 36.31482611348383, "grad_norm": 2.8034870624542236, "learning_rate": 0.0001518688524590164, "loss": 0.0935, "step": 7440 }, { "epoch": 36.412446613788894, "grad_norm": 2.85784649848938, "learning_rate": 0.0001517377049180328, "loss": 0.0984, "step": 7460 }, { "epoch": 36.51006711409396, "grad_norm": 3.637848377227783, "learning_rate": 0.00015160655737704918, "loss": 0.1134, "step": 7480 }, { "epoch": 36.607687614399026, "grad_norm": 3.2717106342315674, "learning_rate": 0.0001514754098360656, "loss": 0.1059, "step": 7500 }, { "epoch": 36.70530811470409, "grad_norm": 2.9539332389831543, "learning_rate": 0.00015134426229508198, "loss": 0.1033, "step": 7520 }, { "epoch": 36.80292861500915, "grad_norm": 3.410210132598877, "learning_rate": 0.00015121311475409837, "loss": 0.1107, "step": 7540 }, { "epoch": 36.90054911531421, "grad_norm": 3.643808126449585, "learning_rate": 0.00015108196721311476, "loss": 0.118, "step": 7560 }, { "epoch": 36.99816961561928, "grad_norm": 2.9359118938446045, "learning_rate": 0.00015095081967213115, "loss": 0.1167, "step": 7580 }, { "epoch": 37.095790115924345, "grad_norm": 2.369500160217285, "learning_rate": 0.00015081967213114754, "loss": 0.0853, "step": 7600 }, { "epoch": 37.19341061622941, "grad_norm": 2.8940887451171875, "learning_rate": 0.00015068852459016393, "loss": 0.0951, "step": 7620 }, { "epoch": 37.29103111653447, "grad_norm": 2.3819053173065186, "learning_rate": 0.00015055737704918035, "loss": 0.1047, "step": 7640 }, { "epoch": 37.38865161683954, "grad_norm": 2.7095139026641846, "learning_rate": 0.00015042622950819673, "loss": 0.1004, "step": 7660 }, { "epoch": 37.4862721171446, "grad_norm": 2.892263650894165, "learning_rate": 0.00015029508196721312, "loss": 0.1031, "step": 7680 }, { "epoch": 37.58389261744966, "grad_norm": 3.0117244720458984, "learning_rate": 0.00015016393442622951, "loss": 0.1036, "step": 7700 }, { "epoch": 37.681513117754726, "grad_norm": 2.048797130584717, "learning_rate": 0.0001500327868852459, "loss": 0.0975, "step": 7720 }, { "epoch": 37.779133618059795, "grad_norm": 3.051039218902588, "learning_rate": 0.00014990163934426232, "loss": 0.1087, "step": 7740 }, { "epoch": 37.87675411836486, "grad_norm": 2.2224767208099365, "learning_rate": 0.00014977049180327868, "loss": 0.1072, "step": 7760 }, { "epoch": 37.97437461866992, "grad_norm": 2.8031728267669678, "learning_rate": 0.0001496393442622951, "loss": 0.1017, "step": 7780 }, { "epoch": 38.07199511897498, "grad_norm": 2.108718156814575, "learning_rate": 0.0001495081967213115, "loss": 0.0921, "step": 7800 }, { "epoch": 38.16961561928005, "grad_norm": 2.253871202468872, "learning_rate": 0.00014937704918032788, "loss": 0.0898, "step": 7820 }, { "epoch": 38.267236119585114, "grad_norm": 2.5459353923797607, "learning_rate": 0.00014924590163934427, "loss": 0.0903, "step": 7840 }, { "epoch": 38.364856619890176, "grad_norm": 2.238715171813965, "learning_rate": 0.00014911475409836066, "loss": 0.0906, "step": 7860 }, { "epoch": 38.46247712019524, "grad_norm": 2.611955404281616, "learning_rate": 0.00014898360655737707, "loss": 0.0882, "step": 7880 }, { "epoch": 38.56009762050031, "grad_norm": 3.3407866954803467, "learning_rate": 0.00014885245901639343, "loss": 0.0994, "step": 7900 }, { "epoch": 38.65771812080537, "grad_norm": 2.6214687824249268, "learning_rate": 0.00014872131147540985, "loss": 0.0918, "step": 7920 }, { "epoch": 38.75533862111043, "grad_norm": 3.7567331790924072, "learning_rate": 0.00014859016393442624, "loss": 0.1039, "step": 7940 }, { "epoch": 38.852959121415495, "grad_norm": 3.809807538986206, "learning_rate": 0.00014845901639344263, "loss": 0.1081, "step": 7960 }, { "epoch": 38.950579621720564, "grad_norm": 3.56388258934021, "learning_rate": 0.00014832786885245902, "loss": 0.108, "step": 7980 }, { "epoch": 39.04820012202563, "grad_norm": 2.5101027488708496, "learning_rate": 0.0001481967213114754, "loss": 0.0941, "step": 8000 }, { "epoch": 39.14582062233069, "grad_norm": 2.393336296081543, "learning_rate": 0.00014806557377049182, "loss": 0.0786, "step": 8020 }, { "epoch": 39.24344112263575, "grad_norm": 3.982912063598633, "learning_rate": 0.0001479344262295082, "loss": 0.0848, "step": 8040 }, { "epoch": 39.34106162294082, "grad_norm": 2.0777554512023926, "learning_rate": 0.0001478032786885246, "loss": 0.0971, "step": 8060 }, { "epoch": 39.43868212324588, "grad_norm": 3.667409658432007, "learning_rate": 0.000147672131147541, "loss": 0.0943, "step": 8080 }, { "epoch": 39.536302623550945, "grad_norm": 3.234408378601074, "learning_rate": 0.00014754098360655738, "loss": 0.1012, "step": 8100 }, { "epoch": 39.63392312385601, "grad_norm": 2.9830737113952637, "learning_rate": 0.00014740983606557377, "loss": 0.0889, "step": 8120 }, { "epoch": 39.73154362416108, "grad_norm": 2.235419750213623, "learning_rate": 0.00014727868852459016, "loss": 0.0994, "step": 8140 }, { "epoch": 39.82916412446614, "grad_norm": 2.695769786834717, "learning_rate": 0.00014714754098360658, "loss": 0.0952, "step": 8160 }, { "epoch": 39.9267846247712, "grad_norm": 2.6697440147399902, "learning_rate": 0.00014701639344262297, "loss": 0.1067, "step": 8180 }, { "epoch": 40.024405125076264, "grad_norm": 2.091428756713867, "learning_rate": 0.00014688524590163936, "loss": 0.0915, "step": 8200 }, { "epoch": 40.12202562538133, "grad_norm": 2.332493305206299, "learning_rate": 0.00014675409836065575, "loss": 0.0786, "step": 8220 }, { "epoch": 40.219646125686396, "grad_norm": 2.596400260925293, "learning_rate": 0.00014662295081967214, "loss": 0.0855, "step": 8240 }, { "epoch": 40.31726662599146, "grad_norm": 2.1886463165283203, "learning_rate": 0.00014649180327868852, "loss": 0.0823, "step": 8260 }, { "epoch": 40.41488712629652, "grad_norm": 3.2993671894073486, "learning_rate": 0.00014636065573770491, "loss": 0.0892, "step": 8280 }, { "epoch": 40.51250762660159, "grad_norm": 2.4500246047973633, "learning_rate": 0.00014622950819672133, "loss": 0.087, "step": 8300 }, { "epoch": 40.61012812690665, "grad_norm": 2.4187991619110107, "learning_rate": 0.00014609836065573772, "loss": 0.0911, "step": 8320 }, { "epoch": 40.707748627211714, "grad_norm": 2.3600058555603027, "learning_rate": 0.0001459672131147541, "loss": 0.0916, "step": 8340 }, { "epoch": 40.80536912751678, "grad_norm": 2.7269983291625977, "learning_rate": 0.0001458360655737705, "loss": 0.0986, "step": 8360 }, { "epoch": 40.902989627821846, "grad_norm": 2.907344102859497, "learning_rate": 0.0001457049180327869, "loss": 0.1055, "step": 8380 }, { "epoch": 41.00061012812691, "grad_norm": 2.574118137359619, "learning_rate": 0.0001455737704918033, "loss": 0.099, "step": 8400 }, { "epoch": 41.09823062843197, "grad_norm": 2.291900634765625, "learning_rate": 0.00014544262295081967, "loss": 0.0676, "step": 8420 }, { "epoch": 41.19585112873703, "grad_norm": 2.241544246673584, "learning_rate": 0.00014531147540983608, "loss": 0.0795, "step": 8440 }, { "epoch": 41.2934716290421, "grad_norm": 2.6499416828155518, "learning_rate": 0.00014518032786885247, "loss": 0.083, "step": 8460 }, { "epoch": 41.391092129347165, "grad_norm": 2.436126947402954, "learning_rate": 0.00014504918032786886, "loss": 0.0927, "step": 8480 }, { "epoch": 41.48871262965223, "grad_norm": 3.5703392028808594, "learning_rate": 0.00014491803278688525, "loss": 0.0804, "step": 8500 }, { "epoch": 41.58633312995729, "grad_norm": 2.8533120155334473, "learning_rate": 0.00014478688524590164, "loss": 0.0871, "step": 8520 }, { "epoch": 41.68395363026236, "grad_norm": 2.9216787815093994, "learning_rate": 0.00014465573770491806, "loss": 0.091, "step": 8540 }, { "epoch": 41.78157413056742, "grad_norm": 2.6128993034362793, "learning_rate": 0.00014452459016393442, "loss": 0.099, "step": 8560 }, { "epoch": 41.87919463087248, "grad_norm": 3.376518726348877, "learning_rate": 0.00014439344262295084, "loss": 0.0926, "step": 8580 }, { "epoch": 41.976815131177545, "grad_norm": 2.5684266090393066, "learning_rate": 0.00014426229508196722, "loss": 0.1061, "step": 8600 }, { "epoch": 42.074435631482615, "grad_norm": 2.173293113708496, "learning_rate": 0.00014413114754098361, "loss": 0.0744, "step": 8620 }, { "epoch": 42.17205613178768, "grad_norm": 1.6150739192962646, "learning_rate": 0.000144, "loss": 0.0802, "step": 8640 }, { "epoch": 42.26967663209274, "grad_norm": 2.8161544799804688, "learning_rate": 0.0001438688524590164, "loss": 0.0825, "step": 8660 }, { "epoch": 42.3672971323978, "grad_norm": 2.475748062133789, "learning_rate": 0.0001437377049180328, "loss": 0.0792, "step": 8680 }, { "epoch": 42.464917632702864, "grad_norm": 2.540273427963257, "learning_rate": 0.00014360655737704917, "loss": 0.0895, "step": 8700 }, { "epoch": 42.56253813300793, "grad_norm": 3.0831921100616455, "learning_rate": 0.0001434754098360656, "loss": 0.0834, "step": 8720 }, { "epoch": 42.660158633312996, "grad_norm": 2.467629909515381, "learning_rate": 0.00014334426229508198, "loss": 0.0816, "step": 8740 }, { "epoch": 42.75777913361806, "grad_norm": 2.2306694984436035, "learning_rate": 0.00014321311475409837, "loss": 0.0933, "step": 8760 }, { "epoch": 42.85539963392312, "grad_norm": 2.662369728088379, "learning_rate": 0.00014308196721311476, "loss": 0.0906, "step": 8780 }, { "epoch": 42.95302013422819, "grad_norm": 3.169682741165161, "learning_rate": 0.00014295081967213115, "loss": 0.0888, "step": 8800 }, { "epoch": 43.05064063453325, "grad_norm": 3.4279258251190186, "learning_rate": 0.00014281967213114756, "loss": 0.0816, "step": 8820 }, { "epoch": 43.148261134838314, "grad_norm": 2.4985101222991943, "learning_rate": 0.00014268852459016395, "loss": 0.0789, "step": 8840 }, { "epoch": 43.24588163514338, "grad_norm": 3.5532596111297607, "learning_rate": 0.00014255737704918034, "loss": 0.0711, "step": 8860 }, { "epoch": 43.343502135448446, "grad_norm": 2.270078420639038, "learning_rate": 0.00014242622950819673, "loss": 0.0753, "step": 8880 }, { "epoch": 43.44112263575351, "grad_norm": 2.114807367324829, "learning_rate": 0.00014229508196721312, "loss": 0.0751, "step": 8900 }, { "epoch": 43.53874313605857, "grad_norm": 3.013507843017578, "learning_rate": 0.0001421639344262295, "loss": 0.0858, "step": 8920 }, { "epoch": 43.63636363636363, "grad_norm": 2.022995948791504, "learning_rate": 0.0001420327868852459, "loss": 0.0803, "step": 8940 }, { "epoch": 43.7339841366687, "grad_norm": 2.6093649864196777, "learning_rate": 0.00014190163934426231, "loss": 0.0828, "step": 8960 }, { "epoch": 43.831604636973765, "grad_norm": 3.3971455097198486, "learning_rate": 0.0001417704918032787, "loss": 0.0909, "step": 8980 }, { "epoch": 43.92922513727883, "grad_norm": 2.7203516960144043, "learning_rate": 0.0001416393442622951, "loss": 0.0884, "step": 9000 }, { "epoch": 44.02684563758389, "grad_norm": 1.8878543376922607, "learning_rate": 0.00014150819672131148, "loss": 0.0782, "step": 9020 }, { "epoch": 44.12446613788896, "grad_norm": 2.146353006362915, "learning_rate": 0.00014137704918032787, "loss": 0.0702, "step": 9040 }, { "epoch": 44.22208663819402, "grad_norm": 2.4737651348114014, "learning_rate": 0.00014124590163934426, "loss": 0.0762, "step": 9060 }, { "epoch": 44.31970713849908, "grad_norm": 2.3397083282470703, "learning_rate": 0.00014111475409836065, "loss": 0.072, "step": 9080 }, { "epoch": 44.417327638804146, "grad_norm": 1.8726561069488525, "learning_rate": 0.00014098360655737707, "loss": 0.068, "step": 9100 }, { "epoch": 44.514948139109215, "grad_norm": 2.164581775665283, "learning_rate": 0.00014085245901639346, "loss": 0.0767, "step": 9120 }, { "epoch": 44.61256863941428, "grad_norm": 3.2708849906921387, "learning_rate": 0.00014072131147540985, "loss": 0.0842, "step": 9140 }, { "epoch": 44.71018913971934, "grad_norm": 3.566901206970215, "learning_rate": 0.00014059016393442624, "loss": 0.0813, "step": 9160 }, { "epoch": 44.8078096400244, "grad_norm": 3.0178351402282715, "learning_rate": 0.00014045901639344262, "loss": 0.0837, "step": 9180 }, { "epoch": 44.90543014032947, "grad_norm": 3.19291353225708, "learning_rate": 0.00014032786885245904, "loss": 0.0886, "step": 9200 }, { "epoch": 45.003050640634534, "grad_norm": 2.640066146850586, "learning_rate": 0.0001401967213114754, "loss": 0.0842, "step": 9220 }, { "epoch": 45.100671140939596, "grad_norm": 2.125232458114624, "learning_rate": 0.00014006557377049182, "loss": 0.0657, "step": 9240 }, { "epoch": 45.19829164124466, "grad_norm": 2.5191051959991455, "learning_rate": 0.0001399344262295082, "loss": 0.0708, "step": 9260 }, { "epoch": 45.29591214154973, "grad_norm": 1.9102871417999268, "learning_rate": 0.0001398032786885246, "loss": 0.0665, "step": 9280 }, { "epoch": 45.39353264185479, "grad_norm": 2.4303011894226074, "learning_rate": 0.000139672131147541, "loss": 0.0737, "step": 9300 }, { "epoch": 45.49115314215985, "grad_norm": 2.9228925704956055, "learning_rate": 0.00013954098360655738, "loss": 0.0731, "step": 9320 }, { "epoch": 45.588773642464915, "grad_norm": 2.2215464115142822, "learning_rate": 0.0001394098360655738, "loss": 0.0778, "step": 9340 }, { "epoch": 45.686394142769984, "grad_norm": 3.2468485832214355, "learning_rate": 0.00013927868852459016, "loss": 0.0799, "step": 9360 }, { "epoch": 45.78401464307505, "grad_norm": 2.814979076385498, "learning_rate": 0.00013914754098360657, "loss": 0.0857, "step": 9380 }, { "epoch": 45.88163514338011, "grad_norm": 2.6760339736938477, "learning_rate": 0.00013901639344262296, "loss": 0.0762, "step": 9400 }, { "epoch": 45.97925564368517, "grad_norm": 2.8788065910339355, "learning_rate": 0.00013888524590163935, "loss": 0.0881, "step": 9420 }, { "epoch": 46.07687614399024, "grad_norm": 2.4430296421051025, "learning_rate": 0.00013875409836065574, "loss": 0.0656, "step": 9440 }, { "epoch": 46.1744966442953, "grad_norm": 2.3718602657318115, "learning_rate": 0.00013862295081967213, "loss": 0.0667, "step": 9460 }, { "epoch": 46.272117144600365, "grad_norm": 2.2495276927948, "learning_rate": 0.00013849180327868855, "loss": 0.0741, "step": 9480 }, { "epoch": 46.36973764490543, "grad_norm": 3.612171173095703, "learning_rate": 0.0001383606557377049, "loss": 0.0751, "step": 9500 }, { "epoch": 46.4673581452105, "grad_norm": 2.656705379486084, "learning_rate": 0.00013822950819672133, "loss": 0.0765, "step": 9520 }, { "epoch": 46.56497864551556, "grad_norm": 2.154595136642456, "learning_rate": 0.00013809836065573771, "loss": 0.0772, "step": 9540 }, { "epoch": 46.66259914582062, "grad_norm": 1.911001205444336, "learning_rate": 0.0001379672131147541, "loss": 0.0708, "step": 9560 }, { "epoch": 46.760219646125684, "grad_norm": 3.0681228637695312, "learning_rate": 0.0001378360655737705, "loss": 0.0803, "step": 9580 }, { "epoch": 46.85784014643075, "grad_norm": 2.6590416431427, "learning_rate": 0.00013770491803278688, "loss": 0.0784, "step": 9600 }, { "epoch": 46.955460646735816, "grad_norm": 2.7904417514801025, "learning_rate": 0.0001375737704918033, "loss": 0.0775, "step": 9620 }, { "epoch": 47.05308114704088, "grad_norm": 1.802498459815979, "learning_rate": 0.0001374426229508197, "loss": 0.0747, "step": 9640 }, { "epoch": 47.15070164734594, "grad_norm": 1.8101750612258911, "learning_rate": 0.00013731147540983608, "loss": 0.0648, "step": 9660 }, { "epoch": 47.24832214765101, "grad_norm": 3.6735594272613525, "learning_rate": 0.00013718032786885247, "loss": 0.0657, "step": 9680 }, { "epoch": 47.34594264795607, "grad_norm": 2.0004684925079346, "learning_rate": 0.00013704918032786886, "loss": 0.0628, "step": 9700 }, { "epoch": 47.443563148261134, "grad_norm": 2.4625582695007324, "learning_rate": 0.00013691803278688525, "loss": 0.0654, "step": 9720 }, { "epoch": 47.5411836485662, "grad_norm": 2.3431763648986816, "learning_rate": 0.00013678688524590164, "loss": 0.0741, "step": 9740 }, { "epoch": 47.638804148871266, "grad_norm": 3.0707414150238037, "learning_rate": 0.00013665573770491805, "loss": 0.0734, "step": 9760 }, { "epoch": 47.73642464917633, "grad_norm": 3.3095669746398926, "learning_rate": 0.00013652459016393444, "loss": 0.0793, "step": 9780 }, { "epoch": 47.83404514948139, "grad_norm": 2.388031482696533, "learning_rate": 0.00013639344262295083, "loss": 0.0789, "step": 9800 }, { "epoch": 47.93166564978645, "grad_norm": 2.9367451667785645, "learning_rate": 0.00013626229508196722, "loss": 0.0798, "step": 9820 }, { "epoch": 48.02928615009152, "grad_norm": 3.24287486076355, "learning_rate": 0.0001361311475409836, "loss": 0.0754, "step": 9840 }, { "epoch": 48.126906650396585, "grad_norm": 2.868478536605835, "learning_rate": 0.00013600000000000003, "loss": 0.056, "step": 9860 }, { "epoch": 48.22452715070165, "grad_norm": 1.8352900743484497, "learning_rate": 0.0001358688524590164, "loss": 0.0686, "step": 9880 }, { "epoch": 48.32214765100671, "grad_norm": 1.727157711982727, "learning_rate": 0.0001357377049180328, "loss": 0.0618, "step": 9900 }, { "epoch": 48.41976815131178, "grad_norm": 1.8096739053726196, "learning_rate": 0.0001356065573770492, "loss": 0.0694, "step": 9920 }, { "epoch": 48.51738865161684, "grad_norm": 2.4526126384735107, "learning_rate": 0.00013547540983606556, "loss": 0.0762, "step": 9940 }, { "epoch": 48.6150091519219, "grad_norm": 2.2709124088287354, "learning_rate": 0.00013534426229508197, "loss": 0.0764, "step": 9960 }, { "epoch": 48.712629652226966, "grad_norm": 3.1081132888793945, "learning_rate": 0.00013521311475409836, "loss": 0.0682, "step": 9980 }, { "epoch": 48.810250152532035, "grad_norm": 2.288539171218872, "learning_rate": 0.00013508196721311478, "loss": 0.0737, "step": 10000 }, { "epoch": 48.9078706528371, "grad_norm": 2.9011008739471436, "learning_rate": 0.00013495081967213114, "loss": 0.0756, "step": 10020 }, { "epoch": 49.00549115314216, "grad_norm": 2.3980066776275635, "learning_rate": 0.00013481967213114756, "loss": 0.0778, "step": 10040 }, { "epoch": 49.10311165344722, "grad_norm": 2.349588394165039, "learning_rate": 0.00013468852459016395, "loss": 0.0515, "step": 10060 }, { "epoch": 49.20073215375229, "grad_norm": 1.9128116369247437, "learning_rate": 0.00013455737704918034, "loss": 0.06, "step": 10080 }, { "epoch": 49.298352654057354, "grad_norm": 3.1744940280914307, "learning_rate": 0.00013442622950819673, "loss": 0.065, "step": 10100 }, { "epoch": 49.395973154362416, "grad_norm": 4.267724990844727, "learning_rate": 0.00013429508196721311, "loss": 0.0681, "step": 10120 }, { "epoch": 49.49359365466748, "grad_norm": 2.734342575073242, "learning_rate": 0.00013416393442622953, "loss": 0.0696, "step": 10140 }, { "epoch": 49.59121415497255, "grad_norm": 2.0303688049316406, "learning_rate": 0.0001340327868852459, "loss": 0.0666, "step": 10160 }, { "epoch": 49.68883465527761, "grad_norm": 2.434067964553833, "learning_rate": 0.0001339016393442623, "loss": 0.0742, "step": 10180 }, { "epoch": 49.78645515558267, "grad_norm": 2.633126735687256, "learning_rate": 0.0001337704918032787, "loss": 0.0741, "step": 10200 }, { "epoch": 49.884075655887735, "grad_norm": 3.312375545501709, "learning_rate": 0.0001336393442622951, "loss": 0.0773, "step": 10220 }, { "epoch": 49.981696156192804, "grad_norm": 2.251260280609131, "learning_rate": 0.00013350819672131148, "loss": 0.0795, "step": 10240 }, { "epoch": 50.079316656497866, "grad_norm": 2.0748329162597656, "learning_rate": 0.00013337704918032787, "loss": 0.061, "step": 10260 }, { "epoch": 50.17693715680293, "grad_norm": 2.7968759536743164, "learning_rate": 0.00013324590163934428, "loss": 0.0634, "step": 10280 }, { "epoch": 50.27455765710799, "grad_norm": 2.5612518787384033, "learning_rate": 0.00013311475409836067, "loss": 0.0632, "step": 10300 }, { "epoch": 50.37217815741306, "grad_norm": 3.2175679206848145, "learning_rate": 0.00013298360655737706, "loss": 0.0642, "step": 10320 }, { "epoch": 50.46979865771812, "grad_norm": 2.260554790496826, "learning_rate": 0.00013285245901639345, "loss": 0.0609, "step": 10340 }, { "epoch": 50.567419158023185, "grad_norm": 2.8259806632995605, "learning_rate": 0.00013272131147540984, "loss": 0.0735, "step": 10360 }, { "epoch": 50.66503965832825, "grad_norm": 2.393419027328491, "learning_rate": 0.00013259016393442623, "loss": 0.0672, "step": 10380 }, { "epoch": 50.76266015863331, "grad_norm": 2.3658840656280518, "learning_rate": 0.00013245901639344262, "loss": 0.0713, "step": 10400 }, { "epoch": 50.86028065893838, "grad_norm": 2.526512384414673, "learning_rate": 0.00013232786885245904, "loss": 0.0699, "step": 10420 }, { "epoch": 50.95790115924344, "grad_norm": 2.856234073638916, "learning_rate": 0.00013219672131147543, "loss": 0.0713, "step": 10440 }, { "epoch": 51.0555216595485, "grad_norm": 1.5620033740997314, "learning_rate": 0.0001320655737704918, "loss": 0.06, "step": 10460 }, { "epoch": 51.153142159853566, "grad_norm": 3.7794861793518066, "learning_rate": 0.0001319344262295082, "loss": 0.0581, "step": 10480 }, { "epoch": 51.250762660158635, "grad_norm": 2.4568800926208496, "learning_rate": 0.0001318032786885246, "loss": 0.059, "step": 10500 }, { "epoch": 51.3483831604637, "grad_norm": 2.32920503616333, "learning_rate": 0.000131672131147541, "loss": 0.0629, "step": 10520 }, { "epoch": 51.44600366076876, "grad_norm": 2.1552681922912598, "learning_rate": 0.00013154098360655737, "loss": 0.0646, "step": 10540 }, { "epoch": 51.54362416107382, "grad_norm": 2.146484375, "learning_rate": 0.0001314098360655738, "loss": 0.064, "step": 10560 }, { "epoch": 51.64124466137889, "grad_norm": 2.428022623062134, "learning_rate": 0.00013127868852459018, "loss": 0.0644, "step": 10580 }, { "epoch": 51.738865161683954, "grad_norm": 3.3938937187194824, "learning_rate": 0.00013114754098360654, "loss": 0.0736, "step": 10600 }, { "epoch": 51.836485661989016, "grad_norm": 2.2218217849731445, "learning_rate": 0.00013101639344262296, "loss": 0.0699, "step": 10620 }, { "epoch": 51.93410616229408, "grad_norm": 2.2436861991882324, "learning_rate": 0.00013088524590163935, "loss": 0.0718, "step": 10640 }, { "epoch": 52.03172666259915, "grad_norm": 1.7175501585006714, "learning_rate": 0.00013075409836065576, "loss": 0.0667, "step": 10660 }, { "epoch": 52.12934716290421, "grad_norm": 2.127976417541504, "learning_rate": 0.00013062295081967213, "loss": 0.0531, "step": 10680 }, { "epoch": 52.22696766320927, "grad_norm": 1.9709439277648926, "learning_rate": 0.00013049180327868854, "loss": 0.0553, "step": 10700 }, { "epoch": 52.324588163514335, "grad_norm": 2.644279718399048, "learning_rate": 0.00013036065573770493, "loss": 0.0624, "step": 10720 }, { "epoch": 52.422208663819404, "grad_norm": 1.570968747138977, "learning_rate": 0.00013022950819672132, "loss": 0.061, "step": 10740 }, { "epoch": 52.51982916412447, "grad_norm": 2.2433953285217285, "learning_rate": 0.0001300983606557377, "loss": 0.0642, "step": 10760 }, { "epoch": 52.61744966442953, "grad_norm": 2.019834280014038, "learning_rate": 0.0001299672131147541, "loss": 0.0681, "step": 10780 }, { "epoch": 52.71507016473459, "grad_norm": 2.366257429122925, "learning_rate": 0.00012983606557377052, "loss": 0.0697, "step": 10800 }, { "epoch": 52.81269066503966, "grad_norm": 2.3033928871154785, "learning_rate": 0.00012970491803278688, "loss": 0.0631, "step": 10820 }, { "epoch": 52.91031116534472, "grad_norm": 2.430032968521118, "learning_rate": 0.0001295737704918033, "loss": 0.0676, "step": 10840 }, { "epoch": 53.007931665649785, "grad_norm": 1.5559980869293213, "learning_rate": 0.00012944262295081968, "loss": 0.063, "step": 10860 }, { "epoch": 53.10555216595485, "grad_norm": 2.612412452697754, "learning_rate": 0.00012931147540983607, "loss": 0.0566, "step": 10880 }, { "epoch": 53.20317266625992, "grad_norm": 2.391444444656372, "learning_rate": 0.00012918032786885246, "loss": 0.0565, "step": 10900 }, { "epoch": 53.30079316656498, "grad_norm": 2.6666293144226074, "learning_rate": 0.00012904918032786885, "loss": 0.0598, "step": 10920 }, { "epoch": 53.39841366687004, "grad_norm": 2.613943576812744, "learning_rate": 0.00012891803278688527, "loss": 0.063, "step": 10940 }, { "epoch": 53.496034167175104, "grad_norm": 2.273446559906006, "learning_rate": 0.00012878688524590166, "loss": 0.0588, "step": 10960 }, { "epoch": 53.59365466748017, "grad_norm": 2.058150291442871, "learning_rate": 0.00012865573770491802, "loss": 0.0588, "step": 10980 }, { "epoch": 53.691275167785236, "grad_norm": 2.4559085369110107, "learning_rate": 0.00012852459016393444, "loss": 0.0648, "step": 11000 }, { "epoch": 53.7888956680903, "grad_norm": 2.1946640014648438, "learning_rate": 0.00012839344262295083, "loss": 0.0689, "step": 11020 }, { "epoch": 53.88651616839536, "grad_norm": 2.7223618030548096, "learning_rate": 0.00012826229508196722, "loss": 0.0637, "step": 11040 }, { "epoch": 53.98413666870043, "grad_norm": 1.8997637033462524, "learning_rate": 0.0001281311475409836, "loss": 0.0624, "step": 11060 }, { "epoch": 54.08175716900549, "grad_norm": 1.6322365999221802, "learning_rate": 0.00012800000000000002, "loss": 0.0529, "step": 11080 }, { "epoch": 54.179377669310554, "grad_norm": 1.6490808725357056, "learning_rate": 0.0001278688524590164, "loss": 0.0557, "step": 11100 }, { "epoch": 54.27699816961562, "grad_norm": 1.3738045692443848, "learning_rate": 0.00012773770491803277, "loss": 0.0553, "step": 11120 }, { "epoch": 54.374618669920686, "grad_norm": 1.990561604499817, "learning_rate": 0.0001276065573770492, "loss": 0.0555, "step": 11140 }, { "epoch": 54.47223917022575, "grad_norm": 1.8162143230438232, "learning_rate": 0.00012747540983606558, "loss": 0.0572, "step": 11160 }, { "epoch": 54.56985967053081, "grad_norm": 1.980643391609192, "learning_rate": 0.000127344262295082, "loss": 0.0689, "step": 11180 }, { "epoch": 54.66748017083587, "grad_norm": 2.3956120014190674, "learning_rate": 0.00012721311475409836, "loss": 0.0616, "step": 11200 }, { "epoch": 54.76510067114094, "grad_norm": 3.9318060874938965, "learning_rate": 0.00012708196721311477, "loss": 0.0594, "step": 11220 }, { "epoch": 54.862721171446005, "grad_norm": 2.353564739227295, "learning_rate": 0.00012695081967213116, "loss": 0.0589, "step": 11240 }, { "epoch": 54.96034167175107, "grad_norm": 2.640501022338867, "learning_rate": 0.00012681967213114753, "loss": 0.0617, "step": 11260 }, { "epoch": 55.05796217205613, "grad_norm": 2.2255806922912598, "learning_rate": 0.00012668852459016394, "loss": 0.0596, "step": 11280 }, { "epoch": 55.1555826723612, "grad_norm": 1.7652024030685425, "learning_rate": 0.00012655737704918033, "loss": 0.0573, "step": 11300 }, { "epoch": 55.25320317266626, "grad_norm": 1.6768336296081543, "learning_rate": 0.00012642622950819675, "loss": 0.0533, "step": 11320 }, { "epoch": 55.35082367297132, "grad_norm": 1.9633594751358032, "learning_rate": 0.0001262950819672131, "loss": 0.0501, "step": 11340 }, { "epoch": 55.448444173276386, "grad_norm": 3.0632686614990234, "learning_rate": 0.0001261639344262295, "loss": 0.0584, "step": 11360 }, { "epoch": 55.546064673581455, "grad_norm": 3.617532968521118, "learning_rate": 0.00012603278688524592, "loss": 0.0573, "step": 11380 }, { "epoch": 55.64368517388652, "grad_norm": 2.046466827392578, "learning_rate": 0.0001259016393442623, "loss": 0.0596, "step": 11400 }, { "epoch": 55.74130567419158, "grad_norm": 2.1905694007873535, "learning_rate": 0.0001257704918032787, "loss": 0.0632, "step": 11420 }, { "epoch": 55.83892617449664, "grad_norm": 3.6722793579101562, "learning_rate": 0.00012563934426229508, "loss": 0.0626, "step": 11440 }, { "epoch": 55.93654667480171, "grad_norm": 1.6656643152236938, "learning_rate": 0.0001255081967213115, "loss": 0.0637, "step": 11460 }, { "epoch": 56.034167175106774, "grad_norm": 1.9462730884552002, "learning_rate": 0.00012537704918032786, "loss": 0.0562, "step": 11480 }, { "epoch": 56.131787675411836, "grad_norm": 2.050899028778076, "learning_rate": 0.00012524590163934425, "loss": 0.0499, "step": 11500 }, { "epoch": 56.2294081757169, "grad_norm": 2.114248514175415, "learning_rate": 0.00012511475409836067, "loss": 0.0489, "step": 11520 }, { "epoch": 56.32702867602197, "grad_norm": 2.2343602180480957, "learning_rate": 0.00012498360655737706, "loss": 0.0547, "step": 11540 }, { "epoch": 56.42464917632703, "grad_norm": 2.2102224826812744, "learning_rate": 0.00012485245901639345, "loss": 0.0568, "step": 11560 }, { "epoch": 56.52226967663209, "grad_norm": 9.124307632446289, "learning_rate": 0.00012472131147540984, "loss": 0.0639, "step": 11580 }, { "epoch": 56.619890176937155, "grad_norm": 3.184844493865967, "learning_rate": 0.00012459016393442625, "loss": 0.0577, "step": 11600 }, { "epoch": 56.717510677242224, "grad_norm": 1.8462156057357788, "learning_rate": 0.00012445901639344262, "loss": 0.0549, "step": 11620 }, { "epoch": 56.815131177547286, "grad_norm": 1.667975902557373, "learning_rate": 0.000124327868852459, "loss": 0.0534, "step": 11640 }, { "epoch": 56.91275167785235, "grad_norm": 2.60188364982605, "learning_rate": 0.00012419672131147542, "loss": 0.0637, "step": 11660 }, { "epoch": 57.01037217815741, "grad_norm": 1.711301565170288, "learning_rate": 0.0001240655737704918, "loss": 0.0623, "step": 11680 }, { "epoch": 57.10799267846248, "grad_norm": 3.080031394958496, "learning_rate": 0.0001239344262295082, "loss": 0.0438, "step": 11700 }, { "epoch": 57.20561317876754, "grad_norm": 1.9816887378692627, "learning_rate": 0.0001238032786885246, "loss": 0.053, "step": 11720 }, { "epoch": 57.303233679072605, "grad_norm": 2.1627466678619385, "learning_rate": 0.000123672131147541, "loss": 0.0544, "step": 11740 }, { "epoch": 57.40085417937767, "grad_norm": 2.412473678588867, "learning_rate": 0.0001235409836065574, "loss": 0.0498, "step": 11760 }, { "epoch": 57.49847467968274, "grad_norm": 1.8432903289794922, "learning_rate": 0.00012340983606557376, "loss": 0.055, "step": 11780 }, { "epoch": 57.5960951799878, "grad_norm": 1.5726666450500488, "learning_rate": 0.00012327868852459017, "loss": 0.0555, "step": 11800 }, { "epoch": 57.69371568029286, "grad_norm": 1.6197164058685303, "learning_rate": 0.00012314754098360656, "loss": 0.0551, "step": 11820 }, { "epoch": 57.79133618059792, "grad_norm": 2.0963289737701416, "learning_rate": 0.00012301639344262295, "loss": 0.0606, "step": 11840 }, { "epoch": 57.88895668090299, "grad_norm": 1.7842280864715576, "learning_rate": 0.00012288524590163934, "loss": 0.0625, "step": 11860 }, { "epoch": 57.986577181208055, "grad_norm": 2.8082289695739746, "learning_rate": 0.00012275409836065573, "loss": 0.0572, "step": 11880 }, { "epoch": 58.08419768151312, "grad_norm": 2.0027847290039062, "learning_rate": 0.00012262295081967215, "loss": 0.0484, "step": 11900 }, { "epoch": 58.18181818181818, "grad_norm": 1.9375332593917847, "learning_rate": 0.0001224918032786885, "loss": 0.0503, "step": 11920 }, { "epoch": 58.27943868212325, "grad_norm": 1.6671733856201172, "learning_rate": 0.00012236065573770493, "loss": 0.0463, "step": 11940 }, { "epoch": 58.37705918242831, "grad_norm": 2.173567533493042, "learning_rate": 0.00012222950819672132, "loss": 0.0535, "step": 11960 }, { "epoch": 58.474679682733374, "grad_norm": 2.869158983230591, "learning_rate": 0.00012209836065573773, "loss": 0.0591, "step": 11980 }, { "epoch": 58.572300183038436, "grad_norm": 4.487265586853027, "learning_rate": 0.0001219672131147541, "loss": 0.054, "step": 12000 }, { "epoch": 58.669920683343506, "grad_norm": 2.45060133934021, "learning_rate": 0.0001218360655737705, "loss": 0.0506, "step": 12020 }, { "epoch": 58.76754118364857, "grad_norm": 1.8584073781967163, "learning_rate": 0.0001217049180327869, "loss": 0.0577, "step": 12040 }, { "epoch": 58.86516168395363, "grad_norm": 2.7494070529937744, "learning_rate": 0.00012157377049180328, "loss": 0.0611, "step": 12060 }, { "epoch": 58.96278218425869, "grad_norm": 2.600886106491089, "learning_rate": 0.00012144262295081968, "loss": 0.0595, "step": 12080 }, { "epoch": 59.060402684563755, "grad_norm": 2.2219817638397217, "learning_rate": 0.00012131147540983607, "loss": 0.0496, "step": 12100 }, { "epoch": 59.158023184868824, "grad_norm": 1.4466367959976196, "learning_rate": 0.00012118032786885247, "loss": 0.0424, "step": 12120 }, { "epoch": 59.25564368517389, "grad_norm": 1.896466612815857, "learning_rate": 0.00012104918032786885, "loss": 0.0496, "step": 12140 }, { "epoch": 59.35326418547895, "grad_norm": 1.5178048610687256, "learning_rate": 0.00012091803278688525, "loss": 0.0541, "step": 12160 }, { "epoch": 59.45088468578401, "grad_norm": 2.189962148666382, "learning_rate": 0.00012078688524590165, "loss": 0.0558, "step": 12180 }, { "epoch": 59.54850518608908, "grad_norm": 2.055428981781006, "learning_rate": 0.00012065573770491804, "loss": 0.0523, "step": 12200 }, { "epoch": 59.64612568639414, "grad_norm": 2.569758415222168, "learning_rate": 0.00012052459016393443, "loss": 0.0547, "step": 12220 }, { "epoch": 59.743746186699205, "grad_norm": 2.1483688354492188, "learning_rate": 0.00012039344262295082, "loss": 0.0579, "step": 12240 }, { "epoch": 59.84136668700427, "grad_norm": 2.11574649810791, "learning_rate": 0.00012026229508196722, "loss": 0.0576, "step": 12260 }, { "epoch": 59.93898718730934, "grad_norm": 2.026974678039551, "learning_rate": 0.0001201311475409836, "loss": 0.0556, "step": 12280 }, { "epoch": 60.0366076876144, "grad_norm": 1.9671833515167236, "learning_rate": 0.00012, "loss": 0.0575, "step": 12300 }, { "epoch": 60.13422818791946, "grad_norm": 2.3915090560913086, "learning_rate": 0.0001198688524590164, "loss": 0.0405, "step": 12320 }, { "epoch": 60.231848688224524, "grad_norm": 2.213895559310913, "learning_rate": 0.0001197377049180328, "loss": 0.0455, "step": 12340 }, { "epoch": 60.32946918852959, "grad_norm": 1.6794525384902954, "learning_rate": 0.00011960655737704917, "loss": 0.0551, "step": 12360 }, { "epoch": 60.427089688834656, "grad_norm": 3.1766440868377686, "learning_rate": 0.00011947540983606557, "loss": 0.0526, "step": 12380 }, { "epoch": 60.52471018913972, "grad_norm": 1.5646824836730957, "learning_rate": 0.00011934426229508198, "loss": 0.0523, "step": 12400 }, { "epoch": 60.62233068944478, "grad_norm": 1.8669993877410889, "learning_rate": 0.00011921311475409838, "loss": 0.0525, "step": 12420 }, { "epoch": 60.71995118974985, "grad_norm": 3.3208813667297363, "learning_rate": 0.00011908196721311476, "loss": 0.05, "step": 12440 }, { "epoch": 60.81757169005491, "grad_norm": 2.8685014247894287, "learning_rate": 0.00011895081967213116, "loss": 0.0558, "step": 12460 }, { "epoch": 60.915192190359974, "grad_norm": 2.627858877182007, "learning_rate": 0.00011881967213114755, "loss": 0.0554, "step": 12480 }, { "epoch": 61.01281269066504, "grad_norm": 1.5304837226867676, "learning_rate": 0.00011868852459016392, "loss": 0.0533, "step": 12500 }, { "epoch": 61.110433190970106, "grad_norm": 2.2873823642730713, "learning_rate": 0.00011855737704918033, "loss": 0.0451, "step": 12520 }, { "epoch": 61.20805369127517, "grad_norm": 2.5284764766693115, "learning_rate": 0.00011842622950819673, "loss": 0.0496, "step": 12540 }, { "epoch": 61.30567419158023, "grad_norm": 2.3583881855010986, "learning_rate": 0.00011829508196721313, "loss": 0.0502, "step": 12560 }, { "epoch": 61.40329469188529, "grad_norm": 1.6773650646209717, "learning_rate": 0.00011816393442622951, "loss": 0.0494, "step": 12580 }, { "epoch": 61.50091519219036, "grad_norm": 1.8725978136062622, "learning_rate": 0.00011803278688524591, "loss": 0.0496, "step": 12600 }, { "epoch": 61.598535692495425, "grad_norm": 1.988142728805542, "learning_rate": 0.0001179016393442623, "loss": 0.0533, "step": 12620 }, { "epoch": 61.69615619280049, "grad_norm": 2.436917304992676, "learning_rate": 0.0001177704918032787, "loss": 0.0521, "step": 12640 }, { "epoch": 61.79377669310555, "grad_norm": 2.867424726486206, "learning_rate": 0.00011763934426229508, "loss": 0.0512, "step": 12660 }, { "epoch": 61.89139719341062, "grad_norm": 3.100371837615967, "learning_rate": 0.00011750819672131148, "loss": 0.0563, "step": 12680 }, { "epoch": 61.98901769371568, "grad_norm": 1.4640679359436035, "learning_rate": 0.00011737704918032789, "loss": 0.0512, "step": 12700 }, { "epoch": 62.08663819402074, "grad_norm": 1.7745931148529053, "learning_rate": 0.00011724590163934426, "loss": 0.0451, "step": 12720 }, { "epoch": 62.184258694325806, "grad_norm": 2.6083261966705322, "learning_rate": 0.00011711475409836066, "loss": 0.0523, "step": 12740 }, { "epoch": 62.281879194630875, "grad_norm": 1.8145304918289185, "learning_rate": 0.00011698360655737705, "loss": 0.0469, "step": 12760 }, { "epoch": 62.37949969493594, "grad_norm": 1.8948346376419067, "learning_rate": 0.00011685245901639346, "loss": 0.049, "step": 12780 }, { "epoch": 62.477120195241, "grad_norm": 2.2340071201324463, "learning_rate": 0.00011672131147540983, "loss": 0.0478, "step": 12800 }, { "epoch": 62.57474069554606, "grad_norm": 1.9108351469039917, "learning_rate": 0.00011659016393442623, "loss": 0.0477, "step": 12820 }, { "epoch": 62.67236119585113, "grad_norm": 1.1131486892700195, "learning_rate": 0.00011645901639344264, "loss": 0.0541, "step": 12840 }, { "epoch": 62.769981696156194, "grad_norm": 2.0101990699768066, "learning_rate": 0.00011632786885245903, "loss": 0.0435, "step": 12860 }, { "epoch": 62.867602196461256, "grad_norm": 1.88633394241333, "learning_rate": 0.0001161967213114754, "loss": 0.051, "step": 12880 }, { "epoch": 62.96522269676632, "grad_norm": 3.823934555053711, "learning_rate": 0.0001160655737704918, "loss": 0.0558, "step": 12900 }, { "epoch": 63.06284319707139, "grad_norm": 2.754892110824585, "learning_rate": 0.00011593442622950821, "loss": 0.0423, "step": 12920 }, { "epoch": 63.16046369737645, "grad_norm": 1.8771121501922607, "learning_rate": 0.00011580327868852458, "loss": 0.0434, "step": 12940 }, { "epoch": 63.25808419768151, "grad_norm": 1.098620057106018, "learning_rate": 0.00011567213114754099, "loss": 0.0457, "step": 12960 }, { "epoch": 63.355704697986575, "grad_norm": 2.0839200019836426, "learning_rate": 0.00011554098360655739, "loss": 0.0516, "step": 12980 }, { "epoch": 63.453325198291644, "grad_norm": 1.4664433002471924, "learning_rate": 0.00011540983606557378, "loss": 0.0456, "step": 13000 }, { "epoch": 63.550945698596706, "grad_norm": 1.9635177850723267, "learning_rate": 0.00011527868852459016, "loss": 0.0537, "step": 13020 }, { "epoch": 63.64856619890177, "grad_norm": 1.3120068311691284, "learning_rate": 0.00011514754098360656, "loss": 0.0492, "step": 13040 }, { "epoch": 63.74618669920683, "grad_norm": 3.893848180770874, "learning_rate": 0.00011501639344262296, "loss": 0.0532, "step": 13060 }, { "epoch": 63.8438071995119, "grad_norm": 2.8461520671844482, "learning_rate": 0.00011488524590163936, "loss": 0.0525, "step": 13080 }, { "epoch": 63.94142769981696, "grad_norm": 2.197197914123535, "learning_rate": 0.00011475409836065574, "loss": 0.0507, "step": 13100 }, { "epoch": 64.03904820012202, "grad_norm": 2.491464138031006, "learning_rate": 0.00011462295081967214, "loss": 0.0486, "step": 13120 }, { "epoch": 64.1366687004271, "grad_norm": 1.8550955057144165, "learning_rate": 0.00011449180327868853, "loss": 0.0421, "step": 13140 }, { "epoch": 64.23428920073215, "grad_norm": 1.7782377004623413, "learning_rate": 0.00011436065573770491, "loss": 0.0448, "step": 13160 }, { "epoch": 64.33190970103722, "grad_norm": 2.042099714279175, "learning_rate": 0.00011422950819672131, "loss": 0.0449, "step": 13180 }, { "epoch": 64.42953020134229, "grad_norm": 2.6295580863952637, "learning_rate": 0.00011409836065573771, "loss": 0.0509, "step": 13200 }, { "epoch": 64.52715070164734, "grad_norm": 2.4631996154785156, "learning_rate": 0.00011396721311475412, "loss": 0.0447, "step": 13220 }, { "epoch": 64.62477120195241, "grad_norm": 1.8422377109527588, "learning_rate": 0.00011383606557377049, "loss": 0.0498, "step": 13240 }, { "epoch": 64.72239170225747, "grad_norm": 2.1493983268737793, "learning_rate": 0.00011370491803278688, "loss": 0.0536, "step": 13260 }, { "epoch": 64.82001220256254, "grad_norm": 2.355818033218384, "learning_rate": 0.00011357377049180329, "loss": 0.0529, "step": 13280 }, { "epoch": 64.9176327028676, "grad_norm": 2.2813539505004883, "learning_rate": 0.00011344262295081969, "loss": 0.0514, "step": 13300 }, { "epoch": 65.01525320317266, "grad_norm": 2.1642229557037354, "learning_rate": 0.00011331147540983606, "loss": 0.048, "step": 13320 }, { "epoch": 65.11287370347773, "grad_norm": 1.607167363166809, "learning_rate": 0.00011318032786885247, "loss": 0.0393, "step": 13340 }, { "epoch": 65.2104942037828, "grad_norm": 1.7760226726531982, "learning_rate": 0.00011304918032786887, "loss": 0.0445, "step": 13360 }, { "epoch": 65.30811470408786, "grad_norm": 2.2471165657043457, "learning_rate": 0.00011291803278688525, "loss": 0.0388, "step": 13380 }, { "epoch": 65.40573520439293, "grad_norm": 2.517448663711548, "learning_rate": 0.00011278688524590164, "loss": 0.0466, "step": 13400 }, { "epoch": 65.50335570469798, "grad_norm": 1.7835204601287842, "learning_rate": 0.00011265573770491804, "loss": 0.0457, "step": 13420 }, { "epoch": 65.60097620500305, "grad_norm": 2.7937209606170654, "learning_rate": 0.00011252459016393444, "loss": 0.0453, "step": 13440 }, { "epoch": 65.69859670530812, "grad_norm": 1.775601863861084, "learning_rate": 0.00011239344262295082, "loss": 0.0515, "step": 13460 }, { "epoch": 65.79621720561317, "grad_norm": 2.0977530479431152, "learning_rate": 0.00011226229508196722, "loss": 0.0494, "step": 13480 }, { "epoch": 65.89383770591824, "grad_norm": 2.0289318561553955, "learning_rate": 0.00011213114754098362, "loss": 0.0492, "step": 13500 }, { "epoch": 65.99145820622331, "grad_norm": 1.8023489713668823, "learning_rate": 0.00011200000000000001, "loss": 0.0491, "step": 13520 }, { "epoch": 66.08907870652837, "grad_norm": 1.4527980089187622, "learning_rate": 0.00011186885245901639, "loss": 0.0371, "step": 13540 }, { "epoch": 66.18669920683344, "grad_norm": 1.5598070621490479, "learning_rate": 0.00011173770491803279, "loss": 0.0421, "step": 13560 }, { "epoch": 66.2843197071385, "grad_norm": 1.312072992324829, "learning_rate": 0.0001116065573770492, "loss": 0.0457, "step": 13580 }, { "epoch": 66.38194020744356, "grad_norm": 2.0999739170074463, "learning_rate": 0.00011147540983606557, "loss": 0.0477, "step": 13600 }, { "epoch": 66.47956070774863, "grad_norm": 1.9168506860733032, "learning_rate": 0.00011134426229508197, "loss": 0.0428, "step": 13620 }, { "epoch": 66.57718120805369, "grad_norm": 2.1148712635040283, "learning_rate": 0.00011121311475409838, "loss": 0.0472, "step": 13640 }, { "epoch": 66.67480170835876, "grad_norm": 1.7266321182250977, "learning_rate": 0.00011108196721311476, "loss": 0.0493, "step": 13660 }, { "epoch": 66.77242220866383, "grad_norm": 1.0809521675109863, "learning_rate": 0.00011095081967213114, "loss": 0.048, "step": 13680 }, { "epoch": 66.87004270896888, "grad_norm": 2.773946523666382, "learning_rate": 0.00011081967213114754, "loss": 0.0481, "step": 13700 }, { "epoch": 66.96766320927395, "grad_norm": 2.187812089920044, "learning_rate": 0.00011068852459016395, "loss": 0.054, "step": 13720 }, { "epoch": 67.065283709579, "grad_norm": 1.8684860467910767, "learning_rate": 0.00011055737704918035, "loss": 0.0414, "step": 13740 }, { "epoch": 67.16290420988408, "grad_norm": 1.6898576021194458, "learning_rate": 0.00011042622950819672, "loss": 0.0405, "step": 13760 }, { "epoch": 67.26052471018915, "grad_norm": 2.749148368835449, "learning_rate": 0.00011029508196721311, "loss": 0.0404, "step": 13780 }, { "epoch": 67.3581452104942, "grad_norm": 3.262460231781006, "learning_rate": 0.00011016393442622952, "loss": 0.0476, "step": 13800 }, { "epoch": 67.45576571079927, "grad_norm": 1.7177296876907349, "learning_rate": 0.00011003278688524589, "loss": 0.0446, "step": 13820 }, { "epoch": 67.55338621110434, "grad_norm": 2.326493978500366, "learning_rate": 0.0001099016393442623, "loss": 0.0447, "step": 13840 }, { "epoch": 67.6510067114094, "grad_norm": 1.690735101699829, "learning_rate": 0.0001097704918032787, "loss": 0.0475, "step": 13860 }, { "epoch": 67.74862721171446, "grad_norm": 2.243262529373169, "learning_rate": 0.0001096393442622951, "loss": 0.0414, "step": 13880 }, { "epoch": 67.84624771201952, "grad_norm": 1.8218213319778442, "learning_rate": 0.00010950819672131148, "loss": 0.0475, "step": 13900 }, { "epoch": 67.94386821232459, "grad_norm": 2.037757635116577, "learning_rate": 0.00010937704918032787, "loss": 0.0517, "step": 13920 }, { "epoch": 68.04148871262966, "grad_norm": 1.589010238647461, "learning_rate": 0.00010924590163934427, "loss": 0.0444, "step": 13940 }, { "epoch": 68.13910921293471, "grad_norm": 1.6589601039886475, "learning_rate": 0.00010911475409836067, "loss": 0.0397, "step": 13960 }, { "epoch": 68.23672971323978, "grad_norm": 0.7578281760215759, "learning_rate": 0.00010898360655737705, "loss": 0.0424, "step": 13980 }, { "epoch": 68.33435021354484, "grad_norm": 2.5849769115448, "learning_rate": 0.00010885245901639345, "loss": 0.0411, "step": 14000 }, { "epoch": 68.4319707138499, "grad_norm": 2.4518473148345947, "learning_rate": 0.00010872131147540985, "loss": 0.0463, "step": 14020 }, { "epoch": 68.52959121415498, "grad_norm": 1.6705505847930908, "learning_rate": 0.00010859016393442623, "loss": 0.0418, "step": 14040 }, { "epoch": 68.62721171446003, "grad_norm": 2.3943698406219482, "learning_rate": 0.00010845901639344262, "loss": 0.0454, "step": 14060 }, { "epoch": 68.7248322147651, "grad_norm": 1.8809058666229248, "learning_rate": 0.00010832786885245902, "loss": 0.0417, "step": 14080 }, { "epoch": 68.82245271507017, "grad_norm": 2.2370426654815674, "learning_rate": 0.00010819672131147543, "loss": 0.0428, "step": 14100 }, { "epoch": 68.92007321537523, "grad_norm": 2.9675140380859375, "learning_rate": 0.0001080655737704918, "loss": 0.0455, "step": 14120 }, { "epoch": 69.0176937156803, "grad_norm": 2.193737506866455, "learning_rate": 0.0001079344262295082, "loss": 0.0484, "step": 14140 }, { "epoch": 69.11531421598535, "grad_norm": 1.3249075412750244, "learning_rate": 0.00010780327868852461, "loss": 0.0361, "step": 14160 }, { "epoch": 69.21293471629042, "grad_norm": 1.6315033435821533, "learning_rate": 0.00010767213114754098, "loss": 0.0392, "step": 14180 }, { "epoch": 69.31055521659549, "grad_norm": 3.6653645038604736, "learning_rate": 0.00010754098360655737, "loss": 0.0418, "step": 14200 }, { "epoch": 69.40817571690054, "grad_norm": 1.652737021446228, "learning_rate": 0.00010740983606557378, "loss": 0.0408, "step": 14220 }, { "epoch": 69.50579621720561, "grad_norm": 1.7608600854873657, "learning_rate": 0.00010727868852459018, "loss": 0.0481, "step": 14240 }, { "epoch": 69.60341671751068, "grad_norm": 1.6108567714691162, "learning_rate": 0.00010714754098360655, "loss": 0.0411, "step": 14260 }, { "epoch": 69.70103721781574, "grad_norm": 1.7640299797058105, "learning_rate": 0.00010701639344262296, "loss": 0.0433, "step": 14280 }, { "epoch": 69.79865771812081, "grad_norm": 3.2369632720947266, "learning_rate": 0.00010688524590163935, "loss": 0.0461, "step": 14300 }, { "epoch": 69.89627821842586, "grad_norm": 1.8078311681747437, "learning_rate": 0.00010675409836065575, "loss": 0.0503, "step": 14320 }, { "epoch": 69.99389871873093, "grad_norm": 2.2205686569213867, "learning_rate": 0.00010662295081967212, "loss": 0.0446, "step": 14340 }, { "epoch": 70.091519219036, "grad_norm": 1.8968815803527832, "learning_rate": 0.00010649180327868853, "loss": 0.0333, "step": 14360 }, { "epoch": 70.18913971934106, "grad_norm": 1.6357848644256592, "learning_rate": 0.00010636065573770493, "loss": 0.0393, "step": 14380 }, { "epoch": 70.28676021964613, "grad_norm": 4.227313995361328, "learning_rate": 0.0001062295081967213, "loss": 0.0367, "step": 14400 }, { "epoch": 70.3843807199512, "grad_norm": 1.2310267686843872, "learning_rate": 0.00010609836065573771, "loss": 0.043, "step": 14420 }, { "epoch": 70.48200122025625, "grad_norm": 1.8939145803451538, "learning_rate": 0.0001059672131147541, "loss": 0.0424, "step": 14440 }, { "epoch": 70.57962172056132, "grad_norm": 1.7214269638061523, "learning_rate": 0.0001058360655737705, "loss": 0.0455, "step": 14460 }, { "epoch": 70.67724222086638, "grad_norm": 1.5810576677322388, "learning_rate": 0.00010570491803278688, "loss": 0.0455, "step": 14480 }, { "epoch": 70.77486272117144, "grad_norm": 2.2829346656799316, "learning_rate": 0.00010557377049180328, "loss": 0.0446, "step": 14500 }, { "epoch": 70.87248322147651, "grad_norm": 1.7626519203186035, "learning_rate": 0.00010544262295081968, "loss": 0.0465, "step": 14520 }, { "epoch": 70.97010372178157, "grad_norm": 3.180558443069458, "learning_rate": 0.00010531147540983609, "loss": 0.0454, "step": 14540 }, { "epoch": 71.06772422208664, "grad_norm": 1.3041974306106567, "learning_rate": 0.00010518032786885246, "loss": 0.0396, "step": 14560 }, { "epoch": 71.16534472239171, "grad_norm": 1.648926854133606, "learning_rate": 0.00010504918032786885, "loss": 0.0364, "step": 14580 }, { "epoch": 71.26296522269676, "grad_norm": 1.6585657596588135, "learning_rate": 0.00010491803278688525, "loss": 0.0365, "step": 14600 }, { "epoch": 71.36058572300183, "grad_norm": 2.1018893718719482, "learning_rate": 0.00010478688524590163, "loss": 0.0373, "step": 14620 }, { "epoch": 71.45820622330689, "grad_norm": 2.348642110824585, "learning_rate": 0.00010465573770491803, "loss": 0.0403, "step": 14640 }, { "epoch": 71.55582672361196, "grad_norm": 1.8236652612686157, "learning_rate": 0.00010452459016393444, "loss": 0.0385, "step": 14660 }, { "epoch": 71.65344722391703, "grad_norm": 1.6800057888031006, "learning_rate": 0.00010439344262295083, "loss": 0.0392, "step": 14680 }, { "epoch": 71.75106772422208, "grad_norm": 2.6465699672698975, "learning_rate": 0.00010426229508196721, "loss": 0.0474, "step": 14700 }, { "epoch": 71.84868822452715, "grad_norm": 2.384202003479004, "learning_rate": 0.0001041311475409836, "loss": 0.0463, "step": 14720 }, { "epoch": 71.94630872483222, "grad_norm": 2.788309335708618, "learning_rate": 0.00010400000000000001, "loss": 0.0465, "step": 14740 }, { "epoch": 72.04392922513728, "grad_norm": 2.2746474742889404, "learning_rate": 0.00010386885245901641, "loss": 0.0423, "step": 14760 }, { "epoch": 72.14154972544235, "grad_norm": 2.319316864013672, "learning_rate": 0.00010373770491803279, "loss": 0.0352, "step": 14780 }, { "epoch": 72.2391702257474, "grad_norm": 1.8821275234222412, "learning_rate": 0.00010360655737704919, "loss": 0.0344, "step": 14800 }, { "epoch": 72.33679072605247, "grad_norm": 1.316780686378479, "learning_rate": 0.00010347540983606558, "loss": 0.0383, "step": 14820 }, { "epoch": 72.43441122635754, "grad_norm": 1.91647207736969, "learning_rate": 0.00010334426229508197, "loss": 0.0375, "step": 14840 }, { "epoch": 72.5320317266626, "grad_norm": 1.4434703588485718, "learning_rate": 0.00010321311475409836, "loss": 0.0352, "step": 14860 }, { "epoch": 72.62965222696766, "grad_norm": 1.4991461038589478, "learning_rate": 0.00010308196721311476, "loss": 0.0393, "step": 14880 }, { "epoch": 72.72727272727273, "grad_norm": 1.8177014589309692, "learning_rate": 0.00010295081967213116, "loss": 0.0424, "step": 14900 }, { "epoch": 72.82489322757779, "grad_norm": 2.2760417461395264, "learning_rate": 0.00010281967213114754, "loss": 0.0424, "step": 14920 }, { "epoch": 72.92251372788286, "grad_norm": 1.6984953880310059, "learning_rate": 0.00010268852459016394, "loss": 0.044, "step": 14940 }, { "epoch": 73.02013422818791, "grad_norm": 1.3592875003814697, "learning_rate": 0.00010255737704918033, "loss": 0.0412, "step": 14960 }, { "epoch": 73.11775472849298, "grad_norm": 1.0483120679855347, "learning_rate": 0.00010242622950819673, "loss": 0.0347, "step": 14980 }, { "epoch": 73.21537522879805, "grad_norm": 0.837219774723053, "learning_rate": 0.00010229508196721311, "loss": 0.0333, "step": 15000 }, { "epoch": 73.31299572910311, "grad_norm": 1.5951837301254272, "learning_rate": 0.00010216393442622951, "loss": 0.0405, "step": 15020 }, { "epoch": 73.41061622940818, "grad_norm": 1.8197298049926758, "learning_rate": 0.00010203278688524592, "loss": 0.0402, "step": 15040 }, { "epoch": 73.50823672971325, "grad_norm": 1.4663337469100952, "learning_rate": 0.00010190163934426229, "loss": 0.0399, "step": 15060 }, { "epoch": 73.6058572300183, "grad_norm": 1.5924322605133057, "learning_rate": 0.0001017704918032787, "loss": 0.0431, "step": 15080 }, { "epoch": 73.70347773032337, "grad_norm": 1.3720799684524536, "learning_rate": 0.00010163934426229508, "loss": 0.0396, "step": 15100 }, { "epoch": 73.80109823062843, "grad_norm": 1.6636115312576294, "learning_rate": 0.00010150819672131149, "loss": 0.041, "step": 15120 }, { "epoch": 73.8987187309335, "grad_norm": 1.7498186826705933, "learning_rate": 0.00010137704918032786, "loss": 0.0415, "step": 15140 }, { "epoch": 73.99633923123857, "grad_norm": 1.410224199295044, "learning_rate": 0.00010124590163934427, "loss": 0.0432, "step": 15160 }, { "epoch": 74.09395973154362, "grad_norm": 1.576262354850769, "learning_rate": 0.00010111475409836067, "loss": 0.0331, "step": 15180 }, { "epoch": 74.19158023184869, "grad_norm": 0.8504081964492798, "learning_rate": 0.00010098360655737706, "loss": 0.0365, "step": 15200 }, { "epoch": 74.28920073215376, "grad_norm": 1.2342356443405151, "learning_rate": 0.00010085245901639345, "loss": 0.0374, "step": 15220 }, { "epoch": 74.38682123245881, "grad_norm": 1.6233398914337158, "learning_rate": 0.00010072131147540984, "loss": 0.0435, "step": 15240 }, { "epoch": 74.48444173276388, "grad_norm": 3.0741922855377197, "learning_rate": 0.00010059016393442624, "loss": 0.0401, "step": 15260 }, { "epoch": 74.58206223306894, "grad_norm": 1.7206604480743408, "learning_rate": 0.00010045901639344261, "loss": 0.0387, "step": 15280 }, { "epoch": 74.67968273337401, "grad_norm": 1.7133204936981201, "learning_rate": 0.00010032786885245902, "loss": 0.0387, "step": 15300 }, { "epoch": 74.77730323367908, "grad_norm": 3.1250414848327637, "learning_rate": 0.00010019672131147542, "loss": 0.04, "step": 15320 }, { "epoch": 74.87492373398413, "grad_norm": 1.7084505558013916, "learning_rate": 0.00010006557377049181, "loss": 0.0379, "step": 15340 }, { "epoch": 74.9725442342892, "grad_norm": 2.0808680057525635, "learning_rate": 9.99344262295082e-05, "loss": 0.0419, "step": 15360 }, { "epoch": 75.07016473459427, "grad_norm": 1.148889422416687, "learning_rate": 9.980327868852459e-05, "loss": 0.0355, "step": 15380 }, { "epoch": 75.16778523489933, "grad_norm": 1.292641520500183, "learning_rate": 9.967213114754099e-05, "loss": 0.0354, "step": 15400 }, { "epoch": 75.2654057352044, "grad_norm": 2.2540032863616943, "learning_rate": 9.954098360655738e-05, "loss": 0.0384, "step": 15420 }, { "epoch": 75.36302623550945, "grad_norm": 1.2150137424468994, "learning_rate": 9.940983606557378e-05, "loss": 0.0358, "step": 15440 }, { "epoch": 75.46064673581452, "grad_norm": 1.647284984588623, "learning_rate": 9.927868852459017e-05, "loss": 0.0351, "step": 15460 }, { "epoch": 75.55826723611959, "grad_norm": 2.1576521396636963, "learning_rate": 9.914754098360656e-05, "loss": 0.0425, "step": 15480 }, { "epoch": 75.65588773642465, "grad_norm": 1.787636637687683, "learning_rate": 9.901639344262295e-05, "loss": 0.0384, "step": 15500 }, { "epoch": 75.75350823672972, "grad_norm": 2.0450475215911865, "learning_rate": 9.888524590163934e-05, "loss": 0.0373, "step": 15520 }, { "epoch": 75.85112873703477, "grad_norm": 1.457287073135376, "learning_rate": 9.875409836065574e-05, "loss": 0.0411, "step": 15540 }, { "epoch": 75.94874923733984, "grad_norm": 2.2569003105163574, "learning_rate": 9.862295081967213e-05, "loss": 0.0401, "step": 15560 }, { "epoch": 76.04636973764491, "grad_norm": 1.942240834236145, "learning_rate": 9.849180327868854e-05, "loss": 0.0381, "step": 15580 }, { "epoch": 76.14399023794996, "grad_norm": 1.9063150882720947, "learning_rate": 9.836065573770493e-05, "loss": 0.0354, "step": 15600 }, { "epoch": 76.24161073825503, "grad_norm": 1.0408899784088135, "learning_rate": 9.822950819672132e-05, "loss": 0.0338, "step": 15620 }, { "epoch": 76.3392312385601, "grad_norm": 1.3950903415679932, "learning_rate": 9.80983606557377e-05, "loss": 0.0361, "step": 15640 }, { "epoch": 76.43685173886516, "grad_norm": 1.3238831758499146, "learning_rate": 9.796721311475411e-05, "loss": 0.037, "step": 15660 }, { "epoch": 76.53447223917023, "grad_norm": 1.7356709241867065, "learning_rate": 9.78360655737705e-05, "loss": 0.0388, "step": 15680 }, { "epoch": 76.63209273947528, "grad_norm": 1.4678503274917603, "learning_rate": 9.770491803278689e-05, "loss": 0.0353, "step": 15700 }, { "epoch": 76.72971323978035, "grad_norm": 1.3024065494537354, "learning_rate": 9.757377049180329e-05, "loss": 0.041, "step": 15720 }, { "epoch": 76.82733374008542, "grad_norm": 1.2499933242797852, "learning_rate": 9.744262295081968e-05, "loss": 0.0341, "step": 15740 }, { "epoch": 76.92495424039048, "grad_norm": 1.7338874340057373, "learning_rate": 9.731147540983607e-05, "loss": 0.0432, "step": 15760 }, { "epoch": 77.02257474069555, "grad_norm": 2.667750120162964, "learning_rate": 9.718032786885246e-05, "loss": 0.0395, "step": 15780 }, { "epoch": 77.12019524100062, "grad_norm": 1.2692776918411255, "learning_rate": 9.704918032786886e-05, "loss": 0.0328, "step": 15800 }, { "epoch": 77.21781574130567, "grad_norm": 1.413238763809204, "learning_rate": 9.691803278688525e-05, "loss": 0.0336, "step": 15820 }, { "epoch": 77.31543624161074, "grad_norm": 1.9772465229034424, "learning_rate": 9.678688524590165e-05, "loss": 0.0355, "step": 15840 }, { "epoch": 77.4130567419158, "grad_norm": 1.7621837854385376, "learning_rate": 9.665573770491804e-05, "loss": 0.0376, "step": 15860 }, { "epoch": 77.51067724222086, "grad_norm": 1.381027102470398, "learning_rate": 9.652459016393443e-05, "loss": 0.0423, "step": 15880 }, { "epoch": 77.60829774252593, "grad_norm": 1.5768849849700928, "learning_rate": 9.639344262295082e-05, "loss": 0.0346, "step": 15900 }, { "epoch": 77.70591824283099, "grad_norm": 1.5931782722473145, "learning_rate": 9.626229508196721e-05, "loss": 0.0344, "step": 15920 }, { "epoch": 77.80353874313606, "grad_norm": 1.40415358543396, "learning_rate": 9.613114754098361e-05, "loss": 0.0377, "step": 15940 }, { "epoch": 77.90115924344113, "grad_norm": 1.4353222846984863, "learning_rate": 9.6e-05, "loss": 0.0347, "step": 15960 }, { "epoch": 77.99877974374618, "grad_norm": 1.4852492809295654, "learning_rate": 9.58688524590164e-05, "loss": 0.0386, "step": 15980 }, { "epoch": 78.09640024405125, "grad_norm": 1.287718415260315, "learning_rate": 9.57377049180328e-05, "loss": 0.0325, "step": 16000 }, { "epoch": 78.19402074435631, "grad_norm": 1.597639799118042, "learning_rate": 9.560655737704918e-05, "loss": 0.0304, "step": 16020 }, { "epoch": 78.29164124466138, "grad_norm": 2.2354135513305664, "learning_rate": 9.547540983606557e-05, "loss": 0.0343, "step": 16040 }, { "epoch": 78.38926174496645, "grad_norm": 1.701223373413086, "learning_rate": 9.534426229508198e-05, "loss": 0.035, "step": 16060 }, { "epoch": 78.4868822452715, "grad_norm": 2.1920886039733887, "learning_rate": 9.521311475409837e-05, "loss": 0.0362, "step": 16080 }, { "epoch": 78.58450274557657, "grad_norm": 1.5436536073684692, "learning_rate": 9.508196721311476e-05, "loss": 0.0388, "step": 16100 }, { "epoch": 78.68212324588164, "grad_norm": 0.7256617546081543, "learning_rate": 9.495081967213116e-05, "loss": 0.0364, "step": 16120 }, { "epoch": 78.7797437461867, "grad_norm": 1.3675026893615723, "learning_rate": 9.481967213114755e-05, "loss": 0.0383, "step": 16140 }, { "epoch": 78.87736424649177, "grad_norm": 2.07330322265625, "learning_rate": 9.468852459016394e-05, "loss": 0.0392, "step": 16160 }, { "epoch": 78.97498474679682, "grad_norm": 1.4074386358261108, "learning_rate": 9.455737704918033e-05, "loss": 0.0401, "step": 16180 }, { "epoch": 79.07260524710189, "grad_norm": 1.5689586400985718, "learning_rate": 9.442622950819673e-05, "loss": 0.0333, "step": 16200 }, { "epoch": 79.17022574740696, "grad_norm": 1.3256062269210815, "learning_rate": 9.429508196721312e-05, "loss": 0.0305, "step": 16220 }, { "epoch": 79.26784624771201, "grad_norm": 1.71085524559021, "learning_rate": 9.416393442622952e-05, "loss": 0.0311, "step": 16240 }, { "epoch": 79.36546674801708, "grad_norm": 1.7854918241500854, "learning_rate": 9.403278688524591e-05, "loss": 0.0334, "step": 16260 }, { "epoch": 79.46308724832215, "grad_norm": 1.381110668182373, "learning_rate": 9.39016393442623e-05, "loss": 0.0355, "step": 16280 }, { "epoch": 79.56070774862721, "grad_norm": 2.068474292755127, "learning_rate": 9.377049180327869e-05, "loss": 0.0358, "step": 16300 }, { "epoch": 79.65832824893228, "grad_norm": 1.4812254905700684, "learning_rate": 9.363934426229508e-05, "loss": 0.0382, "step": 16320 }, { "epoch": 79.75594874923733, "grad_norm": 2.683461904525757, "learning_rate": 9.350819672131148e-05, "loss": 0.0365, "step": 16340 }, { "epoch": 79.8535692495424, "grad_norm": 1.9132243394851685, "learning_rate": 9.337704918032787e-05, "loss": 0.0388, "step": 16360 }, { "epoch": 79.95118974984747, "grad_norm": 1.8553367853164673, "learning_rate": 9.324590163934427e-05, "loss": 0.0386, "step": 16380 }, { "epoch": 80.04881025015253, "grad_norm": 0.8551103472709656, "learning_rate": 9.311475409836066e-05, "loss": 0.0334, "step": 16400 }, { "epoch": 80.1464307504576, "grad_norm": 2.120316743850708, "learning_rate": 9.298360655737705e-05, "loss": 0.0324, "step": 16420 }, { "epoch": 80.24405125076267, "grad_norm": 1.456176519393921, "learning_rate": 9.285245901639344e-05, "loss": 0.0341, "step": 16440 }, { "epoch": 80.34167175106772, "grad_norm": 1.8826713562011719, "learning_rate": 9.272131147540985e-05, "loss": 0.034, "step": 16460 }, { "epoch": 80.43929225137279, "grad_norm": 1.0214563608169556, "learning_rate": 9.259016393442623e-05, "loss": 0.0318, "step": 16480 }, { "epoch": 80.53691275167785, "grad_norm": 1.6423603296279907, "learning_rate": 9.245901639344264e-05, "loss": 0.0344, "step": 16500 }, { "epoch": 80.63453325198292, "grad_norm": 1.6966345310211182, "learning_rate": 9.232786885245903e-05, "loss": 0.0367, "step": 16520 }, { "epoch": 80.73215375228799, "grad_norm": 1.1521140336990356, "learning_rate": 9.21967213114754e-05, "loss": 0.0364, "step": 16540 }, { "epoch": 80.82977425259304, "grad_norm": 1.5094974040985107, "learning_rate": 9.20655737704918e-05, "loss": 0.0355, "step": 16560 }, { "epoch": 80.92739475289811, "grad_norm": 1.4688690900802612, "learning_rate": 9.19344262295082e-05, "loss": 0.0328, "step": 16580 }, { "epoch": 81.02501525320318, "grad_norm": 1.916788935661316, "learning_rate": 9.18032786885246e-05, "loss": 0.0353, "step": 16600 }, { "epoch": 81.12263575350823, "grad_norm": 1.6212852001190186, "learning_rate": 9.167213114754099e-05, "loss": 0.031, "step": 16620 }, { "epoch": 81.2202562538133, "grad_norm": 1.1060786247253418, "learning_rate": 9.154098360655739e-05, "loss": 0.0293, "step": 16640 }, { "epoch": 81.31787675411836, "grad_norm": 1.4581480026245117, "learning_rate": 9.140983606557378e-05, "loss": 0.0349, "step": 16660 }, { "epoch": 81.41549725442343, "grad_norm": 1.1661124229431152, "learning_rate": 9.127868852459017e-05, "loss": 0.0315, "step": 16680 }, { "epoch": 81.5131177547285, "grad_norm": 1.6387383937835693, "learning_rate": 9.114754098360656e-05, "loss": 0.0341, "step": 16700 }, { "epoch": 81.61073825503355, "grad_norm": 1.5041887760162354, "learning_rate": 9.101639344262296e-05, "loss": 0.0332, "step": 16720 }, { "epoch": 81.70835875533862, "grad_norm": 1.3553998470306396, "learning_rate": 9.088524590163935e-05, "loss": 0.0358, "step": 16740 }, { "epoch": 81.80597925564369, "grad_norm": 1.5116016864776611, "learning_rate": 9.075409836065574e-05, "loss": 0.0344, "step": 16760 }, { "epoch": 81.90359975594875, "grad_norm": 1.9211640357971191, "learning_rate": 9.062295081967214e-05, "loss": 0.0371, "step": 16780 }, { "epoch": 82.00122025625382, "grad_norm": 3.2523958683013916, "learning_rate": 9.049180327868852e-05, "loss": 0.0333, "step": 16800 }, { "epoch": 82.09884075655887, "grad_norm": 1.5402885675430298, "learning_rate": 9.036065573770492e-05, "loss": 0.0314, "step": 16820 }, { "epoch": 82.19646125686394, "grad_norm": 1.5037944316864014, "learning_rate": 9.022950819672131e-05, "loss": 0.029, "step": 16840 }, { "epoch": 82.29408175716901, "grad_norm": 2.7046449184417725, "learning_rate": 9.009836065573771e-05, "loss": 0.031, "step": 16860 }, { "epoch": 82.39170225747407, "grad_norm": 2.5004217624664307, "learning_rate": 8.99672131147541e-05, "loss": 0.0318, "step": 16880 }, { "epoch": 82.48932275777914, "grad_norm": 2.3502180576324463, "learning_rate": 8.98360655737705e-05, "loss": 0.0335, "step": 16900 }, { "epoch": 82.5869432580842, "grad_norm": 1.3338574171066284, "learning_rate": 8.97049180327869e-05, "loss": 0.0346, "step": 16920 }, { "epoch": 82.68456375838926, "grad_norm": 1.4850441217422485, "learning_rate": 8.957377049180328e-05, "loss": 0.0355, "step": 16940 }, { "epoch": 82.78218425869433, "grad_norm": 1.3196766376495361, "learning_rate": 8.944262295081967e-05, "loss": 0.0354, "step": 16960 }, { "epoch": 82.87980475899938, "grad_norm": 1.2028127908706665, "learning_rate": 8.931147540983606e-05, "loss": 0.0377, "step": 16980 }, { "epoch": 82.97742525930445, "grad_norm": 1.5491008758544922, "learning_rate": 8.918032786885247e-05, "loss": 0.0325, "step": 17000 }, { "epoch": 83.07504575960952, "grad_norm": 2.5490734577178955, "learning_rate": 8.904918032786886e-05, "loss": 0.0289, "step": 17020 }, { "epoch": 83.17266625991458, "grad_norm": 4.350809097290039, "learning_rate": 8.891803278688526e-05, "loss": 0.0296, "step": 17040 }, { "epoch": 83.27028676021965, "grad_norm": 1.5075066089630127, "learning_rate": 8.878688524590163e-05, "loss": 0.0313, "step": 17060 }, { "epoch": 83.36790726052472, "grad_norm": 1.4814287424087524, "learning_rate": 8.865573770491804e-05, "loss": 0.0269, "step": 17080 }, { "epoch": 83.46552776082977, "grad_norm": 1.050759196281433, "learning_rate": 8.852459016393443e-05, "loss": 0.0326, "step": 17100 }, { "epoch": 83.56314826113484, "grad_norm": 1.5208618640899658, "learning_rate": 8.839344262295083e-05, "loss": 0.0329, "step": 17120 }, { "epoch": 83.6607687614399, "grad_norm": 1.6846823692321777, "learning_rate": 8.826229508196722e-05, "loss": 0.031, "step": 17140 }, { "epoch": 83.75838926174497, "grad_norm": 1.9100298881530762, "learning_rate": 8.813114754098362e-05, "loss": 0.0364, "step": 17160 }, { "epoch": 83.85600976205004, "grad_norm": 1.557652235031128, "learning_rate": 8.800000000000001e-05, "loss": 0.0367, "step": 17180 }, { "epoch": 83.95363026235509, "grad_norm": 1.104952335357666, "learning_rate": 8.786885245901639e-05, "loss": 0.0355, "step": 17200 }, { "epoch": 84.05125076266016, "grad_norm": 2.2244253158569336, "learning_rate": 8.773770491803279e-05, "loss": 0.0308, "step": 17220 }, { "epoch": 84.14887126296523, "grad_norm": 2.745600938796997, "learning_rate": 8.760655737704918e-05, "loss": 0.0284, "step": 17240 }, { "epoch": 84.24649176327028, "grad_norm": 1.7342115640640259, "learning_rate": 8.747540983606558e-05, "loss": 0.0305, "step": 17260 }, { "epoch": 84.34411226357535, "grad_norm": 0.972453773021698, "learning_rate": 8.734426229508197e-05, "loss": 0.0313, "step": 17280 }, { "epoch": 84.44173276388041, "grad_norm": 1.6197409629821777, "learning_rate": 8.721311475409837e-05, "loss": 0.0338, "step": 17300 }, { "epoch": 84.53935326418548, "grad_norm": 1.2944082021713257, "learning_rate": 8.708196721311475e-05, "loss": 0.0365, "step": 17320 }, { "epoch": 84.63697376449055, "grad_norm": 2.3329808712005615, "learning_rate": 8.695081967213115e-05, "loss": 0.0319, "step": 17340 }, { "epoch": 84.7345942647956, "grad_norm": 2.8675897121429443, "learning_rate": 8.681967213114754e-05, "loss": 0.0325, "step": 17360 }, { "epoch": 84.83221476510067, "grad_norm": 2.0623087882995605, "learning_rate": 8.668852459016393e-05, "loss": 0.0335, "step": 17380 }, { "epoch": 84.92983526540573, "grad_norm": 1.3979312181472778, "learning_rate": 8.655737704918033e-05, "loss": 0.0332, "step": 17400 }, { "epoch": 85.0274557657108, "grad_norm": 1.630370855331421, "learning_rate": 8.642622950819672e-05, "loss": 0.0328, "step": 17420 }, { "epoch": 85.12507626601587, "grad_norm": 1.1962217092514038, "learning_rate": 8.629508196721313e-05, "loss": 0.0317, "step": 17440 }, { "epoch": 85.22269676632092, "grad_norm": 1.3756200075149536, "learning_rate": 8.61639344262295e-05, "loss": 0.0247, "step": 17460 }, { "epoch": 85.32031726662599, "grad_norm": 1.2209444046020508, "learning_rate": 8.60327868852459e-05, "loss": 0.0306, "step": 17480 }, { "epoch": 85.41793776693106, "grad_norm": 2.080512046813965, "learning_rate": 8.59016393442623e-05, "loss": 0.029, "step": 17500 }, { "epoch": 85.51555826723612, "grad_norm": 1.3376110792160034, "learning_rate": 8.57704918032787e-05, "loss": 0.0302, "step": 17520 }, { "epoch": 85.61317876754119, "grad_norm": 1.5291906595230103, "learning_rate": 8.563934426229509e-05, "loss": 0.0342, "step": 17540 }, { "epoch": 85.71079926784624, "grad_norm": 0.9164462089538574, "learning_rate": 8.550819672131149e-05, "loss": 0.0314, "step": 17560 }, { "epoch": 85.80841976815131, "grad_norm": 1.3751301765441895, "learning_rate": 8.537704918032787e-05, "loss": 0.0342, "step": 17580 }, { "epoch": 85.90604026845638, "grad_norm": 1.4068491458892822, "learning_rate": 8.524590163934426e-05, "loss": 0.0353, "step": 17600 }, { "epoch": 86.00366076876143, "grad_norm": 1.103683590888977, "learning_rate": 8.511475409836066e-05, "loss": 0.031, "step": 17620 }, { "epoch": 86.1012812690665, "grad_norm": 1.2004616260528564, "learning_rate": 8.498360655737705e-05, "loss": 0.0249, "step": 17640 }, { "epoch": 86.19890176937157, "grad_norm": 1.8739843368530273, "learning_rate": 8.485245901639345e-05, "loss": 0.0271, "step": 17660 }, { "epoch": 86.29652226967663, "grad_norm": 0.8995428085327148, "learning_rate": 8.472131147540984e-05, "loss": 0.0286, "step": 17680 }, { "epoch": 86.3941427699817, "grad_norm": 2.4829764366149902, "learning_rate": 8.459016393442624e-05, "loss": 0.0291, "step": 17700 }, { "epoch": 86.49176327028675, "grad_norm": 1.098775863647461, "learning_rate": 8.445901639344262e-05, "loss": 0.033, "step": 17720 }, { "epoch": 86.58938377059182, "grad_norm": 1.3387798070907593, "learning_rate": 8.432786885245902e-05, "loss": 0.0297, "step": 17740 }, { "epoch": 86.68700427089689, "grad_norm": 2.0622024536132812, "learning_rate": 8.419672131147541e-05, "loss": 0.034, "step": 17760 }, { "epoch": 86.78462477120195, "grad_norm": 1.90251624584198, "learning_rate": 8.406557377049181e-05, "loss": 0.0314, "step": 17780 }, { "epoch": 86.88224527150702, "grad_norm": 0.5546866059303284, "learning_rate": 8.39344262295082e-05, "loss": 0.0315, "step": 17800 }, { "epoch": 86.97986577181209, "grad_norm": 1.1995351314544678, "learning_rate": 8.380327868852459e-05, "loss": 0.0357, "step": 17820 }, { "epoch": 87.07748627211714, "grad_norm": 1.374808430671692, "learning_rate": 8.367213114754098e-05, "loss": 0.0274, "step": 17840 }, { "epoch": 87.17510677242221, "grad_norm": 1.2104483842849731, "learning_rate": 8.354098360655737e-05, "loss": 0.0261, "step": 17860 }, { "epoch": 87.27272727272727, "grad_norm": 1.2082188129425049, "learning_rate": 8.340983606557377e-05, "loss": 0.0327, "step": 17880 }, { "epoch": 87.37034777303234, "grad_norm": 1.6042877435684204, "learning_rate": 8.327868852459016e-05, "loss": 0.0298, "step": 17900 }, { "epoch": 87.4679682733374, "grad_norm": 0.9819115400314331, "learning_rate": 8.314754098360657e-05, "loss": 0.0289, "step": 17920 }, { "epoch": 87.56558877364246, "grad_norm": 0.9918608665466309, "learning_rate": 8.301639344262296e-05, "loss": 0.0284, "step": 17940 }, { "epoch": 87.66320927394753, "grad_norm": 0.5345699191093445, "learning_rate": 8.288524590163935e-05, "loss": 0.0304, "step": 17960 }, { "epoch": 87.7608297742526, "grad_norm": 1.5290710926055908, "learning_rate": 8.275409836065573e-05, "loss": 0.0313, "step": 17980 }, { "epoch": 87.85845027455765, "grad_norm": 2.701918363571167, "learning_rate": 8.262295081967214e-05, "loss": 0.0319, "step": 18000 }, { "epoch": 87.95607077486272, "grad_norm": 1.9246459007263184, "learning_rate": 8.249180327868853e-05, "loss": 0.0332, "step": 18020 }, { "epoch": 88.05369127516778, "grad_norm": 1.6299734115600586, "learning_rate": 8.236065573770492e-05, "loss": 0.0278, "step": 18040 }, { "epoch": 88.15131177547285, "grad_norm": 1.8780626058578491, "learning_rate": 8.222950819672132e-05, "loss": 0.0303, "step": 18060 }, { "epoch": 88.24893227577792, "grad_norm": 1.2770717144012451, "learning_rate": 8.209836065573771e-05, "loss": 0.0286, "step": 18080 }, { "epoch": 88.34655277608297, "grad_norm": 2.3574554920196533, "learning_rate": 8.19672131147541e-05, "loss": 0.0245, "step": 18100 }, { "epoch": 88.44417327638804, "grad_norm": 1.6058595180511475, "learning_rate": 8.183606557377049e-05, "loss": 0.0322, "step": 18120 }, { "epoch": 88.54179377669311, "grad_norm": 2.50612211227417, "learning_rate": 8.170491803278689e-05, "loss": 0.0313, "step": 18140 }, { "epoch": 88.63941427699817, "grad_norm": 1.8343908786773682, "learning_rate": 8.157377049180328e-05, "loss": 0.0314, "step": 18160 }, { "epoch": 88.73703477730324, "grad_norm": 1.3419734239578247, "learning_rate": 8.144262295081968e-05, "loss": 0.0306, "step": 18180 }, { "epoch": 88.83465527760829, "grad_norm": 0.9935535192489624, "learning_rate": 8.131147540983607e-05, "loss": 0.0319, "step": 18200 }, { "epoch": 88.93227577791336, "grad_norm": 1.108636498451233, "learning_rate": 8.118032786885246e-05, "loss": 0.0316, "step": 18220 }, { "epoch": 89.02989627821843, "grad_norm": 0.9274991154670715, "learning_rate": 8.104918032786885e-05, "loss": 0.0299, "step": 18240 }, { "epoch": 89.12751677852349, "grad_norm": 2.000669002532959, "learning_rate": 8.091803278688524e-05, "loss": 0.0251, "step": 18260 }, { "epoch": 89.22513727882856, "grad_norm": 0.8616065382957458, "learning_rate": 8.078688524590164e-05, "loss": 0.0283, "step": 18280 }, { "epoch": 89.32275777913362, "grad_norm": 0.7986624836921692, "learning_rate": 8.065573770491803e-05, "loss": 0.0263, "step": 18300 }, { "epoch": 89.42037827943868, "grad_norm": 1.0566186904907227, "learning_rate": 8.052459016393444e-05, "loss": 0.0282, "step": 18320 }, { "epoch": 89.51799877974375, "grad_norm": 2.450927734375, "learning_rate": 8.039344262295082e-05, "loss": 0.0304, "step": 18340 }, { "epoch": 89.6156192800488, "grad_norm": 1.190073847770691, "learning_rate": 8.026229508196721e-05, "loss": 0.0338, "step": 18360 }, { "epoch": 89.71323978035387, "grad_norm": 0.7991436719894409, "learning_rate": 8.01311475409836e-05, "loss": 0.0279, "step": 18380 }, { "epoch": 89.81086028065894, "grad_norm": 1.010593295097351, "learning_rate": 8e-05, "loss": 0.0321, "step": 18400 }, { "epoch": 89.908480780964, "grad_norm": 1.585942029953003, "learning_rate": 7.98688524590164e-05, "loss": 0.0308, "step": 18420 }, { "epoch": 90.00610128126907, "grad_norm": 0.8515540957450867, "learning_rate": 7.97377049180328e-05, "loss": 0.0336, "step": 18440 }, { "epoch": 90.10372178157414, "grad_norm": 0.7114633917808533, "learning_rate": 7.960655737704919e-05, "loss": 0.0245, "step": 18460 }, { "epoch": 90.20134228187919, "grad_norm": 1.962902545928955, "learning_rate": 7.947540983606558e-05, "loss": 0.0277, "step": 18480 }, { "epoch": 90.29896278218426, "grad_norm": 1.3741369247436523, "learning_rate": 7.934426229508197e-05, "loss": 0.0266, "step": 18500 }, { "epoch": 90.39658328248932, "grad_norm": 1.7575631141662598, "learning_rate": 7.921311475409836e-05, "loss": 0.0251, "step": 18520 }, { "epoch": 90.49420378279439, "grad_norm": 1.7947065830230713, "learning_rate": 7.908196721311476e-05, "loss": 0.0268, "step": 18540 }, { "epoch": 90.59182428309946, "grad_norm": 1.5639177560806274, "learning_rate": 7.895081967213115e-05, "loss": 0.0276, "step": 18560 }, { "epoch": 90.68944478340451, "grad_norm": 1.936145544052124, "learning_rate": 7.881967213114755e-05, "loss": 0.0285, "step": 18580 }, { "epoch": 90.78706528370958, "grad_norm": 1.1903581619262695, "learning_rate": 7.868852459016394e-05, "loss": 0.0295, "step": 18600 }, { "epoch": 90.88468578401465, "grad_norm": 1.4474736452102661, "learning_rate": 7.855737704918033e-05, "loss": 0.0304, "step": 18620 }, { "epoch": 90.9823062843197, "grad_norm": 1.0843713283538818, "learning_rate": 7.842622950819672e-05, "loss": 0.0326, "step": 18640 }, { "epoch": 91.07992678462477, "grad_norm": 1.3280309438705444, "learning_rate": 7.829508196721311e-05, "loss": 0.0251, "step": 18660 }, { "epoch": 91.17754728492983, "grad_norm": 1.297050952911377, "learning_rate": 7.816393442622951e-05, "loss": 0.0223, "step": 18680 }, { "epoch": 91.2751677852349, "grad_norm": 0.6845425367355347, "learning_rate": 7.80327868852459e-05, "loss": 0.0248, "step": 18700 }, { "epoch": 91.37278828553997, "grad_norm": 0.9885107278823853, "learning_rate": 7.79016393442623e-05, "loss": 0.0272, "step": 18720 }, { "epoch": 91.47040878584502, "grad_norm": 1.4001471996307373, "learning_rate": 7.77704918032787e-05, "loss": 0.0279, "step": 18740 }, { "epoch": 91.5680292861501, "grad_norm": 1.0209612846374512, "learning_rate": 7.763934426229508e-05, "loss": 0.0282, "step": 18760 }, { "epoch": 91.66564978645516, "grad_norm": 0.8737853765487671, "learning_rate": 7.750819672131147e-05, "loss": 0.0278, "step": 18780 }, { "epoch": 91.76327028676022, "grad_norm": 2.5949928760528564, "learning_rate": 7.737704918032788e-05, "loss": 0.031, "step": 18800 }, { "epoch": 91.86089078706529, "grad_norm": 1.274043083190918, "learning_rate": 7.724590163934426e-05, "loss": 0.03, "step": 18820 }, { "epoch": 91.95851128737034, "grad_norm": 1.286889672279358, "learning_rate": 7.711475409836067e-05, "loss": 0.0322, "step": 18840 }, { "epoch": 92.05613178767541, "grad_norm": 1.144182801246643, "learning_rate": 7.698360655737706e-05, "loss": 0.0285, "step": 18860 }, { "epoch": 92.15375228798048, "grad_norm": 0.7334594130516052, "learning_rate": 7.685245901639345e-05, "loss": 0.0239, "step": 18880 }, { "epoch": 92.25137278828554, "grad_norm": 0.7854740619659424, "learning_rate": 7.672131147540984e-05, "loss": 0.0246, "step": 18900 }, { "epoch": 92.3489932885906, "grad_norm": 1.5289876461029053, "learning_rate": 7.659016393442622e-05, "loss": 0.0287, "step": 18920 }, { "epoch": 92.44661378889568, "grad_norm": 1.1184567213058472, "learning_rate": 7.645901639344263e-05, "loss": 0.0255, "step": 18940 }, { "epoch": 92.54423428920073, "grad_norm": 1.2199037075042725, "learning_rate": 7.632786885245902e-05, "loss": 0.0296, "step": 18960 }, { "epoch": 92.6418547895058, "grad_norm": 1.8938370943069458, "learning_rate": 7.619672131147542e-05, "loss": 0.0269, "step": 18980 }, { "epoch": 92.73947528981085, "grad_norm": 1.4412243366241455, "learning_rate": 7.606557377049181e-05, "loss": 0.0288, "step": 19000 }, { "epoch": 92.83709579011592, "grad_norm": 1.6484626531600952, "learning_rate": 7.59344262295082e-05, "loss": 0.0285, "step": 19020 }, { "epoch": 92.934716290421, "grad_norm": 1.5925848484039307, "learning_rate": 7.580327868852459e-05, "loss": 0.0308, "step": 19040 }, { "epoch": 93.03233679072605, "grad_norm": 3.0019991397857666, "learning_rate": 7.567213114754099e-05, "loss": 0.0326, "step": 19060 }, { "epoch": 93.12995729103112, "grad_norm": 1.6370418071746826, "learning_rate": 7.554098360655738e-05, "loss": 0.0242, "step": 19080 }, { "epoch": 93.22757779133617, "grad_norm": 1.029890537261963, "learning_rate": 7.540983606557377e-05, "loss": 0.0268, "step": 19100 }, { "epoch": 93.32519829164124, "grad_norm": 0.983168363571167, "learning_rate": 7.527868852459017e-05, "loss": 0.025, "step": 19120 }, { "epoch": 93.42281879194631, "grad_norm": 0.7974419593811035, "learning_rate": 7.514754098360656e-05, "loss": 0.0278, "step": 19140 }, { "epoch": 93.52043929225137, "grad_norm": 1.0815564393997192, "learning_rate": 7.501639344262295e-05, "loss": 0.0256, "step": 19160 }, { "epoch": 93.61805979255644, "grad_norm": 1.217862844467163, "learning_rate": 7.488524590163934e-05, "loss": 0.0278, "step": 19180 }, { "epoch": 93.7156802928615, "grad_norm": 1.0961949825286865, "learning_rate": 7.475409836065574e-05, "loss": 0.0259, "step": 19200 }, { "epoch": 93.81330079316656, "grad_norm": 0.7110977172851562, "learning_rate": 7.462295081967213e-05, "loss": 0.0282, "step": 19220 }, { "epoch": 93.91092129347163, "grad_norm": 1.6820802688598633, "learning_rate": 7.449180327868854e-05, "loss": 0.0277, "step": 19240 }, { "epoch": 94.00854179377669, "grad_norm": 1.1288400888442993, "learning_rate": 7.436065573770493e-05, "loss": 0.0288, "step": 19260 }, { "epoch": 94.10616229408176, "grad_norm": 1.01132071018219, "learning_rate": 7.422950819672131e-05, "loss": 0.0224, "step": 19280 }, { "epoch": 94.20378279438683, "grad_norm": 0.8945777416229248, "learning_rate": 7.40983606557377e-05, "loss": 0.0254, "step": 19300 }, { "epoch": 94.30140329469188, "grad_norm": 0.9037290811538696, "learning_rate": 7.39672131147541e-05, "loss": 0.026, "step": 19320 }, { "epoch": 94.39902379499695, "grad_norm": 1.0359045267105103, "learning_rate": 7.38360655737705e-05, "loss": 0.0236, "step": 19340 }, { "epoch": 94.49664429530202, "grad_norm": 1.0174087285995483, "learning_rate": 7.370491803278689e-05, "loss": 0.0259, "step": 19360 }, { "epoch": 94.59426479560707, "grad_norm": 1.7219215631484985, "learning_rate": 7.357377049180329e-05, "loss": 0.0256, "step": 19380 }, { "epoch": 94.69188529591214, "grad_norm": 1.2760013341903687, "learning_rate": 7.344262295081968e-05, "loss": 0.0268, "step": 19400 }, { "epoch": 94.7895057962172, "grad_norm": 1.4698799848556519, "learning_rate": 7.331147540983607e-05, "loss": 0.0301, "step": 19420 }, { "epoch": 94.88712629652227, "grad_norm": 1.4160206317901611, "learning_rate": 7.318032786885246e-05, "loss": 0.0288, "step": 19440 }, { "epoch": 94.98474679682734, "grad_norm": 1.3946982622146606, "learning_rate": 7.304918032786886e-05, "loss": 0.0295, "step": 19460 }, { "epoch": 95.0823672971324, "grad_norm": 1.3507988452911377, "learning_rate": 7.291803278688525e-05, "loss": 0.0216, "step": 19480 }, { "epoch": 95.17998779743746, "grad_norm": 1.090182900428772, "learning_rate": 7.278688524590165e-05, "loss": 0.0246, "step": 19500 }, { "epoch": 95.27760829774253, "grad_norm": 0.942014217376709, "learning_rate": 7.265573770491804e-05, "loss": 0.0257, "step": 19520 }, { "epoch": 95.37522879804759, "grad_norm": 1.1440125703811646, "learning_rate": 7.252459016393443e-05, "loss": 0.0247, "step": 19540 }, { "epoch": 95.47284929835266, "grad_norm": 1.1401231288909912, "learning_rate": 7.239344262295082e-05, "loss": 0.0249, "step": 19560 }, { "epoch": 95.57046979865771, "grad_norm": 1.6751883029937744, "learning_rate": 7.226229508196721e-05, "loss": 0.0302, "step": 19580 }, { "epoch": 95.66809029896278, "grad_norm": 0.9582850337028503, "learning_rate": 7.213114754098361e-05, "loss": 0.0252, "step": 19600 }, { "epoch": 95.76571079926785, "grad_norm": 0.8516545295715332, "learning_rate": 7.2e-05, "loss": 0.0246, "step": 19620 }, { "epoch": 95.8633312995729, "grad_norm": 1.0861225128173828, "learning_rate": 7.18688524590164e-05, "loss": 0.0288, "step": 19640 }, { "epoch": 95.96095179987798, "grad_norm": 1.4560004472732544, "learning_rate": 7.17377049180328e-05, "loss": 0.0297, "step": 19660 }, { "epoch": 96.05857230018304, "grad_norm": 1.5447992086410522, "learning_rate": 7.160655737704918e-05, "loss": 0.024, "step": 19680 }, { "epoch": 96.1561928004881, "grad_norm": 1.0061029195785522, "learning_rate": 7.147540983606557e-05, "loss": 0.0231, "step": 19700 }, { "epoch": 96.25381330079317, "grad_norm": 0.918874979019165, "learning_rate": 7.134426229508198e-05, "loss": 0.0213, "step": 19720 }, { "epoch": 96.35143380109822, "grad_norm": 1.3430179357528687, "learning_rate": 7.121311475409837e-05, "loss": 0.0278, "step": 19740 }, { "epoch": 96.4490543014033, "grad_norm": 1.0082734823226929, "learning_rate": 7.108196721311475e-05, "loss": 0.0267, "step": 19760 }, { "epoch": 96.54667480170836, "grad_norm": 1.480941653251648, "learning_rate": 7.095081967213116e-05, "loss": 0.0261, "step": 19780 }, { "epoch": 96.64429530201342, "grad_norm": 1.3514058589935303, "learning_rate": 7.081967213114755e-05, "loss": 0.0243, "step": 19800 }, { "epoch": 96.74191580231849, "grad_norm": 1.8436918258666992, "learning_rate": 7.068852459016394e-05, "loss": 0.0233, "step": 19820 }, { "epoch": 96.83953630262356, "grad_norm": 0.7598877549171448, "learning_rate": 7.055737704918033e-05, "loss": 0.0273, "step": 19840 }, { "epoch": 96.93715680292861, "grad_norm": 1.1681586503982544, "learning_rate": 7.042622950819673e-05, "loss": 0.0272, "step": 19860 }, { "epoch": 97.03477730323368, "grad_norm": 2.10929012298584, "learning_rate": 7.029508196721312e-05, "loss": 0.0249, "step": 19880 }, { "epoch": 97.13239780353874, "grad_norm": 1.3854628801345825, "learning_rate": 7.016393442622952e-05, "loss": 0.0222, "step": 19900 }, { "epoch": 97.2300183038438, "grad_norm": 0.7279977798461914, "learning_rate": 7.003278688524591e-05, "loss": 0.0234, "step": 19920 }, { "epoch": 97.32763880414888, "grad_norm": 0.9051561951637268, "learning_rate": 6.99016393442623e-05, "loss": 0.0241, "step": 19940 }, { "epoch": 97.42525930445393, "grad_norm": 0.7423291802406311, "learning_rate": 6.977049180327869e-05, "loss": 0.0234, "step": 19960 }, { "epoch": 97.522879804759, "grad_norm": 1.4373456239700317, "learning_rate": 6.963934426229508e-05, "loss": 0.0253, "step": 19980 }, { "epoch": 97.62050030506407, "grad_norm": 0.6892008781433105, "learning_rate": 6.950819672131148e-05, "loss": 0.0247, "step": 20000 }, { "epoch": 97.71812080536913, "grad_norm": 1.0047869682312012, "learning_rate": 6.937704918032787e-05, "loss": 0.0249, "step": 20020 }, { "epoch": 97.8157413056742, "grad_norm": 1.460539698600769, "learning_rate": 6.924590163934427e-05, "loss": 0.0276, "step": 20040 }, { "epoch": 97.91336180597925, "grad_norm": 0.9892900586128235, "learning_rate": 6.911475409836066e-05, "loss": 0.0274, "step": 20060 }, { "epoch": 98.01098230628432, "grad_norm": 1.0830744504928589, "learning_rate": 6.898360655737705e-05, "loss": 0.0264, "step": 20080 }, { "epoch": 98.10860280658939, "grad_norm": 1.9523261785507202, "learning_rate": 6.885245901639344e-05, "loss": 0.0239, "step": 20100 }, { "epoch": 98.20622330689444, "grad_norm": 1.0463730096817017, "learning_rate": 6.872131147540984e-05, "loss": 0.0246, "step": 20120 }, { "epoch": 98.30384380719951, "grad_norm": 0.9709805250167847, "learning_rate": 6.859016393442623e-05, "loss": 0.0219, "step": 20140 }, { "epoch": 98.40146430750458, "grad_norm": 1.2519688606262207, "learning_rate": 6.845901639344262e-05, "loss": 0.0265, "step": 20160 }, { "epoch": 98.49908480780964, "grad_norm": 0.8213618993759155, "learning_rate": 6.832786885245903e-05, "loss": 0.0233, "step": 20180 }, { "epoch": 98.59670530811471, "grad_norm": 1.1715772151947021, "learning_rate": 6.819672131147542e-05, "loss": 0.0218, "step": 20200 }, { "epoch": 98.69432580841976, "grad_norm": 0.843437671661377, "learning_rate": 6.80655737704918e-05, "loss": 0.0272, "step": 20220 }, { "epoch": 98.79194630872483, "grad_norm": 1.089414358139038, "learning_rate": 6.79344262295082e-05, "loss": 0.0277, "step": 20240 }, { "epoch": 98.8895668090299, "grad_norm": 1.1498339176177979, "learning_rate": 6.78032786885246e-05, "loss": 0.0262, "step": 20260 }, { "epoch": 98.98718730933496, "grad_norm": 0.7882099747657776, "learning_rate": 6.767213114754099e-05, "loss": 0.0261, "step": 20280 }, { "epoch": 99.08480780964003, "grad_norm": 2.353572368621826, "learning_rate": 6.754098360655739e-05, "loss": 0.0232, "step": 20300 }, { "epoch": 99.1824283099451, "grad_norm": 1.716091513633728, "learning_rate": 6.740983606557378e-05, "loss": 0.0232, "step": 20320 }, { "epoch": 99.28004881025015, "grad_norm": 1.1512521505355835, "learning_rate": 6.727868852459017e-05, "loss": 0.0216, "step": 20340 }, { "epoch": 99.37766931055522, "grad_norm": 1.532551884651184, "learning_rate": 6.714754098360656e-05, "loss": 0.0229, "step": 20360 }, { "epoch": 99.47528981086027, "grad_norm": 1.1673088073730469, "learning_rate": 6.701639344262295e-05, "loss": 0.0225, "step": 20380 }, { "epoch": 99.57291031116534, "grad_norm": 1.0088196992874146, "learning_rate": 6.688524590163935e-05, "loss": 0.0265, "step": 20400 }, { "epoch": 99.67053081147041, "grad_norm": 1.236024260520935, "learning_rate": 6.675409836065574e-05, "loss": 0.0245, "step": 20420 }, { "epoch": 99.76815131177547, "grad_norm": 1.9302829504013062, "learning_rate": 6.662295081967214e-05, "loss": 0.0243, "step": 20440 }, { "epoch": 99.86577181208054, "grad_norm": 0.8187095522880554, "learning_rate": 6.649180327868853e-05, "loss": 0.0241, "step": 20460 }, { "epoch": 99.96339231238561, "grad_norm": 1.3263179063796997, "learning_rate": 6.636065573770492e-05, "loss": 0.0274, "step": 20480 }, { "epoch": 100.06101281269066, "grad_norm": 1.1137028932571411, "learning_rate": 6.622950819672131e-05, "loss": 0.0221, "step": 20500 }, { "epoch": 100.15863331299573, "grad_norm": 1.171851396560669, "learning_rate": 6.609836065573771e-05, "loss": 0.0193, "step": 20520 }, { "epoch": 100.25625381330079, "grad_norm": 1.50115966796875, "learning_rate": 6.59672131147541e-05, "loss": 0.0217, "step": 20540 }, { "epoch": 100.35387431360586, "grad_norm": 0.9678937792778015, "learning_rate": 6.58360655737705e-05, "loss": 0.0241, "step": 20560 }, { "epoch": 100.45149481391093, "grad_norm": 1.0585274696350098, "learning_rate": 6.57049180327869e-05, "loss": 0.0257, "step": 20580 }, { "epoch": 100.54911531421598, "grad_norm": 0.9907383322715759, "learning_rate": 6.557377049180327e-05, "loss": 0.0238, "step": 20600 }, { "epoch": 100.64673581452105, "grad_norm": 1.7532027959823608, "learning_rate": 6.544262295081967e-05, "loss": 0.0245, "step": 20620 }, { "epoch": 100.74435631482612, "grad_norm": 1.7263871431350708, "learning_rate": 6.531147540983606e-05, "loss": 0.0274, "step": 20640 }, { "epoch": 100.84197681513118, "grad_norm": 0.698143482208252, "learning_rate": 6.518032786885247e-05, "loss": 0.0247, "step": 20660 }, { "epoch": 100.93959731543625, "grad_norm": 0.9223474860191345, "learning_rate": 6.504918032786886e-05, "loss": 0.0236, "step": 20680 }, { "epoch": 101.0372178157413, "grad_norm": 2.3332505226135254, "learning_rate": 6.491803278688526e-05, "loss": 0.0278, "step": 20700 }, { "epoch": 101.13483831604637, "grad_norm": 1.0863476991653442, "learning_rate": 6.478688524590165e-05, "loss": 0.0218, "step": 20720 }, { "epoch": 101.23245881635144, "grad_norm": 1.3872435092926025, "learning_rate": 6.465573770491804e-05, "loss": 0.0213, "step": 20740 }, { "epoch": 101.3300793166565, "grad_norm": 1.7096153497695923, "learning_rate": 6.452459016393443e-05, "loss": 0.0224, "step": 20760 }, { "epoch": 101.42769981696156, "grad_norm": 1.117743968963623, "learning_rate": 6.439344262295083e-05, "loss": 0.022, "step": 20780 }, { "epoch": 101.52532031726662, "grad_norm": 0.7065951824188232, "learning_rate": 6.426229508196722e-05, "loss": 0.023, "step": 20800 }, { "epoch": 101.62294081757169, "grad_norm": 1.945495843887329, "learning_rate": 6.413114754098361e-05, "loss": 0.024, "step": 20820 }, { "epoch": 101.72056131787676, "grad_norm": 2.141911268234253, "learning_rate": 6.400000000000001e-05, "loss": 0.0243, "step": 20840 }, { "epoch": 101.81818181818181, "grad_norm": 1.0459177494049072, "learning_rate": 6.386885245901639e-05, "loss": 0.0261, "step": 20860 }, { "epoch": 101.91580231848688, "grad_norm": 1.3487838506698608, "learning_rate": 6.373770491803279e-05, "loss": 0.0235, "step": 20880 }, { "epoch": 102.01342281879195, "grad_norm": 0.9669828414916992, "learning_rate": 6.360655737704918e-05, "loss": 0.0262, "step": 20900 }, { "epoch": 102.111043319097, "grad_norm": 0.8470537662506104, "learning_rate": 6.347540983606558e-05, "loss": 0.0209, "step": 20920 }, { "epoch": 102.20866381940208, "grad_norm": 0.879011332988739, "learning_rate": 6.334426229508197e-05, "loss": 0.0203, "step": 20940 }, { "epoch": 102.30628431970713, "grad_norm": 1.5658769607543945, "learning_rate": 6.321311475409837e-05, "loss": 0.0215, "step": 20960 }, { "epoch": 102.4039048200122, "grad_norm": 0.6140362620353699, "learning_rate": 6.308196721311475e-05, "loss": 0.0219, "step": 20980 }, { "epoch": 102.50152532031727, "grad_norm": 1.0868732929229736, "learning_rate": 6.295081967213115e-05, "loss": 0.0224, "step": 21000 }, { "epoch": 102.59914582062233, "grad_norm": 1.1535019874572754, "learning_rate": 6.281967213114754e-05, "loss": 0.0274, "step": 21020 }, { "epoch": 102.6967663209274, "grad_norm": 1.6501412391662598, "learning_rate": 6.268852459016393e-05, "loss": 0.0239, "step": 21040 }, { "epoch": 102.79438682123246, "grad_norm": 1.5148719549179077, "learning_rate": 6.255737704918033e-05, "loss": 0.0213, "step": 21060 }, { "epoch": 102.89200732153752, "grad_norm": 0.9506848454475403, "learning_rate": 6.242622950819672e-05, "loss": 0.0255, "step": 21080 }, { "epoch": 102.98962782184259, "grad_norm": 1.6493301391601562, "learning_rate": 6.229508196721313e-05, "loss": 0.0271, "step": 21100 }, { "epoch": 103.08724832214764, "grad_norm": 1.0388059616088867, "learning_rate": 6.21639344262295e-05, "loss": 0.0196, "step": 21120 }, { "epoch": 103.18486882245271, "grad_norm": 0.6513581871986389, "learning_rate": 6.20327868852459e-05, "loss": 0.0204, "step": 21140 }, { "epoch": 103.28248932275778, "grad_norm": 0.791163980960846, "learning_rate": 6.19016393442623e-05, "loss": 0.0221, "step": 21160 }, { "epoch": 103.38010982306284, "grad_norm": 1.2468329668045044, "learning_rate": 6.17704918032787e-05, "loss": 0.0198, "step": 21180 }, { "epoch": 103.47773032336791, "grad_norm": 0.9693405032157898, "learning_rate": 6.163934426229509e-05, "loss": 0.0239, "step": 21200 }, { "epoch": 103.57535082367298, "grad_norm": 0.8478600382804871, "learning_rate": 6.150819672131148e-05, "loss": 0.0236, "step": 21220 }, { "epoch": 103.67297132397803, "grad_norm": 1.0516799688339233, "learning_rate": 6.137704918032787e-05, "loss": 0.0236, "step": 21240 }, { "epoch": 103.7705918242831, "grad_norm": 1.1466089487075806, "learning_rate": 6.124590163934426e-05, "loss": 0.0232, "step": 21260 }, { "epoch": 103.86821232458816, "grad_norm": 2.5929698944091797, "learning_rate": 6.111475409836066e-05, "loss": 0.0213, "step": 21280 }, { "epoch": 103.96583282489323, "grad_norm": 0.7544173002243042, "learning_rate": 6.098360655737705e-05, "loss": 0.0242, "step": 21300 }, { "epoch": 104.0634533251983, "grad_norm": 1.013609766960144, "learning_rate": 6.085245901639345e-05, "loss": 0.0217, "step": 21320 }, { "epoch": 104.16107382550335, "grad_norm": 1.933367133140564, "learning_rate": 6.072131147540984e-05, "loss": 0.0205, "step": 21340 }, { "epoch": 104.25869432580842, "grad_norm": 0.9371503591537476, "learning_rate": 6.0590163934426236e-05, "loss": 0.0224, "step": 21360 }, { "epoch": 104.35631482611349, "grad_norm": 0.8268026113510132, "learning_rate": 6.0459016393442625e-05, "loss": 0.0217, "step": 21380 }, { "epoch": 104.45393532641855, "grad_norm": 2.129154682159424, "learning_rate": 6.032786885245902e-05, "loss": 0.019, "step": 21400 }, { "epoch": 104.55155582672361, "grad_norm": 0.7515645623207092, "learning_rate": 6.019672131147541e-05, "loss": 0.0236, "step": 21420 }, { "epoch": 104.64917632702867, "grad_norm": 0.7831740975379944, "learning_rate": 6.00655737704918e-05, "loss": 0.0205, "step": 21440 }, { "epoch": 104.74679682733374, "grad_norm": 1.9743825197219849, "learning_rate": 5.99344262295082e-05, "loss": 0.0232, "step": 21460 }, { "epoch": 104.84441732763881, "grad_norm": 1.7434648275375366, "learning_rate": 5.9803278688524586e-05, "loss": 0.0247, "step": 21480 }, { "epoch": 104.94203782794386, "grad_norm": 1.324744701385498, "learning_rate": 5.967213114754099e-05, "loss": 0.0248, "step": 21500 }, { "epoch": 105.03965832824893, "grad_norm": 0.7877782583236694, "learning_rate": 5.954098360655738e-05, "loss": 0.0218, "step": 21520 }, { "epoch": 105.137278828554, "grad_norm": 1.0976682901382446, "learning_rate": 5.9409836065573774e-05, "loss": 0.0212, "step": 21540 }, { "epoch": 105.23489932885906, "grad_norm": 1.2036750316619873, "learning_rate": 5.927868852459016e-05, "loss": 0.0208, "step": 21560 }, { "epoch": 105.33251982916413, "grad_norm": 1.7270026206970215, "learning_rate": 5.9147540983606566e-05, "loss": 0.0211, "step": 21580 }, { "epoch": 105.43014032946918, "grad_norm": 2.558453321456909, "learning_rate": 5.9016393442622956e-05, "loss": 0.0236, "step": 21600 }, { "epoch": 105.52776082977425, "grad_norm": 1.4443696737289429, "learning_rate": 5.888524590163935e-05, "loss": 0.0228, "step": 21620 }, { "epoch": 105.62538133007932, "grad_norm": 0.9000318050384521, "learning_rate": 5.875409836065574e-05, "loss": 0.0223, "step": 21640 }, { "epoch": 105.72300183038438, "grad_norm": 1.0460553169250488, "learning_rate": 5.862295081967213e-05, "loss": 0.0235, "step": 21660 }, { "epoch": 105.82062233068945, "grad_norm": 1.0731499195098877, "learning_rate": 5.849180327868853e-05, "loss": 0.0249, "step": 21680 }, { "epoch": 105.91824283099452, "grad_norm": 2.9348509311676025, "learning_rate": 5.8360655737704916e-05, "loss": 0.0233, "step": 21700 }, { "epoch": 106.01586333129957, "grad_norm": 0.761085033416748, "learning_rate": 5.822950819672132e-05, "loss": 0.0252, "step": 21720 }, { "epoch": 106.11348383160464, "grad_norm": 0.7206617593765259, "learning_rate": 5.80983606557377e-05, "loss": 0.021, "step": 21740 }, { "epoch": 106.2111043319097, "grad_norm": 0.8830762505531311, "learning_rate": 5.7967213114754104e-05, "loss": 0.0222, "step": 21760 }, { "epoch": 106.30872483221476, "grad_norm": 1.1715725660324097, "learning_rate": 5.7836065573770494e-05, "loss": 0.0211, "step": 21780 }, { "epoch": 106.40634533251983, "grad_norm": 0.6775236129760742, "learning_rate": 5.770491803278689e-05, "loss": 0.0207, "step": 21800 }, { "epoch": 106.50396583282489, "grad_norm": 0.787869930267334, "learning_rate": 5.757377049180328e-05, "loss": 0.0201, "step": 21820 }, { "epoch": 106.60158633312996, "grad_norm": 0.880957305431366, "learning_rate": 5.744262295081968e-05, "loss": 0.0227, "step": 21840 }, { "epoch": 106.69920683343503, "grad_norm": 1.0342841148376465, "learning_rate": 5.731147540983607e-05, "loss": 0.0204, "step": 21860 }, { "epoch": 106.79682733374008, "grad_norm": 1.2203606367111206, "learning_rate": 5.7180327868852454e-05, "loss": 0.0217, "step": 21880 }, { "epoch": 106.89444783404515, "grad_norm": 1.1879390478134155, "learning_rate": 5.704918032786886e-05, "loss": 0.0229, "step": 21900 }, { "epoch": 106.99206833435021, "grad_norm": 1.9363093376159668, "learning_rate": 5.6918032786885246e-05, "loss": 0.0227, "step": 21920 }, { "epoch": 107.08968883465528, "grad_norm": 1.5606794357299805, "learning_rate": 5.678688524590164e-05, "loss": 0.0196, "step": 21940 }, { "epoch": 107.18730933496035, "grad_norm": 1.0373190641403198, "learning_rate": 5.665573770491803e-05, "loss": 0.0218, "step": 21960 }, { "epoch": 107.2849298352654, "grad_norm": 0.7485927939414978, "learning_rate": 5.6524590163934435e-05, "loss": 0.019, "step": 21980 }, { "epoch": 107.38255033557047, "grad_norm": 1.0232642889022827, "learning_rate": 5.639344262295082e-05, "loss": 0.0209, "step": 22000 }, { "epoch": 107.48017083587554, "grad_norm": 0.7585027813911438, "learning_rate": 5.626229508196722e-05, "loss": 0.0228, "step": 22020 }, { "epoch": 107.5777913361806, "grad_norm": 1.87349271774292, "learning_rate": 5.613114754098361e-05, "loss": 0.0226, "step": 22040 }, { "epoch": 107.67541183648567, "grad_norm": 0.7697243094444275, "learning_rate": 5.6000000000000006e-05, "loss": 0.0201, "step": 22060 }, { "epoch": 107.77303233679072, "grad_norm": 0.8757901787757874, "learning_rate": 5.5868852459016395e-05, "loss": 0.0212, "step": 22080 }, { "epoch": 107.87065283709579, "grad_norm": 0.6697644591331482, "learning_rate": 5.5737704918032785e-05, "loss": 0.0214, "step": 22100 }, { "epoch": 107.96827333740086, "grad_norm": 0.9846328496932983, "learning_rate": 5.560655737704919e-05, "loss": 0.0224, "step": 22120 }, { "epoch": 108.06589383770591, "grad_norm": 0.975273609161377, "learning_rate": 5.547540983606557e-05, "loss": 0.0228, "step": 22140 }, { "epoch": 108.16351433801098, "grad_norm": 0.671629011631012, "learning_rate": 5.534426229508197e-05, "loss": 0.0183, "step": 22160 }, { "epoch": 108.26113483831605, "grad_norm": 0.7465048432350159, "learning_rate": 5.521311475409836e-05, "loss": 0.0196, "step": 22180 }, { "epoch": 108.35875533862111, "grad_norm": 0.9785251021385193, "learning_rate": 5.508196721311476e-05, "loss": 0.0234, "step": 22200 }, { "epoch": 108.45637583892618, "grad_norm": 1.2198201417922974, "learning_rate": 5.495081967213115e-05, "loss": 0.0215, "step": 22220 }, { "epoch": 108.55399633923123, "grad_norm": 1.261461853981018, "learning_rate": 5.481967213114755e-05, "loss": 0.0212, "step": 22240 }, { "epoch": 108.6516168395363, "grad_norm": 1.1784855127334595, "learning_rate": 5.4688524590163933e-05, "loss": 0.021, "step": 22260 }, { "epoch": 108.74923733984137, "grad_norm": 1.4498306512832642, "learning_rate": 5.4557377049180336e-05, "loss": 0.0215, "step": 22280 }, { "epoch": 108.84685784014643, "grad_norm": 1.6949442625045776, "learning_rate": 5.4426229508196726e-05, "loss": 0.0214, "step": 22300 }, { "epoch": 108.9444783404515, "grad_norm": 0.8981263637542725, "learning_rate": 5.4295081967213115e-05, "loss": 0.0229, "step": 22320 }, { "epoch": 109.04209884075657, "grad_norm": 0.6954344511032104, "learning_rate": 5.416393442622951e-05, "loss": 0.0203, "step": 22340 }, { "epoch": 109.13971934106162, "grad_norm": 1.6094328165054321, "learning_rate": 5.40327868852459e-05, "loss": 0.0186, "step": 22360 }, { "epoch": 109.23733984136669, "grad_norm": 0.983757734298706, "learning_rate": 5.3901639344262304e-05, "loss": 0.0189, "step": 22380 }, { "epoch": 109.33496034167175, "grad_norm": 0.5290912985801697, "learning_rate": 5.3770491803278686e-05, "loss": 0.0227, "step": 22400 }, { "epoch": 109.43258084197682, "grad_norm": 0.7315634489059448, "learning_rate": 5.363934426229509e-05, "loss": 0.019, "step": 22420 }, { "epoch": 109.53020134228188, "grad_norm": 2.0914411544799805, "learning_rate": 5.350819672131148e-05, "loss": 0.0201, "step": 22440 }, { "epoch": 109.62782184258694, "grad_norm": 3.1637024879455566, "learning_rate": 5.3377049180327875e-05, "loss": 0.0208, "step": 22460 }, { "epoch": 109.72544234289201, "grad_norm": 0.7364550828933716, "learning_rate": 5.3245901639344264e-05, "loss": 0.0224, "step": 22480 }, { "epoch": 109.82306284319706, "grad_norm": 0.9041069746017456, "learning_rate": 5.311475409836065e-05, "loss": 0.0202, "step": 22500 }, { "epoch": 109.92068334350213, "grad_norm": 1.2586231231689453, "learning_rate": 5.298360655737705e-05, "loss": 0.0251, "step": 22520 }, { "epoch": 110.0183038438072, "grad_norm": 0.5133557915687561, "learning_rate": 5.285245901639344e-05, "loss": 0.0206, "step": 22540 }, { "epoch": 110.11592434411226, "grad_norm": 0.9251378178596497, "learning_rate": 5.272131147540984e-05, "loss": 0.017, "step": 22560 }, { "epoch": 110.21354484441733, "grad_norm": 0.8834558129310608, "learning_rate": 5.259016393442623e-05, "loss": 0.0182, "step": 22580 }, { "epoch": 110.3111653447224, "grad_norm": 0.7891305685043335, "learning_rate": 5.245901639344263e-05, "loss": 0.0189, "step": 22600 }, { "epoch": 110.40878584502745, "grad_norm": 1.17208993434906, "learning_rate": 5.2327868852459017e-05, "loss": 0.0208, "step": 22620 }, { "epoch": 110.50640634533252, "grad_norm": 0.9067583084106445, "learning_rate": 5.219672131147541e-05, "loss": 0.0186, "step": 22640 }, { "epoch": 110.60402684563758, "grad_norm": 1.1163853406906128, "learning_rate": 5.20655737704918e-05, "loss": 0.0215, "step": 22660 }, { "epoch": 110.70164734594265, "grad_norm": 1.3030050992965698, "learning_rate": 5.1934426229508205e-05, "loss": 0.0243, "step": 22680 }, { "epoch": 110.79926784624772, "grad_norm": 0.6330472230911255, "learning_rate": 5.1803278688524594e-05, "loss": 0.0215, "step": 22700 }, { "epoch": 110.89688834655277, "grad_norm": 0.7558477520942688, "learning_rate": 5.1672131147540984e-05, "loss": 0.021, "step": 22720 }, { "epoch": 110.99450884685784, "grad_norm": 1.7759897708892822, "learning_rate": 5.154098360655738e-05, "loss": 0.021, "step": 22740 }, { "epoch": 111.09212934716291, "grad_norm": 3.0700387954711914, "learning_rate": 5.140983606557377e-05, "loss": 0.0175, "step": 22760 }, { "epoch": 111.18974984746797, "grad_norm": 0.5508354902267456, "learning_rate": 5.1278688524590165e-05, "loss": 0.0174, "step": 22780 }, { "epoch": 111.28737034777303, "grad_norm": 0.8367868661880493, "learning_rate": 5.1147540983606555e-05, "loss": 0.0188, "step": 22800 }, { "epoch": 111.38499084807809, "grad_norm": 0.8336710333824158, "learning_rate": 5.101639344262296e-05, "loss": 0.0178, "step": 22820 }, { "epoch": 111.48261134838316, "grad_norm": 0.8700584173202515, "learning_rate": 5.088524590163935e-05, "loss": 0.0215, "step": 22840 }, { "epoch": 111.58023184868823, "grad_norm": 1.4950515031814575, "learning_rate": 5.075409836065574e-05, "loss": 0.0198, "step": 22860 }, { "epoch": 111.67785234899328, "grad_norm": 1.4095662832260132, "learning_rate": 5.062295081967213e-05, "loss": 0.0199, "step": 22880 }, { "epoch": 111.77547284929835, "grad_norm": 0.6769533157348633, "learning_rate": 5.049180327868853e-05, "loss": 0.0219, "step": 22900 }, { "epoch": 111.87309334960342, "grad_norm": 0.6953234076499939, "learning_rate": 5.036065573770492e-05, "loss": 0.0214, "step": 22920 }, { "epoch": 111.97071384990848, "grad_norm": 1.0300958156585693, "learning_rate": 5.022950819672131e-05, "loss": 0.0216, "step": 22940 }, { "epoch": 112.06833435021355, "grad_norm": 1.323189377784729, "learning_rate": 5.009836065573771e-05, "loss": 0.0207, "step": 22960 }, { "epoch": 112.1659548505186, "grad_norm": 1.1096831560134888, "learning_rate": 4.99672131147541e-05, "loss": 0.0189, "step": 22980 }, { "epoch": 112.26357535082367, "grad_norm": 0.9971668124198914, "learning_rate": 4.9836065573770496e-05, "loss": 0.0194, "step": 23000 }, { "epoch": 112.36119585112874, "grad_norm": 0.7574054002761841, "learning_rate": 4.970491803278689e-05, "loss": 0.0191, "step": 23020 }, { "epoch": 112.4588163514338, "grad_norm": 0.9944175481796265, "learning_rate": 4.957377049180328e-05, "loss": 0.0183, "step": 23040 }, { "epoch": 112.55643685173887, "grad_norm": 1.417324423789978, "learning_rate": 4.944262295081967e-05, "loss": 0.0203, "step": 23060 }, { "epoch": 112.65405735204394, "grad_norm": 1.0666751861572266, "learning_rate": 4.931147540983607e-05, "loss": 0.0188, "step": 23080 }, { "epoch": 112.75167785234899, "grad_norm": 0.9484390020370483, "learning_rate": 4.918032786885246e-05, "loss": 0.0198, "step": 23100 }, { "epoch": 112.84929835265406, "grad_norm": 0.779443621635437, "learning_rate": 4.904918032786885e-05, "loss": 0.0222, "step": 23120 }, { "epoch": 112.94691885295912, "grad_norm": 0.7193658351898193, "learning_rate": 4.891803278688525e-05, "loss": 0.0193, "step": 23140 }, { "epoch": 113.04453935326418, "grad_norm": 1.551416039466858, "learning_rate": 4.8786885245901645e-05, "loss": 0.0185, "step": 23160 }, { "epoch": 113.14215985356925, "grad_norm": 0.720435380935669, "learning_rate": 4.8655737704918034e-05, "loss": 0.0173, "step": 23180 }, { "epoch": 113.23978035387431, "grad_norm": 1.4221702814102173, "learning_rate": 4.852459016393443e-05, "loss": 0.0166, "step": 23200 }, { "epoch": 113.33740085417938, "grad_norm": 0.6794512271881104, "learning_rate": 4.8393442622950826e-05, "loss": 0.0176, "step": 23220 }, { "epoch": 113.43502135448445, "grad_norm": 1.7901623249053955, "learning_rate": 4.8262295081967216e-05, "loss": 0.0178, "step": 23240 }, { "epoch": 113.5326418547895, "grad_norm": 0.8727052211761475, "learning_rate": 4.8131147540983605e-05, "loss": 0.0206, "step": 23260 }, { "epoch": 113.63026235509457, "grad_norm": 1.1418622732162476, "learning_rate": 4.8e-05, "loss": 0.0197, "step": 23280 }, { "epoch": 113.72788285539963, "grad_norm": 1.1590226888656616, "learning_rate": 4.78688524590164e-05, "loss": 0.0211, "step": 23300 }, { "epoch": 113.8255033557047, "grad_norm": 1.1312569379806519, "learning_rate": 4.773770491803279e-05, "loss": 0.0205, "step": 23320 }, { "epoch": 113.92312385600977, "grad_norm": 0.8305485248565674, "learning_rate": 4.760655737704918e-05, "loss": 0.0213, "step": 23340 }, { "epoch": 114.02074435631482, "grad_norm": 0.4991360902786255, "learning_rate": 4.747540983606558e-05, "loss": 0.0206, "step": 23360 }, { "epoch": 114.11836485661989, "grad_norm": 0.5438668131828308, "learning_rate": 4.734426229508197e-05, "loss": 0.0158, "step": 23380 }, { "epoch": 114.21598535692496, "grad_norm": 1.0973989963531494, "learning_rate": 4.7213114754098365e-05, "loss": 0.0175, "step": 23400 }, { "epoch": 114.31360585723002, "grad_norm": 0.8698163032531738, "learning_rate": 4.708196721311476e-05, "loss": 0.0172, "step": 23420 }, { "epoch": 114.41122635753509, "grad_norm": 0.7283509969711304, "learning_rate": 4.695081967213115e-05, "loss": 0.0188, "step": 23440 }, { "epoch": 114.50884685784014, "grad_norm": 2.213822364807129, "learning_rate": 4.681967213114754e-05, "loss": 0.0195, "step": 23460 }, { "epoch": 114.60646735814521, "grad_norm": 1.0742311477661133, "learning_rate": 4.6688524590163936e-05, "loss": 0.0196, "step": 23480 }, { "epoch": 114.70408785845028, "grad_norm": 1.0777767896652222, "learning_rate": 4.655737704918033e-05, "loss": 0.0199, "step": 23500 }, { "epoch": 114.80170835875533, "grad_norm": 1.4363033771514893, "learning_rate": 4.642622950819672e-05, "loss": 0.0212, "step": 23520 }, { "epoch": 114.8993288590604, "grad_norm": 0.7380108833312988, "learning_rate": 4.629508196721312e-05, "loss": 0.0183, "step": 23540 }, { "epoch": 114.99694935936547, "grad_norm": 0.7476310133934021, "learning_rate": 4.616393442622951e-05, "loss": 0.017, "step": 23560 }, { "epoch": 115.09456985967053, "grad_norm": 0.7546923160552979, "learning_rate": 4.60327868852459e-05, "loss": 0.017, "step": 23580 }, { "epoch": 115.1921903599756, "grad_norm": 1.3657608032226562, "learning_rate": 4.59016393442623e-05, "loss": 0.0178, "step": 23600 }, { "epoch": 115.28981086028065, "grad_norm": 0.949591338634491, "learning_rate": 4.5770491803278695e-05, "loss": 0.0172, "step": 23620 }, { "epoch": 115.38743136058572, "grad_norm": 0.6427412629127502, "learning_rate": 4.5639344262295084e-05, "loss": 0.0172, "step": 23640 }, { "epoch": 115.48505186089079, "grad_norm": 1.0137767791748047, "learning_rate": 4.550819672131148e-05, "loss": 0.0161, "step": 23660 }, { "epoch": 115.58267236119585, "grad_norm": 0.5349763035774231, "learning_rate": 4.537704918032787e-05, "loss": 0.02, "step": 23680 }, { "epoch": 115.68029286150092, "grad_norm": 2.2959110736846924, "learning_rate": 4.524590163934426e-05, "loss": 0.023, "step": 23700 }, { "epoch": 115.77791336180599, "grad_norm": 0.7627232074737549, "learning_rate": 4.5114754098360655e-05, "loss": 0.0188, "step": 23720 }, { "epoch": 115.87553386211104, "grad_norm": 0.5804703831672668, "learning_rate": 4.498360655737705e-05, "loss": 0.0198, "step": 23740 }, { "epoch": 115.97315436241611, "grad_norm": 0.5652183294296265, "learning_rate": 4.485245901639345e-05, "loss": 0.0193, "step": 23760 }, { "epoch": 116.07077486272117, "grad_norm": 1.606610655784607, "learning_rate": 4.472131147540984e-05, "loss": 0.0183, "step": 23780 }, { "epoch": 116.16839536302624, "grad_norm": 0.7952703237533569, "learning_rate": 4.459016393442623e-05, "loss": 0.0149, "step": 23800 }, { "epoch": 116.2660158633313, "grad_norm": 0.6120955944061279, "learning_rate": 4.445901639344263e-05, "loss": 0.0174, "step": 23820 }, { "epoch": 116.36363636363636, "grad_norm": 1.0184078216552734, "learning_rate": 4.432786885245902e-05, "loss": 0.0157, "step": 23840 }, { "epoch": 116.46125686394143, "grad_norm": 1.3023227453231812, "learning_rate": 4.4196721311475415e-05, "loss": 0.0173, "step": 23860 }, { "epoch": 116.5588773642465, "grad_norm": 2.9271557331085205, "learning_rate": 4.406557377049181e-05, "loss": 0.0192, "step": 23880 }, { "epoch": 116.65649786455155, "grad_norm": 0.6684398651123047, "learning_rate": 4.3934426229508194e-05, "loss": 0.0186, "step": 23900 }, { "epoch": 116.75411836485662, "grad_norm": 0.843756914138794, "learning_rate": 4.380327868852459e-05, "loss": 0.0179, "step": 23920 }, { "epoch": 116.85173886516168, "grad_norm": 0.8446621894836426, "learning_rate": 4.3672131147540986e-05, "loss": 0.0188, "step": 23940 }, { "epoch": 116.94935936546675, "grad_norm": 0.8657192587852478, "learning_rate": 4.3540983606557375e-05, "loss": 0.0226, "step": 23960 }, { "epoch": 117.04697986577182, "grad_norm": 0.9363495707511902, "learning_rate": 4.340983606557377e-05, "loss": 0.0176, "step": 23980 }, { "epoch": 117.14460036607687, "grad_norm": 0.8379567861557007, "learning_rate": 4.327868852459017e-05, "loss": 0.0157, "step": 24000 }, { "epoch": 117.24222086638194, "grad_norm": 0.9036583304405212, "learning_rate": 4.3147540983606564e-05, "loss": 0.0187, "step": 24020 }, { "epoch": 117.33984136668701, "grad_norm": 1.0149122476577759, "learning_rate": 4.301639344262295e-05, "loss": 0.0167, "step": 24040 }, { "epoch": 117.43746186699207, "grad_norm": 0.9107224345207214, "learning_rate": 4.288524590163935e-05, "loss": 0.0169, "step": 24060 }, { "epoch": 117.53508236729714, "grad_norm": 0.8786804676055908, "learning_rate": 4.2754098360655745e-05, "loss": 0.0179, "step": 24080 }, { "epoch": 117.63270286760219, "grad_norm": 0.6965748071670532, "learning_rate": 4.262295081967213e-05, "loss": 0.0192, "step": 24100 }, { "epoch": 117.73032336790726, "grad_norm": 1.9840143918991089, "learning_rate": 4.2491803278688524e-05, "loss": 0.0199, "step": 24120 }, { "epoch": 117.82794386821233, "grad_norm": 0.770664632320404, "learning_rate": 4.236065573770492e-05, "loss": 0.0224, "step": 24140 }, { "epoch": 117.92556436851739, "grad_norm": 0.6265982985496521, "learning_rate": 4.222950819672131e-05, "loss": 0.0201, "step": 24160 }, { "epoch": 118.02318486882245, "grad_norm": 0.6708711385726929, "learning_rate": 4.2098360655737706e-05, "loss": 0.0181, "step": 24180 }, { "epoch": 118.12080536912751, "grad_norm": 0.6563631892204285, "learning_rate": 4.19672131147541e-05, "loss": 0.0143, "step": 24200 }, { "epoch": 118.21842586943258, "grad_norm": 0.6583575010299683, "learning_rate": 4.183606557377049e-05, "loss": 0.0166, "step": 24220 }, { "epoch": 118.31604636973765, "grad_norm": 0.9662224650382996, "learning_rate": 4.170491803278689e-05, "loss": 0.0155, "step": 24240 }, { "epoch": 118.4136668700427, "grad_norm": 0.5282565355300903, "learning_rate": 4.1573770491803283e-05, "loss": 0.017, "step": 24260 }, { "epoch": 118.51128737034777, "grad_norm": 0.7242906093597412, "learning_rate": 4.144262295081967e-05, "loss": 0.017, "step": 24280 }, { "epoch": 118.60890787065284, "grad_norm": 1.0325088500976562, "learning_rate": 4.131147540983607e-05, "loss": 0.0184, "step": 24300 }, { "epoch": 118.7065283709579, "grad_norm": 0.7789930105209351, "learning_rate": 4.118032786885246e-05, "loss": 0.0191, "step": 24320 }, { "epoch": 118.80414887126297, "grad_norm": 2.7775182723999023, "learning_rate": 4.1049180327868854e-05, "loss": 0.0187, "step": 24340 }, { "epoch": 118.90176937156802, "grad_norm": 0.8794859647750854, "learning_rate": 4.0918032786885244e-05, "loss": 0.0194, "step": 24360 }, { "epoch": 118.99938987187309, "grad_norm": 1.1197643280029297, "learning_rate": 4.078688524590164e-05, "loss": 0.022, "step": 24380 }, { "epoch": 119.09701037217816, "grad_norm": 0.5833938121795654, "learning_rate": 4.0655737704918036e-05, "loss": 0.015, "step": 24400 }, { "epoch": 119.19463087248322, "grad_norm": 0.7687615156173706, "learning_rate": 4.0524590163934425e-05, "loss": 0.0155, "step": 24420 }, { "epoch": 119.29225137278829, "grad_norm": 0.7922354936599731, "learning_rate": 4.039344262295082e-05, "loss": 0.016, "step": 24440 }, { "epoch": 119.38987187309336, "grad_norm": 0.5952595472335815, "learning_rate": 4.026229508196722e-05, "loss": 0.0179, "step": 24460 }, { "epoch": 119.48749237339841, "grad_norm": 0.5200309157371521, "learning_rate": 4.013114754098361e-05, "loss": 0.0159, "step": 24480 }, { "epoch": 119.58511287370348, "grad_norm": 0.6342631578445435, "learning_rate": 4e-05, "loss": 0.0161, "step": 24500 }, { "epoch": 119.68273337400854, "grad_norm": 0.5420534610748291, "learning_rate": 3.98688524590164e-05, "loss": 0.0208, "step": 24520 }, { "epoch": 119.7803538743136, "grad_norm": 2.066472291946411, "learning_rate": 3.973770491803279e-05, "loss": 0.02, "step": 24540 }, { "epoch": 119.87797437461867, "grad_norm": 0.8765552043914795, "learning_rate": 3.960655737704918e-05, "loss": 0.0187, "step": 24560 }, { "epoch": 119.97559487492373, "grad_norm": 0.7948490977287292, "learning_rate": 3.9475409836065574e-05, "loss": 0.021, "step": 24580 }, { "epoch": 120.0732153752288, "grad_norm": 0.6576656699180603, "learning_rate": 3.934426229508197e-05, "loss": 0.018, "step": 24600 }, { "epoch": 120.17083587553387, "grad_norm": 1.538063883781433, "learning_rate": 3.921311475409836e-05, "loss": 0.0172, "step": 24620 }, { "epoch": 120.26845637583892, "grad_norm": 0.6647894382476807, "learning_rate": 3.9081967213114756e-05, "loss": 0.0142, "step": 24640 }, { "epoch": 120.36607687614399, "grad_norm": 0.9566786289215088, "learning_rate": 3.895081967213115e-05, "loss": 0.0177, "step": 24660 }, { "epoch": 120.46369737644905, "grad_norm": 0.9630607962608337, "learning_rate": 3.881967213114754e-05, "loss": 0.0167, "step": 24680 }, { "epoch": 120.56131787675412, "grad_norm": 0.8539407849311829, "learning_rate": 3.868852459016394e-05, "loss": 0.02, "step": 24700 }, { "epoch": 120.65893837705919, "grad_norm": 1.4023864269256592, "learning_rate": 3.8557377049180334e-05, "loss": 0.0161, "step": 24720 }, { "epoch": 120.75655887736424, "grad_norm": 1.2014553546905518, "learning_rate": 3.842622950819672e-05, "loss": 0.0176, "step": 24740 }, { "epoch": 120.85417937766931, "grad_norm": 0.828676164150238, "learning_rate": 3.829508196721311e-05, "loss": 0.0184, "step": 24760 }, { "epoch": 120.95179987797438, "grad_norm": 0.7894257307052612, "learning_rate": 3.816393442622951e-05, "loss": 0.0186, "step": 24780 }, { "epoch": 121.04942037827944, "grad_norm": 0.642427384853363, "learning_rate": 3.8032786885245905e-05, "loss": 0.0174, "step": 24800 }, { "epoch": 121.1470408785845, "grad_norm": 0.7018671631813049, "learning_rate": 3.7901639344262294e-05, "loss": 0.0147, "step": 24820 }, { "epoch": 121.24466137888956, "grad_norm": 0.7113062739372253, "learning_rate": 3.777049180327869e-05, "loss": 0.0163, "step": 24840 }, { "epoch": 121.34228187919463, "grad_norm": 0.395190566778183, "learning_rate": 3.7639344262295086e-05, "loss": 0.0162, "step": 24860 }, { "epoch": 121.4399023794997, "grad_norm": 0.6177274584770203, "learning_rate": 3.7508196721311476e-05, "loss": 0.0197, "step": 24880 }, { "epoch": 121.53752287980475, "grad_norm": 0.8992127776145935, "learning_rate": 3.737704918032787e-05, "loss": 0.0171, "step": 24900 }, { "epoch": 121.63514338010982, "grad_norm": 0.6104313135147095, "learning_rate": 3.724590163934427e-05, "loss": 0.0177, "step": 24920 }, { "epoch": 121.7327638804149, "grad_norm": 1.0897140502929688, "learning_rate": 3.711475409836066e-05, "loss": 0.0191, "step": 24940 }, { "epoch": 121.83038438071995, "grad_norm": 0.6623597145080566, "learning_rate": 3.698360655737705e-05, "loss": 0.0178, "step": 24960 }, { "epoch": 121.92800488102502, "grad_norm": 0.6479583382606506, "learning_rate": 3.685245901639344e-05, "loss": 0.0175, "step": 24980 }, { "epoch": 122.02562538133007, "grad_norm": 0.504220187664032, "learning_rate": 3.672131147540984e-05, "loss": 0.0179, "step": 25000 }, { "epoch": 122.12324588163514, "grad_norm": 0.5783191323280334, "learning_rate": 3.659016393442623e-05, "loss": 0.0159, "step": 25020 }, { "epoch": 122.22086638194021, "grad_norm": 0.6851247549057007, "learning_rate": 3.6459016393442625e-05, "loss": 0.0167, "step": 25040 }, { "epoch": 122.31848688224527, "grad_norm": 0.5190562009811401, "learning_rate": 3.632786885245902e-05, "loss": 0.0168, "step": 25060 }, { "epoch": 122.41610738255034, "grad_norm": 0.8032932281494141, "learning_rate": 3.619672131147541e-05, "loss": 0.0173, "step": 25080 }, { "epoch": 122.5137278828554, "grad_norm": 1.162681221961975, "learning_rate": 3.6065573770491806e-05, "loss": 0.0175, "step": 25100 }, { "epoch": 122.61134838316046, "grad_norm": 0.9898841977119446, "learning_rate": 3.59344262295082e-05, "loss": 0.0154, "step": 25120 }, { "epoch": 122.70896888346553, "grad_norm": 0.7703188061714172, "learning_rate": 3.580327868852459e-05, "loss": 0.0177, "step": 25140 }, { "epoch": 122.80658938377059, "grad_norm": 0.6557360291481018, "learning_rate": 3.567213114754099e-05, "loss": 0.0187, "step": 25160 }, { "epoch": 122.90420988407566, "grad_norm": 0.6278268694877625, "learning_rate": 3.554098360655738e-05, "loss": 0.0173, "step": 25180 }, { "epoch": 123.00183038438072, "grad_norm": 0.5595793128013611, "learning_rate": 3.5409836065573773e-05, "loss": 0.02, "step": 25200 }, { "epoch": 123.09945088468578, "grad_norm": 0.8069674968719482, "learning_rate": 3.527868852459016e-05, "loss": 0.0157, "step": 25220 }, { "epoch": 123.19707138499085, "grad_norm": 0.5641182661056519, "learning_rate": 3.514754098360656e-05, "loss": 0.0162, "step": 25240 }, { "epoch": 123.29469188529592, "grad_norm": 1.641262412071228, "learning_rate": 3.5016393442622955e-05, "loss": 0.0153, "step": 25260 }, { "epoch": 123.39231238560097, "grad_norm": 0.828906238079071, "learning_rate": 3.4885245901639344e-05, "loss": 0.0165, "step": 25280 }, { "epoch": 123.48993288590604, "grad_norm": 0.4439915418624878, "learning_rate": 3.475409836065574e-05, "loss": 0.0175, "step": 25300 }, { "epoch": 123.5875533862111, "grad_norm": 0.5250588059425354, "learning_rate": 3.462295081967214e-05, "loss": 0.016, "step": 25320 }, { "epoch": 123.68517388651617, "grad_norm": 1.8672527074813843, "learning_rate": 3.4491803278688526e-05, "loss": 0.0168, "step": 25340 }, { "epoch": 123.78279438682124, "grad_norm": 0.905852735042572, "learning_rate": 3.436065573770492e-05, "loss": 0.0166, "step": 25360 }, { "epoch": 123.88041488712629, "grad_norm": 1.0820852518081665, "learning_rate": 3.422950819672131e-05, "loss": 0.018, "step": 25380 }, { "epoch": 123.97803538743136, "grad_norm": 0.901567816734314, "learning_rate": 3.409836065573771e-05, "loss": 0.0175, "step": 25400 }, { "epoch": 124.07565588773643, "grad_norm": 0.5788626074790955, "learning_rate": 3.39672131147541e-05, "loss": 0.0153, "step": 25420 }, { "epoch": 124.17327638804149, "grad_norm": 2.4590864181518555, "learning_rate": 3.383606557377049e-05, "loss": 0.0156, "step": 25440 }, { "epoch": 124.27089688834656, "grad_norm": 0.5568335056304932, "learning_rate": 3.370491803278689e-05, "loss": 0.0145, "step": 25460 }, { "epoch": 124.36851738865161, "grad_norm": 0.9648571014404297, "learning_rate": 3.357377049180328e-05, "loss": 0.0139, "step": 25480 }, { "epoch": 124.46613788895668, "grad_norm": 1.7256869077682495, "learning_rate": 3.3442622950819675e-05, "loss": 0.0161, "step": 25500 }, { "epoch": 124.56375838926175, "grad_norm": 0.7551959753036499, "learning_rate": 3.331147540983607e-05, "loss": 0.0165, "step": 25520 }, { "epoch": 124.6613788895668, "grad_norm": 0.6856973767280579, "learning_rate": 3.318032786885246e-05, "loss": 0.0167, "step": 25540 }, { "epoch": 124.75899938987187, "grad_norm": 0.6650362610816956, "learning_rate": 3.3049180327868857e-05, "loss": 0.0164, "step": 25560 }, { "epoch": 124.85661989017694, "grad_norm": 1.0952746868133545, "learning_rate": 3.291803278688525e-05, "loss": 0.0181, "step": 25580 }, { "epoch": 124.954240390482, "grad_norm": 0.8695099353790283, "learning_rate": 3.2786885245901635e-05, "loss": 0.0167, "step": 25600 }, { "epoch": 125.05186089078707, "grad_norm": 0.5697212219238281, "learning_rate": 3.265573770491803e-05, "loss": 0.0166, "step": 25620 }, { "epoch": 125.14948139109212, "grad_norm": 0.6281394958496094, "learning_rate": 3.252459016393443e-05, "loss": 0.0138, "step": 25640 }, { "epoch": 125.2471018913972, "grad_norm": 0.7632110118865967, "learning_rate": 3.2393442622950824e-05, "loss": 0.0163, "step": 25660 }, { "epoch": 125.34472239170226, "grad_norm": 0.587164580821991, "learning_rate": 3.226229508196721e-05, "loss": 0.0169, "step": 25680 }, { "epoch": 125.44234289200732, "grad_norm": 0.8123992681503296, "learning_rate": 3.213114754098361e-05, "loss": 0.0156, "step": 25700 }, { "epoch": 125.53996339231239, "grad_norm": 0.7210849523544312, "learning_rate": 3.2000000000000005e-05, "loss": 0.0161, "step": 25720 }, { "epoch": 125.63758389261746, "grad_norm": 0.6011385917663574, "learning_rate": 3.1868852459016395e-05, "loss": 0.0178, "step": 25740 }, { "epoch": 125.73520439292251, "grad_norm": 0.8048945665359497, "learning_rate": 3.173770491803279e-05, "loss": 0.0172, "step": 25760 }, { "epoch": 125.83282489322758, "grad_norm": 0.5456706285476685, "learning_rate": 3.160655737704919e-05, "loss": 0.0188, "step": 25780 }, { "epoch": 125.93044539353264, "grad_norm": 1.419385313987732, "learning_rate": 3.1475409836065576e-05, "loss": 0.0187, "step": 25800 }, { "epoch": 126.0280658938377, "grad_norm": 0.8208538293838501, "learning_rate": 3.1344262295081966e-05, "loss": 0.0149, "step": 25820 }, { "epoch": 126.12568639414278, "grad_norm": 0.45135247707366943, "learning_rate": 3.121311475409836e-05, "loss": 0.0139, "step": 25840 }, { "epoch": 126.22330689444783, "grad_norm": 0.565280556678772, "learning_rate": 3.108196721311475e-05, "loss": 0.0123, "step": 25860 }, { "epoch": 126.3209273947529, "grad_norm": 0.742659866809845, "learning_rate": 3.095081967213115e-05, "loss": 0.0151, "step": 25880 }, { "epoch": 126.41854789505797, "grad_norm": 1.5381386280059814, "learning_rate": 3.0819672131147544e-05, "loss": 0.0157, "step": 25900 }, { "epoch": 126.51616839536302, "grad_norm": 0.626524031162262, "learning_rate": 3.068852459016393e-05, "loss": 0.0151, "step": 25920 }, { "epoch": 126.6137888956681, "grad_norm": 0.6463727355003357, "learning_rate": 3.055737704918033e-05, "loss": 0.0174, "step": 25940 }, { "epoch": 126.71140939597315, "grad_norm": 0.48679399490356445, "learning_rate": 3.0426229508196725e-05, "loss": 0.0154, "step": 25960 }, { "epoch": 126.80902989627822, "grad_norm": 0.9534430503845215, "learning_rate": 3.0295081967213118e-05, "loss": 0.0185, "step": 25980 }, { "epoch": 126.90665039658329, "grad_norm": 0.571997344493866, "learning_rate": 3.016393442622951e-05, "loss": 0.0174, "step": 26000 }, { "epoch": 127.00427089688834, "grad_norm": 0.8983253836631775, "learning_rate": 3.00327868852459e-05, "loss": 0.0183, "step": 26020 }, { "epoch": 127.10189139719341, "grad_norm": 0.37496012449264526, "learning_rate": 2.9901639344262293e-05, "loss": 0.0135, "step": 26040 }, { "epoch": 127.19951189749847, "grad_norm": 0.7320930361747742, "learning_rate": 2.977049180327869e-05, "loss": 0.0138, "step": 26060 }, { "epoch": 127.29713239780354, "grad_norm": 1.5510950088500977, "learning_rate": 2.963934426229508e-05, "loss": 0.0181, "step": 26080 }, { "epoch": 127.3947528981086, "grad_norm": 0.25900664925575256, "learning_rate": 2.9508196721311478e-05, "loss": 0.0145, "step": 26100 }, { "epoch": 127.49237339841366, "grad_norm": 0.7860931754112244, "learning_rate": 2.937704918032787e-05, "loss": 0.0149, "step": 26120 }, { "epoch": 127.58999389871873, "grad_norm": 1.5779728889465332, "learning_rate": 2.9245901639344263e-05, "loss": 0.0168, "step": 26140 }, { "epoch": 127.6876143990238, "grad_norm": 0.8237743377685547, "learning_rate": 2.911475409836066e-05, "loss": 0.0173, "step": 26160 }, { "epoch": 127.78523489932886, "grad_norm": 0.5260995626449585, "learning_rate": 2.8983606557377052e-05, "loss": 0.0148, "step": 26180 }, { "epoch": 127.88285539963393, "grad_norm": 1.158836007118225, "learning_rate": 2.8852459016393445e-05, "loss": 0.0179, "step": 26200 }, { "epoch": 127.98047589993898, "grad_norm": 1.7822519540786743, "learning_rate": 2.872131147540984e-05, "loss": 0.0153, "step": 26220 }, { "epoch": 128.07809640024405, "grad_norm": 0.6409000158309937, "learning_rate": 2.8590163934426227e-05, "loss": 0.0178, "step": 26240 }, { "epoch": 128.1757169005491, "grad_norm": 0.7198218107223511, "learning_rate": 2.8459016393442623e-05, "loss": 0.0143, "step": 26260 }, { "epoch": 128.2733374008542, "grad_norm": 0.9570964574813843, "learning_rate": 2.8327868852459016e-05, "loss": 0.0132, "step": 26280 }, { "epoch": 128.37095790115924, "grad_norm": 0.40788573026657104, "learning_rate": 2.819672131147541e-05, "loss": 0.0151, "step": 26300 }, { "epoch": 128.4685784014643, "grad_norm": 1.0642712116241455, "learning_rate": 2.8065573770491805e-05, "loss": 0.0154, "step": 26320 }, { "epoch": 128.56619890176938, "grad_norm": 0.5972766280174255, "learning_rate": 2.7934426229508198e-05, "loss": 0.015, "step": 26340 }, { "epoch": 128.66381940207444, "grad_norm": 0.5974799990653992, "learning_rate": 2.7803278688524594e-05, "loss": 0.0144, "step": 26360 }, { "epoch": 128.7614399023795, "grad_norm": 0.8131697773933411, "learning_rate": 2.7672131147540987e-05, "loss": 0.0166, "step": 26380 }, { "epoch": 128.85906040268458, "grad_norm": 0.8219912648200989, "learning_rate": 2.754098360655738e-05, "loss": 0.0186, "step": 26400 }, { "epoch": 128.95668090298963, "grad_norm": 0.7310410737991333, "learning_rate": 2.7409836065573775e-05, "loss": 0.0158, "step": 26420 }, { "epoch": 129.0543014032947, "grad_norm": 0.7448714375495911, "learning_rate": 2.7278688524590168e-05, "loss": 0.0161, "step": 26440 }, { "epoch": 129.15192190359974, "grad_norm": 1.0379338264465332, "learning_rate": 2.7147540983606558e-05, "loss": 0.0129, "step": 26460 }, { "epoch": 129.24954240390483, "grad_norm": 0.4505363404750824, "learning_rate": 2.701639344262295e-05, "loss": 0.0139, "step": 26480 }, { "epoch": 129.34716290420988, "grad_norm": 0.49264198541641235, "learning_rate": 2.6885245901639343e-05, "loss": 0.0141, "step": 26500 }, { "epoch": 129.44478340451494, "grad_norm": 0.38399410247802734, "learning_rate": 2.675409836065574e-05, "loss": 0.0148, "step": 26520 }, { "epoch": 129.54240390482002, "grad_norm": 0.8914321660995483, "learning_rate": 2.6622950819672132e-05, "loss": 0.0159, "step": 26540 }, { "epoch": 129.64002440512508, "grad_norm": 0.8293542265892029, "learning_rate": 2.6491803278688525e-05, "loss": 0.0157, "step": 26560 }, { "epoch": 129.73764490543013, "grad_norm": 0.5534564256668091, "learning_rate": 2.636065573770492e-05, "loss": 0.0158, "step": 26580 }, { "epoch": 129.8352654057352, "grad_norm": 0.7157993912696838, "learning_rate": 2.6229508196721314e-05, "loss": 0.016, "step": 26600 }, { "epoch": 129.93288590604027, "grad_norm": 0.7746397256851196, "learning_rate": 2.6098360655737706e-05, "loss": 0.0192, "step": 26620 }, { "epoch": 130.03050640634532, "grad_norm": 0.7727970480918884, "learning_rate": 2.5967213114754103e-05, "loss": 0.0151, "step": 26640 }, { "epoch": 130.1281269066504, "grad_norm": 0.514680802822113, "learning_rate": 2.5836065573770492e-05, "loss": 0.0159, "step": 26660 }, { "epoch": 130.22574740695546, "grad_norm": 0.87467360496521, "learning_rate": 2.5704918032786885e-05, "loss": 0.0137, "step": 26680 }, { "epoch": 130.32336790726052, "grad_norm": 0.7342318296432495, "learning_rate": 2.5573770491803277e-05, "loss": 0.0164, "step": 26700 }, { "epoch": 130.4209884075656, "grad_norm": 0.46169203519821167, "learning_rate": 2.5442622950819674e-05, "loss": 0.0148, "step": 26720 }, { "epoch": 130.51860890787066, "grad_norm": 0.5552070140838623, "learning_rate": 2.5311475409836066e-05, "loss": 0.0146, "step": 26740 }, { "epoch": 130.6162294081757, "grad_norm": 2.3732874393463135, "learning_rate": 2.518032786885246e-05, "loss": 0.0151, "step": 26760 }, { "epoch": 130.71384990848077, "grad_norm": 0.7399420142173767, "learning_rate": 2.5049180327868855e-05, "loss": 0.0136, "step": 26780 }, { "epoch": 130.81147040878585, "grad_norm": 0.7631209492683411, "learning_rate": 2.4918032786885248e-05, "loss": 0.0168, "step": 26800 }, { "epoch": 130.9090909090909, "grad_norm": 0.4778473675251007, "learning_rate": 2.478688524590164e-05, "loss": 0.0144, "step": 26820 }, { "epoch": 131.00671140939596, "grad_norm": 0.48981741070747375, "learning_rate": 2.4655737704918033e-05, "loss": 0.0174, "step": 26840 }, { "epoch": 131.10433190970105, "grad_norm": 0.550786018371582, "learning_rate": 2.4524590163934426e-05, "loss": 0.0144, "step": 26860 }, { "epoch": 131.2019524100061, "grad_norm": 1.1115200519561768, "learning_rate": 2.4393442622950822e-05, "loss": 0.0137, "step": 26880 }, { "epoch": 131.29957291031116, "grad_norm": 0.7832316160202026, "learning_rate": 2.4262295081967215e-05, "loss": 0.0138, "step": 26900 }, { "epoch": 131.39719341061624, "grad_norm": 0.7918095588684082, "learning_rate": 2.4131147540983608e-05, "loss": 0.0153, "step": 26920 }, { "epoch": 131.4948139109213, "grad_norm": 0.5915355682373047, "learning_rate": 2.4e-05, "loss": 0.0159, "step": 26940 }, { "epoch": 131.59243441122635, "grad_norm": 0.6909199357032776, "learning_rate": 2.3868852459016393e-05, "loss": 0.0159, "step": 26960 }, { "epoch": 131.69005491153143, "grad_norm": 1.0566037893295288, "learning_rate": 2.373770491803279e-05, "loss": 0.0147, "step": 26980 }, { "epoch": 131.7876754118365, "grad_norm": 1.6122446060180664, "learning_rate": 2.3606557377049182e-05, "loss": 0.0141, "step": 27000 }, { "epoch": 131.88529591214154, "grad_norm": 0.8080132007598877, "learning_rate": 2.3475409836065575e-05, "loss": 0.0155, "step": 27020 }, { "epoch": 131.98291641244663, "grad_norm": 0.45939984917640686, "learning_rate": 2.3344262295081968e-05, "loss": 0.0166, "step": 27040 }, { "epoch": 132.08053691275168, "grad_norm": 0.8284308314323425, "learning_rate": 2.321311475409836e-05, "loss": 0.015, "step": 27060 }, { "epoch": 132.17815741305674, "grad_norm": 0.6223374605178833, "learning_rate": 2.3081967213114757e-05, "loss": 0.0155, "step": 27080 }, { "epoch": 132.2757779133618, "grad_norm": 1.6535650491714478, "learning_rate": 2.295081967213115e-05, "loss": 0.015, "step": 27100 }, { "epoch": 132.37339841366688, "grad_norm": 0.6285653710365295, "learning_rate": 2.2819672131147542e-05, "loss": 0.0158, "step": 27120 }, { "epoch": 132.47101891397193, "grad_norm": 0.6470975279808044, "learning_rate": 2.2688524590163935e-05, "loss": 0.0131, "step": 27140 }, { "epoch": 132.568639414277, "grad_norm": 0.6603531241416931, "learning_rate": 2.2557377049180328e-05, "loss": 0.0155, "step": 27160 }, { "epoch": 132.66625991458207, "grad_norm": 0.9789283275604248, "learning_rate": 2.2426229508196724e-05, "loss": 0.014, "step": 27180 }, { "epoch": 132.76388041488713, "grad_norm": 0.7158600687980652, "learning_rate": 2.2295081967213117e-05, "loss": 0.0149, "step": 27200 }, { "epoch": 132.86150091519218, "grad_norm": 0.4593288004398346, "learning_rate": 2.216393442622951e-05, "loss": 0.0149, "step": 27220 }, { "epoch": 132.95912141549726, "grad_norm": 0.7383930087089539, "learning_rate": 2.2032786885245905e-05, "loss": 0.0158, "step": 27240 }, { "epoch": 133.05674191580232, "grad_norm": 0.8438706398010254, "learning_rate": 2.1901639344262295e-05, "loss": 0.0152, "step": 27260 }, { "epoch": 133.15436241610738, "grad_norm": 0.3977959156036377, "learning_rate": 2.1770491803278688e-05, "loss": 0.0135, "step": 27280 }, { "epoch": 133.25198291641246, "grad_norm": 0.5032092332839966, "learning_rate": 2.1639344262295084e-05, "loss": 0.0144, "step": 27300 }, { "epoch": 133.3496034167175, "grad_norm": 0.8900758028030396, "learning_rate": 2.1508196721311476e-05, "loss": 0.0142, "step": 27320 }, { "epoch": 133.44722391702257, "grad_norm": 0.6694475412368774, "learning_rate": 2.1377049180327873e-05, "loss": 0.0148, "step": 27340 }, { "epoch": 133.54484441732765, "grad_norm": 0.6150327920913696, "learning_rate": 2.1245901639344262e-05, "loss": 0.0137, "step": 27360 }, { "epoch": 133.6424649176327, "grad_norm": 0.3980708718299866, "learning_rate": 2.1114754098360655e-05, "loss": 0.0131, "step": 27380 }, { "epoch": 133.74008541793776, "grad_norm": 0.556053876876831, "learning_rate": 2.098360655737705e-05, "loss": 0.0173, "step": 27400 }, { "epoch": 133.83770591824282, "grad_norm": 0.7154746055603027, "learning_rate": 2.0852459016393444e-05, "loss": 0.0142, "step": 27420 }, { "epoch": 133.9353264185479, "grad_norm": 0.585117757320404, "learning_rate": 2.0721311475409836e-05, "loss": 0.0148, "step": 27440 }, { "epoch": 134.03294691885296, "grad_norm": 0.4688512682914734, "learning_rate": 2.059016393442623e-05, "loss": 0.0139, "step": 27460 }, { "epoch": 134.130567419158, "grad_norm": 0.3597017824649811, "learning_rate": 2.0459016393442622e-05, "loss": 0.0125, "step": 27480 }, { "epoch": 134.2281879194631, "grad_norm": 0.6201938986778259, "learning_rate": 2.0327868852459018e-05, "loss": 0.014, "step": 27500 }, { "epoch": 134.32580841976815, "grad_norm": 0.6969265341758728, "learning_rate": 2.019672131147541e-05, "loss": 0.0133, "step": 27520 }, { "epoch": 134.4234289200732, "grad_norm": 0.6457026600837708, "learning_rate": 2.0065573770491804e-05, "loss": 0.0152, "step": 27540 }, { "epoch": 134.5210494203783, "grad_norm": 0.7583892941474915, "learning_rate": 1.99344262295082e-05, "loss": 0.0148, "step": 27560 }, { "epoch": 134.61866992068335, "grad_norm": 0.41781967878341675, "learning_rate": 1.980327868852459e-05, "loss": 0.0145, "step": 27580 }, { "epoch": 134.7162904209884, "grad_norm": 1.2802424430847168, "learning_rate": 1.9672131147540985e-05, "loss": 0.0158, "step": 27600 }, { "epoch": 134.81391092129348, "grad_norm": 0.3811515271663666, "learning_rate": 1.9540983606557378e-05, "loss": 0.0136, "step": 27620 }, { "epoch": 134.91153142159854, "grad_norm": 0.41068577766418457, "learning_rate": 1.940983606557377e-05, "loss": 0.0166, "step": 27640 }, { "epoch": 135.0091519219036, "grad_norm": 0.690075695514679, "learning_rate": 1.9278688524590167e-05, "loss": 0.0152, "step": 27660 }, { "epoch": 135.10677242220865, "grad_norm": 0.6945540308952332, "learning_rate": 1.9147540983606556e-05, "loss": 0.0125, "step": 27680 }, { "epoch": 135.20439292251373, "grad_norm": 0.9262276291847229, "learning_rate": 1.9016393442622952e-05, "loss": 0.014, "step": 27700 }, { "epoch": 135.3020134228188, "grad_norm": 0.5992072224617004, "learning_rate": 1.8885245901639345e-05, "loss": 0.0132, "step": 27720 }, { "epoch": 135.39963392312384, "grad_norm": 0.6684610247612, "learning_rate": 1.8754098360655738e-05, "loss": 0.0147, "step": 27740 }, { "epoch": 135.49725442342893, "grad_norm": 0.647719144821167, "learning_rate": 1.8622950819672134e-05, "loss": 0.0168, "step": 27760 }, { "epoch": 135.59487492373398, "grad_norm": 1.5291879177093506, "learning_rate": 1.8491803278688523e-05, "loss": 0.0159, "step": 27780 }, { "epoch": 135.69249542403904, "grad_norm": 0.7436932325363159, "learning_rate": 1.836065573770492e-05, "loss": 0.0136, "step": 27800 }, { "epoch": 135.79011592434412, "grad_norm": 0.38243773579597473, "learning_rate": 1.8229508196721312e-05, "loss": 0.0145, "step": 27820 }, { "epoch": 135.88773642464918, "grad_norm": 0.6765353679656982, "learning_rate": 1.8098360655737705e-05, "loss": 0.0139, "step": 27840 }, { "epoch": 135.98535692495423, "grad_norm": 0.3190823495388031, "learning_rate": 1.79672131147541e-05, "loss": 0.0152, "step": 27860 }, { "epoch": 136.08297742525932, "grad_norm": 2.0219767093658447, "learning_rate": 1.7836065573770494e-05, "loss": 0.0143, "step": 27880 }, { "epoch": 136.18059792556437, "grad_norm": 0.776849627494812, "learning_rate": 1.7704918032786887e-05, "loss": 0.0135, "step": 27900 }, { "epoch": 136.27821842586943, "grad_norm": 0.5274736285209656, "learning_rate": 1.757377049180328e-05, "loss": 0.0123, "step": 27920 }, { "epoch": 136.3758389261745, "grad_norm": 0.886225700378418, "learning_rate": 1.7442622950819672e-05, "loss": 0.0146, "step": 27940 }, { "epoch": 136.47345942647956, "grad_norm": 0.5282070636749268, "learning_rate": 1.731147540983607e-05, "loss": 0.0137, "step": 27960 }, { "epoch": 136.57107992678462, "grad_norm": 0.6784070730209351, "learning_rate": 1.718032786885246e-05, "loss": 0.0143, "step": 27980 }, { "epoch": 136.66870042708968, "grad_norm": 1.7534900903701782, "learning_rate": 1.7049180327868854e-05, "loss": 0.0137, "step": 28000 }, { "epoch": 136.76632092739476, "grad_norm": 0.40347975492477417, "learning_rate": 1.6918032786885247e-05, "loss": 0.0157, "step": 28020 }, { "epoch": 136.8639414276998, "grad_norm": 1.0218480825424194, "learning_rate": 1.678688524590164e-05, "loss": 0.0145, "step": 28040 }, { "epoch": 136.96156192800487, "grad_norm": 0.2875036597251892, "learning_rate": 1.6655737704918036e-05, "loss": 0.014, "step": 28060 }, { "epoch": 137.05918242830995, "grad_norm": 1.5968719720840454, "learning_rate": 1.6524590163934428e-05, "loss": 0.0132, "step": 28080 }, { "epoch": 137.156802928615, "grad_norm": 0.39140036702156067, "learning_rate": 1.6393442622950818e-05, "loss": 0.0138, "step": 28100 }, { "epoch": 137.25442342892006, "grad_norm": 0.36571571230888367, "learning_rate": 1.6262295081967214e-05, "loss": 0.0134, "step": 28120 }, { "epoch": 137.35204392922515, "grad_norm": 0.6531932950019836, "learning_rate": 1.6131147540983607e-05, "loss": 0.0146, "step": 28140 }, { "epoch": 137.4496644295302, "grad_norm": 0.46148520708084106, "learning_rate": 1.6000000000000003e-05, "loss": 0.0136, "step": 28160 }, { "epoch": 137.54728492983526, "grad_norm": 0.5359562635421753, "learning_rate": 1.5868852459016395e-05, "loss": 0.0128, "step": 28180 }, { "epoch": 137.64490543014034, "grad_norm": 0.5632950663566589, "learning_rate": 1.5737704918032788e-05, "loss": 0.0148, "step": 28200 }, { "epoch": 137.7425259304454, "grad_norm": 0.7229663729667664, "learning_rate": 1.560655737704918e-05, "loss": 0.0147, "step": 28220 }, { "epoch": 137.84014643075045, "grad_norm": 0.5531187653541565, "learning_rate": 1.5475409836065574e-05, "loss": 0.014, "step": 28240 }, { "epoch": 137.93776693105553, "grad_norm": 0.6305696964263916, "learning_rate": 1.5344262295081966e-05, "loss": 0.0157, "step": 28260 }, { "epoch": 138.0353874313606, "grad_norm": 0.8933548331260681, "learning_rate": 1.5213114754098363e-05, "loss": 0.0156, "step": 28280 }, { "epoch": 138.13300793166565, "grad_norm": 0.39126649498939514, "learning_rate": 1.5081967213114755e-05, "loss": 0.011, "step": 28300 }, { "epoch": 138.2306284319707, "grad_norm": 0.6234102249145508, "learning_rate": 1.4950819672131146e-05, "loss": 0.0133, "step": 28320 }, { "epoch": 138.32824893227578, "grad_norm": 0.5867244005203247, "learning_rate": 1.481967213114754e-05, "loss": 0.0138, "step": 28340 }, { "epoch": 138.42586943258084, "grad_norm": 0.6564351916313171, "learning_rate": 1.4688524590163935e-05, "loss": 0.014, "step": 28360 }, { "epoch": 138.5234899328859, "grad_norm": 1.0982993841171265, "learning_rate": 1.455737704918033e-05, "loss": 0.0133, "step": 28380 }, { "epoch": 138.62111043319098, "grad_norm": 0.8140943646430969, "learning_rate": 1.4426229508196722e-05, "loss": 0.0149, "step": 28400 }, { "epoch": 138.71873093349603, "grad_norm": 0.7273306846618652, "learning_rate": 1.4295081967213114e-05, "loss": 0.0149, "step": 28420 }, { "epoch": 138.8163514338011, "grad_norm": 0.46543437242507935, "learning_rate": 1.4163934426229508e-05, "loss": 0.0143, "step": 28440 }, { "epoch": 138.91397193410617, "grad_norm": 1.823746681213379, "learning_rate": 1.4032786885245902e-05, "loss": 0.0141, "step": 28460 }, { "epoch": 139.01159243441123, "grad_norm": 0.602825939655304, "learning_rate": 1.3901639344262297e-05, "loss": 0.0129, "step": 28480 }, { "epoch": 139.10921293471628, "grad_norm": 0.30030643939971924, "learning_rate": 1.377049180327869e-05, "loss": 0.0108, "step": 28500 }, { "epoch": 139.20683343502137, "grad_norm": 0.7023382186889648, "learning_rate": 1.3639344262295084e-05, "loss": 0.013, "step": 28520 }, { "epoch": 139.30445393532642, "grad_norm": 0.8771500587463379, "learning_rate": 1.3508196721311475e-05, "loss": 0.0144, "step": 28540 }, { "epoch": 139.40207443563148, "grad_norm": 0.6988272666931152, "learning_rate": 1.337704918032787e-05, "loss": 0.0125, "step": 28560 }, { "epoch": 139.49969493593656, "grad_norm": 0.8657557368278503, "learning_rate": 1.3245901639344262e-05, "loss": 0.0138, "step": 28580 }, { "epoch": 139.59731543624162, "grad_norm": 0.6832662224769592, "learning_rate": 1.3114754098360657e-05, "loss": 0.0127, "step": 28600 }, { "epoch": 139.69493593654667, "grad_norm": 0.9065951108932495, "learning_rate": 1.2983606557377051e-05, "loss": 0.015, "step": 28620 }, { "epoch": 139.79255643685173, "grad_norm": 0.9211568236351013, "learning_rate": 1.2852459016393442e-05, "loss": 0.0131, "step": 28640 }, { "epoch": 139.8901769371568, "grad_norm": 0.6160862445831299, "learning_rate": 1.2721311475409837e-05, "loss": 0.0163, "step": 28660 }, { "epoch": 139.98779743746186, "grad_norm": 0.8593130111694336, "learning_rate": 1.259016393442623e-05, "loss": 0.0135, "step": 28680 }, { "epoch": 140.08541793776692, "grad_norm": 0.7746515274047852, "learning_rate": 1.2459016393442624e-05, "loss": 0.0141, "step": 28700 }, { "epoch": 140.183038438072, "grad_norm": 0.7830790877342224, "learning_rate": 1.2327868852459017e-05, "loss": 0.0126, "step": 28720 }, { "epoch": 140.28065893837706, "grad_norm": 0.49005040526390076, "learning_rate": 1.2196721311475411e-05, "loss": 0.0127, "step": 28740 }, { "epoch": 140.3782794386821, "grad_norm": 0.9640679359436035, "learning_rate": 1.2065573770491804e-05, "loss": 0.0125, "step": 28760 }, { "epoch": 140.4758999389872, "grad_norm": 0.8114829659461975, "learning_rate": 1.1934426229508197e-05, "loss": 0.0138, "step": 28780 }, { "epoch": 140.57352043929225, "grad_norm": 0.8460706472396851, "learning_rate": 1.1803278688524591e-05, "loss": 0.0148, "step": 28800 }, { "epoch": 140.6711409395973, "grad_norm": 0.4882986843585968, "learning_rate": 1.1672131147540984e-05, "loss": 0.0141, "step": 28820 }, { "epoch": 140.7687614399024, "grad_norm": 1.0322729349136353, "learning_rate": 1.1540983606557378e-05, "loss": 0.0148, "step": 28840 }, { "epoch": 140.86638194020745, "grad_norm": 1.2970582246780396, "learning_rate": 1.1409836065573771e-05, "loss": 0.0144, "step": 28860 }, { "epoch": 140.9640024405125, "grad_norm": 0.9063767790794373, "learning_rate": 1.1278688524590164e-05, "loss": 0.0123, "step": 28880 }, { "epoch": 141.06162294081759, "grad_norm": 0.60384202003479, "learning_rate": 1.1147540983606558e-05, "loss": 0.0122, "step": 28900 }, { "epoch": 141.15924344112264, "grad_norm": 0.5142499804496765, "learning_rate": 1.1016393442622953e-05, "loss": 0.0125, "step": 28920 }, { "epoch": 141.2568639414277, "grad_norm": 0.6854032874107361, "learning_rate": 1.0885245901639344e-05, "loss": 0.0132, "step": 28940 }, { "epoch": 141.35448444173275, "grad_norm": 1.138895034790039, "learning_rate": 1.0754098360655738e-05, "loss": 0.0138, "step": 28960 }, { "epoch": 141.45210494203783, "grad_norm": 0.5815340280532837, "learning_rate": 1.0622950819672131e-05, "loss": 0.0115, "step": 28980 }, { "epoch": 141.5497254423429, "grad_norm": 0.6024242639541626, "learning_rate": 1.0491803278688525e-05, "loss": 0.0127, "step": 29000 }, { "epoch": 141.64734594264795, "grad_norm": 0.44016191363334656, "learning_rate": 1.0360655737704918e-05, "loss": 0.0146, "step": 29020 }, { "epoch": 141.74496644295303, "grad_norm": 2.051720142364502, "learning_rate": 1.0229508196721311e-05, "loss": 0.0151, "step": 29040 }, { "epoch": 141.84258694325808, "grad_norm": 0.6961409449577332, "learning_rate": 1.0098360655737705e-05, "loss": 0.013, "step": 29060 }, { "epoch": 141.94020744356314, "grad_norm": 1.1912919282913208, "learning_rate": 9.9672131147541e-06, "loss": 0.0131, "step": 29080 }, { "epoch": 142.03782794386822, "grad_norm": 0.6203546524047852, "learning_rate": 9.836065573770493e-06, "loss": 0.013, "step": 29100 }, { "epoch": 142.13544844417328, "grad_norm": 0.5386860966682434, "learning_rate": 9.704918032786885e-06, "loss": 0.0123, "step": 29120 }, { "epoch": 142.23306894447833, "grad_norm": 0.5639663934707642, "learning_rate": 9.573770491803278e-06, "loss": 0.0123, "step": 29140 }, { "epoch": 142.33068944478342, "grad_norm": 0.577315628528595, "learning_rate": 9.442622950819673e-06, "loss": 0.0125, "step": 29160 }, { "epoch": 142.42830994508847, "grad_norm": 0.5142390727996826, "learning_rate": 9.311475409836067e-06, "loss": 0.0133, "step": 29180 }, { "epoch": 142.52593044539353, "grad_norm": 0.7933589816093445, "learning_rate": 9.18032786885246e-06, "loss": 0.0151, "step": 29200 }, { "epoch": 142.6235509456986, "grad_norm": 0.8499199151992798, "learning_rate": 9.049180327868853e-06, "loss": 0.0136, "step": 29220 }, { "epoch": 142.72117144600367, "grad_norm": 0.6795129179954529, "learning_rate": 8.918032786885247e-06, "loss": 0.0136, "step": 29240 }, { "epoch": 142.81879194630872, "grad_norm": 0.3827701210975647, "learning_rate": 8.78688524590164e-06, "loss": 0.0122, "step": 29260 }, { "epoch": 142.91641244661378, "grad_norm": 0.6248555779457092, "learning_rate": 8.655737704918034e-06, "loss": 0.0113, "step": 29280 }, { "epoch": 143.01403294691886, "grad_norm": 0.9943171739578247, "learning_rate": 8.524590163934427e-06, "loss": 0.0145, "step": 29300 }, { "epoch": 143.11165344722392, "grad_norm": 0.3848264217376709, "learning_rate": 8.39344262295082e-06, "loss": 0.0119, "step": 29320 }, { "epoch": 143.20927394752897, "grad_norm": 1.02989661693573, "learning_rate": 8.262295081967214e-06, "loss": 0.0123, "step": 29340 }, { "epoch": 143.30689444783405, "grad_norm": 0.5843254923820496, "learning_rate": 8.131147540983607e-06, "loss": 0.0124, "step": 29360 }, { "epoch": 143.4045149481391, "grad_norm": 0.5134753584861755, "learning_rate": 8.000000000000001e-06, "loss": 0.0122, "step": 29380 }, { "epoch": 143.50213544844416, "grad_norm": 0.4464253783226013, "learning_rate": 7.868852459016394e-06, "loss": 0.0116, "step": 29400 }, { "epoch": 143.59975594874925, "grad_norm": 0.445730060338974, "learning_rate": 7.737704918032787e-06, "loss": 0.0116, "step": 29420 }, { "epoch": 143.6973764490543, "grad_norm": 0.7831693887710571, "learning_rate": 7.606557377049181e-06, "loss": 0.0122, "step": 29440 }, { "epoch": 143.79499694935936, "grad_norm": 0.33939194679260254, "learning_rate": 7.475409836065573e-06, "loss": 0.0131, "step": 29460 }, { "epoch": 143.89261744966444, "grad_norm": 0.36323612928390503, "learning_rate": 7.344262295081968e-06, "loss": 0.0157, "step": 29480 }, { "epoch": 143.9902379499695, "grad_norm": 0.6487870216369629, "learning_rate": 7.213114754098361e-06, "loss": 0.0157, "step": 29500 }, { "epoch": 144.08785845027455, "grad_norm": 0.3841145932674408, "learning_rate": 7.081967213114754e-06, "loss": 0.0106, "step": 29520 }, { "epoch": 144.1854789505796, "grad_norm": 1.0142998695373535, "learning_rate": 6.9508196721311484e-06, "loss": 0.0129, "step": 29540 }, { "epoch": 144.2830994508847, "grad_norm": 1.5330740213394165, "learning_rate": 6.819672131147542e-06, "loss": 0.0142, "step": 29560 }, { "epoch": 144.38071995118975, "grad_norm": 2.0231475830078125, "learning_rate": 6.688524590163935e-06, "loss": 0.0115, "step": 29580 }, { "epoch": 144.4783404514948, "grad_norm": 0.542549192905426, "learning_rate": 6.557377049180328e-06, "loss": 0.0131, "step": 29600 }, { "epoch": 144.57596095179989, "grad_norm": 0.6942082047462463, "learning_rate": 6.426229508196721e-06, "loss": 0.013, "step": 29620 }, { "epoch": 144.67358145210494, "grad_norm": 0.4934479296207428, "learning_rate": 6.295081967213115e-06, "loss": 0.0124, "step": 29640 }, { "epoch": 144.77120195241, "grad_norm": 0.9981206655502319, "learning_rate": 6.163934426229508e-06, "loss": 0.013, "step": 29660 }, { "epoch": 144.86882245271508, "grad_norm": 0.5263285636901855, "learning_rate": 6.032786885245902e-06, "loss": 0.013, "step": 29680 }, { "epoch": 144.96644295302013, "grad_norm": 0.4131539762020111, "learning_rate": 5.9016393442622956e-06, "loss": 0.0132, "step": 29700 }, { "epoch": 145.0640634533252, "grad_norm": 0.9396491646766663, "learning_rate": 5.770491803278689e-06, "loss": 0.012, "step": 29720 }, { "epoch": 145.16168395363027, "grad_norm": 0.37081795930862427, "learning_rate": 5.639344262295082e-06, "loss": 0.0118, "step": 29740 }, { "epoch": 145.25930445393533, "grad_norm": 0.5653529167175293, "learning_rate": 5.508196721311476e-06, "loss": 0.0122, "step": 29760 }, { "epoch": 145.35692495424038, "grad_norm": 0.49712416529655457, "learning_rate": 5.377049180327869e-06, "loss": 0.012, "step": 29780 }, { "epoch": 145.45454545454547, "grad_norm": 0.6723568439483643, "learning_rate": 5.245901639344263e-06, "loss": 0.0132, "step": 29800 }, { "epoch": 145.55216595485052, "grad_norm": 0.6191849708557129, "learning_rate": 5.1147540983606555e-06, "loss": 0.0142, "step": 29820 }, { "epoch": 145.64978645515558, "grad_norm": 0.8201606273651123, "learning_rate": 4.98360655737705e-06, "loss": 0.014, "step": 29840 }, { "epoch": 145.74740695546063, "grad_norm": 0.4357975423336029, "learning_rate": 4.852459016393443e-06, "loss": 0.0119, "step": 29860 }, { "epoch": 145.84502745576572, "grad_norm": 0.5062920451164246, "learning_rate": 4.721311475409836e-06, "loss": 0.0112, "step": 29880 }, { "epoch": 145.94264795607077, "grad_norm": 0.6272954940795898, "learning_rate": 4.59016393442623e-06, "loss": 0.0121, "step": 29900 }, { "epoch": 146.04026845637583, "grad_norm": 0.3578208088874817, "learning_rate": 4.4590163934426235e-06, "loss": 0.0137, "step": 29920 }, { "epoch": 146.1378889566809, "grad_norm": 0.4044102132320404, "learning_rate": 4.327868852459017e-06, "loss": 0.0133, "step": 29940 }, { "epoch": 146.23550945698597, "grad_norm": 0.4162692725658417, "learning_rate": 4.19672131147541e-06, "loss": 0.013, "step": 29960 }, { "epoch": 146.33312995729102, "grad_norm": 0.6349827647209167, "learning_rate": 4.0655737704918034e-06, "loss": 0.0138, "step": 29980 }, { "epoch": 146.4307504575961, "grad_norm": 0.6992813348770142, "learning_rate": 3.934426229508197e-06, "loss": 0.0142, "step": 30000 } ], "logging_steps": 20, "max_steps": 30600, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.9434336130018816e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }