diff --git "a/checkpoint-38430/trainer_state.json" "b/checkpoint-38430/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-38430/trainer_state.json" @@ -0,0 +1,10855 @@ +{ + "best_metric": 0.9609171748161316, + "best_model_checkpoint": "autotrain-p3-h70t8-hrfal/checkpoint-38430", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 38430, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00195160031225605, + "grad_norm": 13.096447944641113, + "learning_rate": 3.2526671870934166e-07, + "loss": 1.5907, + "step": 25 + }, + { + "epoch": 0.0039032006245121, + "grad_norm": 5.703726768493652, + "learning_rate": 6.505334374186833e-07, + "loss": 1.5498, + "step": 50 + }, + { + "epoch": 0.00585480093676815, + "grad_norm": 8.548869132995605, + "learning_rate": 9.75800156128025e-07, + "loss": 1.554, + "step": 75 + }, + { + "epoch": 0.0078064012490242, + "grad_norm": 5.827751636505127, + "learning_rate": 1.3010668748373666e-06, + "loss": 1.5233, + "step": 100 + }, + { + "epoch": 0.00975800156128025, + "grad_norm": 6.007171630859375, + "learning_rate": 1.6263335935467083e-06, + "loss": 1.4967, + "step": 125 + }, + { + "epoch": 0.0117096018735363, + "grad_norm": 4.992555618286133, + "learning_rate": 1.95160031225605e-06, + "loss": 1.4704, + "step": 150 + }, + { + "epoch": 0.01366120218579235, + "grad_norm": 5.003896236419678, + "learning_rate": 2.2768670309653916e-06, + "loss": 1.4794, + "step": 175 + }, + { + "epoch": 0.0156128024980484, + "grad_norm": 5.268840789794922, + "learning_rate": 2.6021337496747333e-06, + "loss": 1.3826, + "step": 200 + }, + { + "epoch": 0.01756440281030445, + "grad_norm": 5.862738132476807, + "learning_rate": 2.9274004683840754e-06, + "loss": 1.3102, + "step": 225 + }, + { + "epoch": 0.0195160031225605, + "grad_norm": 6.582859039306641, + "learning_rate": 3.2526671870934166e-06, + "loss": 1.3735, + "step": 250 + }, + { + "epoch": 0.02146760343481655, + "grad_norm": 5.93304967880249, + "learning_rate": 3.5779339058027587e-06, + "loss": 1.3547, + "step": 275 + }, + { + "epoch": 0.0234192037470726, + "grad_norm": 74.69640350341797, + "learning_rate": 3.9032006245121e-06, + "loss": 1.3695, + "step": 300 + }, + { + "epoch": 0.02537080405932865, + "grad_norm": 7.33712911605835, + "learning_rate": 4.2284673432214424e-06, + "loss": 1.4461, + "step": 325 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 9.411476135253906, + "learning_rate": 4.553734061930783e-06, + "loss": 1.3855, + "step": 350 + }, + { + "epoch": 0.02927400468384075, + "grad_norm": 5.823555946350098, + "learning_rate": 4.879000780640125e-06, + "loss": 1.4231, + "step": 375 + }, + { + "epoch": 0.0312256049960968, + "grad_norm": 8.269110679626465, + "learning_rate": 5.2042674993494666e-06, + "loss": 1.339, + "step": 400 + }, + { + "epoch": 0.03317720530835285, + "grad_norm": 6.09420919418335, + "learning_rate": 5.529534218058808e-06, + "loss": 1.3815, + "step": 425 + }, + { + "epoch": 0.0351288056206089, + "grad_norm": 6.803513526916504, + "learning_rate": 5.854800936768151e-06, + "loss": 1.2266, + "step": 450 + }, + { + "epoch": 0.03708040593286495, + "grad_norm": 8.542003631591797, + "learning_rate": 6.1800676554774915e-06, + "loss": 1.2963, + "step": 475 + }, + { + "epoch": 0.039032006245121, + "grad_norm": 9.925923347473145, + "learning_rate": 6.505334374186833e-06, + "loss": 1.314, + "step": 500 + }, + { + "epoch": 0.040983606557377046, + "grad_norm": 10.794492721557617, + "learning_rate": 6.830601092896176e-06, + "loss": 1.2595, + "step": 525 + }, + { + "epoch": 0.0429352068696331, + "grad_norm": 8.39326000213623, + "learning_rate": 7.155867811605517e-06, + "loss": 1.256, + "step": 550 + }, + { + "epoch": 0.04488680718188915, + "grad_norm": 8.971535682678223, + "learning_rate": 7.481134530314858e-06, + "loss": 1.1876, + "step": 575 + }, + { + "epoch": 0.0468384074941452, + "grad_norm": 8.570807456970215, + "learning_rate": 7.8064012490242e-06, + "loss": 1.2649, + "step": 600 + }, + { + "epoch": 0.04879000780640125, + "grad_norm": 9.939892768859863, + "learning_rate": 8.131667967733541e-06, + "loss": 1.2652, + "step": 625 + }, + { + "epoch": 0.0507416081186573, + "grad_norm": 8.307921409606934, + "learning_rate": 8.456934686442885e-06, + "loss": 1.2099, + "step": 650 + }, + { + "epoch": 0.05269320843091335, + "grad_norm": 12.68848991394043, + "learning_rate": 8.782201405152225e-06, + "loss": 1.2878, + "step": 675 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 5.517524242401123, + "learning_rate": 9.107468123861566e-06, + "loss": 1.2559, + "step": 700 + }, + { + "epoch": 0.05659640905542545, + "grad_norm": 8.12459945678711, + "learning_rate": 9.432734842570908e-06, + "loss": 1.2696, + "step": 725 + }, + { + "epoch": 0.0585480093676815, + "grad_norm": 10.731528282165527, + "learning_rate": 9.75800156128025e-06, + "loss": 1.2317, + "step": 750 + }, + { + "epoch": 0.06049960967993755, + "grad_norm": 9.470462799072266, + "learning_rate": 1.0083268279989593e-05, + "loss": 1.1858, + "step": 775 + }, + { + "epoch": 0.0624512099921936, + "grad_norm": 5.64531946182251, + "learning_rate": 1.0408534998698933e-05, + "loss": 1.2177, + "step": 800 + }, + { + "epoch": 0.06440281030444965, + "grad_norm": 8.529309272766113, + "learning_rate": 1.0733801717408275e-05, + "loss": 1.2405, + "step": 825 + }, + { + "epoch": 0.0663544106167057, + "grad_norm": 8.614241600036621, + "learning_rate": 1.1059068436117616e-05, + "loss": 1.319, + "step": 850 + }, + { + "epoch": 0.06830601092896176, + "grad_norm": 8.681391716003418, + "learning_rate": 1.1384335154826958e-05, + "loss": 1.1548, + "step": 875 + }, + { + "epoch": 0.0702576112412178, + "grad_norm": 7.165237903594971, + "learning_rate": 1.1709601873536301e-05, + "loss": 1.1905, + "step": 900 + }, + { + "epoch": 0.07220921155347385, + "grad_norm": 10.226017951965332, + "learning_rate": 1.2034868592245641e-05, + "loss": 1.1018, + "step": 925 + }, + { + "epoch": 0.0741608118657299, + "grad_norm": 10.01928424835205, + "learning_rate": 1.2360135310954983e-05, + "loss": 1.1119, + "step": 950 + }, + { + "epoch": 0.07611241217798595, + "grad_norm": 5.862958908081055, + "learning_rate": 1.2685402029664325e-05, + "loss": 1.2183, + "step": 975 + }, + { + "epoch": 0.078064012490242, + "grad_norm": 9.766705513000488, + "learning_rate": 1.3010668748373666e-05, + "loss": 1.1418, + "step": 1000 + }, + { + "epoch": 0.08001561280249805, + "grad_norm": 13.597146987915039, + "learning_rate": 1.3335935467083008e-05, + "loss": 1.2269, + "step": 1025 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 5.752309322357178, + "learning_rate": 1.3661202185792351e-05, + "loss": 1.0217, + "step": 1050 + }, + { + "epoch": 0.08391881342701014, + "grad_norm": 11.505197525024414, + "learning_rate": 1.3986468904501693e-05, + "loss": 1.1472, + "step": 1075 + }, + { + "epoch": 0.0858704137392662, + "grad_norm": 11.984565734863281, + "learning_rate": 1.4311735623211035e-05, + "loss": 1.1677, + "step": 1100 + }, + { + "epoch": 0.08782201405152225, + "grad_norm": 11.957587242126465, + "learning_rate": 1.4637002341920375e-05, + "loss": 1.1112, + "step": 1125 + }, + { + "epoch": 0.0897736143637783, + "grad_norm": 8.09973430633545, + "learning_rate": 1.4962269060629716e-05, + "loss": 1.2213, + "step": 1150 + }, + { + "epoch": 0.09172521467603435, + "grad_norm": 7.261201858520508, + "learning_rate": 1.528753577933906e-05, + "loss": 1.2436, + "step": 1175 + }, + { + "epoch": 0.0936768149882904, + "grad_norm": 5.268815040588379, + "learning_rate": 1.56128024980484e-05, + "loss": 1.3042, + "step": 1200 + }, + { + "epoch": 0.09562841530054644, + "grad_norm": 12.64362907409668, + "learning_rate": 1.593806921675774e-05, + "loss": 1.2366, + "step": 1225 + }, + { + "epoch": 0.0975800156128025, + "grad_norm": 8.210792541503906, + "learning_rate": 1.6263335935467083e-05, + "loss": 1.1673, + "step": 1250 + }, + { + "epoch": 0.09953161592505855, + "grad_norm": 6.9734673500061035, + "learning_rate": 1.6588602654176425e-05, + "loss": 1.1746, + "step": 1275 + }, + { + "epoch": 0.1014832162373146, + "grad_norm": 5.943842887878418, + "learning_rate": 1.691386937288577e-05, + "loss": 1.1119, + "step": 1300 + }, + { + "epoch": 0.10343481654957065, + "grad_norm": 7.7994184494018555, + "learning_rate": 1.7239136091595108e-05, + "loss": 1.1247, + "step": 1325 + }, + { + "epoch": 0.1053864168618267, + "grad_norm": 8.968555450439453, + "learning_rate": 1.756440281030445e-05, + "loss": 1.1754, + "step": 1350 + }, + { + "epoch": 0.10733801717408274, + "grad_norm": 9.296995162963867, + "learning_rate": 1.788966952901379e-05, + "loss": 1.1995, + "step": 1375 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 4.078953266143799, + "learning_rate": 1.8214936247723133e-05, + "loss": 1.1598, + "step": 1400 + }, + { + "epoch": 0.11124121779859485, + "grad_norm": 9.327086448669434, + "learning_rate": 1.8540202966432478e-05, + "loss": 1.2579, + "step": 1425 + }, + { + "epoch": 0.1131928181108509, + "grad_norm": 5.903244972229004, + "learning_rate": 1.8865469685141816e-05, + "loss": 1.0912, + "step": 1450 + }, + { + "epoch": 0.11514441842310695, + "grad_norm": 5.108721733093262, + "learning_rate": 1.9190736403851158e-05, + "loss": 1.1937, + "step": 1475 + }, + { + "epoch": 0.117096018735363, + "grad_norm": 6.882597923278809, + "learning_rate": 1.95160031225605e-05, + "loss": 1.1128, + "step": 1500 + }, + { + "epoch": 0.11904761904761904, + "grad_norm": 5.640380382537842, + "learning_rate": 1.984126984126984e-05, + "loss": 1.0937, + "step": 1525 + }, + { + "epoch": 0.1209992193598751, + "grad_norm": 7.952732086181641, + "learning_rate": 2.0166536559979186e-05, + "loss": 1.182, + "step": 1550 + }, + { + "epoch": 0.12295081967213115, + "grad_norm": 7.377562046051025, + "learning_rate": 2.0491803278688525e-05, + "loss": 1.0724, + "step": 1575 + }, + { + "epoch": 0.1249024199843872, + "grad_norm": 4.367126941680908, + "learning_rate": 2.0817069997397866e-05, + "loss": 1.0855, + "step": 1600 + }, + { + "epoch": 0.12685402029664325, + "grad_norm": 4.762927532196045, + "learning_rate": 2.1142336716107208e-05, + "loss": 1.1242, + "step": 1625 + }, + { + "epoch": 0.1288056206088993, + "grad_norm": 8.367018699645996, + "learning_rate": 2.146760343481655e-05, + "loss": 1.1775, + "step": 1650 + }, + { + "epoch": 0.13075722092115535, + "grad_norm": 4.4730377197265625, + "learning_rate": 2.1792870153525895e-05, + "loss": 1.1566, + "step": 1675 + }, + { + "epoch": 0.1327088212334114, + "grad_norm": 4.108894348144531, + "learning_rate": 2.2118136872235233e-05, + "loss": 1.1069, + "step": 1700 + }, + { + "epoch": 0.13466042154566746, + "grad_norm": 4.871560096740723, + "learning_rate": 2.2443403590944575e-05, + "loss": 1.1411, + "step": 1725 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 5.6291632652282715, + "learning_rate": 2.2768670309653916e-05, + "loss": 1.1397, + "step": 1750 + }, + { + "epoch": 0.13856362217017953, + "grad_norm": 5.767420768737793, + "learning_rate": 2.3093937028363258e-05, + "loss": 1.0918, + "step": 1775 + }, + { + "epoch": 0.1405152224824356, + "grad_norm": 7.240326881408691, + "learning_rate": 2.3419203747072603e-05, + "loss": 1.0361, + "step": 1800 + }, + { + "epoch": 0.14246682279469164, + "grad_norm": 5.894730091094971, + "learning_rate": 2.374447046578194e-05, + "loss": 1.0645, + "step": 1825 + }, + { + "epoch": 0.1444184231069477, + "grad_norm": 7.513360977172852, + "learning_rate": 2.4069737184491283e-05, + "loss": 1.0948, + "step": 1850 + }, + { + "epoch": 0.14637002341920374, + "grad_norm": 18.503400802612305, + "learning_rate": 2.4395003903200624e-05, + "loss": 1.1111, + "step": 1875 + }, + { + "epoch": 0.1483216237314598, + "grad_norm": 10.58495044708252, + "learning_rate": 2.4720270621909966e-05, + "loss": 1.2131, + "step": 1900 + }, + { + "epoch": 0.15027322404371585, + "grad_norm": 4.171135425567627, + "learning_rate": 2.5045537340619308e-05, + "loss": 1.1158, + "step": 1925 + }, + { + "epoch": 0.1522248243559719, + "grad_norm": 4.908154010772705, + "learning_rate": 2.537080405932865e-05, + "loss": 1.0511, + "step": 1950 + }, + { + "epoch": 0.15417642466822795, + "grad_norm": 4.618373394012451, + "learning_rate": 2.5696070778037995e-05, + "loss": 1.0576, + "step": 1975 + }, + { + "epoch": 0.156128024980484, + "grad_norm": 3.7275032997131348, + "learning_rate": 2.6021337496747333e-05, + "loss": 1.1801, + "step": 2000 + }, + { + "epoch": 0.15807962529274006, + "grad_norm": 8.42215347290039, + "learning_rate": 2.6346604215456678e-05, + "loss": 1.2222, + "step": 2025 + }, + { + "epoch": 0.1600312256049961, + "grad_norm": 7.755763530731201, + "learning_rate": 2.6671870934166016e-05, + "loss": 1.1601, + "step": 2050 + }, + { + "epoch": 0.16198282591725216, + "grad_norm": 6.1496381759643555, + "learning_rate": 2.6997137652875358e-05, + "loss": 1.2084, + "step": 2075 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 5.3830766677856445, + "learning_rate": 2.7322404371584703e-05, + "loss": 1.0965, + "step": 2100 + }, + { + "epoch": 0.16588602654176424, + "grad_norm": 4.946489334106445, + "learning_rate": 2.764767109029404e-05, + "loss": 1.1505, + "step": 2125 + }, + { + "epoch": 0.1678376268540203, + "grad_norm": 8.09270191192627, + "learning_rate": 2.7972937809003386e-05, + "loss": 1.0981, + "step": 2150 + }, + { + "epoch": 0.16978922716627634, + "grad_norm": 13.592877388000488, + "learning_rate": 2.8298204527712724e-05, + "loss": 1.1832, + "step": 2175 + }, + { + "epoch": 0.1717408274785324, + "grad_norm": 11.041646957397461, + "learning_rate": 2.862347124642207e-05, + "loss": 1.1799, + "step": 2200 + }, + { + "epoch": 0.17369242779078845, + "grad_norm": 3.2208385467529297, + "learning_rate": 2.894873796513141e-05, + "loss": 1.0502, + "step": 2225 + }, + { + "epoch": 0.1756440281030445, + "grad_norm": 6.921932697296143, + "learning_rate": 2.927400468384075e-05, + "loss": 1.0409, + "step": 2250 + }, + { + "epoch": 0.17759562841530055, + "grad_norm": 5.773618221282959, + "learning_rate": 2.9599271402550094e-05, + "loss": 1.1298, + "step": 2275 + }, + { + "epoch": 0.1795472287275566, + "grad_norm": 4.763660430908203, + "learning_rate": 2.9924538121259433e-05, + "loss": 1.2314, + "step": 2300 + }, + { + "epoch": 0.18149882903981265, + "grad_norm": 6.696008682250977, + "learning_rate": 3.0249804839968778e-05, + "loss": 1.1259, + "step": 2325 + }, + { + "epoch": 0.1834504293520687, + "grad_norm": 10.095010757446289, + "learning_rate": 3.057507155867812e-05, + "loss": 1.0491, + "step": 2350 + }, + { + "epoch": 0.18540202966432476, + "grad_norm": 5.6859941482543945, + "learning_rate": 3.090033827738746e-05, + "loss": 1.069, + "step": 2375 + }, + { + "epoch": 0.1873536299765808, + "grad_norm": 7.328535079956055, + "learning_rate": 3.12256049960968e-05, + "loss": 1.0507, + "step": 2400 + }, + { + "epoch": 0.18930523028883683, + "grad_norm": 6.071335315704346, + "learning_rate": 3.155087171480614e-05, + "loss": 1.1024, + "step": 2425 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 3.737802028656006, + "learning_rate": 3.187613843351548e-05, + "loss": 1.0667, + "step": 2450 + }, + { + "epoch": 0.19320843091334894, + "grad_norm": 4.416076183319092, + "learning_rate": 3.220140515222483e-05, + "loss": 1.0896, + "step": 2475 + }, + { + "epoch": 0.195160031225605, + "grad_norm": 8.880372047424316, + "learning_rate": 3.2526671870934166e-05, + "loss": 1.095, + "step": 2500 + }, + { + "epoch": 0.19711163153786104, + "grad_norm": 11.645663261413574, + "learning_rate": 3.285193858964351e-05, + "loss": 1.2015, + "step": 2525 + }, + { + "epoch": 0.1990632318501171, + "grad_norm": 4.925193786621094, + "learning_rate": 3.317720530835285e-05, + "loss": 1.186, + "step": 2550 + }, + { + "epoch": 0.20101483216237315, + "grad_norm": 5.554866313934326, + "learning_rate": 3.350247202706219e-05, + "loss": 0.9954, + "step": 2575 + }, + { + "epoch": 0.2029664324746292, + "grad_norm": 3.0277769565582275, + "learning_rate": 3.382773874577154e-05, + "loss": 1.0749, + "step": 2600 + }, + { + "epoch": 0.20491803278688525, + "grad_norm": 8.346768379211426, + "learning_rate": 3.4153005464480874e-05, + "loss": 1.1147, + "step": 2625 + }, + { + "epoch": 0.2068696330991413, + "grad_norm": 9.186507225036621, + "learning_rate": 3.4478272183190216e-05, + "loss": 1.1195, + "step": 2650 + }, + { + "epoch": 0.20882123341139736, + "grad_norm": 4.687973499298096, + "learning_rate": 3.480353890189956e-05, + "loss": 1.106, + "step": 2675 + }, + { + "epoch": 0.2107728337236534, + "grad_norm": 3.2728404998779297, + "learning_rate": 3.51288056206089e-05, + "loss": 1.0834, + "step": 2700 + }, + { + "epoch": 0.21272443403590943, + "grad_norm": 4.886569976806641, + "learning_rate": 3.545407233931825e-05, + "loss": 1.1344, + "step": 2725 + }, + { + "epoch": 0.21467603434816548, + "grad_norm": 4.94088077545166, + "learning_rate": 3.577933905802758e-05, + "loss": 1.2087, + "step": 2750 + }, + { + "epoch": 0.21662763466042154, + "grad_norm": 4.101061820983887, + "learning_rate": 3.6104605776736924e-05, + "loss": 1.0964, + "step": 2775 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 3.3616273403167725, + "learning_rate": 3.6429872495446266e-05, + "loss": 1.2372, + "step": 2800 + }, + { + "epoch": 0.22053083528493364, + "grad_norm": 6.770752906799316, + "learning_rate": 3.675513921415561e-05, + "loss": 1.049, + "step": 2825 + }, + { + "epoch": 0.2224824355971897, + "grad_norm": 5.5648393630981445, + "learning_rate": 3.7080405932864956e-05, + "loss": 1.0512, + "step": 2850 + }, + { + "epoch": 0.22443403590944574, + "grad_norm": 4.148952960968018, + "learning_rate": 3.740567265157429e-05, + "loss": 1.1074, + "step": 2875 + }, + { + "epoch": 0.2263856362217018, + "grad_norm": 4.17451286315918, + "learning_rate": 3.773093937028363e-05, + "loss": 1.0759, + "step": 2900 + }, + { + "epoch": 0.22833723653395785, + "grad_norm": 1.8561843633651733, + "learning_rate": 3.8056206088992974e-05, + "loss": 1.0641, + "step": 2925 + }, + { + "epoch": 0.2302888368462139, + "grad_norm": 4.825262546539307, + "learning_rate": 3.8381472807702316e-05, + "loss": 1.1774, + "step": 2950 + }, + { + "epoch": 0.23224043715846995, + "grad_norm": 5.506011962890625, + "learning_rate": 3.8706739526411664e-05, + "loss": 1.068, + "step": 2975 + }, + { + "epoch": 0.234192037470726, + "grad_norm": 4.509649753570557, + "learning_rate": 3.9032006245121e-05, + "loss": 1.1719, + "step": 3000 + }, + { + "epoch": 0.23614363778298206, + "grad_norm": 5.580336093902588, + "learning_rate": 3.935727296383034e-05, + "loss": 1.1327, + "step": 3025 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 2.790898561477661, + "learning_rate": 3.968253968253968e-05, + "loss": 1.0681, + "step": 3050 + }, + { + "epoch": 0.24004683840749413, + "grad_norm": 7.094126224517822, + "learning_rate": 4.0007806401249024e-05, + "loss": 1.1957, + "step": 3075 + }, + { + "epoch": 0.2419984387197502, + "grad_norm": 3.570857286453247, + "learning_rate": 4.033307311995837e-05, + "loss": 1.1034, + "step": 3100 + }, + { + "epoch": 0.24395003903200624, + "grad_norm": 3.804701805114746, + "learning_rate": 4.065833983866771e-05, + "loss": 1.1447, + "step": 3125 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 3.908904552459717, + "learning_rate": 4.098360655737705e-05, + "loss": 0.9782, + "step": 3150 + }, + { + "epoch": 0.24785323965651834, + "grad_norm": 4.148895263671875, + "learning_rate": 4.130887327608639e-05, + "loss": 1.0813, + "step": 3175 + }, + { + "epoch": 0.2498048399687744, + "grad_norm": 4.743203163146973, + "learning_rate": 4.163413999479573e-05, + "loss": 1.1125, + "step": 3200 + }, + { + "epoch": 0.25175644028103045, + "grad_norm": 3.7647483348846436, + "learning_rate": 4.195940671350508e-05, + "loss": 1.0592, + "step": 3225 + }, + { + "epoch": 0.2537080405932865, + "grad_norm": 4.593599796295166, + "learning_rate": 4.2284673432214416e-05, + "loss": 1.0296, + "step": 3250 + }, + { + "epoch": 0.25565964090554255, + "grad_norm": 4.175937652587891, + "learning_rate": 4.260994015092376e-05, + "loss": 1.1492, + "step": 3275 + }, + { + "epoch": 0.2576112412177986, + "grad_norm": 3.975682020187378, + "learning_rate": 4.29352068696331e-05, + "loss": 1.1119, + "step": 3300 + }, + { + "epoch": 0.25956284153005466, + "grad_norm": 7.904545307159424, + "learning_rate": 4.326047358834244e-05, + "loss": 1.1627, + "step": 3325 + }, + { + "epoch": 0.2615144418423107, + "grad_norm": 7.504304885864258, + "learning_rate": 4.358574030705179e-05, + "loss": 1.1978, + "step": 3350 + }, + { + "epoch": 0.26346604215456676, + "grad_norm": 3.108285903930664, + "learning_rate": 4.3911007025761124e-05, + "loss": 1.0748, + "step": 3375 + }, + { + "epoch": 0.2654176424668228, + "grad_norm": 7.470887184143066, + "learning_rate": 4.4236273744470466e-05, + "loss": 1.0248, + "step": 3400 + }, + { + "epoch": 0.26736924277907886, + "grad_norm": 4.784750461578369, + "learning_rate": 4.456154046317981e-05, + "loss": 1.1805, + "step": 3425 + }, + { + "epoch": 0.2693208430913349, + "grad_norm": 6.852480411529541, + "learning_rate": 4.488680718188915e-05, + "loss": 1.1956, + "step": 3450 + }, + { + "epoch": 0.27127244340359097, + "grad_norm": 4.3307037353515625, + "learning_rate": 4.52120739005985e-05, + "loss": 1.1241, + "step": 3475 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 4.881993293762207, + "learning_rate": 4.553734061930783e-05, + "loss": 1.1308, + "step": 3500 + }, + { + "epoch": 0.275175644028103, + "grad_norm": 19.279508590698242, + "learning_rate": 4.5862607338017174e-05, + "loss": 1.1272, + "step": 3525 + }, + { + "epoch": 0.27712724434035907, + "grad_norm": 4.769797325134277, + "learning_rate": 4.6187874056726516e-05, + "loss": 1.1071, + "step": 3550 + }, + { + "epoch": 0.2790788446526151, + "grad_norm": 8.924955368041992, + "learning_rate": 4.651314077543586e-05, + "loss": 1.1521, + "step": 3575 + }, + { + "epoch": 0.2810304449648712, + "grad_norm": 4.278524398803711, + "learning_rate": 4.6838407494145206e-05, + "loss": 1.0985, + "step": 3600 + }, + { + "epoch": 0.2829820452771272, + "grad_norm": 4.353513240814209, + "learning_rate": 4.716367421285454e-05, + "loss": 1.0604, + "step": 3625 + }, + { + "epoch": 0.2849336455893833, + "grad_norm": 5.537378787994385, + "learning_rate": 4.748894093156388e-05, + "loss": 1.1981, + "step": 3650 + }, + { + "epoch": 0.28688524590163933, + "grad_norm": 5.4037981033325195, + "learning_rate": 4.7814207650273224e-05, + "loss": 1.1133, + "step": 3675 + }, + { + "epoch": 0.2888368462138954, + "grad_norm": 10.91964340209961, + "learning_rate": 4.8139474368982566e-05, + "loss": 1.1587, + "step": 3700 + }, + { + "epoch": 0.29078844652615143, + "grad_norm": 7.859940528869629, + "learning_rate": 4.8464741087691914e-05, + "loss": 1.1853, + "step": 3725 + }, + { + "epoch": 0.2927400468384075, + "grad_norm": 3.7757997512817383, + "learning_rate": 4.879000780640125e-05, + "loss": 1.0812, + "step": 3750 + }, + { + "epoch": 0.29469164715066354, + "grad_norm": 4.0322418212890625, + "learning_rate": 4.911527452511059e-05, + "loss": 1.0592, + "step": 3775 + }, + { + "epoch": 0.2966432474629196, + "grad_norm": 6.0286688804626465, + "learning_rate": 4.944054124381993e-05, + "loss": 1.164, + "step": 3800 + }, + { + "epoch": 0.29859484777517564, + "grad_norm": 2.7810094356536865, + "learning_rate": 4.9765807962529274e-05, + "loss": 1.0809, + "step": 3825 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 3.493044137954712, + "learning_rate": 4.9989880590973484e-05, + "loss": 1.0978, + "step": 3850 + }, + { + "epoch": 0.30249804839968775, + "grad_norm": 8.307269096374512, + "learning_rate": 4.995373984445023e-05, + "loss": 1.1567, + "step": 3875 + }, + { + "epoch": 0.3044496487119438, + "grad_norm": 3.932588577270508, + "learning_rate": 4.991759909792697e-05, + "loss": 1.0253, + "step": 3900 + }, + { + "epoch": 0.30640124902419985, + "grad_norm": 7.0573577880859375, + "learning_rate": 4.988145835140371e-05, + "loss": 1.0473, + "step": 3925 + }, + { + "epoch": 0.3083528493364559, + "grad_norm": 5.22884464263916, + "learning_rate": 4.9845317604880446e-05, + "loss": 1.2578, + "step": 3950 + }, + { + "epoch": 0.31030444964871196, + "grad_norm": 4.004763603210449, + "learning_rate": 4.980917685835719e-05, + "loss": 1.1439, + "step": 3975 + }, + { + "epoch": 0.312256049960968, + "grad_norm": 7.770696640014648, + "learning_rate": 4.977303611183393e-05, + "loss": 1.0966, + "step": 4000 + }, + { + "epoch": 0.31420765027322406, + "grad_norm": 5.048931121826172, + "learning_rate": 4.973689536531067e-05, + "loss": 1.0414, + "step": 4025 + }, + { + "epoch": 0.3161592505854801, + "grad_norm": 6.155930042266846, + "learning_rate": 4.970075461878741e-05, + "loss": 0.9683, + "step": 4050 + }, + { + "epoch": 0.31811085089773616, + "grad_norm": 5.830320835113525, + "learning_rate": 4.966461387226415e-05, + "loss": 1.1416, + "step": 4075 + }, + { + "epoch": 0.3200624512099922, + "grad_norm": 4.284904479980469, + "learning_rate": 4.962847312574089e-05, + "loss": 1.1349, + "step": 4100 + }, + { + "epoch": 0.32201405152224827, + "grad_norm": 6.230539321899414, + "learning_rate": 4.959233237921762e-05, + "loss": 1.2262, + "step": 4125 + }, + { + "epoch": 0.3239656518345043, + "grad_norm": 4.681546688079834, + "learning_rate": 4.9556191632694364e-05, + "loss": 1.1271, + "step": 4150 + }, + { + "epoch": 0.3259172521467603, + "grad_norm": 4.804106712341309, + "learning_rate": 4.952005088617111e-05, + "loss": 1.0742, + "step": 4175 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 4.871090888977051, + "learning_rate": 4.948391013964785e-05, + "loss": 1.1764, + "step": 4200 + }, + { + "epoch": 0.3298204527712724, + "grad_norm": 4.321030139923096, + "learning_rate": 4.9447769393124586e-05, + "loss": 1.1126, + "step": 4225 + }, + { + "epoch": 0.3317720530835285, + "grad_norm": 6.380833148956299, + "learning_rate": 4.9411628646601326e-05, + "loss": 1.192, + "step": 4250 + }, + { + "epoch": 0.3337236533957845, + "grad_norm": 3.971674919128418, + "learning_rate": 4.937548790007807e-05, + "loss": 1.1536, + "step": 4275 + }, + { + "epoch": 0.3356752537080406, + "grad_norm": 37.128082275390625, + "learning_rate": 4.933934715355481e-05, + "loss": 1.13, + "step": 4300 + }, + { + "epoch": 0.33762685402029663, + "grad_norm": 3.7878243923187256, + "learning_rate": 4.930320640703154e-05, + "loss": 1.111, + "step": 4325 + }, + { + "epoch": 0.3395784543325527, + "grad_norm": 5.418118476867676, + "learning_rate": 4.926706566050829e-05, + "loss": 0.9958, + "step": 4350 + }, + { + "epoch": 0.34153005464480873, + "grad_norm": 4.387601375579834, + "learning_rate": 4.923092491398503e-05, + "loss": 0.9424, + "step": 4375 + }, + { + "epoch": 0.3434816549570648, + "grad_norm": 3.4925923347473145, + "learning_rate": 4.919478416746176e-05, + "loss": 1.1682, + "step": 4400 + }, + { + "epoch": 0.34543325526932084, + "grad_norm": 4.853846073150635, + "learning_rate": 4.91586434209385e-05, + "loss": 1.2165, + "step": 4425 + }, + { + "epoch": 0.3473848555815769, + "grad_norm": 6.108550548553467, + "learning_rate": 4.9122502674415244e-05, + "loss": 1.2025, + "step": 4450 + }, + { + "epoch": 0.34933645589383294, + "grad_norm": 4.7765960693359375, + "learning_rate": 4.9086361927891984e-05, + "loss": 1.0645, + "step": 4475 + }, + { + "epoch": 0.351288056206089, + "grad_norm": 6.34417200088501, + "learning_rate": 4.9050221181368725e-05, + "loss": 1.1926, + "step": 4500 + }, + { + "epoch": 0.35323965651834505, + "grad_norm": 5.653446674346924, + "learning_rate": 4.9014080434845466e-05, + "loss": 1.099, + "step": 4525 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 6.869150161743164, + "learning_rate": 4.8977939688322206e-05, + "loss": 1.0869, + "step": 4550 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 4.249377250671387, + "learning_rate": 4.894179894179895e-05, + "loss": 1.0478, + "step": 4575 + }, + { + "epoch": 0.3590944574551132, + "grad_norm": 5.449871063232422, + "learning_rate": 4.890565819527568e-05, + "loss": 1.1365, + "step": 4600 + }, + { + "epoch": 0.36104605776736926, + "grad_norm": 4.3751420974731445, + "learning_rate": 4.886951744875242e-05, + "loss": 1.0853, + "step": 4625 + }, + { + "epoch": 0.3629976580796253, + "grad_norm": 4.02731990814209, + "learning_rate": 4.883337670222916e-05, + "loss": 1.0923, + "step": 4650 + }, + { + "epoch": 0.36494925839188136, + "grad_norm": 6.136334419250488, + "learning_rate": 4.87972359557059e-05, + "loss": 1.1415, + "step": 4675 + }, + { + "epoch": 0.3669008587041374, + "grad_norm": 7.303816318511963, + "learning_rate": 4.876109520918264e-05, + "loss": 1.1066, + "step": 4700 + }, + { + "epoch": 0.36885245901639346, + "grad_norm": 4.0389485359191895, + "learning_rate": 4.872495446265938e-05, + "loss": 1.0511, + "step": 4725 + }, + { + "epoch": 0.3708040593286495, + "grad_norm": 4.499385833740234, + "learning_rate": 4.8688813716136124e-05, + "loss": 1.1585, + "step": 4750 + }, + { + "epoch": 0.37275565964090557, + "grad_norm": 4.839364528656006, + "learning_rate": 4.8652672969612864e-05, + "loss": 1.0976, + "step": 4775 + }, + { + "epoch": 0.3747072599531616, + "grad_norm": 3.6924235820770264, + "learning_rate": 4.86165322230896e-05, + "loss": 1.0606, + "step": 4800 + }, + { + "epoch": 0.3766588602654176, + "grad_norm": 3.1741929054260254, + "learning_rate": 4.8580391476566346e-05, + "loss": 1.068, + "step": 4825 + }, + { + "epoch": 0.37861046057767367, + "grad_norm": 5.013918876647949, + "learning_rate": 4.8544250730043086e-05, + "loss": 1.1417, + "step": 4850 + }, + { + "epoch": 0.3805620608899297, + "grad_norm": 3.984086751937866, + "learning_rate": 4.850810998351982e-05, + "loss": 1.0178, + "step": 4875 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 4.162098407745361, + "learning_rate": 4.847196923699656e-05, + "loss": 0.9814, + "step": 4900 + }, + { + "epoch": 0.3844652615144418, + "grad_norm": 4.488099575042725, + "learning_rate": 4.84358284904733e-05, + "loss": 0.9956, + "step": 4925 + }, + { + "epoch": 0.3864168618266979, + "grad_norm": 3.54471755027771, + "learning_rate": 4.839968774395004e-05, + "loss": 1.1253, + "step": 4950 + }, + { + "epoch": 0.38836846213895393, + "grad_norm": 5.596224784851074, + "learning_rate": 4.8363546997426775e-05, + "loss": 1.0257, + "step": 4975 + }, + { + "epoch": 0.39032006245121, + "grad_norm": 3.2627928256988525, + "learning_rate": 4.832740625090352e-05, + "loss": 1.0162, + "step": 5000 + }, + { + "epoch": 0.39227166276346603, + "grad_norm": 5.352809429168701, + "learning_rate": 4.8291265504380263e-05, + "loss": 1.181, + "step": 5025 + }, + { + "epoch": 0.3942232630757221, + "grad_norm": 5.056680679321289, + "learning_rate": 4.8255124757857004e-05, + "loss": 1.0824, + "step": 5050 + }, + { + "epoch": 0.39617486338797814, + "grad_norm": 4.917740345001221, + "learning_rate": 4.821898401133374e-05, + "loss": 1.0051, + "step": 5075 + }, + { + "epoch": 0.3981264637002342, + "grad_norm": 3.8183538913726807, + "learning_rate": 4.818284326481048e-05, + "loss": 1.2464, + "step": 5100 + }, + { + "epoch": 0.40007806401249024, + "grad_norm": 4.752437591552734, + "learning_rate": 4.814670251828722e-05, + "loss": 1.2254, + "step": 5125 + }, + { + "epoch": 0.4020296643247463, + "grad_norm": 6.421558380126953, + "learning_rate": 4.811056177176396e-05, + "loss": 1.0933, + "step": 5150 + }, + { + "epoch": 0.40398126463700235, + "grad_norm": 5.246910095214844, + "learning_rate": 4.80744210252407e-05, + "loss": 1.121, + "step": 5175 + }, + { + "epoch": 0.4059328649492584, + "grad_norm": 2.352116584777832, + "learning_rate": 4.803828027871744e-05, + "loss": 1.1774, + "step": 5200 + }, + { + "epoch": 0.40788446526151445, + "grad_norm": 4.930973052978516, + "learning_rate": 4.800213953219418e-05, + "loss": 1.0888, + "step": 5225 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 2.956850051879883, + "learning_rate": 4.7965998785670915e-05, + "loss": 1.0709, + "step": 5250 + }, + { + "epoch": 0.41178766588602655, + "grad_norm": 3.146444320678711, + "learning_rate": 4.7929858039147656e-05, + "loss": 1.1323, + "step": 5275 + }, + { + "epoch": 0.4137392661982826, + "grad_norm": 2.190849542617798, + "learning_rate": 4.7893717292624396e-05, + "loss": 1.0779, + "step": 5300 + }, + { + "epoch": 0.41569086651053866, + "grad_norm": 7.8777689933776855, + "learning_rate": 4.7857576546101143e-05, + "loss": 1.1449, + "step": 5325 + }, + { + "epoch": 0.4176424668227947, + "grad_norm": 3.634291648864746, + "learning_rate": 4.782143579957788e-05, + "loss": 1.1609, + "step": 5350 + }, + { + "epoch": 0.41959406713505076, + "grad_norm": 7.147963047027588, + "learning_rate": 4.778529505305462e-05, + "loss": 1.1566, + "step": 5375 + }, + { + "epoch": 0.4215456674473068, + "grad_norm": 3.821582555770874, + "learning_rate": 4.774915430653136e-05, + "loss": 1.1444, + "step": 5400 + }, + { + "epoch": 0.42349726775956287, + "grad_norm": 4.176403999328613, + "learning_rate": 4.77130135600081e-05, + "loss": 1.0199, + "step": 5425 + }, + { + "epoch": 0.42544886807181886, + "grad_norm": 6.640966892242432, + "learning_rate": 4.767687281348483e-05, + "loss": 1.1497, + "step": 5450 + }, + { + "epoch": 0.4274004683840749, + "grad_norm": 4.840256214141846, + "learning_rate": 4.764073206696158e-05, + "loss": 1.1272, + "step": 5475 + }, + { + "epoch": 0.42935206869633097, + "grad_norm": 5.981360912322998, + "learning_rate": 4.760459132043832e-05, + "loss": 1.0107, + "step": 5500 + }, + { + "epoch": 0.431303669008587, + "grad_norm": 4.916370391845703, + "learning_rate": 4.7568450573915054e-05, + "loss": 1.1782, + "step": 5525 + }, + { + "epoch": 0.4332552693208431, + "grad_norm": 5.027568340301514, + "learning_rate": 4.7532309827391795e-05, + "loss": 1.0734, + "step": 5550 + }, + { + "epoch": 0.4352068696330991, + "grad_norm": 3.8499932289123535, + "learning_rate": 4.7496169080868536e-05, + "loss": 1.1896, + "step": 5575 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 4.520782470703125, + "learning_rate": 4.7460028334345276e-05, + "loss": 1.1282, + "step": 5600 + }, + { + "epoch": 0.43911007025761123, + "grad_norm": 25.82693099975586, + "learning_rate": 4.742388758782202e-05, + "loss": 1.1077, + "step": 5625 + }, + { + "epoch": 0.4410616705698673, + "grad_norm": 7.125487804412842, + "learning_rate": 4.738774684129876e-05, + "loss": 1.1773, + "step": 5650 + }, + { + "epoch": 0.44301327088212333, + "grad_norm": 4.232584476470947, + "learning_rate": 4.73516060947755e-05, + "loss": 1.1326, + "step": 5675 + }, + { + "epoch": 0.4449648711943794, + "grad_norm": 3.191322088241577, + "learning_rate": 4.731546534825224e-05, + "loss": 1.0496, + "step": 5700 + }, + { + "epoch": 0.44691647150663544, + "grad_norm": 7.781968593597412, + "learning_rate": 4.727932460172897e-05, + "loss": 1.1592, + "step": 5725 + }, + { + "epoch": 0.4488680718188915, + "grad_norm": 2.8880882263183594, + "learning_rate": 4.724318385520571e-05, + "loss": 1.0381, + "step": 5750 + }, + { + "epoch": 0.45081967213114754, + "grad_norm": 6.664117813110352, + "learning_rate": 4.720704310868245e-05, + "loss": 1.0518, + "step": 5775 + }, + { + "epoch": 0.4527712724434036, + "grad_norm": 3.957406520843506, + "learning_rate": 4.7170902362159194e-05, + "loss": 1.0781, + "step": 5800 + }, + { + "epoch": 0.45472287275565965, + "grad_norm": 3.791996955871582, + "learning_rate": 4.7134761615635934e-05, + "loss": 1.0961, + "step": 5825 + }, + { + "epoch": 0.4566744730679157, + "grad_norm": 4.358145713806152, + "learning_rate": 4.7098620869112675e-05, + "loss": 1.047, + "step": 5850 + }, + { + "epoch": 0.45862607338017175, + "grad_norm": 5.152914524078369, + "learning_rate": 4.7062480122589416e-05, + "loss": 1.0987, + "step": 5875 + }, + { + "epoch": 0.4605776736924278, + "grad_norm": 3.973331928253174, + "learning_rate": 4.7026339376066156e-05, + "loss": 1.0982, + "step": 5900 + }, + { + "epoch": 0.46252927400468385, + "grad_norm": 18.341373443603516, + "learning_rate": 4.699019862954289e-05, + "loss": 1.1221, + "step": 5925 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 6.411871910095215, + "learning_rate": 4.695405788301964e-05, + "loss": 1.0861, + "step": 5950 + }, + { + "epoch": 0.46643247462919596, + "grad_norm": 5.147834777832031, + "learning_rate": 4.691791713649638e-05, + "loss": 1.092, + "step": 5975 + }, + { + "epoch": 0.468384074941452, + "grad_norm": 10.557292938232422, + "learning_rate": 4.688177638997311e-05, + "loss": 1.0913, + "step": 6000 + }, + { + "epoch": 0.47033567525370806, + "grad_norm": 8.907600402832031, + "learning_rate": 4.684563564344985e-05, + "loss": 1.1123, + "step": 6025 + }, + { + "epoch": 0.4722872755659641, + "grad_norm": 9.172224044799805, + "learning_rate": 4.680949489692659e-05, + "loss": 1.1519, + "step": 6050 + }, + { + "epoch": 0.47423887587822017, + "grad_norm": 12.110922813415527, + "learning_rate": 4.6773354150403333e-05, + "loss": 1.2049, + "step": 6075 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 8.593923568725586, + "learning_rate": 4.673721340388007e-05, + "loss": 1.106, + "step": 6100 + }, + { + "epoch": 0.4781420765027322, + "grad_norm": 6.362249851226807, + "learning_rate": 4.6701072657356815e-05, + "loss": 1.1438, + "step": 6125 + }, + { + "epoch": 0.48009367681498827, + "grad_norm": 5.324913501739502, + "learning_rate": 4.6664931910833555e-05, + "loss": 1.2391, + "step": 6150 + }, + { + "epoch": 0.4820452771272443, + "grad_norm": 4.286983489990234, + "learning_rate": 4.6628791164310296e-05, + "loss": 1.1207, + "step": 6175 + }, + { + "epoch": 0.4839968774395004, + "grad_norm": 14.357000350952148, + "learning_rate": 4.659265041778703e-05, + "loss": 1.0455, + "step": 6200 + }, + { + "epoch": 0.4859484777517564, + "grad_norm": 3.6988441944122314, + "learning_rate": 4.655650967126377e-05, + "loss": 1.022, + "step": 6225 + }, + { + "epoch": 0.4879000780640125, + "grad_norm": 8.205002784729004, + "learning_rate": 4.652036892474051e-05, + "loss": 1.0964, + "step": 6250 + }, + { + "epoch": 0.48985167837626853, + "grad_norm": 5.310348987579346, + "learning_rate": 4.648422817821725e-05, + "loss": 1.0683, + "step": 6275 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 5.160155296325684, + "learning_rate": 4.644808743169399e-05, + "loss": 1.0968, + "step": 6300 + }, + { + "epoch": 0.49375487900078063, + "grad_norm": 4.856478691101074, + "learning_rate": 4.641194668517073e-05, + "loss": 1.1084, + "step": 6325 + }, + { + "epoch": 0.4957064793130367, + "grad_norm": 7.521535873413086, + "learning_rate": 4.637580593864747e-05, + "loss": 1.1262, + "step": 6350 + }, + { + "epoch": 0.49765807962529274, + "grad_norm": 4.104182720184326, + "learning_rate": 4.633966519212421e-05, + "loss": 1.1058, + "step": 6375 + }, + { + "epoch": 0.4996096799375488, + "grad_norm": 6.739131450653076, + "learning_rate": 4.630352444560095e-05, + "loss": 1.0462, + "step": 6400 + }, + { + "epoch": 0.5015612802498048, + "grad_norm": 9.126935958862305, + "learning_rate": 4.626738369907769e-05, + "loss": 1.099, + "step": 6425 + }, + { + "epoch": 0.5035128805620609, + "grad_norm": 5.6247382164001465, + "learning_rate": 4.6231242952554435e-05, + "loss": 1.1525, + "step": 6450 + }, + { + "epoch": 0.505464480874317, + "grad_norm": 2.0298993587493896, + "learning_rate": 4.619510220603117e-05, + "loss": 1.1155, + "step": 6475 + }, + { + "epoch": 0.507416081186573, + "grad_norm": 2.8384835720062256, + "learning_rate": 4.615896145950791e-05, + "loss": 1.0868, + "step": 6500 + }, + { + "epoch": 0.509367681498829, + "grad_norm": 3.5315396785736084, + "learning_rate": 4.612282071298465e-05, + "loss": 1.0317, + "step": 6525 + }, + { + "epoch": 0.5113192818110851, + "grad_norm": 5.024168968200684, + "learning_rate": 4.608667996646139e-05, + "loss": 1.1498, + "step": 6550 + }, + { + "epoch": 0.5132708821233412, + "grad_norm": 4.631045341491699, + "learning_rate": 4.6050539219938124e-05, + "loss": 1.0056, + "step": 6575 + }, + { + "epoch": 0.5152224824355972, + "grad_norm": 8.196073532104492, + "learning_rate": 4.601439847341487e-05, + "loss": 1.0426, + "step": 6600 + }, + { + "epoch": 0.5171740827478533, + "grad_norm": 2.9024226665496826, + "learning_rate": 4.597825772689161e-05, + "loss": 1.1699, + "step": 6625 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 9.07722282409668, + "learning_rate": 4.5942116980368346e-05, + "loss": 1.0452, + "step": 6650 + }, + { + "epoch": 0.5210772833723654, + "grad_norm": 3.8796255588531494, + "learning_rate": 4.590597623384509e-05, + "loss": 1.0877, + "step": 6675 + }, + { + "epoch": 0.5230288836846214, + "grad_norm": 3.8506641387939453, + "learning_rate": 4.586983548732183e-05, + "loss": 1.1473, + "step": 6700 + }, + { + "epoch": 0.5249804839968775, + "grad_norm": 4.6162614822387695, + "learning_rate": 4.583369474079857e-05, + "loss": 1.0504, + "step": 6725 + }, + { + "epoch": 0.5269320843091335, + "grad_norm": 4.613024711608887, + "learning_rate": 4.579755399427531e-05, + "loss": 1.0962, + "step": 6750 + }, + { + "epoch": 0.5288836846213896, + "grad_norm": 9.391596794128418, + "learning_rate": 4.576141324775205e-05, + "loss": 1.0662, + "step": 6775 + }, + { + "epoch": 0.5308352849336456, + "grad_norm": 3.371690034866333, + "learning_rate": 4.572527250122879e-05, + "loss": 1.0777, + "step": 6800 + }, + { + "epoch": 0.5327868852459017, + "grad_norm": 6.0039873123168945, + "learning_rate": 4.568913175470553e-05, + "loss": 0.9658, + "step": 6825 + }, + { + "epoch": 0.5347384855581577, + "grad_norm": 4.096612453460693, + "learning_rate": 4.5652991008182264e-05, + "loss": 1.1356, + "step": 6850 + }, + { + "epoch": 0.5366900858704138, + "grad_norm": 4.341022968292236, + "learning_rate": 4.5616850261659004e-05, + "loss": 0.9884, + "step": 6875 + }, + { + "epoch": 0.5386416861826698, + "grad_norm": 10.07226848602295, + "learning_rate": 4.5580709515135745e-05, + "loss": 1.1034, + "step": 6900 + }, + { + "epoch": 0.5405932864949259, + "grad_norm": 5.22163724899292, + "learning_rate": 4.5544568768612486e-05, + "loss": 1.0798, + "step": 6925 + }, + { + "epoch": 0.5425448868071819, + "grad_norm": 4.374348163604736, + "learning_rate": 4.5508428022089226e-05, + "loss": 1.141, + "step": 6950 + }, + { + "epoch": 0.544496487119438, + "grad_norm": 4.529986381530762, + "learning_rate": 4.547228727556597e-05, + "loss": 1.0075, + "step": 6975 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 4.563103675842285, + "learning_rate": 4.543614652904271e-05, + "loss": 1.259, + "step": 7000 + }, + { + "epoch": 0.5483996877439501, + "grad_norm": 4.728930950164795, + "learning_rate": 4.540000578251945e-05, + "loss": 1.1304, + "step": 7025 + }, + { + "epoch": 0.550351288056206, + "grad_norm": 3.5341649055480957, + "learning_rate": 4.536386503599618e-05, + "loss": 0.9653, + "step": 7050 + }, + { + "epoch": 0.5523028883684621, + "grad_norm": 8.19575309753418, + "learning_rate": 4.532772428947293e-05, + "loss": 0.9391, + "step": 7075 + }, + { + "epoch": 0.5542544886807181, + "grad_norm": 4.861355304718018, + "learning_rate": 4.529158354294967e-05, + "loss": 0.9472, + "step": 7100 + }, + { + "epoch": 0.5562060889929742, + "grad_norm": 4.463324546813965, + "learning_rate": 4.5255442796426403e-05, + "loss": 1.0469, + "step": 7125 + }, + { + "epoch": 0.5581576893052302, + "grad_norm": 4.212376117706299, + "learning_rate": 4.5219302049903144e-05, + "loss": 1.1343, + "step": 7150 + }, + { + "epoch": 0.5601092896174863, + "grad_norm": 3.6168642044067383, + "learning_rate": 4.5183161303379885e-05, + "loss": 1.1818, + "step": 7175 + }, + { + "epoch": 0.5620608899297423, + "grad_norm": 2.510784149169922, + "learning_rate": 4.5147020556856625e-05, + "loss": 1.0227, + "step": 7200 + }, + { + "epoch": 0.5640124902419984, + "grad_norm": 3.7371110916137695, + "learning_rate": 4.511087981033336e-05, + "loss": 0.9827, + "step": 7225 + }, + { + "epoch": 0.5659640905542545, + "grad_norm": 1.9721616506576538, + "learning_rate": 4.5074739063810106e-05, + "loss": 1.1665, + "step": 7250 + }, + { + "epoch": 0.5679156908665105, + "grad_norm": 5.9786810874938965, + "learning_rate": 4.503859831728685e-05, + "loss": 0.9879, + "step": 7275 + }, + { + "epoch": 0.5698672911787666, + "grad_norm": 4.709555149078369, + "learning_rate": 4.500245757076359e-05, + "loss": 1.0827, + "step": 7300 + }, + { + "epoch": 0.5718188914910226, + "grad_norm": 3.2862067222595215, + "learning_rate": 4.496631682424032e-05, + "loss": 1.0467, + "step": 7325 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 5.784111499786377, + "learning_rate": 4.493017607771706e-05, + "loss": 1.0896, + "step": 7350 + }, + { + "epoch": 0.5757220921155347, + "grad_norm": 2.884516477584839, + "learning_rate": 4.48940353311938e-05, + "loss": 1.0118, + "step": 7375 + }, + { + "epoch": 0.5776736924277908, + "grad_norm": 4.375331878662109, + "learning_rate": 4.485789458467054e-05, + "loss": 1.0787, + "step": 7400 + }, + { + "epoch": 0.5796252927400468, + "grad_norm": 4.84864616394043, + "learning_rate": 4.4821753838147283e-05, + "loss": 1.1236, + "step": 7425 + }, + { + "epoch": 0.5815768930523029, + "grad_norm": 2.9834280014038086, + "learning_rate": 4.4785613091624024e-05, + "loss": 1.0392, + "step": 7450 + }, + { + "epoch": 0.5835284933645589, + "grad_norm": 4.782969951629639, + "learning_rate": 4.4749472345100765e-05, + "loss": 1.0001, + "step": 7475 + }, + { + "epoch": 0.585480093676815, + "grad_norm": 5.125592231750488, + "learning_rate": 4.47133315985775e-05, + "loss": 1.0781, + "step": 7500 + }, + { + "epoch": 0.587431693989071, + "grad_norm": 3.909648895263672, + "learning_rate": 4.467719085205424e-05, + "loss": 1.0589, + "step": 7525 + }, + { + "epoch": 0.5893832943013271, + "grad_norm": 13.199075698852539, + "learning_rate": 4.464105010553098e-05, + "loss": 1.1148, + "step": 7550 + }, + { + "epoch": 0.5913348946135831, + "grad_norm": 3.768085241317749, + "learning_rate": 4.460490935900773e-05, + "loss": 1.1142, + "step": 7575 + }, + { + "epoch": 0.5932864949258392, + "grad_norm": 23.85594367980957, + "learning_rate": 4.456876861248446e-05, + "loss": 0.9194, + "step": 7600 + }, + { + "epoch": 0.5952380952380952, + "grad_norm": 3.88219952583313, + "learning_rate": 4.45326278659612e-05, + "loss": 1.1592, + "step": 7625 + }, + { + "epoch": 0.5971896955503513, + "grad_norm": 4.277048110961914, + "learning_rate": 4.449648711943794e-05, + "loss": 1.0734, + "step": 7650 + }, + { + "epoch": 0.5991412958626073, + "grad_norm": 4.148211479187012, + "learning_rate": 4.446034637291468e-05, + "loss": 1.1358, + "step": 7675 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 4.788566589355469, + "learning_rate": 4.4424205626391416e-05, + "loss": 1.0031, + "step": 7700 + }, + { + "epoch": 0.6030444964871194, + "grad_norm": 4.106381416320801, + "learning_rate": 4.4388064879868163e-05, + "loss": 1.1541, + "step": 7725 + }, + { + "epoch": 0.6049960967993755, + "grad_norm": 4.781994342803955, + "learning_rate": 4.4351924133344904e-05, + "loss": 1.0884, + "step": 7750 + }, + { + "epoch": 0.6069476971116315, + "grad_norm": 7.013800621032715, + "learning_rate": 4.431578338682164e-05, + "loss": 1.1502, + "step": 7775 + }, + { + "epoch": 0.6088992974238876, + "grad_norm": 6.440084934234619, + "learning_rate": 4.427964264029838e-05, + "loss": 1.0832, + "step": 7800 + }, + { + "epoch": 0.6108508977361436, + "grad_norm": 3.842294931411743, + "learning_rate": 4.424350189377512e-05, + "loss": 1.0387, + "step": 7825 + }, + { + "epoch": 0.6128024980483997, + "grad_norm": 3.1234066486358643, + "learning_rate": 4.420736114725186e-05, + "loss": 1.1062, + "step": 7850 + }, + { + "epoch": 0.6147540983606558, + "grad_norm": 4.855643272399902, + "learning_rate": 4.41712204007286e-05, + "loss": 1.0799, + "step": 7875 + }, + { + "epoch": 0.6167056986729118, + "grad_norm": 4.618957042694092, + "learning_rate": 4.413507965420534e-05, + "loss": 1.0583, + "step": 7900 + }, + { + "epoch": 0.6186572989851679, + "grad_norm": 4.589887619018555, + "learning_rate": 4.409893890768208e-05, + "loss": 1.0626, + "step": 7925 + }, + { + "epoch": 0.6206088992974239, + "grad_norm": 3.4261438846588135, + "learning_rate": 4.406279816115882e-05, + "loss": 1.1269, + "step": 7950 + }, + { + "epoch": 0.62256049960968, + "grad_norm": 10.059616088867188, + "learning_rate": 4.4026657414635556e-05, + "loss": 1.1316, + "step": 7975 + }, + { + "epoch": 0.624512099921936, + "grad_norm": 5.123266696929932, + "learning_rate": 4.3990516668112296e-05, + "loss": 0.9864, + "step": 8000 + }, + { + "epoch": 0.6264637002341921, + "grad_norm": 4.123677730560303, + "learning_rate": 4.395437592158904e-05, + "loss": 1.0769, + "step": 8025 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 5.222365379333496, + "learning_rate": 4.391823517506578e-05, + "loss": 1.0223, + "step": 8050 + }, + { + "epoch": 0.6303669008587042, + "grad_norm": 3.431854486465454, + "learning_rate": 4.388209442854252e-05, + "loss": 1.12, + "step": 8075 + }, + { + "epoch": 0.6323185011709602, + "grad_norm": 6.849370956420898, + "learning_rate": 4.384595368201926e-05, + "loss": 1.1592, + "step": 8100 + }, + { + "epoch": 0.6342701014832163, + "grad_norm": 11.810173988342285, + "learning_rate": 4.3809812935496e-05, + "loss": 1.1885, + "step": 8125 + }, + { + "epoch": 0.6362217017954723, + "grad_norm": 3.4230222702026367, + "learning_rate": 4.377367218897274e-05, + "loss": 1.0375, + "step": 8150 + }, + { + "epoch": 0.6381733021077284, + "grad_norm": 3.075012683868408, + "learning_rate": 4.3737531442449473e-05, + "loss": 0.9832, + "step": 8175 + }, + { + "epoch": 0.6401249024199844, + "grad_norm": 3.2046592235565186, + "learning_rate": 4.370139069592622e-05, + "loss": 1.0423, + "step": 8200 + }, + { + "epoch": 0.6420765027322405, + "grad_norm": 3.4060072898864746, + "learning_rate": 4.366524994940296e-05, + "loss": 1.0266, + "step": 8225 + }, + { + "epoch": 0.6440281030444965, + "grad_norm": 5.291951656341553, + "learning_rate": 4.3629109202879695e-05, + "loss": 1.121, + "step": 8250 + }, + { + "epoch": 0.6459797033567526, + "grad_norm": 14.134099006652832, + "learning_rate": 4.3592968456356436e-05, + "loss": 1.0754, + "step": 8275 + }, + { + "epoch": 0.6479313036690086, + "grad_norm": 3.5539629459381104, + "learning_rate": 4.3556827709833176e-05, + "loss": 1.1297, + "step": 8300 + }, + { + "epoch": 0.6498829039812647, + "grad_norm": 5.648612976074219, + "learning_rate": 4.352068696330992e-05, + "loss": 1.086, + "step": 8325 + }, + { + "epoch": 0.6518345042935206, + "grad_norm": 2.2942404747009277, + "learning_rate": 4.348454621678665e-05, + "loss": 1.0432, + "step": 8350 + }, + { + "epoch": 0.6537861046057767, + "grad_norm": 4.544975757598877, + "learning_rate": 4.34484054702634e-05, + "loss": 0.9759, + "step": 8375 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 17.37340545654297, + "learning_rate": 4.341226472374014e-05, + "loss": 1.1034, + "step": 8400 + }, + { + "epoch": 0.6576893052302888, + "grad_norm": 4.725184440612793, + "learning_rate": 4.337612397721688e-05, + "loss": 1.06, + "step": 8425 + }, + { + "epoch": 0.6596409055425448, + "grad_norm": 3.666466236114502, + "learning_rate": 4.333998323069361e-05, + "loss": 1.0161, + "step": 8450 + }, + { + "epoch": 0.6615925058548009, + "grad_norm": 4.941737651824951, + "learning_rate": 4.3303842484170353e-05, + "loss": 0.9781, + "step": 8475 + }, + { + "epoch": 0.663544106167057, + "grad_norm": 3.4009881019592285, + "learning_rate": 4.3267701737647094e-05, + "loss": 1.0, + "step": 8500 + }, + { + "epoch": 0.665495706479313, + "grad_norm": 3.7620911598205566, + "learning_rate": 4.3231560991123835e-05, + "loss": 0.9847, + "step": 8525 + }, + { + "epoch": 0.667447306791569, + "grad_norm": 10.591248512268066, + "learning_rate": 4.3195420244600575e-05, + "loss": 1.0492, + "step": 8550 + }, + { + "epoch": 0.6693989071038251, + "grad_norm": 99.32888793945312, + "learning_rate": 4.3159279498077316e-05, + "loss": 1.0049, + "step": 8575 + }, + { + "epoch": 0.6713505074160812, + "grad_norm": 2.936160087585449, + "learning_rate": 4.3123138751554056e-05, + "loss": 1.0243, + "step": 8600 + }, + { + "epoch": 0.6733021077283372, + "grad_norm": 4.591320991516113, + "learning_rate": 4.308699800503079e-05, + "loss": 1.0227, + "step": 8625 + }, + { + "epoch": 0.6752537080405933, + "grad_norm": 6.28738260269165, + "learning_rate": 4.305085725850753e-05, + "loss": 1.0488, + "step": 8650 + }, + { + "epoch": 0.6772053083528493, + "grad_norm": 5.117800712585449, + "learning_rate": 4.301471651198427e-05, + "loss": 1.1017, + "step": 8675 + }, + { + "epoch": 0.6791569086651054, + "grad_norm": 6.173947811126709, + "learning_rate": 4.297857576546102e-05, + "loss": 1.0487, + "step": 8700 + }, + { + "epoch": 0.6811085089773614, + "grad_norm": 4.372732162475586, + "learning_rate": 4.294243501893775e-05, + "loss": 1.1523, + "step": 8725 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 6.942607402801514, + "learning_rate": 4.290629427241449e-05, + "loss": 1.0881, + "step": 8750 + }, + { + "epoch": 0.6850117096018735, + "grad_norm": 7.410184383392334, + "learning_rate": 4.2870153525891233e-05, + "loss": 1.0009, + "step": 8775 + }, + { + "epoch": 0.6869633099141296, + "grad_norm": 39.38276290893555, + "learning_rate": 4.2834012779367974e-05, + "loss": 1.2126, + "step": 8800 + }, + { + "epoch": 0.6889149102263856, + "grad_norm": 7.96761417388916, + "learning_rate": 4.279787203284471e-05, + "loss": 0.9786, + "step": 8825 + }, + { + "epoch": 0.6908665105386417, + "grad_norm": 4.380497932434082, + "learning_rate": 4.2761731286321455e-05, + "loss": 1.1384, + "step": 8850 + }, + { + "epoch": 0.6928181108508977, + "grad_norm": 2.9372737407684326, + "learning_rate": 4.2725590539798196e-05, + "loss": 1.0961, + "step": 8875 + }, + { + "epoch": 0.6947697111631538, + "grad_norm": 11.495221138000488, + "learning_rate": 4.268944979327493e-05, + "loss": 1.1677, + "step": 8900 + }, + { + "epoch": 0.6967213114754098, + "grad_norm": 16.372867584228516, + "learning_rate": 4.265330904675167e-05, + "loss": 1.208, + "step": 8925 + }, + { + "epoch": 0.6986729117876659, + "grad_norm": 2.921988010406494, + "learning_rate": 4.261716830022841e-05, + "loss": 0.9814, + "step": 8950 + }, + { + "epoch": 0.7006245120999219, + "grad_norm": 7.3093180656433105, + "learning_rate": 4.258102755370515e-05, + "loss": 1.1336, + "step": 8975 + }, + { + "epoch": 0.702576112412178, + "grad_norm": 3.9029057025909424, + "learning_rate": 4.254488680718189e-05, + "loss": 1.1571, + "step": 9000 + }, + { + "epoch": 0.704527712724434, + "grad_norm": 3.8421742916107178, + "learning_rate": 4.250874606065863e-05, + "loss": 0.9803, + "step": 9025 + }, + { + "epoch": 0.7064793130366901, + "grad_norm": 4.519542217254639, + "learning_rate": 4.247260531413537e-05, + "loss": 1.0252, + "step": 9050 + }, + { + "epoch": 0.7084309133489461, + "grad_norm": 6.2342329025268555, + "learning_rate": 4.2436464567612114e-05, + "loss": 1.084, + "step": 9075 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 5.096877574920654, + "learning_rate": 4.240032382108885e-05, + "loss": 0.9672, + "step": 9100 + }, + { + "epoch": 0.7123341139734582, + "grad_norm": 5.999642372131348, + "learning_rate": 4.236418307456559e-05, + "loss": 1.0796, + "step": 9125 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 5.83458137512207, + "learning_rate": 4.232804232804233e-05, + "loss": 1.0442, + "step": 9150 + }, + { + "epoch": 0.7162373145979704, + "grad_norm": 3.0941836833953857, + "learning_rate": 4.229190158151907e-05, + "loss": 1.1415, + "step": 9175 + }, + { + "epoch": 0.7181889149102264, + "grad_norm": 3.5933706760406494, + "learning_rate": 4.225576083499581e-05, + "loss": 1.0658, + "step": 9200 + }, + { + "epoch": 0.7201405152224825, + "grad_norm": 42.329776763916016, + "learning_rate": 4.221962008847255e-05, + "loss": 1.0524, + "step": 9225 + }, + { + "epoch": 0.7220921155347385, + "grad_norm": 5.331321716308594, + "learning_rate": 4.218347934194929e-05, + "loss": 1.1026, + "step": 9250 + }, + { + "epoch": 0.7240437158469946, + "grad_norm": 3.164287805557251, + "learning_rate": 4.214733859542603e-05, + "loss": 1.0864, + "step": 9275 + }, + { + "epoch": 0.7259953161592506, + "grad_norm": 6.419859409332275, + "learning_rate": 4.2111197848902765e-05, + "loss": 1.0442, + "step": 9300 + }, + { + "epoch": 0.7279469164715067, + "grad_norm": 7.131369113922119, + "learning_rate": 4.207505710237951e-05, + "loss": 0.9406, + "step": 9325 + }, + { + "epoch": 0.7298985167837627, + "grad_norm": 5.3346405029296875, + "learning_rate": 4.203891635585625e-05, + "loss": 1.0692, + "step": 9350 + }, + { + "epoch": 0.7318501170960188, + "grad_norm": 3.301525354385376, + "learning_rate": 4.200277560933299e-05, + "loss": 0.9917, + "step": 9375 + }, + { + "epoch": 0.7338017174082748, + "grad_norm": 12.566885948181152, + "learning_rate": 4.196663486280973e-05, + "loss": 1.0737, + "step": 9400 + }, + { + "epoch": 0.7357533177205309, + "grad_norm": 7.1636271476745605, + "learning_rate": 4.193049411628647e-05, + "loss": 1.0542, + "step": 9425 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 5.242036819458008, + "learning_rate": 4.189435336976321e-05, + "loss": 1.2046, + "step": 9450 + }, + { + "epoch": 0.739656518345043, + "grad_norm": 7.031478404998779, + "learning_rate": 4.185821262323994e-05, + "loss": 1.1672, + "step": 9475 + }, + { + "epoch": 0.741608118657299, + "grad_norm": 9.777889251708984, + "learning_rate": 4.182207187671669e-05, + "loss": 1.0036, + "step": 9500 + }, + { + "epoch": 0.7435597189695551, + "grad_norm": 2.8060688972473145, + "learning_rate": 4.178593113019343e-05, + "loss": 1.1648, + "step": 9525 + }, + { + "epoch": 0.7455113192818111, + "grad_norm": 11.0838623046875, + "learning_rate": 4.174979038367017e-05, + "loss": 1.1561, + "step": 9550 + }, + { + "epoch": 0.7474629195940672, + "grad_norm": 2.345066785812378, + "learning_rate": 4.1713649637146905e-05, + "loss": 1.0569, + "step": 9575 + }, + { + "epoch": 0.7494145199063232, + "grad_norm": 4.051877021789551, + "learning_rate": 4.1677508890623645e-05, + "loss": 1.0629, + "step": 9600 + }, + { + "epoch": 0.7513661202185792, + "grad_norm": 7.873716831207275, + "learning_rate": 4.1641368144100386e-05, + "loss": 1.125, + "step": 9625 + }, + { + "epoch": 0.7533177205308352, + "grad_norm": 4.695271968841553, + "learning_rate": 4.1605227397577126e-05, + "loss": 1.0436, + "step": 9650 + }, + { + "epoch": 0.7552693208430913, + "grad_norm": 3.9926652908325195, + "learning_rate": 4.156908665105387e-05, + "loss": 1.2496, + "step": 9675 + }, + { + "epoch": 0.7572209211553473, + "grad_norm": 5.339399337768555, + "learning_rate": 4.153294590453061e-05, + "loss": 0.9639, + "step": 9700 + }, + { + "epoch": 0.7591725214676034, + "grad_norm": 3.5780484676361084, + "learning_rate": 4.149680515800735e-05, + "loss": 1.0743, + "step": 9725 + }, + { + "epoch": 0.7611241217798594, + "grad_norm": 3.0065765380859375, + "learning_rate": 4.146066441148408e-05, + "loss": 1.1174, + "step": 9750 + }, + { + "epoch": 0.7630757220921155, + "grad_norm": 3.351956367492676, + "learning_rate": 4.142452366496082e-05, + "loss": 0.9875, + "step": 9775 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 4.556973457336426, + "learning_rate": 4.138838291843756e-05, + "loss": 1.132, + "step": 9800 + }, + { + "epoch": 0.7669789227166276, + "grad_norm": 10.879364013671875, + "learning_rate": 4.135224217191431e-05, + "loss": 1.0696, + "step": 9825 + }, + { + "epoch": 0.7689305230288837, + "grad_norm": 4.668983459472656, + "learning_rate": 4.1316101425391044e-05, + "loss": 1.0684, + "step": 9850 + }, + { + "epoch": 0.7708821233411397, + "grad_norm": 3.5243680477142334, + "learning_rate": 4.1279960678867785e-05, + "loss": 1.0792, + "step": 9875 + }, + { + "epoch": 0.7728337236533958, + "grad_norm": 3.4856066703796387, + "learning_rate": 4.1243819932344525e-05, + "loss": 1.0118, + "step": 9900 + }, + { + "epoch": 0.7747853239656518, + "grad_norm": 6.037850379943848, + "learning_rate": 4.1207679185821266e-05, + "loss": 1.1066, + "step": 9925 + }, + { + "epoch": 0.7767369242779079, + "grad_norm": 3.93436861038208, + "learning_rate": 4.1171538439298e-05, + "loss": 1.0614, + "step": 9950 + }, + { + "epoch": 0.7786885245901639, + "grad_norm": 5.99379301071167, + "learning_rate": 4.113539769277475e-05, + "loss": 1.2036, + "step": 9975 + }, + { + "epoch": 0.78064012490242, + "grad_norm": 6.033953666687012, + "learning_rate": 4.109925694625149e-05, + "loss": 1.1062, + "step": 10000 + }, + { + "epoch": 0.782591725214676, + "grad_norm": 21.771608352661133, + "learning_rate": 4.106311619972822e-05, + "loss": 1.0094, + "step": 10025 + }, + { + "epoch": 0.7845433255269321, + "grad_norm": 3.496319532394409, + "learning_rate": 4.102697545320496e-05, + "loss": 1.0692, + "step": 10050 + }, + { + "epoch": 0.7864949258391881, + "grad_norm": 12.276361465454102, + "learning_rate": 4.09908347066817e-05, + "loss": 1.1974, + "step": 10075 + }, + { + "epoch": 0.7884465261514442, + "grad_norm": 5.650417804718018, + "learning_rate": 4.095469396015844e-05, + "loss": 1.1774, + "step": 10100 + }, + { + "epoch": 0.7903981264637002, + "grad_norm": 3.432573080062866, + "learning_rate": 4.091855321363518e-05, + "loss": 1.0274, + "step": 10125 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 4.76949405670166, + "learning_rate": 4.0882412467111924e-05, + "loss": 1.1554, + "step": 10150 + }, + { + "epoch": 0.7943013270882123, + "grad_norm": 5.882024765014648, + "learning_rate": 4.0846271720588665e-05, + "loss": 1.008, + "step": 10175 + }, + { + "epoch": 0.7962529274004684, + "grad_norm": 3.9834980964660645, + "learning_rate": 4.0810130974065405e-05, + "loss": 1.1267, + "step": 10200 + }, + { + "epoch": 0.7982045277127244, + "grad_norm": 9.958760261535645, + "learning_rate": 4.077399022754214e-05, + "loss": 1.101, + "step": 10225 + }, + { + "epoch": 0.8001561280249805, + "grad_norm": 3.8244740962982178, + "learning_rate": 4.073784948101888e-05, + "loss": 1.097, + "step": 10250 + }, + { + "epoch": 0.8021077283372365, + "grad_norm": 2.765012502670288, + "learning_rate": 4.070170873449562e-05, + "loss": 1.0523, + "step": 10275 + }, + { + "epoch": 0.8040593286494926, + "grad_norm": 9.135652542114258, + "learning_rate": 4.066556798797236e-05, + "loss": 0.9617, + "step": 10300 + }, + { + "epoch": 0.8060109289617486, + "grad_norm": 5.85486364364624, + "learning_rate": 4.06294272414491e-05, + "loss": 1.0772, + "step": 10325 + }, + { + "epoch": 0.8079625292740047, + "grad_norm": 6.288055419921875, + "learning_rate": 4.059328649492584e-05, + "loss": 1.2549, + "step": 10350 + }, + { + "epoch": 0.8099141295862607, + "grad_norm": 4.103349208831787, + "learning_rate": 4.055714574840258e-05, + "loss": 1.0722, + "step": 10375 + }, + { + "epoch": 0.8118657298985168, + "grad_norm": 5.6823883056640625, + "learning_rate": 4.052100500187932e-05, + "loss": 1.0694, + "step": 10400 + }, + { + "epoch": 0.8138173302107728, + "grad_norm": 4.441442012786865, + "learning_rate": 4.048486425535606e-05, + "loss": 1.0207, + "step": 10425 + }, + { + "epoch": 0.8157689305230289, + "grad_norm": 3.620833158493042, + "learning_rate": 4.0448723508832804e-05, + "loss": 1.0206, + "step": 10450 + }, + { + "epoch": 0.817720530835285, + "grad_norm": 4.352095603942871, + "learning_rate": 4.0412582762309545e-05, + "loss": 1.0834, + "step": 10475 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 39.897029876708984, + "learning_rate": 4.037644201578628e-05, + "loss": 1.0726, + "step": 10500 + }, + { + "epoch": 0.8216237314597971, + "grad_norm": 2.752610206604004, + "learning_rate": 4.034030126926302e-05, + "loss": 1.0651, + "step": 10525 + }, + { + "epoch": 0.8235753317720531, + "grad_norm": 71.44095611572266, + "learning_rate": 4.030416052273976e-05, + "loss": 1.0939, + "step": 10550 + }, + { + "epoch": 0.8255269320843092, + "grad_norm": 3.6584365367889404, + "learning_rate": 4.02680197762165e-05, + "loss": 1.1776, + "step": 10575 + }, + { + "epoch": 0.8274785323965652, + "grad_norm": 10.471303939819336, + "learning_rate": 4.0231879029693234e-05, + "loss": 0.9784, + "step": 10600 + }, + { + "epoch": 0.8294301327088213, + "grad_norm": 3.3324592113494873, + "learning_rate": 4.019573828316998e-05, + "loss": 1.0374, + "step": 10625 + }, + { + "epoch": 0.8313817330210773, + "grad_norm": 7.936185359954834, + "learning_rate": 4.015959753664672e-05, + "loss": 1.0445, + "step": 10650 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 2.8503174781799316, + "learning_rate": 4.012345679012346e-05, + "loss": 1.08, + "step": 10675 + }, + { + "epoch": 0.8352849336455894, + "grad_norm": 5.776984691619873, + "learning_rate": 4.0087316043600196e-05, + "loss": 1.0263, + "step": 10700 + }, + { + "epoch": 0.8372365339578455, + "grad_norm": 3.9267687797546387, + "learning_rate": 4.005117529707694e-05, + "loss": 1.0331, + "step": 10725 + }, + { + "epoch": 0.8391881342701015, + "grad_norm": 2.3287744522094727, + "learning_rate": 4.001503455055368e-05, + "loss": 1.105, + "step": 10750 + }, + { + "epoch": 0.8411397345823576, + "grad_norm": 5.739510536193848, + "learning_rate": 3.997889380403042e-05, + "loss": 1.0804, + "step": 10775 + }, + { + "epoch": 0.8430913348946136, + "grad_norm": 6.12454080581665, + "learning_rate": 3.994275305750716e-05, + "loss": 1.0025, + "step": 10800 + }, + { + "epoch": 0.8450429352068697, + "grad_norm": 21.532411575317383, + "learning_rate": 3.99066123109839e-05, + "loss": 1.1381, + "step": 10825 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 14.235223770141602, + "learning_rate": 3.987047156446064e-05, + "loss": 1.0606, + "step": 10850 + }, + { + "epoch": 0.8489461358313818, + "grad_norm": 4.551443576812744, + "learning_rate": 3.9834330817937373e-05, + "loss": 1.0604, + "step": 10875 + }, + { + "epoch": 0.8508977361436377, + "grad_norm": 6.122852325439453, + "learning_rate": 3.9798190071414114e-05, + "loss": 1.1362, + "step": 10900 + }, + { + "epoch": 0.8528493364558938, + "grad_norm": 4.3959736824035645, + "learning_rate": 3.9762049324890855e-05, + "loss": 1.1449, + "step": 10925 + }, + { + "epoch": 0.8548009367681498, + "grad_norm": 2.1433286666870117, + "learning_rate": 3.97259085783676e-05, + "loss": 1.1224, + "step": 10950 + }, + { + "epoch": 0.8567525370804059, + "grad_norm": 8.567728042602539, + "learning_rate": 3.9689767831844336e-05, + "loss": 1.0726, + "step": 10975 + }, + { + "epoch": 0.8587041373926619, + "grad_norm": 2.9822051525115967, + "learning_rate": 3.9653627085321076e-05, + "loss": 1.0527, + "step": 11000 + }, + { + "epoch": 0.860655737704918, + "grad_norm": 8.405996322631836, + "learning_rate": 3.961748633879782e-05, + "loss": 1.0375, + "step": 11025 + }, + { + "epoch": 0.862607338017174, + "grad_norm": 4.547908782958984, + "learning_rate": 3.958134559227456e-05, + "loss": 1.1136, + "step": 11050 + }, + { + "epoch": 0.8645589383294301, + "grad_norm": 3.0987839698791504, + "learning_rate": 3.954520484575129e-05, + "loss": 1.054, + "step": 11075 + }, + { + "epoch": 0.8665105386416861, + "grad_norm": 7.834712982177734, + "learning_rate": 3.950906409922804e-05, + "loss": 1.1345, + "step": 11100 + }, + { + "epoch": 0.8684621389539422, + "grad_norm": 6.487369060516357, + "learning_rate": 3.947292335270478e-05, + "loss": 1.1598, + "step": 11125 + }, + { + "epoch": 0.8704137392661982, + "grad_norm": 3.6524288654327393, + "learning_rate": 3.943678260618151e-05, + "loss": 1.1047, + "step": 11150 + }, + { + "epoch": 0.8723653395784543, + "grad_norm": 3.361725091934204, + "learning_rate": 3.9400641859658254e-05, + "loss": 1.0465, + "step": 11175 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 4.324642181396484, + "learning_rate": 3.9364501113134994e-05, + "loss": 1.0142, + "step": 11200 + }, + { + "epoch": 0.8762685402029664, + "grad_norm": 8.21617603302002, + "learning_rate": 3.9328360366611735e-05, + "loss": 1.18, + "step": 11225 + }, + { + "epoch": 0.8782201405152225, + "grad_norm": 5.842861175537109, + "learning_rate": 3.929221962008847e-05, + "loss": 1.057, + "step": 11250 + }, + { + "epoch": 0.8801717408274785, + "grad_norm": 3.564755439758301, + "learning_rate": 3.9256078873565216e-05, + "loss": 1.1334, + "step": 11275 + }, + { + "epoch": 0.8821233411397346, + "grad_norm": 6.895459175109863, + "learning_rate": 3.9219938127041956e-05, + "loss": 1.1256, + "step": 11300 + }, + { + "epoch": 0.8840749414519906, + "grad_norm": 5.680526256561279, + "learning_rate": 3.91837973805187e-05, + "loss": 1.1394, + "step": 11325 + }, + { + "epoch": 0.8860265417642467, + "grad_norm": 5.740342140197754, + "learning_rate": 3.914765663399543e-05, + "loss": 1.1705, + "step": 11350 + }, + { + "epoch": 0.8879781420765027, + "grad_norm": 6.116307258605957, + "learning_rate": 3.911151588747217e-05, + "loss": 1.0934, + "step": 11375 + }, + { + "epoch": 0.8899297423887588, + "grad_norm": 3.364652633666992, + "learning_rate": 3.907537514094891e-05, + "loss": 1.0772, + "step": 11400 + }, + { + "epoch": 0.8918813427010148, + "grad_norm": 2.465847969055176, + "learning_rate": 3.903923439442565e-05, + "loss": 1.0142, + "step": 11425 + }, + { + "epoch": 0.8938329430132709, + "grad_norm": 3.3270020484924316, + "learning_rate": 3.900309364790239e-05, + "loss": 1.0675, + "step": 11450 + }, + { + "epoch": 0.8957845433255269, + "grad_norm": 3.2493507862091064, + "learning_rate": 3.8966952901379134e-05, + "loss": 1.0183, + "step": 11475 + }, + { + "epoch": 0.897736143637783, + "grad_norm": 4.060807228088379, + "learning_rate": 3.8930812154855874e-05, + "loss": 1.1103, + "step": 11500 + }, + { + "epoch": 0.899687743950039, + "grad_norm": 3.1305429935455322, + "learning_rate": 3.8894671408332615e-05, + "loss": 0.9959, + "step": 11525 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 12.090646743774414, + "learning_rate": 3.885853066180935e-05, + "loss": 1.1075, + "step": 11550 + }, + { + "epoch": 0.9035909445745511, + "grad_norm": 3.994847536087036, + "learning_rate": 3.882238991528609e-05, + "loss": 1.0109, + "step": 11575 + }, + { + "epoch": 0.9055425448868072, + "grad_norm": 6.068265914916992, + "learning_rate": 3.8786249168762836e-05, + "loss": 1.07, + "step": 11600 + }, + { + "epoch": 0.9074941451990632, + "grad_norm": 16.828269958496094, + "learning_rate": 3.875010842223957e-05, + "loss": 1.0504, + "step": 11625 + }, + { + "epoch": 0.9094457455113193, + "grad_norm": 6.074275493621826, + "learning_rate": 3.871396767571631e-05, + "loss": 1.176, + "step": 11650 + }, + { + "epoch": 0.9113973458235753, + "grad_norm": 1.6380444765090942, + "learning_rate": 3.867782692919305e-05, + "loss": 1.1453, + "step": 11675 + }, + { + "epoch": 0.9133489461358314, + "grad_norm": 2.0245916843414307, + "learning_rate": 3.864168618266979e-05, + "loss": 1.0773, + "step": 11700 + }, + { + "epoch": 0.9153005464480874, + "grad_norm": 10.392242431640625, + "learning_rate": 3.8605545436146526e-05, + "loss": 1.0399, + "step": 11725 + }, + { + "epoch": 0.9172521467603435, + "grad_norm": 9.222771644592285, + "learning_rate": 3.856940468962327e-05, + "loss": 1.0434, + "step": 11750 + }, + { + "epoch": 0.9192037470725996, + "grad_norm": 4.098302364349365, + "learning_rate": 3.8533263943100014e-05, + "loss": 1.0762, + "step": 11775 + }, + { + "epoch": 0.9211553473848556, + "grad_norm": 3.1058058738708496, + "learning_rate": 3.8497123196576754e-05, + "loss": 1.0091, + "step": 11800 + }, + { + "epoch": 0.9231069476971117, + "grad_norm": 14.550347328186035, + "learning_rate": 3.846098245005349e-05, + "loss": 1.119, + "step": 11825 + }, + { + "epoch": 0.9250585480093677, + "grad_norm": 5.29132604598999, + "learning_rate": 3.842484170353023e-05, + "loss": 0.9882, + "step": 11850 + }, + { + "epoch": 0.9270101483216238, + "grad_norm": 4.180002689361572, + "learning_rate": 3.838870095700697e-05, + "loss": 0.9908, + "step": 11875 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 5.418637275695801, + "learning_rate": 3.835256021048371e-05, + "loss": 1.1422, + "step": 11900 + }, + { + "epoch": 0.9309133489461359, + "grad_norm": 4.400912284851074, + "learning_rate": 3.831641946396045e-05, + "loss": 0.9483, + "step": 11925 + }, + { + "epoch": 0.9328649492583919, + "grad_norm": 2.5902974605560303, + "learning_rate": 3.828027871743719e-05, + "loss": 0.9895, + "step": 11950 + }, + { + "epoch": 0.934816549570648, + "grad_norm": 2.4926915168762207, + "learning_rate": 3.824413797091393e-05, + "loss": 1.0582, + "step": 11975 + }, + { + "epoch": 0.936768149882904, + "grad_norm": 6.030407428741455, + "learning_rate": 3.8207997224390665e-05, + "loss": 1.0676, + "step": 12000 + }, + { + "epoch": 0.9387197501951601, + "grad_norm": 6.592781066894531, + "learning_rate": 3.8171856477867406e-05, + "loss": 1.0961, + "step": 12025 + }, + { + "epoch": 0.9406713505074161, + "grad_norm": 7.366015434265137, + "learning_rate": 3.8135715731344146e-05, + "loss": 1.1043, + "step": 12050 + }, + { + "epoch": 0.9426229508196722, + "grad_norm": 4.65834379196167, + "learning_rate": 3.8099574984820894e-05, + "loss": 1.0391, + "step": 12075 + }, + { + "epoch": 0.9445745511319282, + "grad_norm": 3.758898973464966, + "learning_rate": 3.806343423829763e-05, + "loss": 1.0863, + "step": 12100 + }, + { + "epoch": 0.9465261514441843, + "grad_norm": 5.561915397644043, + "learning_rate": 3.802729349177437e-05, + "loss": 1.1022, + "step": 12125 + }, + { + "epoch": 0.9484777517564403, + "grad_norm": 6.3769941329956055, + "learning_rate": 3.799115274525111e-05, + "loss": 1.0682, + "step": 12150 + }, + { + "epoch": 0.9504293520686963, + "grad_norm": 4.858483791351318, + "learning_rate": 3.795501199872785e-05, + "loss": 1.103, + "step": 12175 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 5.049714088439941, + "learning_rate": 3.791887125220458e-05, + "loss": 1.0184, + "step": 12200 + }, + { + "epoch": 0.9543325526932084, + "grad_norm": 5.8210649490356445, + "learning_rate": 3.788273050568133e-05, + "loss": 1.0723, + "step": 12225 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 4.023828029632568, + "learning_rate": 3.784658975915807e-05, + "loss": 1.0422, + "step": 12250 + }, + { + "epoch": 0.9582357533177205, + "grad_norm": 2.8047125339508057, + "learning_rate": 3.7810449012634805e-05, + "loss": 1.0603, + "step": 12275 + }, + { + "epoch": 0.9601873536299765, + "grad_norm": 4.093529224395752, + "learning_rate": 3.7774308266111545e-05, + "loss": 1.0541, + "step": 12300 + }, + { + "epoch": 0.9621389539422326, + "grad_norm": 10.252714157104492, + "learning_rate": 3.7738167519588286e-05, + "loss": 1.1168, + "step": 12325 + }, + { + "epoch": 0.9640905542544886, + "grad_norm": 9.9879732131958, + "learning_rate": 3.7702026773065026e-05, + "loss": 0.9692, + "step": 12350 + }, + { + "epoch": 0.9660421545667447, + "grad_norm": 4.01174259185791, + "learning_rate": 3.766588602654176e-05, + "loss": 1.0706, + "step": 12375 + }, + { + "epoch": 0.9679937548790007, + "grad_norm": 4.104875564575195, + "learning_rate": 3.762974528001851e-05, + "loss": 0.9965, + "step": 12400 + }, + { + "epoch": 0.9699453551912568, + "grad_norm": 12.7550048828125, + "learning_rate": 3.759360453349525e-05, + "loss": 1.1558, + "step": 12425 + }, + { + "epoch": 0.9718969555035128, + "grad_norm": 3.570772886276245, + "learning_rate": 3.755746378697199e-05, + "loss": 1.0598, + "step": 12450 + }, + { + "epoch": 0.9738485558157689, + "grad_norm": 4.351847171783447, + "learning_rate": 3.752132304044872e-05, + "loss": 1.0233, + "step": 12475 + }, + { + "epoch": 0.975800156128025, + "grad_norm": 3.9156181812286377, + "learning_rate": 3.748518229392546e-05, + "loss": 1.0164, + "step": 12500 + }, + { + "epoch": 0.977751756440281, + "grad_norm": 5.74035120010376, + "learning_rate": 3.7449041547402204e-05, + "loss": 1.1285, + "step": 12525 + }, + { + "epoch": 0.9797033567525371, + "grad_norm": 4.563347339630127, + "learning_rate": 3.7412900800878944e-05, + "loss": 1.0547, + "step": 12550 + }, + { + "epoch": 0.9816549570647931, + "grad_norm": 20.596858978271484, + "learning_rate": 3.7376760054355685e-05, + "loss": 0.9847, + "step": 12575 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 3.1608352661132812, + "learning_rate": 3.7340619307832425e-05, + "loss": 1.0047, + "step": 12600 + }, + { + "epoch": 0.9855581576893052, + "grad_norm": 17.322921752929688, + "learning_rate": 3.7304478561309166e-05, + "loss": 1.0192, + "step": 12625 + }, + { + "epoch": 0.9875097580015613, + "grad_norm": 5.369739532470703, + "learning_rate": 3.7268337814785906e-05, + "loss": 1.0804, + "step": 12650 + }, + { + "epoch": 0.9894613583138173, + "grad_norm": 4.7654924392700195, + "learning_rate": 3.723219706826264e-05, + "loss": 1.2548, + "step": 12675 + }, + { + "epoch": 0.9914129586260734, + "grad_norm": 4.569206714630127, + "learning_rate": 3.719605632173938e-05, + "loss": 1.173, + "step": 12700 + }, + { + "epoch": 0.9933645589383294, + "grad_norm": 4.424173831939697, + "learning_rate": 3.715991557521613e-05, + "loss": 1.2093, + "step": 12725 + }, + { + "epoch": 0.9953161592505855, + "grad_norm": 7.026711463928223, + "learning_rate": 3.712377482869286e-05, + "loss": 1.1057, + "step": 12750 + }, + { + "epoch": 0.9972677595628415, + "grad_norm": 4.385987281799316, + "learning_rate": 3.70876340821696e-05, + "loss": 1.1011, + "step": 12775 + }, + { + "epoch": 0.9992193598750976, + "grad_norm": 5.009171009063721, + "learning_rate": 3.705149333564634e-05, + "loss": 1.0958, + "step": 12800 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.5795081967213115, + "eval_f1_macro": 0.5087002746093271, + "eval_f1_micro": 0.5795081967213115, + "eval_f1_weighted": 0.5680119414930348, + "eval_loss": 1.0661728382110596, + "eval_precision_macro": 0.6306619727330058, + "eval_precision_micro": 0.5795081967213115, + "eval_precision_weighted": 0.6115098586288849, + "eval_recall_macro": 0.47726205888993317, + "eval_recall_micro": 0.5795081967213115, + "eval_recall_weighted": 0.5795081967213115, + "eval_runtime": 5104.9669, + "eval_samples_per_second": 5.019, + "eval_steps_per_second": 0.314, + "step": 12810 + }, + { + "epoch": 1.0011709601873535, + "grad_norm": 12.7615385055542, + "learning_rate": 3.7015352589123084e-05, + "loss": 1.0528, + "step": 12825 + }, + { + "epoch": 1.0031225604996097, + "grad_norm": 4.541055679321289, + "learning_rate": 3.697921184259982e-05, + "loss": 1.0599, + "step": 12850 + }, + { + "epoch": 1.0050741608118656, + "grad_norm": 3.845266342163086, + "learning_rate": 3.6943071096076565e-05, + "loss": 1.0058, + "step": 12875 + }, + { + "epoch": 1.0070257611241218, + "grad_norm": 7.535529136657715, + "learning_rate": 3.6906930349553305e-05, + "loss": 1.0625, + "step": 12900 + }, + { + "epoch": 1.0089773614363777, + "grad_norm": 3.3903520107269287, + "learning_rate": 3.6870789603030046e-05, + "loss": 1.0719, + "step": 12925 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 69.22319793701172, + "learning_rate": 3.683464885650678e-05, + "loss": 1.0491, + "step": 12950 + }, + { + "epoch": 1.0128805620608898, + "grad_norm": 5.992664813995361, + "learning_rate": 3.679850810998352e-05, + "loss": 1.0683, + "step": 12975 + }, + { + "epoch": 1.014832162373146, + "grad_norm": 7.162235260009766, + "learning_rate": 3.676236736346026e-05, + "loss": 1.0235, + "step": 13000 + }, + { + "epoch": 1.016783762685402, + "grad_norm": 2.7051498889923096, + "learning_rate": 3.6726226616937e-05, + "loss": 0.9741, + "step": 13025 + }, + { + "epoch": 1.018735362997658, + "grad_norm": 31.953462600708008, + "learning_rate": 3.669008587041374e-05, + "loss": 1.0239, + "step": 13050 + }, + { + "epoch": 1.020686963309914, + "grad_norm": 5.982146263122559, + "learning_rate": 3.665394512389048e-05, + "loss": 0.9827, + "step": 13075 + }, + { + "epoch": 1.0226385636221702, + "grad_norm": 7.821646213531494, + "learning_rate": 3.661780437736722e-05, + "loss": 1.0686, + "step": 13100 + }, + { + "epoch": 1.0245901639344261, + "grad_norm": 4.255584239959717, + "learning_rate": 3.658166363084396e-05, + "loss": 1.0753, + "step": 13125 + }, + { + "epoch": 1.0265417642466823, + "grad_norm": 6.068447589874268, + "learning_rate": 3.65455228843207e-05, + "loss": 1.0815, + "step": 13150 + }, + { + "epoch": 1.0284933645589383, + "grad_norm": 5.350246429443359, + "learning_rate": 3.650938213779744e-05, + "loss": 1.1152, + "step": 13175 + }, + { + "epoch": 1.0304449648711944, + "grad_norm": 7.5077738761901855, + "learning_rate": 3.6473241391274185e-05, + "loss": 1.0568, + "step": 13200 + }, + { + "epoch": 1.0323965651834504, + "grad_norm": 3.612325668334961, + "learning_rate": 3.643710064475092e-05, + "loss": 1.122, + "step": 13225 + }, + { + "epoch": 1.0343481654957065, + "grad_norm": 4.679467678070068, + "learning_rate": 3.640095989822766e-05, + "loss": 1.1984, + "step": 13250 + }, + { + "epoch": 1.0362997658079625, + "grad_norm": 5.999462604522705, + "learning_rate": 3.63648191517044e-05, + "loss": 1.0953, + "step": 13275 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 2.531510591506958, + "learning_rate": 3.632867840518114e-05, + "loss": 1.0205, + "step": 13300 + }, + { + "epoch": 1.0402029664324746, + "grad_norm": 4.592498779296875, + "learning_rate": 3.6292537658657875e-05, + "loss": 1.0785, + "step": 13325 + }, + { + "epoch": 1.0421545667447307, + "grad_norm": 16.091724395751953, + "learning_rate": 3.625639691213462e-05, + "loss": 1.0874, + "step": 13350 + }, + { + "epoch": 1.0441061670569867, + "grad_norm": 3.179403066635132, + "learning_rate": 3.622025616561136e-05, + "loss": 1.1195, + "step": 13375 + }, + { + "epoch": 1.0460577673692428, + "grad_norm": 4.981616020202637, + "learning_rate": 3.6184115419088096e-05, + "loss": 1.0597, + "step": 13400 + }, + { + "epoch": 1.0480093676814988, + "grad_norm": 3.7301344871520996, + "learning_rate": 3.614797467256484e-05, + "loss": 1.092, + "step": 13425 + }, + { + "epoch": 1.049960967993755, + "grad_norm": 3.2385706901550293, + "learning_rate": 3.611183392604158e-05, + "loss": 1.0127, + "step": 13450 + }, + { + "epoch": 1.0519125683060109, + "grad_norm": 4.188447952270508, + "learning_rate": 3.607569317951832e-05, + "loss": 1.13, + "step": 13475 + }, + { + "epoch": 1.053864168618267, + "grad_norm": 5.909556865692139, + "learning_rate": 3.603955243299505e-05, + "loss": 1.0891, + "step": 13500 + }, + { + "epoch": 1.055815768930523, + "grad_norm": 5.422518730163574, + "learning_rate": 3.60034116864718e-05, + "loss": 1.0328, + "step": 13525 + }, + { + "epoch": 1.0577673692427791, + "grad_norm": 5.447854042053223, + "learning_rate": 3.596727093994854e-05, + "loss": 1.0115, + "step": 13550 + }, + { + "epoch": 1.059718969555035, + "grad_norm": 4.149133205413818, + "learning_rate": 3.593113019342528e-05, + "loss": 1.0087, + "step": 13575 + }, + { + "epoch": 1.0616705698672912, + "grad_norm": 8.521280288696289, + "learning_rate": 3.5894989446902014e-05, + "loss": 1.0971, + "step": 13600 + }, + { + "epoch": 1.0636221701795472, + "grad_norm": 3.320399045944214, + "learning_rate": 3.5858848700378755e-05, + "loss": 1.0683, + "step": 13625 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 8.03446102142334, + "learning_rate": 3.5822707953855495e-05, + "loss": 1.0696, + "step": 13650 + }, + { + "epoch": 1.0675253708040593, + "grad_norm": 3.9610068798065186, + "learning_rate": 3.5786567207332236e-05, + "loss": 1.0773, + "step": 13675 + }, + { + "epoch": 1.0694769711163155, + "grad_norm": 21.27227210998535, + "learning_rate": 3.5750426460808976e-05, + "loss": 0.9077, + "step": 13700 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 10.218512535095215, + "learning_rate": 3.571428571428572e-05, + "loss": 1.1193, + "step": 13725 + }, + { + "epoch": 1.0733801717408276, + "grad_norm": 5.707762241363525, + "learning_rate": 3.567814496776246e-05, + "loss": 1.0098, + "step": 13750 + }, + { + "epoch": 1.0753317720530835, + "grad_norm": 13.289424896240234, + "learning_rate": 3.564200422123919e-05, + "loss": 1.0516, + "step": 13775 + }, + { + "epoch": 1.0772833723653397, + "grad_norm": 4.986598968505859, + "learning_rate": 3.560586347471593e-05, + "loss": 1.0239, + "step": 13800 + }, + { + "epoch": 1.0792349726775956, + "grad_norm": 3.696572780609131, + "learning_rate": 3.556972272819267e-05, + "loss": 1.0911, + "step": 13825 + }, + { + "epoch": 1.0811865729898518, + "grad_norm": 4.757776260375977, + "learning_rate": 3.553358198166942e-05, + "loss": 1.0782, + "step": 13850 + }, + { + "epoch": 1.0831381733021077, + "grad_norm": 5.444307327270508, + "learning_rate": 3.5497441235146154e-05, + "loss": 1.1199, + "step": 13875 + }, + { + "epoch": 1.0850897736143639, + "grad_norm": 4.573936939239502, + "learning_rate": 3.5461300488622894e-05, + "loss": 1.0381, + "step": 13900 + }, + { + "epoch": 1.0870413739266198, + "grad_norm": 6.183209419250488, + "learning_rate": 3.5425159742099635e-05, + "loss": 1.0128, + "step": 13925 + }, + { + "epoch": 1.088992974238876, + "grad_norm": 3.2218897342681885, + "learning_rate": 3.5389018995576375e-05, + "loss": 1.0035, + "step": 13950 + }, + { + "epoch": 1.090944574551132, + "grad_norm": 74.6764144897461, + "learning_rate": 3.535287824905311e-05, + "loss": 0.9861, + "step": 13975 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 4.7839531898498535, + "learning_rate": 3.5316737502529856e-05, + "loss": 0.991, + "step": 14000 + }, + { + "epoch": 1.094847775175644, + "grad_norm": 9.024589538574219, + "learning_rate": 3.52805967560066e-05, + "loss": 0.943, + "step": 14025 + }, + { + "epoch": 1.0967993754879002, + "grad_norm": 9.216841697692871, + "learning_rate": 3.524445600948334e-05, + "loss": 1.2165, + "step": 14050 + }, + { + "epoch": 1.0987509758001561, + "grad_norm": 4.683638572692871, + "learning_rate": 3.520831526296007e-05, + "loss": 0.9405, + "step": 14075 + }, + { + "epoch": 1.100702576112412, + "grad_norm": 9.359245300292969, + "learning_rate": 3.517217451643681e-05, + "loss": 1.0418, + "step": 14100 + }, + { + "epoch": 1.1026541764246682, + "grad_norm": 25.48531150817871, + "learning_rate": 3.513603376991355e-05, + "loss": 0.9349, + "step": 14125 + }, + { + "epoch": 1.1046057767369244, + "grad_norm": 3.779207706451416, + "learning_rate": 3.509989302339029e-05, + "loss": 1.0811, + "step": 14150 + }, + { + "epoch": 1.1065573770491803, + "grad_norm": 3.771131753921509, + "learning_rate": 3.5063752276867034e-05, + "loss": 1.0441, + "step": 14175 + }, + { + "epoch": 1.1085089773614363, + "grad_norm": 13.767167091369629, + "learning_rate": 3.5027611530343774e-05, + "loss": 1.0153, + "step": 14200 + }, + { + "epoch": 1.1104605776736924, + "grad_norm": 5.634498119354248, + "learning_rate": 3.4991470783820515e-05, + "loss": 1.0427, + "step": 14225 + }, + { + "epoch": 1.1124121779859484, + "grad_norm": 5.016213893890381, + "learning_rate": 3.495533003729725e-05, + "loss": 1.0513, + "step": 14250 + }, + { + "epoch": 1.1143637782982045, + "grad_norm": 9.435086250305176, + "learning_rate": 3.491918929077399e-05, + "loss": 1.0806, + "step": 14275 + }, + { + "epoch": 1.1163153786104605, + "grad_norm": 9.144840240478516, + "learning_rate": 3.488304854425073e-05, + "loss": 1.0873, + "step": 14300 + }, + { + "epoch": 1.1182669789227166, + "grad_norm": 8.070186614990234, + "learning_rate": 3.484690779772748e-05, + "loss": 1.0519, + "step": 14325 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 5.891928672790527, + "learning_rate": 3.481076705120421e-05, + "loss": 1.1179, + "step": 14350 + }, + { + "epoch": 1.1221701795472288, + "grad_norm": 2.934696674346924, + "learning_rate": 3.477462630468095e-05, + "loss": 0.9175, + "step": 14375 + }, + { + "epoch": 1.1241217798594847, + "grad_norm": 6.6022491455078125, + "learning_rate": 3.473848555815769e-05, + "loss": 1.0346, + "step": 14400 + }, + { + "epoch": 1.1260733801717409, + "grad_norm": 5.278355598449707, + "learning_rate": 3.470234481163443e-05, + "loss": 0.9236, + "step": 14425 + }, + { + "epoch": 1.1280249804839968, + "grad_norm": 7.8194074630737305, + "learning_rate": 3.4666204065111166e-05, + "loss": 1.0641, + "step": 14450 + }, + { + "epoch": 1.129976580796253, + "grad_norm": 6.242311477661133, + "learning_rate": 3.4630063318587914e-05, + "loss": 0.9367, + "step": 14475 + }, + { + "epoch": 1.131928181108509, + "grad_norm": 10.937076568603516, + "learning_rate": 3.4593922572064654e-05, + "loss": 1.1266, + "step": 14500 + }, + { + "epoch": 1.133879781420765, + "grad_norm": 14.266758918762207, + "learning_rate": 3.455778182554139e-05, + "loss": 1.0718, + "step": 14525 + }, + { + "epoch": 1.135831381733021, + "grad_norm": 6.686312675476074, + "learning_rate": 3.452164107901813e-05, + "loss": 1.0347, + "step": 14550 + }, + { + "epoch": 1.1377829820452772, + "grad_norm": 5.075685977935791, + "learning_rate": 3.448550033249487e-05, + "loss": 1.0828, + "step": 14575 + }, + { + "epoch": 1.139734582357533, + "grad_norm": 6.093489170074463, + "learning_rate": 3.444935958597161e-05, + "loss": 1.0761, + "step": 14600 + }, + { + "epoch": 1.1416861826697893, + "grad_norm": 4.527467727661133, + "learning_rate": 3.4413218839448344e-05, + "loss": 1.0714, + "step": 14625 + }, + { + "epoch": 1.1436377829820452, + "grad_norm": 3.6168901920318604, + "learning_rate": 3.437707809292509e-05, + "loss": 1.0826, + "step": 14650 + }, + { + "epoch": 1.1455893832943014, + "grad_norm": 6.7677812576293945, + "learning_rate": 3.434093734640183e-05, + "loss": 1.0272, + "step": 14675 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 4.555366516113281, + "learning_rate": 3.430479659987857e-05, + "loss": 1.1062, + "step": 14700 + }, + { + "epoch": 1.1494925839188135, + "grad_norm": 9.095574378967285, + "learning_rate": 3.4268655853355306e-05, + "loss": 1.0128, + "step": 14725 + }, + { + "epoch": 1.1514441842310694, + "grad_norm": 4.069788455963135, + "learning_rate": 3.4232515106832046e-05, + "loss": 1.0775, + "step": 14750 + }, + { + "epoch": 1.1533957845433256, + "grad_norm": 6.9796295166015625, + "learning_rate": 3.419637436030879e-05, + "loss": 1.0376, + "step": 14775 + }, + { + "epoch": 1.1553473848555815, + "grad_norm": 6.946877479553223, + "learning_rate": 3.416023361378553e-05, + "loss": 1.0794, + "step": 14800 + }, + { + "epoch": 1.1572989851678377, + "grad_norm": 6.081907749176025, + "learning_rate": 3.412409286726227e-05, + "loss": 0.9201, + "step": 14825 + }, + { + "epoch": 1.1592505854800936, + "grad_norm": 6.641544342041016, + "learning_rate": 3.408795212073901e-05, + "loss": 1.0133, + "step": 14850 + }, + { + "epoch": 1.1612021857923498, + "grad_norm": 9.903573989868164, + "learning_rate": 3.405181137421575e-05, + "loss": 1.1718, + "step": 14875 + }, + { + "epoch": 1.1631537861046057, + "grad_norm": 4.586569786071777, + "learning_rate": 3.401567062769248e-05, + "loss": 1.2073, + "step": 14900 + }, + { + "epoch": 1.165105386416862, + "grad_norm": 5.467164993286133, + "learning_rate": 3.3979529881169224e-05, + "loss": 1.0974, + "step": 14925 + }, + { + "epoch": 1.1670569867291178, + "grad_norm": 4.363786220550537, + "learning_rate": 3.3943389134645964e-05, + "loss": 1.1519, + "step": 14950 + }, + { + "epoch": 1.169008587041374, + "grad_norm": 4.027893543243408, + "learning_rate": 3.390724838812271e-05, + "loss": 0.9971, + "step": 14975 + }, + { + "epoch": 1.17096018735363, + "grad_norm": 5.858165740966797, + "learning_rate": 3.3871107641599445e-05, + "loss": 1.1403, + "step": 15000 + }, + { + "epoch": 1.172911787665886, + "grad_norm": 5.775414943695068, + "learning_rate": 3.3834966895076186e-05, + "loss": 1.0335, + "step": 15025 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 11.699418067932129, + "learning_rate": 3.3798826148552926e-05, + "loss": 1.0177, + "step": 15050 + }, + { + "epoch": 1.1768149882903982, + "grad_norm": 7.264484405517578, + "learning_rate": 3.376268540202967e-05, + "loss": 1.0599, + "step": 15075 + }, + { + "epoch": 1.1787665886026542, + "grad_norm": 7.674377918243408, + "learning_rate": 3.37265446555064e-05, + "loss": 1.0617, + "step": 15100 + }, + { + "epoch": 1.1807181889149103, + "grad_norm": 6.782419204711914, + "learning_rate": 3.369040390898315e-05, + "loss": 1.0667, + "step": 15125 + }, + { + "epoch": 1.1826697892271663, + "grad_norm": 12.059709548950195, + "learning_rate": 3.365426316245989e-05, + "loss": 1.0631, + "step": 15150 + }, + { + "epoch": 1.1846213895394224, + "grad_norm": 3.042478084564209, + "learning_rate": 3.361812241593663e-05, + "loss": 0.9684, + "step": 15175 + }, + { + "epoch": 1.1865729898516784, + "grad_norm": 5.130613803863525, + "learning_rate": 3.358198166941336e-05, + "loss": 1.0737, + "step": 15200 + }, + { + "epoch": 1.1885245901639343, + "grad_norm": 15.406847953796387, + "learning_rate": 3.3545840922890104e-05, + "loss": 1.0377, + "step": 15225 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 7.991154193878174, + "learning_rate": 3.3509700176366844e-05, + "loss": 1.0881, + "step": 15250 + }, + { + "epoch": 1.1924277907884466, + "grad_norm": 5.082891464233398, + "learning_rate": 3.3473559429843585e-05, + "loss": 1.0444, + "step": 15275 + }, + { + "epoch": 1.1943793911007026, + "grad_norm": 15.619866371154785, + "learning_rate": 3.3437418683320325e-05, + "loss": 0.9937, + "step": 15300 + }, + { + "epoch": 1.1963309914129585, + "grad_norm": 6.623885154724121, + "learning_rate": 3.3401277936797066e-05, + "loss": 0.9967, + "step": 15325 + }, + { + "epoch": 1.1982825917252147, + "grad_norm": 4.418607234954834, + "learning_rate": 3.3365137190273806e-05, + "loss": 0.9602, + "step": 15350 + }, + { + "epoch": 1.2002341920374708, + "grad_norm": 4.006126403808594, + "learning_rate": 3.332899644375054e-05, + "loss": 1.0691, + "step": 15375 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 6.473714828491211, + "learning_rate": 3.329285569722728e-05, + "loss": 1.0886, + "step": 15400 + }, + { + "epoch": 1.2041373926619827, + "grad_norm": 2.9864890575408936, + "learning_rate": 3.325671495070402e-05, + "loss": 1.1099, + "step": 15425 + }, + { + "epoch": 1.2060889929742389, + "grad_norm": 5.299988269805908, + "learning_rate": 3.322057420418077e-05, + "loss": 1.1075, + "step": 15450 + }, + { + "epoch": 1.208040593286495, + "grad_norm": 2.7544455528259277, + "learning_rate": 3.31844334576575e-05, + "loss": 1.0473, + "step": 15475 + }, + { + "epoch": 1.209992193598751, + "grad_norm": 5.825959205627441, + "learning_rate": 3.314829271113424e-05, + "loss": 1.0988, + "step": 15500 + }, + { + "epoch": 1.211943793911007, + "grad_norm": 4.797727584838867, + "learning_rate": 3.3112151964610984e-05, + "loss": 0.9717, + "step": 15525 + }, + { + "epoch": 1.213895394223263, + "grad_norm": 2.4147379398345947, + "learning_rate": 3.3076011218087724e-05, + "loss": 0.9443, + "step": 15550 + }, + { + "epoch": 1.215846994535519, + "grad_norm": 13.073369979858398, + "learning_rate": 3.303987047156446e-05, + "loss": 1.0742, + "step": 15575 + }, + { + "epoch": 1.2177985948477752, + "grad_norm": 4.989748954772949, + "learning_rate": 3.3003729725041205e-05, + "loss": 0.9992, + "step": 15600 + }, + { + "epoch": 1.2197501951600311, + "grad_norm": 4.5544209480285645, + "learning_rate": 3.2967588978517946e-05, + "loss": 1.1223, + "step": 15625 + }, + { + "epoch": 1.2217017954722873, + "grad_norm": 4.262848377227783, + "learning_rate": 3.293144823199468e-05, + "loss": 1.1126, + "step": 15650 + }, + { + "epoch": 1.2236533957845432, + "grad_norm": 4.273756504058838, + "learning_rate": 3.289530748547142e-05, + "loss": 0.9968, + "step": 15675 + }, + { + "epoch": 1.2256049960967994, + "grad_norm": 6.735283374786377, + "learning_rate": 3.285916673894816e-05, + "loss": 0.9533, + "step": 15700 + }, + { + "epoch": 1.2275565964090553, + "grad_norm": 3.591541290283203, + "learning_rate": 3.28230259924249e-05, + "loss": 1.0845, + "step": 15725 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 19.604896545410156, + "learning_rate": 3.2786885245901635e-05, + "loss": 1.0262, + "step": 15750 + }, + { + "epoch": 1.2314597970335674, + "grad_norm": 4.898001194000244, + "learning_rate": 3.275074449937838e-05, + "loss": 0.9569, + "step": 15775 + }, + { + "epoch": 1.2334113973458236, + "grad_norm": 75.82901000976562, + "learning_rate": 3.271460375285512e-05, + "loss": 1.1277, + "step": 15800 + }, + { + "epoch": 1.2353629976580796, + "grad_norm": 11.290657043457031, + "learning_rate": 3.2678463006331864e-05, + "loss": 1.0467, + "step": 15825 + }, + { + "epoch": 1.2373145979703357, + "grad_norm": 5.339318752288818, + "learning_rate": 3.26423222598086e-05, + "loss": 1.0216, + "step": 15850 + }, + { + "epoch": 1.2392661982825917, + "grad_norm": 4.835266590118408, + "learning_rate": 3.260618151328534e-05, + "loss": 1.0557, + "step": 15875 + }, + { + "epoch": 1.2412177985948478, + "grad_norm": 8.2711820602417, + "learning_rate": 3.257004076676208e-05, + "loss": 1.1584, + "step": 15900 + }, + { + "epoch": 1.2431693989071038, + "grad_norm": 23.31690788269043, + "learning_rate": 3.253390002023882e-05, + "loss": 1.0853, + "step": 15925 + }, + { + "epoch": 1.24512099921936, + "grad_norm": 4.827496528625488, + "learning_rate": 3.249775927371556e-05, + "loss": 0.8337, + "step": 15950 + }, + { + "epoch": 1.2470725995316159, + "grad_norm": 6.998970985412598, + "learning_rate": 3.24616185271923e-05, + "loss": 1.0395, + "step": 15975 + }, + { + "epoch": 1.249024199843872, + "grad_norm": 4.085362911224365, + "learning_rate": 3.242547778066904e-05, + "loss": 1.1301, + "step": 16000 + }, + { + "epoch": 1.250975800156128, + "grad_norm": 3.71061635017395, + "learning_rate": 3.2389337034145775e-05, + "loss": 1.004, + "step": 16025 + }, + { + "epoch": 1.2529274004683841, + "grad_norm": 3.282898187637329, + "learning_rate": 3.2353196287622515e-05, + "loss": 0.9796, + "step": 16050 + }, + { + "epoch": 1.25487900078064, + "grad_norm": 3.974426746368408, + "learning_rate": 3.2317055541099256e-05, + "loss": 1.06, + "step": 16075 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 3.0458126068115234, + "learning_rate": 3.2280914794576e-05, + "loss": 1.0301, + "step": 16100 + }, + { + "epoch": 1.2587822014051522, + "grad_norm": 5.343266487121582, + "learning_rate": 3.224477404805274e-05, + "loss": 1.0106, + "step": 16125 + }, + { + "epoch": 1.2607338017174083, + "grad_norm": 11.67337417602539, + "learning_rate": 3.220863330152948e-05, + "loss": 1.0696, + "step": 16150 + }, + { + "epoch": 1.2626854020296643, + "grad_norm": 2.9566726684570312, + "learning_rate": 3.217249255500622e-05, + "loss": 0.9795, + "step": 16175 + }, + { + "epoch": 1.2646370023419204, + "grad_norm": 4.338524341583252, + "learning_rate": 3.213635180848296e-05, + "loss": 1.1341, + "step": 16200 + }, + { + "epoch": 1.2665886026541764, + "grad_norm": 4.569310188293457, + "learning_rate": 3.210021106195969e-05, + "loss": 1.0861, + "step": 16225 + }, + { + "epoch": 1.2685402029664326, + "grad_norm": 11.866361618041992, + "learning_rate": 3.206407031543644e-05, + "loss": 1.043, + "step": 16250 + }, + { + "epoch": 1.2704918032786885, + "grad_norm": 3.5181682109832764, + "learning_rate": 3.202792956891318e-05, + "loss": 1.0634, + "step": 16275 + }, + { + "epoch": 1.2724434035909447, + "grad_norm": 5.373766899108887, + "learning_rate": 3.1991788822389914e-05, + "loss": 1.0894, + "step": 16300 + }, + { + "epoch": 1.2743950039032006, + "grad_norm": 4.6050639152526855, + "learning_rate": 3.1955648075866655e-05, + "loss": 0.9494, + "step": 16325 + }, + { + "epoch": 1.2763466042154565, + "grad_norm": 2.0043582916259766, + "learning_rate": 3.1919507329343395e-05, + "loss": 0.9469, + "step": 16350 + }, + { + "epoch": 1.2782982045277127, + "grad_norm": 6.936280727386475, + "learning_rate": 3.1883366582820136e-05, + "loss": 1.0794, + "step": 16375 + }, + { + "epoch": 1.2802498048399689, + "grad_norm": 4.248229503631592, + "learning_rate": 3.1847225836296876e-05, + "loss": 0.9689, + "step": 16400 + }, + { + "epoch": 1.2822014051522248, + "grad_norm": 6.221415042877197, + "learning_rate": 3.181108508977362e-05, + "loss": 1.0542, + "step": 16425 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 13.024694442749023, + "learning_rate": 3.177494434325036e-05, + "loss": 0.9678, + "step": 16450 + }, + { + "epoch": 1.286104605776737, + "grad_norm": 7.612826347351074, + "learning_rate": 3.17388035967271e-05, + "loss": 0.9633, + "step": 16475 + }, + { + "epoch": 1.288056206088993, + "grad_norm": 4.338873863220215, + "learning_rate": 3.170266285020383e-05, + "loss": 0.9795, + "step": 16500 + }, + { + "epoch": 1.290007806401249, + "grad_norm": 2.4166529178619385, + "learning_rate": 3.166652210368057e-05, + "loss": 1.0193, + "step": 16525 + }, + { + "epoch": 1.291959406713505, + "grad_norm": 5.464421272277832, + "learning_rate": 3.163038135715731e-05, + "loss": 0.9912, + "step": 16550 + }, + { + "epoch": 1.2939110070257611, + "grad_norm": 3.446772813796997, + "learning_rate": 3.159424061063406e-05, + "loss": 1.0438, + "step": 16575 + }, + { + "epoch": 1.2958626073380173, + "grad_norm": 4.623307704925537, + "learning_rate": 3.1558099864110794e-05, + "loss": 1.0619, + "step": 16600 + }, + { + "epoch": 1.2978142076502732, + "grad_norm": 13.972589492797852, + "learning_rate": 3.1521959117587535e-05, + "loss": 0.9696, + "step": 16625 + }, + { + "epoch": 1.2997658079625292, + "grad_norm": 6.926995277404785, + "learning_rate": 3.1485818371064275e-05, + "loss": 1.0163, + "step": 16650 + }, + { + "epoch": 1.3017174082747853, + "grad_norm": 3.467139720916748, + "learning_rate": 3.1449677624541016e-05, + "loss": 0.9427, + "step": 16675 + }, + { + "epoch": 1.3036690085870415, + "grad_norm": 3.1513898372650146, + "learning_rate": 3.141353687801775e-05, + "loss": 1.1633, + "step": 16700 + }, + { + "epoch": 1.3056206088992974, + "grad_norm": 3.917092800140381, + "learning_rate": 3.13773961314945e-05, + "loss": 1.1167, + "step": 16725 + }, + { + "epoch": 1.3075722092115534, + "grad_norm": 3.3093464374542236, + "learning_rate": 3.134125538497124e-05, + "loss": 0.9464, + "step": 16750 + }, + { + "epoch": 1.3095238095238095, + "grad_norm": 45.38322448730469, + "learning_rate": 3.130511463844797e-05, + "loss": 1.054, + "step": 16775 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 3.575658082962036, + "learning_rate": 3.126897389192471e-05, + "loss": 0.9527, + "step": 16800 + }, + { + "epoch": 1.3134270101483216, + "grad_norm": 5.4109787940979, + "learning_rate": 3.123283314540145e-05, + "loss": 1.1202, + "step": 16825 + }, + { + "epoch": 1.3153786104605776, + "grad_norm": 5.802377700805664, + "learning_rate": 3.119669239887819e-05, + "loss": 0.9195, + "step": 16850 + }, + { + "epoch": 1.3173302107728337, + "grad_norm": 10.947449684143066, + "learning_rate": 3.116055165235493e-05, + "loss": 1.069, + "step": 16875 + }, + { + "epoch": 1.31928181108509, + "grad_norm": 3.5102312564849854, + "learning_rate": 3.1124410905831674e-05, + "loss": 1.03, + "step": 16900 + }, + { + "epoch": 1.3212334113973458, + "grad_norm": 4.87523889541626, + "learning_rate": 3.1088270159308415e-05, + "loss": 1.0424, + "step": 16925 + }, + { + "epoch": 1.3231850117096018, + "grad_norm": 4.235840797424316, + "learning_rate": 3.1052129412785155e-05, + "loss": 1.0084, + "step": 16950 + }, + { + "epoch": 1.325136612021858, + "grad_norm": 4.647400856018066, + "learning_rate": 3.101598866626189e-05, + "loss": 0.9846, + "step": 16975 + }, + { + "epoch": 1.327088212334114, + "grad_norm": 10.53801441192627, + "learning_rate": 3.097984791973863e-05, + "loss": 1.0123, + "step": 17000 + }, + { + "epoch": 1.32903981264637, + "grad_norm": 7.863859176635742, + "learning_rate": 3.094370717321537e-05, + "loss": 1.1541, + "step": 17025 + }, + { + "epoch": 1.330991412958626, + "grad_norm": 4.656104564666748, + "learning_rate": 3.090756642669211e-05, + "loss": 1.0377, + "step": 17050 + }, + { + "epoch": 1.3329430132708822, + "grad_norm": 14.25987720489502, + "learning_rate": 3.087142568016885e-05, + "loss": 1.0019, + "step": 17075 + }, + { + "epoch": 1.334894613583138, + "grad_norm": 4.178525447845459, + "learning_rate": 3.083528493364559e-05, + "loss": 0.9933, + "step": 17100 + }, + { + "epoch": 1.3368462138953943, + "grad_norm": 4.163548469543457, + "learning_rate": 3.079914418712233e-05, + "loss": 1.0455, + "step": 17125 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 5.098491668701172, + "learning_rate": 3.0763003440599066e-05, + "loss": 1.0853, + "step": 17150 + }, + { + "epoch": 1.3407494145199064, + "grad_norm": 10.22192668914795, + "learning_rate": 3.072686269407581e-05, + "loss": 1.0128, + "step": 17175 + }, + { + "epoch": 1.3427010148321623, + "grad_norm": 4.41799783706665, + "learning_rate": 3.069072194755255e-05, + "loss": 1.1225, + "step": 17200 + }, + { + "epoch": 1.3446526151444185, + "grad_norm": 4.849637031555176, + "learning_rate": 3.0654581201029295e-05, + "loss": 0.9925, + "step": 17225 + }, + { + "epoch": 1.3466042154566744, + "grad_norm": 5.031124591827393, + "learning_rate": 3.061844045450603e-05, + "loss": 1.0401, + "step": 17250 + }, + { + "epoch": 1.3485558157689306, + "grad_norm": 7.179988384246826, + "learning_rate": 3.058229970798277e-05, + "loss": 0.975, + "step": 17275 + }, + { + "epoch": 1.3505074160811865, + "grad_norm": 4.910855770111084, + "learning_rate": 3.054615896145951e-05, + "loss": 1.03, + "step": 17300 + }, + { + "epoch": 1.3524590163934427, + "grad_norm": 3.471041679382324, + "learning_rate": 3.0510018214936247e-05, + "loss": 0.9914, + "step": 17325 + }, + { + "epoch": 1.3544106167056986, + "grad_norm": 3.18436598777771, + "learning_rate": 3.0473877468412988e-05, + "loss": 1.0731, + "step": 17350 + }, + { + "epoch": 1.3563622170179548, + "grad_norm": 3.4437074661254883, + "learning_rate": 3.043773672188973e-05, + "loss": 0.8273, + "step": 17375 + }, + { + "epoch": 1.3583138173302107, + "grad_norm": 3.503955125808716, + "learning_rate": 3.040159597536647e-05, + "loss": 0.9826, + "step": 17400 + }, + { + "epoch": 1.360265417642467, + "grad_norm": 9.779973983764648, + "learning_rate": 3.036545522884321e-05, + "loss": 0.986, + "step": 17425 + }, + { + "epoch": 1.3622170179547228, + "grad_norm": 3.9893665313720703, + "learning_rate": 3.032931448231995e-05, + "loss": 0.9838, + "step": 17450 + }, + { + "epoch": 1.364168618266979, + "grad_norm": 6.999486446380615, + "learning_rate": 3.0293173735796687e-05, + "loss": 0.9448, + "step": 17475 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 4.026136875152588, + "learning_rate": 3.0257032989273428e-05, + "loss": 1.0036, + "step": 17500 + }, + { + "epoch": 1.368071818891491, + "grad_norm": 3.2647008895874023, + "learning_rate": 3.0220892242750165e-05, + "loss": 0.9904, + "step": 17525 + }, + { + "epoch": 1.370023419203747, + "grad_norm": 2.449075222015381, + "learning_rate": 3.018475149622691e-05, + "loss": 1.0663, + "step": 17550 + }, + { + "epoch": 1.3719750195160032, + "grad_norm": 7.260342597961426, + "learning_rate": 3.014861074970365e-05, + "loss": 0.9918, + "step": 17575 + }, + { + "epoch": 1.3739266198282591, + "grad_norm": 15.406435012817383, + "learning_rate": 3.0112470003180386e-05, + "loss": 0.9492, + "step": 17600 + }, + { + "epoch": 1.3758782201405153, + "grad_norm": 41.50312423706055, + "learning_rate": 3.0076329256657127e-05, + "loss": 1.0276, + "step": 17625 + }, + { + "epoch": 1.3778298204527712, + "grad_norm": 4.095573902130127, + "learning_rate": 3.0040188510133864e-05, + "loss": 1.0613, + "step": 17650 + }, + { + "epoch": 1.3797814207650272, + "grad_norm": 4.496942043304443, + "learning_rate": 3.0004047763610605e-05, + "loss": 1.0033, + "step": 17675 + }, + { + "epoch": 1.3817330210772834, + "grad_norm": 3.4721872806549072, + "learning_rate": 2.996790701708735e-05, + "loss": 1.0714, + "step": 17700 + }, + { + "epoch": 1.3836846213895395, + "grad_norm": 5.198745250701904, + "learning_rate": 2.993176627056409e-05, + "loss": 1.0083, + "step": 17725 + }, + { + "epoch": 1.3856362217017955, + "grad_norm": 5.634047031402588, + "learning_rate": 2.9895625524040827e-05, + "loss": 1.0326, + "step": 17750 + }, + { + "epoch": 1.3875878220140514, + "grad_norm": 4.827916622161865, + "learning_rate": 2.9859484777517567e-05, + "loss": 1.1214, + "step": 17775 + }, + { + "epoch": 1.3895394223263076, + "grad_norm": 4.9483842849731445, + "learning_rate": 2.9823344030994304e-05, + "loss": 1.0853, + "step": 17800 + }, + { + "epoch": 1.3914910226385637, + "grad_norm": 4.358343601226807, + "learning_rate": 2.9787203284471045e-05, + "loss": 1.0718, + "step": 17825 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 6.446356296539307, + "learning_rate": 2.9751062537947782e-05, + "loss": 1.0735, + "step": 17850 + }, + { + "epoch": 1.3953942232630756, + "grad_norm": 6.969168186187744, + "learning_rate": 2.9714921791424526e-05, + "loss": 1.0459, + "step": 17875 + }, + { + "epoch": 1.3973458235753318, + "grad_norm": 10.049994468688965, + "learning_rate": 2.9678781044901267e-05, + "loss": 1.1259, + "step": 17900 + }, + { + "epoch": 1.399297423887588, + "grad_norm": 5.11632776260376, + "learning_rate": 2.9642640298378004e-05, + "loss": 1.0396, + "step": 17925 + }, + { + "epoch": 1.4012490241998439, + "grad_norm": 7.025031566619873, + "learning_rate": 2.9606499551854744e-05, + "loss": 1.0761, + "step": 17950 + }, + { + "epoch": 1.4032006245120998, + "grad_norm": 4.1501617431640625, + "learning_rate": 2.957035880533148e-05, + "loss": 0.9756, + "step": 17975 + }, + { + "epoch": 1.405152224824356, + "grad_norm": 5.168917655944824, + "learning_rate": 2.9534218058808222e-05, + "loss": 0.9865, + "step": 18000 + }, + { + "epoch": 1.4071038251366121, + "grad_norm": 714.6832275390625, + "learning_rate": 2.9498077312284966e-05, + "loss": 1.0472, + "step": 18025 + }, + { + "epoch": 1.409055425448868, + "grad_norm": 7.9371161460876465, + "learning_rate": 2.9461936565761707e-05, + "loss": 0.9684, + "step": 18050 + }, + { + "epoch": 1.411007025761124, + "grad_norm": 5.507622718811035, + "learning_rate": 2.9425795819238444e-05, + "loss": 1.0258, + "step": 18075 + }, + { + "epoch": 1.4129586260733802, + "grad_norm": 5.39263391494751, + "learning_rate": 2.9389655072715184e-05, + "loss": 0.9649, + "step": 18100 + }, + { + "epoch": 1.4149102263856363, + "grad_norm": 5.229122161865234, + "learning_rate": 2.935351432619192e-05, + "loss": 0.9844, + "step": 18125 + }, + { + "epoch": 1.4168618266978923, + "grad_norm": 2.2199718952178955, + "learning_rate": 2.9317373579668662e-05, + "loss": 0.8906, + "step": 18150 + }, + { + "epoch": 1.4188134270101482, + "grad_norm": 4.393389701843262, + "learning_rate": 2.9281232833145406e-05, + "loss": 0.9483, + "step": 18175 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 3.3973653316497803, + "learning_rate": 2.9245092086622143e-05, + "loss": 0.9457, + "step": 18200 + }, + { + "epoch": 1.4227166276346606, + "grad_norm": 8.53115463256836, + "learning_rate": 2.9208951340098884e-05, + "loss": 1.0234, + "step": 18225 + }, + { + "epoch": 1.4246682279469165, + "grad_norm": 3.105377197265625, + "learning_rate": 2.917281059357562e-05, + "loss": 1.0642, + "step": 18250 + }, + { + "epoch": 1.4266198282591724, + "grad_norm": 4.813482284545898, + "learning_rate": 2.913666984705236e-05, + "loss": 0.9646, + "step": 18275 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 5.369482517242432, + "learning_rate": 2.91005291005291e-05, + "loss": 0.9709, + "step": 18300 + }, + { + "epoch": 1.4305230288836845, + "grad_norm": 6.120393753051758, + "learning_rate": 2.906438835400584e-05, + "loss": 1.003, + "step": 18325 + }, + { + "epoch": 1.4324746291959407, + "grad_norm": 3.9096245765686035, + "learning_rate": 2.9028247607482583e-05, + "loss": 0.9658, + "step": 18350 + }, + { + "epoch": 1.4344262295081966, + "grad_norm": 6.922801971435547, + "learning_rate": 2.8992106860959324e-05, + "loss": 0.9676, + "step": 18375 + }, + { + "epoch": 1.4363778298204528, + "grad_norm": 4.363033294677734, + "learning_rate": 2.895596611443606e-05, + "loss": 1.0072, + "step": 18400 + }, + { + "epoch": 1.4383294301327088, + "grad_norm": 5.962978839874268, + "learning_rate": 2.89198253679128e-05, + "loss": 1.0297, + "step": 18425 + }, + { + "epoch": 1.440281030444965, + "grad_norm": 3.9251201152801514, + "learning_rate": 2.888368462138954e-05, + "loss": 1.0636, + "step": 18450 + }, + { + "epoch": 1.4422326307572209, + "grad_norm": 3.9736201763153076, + "learning_rate": 2.884754387486628e-05, + "loss": 1.0764, + "step": 18475 + }, + { + "epoch": 1.444184231069477, + "grad_norm": 3.5434916019439697, + "learning_rate": 2.8811403128343023e-05, + "loss": 1.1278, + "step": 18500 + }, + { + "epoch": 1.446135831381733, + "grad_norm": 20.971593856811523, + "learning_rate": 2.877526238181976e-05, + "loss": 1.0956, + "step": 18525 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 2.552823781967163, + "learning_rate": 2.87391216352965e-05, + "loss": 1.0464, + "step": 18550 + }, + { + "epoch": 1.450039032006245, + "grad_norm": 3.2997212409973145, + "learning_rate": 2.870298088877324e-05, + "loss": 0.9917, + "step": 18575 + }, + { + "epoch": 1.4519906323185012, + "grad_norm": 4.3301920890808105, + "learning_rate": 2.866684014224998e-05, + "loss": 0.957, + "step": 18600 + }, + { + "epoch": 1.4539422326307572, + "grad_norm": 5.454684257507324, + "learning_rate": 2.863069939572672e-05, + "loss": 1.1199, + "step": 18625 + }, + { + "epoch": 1.4558938329430133, + "grad_norm": 4.898594379425049, + "learning_rate": 2.8594558649203456e-05, + "loss": 1.0822, + "step": 18650 + }, + { + "epoch": 1.4578454332552693, + "grad_norm": 6.092761039733887, + "learning_rate": 2.85584179026802e-05, + "loss": 0.9509, + "step": 18675 + }, + { + "epoch": 1.4597970335675254, + "grad_norm": 6.443736553192139, + "learning_rate": 2.852227715615694e-05, + "loss": 1.0457, + "step": 18700 + }, + { + "epoch": 1.4617486338797814, + "grad_norm": 9.433018684387207, + "learning_rate": 2.8486136409633678e-05, + "loss": 1.028, + "step": 18725 + }, + { + "epoch": 1.4637002341920375, + "grad_norm": 7.352541923522949, + "learning_rate": 2.844999566311042e-05, + "loss": 0.9847, + "step": 18750 + }, + { + "epoch": 1.4656518345042935, + "grad_norm": 6.585604667663574, + "learning_rate": 2.8413854916587156e-05, + "loss": 1.0851, + "step": 18775 + }, + { + "epoch": 1.4676034348165496, + "grad_norm": 3.2517330646514893, + "learning_rate": 2.8377714170063897e-05, + "loss": 0.9635, + "step": 18800 + }, + { + "epoch": 1.4695550351288056, + "grad_norm": 7.05902624130249, + "learning_rate": 2.834157342354064e-05, + "loss": 1.011, + "step": 18825 + }, + { + "epoch": 1.4715066354410617, + "grad_norm": 4.2522993087768555, + "learning_rate": 2.830543267701738e-05, + "loss": 1.0048, + "step": 18850 + }, + { + "epoch": 1.4734582357533177, + "grad_norm": 5.979567527770996, + "learning_rate": 2.8269291930494118e-05, + "loss": 0.9672, + "step": 18875 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 4.375784397125244, + "learning_rate": 2.823315118397086e-05, + "loss": 1.0821, + "step": 18900 + }, + { + "epoch": 1.4773614363778298, + "grad_norm": 2.408162832260132, + "learning_rate": 2.8197010437447596e-05, + "loss": 1.1504, + "step": 18925 + }, + { + "epoch": 1.479313036690086, + "grad_norm": 4.831646919250488, + "learning_rate": 2.8160869690924337e-05, + "loss": 1.0477, + "step": 18950 + }, + { + "epoch": 1.481264637002342, + "grad_norm": 5.827836990356445, + "learning_rate": 2.8124728944401074e-05, + "loss": 0.9696, + "step": 18975 + }, + { + "epoch": 1.4832162373145978, + "grad_norm": 4.551143169403076, + "learning_rate": 2.8088588197877818e-05, + "loss": 1.0606, + "step": 19000 + }, + { + "epoch": 1.485167837626854, + "grad_norm": 6.779484748840332, + "learning_rate": 2.8052447451354558e-05, + "loss": 0.9497, + "step": 19025 + }, + { + "epoch": 1.4871194379391102, + "grad_norm": 4.364745616912842, + "learning_rate": 2.8016306704831295e-05, + "loss": 1.0972, + "step": 19050 + }, + { + "epoch": 1.489071038251366, + "grad_norm": 4.649238586425781, + "learning_rate": 2.7980165958308036e-05, + "loss": 1.0243, + "step": 19075 + }, + { + "epoch": 1.491022638563622, + "grad_norm": 7.4075188636779785, + "learning_rate": 2.7944025211784773e-05, + "loss": 1.0038, + "step": 19100 + }, + { + "epoch": 1.4929742388758782, + "grad_norm": 5.461874961853027, + "learning_rate": 2.7907884465261514e-05, + "loss": 1.0718, + "step": 19125 + }, + { + "epoch": 1.4949258391881344, + "grad_norm": 5.350560665130615, + "learning_rate": 2.7871743718738258e-05, + "loss": 1.0364, + "step": 19150 + }, + { + "epoch": 1.4968774395003903, + "grad_norm": 4.233122825622559, + "learning_rate": 2.7835602972214998e-05, + "loss": 1.0761, + "step": 19175 + }, + { + "epoch": 1.4988290398126463, + "grad_norm": 4.502426624298096, + "learning_rate": 2.7799462225691735e-05, + "loss": 1.0649, + "step": 19200 + }, + { + "epoch": 1.5007806401249024, + "grad_norm": 3.345777988433838, + "learning_rate": 2.7763321479168476e-05, + "loss": 1.0786, + "step": 19225 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 3.6552770137786865, + "learning_rate": 2.7727180732645213e-05, + "loss": 1.0938, + "step": 19250 + }, + { + "epoch": 1.5046838407494145, + "grad_norm": 6.093689918518066, + "learning_rate": 2.7691039986121954e-05, + "loss": 1.116, + "step": 19275 + }, + { + "epoch": 1.5066354410616705, + "grad_norm": 3.3782944679260254, + "learning_rate": 2.7654899239598698e-05, + "loss": 0.9904, + "step": 19300 + }, + { + "epoch": 1.5085870413739266, + "grad_norm": 11.246219635009766, + "learning_rate": 2.7618758493075435e-05, + "loss": 1.036, + "step": 19325 + }, + { + "epoch": 1.5105386416861828, + "grad_norm": 3.729729175567627, + "learning_rate": 2.7582617746552175e-05, + "loss": 1.0244, + "step": 19350 + }, + { + "epoch": 1.5124902419984387, + "grad_norm": 5.234212398529053, + "learning_rate": 2.7546477000028913e-05, + "loss": 1.0188, + "step": 19375 + }, + { + "epoch": 1.5144418423106947, + "grad_norm": 6.699758052825928, + "learning_rate": 2.7510336253505653e-05, + "loss": 1.0264, + "step": 19400 + }, + { + "epoch": 1.5163934426229508, + "grad_norm": 6.274900913238525, + "learning_rate": 2.747419550698239e-05, + "loss": 1.005, + "step": 19425 + }, + { + "epoch": 1.518345042935207, + "grad_norm": 2.9779465198516846, + "learning_rate": 2.743805476045913e-05, + "loss": 1.0164, + "step": 19450 + }, + { + "epoch": 1.520296643247463, + "grad_norm": 8.84890079498291, + "learning_rate": 2.7401914013935875e-05, + "loss": 0.9889, + "step": 19475 + }, + { + "epoch": 1.5222482435597189, + "grad_norm": 12.26213264465332, + "learning_rate": 2.7365773267412615e-05, + "loss": 1.0684, + "step": 19500 + }, + { + "epoch": 1.524199843871975, + "grad_norm": 4.924468040466309, + "learning_rate": 2.7329632520889353e-05, + "loss": 1.0348, + "step": 19525 + }, + { + "epoch": 1.5261514441842312, + "grad_norm": 3.0204434394836426, + "learning_rate": 2.7293491774366093e-05, + "loss": 0.9714, + "step": 19550 + }, + { + "epoch": 1.5281030444964872, + "grad_norm": 4.593067646026611, + "learning_rate": 2.725735102784283e-05, + "loss": 0.9517, + "step": 19575 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 2.264890432357788, + "learning_rate": 2.722121028131957e-05, + "loss": 0.8162, + "step": 19600 + }, + { + "epoch": 1.5320062451209993, + "grad_norm": 12.349218368530273, + "learning_rate": 2.7185069534796315e-05, + "loss": 1.0557, + "step": 19625 + }, + { + "epoch": 1.5339578454332554, + "grad_norm": 7.269456386566162, + "learning_rate": 2.7148928788273052e-05, + "loss": 1.0353, + "step": 19650 + }, + { + "epoch": 1.5359094457455114, + "grad_norm": 5.1450514793396, + "learning_rate": 2.7112788041749793e-05, + "loss": 0.9834, + "step": 19675 + }, + { + "epoch": 1.5378610460577673, + "grad_norm": 4.423309326171875, + "learning_rate": 2.707664729522653e-05, + "loss": 0.9935, + "step": 19700 + }, + { + "epoch": 1.5398126463700235, + "grad_norm": 6.642602443695068, + "learning_rate": 2.704050654870327e-05, + "loss": 1.0082, + "step": 19725 + }, + { + "epoch": 1.5417642466822796, + "grad_norm": 4.050596237182617, + "learning_rate": 2.700436580218001e-05, + "loss": 0.9686, + "step": 19750 + }, + { + "epoch": 1.5437158469945356, + "grad_norm": 8.340785026550293, + "learning_rate": 2.6968225055656748e-05, + "loss": 1.0724, + "step": 19775 + }, + { + "epoch": 1.5456674473067915, + "grad_norm": 6.880356311798096, + "learning_rate": 2.6932084309133492e-05, + "loss": 0.9453, + "step": 19800 + }, + { + "epoch": 1.5476190476190477, + "grad_norm": 6.080481052398682, + "learning_rate": 2.6895943562610233e-05, + "loss": 1.0322, + "step": 19825 + }, + { + "epoch": 1.5495706479313038, + "grad_norm": 4.800299644470215, + "learning_rate": 2.685980281608697e-05, + "loss": 1.1176, + "step": 19850 + }, + { + "epoch": 1.5515222482435598, + "grad_norm": 2.738233804702759, + "learning_rate": 2.682366206956371e-05, + "loss": 0.9768, + "step": 19875 + }, + { + "epoch": 1.5534738485558157, + "grad_norm": 12.255475044250488, + "learning_rate": 2.6787521323040448e-05, + "loss": 0.9278, + "step": 19900 + }, + { + "epoch": 1.5554254488680717, + "grad_norm": 3.6856420040130615, + "learning_rate": 2.6751380576517188e-05, + "loss": 1.0784, + "step": 19925 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 5.496433734893799, + "learning_rate": 2.6715239829993932e-05, + "loss": 0.8721, + "step": 19950 + }, + { + "epoch": 1.559328649492584, + "grad_norm": 1.6164950132369995, + "learning_rate": 2.6679099083470673e-05, + "loss": 1.019, + "step": 19975 + }, + { + "epoch": 1.56128024980484, + "grad_norm": 7.188957214355469, + "learning_rate": 2.664295833694741e-05, + "loss": 0.9543, + "step": 20000 + }, + { + "epoch": 1.5632318501170959, + "grad_norm": 8.268742561340332, + "learning_rate": 2.660681759042415e-05, + "loss": 1.0626, + "step": 20025 + }, + { + "epoch": 1.565183450429352, + "grad_norm": 4.795127868652344, + "learning_rate": 2.6570676843900888e-05, + "loss": 1.0771, + "step": 20050 + }, + { + "epoch": 1.5671350507416082, + "grad_norm": 7.0319085121154785, + "learning_rate": 2.6534536097377628e-05, + "loss": 1.0804, + "step": 20075 + }, + { + "epoch": 1.5690866510538641, + "grad_norm": 5.493650436401367, + "learning_rate": 2.6498395350854365e-05, + "loss": 1.0185, + "step": 20100 + }, + { + "epoch": 1.57103825136612, + "grad_norm": 4.698023319244385, + "learning_rate": 2.646225460433111e-05, + "loss": 0.9612, + "step": 20125 + }, + { + "epoch": 1.5729898516783762, + "grad_norm": 2.303802013397217, + "learning_rate": 2.642611385780785e-05, + "loss": 0.9888, + "step": 20150 + }, + { + "epoch": 1.5749414519906324, + "grad_norm": 3.154298782348633, + "learning_rate": 2.6389973111284587e-05, + "loss": 1.0377, + "step": 20175 + }, + { + "epoch": 1.5768930523028883, + "grad_norm": 3.6313188076019287, + "learning_rate": 2.6353832364761328e-05, + "loss": 1.0156, + "step": 20200 + }, + { + "epoch": 1.5788446526151443, + "grad_norm": 10.81686782836914, + "learning_rate": 2.6317691618238065e-05, + "loss": 1.1013, + "step": 20225 + }, + { + "epoch": 1.5807962529274004, + "grad_norm": 5.5524396896362305, + "learning_rate": 2.6281550871714805e-05, + "loss": 1.0842, + "step": 20250 + }, + { + "epoch": 1.5827478532396566, + "grad_norm": 3.9614038467407227, + "learning_rate": 2.624541012519155e-05, + "loss": 1.0371, + "step": 20275 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 9.910189628601074, + "learning_rate": 2.620926937866829e-05, + "loss": 0.9515, + "step": 20300 + }, + { + "epoch": 1.5866510538641685, + "grad_norm": 2.5915534496307373, + "learning_rate": 2.6173128632145027e-05, + "loss": 1.0421, + "step": 20325 + }, + { + "epoch": 1.5886026541764247, + "grad_norm": 12.07827377319336, + "learning_rate": 2.6136987885621768e-05, + "loss": 1.0442, + "step": 20350 + }, + { + "epoch": 1.5905542544886808, + "grad_norm": 8.072848320007324, + "learning_rate": 2.6100847139098505e-05, + "loss": 1.0583, + "step": 20375 + }, + { + "epoch": 1.5925058548009368, + "grad_norm": 6.771178245544434, + "learning_rate": 2.6064706392575245e-05, + "loss": 0.9976, + "step": 20400 + }, + { + "epoch": 1.5944574551131927, + "grad_norm": 4.585131645202637, + "learning_rate": 2.6028565646051983e-05, + "loss": 0.9965, + "step": 20425 + }, + { + "epoch": 1.5964090554254489, + "grad_norm": 4.044947147369385, + "learning_rate": 2.5992424899528727e-05, + "loss": 0.9715, + "step": 20450 + }, + { + "epoch": 1.598360655737705, + "grad_norm": 4.791822910308838, + "learning_rate": 2.5956284153005467e-05, + "loss": 0.8945, + "step": 20475 + }, + { + "epoch": 1.600312256049961, + "grad_norm": 25.493099212646484, + "learning_rate": 2.5920143406482204e-05, + "loss": 1.0219, + "step": 20500 + }, + { + "epoch": 1.602263856362217, + "grad_norm": 5.103201866149902, + "learning_rate": 2.5884002659958945e-05, + "loss": 1.1644, + "step": 20525 + }, + { + "epoch": 1.604215456674473, + "grad_norm": 3.6445109844207764, + "learning_rate": 2.5847861913435682e-05, + "loss": 1.0763, + "step": 20550 + }, + { + "epoch": 1.6061670569867292, + "grad_norm": 4.258514404296875, + "learning_rate": 2.5811721166912423e-05, + "loss": 0.9534, + "step": 20575 + }, + { + "epoch": 1.6081186572989852, + "grad_norm": 3.511690139770508, + "learning_rate": 2.5775580420389167e-05, + "loss": 1.0279, + "step": 20600 + }, + { + "epoch": 1.6100702576112411, + "grad_norm": 3.6223819255828857, + "learning_rate": 2.5739439673865907e-05, + "loss": 1.0497, + "step": 20625 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 19.4866943359375, + "learning_rate": 2.5703298927342644e-05, + "loss": 1.1281, + "step": 20650 + }, + { + "epoch": 1.6139734582357534, + "grad_norm": 2.9575917720794678, + "learning_rate": 2.5667158180819385e-05, + "loss": 1.0877, + "step": 20675 + }, + { + "epoch": 1.6159250585480094, + "grad_norm": 4.271249771118164, + "learning_rate": 2.5631017434296122e-05, + "loss": 1.0247, + "step": 20700 + }, + { + "epoch": 1.6178766588602653, + "grad_norm": 5.442391872406006, + "learning_rate": 2.5594876687772863e-05, + "loss": 1.0574, + "step": 20725 + }, + { + "epoch": 1.6198282591725215, + "grad_norm": 6.771878719329834, + "learning_rate": 2.5558735941249607e-05, + "loss": 0.9667, + "step": 20750 + }, + { + "epoch": 1.6217798594847777, + "grad_norm": 5.302334308624268, + "learning_rate": 2.5522595194726344e-05, + "loss": 0.9703, + "step": 20775 + }, + { + "epoch": 1.6237314597970336, + "grad_norm": 5.012480735778809, + "learning_rate": 2.5486454448203084e-05, + "loss": 1.0256, + "step": 20800 + }, + { + "epoch": 1.6256830601092895, + "grad_norm": 4.493309020996094, + "learning_rate": 2.545031370167982e-05, + "loss": 1.0891, + "step": 20825 + }, + { + "epoch": 1.6276346604215457, + "grad_norm": 3.693201780319214, + "learning_rate": 2.5414172955156562e-05, + "loss": 0.995, + "step": 20850 + }, + { + "epoch": 1.6295862607338019, + "grad_norm": 4.986085414886475, + "learning_rate": 2.5378032208633303e-05, + "loss": 0.9673, + "step": 20875 + }, + { + "epoch": 1.6315378610460578, + "grad_norm": 5.518863677978516, + "learning_rate": 2.534189146211004e-05, + "loss": 1.0588, + "step": 20900 + }, + { + "epoch": 1.6334894613583137, + "grad_norm": 3.7875008583068848, + "learning_rate": 2.5305750715586784e-05, + "loss": 1.0413, + "step": 20925 + }, + { + "epoch": 1.63544106167057, + "grad_norm": 4.00389289855957, + "learning_rate": 2.5269609969063524e-05, + "loss": 1.0335, + "step": 20950 + }, + { + "epoch": 1.637392661982826, + "grad_norm": 8.645567893981934, + "learning_rate": 2.523346922254026e-05, + "loss": 0.9512, + "step": 20975 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 4.029364109039307, + "learning_rate": 2.5197328476017002e-05, + "loss": 0.9917, + "step": 21000 + }, + { + "epoch": 1.641295862607338, + "grad_norm": 3.6132876873016357, + "learning_rate": 2.516118772949374e-05, + "loss": 0.9177, + "step": 21025 + }, + { + "epoch": 1.6432474629195941, + "grad_norm": 4.422287940979004, + "learning_rate": 2.512504698297048e-05, + "loss": 0.9892, + "step": 21050 + }, + { + "epoch": 1.6451990632318503, + "grad_norm": 12.263224601745605, + "learning_rate": 2.5088906236447224e-05, + "loss": 0.9686, + "step": 21075 + }, + { + "epoch": 1.6471506635441062, + "grad_norm": 11.990299224853516, + "learning_rate": 2.5052765489923964e-05, + "loss": 1.0985, + "step": 21100 + }, + { + "epoch": 1.6491022638563622, + "grad_norm": 7.674864292144775, + "learning_rate": 2.50166247434007e-05, + "loss": 1.0921, + "step": 21125 + }, + { + "epoch": 1.651053864168618, + "grad_norm": 10.45122241973877, + "learning_rate": 2.4980483996877442e-05, + "loss": 1.1323, + "step": 21150 + }, + { + "epoch": 1.6530054644808743, + "grad_norm": 3.549076795578003, + "learning_rate": 2.494434325035418e-05, + "loss": 1.0177, + "step": 21175 + }, + { + "epoch": 1.6549570647931304, + "grad_norm": 2.7426836490631104, + "learning_rate": 2.490820250383092e-05, + "loss": 1.0479, + "step": 21200 + }, + { + "epoch": 1.6569086651053864, + "grad_norm": 2.4476282596588135, + "learning_rate": 2.487206175730766e-05, + "loss": 0.9987, + "step": 21225 + }, + { + "epoch": 1.6588602654176423, + "grad_norm": 6.051702499389648, + "learning_rate": 2.4835921010784398e-05, + "loss": 1.1078, + "step": 21250 + }, + { + "epoch": 1.6608118657298985, + "grad_norm": 2.624838352203369, + "learning_rate": 2.479978026426114e-05, + "loss": 1.0376, + "step": 21275 + }, + { + "epoch": 1.6627634660421546, + "grad_norm": 6.66210412979126, + "learning_rate": 2.476363951773788e-05, + "loss": 0.9237, + "step": 21300 + }, + { + "epoch": 1.6647150663544106, + "grad_norm": 2.3470282554626465, + "learning_rate": 2.472749877121462e-05, + "loss": 0.9968, + "step": 21325 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.7463133335113525, + "learning_rate": 2.4691358024691357e-05, + "loss": 0.9817, + "step": 21350 + }, + { + "epoch": 1.6686182669789227, + "grad_norm": 4.73617696762085, + "learning_rate": 2.46552172781681e-05, + "loss": 1.0356, + "step": 21375 + }, + { + "epoch": 1.6705698672911788, + "grad_norm": 6.097596645355225, + "learning_rate": 2.4619076531644838e-05, + "loss": 0.9652, + "step": 21400 + }, + { + "epoch": 1.6725214676034348, + "grad_norm": 3.4013562202453613, + "learning_rate": 2.4582935785121578e-05, + "loss": 1.0503, + "step": 21425 + }, + { + "epoch": 1.6744730679156907, + "grad_norm": 6.023477554321289, + "learning_rate": 2.454679503859832e-05, + "loss": 0.9708, + "step": 21450 + }, + { + "epoch": 1.676424668227947, + "grad_norm": 3.4353835582733154, + "learning_rate": 2.451065429207506e-05, + "loss": 1.0548, + "step": 21475 + }, + { + "epoch": 1.678376268540203, + "grad_norm": 27.315839767456055, + "learning_rate": 2.4474513545551797e-05, + "loss": 0.9905, + "step": 21500 + }, + { + "epoch": 1.680327868852459, + "grad_norm": 3.304811477661133, + "learning_rate": 2.4438372799028537e-05, + "loss": 1.0425, + "step": 21525 + }, + { + "epoch": 1.682279469164715, + "grad_norm": 3.6206228733062744, + "learning_rate": 2.4402232052505278e-05, + "loss": 1.0865, + "step": 21550 + }, + { + "epoch": 1.684231069476971, + "grad_norm": 4.431848526000977, + "learning_rate": 2.4366091305982018e-05, + "loss": 0.9996, + "step": 21575 + }, + { + "epoch": 1.6861826697892273, + "grad_norm": 4.312738418579102, + "learning_rate": 2.432995055945876e-05, + "loss": 1.0168, + "step": 21600 + }, + { + "epoch": 1.6881342701014832, + "grad_norm": 3.205915689468384, + "learning_rate": 2.4293809812935496e-05, + "loss": 1.0471, + "step": 21625 + }, + { + "epoch": 1.6900858704137391, + "grad_norm": 2.957268714904785, + "learning_rate": 2.4257669066412237e-05, + "loss": 1.0682, + "step": 21650 + }, + { + "epoch": 1.6920374707259953, + "grad_norm": 2.54960298538208, + "learning_rate": 2.4221528319888977e-05, + "loss": 1.0354, + "step": 21675 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 4.215837478637695, + "learning_rate": 2.4185387573365718e-05, + "loss": 1.0701, + "step": 21700 + }, + { + "epoch": 1.6959406713505074, + "grad_norm": 4.628776550292969, + "learning_rate": 2.4149246826842455e-05, + "loss": 0.9995, + "step": 21725 + }, + { + "epoch": 1.6978922716627634, + "grad_norm": 4.389647006988525, + "learning_rate": 2.4113106080319196e-05, + "loss": 1.0387, + "step": 21750 + }, + { + "epoch": 1.6998438719750195, + "grad_norm": 3.111727237701416, + "learning_rate": 2.4076965333795936e-05, + "loss": 0.9685, + "step": 21775 + }, + { + "epoch": 1.7017954722872757, + "grad_norm": 7.494513034820557, + "learning_rate": 2.4040824587272677e-05, + "loss": 1.0698, + "step": 21800 + }, + { + "epoch": 1.7037470725995316, + "grad_norm": 6.872286796569824, + "learning_rate": 2.4004683840749414e-05, + "loss": 1.0207, + "step": 21825 + }, + { + "epoch": 1.7056986729117876, + "grad_norm": 12.551429748535156, + "learning_rate": 2.3968543094226158e-05, + "loss": 1.0061, + "step": 21850 + }, + { + "epoch": 1.7076502732240437, + "grad_norm": 20.56284523010254, + "learning_rate": 2.3932402347702895e-05, + "loss": 1.0501, + "step": 21875 + }, + { + "epoch": 1.7096018735362999, + "grad_norm": 4.5386738777160645, + "learning_rate": 2.3896261601179636e-05, + "loss": 1.1208, + "step": 21900 + }, + { + "epoch": 1.7115534738485558, + "grad_norm": 5.644140243530273, + "learning_rate": 2.3860120854656376e-05, + "loss": 0.9376, + "step": 21925 + }, + { + "epoch": 1.7135050741608118, + "grad_norm": 3.8693292140960693, + "learning_rate": 2.3823980108133113e-05, + "loss": 1.1056, + "step": 21950 + }, + { + "epoch": 1.715456674473068, + "grad_norm": 5.4579362869262695, + "learning_rate": 2.3787839361609854e-05, + "loss": 0.9725, + "step": 21975 + }, + { + "epoch": 1.717408274785324, + "grad_norm": 21.75058937072754, + "learning_rate": 2.3751698615086594e-05, + "loss": 1.0355, + "step": 22000 + }, + { + "epoch": 1.71935987509758, + "grad_norm": 6.148619651794434, + "learning_rate": 2.3715557868563335e-05, + "loss": 0.972, + "step": 22025 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 8.506387710571289, + "learning_rate": 2.3679417122040072e-05, + "loss": 1.0003, + "step": 22050 + }, + { + "epoch": 1.7232630757220921, + "grad_norm": 6.961507320404053, + "learning_rate": 2.3643276375516816e-05, + "loss": 1.1188, + "step": 22075 + }, + { + "epoch": 1.7252146760343483, + "grad_norm": 3.613565444946289, + "learning_rate": 2.3607135628993553e-05, + "loss": 0.9911, + "step": 22100 + }, + { + "epoch": 1.7271662763466042, + "grad_norm": 6.051638126373291, + "learning_rate": 2.3570994882470294e-05, + "loss": 0.913, + "step": 22125 + }, + { + "epoch": 1.7291178766588602, + "grad_norm": 5.5636372566223145, + "learning_rate": 2.353485413594703e-05, + "loss": 0.9866, + "step": 22150 + }, + { + "epoch": 1.7310694769711163, + "grad_norm": 6.666276931762695, + "learning_rate": 2.3498713389423775e-05, + "loss": 1.0026, + "step": 22175 + }, + { + "epoch": 1.7330210772833725, + "grad_norm": 4.0576090812683105, + "learning_rate": 2.3462572642900512e-05, + "loss": 0.9774, + "step": 22200 + }, + { + "epoch": 1.7349726775956285, + "grad_norm": 8.885558128356934, + "learning_rate": 2.3426431896377253e-05, + "loss": 1.0566, + "step": 22225 + }, + { + "epoch": 1.7369242779078844, + "grad_norm": 4.85816764831543, + "learning_rate": 2.3390291149853993e-05, + "loss": 1.0653, + "step": 22250 + }, + { + "epoch": 1.7388758782201406, + "grad_norm": 16.29740333557129, + "learning_rate": 2.3354150403330734e-05, + "loss": 1.0159, + "step": 22275 + }, + { + "epoch": 1.7408274785323967, + "grad_norm": 6.323321342468262, + "learning_rate": 2.331800965680747e-05, + "loss": 0.9722, + "step": 22300 + }, + { + "epoch": 1.7427790788446527, + "grad_norm": 3.586937427520752, + "learning_rate": 2.328186891028421e-05, + "loss": 1.0108, + "step": 22325 + }, + { + "epoch": 1.7447306791569086, + "grad_norm": 5.599551677703857, + "learning_rate": 2.3245728163760952e-05, + "loss": 1.0232, + "step": 22350 + }, + { + "epoch": 1.7466822794691648, + "grad_norm": 3.7197461128234863, + "learning_rate": 2.320958741723769e-05, + "loss": 0.9743, + "step": 22375 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 10.259845733642578, + "learning_rate": 2.3173446670714433e-05, + "loss": 1.1204, + "step": 22400 + }, + { + "epoch": 1.7505854800936769, + "grad_norm": 3.5381033420562744, + "learning_rate": 2.313730592419117e-05, + "loss": 0.9635, + "step": 22425 + }, + { + "epoch": 1.7525370804059328, + "grad_norm": 3.282846450805664, + "learning_rate": 2.310116517766791e-05, + "loss": 0.9936, + "step": 22450 + }, + { + "epoch": 1.7544886807181888, + "grad_norm": 9.679823875427246, + "learning_rate": 2.3065024431144648e-05, + "loss": 0.9971, + "step": 22475 + }, + { + "epoch": 1.756440281030445, + "grad_norm": 4.019118785858154, + "learning_rate": 2.3028883684621392e-05, + "loss": 1.0038, + "step": 22500 + }, + { + "epoch": 1.758391881342701, + "grad_norm": 4.094124794006348, + "learning_rate": 2.299274293809813e-05, + "loss": 0.9826, + "step": 22525 + }, + { + "epoch": 1.760343481654957, + "grad_norm": 12.850878715515137, + "learning_rate": 2.295660219157487e-05, + "loss": 0.9895, + "step": 22550 + }, + { + "epoch": 1.762295081967213, + "grad_norm": 2.8238022327423096, + "learning_rate": 2.292046144505161e-05, + "loss": 1.0852, + "step": 22575 + }, + { + "epoch": 1.7642466822794691, + "grad_norm": 8.482772827148438, + "learning_rate": 2.288432069852835e-05, + "loss": 0.8679, + "step": 22600 + }, + { + "epoch": 1.7661982825917253, + "grad_norm": 4.804849624633789, + "learning_rate": 2.2848179952005088e-05, + "loss": 0.8889, + "step": 22625 + }, + { + "epoch": 1.7681498829039812, + "grad_norm": 4.841886520385742, + "learning_rate": 2.281203920548183e-05, + "loss": 0.9715, + "step": 22650 + }, + { + "epoch": 1.7701014832162372, + "grad_norm": 4.325882911682129, + "learning_rate": 2.277589845895857e-05, + "loss": 1.0526, + "step": 22675 + }, + { + "epoch": 1.7720530835284933, + "grad_norm": 2.584425926208496, + "learning_rate": 2.273975771243531e-05, + "loss": 1.0832, + "step": 22700 + }, + { + "epoch": 1.7740046838407495, + "grad_norm": 8.191747665405273, + "learning_rate": 2.270361696591205e-05, + "loss": 0.935, + "step": 22725 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 2.62751841545105, + "learning_rate": 2.2667476219388788e-05, + "loss": 1.0942, + "step": 22750 + }, + { + "epoch": 1.7779078844652614, + "grad_norm": 5.567984580993652, + "learning_rate": 2.263133547286553e-05, + "loss": 0.9677, + "step": 22775 + }, + { + "epoch": 1.7798594847775175, + "grad_norm": 10.521990776062012, + "learning_rate": 2.259519472634227e-05, + "loss": 0.9444, + "step": 22800 + }, + { + "epoch": 1.7818110850897737, + "grad_norm": 4.54259729385376, + "learning_rate": 2.255905397981901e-05, + "loss": 1.0718, + "step": 22825 + }, + { + "epoch": 1.7837626854020296, + "grad_norm": 47.41788101196289, + "learning_rate": 2.2522913233295747e-05, + "loss": 1.0191, + "step": 22850 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 3.800628900527954, + "learning_rate": 2.2486772486772487e-05, + "loss": 0.9657, + "step": 22875 + }, + { + "epoch": 1.7876658860265418, + "grad_norm": 6.235889434814453, + "learning_rate": 2.2450631740249228e-05, + "loss": 1.153, + "step": 22900 + }, + { + "epoch": 1.789617486338798, + "grad_norm": 4.902355670928955, + "learning_rate": 2.241449099372597e-05, + "loss": 0.9707, + "step": 22925 + }, + { + "epoch": 1.7915690866510539, + "grad_norm": 3.86279559135437, + "learning_rate": 2.2378350247202706e-05, + "loss": 0.885, + "step": 22950 + }, + { + "epoch": 1.7935206869633098, + "grad_norm": 4.678164958953857, + "learning_rate": 2.234220950067945e-05, + "loss": 1.0981, + "step": 22975 + }, + { + "epoch": 1.795472287275566, + "grad_norm": 3.1500132083892822, + "learning_rate": 2.2306068754156187e-05, + "loss": 0.9029, + "step": 23000 + }, + { + "epoch": 1.7974238875878221, + "grad_norm": 8.401984214782715, + "learning_rate": 2.2269928007632927e-05, + "loss": 1.0803, + "step": 23025 + }, + { + "epoch": 1.799375487900078, + "grad_norm": 5.956867218017578, + "learning_rate": 2.2233787261109668e-05, + "loss": 0.9549, + "step": 23050 + }, + { + "epoch": 1.801327088212334, + "grad_norm": 9.116150856018066, + "learning_rate": 2.2197646514586405e-05, + "loss": 0.9831, + "step": 23075 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 6.088160514831543, + "learning_rate": 2.2161505768063146e-05, + "loss": 0.9992, + "step": 23100 + }, + { + "epoch": 1.8052302888368463, + "grad_norm": 3.873304843902588, + "learning_rate": 2.2125365021539886e-05, + "loss": 1.0571, + "step": 23125 + }, + { + "epoch": 1.8071818891491023, + "grad_norm": 3.630399227142334, + "learning_rate": 2.2089224275016627e-05, + "loss": 0.9468, + "step": 23150 + }, + { + "epoch": 1.8091334894613582, + "grad_norm": 9.430502891540527, + "learning_rate": 2.2053083528493364e-05, + "loss": 0.9173, + "step": 23175 + }, + { + "epoch": 1.8110850897736144, + "grad_norm": 8.494078636169434, + "learning_rate": 2.2016942781970104e-05, + "loss": 1.0227, + "step": 23200 + }, + { + "epoch": 1.8130366900858705, + "grad_norm": 8.242522239685059, + "learning_rate": 2.1980802035446845e-05, + "loss": 1.0416, + "step": 23225 + }, + { + "epoch": 1.8149882903981265, + "grad_norm": 12.202899932861328, + "learning_rate": 2.1944661288923586e-05, + "loss": 1.1328, + "step": 23250 + }, + { + "epoch": 1.8169398907103824, + "grad_norm": 37.985015869140625, + "learning_rate": 2.1908520542400323e-05, + "loss": 0.9005, + "step": 23275 + }, + { + "epoch": 1.8188914910226386, + "grad_norm": 14.483037948608398, + "learning_rate": 2.1872379795877067e-05, + "loss": 1.0088, + "step": 23300 + }, + { + "epoch": 1.8208430913348947, + "grad_norm": 4.188882827758789, + "learning_rate": 2.1836239049353804e-05, + "loss": 1.0375, + "step": 23325 + }, + { + "epoch": 1.8227946916471507, + "grad_norm": 21.65285301208496, + "learning_rate": 2.1800098302830544e-05, + "loss": 1.0126, + "step": 23350 + }, + { + "epoch": 1.8247462919594066, + "grad_norm": 8.228139877319336, + "learning_rate": 2.1763957556307285e-05, + "loss": 1.0398, + "step": 23375 + }, + { + "epoch": 1.8266978922716628, + "grad_norm": 6.503570079803467, + "learning_rate": 2.1727816809784026e-05, + "loss": 1.1157, + "step": 23400 + }, + { + "epoch": 1.828649492583919, + "grad_norm": 6.996328830718994, + "learning_rate": 2.1691676063260763e-05, + "loss": 0.9921, + "step": 23425 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 72.69123077392578, + "learning_rate": 2.1655535316737503e-05, + "loss": 1.0419, + "step": 23450 + }, + { + "epoch": 1.8325526932084308, + "grad_norm": 20.86085319519043, + "learning_rate": 2.1619394570214244e-05, + "loss": 0.9514, + "step": 23475 + }, + { + "epoch": 1.834504293520687, + "grad_norm": 7.654865264892578, + "learning_rate": 2.158325382369098e-05, + "loss": 1.1413, + "step": 23500 + }, + { + "epoch": 1.8364558938329432, + "grad_norm": 3.606051445007324, + "learning_rate": 2.1547113077167725e-05, + "loss": 0.9255, + "step": 23525 + }, + { + "epoch": 1.838407494145199, + "grad_norm": 6.472594261169434, + "learning_rate": 2.1510972330644462e-05, + "loss": 0.9197, + "step": 23550 + }, + { + "epoch": 1.840359094457455, + "grad_norm": 6.906210899353027, + "learning_rate": 2.1474831584121203e-05, + "loss": 1.0964, + "step": 23575 + }, + { + "epoch": 1.8423106947697112, + "grad_norm": 7.558899402618408, + "learning_rate": 2.143869083759794e-05, + "loss": 1.0518, + "step": 23600 + }, + { + "epoch": 1.8442622950819674, + "grad_norm": 15.35103702545166, + "learning_rate": 2.1402550091074684e-05, + "loss": 1.0042, + "step": 23625 + }, + { + "epoch": 1.8462138953942233, + "grad_norm": 9.251070022583008, + "learning_rate": 2.136640934455142e-05, + "loss": 1.0123, + "step": 23650 + }, + { + "epoch": 1.8481654957064793, + "grad_norm": 13.137028694152832, + "learning_rate": 2.133026859802816e-05, + "loss": 0.9886, + "step": 23675 + }, + { + "epoch": 1.8501170960187352, + "grad_norm": 8.26199722290039, + "learning_rate": 2.1294127851504902e-05, + "loss": 1.057, + "step": 23700 + }, + { + "epoch": 1.8520686963309914, + "grad_norm": 6.371755123138428, + "learning_rate": 2.1257987104981643e-05, + "loss": 0.965, + "step": 23725 + }, + { + "epoch": 1.8540202966432475, + "grad_norm": 3.0446207523345947, + "learning_rate": 2.122184635845838e-05, + "loss": 1.0176, + "step": 23750 + }, + { + "epoch": 1.8559718969555035, + "grad_norm": 3.9427521228790283, + "learning_rate": 2.118570561193512e-05, + "loss": 1.1033, + "step": 23775 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 3.662386655807495, + "learning_rate": 2.114956486541186e-05, + "loss": 0.9452, + "step": 23800 + }, + { + "epoch": 1.8598750975800156, + "grad_norm": 10.343493461608887, + "learning_rate": 2.1113424118888602e-05, + "loss": 1.0255, + "step": 23825 + }, + { + "epoch": 1.8618266978922717, + "grad_norm": 5.656474590301514, + "learning_rate": 2.1077283372365342e-05, + "loss": 1.0743, + "step": 23850 + }, + { + "epoch": 1.8637782982045277, + "grad_norm": 5.307066917419434, + "learning_rate": 2.104114262584208e-05, + "loss": 1.0437, + "step": 23875 + }, + { + "epoch": 1.8657298985167836, + "grad_norm": 4.279754638671875, + "learning_rate": 2.100500187931882e-05, + "loss": 1.0523, + "step": 23900 + }, + { + "epoch": 1.8676814988290398, + "grad_norm": 3.5911779403686523, + "learning_rate": 2.096886113279556e-05, + "loss": 0.8772, + "step": 23925 + }, + { + "epoch": 1.869633099141296, + "grad_norm": 4.825263023376465, + "learning_rate": 2.09327203862723e-05, + "loss": 0.8774, + "step": 23950 + }, + { + "epoch": 1.8715846994535519, + "grad_norm": 4.953763484954834, + "learning_rate": 2.089657963974904e-05, + "loss": 0.9165, + "step": 23975 + }, + { + "epoch": 1.8735362997658078, + "grad_norm": 8.99000072479248, + "learning_rate": 2.086043889322578e-05, + "loss": 0.9741, + "step": 24000 + }, + { + "epoch": 1.875487900078064, + "grad_norm": 3.3210792541503906, + "learning_rate": 2.082429814670252e-05, + "loss": 1.0601, + "step": 24025 + }, + { + "epoch": 1.8774395003903201, + "grad_norm": 6.125851154327393, + "learning_rate": 2.078815740017926e-05, + "loss": 1.0938, + "step": 24050 + }, + { + "epoch": 1.879391100702576, + "grad_norm": 4.8660569190979, + "learning_rate": 2.0752016653655997e-05, + "loss": 0.9813, + "step": 24075 + }, + { + "epoch": 1.881342701014832, + "grad_norm": 5.24822998046875, + "learning_rate": 2.071587590713274e-05, + "loss": 0.908, + "step": 24100 + }, + { + "epoch": 1.8832943013270882, + "grad_norm": 4.919856548309326, + "learning_rate": 2.067973516060948e-05, + "loss": 1.1483, + "step": 24125 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 4.698136329650879, + "learning_rate": 2.064359441408622e-05, + "loss": 1.1088, + "step": 24150 + }, + { + "epoch": 1.8871975019516003, + "grad_norm": 5.378289222717285, + "learning_rate": 2.060745366756296e-05, + "loss": 1.0166, + "step": 24175 + }, + { + "epoch": 1.8891491022638562, + "grad_norm": 5.601995468139648, + "learning_rate": 2.0571312921039697e-05, + "loss": 0.9309, + "step": 24200 + }, + { + "epoch": 1.8911007025761124, + "grad_norm": 5.185031890869141, + "learning_rate": 2.0535172174516437e-05, + "loss": 0.9328, + "step": 24225 + }, + { + "epoch": 1.8930523028883686, + "grad_norm": 3.819873809814453, + "learning_rate": 2.0499031427993178e-05, + "loss": 1.1462, + "step": 24250 + }, + { + "epoch": 1.8950039032006245, + "grad_norm": 3.9771854877471924, + "learning_rate": 2.046289068146992e-05, + "loss": 0.9568, + "step": 24275 + }, + { + "epoch": 1.8969555035128804, + "grad_norm": 13.893989562988281, + "learning_rate": 2.0426749934946656e-05, + "loss": 1.0646, + "step": 24300 + }, + { + "epoch": 1.8989071038251366, + "grad_norm": 4.405734539031982, + "learning_rate": 2.0390609188423396e-05, + "loss": 1.0535, + "step": 24325 + }, + { + "epoch": 1.9008587041373928, + "grad_norm": 8.137755393981934, + "learning_rate": 2.0354468441900137e-05, + "loss": 0.9947, + "step": 24350 + }, + { + "epoch": 1.9028103044496487, + "grad_norm": 3.348464250564575, + "learning_rate": 2.0318327695376877e-05, + "loss": 0.98, + "step": 24375 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 5.1969733238220215, + "learning_rate": 2.0282186948853614e-05, + "loss": 0.9874, + "step": 24400 + }, + { + "epoch": 1.9067135050741608, + "grad_norm": 4.936718463897705, + "learning_rate": 2.024604620233036e-05, + "loss": 1.0316, + "step": 24425 + }, + { + "epoch": 1.908665105386417, + "grad_norm": 6.422952651977539, + "learning_rate": 2.0209905455807096e-05, + "loss": 1.0354, + "step": 24450 + }, + { + "epoch": 1.910616705698673, + "grad_norm": 6.350083827972412, + "learning_rate": 2.0173764709283836e-05, + "loss": 1.0285, + "step": 24475 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 3.9481732845306396, + "learning_rate": 2.0137623962760577e-05, + "loss": 0.9607, + "step": 24500 + }, + { + "epoch": 1.914519906323185, + "grad_norm": 4.927973747253418, + "learning_rate": 2.0101483216237317e-05, + "loss": 0.9261, + "step": 24525 + }, + { + "epoch": 1.9164715066354412, + "grad_norm": 5.495250225067139, + "learning_rate": 2.0065342469714054e-05, + "loss": 1.0127, + "step": 24550 + }, + { + "epoch": 1.9184231069476971, + "grad_norm": 2.946544885635376, + "learning_rate": 2.0029201723190795e-05, + "loss": 1.0471, + "step": 24575 + }, + { + "epoch": 1.920374707259953, + "grad_norm": 10.816059112548828, + "learning_rate": 1.9993060976667536e-05, + "loss": 1.0833, + "step": 24600 + }, + { + "epoch": 1.9223263075722092, + "grad_norm": 4.840624809265137, + "learning_rate": 1.9956920230144273e-05, + "loss": 1.0096, + "step": 24625 + }, + { + "epoch": 1.9242779078844654, + "grad_norm": 4.447267055511475, + "learning_rate": 1.9920779483621017e-05, + "loss": 1.006, + "step": 24650 + }, + { + "epoch": 1.9262295081967213, + "grad_norm": 6.60442590713501, + "learning_rate": 1.9884638737097754e-05, + "loss": 1.0072, + "step": 24675 + }, + { + "epoch": 1.9281811085089773, + "grad_norm": 5.035456657409668, + "learning_rate": 1.9848497990574494e-05, + "loss": 0.9091, + "step": 24700 + }, + { + "epoch": 1.9301327088212334, + "grad_norm": 5.859532833099365, + "learning_rate": 1.981235724405123e-05, + "loss": 0.9098, + "step": 24725 + }, + { + "epoch": 1.9320843091334896, + "grad_norm": 3.5434255599975586, + "learning_rate": 1.9776216497527976e-05, + "loss": 1.0425, + "step": 24750 + }, + { + "epoch": 1.9340359094457455, + "grad_norm": 4.631805419921875, + "learning_rate": 1.9740075751004713e-05, + "loss": 0.9839, + "step": 24775 + }, + { + "epoch": 1.9359875097580015, + "grad_norm": 4.86209774017334, + "learning_rate": 1.9703935004481453e-05, + "loss": 1.0382, + "step": 24800 + }, + { + "epoch": 1.9379391100702577, + "grad_norm": 6.131168365478516, + "learning_rate": 1.9667794257958194e-05, + "loss": 1.0773, + "step": 24825 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 7.604784965515137, + "learning_rate": 1.9631653511434935e-05, + "loss": 1.0322, + "step": 24850 + }, + { + "epoch": 1.9418423106947698, + "grad_norm": 3.792905569076538, + "learning_rate": 1.9595512764911672e-05, + "loss": 0.9464, + "step": 24875 + }, + { + "epoch": 1.9437939110070257, + "grad_norm": 5.633606433868408, + "learning_rate": 1.9559372018388412e-05, + "loss": 1.0914, + "step": 24900 + }, + { + "epoch": 1.9457455113192819, + "grad_norm": 7.202958583831787, + "learning_rate": 1.9523231271865153e-05, + "loss": 1.018, + "step": 24925 + }, + { + "epoch": 1.947697111631538, + "grad_norm": 2.7803268432617188, + "learning_rate": 1.9487090525341893e-05, + "loss": 0.9637, + "step": 24950 + }, + { + "epoch": 1.949648711943794, + "grad_norm": 3.412909984588623, + "learning_rate": 1.9450949778818634e-05, + "loss": 1.0248, + "step": 24975 + }, + { + "epoch": 1.95160031225605, + "grad_norm": 6.371620178222656, + "learning_rate": 1.941480903229537e-05, + "loss": 0.962, + "step": 25000 + }, + { + "epoch": 1.9535519125683058, + "grad_norm": 4.105161190032959, + "learning_rate": 1.9378668285772112e-05, + "loss": 0.9755, + "step": 25025 + }, + { + "epoch": 1.955503512880562, + "grad_norm": 4.338137149810791, + "learning_rate": 1.934252753924885e-05, + "loss": 0.9609, + "step": 25050 + }, + { + "epoch": 1.9574551131928182, + "grad_norm": 7.652178764343262, + "learning_rate": 1.9306386792725593e-05, + "loss": 1.1324, + "step": 25075 + }, + { + "epoch": 1.9594067135050741, + "grad_norm": 8.784337043762207, + "learning_rate": 1.927024604620233e-05, + "loss": 0.9207, + "step": 25100 + }, + { + "epoch": 1.96135831381733, + "grad_norm": 5.1917009353637695, + "learning_rate": 1.923410529967907e-05, + "loss": 1.0542, + "step": 25125 + }, + { + "epoch": 1.9633099141295862, + "grad_norm": 5.293478965759277, + "learning_rate": 1.919796455315581e-05, + "loss": 0.9375, + "step": 25150 + }, + { + "epoch": 1.9652615144418424, + "grad_norm": 2.8206756114959717, + "learning_rate": 1.9161823806632552e-05, + "loss": 1.1426, + "step": 25175 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 4.421519756317139, + "learning_rate": 1.912568306010929e-05, + "loss": 1.1107, + "step": 25200 + }, + { + "epoch": 1.9691647150663543, + "grad_norm": 7.0969319343566895, + "learning_rate": 1.9089542313586033e-05, + "loss": 0.9653, + "step": 25225 + }, + { + "epoch": 1.9711163153786104, + "grad_norm": 5.541957855224609, + "learning_rate": 1.905340156706277e-05, + "loss": 0.9419, + "step": 25250 + }, + { + "epoch": 1.9730679156908666, + "grad_norm": 8.783896446228027, + "learning_rate": 1.901726082053951e-05, + "loss": 0.915, + "step": 25275 + }, + { + "epoch": 1.9750195160031225, + "grad_norm": 4.210341453552246, + "learning_rate": 1.898112007401625e-05, + "loss": 0.9373, + "step": 25300 + }, + { + "epoch": 1.9769711163153785, + "grad_norm": 3.7495317459106445, + "learning_rate": 1.894497932749299e-05, + "loss": 1.0233, + "step": 25325 + }, + { + "epoch": 1.9789227166276346, + "grad_norm": 5.08139181137085, + "learning_rate": 1.890883858096973e-05, + "loss": 0.9092, + "step": 25350 + }, + { + "epoch": 1.9808743169398908, + "grad_norm": 4.388249397277832, + "learning_rate": 1.887269783444647e-05, + "loss": 1.037, + "step": 25375 + }, + { + "epoch": 1.9828259172521467, + "grad_norm": 7.213656425476074, + "learning_rate": 1.883655708792321e-05, + "loss": 1.0053, + "step": 25400 + }, + { + "epoch": 1.9847775175644027, + "grad_norm": 2.1058850288391113, + "learning_rate": 1.8800416341399947e-05, + "loss": 0.9775, + "step": 25425 + }, + { + "epoch": 1.9867291178766588, + "grad_norm": 8.376587867736816, + "learning_rate": 1.8764275594876688e-05, + "loss": 0.9299, + "step": 25450 + }, + { + "epoch": 1.988680718188915, + "grad_norm": 5.2815375328063965, + "learning_rate": 1.872813484835343e-05, + "loss": 1.0261, + "step": 25475 + }, + { + "epoch": 1.990632318501171, + "grad_norm": 10.438718795776367, + "learning_rate": 1.869199410183017e-05, + "loss": 1.0528, + "step": 25500 + }, + { + "epoch": 1.992583918813427, + "grad_norm": 3.0297138690948486, + "learning_rate": 1.8655853355306906e-05, + "loss": 1.0247, + "step": 25525 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 3.8890178203582764, + "learning_rate": 1.861971260878365e-05, + "loss": 0.9809, + "step": 25550 + }, + { + "epoch": 1.9964871194379392, + "grad_norm": 4.96302604675293, + "learning_rate": 1.8583571862260387e-05, + "loss": 0.9142, + "step": 25575 + }, + { + "epoch": 1.9984387197501952, + "grad_norm": 5.465353488922119, + "learning_rate": 1.8547431115737128e-05, + "loss": 1.0277, + "step": 25600 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.5985558157689306, + "eval_f1_macro": 0.5354175326862894, + "eval_f1_micro": 0.5985558157689306, + "eval_f1_weighted": 0.5876595510316817, + "eval_loss": 1.0056464672088623, + "eval_precision_macro": 0.6223970750630463, + "eval_precision_micro": 0.5985558157689306, + "eval_precision_weighted": 0.626917525865449, + "eval_recall_macro": 0.5122851891872513, + "eval_recall_micro": 0.5985558157689306, + "eval_recall_weighted": 0.5985558157689306, + "eval_runtime": 4534.8831, + "eval_samples_per_second": 5.65, + "eval_steps_per_second": 0.353, + "step": 25620 + }, + { + "epoch": 2.000390320062451, + "grad_norm": 6.886026382446289, + "learning_rate": 1.851129036921387e-05, + "loss": 0.9663, + "step": 25625 + }, + { + "epoch": 2.002341920374707, + "grad_norm": 4.928319454193115, + "learning_rate": 1.847514962269061e-05, + "loss": 0.9497, + "step": 25650 + }, + { + "epoch": 2.0042935206869634, + "grad_norm": 9.667476654052734, + "learning_rate": 1.8439008876167346e-05, + "loss": 0.9615, + "step": 25675 + }, + { + "epoch": 2.0062451209992194, + "grad_norm": 3.7652931213378906, + "learning_rate": 1.8402868129644087e-05, + "loss": 1.0235, + "step": 25700 + }, + { + "epoch": 2.0081967213114753, + "grad_norm": 7.290821552276611, + "learning_rate": 1.8366727383120827e-05, + "loss": 1.023, + "step": 25725 + }, + { + "epoch": 2.0101483216237312, + "grad_norm": 5.406303405761719, + "learning_rate": 1.8330586636597564e-05, + "loss": 0.9168, + "step": 25750 + }, + { + "epoch": 2.0120999219359876, + "grad_norm": 5.609794616699219, + "learning_rate": 1.8294445890074305e-05, + "loss": 1.1885, + "step": 25775 + }, + { + "epoch": 2.0140515222482436, + "grad_norm": 5.354650497436523, + "learning_rate": 1.8258305143551046e-05, + "loss": 1.0624, + "step": 25800 + }, + { + "epoch": 2.0160031225604995, + "grad_norm": 3.4259674549102783, + "learning_rate": 1.8222164397027786e-05, + "loss": 0.996, + "step": 25825 + }, + { + "epoch": 2.0179547228727555, + "grad_norm": 7.254239082336426, + "learning_rate": 1.8186023650504523e-05, + "loss": 1.0123, + "step": 25850 + }, + { + "epoch": 2.019906323185012, + "grad_norm": 2.9830498695373535, + "learning_rate": 1.8149882903981267e-05, + "loss": 0.9291, + "step": 25875 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 2.9003350734710693, + "learning_rate": 1.8113742157458005e-05, + "loss": 0.9475, + "step": 25900 + }, + { + "epoch": 2.0238095238095237, + "grad_norm": 4.742582321166992, + "learning_rate": 1.8077601410934745e-05, + "loss": 0.9512, + "step": 25925 + }, + { + "epoch": 2.0257611241217797, + "grad_norm": 5.183776378631592, + "learning_rate": 1.8041460664411486e-05, + "loss": 0.9392, + "step": 25950 + }, + { + "epoch": 2.027712724434036, + "grad_norm": 4.273641586303711, + "learning_rate": 1.8005319917888226e-05, + "loss": 1.007, + "step": 25975 + }, + { + "epoch": 2.029664324746292, + "grad_norm": 5.716911792755127, + "learning_rate": 1.7969179171364963e-05, + "loss": 1.1262, + "step": 26000 + }, + { + "epoch": 2.031615925058548, + "grad_norm": 8.178544998168945, + "learning_rate": 1.7933038424841704e-05, + "loss": 0.9109, + "step": 26025 + }, + { + "epoch": 2.033567525370804, + "grad_norm": 2.71571946144104, + "learning_rate": 1.7896897678318445e-05, + "loss": 0.8558, + "step": 26050 + }, + { + "epoch": 2.0355191256830603, + "grad_norm": 5.6638922691345215, + "learning_rate": 1.7860756931795182e-05, + "loss": 1.032, + "step": 26075 + }, + { + "epoch": 2.037470725995316, + "grad_norm": 9.162562370300293, + "learning_rate": 1.7824616185271926e-05, + "loss": 0.8992, + "step": 26100 + }, + { + "epoch": 2.039422326307572, + "grad_norm": 5.578387260437012, + "learning_rate": 1.7788475438748663e-05, + "loss": 0.8911, + "step": 26125 + }, + { + "epoch": 2.041373926619828, + "grad_norm": 8.655416488647461, + "learning_rate": 1.7752334692225403e-05, + "loss": 0.9516, + "step": 26150 + }, + { + "epoch": 2.0433255269320845, + "grad_norm": 4.054612159729004, + "learning_rate": 1.771619394570214e-05, + "loss": 1.0967, + "step": 26175 + }, + { + "epoch": 2.0452771272443404, + "grad_norm": 4.838535785675049, + "learning_rate": 1.7680053199178885e-05, + "loss": 0.9491, + "step": 26200 + }, + { + "epoch": 2.0472287275565964, + "grad_norm": 5.885175704956055, + "learning_rate": 1.7643912452655622e-05, + "loss": 0.9534, + "step": 26225 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 2.4291462898254395, + "learning_rate": 1.7607771706132362e-05, + "loss": 0.8206, + "step": 26250 + }, + { + "epoch": 2.0511319281811087, + "grad_norm": 11.120742797851562, + "learning_rate": 1.7571630959609103e-05, + "loss": 1.0349, + "step": 26275 + }, + { + "epoch": 2.0530835284933646, + "grad_norm": 3.825836420059204, + "learning_rate": 1.7535490213085843e-05, + "loss": 1.0561, + "step": 26300 + }, + { + "epoch": 2.0550351288056206, + "grad_norm": 7.3542680740356445, + "learning_rate": 1.749934946656258e-05, + "loss": 0.9658, + "step": 26325 + }, + { + "epoch": 2.0569867291178765, + "grad_norm": 8.210222244262695, + "learning_rate": 1.7463208720039325e-05, + "loss": 1.1254, + "step": 26350 + }, + { + "epoch": 2.058938329430133, + "grad_norm": 5.820336818695068, + "learning_rate": 1.7427067973516062e-05, + "loss": 1.0567, + "step": 26375 + }, + { + "epoch": 2.060889929742389, + "grad_norm": 5.608272075653076, + "learning_rate": 1.7390927226992802e-05, + "loss": 0.9408, + "step": 26400 + }, + { + "epoch": 2.0628415300546448, + "grad_norm": 17.765472412109375, + "learning_rate": 1.7354786480469543e-05, + "loss": 0.981, + "step": 26425 + }, + { + "epoch": 2.0647931303669007, + "grad_norm": 2.218811511993408, + "learning_rate": 1.731864573394628e-05, + "loss": 0.9093, + "step": 26450 + }, + { + "epoch": 2.066744730679157, + "grad_norm": 18.186668395996094, + "learning_rate": 1.728250498742302e-05, + "loss": 0.9166, + "step": 26475 + }, + { + "epoch": 2.068696330991413, + "grad_norm": 3.8229007720947266, + "learning_rate": 1.724636424089976e-05, + "loss": 1.0209, + "step": 26500 + }, + { + "epoch": 2.070647931303669, + "grad_norm": 2.42405104637146, + "learning_rate": 1.7210223494376502e-05, + "loss": 0.892, + "step": 26525 + }, + { + "epoch": 2.072599531615925, + "grad_norm": 5.344985485076904, + "learning_rate": 1.717408274785324e-05, + "loss": 0.8683, + "step": 26550 + }, + { + "epoch": 2.0745511319281813, + "grad_norm": 4.9546284675598145, + "learning_rate": 1.713794200132998e-05, + "loss": 0.9188, + "step": 26575 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 4.535947799682617, + "learning_rate": 1.710180125480672e-05, + "loss": 1.0352, + "step": 26600 + }, + { + "epoch": 2.078454332552693, + "grad_norm": 8.590414047241211, + "learning_rate": 1.706566050828346e-05, + "loss": 1.0745, + "step": 26625 + }, + { + "epoch": 2.080405932864949, + "grad_norm": 6.805130958557129, + "learning_rate": 1.7029519761760198e-05, + "loss": 1.0032, + "step": 26650 + }, + { + "epoch": 2.0823575331772055, + "grad_norm": 14.587897300720215, + "learning_rate": 1.6993379015236942e-05, + "loss": 0.9756, + "step": 26675 + }, + { + "epoch": 2.0843091334894615, + "grad_norm": 5.300734043121338, + "learning_rate": 1.695723826871368e-05, + "loss": 1.0155, + "step": 26700 + }, + { + "epoch": 2.0862607338017174, + "grad_norm": 3.7397916316986084, + "learning_rate": 1.692109752219042e-05, + "loss": 1.0173, + "step": 26725 + }, + { + "epoch": 2.0882123341139733, + "grad_norm": 8.332179069519043, + "learning_rate": 1.688495677566716e-05, + "loss": 0.9054, + "step": 26750 + }, + { + "epoch": 2.0901639344262297, + "grad_norm": 3.7235991954803467, + "learning_rate": 1.68488160291439e-05, + "loss": 1.0281, + "step": 26775 + }, + { + "epoch": 2.0921155347384857, + "grad_norm": 6.858097553253174, + "learning_rate": 1.6812675282620638e-05, + "loss": 1.0125, + "step": 26800 + }, + { + "epoch": 2.0940671350507416, + "grad_norm": 7.646819114685059, + "learning_rate": 1.677653453609738e-05, + "loss": 1.0365, + "step": 26825 + }, + { + "epoch": 2.0960187353629975, + "grad_norm": 4.471475124359131, + "learning_rate": 1.674039378957412e-05, + "loss": 0.8459, + "step": 26850 + }, + { + "epoch": 2.097970335675254, + "grad_norm": 11.729499816894531, + "learning_rate": 1.6704253043050856e-05, + "loss": 1.0393, + "step": 26875 + }, + { + "epoch": 2.09992193598751, + "grad_norm": 4.221553325653076, + "learning_rate": 1.6668112296527597e-05, + "loss": 0.8533, + "step": 26900 + }, + { + "epoch": 2.101873536299766, + "grad_norm": 5.375683307647705, + "learning_rate": 1.6631971550004337e-05, + "loss": 1.0006, + "step": 26925 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 3.8575401306152344, + "learning_rate": 1.6595830803481078e-05, + "loss": 1.0334, + "step": 26950 + }, + { + "epoch": 2.1057767369242777, + "grad_norm": 4.648269176483154, + "learning_rate": 1.6559690056957815e-05, + "loss": 0.9688, + "step": 26975 + }, + { + "epoch": 2.107728337236534, + "grad_norm": 7.331996440887451, + "learning_rate": 1.652354931043456e-05, + "loss": 1.0404, + "step": 27000 + }, + { + "epoch": 2.10967993754879, + "grad_norm": 12.124656677246094, + "learning_rate": 1.6487408563911296e-05, + "loss": 0.9199, + "step": 27025 + }, + { + "epoch": 2.111631537861046, + "grad_norm": 6.5369038581848145, + "learning_rate": 1.6451267817388037e-05, + "loss": 0.9601, + "step": 27050 + }, + { + "epoch": 2.113583138173302, + "grad_norm": 3.3152198791503906, + "learning_rate": 1.6415127070864777e-05, + "loss": 0.997, + "step": 27075 + }, + { + "epoch": 2.1155347384855583, + "grad_norm": 5.888739585876465, + "learning_rate": 1.6378986324341518e-05, + "loss": 0.9409, + "step": 27100 + }, + { + "epoch": 2.1174863387978142, + "grad_norm": 9.535857200622559, + "learning_rate": 1.6342845577818255e-05, + "loss": 0.9893, + "step": 27125 + }, + { + "epoch": 2.11943793911007, + "grad_norm": 6.455988883972168, + "learning_rate": 1.6306704831294996e-05, + "loss": 0.9753, + "step": 27150 + }, + { + "epoch": 2.121389539422326, + "grad_norm": 3.4995737075805664, + "learning_rate": 1.6270564084771736e-05, + "loss": 0.9336, + "step": 27175 + }, + { + "epoch": 2.1233411397345825, + "grad_norm": 9.277902603149414, + "learning_rate": 1.6234423338248473e-05, + "loss": 0.9604, + "step": 27200 + }, + { + "epoch": 2.1252927400468384, + "grad_norm": 5.127810478210449, + "learning_rate": 1.6198282591725217e-05, + "loss": 0.8887, + "step": 27225 + }, + { + "epoch": 2.1272443403590944, + "grad_norm": 5.327418804168701, + "learning_rate": 1.6162141845201955e-05, + "loss": 0.9587, + "step": 27250 + }, + { + "epoch": 2.1291959406713503, + "grad_norm": 11.63198184967041, + "learning_rate": 1.6126001098678695e-05, + "loss": 0.9409, + "step": 27275 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 4.527537822723389, + "learning_rate": 1.6089860352155432e-05, + "loss": 0.8827, + "step": 27300 + }, + { + "epoch": 2.1330991412958626, + "grad_norm": 5.738367557525635, + "learning_rate": 1.6053719605632176e-05, + "loss": 1.0558, + "step": 27325 + }, + { + "epoch": 2.1350507416081186, + "grad_norm": 3.6807265281677246, + "learning_rate": 1.6017578859108913e-05, + "loss": 0.985, + "step": 27350 + }, + { + "epoch": 2.1370023419203745, + "grad_norm": 6.1871256828308105, + "learning_rate": 1.5981438112585654e-05, + "loss": 1.0088, + "step": 27375 + }, + { + "epoch": 2.138953942232631, + "grad_norm": 7.7258782386779785, + "learning_rate": 1.5945297366062395e-05, + "loss": 1.0636, + "step": 27400 + }, + { + "epoch": 2.140905542544887, + "grad_norm": 8.524324417114258, + "learning_rate": 1.5909156619539135e-05, + "loss": 1.114, + "step": 27425 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 9.18470573425293, + "learning_rate": 1.5873015873015872e-05, + "loss": 0.9782, + "step": 27450 + }, + { + "epoch": 2.1448087431693987, + "grad_norm": 4.502004623413086, + "learning_rate": 1.5836875126492616e-05, + "loss": 0.9339, + "step": 27475 + }, + { + "epoch": 2.146760343481655, + "grad_norm": 12.300610542297363, + "learning_rate": 1.5800734379969353e-05, + "loss": 0.9906, + "step": 27500 + }, + { + "epoch": 2.148711943793911, + "grad_norm": 1.815293788909912, + "learning_rate": 1.5764593633446094e-05, + "loss": 1.0443, + "step": 27525 + }, + { + "epoch": 2.150663544106167, + "grad_norm": 9.135332107543945, + "learning_rate": 1.5728452886922835e-05, + "loss": 1.0616, + "step": 27550 + }, + { + "epoch": 2.152615144418423, + "grad_norm": 4.01249885559082, + "learning_rate": 1.5692312140399572e-05, + "loss": 0.9078, + "step": 27575 + }, + { + "epoch": 2.1545667447306793, + "grad_norm": 6.635018825531006, + "learning_rate": 1.5656171393876312e-05, + "loss": 1.0852, + "step": 27600 + }, + { + "epoch": 2.1565183450429353, + "grad_norm": 8.71415901184082, + "learning_rate": 1.562003064735305e-05, + "loss": 0.9838, + "step": 27625 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 6.9681291580200195, + "learning_rate": 1.5583889900829793e-05, + "loss": 0.9293, + "step": 27650 + }, + { + "epoch": 2.160421545667447, + "grad_norm": 4.839693546295166, + "learning_rate": 1.554774915430653e-05, + "loss": 0.9648, + "step": 27675 + }, + { + "epoch": 2.1623731459797035, + "grad_norm": 7.137564659118652, + "learning_rate": 1.551160840778327e-05, + "loss": 0.9498, + "step": 27700 + }, + { + "epoch": 2.1643247462919595, + "grad_norm": 5.481379508972168, + "learning_rate": 1.5475467661260012e-05, + "loss": 1.0909, + "step": 27725 + }, + { + "epoch": 2.1662763466042154, + "grad_norm": 8.150879859924316, + "learning_rate": 1.5439326914736752e-05, + "loss": 1.0039, + "step": 27750 + }, + { + "epoch": 2.1682279469164714, + "grad_norm": 4.08789587020874, + "learning_rate": 1.540318616821349e-05, + "loss": 0.8723, + "step": 27775 + }, + { + "epoch": 2.1701795472287277, + "grad_norm": 7.489070892333984, + "learning_rate": 1.5367045421690234e-05, + "loss": 1.0127, + "step": 27800 + }, + { + "epoch": 2.1721311475409837, + "grad_norm": 13.971680641174316, + "learning_rate": 1.533090467516697e-05, + "loss": 0.9522, + "step": 27825 + }, + { + "epoch": 2.1740827478532396, + "grad_norm": 2.381237030029297, + "learning_rate": 1.529476392864371e-05, + "loss": 1.1431, + "step": 27850 + }, + { + "epoch": 2.1760343481654956, + "grad_norm": 14.066901206970215, + "learning_rate": 1.5258623182120452e-05, + "loss": 0.9485, + "step": 27875 + }, + { + "epoch": 2.177985948477752, + "grad_norm": 4.859488010406494, + "learning_rate": 1.522248243559719e-05, + "loss": 0.9191, + "step": 27900 + }, + { + "epoch": 2.179937548790008, + "grad_norm": 2.7918882369995117, + "learning_rate": 1.518634168907393e-05, + "loss": 0.9237, + "step": 27925 + }, + { + "epoch": 2.181889149102264, + "grad_norm": 9.140660285949707, + "learning_rate": 1.5150200942550672e-05, + "loss": 0.8926, + "step": 27950 + }, + { + "epoch": 2.1838407494145198, + "grad_norm": 3.5810208320617676, + "learning_rate": 1.511406019602741e-05, + "loss": 0.8827, + "step": 27975 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 3.3466479778289795, + "learning_rate": 1.507791944950415e-05, + "loss": 0.8498, + "step": 28000 + }, + { + "epoch": 2.187743950039032, + "grad_norm": 5.325255393981934, + "learning_rate": 1.5041778702980888e-05, + "loss": 0.8802, + "step": 28025 + }, + { + "epoch": 2.189695550351288, + "grad_norm": 5.787388324737549, + "learning_rate": 1.500563795645763e-05, + "loss": 0.8814, + "step": 28050 + }, + { + "epoch": 2.191647150663544, + "grad_norm": 6.591071605682373, + "learning_rate": 1.496949720993437e-05, + "loss": 1.08, + "step": 28075 + }, + { + "epoch": 2.1935987509758004, + "grad_norm": 7.689511775970459, + "learning_rate": 1.4933356463411108e-05, + "loss": 1.0652, + "step": 28100 + }, + { + "epoch": 2.1955503512880563, + "grad_norm": 17.948575973510742, + "learning_rate": 1.4897215716887849e-05, + "loss": 0.928, + "step": 28125 + }, + { + "epoch": 2.1975019516003123, + "grad_norm": 12.959871292114258, + "learning_rate": 1.4861074970364588e-05, + "loss": 0.8551, + "step": 28150 + }, + { + "epoch": 2.199453551912568, + "grad_norm": 4.313554763793945, + "learning_rate": 1.4824934223841327e-05, + "loss": 1.0335, + "step": 28175 + }, + { + "epoch": 2.201405152224824, + "grad_norm": 6.400911331176758, + "learning_rate": 1.4788793477318069e-05, + "loss": 0.8744, + "step": 28200 + }, + { + "epoch": 2.2033567525370805, + "grad_norm": 3.2288403511047363, + "learning_rate": 1.4752652730794808e-05, + "loss": 1.1332, + "step": 28225 + }, + { + "epoch": 2.2053083528493365, + "grad_norm": 5.799706935882568, + "learning_rate": 1.4716511984271547e-05, + "loss": 0.9855, + "step": 28250 + }, + { + "epoch": 2.2072599531615924, + "grad_norm": 5.40717077255249, + "learning_rate": 1.4680371237748289e-05, + "loss": 0.8699, + "step": 28275 + }, + { + "epoch": 2.209211553473849, + "grad_norm": 7.853477954864502, + "learning_rate": 1.4644230491225028e-05, + "loss": 0.9791, + "step": 28300 + }, + { + "epoch": 2.2111631537861047, + "grad_norm": 5.657312393188477, + "learning_rate": 1.4608089744701767e-05, + "loss": 0.8509, + "step": 28325 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 9.418416023254395, + "learning_rate": 1.4571948998178509e-05, + "loss": 0.9158, + "step": 28350 + }, + { + "epoch": 2.2150663544106166, + "grad_norm": 5.769794940948486, + "learning_rate": 1.4535808251655248e-05, + "loss": 1.0015, + "step": 28375 + }, + { + "epoch": 2.2170179547228726, + "grad_norm": 22.790889739990234, + "learning_rate": 1.4499667505131987e-05, + "loss": 1.0266, + "step": 28400 + }, + { + "epoch": 2.218969555035129, + "grad_norm": 13.210980415344238, + "learning_rate": 1.4463526758608726e-05, + "loss": 1.0748, + "step": 28425 + }, + { + "epoch": 2.220921155347385, + "grad_norm": 5.830112934112549, + "learning_rate": 1.4427386012085466e-05, + "loss": 0.9793, + "step": 28450 + }, + { + "epoch": 2.222872755659641, + "grad_norm": 9.9698486328125, + "learning_rate": 1.4391245265562205e-05, + "loss": 0.9507, + "step": 28475 + }, + { + "epoch": 2.2248243559718968, + "grad_norm": 4.813653469085693, + "learning_rate": 1.4355104519038946e-05, + "loss": 0.9596, + "step": 28500 + }, + { + "epoch": 2.226775956284153, + "grad_norm": 7.891148090362549, + "learning_rate": 1.4318963772515686e-05, + "loss": 0.9123, + "step": 28525 + }, + { + "epoch": 2.228727556596409, + "grad_norm": 13.834811210632324, + "learning_rate": 1.4282823025992425e-05, + "loss": 1.0876, + "step": 28550 + }, + { + "epoch": 2.230679156908665, + "grad_norm": 11.975841522216797, + "learning_rate": 1.4246682279469164e-05, + "loss": 1.0164, + "step": 28575 + }, + { + "epoch": 2.232630757220921, + "grad_norm": 3.770481824874878, + "learning_rate": 1.4210541532945906e-05, + "loss": 1.0294, + "step": 28600 + }, + { + "epoch": 2.2345823575331774, + "grad_norm": 10.288094520568848, + "learning_rate": 1.4174400786422645e-05, + "loss": 0.9309, + "step": 28625 + }, + { + "epoch": 2.2365339578454333, + "grad_norm": 3.638847827911377, + "learning_rate": 1.4138260039899384e-05, + "loss": 1.0171, + "step": 28650 + }, + { + "epoch": 2.2384855581576892, + "grad_norm": 4.831700801849365, + "learning_rate": 1.4102119293376126e-05, + "loss": 0.9902, + "step": 28675 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 6.579839706420898, + "learning_rate": 1.4065978546852865e-05, + "loss": 0.9476, + "step": 28700 + }, + { + "epoch": 2.2423887587822016, + "grad_norm": 5.4303154945373535, + "learning_rate": 1.4029837800329604e-05, + "loss": 0.9595, + "step": 28725 + }, + { + "epoch": 2.2443403590944575, + "grad_norm": 7.9212327003479, + "learning_rate": 1.3993697053806343e-05, + "loss": 0.8997, + "step": 28750 + }, + { + "epoch": 2.2462919594067134, + "grad_norm": 4.725170612335205, + "learning_rate": 1.3957556307283085e-05, + "loss": 1.0729, + "step": 28775 + }, + { + "epoch": 2.2482435597189694, + "grad_norm": 4.587596416473389, + "learning_rate": 1.3921415560759824e-05, + "loss": 0.9872, + "step": 28800 + }, + { + "epoch": 2.2501951600312258, + "grad_norm": 1.9277433156967163, + "learning_rate": 1.3885274814236563e-05, + "loss": 0.9775, + "step": 28825 + }, + { + "epoch": 2.2521467603434817, + "grad_norm": 4.2252583503723145, + "learning_rate": 1.3849134067713304e-05, + "loss": 1.072, + "step": 28850 + }, + { + "epoch": 2.2540983606557377, + "grad_norm": 7.165510654449463, + "learning_rate": 1.3812993321190042e-05, + "loss": 0.922, + "step": 28875 + }, + { + "epoch": 2.2560499609679936, + "grad_norm": 7.793671131134033, + "learning_rate": 1.3776852574666781e-05, + "loss": 0.9162, + "step": 28900 + }, + { + "epoch": 2.25800156128025, + "grad_norm": 3.7899179458618164, + "learning_rate": 1.3740711828143524e-05, + "loss": 0.9813, + "step": 28925 + }, + { + "epoch": 2.259953161592506, + "grad_norm": 4.094254493713379, + "learning_rate": 1.3704571081620262e-05, + "loss": 1.0857, + "step": 28950 + }, + { + "epoch": 2.261904761904762, + "grad_norm": 3.2754712104797363, + "learning_rate": 1.3668430335097001e-05, + "loss": 0.9565, + "step": 28975 + }, + { + "epoch": 2.263856362217018, + "grad_norm": 5.620739459991455, + "learning_rate": 1.3632289588573744e-05, + "loss": 0.9926, + "step": 29000 + }, + { + "epoch": 2.265807962529274, + "grad_norm": 6.594982147216797, + "learning_rate": 1.3596148842050482e-05, + "loss": 0.9974, + "step": 29025 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 5.886627197265625, + "learning_rate": 1.3560008095527221e-05, + "loss": 0.995, + "step": 29050 + }, + { + "epoch": 2.269711163153786, + "grad_norm": 14.084917068481445, + "learning_rate": 1.3523867349003964e-05, + "loss": 0.9279, + "step": 29075 + }, + { + "epoch": 2.271662763466042, + "grad_norm": 5.216859817504883, + "learning_rate": 1.3487726602480702e-05, + "loss": 0.9538, + "step": 29100 + }, + { + "epoch": 2.2736143637782984, + "grad_norm": 3.426664352416992, + "learning_rate": 1.3451585855957441e-05, + "loss": 0.9421, + "step": 29125 + }, + { + "epoch": 2.2755659640905543, + "grad_norm": 3.9018309116363525, + "learning_rate": 1.341544510943418e-05, + "loss": 0.8922, + "step": 29150 + }, + { + "epoch": 2.2775175644028103, + "grad_norm": 3.0979888439178467, + "learning_rate": 1.3379304362910922e-05, + "loss": 1.0274, + "step": 29175 + }, + { + "epoch": 2.279469164715066, + "grad_norm": 4.668726921081543, + "learning_rate": 1.3343163616387661e-05, + "loss": 1.0193, + "step": 29200 + }, + { + "epoch": 2.281420765027322, + "grad_norm": 4.666767120361328, + "learning_rate": 1.33070228698644e-05, + "loss": 0.9839, + "step": 29225 + }, + { + "epoch": 2.2833723653395785, + "grad_norm": 9.055400848388672, + "learning_rate": 1.327088212334114e-05, + "loss": 0.9663, + "step": 29250 + }, + { + "epoch": 2.2853239656518345, + "grad_norm": 4.293013095855713, + "learning_rate": 1.323474137681788e-05, + "loss": 1.0619, + "step": 29275 + }, + { + "epoch": 2.2872755659640904, + "grad_norm": 6.230489730834961, + "learning_rate": 1.3198600630294618e-05, + "loss": 0.978, + "step": 29300 + }, + { + "epoch": 2.289227166276347, + "grad_norm": 6.095199108123779, + "learning_rate": 1.316245988377136e-05, + "loss": 0.903, + "step": 29325 + }, + { + "epoch": 2.2911787665886028, + "grad_norm": 3.4371697902679443, + "learning_rate": 1.31263191372481e-05, + "loss": 1.0495, + "step": 29350 + }, + { + "epoch": 2.2931303669008587, + "grad_norm": 3.5021347999572754, + "learning_rate": 1.3090178390724838e-05, + "loss": 0.9786, + "step": 29375 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 11.055572509765625, + "learning_rate": 1.305403764420158e-05, + "loss": 0.9204, + "step": 29400 + }, + { + "epoch": 2.2970335675253706, + "grad_norm": 4.738006114959717, + "learning_rate": 1.301789689767832e-05, + "loss": 1.0253, + "step": 29425 + }, + { + "epoch": 2.298985167837627, + "grad_norm": 10.708647727966309, + "learning_rate": 1.2981756151155059e-05, + "loss": 0.9317, + "step": 29450 + }, + { + "epoch": 2.300936768149883, + "grad_norm": 11.313172340393066, + "learning_rate": 1.2945615404631797e-05, + "loss": 0.8813, + "step": 29475 + }, + { + "epoch": 2.302888368462139, + "grad_norm": 6.359426975250244, + "learning_rate": 1.290947465810854e-05, + "loss": 0.9423, + "step": 29500 + }, + { + "epoch": 2.3048399687743952, + "grad_norm": 5.531627655029297, + "learning_rate": 1.2873333911585279e-05, + "loss": 0.914, + "step": 29525 + }, + { + "epoch": 2.306791569086651, + "grad_norm": 7.141120433807373, + "learning_rate": 1.2837193165062017e-05, + "loss": 1.0675, + "step": 29550 + }, + { + "epoch": 2.308743169398907, + "grad_norm": 15.569938659667969, + "learning_rate": 1.2801052418538758e-05, + "loss": 1.0527, + "step": 29575 + }, + { + "epoch": 2.310694769711163, + "grad_norm": 4.769351005554199, + "learning_rate": 1.2764911672015497e-05, + "loss": 0.9639, + "step": 29600 + }, + { + "epoch": 2.312646370023419, + "grad_norm": 9.785005569458008, + "learning_rate": 1.2728770925492237e-05, + "loss": 0.9799, + "step": 29625 + }, + { + "epoch": 2.3145979703356754, + "grad_norm": 3.6823160648345947, + "learning_rate": 1.2692630178968978e-05, + "loss": 0.9842, + "step": 29650 + }, + { + "epoch": 2.3165495706479313, + "grad_norm": 16.532907485961914, + "learning_rate": 1.2656489432445717e-05, + "loss": 1.0613, + "step": 29675 + }, + { + "epoch": 2.3185011709601873, + "grad_norm": 6.940017223358154, + "learning_rate": 1.2620348685922456e-05, + "loss": 1.0841, + "step": 29700 + }, + { + "epoch": 2.3204527712724436, + "grad_norm": 4.19795036315918, + "learning_rate": 1.2584207939399198e-05, + "loss": 0.8864, + "step": 29725 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 4.07827615737915, + "learning_rate": 1.2548067192875937e-05, + "loss": 0.9291, + "step": 29750 + }, + { + "epoch": 2.3243559718969555, + "grad_norm": 8.857775688171387, + "learning_rate": 1.2511926446352676e-05, + "loss": 1.0052, + "step": 29775 + }, + { + "epoch": 2.3263075722092115, + "grad_norm": 7.242584228515625, + "learning_rate": 1.2475785699829416e-05, + "loss": 1.0809, + "step": 29800 + }, + { + "epoch": 2.3282591725214674, + "grad_norm": 5.207695484161377, + "learning_rate": 1.2439644953306157e-05, + "loss": 0.9654, + "step": 29825 + }, + { + "epoch": 2.330210772833724, + "grad_norm": 11.311796188354492, + "learning_rate": 1.2403504206782896e-05, + "loss": 1.067, + "step": 29850 + }, + { + "epoch": 2.3321623731459797, + "grad_norm": 6.6428914070129395, + "learning_rate": 1.2367363460259636e-05, + "loss": 0.9397, + "step": 29875 + }, + { + "epoch": 2.3341139734582357, + "grad_norm": 5.000761985778809, + "learning_rate": 1.2331222713736377e-05, + "loss": 0.9863, + "step": 29900 + }, + { + "epoch": 2.3360655737704916, + "grad_norm": 3.101931095123291, + "learning_rate": 1.2295081967213116e-05, + "loss": 0.9746, + "step": 29925 + }, + { + "epoch": 2.338017174082748, + "grad_norm": 6.155749320983887, + "learning_rate": 1.2258941220689855e-05, + "loss": 1.0357, + "step": 29950 + }, + { + "epoch": 2.339968774395004, + "grad_norm": 8.24863052368164, + "learning_rate": 1.2222800474166595e-05, + "loss": 0.9721, + "step": 29975 + }, + { + "epoch": 2.34192037470726, + "grad_norm": 6.979496955871582, + "learning_rate": 1.2186659727643334e-05, + "loss": 0.9762, + "step": 30000 + }, + { + "epoch": 2.343871975019516, + "grad_norm": 8.325671195983887, + "learning_rate": 1.2150518981120075e-05, + "loss": 1.0253, + "step": 30025 + }, + { + "epoch": 2.345823575331772, + "grad_norm": 14.883063316345215, + "learning_rate": 1.2114378234596814e-05, + "loss": 0.8882, + "step": 30050 + }, + { + "epoch": 2.347775175644028, + "grad_norm": 6.894105434417725, + "learning_rate": 1.2078237488073554e-05, + "loss": 0.8812, + "step": 30075 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 9.40588665008545, + "learning_rate": 1.2042096741550295e-05, + "loss": 1.0739, + "step": 30100 + }, + { + "epoch": 2.35167837626854, + "grad_norm": 6.560412406921387, + "learning_rate": 1.2005955995027034e-05, + "loss": 1.0594, + "step": 30125 + }, + { + "epoch": 2.3536299765807964, + "grad_norm": 8.218924522399902, + "learning_rate": 1.1969815248503774e-05, + "loss": 0.9933, + "step": 30150 + }, + { + "epoch": 2.3555815768930524, + "grad_norm": 8.137438774108887, + "learning_rate": 1.1933674501980513e-05, + "loss": 1.0546, + "step": 30175 + }, + { + "epoch": 2.3575331772053083, + "grad_norm": 4.003744125366211, + "learning_rate": 1.1897533755457254e-05, + "loss": 0.994, + "step": 30200 + }, + { + "epoch": 2.3594847775175642, + "grad_norm": 5.952597141265869, + "learning_rate": 1.1861393008933994e-05, + "loss": 1.0256, + "step": 30225 + }, + { + "epoch": 2.3614363778298206, + "grad_norm": 7.141972541809082, + "learning_rate": 1.1825252262410733e-05, + "loss": 0.985, + "step": 30250 + }, + { + "epoch": 2.3633879781420766, + "grad_norm": 3.663198232650757, + "learning_rate": 1.1789111515887474e-05, + "loss": 0.9883, + "step": 30275 + }, + { + "epoch": 2.3653395784543325, + "grad_norm": 8.209881782531738, + "learning_rate": 1.1752970769364212e-05, + "loss": 0.9552, + "step": 30300 + }, + { + "epoch": 2.3672911787665885, + "grad_norm": 6.044576168060303, + "learning_rate": 1.1716830022840953e-05, + "loss": 1.0128, + "step": 30325 + }, + { + "epoch": 2.369242779078845, + "grad_norm": 3.9171862602233887, + "learning_rate": 1.1680689276317692e-05, + "loss": 0.9493, + "step": 30350 + }, + { + "epoch": 2.371194379391101, + "grad_norm": 4.479526519775391, + "learning_rate": 1.164454852979443e-05, + "loss": 0.9585, + "step": 30375 + }, + { + "epoch": 2.3731459797033567, + "grad_norm": 3.9303736686706543, + "learning_rate": 1.1608407783271171e-05, + "loss": 0.9097, + "step": 30400 + }, + { + "epoch": 2.3750975800156127, + "grad_norm": 8.05853271484375, + "learning_rate": 1.1572267036747912e-05, + "loss": 0.8702, + "step": 30425 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 6.113443374633789, + "learning_rate": 1.153612629022465e-05, + "loss": 0.9927, + "step": 30450 + }, + { + "epoch": 2.379000780640125, + "grad_norm": 7.406364917755127, + "learning_rate": 1.1499985543701391e-05, + "loss": 0.9762, + "step": 30475 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 7.077301979064941, + "learning_rate": 1.146384479717813e-05, + "loss": 1.0396, + "step": 30500 + }, + { + "epoch": 2.382903981264637, + "grad_norm": 10.106149673461914, + "learning_rate": 1.142770405065487e-05, + "loss": 0.9816, + "step": 30525 + }, + { + "epoch": 2.3848555815768933, + "grad_norm": 5.740723609924316, + "learning_rate": 1.1391563304131611e-05, + "loss": 1.0582, + "step": 30550 + }, + { + "epoch": 2.386807181889149, + "grad_norm": 4.246044635772705, + "learning_rate": 1.135542255760835e-05, + "loss": 0.9745, + "step": 30575 + }, + { + "epoch": 2.388758782201405, + "grad_norm": 6.6865692138671875, + "learning_rate": 1.131928181108509e-05, + "loss": 0.9335, + "step": 30600 + }, + { + "epoch": 2.390710382513661, + "grad_norm": 11.087933540344238, + "learning_rate": 1.1283141064561831e-05, + "loss": 0.9265, + "step": 30625 + }, + { + "epoch": 2.392661982825917, + "grad_norm": 9.3740816116333, + "learning_rate": 1.124700031803857e-05, + "loss": 1.0571, + "step": 30650 + }, + { + "epoch": 2.3946135831381734, + "grad_norm": 3.9068801403045654, + "learning_rate": 1.121085957151531e-05, + "loss": 0.9522, + "step": 30675 + }, + { + "epoch": 2.3965651834504293, + "grad_norm": 4.672880172729492, + "learning_rate": 1.117471882499205e-05, + "loss": 0.8745, + "step": 30700 + }, + { + "epoch": 2.3985167837626853, + "grad_norm": 8.521465301513672, + "learning_rate": 1.1138578078468789e-05, + "loss": 1.0647, + "step": 30725 + }, + { + "epoch": 2.4004683840749417, + "grad_norm": 5.655125617980957, + "learning_rate": 1.1102437331945529e-05, + "loss": 1.0152, + "step": 30750 + }, + { + "epoch": 2.4024199843871976, + "grad_norm": 5.124433517456055, + "learning_rate": 1.1066296585422268e-05, + "loss": 1.0151, + "step": 30775 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 5.483908653259277, + "learning_rate": 1.1030155838899009e-05, + "loss": 0.9329, + "step": 30800 + }, + { + "epoch": 2.4063231850117095, + "grad_norm": 6.874320030212402, + "learning_rate": 1.0994015092375749e-05, + "loss": 1.0595, + "step": 30825 + }, + { + "epoch": 2.4082747853239654, + "grad_norm": 11.788403511047363, + "learning_rate": 1.0957874345852488e-05, + "loss": 0.9025, + "step": 30850 + }, + { + "epoch": 2.410226385636222, + "grad_norm": 6.14674711227417, + "learning_rate": 1.0921733599329229e-05, + "loss": 0.9473, + "step": 30875 + }, + { + "epoch": 2.4121779859484778, + "grad_norm": 4.083067893981934, + "learning_rate": 1.0885592852805967e-05, + "loss": 0.8882, + "step": 30900 + }, + { + "epoch": 2.4141295862607337, + "grad_norm": 5.767749309539795, + "learning_rate": 1.0849452106282708e-05, + "loss": 0.9651, + "step": 30925 + }, + { + "epoch": 2.41608118657299, + "grad_norm": 8.176101684570312, + "learning_rate": 1.0813311359759449e-05, + "loss": 0.9832, + "step": 30950 + }, + { + "epoch": 2.418032786885246, + "grad_norm": 7.644573211669922, + "learning_rate": 1.0777170613236187e-05, + "loss": 1.0428, + "step": 30975 + }, + { + "epoch": 2.419984387197502, + "grad_norm": 6.1448750495910645, + "learning_rate": 1.0741029866712928e-05, + "loss": 0.9745, + "step": 31000 + }, + { + "epoch": 2.421935987509758, + "grad_norm": 9.077276229858398, + "learning_rate": 1.0704889120189669e-05, + "loss": 0.8729, + "step": 31025 + }, + { + "epoch": 2.423887587822014, + "grad_norm": 6.631700038909912, + "learning_rate": 1.0668748373666407e-05, + "loss": 1.0791, + "step": 31050 + }, + { + "epoch": 2.4258391881342702, + "grad_norm": 6.121955871582031, + "learning_rate": 1.0632607627143146e-05, + "loss": 0.9411, + "step": 31075 + }, + { + "epoch": 2.427790788446526, + "grad_norm": 3.8228302001953125, + "learning_rate": 1.0596466880619887e-05, + "loss": 0.914, + "step": 31100 + }, + { + "epoch": 2.429742388758782, + "grad_norm": 6.348528861999512, + "learning_rate": 1.0560326134096626e-05, + "loss": 0.9257, + "step": 31125 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 5.243332386016846, + "learning_rate": 1.0524185387573366e-05, + "loss": 1.0549, + "step": 31150 + }, + { + "epoch": 2.4336455893832944, + "grad_norm": 7.257528305053711, + "learning_rate": 1.0488044641050105e-05, + "loss": 0.9365, + "step": 31175 + }, + { + "epoch": 2.4355971896955504, + "grad_norm": 6.477630138397217, + "learning_rate": 1.0451903894526846e-05, + "loss": 1.0519, + "step": 31200 + }, + { + "epoch": 2.4375487900078063, + "grad_norm": 5.286805629730225, + "learning_rate": 1.0415763148003585e-05, + "loss": 0.9573, + "step": 31225 + }, + { + "epoch": 2.4395003903200623, + "grad_norm": 8.109827041625977, + "learning_rate": 1.0379622401480325e-05, + "loss": 0.887, + "step": 31250 + }, + { + "epoch": 2.4414519906323187, + "grad_norm": 5.713425636291504, + "learning_rate": 1.0343481654957066e-05, + "loss": 0.9673, + "step": 31275 + }, + { + "epoch": 2.4434035909445746, + "grad_norm": 5.0396857261657715, + "learning_rate": 1.0307340908433805e-05, + "loss": 0.9674, + "step": 31300 + }, + { + "epoch": 2.4453551912568305, + "grad_norm": 11.901044845581055, + "learning_rate": 1.0271200161910545e-05, + "loss": 0.9042, + "step": 31325 + }, + { + "epoch": 2.4473067915690865, + "grad_norm": 30.320228576660156, + "learning_rate": 1.0235059415387286e-05, + "loss": 0.982, + "step": 31350 + }, + { + "epoch": 2.449258391881343, + "grad_norm": 6.0906476974487305, + "learning_rate": 1.0198918668864025e-05, + "loss": 0.9918, + "step": 31375 + }, + { + "epoch": 2.451209992193599, + "grad_norm": 8.826563835144043, + "learning_rate": 1.0162777922340765e-05, + "loss": 1.0469, + "step": 31400 + }, + { + "epoch": 2.4531615925058547, + "grad_norm": 5.827922344207764, + "learning_rate": 1.0126637175817504e-05, + "loss": 0.9668, + "step": 31425 + }, + { + "epoch": 2.4551131928181107, + "grad_norm": 9.078672409057617, + "learning_rate": 1.0090496429294245e-05, + "loss": 0.9404, + "step": 31450 + }, + { + "epoch": 2.457064793130367, + "grad_norm": 19.40699005126953, + "learning_rate": 1.0054355682770984e-05, + "loss": 0.9356, + "step": 31475 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 5.999863147735596, + "learning_rate": 1.0018214936247722e-05, + "loss": 0.925, + "step": 31500 + }, + { + "epoch": 2.460967993754879, + "grad_norm": 8.135833740234375, + "learning_rate": 9.982074189724463e-06, + "loss": 1.0488, + "step": 31525 + }, + { + "epoch": 2.462919594067135, + "grad_norm": 3.9625046253204346, + "learning_rate": 9.945933443201204e-06, + "loss": 0.937, + "step": 31550 + }, + { + "epoch": 2.4648711943793913, + "grad_norm": 6.0401129722595215, + "learning_rate": 9.909792696677942e-06, + "loss": 0.992, + "step": 31575 + }, + { + "epoch": 2.4668227946916472, + "grad_norm": 5.625734329223633, + "learning_rate": 9.873651950154683e-06, + "loss": 1.0811, + "step": 31600 + }, + { + "epoch": 2.468774395003903, + "grad_norm": 5.0271759033203125, + "learning_rate": 9.837511203631422e-06, + "loss": 1.0082, + "step": 31625 + }, + { + "epoch": 2.470725995316159, + "grad_norm": 5.924317836761475, + "learning_rate": 9.801370457108162e-06, + "loss": 1.0068, + "step": 31650 + }, + { + "epoch": 2.4726775956284155, + "grad_norm": 6.7531867027282715, + "learning_rate": 9.765229710584903e-06, + "loss": 1.1194, + "step": 31675 + }, + { + "epoch": 2.4746291959406714, + "grad_norm": 8.309741020202637, + "learning_rate": 9.729088964061642e-06, + "loss": 0.9484, + "step": 31700 + }, + { + "epoch": 2.4765807962529274, + "grad_norm": 14.298741340637207, + "learning_rate": 9.692948217538382e-06, + "loss": 1.0462, + "step": 31725 + }, + { + "epoch": 2.4785323965651833, + "grad_norm": 4.0249247550964355, + "learning_rate": 9.656807471015123e-06, + "loss": 1.0424, + "step": 31750 + }, + { + "epoch": 2.4804839968774397, + "grad_norm": 5.353481769561768, + "learning_rate": 9.620666724491862e-06, + "loss": 0.9523, + "step": 31775 + }, + { + "epoch": 2.4824355971896956, + "grad_norm": 5.737078666687012, + "learning_rate": 9.584525977968602e-06, + "loss": 0.8723, + "step": 31800 + }, + { + "epoch": 2.4843871975019516, + "grad_norm": 7.522207260131836, + "learning_rate": 9.548385231445341e-06, + "loss": 1.006, + "step": 31825 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 6.205740451812744, + "learning_rate": 9.51224448492208e-06, + "loss": 0.9816, + "step": 31850 + }, + { + "epoch": 2.4882903981264635, + "grad_norm": 12.071701049804688, + "learning_rate": 9.47610373839882e-06, + "loss": 1.1396, + "step": 31875 + }, + { + "epoch": 2.49024199843872, + "grad_norm": 6.391028881072998, + "learning_rate": 9.43996299187556e-06, + "loss": 0.9251, + "step": 31900 + }, + { + "epoch": 2.492193598750976, + "grad_norm": 8.569737434387207, + "learning_rate": 9.4038222453523e-06, + "loss": 0.9092, + "step": 31925 + }, + { + "epoch": 2.4941451990632317, + "grad_norm": 7.3247199058532715, + "learning_rate": 9.36768149882904e-06, + "loss": 1.1044, + "step": 31950 + }, + { + "epoch": 2.496096799375488, + "grad_norm": 2.879774808883667, + "learning_rate": 9.33154075230578e-06, + "loss": 0.845, + "step": 31975 + }, + { + "epoch": 2.498048399687744, + "grad_norm": 4.9196577072143555, + "learning_rate": 9.29540000578252e-06, + "loss": 1.1101, + "step": 32000 + }, + { + "epoch": 2.5, + "grad_norm": 8.309597969055176, + "learning_rate": 9.259259259259259e-06, + "loss": 1.0638, + "step": 32025 + }, + { + "epoch": 2.501951600312256, + "grad_norm": 9.722742080688477, + "learning_rate": 9.223118512736e-06, + "loss": 0.8975, + "step": 32050 + }, + { + "epoch": 2.503903200624512, + "grad_norm": 2.8348076343536377, + "learning_rate": 9.18697776621274e-06, + "loss": 0.9371, + "step": 32075 + }, + { + "epoch": 2.5058548009367683, + "grad_norm": 7.504531383514404, + "learning_rate": 9.150837019689479e-06, + "loss": 0.7983, + "step": 32100 + }, + { + "epoch": 2.507806401249024, + "grad_norm": 14.54291820526123, + "learning_rate": 9.11469627316622e-06, + "loss": 0.9906, + "step": 32125 + }, + { + "epoch": 2.50975800156128, + "grad_norm": 5.026951789855957, + "learning_rate": 9.078555526642959e-06, + "loss": 1.0009, + "step": 32150 + }, + { + "epoch": 2.5117096018735365, + "grad_norm": 7.150708198547363, + "learning_rate": 9.0424147801197e-06, + "loss": 1.075, + "step": 32175 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 5.468643665313721, + "learning_rate": 9.006274033596438e-06, + "loss": 0.9829, + "step": 32200 + }, + { + "epoch": 2.5156128024980484, + "grad_norm": 8.43506145477295, + "learning_rate": 8.970133287073177e-06, + "loss": 0.8484, + "step": 32225 + }, + { + "epoch": 2.5175644028103044, + "grad_norm": 6.201877593994141, + "learning_rate": 8.933992540549917e-06, + "loss": 1.042, + "step": 32250 + }, + { + "epoch": 2.5195160031225603, + "grad_norm": 6.238094806671143, + "learning_rate": 8.897851794026658e-06, + "loss": 1.0411, + "step": 32275 + }, + { + "epoch": 2.5214676034348167, + "grad_norm": 4.352574348449707, + "learning_rate": 8.861711047503397e-06, + "loss": 0.8691, + "step": 32300 + }, + { + "epoch": 2.5234192037470726, + "grad_norm": 4.175549507141113, + "learning_rate": 8.825570300980137e-06, + "loss": 0.9434, + "step": 32325 + }, + { + "epoch": 2.5253708040593286, + "grad_norm": 5.9434099197387695, + "learning_rate": 8.789429554456876e-06, + "loss": 0.8322, + "step": 32350 + }, + { + "epoch": 2.527322404371585, + "grad_norm": 6.347434997558594, + "learning_rate": 8.753288807933617e-06, + "loss": 1.096, + "step": 32375 + }, + { + "epoch": 2.529274004683841, + "grad_norm": 4.583721160888672, + "learning_rate": 8.717148061410358e-06, + "loss": 1.0487, + "step": 32400 + }, + { + "epoch": 2.531225604996097, + "grad_norm": 4.194766998291016, + "learning_rate": 8.681007314887096e-06, + "loss": 0.9414, + "step": 32425 + }, + { + "epoch": 2.5331772053083528, + "grad_norm": 9.97851848602295, + "learning_rate": 8.644866568363837e-06, + "loss": 0.9438, + "step": 32450 + }, + { + "epoch": 2.5351288056206087, + "grad_norm": 7.122736930847168, + "learning_rate": 8.608725821840578e-06, + "loss": 0.9395, + "step": 32475 + }, + { + "epoch": 2.537080405932865, + "grad_norm": 3.8571557998657227, + "learning_rate": 8.572585075317316e-06, + "loss": 1.0715, + "step": 32500 + }, + { + "epoch": 2.539032006245121, + "grad_norm": 8.033638000488281, + "learning_rate": 8.536444328794057e-06, + "loss": 0.9242, + "step": 32525 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 5.801552772521973, + "learning_rate": 8.500303582270796e-06, + "loss": 1.033, + "step": 32550 + }, + { + "epoch": 2.5429352068696334, + "grad_norm": 24.130815505981445, + "learning_rate": 8.464162835747535e-06, + "loss": 0.8794, + "step": 32575 + }, + { + "epoch": 2.5448868071818893, + "grad_norm": 4.116757392883301, + "learning_rate": 8.428022089224275e-06, + "loss": 0.9068, + "step": 32600 + }, + { + "epoch": 2.5468384074941453, + "grad_norm": 5.367023944854736, + "learning_rate": 8.391881342701014e-06, + "loss": 0.9922, + "step": 32625 + }, + { + "epoch": 2.548790007806401, + "grad_norm": 3.6237165927886963, + "learning_rate": 8.355740596177755e-06, + "loss": 0.9678, + "step": 32650 + }, + { + "epoch": 2.550741608118657, + "grad_norm": 6.481890678405762, + "learning_rate": 8.319599849654495e-06, + "loss": 0.9549, + "step": 32675 + }, + { + "epoch": 2.552693208430913, + "grad_norm": 4.162390232086182, + "learning_rate": 8.283459103131234e-06, + "loss": 1.017, + "step": 32700 + }, + { + "epoch": 2.5546448087431695, + "grad_norm": 3.7989232540130615, + "learning_rate": 8.247318356607975e-06, + "loss": 0.9482, + "step": 32725 + }, + { + "epoch": 2.5565964090554254, + "grad_norm": 3.717047691345215, + "learning_rate": 8.211177610084714e-06, + "loss": 0.9696, + "step": 32750 + }, + { + "epoch": 2.5585480093676813, + "grad_norm": 8.619587898254395, + "learning_rate": 8.175036863561454e-06, + "loss": 0.9971, + "step": 32775 + }, + { + "epoch": 2.5604996096799377, + "grad_norm": 8.369901657104492, + "learning_rate": 8.138896117038195e-06, + "loss": 0.9641, + "step": 32800 + }, + { + "epoch": 2.5624512099921937, + "grad_norm": 10.549445152282715, + "learning_rate": 8.102755370514934e-06, + "loss": 0.9465, + "step": 32825 + }, + { + "epoch": 2.5644028103044496, + "grad_norm": 4.533816337585449, + "learning_rate": 8.066614623991674e-06, + "loss": 1.055, + "step": 32850 + }, + { + "epoch": 2.5663544106167056, + "grad_norm": 4.171160697937012, + "learning_rate": 8.030473877468415e-06, + "loss": 0.9541, + "step": 32875 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 8.18663501739502, + "learning_rate": 7.994333130945154e-06, + "loss": 1.0934, + "step": 32900 + }, + { + "epoch": 2.570257611241218, + "grad_norm": 38.61558532714844, + "learning_rate": 7.958192384421894e-06, + "loss": 0.9414, + "step": 32925 + }, + { + "epoch": 2.572209211553474, + "grad_norm": 10.152268409729004, + "learning_rate": 7.922051637898633e-06, + "loss": 0.8341, + "step": 32950 + }, + { + "epoch": 2.5741608118657298, + "grad_norm": 9.300623893737793, + "learning_rate": 7.885910891375372e-06, + "loss": 0.8684, + "step": 32975 + }, + { + "epoch": 2.576112412177986, + "grad_norm": 3.9494409561157227, + "learning_rate": 7.849770144852113e-06, + "loss": 0.9547, + "step": 33000 + }, + { + "epoch": 2.578064012490242, + "grad_norm": 5.867650508880615, + "learning_rate": 7.813629398328851e-06, + "loss": 0.95, + "step": 33025 + }, + { + "epoch": 2.580015612802498, + "grad_norm": 4.412187099456787, + "learning_rate": 7.777488651805592e-06, + "loss": 0.9681, + "step": 33050 + }, + { + "epoch": 2.581967213114754, + "grad_norm": 15.531952857971191, + "learning_rate": 7.74134790528233e-06, + "loss": 0.9524, + "step": 33075 + }, + { + "epoch": 2.58391881342701, + "grad_norm": 14.39814567565918, + "learning_rate": 7.705207158759071e-06, + "loss": 0.8713, + "step": 33100 + }, + { + "epoch": 2.5858704137392663, + "grad_norm": 5.69350528717041, + "learning_rate": 7.669066412235812e-06, + "loss": 0.9885, + "step": 33125 + }, + { + "epoch": 2.5878220140515222, + "grad_norm": 10.534988403320312, + "learning_rate": 7.63292566571255e-06, + "loss": 1.0195, + "step": 33150 + }, + { + "epoch": 2.589773614363778, + "grad_norm": 4.347255706787109, + "learning_rate": 7.596784919189291e-06, + "loss": 0.9347, + "step": 33175 + }, + { + "epoch": 2.5917252146760346, + "grad_norm": 15.27342414855957, + "learning_rate": 7.560644172666031e-06, + "loss": 0.9575, + "step": 33200 + }, + { + "epoch": 2.5936768149882905, + "grad_norm": 9.028501510620117, + "learning_rate": 7.524503426142771e-06, + "loss": 1.0287, + "step": 33225 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 6.797510623931885, + "learning_rate": 7.4883626796195106e-06, + "loss": 0.9472, + "step": 33250 + }, + { + "epoch": 2.5975800156128024, + "grad_norm": 6.6254167556762695, + "learning_rate": 7.4522219330962494e-06, + "loss": 1.0768, + "step": 33275 + }, + { + "epoch": 2.5995316159250583, + "grad_norm": 10.8053560256958, + "learning_rate": 7.41608118657299e-06, + "loss": 0.9943, + "step": 33300 + }, + { + "epoch": 2.6014832162373147, + "grad_norm": 6.622986316680908, + "learning_rate": 7.379940440049731e-06, + "loss": 0.9594, + "step": 33325 + }, + { + "epoch": 2.6034348165495707, + "grad_norm": 5.385802745819092, + "learning_rate": 7.3437996935264695e-06, + "loss": 0.9264, + "step": 33350 + }, + { + "epoch": 2.6053864168618266, + "grad_norm": 5.485842704772949, + "learning_rate": 7.30765894700321e-06, + "loss": 1.0036, + "step": 33375 + }, + { + "epoch": 2.607338017174083, + "grad_norm": 9.910578727722168, + "learning_rate": 7.27151820047995e-06, + "loss": 0.9451, + "step": 33400 + }, + { + "epoch": 2.609289617486339, + "grad_norm": 75.94444274902344, + "learning_rate": 7.235377453956689e-06, + "loss": 1.0518, + "step": 33425 + }, + { + "epoch": 2.611241217798595, + "grad_norm": 5.67848014831543, + "learning_rate": 7.199236707433429e-06, + "loss": 1.0103, + "step": 33450 + }, + { + "epoch": 2.613192818110851, + "grad_norm": 5.153994083404541, + "learning_rate": 7.163095960910168e-06, + "loss": 0.9279, + "step": 33475 + }, + { + "epoch": 2.6151444184231067, + "grad_norm": 5.899696350097656, + "learning_rate": 7.126955214386909e-06, + "loss": 0.9879, + "step": 33500 + }, + { + "epoch": 2.617096018735363, + "grad_norm": 8.0438871383667, + "learning_rate": 7.090814467863649e-06, + "loss": 0.9979, + "step": 33525 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 4.936447620391846, + "learning_rate": 7.054673721340388e-06, + "loss": 1.0407, + "step": 33550 + }, + { + "epoch": 2.620999219359875, + "grad_norm": 6.827628135681152, + "learning_rate": 7.018532974817129e-06, + "loss": 1.0494, + "step": 33575 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 2.877272844314575, + "learning_rate": 6.982392228293868e-06, + "loss": 0.9225, + "step": 33600 + }, + { + "epoch": 2.6249024199843873, + "grad_norm": 6.387278079986572, + "learning_rate": 6.946251481770607e-06, + "loss": 1.0567, + "step": 33625 + }, + { + "epoch": 2.6268540202966433, + "grad_norm": 3.42100191116333, + "learning_rate": 6.910110735247348e-06, + "loss": 0.9351, + "step": 33650 + }, + { + "epoch": 2.628805620608899, + "grad_norm": 5.836422443389893, + "learning_rate": 6.873969988724087e-06, + "loss": 0.9675, + "step": 33675 + }, + { + "epoch": 2.630757220921155, + "grad_norm": 7.553625106811523, + "learning_rate": 6.837829242200827e-06, + "loss": 0.979, + "step": 33700 + }, + { + "epoch": 2.6327088212334115, + "grad_norm": 4.6855244636535645, + "learning_rate": 6.801688495677568e-06, + "loss": 0.9096, + "step": 33725 + }, + { + "epoch": 2.6346604215456675, + "grad_norm": 7.591019153594971, + "learning_rate": 6.765547749154307e-06, + "loss": 0.9406, + "step": 33750 + }, + { + "epoch": 2.6366120218579234, + "grad_norm": 11.959511756896973, + "learning_rate": 6.729407002631047e-06, + "loss": 0.9963, + "step": 33775 + }, + { + "epoch": 2.63856362217018, + "grad_norm": 6.982829570770264, + "learning_rate": 6.693266256107786e-06, + "loss": 1.0124, + "step": 33800 + }, + { + "epoch": 2.6405152224824358, + "grad_norm": 5.991888523101807, + "learning_rate": 6.657125509584526e-06, + "loss": 0.9322, + "step": 33825 + }, + { + "epoch": 2.6424668227946917, + "grad_norm": 7.475541591644287, + "learning_rate": 6.6209847630612664e-06, + "loss": 1.114, + "step": 33850 + }, + { + "epoch": 2.6444184231069476, + "grad_norm": 6.789041519165039, + "learning_rate": 6.584844016538005e-06, + "loss": 1.0161, + "step": 33875 + }, + { + "epoch": 2.6463700234192036, + "grad_norm": 3.265803337097168, + "learning_rate": 6.548703270014746e-06, + "loss": 0.9941, + "step": 33900 + }, + { + "epoch": 2.64832162373146, + "grad_norm": 7.424156188964844, + "learning_rate": 6.5125625234914864e-06, + "loss": 0.8625, + "step": 33925 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 2.75602388381958, + "learning_rate": 6.476421776968225e-06, + "loss": 0.8382, + "step": 33950 + }, + { + "epoch": 2.652224824355972, + "grad_norm": 4.535311698913574, + "learning_rate": 6.440281030444965e-06, + "loss": 0.9574, + "step": 33975 + }, + { + "epoch": 2.654176424668228, + "grad_norm": 3.5907976627349854, + "learning_rate": 6.404140283921705e-06, + "loss": 0.9383, + "step": 34000 + }, + { + "epoch": 2.656128024980484, + "grad_norm": 6.075243949890137, + "learning_rate": 6.3679995373984445e-06, + "loss": 0.9952, + "step": 34025 + }, + { + "epoch": 2.65807962529274, + "grad_norm": 4.383375644683838, + "learning_rate": 6.331858790875185e-06, + "loss": 0.981, + "step": 34050 + }, + { + "epoch": 2.660031225604996, + "grad_norm": 31.638050079345703, + "learning_rate": 6.295718044351924e-06, + "loss": 0.9966, + "step": 34075 + }, + { + "epoch": 2.661982825917252, + "grad_norm": 7.863098621368408, + "learning_rate": 6.2595772978286645e-06, + "loss": 0.9129, + "step": 34100 + }, + { + "epoch": 2.663934426229508, + "grad_norm": 58.13475036621094, + "learning_rate": 6.223436551305404e-06, + "loss": 0.9333, + "step": 34125 + }, + { + "epoch": 2.6658860265417643, + "grad_norm": 5.72825813293457, + "learning_rate": 6.187295804782144e-06, + "loss": 0.9327, + "step": 34150 + }, + { + "epoch": 2.6678376268540203, + "grad_norm": 9.484465599060059, + "learning_rate": 6.151155058258884e-06, + "loss": 0.9893, + "step": 34175 + }, + { + "epoch": 2.669789227166276, + "grad_norm": 19.498689651489258, + "learning_rate": 6.115014311735623e-06, + "loss": 0.9518, + "step": 34200 + }, + { + "epoch": 2.6717408274785326, + "grad_norm": 2.871004343032837, + "learning_rate": 6.078873565212363e-06, + "loss": 0.8288, + "step": 34225 + }, + { + "epoch": 2.6736924277907885, + "grad_norm": 5.968422889709473, + "learning_rate": 6.042732818689103e-06, + "loss": 0.9256, + "step": 34250 + }, + { + "epoch": 2.6756440281030445, + "grad_norm": 5.377193927764893, + "learning_rate": 6.006592072165843e-06, + "loss": 1.0264, + "step": 34275 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 6.185430526733398, + "learning_rate": 5.970451325642583e-06, + "loss": 0.9488, + "step": 34300 + }, + { + "epoch": 2.6795472287275564, + "grad_norm": 6.6664509773254395, + "learning_rate": 5.934310579119323e-06, + "loss": 1.0801, + "step": 34325 + }, + { + "epoch": 2.6814988290398127, + "grad_norm": 12.882567405700684, + "learning_rate": 5.8981698325960626e-06, + "loss": 0.9768, + "step": 34350 + }, + { + "epoch": 2.6834504293520687, + "grad_norm": 4.35813570022583, + "learning_rate": 5.862029086072802e-06, + "loss": 0.8278, + "step": 34375 + }, + { + "epoch": 2.6854020296643246, + "grad_norm": 5.229624271392822, + "learning_rate": 5.825888339549542e-06, + "loss": 0.9328, + "step": 34400 + }, + { + "epoch": 2.687353629976581, + "grad_norm": 11.509014129638672, + "learning_rate": 5.789747593026282e-06, + "loss": 1.0848, + "step": 34425 + }, + { + "epoch": 2.689305230288837, + "grad_norm": 9.492149353027344, + "learning_rate": 5.7536068465030214e-06, + "loss": 1.0494, + "step": 34450 + }, + { + "epoch": 2.691256830601093, + "grad_norm": 15.170995712280273, + "learning_rate": 5.717466099979762e-06, + "loss": 1.0186, + "step": 34475 + }, + { + "epoch": 2.693208430913349, + "grad_norm": 2.2591323852539062, + "learning_rate": 5.681325353456502e-06, + "loss": 0.9791, + "step": 34500 + }, + { + "epoch": 2.6951600312256048, + "grad_norm": 4.162281513214111, + "learning_rate": 5.6451846069332414e-06, + "loss": 1.0441, + "step": 34525 + }, + { + "epoch": 2.697111631537861, + "grad_norm": 9.68082332611084, + "learning_rate": 5.60904386040998e-06, + "loss": 1.0583, + "step": 34550 + }, + { + "epoch": 2.699063231850117, + "grad_norm": 4.857485294342041, + "learning_rate": 5.572903113886721e-06, + "loss": 0.881, + "step": 34575 + }, + { + "epoch": 2.701014832162373, + "grad_norm": 5.222204208374023, + "learning_rate": 5.536762367363461e-06, + "loss": 0.9455, + "step": 34600 + }, + { + "epoch": 2.7029664324746294, + "grad_norm": 5.1334919929504395, + "learning_rate": 5.5006216208402e-06, + "loss": 0.8701, + "step": 34625 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 12.660305976867676, + "learning_rate": 5.46448087431694e-06, + "loss": 0.9229, + "step": 34650 + }, + { + "epoch": 2.7068696330991413, + "grad_norm": 5.183673858642578, + "learning_rate": 5.42834012779368e-06, + "loss": 1.0212, + "step": 34675 + }, + { + "epoch": 2.7088212334113972, + "grad_norm": 6.06083869934082, + "learning_rate": 5.39219938127042e-06, + "loss": 0.942, + "step": 34700 + }, + { + "epoch": 2.710772833723653, + "grad_norm": 9.007758140563965, + "learning_rate": 5.356058634747159e-06, + "loss": 1.1112, + "step": 34725 + }, + { + "epoch": 2.7127244340359096, + "grad_norm": 5.921536922454834, + "learning_rate": 5.319917888223899e-06, + "loss": 0.8582, + "step": 34750 + }, + { + "epoch": 2.7146760343481655, + "grad_norm": 5.442780017852783, + "learning_rate": 5.283777141700639e-06, + "loss": 0.8355, + "step": 34775 + }, + { + "epoch": 2.7166276346604215, + "grad_norm": 7.917137622833252, + "learning_rate": 5.247636395177379e-06, + "loss": 0.9893, + "step": 34800 + }, + { + "epoch": 2.718579234972678, + "grad_norm": 6.792044162750244, + "learning_rate": 5.211495648654119e-06, + "loss": 0.9607, + "step": 34825 + }, + { + "epoch": 2.720530835284934, + "grad_norm": 13.380036354064941, + "learning_rate": 5.175354902130859e-06, + "loss": 0.7785, + "step": 34850 + }, + { + "epoch": 2.7224824355971897, + "grad_norm": 4.078965187072754, + "learning_rate": 5.139214155607598e-06, + "loss": 1.0543, + "step": 34875 + }, + { + "epoch": 2.7244340359094457, + "grad_norm": 18.490478515625, + "learning_rate": 5.103073409084338e-06, + "loss": 0.9531, + "step": 34900 + }, + { + "epoch": 2.7263856362217016, + "grad_norm": 7.501288414001465, + "learning_rate": 5.066932662561078e-06, + "loss": 1.0915, + "step": 34925 + }, + { + "epoch": 2.728337236533958, + "grad_norm": 6.161617755889893, + "learning_rate": 5.0307919160378176e-06, + "loss": 0.8864, + "step": 34950 + }, + { + "epoch": 2.730288836846214, + "grad_norm": 7.452844142913818, + "learning_rate": 4.994651169514557e-06, + "loss": 0.8479, + "step": 34975 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 6.6995439529418945, + "learning_rate": 4.958510422991298e-06, + "loss": 1.1195, + "step": 35000 + }, + { + "epoch": 2.7341920374707263, + "grad_norm": 3.6286466121673584, + "learning_rate": 4.9223696764680376e-06, + "loss": 0.8767, + "step": 35025 + }, + { + "epoch": 2.736143637782982, + "grad_norm": 4.038273334503174, + "learning_rate": 4.886228929944777e-06, + "loss": 0.9594, + "step": 35050 + }, + { + "epoch": 2.738095238095238, + "grad_norm": 14.014683723449707, + "learning_rate": 4.850088183421517e-06, + "loss": 0.9756, + "step": 35075 + }, + { + "epoch": 2.740046838407494, + "grad_norm": 5.6222639083862305, + "learning_rate": 4.813947436898257e-06, + "loss": 0.9511, + "step": 35100 + }, + { + "epoch": 2.74199843871975, + "grad_norm": 7.529115200042725, + "learning_rate": 4.7778066903749965e-06, + "loss": 1.0145, + "step": 35125 + }, + { + "epoch": 2.7439500390320064, + "grad_norm": 7.758570194244385, + "learning_rate": 4.741665943851736e-06, + "loss": 0.8745, + "step": 35150 + }, + { + "epoch": 2.7459016393442623, + "grad_norm": 9.186238288879395, + "learning_rate": 4.705525197328476e-06, + "loss": 1.0008, + "step": 35175 + }, + { + "epoch": 2.7478532396565183, + "grad_norm": 5.48419189453125, + "learning_rate": 4.6693844508052165e-06, + "loss": 0.961, + "step": 35200 + }, + { + "epoch": 2.7498048399687747, + "grad_norm": 2.8047804832458496, + "learning_rate": 4.633243704281956e-06, + "loss": 1.0038, + "step": 35225 + }, + { + "epoch": 2.7517564402810306, + "grad_norm": 5.34364652633667, + "learning_rate": 4.597102957758696e-06, + "loss": 1.0205, + "step": 35250 + }, + { + "epoch": 2.7537080405932866, + "grad_norm": 4.0446271896362305, + "learning_rate": 4.560962211235436e-06, + "loss": 0.9732, + "step": 35275 + }, + { + "epoch": 2.7556596409055425, + "grad_norm": 5.469175815582275, + "learning_rate": 4.524821464712175e-06, + "loss": 0.8912, + "step": 35300 + }, + { + "epoch": 2.7576112412177984, + "grad_norm": 7.1642537117004395, + "learning_rate": 4.488680718188915e-06, + "loss": 0.8999, + "step": 35325 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 20.89712142944336, + "learning_rate": 4.452539971665655e-06, + "loss": 0.9846, + "step": 35350 + }, + { + "epoch": 2.7615144418423108, + "grad_norm": 4.992537975311279, + "learning_rate": 4.4163992251423945e-06, + "loss": 0.973, + "step": 35375 + }, + { + "epoch": 2.7634660421545667, + "grad_norm": 2.2830374240875244, + "learning_rate": 4.380258478619135e-06, + "loss": 0.9524, + "step": 35400 + }, + { + "epoch": 2.7654176424668226, + "grad_norm": 5.448328495025635, + "learning_rate": 4.344117732095875e-06, + "loss": 0.9944, + "step": 35425 + }, + { + "epoch": 2.767369242779079, + "grad_norm": 6.006536483764648, + "learning_rate": 4.3079769855726145e-06, + "loss": 1.0081, + "step": 35450 + }, + { + "epoch": 2.769320843091335, + "grad_norm": 5.312732696533203, + "learning_rate": 4.271836239049353e-06, + "loss": 0.9737, + "step": 35475 + }, + { + "epoch": 2.771272443403591, + "grad_norm": 7.838423728942871, + "learning_rate": 4.235695492526094e-06, + "loss": 0.8639, + "step": 35500 + }, + { + "epoch": 2.773224043715847, + "grad_norm": 11.665059089660645, + "learning_rate": 4.199554746002834e-06, + "loss": 0.9781, + "step": 35525 + }, + { + "epoch": 2.775175644028103, + "grad_norm": 4.239396572113037, + "learning_rate": 4.163413999479573e-06, + "loss": 1.1026, + "step": 35550 + }, + { + "epoch": 2.777127244340359, + "grad_norm": 12.657247543334961, + "learning_rate": 4.127273252956313e-06, + "loss": 1.0077, + "step": 35575 + }, + { + "epoch": 2.779078844652615, + "grad_norm": 4.813512802124023, + "learning_rate": 4.091132506433053e-06, + "loss": 0.8954, + "step": 35600 + }, + { + "epoch": 2.781030444964871, + "grad_norm": 13.750665664672852, + "learning_rate": 4.0549917599097934e-06, + "loss": 0.999, + "step": 35625 + }, + { + "epoch": 2.7829820452771274, + "grad_norm": 8.885204315185547, + "learning_rate": 4.018851013386532e-06, + "loss": 1.013, + "step": 35650 + }, + { + "epoch": 2.7849336455893834, + "grad_norm": 5.044318675994873, + "learning_rate": 3.982710266863272e-06, + "loss": 1.0112, + "step": 35675 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 3.425179958343506, + "learning_rate": 3.946569520340012e-06, + "loss": 0.9307, + "step": 35700 + }, + { + "epoch": 2.7888368462138953, + "grad_norm": 6.769057273864746, + "learning_rate": 3.910428773816752e-06, + "loss": 0.9554, + "step": 35725 + }, + { + "epoch": 2.790788446526151, + "grad_norm": 33.40279006958008, + "learning_rate": 3.874288027293492e-06, + "loss": 0.8602, + "step": 35750 + }, + { + "epoch": 2.7927400468384076, + "grad_norm": 4.830076694488525, + "learning_rate": 3.838147280770232e-06, + "loss": 1.0088, + "step": 35775 + }, + { + "epoch": 2.7946916471506635, + "grad_norm": 3.707003116607666, + "learning_rate": 3.8020065342469715e-06, + "loss": 0.9292, + "step": 35800 + }, + { + "epoch": 2.7966432474629195, + "grad_norm": 5.800565719604492, + "learning_rate": 3.7658657877237116e-06, + "loss": 0.9453, + "step": 35825 + }, + { + "epoch": 2.798594847775176, + "grad_norm": 7.302030563354492, + "learning_rate": 3.7297250412004513e-06, + "loss": 0.9787, + "step": 35850 + }, + { + "epoch": 2.800546448087432, + "grad_norm": 3.1362900733947754, + "learning_rate": 3.693584294677191e-06, + "loss": 0.8745, + "step": 35875 + }, + { + "epoch": 2.8024980483996877, + "grad_norm": 9.402700424194336, + "learning_rate": 3.6574435481539304e-06, + "loss": 0.8995, + "step": 35900 + }, + { + "epoch": 2.8044496487119437, + "grad_norm": 6.527857780456543, + "learning_rate": 3.621302801630671e-06, + "loss": 0.9443, + "step": 35925 + }, + { + "epoch": 2.8064012490241996, + "grad_norm": 4.445523738861084, + "learning_rate": 3.5851620551074107e-06, + "loss": 0.995, + "step": 35950 + }, + { + "epoch": 2.808352849336456, + "grad_norm": 12.511701583862305, + "learning_rate": 3.5490213085841504e-06, + "loss": 0.956, + "step": 35975 + }, + { + "epoch": 2.810304449648712, + "grad_norm": 5.614612102508545, + "learning_rate": 3.5128805620608897e-06, + "loss": 0.8289, + "step": 36000 + }, + { + "epoch": 2.812256049960968, + "grad_norm": 7.694569110870361, + "learning_rate": 3.4767398155376302e-06, + "loss": 0.844, + "step": 36025 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 19.2261905670166, + "learning_rate": 3.44059906901437e-06, + "loss": 0.9839, + "step": 36050 + }, + { + "epoch": 2.8161592505854802, + "grad_norm": 5.294959545135498, + "learning_rate": 3.4044583224911093e-06, + "loss": 0.952, + "step": 36075 + }, + { + "epoch": 2.818110850897736, + "grad_norm": 6.42605447769165, + "learning_rate": 3.368317575967849e-06, + "loss": 0.9319, + "step": 36100 + }, + { + "epoch": 2.820062451209992, + "grad_norm": 7.288741111755371, + "learning_rate": 3.3321768294445896e-06, + "loss": 0.8958, + "step": 36125 + }, + { + "epoch": 2.822014051522248, + "grad_norm": 6.327204704284668, + "learning_rate": 3.2960360829213293e-06, + "loss": 0.8467, + "step": 36150 + }, + { + "epoch": 2.8239656518345044, + "grad_norm": 2.487668514251709, + "learning_rate": 3.2598953363980686e-06, + "loss": 0.9137, + "step": 36175 + }, + { + "epoch": 2.8259172521467604, + "grad_norm": 6.163941860198975, + "learning_rate": 3.2237545898748083e-06, + "loss": 1.0374, + "step": 36200 + }, + { + "epoch": 2.8278688524590163, + "grad_norm": 5.312852382659912, + "learning_rate": 3.187613843351549e-06, + "loss": 0.9339, + "step": 36225 + }, + { + "epoch": 2.8298204527712727, + "grad_norm": 8.940019607543945, + "learning_rate": 3.151473096828288e-06, + "loss": 0.9153, + "step": 36250 + }, + { + "epoch": 2.8317720530835286, + "grad_norm": 9.613917350769043, + "learning_rate": 3.115332350305028e-06, + "loss": 0.9743, + "step": 36275 + }, + { + "epoch": 2.8337236533957846, + "grad_norm": 5.107303619384766, + "learning_rate": 3.079191603781768e-06, + "loss": 0.9676, + "step": 36300 + }, + { + "epoch": 2.8356752537080405, + "grad_norm": 9.088422775268555, + "learning_rate": 3.0430508572585077e-06, + "loss": 0.9481, + "step": 36325 + }, + { + "epoch": 2.8376268540202965, + "grad_norm": 9.972681999206543, + "learning_rate": 3.0069101107352475e-06, + "loss": 0.9406, + "step": 36350 + }, + { + "epoch": 2.839578454332553, + "grad_norm": 11.56908893585205, + "learning_rate": 2.970769364211987e-06, + "loss": 0.961, + "step": 36375 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 5.048888683319092, + "learning_rate": 2.9346286176887273e-06, + "loss": 0.9548, + "step": 36400 + }, + { + "epoch": 2.8434816549570647, + "grad_norm": 9.328125, + "learning_rate": 2.898487871165467e-06, + "loss": 0.9682, + "step": 36425 + }, + { + "epoch": 2.845433255269321, + "grad_norm": 6.717364311218262, + "learning_rate": 2.8623471246422068e-06, + "loss": 0.9797, + "step": 36450 + }, + { + "epoch": 2.847384855581577, + "grad_norm": 5.134498596191406, + "learning_rate": 2.8262063781189465e-06, + "loss": 0.9824, + "step": 36475 + }, + { + "epoch": 2.849336455893833, + "grad_norm": 12.574528694152832, + "learning_rate": 2.7900656315956862e-06, + "loss": 0.9504, + "step": 36500 + }, + { + "epoch": 2.851288056206089, + "grad_norm": 4.967392444610596, + "learning_rate": 2.7539248850724264e-06, + "loss": 0.9407, + "step": 36525 + }, + { + "epoch": 2.853239656518345, + "grad_norm": 7.361475944519043, + "learning_rate": 2.7177841385491657e-06, + "loss": 0.9071, + "step": 36550 + }, + { + "epoch": 2.855191256830601, + "grad_norm": 9.685564994812012, + "learning_rate": 2.681643392025906e-06, + "loss": 0.9237, + "step": 36575 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 5.940114498138428, + "learning_rate": 2.6455026455026455e-06, + "loss": 0.9725, + "step": 36600 + }, + { + "epoch": 2.859094457455113, + "grad_norm": 9.44420051574707, + "learning_rate": 2.6093618989793857e-06, + "loss": 1.0293, + "step": 36625 + }, + { + "epoch": 2.861046057767369, + "grad_norm": 5.067485332489014, + "learning_rate": 2.573221152456125e-06, + "loss": 0.9725, + "step": 36650 + }, + { + "epoch": 2.8629976580796255, + "grad_norm": 4.158738613128662, + "learning_rate": 2.537080405932865e-06, + "loss": 0.9369, + "step": 36675 + }, + { + "epoch": 2.8649492583918814, + "grad_norm": 9.426555633544922, + "learning_rate": 2.500939659409605e-06, + "loss": 0.9928, + "step": 36700 + }, + { + "epoch": 2.8669008587041374, + "grad_norm": 4.986302852630615, + "learning_rate": 2.4647989128863446e-06, + "loss": 1.0516, + "step": 36725 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 7.455185413360596, + "learning_rate": 2.4286581663630843e-06, + "loss": 1.009, + "step": 36750 + }, + { + "epoch": 2.8708040593286492, + "grad_norm": 5.22102689743042, + "learning_rate": 2.3925174198398244e-06, + "loss": 0.9373, + "step": 36775 + }, + { + "epoch": 2.8727556596409056, + "grad_norm": 9.262065887451172, + "learning_rate": 2.356376673316564e-06, + "loss": 0.8514, + "step": 36800 + }, + { + "epoch": 2.8747072599531616, + "grad_norm": 4.437563419342041, + "learning_rate": 2.320235926793304e-06, + "loss": 0.9458, + "step": 36825 + }, + { + "epoch": 2.8766588602654175, + "grad_norm": 5.249630928039551, + "learning_rate": 2.2840951802700436e-06, + "loss": 1.0477, + "step": 36850 + }, + { + "epoch": 2.878610460577674, + "grad_norm": 8.485176086425781, + "learning_rate": 2.2479544337467837e-06, + "loss": 0.9119, + "step": 36875 + }, + { + "epoch": 2.88056206088993, + "grad_norm": 8.121548652648926, + "learning_rate": 2.2118136872235235e-06, + "loss": 0.9911, + "step": 36900 + }, + { + "epoch": 2.8825136612021858, + "grad_norm": 5.384830951690674, + "learning_rate": 2.175672940700263e-06, + "loss": 0.8677, + "step": 36925 + }, + { + "epoch": 2.8844652615144417, + "grad_norm": 10.310225486755371, + "learning_rate": 2.139532194177003e-06, + "loss": 0.9159, + "step": 36950 + }, + { + "epoch": 2.8864168618266977, + "grad_norm": 3.7309186458587646, + "learning_rate": 2.103391447653743e-06, + "loss": 0.9991, + "step": 36975 + }, + { + "epoch": 2.888368462138954, + "grad_norm": 5.711589813232422, + "learning_rate": 2.0672507011304828e-06, + "loss": 0.9948, + "step": 37000 + }, + { + "epoch": 2.89032006245121, + "grad_norm": 9.343750953674316, + "learning_rate": 2.0311099546072225e-06, + "loss": 0.9743, + "step": 37025 + }, + { + "epoch": 2.892271662763466, + "grad_norm": 8.20173168182373, + "learning_rate": 1.994969208083962e-06, + "loss": 1.0054, + "step": 37050 + }, + { + "epoch": 2.8942232630757223, + "grad_norm": 6.475944995880127, + "learning_rate": 1.9588284615607024e-06, + "loss": 0.9209, + "step": 37075 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 5.5318217277526855, + "learning_rate": 1.9226877150374417e-06, + "loss": 1.1114, + "step": 37100 + }, + { + "epoch": 2.898126463700234, + "grad_norm": 9.874210357666016, + "learning_rate": 1.8865469685141818e-06, + "loss": 0.962, + "step": 37125 + }, + { + "epoch": 2.90007806401249, + "grad_norm": 6.493501663208008, + "learning_rate": 1.8504062219909215e-06, + "loss": 1.0265, + "step": 37150 + }, + { + "epoch": 2.902029664324746, + "grad_norm": 5.907900810241699, + "learning_rate": 1.8142654754676615e-06, + "loss": 0.9085, + "step": 37175 + }, + { + "epoch": 2.9039812646370025, + "grad_norm": 6.20749044418335, + "learning_rate": 1.7781247289444012e-06, + "loss": 0.9389, + "step": 37200 + }, + { + "epoch": 2.9059328649492584, + "grad_norm": 8.900979042053223, + "learning_rate": 1.7419839824211411e-06, + "loss": 1.0622, + "step": 37225 + }, + { + "epoch": 2.9078844652615143, + "grad_norm": 13.516480445861816, + "learning_rate": 1.7058432358978808e-06, + "loss": 0.979, + "step": 37250 + }, + { + "epoch": 2.9098360655737707, + "grad_norm": 4.44931173324585, + "learning_rate": 1.6697024893746208e-06, + "loss": 1.0254, + "step": 37275 + }, + { + "epoch": 2.9117876658860267, + "grad_norm": 7.9147562980651855, + "learning_rate": 1.6335617428513605e-06, + "loss": 0.9681, + "step": 37300 + }, + { + "epoch": 2.9137392661982826, + "grad_norm": 14.517166137695312, + "learning_rate": 1.5974209963281004e-06, + "loss": 0.9414, + "step": 37325 + }, + { + "epoch": 2.9156908665105385, + "grad_norm": 4.41569709777832, + "learning_rate": 1.5612802498048401e-06, + "loss": 1.0306, + "step": 37350 + }, + { + "epoch": 2.9176424668227945, + "grad_norm": 10.396710395812988, + "learning_rate": 1.5251395032815799e-06, + "loss": 0.9047, + "step": 37375 + }, + { + "epoch": 2.919594067135051, + "grad_norm": 4.044954299926758, + "learning_rate": 1.4889987567583196e-06, + "loss": 1.0493, + "step": 37400 + }, + { + "epoch": 2.921545667447307, + "grad_norm": 8.15400218963623, + "learning_rate": 1.4528580102350595e-06, + "loss": 0.9675, + "step": 37425 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 4.238800525665283, + "learning_rate": 1.4167172637117992e-06, + "loss": 0.8742, + "step": 37450 + }, + { + "epoch": 2.925448868071819, + "grad_norm": 4.453863143920898, + "learning_rate": 1.3805765171885392e-06, + "loss": 1.009, + "step": 37475 + }, + { + "epoch": 2.927400468384075, + "grad_norm": 4.53254508972168, + "learning_rate": 1.3444357706652789e-06, + "loss": 0.906, + "step": 37500 + }, + { + "epoch": 2.929352068696331, + "grad_norm": 8.645501136779785, + "learning_rate": 1.3082950241420188e-06, + "loss": 0.8904, + "step": 37525 + }, + { + "epoch": 2.931303669008587, + "grad_norm": 7.76276159286499, + "learning_rate": 1.2721542776187585e-06, + "loss": 1.0786, + "step": 37550 + }, + { + "epoch": 2.933255269320843, + "grad_norm": 4.654953479766846, + "learning_rate": 1.2360135310954985e-06, + "loss": 0.9461, + "step": 37575 + }, + { + "epoch": 2.9352068696330993, + "grad_norm": 5.940478324890137, + "learning_rate": 1.1998727845722382e-06, + "loss": 0.9464, + "step": 37600 + }, + { + "epoch": 2.9371584699453552, + "grad_norm": 3.408820390701294, + "learning_rate": 1.1637320380489781e-06, + "loss": 1.095, + "step": 37625 + }, + { + "epoch": 2.939110070257611, + "grad_norm": 49.68843460083008, + "learning_rate": 1.1275912915257179e-06, + "loss": 1.0198, + "step": 37650 + }, + { + "epoch": 2.9410616705698676, + "grad_norm": 8.318230628967285, + "learning_rate": 1.0914505450024578e-06, + "loss": 0.9268, + "step": 37675 + }, + { + "epoch": 2.9430132708821235, + "grad_norm": 3.565521717071533, + "learning_rate": 1.0553097984791975e-06, + "loss": 1.0392, + "step": 37700 + }, + { + "epoch": 2.9449648711943794, + "grad_norm": 7.742121696472168, + "learning_rate": 1.0191690519559372e-06, + "loss": 0.8929, + "step": 37725 + }, + { + "epoch": 2.9469164715066354, + "grad_norm": 10.264391899108887, + "learning_rate": 9.830283054326772e-07, + "loss": 1.048, + "step": 37750 + }, + { + "epoch": 2.9488680718188913, + "grad_norm": 5.232205390930176, + "learning_rate": 9.46887558909417e-07, + "loss": 0.9733, + "step": 37775 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 8.577336311340332, + "learning_rate": 9.107468123861568e-07, + "loss": 0.925, + "step": 37800 + }, + { + "epoch": 2.9527712724434036, + "grad_norm": 4.8147382736206055, + "learning_rate": 8.746060658628964e-07, + "loss": 1.001, + "step": 37825 + }, + { + "epoch": 2.9547228727556596, + "grad_norm": 9.136549949645996, + "learning_rate": 8.384653193396363e-07, + "loss": 1.0379, + "step": 37850 + }, + { + "epoch": 2.9566744730679155, + "grad_norm": 9.757030487060547, + "learning_rate": 8.023245728163761e-07, + "loss": 0.9762, + "step": 37875 + }, + { + "epoch": 2.958626073380172, + "grad_norm": 2.8016204833984375, + "learning_rate": 7.661838262931159e-07, + "loss": 0.9598, + "step": 37900 + }, + { + "epoch": 2.960577673692428, + "grad_norm": 4.915115833282471, + "learning_rate": 7.300430797698557e-07, + "loss": 0.8749, + "step": 37925 + }, + { + "epoch": 2.962529274004684, + "grad_norm": 7.519952774047852, + "learning_rate": 6.939023332465956e-07, + "loss": 0.9948, + "step": 37950 + }, + { + "epoch": 2.9644808743169397, + "grad_norm": 5.322856426239014, + "learning_rate": 6.577615867233354e-07, + "loss": 0.8365, + "step": 37975 + }, + { + "epoch": 2.9664324746291957, + "grad_norm": 5.678027153015137, + "learning_rate": 6.216208402000752e-07, + "loss": 0.815, + "step": 38000 + }, + { + "epoch": 2.968384074941452, + "grad_norm": 4.429562091827393, + "learning_rate": 5.85480093676815e-07, + "loss": 1.0204, + "step": 38025 + }, + { + "epoch": 2.970335675253708, + "grad_norm": 20.631006240844727, + "learning_rate": 5.493393471535548e-07, + "loss": 0.8517, + "step": 38050 + }, + { + "epoch": 2.972287275565964, + "grad_norm": 5.033249855041504, + "learning_rate": 5.131986006302946e-07, + "loss": 1.0247, + "step": 38075 + }, + { + "epoch": 2.9742388758782203, + "grad_norm": 15.038820266723633, + "learning_rate": 4.770578541070344e-07, + "loss": 1.02, + "step": 38100 + }, + { + "epoch": 2.9761904761904763, + "grad_norm": 5.03165864944458, + "learning_rate": 4.4091710758377425e-07, + "loss": 0.9728, + "step": 38125 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 11.223451614379883, + "learning_rate": 4.047763610605141e-07, + "loss": 0.9773, + "step": 38150 + }, + { + "epoch": 2.980093676814988, + "grad_norm": 5.706803798675537, + "learning_rate": 3.686356145372539e-07, + "loss": 0.9784, + "step": 38175 + }, + { + "epoch": 2.982045277127244, + "grad_norm": 4.822505474090576, + "learning_rate": 3.324948680139937e-07, + "loss": 0.9785, + "step": 38200 + }, + { + "epoch": 2.9839968774395005, + "grad_norm": 6.286616802215576, + "learning_rate": 2.963541214907335e-07, + "loss": 1.0136, + "step": 38225 + }, + { + "epoch": 2.9859484777517564, + "grad_norm": 5.4574432373046875, + "learning_rate": 2.6021337496747334e-07, + "loss": 0.9526, + "step": 38250 + }, + { + "epoch": 2.9879000780640124, + "grad_norm": 6.951941967010498, + "learning_rate": 2.2407262844421317e-07, + "loss": 1.0792, + "step": 38275 + }, + { + "epoch": 2.9898516783762688, + "grad_norm": 6.4223456382751465, + "learning_rate": 1.8793188192095297e-07, + "loss": 0.9462, + "step": 38300 + }, + { + "epoch": 2.9918032786885247, + "grad_norm": 4.584599018096924, + "learning_rate": 1.5179113539769277e-07, + "loss": 0.9315, + "step": 38325 + }, + { + "epoch": 2.9937548790007806, + "grad_norm": 12.312700271606445, + "learning_rate": 1.156503888744326e-07, + "loss": 0.9726, + "step": 38350 + }, + { + "epoch": 2.9957064793130366, + "grad_norm": 5.609854221343994, + "learning_rate": 7.950964235117241e-08, + "loss": 0.979, + "step": 38375 + }, + { + "epoch": 2.9976580796252925, + "grad_norm": 5.3260884284973145, + "learning_rate": 4.336889582791222e-08, + "loss": 0.9594, + "step": 38400 + }, + { + "epoch": 2.999609679937549, + "grad_norm": 7.4217329025268555, + "learning_rate": 7.228149304652037e-09, + "loss": 0.9241, + "step": 38425 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.6124121779859485, + "eval_f1_macro": 0.5546337781014297, + "eval_f1_micro": 0.6124121779859485, + "eval_f1_weighted": 0.603589451590145, + "eval_loss": 0.9609171748161316, + "eval_precision_macro": 0.6282088637908045, + "eval_precision_micro": 0.6124121779859485, + "eval_precision_weighted": 0.630997362777574, + "eval_recall_macro": 0.5298730645340564, + "eval_recall_micro": 0.6124121779859485, + "eval_recall_weighted": 0.6124121779859485, + "eval_runtime": 5486.274, + "eval_samples_per_second": 4.67, + "eval_steps_per_second": 0.292, + "step": 38430 + } + ], + "logging_steps": 25, + "max_steps": 38430, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.022247107535565e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}