diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7087 @@ +{ + "best_metric": 0.10594170403587444, + "best_model_checkpoint": "d:\\\\whisper-medium-pt-cv18-fleurs2-lr\\checkpoint-5000", + "epoch": 11.502185415228894, + "eval_steps": 5000, + "global_step": 25000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011502185415228893, + "grad_norm": 15.794218063354492, + "learning_rate": 2.875e-08, + "loss": 0.7429, + "step": 25 + }, + { + "epoch": 0.023004370830457786, + "grad_norm": 37.62531661987305, + "learning_rate": 5.8750000000000007e-08, + "loss": 1.3461, + "step": 50 + }, + { + "epoch": 0.03450655624568668, + "grad_norm": 13.57304573059082, + "learning_rate": 9e-08, + "loss": 0.7409, + "step": 75 + }, + { + "epoch": 0.04600874166091557, + "grad_norm": 37.35552215576172, + "learning_rate": 1.2125e-07, + "loss": 1.2656, + "step": 100 + }, + { + "epoch": 0.05751092707614447, + "grad_norm": 13.972356796264648, + "learning_rate": 1.5250000000000002e-07, + "loss": 0.6919, + "step": 125 + }, + { + "epoch": 0.06901311249137336, + "grad_norm": 28.50897789001465, + "learning_rate": 1.8375000000000001e-07, + "loss": 1.131, + "step": 150 + }, + { + "epoch": 0.08051529790660225, + "grad_norm": 12.919734954833984, + "learning_rate": 2.15e-07, + "loss": 0.5826, + "step": 175 + }, + { + "epoch": 0.09201748332183114, + "grad_norm": 28.706079483032227, + "learning_rate": 2.4624999999999997e-07, + "loss": 0.7895, + "step": 200 + }, + { + "epoch": 0.10351966873706005, + "grad_norm": 8.280942916870117, + "learning_rate": 2.7750000000000004e-07, + "loss": 0.3197, + "step": 225 + }, + { + "epoch": 0.11502185415228894, + "grad_norm": 23.930252075195312, + "learning_rate": 3.0875e-07, + "loss": 0.3855, + "step": 250 + }, + { + "epoch": 0.12652403956751782, + "grad_norm": 6.460832595825195, + "learning_rate": 3.4e-07, + "loss": 0.2569, + "step": 275 + }, + { + "epoch": 0.13802622498274672, + "grad_norm": 22.88783836364746, + "learning_rate": 3.7125000000000005e-07, + "loss": 0.3504, + "step": 300 + }, + { + "epoch": 0.14952841039797563, + "grad_norm": 7.947082042694092, + "learning_rate": 4.025e-07, + "loss": 0.2123, + "step": 325 + }, + { + "epoch": 0.1610305958132045, + "grad_norm": 17.616596221923828, + "learning_rate": 4.3375000000000003e-07, + "loss": 0.2902, + "step": 350 + }, + { + "epoch": 0.1725327812284334, + "grad_norm": 4.91463565826416, + "learning_rate": 4.65e-07, + "loss": 0.1828, + "step": 375 + }, + { + "epoch": 0.18403496664366228, + "grad_norm": 18.81287956237793, + "learning_rate": 4.9625e-07, + "loss": 0.2847, + "step": 400 + }, + { + "epoch": 0.1955371520588912, + "grad_norm": 6.390377044677734, + "learning_rate": 5.275e-07, + "loss": 0.2107, + "step": 425 + }, + { + "epoch": 0.2070393374741201, + "grad_norm": 18.839292526245117, + "learning_rate": 5.587499999999999e-07, + "loss": 0.234, + "step": 450 + }, + { + "epoch": 0.21854152288934897, + "grad_norm": 6.151549816131592, + "learning_rate": 5.9e-07, + "loss": 0.2198, + "step": 475 + }, + { + "epoch": 0.23004370830457788, + "grad_norm": 13.628652572631836, + "learning_rate": 6.212500000000001e-07, + "loss": 0.2575, + "step": 500 + }, + { + "epoch": 0.24154589371980675, + "grad_norm": 5.531817436218262, + "learning_rate": 6.525000000000001e-07, + "loss": 0.1854, + "step": 525 + }, + { + "epoch": 0.25304807913503563, + "grad_norm": 14.468728065490723, + "learning_rate": 6.8375e-07, + "loss": 0.244, + "step": 550 + }, + { + "epoch": 0.26455026455026454, + "grad_norm": 6.999868869781494, + "learning_rate": 7.15e-07, + "loss": 0.1865, + "step": 575 + }, + { + "epoch": 0.27605244996549344, + "grad_norm": 17.996356964111328, + "learning_rate": 7.462500000000001e-07, + "loss": 0.2358, + "step": 600 + }, + { + "epoch": 0.28755463538072235, + "grad_norm": 8.172720909118652, + "learning_rate": 7.775e-07, + "loss": 0.168, + "step": 625 + }, + { + "epoch": 0.29905682079595125, + "grad_norm": 16.850492477416992, + "learning_rate": 8.0875e-07, + "loss": 0.2229, + "step": 650 + }, + { + "epoch": 0.3105590062111801, + "grad_norm": 6.139720916748047, + "learning_rate": 8.4e-07, + "loss": 0.1813, + "step": 675 + }, + { + "epoch": 0.322061191626409, + "grad_norm": 13.691137313842773, + "learning_rate": 8.7125e-07, + "loss": 0.2124, + "step": 700 + }, + { + "epoch": 0.3335633770416379, + "grad_norm": 6.144876956939697, + "learning_rate": 9.025e-07, + "loss": 0.1865, + "step": 725 + }, + { + "epoch": 0.3450655624568668, + "grad_norm": 19.069873809814453, + "learning_rate": 9.337500000000001e-07, + "loss": 0.1917, + "step": 750 + }, + { + "epoch": 0.3565677478720957, + "grad_norm": 8.921222686767578, + "learning_rate": 9.65e-07, + "loss": 0.2001, + "step": 775 + }, + { + "epoch": 0.36806993328732457, + "grad_norm": 16.66543197631836, + "learning_rate": 9.9625e-07, + "loss": 0.2174, + "step": 800 + }, + { + "epoch": 0.3795721187025535, + "grad_norm": 6.839846134185791, + "learning_rate": 1.0275e-06, + "loss": 0.1836, + "step": 825 + }, + { + "epoch": 0.3910743041177824, + "grad_norm": 18.923572540283203, + "learning_rate": 1.05875e-06, + "loss": 0.2126, + "step": 850 + }, + { + "epoch": 0.4025764895330113, + "grad_norm": 4.898512840270996, + "learning_rate": 1.0900000000000002e-06, + "loss": 0.1661, + "step": 875 + }, + { + "epoch": 0.4140786749482402, + "grad_norm": 14.121219635009766, + "learning_rate": 1.12125e-06, + "loss": 0.1825, + "step": 900 + }, + { + "epoch": 0.42558086036346904, + "grad_norm": 7.532533645629883, + "learning_rate": 1.1525000000000002e-06, + "loss": 0.1767, + "step": 925 + }, + { + "epoch": 0.43708304577869794, + "grad_norm": 12.991471290588379, + "learning_rate": 1.18375e-06, + "loss": 0.1993, + "step": 950 + }, + { + "epoch": 0.44858523119392685, + "grad_norm": 6.528143882751465, + "learning_rate": 1.215e-06, + "loss": 0.1772, + "step": 975 + }, + { + "epoch": 0.46008741660915575, + "grad_norm": 15.690600395202637, + "learning_rate": 1.24625e-06, + "loss": 0.184, + "step": 1000 + }, + { + "epoch": 0.47158960202438466, + "grad_norm": 6.040137767791748, + "learning_rate": 1.2775e-06, + "loss": 0.1684, + "step": 1025 + }, + { + "epoch": 0.4830917874396135, + "grad_norm": 12.872380256652832, + "learning_rate": 1.3087500000000002e-06, + "loss": 0.1934, + "step": 1050 + }, + { + "epoch": 0.4945939728548424, + "grad_norm": 6.453334331512451, + "learning_rate": 1.34e-06, + "loss": 0.1671, + "step": 1075 + }, + { + "epoch": 0.5060961582700713, + "grad_norm": 12.546272277832031, + "learning_rate": 1.3712500000000002e-06, + "loss": 0.1901, + "step": 1100 + }, + { + "epoch": 0.5175983436853002, + "grad_norm": 6.841800689697266, + "learning_rate": 1.4025e-06, + "loss": 0.1726, + "step": 1125 + }, + { + "epoch": 0.5291005291005291, + "grad_norm": 16.96964454650879, + "learning_rate": 1.43375e-06, + "loss": 0.2027, + "step": 1150 + }, + { + "epoch": 0.540602714515758, + "grad_norm": 5.311614036560059, + "learning_rate": 1.465e-06, + "loss": 0.152, + "step": 1175 + }, + { + "epoch": 0.5521048999309869, + "grad_norm": 15.524170875549316, + "learning_rate": 1.49625e-06, + "loss": 0.1986, + "step": 1200 + }, + { + "epoch": 0.5636070853462157, + "grad_norm": 9.683732986450195, + "learning_rate": 1.5275000000000002e-06, + "loss": 0.1624, + "step": 1225 + }, + { + "epoch": 0.5751092707614447, + "grad_norm": 11.873454093933105, + "learning_rate": 1.5587500000000001e-06, + "loss": 0.1875, + "step": 1250 + }, + { + "epoch": 0.5866114561766735, + "grad_norm": 5.891846179962158, + "learning_rate": 1.5900000000000002e-06, + "loss": 0.1603, + "step": 1275 + }, + { + "epoch": 0.5981136415919025, + "grad_norm": 14.153804779052734, + "learning_rate": 1.6212500000000001e-06, + "loss": 0.1641, + "step": 1300 + }, + { + "epoch": 0.6096158270071314, + "grad_norm": 7.2772955894470215, + "learning_rate": 1.6525000000000003e-06, + "loss": 0.154, + "step": 1325 + }, + { + "epoch": 0.6211180124223602, + "grad_norm": 14.763301849365234, + "learning_rate": 1.68375e-06, + "loss": 0.1745, + "step": 1350 + }, + { + "epoch": 0.6326201978375892, + "grad_norm": 8.715161323547363, + "learning_rate": 1.7149999999999999e-06, + "loss": 0.1667, + "step": 1375 + }, + { + "epoch": 0.644122383252818, + "grad_norm": 11.507594108581543, + "learning_rate": 1.74625e-06, + "loss": 0.17, + "step": 1400 + }, + { + "epoch": 0.655624568668047, + "grad_norm": 5.30320405960083, + "learning_rate": 1.7775e-06, + "loss": 0.1735, + "step": 1425 + }, + { + "epoch": 0.6671267540832758, + "grad_norm": 12.242377281188965, + "learning_rate": 1.80875e-06, + "loss": 0.1834, + "step": 1450 + }, + { + "epoch": 0.6786289394985047, + "grad_norm": 7.7493462562561035, + "learning_rate": 1.84e-06, + "loss": 0.1575, + "step": 1475 + }, + { + "epoch": 0.6901311249137336, + "grad_norm": 14.08462142944336, + "learning_rate": 1.87125e-06, + "loss": 0.1818, + "step": 1500 + }, + { + "epoch": 0.7016333103289625, + "grad_norm": 5.448755264282227, + "learning_rate": 1.9025000000000002e-06, + "loss": 0.1483, + "step": 1525 + }, + { + "epoch": 0.7131354957441914, + "grad_norm": 14.599383354187012, + "learning_rate": 1.9337500000000003e-06, + "loss": 0.1877, + "step": 1550 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 6.832576274871826, + "learning_rate": 1.9650000000000002e-06, + "loss": 0.1639, + "step": 1575 + }, + { + "epoch": 0.7361398665746491, + "grad_norm": 11.764741897583008, + "learning_rate": 1.99625e-06, + "loss": 0.1887, + "step": 1600 + }, + { + "epoch": 0.7476420519898781, + "grad_norm": 5.073885917663574, + "learning_rate": 2.0275e-06, + "loss": 0.1449, + "step": 1625 + }, + { + "epoch": 0.759144237405107, + "grad_norm": 11.470847129821777, + "learning_rate": 2.0587500000000004e-06, + "loss": 0.1776, + "step": 1650 + }, + { + "epoch": 0.7706464228203359, + "grad_norm": 8.333059310913086, + "learning_rate": 2.09e-06, + "loss": 0.1572, + "step": 1675 + }, + { + "epoch": 0.7821486082355648, + "grad_norm": 10.23344612121582, + "learning_rate": 2.12125e-06, + "loss": 0.2047, + "step": 1700 + }, + { + "epoch": 0.7936507936507936, + "grad_norm": 7.107337951660156, + "learning_rate": 2.1525e-06, + "loss": 0.1635, + "step": 1725 + }, + { + "epoch": 0.8051529790660226, + "grad_norm": 12.670259475708008, + "learning_rate": 2.18375e-06, + "loss": 0.1784, + "step": 1750 + }, + { + "epoch": 0.8166551644812514, + "grad_norm": 6.77697229385376, + "learning_rate": 2.215e-06, + "loss": 0.1526, + "step": 1775 + }, + { + "epoch": 0.8281573498964804, + "grad_norm": 13.887433052062988, + "learning_rate": 2.24625e-06, + "loss": 0.1897, + "step": 1800 + }, + { + "epoch": 0.8396595353117092, + "grad_norm": 4.762125492095947, + "learning_rate": 2.2775000000000002e-06, + "loss": 0.1464, + "step": 1825 + }, + { + "epoch": 0.8511617207269381, + "grad_norm": 19.292552947998047, + "learning_rate": 2.30875e-06, + "loss": 0.174, + "step": 1850 + }, + { + "epoch": 0.862663906142167, + "grad_norm": 7.12981653213501, + "learning_rate": 2.34e-06, + "loss": 0.1602, + "step": 1875 + }, + { + "epoch": 0.8741660915573959, + "grad_norm": 9.92496395111084, + "learning_rate": 2.3712500000000004e-06, + "loss": 0.1571, + "step": 1900 + }, + { + "epoch": 0.8856682769726248, + "grad_norm": 6.081151008605957, + "learning_rate": 2.4025000000000003e-06, + "loss": 0.1608, + "step": 1925 + }, + { + "epoch": 0.8971704623878537, + "grad_norm": 15.237141609191895, + "learning_rate": 2.43375e-06, + "loss": 0.1634, + "step": 1950 + }, + { + "epoch": 0.9086726478030825, + "grad_norm": 6.994187831878662, + "learning_rate": 2.465e-06, + "loss": 0.1531, + "step": 1975 + }, + { + "epoch": 0.9201748332183115, + "grad_norm": 16.033370971679688, + "learning_rate": 2.49625e-06, + "loss": 0.1583, + "step": 2000 + }, + { + "epoch": 0.9316770186335404, + "grad_norm": 6.178311824798584, + "learning_rate": 2.5275e-06, + "loss": 0.1502, + "step": 2025 + }, + { + "epoch": 0.9431792040487693, + "grad_norm": 10.050224304199219, + "learning_rate": 2.55875e-06, + "loss": 0.1763, + "step": 2050 + }, + { + "epoch": 0.9546813894639982, + "grad_norm": 5.254738807678223, + "learning_rate": 2.59e-06, + "loss": 0.1453, + "step": 2075 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 13.425460815429688, + "learning_rate": 2.62125e-06, + "loss": 0.1536, + "step": 2100 + }, + { + "epoch": 0.977685760294456, + "grad_norm": 4.7515459060668945, + "learning_rate": 2.6525e-06, + "loss": 0.1579, + "step": 2125 + }, + { + "epoch": 0.9891879457096848, + "grad_norm": 19.656898498535156, + "learning_rate": 2.6837500000000004e-06, + "loss": 0.1984, + "step": 2150 + }, + { + "epoch": 1.0006901311249137, + "grad_norm": 2.677222728729248, + "learning_rate": 2.7150000000000003e-06, + "loss": 0.1498, + "step": 2175 + }, + { + "epoch": 1.0121923165401425, + "grad_norm": 5.08892297744751, + "learning_rate": 2.74625e-06, + "loss": 0.1184, + "step": 2200 + }, + { + "epoch": 1.0236945019553716, + "grad_norm": 3.9588983058929443, + "learning_rate": 2.7775e-06, + "loss": 0.1467, + "step": 2225 + }, + { + "epoch": 1.0351966873706004, + "grad_norm": 11.414639472961426, + "learning_rate": 2.8087500000000004e-06, + "loss": 0.1113, + "step": 2250 + }, + { + "epoch": 1.0466988727858293, + "grad_norm": 2.633138418197632, + "learning_rate": 2.8400000000000003e-06, + "loss": 0.1353, + "step": 2275 + }, + { + "epoch": 1.0582010582010581, + "grad_norm": 5.479538917541504, + "learning_rate": 2.87125e-06, + "loss": 0.1261, + "step": 2300 + }, + { + "epoch": 1.069703243616287, + "grad_norm": 3.630627393722534, + "learning_rate": 2.9025e-06, + "loss": 0.1227, + "step": 2325 + }, + { + "epoch": 1.081205429031516, + "grad_norm": 8.805641174316406, + "learning_rate": 2.93375e-06, + "loss": 0.1229, + "step": 2350 + }, + { + "epoch": 1.092707614446745, + "grad_norm": 3.6392369270324707, + "learning_rate": 2.965e-06, + "loss": 0.1476, + "step": 2375 + }, + { + "epoch": 1.1042097998619738, + "grad_norm": 4.040417194366455, + "learning_rate": 2.99625e-06, + "loss": 0.1146, + "step": 2400 + }, + { + "epoch": 1.1157119852772026, + "grad_norm": 2.933556318283081, + "learning_rate": 3.0275000000000002e-06, + "loss": 0.1226, + "step": 2425 + }, + { + "epoch": 1.1272141706924317, + "grad_norm": 3.0352232456207275, + "learning_rate": 3.05875e-06, + "loss": 0.1224, + "step": 2450 + }, + { + "epoch": 1.1387163561076605, + "grad_norm": 3.9333884716033936, + "learning_rate": 3.09e-06, + "loss": 0.116, + "step": 2475 + }, + { + "epoch": 1.1502185415228894, + "grad_norm": 4.144917011260986, + "learning_rate": 3.1212500000000004e-06, + "loss": 0.1101, + "step": 2500 + }, + { + "epoch": 1.1617207269381182, + "grad_norm": 3.215536117553711, + "learning_rate": 3.1525e-06, + "loss": 0.1464, + "step": 2525 + }, + { + "epoch": 1.173222912353347, + "grad_norm": 10.1427640914917, + "learning_rate": 3.18375e-06, + "loss": 0.122, + "step": 2550 + }, + { + "epoch": 1.1847250977685762, + "grad_norm": 2.707651138305664, + "learning_rate": 3.215e-06, + "loss": 0.129, + "step": 2575 + }, + { + "epoch": 1.196227283183805, + "grad_norm": 4.589003562927246, + "learning_rate": 3.24625e-06, + "loss": 0.1083, + "step": 2600 + }, + { + "epoch": 1.2077294685990339, + "grad_norm": 2.6789445877075195, + "learning_rate": 3.2775e-06, + "loss": 0.1446, + "step": 2625 + }, + { + "epoch": 1.2192316540142627, + "grad_norm": 7.148416519165039, + "learning_rate": 3.30875e-06, + "loss": 0.1153, + "step": 2650 + }, + { + "epoch": 1.2307338394294915, + "grad_norm": 2.8945469856262207, + "learning_rate": 3.34e-06, + "loss": 0.1267, + "step": 2675 + }, + { + "epoch": 1.2422360248447206, + "grad_norm": 7.540188312530518, + "learning_rate": 3.37125e-06, + "loss": 0.1106, + "step": 2700 + }, + { + "epoch": 1.2537382102599495, + "grad_norm": 3.4765818119049072, + "learning_rate": 3.4025e-06, + "loss": 0.1377, + "step": 2725 + }, + { + "epoch": 1.2652403956751783, + "grad_norm": 5.349803447723389, + "learning_rate": 3.4337500000000004e-06, + "loss": 0.0954, + "step": 2750 + }, + { + "epoch": 1.2767425810904072, + "grad_norm": 2.526627779006958, + "learning_rate": 3.4650000000000003e-06, + "loss": 0.1224, + "step": 2775 + }, + { + "epoch": 1.288244766505636, + "grad_norm": 6.571626663208008, + "learning_rate": 3.49625e-06, + "loss": 0.118, + "step": 2800 + }, + { + "epoch": 1.299746951920865, + "grad_norm": 2.319915533065796, + "learning_rate": 3.5275e-06, + "loss": 0.1581, + "step": 2825 + }, + { + "epoch": 1.311249137336094, + "grad_norm": 3.2540760040283203, + "learning_rate": 3.5587500000000004e-06, + "loss": 0.1196, + "step": 2850 + }, + { + "epoch": 1.3227513227513228, + "grad_norm": 3.912529706954956, + "learning_rate": 3.5900000000000004e-06, + "loss": 0.146, + "step": 2875 + }, + { + "epoch": 1.3342535081665516, + "grad_norm": 3.1499977111816406, + "learning_rate": 3.6212500000000003e-06, + "loss": 0.1158, + "step": 2900 + }, + { + "epoch": 1.3457556935817805, + "grad_norm": 3.1882896423339844, + "learning_rate": 3.6525e-06, + "loss": 0.1517, + "step": 2925 + }, + { + "epoch": 1.3572578789970096, + "grad_norm": 5.0317888259887695, + "learning_rate": 3.6837500000000005e-06, + "loss": 0.1129, + "step": 2950 + }, + { + "epoch": 1.3687600644122382, + "grad_norm": 3.961643695831299, + "learning_rate": 3.7150000000000004e-06, + "loss": 0.1233, + "step": 2975 + }, + { + "epoch": 1.3802622498274673, + "grad_norm": 5.409346580505371, + "learning_rate": 3.7462500000000003e-06, + "loss": 0.1302, + "step": 3000 + }, + { + "epoch": 1.391764435242696, + "grad_norm": 3.978931427001953, + "learning_rate": 3.7775000000000007e-06, + "loss": 0.1361, + "step": 3025 + }, + { + "epoch": 1.403266620657925, + "grad_norm": 6.479454040527344, + "learning_rate": 3.8087500000000006e-06, + "loss": 0.1167, + "step": 3050 + }, + { + "epoch": 1.414768806073154, + "grad_norm": 3.1381306648254395, + "learning_rate": 3.84e-06, + "loss": 0.1199, + "step": 3075 + }, + { + "epoch": 1.4262709914883827, + "grad_norm": 5.639588832855225, + "learning_rate": 3.8712499999999996e-06, + "loss": 0.0967, + "step": 3100 + }, + { + "epoch": 1.4377731769036117, + "grad_norm": 3.615877866744995, + "learning_rate": 3.9025e-06, + "loss": 0.1334, + "step": 3125 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 5.772467136383057, + "learning_rate": 3.93375e-06, + "loss": 0.114, + "step": 3150 + }, + { + "epoch": 1.4607775477340694, + "grad_norm": 3.489830255508423, + "learning_rate": 3.965e-06, + "loss": 0.1421, + "step": 3175 + }, + { + "epoch": 1.4722797331492985, + "grad_norm": 6.027266502380371, + "learning_rate": 3.99625e-06, + "loss": 0.1212, + "step": 3200 + }, + { + "epoch": 1.4837819185645271, + "grad_norm": 3.047349452972412, + "learning_rate": 4.0275e-06, + "loss": 0.122, + "step": 3225 + }, + { + "epoch": 1.4952841039797562, + "grad_norm": 7.183162689208984, + "learning_rate": 4.05875e-06, + "loss": 0.1265, + "step": 3250 + }, + { + "epoch": 1.506786289394985, + "grad_norm": 3.0059525966644287, + "learning_rate": 4.09e-06, + "loss": 0.1468, + "step": 3275 + }, + { + "epoch": 1.518288474810214, + "grad_norm": 7.402144908905029, + "learning_rate": 4.12125e-06, + "loss": 0.1156, + "step": 3300 + }, + { + "epoch": 1.529790660225443, + "grad_norm": 3.278670072555542, + "learning_rate": 4.1525000000000005e-06, + "loss": 0.1201, + "step": 3325 + }, + { + "epoch": 1.5412928456406716, + "grad_norm": 5.25584602355957, + "learning_rate": 4.18375e-06, + "loss": 0.0967, + "step": 3350 + }, + { + "epoch": 1.5527950310559007, + "grad_norm": 4.171654224395752, + "learning_rate": 4.215e-06, + "loss": 0.1219, + "step": 3375 + }, + { + "epoch": 1.5642972164711295, + "grad_norm": 4.809912204742432, + "learning_rate": 4.24625e-06, + "loss": 0.1082, + "step": 3400 + }, + { + "epoch": 1.5757994018863584, + "grad_norm": 4.222274303436279, + "learning_rate": 4.2775e-06, + "loss": 0.1219, + "step": 3425 + }, + { + "epoch": 1.5873015873015874, + "grad_norm": 7.9530792236328125, + "learning_rate": 4.30875e-06, + "loss": 0.1228, + "step": 3450 + }, + { + "epoch": 1.598803772716816, + "grad_norm": 3.8543384075164795, + "learning_rate": 4.34e-06, + "loss": 0.1422, + "step": 3475 + }, + { + "epoch": 1.6103059581320451, + "grad_norm": 7.819809436798096, + "learning_rate": 4.371250000000001e-06, + "loss": 0.1306, + "step": 3500 + }, + { + "epoch": 1.621808143547274, + "grad_norm": 3.5284693241119385, + "learning_rate": 4.402500000000001e-06, + "loss": 0.1196, + "step": 3525 + }, + { + "epoch": 1.6333103289625028, + "grad_norm": 8.282682418823242, + "learning_rate": 4.4337500000000005e-06, + "loss": 0.102, + "step": 3550 + }, + { + "epoch": 1.644812514377732, + "grad_norm": 4.162339210510254, + "learning_rate": 4.4650000000000004e-06, + "loss": 0.1474, + "step": 3575 + }, + { + "epoch": 1.6563146997929605, + "grad_norm": 6.563460350036621, + "learning_rate": 4.49625e-06, + "loss": 0.1218, + "step": 3600 + }, + { + "epoch": 1.6678168852081896, + "grad_norm": 3.1366829872131348, + "learning_rate": 4.5275e-06, + "loss": 0.1332, + "step": 3625 + }, + { + "epoch": 1.6793190706234185, + "grad_norm": 6.515697956085205, + "learning_rate": 4.55875e-06, + "loss": 0.1108, + "step": 3650 + }, + { + "epoch": 1.6908212560386473, + "grad_norm": 3.636465072631836, + "learning_rate": 4.590000000000001e-06, + "loss": 0.1126, + "step": 3675 + }, + { + "epoch": 1.7023234414538764, + "grad_norm": 5.450216770172119, + "learning_rate": 4.62125e-06, + "loss": 0.1014, + "step": 3700 + }, + { + "epoch": 1.713825626869105, + "grad_norm": 3.8343234062194824, + "learning_rate": 4.6525e-06, + "loss": 0.147, + "step": 3725 + }, + { + "epoch": 1.725327812284334, + "grad_norm": 6.672384738922119, + "learning_rate": 4.68375e-06, + "loss": 0.1196, + "step": 3750 + }, + { + "epoch": 1.736829997699563, + "grad_norm": 3.536450147628784, + "learning_rate": 4.715e-06, + "loss": 0.1546, + "step": 3775 + }, + { + "epoch": 1.7483321831147918, + "grad_norm": 4.106471538543701, + "learning_rate": 4.74625e-06, + "loss": 0.1016, + "step": 3800 + }, + { + "epoch": 1.7598343685300208, + "grad_norm": 3.1923904418945312, + "learning_rate": 4.7775e-06, + "loss": 0.1414, + "step": 3825 + }, + { + "epoch": 1.7713365539452495, + "grad_norm": 5.6106157302856445, + "learning_rate": 4.80875e-06, + "loss": 0.1113, + "step": 3850 + }, + { + "epoch": 1.7828387393604785, + "grad_norm": 3.33258056640625, + "learning_rate": 4.84e-06, + "loss": 0.1228, + "step": 3875 + }, + { + "epoch": 1.7943409247757074, + "grad_norm": 4.954050064086914, + "learning_rate": 4.87125e-06, + "loss": 0.1204, + "step": 3900 + }, + { + "epoch": 1.8058431101909362, + "grad_norm": 3.758305072784424, + "learning_rate": 4.9025e-06, + "loss": 0.1226, + "step": 3925 + }, + { + "epoch": 1.8173452956061653, + "grad_norm": 5.375064373016357, + "learning_rate": 4.93375e-06, + "loss": 0.1344, + "step": 3950 + }, + { + "epoch": 1.828847481021394, + "grad_norm": 3.2198784351348877, + "learning_rate": 4.965e-06, + "loss": 0.1352, + "step": 3975 + }, + { + "epoch": 1.840349666436623, + "grad_norm": 6.347688674926758, + "learning_rate": 4.996250000000001e-06, + "loss": 0.1218, + "step": 4000 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 2.7024085521698, + "learning_rate": 5.0275000000000006e-06, + "loss": 0.1361, + "step": 4025 + }, + { + "epoch": 1.8633540372670807, + "grad_norm": 4.011370658874512, + "learning_rate": 5.0587500000000005e-06, + "loss": 0.1191, + "step": 4050 + }, + { + "epoch": 1.8748562226823098, + "grad_norm": 3.9904325008392334, + "learning_rate": 5.09e-06, + "loss": 0.1446, + "step": 4075 + }, + { + "epoch": 1.8863584080975384, + "grad_norm": 9.331755638122559, + "learning_rate": 5.12125e-06, + "loss": 0.1054, + "step": 4100 + }, + { + "epoch": 1.8978605935127675, + "grad_norm": 3.661421060562134, + "learning_rate": 5.151250000000001e-06, + "loss": 0.1424, + "step": 4125 + }, + { + "epoch": 1.9093627789279963, + "grad_norm": 5.570228576660156, + "learning_rate": 5.182500000000001e-06, + "loss": 0.1106, + "step": 4150 + }, + { + "epoch": 1.9208649643432252, + "grad_norm": 2.8392717838287354, + "learning_rate": 5.213750000000001e-06, + "loss": 0.1464, + "step": 4175 + }, + { + "epoch": 1.9323671497584543, + "grad_norm": 3.853571891784668, + "learning_rate": 5.245e-06, + "loss": 0.1287, + "step": 4200 + }, + { + "epoch": 1.9438693351736829, + "grad_norm": 3.817902088165283, + "learning_rate": 5.27625e-06, + "loss": 0.1274, + "step": 4225 + }, + { + "epoch": 1.955371520588912, + "grad_norm": 7.2849297523498535, + "learning_rate": 5.3075e-06, + "loss": 0.1192, + "step": 4250 + }, + { + "epoch": 1.9668737060041408, + "grad_norm": 3.0766196250915527, + "learning_rate": 5.33875e-06, + "loss": 0.1546, + "step": 4275 + }, + { + "epoch": 1.9783758914193696, + "grad_norm": 4.272324085235596, + "learning_rate": 5.37e-06, + "loss": 0.1152, + "step": 4300 + }, + { + "epoch": 1.9898780768345987, + "grad_norm": 3.574113607406616, + "learning_rate": 5.40125e-06, + "loss": 0.1537, + "step": 4325 + }, + { + "epoch": 2.0013802622498273, + "grad_norm": 3.3172659873962402, + "learning_rate": 5.4325e-06, + "loss": 0.1174, + "step": 4350 + }, + { + "epoch": 2.0128824476650564, + "grad_norm": 9.897499084472656, + "learning_rate": 5.46375e-06, + "loss": 0.0623, + "step": 4375 + }, + { + "epoch": 2.024384633080285, + "grad_norm": 2.7083864212036133, + "learning_rate": 5.495e-06, + "loss": 0.0892, + "step": 4400 + }, + { + "epoch": 2.035886818495514, + "grad_norm": 4.11065149307251, + "learning_rate": 5.52625e-06, + "loss": 0.0634, + "step": 4425 + }, + { + "epoch": 2.047389003910743, + "grad_norm": 2.104963779449463, + "learning_rate": 5.557500000000001e-06, + "loss": 0.0661, + "step": 4450 + }, + { + "epoch": 2.058891189325972, + "grad_norm": 3.848796844482422, + "learning_rate": 5.5887500000000005e-06, + "loss": 0.0586, + "step": 4475 + }, + { + "epoch": 2.070393374741201, + "grad_norm": 2.1717464923858643, + "learning_rate": 5.62e-06, + "loss": 0.0773, + "step": 4500 + }, + { + "epoch": 2.0818955601564295, + "grad_norm": 4.033133506774902, + "learning_rate": 5.65125e-06, + "loss": 0.0584, + "step": 4525 + }, + { + "epoch": 2.0933977455716586, + "grad_norm": 2.5643622875213623, + "learning_rate": 5.6825e-06, + "loss": 0.0776, + "step": 4550 + }, + { + "epoch": 2.1048999309868877, + "grad_norm": 6.559327602386475, + "learning_rate": 5.71375e-06, + "loss": 0.0643, + "step": 4575 + }, + { + "epoch": 2.1164021164021163, + "grad_norm": 2.1577560901641846, + "learning_rate": 5.745e-06, + "loss": 0.0687, + "step": 4600 + }, + { + "epoch": 2.1279043018173454, + "grad_norm": 8.438713073730469, + "learning_rate": 5.776250000000001e-06, + "loss": 0.072, + "step": 4625 + }, + { + "epoch": 2.139406487232574, + "grad_norm": 2.4797635078430176, + "learning_rate": 5.807500000000001e-06, + "loss": 0.087, + "step": 4650 + }, + { + "epoch": 2.150908672647803, + "grad_norm": 2.543196201324463, + "learning_rate": 5.838750000000001e-06, + "loss": 0.0586, + "step": 4675 + }, + { + "epoch": 2.162410858063032, + "grad_norm": 2.4707229137420654, + "learning_rate": 5.8700000000000005e-06, + "loss": 0.0723, + "step": 4700 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 5.645440101623535, + "learning_rate": 5.9012500000000005e-06, + "loss": 0.0654, + "step": 4725 + }, + { + "epoch": 2.18541522889349, + "grad_norm": 2.701606512069702, + "learning_rate": 5.9325e-06, + "loss": 0.0846, + "step": 4750 + }, + { + "epoch": 2.1969174143087185, + "grad_norm": 3.8023571968078613, + "learning_rate": 5.96375e-06, + "loss": 0.0581, + "step": 4775 + }, + { + "epoch": 2.2084195997239475, + "grad_norm": 1.9498426914215088, + "learning_rate": 5.995e-06, + "loss": 0.0911, + "step": 4800 + }, + { + "epoch": 2.2199217851391766, + "grad_norm": 2.360180139541626, + "learning_rate": 6.02625e-06, + "loss": 0.0728, + "step": 4825 + }, + { + "epoch": 2.2314239705544052, + "grad_norm": 2.91253924369812, + "learning_rate": 6.0575e-06, + "loss": 0.086, + "step": 4850 + }, + { + "epoch": 2.2429261559696343, + "grad_norm": 4.982974052429199, + "learning_rate": 6.08875e-06, + "loss": 0.071, + "step": 4875 + }, + { + "epoch": 2.2544283413848634, + "grad_norm": 2.393528461456299, + "learning_rate": 6.12e-06, + "loss": 0.0889, + "step": 4900 + }, + { + "epoch": 2.265930526800092, + "grad_norm": 3.294156312942505, + "learning_rate": 6.15125e-06, + "loss": 0.0659, + "step": 4925 + }, + { + "epoch": 2.277432712215321, + "grad_norm": 2.5716331005096436, + "learning_rate": 6.1825e-06, + "loss": 0.072, + "step": 4950 + }, + { + "epoch": 2.2889348976305497, + "grad_norm": 5.017734527587891, + "learning_rate": 6.2137500000000004e-06, + "loss": 0.0742, + "step": 4975 + }, + { + "epoch": 2.3004370830457788, + "grad_norm": 2.866231679916382, + "learning_rate": 6.245e-06, + "loss": 0.0876, + "step": 5000 + }, + { + "epoch": 2.3004370830457788, + "eval_loss": 0.16616719961166382, + "eval_runtime": 5341.8191, + "eval_samples_per_second": 1.777, + "eval_steps_per_second": 0.222, + "eval_wer": 0.10594170403587444, + "step": 5000 + }, + { + "epoch": 2.311939268461008, + "grad_norm": 6.034395694732666, + "learning_rate": 6.2434375e-06, + "loss": 0.0644, + "step": 5025 + }, + { + "epoch": 2.3234414538762365, + "grad_norm": 2.2458648681640625, + "learning_rate": 6.235625e-06, + "loss": 0.0776, + "step": 5050 + }, + { + "epoch": 2.3349436392914655, + "grad_norm": 4.230370998382568, + "learning_rate": 6.2278125e-06, + "loss": 0.0663, + "step": 5075 + }, + { + "epoch": 2.346445824706694, + "grad_norm": 2.9568865299224854, + "learning_rate": 6.22e-06, + "loss": 0.0908, + "step": 5100 + }, + { + "epoch": 2.3579480101219232, + "grad_norm": 3.1515731811523438, + "learning_rate": 6.2121875e-06, + "loss": 0.0767, + "step": 5125 + }, + { + "epoch": 2.3694501955371523, + "grad_norm": 3.660957098007202, + "learning_rate": 6.204375e-06, + "loss": 0.08, + "step": 5150 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 3.4517126083374023, + "learning_rate": 6.196562500000001e-06, + "loss": 0.0715, + "step": 5175 + }, + { + "epoch": 2.39245456636761, + "grad_norm": 3.5403709411621094, + "learning_rate": 6.18875e-06, + "loss": 0.0749, + "step": 5200 + }, + { + "epoch": 2.4039567517828386, + "grad_norm": 4.9767866134643555, + "learning_rate": 6.1809375000000005e-06, + "loss": 0.0757, + "step": 5225 + }, + { + "epoch": 2.4154589371980677, + "grad_norm": 3.120891809463501, + "learning_rate": 6.173125e-06, + "loss": 0.0968, + "step": 5250 + }, + { + "epoch": 2.4269611226132968, + "grad_norm": 3.43932843208313, + "learning_rate": 6.165312500000001e-06, + "loss": 0.0724, + "step": 5275 + }, + { + "epoch": 2.4384633080285254, + "grad_norm": 3.4927871227264404, + "learning_rate": 6.1575e-06, + "loss": 0.0831, + "step": 5300 + }, + { + "epoch": 2.4499654934437545, + "grad_norm": 3.009047746658325, + "learning_rate": 6.1496875000000006e-06, + "loss": 0.0695, + "step": 5325 + }, + { + "epoch": 2.461467678858983, + "grad_norm": 3.073551654815674, + "learning_rate": 6.141875e-06, + "loss": 0.0893, + "step": 5350 + }, + { + "epoch": 2.472969864274212, + "grad_norm": 5.566808223724365, + "learning_rate": 6.1340625e-06, + "loss": 0.0743, + "step": 5375 + }, + { + "epoch": 2.4844720496894412, + "grad_norm": 2.416825771331787, + "learning_rate": 6.12625e-06, + "loss": 0.079, + "step": 5400 + }, + { + "epoch": 2.49597423510467, + "grad_norm": 4.4972357749938965, + "learning_rate": 6.1184375e-06, + "loss": 0.0712, + "step": 5425 + }, + { + "epoch": 2.507476420519899, + "grad_norm": 3.5067849159240723, + "learning_rate": 6.1106250000000005e-06, + "loss": 0.0883, + "step": 5450 + }, + { + "epoch": 2.5189786059351276, + "grad_norm": 4.83007287979126, + "learning_rate": 6.1028125e-06, + "loss": 0.0775, + "step": 5475 + }, + { + "epoch": 2.5304807913503566, + "grad_norm": 3.0138561725616455, + "learning_rate": 6.095e-06, + "loss": 0.0814, + "step": 5500 + }, + { + "epoch": 2.5419829767655857, + "grad_norm": 5.821829795837402, + "learning_rate": 6.0871875e-06, + "loss": 0.0632, + "step": 5525 + }, + { + "epoch": 2.5534851621808143, + "grad_norm": 2.7620084285736084, + "learning_rate": 6.0793750000000006e-06, + "loss": 0.0789, + "step": 5550 + }, + { + "epoch": 2.5649873475960434, + "grad_norm": 5.065167427062988, + "learning_rate": 6.0715625e-06, + "loss": 0.0758, + "step": 5575 + }, + { + "epoch": 2.576489533011272, + "grad_norm": 1.777954339981079, + "learning_rate": 6.06375e-06, + "loss": 0.0881, + "step": 5600 + }, + { + "epoch": 2.587991718426501, + "grad_norm": 4.819468975067139, + "learning_rate": 6.0559375e-06, + "loss": 0.0715, + "step": 5625 + }, + { + "epoch": 2.59949390384173, + "grad_norm": 2.716526985168457, + "learning_rate": 6.048125000000001e-06, + "loss": 0.0912, + "step": 5650 + }, + { + "epoch": 2.610996089256959, + "grad_norm": 4.544143199920654, + "learning_rate": 6.0403125000000005e-06, + "loss": 0.0689, + "step": 5675 + }, + { + "epoch": 2.622498274672188, + "grad_norm": 2.0918431282043457, + "learning_rate": 6.0325e-06, + "loss": 0.0832, + "step": 5700 + }, + { + "epoch": 2.6340004600874165, + "grad_norm": 4.814356803894043, + "learning_rate": 6.0246875e-06, + "loss": 0.073, + "step": 5725 + }, + { + "epoch": 2.6455026455026456, + "grad_norm": 3.759373664855957, + "learning_rate": 6.016875e-06, + "loss": 0.0803, + "step": 5750 + }, + { + "epoch": 2.6570048309178746, + "grad_norm": 3.2967991828918457, + "learning_rate": 6.0090625000000005e-06, + "loss": 0.0767, + "step": 5775 + }, + { + "epoch": 2.6685070163331033, + "grad_norm": 4.567154884338379, + "learning_rate": 6.00125e-06, + "loss": 0.079, + "step": 5800 + }, + { + "epoch": 2.6800092017483323, + "grad_norm": 3.424586534500122, + "learning_rate": 5.9934375e-06, + "loss": 0.0761, + "step": 5825 + }, + { + "epoch": 2.691511387163561, + "grad_norm": 2.420856475830078, + "learning_rate": 5.985625e-06, + "loss": 0.0848, + "step": 5850 + }, + { + "epoch": 2.70301357257879, + "grad_norm": 6.956820487976074, + "learning_rate": 5.977812500000001e-06, + "loss": 0.0702, + "step": 5875 + }, + { + "epoch": 2.714515757994019, + "grad_norm": 2.5272533893585205, + "learning_rate": 5.9700000000000004e-06, + "loss": 0.0842, + "step": 5900 + }, + { + "epoch": 2.7260179434092477, + "grad_norm": 5.917661190032959, + "learning_rate": 5.9621875e-06, + "loss": 0.081, + "step": 5925 + }, + { + "epoch": 2.7375201288244764, + "grad_norm": 2.5169830322265625, + "learning_rate": 5.954375e-06, + "loss": 0.0929, + "step": 5950 + }, + { + "epoch": 2.7490223142397054, + "grad_norm": 8.81894588470459, + "learning_rate": 5.946562500000001e-06, + "loss": 0.0764, + "step": 5975 + }, + { + "epoch": 2.7605244996549345, + "grad_norm": 3.4220263957977295, + "learning_rate": 5.9387500000000005e-06, + "loss": 0.0821, + "step": 6000 + }, + { + "epoch": 2.7720266850701636, + "grad_norm": 3.72196626663208, + "learning_rate": 5.9309375e-06, + "loss": 0.0773, + "step": 6025 + }, + { + "epoch": 2.783528870485392, + "grad_norm": 3.5996947288513184, + "learning_rate": 5.923125e-06, + "loss": 0.0952, + "step": 6050 + }, + { + "epoch": 2.795031055900621, + "grad_norm": 3.508704423904419, + "learning_rate": 5.9153125e-06, + "loss": 0.0677, + "step": 6075 + }, + { + "epoch": 2.80653324131585, + "grad_norm": 3.768465042114258, + "learning_rate": 5.907500000000001e-06, + "loss": 0.0903, + "step": 6100 + }, + { + "epoch": 2.818035426731079, + "grad_norm": 7.676156997680664, + "learning_rate": 5.8996875000000004e-06, + "loss": 0.0889, + "step": 6125 + }, + { + "epoch": 2.829537612146308, + "grad_norm": 4.185784816741943, + "learning_rate": 5.891875e-06, + "loss": 0.0901, + "step": 6150 + }, + { + "epoch": 2.8410397975615367, + "grad_norm": 5.07861328125, + "learning_rate": 5.8840625e-06, + "loss": 0.0736, + "step": 6175 + }, + { + "epoch": 2.8525419829767653, + "grad_norm": 2.5589280128479004, + "learning_rate": 5.876250000000001e-06, + "loss": 0.0726, + "step": 6200 + }, + { + "epoch": 2.8640441683919944, + "grad_norm": 5.522654056549072, + "learning_rate": 5.8684375e-06, + "loss": 0.081, + "step": 6225 + }, + { + "epoch": 2.8755463538072235, + "grad_norm": 2.873734474182129, + "learning_rate": 5.860625e-06, + "loss": 0.0934, + "step": 6250 + }, + { + "epoch": 2.8870485392224525, + "grad_norm": 4.135101318359375, + "learning_rate": 5.8528125e-06, + "loss": 0.0691, + "step": 6275 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 3.2022476196289062, + "learning_rate": 5.845312500000001e-06, + "loss": 0.0876, + "step": 6300 + }, + { + "epoch": 2.91005291005291, + "grad_norm": 4.878911018371582, + "learning_rate": 5.8375000000000004e-06, + "loss": 0.0784, + "step": 6325 + }, + { + "epoch": 2.921555095468139, + "grad_norm": 2.7555994987487793, + "learning_rate": 5.8296875e-06, + "loss": 0.0932, + "step": 6350 + }, + { + "epoch": 2.933057280883368, + "grad_norm": 4.3659257888793945, + "learning_rate": 5.821875e-06, + "loss": 0.0678, + "step": 6375 + }, + { + "epoch": 2.944559466298597, + "grad_norm": 2.2182586193084717, + "learning_rate": 5.814062500000001e-06, + "loss": 0.0832, + "step": 6400 + }, + { + "epoch": 2.9560616517138256, + "grad_norm": 4.891880989074707, + "learning_rate": 5.8062500000000005e-06, + "loss": 0.077, + "step": 6425 + }, + { + "epoch": 2.9675638371290542, + "grad_norm": 3.2281267642974854, + "learning_rate": 5.7984375e-06, + "loss": 0.0862, + "step": 6450 + }, + { + "epoch": 2.9790660225442833, + "grad_norm": 4.891918659210205, + "learning_rate": 5.790625e-06, + "loss": 0.0713, + "step": 6475 + }, + { + "epoch": 2.9905682079595124, + "grad_norm": 3.6200342178344727, + "learning_rate": 5.782812500000001e-06, + "loss": 0.1053, + "step": 6500 + }, + { + "epoch": 3.002070393374741, + "grad_norm": 2.0583813190460205, + "learning_rate": 5.775000000000001e-06, + "loss": 0.0679, + "step": 6525 + }, + { + "epoch": 3.01357257878997, + "grad_norm": 4.742440700531006, + "learning_rate": 5.7671875e-06, + "loss": 0.0332, + "step": 6550 + }, + { + "epoch": 3.025074764205199, + "grad_norm": 1.5032464265823364, + "learning_rate": 5.759375e-06, + "loss": 0.0537, + "step": 6575 + }, + { + "epoch": 3.036576949620428, + "grad_norm": 2.877703905105591, + "learning_rate": 5.7515625e-06, + "loss": 0.0373, + "step": 6600 + }, + { + "epoch": 3.048079135035657, + "grad_norm": 2.60418963432312, + "learning_rate": 5.743750000000001e-06, + "loss": 0.0441, + "step": 6625 + }, + { + "epoch": 3.0595813204508855, + "grad_norm": 1.8395668268203735, + "learning_rate": 5.7359375e-06, + "loss": 0.0348, + "step": 6650 + }, + { + "epoch": 3.0710835058661146, + "grad_norm": 2.9202868938446045, + "learning_rate": 5.728125e-06, + "loss": 0.0534, + "step": 6675 + }, + { + "epoch": 3.0825856912813436, + "grad_norm": 2.7743561267852783, + "learning_rate": 5.7203125e-06, + "loss": 0.0353, + "step": 6700 + }, + { + "epoch": 3.0940878766965723, + "grad_norm": 3.1167683601379395, + "learning_rate": 5.712500000000001e-06, + "loss": 0.0505, + "step": 6725 + }, + { + "epoch": 3.1055900621118013, + "grad_norm": 4.696991920471191, + "learning_rate": 5.7046875e-06, + "loss": 0.0321, + "step": 6750 + }, + { + "epoch": 3.11709224752703, + "grad_norm": 1.5438411235809326, + "learning_rate": 5.696875e-06, + "loss": 0.0499, + "step": 6775 + }, + { + "epoch": 3.128594432942259, + "grad_norm": 6.239833831787109, + "learning_rate": 5.6890625e-06, + "loss": 0.0369, + "step": 6800 + }, + { + "epoch": 3.140096618357488, + "grad_norm": 4.79664421081543, + "learning_rate": 5.681250000000001e-06, + "loss": 0.0541, + "step": 6825 + }, + { + "epoch": 3.1515988037727167, + "grad_norm": 3.2525932788848877, + "learning_rate": 5.6734375e-06, + "loss": 0.0359, + "step": 6850 + }, + { + "epoch": 3.163100989187946, + "grad_norm": 2.3709487915039062, + "learning_rate": 5.6656250000000005e-06, + "loss": 0.0464, + "step": 6875 + }, + { + "epoch": 3.1746031746031744, + "grad_norm": 3.340402126312256, + "learning_rate": 5.6578125e-06, + "loss": 0.0331, + "step": 6900 + }, + { + "epoch": 3.1861053600184035, + "grad_norm": 2.4604740142822266, + "learning_rate": 5.65e-06, + "loss": 0.0578, + "step": 6925 + }, + { + "epoch": 3.1976075454336326, + "grad_norm": 4.909114837646484, + "learning_rate": 5.642187500000001e-06, + "loss": 0.0382, + "step": 6950 + }, + { + "epoch": 3.209109730848861, + "grad_norm": 4.222381591796875, + "learning_rate": 5.634375e-06, + "loss": 0.0573, + "step": 6975 + }, + { + "epoch": 3.2206119162640903, + "grad_norm": 3.5466387271881104, + "learning_rate": 5.6265625e-06, + "loss": 0.0355, + "step": 7000 + }, + { + "epoch": 3.232114101679319, + "grad_norm": 4.068739891052246, + "learning_rate": 5.61875e-06, + "loss": 0.0524, + "step": 7025 + }, + { + "epoch": 3.243616287094548, + "grad_norm": 3.5331459045410156, + "learning_rate": 5.610937500000001e-06, + "loss": 0.0388, + "step": 7050 + }, + { + "epoch": 3.255118472509777, + "grad_norm": 5.634138107299805, + "learning_rate": 5.603125e-06, + "loss": 0.0476, + "step": 7075 + }, + { + "epoch": 3.2666206579250057, + "grad_norm": 4.637297630310059, + "learning_rate": 5.5953125000000005e-06, + "loss": 0.038, + "step": 7100 + }, + { + "epoch": 3.2781228433402347, + "grad_norm": 2.1430771350860596, + "learning_rate": 5.5875e-06, + "loss": 0.0536, + "step": 7125 + }, + { + "epoch": 3.2896250287554634, + "grad_norm": 2.6287930011749268, + "learning_rate": 5.579687500000001e-06, + "loss": 0.0393, + "step": 7150 + }, + { + "epoch": 3.3011272141706924, + "grad_norm": 4.748372554779053, + "learning_rate": 5.571875e-06, + "loss": 0.0424, + "step": 7175 + }, + { + "epoch": 3.3126293995859215, + "grad_norm": 3.630303382873535, + "learning_rate": 5.5640625000000006e-06, + "loss": 0.0387, + "step": 7200 + }, + { + "epoch": 3.32413158500115, + "grad_norm": 4.786473751068115, + "learning_rate": 5.55625e-06, + "loss": 0.0561, + "step": 7225 + }, + { + "epoch": 3.335633770416379, + "grad_norm": 2.6991186141967773, + "learning_rate": 5.5484375e-06, + "loss": 0.0441, + "step": 7250 + }, + { + "epoch": 3.347135955831608, + "grad_norm": 2.1756906509399414, + "learning_rate": 5.540625e-06, + "loss": 0.0537, + "step": 7275 + }, + { + "epoch": 3.358638141246837, + "grad_norm": 1.9589180946350098, + "learning_rate": 5.5328125e-06, + "loss": 0.0341, + "step": 7300 + }, + { + "epoch": 3.370140326662066, + "grad_norm": 3.504366159439087, + "learning_rate": 5.5250000000000005e-06, + "loss": 0.0556, + "step": 7325 + }, + { + "epoch": 3.3816425120772946, + "grad_norm": 2.558767557144165, + "learning_rate": 5.5171875e-06, + "loss": 0.0411, + "step": 7350 + }, + { + "epoch": 3.3931446974925237, + "grad_norm": 2.466121196746826, + "learning_rate": 5.509375e-06, + "loss": 0.0488, + "step": 7375 + }, + { + "epoch": 3.4046468829077523, + "grad_norm": 4.102237224578857, + "learning_rate": 5.5015625e-06, + "loss": 0.0399, + "step": 7400 + }, + { + "epoch": 3.4161490683229814, + "grad_norm": 2.008484125137329, + "learning_rate": 5.4937500000000006e-06, + "loss": 0.0612, + "step": 7425 + }, + { + "epoch": 3.4276512537382104, + "grad_norm": 4.083756446838379, + "learning_rate": 5.4859375e-06, + "loss": 0.0417, + "step": 7450 + }, + { + "epoch": 3.439153439153439, + "grad_norm": 2.526481866836548, + "learning_rate": 5.478125e-06, + "loss": 0.0572, + "step": 7475 + }, + { + "epoch": 3.450655624568668, + "grad_norm": 5.448037624359131, + "learning_rate": 5.4703125e-06, + "loss": 0.0487, + "step": 7500 + }, + { + "epoch": 3.4621578099838968, + "grad_norm": 2.4314920902252197, + "learning_rate": 5.462500000000001e-06, + "loss": 0.0465, + "step": 7525 + }, + { + "epoch": 3.473659995399126, + "grad_norm": 2.6774253845214844, + "learning_rate": 5.4546875000000004e-06, + "loss": 0.0398, + "step": 7550 + }, + { + "epoch": 3.485162180814355, + "grad_norm": 2.9031014442443848, + "learning_rate": 5.446875e-06, + "loss": 0.0629, + "step": 7575 + }, + { + "epoch": 3.4966643662295835, + "grad_norm": 3.0640227794647217, + "learning_rate": 5.4390625e-06, + "loss": 0.0344, + "step": 7600 + }, + { + "epoch": 3.5081665516448126, + "grad_norm": 3.262568950653076, + "learning_rate": 5.43125e-06, + "loss": 0.0538, + "step": 7625 + }, + { + "epoch": 3.5196687370600412, + "grad_norm": 3.67341685295105, + "learning_rate": 5.4234375000000005e-06, + "loss": 0.0409, + "step": 7650 + }, + { + "epoch": 3.5311709224752703, + "grad_norm": 2.3461291790008545, + "learning_rate": 5.415625e-06, + "loss": 0.0506, + "step": 7675 + }, + { + "epoch": 3.542673107890499, + "grad_norm": 7.1874589920043945, + "learning_rate": 5.4078125e-06, + "loss": 0.0462, + "step": 7700 + }, + { + "epoch": 3.554175293305728, + "grad_norm": 2.4068679809570312, + "learning_rate": 5.4e-06, + "loss": 0.0517, + "step": 7725 + }, + { + "epoch": 3.565677478720957, + "grad_norm": 10.364015579223633, + "learning_rate": 5.392187500000001e-06, + "loss": 0.0459, + "step": 7750 + }, + { + "epoch": 3.5771796641361857, + "grad_norm": 1.969916582107544, + "learning_rate": 5.3843750000000004e-06, + "loss": 0.0395, + "step": 7775 + }, + { + "epoch": 3.588681849551415, + "grad_norm": 2.2081713676452637, + "learning_rate": 5.3765625e-06, + "loss": 0.0333, + "step": 7800 + }, + { + "epoch": 3.6001840349666434, + "grad_norm": 2.534886598587036, + "learning_rate": 5.36875e-06, + "loss": 0.0628, + "step": 7825 + }, + { + "epoch": 3.6116862203818725, + "grad_norm": 4.844212532043457, + "learning_rate": 5.360937500000001e-06, + "loss": 0.0399, + "step": 7850 + }, + { + "epoch": 3.6231884057971016, + "grad_norm": 2.0327091217041016, + "learning_rate": 5.3531250000000005e-06, + "loss": 0.0479, + "step": 7875 + }, + { + "epoch": 3.63469059121233, + "grad_norm": 5.35178804397583, + "learning_rate": 5.3453125e-06, + "loss": 0.0449, + "step": 7900 + }, + { + "epoch": 3.6461927766275593, + "grad_norm": 1.815317153930664, + "learning_rate": 5.3375e-06, + "loss": 0.061, + "step": 7925 + }, + { + "epoch": 3.657694962042788, + "grad_norm": 4.710953235626221, + "learning_rate": 5.3296875e-06, + "loss": 0.0366, + "step": 7950 + }, + { + "epoch": 3.669197147458017, + "grad_norm": 2.8497252464294434, + "learning_rate": 5.321875000000001e-06, + "loss": 0.0523, + "step": 7975 + }, + { + "epoch": 3.680699332873246, + "grad_norm": 3.102057695388794, + "learning_rate": 5.3140624999999996e-06, + "loss": 0.0424, + "step": 8000 + }, + { + "epoch": 3.6922015182884746, + "grad_norm": 2.153207540512085, + "learning_rate": 5.30625e-06, + "loss": 0.0524, + "step": 8025 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 2.5647189617156982, + "learning_rate": 5.2984375e-06, + "loss": 0.041, + "step": 8050 + }, + { + "epoch": 3.7152058891189323, + "grad_norm": 3.3755810260772705, + "learning_rate": 5.290625000000001e-06, + "loss": 0.0574, + "step": 8075 + }, + { + "epoch": 3.7267080745341614, + "grad_norm": 3.370281457901001, + "learning_rate": 5.2828125e-06, + "loss": 0.0427, + "step": 8100 + }, + { + "epoch": 3.7382102599493905, + "grad_norm": 3.2038800716400146, + "learning_rate": 5.275e-06, + "loss": 0.0544, + "step": 8125 + }, + { + "epoch": 3.749712445364619, + "grad_norm": 1.8960700035095215, + "learning_rate": 5.2671875e-06, + "loss": 0.0436, + "step": 8150 + }, + { + "epoch": 3.761214630779848, + "grad_norm": 3.4356021881103516, + "learning_rate": 5.259375000000001e-06, + "loss": 0.0593, + "step": 8175 + }, + { + "epoch": 3.772716816195077, + "grad_norm": 3.514998197555542, + "learning_rate": 5.251562500000001e-06, + "loss": 0.0361, + "step": 8200 + }, + { + "epoch": 3.784219001610306, + "grad_norm": 1.705615758895874, + "learning_rate": 5.24375e-06, + "loss": 0.0551, + "step": 8225 + }, + { + "epoch": 3.795721187025535, + "grad_norm": 4.432028770446777, + "learning_rate": 5.2359375e-06, + "loss": 0.0409, + "step": 8250 + }, + { + "epoch": 3.8072233724407636, + "grad_norm": 1.7736481428146362, + "learning_rate": 5.228125e-06, + "loss": 0.0431, + "step": 8275 + }, + { + "epoch": 3.8187255578559927, + "grad_norm": 6.601363658905029, + "learning_rate": 5.220312500000001e-06, + "loss": 0.0385, + "step": 8300 + }, + { + "epoch": 3.8302277432712213, + "grad_norm": 1.307138204574585, + "learning_rate": 5.2128125e-06, + "loss": 0.0551, + "step": 8325 + }, + { + "epoch": 3.8417299286864504, + "grad_norm": 4.539569854736328, + "learning_rate": 5.205e-06, + "loss": 0.0414, + "step": 8350 + }, + { + "epoch": 3.8532321141016794, + "grad_norm": 1.4391796588897705, + "learning_rate": 5.1971875e-06, + "loss": 0.0468, + "step": 8375 + }, + { + "epoch": 3.864734299516908, + "grad_norm": 6.971019268035889, + "learning_rate": 5.189375000000001e-06, + "loss": 0.0399, + "step": 8400 + }, + { + "epoch": 3.876236484932137, + "grad_norm": 3.1996214389801025, + "learning_rate": 5.1815624999999996e-06, + "loss": 0.0586, + "step": 8425 + }, + { + "epoch": 3.8877386703473658, + "grad_norm": 2.8706111907958984, + "learning_rate": 5.17375e-06, + "loss": 0.0372, + "step": 8450 + }, + { + "epoch": 3.899240855762595, + "grad_norm": 2.5482466220855713, + "learning_rate": 5.1659375e-06, + "loss": 0.0588, + "step": 8475 + }, + { + "epoch": 3.910743041177824, + "grad_norm": 3.1475095748901367, + "learning_rate": 5.158125000000001e-06, + "loss": 0.0357, + "step": 8500 + }, + { + "epoch": 3.9222452265930525, + "grad_norm": 3.1267690658569336, + "learning_rate": 5.1503125e-06, + "loss": 0.063, + "step": 8525 + }, + { + "epoch": 3.9337474120082816, + "grad_norm": 1.667446255683899, + "learning_rate": 5.1425e-06, + "loss": 0.041, + "step": 8550 + }, + { + "epoch": 3.9452495974235102, + "grad_norm": 1.526341438293457, + "learning_rate": 5.1346875e-06, + "loss": 0.0447, + "step": 8575 + }, + { + "epoch": 3.9567517828387393, + "grad_norm": 5.230031490325928, + "learning_rate": 5.126875000000001e-06, + "loss": 0.0429, + "step": 8600 + }, + { + "epoch": 3.9682539682539684, + "grad_norm": 2.7826850414276123, + "learning_rate": 5.1190625e-06, + "loss": 0.0492, + "step": 8625 + }, + { + "epoch": 3.979756153669197, + "grad_norm": 3.674015760421753, + "learning_rate": 5.11125e-06, + "loss": 0.0391, + "step": 8650 + }, + { + "epoch": 3.991258339084426, + "grad_norm": 3.0181193351745605, + "learning_rate": 5.1034375e-06, + "loss": 0.0538, + "step": 8675 + }, + { + "epoch": 4.002760524499655, + "grad_norm": 0.9257192015647888, + "learning_rate": 5.095625e-06, + "loss": 0.0428, + "step": 8700 + }, + { + "epoch": 4.014262709914884, + "grad_norm": 3.171478033065796, + "learning_rate": 5.087812500000001e-06, + "loss": 0.0175, + "step": 8725 + }, + { + "epoch": 4.025764895330113, + "grad_norm": 1.8162753582000732, + "learning_rate": 5.08e-06, + "loss": 0.0279, + "step": 8750 + }, + { + "epoch": 4.037267080745342, + "grad_norm": 0.8982828855514526, + "learning_rate": 5.0721875e-06, + "loss": 0.0195, + "step": 8775 + }, + { + "epoch": 4.04876926616057, + "grad_norm": 0.8609257936477661, + "learning_rate": 5.064375e-06, + "loss": 0.0386, + "step": 8800 + }, + { + "epoch": 4.060271451575799, + "grad_norm": 2.8455467224121094, + "learning_rate": 5.056562500000001e-06, + "loss": 0.0224, + "step": 8825 + }, + { + "epoch": 4.071773636991028, + "grad_norm": 4.009946346282959, + "learning_rate": 5.04875e-06, + "loss": 0.0332, + "step": 8850 + }, + { + "epoch": 4.083275822406257, + "grad_norm": 6.834461688995361, + "learning_rate": 5.0409375e-06, + "loss": 0.0243, + "step": 8875 + }, + { + "epoch": 4.094778007821486, + "grad_norm": 2.6603477001190186, + "learning_rate": 5.033125e-06, + "loss": 0.0337, + "step": 8900 + }, + { + "epoch": 4.106280193236715, + "grad_norm": 0.7769069075584412, + "learning_rate": 5.025312500000001e-06, + "loss": 0.0227, + "step": 8925 + }, + { + "epoch": 4.117782378651944, + "grad_norm": 2.2610023021698, + "learning_rate": 5.0175e-06, + "loss": 0.0316, + "step": 8950 + }, + { + "epoch": 4.129284564067173, + "grad_norm": 0.38040891289711, + "learning_rate": 5.0096875000000005e-06, + "loss": 0.0222, + "step": 8975 + }, + { + "epoch": 4.140786749482402, + "grad_norm": 3.0205907821655273, + "learning_rate": 5.001875e-06, + "loss": 0.04, + "step": 9000 + }, + { + "epoch": 4.152288934897631, + "grad_norm": 2.703075647354126, + "learning_rate": 4.9940625e-06, + "loss": 0.022, + "step": 9025 + }, + { + "epoch": 4.163791120312859, + "grad_norm": 1.172072410583496, + "learning_rate": 4.98625e-06, + "loss": 0.0325, + "step": 9050 + }, + { + "epoch": 4.175293305728088, + "grad_norm": 2.555593967437744, + "learning_rate": 4.9784375e-06, + "loss": 0.0222, + "step": 9075 + }, + { + "epoch": 4.186795491143317, + "grad_norm": 1.3756543397903442, + "learning_rate": 4.970625e-06, + "loss": 0.035, + "step": 9100 + }, + { + "epoch": 4.198297676558546, + "grad_norm": 3.3377280235290527, + "learning_rate": 4.9628125e-06, + "loss": 0.0136, + "step": 9125 + }, + { + "epoch": 4.209799861973775, + "grad_norm": 3.4057676792144775, + "learning_rate": 4.955e-06, + "loss": 0.0284, + "step": 9150 + }, + { + "epoch": 4.2213020473890035, + "grad_norm": 0.6658844351768494, + "learning_rate": 4.9471875e-06, + "loss": 0.0191, + "step": 9175 + }, + { + "epoch": 4.232804232804233, + "grad_norm": 3.073923349380493, + "learning_rate": 4.9393750000000005e-06, + "loss": 0.0365, + "step": 9200 + }, + { + "epoch": 4.244306418219462, + "grad_norm": 1.391367793083191, + "learning_rate": 4.9315625e-06, + "loss": 0.0206, + "step": 9225 + }, + { + "epoch": 4.255808603634691, + "grad_norm": 2.199260711669922, + "learning_rate": 4.92375e-06, + "loss": 0.0325, + "step": 9250 + }, + { + "epoch": 4.26731078904992, + "grad_norm": 0.31522706151008606, + "learning_rate": 4.9159375e-06, + "loss": 0.0191, + "step": 9275 + }, + { + "epoch": 4.278812974465148, + "grad_norm": 1.3394752740859985, + "learning_rate": 4.9081250000000005e-06, + "loss": 0.031, + "step": 9300 + }, + { + "epoch": 4.290315159880377, + "grad_norm": 3.397002696990967, + "learning_rate": 4.9003125e-06, + "loss": 0.0283, + "step": 9325 + }, + { + "epoch": 4.301817345295606, + "grad_norm": 1.9605752229690552, + "learning_rate": 4.8925e-06, + "loss": 0.0334, + "step": 9350 + }, + { + "epoch": 4.313319530710835, + "grad_norm": 3.3331549167633057, + "learning_rate": 4.8846875e-06, + "loss": 0.0173, + "step": 9375 + }, + { + "epoch": 4.324821716126064, + "grad_norm": 1.6721982955932617, + "learning_rate": 4.876875e-06, + "loss": 0.0335, + "step": 9400 + }, + { + "epoch": 4.336323901541292, + "grad_norm": 2.483747720718384, + "learning_rate": 4.8690625000000004e-06, + "loss": 0.0161, + "step": 9425 + }, + { + "epoch": 4.3478260869565215, + "grad_norm": 2.7441227436065674, + "learning_rate": 4.86125e-06, + "loss": 0.0299, + "step": 9450 + }, + { + "epoch": 4.359328272371751, + "grad_norm": 6.056209087371826, + "learning_rate": 4.8534375e-06, + "loss": 0.0191, + "step": 9475 + }, + { + "epoch": 4.37083045778698, + "grad_norm": 3.665620803833008, + "learning_rate": 4.845625e-06, + "loss": 0.0413, + "step": 9500 + }, + { + "epoch": 4.382332643202209, + "grad_norm": 0.7313005924224854, + "learning_rate": 4.8378125000000005e-06, + "loss": 0.0279, + "step": 9525 + }, + { + "epoch": 4.393834828617437, + "grad_norm": 6.102551460266113, + "learning_rate": 4.83e-06, + "loss": 0.0365, + "step": 9550 + }, + { + "epoch": 4.405337014032666, + "grad_norm": 3.0939571857452393, + "learning_rate": 4.8221875e-06, + "loss": 0.0235, + "step": 9575 + }, + { + "epoch": 4.416839199447895, + "grad_norm": 6.218958854675293, + "learning_rate": 4.814375e-06, + "loss": 0.04, + "step": 9600 + }, + { + "epoch": 4.428341384863124, + "grad_norm": 3.746630907058716, + "learning_rate": 4.806562500000001e-06, + "loss": 0.0252, + "step": 9625 + }, + { + "epoch": 4.439843570278353, + "grad_norm": 2.458735466003418, + "learning_rate": 4.7987500000000004e-06, + "loss": 0.0407, + "step": 9650 + }, + { + "epoch": 4.451345755693581, + "grad_norm": 0.9115240573883057, + "learning_rate": 4.7909375e-06, + "loss": 0.0216, + "step": 9675 + }, + { + "epoch": 4.4628479411088104, + "grad_norm": 2.0379064083099365, + "learning_rate": 4.783125e-06, + "loss": 0.0326, + "step": 9700 + }, + { + "epoch": 4.4743501265240395, + "grad_norm": 3.2523980140686035, + "learning_rate": 4.7753125e-06, + "loss": 0.0216, + "step": 9725 + }, + { + "epoch": 4.485852311939269, + "grad_norm": 0.8420194387435913, + "learning_rate": 4.7675000000000005e-06, + "loss": 0.0337, + "step": 9750 + }, + { + "epoch": 4.497354497354498, + "grad_norm": 4.6802167892456055, + "learning_rate": 4.7596875e-06, + "loss": 0.0245, + "step": 9775 + }, + { + "epoch": 4.508856682769727, + "grad_norm": 1.313185214996338, + "learning_rate": 4.751875e-06, + "loss": 0.0261, + "step": 9800 + }, + { + "epoch": 4.520358868184955, + "grad_norm": 0.6525618433952332, + "learning_rate": 4.7440625e-06, + "loss": 0.0222, + "step": 9825 + }, + { + "epoch": 4.531861053600184, + "grad_norm": 1.9755631685256958, + "learning_rate": 4.736250000000001e-06, + "loss": 0.0328, + "step": 9850 + }, + { + "epoch": 4.543363239015413, + "grad_norm": 5.463117599487305, + "learning_rate": 4.7284374999999996e-06, + "loss": 0.0243, + "step": 9875 + }, + { + "epoch": 4.554865424430642, + "grad_norm": 2.689207077026367, + "learning_rate": 4.720625e-06, + "loss": 0.0294, + "step": 9900 + }, + { + "epoch": 4.566367609845871, + "grad_norm": 2.54421067237854, + "learning_rate": 4.7128125e-06, + "loss": 0.0249, + "step": 9925 + }, + { + "epoch": 4.577869795261099, + "grad_norm": 9.304972648620605, + "learning_rate": 4.705000000000001e-06, + "loss": 0.0413, + "step": 9950 + }, + { + "epoch": 4.5893719806763285, + "grad_norm": 1.8700443506240845, + "learning_rate": 4.6971875000000005e-06, + "loss": 0.0199, + "step": 9975 + }, + { + "epoch": 4.6008741660915575, + "grad_norm": 1.8287808895111084, + "learning_rate": 4.689375e-06, + "loss": 0.0371, + "step": 10000 + }, + { + "epoch": 4.6008741660915575, + "eval_loss": 0.18389073014259338, + "eval_runtime": 5170.7691, + "eval_samples_per_second": 1.836, + "eval_steps_per_second": 0.23, + "eval_wer": 0.09987187700192185, + "step": 10000 + }, + { + "epoch": 4.612376351506787, + "grad_norm": 4.324032306671143, + "learning_rate": 4.6815625e-06, + "loss": 0.0254, + "step": 10025 + }, + { + "epoch": 4.623878536922016, + "grad_norm": 1.790111780166626, + "learning_rate": 4.67375e-06, + "loss": 0.0317, + "step": 10050 + }, + { + "epoch": 4.635380722337244, + "grad_norm": 3.236027717590332, + "learning_rate": 4.665937500000001e-06, + "loss": 0.0169, + "step": 10075 + }, + { + "epoch": 4.646882907752473, + "grad_norm": 2.1184232234954834, + "learning_rate": 4.658125e-06, + "loss": 0.0272, + "step": 10100 + }, + { + "epoch": 4.658385093167702, + "grad_norm": 2.2520174980163574, + "learning_rate": 4.6503125e-06, + "loss": 0.0195, + "step": 10125 + }, + { + "epoch": 4.669887278582931, + "grad_norm": 3.1109213829040527, + "learning_rate": 4.6425e-06, + "loss": 0.0377, + "step": 10150 + }, + { + "epoch": 4.68138946399816, + "grad_norm": 0.9269886612892151, + "learning_rate": 4.634687500000001e-06, + "loss": 0.0245, + "step": 10175 + }, + { + "epoch": 4.692891649413388, + "grad_norm": 2.5646307468414307, + "learning_rate": 4.626875e-06, + "loss": 0.0335, + "step": 10200 + }, + { + "epoch": 4.704393834828617, + "grad_norm": 2.609894037246704, + "learning_rate": 4.6190625e-06, + "loss": 0.0286, + "step": 10225 + }, + { + "epoch": 4.7158960202438465, + "grad_norm": 4.488738059997559, + "learning_rate": 4.61125e-06, + "loss": 0.0311, + "step": 10250 + }, + { + "epoch": 4.7273982056590755, + "grad_norm": 1.5146970748901367, + "learning_rate": 4.603437500000001e-06, + "loss": 0.0193, + "step": 10275 + }, + { + "epoch": 4.738900391074305, + "grad_norm": 0.9895784258842468, + "learning_rate": 4.595625e-06, + "loss": 0.0263, + "step": 10300 + }, + { + "epoch": 4.750402576489533, + "grad_norm": 5.285503387451172, + "learning_rate": 4.5878125e-06, + "loss": 0.0262, + "step": 10325 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 0.3710523843765259, + "learning_rate": 4.58e-06, + "loss": 0.0414, + "step": 10350 + }, + { + "epoch": 4.773406947319991, + "grad_norm": 1.5297555923461914, + "learning_rate": 4.572187500000001e-06, + "loss": 0.0226, + "step": 10375 + }, + { + "epoch": 4.78490913273522, + "grad_norm": 2.7512269020080566, + "learning_rate": 4.564375e-06, + "loss": 0.0296, + "step": 10400 + }, + { + "epoch": 4.796411318150449, + "grad_norm": 7.514578342437744, + "learning_rate": 4.5565625000000005e-06, + "loss": 0.0252, + "step": 10425 + }, + { + "epoch": 4.807913503565677, + "grad_norm": 2.6303036212921143, + "learning_rate": 4.54875e-06, + "loss": 0.0281, + "step": 10450 + }, + { + "epoch": 4.819415688980906, + "grad_norm": 2.5647971630096436, + "learning_rate": 4.5409375e-06, + "loss": 0.0236, + "step": 10475 + }, + { + "epoch": 4.830917874396135, + "grad_norm": 1.5822607278823853, + "learning_rate": 4.533125e-06, + "loss": 0.03, + "step": 10500 + }, + { + "epoch": 4.8424200598113645, + "grad_norm": 3.3824303150177, + "learning_rate": 4.5253125e-06, + "loss": 0.0277, + "step": 10525 + }, + { + "epoch": 4.8539222452265935, + "grad_norm": 2.4250597953796387, + "learning_rate": 4.5175e-06, + "loss": 0.0288, + "step": 10550 + }, + { + "epoch": 4.865424430641822, + "grad_norm": 5.316204071044922, + "learning_rate": 4.5096875e-06, + "loss": 0.0248, + "step": 10575 + }, + { + "epoch": 4.876926616057051, + "grad_norm": 2.7238681316375732, + "learning_rate": 4.501875000000001e-06, + "loss": 0.0332, + "step": 10600 + }, + { + "epoch": 4.88842880147228, + "grad_norm": 3.1399307250976562, + "learning_rate": 4.4940625e-06, + "loss": 0.0222, + "step": 10625 + }, + { + "epoch": 4.899930986887509, + "grad_norm": 2.6083264350891113, + "learning_rate": 4.4862500000000005e-06, + "loss": 0.0325, + "step": 10650 + }, + { + "epoch": 4.911433172302738, + "grad_norm": 1.1990541219711304, + "learning_rate": 4.4784375e-06, + "loss": 0.0218, + "step": 10675 + }, + { + "epoch": 4.922935357717966, + "grad_norm": 2.0461394786834717, + "learning_rate": 4.470625000000001e-06, + "loss": 0.0312, + "step": 10700 + }, + { + "epoch": 4.934437543133195, + "grad_norm": 0.7773350477218628, + "learning_rate": 4.4628125e-06, + "loss": 0.0256, + "step": 10725 + }, + { + "epoch": 4.945939728548424, + "grad_norm": 2.645052671432495, + "learning_rate": 4.4550000000000005e-06, + "loss": 0.0334, + "step": 10750 + }, + { + "epoch": 4.957441913963653, + "grad_norm": 3.0270519256591797, + "learning_rate": 4.4471875e-06, + "loss": 0.0221, + "step": 10775 + }, + { + "epoch": 4.9689440993788825, + "grad_norm": 2.553375720977783, + "learning_rate": 4.439375e-06, + "loss": 0.0337, + "step": 10800 + }, + { + "epoch": 4.980446284794111, + "grad_norm": 5.496345520019531, + "learning_rate": 4.4315625e-06, + "loss": 0.0197, + "step": 10825 + }, + { + "epoch": 4.99194847020934, + "grad_norm": 2.0996806621551514, + "learning_rate": 4.4240625000000005e-06, + "loss": 0.0389, + "step": 10850 + }, + { + "epoch": 5.003450655624569, + "grad_norm": 1.9347798824310303, + "learning_rate": 4.41625e-06, + "loss": 0.0229, + "step": 10875 + }, + { + "epoch": 5.014952841039798, + "grad_norm": 3.568023443222046, + "learning_rate": 4.4084375e-06, + "loss": 0.0135, + "step": 10900 + }, + { + "epoch": 5.026455026455026, + "grad_norm": 0.6959520578384399, + "learning_rate": 4.400625e-06, + "loss": 0.0216, + "step": 10925 + }, + { + "epoch": 5.037957211870255, + "grad_norm": 0.2912887930870056, + "learning_rate": 4.3928125e-06, + "loss": 0.0157, + "step": 10950 + }, + { + "epoch": 5.049459397285484, + "grad_norm": 2.004884719848633, + "learning_rate": 4.385e-06, + "loss": 0.0208, + "step": 10975 + }, + { + "epoch": 5.060961582700713, + "grad_norm": 4.345211505889893, + "learning_rate": 4.3771875e-06, + "loss": 0.0163, + "step": 11000 + }, + { + "epoch": 5.072463768115942, + "grad_norm": 1.2300424575805664, + "learning_rate": 4.369375000000001e-06, + "loss": 0.0261, + "step": 11025 + }, + { + "epoch": 5.0839659535311705, + "grad_norm": 2.4417171478271484, + "learning_rate": 4.3615625e-06, + "loss": 0.0177, + "step": 11050 + }, + { + "epoch": 5.0954681389464, + "grad_norm": 0.4104786217212677, + "learning_rate": 4.3537500000000005e-06, + "loss": 0.0221, + "step": 11075 + }, + { + "epoch": 5.106970324361629, + "grad_norm": 1.5912240743637085, + "learning_rate": 4.3459375e-06, + "loss": 0.0108, + "step": 11100 + }, + { + "epoch": 5.118472509776858, + "grad_norm": 2.6581289768218994, + "learning_rate": 4.338125000000001e-06, + "loss": 0.0196, + "step": 11125 + }, + { + "epoch": 5.129974695192087, + "grad_norm": 0.41683322191238403, + "learning_rate": 4.3303125e-06, + "loss": 0.012, + "step": 11150 + }, + { + "epoch": 5.141476880607315, + "grad_norm": 0.7937358021736145, + "learning_rate": 4.3225000000000005e-06, + "loss": 0.0198, + "step": 11175 + }, + { + "epoch": 5.152979066022544, + "grad_norm": 2.042461633682251, + "learning_rate": 4.3146875e-06, + "loss": 0.0157, + "step": 11200 + }, + { + "epoch": 5.164481251437773, + "grad_norm": 1.6334706544876099, + "learning_rate": 4.306875e-06, + "loss": 0.0185, + "step": 11225 + }, + { + "epoch": 5.175983436853002, + "grad_norm": 2.2739877700805664, + "learning_rate": 4.2990625e-06, + "loss": 0.0178, + "step": 11250 + }, + { + "epoch": 5.187485622268231, + "grad_norm": 1.6258018016815186, + "learning_rate": 4.29125e-06, + "loss": 0.0173, + "step": 11275 + }, + { + "epoch": 5.1989878076834595, + "grad_norm": 4.973881244659424, + "learning_rate": 4.2834375000000004e-06, + "loss": 0.0129, + "step": 11300 + }, + { + "epoch": 5.2104899930986885, + "grad_norm": 0.9437686204910278, + "learning_rate": 4.275625e-06, + "loss": 0.0237, + "step": 11325 + }, + { + "epoch": 5.221992178513918, + "grad_norm": 3.4761786460876465, + "learning_rate": 4.2678125e-06, + "loss": 0.0147, + "step": 11350 + }, + { + "epoch": 5.233494363929147, + "grad_norm": 2.692365884780884, + "learning_rate": 4.26e-06, + "loss": 0.0189, + "step": 11375 + }, + { + "epoch": 5.244996549344376, + "grad_norm": 2.9584712982177734, + "learning_rate": 4.2521875000000005e-06, + "loss": 0.0174, + "step": 11400 + }, + { + "epoch": 5.256498734759604, + "grad_norm": 9.468619346618652, + "learning_rate": 4.244375e-06, + "loss": 0.0176, + "step": 11425 + }, + { + "epoch": 5.268000920174833, + "grad_norm": 0.9750581383705139, + "learning_rate": 4.2365625e-06, + "loss": 0.021, + "step": 11450 + }, + { + "epoch": 5.279503105590062, + "grad_norm": 3.0626797676086426, + "learning_rate": 4.22875e-06, + "loss": 0.0298, + "step": 11475 + }, + { + "epoch": 5.291005291005291, + "grad_norm": 2.2305479049682617, + "learning_rate": 4.220937500000001e-06, + "loss": 0.0161, + "step": 11500 + }, + { + "epoch": 5.30250747642052, + "grad_norm": 0.5742190480232239, + "learning_rate": 4.2131250000000004e-06, + "loss": 0.0216, + "step": 11525 + }, + { + "epoch": 5.314009661835748, + "grad_norm": 2.7238235473632812, + "learning_rate": 4.2053125e-06, + "loss": 0.0178, + "step": 11550 + }, + { + "epoch": 5.3255118472509775, + "grad_norm": 1.7739872932434082, + "learning_rate": 4.1975e-06, + "loss": 0.0203, + "step": 11575 + }, + { + "epoch": 5.3370140326662066, + "grad_norm": 0.9364586472511292, + "learning_rate": 4.1896875e-06, + "loss": 0.0185, + "step": 11600 + }, + { + "epoch": 5.348516218081436, + "grad_norm": 3.462063789367676, + "learning_rate": 4.1818750000000005e-06, + "loss": 0.027, + "step": 11625 + }, + { + "epoch": 5.360018403496665, + "grad_norm": 4.4536051750183105, + "learning_rate": 4.1740625e-06, + "loss": 0.0137, + "step": 11650 + }, + { + "epoch": 5.371520588911893, + "grad_norm": 1.4285831451416016, + "learning_rate": 4.16625e-06, + "loss": 0.022, + "step": 11675 + }, + { + "epoch": 5.383022774327122, + "grad_norm": 0.1919490098953247, + "learning_rate": 4.1584375e-06, + "loss": 0.0127, + "step": 11700 + }, + { + "epoch": 5.394524959742351, + "grad_norm": 1.8607268333435059, + "learning_rate": 4.150625000000001e-06, + "loss": 0.0221, + "step": 11725 + }, + { + "epoch": 5.40602714515758, + "grad_norm": 2.7783443927764893, + "learning_rate": 4.1428125e-06, + "loss": 0.0114, + "step": 11750 + }, + { + "epoch": 5.417529330572809, + "grad_norm": 0.5502150058746338, + "learning_rate": 4.135e-06, + "loss": 0.0283, + "step": 11775 + }, + { + "epoch": 5.429031515988037, + "grad_norm": 4.172156810760498, + "learning_rate": 4.1271875e-06, + "loss": 0.0148, + "step": 11800 + }, + { + "epoch": 5.440533701403266, + "grad_norm": 4.028002738952637, + "learning_rate": 4.119375000000001e-06, + "loss": 0.0248, + "step": 11825 + }, + { + "epoch": 5.4520358868184955, + "grad_norm": 2.3138980865478516, + "learning_rate": 4.1115625000000005e-06, + "loss": 0.0132, + "step": 11850 + }, + { + "epoch": 5.463538072233725, + "grad_norm": 1.1849713325500488, + "learning_rate": 4.10375e-06, + "loss": 0.0226, + "step": 11875 + }, + { + "epoch": 5.475040257648954, + "grad_norm": 0.587064266204834, + "learning_rate": 4.0959375e-06, + "loss": 0.019, + "step": 11900 + }, + { + "epoch": 5.486542443064182, + "grad_norm": 2.445380449295044, + "learning_rate": 4.088125e-06, + "loss": 0.0294, + "step": 11925 + }, + { + "epoch": 5.498044628479411, + "grad_norm": 0.9706999659538269, + "learning_rate": 4.080312500000001e-06, + "loss": 0.0243, + "step": 11950 + }, + { + "epoch": 5.50954681389464, + "grad_norm": 3.1857030391693115, + "learning_rate": 4.0724999999999995e-06, + "loss": 0.0285, + "step": 11975 + }, + { + "epoch": 5.521048999309869, + "grad_norm": 2.901566982269287, + "learning_rate": 4.0646875e-06, + "loss": 0.0144, + "step": 12000 + }, + { + "epoch": 5.532551184725098, + "grad_norm": 1.5645906925201416, + "learning_rate": 4.056875e-06, + "loss": 0.0209, + "step": 12025 + }, + { + "epoch": 5.544053370140326, + "grad_norm": 0.3500732481479645, + "learning_rate": 4.049062500000001e-06, + "loss": 0.0154, + "step": 12050 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 1.8092036247253418, + "learning_rate": 4.04125e-06, + "loss": 0.0244, + "step": 12075 + }, + { + "epoch": 5.567057740970784, + "grad_norm": 0.872996985912323, + "learning_rate": 4.0334375e-06, + "loss": 0.0128, + "step": 12100 + }, + { + "epoch": 5.5785599263860135, + "grad_norm": 1.7378208637237549, + "learning_rate": 4.025625e-06, + "loss": 0.0173, + "step": 12125 + }, + { + "epoch": 5.590062111801243, + "grad_norm": 2.589615821838379, + "learning_rate": 4.017812500000001e-06, + "loss": 0.019, + "step": 12150 + }, + { + "epoch": 5.601564297216471, + "grad_norm": 1.2749011516571045, + "learning_rate": 4.01e-06, + "loss": 0.0219, + "step": 12175 + }, + { + "epoch": 5.6130664826317, + "grad_norm": 3.331284761428833, + "learning_rate": 4.0021875e-06, + "loss": 0.0187, + "step": 12200 + }, + { + "epoch": 5.624568668046929, + "grad_norm": 3.3452606201171875, + "learning_rate": 3.994375e-06, + "loss": 0.0251, + "step": 12225 + }, + { + "epoch": 5.636070853462158, + "grad_norm": 3.171482563018799, + "learning_rate": 3.9865625e-06, + "loss": 0.0111, + "step": 12250 + }, + { + "epoch": 5.647573038877387, + "grad_norm": 1.3494371175765991, + "learning_rate": 3.978750000000001e-06, + "loss": 0.0252, + "step": 12275 + }, + { + "epoch": 5.659075224292615, + "grad_norm": 5.425341606140137, + "learning_rate": 3.9709375e-06, + "loss": 0.0154, + "step": 12300 + }, + { + "epoch": 5.670577409707844, + "grad_norm": 3.904033899307251, + "learning_rate": 3.963125e-06, + "loss": 0.0257, + "step": 12325 + }, + { + "epoch": 5.682079595123073, + "grad_norm": 6.220149993896484, + "learning_rate": 3.9553125e-06, + "loss": 0.0197, + "step": 12350 + }, + { + "epoch": 5.693581780538302, + "grad_norm": 2.4043924808502197, + "learning_rate": 3.947500000000001e-06, + "loss": 0.0206, + "step": 12375 + }, + { + "epoch": 5.7050839659535315, + "grad_norm": 1.716847538948059, + "learning_rate": 3.9396875e-06, + "loss": 0.0128, + "step": 12400 + }, + { + "epoch": 5.71658615136876, + "grad_norm": 2.778402805328369, + "learning_rate": 3.931875e-06, + "loss": 0.025, + "step": 12425 + }, + { + "epoch": 5.728088336783989, + "grad_norm": 1.2575827836990356, + "learning_rate": 3.9240625e-06, + "loss": 0.0165, + "step": 12450 + }, + { + "epoch": 5.739590522199218, + "grad_norm": 1.911150574684143, + "learning_rate": 3.916250000000001e-06, + "loss": 0.0274, + "step": 12475 + }, + { + "epoch": 5.751092707614447, + "grad_norm": 4.2152099609375, + "learning_rate": 3.9084375e-06, + "loss": 0.0205, + "step": 12500 + }, + { + "epoch": 5.762594893029676, + "grad_norm": 2.5663771629333496, + "learning_rate": 3.9006250000000005e-06, + "loss": 0.0233, + "step": 12525 + }, + { + "epoch": 5.774097078444904, + "grad_norm": 2.787692070007324, + "learning_rate": 3.8928125e-06, + "loss": 0.0168, + "step": 12550 + }, + { + "epoch": 5.785599263860133, + "grad_norm": 1.5613856315612793, + "learning_rate": 3.885e-06, + "loss": 0.0208, + "step": 12575 + }, + { + "epoch": 5.797101449275362, + "grad_norm": 4.623484134674072, + "learning_rate": 3.8771875e-06, + "loss": 0.0154, + "step": 12600 + }, + { + "epoch": 5.808603634690591, + "grad_norm": 1.0770273208618164, + "learning_rate": 3.869375e-06, + "loss": 0.0251, + "step": 12625 + }, + { + "epoch": 5.8201058201058204, + "grad_norm": 1.7411080598831177, + "learning_rate": 3.8615625e-06, + "loss": 0.0187, + "step": 12650 + }, + { + "epoch": 5.831608005521049, + "grad_norm": 1.9490394592285156, + "learning_rate": 3.85375e-06, + "loss": 0.0255, + "step": 12675 + }, + { + "epoch": 5.843110190936278, + "grad_norm": 3.750220537185669, + "learning_rate": 3.8459375e-06, + "loss": 0.015, + "step": 12700 + }, + { + "epoch": 5.854612376351507, + "grad_norm": 4.523986339569092, + "learning_rate": 3.838125e-06, + "loss": 0.0251, + "step": 12725 + }, + { + "epoch": 5.866114561766736, + "grad_norm": 0.7033637762069702, + "learning_rate": 3.8303125000000004e-06, + "loss": 0.0109, + "step": 12750 + }, + { + "epoch": 5.877616747181965, + "grad_norm": 1.5173693895339966, + "learning_rate": 3.8225e-06, + "loss": 0.026, + "step": 12775 + }, + { + "epoch": 5.889118932597193, + "grad_norm": 1.4995800256729126, + "learning_rate": 3.8146875e-06, + "loss": 0.0194, + "step": 12800 + }, + { + "epoch": 5.900621118012422, + "grad_norm": 2.203517436981201, + "learning_rate": 3.806875e-06, + "loss": 0.0292, + "step": 12825 + }, + { + "epoch": 5.912123303427651, + "grad_norm": 2.84173321723938, + "learning_rate": 3.7990625e-06, + "loss": 0.0159, + "step": 12850 + }, + { + "epoch": 5.92362548884288, + "grad_norm": 0.5612061619758606, + "learning_rate": 3.7912500000000003e-06, + "loss": 0.0288, + "step": 12875 + }, + { + "epoch": 5.935127674258109, + "grad_norm": 0.5369181632995605, + "learning_rate": 3.7834375000000006e-06, + "loss": 0.015, + "step": 12900 + }, + { + "epoch": 5.946629859673338, + "grad_norm": 2.2607927322387695, + "learning_rate": 3.775625e-06, + "loss": 0.0286, + "step": 12925 + }, + { + "epoch": 5.958132045088567, + "grad_norm": 2.4500582218170166, + "learning_rate": 3.7678125e-06, + "loss": 0.0204, + "step": 12950 + }, + { + "epoch": 5.969634230503796, + "grad_norm": 1.60666024684906, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.0199, + "step": 12975 + }, + { + "epoch": 5.981136415919025, + "grad_norm": 0.704494059085846, + "learning_rate": 3.7521875000000007e-06, + "loss": 0.0117, + "step": 13000 + }, + { + "epoch": 5.992638601334254, + "grad_norm": 0.9249849319458008, + "learning_rate": 3.744375e-06, + "loss": 0.0235, + "step": 13025 + }, + { + "epoch": 6.004140786749482, + "grad_norm": 3.008746862411499, + "learning_rate": 3.7365625000000003e-06, + "loss": 0.0129, + "step": 13050 + }, + { + "epoch": 6.015642972164711, + "grad_norm": 1.7184109687805176, + "learning_rate": 3.7287500000000005e-06, + "loss": 0.0127, + "step": 13075 + }, + { + "epoch": 6.02714515757994, + "grad_norm": 0.9730533957481384, + "learning_rate": 3.7209375000000003e-06, + "loss": 0.0175, + "step": 13100 + }, + { + "epoch": 6.038647342995169, + "grad_norm": 1.0653347969055176, + "learning_rate": 3.713125e-06, + "loss": 0.0088, + "step": 13125 + }, + { + "epoch": 6.050149528410398, + "grad_norm": 2.9114506244659424, + "learning_rate": 3.7053125e-06, + "loss": 0.0137, + "step": 13150 + }, + { + "epoch": 6.0616517138256265, + "grad_norm": 0.32335150241851807, + "learning_rate": 3.6975e-06, + "loss": 0.0077, + "step": 13175 + }, + { + "epoch": 6.073153899240856, + "grad_norm": 0.48962631821632385, + "learning_rate": 3.6896875000000004e-06, + "loss": 0.0198, + "step": 13200 + }, + { + "epoch": 6.084656084656085, + "grad_norm": 0.669650137424469, + "learning_rate": 3.681875e-06, + "loss": 0.0121, + "step": 13225 + }, + { + "epoch": 6.096158270071314, + "grad_norm": 3.413156270980835, + "learning_rate": 3.6740625e-06, + "loss": 0.0128, + "step": 13250 + }, + { + "epoch": 6.107660455486543, + "grad_norm": 2.7622175216674805, + "learning_rate": 3.6662500000000003e-06, + "loss": 0.0132, + "step": 13275 + }, + { + "epoch": 6.119162640901771, + "grad_norm": 2.454317808151245, + "learning_rate": 3.6584375000000005e-06, + "loss": 0.0189, + "step": 13300 + }, + { + "epoch": 6.130664826317, + "grad_norm": 1.0187458992004395, + "learning_rate": 3.650625e-06, + "loss": 0.0099, + "step": 13325 + }, + { + "epoch": 6.142167011732229, + "grad_norm": 1.0523874759674072, + "learning_rate": 3.6428125e-06, + "loss": 0.0186, + "step": 13350 + }, + { + "epoch": 6.153669197147458, + "grad_norm": 2.4889376163482666, + "learning_rate": 3.6350000000000003e-06, + "loss": 0.0102, + "step": 13375 + }, + { + "epoch": 6.165171382562687, + "grad_norm": 1.4157731533050537, + "learning_rate": 3.6275000000000004e-06, + "loss": 0.0232, + "step": 13400 + }, + { + "epoch": 6.1766735679779154, + "grad_norm": 2.285750389099121, + "learning_rate": 3.6196875000000007e-06, + "loss": 0.0112, + "step": 13425 + }, + { + "epoch": 6.1881757533931445, + "grad_norm": 3.0357449054718018, + "learning_rate": 3.611875e-06, + "loss": 0.0151, + "step": 13450 + }, + { + "epoch": 6.199677938808374, + "grad_norm": 1.9853347539901733, + "learning_rate": 3.6040625000000003e-06, + "loss": 0.0119, + "step": 13475 + }, + { + "epoch": 6.211180124223603, + "grad_norm": 3.489882469177246, + "learning_rate": 3.5962500000000005e-06, + "loss": 0.0156, + "step": 13500 + }, + { + "epoch": 6.222682309638832, + "grad_norm": 7.6776299476623535, + "learning_rate": 3.5884375000000003e-06, + "loss": 0.0107, + "step": 13525 + }, + { + "epoch": 6.23418449505406, + "grad_norm": 1.0730276107788086, + "learning_rate": 3.580625e-06, + "loss": 0.0194, + "step": 13550 + }, + { + "epoch": 6.245686680469289, + "grad_norm": 41.845558166503906, + "learning_rate": 3.5728125e-06, + "loss": 0.0115, + "step": 13575 + }, + { + "epoch": 6.257188865884518, + "grad_norm": 2.866692543029785, + "learning_rate": 3.565e-06, + "loss": 0.0159, + "step": 13600 + }, + { + "epoch": 6.268691051299747, + "grad_norm": 3.9908344745635986, + "learning_rate": 3.5571875000000004e-06, + "loss": 0.0099, + "step": 13625 + }, + { + "epoch": 6.280193236714976, + "grad_norm": 2.3821098804473877, + "learning_rate": 3.549375e-06, + "loss": 0.0191, + "step": 13650 + }, + { + "epoch": 6.291695422130204, + "grad_norm": 0.6831459403038025, + "learning_rate": 3.5415625e-06, + "loss": 0.0123, + "step": 13675 + }, + { + "epoch": 6.3031976075454335, + "grad_norm": 1.3033053874969482, + "learning_rate": 3.5337500000000003e-06, + "loss": 0.0183, + "step": 13700 + }, + { + "epoch": 6.3146997929606625, + "grad_norm": 0.3791348934173584, + "learning_rate": 3.5259375000000005e-06, + "loss": 0.0078, + "step": 13725 + }, + { + "epoch": 6.326201978375892, + "grad_norm": 0.43763110041618347, + "learning_rate": 3.518125e-06, + "loss": 0.0131, + "step": 13750 + }, + { + "epoch": 6.337704163791121, + "grad_norm": 2.630368232727051, + "learning_rate": 3.5103125e-06, + "loss": 0.0071, + "step": 13775 + }, + { + "epoch": 6.349206349206349, + "grad_norm": 0.5663381814956665, + "learning_rate": 3.5025000000000003e-06, + "loss": 0.0212, + "step": 13800 + }, + { + "epoch": 6.360708534621578, + "grad_norm": 2.189028739929199, + "learning_rate": 3.4946875000000006e-06, + "loss": 0.0137, + "step": 13825 + }, + { + "epoch": 6.372210720036807, + "grad_norm": 1.2615488767623901, + "learning_rate": 3.486875e-06, + "loss": 0.0231, + "step": 13850 + }, + { + "epoch": 6.383712905452036, + "grad_norm": 1.6485449075698853, + "learning_rate": 3.4790625e-06, + "loss": 0.0116, + "step": 13875 + }, + { + "epoch": 6.395215090867265, + "grad_norm": 3.103294610977173, + "learning_rate": 3.47125e-06, + "loss": 0.0198, + "step": 13900 + }, + { + "epoch": 6.406717276282493, + "grad_norm": 6.216635704040527, + "learning_rate": 3.4634375000000002e-06, + "loss": 0.0137, + "step": 13925 + }, + { + "epoch": 6.418219461697722, + "grad_norm": 3.772575855255127, + "learning_rate": 3.4556249999999996e-06, + "loss": 0.0207, + "step": 13950 + }, + { + "epoch": 6.4297216471129515, + "grad_norm": 0.9796826243400574, + "learning_rate": 3.4478125e-06, + "loss": 0.0094, + "step": 13975 + }, + { + "epoch": 6.4412238325281805, + "grad_norm": 4.275996208190918, + "learning_rate": 3.44e-06, + "loss": 0.019, + "step": 14000 + }, + { + "epoch": 6.45272601794341, + "grad_norm": 5.775335311889648, + "learning_rate": 3.4321875000000003e-06, + "loss": 0.0099, + "step": 14025 + }, + { + "epoch": 6.464228203358638, + "grad_norm": 2.5531623363494873, + "learning_rate": 3.4243750000000006e-06, + "loss": 0.0213, + "step": 14050 + }, + { + "epoch": 6.475730388773867, + "grad_norm": 0.7260667085647583, + "learning_rate": 3.4165625e-06, + "loss": 0.009, + "step": 14075 + }, + { + "epoch": 6.487232574189096, + "grad_norm": 1.1557809114456177, + "learning_rate": 3.40875e-06, + "loss": 0.0218, + "step": 14100 + }, + { + "epoch": 6.498734759604325, + "grad_norm": 1.5365861654281616, + "learning_rate": 3.4009375000000004e-06, + "loss": 0.0049, + "step": 14125 + }, + { + "epoch": 6.510236945019554, + "grad_norm": 4.833078861236572, + "learning_rate": 3.3931250000000007e-06, + "loss": 0.0196, + "step": 14150 + }, + { + "epoch": 6.521739130434782, + "grad_norm": 3.0063588619232178, + "learning_rate": 3.3853125e-06, + "loss": 0.0102, + "step": 14175 + }, + { + "epoch": 6.533241315850011, + "grad_norm": 3.6960723400115967, + "learning_rate": 3.3775000000000003e-06, + "loss": 0.0182, + "step": 14200 + }, + { + "epoch": 6.54474350126524, + "grad_norm": 0.6360275149345398, + "learning_rate": 3.3696875e-06, + "loss": 0.0093, + "step": 14225 + }, + { + "epoch": 6.5562456866804695, + "grad_norm": 2.3226568698883057, + "learning_rate": 3.3618750000000003e-06, + "loss": 0.0211, + "step": 14250 + }, + { + "epoch": 6.5677478720956985, + "grad_norm": 2.793957471847534, + "learning_rate": 3.3540624999999997e-06, + "loss": 0.0118, + "step": 14275 + }, + { + "epoch": 6.579250057510927, + "grad_norm": 0.3393898904323578, + "learning_rate": 3.34625e-06, + "loss": 0.0115, + "step": 14300 + }, + { + "epoch": 6.590752242926156, + "grad_norm": 0.8301447033882141, + "learning_rate": 3.3384375e-06, + "loss": 0.0069, + "step": 14325 + }, + { + "epoch": 6.602254428341385, + "grad_norm": 0.6139010190963745, + "learning_rate": 3.3306250000000004e-06, + "loss": 0.0112, + "step": 14350 + }, + { + "epoch": 6.613756613756614, + "grad_norm": 0.24870969355106354, + "learning_rate": 3.3228125e-06, + "loss": 0.0116, + "step": 14375 + }, + { + "epoch": 6.625258799171843, + "grad_norm": 1.3729124069213867, + "learning_rate": 3.315e-06, + "loss": 0.0115, + "step": 14400 + }, + { + "epoch": 6.636760984587071, + "grad_norm": 2.537313938140869, + "learning_rate": 3.3071875000000003e-06, + "loss": 0.0081, + "step": 14425 + }, + { + "epoch": 6.6482631700023, + "grad_norm": 2.8953866958618164, + "learning_rate": 3.2993750000000005e-06, + "loss": 0.0152, + "step": 14450 + }, + { + "epoch": 6.659765355417529, + "grad_norm": 0.47888821363449097, + "learning_rate": 3.2915625e-06, + "loss": 0.0098, + "step": 14475 + }, + { + "epoch": 6.671267540832758, + "grad_norm": 0.4152248799800873, + "learning_rate": 3.28375e-06, + "loss": 0.0285, + "step": 14500 + }, + { + "epoch": 6.6827697262479875, + "grad_norm": 0.7531673312187195, + "learning_rate": 3.2759375000000003e-06, + "loss": 0.0063, + "step": 14525 + }, + { + "epoch": 6.694271911663216, + "grad_norm": 2.4068517684936523, + "learning_rate": 3.268125e-06, + "loss": 0.0213, + "step": 14550 + }, + { + "epoch": 6.705774097078445, + "grad_norm": 2.0701165199279785, + "learning_rate": 3.2603125e-06, + "loss": 0.0096, + "step": 14575 + }, + { + "epoch": 6.717276282493674, + "grad_norm": 3.544454336166382, + "learning_rate": 3.2525e-06, + "loss": 0.0234, + "step": 14600 + }, + { + "epoch": 6.728778467908903, + "grad_norm": 0.8146782517433167, + "learning_rate": 3.2446875e-06, + "loss": 0.0066, + "step": 14625 + }, + { + "epoch": 6.740280653324132, + "grad_norm": 2.016157627105713, + "learning_rate": 3.2368750000000002e-06, + "loss": 0.0139, + "step": 14650 + }, + { + "epoch": 6.75178283873936, + "grad_norm": 3.558530807495117, + "learning_rate": 3.2290625000000005e-06, + "loss": 0.016, + "step": 14675 + }, + { + "epoch": 6.763285024154589, + "grad_norm": 2.2227861881256104, + "learning_rate": 3.22125e-06, + "loss": 0.0197, + "step": 14700 + }, + { + "epoch": 6.774787209569818, + "grad_norm": 1.3837552070617676, + "learning_rate": 3.2134375e-06, + "loss": 0.0096, + "step": 14725 + }, + { + "epoch": 6.786289394985047, + "grad_norm": 7.47805643081665, + "learning_rate": 3.2056250000000003e-06, + "loss": 0.0136, + "step": 14750 + }, + { + "epoch": 6.797791580400276, + "grad_norm": 2.1846354007720947, + "learning_rate": 3.1978125000000006e-06, + "loss": 0.0142, + "step": 14775 + }, + { + "epoch": 6.809293765815505, + "grad_norm": 0.4281105101108551, + "learning_rate": 3.19e-06, + "loss": 0.0168, + "step": 14800 + }, + { + "epoch": 6.820795951230734, + "grad_norm": 3.293972969055176, + "learning_rate": 3.1821875e-06, + "loss": 0.0125, + "step": 14825 + }, + { + "epoch": 6.832298136645963, + "grad_norm": 1.7543948888778687, + "learning_rate": 3.1743750000000004e-06, + "loss": 0.0125, + "step": 14850 + }, + { + "epoch": 6.843800322061192, + "grad_norm": 0.2645922005176544, + "learning_rate": 3.1665625000000002e-06, + "loss": 0.0094, + "step": 14875 + }, + { + "epoch": 6.855302507476421, + "grad_norm": 0.2636635899543762, + "learning_rate": 3.15875e-06, + "loss": 0.012, + "step": 14900 + }, + { + "epoch": 6.866804692891649, + "grad_norm": 7.451193332672119, + "learning_rate": 3.1509375000000003e-06, + "loss": 0.0078, + "step": 14925 + }, + { + "epoch": 6.878306878306878, + "grad_norm": 3.3548383712768555, + "learning_rate": 3.143125e-06, + "loss": 0.0155, + "step": 14950 + }, + { + "epoch": 6.889809063722107, + "grad_norm": 4.6555023193359375, + "learning_rate": 3.1353125000000003e-06, + "loss": 0.0108, + "step": 14975 + }, + { + "epoch": 6.901311249137336, + "grad_norm": 1.744326114654541, + "learning_rate": 3.1274999999999997e-06, + "loss": 0.0246, + "step": 15000 + }, + { + "epoch": 6.901311249137336, + "eval_loss": 0.20269618928432465, + "eval_runtime": 5635.5955, + "eval_samples_per_second": 1.685, + "eval_steps_per_second": 0.211, + "eval_wer": 0.09967969250480462, + "step": 15000 + }, + { + "epoch": 6.912813434552565, + "grad_norm": 0.898334801197052, + "learning_rate": 3.1196875e-06, + "loss": 0.0087, + "step": 15025 + }, + { + "epoch": 6.9243156199677935, + "grad_norm": 1.3468067646026611, + "learning_rate": 3.111875e-06, + "loss": 0.0212, + "step": 15050 + }, + { + "epoch": 6.935817805383023, + "grad_norm": 0.397684782743454, + "learning_rate": 3.1040625e-06, + "loss": 0.0127, + "step": 15075 + }, + { + "epoch": 6.947319990798252, + "grad_norm": 0.8051169514656067, + "learning_rate": 3.0962500000000002e-06, + "loss": 0.0186, + "step": 15100 + }, + { + "epoch": 6.958822176213481, + "grad_norm": 0.3554774224758148, + "learning_rate": 3.0884375e-06, + "loss": 0.0083, + "step": 15125 + }, + { + "epoch": 6.97032436162871, + "grad_norm": 2.511303424835205, + "learning_rate": 3.0806250000000003e-06, + "loss": 0.0144, + "step": 15150 + }, + { + "epoch": 6.981826547043938, + "grad_norm": 0.17072099447250366, + "learning_rate": 3.0728125e-06, + "loss": 0.0097, + "step": 15175 + }, + { + "epoch": 6.993328732459167, + "grad_norm": 0.16506649553775787, + "learning_rate": 3.0650000000000003e-06, + "loss": 0.0163, + "step": 15200 + }, + { + "epoch": 7.004830917874396, + "grad_norm": 0.12069711089134216, + "learning_rate": 3.0571875e-06, + "loss": 0.0079, + "step": 15225 + }, + { + "epoch": 7.016333103289625, + "grad_norm": 0.12159192562103271, + "learning_rate": 3.0493750000000003e-06, + "loss": 0.007, + "step": 15250 + }, + { + "epoch": 7.027835288704854, + "grad_norm": 0.15364721417427063, + "learning_rate": 3.0415625e-06, + "loss": 0.0155, + "step": 15275 + }, + { + "epoch": 7.0393374741200825, + "grad_norm": 0.516640841960907, + "learning_rate": 3.03375e-06, + "loss": 0.0092, + "step": 15300 + }, + { + "epoch": 7.0508396595353116, + "grad_norm": 0.5874799489974976, + "learning_rate": 3.0259375e-06, + "loss": 0.0079, + "step": 15325 + }, + { + "epoch": 7.062341844950541, + "grad_norm": 1.2351374626159668, + "learning_rate": 3.018125e-06, + "loss": 0.0148, + "step": 15350 + }, + { + "epoch": 7.07384403036577, + "grad_norm": 0.7071799635887146, + "learning_rate": 3.0103125000000002e-06, + "loss": 0.015, + "step": 15375 + }, + { + "epoch": 7.085346215780999, + "grad_norm": 7.619936943054199, + "learning_rate": 3.0025e-06, + "loss": 0.0063, + "step": 15400 + }, + { + "epoch": 7.096848401196227, + "grad_norm": 0.20653395354747772, + "learning_rate": 2.9946875000000003e-06, + "loss": 0.0088, + "step": 15425 + }, + { + "epoch": 7.108350586611456, + "grad_norm": 0.3233853280544281, + "learning_rate": 2.986875e-06, + "loss": 0.0058, + "step": 15450 + }, + { + "epoch": 7.119852772026685, + "grad_norm": 1.1703747510910034, + "learning_rate": 2.9790625000000003e-06, + "loss": 0.0149, + "step": 15475 + }, + { + "epoch": 7.131354957441914, + "grad_norm": 0.07214687764644623, + "learning_rate": 2.97125e-06, + "loss": 0.0041, + "step": 15500 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 1.454972505569458, + "learning_rate": 2.9634375000000004e-06, + "loss": 0.0121, + "step": 15525 + }, + { + "epoch": 7.154359328272371, + "grad_norm": 0.7644615769386292, + "learning_rate": 2.955625e-06, + "loss": 0.0086, + "step": 15550 + }, + { + "epoch": 7.1658615136876005, + "grad_norm": 3.402597188949585, + "learning_rate": 2.9478125000000004e-06, + "loss": 0.0095, + "step": 15575 + }, + { + "epoch": 7.17736369910283, + "grad_norm": 3.2097392082214355, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.0104, + "step": 15600 + }, + { + "epoch": 7.188865884518059, + "grad_norm": 1.831574559211731, + "learning_rate": 2.9325000000000003e-06, + "loss": 0.0181, + "step": 15625 + }, + { + "epoch": 7.200368069933288, + "grad_norm": 2.661018133163452, + "learning_rate": 2.9246875e-06, + "loss": 0.0048, + "step": 15650 + }, + { + "epoch": 7.211870255348516, + "grad_norm": 1.9798295497894287, + "learning_rate": 2.9168750000000003e-06, + "loss": 0.0109, + "step": 15675 + }, + { + "epoch": 7.223372440763745, + "grad_norm": 1.1260945796966553, + "learning_rate": 2.9090625e-06, + "loss": 0.0076, + "step": 15700 + }, + { + "epoch": 7.234874626178974, + "grad_norm": 0.178892120718956, + "learning_rate": 2.90125e-06, + "loss": 0.0118, + "step": 15725 + }, + { + "epoch": 7.246376811594203, + "grad_norm": 2.5324740409851074, + "learning_rate": 2.8934374999999998e-06, + "loss": 0.0068, + "step": 15750 + }, + { + "epoch": 7.257878997009432, + "grad_norm": 0.655042290687561, + "learning_rate": 2.885625e-06, + "loss": 0.0075, + "step": 15775 + }, + { + "epoch": 7.26938118242466, + "grad_norm": 5.1071624755859375, + "learning_rate": 2.8778125000000002e-06, + "loss": 0.0065, + "step": 15800 + }, + { + "epoch": 7.280883367839889, + "grad_norm": 0.35025542974472046, + "learning_rate": 2.87e-06, + "loss": 0.0171, + "step": 15825 + }, + { + "epoch": 7.2923855532551185, + "grad_norm": 0.16820687055587769, + "learning_rate": 2.8621875000000003e-06, + "loss": 0.0077, + "step": 15850 + }, + { + "epoch": 7.303887738670348, + "grad_norm": 1.0210137367248535, + "learning_rate": 2.854375e-06, + "loss": 0.0088, + "step": 15875 + }, + { + "epoch": 7.315389924085577, + "grad_norm": 0.3880836069583893, + "learning_rate": 2.8465625000000003e-06, + "loss": 0.0155, + "step": 15900 + }, + { + "epoch": 7.326892109500805, + "grad_norm": 0.7286210060119629, + "learning_rate": 2.83875e-06, + "loss": 0.0153, + "step": 15925 + }, + { + "epoch": 7.338394294916034, + "grad_norm": 1.0173991918563843, + "learning_rate": 2.8309375000000004e-06, + "loss": 0.0082, + "step": 15950 + }, + { + "epoch": 7.349896480331263, + "grad_norm": 0.4859057664871216, + "learning_rate": 2.823125e-06, + "loss": 0.0104, + "step": 15975 + }, + { + "epoch": 7.361398665746492, + "grad_norm": 0.1217811331152916, + "learning_rate": 2.8153125000000004e-06, + "loss": 0.0075, + "step": 16000 + }, + { + "epoch": 7.372900851161721, + "grad_norm": 1.074440360069275, + "learning_rate": 2.8075000000000002e-06, + "loss": 0.0123, + "step": 16025 + }, + { + "epoch": 7.384403036576949, + "grad_norm": 5.824402332305908, + "learning_rate": 2.7996875e-06, + "loss": 0.0057, + "step": 16050 + }, + { + "epoch": 7.395905221992178, + "grad_norm": 2.5496740341186523, + "learning_rate": 2.791875e-06, + "loss": 0.0098, + "step": 16075 + }, + { + "epoch": 7.407407407407407, + "grad_norm": 5.917174339294434, + "learning_rate": 2.7840625e-06, + "loss": 0.0057, + "step": 16100 + }, + { + "epoch": 7.4189095928226365, + "grad_norm": 0.3634544909000397, + "learning_rate": 2.77625e-06, + "loss": 0.0128, + "step": 16125 + }, + { + "epoch": 7.430411778237866, + "grad_norm": 3.414944887161255, + "learning_rate": 2.7684375e-06, + "loss": 0.0086, + "step": 16150 + }, + { + "epoch": 7.441913963653094, + "grad_norm": 4.660429000854492, + "learning_rate": 2.760625e-06, + "loss": 0.0131, + "step": 16175 + }, + { + "epoch": 7.453416149068323, + "grad_norm": 2.309429407119751, + "learning_rate": 2.7528125e-06, + "loss": 0.0058, + "step": 16200 + }, + { + "epoch": 7.464918334483552, + "grad_norm": 5.828795909881592, + "learning_rate": 2.745e-06, + "loss": 0.0101, + "step": 16225 + }, + { + "epoch": 7.476420519898781, + "grad_norm": 0.5273516774177551, + "learning_rate": 2.7371875e-06, + "loss": 0.0059, + "step": 16250 + }, + { + "epoch": 7.48792270531401, + "grad_norm": 1.4716130495071411, + "learning_rate": 2.729375e-06, + "loss": 0.014, + "step": 16275 + }, + { + "epoch": 7.499424890729238, + "grad_norm": 1.8553239107131958, + "learning_rate": 2.7215625000000003e-06, + "loss": 0.0068, + "step": 16300 + }, + { + "epoch": 7.510927076144467, + "grad_norm": 2.6758711338043213, + "learning_rate": 2.71375e-06, + "loss": 0.0112, + "step": 16325 + }, + { + "epoch": 7.522429261559696, + "grad_norm": 0.19957537949085236, + "learning_rate": 2.7059375000000003e-06, + "loss": 0.0095, + "step": 16350 + }, + { + "epoch": 7.5339314469749254, + "grad_norm": 5.1007161140441895, + "learning_rate": 2.698125e-06, + "loss": 0.0109, + "step": 16375 + }, + { + "epoch": 7.545433632390154, + "grad_norm": 8.458159446716309, + "learning_rate": 2.6903125e-06, + "loss": 0.0071, + "step": 16400 + }, + { + "epoch": 7.556935817805383, + "grad_norm": 0.6693940758705139, + "learning_rate": 2.6825e-06, + "loss": 0.0187, + "step": 16425 + }, + { + "epoch": 7.568438003220612, + "grad_norm": 1.4511332511901855, + "learning_rate": 2.6746875e-06, + "loss": 0.0079, + "step": 16450 + }, + { + "epoch": 7.579940188635841, + "grad_norm": 3.054114818572998, + "learning_rate": 2.666875e-06, + "loss": 0.0185, + "step": 16475 + }, + { + "epoch": 7.59144237405107, + "grad_norm": 6.097264766693115, + "learning_rate": 2.6590625e-06, + "loss": 0.01, + "step": 16500 + }, + { + "epoch": 7.602944559466298, + "grad_norm": 1.9860618114471436, + "learning_rate": 2.6512500000000002e-06, + "loss": 0.0188, + "step": 16525 + }, + { + "epoch": 7.614446744881527, + "grad_norm": 0.1648847460746765, + "learning_rate": 2.6434375e-06, + "loss": 0.0114, + "step": 16550 + }, + { + "epoch": 7.625948930296756, + "grad_norm": 2.5525155067443848, + "learning_rate": 2.6356250000000003e-06, + "loss": 0.0158, + "step": 16575 + }, + { + "epoch": 7.637451115711985, + "grad_norm": 2.294396162033081, + "learning_rate": 2.6278125e-06, + "loss": 0.0093, + "step": 16600 + }, + { + "epoch": 7.648953301127214, + "grad_norm": 3.0494441986083984, + "learning_rate": 2.6200000000000003e-06, + "loss": 0.0233, + "step": 16625 + }, + { + "epoch": 7.660455486542443, + "grad_norm": 0.1415322870016098, + "learning_rate": 2.6121875e-06, + "loss": 0.0087, + "step": 16650 + }, + { + "epoch": 7.671957671957672, + "grad_norm": 2.2960498332977295, + "learning_rate": 2.6043750000000004e-06, + "loss": 0.0139, + "step": 16675 + }, + { + "epoch": 7.683459857372901, + "grad_norm": 0.5837172269821167, + "learning_rate": 2.5965625e-06, + "loss": 0.0055, + "step": 16700 + }, + { + "epoch": 7.69496204278813, + "grad_norm": 1.8971115350723267, + "learning_rate": 2.5887500000000004e-06, + "loss": 0.0115, + "step": 16725 + }, + { + "epoch": 7.706464228203359, + "grad_norm": 4.95072603225708, + "learning_rate": 2.5809375000000002e-06, + "loss": 0.0077, + "step": 16750 + }, + { + "epoch": 7.717966413618587, + "grad_norm": 2.0752921104431152, + "learning_rate": 2.573125e-06, + "loss": 0.0132, + "step": 16775 + }, + { + "epoch": 7.729468599033816, + "grad_norm": 0.07725714892148972, + "learning_rate": 2.5653125e-06, + "loss": 0.0089, + "step": 16800 + }, + { + "epoch": 7.740970784449045, + "grad_norm": 0.4947813153266907, + "learning_rate": 2.5575e-06, + "loss": 0.0089, + "step": 16825 + }, + { + "epoch": 7.752472969864274, + "grad_norm": 3.427701234817505, + "learning_rate": 2.5496875e-06, + "loss": 0.0087, + "step": 16850 + }, + { + "epoch": 7.763975155279503, + "grad_norm": 2.016444683074951, + "learning_rate": 2.541875e-06, + "loss": 0.0099, + "step": 16875 + }, + { + "epoch": 7.7754773406947315, + "grad_norm": 0.25170424580574036, + "learning_rate": 2.5340625e-06, + "loss": 0.007, + "step": 16900 + }, + { + "epoch": 7.786979526109961, + "grad_norm": 0.6936759948730469, + "learning_rate": 2.52625e-06, + "loss": 0.0069, + "step": 16925 + }, + { + "epoch": 7.79848171152519, + "grad_norm": 1.1857188940048218, + "learning_rate": 2.5184375e-06, + "loss": 0.008, + "step": 16950 + }, + { + "epoch": 7.809983896940419, + "grad_norm": 3.1954314708709717, + "learning_rate": 2.510625e-06, + "loss": 0.0147, + "step": 16975 + }, + { + "epoch": 7.821486082355648, + "grad_norm": 1.4031758308410645, + "learning_rate": 2.5028125e-06, + "loss": 0.0081, + "step": 17000 + }, + { + "epoch": 7.832988267770876, + "grad_norm": 0.52959805727005, + "learning_rate": 2.4950000000000003e-06, + "loss": 0.0123, + "step": 17025 + }, + { + "epoch": 7.844490453186105, + "grad_norm": 4.232771396636963, + "learning_rate": 2.4871875000000005e-06, + "loss": 0.0054, + "step": 17050 + }, + { + "epoch": 7.855992638601334, + "grad_norm": 0.2118764966726303, + "learning_rate": 2.4793750000000003e-06, + "loss": 0.0149, + "step": 17075 + }, + { + "epoch": 7.867494824016563, + "grad_norm": 0.4932823181152344, + "learning_rate": 2.4715625e-06, + "loss": 0.0098, + "step": 17100 + }, + { + "epoch": 7.878997009431792, + "grad_norm": 3.5741822719573975, + "learning_rate": 2.46375e-06, + "loss": 0.0128, + "step": 17125 + }, + { + "epoch": 7.8904991948470204, + "grad_norm": 1.6678423881530762, + "learning_rate": 2.4559375e-06, + "loss": 0.0071, + "step": 17150 + }, + { + "epoch": 7.9020013802622495, + "grad_norm": 3.0652177333831787, + "learning_rate": 2.448125e-06, + "loss": 0.0136, + "step": 17175 + }, + { + "epoch": 7.913503565677479, + "grad_norm": 1.4408318996429443, + "learning_rate": 2.4403125e-06, + "loss": 0.0045, + "step": 17200 + }, + { + "epoch": 7.925005751092708, + "grad_norm": 0.2646098732948303, + "learning_rate": 2.4325e-06, + "loss": 0.0155, + "step": 17225 + }, + { + "epoch": 7.936507936507937, + "grad_norm": 2.7373244762420654, + "learning_rate": 2.4246875000000002e-06, + "loss": 0.0067, + "step": 17250 + }, + { + "epoch": 7.948010121923165, + "grad_norm": 6.320342540740967, + "learning_rate": 2.416875e-06, + "loss": 0.0095, + "step": 17275 + }, + { + "epoch": 7.959512307338394, + "grad_norm": 0.8445401191711426, + "learning_rate": 2.4090625000000003e-06, + "loss": 0.0061, + "step": 17300 + }, + { + "epoch": 7.971014492753623, + "grad_norm": 1.2060354948043823, + "learning_rate": 2.40125e-06, + "loss": 0.0203, + "step": 17325 + }, + { + "epoch": 7.982516678168852, + "grad_norm": 1.851037621498108, + "learning_rate": 2.3934375000000003e-06, + "loss": 0.007, + "step": 17350 + }, + { + "epoch": 7.994018863584081, + "grad_norm": 0.901386022567749, + "learning_rate": 2.385625e-06, + "loss": 0.0088, + "step": 17375 + }, + { + "epoch": 8.00552104899931, + "grad_norm": 0.07420093566179276, + "learning_rate": 2.3778125000000004e-06, + "loss": 0.013, + "step": 17400 + }, + { + "epoch": 8.01702323441454, + "grad_norm": 0.05671022832393646, + "learning_rate": 2.37e-06, + "loss": 0.007, + "step": 17425 + }, + { + "epoch": 8.028525419829768, + "grad_norm": 0.5261373519897461, + "learning_rate": 2.3621875e-06, + "loss": 0.0071, + "step": 17450 + }, + { + "epoch": 8.040027605244996, + "grad_norm": 0.4346860349178314, + "learning_rate": 2.354375e-06, + "loss": 0.0067, + "step": 17475 + }, + { + "epoch": 8.051529790660226, + "grad_norm": 0.08124396204948425, + "learning_rate": 2.3465625e-06, + "loss": 0.0113, + "step": 17500 + }, + { + "epoch": 8.063031976075454, + "grad_norm": 2.8863022327423096, + "learning_rate": 2.33875e-06, + "loss": 0.0096, + "step": 17525 + }, + { + "epoch": 8.074534161490684, + "grad_norm": 0.4603404700756073, + "learning_rate": 2.3309375e-06, + "loss": 0.0111, + "step": 17550 + }, + { + "epoch": 8.086036346905912, + "grad_norm": 2.8856966495513916, + "learning_rate": 2.323125e-06, + "loss": 0.0059, + "step": 17575 + }, + { + "epoch": 8.09753853232114, + "grad_norm": 0.10746220499277115, + "learning_rate": 2.3153125e-06, + "loss": 0.0095, + "step": 17600 + }, + { + "epoch": 8.10904071773637, + "grad_norm": 0.5317927002906799, + "learning_rate": 2.3075e-06, + "loss": 0.006, + "step": 17625 + }, + { + "epoch": 8.120542903151598, + "grad_norm": 0.14040638506412506, + "learning_rate": 2.2996875e-06, + "loss": 0.0177, + "step": 17650 + }, + { + "epoch": 8.132045088566828, + "grad_norm": 0.5447073578834534, + "learning_rate": 2.2918750000000004e-06, + "loss": 0.0088, + "step": 17675 + }, + { + "epoch": 8.143547273982056, + "grad_norm": 2.9328360557556152, + "learning_rate": 2.2840625e-06, + "loss": 0.018, + "step": 17700 + }, + { + "epoch": 8.155049459397285, + "grad_norm": 0.19200760126113892, + "learning_rate": 2.2762500000000004e-06, + "loss": 0.0087, + "step": 17725 + }, + { + "epoch": 8.166551644812515, + "grad_norm": 0.07997579872608185, + "learning_rate": 2.2684375000000003e-06, + "loss": 0.0087, + "step": 17750 + }, + { + "epoch": 8.178053830227743, + "grad_norm": 2.795677900314331, + "learning_rate": 2.260625e-06, + "loss": 0.0095, + "step": 17775 + }, + { + "epoch": 8.189556015642973, + "grad_norm": 0.10675506293773651, + "learning_rate": 2.2528125e-06, + "loss": 0.0103, + "step": 17800 + }, + { + "epoch": 8.201058201058201, + "grad_norm": 1.484521746635437, + "learning_rate": 2.245e-06, + "loss": 0.0077, + "step": 17825 + }, + { + "epoch": 8.21256038647343, + "grad_norm": 1.530125379562378, + "learning_rate": 2.2371875e-06, + "loss": 0.0136, + "step": 17850 + }, + { + "epoch": 8.224062571888659, + "grad_norm": 8.444382667541504, + "learning_rate": 2.229375e-06, + "loss": 0.0061, + "step": 17875 + }, + { + "epoch": 8.235564757303887, + "grad_norm": 0.18373289704322815, + "learning_rate": 2.2215625e-06, + "loss": 0.0103, + "step": 17900 + }, + { + "epoch": 8.247066942719117, + "grad_norm": 8.391464233398438, + "learning_rate": 2.21375e-06, + "loss": 0.007, + "step": 17925 + }, + { + "epoch": 8.258569128134345, + "grad_norm": 0.1739552766084671, + "learning_rate": 2.2059375e-06, + "loss": 0.0041, + "step": 17950 + }, + { + "epoch": 8.270071313549574, + "grad_norm": 2.3926053047180176, + "learning_rate": 2.1981250000000002e-06, + "loss": 0.0049, + "step": 17975 + }, + { + "epoch": 8.281573498964804, + "grad_norm": 0.11882667243480682, + "learning_rate": 2.1903125e-06, + "loss": 0.0137, + "step": 18000 + }, + { + "epoch": 8.293075684380032, + "grad_norm": 0.7215703129768372, + "learning_rate": 2.1825000000000003e-06, + "loss": 0.0125, + "step": 18025 + }, + { + "epoch": 8.304577869795262, + "grad_norm": 0.07454714179039001, + "learning_rate": 2.1746875e-06, + "loss": 0.0048, + "step": 18050 + }, + { + "epoch": 8.31608005521049, + "grad_norm": 0.11876709014177322, + "learning_rate": 2.1668750000000003e-06, + "loss": 0.0083, + "step": 18075 + }, + { + "epoch": 8.327582240625718, + "grad_norm": 2.6528899669647217, + "learning_rate": 2.1590625e-06, + "loss": 0.0211, + "step": 18100 + }, + { + "epoch": 8.339084426040948, + "grad_norm": 0.046709995716810226, + "learning_rate": 2.15125e-06, + "loss": 0.0039, + "step": 18125 + }, + { + "epoch": 8.350586611456176, + "grad_norm": 0.5029109716415405, + "learning_rate": 2.1434374999999998e-06, + "loss": 0.0137, + "step": 18150 + }, + { + "epoch": 8.362088796871406, + "grad_norm": 8.609116554260254, + "learning_rate": 2.135625e-06, + "loss": 0.0081, + "step": 18175 + }, + { + "epoch": 8.373590982286634, + "grad_norm": 0.24618186056613922, + "learning_rate": 2.1278125e-06, + "loss": 0.0127, + "step": 18200 + }, + { + "epoch": 8.385093167701863, + "grad_norm": 0.26581060886383057, + "learning_rate": 2.12e-06, + "loss": 0.0058, + "step": 18225 + }, + { + "epoch": 8.396595353117092, + "grad_norm": 1.805894374847412, + "learning_rate": 2.1125e-06, + "loss": 0.0067, + "step": 18250 + }, + { + "epoch": 8.40809753853232, + "grad_norm": 0.11837717145681381, + "learning_rate": 2.1046875e-06, + "loss": 0.0085, + "step": 18275 + }, + { + "epoch": 8.41959972394755, + "grad_norm": 1.3716498613357544, + "learning_rate": 2.096875e-06, + "loss": 0.01, + "step": 18300 + }, + { + "epoch": 8.431101909362779, + "grad_norm": 0.13939572870731354, + "learning_rate": 2.0890625e-06, + "loss": 0.0042, + "step": 18325 + }, + { + "epoch": 8.442604094778007, + "grad_norm": 0.18078701198101044, + "learning_rate": 2.08125e-06, + "loss": 0.0062, + "step": 18350 + }, + { + "epoch": 8.454106280193237, + "grad_norm": 0.35175594687461853, + "learning_rate": 2.0734375e-06, + "loss": 0.0032, + "step": 18375 + }, + { + "epoch": 8.465608465608465, + "grad_norm": 1.0184096097946167, + "learning_rate": 2.0656250000000002e-06, + "loss": 0.0135, + "step": 18400 + }, + { + "epoch": 8.477110651023695, + "grad_norm": 0.35224124789237976, + "learning_rate": 2.0578125e-06, + "loss": 0.0061, + "step": 18425 + }, + { + "epoch": 8.488612836438923, + "grad_norm": 4.74711275100708, + "learning_rate": 2.0500000000000003e-06, + "loss": 0.0144, + "step": 18450 + }, + { + "epoch": 8.500115021854151, + "grad_norm": 0.630029022693634, + "learning_rate": 2.0421875e-06, + "loss": 0.0049, + "step": 18475 + }, + { + "epoch": 8.511617207269381, + "grad_norm": 3.0329389572143555, + "learning_rate": 2.0343750000000003e-06, + "loss": 0.0107, + "step": 18500 + }, + { + "epoch": 8.52311939268461, + "grad_norm": 1.9780049324035645, + "learning_rate": 2.0265625e-06, + "loss": 0.0048, + "step": 18525 + }, + { + "epoch": 8.53462157809984, + "grad_norm": 2.6098361015319824, + "learning_rate": 2.0187500000000004e-06, + "loss": 0.0155, + "step": 18550 + }, + { + "epoch": 8.546123763515068, + "grad_norm": 0.2777227461338043, + "learning_rate": 2.0109375e-06, + "loss": 0.0056, + "step": 18575 + }, + { + "epoch": 8.557625948930296, + "grad_norm": 2.4276092052459717, + "learning_rate": 2.003125e-06, + "loss": 0.0126, + "step": 18600 + }, + { + "epoch": 8.569128134345526, + "grad_norm": 0.2835908532142639, + "learning_rate": 1.9953125e-06, + "loss": 0.0059, + "step": 18625 + }, + { + "epoch": 8.580630319760754, + "grad_norm": 2.6612679958343506, + "learning_rate": 1.9875e-06, + "loss": 0.0107, + "step": 18650 + }, + { + "epoch": 8.592132505175984, + "grad_norm": 1.2350229024887085, + "learning_rate": 1.9796875e-06, + "loss": 0.0093, + "step": 18675 + }, + { + "epoch": 8.603634690591212, + "grad_norm": 0.15607894957065582, + "learning_rate": 1.971875e-06, + "loss": 0.0107, + "step": 18700 + }, + { + "epoch": 8.61513687600644, + "grad_norm": 0.07845500856637955, + "learning_rate": 1.9640625e-06, + "loss": 0.0073, + "step": 18725 + }, + { + "epoch": 8.62663906142167, + "grad_norm": 1.0883415937423706, + "learning_rate": 1.95625e-06, + "loss": 0.0118, + "step": 18750 + }, + { + "epoch": 8.638141246836899, + "grad_norm": 0.17504793405532837, + "learning_rate": 1.9484375000000004e-06, + "loss": 0.0067, + "step": 18775 + }, + { + "epoch": 8.649643432252129, + "grad_norm": 1.5559245347976685, + "learning_rate": 1.940625e-06, + "loss": 0.0167, + "step": 18800 + }, + { + "epoch": 8.661145617667357, + "grad_norm": 0.06407686322927475, + "learning_rate": 1.9328125000000004e-06, + "loss": 0.0036, + "step": 18825 + }, + { + "epoch": 8.672647803082585, + "grad_norm": 1.3386480808258057, + "learning_rate": 1.925e-06, + "loss": 0.011, + "step": 18850 + }, + { + "epoch": 8.684149988497815, + "grad_norm": 0.0743364468216896, + "learning_rate": 1.9171875000000004e-06, + "loss": 0.0081, + "step": 18875 + }, + { + "epoch": 8.695652173913043, + "grad_norm": 2.949956178665161, + "learning_rate": 1.9093750000000002e-06, + "loss": 0.0072, + "step": 18900 + }, + { + "epoch": 8.707154359328273, + "grad_norm": 0.7283833026885986, + "learning_rate": 1.9015625000000003e-06, + "loss": 0.0049, + "step": 18925 + }, + { + "epoch": 8.718656544743501, + "grad_norm": 0.5794746279716492, + "learning_rate": 1.89375e-06, + "loss": 0.0101, + "step": 18950 + }, + { + "epoch": 8.73015873015873, + "grad_norm": 1.550223708152771, + "learning_rate": 1.8859375e-06, + "loss": 0.0068, + "step": 18975 + }, + { + "epoch": 8.74166091557396, + "grad_norm": 0.12086187303066254, + "learning_rate": 1.8781250000000001e-06, + "loss": 0.0096, + "step": 19000 + }, + { + "epoch": 8.753163100989187, + "grad_norm": 2.682513475418091, + "learning_rate": 1.8703125000000001e-06, + "loss": 0.0048, + "step": 19025 + }, + { + "epoch": 8.764665286404417, + "grad_norm": 0.8820897340774536, + "learning_rate": 1.8625e-06, + "loss": 0.0067, + "step": 19050 + }, + { + "epoch": 8.776167471819646, + "grad_norm": 1.6323509216308594, + "learning_rate": 1.8546875000000002e-06, + "loss": 0.0066, + "step": 19075 + }, + { + "epoch": 8.787669657234874, + "grad_norm": 0.3018206059932709, + "learning_rate": 1.846875e-06, + "loss": 0.0123, + "step": 19100 + }, + { + "epoch": 8.799171842650104, + "grad_norm": 0.10852475464344025, + "learning_rate": 1.8390625000000002e-06, + "loss": 0.0059, + "step": 19125 + }, + { + "epoch": 8.810674028065332, + "grad_norm": 3.6190342903137207, + "learning_rate": 1.83125e-06, + "loss": 0.01, + "step": 19150 + }, + { + "epoch": 8.822176213480562, + "grad_norm": 4.896969795227051, + "learning_rate": 1.8234375e-06, + "loss": 0.007, + "step": 19175 + }, + { + "epoch": 8.83367839889579, + "grad_norm": 0.1353834569454193, + "learning_rate": 1.8156249999999999e-06, + "loss": 0.0082, + "step": 19200 + }, + { + "epoch": 8.845180584311018, + "grad_norm": 0.18897338211536407, + "learning_rate": 1.8078125000000001e-06, + "loss": 0.0034, + "step": 19225 + }, + { + "epoch": 8.856682769726248, + "grad_norm": 1.5457974672317505, + "learning_rate": 1.8e-06, + "loss": 0.0066, + "step": 19250 + }, + { + "epoch": 8.868184955141476, + "grad_norm": 0.9404191374778748, + "learning_rate": 1.7921875000000002e-06, + "loss": 0.0096, + "step": 19275 + }, + { + "epoch": 8.879687140556706, + "grad_norm": 0.1098804920911789, + "learning_rate": 1.784375e-06, + "loss": 0.0073, + "step": 19300 + }, + { + "epoch": 8.891189325971935, + "grad_norm": 1.549627661705017, + "learning_rate": 1.7765625000000002e-06, + "loss": 0.0043, + "step": 19325 + }, + { + "epoch": 8.902691511387163, + "grad_norm": 0.5851069092750549, + "learning_rate": 1.76875e-06, + "loss": 0.007, + "step": 19350 + }, + { + "epoch": 8.914193696802393, + "grad_norm": 0.2933562994003296, + "learning_rate": 1.7609375e-06, + "loss": 0.0059, + "step": 19375 + }, + { + "epoch": 8.925695882217621, + "grad_norm": 0.9020884037017822, + "learning_rate": 1.7531250000000003e-06, + "loss": 0.0108, + "step": 19400 + }, + { + "epoch": 8.93719806763285, + "grad_norm": 0.11604174226522446, + "learning_rate": 1.7453125e-06, + "loss": 0.0078, + "step": 19425 + }, + { + "epoch": 8.948700253048079, + "grad_norm": 0.2992897033691406, + "learning_rate": 1.7375000000000003e-06, + "loss": 0.0159, + "step": 19450 + }, + { + "epoch": 8.960202438463307, + "grad_norm": 6.1458306312561035, + "learning_rate": 1.7296875000000001e-06, + "loss": 0.0083, + "step": 19475 + }, + { + "epoch": 8.971704623878537, + "grad_norm": 0.6744798421859741, + "learning_rate": 1.7218750000000001e-06, + "loss": 0.0126, + "step": 19500 + }, + { + "epoch": 8.983206809293765, + "grad_norm": 0.13921253383159637, + "learning_rate": 1.7140625e-06, + "loss": 0.007, + "step": 19525 + }, + { + "epoch": 8.994708994708995, + "grad_norm": 0.5519439578056335, + "learning_rate": 1.7062500000000002e-06, + "loss": 0.0081, + "step": 19550 + }, + { + "epoch": 9.006211180124224, + "grad_norm": 0.20477654039859772, + "learning_rate": 1.6984375e-06, + "loss": 0.0056, + "step": 19575 + }, + { + "epoch": 9.017713365539452, + "grad_norm": 0.7848691940307617, + "learning_rate": 1.6906250000000002e-06, + "loss": 0.0097, + "step": 19600 + }, + { + "epoch": 9.029215550954682, + "grad_norm": 0.11725778132677078, + "learning_rate": 1.6828125e-06, + "loss": 0.0098, + "step": 19625 + }, + { + "epoch": 9.04071773636991, + "grad_norm": 0.863402247428894, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.0066, + "step": 19650 + }, + { + "epoch": 9.05221992178514, + "grad_norm": 0.6256903409957886, + "learning_rate": 1.6671875e-06, + "loss": 0.0051, + "step": 19675 + }, + { + "epoch": 9.063722107200368, + "grad_norm": 0.27045947313308716, + "learning_rate": 1.659375e-06, + "loss": 0.0062, + "step": 19700 + }, + { + "epoch": 9.075224292615596, + "grad_norm": 1.267508864402771, + "learning_rate": 1.6515625e-06, + "loss": 0.0096, + "step": 19725 + }, + { + "epoch": 9.086726478030826, + "grad_norm": 0.2194228619337082, + "learning_rate": 1.6437500000000001e-06, + "loss": 0.0073, + "step": 19750 + }, + { + "epoch": 9.098228663446054, + "grad_norm": 0.7281507849693298, + "learning_rate": 1.6359375e-06, + "loss": 0.0109, + "step": 19775 + }, + { + "epoch": 9.109730848861284, + "grad_norm": 0.041582778096199036, + "learning_rate": 1.6281250000000002e-06, + "loss": 0.0053, + "step": 19800 + }, + { + "epoch": 9.121233034276512, + "grad_norm": 1.166810393333435, + "learning_rate": 1.6203125e-06, + "loss": 0.005, + "step": 19825 + }, + { + "epoch": 9.13273521969174, + "grad_norm": 1.6084290742874146, + "learning_rate": 1.6125e-06, + "loss": 0.0039, + "step": 19850 + }, + { + "epoch": 9.14423740510697, + "grad_norm": 0.07307042181491852, + "learning_rate": 1.6046875e-06, + "loss": 0.0078, + "step": 19875 + }, + { + "epoch": 9.155739590522199, + "grad_norm": 0.04875970631837845, + "learning_rate": 1.596875e-06, + "loss": 0.0052, + "step": 19900 + }, + { + "epoch": 9.167241775937429, + "grad_norm": 0.23526020348072052, + "learning_rate": 1.5890624999999999e-06, + "loss": 0.0129, + "step": 19925 + }, + { + "epoch": 9.178743961352657, + "grad_norm": 0.24947020411491394, + "learning_rate": 1.5812500000000001e-06, + "loss": 0.0061, + "step": 19950 + }, + { + "epoch": 9.190246146767885, + "grad_norm": 0.8459863066673279, + "learning_rate": 1.5734375e-06, + "loss": 0.008, + "step": 19975 + }, + { + "epoch": 9.201748332183115, + "grad_norm": 0.057964421808719635, + "learning_rate": 1.5656250000000002e-06, + "loss": 0.0072, + "step": 20000 + }, + { + "epoch": 9.201748332183115, + "eval_loss": 0.21524043381214142, + "eval_runtime": 5582.0879, + "eval_samples_per_second": 1.701, + "eval_steps_per_second": 0.213, + "eval_wer": 0.0967008327994875, + "step": 20000 + }, + { + "epoch": 9.213250517598343, + "grad_norm": 0.24494586884975433, + "learning_rate": 1.5578125000000002e-06, + "loss": 0.0123, + "step": 20025 + }, + { + "epoch": 9.224752703013573, + "grad_norm": 0.16685990989208221, + "learning_rate": 1.55e-06, + "loss": 0.0032, + "step": 20050 + }, + { + "epoch": 9.236254888428801, + "grad_norm": 1.2150516510009766, + "learning_rate": 1.5421875e-06, + "loss": 0.0076, + "step": 20075 + }, + { + "epoch": 9.24775707384403, + "grad_norm": 3.1026265621185303, + "learning_rate": 1.534375e-06, + "loss": 0.003, + "step": 20100 + }, + { + "epoch": 9.25925925925926, + "grad_norm": 1.430039882659912, + "learning_rate": 1.5265625e-06, + "loss": 0.0064, + "step": 20125 + }, + { + "epoch": 9.270761444674488, + "grad_norm": 0.9484136700630188, + "learning_rate": 1.51875e-06, + "loss": 0.0067, + "step": 20150 + }, + { + "epoch": 9.282263630089718, + "grad_norm": 2.5765624046325684, + "learning_rate": 1.5109375e-06, + "loss": 0.0079, + "step": 20175 + }, + { + "epoch": 9.293765815504946, + "grad_norm": 0.821998119354248, + "learning_rate": 1.5031250000000001e-06, + "loss": 0.0047, + "step": 20200 + }, + { + "epoch": 9.305268000920174, + "grad_norm": 0.3511562645435333, + "learning_rate": 1.4953125e-06, + "loss": 0.0103, + "step": 20225 + }, + { + "epoch": 9.316770186335404, + "grad_norm": 0.04969533905386925, + "learning_rate": 1.4875e-06, + "loss": 0.0034, + "step": 20250 + }, + { + "epoch": 9.328272371750632, + "grad_norm": 0.08082418143749237, + "learning_rate": 1.4796875e-06, + "loss": 0.0072, + "step": 20275 + }, + { + "epoch": 9.339774557165862, + "grad_norm": 0.06818880885839462, + "learning_rate": 1.471875e-06, + "loss": 0.0067, + "step": 20300 + }, + { + "epoch": 9.35127674258109, + "grad_norm": 1.4060240983963013, + "learning_rate": 1.4640625000000002e-06, + "loss": 0.0071, + "step": 20325 + }, + { + "epoch": 9.362778927996319, + "grad_norm": 0.20024247467517853, + "learning_rate": 1.4562500000000002e-06, + "loss": 0.0044, + "step": 20350 + }, + { + "epoch": 9.374281113411548, + "grad_norm": 1.157453179359436, + "learning_rate": 1.4484375e-06, + "loss": 0.0094, + "step": 20375 + }, + { + "epoch": 9.385783298826777, + "grad_norm": 3.1247355937957764, + "learning_rate": 1.440625e-06, + "loss": 0.0088, + "step": 20400 + }, + { + "epoch": 9.397285484242007, + "grad_norm": 0.12344136089086533, + "learning_rate": 1.4328125e-06, + "loss": 0.0114, + "step": 20425 + }, + { + "epoch": 9.408787669657235, + "grad_norm": 0.16506707668304443, + "learning_rate": 1.4250000000000001e-06, + "loss": 0.007, + "step": 20450 + }, + { + "epoch": 9.420289855072463, + "grad_norm": 0.12009504437446594, + "learning_rate": 1.4171875000000001e-06, + "loss": 0.0062, + "step": 20475 + }, + { + "epoch": 9.431792040487693, + "grad_norm": 0.33941954374313354, + "learning_rate": 1.4093750000000002e-06, + "loss": 0.0049, + "step": 20500 + }, + { + "epoch": 9.443294225902921, + "grad_norm": 2.774742603302002, + "learning_rate": 1.4015625000000002e-06, + "loss": 0.015, + "step": 20525 + }, + { + "epoch": 9.454796411318151, + "grad_norm": 2.181804656982422, + "learning_rate": 1.39375e-06, + "loss": 0.0068, + "step": 20550 + }, + { + "epoch": 9.46629859673338, + "grad_norm": 0.1634143590927124, + "learning_rate": 1.3859375e-06, + "loss": 0.0131, + "step": 20575 + }, + { + "epoch": 9.477800782148607, + "grad_norm": 0.28276145458221436, + "learning_rate": 1.378125e-06, + "loss": 0.0033, + "step": 20600 + }, + { + "epoch": 9.489302967563837, + "grad_norm": 0.23474666476249695, + "learning_rate": 1.3703125e-06, + "loss": 0.0076, + "step": 20625 + }, + { + "epoch": 9.500805152979066, + "grad_norm": 0.23035980761051178, + "learning_rate": 1.3625e-06, + "loss": 0.0081, + "step": 20650 + }, + { + "epoch": 9.512307338394296, + "grad_norm": 0.15259630978107452, + "learning_rate": 1.3546875e-06, + "loss": 0.0078, + "step": 20675 + }, + { + "epoch": 9.523809523809524, + "grad_norm": 1.3978540897369385, + "learning_rate": 1.3468750000000001e-06, + "loss": 0.0039, + "step": 20700 + }, + { + "epoch": 9.535311709224752, + "grad_norm": 1.555802345275879, + "learning_rate": 1.3390625e-06, + "loss": 0.007, + "step": 20725 + }, + { + "epoch": 9.546813894639982, + "grad_norm": 0.11839170753955841, + "learning_rate": 1.33125e-06, + "loss": 0.0045, + "step": 20750 + }, + { + "epoch": 9.55831608005521, + "grad_norm": 1.8819234371185303, + "learning_rate": 1.3234375e-06, + "loss": 0.0079, + "step": 20775 + }, + { + "epoch": 9.56981826547044, + "grad_norm": 2.1347029209136963, + "learning_rate": 1.315625e-06, + "loss": 0.005, + "step": 20800 + }, + { + "epoch": 9.581320450885668, + "grad_norm": 0.05001327767968178, + "learning_rate": 1.3078125e-06, + "loss": 0.0094, + "step": 20825 + }, + { + "epoch": 9.592822636300896, + "grad_norm": 4.695704460144043, + "learning_rate": 1.3e-06, + "loss": 0.0041, + "step": 20850 + }, + { + "epoch": 9.604324821716126, + "grad_norm": 0.27146437764167786, + "learning_rate": 1.2921875e-06, + "loss": 0.005, + "step": 20875 + }, + { + "epoch": 9.615827007131355, + "grad_norm": 2.277533769607544, + "learning_rate": 1.284375e-06, + "loss": 0.0064, + "step": 20900 + }, + { + "epoch": 9.627329192546584, + "grad_norm": 0.10806547850370407, + "learning_rate": 1.2765625e-06, + "loss": 0.0063, + "step": 20925 + }, + { + "epoch": 9.638831377961813, + "grad_norm": 1.3636436462402344, + "learning_rate": 1.2690625000000002e-06, + "loss": 0.0103, + "step": 20950 + }, + { + "epoch": 9.65033356337704, + "grad_norm": 2.6159441471099854, + "learning_rate": 1.2612500000000002e-06, + "loss": 0.0052, + "step": 20975 + }, + { + "epoch": 9.66183574879227, + "grad_norm": 0.5515117645263672, + "learning_rate": 1.2534375e-06, + "loss": 0.0077, + "step": 21000 + }, + { + "epoch": 9.673337934207499, + "grad_norm": 0.8103013038635254, + "learning_rate": 1.245625e-06, + "loss": 0.0091, + "step": 21025 + }, + { + "epoch": 9.684840119622729, + "grad_norm": 2.336944103240967, + "learning_rate": 1.2378125e-06, + "loss": 0.0048, + "step": 21050 + }, + { + "epoch": 9.696342305037957, + "grad_norm": 3.5888826847076416, + "learning_rate": 1.23e-06, + "loss": 0.0059, + "step": 21075 + }, + { + "epoch": 9.707844490453185, + "grad_norm": 2.42950439453125, + "learning_rate": 1.2221875e-06, + "loss": 0.0071, + "step": 21100 + }, + { + "epoch": 9.719346675868415, + "grad_norm": 1.5883959531784058, + "learning_rate": 1.2143750000000001e-06, + "loss": 0.0103, + "step": 21125 + }, + { + "epoch": 9.730848861283643, + "grad_norm": 0.9261289238929749, + "learning_rate": 1.2065625000000001e-06, + "loss": 0.0101, + "step": 21150 + }, + { + "epoch": 9.742351046698873, + "grad_norm": 2.5712246894836426, + "learning_rate": 1.19875e-06, + "loss": 0.0071, + "step": 21175 + }, + { + "epoch": 9.753853232114102, + "grad_norm": 6.325390815734863, + "learning_rate": 1.1909375e-06, + "loss": 0.0042, + "step": 21200 + }, + { + "epoch": 9.76535541752933, + "grad_norm": 0.3912803828716278, + "learning_rate": 1.183125e-06, + "loss": 0.0064, + "step": 21225 + }, + { + "epoch": 9.77685760294456, + "grad_norm": 4.111480712890625, + "learning_rate": 1.1753125e-06, + "loss": 0.0056, + "step": 21250 + }, + { + "epoch": 9.788359788359788, + "grad_norm": 1.7652671337127686, + "learning_rate": 1.1675e-06, + "loss": 0.0061, + "step": 21275 + }, + { + "epoch": 9.799861973775018, + "grad_norm": 1.497182011604309, + "learning_rate": 1.1596875e-06, + "loss": 0.0045, + "step": 21300 + }, + { + "epoch": 9.811364159190246, + "grad_norm": 0.23965400457382202, + "learning_rate": 1.151875e-06, + "loss": 0.0117, + "step": 21325 + }, + { + "epoch": 9.822866344605474, + "grad_norm": 1.4831360578536987, + "learning_rate": 1.1440625e-06, + "loss": 0.0067, + "step": 21350 + }, + { + "epoch": 9.834368530020704, + "grad_norm": 0.11675738543272018, + "learning_rate": 1.13625e-06, + "loss": 0.0047, + "step": 21375 + }, + { + "epoch": 9.845870715435932, + "grad_norm": 0.06517274677753448, + "learning_rate": 1.1284375e-06, + "loss": 0.0076, + "step": 21400 + }, + { + "epoch": 9.857372900851162, + "grad_norm": 0.10633418709039688, + "learning_rate": 1.120625e-06, + "loss": 0.0061, + "step": 21425 + }, + { + "epoch": 9.86887508626639, + "grad_norm": 0.047500479966402054, + "learning_rate": 1.1128125000000002e-06, + "loss": 0.0058, + "step": 21450 + }, + { + "epoch": 9.880377271681619, + "grad_norm": 1.3911513090133667, + "learning_rate": 1.1050000000000002e-06, + "loss": 0.0097, + "step": 21475 + }, + { + "epoch": 9.891879457096849, + "grad_norm": 0.05551273375749588, + "learning_rate": 1.0971875e-06, + "loss": 0.0071, + "step": 21500 + }, + { + "epoch": 9.903381642512077, + "grad_norm": 0.4141998291015625, + "learning_rate": 1.089375e-06, + "loss": 0.0068, + "step": 21525 + }, + { + "epoch": 9.914883827927307, + "grad_norm": 0.11964666098356247, + "learning_rate": 1.0815625e-06, + "loss": 0.0058, + "step": 21550 + }, + { + "epoch": 9.926386013342535, + "grad_norm": 0.20705200731754303, + "learning_rate": 1.07375e-06, + "loss": 0.0082, + "step": 21575 + }, + { + "epoch": 9.937888198757763, + "grad_norm": 0.06934256851673126, + "learning_rate": 1.0659375000000001e-06, + "loss": 0.006, + "step": 21600 + }, + { + "epoch": 9.949390384172993, + "grad_norm": 0.9905650615692139, + "learning_rate": 1.0581250000000001e-06, + "loss": 0.0082, + "step": 21625 + }, + { + "epoch": 9.960892569588221, + "grad_norm": 6.720580101013184, + "learning_rate": 1.0503125000000002e-06, + "loss": 0.0087, + "step": 21650 + }, + { + "epoch": 9.972394755003451, + "grad_norm": 2.596421480178833, + "learning_rate": 1.0425e-06, + "loss": 0.0103, + "step": 21675 + }, + { + "epoch": 9.98389694041868, + "grad_norm": 0.18345965445041656, + "learning_rate": 1.0346875e-06, + "loss": 0.006, + "step": 21700 + }, + { + "epoch": 9.995399125833908, + "grad_norm": 0.20510388910770416, + "learning_rate": 1.026875e-06, + "loss": 0.01, + "step": 21725 + }, + { + "epoch": 10.006901311249138, + "grad_norm": 1.5538175106048584, + "learning_rate": 1.0190625e-06, + "loss": 0.0051, + "step": 21750 + }, + { + "epoch": 10.018403496664366, + "grad_norm": 4.992893695831299, + "learning_rate": 1.01125e-06, + "loss": 0.007, + "step": 21775 + }, + { + "epoch": 10.029905682079596, + "grad_norm": 2.5293116569519043, + "learning_rate": 1.0034375e-06, + "loss": 0.0097, + "step": 21800 + }, + { + "epoch": 10.041407867494824, + "grad_norm": 2.987748861312866, + "learning_rate": 9.95625e-07, + "loss": 0.0039, + "step": 21825 + }, + { + "epoch": 10.052910052910052, + "grad_norm": 0.8729166984558105, + "learning_rate": 9.878125000000001e-07, + "loss": 0.0102, + "step": 21850 + }, + { + "epoch": 10.064412238325282, + "grad_norm": 4.021528244018555, + "learning_rate": 9.8e-07, + "loss": 0.0032, + "step": 21875 + }, + { + "epoch": 10.07591442374051, + "grad_norm": 0.037765491753816605, + "learning_rate": 9.721875e-07, + "loss": 0.0061, + "step": 21900 + }, + { + "epoch": 10.08741660915574, + "grad_norm": 0.4817400574684143, + "learning_rate": 9.64375e-07, + "loss": 0.0052, + "step": 21925 + }, + { + "epoch": 10.098918794570968, + "grad_norm": 0.18971405923366547, + "learning_rate": 9.565625e-07, + "loss": 0.0065, + "step": 21950 + }, + { + "epoch": 10.110420979986197, + "grad_norm": 2.408292770385742, + "learning_rate": 9.4875e-07, + "loss": 0.0049, + "step": 21975 + }, + { + "epoch": 10.121923165401427, + "grad_norm": 1.3557521104812622, + "learning_rate": 9.409374999999999e-07, + "loss": 0.0069, + "step": 22000 + }, + { + "epoch": 10.133425350816655, + "grad_norm": 4.988788604736328, + "learning_rate": 9.33125e-07, + "loss": 0.0037, + "step": 22025 + }, + { + "epoch": 10.144927536231885, + "grad_norm": 1.5118516683578491, + "learning_rate": 9.253125e-07, + "loss": 0.0041, + "step": 22050 + }, + { + "epoch": 10.156429721647113, + "grad_norm": 0.13832196593284607, + "learning_rate": 9.175000000000001e-07, + "loss": 0.0058, + "step": 22075 + }, + { + "epoch": 10.167931907062341, + "grad_norm": 2.6763222217559814, + "learning_rate": 9.096875000000001e-07, + "loss": 0.0073, + "step": 22100 + }, + { + "epoch": 10.179434092477571, + "grad_norm": 0.1248018741607666, + "learning_rate": 9.018750000000002e-07, + "loss": 0.01, + "step": 22125 + }, + { + "epoch": 10.1909362778928, + "grad_norm": 0.22059573233127594, + "learning_rate": 8.940625000000001e-07, + "loss": 0.0099, + "step": 22150 + }, + { + "epoch": 10.20243846330803, + "grad_norm": 0.35290199518203735, + "learning_rate": 8.862500000000001e-07, + "loss": 0.0043, + "step": 22175 + }, + { + "epoch": 10.213940648723257, + "grad_norm": 0.038650188595056534, + "learning_rate": 8.784375000000001e-07, + "loss": 0.0073, + "step": 22200 + }, + { + "epoch": 10.225442834138486, + "grad_norm": 1.4717687368392944, + "learning_rate": 8.706250000000001e-07, + "loss": 0.0084, + "step": 22225 + }, + { + "epoch": 10.236945019553715, + "grad_norm": 0.5546708703041077, + "learning_rate": 8.628125e-07, + "loss": 0.0067, + "step": 22250 + }, + { + "epoch": 10.248447204968944, + "grad_norm": 0.07813633978366852, + "learning_rate": 8.550000000000001e-07, + "loss": 0.0034, + "step": 22275 + }, + { + "epoch": 10.259949390384174, + "grad_norm": 0.40856441855430603, + "learning_rate": 8.471875000000001e-07, + "loss": 0.0091, + "step": 22300 + }, + { + "epoch": 10.271451575799402, + "grad_norm": 0.05716840550303459, + "learning_rate": 8.39375e-07, + "loss": 0.0069, + "step": 22325 + }, + { + "epoch": 10.28295376121463, + "grad_norm": 0.07827286422252655, + "learning_rate": 8.315625e-07, + "loss": 0.0066, + "step": 22350 + }, + { + "epoch": 10.29445594662986, + "grad_norm": 8.593811988830566, + "learning_rate": 8.237500000000001e-07, + "loss": 0.0088, + "step": 22375 + }, + { + "epoch": 10.305958132045088, + "grad_norm": 1.3823190927505493, + "learning_rate": 8.159375000000001e-07, + "loss": 0.0085, + "step": 22400 + }, + { + "epoch": 10.317460317460318, + "grad_norm": 5.355988025665283, + "learning_rate": 8.08125e-07, + "loss": 0.0031, + "step": 22425 + }, + { + "epoch": 10.328962502875546, + "grad_norm": 0.39019063115119934, + "learning_rate": 8.003125e-07, + "loss": 0.006, + "step": 22450 + }, + { + "epoch": 10.340464688290774, + "grad_norm": 5.189722537994385, + "learning_rate": 7.925e-07, + "loss": 0.0064, + "step": 22475 + }, + { + "epoch": 10.351966873706004, + "grad_norm": 1.724393606185913, + "learning_rate": 7.846875000000001e-07, + "loss": 0.0067, + "step": 22500 + }, + { + "epoch": 10.363469059121233, + "grad_norm": 5.20131778717041, + "learning_rate": 7.76875e-07, + "loss": 0.0077, + "step": 22525 + }, + { + "epoch": 10.374971244536463, + "grad_norm": 0.0890192911028862, + "learning_rate": 7.690625000000001e-07, + "loss": 0.0036, + "step": 22550 + }, + { + "epoch": 10.38647342995169, + "grad_norm": 0.6779229640960693, + "learning_rate": 7.612500000000001e-07, + "loss": 0.0048, + "step": 22575 + }, + { + "epoch": 10.397975615366919, + "grad_norm": 0.616266667842865, + "learning_rate": 7.534375e-07, + "loss": 0.0097, + "step": 22600 + }, + { + "epoch": 10.409477800782149, + "grad_norm": 0.03939608484506607, + "learning_rate": 7.456250000000001e-07, + "loss": 0.0047, + "step": 22625 + }, + { + "epoch": 10.420979986197377, + "grad_norm": 0.14820168912410736, + "learning_rate": 7.378125000000001e-07, + "loss": 0.0096, + "step": 22650 + }, + { + "epoch": 10.432482171612607, + "grad_norm": 0.12704187631607056, + "learning_rate": 7.3e-07, + "loss": 0.0066, + "step": 22675 + }, + { + "epoch": 10.443984357027835, + "grad_norm": 0.037611182779073715, + "learning_rate": 7.221875e-07, + "loss": 0.0061, + "step": 22700 + }, + { + "epoch": 10.455486542443063, + "grad_norm": 4.043584823608398, + "learning_rate": 7.14375e-07, + "loss": 0.0085, + "step": 22725 + }, + { + "epoch": 10.466988727858293, + "grad_norm": 2.1938695907592773, + "learning_rate": 7.065625000000001e-07, + "loss": 0.0123, + "step": 22750 + }, + { + "epoch": 10.478490913273522, + "grad_norm": 0.1777833104133606, + "learning_rate": 6.9875e-07, + "loss": 0.0041, + "step": 22775 + }, + { + "epoch": 10.489993098688752, + "grad_norm": 0.21180225908756256, + "learning_rate": 6.909375e-07, + "loss": 0.0089, + "step": 22800 + }, + { + "epoch": 10.50149528410398, + "grad_norm": 0.3625020682811737, + "learning_rate": 6.83125e-07, + "loss": 0.0052, + "step": 22825 + }, + { + "epoch": 10.512997469519208, + "grad_norm": 1.3957737684249878, + "learning_rate": 6.753124999999999e-07, + "loss": 0.0105, + "step": 22850 + }, + { + "epoch": 10.524499654934438, + "grad_norm": 3.243558168411255, + "learning_rate": 6.675000000000001e-07, + "loss": 0.005, + "step": 22875 + }, + { + "epoch": 10.536001840349666, + "grad_norm": 2.227569341659546, + "learning_rate": 6.596875000000001e-07, + "loss": 0.0081, + "step": 22900 + }, + { + "epoch": 10.547504025764896, + "grad_norm": 2.815009593963623, + "learning_rate": 6.51875e-07, + "loss": 0.0078, + "step": 22925 + }, + { + "epoch": 10.559006211180124, + "grad_norm": 0.16958042979240417, + "learning_rate": 6.440625e-07, + "loss": 0.0127, + "step": 22950 + }, + { + "epoch": 10.570508396595352, + "grad_norm": 4.25639009475708, + "learning_rate": 6.3625e-07, + "loss": 0.0088, + "step": 22975 + }, + { + "epoch": 10.582010582010582, + "grad_norm": 0.4477657675743103, + "learning_rate": 6.284375000000001e-07, + "loss": 0.0043, + "step": 23000 + }, + { + "epoch": 10.59351276742581, + "grad_norm": 2.4832139015197754, + "learning_rate": 6.20625e-07, + "loss": 0.0045, + "step": 23025 + }, + { + "epoch": 10.60501495284104, + "grad_norm": 0.2865842282772064, + "learning_rate": 6.128125e-07, + "loss": 0.007, + "step": 23050 + }, + { + "epoch": 10.616517138256269, + "grad_norm": 0.3463385999202728, + "learning_rate": 6.05e-07, + "loss": 0.0068, + "step": 23075 + }, + { + "epoch": 10.628019323671497, + "grad_norm": 0.08812834322452545, + "learning_rate": 5.971875e-07, + "loss": 0.005, + "step": 23100 + }, + { + "epoch": 10.639521509086727, + "grad_norm": 0.11067871749401093, + "learning_rate": 5.89375e-07, + "loss": 0.0047, + "step": 23125 + }, + { + "epoch": 10.651023694501955, + "grad_norm": 1.1273283958435059, + "learning_rate": 5.815625e-07, + "loss": 0.0049, + "step": 23150 + }, + { + "epoch": 10.662525879917185, + "grad_norm": 0.0541173480451107, + "learning_rate": 5.737500000000001e-07, + "loss": 0.0042, + "step": 23175 + }, + { + "epoch": 10.674028065332413, + "grad_norm": 0.04831864312291145, + "learning_rate": 5.659375e-07, + "loss": 0.0075, + "step": 23200 + }, + { + "epoch": 10.685530250747641, + "grad_norm": 0.07917584478855133, + "learning_rate": 5.581250000000001e-07, + "loss": 0.0044, + "step": 23225 + }, + { + "epoch": 10.697032436162871, + "grad_norm": 2.2876992225646973, + "learning_rate": 5.503125000000001e-07, + "loss": 0.0073, + "step": 23250 + }, + { + "epoch": 10.7085346215781, + "grad_norm": 0.03435774892568588, + "learning_rate": 5.425e-07, + "loss": 0.0031, + "step": 23275 + }, + { + "epoch": 10.72003680699333, + "grad_norm": 1.8223545551300049, + "learning_rate": 5.346875e-07, + "loss": 0.0072, + "step": 23300 + }, + { + "epoch": 10.731538992408558, + "grad_norm": 12.397570610046387, + "learning_rate": 5.26875e-07, + "loss": 0.0035, + "step": 23325 + }, + { + "epoch": 10.743041177823786, + "grad_norm": 0.5610597133636475, + "learning_rate": 5.19375e-07, + "loss": 0.0088, + "step": 23350 + }, + { + "epoch": 10.754543363239016, + "grad_norm": 0.4081664979457855, + "learning_rate": 5.115625e-07, + "loss": 0.0051, + "step": 23375 + }, + { + "epoch": 10.766045548654244, + "grad_norm": 0.123787522315979, + "learning_rate": 5.0375e-07, + "loss": 0.0069, + "step": 23400 + }, + { + "epoch": 10.777547734069474, + "grad_norm": 2.293886423110962, + "learning_rate": 4.959375000000001e-07, + "loss": 0.0057, + "step": 23425 + }, + { + "epoch": 10.789049919484702, + "grad_norm": 6.182718753814697, + "learning_rate": 4.881250000000001e-07, + "loss": 0.0082, + "step": 23450 + }, + { + "epoch": 10.80055210489993, + "grad_norm": 0.033033497631549835, + "learning_rate": 4.803125e-07, + "loss": 0.0064, + "step": 23475 + }, + { + "epoch": 10.81205429031516, + "grad_norm": 0.5063899755477905, + "learning_rate": 4.7250000000000003e-07, + "loss": 0.0107, + "step": 23500 + }, + { + "epoch": 10.823556475730388, + "grad_norm": 1.6246389150619507, + "learning_rate": 4.646875e-07, + "loss": 0.0066, + "step": 23525 + }, + { + "epoch": 10.835058661145618, + "grad_norm": 0.16620787978172302, + "learning_rate": 4.56875e-07, + "loss": 0.0059, + "step": 23550 + }, + { + "epoch": 10.846560846560847, + "grad_norm": 0.7030823826789856, + "learning_rate": 4.490625e-07, + "loss": 0.0022, + "step": 23575 + }, + { + "epoch": 10.858063031976075, + "grad_norm": 0.8228742480278015, + "learning_rate": 4.4125e-07, + "loss": 0.0125, + "step": 23600 + }, + { + "epoch": 10.869565217391305, + "grad_norm": 4.570178031921387, + "learning_rate": 4.334375e-07, + "loss": 0.0052, + "step": 23625 + }, + { + "epoch": 10.881067402806533, + "grad_norm": 2.8716719150543213, + "learning_rate": 4.25625e-07, + "loss": 0.005, + "step": 23650 + }, + { + "epoch": 10.892569588221763, + "grad_norm": 5.523809909820557, + "learning_rate": 4.178125000000001e-07, + "loss": 0.0063, + "step": 23675 + }, + { + "epoch": 10.904071773636991, + "grad_norm": 1.16761314868927, + "learning_rate": 4.1000000000000004e-07, + "loss": 0.0095, + "step": 23700 + }, + { + "epoch": 10.91557395905222, + "grad_norm": 0.024232987314462662, + "learning_rate": 4.0218750000000007e-07, + "loss": 0.0036, + "step": 23725 + }, + { + "epoch": 10.92707614446745, + "grad_norm": 4.76410436630249, + "learning_rate": 3.9437500000000004e-07, + "loss": 0.0097, + "step": 23750 + }, + { + "epoch": 10.938578329882677, + "grad_norm": 5.289037704467773, + "learning_rate": 3.8656250000000006e-07, + "loss": 0.006, + "step": 23775 + }, + { + "epoch": 10.950080515297907, + "grad_norm": 0.05810718610882759, + "learning_rate": 3.7875000000000003e-07, + "loss": 0.0063, + "step": 23800 + }, + { + "epoch": 10.961582700713135, + "grad_norm": 2.577594757080078, + "learning_rate": 3.709375e-07, + "loss": 0.0041, + "step": 23825 + }, + { + "epoch": 10.973084886128364, + "grad_norm": 0.039299797266721725, + "learning_rate": 3.63125e-07, + "loss": 0.0062, + "step": 23850 + }, + { + "epoch": 10.984587071543594, + "grad_norm": 1.8657886981964111, + "learning_rate": 3.553125e-07, + "loss": 0.0029, + "step": 23875 + }, + { + "epoch": 10.996089256958822, + "grad_norm": 0.037355002015829086, + "learning_rate": 3.475e-07, + "loss": 0.0068, + "step": 23900 + }, + { + "epoch": 11.007591442374052, + "grad_norm": 0.03586237132549286, + "learning_rate": 3.3968750000000003e-07, + "loss": 0.0055, + "step": 23925 + }, + { + "epoch": 11.01909362778928, + "grad_norm": 1.9938002824783325, + "learning_rate": 3.31875e-07, + "loss": 0.0054, + "step": 23950 + }, + { + "epoch": 11.030595813204508, + "grad_norm": 0.05722161382436752, + "learning_rate": 3.240625e-07, + "loss": 0.0022, + "step": 23975 + }, + { + "epoch": 11.042097998619738, + "grad_norm": 0.16819870471954346, + "learning_rate": 3.1625e-07, + "loss": 0.0077, + "step": 24000 + }, + { + "epoch": 11.053600184034966, + "grad_norm": 1.404698371887207, + "learning_rate": 3.084375e-07, + "loss": 0.0078, + "step": 24025 + }, + { + "epoch": 11.065102369450196, + "grad_norm": 1.955828309059143, + "learning_rate": 3.00625e-07, + "loss": 0.0056, + "step": 24050 + }, + { + "epoch": 11.076604554865424, + "grad_norm": 0.1421104073524475, + "learning_rate": 2.9281250000000006e-07, + "loss": 0.0044, + "step": 24075 + }, + { + "epoch": 11.088106740280653, + "grad_norm": 0.2362927496433258, + "learning_rate": 2.85e-07, + "loss": 0.0046, + "step": 24100 + }, + { + "epoch": 11.099608925695883, + "grad_norm": 0.3808918297290802, + "learning_rate": 2.771875e-07, + "loss": 0.0041, + "step": 24125 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 0.596932590007782, + "learning_rate": 2.69375e-07, + "loss": 0.0036, + "step": 24150 + }, + { + "epoch": 11.12261329652634, + "grad_norm": 2.1200876235961914, + "learning_rate": 2.615625e-07, + "loss": 0.0055, + "step": 24175 + }, + { + "epoch": 11.134115481941569, + "grad_norm": 0.17938555777072906, + "learning_rate": 2.5375e-07, + "loss": 0.0035, + "step": 24200 + }, + { + "epoch": 11.145617667356797, + "grad_norm": 0.06144664064049721, + "learning_rate": 2.4593750000000003e-07, + "loss": 0.0066, + "step": 24225 + }, + { + "epoch": 11.157119852772027, + "grad_norm": 0.05789117515087128, + "learning_rate": 2.3812500000000002e-07, + "loss": 0.0033, + "step": 24250 + }, + { + "epoch": 11.168622038187255, + "grad_norm": 1.0644292831420898, + "learning_rate": 2.3031250000000002e-07, + "loss": 0.0055, + "step": 24275 + }, + { + "epoch": 11.180124223602485, + "grad_norm": 0.05568142980337143, + "learning_rate": 2.2250000000000001e-07, + "loss": 0.0096, + "step": 24300 + }, + { + "epoch": 11.191626409017713, + "grad_norm": 3.572922468185425, + "learning_rate": 2.146875e-07, + "loss": 0.0058, + "step": 24325 + }, + { + "epoch": 11.203128594432942, + "grad_norm": 0.3286091089248657, + "learning_rate": 2.06875e-07, + "loss": 0.0035, + "step": 24350 + }, + { + "epoch": 11.214630779848171, + "grad_norm": 3.0645318031311035, + "learning_rate": 1.9906250000000003e-07, + "loss": 0.007, + "step": 24375 + }, + { + "epoch": 11.2261329652634, + "grad_norm": 9.076870918273926, + "learning_rate": 1.9125e-07, + "loss": 0.0038, + "step": 24400 + }, + { + "epoch": 11.23763515067863, + "grad_norm": 0.054851166903972626, + "learning_rate": 1.8343750000000002e-07, + "loss": 0.0049, + "step": 24425 + }, + { + "epoch": 11.249137336093858, + "grad_norm": 0.7788383364677429, + "learning_rate": 1.75625e-07, + "loss": 0.0044, + "step": 24450 + }, + { + "epoch": 11.260639521509086, + "grad_norm": 0.06403613835573196, + "learning_rate": 1.678125e-07, + "loss": 0.0055, + "step": 24475 + }, + { + "epoch": 11.272141706924316, + "grad_norm": 0.1481279581785202, + "learning_rate": 1.6e-07, + "loss": 0.0043, + "step": 24500 + }, + { + "epoch": 11.283643892339544, + "grad_norm": 1.9446523189544678, + "learning_rate": 1.521875e-07, + "loss": 0.0065, + "step": 24525 + }, + { + "epoch": 11.295146077754774, + "grad_norm": 0.01817043498158455, + "learning_rate": 1.44375e-07, + "loss": 0.0043, + "step": 24550 + }, + { + "epoch": 11.306648263170002, + "grad_norm": 2.417325258255005, + "learning_rate": 1.3656250000000002e-07, + "loss": 0.0037, + "step": 24575 + }, + { + "epoch": 11.31815044858523, + "grad_norm": 0.08347784727811813, + "learning_rate": 1.2875e-07, + "loss": 0.0025, + "step": 24600 + }, + { + "epoch": 11.32965263400046, + "grad_norm": 3.372894763946533, + "learning_rate": 1.209375e-07, + "loss": 0.0099, + "step": 24625 + }, + { + "epoch": 11.341154819415689, + "grad_norm": 0.43756258487701416, + "learning_rate": 1.1312500000000002e-07, + "loss": 0.0058, + "step": 24650 + }, + { + "epoch": 11.352657004830919, + "grad_norm": 0.4029462933540344, + "learning_rate": 1.0531250000000001e-07, + "loss": 0.0061, + "step": 24675 + }, + { + "epoch": 11.364159190246147, + "grad_norm": 4.701152324676514, + "learning_rate": 9.75e-08, + "loss": 0.0051, + "step": 24700 + }, + { + "epoch": 11.375661375661375, + "grad_norm": 0.5707642436027527, + "learning_rate": 8.96875e-08, + "loss": 0.0054, + "step": 24725 + }, + { + "epoch": 11.387163561076605, + "grad_norm": 0.0709519162774086, + "learning_rate": 8.187500000000001e-08, + "loss": 0.0031, + "step": 24750 + }, + { + "epoch": 11.398665746491833, + "grad_norm": 1.804284930229187, + "learning_rate": 7.40625e-08, + "loss": 0.0052, + "step": 24775 + }, + { + "epoch": 11.410167931907063, + "grad_norm": 0.04206651449203491, + "learning_rate": 6.625e-08, + "loss": 0.0038, + "step": 24800 + }, + { + "epoch": 11.421670117322291, + "grad_norm": 0.5502020716667175, + "learning_rate": 5.843750000000001e-08, + "loss": 0.0087, + "step": 24825 + }, + { + "epoch": 11.43317230273752, + "grad_norm": 0.21491751074790955, + "learning_rate": 5.0625e-08, + "loss": 0.003, + "step": 24850 + }, + { + "epoch": 11.44467448815275, + "grad_norm": 2.5923168659210205, + "learning_rate": 4.28125e-08, + "loss": 0.0059, + "step": 24875 + }, + { + "epoch": 11.456176673567978, + "grad_norm": 0.21680384874343872, + "learning_rate": 3.5e-08, + "loss": 0.0063, + "step": 24900 + }, + { + "epoch": 11.467678858983207, + "grad_norm": 0.0767444297671318, + "learning_rate": 2.7187499999999998e-08, + "loss": 0.0054, + "step": 24925 + }, + { + "epoch": 11.479181044398436, + "grad_norm": 0.42297929525375366, + "learning_rate": 1.9375e-08, + "loss": 0.0025, + "step": 24950 + }, + { + "epoch": 11.490683229813664, + "grad_norm": 0.13791832327842712, + "learning_rate": 1.1562500000000002e-08, + "loss": 0.0088, + "step": 24975 + }, + { + "epoch": 11.502185415228894, + "grad_norm": 5.369208812713623, + "learning_rate": 3.75e-09, + "loss": 0.0074, + "step": 25000 + }, + { + "epoch": 11.502185415228894, + "eval_loss": 0.2163197249174118, + "eval_runtime": 5391.0097, + "eval_samples_per_second": 1.761, + "eval_steps_per_second": 0.22, + "eval_wer": 0.09290518898142217, + "step": 25000 + }, + { + "epoch": 11.502185415228894, + "step": 25000, + "total_flos": 4.082307363540173e+20, + "train_loss": 0.05244532932087779, + "train_runtime": 190521.5004, + "train_samples_per_second": 2.1, + "train_steps_per_second": 0.131 + } + ], + "logging_steps": 25, + "max_steps": 25000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.082307363540173e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}