diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15609 @@ +{ + "best_metric": 0.19376881420612335, + "best_model_checkpoint": "d:\\\\whisper-large-v3-pt-3000h-4\\checkpoint-5529", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 55290, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004521613311629589, + "grad_norm": 2.3342323303222656, + "learning_rate": 2.5e-08, + "loss": 0.0451, + "step": 25 + }, + { + "epoch": 0.009043226623259178, + "grad_norm": 2.0830228328704834, + "learning_rate": 5e-08, + "loss": 0.0393, + "step": 50 + }, + { + "epoch": 0.013564839934888768, + "grad_norm": 3.438735008239746, + "learning_rate": 7.5e-08, + "loss": 0.0355, + "step": 75 + }, + { + "epoch": 0.018086453246518357, + "grad_norm": 14.74426555633545, + "learning_rate": 1e-07, + "loss": 0.0359, + "step": 100 + }, + { + "epoch": 0.022608066558147948, + "grad_norm": 2.5502216815948486, + "learning_rate": 1.25e-07, + "loss": 0.028, + "step": 125 + }, + { + "epoch": 0.027129679869777535, + "grad_norm": 6.838224411010742, + "learning_rate": 1.5e-07, + "loss": 0.0214, + "step": 150 + }, + { + "epoch": 0.03165129318140713, + "grad_norm": 6.352050304412842, + "learning_rate": 1.75e-07, + "loss": 0.0336, + "step": 175 + }, + { + "epoch": 0.03617290649303671, + "grad_norm": 8.222951889038086, + "learning_rate": 2e-07, + "loss": 0.028, + "step": 200 + }, + { + "epoch": 0.040694519804666304, + "grad_norm": 5.666439533233643, + "learning_rate": 2.25e-07, + "loss": 0.0295, + "step": 225 + }, + { + "epoch": 0.045216133116295895, + "grad_norm": 0.2596105635166168, + "learning_rate": 2.5e-07, + "loss": 0.0338, + "step": 250 + }, + { + "epoch": 0.049737746427925486, + "grad_norm": 4.480187892913818, + "learning_rate": 2.75e-07, + "loss": 0.0365, + "step": 275 + }, + { + "epoch": 0.05425935973955507, + "grad_norm": 0.934718132019043, + "learning_rate": 3e-07, + "loss": 0.0284, + "step": 300 + }, + { + "epoch": 0.05878097305118466, + "grad_norm": 1.4461249113082886, + "learning_rate": 3.25e-07, + "loss": 0.0199, + "step": 325 + }, + { + "epoch": 0.06330258636281426, + "grad_norm": 3.453277587890625, + "learning_rate": 3.5e-07, + "loss": 0.0401, + "step": 350 + }, + { + "epoch": 0.06782419967444384, + "grad_norm": 6.596461772918701, + "learning_rate": 3.75e-07, + "loss": 0.0522, + "step": 375 + }, + { + "epoch": 0.07234581298607343, + "grad_norm": 0.7677063345909119, + "learning_rate": 3.99e-07, + "loss": 0.1004, + "step": 400 + }, + { + "epoch": 0.07686742629770302, + "grad_norm": 7.2375264167785645, + "learning_rate": 4.24e-07, + "loss": 0.0313, + "step": 425 + }, + { + "epoch": 0.08138903960933261, + "grad_norm": 4.461888790130615, + "learning_rate": 4.49e-07, + "loss": 0.0358, + "step": 450 + }, + { + "epoch": 0.0859106529209622, + "grad_norm": 2.9905307292938232, + "learning_rate": 4.7399999999999993e-07, + "loss": 0.035, + "step": 475 + }, + { + "epoch": 0.09043226623259179, + "grad_norm": 2.32279896736145, + "learning_rate": 4.99e-07, + "loss": 0.0296, + "step": 500 + }, + { + "epoch": 0.09495387954422138, + "grad_norm": 8.56678581237793, + "learning_rate": 5.24e-07, + "loss": 0.0357, + "step": 525 + }, + { + "epoch": 0.09947549285585097, + "grad_norm": 2.7547428607940674, + "learning_rate": 5.490000000000001e-07, + "loss": 0.0378, + "step": 550 + }, + { + "epoch": 0.10399710616748056, + "grad_norm": 1.2645742893218994, + "learning_rate": 5.739999999999999e-07, + "loss": 0.0434, + "step": 575 + }, + { + "epoch": 0.10851871947911014, + "grad_norm": 2.233053684234619, + "learning_rate": 5.989999999999999e-07, + "loss": 0.042, + "step": 600 + }, + { + "epoch": 0.11304033279073973, + "grad_norm": 13.10155200958252, + "learning_rate": 6.24e-07, + "loss": 0.0454, + "step": 625 + }, + { + "epoch": 0.11756194610236932, + "grad_norm": 9.450150489807129, + "learning_rate": 6.49e-07, + "loss": 0.0331, + "step": 650 + }, + { + "epoch": 0.12208355941399891, + "grad_norm": 0.5919239521026611, + "learning_rate": 6.74e-07, + "loss": 0.033, + "step": 675 + }, + { + "epoch": 0.12660517272562852, + "grad_norm": 24.587112426757812, + "learning_rate": 6.989999999999999e-07, + "loss": 0.0311, + "step": 700 + }, + { + "epoch": 0.13112678603725808, + "grad_norm": 19.61304473876953, + "learning_rate": 7.24e-07, + "loss": 0.0359, + "step": 725 + }, + { + "epoch": 0.13564839934888767, + "grad_norm": 20.30430030822754, + "learning_rate": 7.489999999999999e-07, + "loss": 0.0531, + "step": 750 + }, + { + "epoch": 0.14017001266051726, + "grad_norm": 16.841527938842773, + "learning_rate": 7.74e-07, + "loss": 0.0761, + "step": 775 + }, + { + "epoch": 0.14469162597214685, + "grad_norm": 18.845434188842773, + "learning_rate": 7.99e-07, + "loss": 0.0871, + "step": 800 + }, + { + "epoch": 0.14921323928377644, + "grad_norm": 14.67888355255127, + "learning_rate": 8.24e-07, + "loss": 0.0414, + "step": 825 + }, + { + "epoch": 0.15373485259540604, + "grad_norm": 6.383974075317383, + "learning_rate": 8.489999999999999e-07, + "loss": 0.0344, + "step": 850 + }, + { + "epoch": 0.15825646590703563, + "grad_norm": 7.403707504272461, + "learning_rate": 8.739999999999999e-07, + "loss": 0.0317, + "step": 875 + }, + { + "epoch": 0.16277807921866522, + "grad_norm": 0.7672566771507263, + "learning_rate": 8.99e-07, + "loss": 0.0319, + "step": 900 + }, + { + "epoch": 0.1672996925302948, + "grad_norm": 3.935624599456787, + "learning_rate": 9.24e-07, + "loss": 0.0364, + "step": 925 + }, + { + "epoch": 0.1718213058419244, + "grad_norm": 6.021194934844971, + "learning_rate": 9.489999999999999e-07, + "loss": 0.0378, + "step": 950 + }, + { + "epoch": 0.176342919153554, + "grad_norm": 0.7235254645347595, + "learning_rate": 9.74e-07, + "loss": 0.0386, + "step": 975 + }, + { + "epoch": 0.18086453246518358, + "grad_norm": 5.369808673858643, + "learning_rate": 9.989999999999999e-07, + "loss": 0.0307, + "step": 1000 + }, + { + "epoch": 0.18538614577681317, + "grad_norm": 4.743622303009033, + "learning_rate": 9.995579296371338e-07, + "loss": 0.0555, + "step": 1025 + }, + { + "epoch": 0.18990775908844276, + "grad_norm": 12.76374340057373, + "learning_rate": 9.99097439675815e-07, + "loss": 0.0478, + "step": 1050 + }, + { + "epoch": 0.19442937240007235, + "grad_norm": 4.498536109924316, + "learning_rate": 9.986369497144962e-07, + "loss": 0.0443, + "step": 1075 + }, + { + "epoch": 0.19895098571170194, + "grad_norm": 9.623854637145996, + "learning_rate": 9.981764597531773e-07, + "loss": 0.044, + "step": 1100 + }, + { + "epoch": 0.20347259902333154, + "grad_norm": 1.4003485441207886, + "learning_rate": 9.977159697918585e-07, + "loss": 0.042, + "step": 1125 + }, + { + "epoch": 0.20799421233496113, + "grad_norm": 3.358299493789673, + "learning_rate": 9.972554798305397e-07, + "loss": 0.0422, + "step": 1150 + }, + { + "epoch": 0.21251582564659072, + "grad_norm": 33.19310760498047, + "learning_rate": 9.967949898692208e-07, + "loss": 0.0443, + "step": 1175 + }, + { + "epoch": 0.21703743895822028, + "grad_norm": 7.122214317321777, + "learning_rate": 9.96334499907902e-07, + "loss": 0.0962, + "step": 1200 + }, + { + "epoch": 0.22155905226984987, + "grad_norm": 8.756924629211426, + "learning_rate": 9.958740099465832e-07, + "loss": 0.0399, + "step": 1225 + }, + { + "epoch": 0.22608066558147946, + "grad_norm": 3.779580593109131, + "learning_rate": 9.954135199852643e-07, + "loss": 0.035, + "step": 1250 + }, + { + "epoch": 0.23060227889310905, + "grad_norm": 15.683145523071289, + "learning_rate": 9.949530300239455e-07, + "loss": 0.042, + "step": 1275 + }, + { + "epoch": 0.23512389220473864, + "grad_norm": 12.767990112304688, + "learning_rate": 9.944925400626267e-07, + "loss": 0.0517, + "step": 1300 + }, + { + "epoch": 0.23964550551636823, + "grad_norm": 0.5977092981338501, + "learning_rate": 9.940320501013078e-07, + "loss": 0.0266, + "step": 1325 + }, + { + "epoch": 0.24416711882799783, + "grad_norm": 2.613328218460083, + "learning_rate": 9.935715601399888e-07, + "loss": 0.0302, + "step": 1350 + }, + { + "epoch": 0.24868873213962742, + "grad_norm": 3.43841290473938, + "learning_rate": 9.9311107017867e-07, + "loss": 0.0604, + "step": 1375 + }, + { + "epoch": 0.25321034545125704, + "grad_norm": 1.4975024461746216, + "learning_rate": 9.926505802173513e-07, + "loss": 0.0524, + "step": 1400 + }, + { + "epoch": 0.25773195876288657, + "grad_norm": 6.18322229385376, + "learning_rate": 9.921900902560325e-07, + "loss": 0.0318, + "step": 1425 + }, + { + "epoch": 0.26225357207451616, + "grad_norm": 3.4327890872955322, + "learning_rate": 9.917296002947134e-07, + "loss": 0.0449, + "step": 1450 + }, + { + "epoch": 0.26677518538614575, + "grad_norm": 19.299654006958008, + "learning_rate": 9.912691103333946e-07, + "loss": 0.0371, + "step": 1475 + }, + { + "epoch": 0.27129679869777534, + "grad_norm": 17.62229347229004, + "learning_rate": 9.908086203720758e-07, + "loss": 0.046, + "step": 1500 + }, + { + "epoch": 0.27581841200940493, + "grad_norm": 1.671424150466919, + "learning_rate": 9.90348130410757e-07, + "loss": 0.0391, + "step": 1525 + }, + { + "epoch": 0.2803400253210345, + "grad_norm": 21.561243057250977, + "learning_rate": 9.898876404494381e-07, + "loss": 0.0729, + "step": 1550 + }, + { + "epoch": 0.2848616386326641, + "grad_norm": 7.39694356918335, + "learning_rate": 9.894271504881193e-07, + "loss": 0.052, + "step": 1575 + }, + { + "epoch": 0.2893832519442937, + "grad_norm": 1.9565777778625488, + "learning_rate": 9.889850801252534e-07, + "loss": 0.082, + "step": 1600 + }, + { + "epoch": 0.2939048652559233, + "grad_norm": 1.8609459400177002, + "learning_rate": 9.885245901639343e-07, + "loss": 0.0446, + "step": 1625 + }, + { + "epoch": 0.2984264785675529, + "grad_norm": 4.542464733123779, + "learning_rate": 9.880641002026155e-07, + "loss": 0.0353, + "step": 1650 + }, + { + "epoch": 0.3029480918791825, + "grad_norm": 0.291640967130661, + "learning_rate": 9.876036102412967e-07, + "loss": 0.0359, + "step": 1675 + }, + { + "epoch": 0.30746970519081207, + "grad_norm": 0.6001373529434204, + "learning_rate": 9.871431202799778e-07, + "loss": 0.031, + "step": 1700 + }, + { + "epoch": 0.31199131850244166, + "grad_norm": 12.14961051940918, + "learning_rate": 9.86682630318659e-07, + "loss": 0.0447, + "step": 1725 + }, + { + "epoch": 0.31651293181407125, + "grad_norm": 23.949186325073242, + "learning_rate": 9.862221403573402e-07, + "loss": 0.0479, + "step": 1750 + }, + { + "epoch": 0.32103454512570084, + "grad_norm": 4.505954742431641, + "learning_rate": 9.857616503960213e-07, + "loss": 0.0481, + "step": 1775 + }, + { + "epoch": 0.32555615843733043, + "grad_norm": 2.596393585205078, + "learning_rate": 9.853011604347025e-07, + "loss": 0.0482, + "step": 1800 + }, + { + "epoch": 0.33007777174896, + "grad_norm": 8.913126945495605, + "learning_rate": 9.848406704733837e-07, + "loss": 0.0612, + "step": 1825 + }, + { + "epoch": 0.3345993850605896, + "grad_norm": 5.877014636993408, + "learning_rate": 9.843801805120648e-07, + "loss": 0.0476, + "step": 1850 + }, + { + "epoch": 0.3391209983722192, + "grad_norm": 22.622203826904297, + "learning_rate": 9.83919690550746e-07, + "loss": 0.0447, + "step": 1875 + }, + { + "epoch": 0.3436426116838488, + "grad_norm": 18.984943389892578, + "learning_rate": 9.834592005894271e-07, + "loss": 0.0543, + "step": 1900 + }, + { + "epoch": 0.3481642249954784, + "grad_norm": 13.041614532470703, + "learning_rate": 9.829987106281083e-07, + "loss": 0.0393, + "step": 1925 + }, + { + "epoch": 0.352685838307108, + "grad_norm": 23.434789657592773, + "learning_rate": 9.825382206667893e-07, + "loss": 0.0488, + "step": 1950 + }, + { + "epoch": 0.35720745161873757, + "grad_norm": 7.045641899108887, + "learning_rate": 9.820777307054706e-07, + "loss": 0.0557, + "step": 1975 + }, + { + "epoch": 0.36172906493036716, + "grad_norm": 17.288244247436523, + "learning_rate": 9.816172407441518e-07, + "loss": 0.0821, + "step": 2000 + }, + { + "epoch": 0.36625067824199675, + "grad_norm": 4.284151077270508, + "learning_rate": 9.81156750782833e-07, + "loss": 0.0526, + "step": 2025 + }, + { + "epoch": 0.37077229155362634, + "grad_norm": 9.124343872070312, + "learning_rate": 9.80696260821514e-07, + "loss": 0.0309, + "step": 2050 + }, + { + "epoch": 0.37529390486525593, + "grad_norm": 4.401906490325928, + "learning_rate": 9.80235770860195e-07, + "loss": 0.046, + "step": 2075 + }, + { + "epoch": 0.3798155181768855, + "grad_norm": 0.29334700107574463, + "learning_rate": 9.797752808988765e-07, + "loss": 0.0349, + "step": 2100 + }, + { + "epoch": 0.3843371314885151, + "grad_norm": 4.6845197677612305, + "learning_rate": 9.793147909375574e-07, + "loss": 0.0398, + "step": 2125 + }, + { + "epoch": 0.3888587448001447, + "grad_norm": 10.254193305969238, + "learning_rate": 9.788543009762386e-07, + "loss": 0.0441, + "step": 2150 + }, + { + "epoch": 0.3933803581117743, + "grad_norm": 9.839592933654785, + "learning_rate": 9.783938110149198e-07, + "loss": 0.0427, + "step": 2175 + }, + { + "epoch": 0.3979019714234039, + "grad_norm": 7.145606517791748, + "learning_rate": 9.77933321053601e-07, + "loss": 0.0364, + "step": 2200 + }, + { + "epoch": 0.4024235847350335, + "grad_norm": 13.468842506408691, + "learning_rate": 9.77472831092282e-07, + "loss": 0.0578, + "step": 2225 + }, + { + "epoch": 0.40694519804666307, + "grad_norm": 5.4638519287109375, + "learning_rate": 9.770123411309633e-07, + "loss": 0.0596, + "step": 2250 + }, + { + "epoch": 0.41146681135829266, + "grad_norm": 3.9997127056121826, + "learning_rate": 9.765518511696444e-07, + "loss": 0.0304, + "step": 2275 + }, + { + "epoch": 0.41598842466992225, + "grad_norm": 1.2957385778427124, + "learning_rate": 9.760913612083256e-07, + "loss": 0.0308, + "step": 2300 + }, + { + "epoch": 0.42051003798155184, + "grad_norm": 15.630707740783691, + "learning_rate": 9.756308712470068e-07, + "loss": 0.0538, + "step": 2325 + }, + { + "epoch": 0.42503165129318143, + "grad_norm": 0.9495472311973572, + "learning_rate": 9.75170381285688e-07, + "loss": 0.0484, + "step": 2350 + }, + { + "epoch": 0.42955326460481097, + "grad_norm": 8.299753189086914, + "learning_rate": 9.74709891324369e-07, + "loss": 0.0941, + "step": 2375 + }, + { + "epoch": 0.43407487791644056, + "grad_norm": 2.055479049682617, + "learning_rate": 9.742494013630503e-07, + "loss": 0.0871, + "step": 2400 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 2.5821428298950195, + "learning_rate": 9.737889114017314e-07, + "loss": 0.0319, + "step": 2425 + }, + { + "epoch": 0.44311810453969974, + "grad_norm": 15.306851387023926, + "learning_rate": 9.733284214404126e-07, + "loss": 0.0397, + "step": 2450 + }, + { + "epoch": 0.44763971785132933, + "grad_norm": 12.804640769958496, + "learning_rate": 9.728679314790938e-07, + "loss": 0.0397, + "step": 2475 + }, + { + "epoch": 0.4521613311629589, + "grad_norm": 2.8858513832092285, + "learning_rate": 9.72407441517775e-07, + "loss": 0.0423, + "step": 2500 + }, + { + "epoch": 0.4566829444745885, + "grad_norm": 5.125776767730713, + "learning_rate": 9.71946951556456e-07, + "loss": 0.0372, + "step": 2525 + }, + { + "epoch": 0.4612045577862181, + "grad_norm": 3.445122718811035, + "learning_rate": 9.71486461595137e-07, + "loss": 0.0597, + "step": 2550 + }, + { + "epoch": 0.4657261710978477, + "grad_norm": 1.8609939813613892, + "learning_rate": 9.710259716338184e-07, + "loss": 0.0356, + "step": 2575 + }, + { + "epoch": 0.4702477844094773, + "grad_norm": 6.8225998878479, + "learning_rate": 9.705654816724996e-07, + "loss": 0.0341, + "step": 2600 + }, + { + "epoch": 0.4747693977211069, + "grad_norm": 20.26767349243164, + "learning_rate": 9.701049917111808e-07, + "loss": 0.049, + "step": 2625 + }, + { + "epoch": 0.47929101103273647, + "grad_norm": 5.631345748901367, + "learning_rate": 9.696445017498617e-07, + "loss": 0.0616, + "step": 2650 + }, + { + "epoch": 0.48381262434436606, + "grad_norm": 20.55868148803711, + "learning_rate": 9.691840117885429e-07, + "loss": 0.062, + "step": 2675 + }, + { + "epoch": 0.48833423765599565, + "grad_norm": 32.73431396484375, + "learning_rate": 9.687235218272243e-07, + "loss": 0.0653, + "step": 2700 + }, + { + "epoch": 0.49285585096762524, + "grad_norm": 1.6700994968414307, + "learning_rate": 9.682630318659052e-07, + "loss": 0.0486, + "step": 2725 + }, + { + "epoch": 0.49737746427925483, + "grad_norm": 14.762917518615723, + "learning_rate": 9.678025419045864e-07, + "loss": 0.0641, + "step": 2750 + }, + { + "epoch": 0.5018990775908845, + "grad_norm": 1.297577977180481, + "learning_rate": 9.673420519432675e-07, + "loss": 0.0625, + "step": 2775 + }, + { + "epoch": 0.5064206909025141, + "grad_norm": 0.5399802327156067, + "learning_rate": 9.668815619819487e-07, + "loss": 0.0931, + "step": 2800 + }, + { + "epoch": 0.5109423042141437, + "grad_norm": 7.880299091339111, + "learning_rate": 9.664210720206299e-07, + "loss": 0.0397, + "step": 2825 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 5.446402549743652, + "learning_rate": 9.65960582059311e-07, + "loss": 0.0327, + "step": 2850 + }, + { + "epoch": 0.5199855308374027, + "grad_norm": 10.012272834777832, + "learning_rate": 9.655000920979922e-07, + "loss": 0.0405, + "step": 2875 + }, + { + "epoch": 0.5245071441490323, + "grad_norm": 4.505229949951172, + "learning_rate": 9.650396021366734e-07, + "loss": 0.0372, + "step": 2900 + }, + { + "epoch": 0.5290287574606619, + "grad_norm": 0.4095817804336548, + "learning_rate": 9.645791121753545e-07, + "loss": 0.0447, + "step": 2925 + }, + { + "epoch": 0.5335503707722915, + "grad_norm": 1.1292011737823486, + "learning_rate": 9.641186222140357e-07, + "loss": 0.0306, + "step": 2950 + }, + { + "epoch": 0.5380719840839211, + "grad_norm": 2.467514991760254, + "learning_rate": 9.636581322527169e-07, + "loss": 0.056, + "step": 2975 + }, + { + "epoch": 0.5425935973955507, + "grad_norm": 2.929332971572876, + "learning_rate": 9.63197642291398e-07, + "loss": 0.0551, + "step": 3000 + }, + { + "epoch": 0.5471152107071803, + "grad_norm": 4.477908134460449, + "learning_rate": 9.627371523300792e-07, + "loss": 0.0396, + "step": 3025 + }, + { + "epoch": 0.5516368240188099, + "grad_norm": 2.9823644161224365, + "learning_rate": 9.622766623687604e-07, + "loss": 0.0394, + "step": 3050 + }, + { + "epoch": 0.5561584373304395, + "grad_norm": 19.203933715820312, + "learning_rate": 9.618161724074415e-07, + "loss": 0.0474, + "step": 3075 + }, + { + "epoch": 0.560680050642069, + "grad_norm": 3.774156093597412, + "learning_rate": 9.613556824461227e-07, + "loss": 0.0459, + "step": 3100 + }, + { + "epoch": 0.5652016639536986, + "grad_norm": 8.178613662719727, + "learning_rate": 9.608951924848039e-07, + "loss": 0.0704, + "step": 3125 + }, + { + "epoch": 0.5697232772653282, + "grad_norm": 21.61182403564453, + "learning_rate": 9.604347025234848e-07, + "loss": 0.0728, + "step": 3150 + }, + { + "epoch": 0.5742448905769578, + "grad_norm": 19.002914428710938, + "learning_rate": 9.599742125621662e-07, + "loss": 0.0839, + "step": 3175 + }, + { + "epoch": 0.5787665038885874, + "grad_norm": 4.789153099060059, + "learning_rate": 9.595137226008474e-07, + "loss": 0.0816, + "step": 3200 + }, + { + "epoch": 0.583288117200217, + "grad_norm": 0.4933367669582367, + "learning_rate": 9.590532326395283e-07, + "loss": 0.0534, + "step": 3225 + }, + { + "epoch": 0.5878097305118466, + "grad_norm": 1.4477636814117432, + "learning_rate": 9.585927426782095e-07, + "loss": 0.0479, + "step": 3250 + }, + { + "epoch": 0.5923313438234762, + "grad_norm": 5.407076358795166, + "learning_rate": 9.581322527168907e-07, + "loss": 0.0328, + "step": 3275 + }, + { + "epoch": 0.5968529571351058, + "grad_norm": 6.760631084442139, + "learning_rate": 9.57671762755572e-07, + "loss": 0.0421, + "step": 3300 + }, + { + "epoch": 0.6013745704467354, + "grad_norm": 11.133447647094727, + "learning_rate": 9.57211272794253e-07, + "loss": 0.0315, + "step": 3325 + }, + { + "epoch": 0.605896183758365, + "grad_norm": 8.827392578125, + "learning_rate": 9.567507828329342e-07, + "loss": 0.0533, + "step": 3350 + }, + { + "epoch": 0.6104177970699945, + "grad_norm": 4.74755334854126, + "learning_rate": 9.562902928716153e-07, + "loss": 0.0505, + "step": 3375 + }, + { + "epoch": 0.6149394103816241, + "grad_norm": 6.094758987426758, + "learning_rate": 9.558298029102965e-07, + "loss": 0.0374, + "step": 3400 + }, + { + "epoch": 0.6194610236932537, + "grad_norm": 7.404526710510254, + "learning_rate": 9.553693129489777e-07, + "loss": 0.0274, + "step": 3425 + }, + { + "epoch": 0.6239826370048833, + "grad_norm": 2.237982749938965, + "learning_rate": 9.549088229876588e-07, + "loss": 0.0395, + "step": 3450 + }, + { + "epoch": 0.6285042503165129, + "grad_norm": 20.463314056396484, + "learning_rate": 9.5444833302634e-07, + "loss": 0.0596, + "step": 3475 + }, + { + "epoch": 0.6330258636281425, + "grad_norm": 12.629451751708984, + "learning_rate": 9.539878430650212e-07, + "loss": 0.0329, + "step": 3500 + }, + { + "epoch": 0.6375474769397721, + "grad_norm": 19.752243041992188, + "learning_rate": 9.535273531037022e-07, + "loss": 0.0343, + "step": 3525 + }, + { + "epoch": 0.6420690902514017, + "grad_norm": 7.476962089538574, + "learning_rate": 9.530668631423835e-07, + "loss": 0.0724, + "step": 3550 + }, + { + "epoch": 0.6465907035630313, + "grad_norm": 31.683101654052734, + "learning_rate": 9.526063731810647e-07, + "loss": 0.07, + "step": 3575 + }, + { + "epoch": 0.6511123168746609, + "grad_norm": 3.1800105571746826, + "learning_rate": 9.521458832197457e-07, + "loss": 0.1358, + "step": 3600 + }, + { + "epoch": 0.6556339301862905, + "grad_norm": 7.005350589752197, + "learning_rate": 9.516853932584269e-07, + "loss": 0.0643, + "step": 3625 + }, + { + "epoch": 0.66015554349792, + "grad_norm": 5.049868583679199, + "learning_rate": 9.51224903297108e-07, + "loss": 0.0469, + "step": 3650 + }, + { + "epoch": 0.6646771568095496, + "grad_norm": 6.004695415496826, + "learning_rate": 9.507644133357893e-07, + "loss": 0.0377, + "step": 3675 + }, + { + "epoch": 0.6691987701211792, + "grad_norm": 14.023488998413086, + "learning_rate": 9.503039233744704e-07, + "loss": 0.0457, + "step": 3700 + }, + { + "epoch": 0.6737203834328088, + "grad_norm": 0.5092989802360535, + "learning_rate": 9.498434334131515e-07, + "loss": 0.0467, + "step": 3725 + }, + { + "epoch": 0.6782419967444384, + "grad_norm": 9.694477081298828, + "learning_rate": 9.493829434518327e-07, + "loss": 0.0307, + "step": 3750 + }, + { + "epoch": 0.682763610056068, + "grad_norm": 8.313312530517578, + "learning_rate": 9.489224534905139e-07, + "loss": 0.0602, + "step": 3775 + }, + { + "epoch": 0.6872852233676976, + "grad_norm": 9.547676086425781, + "learning_rate": 9.48461963529195e-07, + "loss": 0.051, + "step": 3800 + }, + { + "epoch": 0.6918068366793272, + "grad_norm": 5.230764865875244, + "learning_rate": 9.480014735678762e-07, + "loss": 0.0463, + "step": 3825 + }, + { + "epoch": 0.6963284499909568, + "grad_norm": 6.57362699508667, + "learning_rate": 9.475409836065573e-07, + "loss": 0.0564, + "step": 3850 + }, + { + "epoch": 0.7008500633025864, + "grad_norm": 8.198570251464844, + "learning_rate": 9.470804936452384e-07, + "loss": 0.0345, + "step": 3875 + }, + { + "epoch": 0.705371676614216, + "grad_norm": 12.034746170043945, + "learning_rate": 9.466200036839197e-07, + "loss": 0.0728, + "step": 3900 + }, + { + "epoch": 0.7098932899258455, + "grad_norm": 14.528310775756836, + "learning_rate": 9.461595137226009e-07, + "loss": 0.0473, + "step": 3925 + }, + { + "epoch": 0.7144149032374751, + "grad_norm": 1.0361833572387695, + "learning_rate": 9.456990237612819e-07, + "loss": 0.0566, + "step": 3950 + }, + { + "epoch": 0.7189365165491047, + "grad_norm": 18.10868263244629, + "learning_rate": 9.452385337999631e-07, + "loss": 0.0647, + "step": 3975 + }, + { + "epoch": 0.7234581298607343, + "grad_norm": 0.4723142981529236, + "learning_rate": 9.447780438386443e-07, + "loss": 0.0936, + "step": 4000 + }, + { + "epoch": 0.7279797431723639, + "grad_norm": 2.1907668113708496, + "learning_rate": 9.443175538773254e-07, + "loss": 0.0548, + "step": 4025 + }, + { + "epoch": 0.7325013564839935, + "grad_norm": 9.852395057678223, + "learning_rate": 9.438570639160066e-07, + "loss": 0.0568, + "step": 4050 + }, + { + "epoch": 0.7370229697956231, + "grad_norm": 4.197534084320068, + "learning_rate": 9.433965739546878e-07, + "loss": 0.0393, + "step": 4075 + }, + { + "epoch": 0.7415445831072527, + "grad_norm": 2.691697835922241, + "learning_rate": 9.429360839933689e-07, + "loss": 0.0486, + "step": 4100 + }, + { + "epoch": 0.7460661964188823, + "grad_norm": 6.5368452072143555, + "learning_rate": 9.4247559403205e-07, + "loss": 0.0346, + "step": 4125 + }, + { + "epoch": 0.7505878097305119, + "grad_norm": 5.945896148681641, + "learning_rate": 9.420151040707313e-07, + "loss": 0.0583, + "step": 4150 + }, + { + "epoch": 0.7551094230421415, + "grad_norm": 0.8333092331886292, + "learning_rate": 9.415546141094124e-07, + "loss": 0.0532, + "step": 4175 + }, + { + "epoch": 0.759631036353771, + "grad_norm": 6.2607879638671875, + "learning_rate": 9.410941241480935e-07, + "loss": 0.0341, + "step": 4200 + }, + { + "epoch": 0.7641526496654006, + "grad_norm": 20.586143493652344, + "learning_rate": 9.406336341867747e-07, + "loss": 0.0403, + "step": 4225 + }, + { + "epoch": 0.7686742629770302, + "grad_norm": 7.338830947875977, + "learning_rate": 9.401731442254558e-07, + "loss": 0.0727, + "step": 4250 + }, + { + "epoch": 0.7731958762886598, + "grad_norm": 0.2722490429878235, + "learning_rate": 9.39712654264137e-07, + "loss": 0.0489, + "step": 4275 + }, + { + "epoch": 0.7777174896002894, + "grad_norm": 0.15249581634998322, + "learning_rate": 9.392521643028182e-07, + "loss": 0.0519, + "step": 4300 + }, + { + "epoch": 0.782239102911919, + "grad_norm": 2.320035457611084, + "learning_rate": 9.38810093939952e-07, + "loss": 0.1012, + "step": 4325 + }, + { + "epoch": 0.7867607162235486, + "grad_norm": 21.31127166748047, + "learning_rate": 9.383496039786333e-07, + "loss": 0.0479, + "step": 4350 + }, + { + "epoch": 0.7912823295351782, + "grad_norm": 0.8797614574432373, + "learning_rate": 9.378891140173144e-07, + "loss": 0.054, + "step": 4375 + }, + { + "epoch": 0.7958039428468078, + "grad_norm": 12.751317977905273, + "learning_rate": 9.374286240559955e-07, + "loss": 0.1115, + "step": 4400 + }, + { + "epoch": 0.8003255561584374, + "grad_norm": 3.1984500885009766, + "learning_rate": 9.369681340946767e-07, + "loss": 0.0431, + "step": 4425 + }, + { + "epoch": 0.804847169470067, + "grad_norm": 4.108559608459473, + "learning_rate": 9.365076441333579e-07, + "loss": 0.0326, + "step": 4450 + }, + { + "epoch": 0.8093687827816965, + "grad_norm": 9.827606201171875, + "learning_rate": 9.36047154172039e-07, + "loss": 0.0454, + "step": 4475 + }, + { + "epoch": 0.8138903960933261, + "grad_norm": 6.922112941741943, + "learning_rate": 9.355866642107202e-07, + "loss": 0.0472, + "step": 4500 + }, + { + "epoch": 0.8184120094049557, + "grad_norm": 3.6166677474975586, + "learning_rate": 9.351261742494014e-07, + "loss": 0.0495, + "step": 4525 + }, + { + "epoch": 0.8229336227165853, + "grad_norm": 6.462372303009033, + "learning_rate": 9.346656842880824e-07, + "loss": 0.0273, + "step": 4550 + }, + { + "epoch": 0.8274552360282149, + "grad_norm": 6.296558856964111, + "learning_rate": 9.342051943267636e-07, + "loss": 0.0557, + "step": 4575 + }, + { + "epoch": 0.8319768493398445, + "grad_norm": 4.71564245223999, + "learning_rate": 9.337447043654449e-07, + "loss": 0.0524, + "step": 4600 + }, + { + "epoch": 0.8364984626514741, + "grad_norm": 5.095015048980713, + "learning_rate": 9.332842144041259e-07, + "loss": 0.0468, + "step": 4625 + }, + { + "epoch": 0.8410200759631037, + "grad_norm": 1.5702929496765137, + "learning_rate": 9.328237244428071e-07, + "loss": 0.0524, + "step": 4650 + }, + { + "epoch": 0.8455416892747333, + "grad_norm": 13.924525260925293, + "learning_rate": 9.323632344814883e-07, + "loss": 0.0681, + "step": 4675 + }, + { + "epoch": 0.8500633025863629, + "grad_norm": 1.5610660314559937, + "learning_rate": 9.319027445201694e-07, + "loss": 0.0361, + "step": 4700 + }, + { + "epoch": 0.8545849158979925, + "grad_norm": 15.312824249267578, + "learning_rate": 9.314422545588506e-07, + "loss": 0.0799, + "step": 4725 + }, + { + "epoch": 0.8591065292096219, + "grad_norm": 12.912773132324219, + "learning_rate": 9.309817645975318e-07, + "loss": 0.0768, + "step": 4750 + }, + { + "epoch": 0.8636281425212515, + "grad_norm": 0.2809099853038788, + "learning_rate": 9.305212746362129e-07, + "loss": 0.0832, + "step": 4775 + }, + { + "epoch": 0.8681497558328811, + "grad_norm": 16.709447860717773, + "learning_rate": 9.30060784674894e-07, + "loss": 0.1101, + "step": 4800 + }, + { + "epoch": 0.8726713691445107, + "grad_norm": 3.765148401260376, + "learning_rate": 9.296002947135751e-07, + "loss": 0.0513, + "step": 4825 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 5.886707782745361, + "learning_rate": 9.291398047522564e-07, + "loss": 0.0414, + "step": 4850 + }, + { + "epoch": 0.8817145957677699, + "grad_norm": 5.642549991607666, + "learning_rate": 9.286793147909375e-07, + "loss": 0.0418, + "step": 4875 + }, + { + "epoch": 0.8862362090793995, + "grad_norm": 2.7111074924468994, + "learning_rate": 9.282188248296186e-07, + "loss": 0.0352, + "step": 4900 + }, + { + "epoch": 0.8907578223910291, + "grad_norm": 8.905896186828613, + "learning_rate": 9.277583348682998e-07, + "loss": 0.0417, + "step": 4925 + }, + { + "epoch": 0.8952794357026587, + "grad_norm": 5.965810775756836, + "learning_rate": 9.27297844906981e-07, + "loss": 0.056, + "step": 4950 + }, + { + "epoch": 0.8998010490142883, + "grad_norm": 14.578080177307129, + "learning_rate": 9.268373549456621e-07, + "loss": 0.054, + "step": 4975 + }, + { + "epoch": 0.9043226623259178, + "grad_norm": 8.001260757446289, + "learning_rate": 9.263768649843433e-07, + "loss": 0.0379, + "step": 5000 + }, + { + "epoch": 0.9088442756375474, + "grad_norm": 10.563271522521973, + "learning_rate": 9.259163750230245e-07, + "loss": 0.0492, + "step": 5025 + }, + { + "epoch": 0.913365888949177, + "grad_norm": 6.896139621734619, + "learning_rate": 9.254558850617055e-07, + "loss": 0.0541, + "step": 5050 + }, + { + "epoch": 0.9178875022608066, + "grad_norm": 0.6722992658615112, + "learning_rate": 9.249953951003868e-07, + "loss": 0.0523, + "step": 5075 + }, + { + "epoch": 0.9224091155724362, + "grad_norm": 0.5435565710067749, + "learning_rate": 9.24534905139068e-07, + "loss": 0.0554, + "step": 5100 + }, + { + "epoch": 0.9269307288840658, + "grad_norm": 17.15158462524414, + "learning_rate": 9.240744151777491e-07, + "loss": 0.1066, + "step": 5125 + }, + { + "epoch": 0.9314523421956954, + "grad_norm": 4.256832599639893, + "learning_rate": 9.236139252164302e-07, + "loss": 0.0761, + "step": 5150 + }, + { + "epoch": 0.935973955507325, + "grad_norm": 1.2044847011566162, + "learning_rate": 9.231534352551114e-07, + "loss": 0.05, + "step": 5175 + }, + { + "epoch": 0.9404955688189546, + "grad_norm": 36.5662956237793, + "learning_rate": 9.226929452937926e-07, + "loss": 0.1034, + "step": 5200 + }, + { + "epoch": 0.9450171821305842, + "grad_norm": 6.578591823577881, + "learning_rate": 9.222324553324737e-07, + "loss": 0.0497, + "step": 5225 + }, + { + "epoch": 0.9495387954422138, + "grad_norm": 0.6094868779182434, + "learning_rate": 9.217719653711549e-07, + "loss": 0.0396, + "step": 5250 + }, + { + "epoch": 0.9540604087538433, + "grad_norm": 2.0846738815307617, + "learning_rate": 9.21311475409836e-07, + "loss": 0.0271, + "step": 5275 + }, + { + "epoch": 0.9585820220654729, + "grad_norm": 2.4080617427825928, + "learning_rate": 9.208509854485171e-07, + "loss": 0.0568, + "step": 5300 + }, + { + "epoch": 0.9631036353771025, + "grad_norm": 4.527153968811035, + "learning_rate": 9.203904954871984e-07, + "loss": 0.0346, + "step": 5325 + }, + { + "epoch": 0.9676252486887321, + "grad_norm": 4.270484924316406, + "learning_rate": 9.199300055258795e-07, + "loss": 0.0649, + "step": 5350 + }, + { + "epoch": 0.9721468620003617, + "grad_norm": 0.19271469116210938, + "learning_rate": 9.194695155645607e-07, + "loss": 0.0349, + "step": 5375 + }, + { + "epoch": 0.9766684753119913, + "grad_norm": 3.624760389328003, + "learning_rate": 9.190090256032418e-07, + "loss": 0.0332, + "step": 5400 + }, + { + "epoch": 0.9811900886236209, + "grad_norm": 12.83198070526123, + "learning_rate": 9.185485356419229e-07, + "loss": 0.0392, + "step": 5425 + }, + { + "epoch": 0.9857117019352505, + "grad_norm": 2.507450580596924, + "learning_rate": 9.180880456806042e-07, + "loss": 0.0561, + "step": 5450 + }, + { + "epoch": 0.9902333152468801, + "grad_norm": 3.1027333736419678, + "learning_rate": 9.176275557192853e-07, + "loss": 0.0411, + "step": 5475 + }, + { + "epoch": 0.9947549285585097, + "grad_norm": 53.23537826538086, + "learning_rate": 9.171670657579664e-07, + "loss": 0.0892, + "step": 5500 + }, + { + "epoch": 0.9992765418701393, + "grad_norm": 53.92917251586914, + "learning_rate": 9.167065757966476e-07, + "loss": 0.0849, + "step": 5525 + }, + { + "epoch": 1.0, + "eval_loss": 0.19376881420612335, + "eval_runtime": 8685.9066, + "eval_samples_per_second": 1.093, + "eval_steps_per_second": 0.137, + "eval_wer": 0.10807174887892376, + "step": 5529 + }, + { + "epoch": 1.003798155181769, + "grad_norm": 1.1583398580551147, + "learning_rate": 9.162460858353286e-07, + "loss": 0.0418, + "step": 5550 + }, + { + "epoch": 1.0083197684933984, + "grad_norm": 3.819171190261841, + "learning_rate": 9.157855958740099e-07, + "loss": 0.0223, + "step": 5575 + }, + { + "epoch": 1.0128413818050281, + "grad_norm": 12.470620155334473, + "learning_rate": 9.153251059126911e-07, + "loss": 0.0322, + "step": 5600 + }, + { + "epoch": 1.0173629951166576, + "grad_norm": 3.7826218605041504, + "learning_rate": 9.148646159513723e-07, + "loss": 0.0159, + "step": 5625 + }, + { + "epoch": 1.0218846084282873, + "grad_norm": 3.368657112121582, + "learning_rate": 9.144041259900533e-07, + "loss": 0.0354, + "step": 5650 + }, + { + "epoch": 1.0264062217399168, + "grad_norm": 5.757375717163086, + "learning_rate": 9.139436360287345e-07, + "loss": 0.0212, + "step": 5675 + }, + { + "epoch": 1.0309278350515463, + "grad_norm": 0.24591568112373352, + "learning_rate": 9.134831460674158e-07, + "loss": 0.0354, + "step": 5700 + }, + { + "epoch": 1.035449448363176, + "grad_norm": 1.0728533267974854, + "learning_rate": 9.130226561060968e-07, + "loss": 0.0162, + "step": 5725 + }, + { + "epoch": 1.0399710616748055, + "grad_norm": 4.130620956420898, + "learning_rate": 9.12562166144778e-07, + "loss": 0.0286, + "step": 5750 + }, + { + "epoch": 1.0444926749864352, + "grad_norm": 19.77248764038086, + "learning_rate": 9.121016761834591e-07, + "loss": 0.0346, + "step": 5775 + }, + { + "epoch": 1.0490142882980646, + "grad_norm": 1.552537202835083, + "learning_rate": 9.116411862221404e-07, + "loss": 0.0164, + "step": 5800 + }, + { + "epoch": 1.0535359016096943, + "grad_norm": 0.2987182140350342, + "learning_rate": 9.111806962608215e-07, + "loss": 0.0423, + "step": 5825 + }, + { + "epoch": 1.0580575149213238, + "grad_norm": 2.336726188659668, + "learning_rate": 9.107202062995026e-07, + "loss": 0.0431, + "step": 5850 + }, + { + "epoch": 1.0625791282329535, + "grad_norm": 1.1774622201919556, + "learning_rate": 9.102597163381838e-07, + "loss": 0.0238, + "step": 5875 + }, + { + "epoch": 1.067100741544583, + "grad_norm": 1.2562298774719238, + "learning_rate": 9.097992263768649e-07, + "loss": 0.0502, + "step": 5900 + }, + { + "epoch": 1.0716223548562127, + "grad_norm": 15.588605880737305, + "learning_rate": 9.093387364155461e-07, + "loss": 0.0622, + "step": 5925 + }, + { + "epoch": 1.0761439681678422, + "grad_norm": 2.4600257873535156, + "learning_rate": 9.088782464542273e-07, + "loss": 0.0476, + "step": 5950 + }, + { + "epoch": 1.080665581479472, + "grad_norm": 7.753191947937012, + "learning_rate": 9.084177564929084e-07, + "loss": 0.035, + "step": 5975 + }, + { + "epoch": 1.0851871947911014, + "grad_norm": 6.821231365203857, + "learning_rate": 9.079572665315895e-07, + "loss": 0.0212, + "step": 6000 + }, + { + "epoch": 1.089708808102731, + "grad_norm": 2.2424540519714355, + "learning_rate": 9.074967765702707e-07, + "loss": 0.0163, + "step": 6025 + }, + { + "epoch": 1.0942304214143606, + "grad_norm": 23.894805908203125, + "learning_rate": 9.07036286608952e-07, + "loss": 0.0329, + "step": 6050 + }, + { + "epoch": 1.0987520347259903, + "grad_norm": 0.27833691239356995, + "learning_rate": 9.06575796647633e-07, + "loss": 0.0378, + "step": 6075 + }, + { + "epoch": 1.1032736480376197, + "grad_norm": 1.7873796224594116, + "learning_rate": 9.061153066863142e-07, + "loss": 0.035, + "step": 6100 + }, + { + "epoch": 1.1077952613492494, + "grad_norm": 8.95418930053711, + "learning_rate": 9.056548167249954e-07, + "loss": 0.0346, + "step": 6125 + }, + { + "epoch": 1.112316874660879, + "grad_norm": 0.6353895664215088, + "learning_rate": 9.051943267636764e-07, + "loss": 0.0225, + "step": 6150 + }, + { + "epoch": 1.1168384879725086, + "grad_norm": 4.148350715637207, + "learning_rate": 9.047338368023577e-07, + "loss": 0.0375, + "step": 6175 + }, + { + "epoch": 1.121360101284138, + "grad_norm": 3.327193021774292, + "learning_rate": 9.042733468410389e-07, + "loss": 0.0269, + "step": 6200 + }, + { + "epoch": 1.1258817145957678, + "grad_norm": 4.31793212890625, + "learning_rate": 9.038128568797199e-07, + "loss": 0.0242, + "step": 6225 + }, + { + "epoch": 1.1304033279073973, + "grad_norm": 20.431732177734375, + "learning_rate": 9.033523669184011e-07, + "loss": 0.0222, + "step": 6250 + }, + { + "epoch": 1.134924941219027, + "grad_norm": 6.784418106079102, + "learning_rate": 9.028918769570823e-07, + "loss": 0.0488, + "step": 6275 + }, + { + "epoch": 1.1394465545306565, + "grad_norm": 21.859533309936523, + "learning_rate": 9.024313869957635e-07, + "loss": 0.0518, + "step": 6300 + }, + { + "epoch": 1.1439681678422862, + "grad_norm": 2.1316771507263184, + "learning_rate": 9.019708970344446e-07, + "loss": 0.0403, + "step": 6325 + }, + { + "epoch": 1.1484897811539156, + "grad_norm": 2.723567247390747, + "learning_rate": 9.015104070731258e-07, + "loss": 0.0487, + "step": 6350 + }, + { + "epoch": 1.1530113944655453, + "grad_norm": 4.712471961975098, + "learning_rate": 9.010499171118069e-07, + "loss": 0.0139, + "step": 6375 + }, + { + "epoch": 1.1575330077771748, + "grad_norm": 2.7237842082977295, + "learning_rate": 9.00589427150488e-07, + "loss": 0.053, + "step": 6400 + }, + { + "epoch": 1.1620546210888045, + "grad_norm": 3.783311605453491, + "learning_rate": 9.001289371891693e-07, + "loss": 0.0447, + "step": 6425 + }, + { + "epoch": 1.166576234400434, + "grad_norm": 4.771798610687256, + "learning_rate": 8.996684472278504e-07, + "loss": 0.0306, + "step": 6450 + }, + { + "epoch": 1.1710978477120637, + "grad_norm": 0.3322123885154724, + "learning_rate": 8.992079572665316e-07, + "loss": 0.024, + "step": 6475 + }, + { + "epoch": 1.1756194610236932, + "grad_norm": 17.802349090576172, + "learning_rate": 8.987658869036655e-07, + "loss": 0.0317, + "step": 6500 + }, + { + "epoch": 1.180141074335323, + "grad_norm": 0.25371697545051575, + "learning_rate": 8.983053969423466e-07, + "loss": 0.0405, + "step": 6525 + }, + { + "epoch": 1.1846626876469524, + "grad_norm": 2.539607524871826, + "learning_rate": 8.978449069810278e-07, + "loss": 0.0199, + "step": 6550 + }, + { + "epoch": 1.189184300958582, + "grad_norm": 2.3740954399108887, + "learning_rate": 8.973844170197089e-07, + "loss": 0.0252, + "step": 6575 + }, + { + "epoch": 1.1937059142702116, + "grad_norm": 2.0089480876922607, + "learning_rate": 8.9692392705839e-07, + "loss": 0.0091, + "step": 6600 + }, + { + "epoch": 1.1982275275818413, + "grad_norm": 0.539943516254425, + "learning_rate": 8.964634370970713e-07, + "loss": 0.0479, + "step": 6625 + }, + { + "epoch": 1.2027491408934707, + "grad_norm": 15.776597023010254, + "learning_rate": 8.960029471357525e-07, + "loss": 0.0337, + "step": 6650 + }, + { + "epoch": 1.2072707542051004, + "grad_norm": 43.65888977050781, + "learning_rate": 8.955424571744335e-07, + "loss": 0.04, + "step": 6675 + }, + { + "epoch": 1.21179236751673, + "grad_norm": 0.6316529512405396, + "learning_rate": 8.950819672131147e-07, + "loss": 0.0464, + "step": 6700 + }, + { + "epoch": 1.2163139808283596, + "grad_norm": 0.06351311504840851, + "learning_rate": 8.946214772517959e-07, + "loss": 0.0596, + "step": 6725 + }, + { + "epoch": 1.220835594139989, + "grad_norm": 9.572014808654785, + "learning_rate": 8.94160987290477e-07, + "loss": 0.0398, + "step": 6750 + }, + { + "epoch": 1.2253572074516188, + "grad_norm": 9.598252296447754, + "learning_rate": 8.937004973291582e-07, + "loss": 0.0344, + "step": 6775 + }, + { + "epoch": 1.2298788207632483, + "grad_norm": 1.5910439491271973, + "learning_rate": 8.932400073678394e-07, + "loss": 0.0265, + "step": 6800 + }, + { + "epoch": 1.234400434074878, + "grad_norm": 10.684225082397461, + "learning_rate": 8.927795174065205e-07, + "loss": 0.0264, + "step": 6825 + }, + { + "epoch": 1.2389220473865075, + "grad_norm": 11.20118236541748, + "learning_rate": 8.923190274452016e-07, + "loss": 0.0345, + "step": 6850 + }, + { + "epoch": 1.2434436606981372, + "grad_norm": 12.899706840515137, + "learning_rate": 8.918585374838829e-07, + "loss": 0.0316, + "step": 6875 + }, + { + "epoch": 1.2479652740097666, + "grad_norm": 18.632545471191406, + "learning_rate": 8.91398047522564e-07, + "loss": 0.033, + "step": 6900 + }, + { + "epoch": 1.2524868873213963, + "grad_norm": 0.7847491502761841, + "learning_rate": 8.909375575612451e-07, + "loss": 0.0292, + "step": 6925 + }, + { + "epoch": 1.2570085006330258, + "grad_norm": 1.0317540168762207, + "learning_rate": 8.904770675999262e-07, + "loss": 0.0286, + "step": 6950 + }, + { + "epoch": 1.2615301139446555, + "grad_norm": 0.7965870499610901, + "learning_rate": 8.900165776386075e-07, + "loss": 0.0222, + "step": 6975 + }, + { + "epoch": 1.266051727256285, + "grad_norm": 12.810919761657715, + "learning_rate": 8.895560876772886e-07, + "loss": 0.0369, + "step": 7000 + }, + { + "epoch": 1.2705733405679147, + "grad_norm": 12.809525489807129, + "learning_rate": 8.890955977159697e-07, + "loss": 0.0381, + "step": 7025 + }, + { + "epoch": 1.2750949538795442, + "grad_norm": 4.802799224853516, + "learning_rate": 8.886351077546509e-07, + "loss": 0.0274, + "step": 7050 + }, + { + "epoch": 1.279616567191174, + "grad_norm": 37.88527297973633, + "learning_rate": 8.881746177933321e-07, + "loss": 0.0266, + "step": 7075 + }, + { + "epoch": 1.2841381805028034, + "grad_norm": 23.25122833251953, + "learning_rate": 8.877141278320132e-07, + "loss": 0.0487, + "step": 7100 + }, + { + "epoch": 1.2886597938144329, + "grad_norm": 25.70662498474121, + "learning_rate": 8.872536378706944e-07, + "loss": 0.0652, + "step": 7125 + }, + { + "epoch": 1.2931814071260626, + "grad_norm": 3.8632333278656006, + "learning_rate": 8.867931479093756e-07, + "loss": 0.0442, + "step": 7150 + }, + { + "epoch": 1.2977030204376923, + "grad_norm": 1.3248151540756226, + "learning_rate": 8.863326579480566e-07, + "loss": 0.0229, + "step": 7175 + }, + { + "epoch": 1.3022246337493217, + "grad_norm": 1.7153706550598145, + "learning_rate": 8.858721679867378e-07, + "loss": 0.0341, + "step": 7200 + }, + { + "epoch": 1.3067462470609512, + "grad_norm": 8.373719215393066, + "learning_rate": 8.854116780254191e-07, + "loss": 0.0422, + "step": 7225 + }, + { + "epoch": 1.311267860372581, + "grad_norm": 2.403066635131836, + "learning_rate": 8.849511880641001e-07, + "loss": 0.0287, + "step": 7250 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 1.0376137495040894, + "learning_rate": 8.844906981027813e-07, + "loss": 0.0113, + "step": 7275 + }, + { + "epoch": 1.32031108699584, + "grad_norm": 5.192943096160889, + "learning_rate": 8.840302081414625e-07, + "loss": 0.0301, + "step": 7300 + }, + { + "epoch": 1.3248327003074696, + "grad_norm": 1.6459161043167114, + "learning_rate": 8.835697181801436e-07, + "loss": 0.044, + "step": 7325 + }, + { + "epoch": 1.3293543136190993, + "grad_norm": 9.943346977233887, + "learning_rate": 8.831092282188248e-07, + "loss": 0.0132, + "step": 7350 + }, + { + "epoch": 1.333875926930729, + "grad_norm": 5.858924865722656, + "learning_rate": 8.82648738257506e-07, + "loss": 0.0192, + "step": 7375 + }, + { + "epoch": 1.3383975402423585, + "grad_norm": 6.490448474884033, + "learning_rate": 8.821882482961871e-07, + "loss": 0.0239, + "step": 7400 + }, + { + "epoch": 1.342919153553988, + "grad_norm": 0.2476533055305481, + "learning_rate": 8.817277583348682e-07, + "loss": 0.0463, + "step": 7425 + }, + { + "epoch": 1.3474407668656176, + "grad_norm": 14.549626350402832, + "learning_rate": 8.812672683735494e-07, + "loss": 0.0507, + "step": 7450 + }, + { + "epoch": 1.3519623801772473, + "grad_norm": 2.189336061477661, + "learning_rate": 8.808067784122306e-07, + "loss": 0.0331, + "step": 7475 + }, + { + "epoch": 1.3564839934888768, + "grad_norm": 0.9585964679718018, + "learning_rate": 8.803462884509118e-07, + "loss": 0.0572, + "step": 7500 + }, + { + "epoch": 1.3610056068005063, + "grad_norm": 0.9413008093833923, + "learning_rate": 8.798857984895929e-07, + "loss": 0.0847, + "step": 7525 + }, + { + "epoch": 1.365527220112136, + "grad_norm": 1.1694642305374146, + "learning_rate": 8.79425308528274e-07, + "loss": 0.0355, + "step": 7550 + }, + { + "epoch": 1.3700488334237657, + "grad_norm": 10.240534782409668, + "learning_rate": 8.789648185669552e-07, + "loss": 0.0264, + "step": 7575 + }, + { + "epoch": 1.3745704467353952, + "grad_norm": 0.40524232387542725, + "learning_rate": 8.785043286056364e-07, + "loss": 0.0277, + "step": 7600 + }, + { + "epoch": 1.3790920600470247, + "grad_norm": 0.6719425916671753, + "learning_rate": 8.780438386443175e-07, + "loss": 0.027, + "step": 7625 + }, + { + "epoch": 1.3836136733586544, + "grad_norm": 3.0596706867218018, + "learning_rate": 8.775833486829987e-07, + "loss": 0.0251, + "step": 7650 + }, + { + "epoch": 1.388135286670284, + "grad_norm": 3.454193592071533, + "learning_rate": 8.771228587216797e-07, + "loss": 0.0334, + "step": 7675 + }, + { + "epoch": 1.3926568999819136, + "grad_norm": 3.9943997859954834, + "learning_rate": 8.76662368760361e-07, + "loss": 0.0312, + "step": 7700 + }, + { + "epoch": 1.397178513293543, + "grad_norm": 8.968663215637207, + "learning_rate": 8.762018787990422e-07, + "loss": 0.027, + "step": 7725 + }, + { + "epoch": 1.4017001266051727, + "grad_norm": 1.615293025970459, + "learning_rate": 8.757413888377234e-07, + "loss": 0.0246, + "step": 7750 + }, + { + "epoch": 1.4062217399168024, + "grad_norm": 14.491230964660645, + "learning_rate": 8.752808988764044e-07, + "loss": 0.019, + "step": 7775 + }, + { + "epoch": 1.410743353228432, + "grad_norm": 0.586275577545166, + "learning_rate": 8.748204089150856e-07, + "loss": 0.0315, + "step": 7800 + }, + { + "epoch": 1.4152649665400614, + "grad_norm": 16.67391586303711, + "learning_rate": 8.743599189537669e-07, + "loss": 0.0468, + "step": 7825 + }, + { + "epoch": 1.419786579851691, + "grad_norm": 7.967570781707764, + "learning_rate": 8.738994289924479e-07, + "loss": 0.0443, + "step": 7850 + }, + { + "epoch": 1.4243081931633208, + "grad_norm": 3.4274497032165527, + "learning_rate": 8.734389390311291e-07, + "loss": 0.0464, + "step": 7875 + }, + { + "epoch": 1.4288298064749503, + "grad_norm": 14.454466819763184, + "learning_rate": 8.729784490698102e-07, + "loss": 0.0594, + "step": 7900 + }, + { + "epoch": 1.4333514197865798, + "grad_norm": 34.33506774902344, + "learning_rate": 8.725179591084913e-07, + "loss": 0.1098, + "step": 7925 + }, + { + "epoch": 1.4378730330982095, + "grad_norm": 5.2832770347595215, + "learning_rate": 8.720574691471726e-07, + "loss": 0.035, + "step": 7950 + }, + { + "epoch": 1.4423946464098392, + "grad_norm": 2.5055034160614014, + "learning_rate": 8.715969791858537e-07, + "loss": 0.0261, + "step": 7975 + }, + { + "epoch": 1.4469162597214686, + "grad_norm": 2.449373483657837, + "learning_rate": 8.711364892245349e-07, + "loss": 0.024, + "step": 8000 + }, + { + "epoch": 1.4514378730330981, + "grad_norm": 3.93390154838562, + "learning_rate": 8.70675999263216e-07, + "loss": 0.0248, + "step": 8025 + }, + { + "epoch": 1.4559594863447278, + "grad_norm": 14.35305404663086, + "learning_rate": 8.702155093018971e-07, + "loss": 0.0237, + "step": 8050 + }, + { + "epoch": 1.4604810996563573, + "grad_norm": 7.074910640716553, + "learning_rate": 8.697550193405784e-07, + "loss": 0.0284, + "step": 8075 + }, + { + "epoch": 1.465002712967987, + "grad_norm": 21.563901901245117, + "learning_rate": 8.692945293792595e-07, + "loss": 0.0421, + "step": 8100 + }, + { + "epoch": 1.4695243262796165, + "grad_norm": 8.473495483398438, + "learning_rate": 8.688340394179406e-07, + "loss": 0.0194, + "step": 8125 + }, + { + "epoch": 1.4740459395912462, + "grad_norm": 0.6098468899726868, + "learning_rate": 8.683735494566218e-07, + "loss": 0.0392, + "step": 8150 + }, + { + "epoch": 1.4785675529028757, + "grad_norm": 29.32798957824707, + "learning_rate": 8.679130594953029e-07, + "loss": 0.0503, + "step": 8175 + }, + { + "epoch": 1.4830891662145054, + "grad_norm": 7.364591121673584, + "learning_rate": 8.674525695339841e-07, + "loss": 0.0227, + "step": 8200 + }, + { + "epoch": 1.4876107795261349, + "grad_norm": 8.35572338104248, + "learning_rate": 8.669920795726653e-07, + "loss": 0.0346, + "step": 8225 + }, + { + "epoch": 1.4921323928377646, + "grad_norm": 9.818826675415039, + "learning_rate": 8.665315896113465e-07, + "loss": 0.0484, + "step": 8250 + }, + { + "epoch": 1.496654006149394, + "grad_norm": 8.03248405456543, + "learning_rate": 8.660710996500275e-07, + "loss": 0.0605, + "step": 8275 + }, + { + "epoch": 1.5011756194610237, + "grad_norm": 7.9050397872924805, + "learning_rate": 8.656106096887087e-07, + "loss": 0.0609, + "step": 8300 + }, + { + "epoch": 1.5056972327726532, + "grad_norm": 0.5489145517349243, + "learning_rate": 8.6515011972739e-07, + "loss": 0.041, + "step": 8325 + }, + { + "epoch": 1.510218846084283, + "grad_norm": 1.3398199081420898, + "learning_rate": 8.64689629766071e-07, + "loss": 0.0458, + "step": 8350 + }, + { + "epoch": 1.5147404593959126, + "grad_norm": 23.95371437072754, + "learning_rate": 8.642291398047522e-07, + "loss": 0.0322, + "step": 8375 + }, + { + "epoch": 1.519262072707542, + "grad_norm": 6.187991142272949, + "learning_rate": 8.637686498434334e-07, + "loss": 0.0186, + "step": 8400 + }, + { + "epoch": 1.5237836860191716, + "grad_norm": 2.8521807193756104, + "learning_rate": 8.633081598821146e-07, + "loss": 0.0353, + "step": 8425 + }, + { + "epoch": 1.5283052993308013, + "grad_norm": 17.128206253051758, + "learning_rate": 8.628476699207957e-07, + "loss": 0.0217, + "step": 8450 + }, + { + "epoch": 1.532826912642431, + "grad_norm": 1.29264235496521, + "learning_rate": 8.623871799594769e-07, + "loss": 0.02, + "step": 8475 + }, + { + "epoch": 1.5373485259540605, + "grad_norm": 1.0612270832061768, + "learning_rate": 8.61926689998158e-07, + "loss": 0.0273, + "step": 8500 + }, + { + "epoch": 1.54187013926569, + "grad_norm": 21.39075469970703, + "learning_rate": 8.614662000368391e-07, + "loss": 0.0326, + "step": 8525 + }, + { + "epoch": 1.5463917525773194, + "grad_norm": 22.907485961914062, + "learning_rate": 8.610057100755204e-07, + "loss": 0.0493, + "step": 8550 + }, + { + "epoch": 1.5509133658889491, + "grad_norm": 0.9095446467399597, + "learning_rate": 8.605452201142015e-07, + "loss": 0.0203, + "step": 8575 + }, + { + "epoch": 1.5554349792005788, + "grad_norm": 19.75260353088379, + "learning_rate": 8.600847301528826e-07, + "loss": 0.021, + "step": 8600 + }, + { + "epoch": 1.5599565925122083, + "grad_norm": 14.637717247009277, + "learning_rate": 8.596242401915637e-07, + "loss": 0.0392, + "step": 8625 + }, + { + "epoch": 1.5644782058238378, + "grad_norm": 16.156036376953125, + "learning_rate": 8.591821698286977e-07, + "loss": 0.0496, + "step": 8650 + }, + { + "epoch": 1.5689998191354675, + "grad_norm": 0.6795814037322998, + "learning_rate": 8.587216798673789e-07, + "loss": 0.031, + "step": 8675 + }, + { + "epoch": 1.5735214324470972, + "grad_norm": 9.835176467895508, + "learning_rate": 8.5826118990606e-07, + "loss": 0.0413, + "step": 8700 + }, + { + "epoch": 1.5780430457587267, + "grad_norm": 98.9009017944336, + "learning_rate": 8.578006999447411e-07, + "loss": 0.0764, + "step": 8725 + }, + { + "epoch": 1.5825646590703562, + "grad_norm": 11.466421127319336, + "learning_rate": 8.573402099834223e-07, + "loss": 0.0362, + "step": 8750 + }, + { + "epoch": 1.5870862723819859, + "grad_norm": 1.9184726476669312, + "learning_rate": 8.568797200221036e-07, + "loss": 0.0199, + "step": 8775 + }, + { + "epoch": 1.5916078856936156, + "grad_norm": 1.2715715169906616, + "learning_rate": 8.564192300607846e-07, + "loss": 0.0273, + "step": 8800 + }, + { + "epoch": 1.596129499005245, + "grad_norm": 0.3399398624897003, + "learning_rate": 8.559587400994658e-07, + "loss": 0.0271, + "step": 8825 + }, + { + "epoch": 1.6006511123168745, + "grad_norm": 3.984431028366089, + "learning_rate": 8.55498250138147e-07, + "loss": 0.0395, + "step": 8850 + }, + { + "epoch": 1.6051727256285042, + "grad_norm": 0.13510115444660187, + "learning_rate": 8.550377601768281e-07, + "loss": 0.0264, + "step": 8875 + }, + { + "epoch": 1.609694338940134, + "grad_norm": 1.5098395347595215, + "learning_rate": 8.545772702155093e-07, + "loss": 0.0365, + "step": 8900 + }, + { + "epoch": 1.6142159522517634, + "grad_norm": 1.5568790435791016, + "learning_rate": 8.541167802541905e-07, + "loss": 0.0348, + "step": 8925 + }, + { + "epoch": 1.6187375655633929, + "grad_norm": 15.226325988769531, + "learning_rate": 8.536562902928715e-07, + "loss": 0.032, + "step": 8950 + }, + { + "epoch": 1.6232591788750226, + "grad_norm": 4.606542587280273, + "learning_rate": 8.531958003315527e-07, + "loss": 0.0397, + "step": 8975 + }, + { + "epoch": 1.6277807921866523, + "grad_norm": 0.1431085765361786, + "learning_rate": 8.52735310370234e-07, + "loss": 0.04, + "step": 9000 + }, + { + "epoch": 1.6323024054982818, + "grad_norm": 8.834503173828125, + "learning_rate": 8.522748204089151e-07, + "loss": 0.0199, + "step": 9025 + }, + { + "epoch": 1.6368240188099112, + "grad_norm": 0.43929988145828247, + "learning_rate": 8.518143304475962e-07, + "loss": 0.0521, + "step": 9050 + }, + { + "epoch": 1.641345632121541, + "grad_norm": 0.239268958568573, + "learning_rate": 8.513538404862773e-07, + "loss": 0.028, + "step": 9075 + }, + { + "epoch": 1.6458672454331706, + "grad_norm": 7.775051593780518, + "learning_rate": 8.508933505249585e-07, + "loss": 0.0435, + "step": 9100 + }, + { + "epoch": 1.6503888587448001, + "grad_norm": 1.0783268213272095, + "learning_rate": 8.504328605636397e-07, + "loss": 0.09, + "step": 9125 + }, + { + "epoch": 1.6549104720564296, + "grad_norm": 6.303003787994385, + "learning_rate": 8.499723706023208e-07, + "loss": 0.0371, + "step": 9150 + }, + { + "epoch": 1.6594320853680593, + "grad_norm": 0.0859726294875145, + "learning_rate": 8.49511880641002e-07, + "loss": 0.0309, + "step": 9175 + }, + { + "epoch": 1.663953698679689, + "grad_norm": 7.407548427581787, + "learning_rate": 8.490513906796831e-07, + "loss": 0.0267, + "step": 9200 + }, + { + "epoch": 1.6684753119913185, + "grad_norm": 0.10481081902980804, + "learning_rate": 8.485909007183642e-07, + "loss": 0.023, + "step": 9225 + }, + { + "epoch": 1.672996925302948, + "grad_norm": 4.49674129486084, + "learning_rate": 8.481304107570455e-07, + "loss": 0.0226, + "step": 9250 + }, + { + "epoch": 1.6775185386145777, + "grad_norm": 3.7796387672424316, + "learning_rate": 8.476699207957267e-07, + "loss": 0.0452, + "step": 9275 + }, + { + "epoch": 1.6820401519262074, + "grad_norm": 4.558553218841553, + "learning_rate": 8.472094308344077e-07, + "loss": 0.0302, + "step": 9300 + }, + { + "epoch": 1.6865617652378369, + "grad_norm": 1.0844203233718872, + "learning_rate": 8.467489408730889e-07, + "loss": 0.0213, + "step": 9325 + }, + { + "epoch": 1.6910833785494663, + "grad_norm": 0.6152500510215759, + "learning_rate": 8.462884509117701e-07, + "loss": 0.028, + "step": 9350 + }, + { + "epoch": 1.695604991861096, + "grad_norm": 0.83628249168396, + "learning_rate": 8.458279609504512e-07, + "loss": 0.0249, + "step": 9375 + }, + { + "epoch": 1.7001266051727257, + "grad_norm": 4.9916157722473145, + "learning_rate": 8.453674709891324e-07, + "loss": 0.0326, + "step": 9400 + }, + { + "epoch": 1.7046482184843552, + "grad_norm": 0.2095576971769333, + "learning_rate": 8.449069810278136e-07, + "loss": 0.0494, + "step": 9425 + }, + { + "epoch": 1.7091698317959847, + "grad_norm": 16.670848846435547, + "learning_rate": 8.444464910664947e-07, + "loss": 0.0248, + "step": 9450 + }, + { + "epoch": 1.7136914451076144, + "grad_norm": 1.0165280103683472, + "learning_rate": 8.439860011051758e-07, + "loss": 0.0371, + "step": 9475 + }, + { + "epoch": 1.718213058419244, + "grad_norm": 19.8568115234375, + "learning_rate": 8.435255111438571e-07, + "loss": 0.03, + "step": 9500 + }, + { + "epoch": 1.7227346717308736, + "grad_norm": 29.754867553710938, + "learning_rate": 8.430650211825382e-07, + "loss": 0.0836, + "step": 9525 + }, + { + "epoch": 1.727256285042503, + "grad_norm": 10.585617065429688, + "learning_rate": 8.426045312212193e-07, + "loss": 0.0253, + "step": 9550 + }, + { + "epoch": 1.7317778983541328, + "grad_norm": 0.41567254066467285, + "learning_rate": 8.421440412599005e-07, + "loss": 0.0419, + "step": 9575 + }, + { + "epoch": 1.7362995116657625, + "grad_norm": 14.9446439743042, + "learning_rate": 8.416835512985817e-07, + "loss": 0.0346, + "step": 9600 + }, + { + "epoch": 1.740821124977392, + "grad_norm": 11.810590744018555, + "learning_rate": 8.412230613372628e-07, + "loss": 0.0399, + "step": 9625 + }, + { + "epoch": 1.7453427382890214, + "grad_norm": 4.900815486907959, + "learning_rate": 8.40762571375944e-07, + "loss": 0.0328, + "step": 9650 + }, + { + "epoch": 1.7498643516006511, + "grad_norm": 43.67582702636719, + "learning_rate": 8.403020814146251e-07, + "loss": 0.0329, + "step": 9675 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 9.560347557067871, + "learning_rate": 8.398415914533063e-07, + "loss": 0.0455, + "step": 9700 + }, + { + "epoch": 1.7589075782239103, + "grad_norm": 20.836200714111328, + "learning_rate": 8.393811014919875e-07, + "loss": 0.0347, + "step": 9725 + }, + { + "epoch": 1.7634291915355398, + "grad_norm": 1.0109299421310425, + "learning_rate": 8.389206115306686e-07, + "loss": 0.0171, + "step": 9750 + }, + { + "epoch": 1.7679508048471695, + "grad_norm": 15.896967887878418, + "learning_rate": 8.384601215693498e-07, + "loss": 0.0558, + "step": 9775 + }, + { + "epoch": 1.7724724181587992, + "grad_norm": 8.316397666931152, + "learning_rate": 8.379996316080308e-07, + "loss": 0.042, + "step": 9800 + }, + { + "epoch": 1.7769940314704287, + "grad_norm": 2.5963985919952393, + "learning_rate": 8.37539141646712e-07, + "loss": 0.051, + "step": 9825 + }, + { + "epoch": 1.7815156447820581, + "grad_norm": 0.8551808595657349, + "learning_rate": 8.370786516853933e-07, + "loss": 0.0379, + "step": 9850 + }, + { + "epoch": 1.7860372580936879, + "grad_norm": 7.102266311645508, + "learning_rate": 8.366181617240745e-07, + "loss": 0.0252, + "step": 9875 + }, + { + "epoch": 1.7905588714053176, + "grad_norm": 1.0050630569458008, + "learning_rate": 8.361576717627555e-07, + "loss": 0.0523, + "step": 9900 + }, + { + "epoch": 1.795080484716947, + "grad_norm": 0.23266096413135529, + "learning_rate": 8.356971818014367e-07, + "loss": 0.0771, + "step": 9925 + }, + { + "epoch": 1.7996020980285765, + "grad_norm": 2.397165298461914, + "learning_rate": 8.352366918401178e-07, + "loss": 0.0383, + "step": 9950 + }, + { + "epoch": 1.8041237113402062, + "grad_norm": 3.823277711868286, + "learning_rate": 8.34776201878799e-07, + "loss": 0.0244, + "step": 9975 + }, + { + "epoch": 1.808645324651836, + "grad_norm": 0.4726651906967163, + "learning_rate": 8.343157119174802e-07, + "loss": 0.0263, + "step": 10000 + }, + { + "epoch": 1.8131669379634654, + "grad_norm": 1.6218931674957275, + "learning_rate": 8.338552219561613e-07, + "loss": 0.0141, + "step": 10025 + }, + { + "epoch": 1.8176885512750949, + "grad_norm": 10.145578384399414, + "learning_rate": 8.333947319948424e-07, + "loss": 0.026, + "step": 10050 + }, + { + "epoch": 1.8222101645867246, + "grad_norm": 10.49772834777832, + "learning_rate": 8.329342420335236e-07, + "loss": 0.0373, + "step": 10075 + }, + { + "epoch": 1.8267317778983543, + "grad_norm": 25.12136459350586, + "learning_rate": 8.324737520722048e-07, + "loss": 0.0335, + "step": 10100 + }, + { + "epoch": 1.8312533912099838, + "grad_norm": 0.4937836229801178, + "learning_rate": 8.32013262110886e-07, + "loss": 0.038, + "step": 10125 + }, + { + "epoch": 1.8357750045216132, + "grad_norm": 0.8513910174369812, + "learning_rate": 8.315527721495671e-07, + "loss": 0.0258, + "step": 10150 + }, + { + "epoch": 1.840296617833243, + "grad_norm": 16.558271408081055, + "learning_rate": 8.310922821882482e-07, + "loss": 0.0564, + "step": 10175 + }, + { + "epoch": 1.8448182311448726, + "grad_norm": 4.88476037979126, + "learning_rate": 8.306317922269294e-07, + "loss": 0.0285, + "step": 10200 + }, + { + "epoch": 1.8493398444565021, + "grad_norm": 4.210046291351318, + "learning_rate": 8.301713022656106e-07, + "loss": 0.0293, + "step": 10225 + }, + { + "epoch": 1.8538614577681316, + "grad_norm": 11.491304397583008, + "learning_rate": 8.297108123042917e-07, + "loss": 0.0585, + "step": 10250 + }, + { + "epoch": 1.8583830710797613, + "grad_norm": 11.61664867401123, + "learning_rate": 8.292503223429729e-07, + "loss": 0.0506, + "step": 10275 + }, + { + "epoch": 1.862904684391391, + "grad_norm": 2.719242572784424, + "learning_rate": 8.28789832381654e-07, + "loss": 0.0484, + "step": 10300 + }, + { + "epoch": 1.8674262977030205, + "grad_norm": 1.5411864519119263, + "learning_rate": 8.283293424203352e-07, + "loss": 0.0472, + "step": 10325 + }, + { + "epoch": 1.87194791101465, + "grad_norm": 9.415594100952148, + "learning_rate": 8.278688524590164e-07, + "loss": 0.0792, + "step": 10350 + }, + { + "epoch": 1.8764695243262797, + "grad_norm": 7.970459938049316, + "learning_rate": 8.274083624976976e-07, + "loss": 0.0345, + "step": 10375 + }, + { + "epoch": 1.8809911376379094, + "grad_norm": 1.801161289215088, + "learning_rate": 8.269478725363786e-07, + "loss": 0.0373, + "step": 10400 + }, + { + "epoch": 1.8855127509495389, + "grad_norm": 0.596969485282898, + "learning_rate": 8.264873825750598e-07, + "loss": 0.0399, + "step": 10425 + }, + { + "epoch": 1.8900343642611683, + "grad_norm": 2.700634717941284, + "learning_rate": 8.260268926137411e-07, + "loss": 0.0269, + "step": 10450 + }, + { + "epoch": 1.8945559775727978, + "grad_norm": 5.090729713439941, + "learning_rate": 8.255664026524221e-07, + "loss": 0.026, + "step": 10475 + }, + { + "epoch": 1.8990775908844275, + "grad_norm": 2.83105206489563, + "learning_rate": 8.251059126911033e-07, + "loss": 0.0266, + "step": 10500 + }, + { + "epoch": 1.9035992041960572, + "grad_norm": 0.7878080010414124, + "learning_rate": 8.246454227297845e-07, + "loss": 0.0449, + "step": 10525 + }, + { + "epoch": 1.9081208175076867, + "grad_norm": 14.082945823669434, + "learning_rate": 8.241849327684655e-07, + "loss": 0.027, + "step": 10550 + }, + { + "epoch": 1.9126424308193162, + "grad_norm": 0.5400319695472717, + "learning_rate": 8.237244428071468e-07, + "loss": 0.0546, + "step": 10575 + }, + { + "epoch": 1.9171640441309459, + "grad_norm": 17.12287712097168, + "learning_rate": 8.23263952845828e-07, + "loss": 0.0318, + "step": 10600 + }, + { + "epoch": 1.9216856574425756, + "grad_norm": 4.993497848510742, + "learning_rate": 8.228034628845091e-07, + "loss": 0.0428, + "step": 10625 + }, + { + "epoch": 1.926207270754205, + "grad_norm": 31.576929092407227, + "learning_rate": 8.223429729231902e-07, + "loss": 0.0434, + "step": 10650 + }, + { + "epoch": 1.9307288840658345, + "grad_norm": 1.9357125759124756, + "learning_rate": 8.218824829618713e-07, + "loss": 0.033, + "step": 10675 + }, + { + "epoch": 1.9352504973774642, + "grad_norm": 0.617363691329956, + "learning_rate": 8.214219930005526e-07, + "loss": 0.0639, + "step": 10700 + }, + { + "epoch": 1.939772110689094, + "grad_norm": 0.753123939037323, + "learning_rate": 8.209615030392337e-07, + "loss": 0.0357, + "step": 10725 + }, + { + "epoch": 1.9442937240007234, + "grad_norm": 0.2875419855117798, + "learning_rate": 8.205010130779148e-07, + "loss": 0.0286, + "step": 10750 + }, + { + "epoch": 1.948815337312353, + "grad_norm": 1.6829754114151, + "learning_rate": 8.20040523116596e-07, + "loss": 0.0285, + "step": 10775 + }, + { + "epoch": 1.9533369506239826, + "grad_norm": 1.1577789783477783, + "learning_rate": 8.195800331552772e-07, + "loss": 0.018, + "step": 10800 + }, + { + "epoch": 1.9578585639356123, + "grad_norm": 9.512741088867188, + "learning_rate": 8.191195431939583e-07, + "loss": 0.031, + "step": 10825 + }, + { + "epoch": 1.9623801772472418, + "grad_norm": 2.2050418853759766, + "learning_rate": 8.186590532326395e-07, + "loss": 0.0314, + "step": 10850 + }, + { + "epoch": 1.9669017905588713, + "grad_norm": 15.566596984863281, + "learning_rate": 8.181985632713207e-07, + "loss": 0.0376, + "step": 10875 + }, + { + "epoch": 1.971423403870501, + "grad_norm": 8.69605827331543, + "learning_rate": 8.177380733100017e-07, + "loss": 0.0321, + "step": 10900 + }, + { + "epoch": 1.9759450171821307, + "grad_norm": 8.651535987854004, + "learning_rate": 8.172775833486829e-07, + "loss": 0.0212, + "step": 10925 + }, + { + "epoch": 1.9804666304937601, + "grad_norm": 1.8405441045761108, + "learning_rate": 8.168170933873642e-07, + "loss": 0.032, + "step": 10950 + }, + { + "epoch": 1.9849882438053896, + "grad_norm": 15.064764976501465, + "learning_rate": 8.163566034260452e-07, + "loss": 0.0133, + "step": 10975 + }, + { + "epoch": 1.9895098571170193, + "grad_norm": 0.0899849534034729, + "learning_rate": 8.158961134647264e-07, + "loss": 0.0469, + "step": 11000 + }, + { + "epoch": 1.994031470428649, + "grad_norm": 18.745182037353516, + "learning_rate": 8.154540431018604e-07, + "loss": 0.0688, + "step": 11025 + }, + { + "epoch": 1.9985530837402785, + "grad_norm": 0.08827279508113861, + "learning_rate": 8.149935531405416e-07, + "loss": 0.0788, + "step": 11050 + }, + { + "epoch": 2.0, + "eval_loss": 0.22891011834144592, + "eval_runtime": 8887.2881, + "eval_samples_per_second": 1.068, + "eval_steps_per_second": 0.134, + "eval_wer": 0.10608584240871237, + "step": 11058 + }, + { + "epoch": 2.003074697051908, + "grad_norm": 1.0333837270736694, + "learning_rate": 8.145330631792226e-07, + "loss": 0.0349, + "step": 11075 + }, + { + "epoch": 2.007596310363538, + "grad_norm": 1.1579056978225708, + "learning_rate": 8.140725732179038e-07, + "loss": 0.0172, + "step": 11100 + }, + { + "epoch": 2.0121179236751674, + "grad_norm": 0.37552839517593384, + "learning_rate": 8.136120832565849e-07, + "loss": 0.0151, + "step": 11125 + }, + { + "epoch": 2.016639536986797, + "grad_norm": 23.273653030395508, + "learning_rate": 8.131515932952662e-07, + "loss": 0.0298, + "step": 11150 + }, + { + "epoch": 2.0211611502984264, + "grad_norm": 8.169917106628418, + "learning_rate": 8.126911033339473e-07, + "loss": 0.0215, + "step": 11175 + }, + { + "epoch": 2.0256827636100563, + "grad_norm": 1.1066598892211914, + "learning_rate": 8.122306133726284e-07, + "loss": 0.0222, + "step": 11200 + }, + { + "epoch": 2.0302043769216858, + "grad_norm": 0.5222472548484802, + "learning_rate": 8.117701234113096e-07, + "loss": 0.0125, + "step": 11225 + }, + { + "epoch": 2.0347259902333152, + "grad_norm": 0.707737922668457, + "learning_rate": 8.113096334499907e-07, + "loss": 0.0275, + "step": 11250 + }, + { + "epoch": 2.0392476035449447, + "grad_norm": 2.295354127883911, + "learning_rate": 8.108491434886719e-07, + "loss": 0.023, + "step": 11275 + }, + { + "epoch": 2.0437692168565746, + "grad_norm": 1.3817616701126099, + "learning_rate": 8.103886535273531e-07, + "loss": 0.0125, + "step": 11300 + }, + { + "epoch": 2.048290830168204, + "grad_norm": 2.0756027698516846, + "learning_rate": 8.099281635660342e-07, + "loss": 0.0187, + "step": 11325 + }, + { + "epoch": 2.0528124434798336, + "grad_norm": 0.05818900838494301, + "learning_rate": 8.094676736047153e-07, + "loss": 0.0278, + "step": 11350 + }, + { + "epoch": 2.057334056791463, + "grad_norm": 6.99597692489624, + "learning_rate": 8.090071836433965e-07, + "loss": 0.0241, + "step": 11375 + }, + { + "epoch": 2.0618556701030926, + "grad_norm": 0.15094341337680817, + "learning_rate": 8.085466936820778e-07, + "loss": 0.0223, + "step": 11400 + }, + { + "epoch": 2.0663772834147225, + "grad_norm": 0.17221419513225555, + "learning_rate": 8.080862037207588e-07, + "loss": 0.0257, + "step": 11425 + }, + { + "epoch": 2.070898896726352, + "grad_norm": 0.5105612277984619, + "learning_rate": 8.0762571375944e-07, + "loss": 0.0523, + "step": 11450 + }, + { + "epoch": 2.0754205100379814, + "grad_norm": 1.4951982498168945, + "learning_rate": 8.071652237981212e-07, + "loss": 0.0247, + "step": 11475 + }, + { + "epoch": 2.079942123349611, + "grad_norm": 5.619513988494873, + "learning_rate": 8.067047338368023e-07, + "loss": 0.0181, + "step": 11500 + }, + { + "epoch": 2.084463736661241, + "grad_norm": 21.839784622192383, + "learning_rate": 8.062442438754835e-07, + "loss": 0.0221, + "step": 11525 + }, + { + "epoch": 2.0889853499728703, + "grad_norm": 4.323668003082275, + "learning_rate": 8.057837539141647e-07, + "loss": 0.0296, + "step": 11550 + }, + { + "epoch": 2.0935069632845, + "grad_norm": 0.17072859406471252, + "learning_rate": 8.053232639528457e-07, + "loss": 0.0196, + "step": 11575 + }, + { + "epoch": 2.0980285765961293, + "grad_norm": 13.157007217407227, + "learning_rate": 8.048627739915269e-07, + "loss": 0.0238, + "step": 11600 + }, + { + "epoch": 2.102550189907759, + "grad_norm": 0.13449828326702118, + "learning_rate": 8.044022840302082e-07, + "loss": 0.0157, + "step": 11625 + }, + { + "epoch": 2.1070718032193887, + "grad_norm": 9.773667335510254, + "learning_rate": 8.039417940688893e-07, + "loss": 0.0167, + "step": 11650 + }, + { + "epoch": 2.111593416531018, + "grad_norm": 0.01844405196607113, + "learning_rate": 8.034813041075704e-07, + "loss": 0.0062, + "step": 11675 + }, + { + "epoch": 2.1161150298426477, + "grad_norm": 2.560438394546509, + "learning_rate": 8.030208141462516e-07, + "loss": 0.0193, + "step": 11700 + }, + { + "epoch": 2.1206366431542776, + "grad_norm": 0.5435400605201721, + "learning_rate": 8.025603241849327e-07, + "loss": 0.0203, + "step": 11725 + }, + { + "epoch": 2.125158256465907, + "grad_norm": 63.197235107421875, + "learning_rate": 8.020998342236139e-07, + "loss": 0.0358, + "step": 11750 + }, + { + "epoch": 2.1296798697775365, + "grad_norm": 1.3490138053894043, + "learning_rate": 8.016393442622951e-07, + "loss": 0.013, + "step": 11775 + }, + { + "epoch": 2.134201483089166, + "grad_norm": 34.209476470947266, + "learning_rate": 8.011788543009762e-07, + "loss": 0.0351, + "step": 11800 + }, + { + "epoch": 2.138723096400796, + "grad_norm": 0.7151913642883301, + "learning_rate": 8.007183643396574e-07, + "loss": 0.0258, + "step": 11825 + }, + { + "epoch": 2.1432447097124254, + "grad_norm": 0.9076653718948364, + "learning_rate": 8.002578743783384e-07, + "loss": 0.0662, + "step": 11850 + }, + { + "epoch": 2.147766323024055, + "grad_norm": 0.8418449759483337, + "learning_rate": 7.997973844170197e-07, + "loss": 0.0301, + "step": 11875 + }, + { + "epoch": 2.1522879363356844, + "grad_norm": 0.5430082082748413, + "learning_rate": 7.993368944557009e-07, + "loss": 0.0166, + "step": 11900 + }, + { + "epoch": 2.1568095496473143, + "grad_norm": 0.6853590607643127, + "learning_rate": 7.988764044943819e-07, + "loss": 0.0148, + "step": 11925 + }, + { + "epoch": 2.161331162958944, + "grad_norm": 0.5972227454185486, + "learning_rate": 7.984159145330631e-07, + "loss": 0.0177, + "step": 11950 + }, + { + "epoch": 2.1658527762705733, + "grad_norm": 2.1670873165130615, + "learning_rate": 7.979554245717443e-07, + "loss": 0.0274, + "step": 11975 + }, + { + "epoch": 2.1703743895822027, + "grad_norm": 18.318012237548828, + "learning_rate": 7.974949346104254e-07, + "loss": 0.029, + "step": 12000 + }, + { + "epoch": 2.1748960028938327, + "grad_norm": 0.31500276923179626, + "learning_rate": 7.970344446491066e-07, + "loss": 0.012, + "step": 12025 + }, + { + "epoch": 2.179417616205462, + "grad_norm": 0.21801598370075226, + "learning_rate": 7.965739546877878e-07, + "loss": 0.0121, + "step": 12050 + }, + { + "epoch": 2.1839392295170916, + "grad_norm": 1.37786865234375, + "learning_rate": 7.961134647264689e-07, + "loss": 0.0066, + "step": 12075 + }, + { + "epoch": 2.188460842828721, + "grad_norm": 32.29001998901367, + "learning_rate": 7.9565297476515e-07, + "loss": 0.0341, + "step": 12100 + }, + { + "epoch": 2.192982456140351, + "grad_norm": 2.302133083343506, + "learning_rate": 7.951924848038313e-07, + "loss": 0.0188, + "step": 12125 + }, + { + "epoch": 2.1975040694519805, + "grad_norm": 14.034008979797363, + "learning_rate": 7.947319948425124e-07, + "loss": 0.0117, + "step": 12150 + }, + { + "epoch": 2.20202568276361, + "grad_norm": 0.3767974078655243, + "learning_rate": 7.942715048811935e-07, + "loss": 0.0259, + "step": 12175 + }, + { + "epoch": 2.2065472960752395, + "grad_norm": 0.26384684443473816, + "learning_rate": 7.938110149198747e-07, + "loss": 0.0371, + "step": 12200 + }, + { + "epoch": 2.2110689093868694, + "grad_norm": 8.888740539550781, + "learning_rate": 7.933505249585559e-07, + "loss": 0.0336, + "step": 12225 + }, + { + "epoch": 2.215590522698499, + "grad_norm": 0.19948595762252808, + "learning_rate": 7.928900349972371e-07, + "loss": 0.0474, + "step": 12250 + }, + { + "epoch": 2.2201121360101284, + "grad_norm": 2.2132930755615234, + "learning_rate": 7.924295450359182e-07, + "loss": 0.0406, + "step": 12275 + }, + { + "epoch": 2.224633749321758, + "grad_norm": 11.718713760375977, + "learning_rate": 7.919690550745993e-07, + "loss": 0.0195, + "step": 12300 + }, + { + "epoch": 2.2291553626333878, + "grad_norm": 5.642462730407715, + "learning_rate": 7.915085651132805e-07, + "loss": 0.0187, + "step": 12325 + }, + { + "epoch": 2.2336769759450172, + "grad_norm": 4.084228992462158, + "learning_rate": 7.910480751519617e-07, + "loss": 0.0126, + "step": 12350 + }, + { + "epoch": 2.2381985892566467, + "grad_norm": 1.1005765199661255, + "learning_rate": 7.905875851906428e-07, + "loss": 0.0167, + "step": 12375 + }, + { + "epoch": 2.242720202568276, + "grad_norm": 9.070086479187012, + "learning_rate": 7.90127095229324e-07, + "loss": 0.0182, + "step": 12400 + }, + { + "epoch": 2.247241815879906, + "grad_norm": 15.3062162399292, + "learning_rate": 7.896666052680051e-07, + "loss": 0.0156, + "step": 12425 + }, + { + "epoch": 2.2517634291915356, + "grad_norm": 0.09264446794986725, + "learning_rate": 7.892061153066862e-07, + "loss": 0.0237, + "step": 12450 + }, + { + "epoch": 2.256285042503165, + "grad_norm": 3.3248021602630615, + "learning_rate": 7.887456253453675e-07, + "loss": 0.0139, + "step": 12475 + }, + { + "epoch": 2.2608066558147946, + "grad_norm": 3.0979135036468506, + "learning_rate": 7.882851353840487e-07, + "loss": 0.0063, + "step": 12500 + }, + { + "epoch": 2.2653282691264245, + "grad_norm": 11.612130165100098, + "learning_rate": 7.878246454227297e-07, + "loss": 0.04, + "step": 12525 + }, + { + "epoch": 2.269849882438054, + "grad_norm": 4.221678256988525, + "learning_rate": 7.873641554614109e-07, + "loss": 0.0393, + "step": 12550 + }, + { + "epoch": 2.2743714957496834, + "grad_norm": 11.065829277038574, + "learning_rate": 7.869036655000921e-07, + "loss": 0.0286, + "step": 12575 + }, + { + "epoch": 2.278893109061313, + "grad_norm": 1.366445779800415, + "learning_rate": 7.864431755387732e-07, + "loss": 0.028, + "step": 12600 + }, + { + "epoch": 2.283414722372943, + "grad_norm": 0.7951880693435669, + "learning_rate": 7.859826855774544e-07, + "loss": 0.0661, + "step": 12625 + }, + { + "epoch": 2.2879363356845723, + "grad_norm": 10.447066307067871, + "learning_rate": 7.855221956161356e-07, + "loss": 0.0484, + "step": 12650 + }, + { + "epoch": 2.292457948996202, + "grad_norm": 0.32853183150291443, + "learning_rate": 7.850617056548166e-07, + "loss": 0.0239, + "step": 12675 + }, + { + "epoch": 2.2969795623078313, + "grad_norm": 2.0133612155914307, + "learning_rate": 7.846012156934978e-07, + "loss": 0.0106, + "step": 12700 + }, + { + "epoch": 2.301501175619461, + "grad_norm": 1.5879937410354614, + "learning_rate": 7.841407257321791e-07, + "loss": 0.0151, + "step": 12725 + }, + { + "epoch": 2.3060227889310907, + "grad_norm": 3.2537899017333984, + "learning_rate": 7.836802357708602e-07, + "loss": 0.021, + "step": 12750 + }, + { + "epoch": 2.31054440224272, + "grad_norm": 0.4321633577346802, + "learning_rate": 7.832197458095413e-07, + "loss": 0.0135, + "step": 12775 + }, + { + "epoch": 2.3150660155543497, + "grad_norm": 17.50613021850586, + "learning_rate": 7.827592558482224e-07, + "loss": 0.024, + "step": 12800 + }, + { + "epoch": 2.319587628865979, + "grad_norm": 10.587005615234375, + "learning_rate": 7.822987658869036e-07, + "loss": 0.0155, + "step": 12825 + }, + { + "epoch": 2.324109242177609, + "grad_norm": 1.703659176826477, + "learning_rate": 7.818382759255848e-07, + "loss": 0.0204, + "step": 12850 + }, + { + "epoch": 2.3286308554892385, + "grad_norm": 0.26982223987579346, + "learning_rate": 7.813777859642659e-07, + "loss": 0.0087, + "step": 12875 + }, + { + "epoch": 2.333152468800868, + "grad_norm": 4.538456916809082, + "learning_rate": 7.809172960029471e-07, + "loss": 0.0177, + "step": 12900 + }, + { + "epoch": 2.337674082112498, + "grad_norm": 5.056499481201172, + "learning_rate": 7.804568060416282e-07, + "loss": 0.0163, + "step": 12925 + }, + { + "epoch": 2.3421956954241274, + "grad_norm": 0.5478576421737671, + "learning_rate": 7.799963160803094e-07, + "loss": 0.0149, + "step": 12950 + }, + { + "epoch": 2.346717308735757, + "grad_norm": 1.6986396312713623, + "learning_rate": 7.795358261189906e-07, + "loss": 0.0341, + "step": 12975 + }, + { + "epoch": 2.3512389220473864, + "grad_norm": 0.026538992300629616, + "learning_rate": 7.790753361576718e-07, + "loss": 0.0143, + "step": 13000 + }, + { + "epoch": 2.355760535359016, + "grad_norm": 0.38667038083076477, + "learning_rate": 7.786148461963528e-07, + "loss": 0.0079, + "step": 13025 + }, + { + "epoch": 2.360282148670646, + "grad_norm": 15.939850807189941, + "learning_rate": 7.78154356235034e-07, + "loss": 0.0372, + "step": 13050 + }, + { + "epoch": 2.3648037619822753, + "grad_norm": 18.573139190673828, + "learning_rate": 7.776938662737153e-07, + "loss": 0.0441, + "step": 13075 + }, + { + "epoch": 2.3693253752939047, + "grad_norm": 3.2550344467163086, + "learning_rate": 7.772333763123963e-07, + "loss": 0.027, + "step": 13100 + }, + { + "epoch": 2.3738469886055347, + "grad_norm": 1.5440771579742432, + "learning_rate": 7.767728863510775e-07, + "loss": 0.0277, + "step": 13125 + }, + { + "epoch": 2.378368601917164, + "grad_norm": 1.0547950267791748, + "learning_rate": 7.763123963897587e-07, + "loss": 0.0278, + "step": 13150 + }, + { + "epoch": 2.3828902152287936, + "grad_norm": 0.08282533288002014, + "learning_rate": 7.758519064284398e-07, + "loss": 0.0101, + "step": 13175 + }, + { + "epoch": 2.387411828540423, + "grad_norm": 0.24389539659023285, + "learning_rate": 7.75391416467121e-07, + "loss": 0.0233, + "step": 13200 + }, + { + "epoch": 2.3919334418520526, + "grad_norm": 9.251720428466797, + "learning_rate": 7.749309265058022e-07, + "loss": 0.047, + "step": 13225 + }, + { + "epoch": 2.3964550551636825, + "grad_norm": 2.2844269275665283, + "learning_rate": 7.744704365444833e-07, + "loss": 0.0162, + "step": 13250 + }, + { + "epoch": 2.400976668475312, + "grad_norm": 3.2137227058410645, + "learning_rate": 7.740099465831644e-07, + "loss": 0.0121, + "step": 13275 + }, + { + "epoch": 2.4054982817869415, + "grad_norm": 11.308737754821777, + "learning_rate": 7.735494566218456e-07, + "loss": 0.0299, + "step": 13300 + }, + { + "epoch": 2.4100198950985714, + "grad_norm": 3.3836469650268555, + "learning_rate": 7.730889666605268e-07, + "loss": 0.0133, + "step": 13325 + }, + { + "epoch": 2.414541508410201, + "grad_norm": 0.13357259333133698, + "learning_rate": 7.726284766992079e-07, + "loss": 0.0223, + "step": 13350 + }, + { + "epoch": 2.4190631217218304, + "grad_norm": 0.5216515064239502, + "learning_rate": 7.721864063363418e-07, + "loss": 0.0212, + "step": 13375 + }, + { + "epoch": 2.42358473503346, + "grad_norm": 4.0334320068359375, + "learning_rate": 7.71725916375023e-07, + "loss": 0.0367, + "step": 13400 + }, + { + "epoch": 2.4281063483450893, + "grad_norm": 8.48493766784668, + "learning_rate": 7.712654264137042e-07, + "loss": 0.0254, + "step": 13425 + }, + { + "epoch": 2.4326279616567192, + "grad_norm": 1.6405227184295654, + "learning_rate": 7.708049364523853e-07, + "loss": 0.0331, + "step": 13450 + }, + { + "epoch": 2.4371495749683487, + "grad_norm": 13.649563789367676, + "learning_rate": 7.703444464910664e-07, + "loss": 0.018, + "step": 13475 + }, + { + "epoch": 2.441671188279978, + "grad_norm": 0.7964933514595032, + "learning_rate": 7.698839565297476e-07, + "loss": 0.0235, + "step": 13500 + }, + { + "epoch": 2.446192801591608, + "grad_norm": 0.13087065517902374, + "learning_rate": 7.694234665684289e-07, + "loss": 0.0215, + "step": 13525 + }, + { + "epoch": 2.4507144149032376, + "grad_norm": 5.35853385925293, + "learning_rate": 7.689629766071099e-07, + "loss": 0.0178, + "step": 13550 + }, + { + "epoch": 2.455236028214867, + "grad_norm": 3.688849687576294, + "learning_rate": 7.685024866457911e-07, + "loss": 0.0099, + "step": 13575 + }, + { + "epoch": 2.4597576415264966, + "grad_norm": 0.2602083384990692, + "learning_rate": 7.680419966844723e-07, + "loss": 0.0205, + "step": 13600 + }, + { + "epoch": 2.464279254838126, + "grad_norm": 0.10222572088241577, + "learning_rate": 7.675815067231533e-07, + "loss": 0.012, + "step": 13625 + }, + { + "epoch": 2.468800868149756, + "grad_norm": 1.9992151260375977, + "learning_rate": 7.671210167618346e-07, + "loss": 0.0295, + "step": 13650 + }, + { + "epoch": 2.4733224814613854, + "grad_norm": 3.8986308574676514, + "learning_rate": 7.666605268005158e-07, + "loss": 0.0447, + "step": 13675 + }, + { + "epoch": 2.477844094773015, + "grad_norm": 2.5787339210510254, + "learning_rate": 7.662000368391968e-07, + "loss": 0.0233, + "step": 13700 + }, + { + "epoch": 2.482365708084645, + "grad_norm": 3.943392276763916, + "learning_rate": 7.65739546877878e-07, + "loss": 0.0214, + "step": 13725 + }, + { + "epoch": 2.4868873213962743, + "grad_norm": 4.015535831451416, + "learning_rate": 7.652790569165592e-07, + "loss": 0.0146, + "step": 13750 + }, + { + "epoch": 2.491408934707904, + "grad_norm": 1.5608233213424683, + "learning_rate": 7.648185669552404e-07, + "loss": 0.0171, + "step": 13775 + }, + { + "epoch": 2.4959305480195333, + "grad_norm": 1.696368932723999, + "learning_rate": 7.643580769939215e-07, + "loss": 0.0122, + "step": 13800 + }, + { + "epoch": 2.5004521613311628, + "grad_norm": 1.1217238903045654, + "learning_rate": 7.638975870326027e-07, + "loss": 0.0128, + "step": 13825 + }, + { + "epoch": 2.5049737746427927, + "grad_norm": 40.497745513916016, + "learning_rate": 7.634370970712838e-07, + "loss": 0.0277, + "step": 13850 + }, + { + "epoch": 2.509495387954422, + "grad_norm": 6.063665390014648, + "learning_rate": 7.629766071099649e-07, + "loss": 0.0118, + "step": 13875 + }, + { + "epoch": 2.5140170012660517, + "grad_norm": 8.675702095031738, + "learning_rate": 7.625161171486462e-07, + "loss": 0.0259, + "step": 13900 + }, + { + "epoch": 2.5185386145776816, + "grad_norm": 0.8335000872612, + "learning_rate": 7.620556271873273e-07, + "loss": 0.0147, + "step": 13925 + }, + { + "epoch": 2.523060227889311, + "grad_norm": 8.890750885009766, + "learning_rate": 7.615951372260084e-07, + "loss": 0.0226, + "step": 13950 + }, + { + "epoch": 2.5275818412009405, + "grad_norm": 0.20721301436424255, + "learning_rate": 7.611346472646895e-07, + "loss": 0.0103, + "step": 13975 + }, + { + "epoch": 2.53210345451257, + "grad_norm": 0.5705264806747437, + "learning_rate": 7.606741573033707e-07, + "loss": 0.0094, + "step": 14000 + }, + { + "epoch": 2.5366250678241995, + "grad_norm": 0.07163272053003311, + "learning_rate": 7.60213667342052e-07, + "loss": 0.0177, + "step": 14025 + }, + { + "epoch": 2.5411466811358294, + "grad_norm": 0.8082312345504761, + "learning_rate": 7.59753177380733e-07, + "loss": 0.0177, + "step": 14050 + }, + { + "epoch": 2.545668294447459, + "grad_norm": 0.3273601830005646, + "learning_rate": 7.592926874194142e-07, + "loss": 0.0429, + "step": 14075 + }, + { + "epoch": 2.5501899077590884, + "grad_norm": 1.8662065267562866, + "learning_rate": 7.588321974580954e-07, + "loss": 0.0163, + "step": 14100 + }, + { + "epoch": 2.5547115210707183, + "grad_norm": 0.7974827289581299, + "learning_rate": 7.583717074967764e-07, + "loss": 0.0159, + "step": 14125 + }, + { + "epoch": 2.559233134382348, + "grad_norm": 12.264116287231445, + "learning_rate": 7.579112175354577e-07, + "loss": 0.0329, + "step": 14150 + }, + { + "epoch": 2.5637547476939773, + "grad_norm": 0.07791896164417267, + "learning_rate": 7.574507275741389e-07, + "loss": 0.0214, + "step": 14175 + }, + { + "epoch": 2.5682763610056067, + "grad_norm": 0.7379089593887329, + "learning_rate": 7.5699023761282e-07, + "loss": 0.0366, + "step": 14200 + }, + { + "epoch": 2.5727979743172362, + "grad_norm": 91.46224975585938, + "learning_rate": 7.565297476515011e-07, + "loss": 0.027, + "step": 14225 + }, + { + "epoch": 2.5773195876288657, + "grad_norm": 14.806313514709473, + "learning_rate": 7.560692576901824e-07, + "loss": 0.0488, + "step": 14250 + }, + { + "epoch": 2.5818412009404956, + "grad_norm": 2.083322286605835, + "learning_rate": 7.556087677288635e-07, + "loss": 0.031, + "step": 14275 + }, + { + "epoch": 2.586362814252125, + "grad_norm": 4.879816055297852, + "learning_rate": 7.551482777675446e-07, + "loss": 0.0153, + "step": 14300 + }, + { + "epoch": 2.5908844275637546, + "grad_norm": 6.237574100494385, + "learning_rate": 7.546877878062258e-07, + "loss": 0.0141, + "step": 14325 + }, + { + "epoch": 2.5954060408753845, + "grad_norm": 0.12708225846290588, + "learning_rate": 7.542272978449069e-07, + "loss": 0.0101, + "step": 14350 + }, + { + "epoch": 2.599927654187014, + "grad_norm": 1.2891823053359985, + "learning_rate": 7.537668078835881e-07, + "loss": 0.0215, + "step": 14375 + }, + { + "epoch": 2.6044492674986435, + "grad_norm": 0.09198635071516037, + "learning_rate": 7.533063179222693e-07, + "loss": 0.0202, + "step": 14400 + }, + { + "epoch": 2.608970880810273, + "grad_norm": 0.42182183265686035, + "learning_rate": 7.528458279609504e-07, + "loss": 0.0115, + "step": 14425 + }, + { + "epoch": 2.6134924941219024, + "grad_norm": 0.22351473569869995, + "learning_rate": 7.523853379996316e-07, + "loss": 0.0133, + "step": 14450 + }, + { + "epoch": 2.6180141074335324, + "grad_norm": 6.165104389190674, + "learning_rate": 7.519248480383127e-07, + "loss": 0.0381, + "step": 14475 + }, + { + "epoch": 2.622535720745162, + "grad_norm": 4.592835903167725, + "learning_rate": 7.514643580769939e-07, + "loss": 0.0118, + "step": 14500 + }, + { + "epoch": 2.6270573340567913, + "grad_norm": 13.439335823059082, + "learning_rate": 7.510038681156751e-07, + "loss": 0.0324, + "step": 14525 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 3.1381280422210693, + "learning_rate": 7.505433781543562e-07, + "loss": 0.0383, + "step": 14550 + }, + { + "epoch": 2.6361005606800507, + "grad_norm": 3.8765087127685547, + "learning_rate": 7.500828881930373e-07, + "loss": 0.03, + "step": 14575 + }, + { + "epoch": 2.64062217399168, + "grad_norm": 12.755854606628418, + "learning_rate": 7.496223982317185e-07, + "loss": 0.0356, + "step": 14600 + }, + { + "epoch": 2.6451437873033097, + "grad_norm": 48.162261962890625, + "learning_rate": 7.491619082703998e-07, + "loss": 0.0244, + "step": 14625 + }, + { + "epoch": 2.649665400614939, + "grad_norm": 55.78163528442383, + "learning_rate": 7.487014183090808e-07, + "loss": 0.0495, + "step": 14650 + }, + { + "epoch": 2.654187013926569, + "grad_norm": 4.578949451446533, + "learning_rate": 7.48240928347762e-07, + "loss": 0.0292, + "step": 14675 + }, + { + "epoch": 2.6587086272381986, + "grad_norm": 6.509306907653809, + "learning_rate": 7.477804383864432e-07, + "loss": 0.0193, + "step": 14700 + }, + { + "epoch": 2.663230240549828, + "grad_norm": 4.97738790512085, + "learning_rate": 7.473199484251242e-07, + "loss": 0.0223, + "step": 14725 + }, + { + "epoch": 2.667751853861458, + "grad_norm": 7.346843242645264, + "learning_rate": 7.468594584638055e-07, + "loss": 0.0244, + "step": 14750 + }, + { + "epoch": 2.6722734671730874, + "grad_norm": 0.34532052278518677, + "learning_rate": 7.463989685024867e-07, + "loss": 0.0158, + "step": 14775 + }, + { + "epoch": 2.676795080484717, + "grad_norm": 1.6974883079528809, + "learning_rate": 7.459384785411677e-07, + "loss": 0.0172, + "step": 14800 + }, + { + "epoch": 2.6813166937963464, + "grad_norm": 4.6452507972717285, + "learning_rate": 7.454779885798489e-07, + "loss": 0.034, + "step": 14825 + }, + { + "epoch": 2.685838307107976, + "grad_norm": 40.83687973022461, + "learning_rate": 7.4501749861853e-07, + "loss": 0.0238, + "step": 14850 + }, + { + "epoch": 2.690359920419606, + "grad_norm": 0.27361202239990234, + "learning_rate": 7.445570086572113e-07, + "loss": 0.0217, + "step": 14875 + }, + { + "epoch": 2.6948815337312353, + "grad_norm": 0.088538758456707, + "learning_rate": 7.440965186958924e-07, + "loss": 0.0216, + "step": 14900 + }, + { + "epoch": 2.6994031470428648, + "grad_norm": 0.36908406019210815, + "learning_rate": 7.436360287345735e-07, + "loss": 0.0163, + "step": 14925 + }, + { + "epoch": 2.7039247603544947, + "grad_norm": 0.3927018344402313, + "learning_rate": 7.431755387732547e-07, + "loss": 0.0272, + "step": 14950 + }, + { + "epoch": 2.708446373666124, + "grad_norm": 0.08495494723320007, + "learning_rate": 7.427150488119359e-07, + "loss": 0.0321, + "step": 14975 + }, + { + "epoch": 2.7129679869777537, + "grad_norm": 16.534454345703125, + "learning_rate": 7.42254558850617e-07, + "loss": 0.0346, + "step": 15000 + }, + { + "epoch": 2.717489600289383, + "grad_norm": 10.199457168579102, + "learning_rate": 7.417940688892982e-07, + "loss": 0.0236, + "step": 15025 + }, + { + "epoch": 2.7220112136010126, + "grad_norm": 12.526575088500977, + "learning_rate": 7.413335789279793e-07, + "loss": 0.0344, + "step": 15050 + }, + { + "epoch": 2.7265328269126425, + "grad_norm": 0.6113793253898621, + "learning_rate": 7.408730889666604e-07, + "loss": 0.0549, + "step": 15075 + }, + { + "epoch": 2.731054440224272, + "grad_norm": 2.575866460800171, + "learning_rate": 7.404125990053417e-07, + "loss": 0.0188, + "step": 15100 + }, + { + "epoch": 2.7355760535359015, + "grad_norm": 0.5728238821029663, + "learning_rate": 7.399521090440229e-07, + "loss": 0.0248, + "step": 15125 + }, + { + "epoch": 2.7400976668475314, + "grad_norm": 5.137115478515625, + "learning_rate": 7.394916190827039e-07, + "loss": 0.0234, + "step": 15150 + }, + { + "epoch": 2.744619280159161, + "grad_norm": 2.0585458278656006, + "learning_rate": 7.390311291213851e-07, + "loss": 0.0193, + "step": 15175 + }, + { + "epoch": 2.7491408934707904, + "grad_norm": 2.6761465072631836, + "learning_rate": 7.385706391600663e-07, + "loss": 0.0242, + "step": 15200 + }, + { + "epoch": 2.75366250678242, + "grad_norm": 0.3999291956424713, + "learning_rate": 7.381101491987474e-07, + "loss": 0.0193, + "step": 15225 + }, + { + "epoch": 2.7581841200940493, + "grad_norm": 8.237870216369629, + "learning_rate": 7.376496592374286e-07, + "loss": 0.0278, + "step": 15250 + }, + { + "epoch": 2.7627057334056793, + "grad_norm": 8.460784912109375, + "learning_rate": 7.371891692761098e-07, + "loss": 0.0194, + "step": 15275 + }, + { + "epoch": 2.7672273467173087, + "grad_norm": 13.062602043151855, + "learning_rate": 7.367286793147908e-07, + "loss": 0.032, + "step": 15300 + }, + { + "epoch": 2.7717489600289382, + "grad_norm": 7.502108097076416, + "learning_rate": 7.36268189353472e-07, + "loss": 0.0076, + "step": 15325 + }, + { + "epoch": 2.776270573340568, + "grad_norm": 11.163969039916992, + "learning_rate": 7.358076993921533e-07, + "loss": 0.024, + "step": 15350 + }, + { + "epoch": 2.7807921866521976, + "grad_norm": 1.2433866262435913, + "learning_rate": 7.353472094308344e-07, + "loss": 0.0276, + "step": 15375 + }, + { + "epoch": 2.785313799963827, + "grad_norm": 0.21064484119415283, + "learning_rate": 7.348867194695155e-07, + "loss": 0.0104, + "step": 15400 + }, + { + "epoch": 2.7898354132754566, + "grad_norm": 39.60307312011719, + "learning_rate": 7.344446491066495e-07, + "loss": 0.0408, + "step": 15425 + }, + { + "epoch": 2.794357026587086, + "grad_norm": 1.9793568849563599, + "learning_rate": 7.339841591453306e-07, + "loss": 0.059, + "step": 15450 + }, + { + "epoch": 2.798878639898716, + "grad_norm": 0.8816681504249573, + "learning_rate": 7.335236691840118e-07, + "loss": 0.069, + "step": 15475 + }, + { + "epoch": 2.8034002532103455, + "grad_norm": 1.217022180557251, + "learning_rate": 7.330631792226929e-07, + "loss": 0.0182, + "step": 15500 + }, + { + "epoch": 2.807921866521975, + "grad_norm": 10.408825874328613, + "learning_rate": 7.32602689261374e-07, + "loss": 0.0192, + "step": 15525 + }, + { + "epoch": 2.812443479833605, + "grad_norm": 0.13745278120040894, + "learning_rate": 7.321421993000553e-07, + "loss": 0.02, + "step": 15550 + }, + { + "epoch": 2.8169650931452344, + "grad_norm": 4.462474346160889, + "learning_rate": 7.316817093387364e-07, + "loss": 0.0229, + "step": 15575 + }, + { + "epoch": 2.821486706456864, + "grad_norm": 0.2782382369041443, + "learning_rate": 7.312212193774175e-07, + "loss": 0.0209, + "step": 15600 + }, + { + "epoch": 2.8260083197684933, + "grad_norm": 9.691481590270996, + "learning_rate": 7.307607294160987e-07, + "loss": 0.0226, + "step": 15625 + }, + { + "epoch": 2.830529933080123, + "grad_norm": 0.038128096610307693, + "learning_rate": 7.303002394547798e-07, + "loss": 0.0076, + "step": 15650 + }, + { + "epoch": 2.8350515463917527, + "grad_norm": 3.7006304264068604, + "learning_rate": 7.29839749493461e-07, + "loss": 0.0116, + "step": 15675 + }, + { + "epoch": 2.839573159703382, + "grad_norm": 0.9358986616134644, + "learning_rate": 7.293792595321422e-07, + "loss": 0.0176, + "step": 15700 + }, + { + "epoch": 2.8440947730150117, + "grad_norm": 6.940121173858643, + "learning_rate": 7.289187695708234e-07, + "loss": 0.018, + "step": 15725 + }, + { + "epoch": 2.8486163863266416, + "grad_norm": 0.16799919307231903, + "learning_rate": 7.284582796095044e-07, + "loss": 0.0191, + "step": 15750 + }, + { + "epoch": 2.853137999638271, + "grad_norm": 8.835237503051758, + "learning_rate": 7.279977896481856e-07, + "loss": 0.023, + "step": 15775 + }, + { + "epoch": 2.8576596129499006, + "grad_norm": 41.18638229370117, + "learning_rate": 7.275372996868669e-07, + "loss": 0.0451, + "step": 15800 + }, + { + "epoch": 2.86218122626153, + "grad_norm": 7.924957752227783, + "learning_rate": 7.270768097255479e-07, + "loss": 0.017, + "step": 15825 + }, + { + "epoch": 2.8667028395731595, + "grad_norm": 0.04123552888631821, + "learning_rate": 7.266163197642291e-07, + "loss": 0.0736, + "step": 15850 + }, + { + "epoch": 2.8712244528847894, + "grad_norm": 0.38446855545043945, + "learning_rate": 7.261558298029103e-07, + "loss": 0.0227, + "step": 15875 + }, + { + "epoch": 2.875746066196419, + "grad_norm": 2.7168097496032715, + "learning_rate": 7.256953398415914e-07, + "loss": 0.0294, + "step": 15900 + }, + { + "epoch": 2.8802676795080484, + "grad_norm": 6.317012786865234, + "learning_rate": 7.252348498802726e-07, + "loss": 0.0313, + "step": 15925 + }, + { + "epoch": 2.8847892928196783, + "grad_norm": 7.5447821617126465, + "learning_rate": 7.247743599189538e-07, + "loss": 0.02, + "step": 15950 + }, + { + "epoch": 2.889310906131308, + "grad_norm": 0.06321001052856445, + "learning_rate": 7.243138699576349e-07, + "loss": 0.0197, + "step": 15975 + }, + { + "epoch": 2.8938325194429373, + "grad_norm": 0.14826831221580505, + "learning_rate": 7.23853379996316e-07, + "loss": 0.0111, + "step": 16000 + }, + { + "epoch": 2.8983541327545668, + "grad_norm": 10.19642448425293, + "learning_rate": 7.233928900349971e-07, + "loss": 0.0235, + "step": 16025 + }, + { + "epoch": 2.9028757460661962, + "grad_norm": 0.09074613451957703, + "learning_rate": 7.229324000736784e-07, + "loss": 0.0177, + "step": 16050 + }, + { + "epoch": 2.907397359377826, + "grad_norm": 50.30870819091797, + "learning_rate": 7.224719101123595e-07, + "loss": 0.0166, + "step": 16075 + }, + { + "epoch": 2.9119189726894557, + "grad_norm": 0.3094925880432129, + "learning_rate": 7.220114201510406e-07, + "loss": 0.0154, + "step": 16100 + }, + { + "epoch": 2.916440586001085, + "grad_norm": 0.837853193283081, + "learning_rate": 7.215509301897218e-07, + "loss": 0.0331, + "step": 16125 + }, + { + "epoch": 2.9209621993127146, + "grad_norm": 1.2888524532318115, + "learning_rate": 7.210904402284031e-07, + "loss": 0.0331, + "step": 16150 + }, + { + "epoch": 2.9254838126243445, + "grad_norm": 0.05001299828290939, + "learning_rate": 7.206299502670841e-07, + "loss": 0.021, + "step": 16175 + }, + { + "epoch": 2.930005425935974, + "grad_norm": 4.152213096618652, + "learning_rate": 7.201694603057653e-07, + "loss": 0.0376, + "step": 16200 + }, + { + "epoch": 2.9345270392476035, + "grad_norm": 3.798003911972046, + "learning_rate": 7.197089703444465e-07, + "loss": 0.0612, + "step": 16225 + }, + { + "epoch": 2.939048652559233, + "grad_norm": 0.4344462454319, + "learning_rate": 7.192484803831275e-07, + "loss": 0.0331, + "step": 16250 + }, + { + "epoch": 2.9435702658708625, + "grad_norm": 12.142309188842773, + "learning_rate": 7.187879904218088e-07, + "loss": 0.0369, + "step": 16275 + }, + { + "epoch": 2.9480918791824924, + "grad_norm": 1.0713512897491455, + "learning_rate": 7.1832750046049e-07, + "loss": 0.0201, + "step": 16300 + }, + { + "epoch": 2.952613492494122, + "grad_norm": 12.534268379211426, + "learning_rate": 7.17867010499171e-07, + "loss": 0.0232, + "step": 16325 + }, + { + "epoch": 2.9571351058057513, + "grad_norm": 2.9991679191589355, + "learning_rate": 7.174065205378522e-07, + "loss": 0.0212, + "step": 16350 + }, + { + "epoch": 2.9616567191173813, + "grad_norm": 0.7451911568641663, + "learning_rate": 7.169460305765334e-07, + "loss": 0.0128, + "step": 16375 + }, + { + "epoch": 2.9661783324290107, + "grad_norm": 0.15179577469825745, + "learning_rate": 7.164855406152146e-07, + "loss": 0.0148, + "step": 16400 + }, + { + "epoch": 2.9706999457406402, + "grad_norm": 0.2197951227426529, + "learning_rate": 7.160250506538957e-07, + "loss": 0.0202, + "step": 16425 + }, + { + "epoch": 2.9752215590522697, + "grad_norm": 8.621418952941895, + "learning_rate": 7.155645606925769e-07, + "loss": 0.0297, + "step": 16450 + }, + { + "epoch": 2.979743172363899, + "grad_norm": 1.0757030248641968, + "learning_rate": 7.15104070731258e-07, + "loss": 0.0164, + "step": 16475 + }, + { + "epoch": 2.984264785675529, + "grad_norm": 7.468885898590088, + "learning_rate": 7.146435807699391e-07, + "loss": 0.0222, + "step": 16500 + }, + { + "epoch": 2.9887863989871586, + "grad_norm": 22.351499557495117, + "learning_rate": 7.141830908086204e-07, + "loss": 0.032, + "step": 16525 + }, + { + "epoch": 2.993308012298788, + "grad_norm": 1.1031124591827393, + "learning_rate": 7.137226008473015e-07, + "loss": 0.0178, + "step": 16550 + }, + { + "epoch": 2.997829625610418, + "grad_norm": 0.33143824338912964, + "learning_rate": 7.132621108859827e-07, + "loss": 0.0183, + "step": 16575 + }, + { + "epoch": 3.0, + "eval_loss": 0.28094977140426636, + "eval_runtime": 8563.7182, + "eval_samples_per_second": 1.109, + "eval_steps_per_second": 0.139, + "eval_wer": 0.10791159513132607, + "step": 16587 + }, + { + "epoch": 3.0023512389220475, + "grad_norm": 2.0394110679626465, + "learning_rate": 7.128016209246638e-07, + "loss": 0.0198, + "step": 16600 + }, + { + "epoch": 3.006872852233677, + "grad_norm": 0.34627845883369446, + "learning_rate": 7.123411309633449e-07, + "loss": 0.0163, + "step": 16625 + }, + { + "epoch": 3.0113944655453064, + "grad_norm": 0.1096586138010025, + "learning_rate": 7.118806410020262e-07, + "loss": 0.012, + "step": 16650 + }, + { + "epoch": 3.0159160788569364, + "grad_norm": 0.8496006727218628, + "learning_rate": 7.114201510407073e-07, + "loss": 0.0078, + "step": 16675 + }, + { + "epoch": 3.020437692168566, + "grad_norm": 0.45903900265693665, + "learning_rate": 7.109596610793884e-07, + "loss": 0.0113, + "step": 16700 + }, + { + "epoch": 3.0249593054801953, + "grad_norm": 0.18622107803821564, + "learning_rate": 7.104991711180696e-07, + "loss": 0.0177, + "step": 16725 + }, + { + "epoch": 3.029480918791825, + "grad_norm": 0.07214221358299255, + "learning_rate": 7.100386811567507e-07, + "loss": 0.0167, + "step": 16750 + }, + { + "epoch": 3.0340025321034547, + "grad_norm": 1.387891173362732, + "learning_rate": 7.095781911954319e-07, + "loss": 0.0085, + "step": 16775 + }, + { + "epoch": 3.038524145415084, + "grad_norm": 79.67159271240234, + "learning_rate": 7.091177012341131e-07, + "loss": 0.0121, + "step": 16800 + }, + { + "epoch": 3.0430457587267137, + "grad_norm": 11.743141174316406, + "learning_rate": 7.086572112727943e-07, + "loss": 0.0093, + "step": 16825 + }, + { + "epoch": 3.047567372038343, + "grad_norm": 18.43634605407715, + "learning_rate": 7.081967213114753e-07, + "loss": 0.0224, + "step": 16850 + }, + { + "epoch": 3.052088985349973, + "grad_norm": 0.045197684317827225, + "learning_rate": 7.077362313501566e-07, + "loss": 0.0062, + "step": 16875 + }, + { + "epoch": 3.0566105986616026, + "grad_norm": 0.12885162234306335, + "learning_rate": 7.072757413888378e-07, + "loss": 0.0214, + "step": 16900 + }, + { + "epoch": 3.061132211973232, + "grad_norm": 0.09277495741844177, + "learning_rate": 7.068152514275188e-07, + "loss": 0.0275, + "step": 16925 + }, + { + "epoch": 3.0656538252848615, + "grad_norm": 15.500121116638184, + "learning_rate": 7.063547614662e-07, + "loss": 0.0328, + "step": 16950 + }, + { + "epoch": 3.0701754385964914, + "grad_norm": 21.585752487182617, + "learning_rate": 7.058942715048811e-07, + "loss": 0.0304, + "step": 16975 + }, + { + "epoch": 3.074697051908121, + "grad_norm": 2.1149275302886963, + "learning_rate": 7.054337815435624e-07, + "loss": 0.0315, + "step": 17000 + }, + { + "epoch": 3.0792186652197504, + "grad_norm": 0.16735130548477173, + "learning_rate": 7.049732915822435e-07, + "loss": 0.0136, + "step": 17025 + }, + { + "epoch": 3.08374027853138, + "grad_norm": 0.7384225130081177, + "learning_rate": 7.045128016209246e-07, + "loss": 0.0059, + "step": 17050 + }, + { + "epoch": 3.0882618918430094, + "grad_norm": 0.19367511570453644, + "learning_rate": 7.040523116596058e-07, + "loss": 0.01, + "step": 17075 + }, + { + "epoch": 3.0927835051546393, + "grad_norm": 0.5572232604026794, + "learning_rate": 7.035918216982869e-07, + "loss": 0.0135, + "step": 17100 + }, + { + "epoch": 3.0973051184662688, + "grad_norm": 0.05385562777519226, + "learning_rate": 7.031313317369681e-07, + "loss": 0.0184, + "step": 17125 + }, + { + "epoch": 3.1018267317778982, + "grad_norm": 4.722483158111572, + "learning_rate": 7.026708417756493e-07, + "loss": 0.0039, + "step": 17150 + }, + { + "epoch": 3.1063483450895277, + "grad_norm": 0.6491204500198364, + "learning_rate": 7.022103518143304e-07, + "loss": 0.0147, + "step": 17175 + }, + { + "epoch": 3.1108699584011577, + "grad_norm": 0.014111626893281937, + "learning_rate": 7.017498618530115e-07, + "loss": 0.0119, + "step": 17200 + }, + { + "epoch": 3.115391571712787, + "grad_norm": 5.42165470123291, + "learning_rate": 7.012893718916927e-07, + "loss": 0.0116, + "step": 17225 + }, + { + "epoch": 3.1199131850244166, + "grad_norm": 7.389101982116699, + "learning_rate": 7.00828881930374e-07, + "loss": 0.0103, + "step": 17250 + }, + { + "epoch": 3.124434798336046, + "grad_norm": 0.3708292841911316, + "learning_rate": 7.00368391969055e-07, + "loss": 0.0074, + "step": 17275 + }, + { + "epoch": 3.128956411647676, + "grad_norm": 0.08553273230791092, + "learning_rate": 6.999079020077362e-07, + "loss": 0.0081, + "step": 17300 + }, + { + "epoch": 3.1334780249593055, + "grad_norm": 5.075300216674805, + "learning_rate": 6.994474120464174e-07, + "loss": 0.0204, + "step": 17325 + }, + { + "epoch": 3.137999638270935, + "grad_norm": 8.977065086364746, + "learning_rate": 6.989869220850984e-07, + "loss": 0.0095, + "step": 17350 + }, + { + "epoch": 3.1425212515825645, + "grad_norm": 3.1058037281036377, + "learning_rate": 6.985264321237797e-07, + "loss": 0.0618, + "step": 17375 + }, + { + "epoch": 3.1470428648941944, + "grad_norm": 4.081079959869385, + "learning_rate": 6.980659421624609e-07, + "loss": 0.0441, + "step": 17400 + }, + { + "epoch": 3.151564478205824, + "grad_norm": 32.8135986328125, + "learning_rate": 6.976054522011419e-07, + "loss": 0.0182, + "step": 17425 + }, + { + "epoch": 3.1560860915174533, + "grad_norm": 0.21175616979599, + "learning_rate": 6.971449622398231e-07, + "loss": 0.0125, + "step": 17450 + }, + { + "epoch": 3.160607704829083, + "grad_norm": 0.16236978769302368, + "learning_rate": 6.966844722785043e-07, + "loss": 0.0121, + "step": 17475 + }, + { + "epoch": 3.1651293181407127, + "grad_norm": 35.438865661621094, + "learning_rate": 6.962239823171855e-07, + "loss": 0.0111, + "step": 17500 + }, + { + "epoch": 3.1696509314523422, + "grad_norm": 0.0932527482509613, + "learning_rate": 6.957634923558666e-07, + "loss": 0.0118, + "step": 17525 + }, + { + "epoch": 3.1741725447639717, + "grad_norm": 9.384880065917969, + "learning_rate": 6.953030023945478e-07, + "loss": 0.0151, + "step": 17550 + }, + { + "epoch": 3.178694158075601, + "grad_norm": 0.24784910678863525, + "learning_rate": 6.948425124332289e-07, + "loss": 0.0106, + "step": 17575 + }, + { + "epoch": 3.183215771387231, + "grad_norm": 0.9011788964271545, + "learning_rate": 6.943820224719101e-07, + "loss": 0.0109, + "step": 17600 + }, + { + "epoch": 3.1877373846988606, + "grad_norm": 1.6620635986328125, + "learning_rate": 6.939215325105913e-07, + "loss": 0.0124, + "step": 17625 + }, + { + "epoch": 3.19225899801049, + "grad_norm": 0.09768769890069962, + "learning_rate": 6.934610425492724e-07, + "loss": 0.0131, + "step": 17650 + }, + { + "epoch": 3.1967806113221195, + "grad_norm": 3.7721073627471924, + "learning_rate": 6.930005525879535e-07, + "loss": 0.0315, + "step": 17675 + }, + { + "epoch": 3.2013022246337495, + "grad_norm": 4.030418872833252, + "learning_rate": 6.925400626266346e-07, + "loss": 0.0216, + "step": 17700 + }, + { + "epoch": 3.205823837945379, + "grad_norm": 16.64514923095703, + "learning_rate": 6.920795726653159e-07, + "loss": 0.0309, + "step": 17725 + }, + { + "epoch": 3.2103454512570084, + "grad_norm": 0.3508654236793518, + "learning_rate": 6.916190827039971e-07, + "loss": 0.0075, + "step": 17750 + }, + { + "epoch": 3.214867064568638, + "grad_norm": 0.19037003815174103, + "learning_rate": 6.911585927426781e-07, + "loss": 0.0547, + "step": 17775 + }, + { + "epoch": 3.219388677880268, + "grad_norm": 0.2579357624053955, + "learning_rate": 6.906981027813593e-07, + "loss": 0.0483, + "step": 17800 + }, + { + "epoch": 3.2239102911918973, + "grad_norm": 0.3658471405506134, + "learning_rate": 6.902376128200405e-07, + "loss": 0.0135, + "step": 17825 + }, + { + "epoch": 3.228431904503527, + "grad_norm": 7.055116176605225, + "learning_rate": 6.897771228587216e-07, + "loss": 0.0192, + "step": 17850 + }, + { + "epoch": 3.2329535178151563, + "grad_norm": 0.09685884416103363, + "learning_rate": 6.893166328974028e-07, + "loss": 0.0104, + "step": 17875 + }, + { + "epoch": 3.237475131126786, + "grad_norm": 0.2775970697402954, + "learning_rate": 6.88856142936084e-07, + "loss": 0.0112, + "step": 17900 + }, + { + "epoch": 3.2419967444384157, + "grad_norm": 0.04564272239804268, + "learning_rate": 6.883956529747651e-07, + "loss": 0.0167, + "step": 17925 + }, + { + "epoch": 3.246518357750045, + "grad_norm": 0.1531757116317749, + "learning_rate": 6.879351630134462e-07, + "loss": 0.0034, + "step": 17950 + }, + { + "epoch": 3.2510399710616746, + "grad_norm": 0.12003475427627563, + "learning_rate": 6.874746730521275e-07, + "loss": 0.0287, + "step": 17975 + }, + { + "epoch": 3.2555615843733046, + "grad_norm": 0.1408698409795761, + "learning_rate": 6.870141830908086e-07, + "loss": 0.0058, + "step": 18000 + }, + { + "epoch": 3.260083197684934, + "grad_norm": 0.21101155877113342, + "learning_rate": 6.865536931294897e-07, + "loss": 0.0224, + "step": 18025 + }, + { + "epoch": 3.2646048109965635, + "grad_norm": 3.5580930709838867, + "learning_rate": 6.861116227666237e-07, + "loss": 0.0336, + "step": 18050 + }, + { + "epoch": 3.269126424308193, + "grad_norm": 3.6865243911743164, + "learning_rate": 6.856511328053049e-07, + "loss": 0.0253, + "step": 18075 + }, + { + "epoch": 3.273648037619823, + "grad_norm": 0.7238378524780273, + "learning_rate": 6.85190642843986e-07, + "loss": 0.0028, + "step": 18100 + }, + { + "epoch": 3.2781696509314524, + "grad_norm": 0.032546211034059525, + "learning_rate": 6.847301528826671e-07, + "loss": 0.011, + "step": 18125 + }, + { + "epoch": 3.282691264243082, + "grad_norm": 0.2295057624578476, + "learning_rate": 6.842696629213482e-07, + "loss": 0.0241, + "step": 18150 + }, + { + "epoch": 3.2872128775547114, + "grad_norm": 0.37478190660476685, + "learning_rate": 6.838091729600295e-07, + "loss": 0.0487, + "step": 18175 + }, + { + "epoch": 3.2917344908663413, + "grad_norm": 2.0155792236328125, + "learning_rate": 6.833486829987106e-07, + "loss": 0.0273, + "step": 18200 + }, + { + "epoch": 3.2962561041779708, + "grad_norm": 1.1389836072921753, + "learning_rate": 6.828881930373917e-07, + "loss": 0.0132, + "step": 18225 + }, + { + "epoch": 3.3007777174896002, + "grad_norm": 7.253364086151123, + "learning_rate": 6.824277030760729e-07, + "loss": 0.0085, + "step": 18250 + }, + { + "epoch": 3.3052993308012297, + "grad_norm": 17.829906463623047, + "learning_rate": 6.819672131147541e-07, + "loss": 0.0189, + "step": 18275 + }, + { + "epoch": 3.3098209441128597, + "grad_norm": 3.49690842628479, + "learning_rate": 6.815067231534352e-07, + "loss": 0.01, + "step": 18300 + }, + { + "epoch": 3.314342557424489, + "grad_norm": 0.6538553237915039, + "learning_rate": 6.810462331921164e-07, + "loss": 0.0097, + "step": 18325 + }, + { + "epoch": 3.3188641707361186, + "grad_norm": 29.16781234741211, + "learning_rate": 6.805857432307976e-07, + "loss": 0.0162, + "step": 18350 + }, + { + "epoch": 3.323385784047748, + "grad_norm": 0.38442912697792053, + "learning_rate": 6.801252532694786e-07, + "loss": 0.013, + "step": 18375 + }, + { + "epoch": 3.327907397359378, + "grad_norm": 16.521053314208984, + "learning_rate": 6.796647633081598e-07, + "loss": 0.0153, + "step": 18400 + }, + { + "epoch": 3.3324290106710075, + "grad_norm": 1.947344422340393, + "learning_rate": 6.792042733468411e-07, + "loss": 0.0088, + "step": 18425 + }, + { + "epoch": 3.336950623982637, + "grad_norm": 0.08833196014165878, + "learning_rate": 6.787437833855221e-07, + "loss": 0.0169, + "step": 18450 + }, + { + "epoch": 3.3414722372942665, + "grad_norm": 0.04213396832346916, + "learning_rate": 6.782832934242033e-07, + "loss": 0.0048, + "step": 18475 + }, + { + "epoch": 3.345993850605896, + "grad_norm": 14.339350700378418, + "learning_rate": 6.778228034628845e-07, + "loss": 0.0322, + "step": 18500 + }, + { + "epoch": 3.350515463917526, + "grad_norm": 2.3550896644592285, + "learning_rate": 6.773623135015656e-07, + "loss": 0.0079, + "step": 18525 + }, + { + "epoch": 3.3550370772291553, + "grad_norm": 0.04377694055438042, + "learning_rate": 6.769018235402468e-07, + "loss": 0.0408, + "step": 18550 + }, + { + "epoch": 3.359558690540785, + "grad_norm": 66.76377868652344, + "learning_rate": 6.76441333578928e-07, + "loss": 0.0646, + "step": 18575 + }, + { + "epoch": 3.3640803038524147, + "grad_norm": 24.474994659423828, + "learning_rate": 6.759808436176091e-07, + "loss": 0.0251, + "step": 18600 + }, + { + "epoch": 3.3686019171640442, + "grad_norm": 4.777419090270996, + "learning_rate": 6.755203536562902e-07, + "loss": 0.0127, + "step": 18625 + }, + { + "epoch": 3.3731235304756737, + "grad_norm": 1.3446674346923828, + "learning_rate": 6.750598636949714e-07, + "loss": 0.0246, + "step": 18650 + }, + { + "epoch": 3.377645143787303, + "grad_norm": 5.482788562774658, + "learning_rate": 6.745993737336526e-07, + "loss": 0.0141, + "step": 18675 + }, + { + "epoch": 3.3821667570989327, + "grad_norm": 1.5254578590393066, + "learning_rate": 6.741388837723337e-07, + "loss": 0.019, + "step": 18700 + }, + { + "epoch": 3.3866883704105626, + "grad_norm": 2.620288372039795, + "learning_rate": 6.736783938110149e-07, + "loss": 0.0257, + "step": 18725 + }, + { + "epoch": 3.391209983722192, + "grad_norm": 5.762913703918457, + "learning_rate": 6.73217903849696e-07, + "loss": 0.0033, + "step": 18750 + }, + { + "epoch": 3.3957315970338215, + "grad_norm": 5.787083625793457, + "learning_rate": 6.727574138883773e-07, + "loss": 0.0093, + "step": 18775 + }, + { + "epoch": 3.4002532103454515, + "grad_norm": 2.400695562362671, + "learning_rate": 6.722969239270584e-07, + "loss": 0.0103, + "step": 18800 + }, + { + "epoch": 3.404774823657081, + "grad_norm": 0.05644823983311653, + "learning_rate": 6.718364339657395e-07, + "loss": 0.0058, + "step": 18825 + }, + { + "epoch": 3.4092964369687104, + "grad_norm": 0.2891055941581726, + "learning_rate": 6.713759440044207e-07, + "loss": 0.0188, + "step": 18850 + }, + { + "epoch": 3.41381805028034, + "grad_norm": 0.8349182605743408, + "learning_rate": 6.709154540431017e-07, + "loss": 0.0295, + "step": 18875 + }, + { + "epoch": 3.4183396635919694, + "grad_norm": 0.06508205085992813, + "learning_rate": 6.70454964081783e-07, + "loss": 0.0079, + "step": 18900 + }, + { + "epoch": 3.4228612769035993, + "grad_norm": 8.653030395507812, + "learning_rate": 6.699944741204642e-07, + "loss": 0.0152, + "step": 18925 + }, + { + "epoch": 3.427382890215229, + "grad_norm": 0.2311927080154419, + "learning_rate": 6.695339841591454e-07, + "loss": 0.0414, + "step": 18950 + }, + { + "epoch": 3.4319045035268583, + "grad_norm": 55.472957611083984, + "learning_rate": 6.690734941978264e-07, + "loss": 0.0386, + "step": 18975 + }, + { + "epoch": 3.436426116838488, + "grad_norm": 0.4529309570789337, + "learning_rate": 6.686130042365076e-07, + "loss": 0.0494, + "step": 19000 + }, + { + "epoch": 3.4409477301501177, + "grad_norm": 1.5187815427780151, + "learning_rate": 6.681525142751889e-07, + "loss": 0.0166, + "step": 19025 + }, + { + "epoch": 3.445469343461747, + "grad_norm": 0.964755654335022, + "learning_rate": 6.676920243138699e-07, + "loss": 0.0076, + "step": 19050 + }, + { + "epoch": 3.4499909567733766, + "grad_norm": 22.352828979492188, + "learning_rate": 6.672315343525511e-07, + "loss": 0.019, + "step": 19075 + }, + { + "epoch": 3.454512570085006, + "grad_norm": 0.08743809163570404, + "learning_rate": 6.667710443912322e-07, + "loss": 0.0132, + "step": 19100 + }, + { + "epoch": 3.459034183396636, + "grad_norm": 0.2849852740764618, + "learning_rate": 6.663105544299133e-07, + "loss": 0.0206, + "step": 19125 + }, + { + "epoch": 3.4635557967082655, + "grad_norm": 1.0750819444656372, + "learning_rate": 6.658500644685946e-07, + "loss": 0.0206, + "step": 19150 + }, + { + "epoch": 3.468077410019895, + "grad_norm": 2.5783636569976807, + "learning_rate": 6.653895745072757e-07, + "loss": 0.0086, + "step": 19175 + }, + { + "epoch": 3.472599023331525, + "grad_norm": 3.71582293510437, + "learning_rate": 6.649290845459569e-07, + "loss": 0.0055, + "step": 19200 + }, + { + "epoch": 3.4771206366431544, + "grad_norm": 0.3580998182296753, + "learning_rate": 6.64468594584638e-07, + "loss": 0.0128, + "step": 19225 + }, + { + "epoch": 3.481642249954784, + "grad_norm": 20.23494529724121, + "learning_rate": 6.640081046233191e-07, + "loss": 0.032, + "step": 19250 + }, + { + "epoch": 3.4861638632664134, + "grad_norm": 0.634772539138794, + "learning_rate": 6.635476146620004e-07, + "loss": 0.0041, + "step": 19275 + }, + { + "epoch": 3.490685476578043, + "grad_norm": 9.67199993133545, + "learning_rate": 6.630871247006815e-07, + "loss": 0.0264, + "step": 19300 + }, + { + "epoch": 3.4952070898896728, + "grad_norm": 0.13094273209571838, + "learning_rate": 6.626266347393626e-07, + "loss": 0.0177, + "step": 19325 + }, + { + "epoch": 3.4997287032013022, + "grad_norm": 57.88798141479492, + "learning_rate": 6.621661447780438e-07, + "loss": 0.0171, + "step": 19350 + }, + { + "epoch": 3.5042503165129317, + "grad_norm": 0.32330283522605896, + "learning_rate": 6.617056548167249e-07, + "loss": 0.0333, + "step": 19375 + }, + { + "epoch": 3.5087719298245617, + "grad_norm": 2.477999210357666, + "learning_rate": 6.612451648554061e-07, + "loss": 0.0257, + "step": 19400 + }, + { + "epoch": 3.513293543136191, + "grad_norm": 4.913309097290039, + "learning_rate": 6.607846748940873e-07, + "loss": 0.014, + "step": 19425 + }, + { + "epoch": 3.5178151564478206, + "grad_norm": 1.5322345495224, + "learning_rate": 6.603241849327685e-07, + "loss": 0.0214, + "step": 19450 + }, + { + "epoch": 3.52233676975945, + "grad_norm": 0.07133228331804276, + "learning_rate": 6.598636949714495e-07, + "loss": 0.0148, + "step": 19475 + }, + { + "epoch": 3.5268583830710796, + "grad_norm": 13.74171257019043, + "learning_rate": 6.594032050101308e-07, + "loss": 0.0121, + "step": 19500 + }, + { + "epoch": 3.5313799963827095, + "grad_norm": 27.33115577697754, + "learning_rate": 6.58942715048812e-07, + "loss": 0.013, + "step": 19525 + }, + { + "epoch": 3.535901609694339, + "grad_norm": 0.22666044533252716, + "learning_rate": 6.58482225087493e-07, + "loss": 0.0145, + "step": 19550 + }, + { + "epoch": 3.5404232230059685, + "grad_norm": 13.335356712341309, + "learning_rate": 6.580217351261742e-07, + "loss": 0.0061, + "step": 19575 + }, + { + "epoch": 3.5449448363175984, + "grad_norm": 4.350337505340576, + "learning_rate": 6.575612451648554e-07, + "loss": 0.0038, + "step": 19600 + }, + { + "epoch": 3.549466449629228, + "grad_norm": 0.13537144660949707, + "learning_rate": 6.571007552035366e-07, + "loss": 0.0125, + "step": 19625 + }, + { + "epoch": 3.5539880629408573, + "grad_norm": 56.16394805908203, + "learning_rate": 6.566402652422177e-07, + "loss": 0.0213, + "step": 19650 + }, + { + "epoch": 3.558509676252487, + "grad_norm": 0.10288344323635101, + "learning_rate": 6.561797752808989e-07, + "loss": 0.0156, + "step": 19675 + }, + { + "epoch": 3.5630312895641163, + "grad_norm": 3.5337467193603516, + "learning_rate": 6.5571928531958e-07, + "loss": 0.019, + "step": 19700 + }, + { + "epoch": 3.5675529028757462, + "grad_norm": 0.20034463703632355, + "learning_rate": 6.552587953582611e-07, + "loss": 0.05, + "step": 19725 + }, + { + "epoch": 3.5720745161873757, + "grad_norm": 0.22690874338150024, + "learning_rate": 6.547983053969424e-07, + "loss": 0.0217, + "step": 19750 + }, + { + "epoch": 3.576596129499005, + "grad_norm": 26.994596481323242, + "learning_rate": 6.543378154356235e-07, + "loss": 0.0343, + "step": 19775 + }, + { + "epoch": 3.581117742810635, + "grad_norm": 5.310791492462158, + "learning_rate": 6.538773254743046e-07, + "loss": 0.0325, + "step": 19800 + }, + { + "epoch": 3.5856393561222646, + "grad_norm": 1.536670207977295, + "learning_rate": 6.534168355129857e-07, + "loss": 0.0057, + "step": 19825 + }, + { + "epoch": 3.590160969433894, + "grad_norm": 4.136430263519287, + "learning_rate": 6.529563455516669e-07, + "loss": 0.0292, + "step": 19850 + }, + { + "epoch": 3.5946825827455235, + "grad_norm": 0.5038244724273682, + "learning_rate": 6.524958555903482e-07, + "loss": 0.0159, + "step": 19875 + }, + { + "epoch": 3.599204196057153, + "grad_norm": 0.03355490043759346, + "learning_rate": 6.520353656290292e-07, + "loss": 0.0137, + "step": 19900 + }, + { + "epoch": 3.6037258093687825, + "grad_norm": 1.1418379545211792, + "learning_rate": 6.515748756677104e-07, + "loss": 0.0043, + "step": 19925 + }, + { + "epoch": 3.6082474226804124, + "grad_norm": 56.00255584716797, + "learning_rate": 6.511143857063916e-07, + "loss": 0.034, + "step": 19950 + }, + { + "epoch": 3.612769035992042, + "grad_norm": 0.5989301204681396, + "learning_rate": 6.506538957450726e-07, + "loss": 0.0094, + "step": 19975 + }, + { + "epoch": 3.6172906493036714, + "grad_norm": 5.834647178649902, + "learning_rate": 6.501934057837539e-07, + "loss": 0.0104, + "step": 20000 + }, + { + "epoch": 3.6218122626153013, + "grad_norm": 0.1490633189678192, + "learning_rate": 6.497329158224351e-07, + "loss": 0.0167, + "step": 20025 + }, + { + "epoch": 3.626333875926931, + "grad_norm": 18.232946395874023, + "learning_rate": 6.492724258611161e-07, + "loss": 0.0139, + "step": 20050 + }, + { + "epoch": 3.6308554892385603, + "grad_norm": 0.08834528177976608, + "learning_rate": 6.488119358997973e-07, + "loss": 0.0253, + "step": 20075 + }, + { + "epoch": 3.6353771025501898, + "grad_norm": 0.3102453351020813, + "learning_rate": 6.483698655369313e-07, + "loss": 0.0146, + "step": 20100 + }, + { + "epoch": 3.6398987158618192, + "grad_norm": 0.26993054151535034, + "learning_rate": 6.479093755756125e-07, + "loss": 0.0187, + "step": 20125 + }, + { + "epoch": 3.644420329173449, + "grad_norm": 0.2688346803188324, + "learning_rate": 6.474488856142935e-07, + "loss": 0.0304, + "step": 20150 + }, + { + "epoch": 3.6489419424850786, + "grad_norm": 0.11477080732584, + "learning_rate": 6.469883956529747e-07, + "loss": 0.0515, + "step": 20175 + }, + { + "epoch": 3.653463555796708, + "grad_norm": 2.2574214935302734, + "learning_rate": 6.46527905691656e-07, + "loss": 0.0248, + "step": 20200 + }, + { + "epoch": 3.657985169108338, + "grad_norm": 0.9582886099815369, + "learning_rate": 6.460674157303371e-07, + "loss": 0.0128, + "step": 20225 + }, + { + "epoch": 3.6625067824199675, + "grad_norm": 17.577070236206055, + "learning_rate": 6.456069257690182e-07, + "loss": 0.0189, + "step": 20250 + }, + { + "epoch": 3.667028395731597, + "grad_norm": 8.872756004333496, + "learning_rate": 6.451464358076993e-07, + "loss": 0.0113, + "step": 20275 + }, + { + "epoch": 3.6715500090432265, + "grad_norm": 0.408764123916626, + "learning_rate": 6.446859458463805e-07, + "loss": 0.0155, + "step": 20300 + }, + { + "epoch": 3.676071622354856, + "grad_norm": 0.4208977520465851, + "learning_rate": 6.442254558850617e-07, + "loss": 0.0044, + "step": 20325 + }, + { + "epoch": 3.680593235666486, + "grad_norm": 0.39012089371681213, + "learning_rate": 6.437649659237428e-07, + "loss": 0.0161, + "step": 20350 + }, + { + "epoch": 3.6851148489781154, + "grad_norm": 0.0061714984476566315, + "learning_rate": 6.43304475962424e-07, + "loss": 0.0113, + "step": 20375 + }, + { + "epoch": 3.689636462289745, + "grad_norm": 0.03614750877022743, + "learning_rate": 6.428439860011051e-07, + "loss": 0.0049, + "step": 20400 + }, + { + "epoch": 3.6941580756013748, + "grad_norm": 0.015124999918043613, + "learning_rate": 6.423834960397862e-07, + "loss": 0.0111, + "step": 20425 + }, + { + "epoch": 3.6986796889130042, + "grad_norm": 1.7233740091323853, + "learning_rate": 6.419230060784675e-07, + "loss": 0.0185, + "step": 20450 + }, + { + "epoch": 3.7032013022246337, + "grad_norm": 1.9191359281539917, + "learning_rate": 6.414625161171487e-07, + "loss": 0.0043, + "step": 20475 + }, + { + "epoch": 3.707722915536263, + "grad_norm": 0.6458703875541687, + "learning_rate": 6.410020261558297e-07, + "loss": 0.0183, + "step": 20500 + }, + { + "epoch": 3.7122445288478927, + "grad_norm": 0.0197773240506649, + "learning_rate": 6.405415361945109e-07, + "loss": 0.0244, + "step": 20525 + }, + { + "epoch": 3.7167661421595226, + "grad_norm": 0.8223174214363098, + "learning_rate": 6.400810462331921e-07, + "loss": 0.0311, + "step": 20550 + }, + { + "epoch": 3.721287755471152, + "grad_norm": 42.07274627685547, + "learning_rate": 6.396205562718732e-07, + "loss": 0.0578, + "step": 20575 + }, + { + "epoch": 3.7258093687827816, + "grad_norm": 0.2349138706922531, + "learning_rate": 6.391600663105544e-07, + "loss": 0.0253, + "step": 20600 + }, + { + "epoch": 3.7303309820944115, + "grad_norm": 0.18693946301937103, + "learning_rate": 6.386995763492356e-07, + "loss": 0.0104, + "step": 20625 + }, + { + "epoch": 3.734852595406041, + "grad_norm": 3.6481404304504395, + "learning_rate": 6.382390863879167e-07, + "loss": 0.0067, + "step": 20650 + }, + { + "epoch": 3.7393742087176705, + "grad_norm": 0.06991686671972275, + "learning_rate": 6.377785964265979e-07, + "loss": 0.0086, + "step": 20675 + }, + { + "epoch": 3.7438958220293, + "grad_norm": 0.06903531402349472, + "learning_rate": 6.373181064652791e-07, + "loss": 0.012, + "step": 20700 + }, + { + "epoch": 3.7484174353409294, + "grad_norm": 0.3118179142475128, + "learning_rate": 6.368576165039602e-07, + "loss": 0.0145, + "step": 20725 + }, + { + "epoch": 3.7529390486525593, + "grad_norm": 0.13410675525665283, + "learning_rate": 6.363971265426413e-07, + "loss": 0.0116, + "step": 20750 + }, + { + "epoch": 3.757460661964189, + "grad_norm": 6.128607273101807, + "learning_rate": 6.359366365813225e-07, + "loss": 0.0147, + "step": 20775 + }, + { + "epoch": 3.7619822752758183, + "grad_norm": 16.329174041748047, + "learning_rate": 6.354761466200037e-07, + "loss": 0.0289, + "step": 20800 + }, + { + "epoch": 3.7665038885874482, + "grad_norm": 0.022927312180399895, + "learning_rate": 6.350156566586848e-07, + "loss": 0.0194, + "step": 20825 + }, + { + "epoch": 3.7710255018990777, + "grad_norm": 0.08002068102359772, + "learning_rate": 6.34555166697366e-07, + "loss": 0.0276, + "step": 20850 + }, + { + "epoch": 3.775547115210707, + "grad_norm": 6.620906352996826, + "learning_rate": 6.340946767360471e-07, + "loss": 0.0303, + "step": 20875 + }, + { + "epoch": 3.7800687285223367, + "grad_norm": 20.082733154296875, + "learning_rate": 6.336341867747283e-07, + "loss": 0.0285, + "step": 20900 + }, + { + "epoch": 3.784590341833966, + "grad_norm": 0.4795994758605957, + "learning_rate": 6.331736968134095e-07, + "loss": 0.0164, + "step": 20925 + }, + { + "epoch": 3.789111955145596, + "grad_norm": 0.2975751757621765, + "learning_rate": 6.327132068520906e-07, + "loss": 0.0246, + "step": 20950 + }, + { + "epoch": 3.7936335684572255, + "grad_norm": 1.9404077529907227, + "learning_rate": 6.322527168907718e-07, + "loss": 0.0274, + "step": 20975 + }, + { + "epoch": 3.798155181768855, + "grad_norm": 1.5092577934265137, + "learning_rate": 6.317922269294528e-07, + "loss": 0.0286, + "step": 21000 + }, + { + "epoch": 3.802676795080485, + "grad_norm": 3.025921106338501, + "learning_rate": 6.31331736968134e-07, + "loss": 0.0139, + "step": 21025 + }, + { + "epoch": 3.8071984083921144, + "grad_norm": 0.11709149926900864, + "learning_rate": 6.308712470068153e-07, + "loss": 0.0067, + "step": 21050 + }, + { + "epoch": 3.811720021703744, + "grad_norm": 3.3579108715057373, + "learning_rate": 6.304107570454963e-07, + "loss": 0.0077, + "step": 21075 + }, + { + "epoch": 3.8162416350153734, + "grad_norm": 4.9033050537109375, + "learning_rate": 6.299502670841775e-07, + "loss": 0.0124, + "step": 21100 + }, + { + "epoch": 3.820763248327003, + "grad_norm": 0.054667115211486816, + "learning_rate": 6.294897771228587e-07, + "loss": 0.0147, + "step": 21125 + }, + { + "epoch": 3.825284861638633, + "grad_norm": 20.414718627929688, + "learning_rate": 6.290292871615398e-07, + "loss": 0.018, + "step": 21150 + }, + { + "epoch": 3.8298064749502623, + "grad_norm": 0.12169066816568375, + "learning_rate": 6.28568797200221e-07, + "loss": 0.0266, + "step": 21175 + }, + { + "epoch": 3.8343280882618918, + "grad_norm": 0.21765799820423126, + "learning_rate": 6.281083072389022e-07, + "loss": 0.0061, + "step": 21200 + }, + { + "epoch": 3.8388497015735217, + "grad_norm": 0.43716397881507874, + "learning_rate": 6.276478172775833e-07, + "loss": 0.0114, + "step": 21225 + }, + { + "epoch": 3.843371314885151, + "grad_norm": 1.5071337223052979, + "learning_rate": 6.271873273162644e-07, + "loss": 0.0111, + "step": 21250 + }, + { + "epoch": 3.8478929281967806, + "grad_norm": 1.3820738792419434, + "learning_rate": 6.267268373549456e-07, + "loss": 0.0088, + "step": 21275 + }, + { + "epoch": 3.85241454150841, + "grad_norm": 18.006547927856445, + "learning_rate": 6.262663473936268e-07, + "loss": 0.0628, + "step": 21300 + }, + { + "epoch": 3.8569361548200396, + "grad_norm": 0.09640432149171829, + "learning_rate": 6.25805857432308e-07, + "loss": 0.0324, + "step": 21325 + }, + { + "epoch": 3.8614577681316695, + "grad_norm": 86.30760192871094, + "learning_rate": 6.253453674709891e-07, + "loss": 0.0348, + "step": 21350 + }, + { + "epoch": 3.865979381443299, + "grad_norm": 20.232952117919922, + "learning_rate": 6.248848775096702e-07, + "loss": 0.0564, + "step": 21375 + }, + { + "epoch": 3.8705009947549285, + "grad_norm": 0.07119341939687729, + "learning_rate": 6.244243875483515e-07, + "loss": 0.0311, + "step": 21400 + }, + { + "epoch": 3.8750226080665584, + "grad_norm": 1.7042196989059448, + "learning_rate": 6.239638975870326e-07, + "loss": 0.0114, + "step": 21425 + }, + { + "epoch": 3.879544221378188, + "grad_norm": 0.36282435059547424, + "learning_rate": 6.235034076257137e-07, + "loss": 0.0227, + "step": 21450 + }, + { + "epoch": 3.8840658346898174, + "grad_norm": 0.15856370329856873, + "learning_rate": 6.230429176643949e-07, + "loss": 0.0134, + "step": 21475 + }, + { + "epoch": 3.888587448001447, + "grad_norm": 22.399805068969727, + "learning_rate": 6.22582427703076e-07, + "loss": 0.0165, + "step": 21500 + }, + { + "epoch": 3.8931090613130763, + "grad_norm": 1.0881842374801636, + "learning_rate": 6.221219377417572e-07, + "loss": 0.0081, + "step": 21525 + }, + { + "epoch": 3.8976306746247062, + "grad_norm": 0.025745024904608727, + "learning_rate": 6.216614477804384e-07, + "loss": 0.012, + "step": 21550 + }, + { + "epoch": 3.9021522879363357, + "grad_norm": 0.05021649971604347, + "learning_rate": 6.212009578191196e-07, + "loss": 0.0035, + "step": 21575 + }, + { + "epoch": 3.906673901247965, + "grad_norm": 0.08221199363470078, + "learning_rate": 6.207404678578006e-07, + "loss": 0.0131, + "step": 21600 + }, + { + "epoch": 3.911195514559595, + "grad_norm": 0.9112662672996521, + "learning_rate": 6.202799778964818e-07, + "loss": 0.0087, + "step": 21625 + }, + { + "epoch": 3.9157171278712246, + "grad_norm": 3.2704386711120605, + "learning_rate": 6.198194879351631e-07, + "loss": 0.0057, + "step": 21650 + }, + { + "epoch": 3.920238741182854, + "grad_norm": 0.2193220853805542, + "learning_rate": 6.193589979738441e-07, + "loss": 0.0154, + "step": 21675 + }, + { + "epoch": 3.9247603544944836, + "grad_norm": 1.7677456140518188, + "learning_rate": 6.188985080125253e-07, + "loss": 0.0338, + "step": 21700 + }, + { + "epoch": 3.929281967806113, + "grad_norm": 4.569864273071289, + "learning_rate": 6.184380180512065e-07, + "loss": 0.045, + "step": 21725 + }, + { + "epoch": 3.933803581117743, + "grad_norm": 1.5949956178665161, + "learning_rate": 6.179775280898875e-07, + "loss": 0.0126, + "step": 21750 + }, + { + "epoch": 3.9383251944293725, + "grad_norm": 0.5718483924865723, + "learning_rate": 6.175170381285688e-07, + "loss": 0.0407, + "step": 21775 + }, + { + "epoch": 3.942846807741002, + "grad_norm": 0.3438205122947693, + "learning_rate": 6.1705654816725e-07, + "loss": 0.0312, + "step": 21800 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.23237770795822144, + "learning_rate": 6.165960582059311e-07, + "loss": 0.0075, + "step": 21825 + }, + { + "epoch": 3.9518900343642613, + "grad_norm": 0.4722042381763458, + "learning_rate": 6.161355682446122e-07, + "loss": 0.0142, + "step": 21850 + }, + { + "epoch": 3.956411647675891, + "grad_norm": 3.872162342071533, + "learning_rate": 6.156750782832933e-07, + "loss": 0.0061, + "step": 21875 + }, + { + "epoch": 3.9609332609875203, + "grad_norm": 12.684093475341797, + "learning_rate": 6.152145883219746e-07, + "loss": 0.0123, + "step": 21900 + }, + { + "epoch": 3.96545487429915, + "grad_norm": 1.105630874633789, + "learning_rate": 6.147540983606557e-07, + "loss": 0.0066, + "step": 21925 + }, + { + "epoch": 3.9699764876107793, + "grad_norm": 0.050917476415634155, + "learning_rate": 6.142936083993368e-07, + "loss": 0.0095, + "step": 21950 + }, + { + "epoch": 3.974498100922409, + "grad_norm": 6.288554668426514, + "learning_rate": 6.13833118438018e-07, + "loss": 0.0039, + "step": 21975 + }, + { + "epoch": 3.9790197142340387, + "grad_norm": 0.09361649304628372, + "learning_rate": 6.133726284766991e-07, + "loss": 0.0174, + "step": 22000 + }, + { + "epoch": 3.983541327545668, + "grad_norm": 0.36472177505493164, + "learning_rate": 6.129121385153803e-07, + "loss": 0.012, + "step": 22025 + }, + { + "epoch": 3.988062940857298, + "grad_norm": 0.4556725025177002, + "learning_rate": 6.124516485540615e-07, + "loss": 0.012, + "step": 22050 + }, + { + "epoch": 3.9925845541689275, + "grad_norm": 4.3723225593566895, + "learning_rate": 6.119911585927427e-07, + "loss": 0.0076, + "step": 22075 + }, + { + "epoch": 3.997106167480557, + "grad_norm": 0.04663983732461929, + "learning_rate": 6.115490882298766e-07, + "loss": 0.0322, + "step": 22100 + }, + { + "epoch": 4.0, + "eval_loss": 0.3087974786758423, + "eval_runtime": 8497.8744, + "eval_samples_per_second": 1.117, + "eval_steps_per_second": 0.14, + "eval_wer": 0.10576553491351698, + "step": 22116 + }, + { + "epoch": 4.0016277807921865, + "grad_norm": 0.3466642498970032, + "learning_rate": 6.110885982685577e-07, + "loss": 0.0404, + "step": 22125 + }, + { + "epoch": 4.006149394103816, + "grad_norm": 0.15252432227134705, + "learning_rate": 6.106281083072389e-07, + "loss": 0.0096, + "step": 22150 + }, + { + "epoch": 4.0106710074154455, + "grad_norm": 19.444740295410156, + "learning_rate": 6.101676183459201e-07, + "loss": 0.0159, + "step": 22175 + }, + { + "epoch": 4.015192620727076, + "grad_norm": 10.760107040405273, + "learning_rate": 6.097071283846011e-07, + "loss": 0.0068, + "step": 22200 + }, + { + "epoch": 4.019714234038705, + "grad_norm": 0.07839302718639374, + "learning_rate": 6.092466384232824e-07, + "loss": 0.0026, + "step": 22225 + }, + { + "epoch": 4.024235847350335, + "grad_norm": 0.3171479105949402, + "learning_rate": 6.087861484619636e-07, + "loss": 0.0058, + "step": 22250 + }, + { + "epoch": 4.028757460661964, + "grad_norm": 0.2635548412799835, + "learning_rate": 6.083256585006446e-07, + "loss": 0.0104, + "step": 22275 + }, + { + "epoch": 4.033279073973594, + "grad_norm": 0.7347187995910645, + "learning_rate": 6.078651685393258e-07, + "loss": 0.0041, + "step": 22300 + }, + { + "epoch": 4.037800687285223, + "grad_norm": 0.03082045540213585, + "learning_rate": 6.074046785780069e-07, + "loss": 0.0023, + "step": 22325 + }, + { + "epoch": 4.042322300596853, + "grad_norm": 0.030160456895828247, + "learning_rate": 6.069441886166882e-07, + "loss": 0.0023, + "step": 22350 + }, + { + "epoch": 4.046843913908482, + "grad_norm": 22.204673767089844, + "learning_rate": 6.064836986553693e-07, + "loss": 0.0166, + "step": 22375 + }, + { + "epoch": 4.051365527220113, + "grad_norm": 5.911830425262451, + "learning_rate": 6.060232086940504e-07, + "loss": 0.0026, + "step": 22400 + }, + { + "epoch": 4.055887140531742, + "grad_norm": 0.5570561289787292, + "learning_rate": 6.055627187327316e-07, + "loss": 0.0142, + "step": 22425 + }, + { + "epoch": 4.0604087538433715, + "grad_norm": 0.12296438962221146, + "learning_rate": 6.051022287714127e-07, + "loss": 0.0138, + "step": 22450 + }, + { + "epoch": 4.064930367155001, + "grad_norm": 1.0237263441085815, + "learning_rate": 6.046417388100939e-07, + "loss": 0.0244, + "step": 22475 + }, + { + "epoch": 4.0694519804666305, + "grad_norm": 15.541071891784668, + "learning_rate": 6.041812488487751e-07, + "loss": 0.0174, + "step": 22500 + }, + { + "epoch": 4.07397359377826, + "grad_norm": 2.008063793182373, + "learning_rate": 6.037207588874562e-07, + "loss": 0.0325, + "step": 22525 + }, + { + "epoch": 4.078495207089889, + "grad_norm": 1.934888243675232, + "learning_rate": 6.032602689261373e-07, + "loss": 0.0051, + "step": 22550 + }, + { + "epoch": 4.083016820401519, + "grad_norm": 14.471735000610352, + "learning_rate": 6.027997789648186e-07, + "loss": 0.0077, + "step": 22575 + }, + { + "epoch": 4.087538433713149, + "grad_norm": 0.05337606742978096, + "learning_rate": 6.023392890034998e-07, + "loss": 0.012, + "step": 22600 + }, + { + "epoch": 4.092060047024779, + "grad_norm": 0.21558235585689545, + "learning_rate": 6.018787990421808e-07, + "loss": 0.0116, + "step": 22625 + }, + { + "epoch": 4.096581660336408, + "grad_norm": 11.160529136657715, + "learning_rate": 6.01418309080862e-07, + "loss": 0.0083, + "step": 22650 + }, + { + "epoch": 4.101103273648038, + "grad_norm": 0.33503735065460205, + "learning_rate": 6.009578191195432e-07, + "loss": 0.0071, + "step": 22675 + }, + { + "epoch": 4.105624886959667, + "grad_norm": 0.04343891143798828, + "learning_rate": 6.004973291582243e-07, + "loss": 0.0057, + "step": 22700 + }, + { + "epoch": 4.110146500271297, + "grad_norm": 0.017316769808530807, + "learning_rate": 6.000368391969055e-07, + "loss": 0.0085, + "step": 22725 + }, + { + "epoch": 4.114668113582926, + "grad_norm": 2.94707989692688, + "learning_rate": 5.995763492355867e-07, + "loss": 0.0085, + "step": 22750 + }, + { + "epoch": 4.119189726894556, + "grad_norm": 0.5166690349578857, + "learning_rate": 5.991158592742677e-07, + "loss": 0.0085, + "step": 22775 + }, + { + "epoch": 4.123711340206185, + "grad_norm": 0.27040883898735046, + "learning_rate": 5.986553693129489e-07, + "loss": 0.0115, + "step": 22800 + }, + { + "epoch": 4.1282329535178155, + "grad_norm": 0.2504747807979584, + "learning_rate": 5.981948793516302e-07, + "loss": 0.0193, + "step": 22825 + }, + { + "epoch": 4.132754566829445, + "grad_norm": 0.23941832780838013, + "learning_rate": 5.977343893903113e-07, + "loss": 0.0184, + "step": 22850 + }, + { + "epoch": 4.1372761801410745, + "grad_norm": 0.060054145753383636, + "learning_rate": 5.972738994289924e-07, + "loss": 0.0327, + "step": 22875 + }, + { + "epoch": 4.141797793452704, + "grad_norm": 2.8317267894744873, + "learning_rate": 5.968134094676736e-07, + "loss": 0.0261, + "step": 22900 + }, + { + "epoch": 4.146319406764333, + "grad_norm": 0.7358568906784058, + "learning_rate": 5.963529195063547e-07, + "loss": 0.0361, + "step": 22925 + }, + { + "epoch": 4.150841020075963, + "grad_norm": 0.17320603132247925, + "learning_rate": 5.958924295450359e-07, + "loss": 0.0153, + "step": 22950 + }, + { + "epoch": 4.155362633387592, + "grad_norm": 7.578225612640381, + "learning_rate": 5.954319395837171e-07, + "loss": 0.0065, + "step": 22975 + }, + { + "epoch": 4.159884246699222, + "grad_norm": 0.20573076605796814, + "learning_rate": 5.949714496223982e-07, + "loss": 0.008, + "step": 23000 + }, + { + "epoch": 4.164405860010852, + "grad_norm": 0.031806666404008865, + "learning_rate": 5.945109596610793e-07, + "loss": 0.0039, + "step": 23025 + }, + { + "epoch": 4.168927473322482, + "grad_norm": 1.0935386419296265, + "learning_rate": 5.940504696997604e-07, + "loss": 0.0039, + "step": 23050 + }, + { + "epoch": 4.173449086634111, + "grad_norm": 3.248675584793091, + "learning_rate": 5.935899797384417e-07, + "loss": 0.0084, + "step": 23075 + }, + { + "epoch": 4.177970699945741, + "grad_norm": 0.0767347663640976, + "learning_rate": 5.931294897771229e-07, + "loss": 0.0062, + "step": 23100 + }, + { + "epoch": 4.18249231325737, + "grad_norm": 0.03708457574248314, + "learning_rate": 5.92668999815804e-07, + "loss": 0.0085, + "step": 23125 + }, + { + "epoch": 4.187013926569, + "grad_norm": 0.16320359706878662, + "learning_rate": 5.922085098544851e-07, + "loss": 0.0053, + "step": 23150 + }, + { + "epoch": 4.191535539880629, + "grad_norm": 0.7727710008621216, + "learning_rate": 5.917480198931663e-07, + "loss": 0.0291, + "step": 23175 + }, + { + "epoch": 4.196057153192259, + "grad_norm": 0.0653238445520401, + "learning_rate": 5.912875299318474e-07, + "loss": 0.0165, + "step": 23200 + }, + { + "epoch": 4.200578766503889, + "grad_norm": 18.652076721191406, + "learning_rate": 5.908270399705286e-07, + "loss": 0.0119, + "step": 23225 + }, + { + "epoch": 4.205100379815518, + "grad_norm": 1.3249595165252686, + "learning_rate": 5.903665500092098e-07, + "loss": 0.0127, + "step": 23250 + }, + { + "epoch": 4.209621993127148, + "grad_norm": 0.1082233116030693, + "learning_rate": 5.899060600478909e-07, + "loss": 0.0312, + "step": 23275 + }, + { + "epoch": 4.214143606438777, + "grad_norm": 0.7809641361236572, + "learning_rate": 5.894455700865721e-07, + "loss": 0.0206, + "step": 23300 + }, + { + "epoch": 4.218665219750407, + "grad_norm": 0.12851662933826447, + "learning_rate": 5.889850801252533e-07, + "loss": 0.0284, + "step": 23325 + }, + { + "epoch": 4.223186833062036, + "grad_norm": 1.1004611253738403, + "learning_rate": 5.885245901639344e-07, + "loss": 0.0192, + "step": 23350 + }, + { + "epoch": 4.227708446373666, + "grad_norm": 22.368629455566406, + "learning_rate": 5.880641002026155e-07, + "loss": 0.0076, + "step": 23375 + }, + { + "epoch": 4.232230059685295, + "grad_norm": 0.16481293737888336, + "learning_rate": 5.876036102412967e-07, + "loss": 0.0121, + "step": 23400 + }, + { + "epoch": 4.236751672996926, + "grad_norm": 0.09105231612920761, + "learning_rate": 5.871431202799779e-07, + "loss": 0.0055, + "step": 23425 + }, + { + "epoch": 4.241273286308555, + "grad_norm": 0.06610502302646637, + "learning_rate": 5.86682630318659e-07, + "loss": 0.0051, + "step": 23450 + }, + { + "epoch": 4.245794899620185, + "grad_norm": 1.563685417175293, + "learning_rate": 5.862221403573402e-07, + "loss": 0.0034, + "step": 23475 + }, + { + "epoch": 4.250316512931814, + "grad_norm": 0.07692436873912811, + "learning_rate": 5.857616503960213e-07, + "loss": 0.0049, + "step": 23500 + }, + { + "epoch": 4.254838126243444, + "grad_norm": 0.42418307065963745, + "learning_rate": 5.853011604347025e-07, + "loss": 0.0021, + "step": 23525 + }, + { + "epoch": 4.259359739555073, + "grad_norm": 0.6450216174125671, + "learning_rate": 5.848406704733837e-07, + "loss": 0.0117, + "step": 23550 + }, + { + "epoch": 4.263881352866703, + "grad_norm": 3.0769999027252197, + "learning_rate": 5.843801805120648e-07, + "loss": 0.0021, + "step": 23575 + }, + { + "epoch": 4.268402966178332, + "grad_norm": 0.060876231640577316, + "learning_rate": 5.83919690550746e-07, + "loss": 0.0241, + "step": 23600 + }, + { + "epoch": 4.272924579489962, + "grad_norm": 10.53986930847168, + "learning_rate": 5.834592005894271e-07, + "loss": 0.0232, + "step": 23625 + }, + { + "epoch": 4.277446192801592, + "grad_norm": 17.494129180908203, + "learning_rate": 5.829987106281082e-07, + "loss": 0.0222, + "step": 23650 + }, + { + "epoch": 4.281967806113221, + "grad_norm": 0.1705417037010193, + "learning_rate": 5.825382206667895e-07, + "loss": 0.0187, + "step": 23675 + }, + { + "epoch": 4.286489419424851, + "grad_norm": 2.707817316055298, + "learning_rate": 5.820777307054707e-07, + "loss": 0.0327, + "step": 23700 + }, + { + "epoch": 4.29101103273648, + "grad_norm": 1.4121204614639282, + "learning_rate": 5.816172407441517e-07, + "loss": 0.0243, + "step": 23725 + }, + { + "epoch": 4.29553264604811, + "grad_norm": 7.615264415740967, + "learning_rate": 5.811567507828329e-07, + "loss": 0.0081, + "step": 23750 + }, + { + "epoch": 4.300054259359739, + "grad_norm": 0.06863299757242203, + "learning_rate": 5.806962608215141e-07, + "loss": 0.0112, + "step": 23775 + }, + { + "epoch": 4.304575872671369, + "grad_norm": 0.018180107697844505, + "learning_rate": 5.802357708601952e-07, + "loss": 0.0046, + "step": 23800 + }, + { + "epoch": 4.309097485982999, + "grad_norm": 0.43104514479637146, + "learning_rate": 5.797752808988764e-07, + "loss": 0.0055, + "step": 23825 + }, + { + "epoch": 4.313619099294629, + "grad_norm": 0.0967748612165451, + "learning_rate": 5.793147909375576e-07, + "loss": 0.0046, + "step": 23850 + }, + { + "epoch": 4.318140712606258, + "grad_norm": 0.15935564041137695, + "learning_rate": 5.788543009762386e-07, + "loss": 0.0086, + "step": 23875 + }, + { + "epoch": 4.322662325917888, + "grad_norm": 0.141365185379982, + "learning_rate": 5.783938110149198e-07, + "loss": 0.0167, + "step": 23900 + }, + { + "epoch": 4.327183939229517, + "grad_norm": 0.17591165006160736, + "learning_rate": 5.779333210536011e-07, + "loss": 0.0064, + "step": 23925 + }, + { + "epoch": 4.3317055525411465, + "grad_norm": 6.465728759765625, + "learning_rate": 5.774728310922822e-07, + "loss": 0.004, + "step": 23950 + }, + { + "epoch": 4.336227165852776, + "grad_norm": 0.12863588333129883, + "learning_rate": 5.770123411309633e-07, + "loss": 0.0163, + "step": 23975 + }, + { + "epoch": 4.3407487791644055, + "grad_norm": 0.3128308355808258, + "learning_rate": 5.765518511696444e-07, + "loss": 0.0216, + "step": 24000 + }, + { + "epoch": 4.345270392476036, + "grad_norm": 6.302773475646973, + "learning_rate": 5.760913612083256e-07, + "loss": 0.0115, + "step": 24025 + }, + { + "epoch": 4.349792005787665, + "grad_norm": 0.05059755593538284, + "learning_rate": 5.756308712470068e-07, + "loss": 0.0064, + "step": 24050 + }, + { + "epoch": 4.354313619099295, + "grad_norm": 0.18986685574054718, + "learning_rate": 5.75170381285688e-07, + "loss": 0.0277, + "step": 24075 + }, + { + "epoch": 4.358835232410924, + "grad_norm": 9.643892288208008, + "learning_rate": 5.747098913243691e-07, + "loss": 0.0373, + "step": 24100 + }, + { + "epoch": 4.363356845722554, + "grad_norm": 1.0030667781829834, + "learning_rate": 5.742494013630502e-07, + "loss": 0.0188, + "step": 24125 + }, + { + "epoch": 4.367878459034183, + "grad_norm": 0.44967132806777954, + "learning_rate": 5.737889114017314e-07, + "loss": 0.0095, + "step": 24150 + }, + { + "epoch": 4.372400072345813, + "grad_norm": 2.169846534729004, + "learning_rate": 5.733284214404126e-07, + "loss": 0.0168, + "step": 24175 + }, + { + "epoch": 4.376921685657442, + "grad_norm": 0.01923939771950245, + "learning_rate": 5.728679314790938e-07, + "loss": 0.0095, + "step": 24200 + }, + { + "epoch": 4.381443298969073, + "grad_norm": 9.444828033447266, + "learning_rate": 5.724074415177748e-07, + "loss": 0.0053, + "step": 24225 + }, + { + "epoch": 4.385964912280702, + "grad_norm": 60.225868225097656, + "learning_rate": 5.71946951556456e-07, + "loss": 0.0052, + "step": 24250 + }, + { + "epoch": 4.3904865255923315, + "grad_norm": 5.413934230804443, + "learning_rate": 5.714864615951373e-07, + "loss": 0.0125, + "step": 24275 + }, + { + "epoch": 4.395008138903961, + "grad_norm": 0.14292512834072113, + "learning_rate": 5.710259716338183e-07, + "loss": 0.0211, + "step": 24300 + }, + { + "epoch": 4.3995297522155905, + "grad_norm": 0.2194162905216217, + "learning_rate": 5.705654816724995e-07, + "loss": 0.0125, + "step": 24325 + }, + { + "epoch": 4.40405136552722, + "grad_norm": 0.029263151809573174, + "learning_rate": 5.701049917111807e-07, + "loss": 0.0045, + "step": 24350 + }, + { + "epoch": 4.4085729788388495, + "grad_norm": 0.2611495554447174, + "learning_rate": 5.696445017498617e-07, + "loss": 0.0031, + "step": 24375 + }, + { + "epoch": 4.413094592150479, + "grad_norm": 0.025238435715436935, + "learning_rate": 5.69184011788543e-07, + "loss": 0.0155, + "step": 24400 + }, + { + "epoch": 4.417616205462108, + "grad_norm": 0.043699052184820175, + "learning_rate": 5.687235218272242e-07, + "loss": 0.0156, + "step": 24425 + }, + { + "epoch": 4.422137818773739, + "grad_norm": 0.04374052584171295, + "learning_rate": 5.682630318659053e-07, + "loss": 0.0133, + "step": 24450 + }, + { + "epoch": 4.426659432085368, + "grad_norm": 3.1334991455078125, + "learning_rate": 5.678025419045864e-07, + "loss": 0.0383, + "step": 24475 + }, + { + "epoch": 4.431181045396998, + "grad_norm": 0.17290575802326202, + "learning_rate": 5.673420519432676e-07, + "loss": 0.0098, + "step": 24500 + }, + { + "epoch": 4.435702658708627, + "grad_norm": 0.5227589011192322, + "learning_rate": 5.668815619819488e-07, + "loss": 0.0863, + "step": 24525 + }, + { + "epoch": 4.440224272020257, + "grad_norm": 1.993817687034607, + "learning_rate": 5.664210720206299e-07, + "loss": 0.0161, + "step": 24550 + }, + { + "epoch": 4.444745885331886, + "grad_norm": 0.021811481565237045, + "learning_rate": 5.659605820593111e-07, + "loss": 0.0101, + "step": 24575 + }, + { + "epoch": 4.449267498643516, + "grad_norm": 0.055265914648771286, + "learning_rate": 5.655000920979922e-07, + "loss": 0.0117, + "step": 24600 + }, + { + "epoch": 4.453789111955146, + "grad_norm": 0.027848972007632256, + "learning_rate": 5.650396021366734e-07, + "loss": 0.0046, + "step": 24625 + }, + { + "epoch": 4.4583107252667755, + "grad_norm": 0.613524317741394, + "learning_rate": 5.645791121753546e-07, + "loss": 0.0079, + "step": 24650 + }, + { + "epoch": 4.462832338578405, + "grad_norm": 0.329345166683197, + "learning_rate": 5.641186222140357e-07, + "loss": 0.0096, + "step": 24675 + }, + { + "epoch": 4.4673539518900345, + "grad_norm": 0.07431361824274063, + "learning_rate": 5.636581322527169e-07, + "loss": 0.0033, + "step": 24700 + }, + { + "epoch": 4.471875565201664, + "grad_norm": 0.01413232646882534, + "learning_rate": 5.63197642291398e-07, + "loss": 0.0145, + "step": 24725 + }, + { + "epoch": 4.476397178513293, + "grad_norm": 0.03763017803430557, + "learning_rate": 5.627371523300791e-07, + "loss": 0.0093, + "step": 24750 + }, + { + "epoch": 4.480918791824923, + "grad_norm": 0.8524808287620544, + "learning_rate": 5.622766623687604e-07, + "loss": 0.0118, + "step": 24775 + }, + { + "epoch": 4.485440405136552, + "grad_norm": 0.10143906623125076, + "learning_rate": 5.618161724074415e-07, + "loss": 0.0203, + "step": 24800 + }, + { + "epoch": 4.489962018448182, + "grad_norm": 3.6819851398468018, + "learning_rate": 5.613556824461226e-07, + "loss": 0.0046, + "step": 24825 + }, + { + "epoch": 4.494483631759812, + "grad_norm": 17.471813201904297, + "learning_rate": 5.608951924848038e-07, + "loss": 0.0092, + "step": 24850 + }, + { + "epoch": 4.499005245071442, + "grad_norm": 0.03576793521642685, + "learning_rate": 5.604347025234851e-07, + "loss": 0.0171, + "step": 24875 + }, + { + "epoch": 4.503526858383071, + "grad_norm": 1.3747385740280151, + "learning_rate": 5.599742125621661e-07, + "loss": 0.0433, + "step": 24900 + }, + { + "epoch": 4.508048471694701, + "grad_norm": 0.658429741859436, + "learning_rate": 5.595137226008473e-07, + "loss": 0.0272, + "step": 24925 + }, + { + "epoch": 4.51257008500633, + "grad_norm": 2.7316272258758545, + "learning_rate": 5.590532326395284e-07, + "loss": 0.0061, + "step": 24950 + }, + { + "epoch": 4.51709169831796, + "grad_norm": 0.3981630206108093, + "learning_rate": 5.585927426782095e-07, + "loss": 0.0078, + "step": 24975 + }, + { + "epoch": 4.521613311629589, + "grad_norm": 0.21992315351963043, + "learning_rate": 5.581322527168908e-07, + "loss": 0.0076, + "step": 25000 + }, + { + "epoch": 4.5261349249412195, + "grad_norm": 7.3034796714782715, + "learning_rate": 5.576717627555719e-07, + "loss": 0.0073, + "step": 25025 + }, + { + "epoch": 4.530656538252849, + "grad_norm": 0.10995008796453476, + "learning_rate": 5.572112727942531e-07, + "loss": 0.0064, + "step": 25050 + }, + { + "epoch": 4.5351781515644785, + "grad_norm": 0.31085583567619324, + "learning_rate": 5.567507828329342e-07, + "loss": 0.0081, + "step": 25075 + }, + { + "epoch": 4.539699764876108, + "grad_norm": 0.062460754066705704, + "learning_rate": 5.562902928716153e-07, + "loss": 0.0158, + "step": 25100 + }, + { + "epoch": 4.544221378187737, + "grad_norm": 0.5231966376304626, + "learning_rate": 5.558298029102966e-07, + "loss": 0.0048, + "step": 25125 + }, + { + "epoch": 4.548742991499367, + "grad_norm": 0.23824147880077362, + "learning_rate": 5.553693129489777e-07, + "loss": 0.0172, + "step": 25150 + }, + { + "epoch": 4.553264604810996, + "grad_norm": 0.060738705098629, + "learning_rate": 5.549088229876588e-07, + "loss": 0.0127, + "step": 25175 + }, + { + "epoch": 4.557786218122626, + "grad_norm": 0.017272014170885086, + "learning_rate": 5.5444833302634e-07, + "loss": 0.0124, + "step": 25200 + }, + { + "epoch": 4.562307831434255, + "grad_norm": 0.1754794865846634, + "learning_rate": 5.539878430650211e-07, + "loss": 0.0148, + "step": 25225 + }, + { + "epoch": 4.566829444745886, + "grad_norm": 0.2678035795688629, + "learning_rate": 5.535273531037023e-07, + "loss": 0.0127, + "step": 25250 + }, + { + "epoch": 4.571351058057515, + "grad_norm": 10.760842323303223, + "learning_rate": 5.530668631423835e-07, + "loss": 0.0245, + "step": 25275 + }, + { + "epoch": 4.575872671369145, + "grad_norm": 0.16525591909885406, + "learning_rate": 5.526247927795174e-07, + "loss": 0.0371, + "step": 25300 + }, + { + "epoch": 4.580394284680774, + "grad_norm": 1.5673601627349854, + "learning_rate": 5.521643028181985e-07, + "loss": 0.0264, + "step": 25325 + }, + { + "epoch": 4.584915897992404, + "grad_norm": 0.20957525074481964, + "learning_rate": 5.517038128568797e-07, + "loss": 0.0167, + "step": 25350 + }, + { + "epoch": 4.589437511304033, + "grad_norm": 0.5806902050971985, + "learning_rate": 5.512433228955609e-07, + "loss": 0.0187, + "step": 25375 + }, + { + "epoch": 4.593959124615663, + "grad_norm": 0.487657368183136, + "learning_rate": 5.507828329342419e-07, + "loss": 0.0049, + "step": 25400 + }, + { + "epoch": 4.598480737927293, + "grad_norm": 0.2314232587814331, + "learning_rate": 5.503223429729231e-07, + "loss": 0.0065, + "step": 25425 + }, + { + "epoch": 4.603002351238922, + "grad_norm": 0.11486486345529556, + "learning_rate": 5.498618530116044e-07, + "loss": 0.0032, + "step": 25450 + }, + { + "epoch": 4.607523964550552, + "grad_norm": 0.23516380786895752, + "learning_rate": 5.494013630502855e-07, + "loss": 0.0088, + "step": 25475 + }, + { + "epoch": 4.612045577862181, + "grad_norm": 0.26957616209983826, + "learning_rate": 5.489408730889666e-07, + "loss": 0.011, + "step": 25500 + }, + { + "epoch": 4.616567191173811, + "grad_norm": 0.9121783375740051, + "learning_rate": 5.484803831276478e-07, + "loss": 0.0125, + "step": 25525 + }, + { + "epoch": 4.62108880448544, + "grad_norm": 0.0874478742480278, + "learning_rate": 5.480198931663289e-07, + "loss": 0.0015, + "step": 25550 + }, + { + "epoch": 4.62561041779707, + "grad_norm": 0.13613806664943695, + "learning_rate": 5.475594032050101e-07, + "loss": 0.005, + "step": 25575 + }, + { + "epoch": 4.630132031108699, + "grad_norm": 12.003376960754395, + "learning_rate": 5.470989132436913e-07, + "loss": 0.0104, + "step": 25600 + }, + { + "epoch": 4.634653644420329, + "grad_norm": 0.1077638640999794, + "learning_rate": 5.466384232823724e-07, + "loss": 0.0035, + "step": 25625 + }, + { + "epoch": 4.639175257731958, + "grad_norm": 0.20498277246952057, + "learning_rate": 5.461779333210536e-07, + "loss": 0.0327, + "step": 25650 + }, + { + "epoch": 4.643696871043589, + "grad_norm": 0.12293770909309387, + "learning_rate": 5.457174433597347e-07, + "loss": 0.0208, + "step": 25675 + }, + { + "epoch": 4.648218484355218, + "grad_norm": 0.13977064192295074, + "learning_rate": 5.452569533984159e-07, + "loss": 0.0279, + "step": 25700 + }, + { + "epoch": 4.652740097666848, + "grad_norm": 0.14199872314929962, + "learning_rate": 5.447964634370971e-07, + "loss": 0.0407, + "step": 25725 + }, + { + "epoch": 4.657261710978477, + "grad_norm": 0.16517263650894165, + "learning_rate": 5.443359734757782e-07, + "loss": 0.0084, + "step": 25750 + }, + { + "epoch": 4.661783324290107, + "grad_norm": 4.908987045288086, + "learning_rate": 5.438754835144593e-07, + "loss": 0.0096, + "step": 25775 + }, + { + "epoch": 4.666304937601736, + "grad_norm": 2.0926215648651123, + "learning_rate": 5.434149935531405e-07, + "loss": 0.0069, + "step": 25800 + }, + { + "epoch": 4.6708265509133655, + "grad_norm": 37.50233840942383, + "learning_rate": 5.429545035918217e-07, + "loss": 0.0127, + "step": 25825 + }, + { + "epoch": 4.675348164224996, + "grad_norm": 3.6432156562805176, + "learning_rate": 5.424940136305028e-07, + "loss": 0.0037, + "step": 25850 + }, + { + "epoch": 4.679869777536625, + "grad_norm": 0.005684335716068745, + "learning_rate": 5.42033523669184e-07, + "loss": 0.016, + "step": 25875 + }, + { + "epoch": 4.684391390848255, + "grad_norm": 0.06579804420471191, + "learning_rate": 5.415730337078652e-07, + "loss": 0.0046, + "step": 25900 + }, + { + "epoch": 4.688913004159884, + "grad_norm": 18.67939567565918, + "learning_rate": 5.411125437465462e-07, + "loss": 0.0067, + "step": 25925 + }, + { + "epoch": 4.693434617471514, + "grad_norm": 0.035297270864248276, + "learning_rate": 5.406520537852275e-07, + "loss": 0.0092, + "step": 25950 + }, + { + "epoch": 4.697956230783143, + "grad_norm": 0.08447632938623428, + "learning_rate": 5.401915638239087e-07, + "loss": 0.0152, + "step": 25975 + }, + { + "epoch": 4.702477844094773, + "grad_norm": 0.2718825042247772, + "learning_rate": 5.397310738625897e-07, + "loss": 0.0267, + "step": 26000 + }, + { + "epoch": 4.706999457406402, + "grad_norm": 0.02587636187672615, + "learning_rate": 5.392705839012709e-07, + "loss": 0.0032, + "step": 26025 + }, + { + "epoch": 4.711521070718032, + "grad_norm": 0.295260488986969, + "learning_rate": 5.388100939399522e-07, + "loss": 0.0249, + "step": 26050 + }, + { + "epoch": 4.716042684029662, + "grad_norm": 0.10036912560462952, + "learning_rate": 5.383496039786333e-07, + "loss": 0.0248, + "step": 26075 + }, + { + "epoch": 4.720564297341292, + "grad_norm": 19.557443618774414, + "learning_rate": 5.378891140173144e-07, + "loss": 0.0269, + "step": 26100 + }, + { + "epoch": 4.725085910652921, + "grad_norm": 13.175033569335938, + "learning_rate": 5.374286240559955e-07, + "loss": 0.0286, + "step": 26125 + }, + { + "epoch": 4.7296075239645505, + "grad_norm": 28.173160552978516, + "learning_rate": 5.369681340946767e-07, + "loss": 0.0069, + "step": 26150 + }, + { + "epoch": 4.73412913727618, + "grad_norm": 0.07096391171216965, + "learning_rate": 5.365076441333579e-07, + "loss": 0.0089, + "step": 26175 + }, + { + "epoch": 4.7386507505878095, + "grad_norm": 0.07818924635648727, + "learning_rate": 5.36047154172039e-07, + "loss": 0.004, + "step": 26200 + }, + { + "epoch": 4.743172363899439, + "grad_norm": 2.088768243789673, + "learning_rate": 5.355866642107202e-07, + "loss": 0.0118, + "step": 26225 + }, + { + "epoch": 4.747693977211069, + "grad_norm": 0.3656996488571167, + "learning_rate": 5.351261742494013e-07, + "loss": 0.0029, + "step": 26250 + }, + { + "epoch": 4.752215590522699, + "grad_norm": 0.016400739550590515, + "learning_rate": 5.346656842880824e-07, + "loss": 0.0068, + "step": 26275 + }, + { + "epoch": 4.756737203834328, + "grad_norm": 0.023505523800849915, + "learning_rate": 5.342051943267637e-07, + "loss": 0.0053, + "step": 26300 + }, + { + "epoch": 4.761258817145958, + "grad_norm": 0.033994659781455994, + "learning_rate": 5.337447043654449e-07, + "loss": 0.0084, + "step": 26325 + }, + { + "epoch": 4.765780430457587, + "grad_norm": 0.4282836318016052, + "learning_rate": 5.332842144041259e-07, + "loss": 0.0178, + "step": 26350 + }, + { + "epoch": 4.770302043769217, + "grad_norm": 0.9596586227416992, + "learning_rate": 5.328237244428071e-07, + "loss": 0.0055, + "step": 26375 + }, + { + "epoch": 4.774823657080846, + "grad_norm": 0.07100539654493332, + "learning_rate": 5.323632344814883e-07, + "loss": 0.0035, + "step": 26400 + }, + { + "epoch": 4.779345270392476, + "grad_norm": 20.814708709716797, + "learning_rate": 5.319027445201694e-07, + "loss": 0.0018, + "step": 26425 + }, + { + "epoch": 4.783866883704105, + "grad_norm": 0.1001884788274765, + "learning_rate": 5.314422545588506e-07, + "loss": 0.0169, + "step": 26450 + }, + { + "epoch": 4.7883884970157355, + "grad_norm": 10.18614387512207, + "learning_rate": 5.309817645975318e-07, + "loss": 0.0833, + "step": 26475 + }, + { + "epoch": 4.792910110327365, + "grad_norm": 5.15158224105835, + "learning_rate": 5.305212746362128e-07, + "loss": 0.0395, + "step": 26500 + }, + { + "epoch": 4.7974317236389945, + "grad_norm": 0.10631446540355682, + "learning_rate": 5.30060784674894e-07, + "loss": 0.019, + "step": 26525 + }, + { + "epoch": 4.801953336950624, + "grad_norm": 0.15538524091243744, + "learning_rate": 5.296002947135753e-07, + "loss": 0.0075, + "step": 26550 + }, + { + "epoch": 4.8064749502622535, + "grad_norm": 1.2634289264678955, + "learning_rate": 5.291398047522564e-07, + "loss": 0.0111, + "step": 26575 + }, + { + "epoch": 4.810996563573883, + "grad_norm": 0.11970185488462448, + "learning_rate": 5.286793147909375e-07, + "loss": 0.009, + "step": 26600 + }, + { + "epoch": 4.815518176885512, + "grad_norm": 3.1821625232696533, + "learning_rate": 5.282188248296187e-07, + "loss": 0.008, + "step": 26625 + }, + { + "epoch": 4.820039790197143, + "grad_norm": 1.2722951173782349, + "learning_rate": 5.277583348682998e-07, + "loss": 0.0083, + "step": 26650 + }, + { + "epoch": 4.824561403508772, + "grad_norm": 0.16242074966430664, + "learning_rate": 5.27297844906981e-07, + "loss": 0.0093, + "step": 26675 + }, + { + "epoch": 4.829083016820402, + "grad_norm": 0.08178609609603882, + "learning_rate": 5.268373549456622e-07, + "loss": 0.0163, + "step": 26700 + }, + { + "epoch": 4.833604630132031, + "grad_norm": 0.05239911004900932, + "learning_rate": 5.263768649843433e-07, + "loss": 0.0047, + "step": 26725 + }, + { + "epoch": 4.838126243443661, + "grad_norm": 0.11282465606927872, + "learning_rate": 5.259163750230244e-07, + "loss": 0.0057, + "step": 26750 + }, + { + "epoch": 4.84264785675529, + "grad_norm": 0.11248558014631271, + "learning_rate": 5.254558850617057e-07, + "loss": 0.011, + "step": 26775 + }, + { + "epoch": 4.84716947006692, + "grad_norm": 0.020478971302509308, + "learning_rate": 5.249953951003868e-07, + "loss": 0.013, + "step": 26800 + }, + { + "epoch": 4.851691083378549, + "grad_norm": 0.0279947929084301, + "learning_rate": 5.24534905139068e-07, + "loss": 0.016, + "step": 26825 + }, + { + "epoch": 4.856212696690179, + "grad_norm": 2.424943208694458, + "learning_rate": 5.24074415177749e-07, + "loss": 0.0214, + "step": 26850 + }, + { + "epoch": 4.860734310001809, + "grad_norm": 0.05842322111129761, + "learning_rate": 5.236139252164302e-07, + "loss": 0.0067, + "step": 26875 + }, + { + "epoch": 4.8652559233134385, + "grad_norm": 11.81139087677002, + "learning_rate": 5.231534352551115e-07, + "loss": 0.023, + "step": 26900 + }, + { + "epoch": 4.869777536625068, + "grad_norm": 4.321495056152344, + "learning_rate": 5.226929452937926e-07, + "loss": 0.0276, + "step": 26925 + }, + { + "epoch": 4.874299149936697, + "grad_norm": 26.13640022277832, + "learning_rate": 5.222324553324737e-07, + "loss": 0.0133, + "step": 26950 + }, + { + "epoch": 4.878820763248327, + "grad_norm": 0.09919006377458572, + "learning_rate": 5.217719653711549e-07, + "loss": 0.01, + "step": 26975 + }, + { + "epoch": 4.883342376559956, + "grad_norm": 2.89017653465271, + "learning_rate": 5.21311475409836e-07, + "loss": 0.0199, + "step": 27000 + }, + { + "epoch": 4.887863989871586, + "grad_norm": 0.1923578828573227, + "learning_rate": 5.208509854485172e-07, + "loss": 0.0055, + "step": 27025 + }, + { + "epoch": 4.892385603183216, + "grad_norm": 0.05905037745833397, + "learning_rate": 5.203904954871984e-07, + "loss": 0.007, + "step": 27050 + }, + { + "epoch": 4.896907216494846, + "grad_norm": 4.7976837158203125, + "learning_rate": 5.199300055258795e-07, + "loss": 0.0095, + "step": 27075 + }, + { + "epoch": 4.901428829806475, + "grad_norm": 0.5176482200622559, + "learning_rate": 5.194695155645606e-07, + "loss": 0.0068, + "step": 27100 + }, + { + "epoch": 4.905950443118105, + "grad_norm": 0.15327677130699158, + "learning_rate": 5.190090256032418e-07, + "loss": 0.0016, + "step": 27125 + }, + { + "epoch": 4.910472056429734, + "grad_norm": 23.123516082763672, + "learning_rate": 5.18548535641923e-07, + "loss": 0.0126, + "step": 27150 + }, + { + "epoch": 4.914993669741364, + "grad_norm": 5.405014514923096, + "learning_rate": 5.180880456806041e-07, + "loss": 0.0063, + "step": 27175 + }, + { + "epoch": 4.919515283052993, + "grad_norm": 0.03432038426399231, + "learning_rate": 5.176275557192853e-07, + "loss": 0.0085, + "step": 27200 + }, + { + "epoch": 4.924036896364623, + "grad_norm": 0.012576217763125896, + "learning_rate": 5.171670657579664e-07, + "loss": 0.0089, + "step": 27225 + }, + { + "epoch": 4.928558509676252, + "grad_norm": 0.03322821483016014, + "learning_rate": 5.167065757966476e-07, + "loss": 0.0117, + "step": 27250 + }, + { + "epoch": 4.9330801229878825, + "grad_norm": 0.14308449625968933, + "learning_rate": 5.162460858353288e-07, + "loss": 0.0083, + "step": 27275 + }, + { + "epoch": 4.937601736299512, + "grad_norm": 39.83473205566406, + "learning_rate": 5.157855958740099e-07, + "loss": 0.0412, + "step": 27300 + }, + { + "epoch": 4.942123349611141, + "grad_norm": 0.15658709406852722, + "learning_rate": 5.153251059126911e-07, + "loss": 0.048, + "step": 27325 + }, + { + "epoch": 4.946644962922771, + "grad_norm": 0.5169302821159363, + "learning_rate": 5.148646159513722e-07, + "loss": 0.0125, + "step": 27350 + }, + { + "epoch": 4.9511665762344, + "grad_norm": 0.021764138713479042, + "learning_rate": 5.144041259900533e-07, + "loss": 0.0044, + "step": 27375 + }, + { + "epoch": 4.95568818954603, + "grad_norm": 0.2565416395664215, + "learning_rate": 5.139436360287346e-07, + "loss": 0.0134, + "step": 27400 + }, + { + "epoch": 4.960209802857659, + "grad_norm": 3.5746593475341797, + "learning_rate": 5.134831460674158e-07, + "loss": 0.0144, + "step": 27425 + }, + { + "epoch": 4.96473141616929, + "grad_norm": 1.1976155042648315, + "learning_rate": 5.130226561060968e-07, + "loss": 0.0015, + "step": 27450 + }, + { + "epoch": 4.969253029480919, + "grad_norm": 1.560697078704834, + "learning_rate": 5.12562166144778e-07, + "loss": 0.0105, + "step": 27475 + }, + { + "epoch": 4.973774642792549, + "grad_norm": 0.21413296461105347, + "learning_rate": 5.121016761834593e-07, + "loss": 0.0086, + "step": 27500 + }, + { + "epoch": 4.978296256104178, + "grad_norm": 0.1948905736207962, + "learning_rate": 5.116411862221403e-07, + "loss": 0.0142, + "step": 27525 + }, + { + "epoch": 4.982817869415808, + "grad_norm": 0.0875588059425354, + "learning_rate": 5.111806962608215e-07, + "loss": 0.0086, + "step": 27550 + }, + { + "epoch": 4.987339482727437, + "grad_norm": 0.18821197748184204, + "learning_rate": 5.107202062995027e-07, + "loss": 0.0343, + "step": 27575 + }, + { + "epoch": 4.991861096039067, + "grad_norm": 3.243891954421997, + "learning_rate": 5.102597163381837e-07, + "loss": 0.0066, + "step": 27600 + }, + { + "epoch": 4.996382709350696, + "grad_norm": 0.04505661129951477, + "learning_rate": 5.09799226376865e-07, + "loss": 0.0273, + "step": 27625 + }, + { + "epoch": 5.0, + "eval_loss": 0.32219141721725464, + "eval_runtime": 8558.7304, + "eval_samples_per_second": 1.109, + "eval_steps_per_second": 0.139, + "eval_wer": 0.1037636130685458, + "step": 27645 + }, + { + "epoch": 5.0009043226623255, + "grad_norm": 3.0571129322052, + "learning_rate": 5.093387364155462e-07, + "loss": 0.0432, + "step": 27650 + }, + { + "epoch": 5.005425935973956, + "grad_norm": 0.28156787157058716, + "learning_rate": 5.088782464542273e-07, + "loss": 0.015, + "step": 27675 + }, + { + "epoch": 5.009947549285585, + "grad_norm": 1.9075210094451904, + "learning_rate": 5.084177564929084e-07, + "loss": 0.0056, + "step": 27700 + }, + { + "epoch": 5.014469162597215, + "grad_norm": 0.8747662305831909, + "learning_rate": 5.079572665315896e-07, + "loss": 0.0115, + "step": 27725 + }, + { + "epoch": 5.018990775908844, + "grad_norm": 0.4982765316963196, + "learning_rate": 5.074967765702708e-07, + "loss": 0.0029, + "step": 27750 + }, + { + "epoch": 5.023512389220474, + "grad_norm": 0.06921929121017456, + "learning_rate": 5.070362866089519e-07, + "loss": 0.0133, + "step": 27775 + }, + { + "epoch": 5.028034002532103, + "grad_norm": 0.3169139623641968, + "learning_rate": 5.06575796647633e-07, + "loss": 0.0059, + "step": 27800 + }, + { + "epoch": 5.032555615843733, + "grad_norm": 0.11565407365560532, + "learning_rate": 5.061153066863142e-07, + "loss": 0.0092, + "step": 27825 + }, + { + "epoch": 5.037077229155362, + "grad_norm": 0.03337372466921806, + "learning_rate": 5.056548167249953e-07, + "loss": 0.0045, + "step": 27850 + }, + { + "epoch": 5.041598842466993, + "grad_norm": 0.03493885695934296, + "learning_rate": 5.051943267636765e-07, + "loss": 0.0024, + "step": 27875 + }, + { + "epoch": 5.046120455778622, + "grad_norm": 0.018299061805009842, + "learning_rate": 5.047338368023577e-07, + "loss": 0.0072, + "step": 27900 + }, + { + "epoch": 5.050642069090252, + "grad_norm": 0.10773641616106033, + "learning_rate": 5.042733468410389e-07, + "loss": 0.0122, + "step": 27925 + }, + { + "epoch": 5.055163682401881, + "grad_norm": 0.8263195157051086, + "learning_rate": 5.038128568797199e-07, + "loss": 0.0025, + "step": 27950 + }, + { + "epoch": 5.0596852957135106, + "grad_norm": 0.03416213020682335, + "learning_rate": 5.033523669184011e-07, + "loss": 0.0017, + "step": 27975 + }, + { + "epoch": 5.06420690902514, + "grad_norm": 0.05288705974817276, + "learning_rate": 5.028918769570824e-07, + "loss": 0.0196, + "step": 28000 + }, + { + "epoch": 5.0687285223367695, + "grad_norm": 1.1040371656417847, + "learning_rate": 5.024313869957634e-07, + "loss": 0.0162, + "step": 28025 + }, + { + "epoch": 5.073250135648399, + "grad_norm": 1.0380799770355225, + "learning_rate": 5.019708970344446e-07, + "loss": 0.0872, + "step": 28050 + }, + { + "epoch": 5.077771748960029, + "grad_norm": 1.2944865226745605, + "learning_rate": 5.015104070731258e-07, + "loss": 0.0196, + "step": 28075 + }, + { + "epoch": 5.082293362271659, + "grad_norm": 0.10234280675649643, + "learning_rate": 5.010499171118068e-07, + "loss": 0.0039, + "step": 28100 + }, + { + "epoch": 5.086814975583288, + "grad_norm": 0.2737126350402832, + "learning_rate": 5.005894271504881e-07, + "loss": 0.0138, + "step": 28125 + }, + { + "epoch": 5.091336588894918, + "grad_norm": 0.04862267151474953, + "learning_rate": 5.001289371891693e-07, + "loss": 0.0084, + "step": 28150 + }, + { + "epoch": 5.095858202206547, + "grad_norm": 0.12347660213708878, + "learning_rate": 4.996684472278504e-07, + "loss": 0.0039, + "step": 28175 + }, + { + "epoch": 5.100379815518177, + "grad_norm": 1.5366371870040894, + "learning_rate": 4.992079572665316e-07, + "loss": 0.0024, + "step": 28200 + }, + { + "epoch": 5.104901428829806, + "grad_norm": 0.11356142163276672, + "learning_rate": 4.987474673052128e-07, + "loss": 0.0069, + "step": 28225 + }, + { + "epoch": 5.109423042141436, + "grad_norm": 0.019394779577851295, + "learning_rate": 4.982869773438938e-07, + "loss": 0.0026, + "step": 28250 + }, + { + "epoch": 5.113944655453066, + "grad_norm": 12.068894386291504, + "learning_rate": 4.97826487382575e-07, + "loss": 0.0064, + "step": 28275 + }, + { + "epoch": 5.118466268764696, + "grad_norm": 3.972506523132324, + "learning_rate": 4.973659974212562e-07, + "loss": 0.004, + "step": 28300 + }, + { + "epoch": 5.122987882076325, + "grad_norm": 0.024081800132989883, + "learning_rate": 4.969055074599373e-07, + "loss": 0.0079, + "step": 28325 + }, + { + "epoch": 5.1275094953879545, + "grad_norm": 0.2852739095687866, + "learning_rate": 4.964450174986185e-07, + "loss": 0.0111, + "step": 28350 + }, + { + "epoch": 5.132031108699584, + "grad_norm": 0.1766853779554367, + "learning_rate": 4.959845275372997e-07, + "loss": 0.0042, + "step": 28375 + }, + { + "epoch": 5.1365527220112135, + "grad_norm": 0.1434965282678604, + "learning_rate": 4.955240375759808e-07, + "loss": 0.0018, + "step": 28400 + }, + { + "epoch": 5.141074335322843, + "grad_norm": 15.574809074401855, + "learning_rate": 4.95063547614662e-07, + "loss": 0.0219, + "step": 28425 + }, + { + "epoch": 5.1455959486344724, + "grad_norm": 0.07658454775810242, + "learning_rate": 4.946030576533432e-07, + "loss": 0.0361, + "step": 28450 + }, + { + "epoch": 5.150117561946102, + "grad_norm": 2.8722949028015137, + "learning_rate": 4.941425676920243e-07, + "loss": 0.0081, + "step": 28475 + }, + { + "epoch": 5.154639175257732, + "grad_norm": 0.16390861570835114, + "learning_rate": 4.936820777307055e-07, + "loss": 0.0024, + "step": 28500 + }, + { + "epoch": 5.159160788569362, + "grad_norm": 0.5398291945457458, + "learning_rate": 4.932215877693866e-07, + "loss": 0.0106, + "step": 28525 + }, + { + "epoch": 5.163682401880991, + "grad_norm": 10.156366348266602, + "learning_rate": 4.927610978080677e-07, + "loss": 0.0137, + "step": 28550 + }, + { + "epoch": 5.168204015192621, + "grad_norm": 0.013218329288065434, + "learning_rate": 4.923006078467489e-07, + "loss": 0.0104, + "step": 28575 + }, + { + "epoch": 5.17272562850425, + "grad_norm": 8.923846244812012, + "learning_rate": 4.9184011788543e-07, + "loss": 0.0106, + "step": 28600 + }, + { + "epoch": 5.17724724181588, + "grad_norm": 0.07841784507036209, + "learning_rate": 4.913796279241112e-07, + "loss": 0.0065, + "step": 28625 + }, + { + "epoch": 5.181768855127509, + "grad_norm": 0.3685607314109802, + "learning_rate": 4.909191379627924e-07, + "loss": 0.0118, + "step": 28650 + }, + { + "epoch": 5.186290468439139, + "grad_norm": 1.6632095575332642, + "learning_rate": 4.904586480014736e-07, + "loss": 0.0019, + "step": 28675 + }, + { + "epoch": 5.190812081750769, + "grad_norm": 0.10080606490373611, + "learning_rate": 4.899981580401547e-07, + "loss": 0.0027, + "step": 28700 + }, + { + "epoch": 5.1953336950623985, + "grad_norm": 4.444336891174316, + "learning_rate": 4.895376680788359e-07, + "loss": 0.0029, + "step": 28725 + }, + { + "epoch": 5.199855308374028, + "grad_norm": 5.826324939727783, + "learning_rate": 4.89077178117517e-07, + "loss": 0.0058, + "step": 28750 + }, + { + "epoch": 5.2043769216856575, + "grad_norm": 0.09290173649787903, + "learning_rate": 4.886166881561982e-07, + "loss": 0.0015, + "step": 28775 + }, + { + "epoch": 5.208898534997287, + "grad_norm": 0.0847366601228714, + "learning_rate": 4.881561981948794e-07, + "loss": 0.0143, + "step": 28800 + }, + { + "epoch": 5.213420148308916, + "grad_norm": 0.4414973855018616, + "learning_rate": 4.876957082335604e-07, + "loss": 0.012, + "step": 28825 + }, + { + "epoch": 5.217941761620546, + "grad_norm": 1.8146724700927734, + "learning_rate": 4.872352182722416e-07, + "loss": 0.0213, + "step": 28850 + }, + { + "epoch": 5.222463374932175, + "grad_norm": 0.3021218478679657, + "learning_rate": 4.867747283109228e-07, + "loss": 0.0052, + "step": 28875 + }, + { + "epoch": 5.226984988243806, + "grad_norm": 0.32385706901550293, + "learning_rate": 4.863142383496039e-07, + "loss": 0.006, + "step": 28900 + }, + { + "epoch": 5.231506601555435, + "grad_norm": 0.3742549419403076, + "learning_rate": 4.858537483882851e-07, + "loss": 0.0088, + "step": 28925 + }, + { + "epoch": 5.236028214867065, + "grad_norm": 0.1462940126657486, + "learning_rate": 4.853932584269663e-07, + "loss": 0.003, + "step": 28950 + }, + { + "epoch": 5.240549828178694, + "grad_norm": 0.248609721660614, + "learning_rate": 4.849327684656474e-07, + "loss": 0.0124, + "step": 28975 + }, + { + "epoch": 5.245071441490324, + "grad_norm": 0.13235850632190704, + "learning_rate": 4.844722785043286e-07, + "loss": 0.0111, + "step": 29000 + }, + { + "epoch": 5.249593054801953, + "grad_norm": 0.1916670799255371, + "learning_rate": 4.840117885430098e-07, + "loss": 0.0034, + "step": 29025 + }, + { + "epoch": 5.254114668113583, + "grad_norm": 0.0536821186542511, + "learning_rate": 4.835512985816909e-07, + "loss": 0.0025, + "step": 29050 + }, + { + "epoch": 5.258636281425212, + "grad_norm": 0.03594660758972168, + "learning_rate": 4.83090808620372e-07, + "loss": 0.0005, + "step": 29075 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 0.012173148803412914, + "learning_rate": 4.826303186590533e-07, + "loss": 0.0037, + "step": 29100 + }, + { + "epoch": 5.267679508048472, + "grad_norm": 0.033210985362529755, + "learning_rate": 4.821698286977343e-07, + "loss": 0.0014, + "step": 29125 + }, + { + "epoch": 5.272201121360101, + "grad_norm": 0.131244495511055, + "learning_rate": 4.817093387364155e-07, + "loss": 0.0023, + "step": 29150 + }, + { + "epoch": 5.276722734671731, + "grad_norm": 6.246963977813721, + "learning_rate": 4.812488487750967e-07, + "loss": 0.002, + "step": 29175 + }, + { + "epoch": 5.28124434798336, + "grad_norm": 0.4974225163459778, + "learning_rate": 4.807883588137778e-07, + "loss": 0.0056, + "step": 29200 + }, + { + "epoch": 5.28576596129499, + "grad_norm": 16.300535202026367, + "learning_rate": 4.80327868852459e-07, + "loss": 0.0446, + "step": 29225 + }, + { + "epoch": 5.290287574606619, + "grad_norm": 2.680767059326172, + "learning_rate": 4.798673788911402e-07, + "loss": 0.0483, + "step": 29250 + }, + { + "epoch": 5.294809187918249, + "grad_norm": 8.133724212646484, + "learning_rate": 4.794068889298213e-07, + "loss": 0.0121, + "step": 29275 + }, + { + "epoch": 5.299330801229879, + "grad_norm": 0.09077363461256027, + "learning_rate": 4.789463989685025e-07, + "loss": 0.0048, + "step": 29300 + }, + { + "epoch": 5.303852414541509, + "grad_norm": 36.61612319946289, + "learning_rate": 4.784859090071836e-07, + "loss": 0.0182, + "step": 29325 + }, + { + "epoch": 5.308374027853138, + "grad_norm": 6.475874900817871, + "learning_rate": 4.780254190458648e-07, + "loss": 0.0075, + "step": 29350 + }, + { + "epoch": 5.312895641164768, + "grad_norm": 0.09107718616724014, + "learning_rate": 4.775649290845459e-07, + "loss": 0.003, + "step": 29375 + }, + { + "epoch": 5.317417254476397, + "grad_norm": 0.0676039382815361, + "learning_rate": 4.771044391232271e-07, + "loss": 0.0139, + "step": 29400 + }, + { + "epoch": 5.321938867788027, + "grad_norm": 0.01544503029435873, + "learning_rate": 4.7664394916190827e-07, + "loss": 0.0118, + "step": 29425 + }, + { + "epoch": 5.326460481099656, + "grad_norm": 65.15055847167969, + "learning_rate": 4.761834592005894e-07, + "loss": 0.0057, + "step": 29450 + }, + { + "epoch": 5.330982094411286, + "grad_norm": 0.08361693471670151, + "learning_rate": 4.7572296923927055e-07, + "loss": 0.0023, + "step": 29475 + }, + { + "epoch": 5.335503707722916, + "grad_norm": 0.10150730609893799, + "learning_rate": 4.752624792779517e-07, + "loss": 0.0152, + "step": 29500 + }, + { + "epoch": 5.340025321034545, + "grad_norm": 0.7418703436851501, + "learning_rate": 4.748019893166329e-07, + "loss": 0.0181, + "step": 29525 + }, + { + "epoch": 5.344546934346175, + "grad_norm": 0.09599591046571732, + "learning_rate": 4.7434149935531405e-07, + "loss": 0.0076, + "step": 29550 + }, + { + "epoch": 5.349068547657804, + "grad_norm": 0.28607824444770813, + "learning_rate": 4.7388100939399516e-07, + "loss": 0.0053, + "step": 29575 + }, + { + "epoch": 5.353590160969434, + "grad_norm": 0.02649850957095623, + "learning_rate": 4.7342051943267633e-07, + "loss": 0.0114, + "step": 29600 + }, + { + "epoch": 5.358111774281063, + "grad_norm": 19.19407081604004, + "learning_rate": 4.729600294713575e-07, + "loss": 0.0629, + "step": 29625 + }, + { + "epoch": 5.362633387592693, + "grad_norm": 0.16849057376384735, + "learning_rate": 4.7251795910849143e-07, + "loss": 0.0308, + "step": 29650 + }, + { + "epoch": 5.367155000904322, + "grad_norm": 4.074181079864502, + "learning_rate": 4.720574691471726e-07, + "loss": 0.0024, + "step": 29675 + }, + { + "epoch": 5.371676614215953, + "grad_norm": 0.3396848142147064, + "learning_rate": 4.715969791858537e-07, + "loss": 0.019, + "step": 29700 + }, + { + "epoch": 5.376198227527582, + "grad_norm": 1.8326916694641113, + "learning_rate": 4.7113648922453487e-07, + "loss": 0.0116, + "step": 29725 + }, + { + "epoch": 5.380719840839212, + "grad_norm": 1.045644760131836, + "learning_rate": 4.7067599926321604e-07, + "loss": 0.0014, + "step": 29750 + }, + { + "epoch": 5.385241454150841, + "grad_norm": 0.034128542989492416, + "learning_rate": 4.702155093018972e-07, + "loss": 0.0032, + "step": 29775 + }, + { + "epoch": 5.389763067462471, + "grad_norm": 0.24559779465198517, + "learning_rate": 4.6975501934057837e-07, + "loss": 0.0092, + "step": 29800 + }, + { + "epoch": 5.3942846807741, + "grad_norm": 0.08045164495706558, + "learning_rate": 4.692945293792595e-07, + "loss": 0.0039, + "step": 29825 + }, + { + "epoch": 5.3988062940857295, + "grad_norm": 0.2354724407196045, + "learning_rate": 4.6883403941794065e-07, + "loss": 0.0062, + "step": 29850 + }, + { + "epoch": 5.403327907397359, + "grad_norm": 0.02797023393213749, + "learning_rate": 4.683735494566218e-07, + "loss": 0.0071, + "step": 29875 + }, + { + "epoch": 5.407849520708989, + "grad_norm": 2.4346461296081543, + "learning_rate": 4.67913059495303e-07, + "loss": 0.005, + "step": 29900 + }, + { + "epoch": 5.412371134020619, + "grad_norm": 0.08410675823688507, + "learning_rate": 4.6745256953398415e-07, + "loss": 0.0008, + "step": 29925 + }, + { + "epoch": 5.416892747332248, + "grad_norm": 1.741323709487915, + "learning_rate": 4.6699207957266526e-07, + "loss": 0.0129, + "step": 29950 + }, + { + "epoch": 5.421414360643878, + "grad_norm": 0.08235138654708862, + "learning_rate": 4.665315896113465e-07, + "loss": 0.0283, + "step": 29975 + }, + { + "epoch": 5.425935973955507, + "grad_norm": 20.520862579345703, + "learning_rate": 4.660710996500276e-07, + "loss": 0.0526, + "step": 30000 + }, + { + "epoch": 5.430457587267137, + "grad_norm": 0.11165034025907516, + "learning_rate": 4.6561060968870876e-07, + "loss": 0.0368, + "step": 30025 + }, + { + "epoch": 5.434979200578766, + "grad_norm": 2.317666530609131, + "learning_rate": 4.6515011972738993e-07, + "loss": 0.0265, + "step": 30050 + }, + { + "epoch": 5.439500813890396, + "grad_norm": 9.61573314666748, + "learning_rate": 4.6468962976607104e-07, + "loss": 0.0167, + "step": 30075 + }, + { + "epoch": 5.444022427202025, + "grad_norm": 0.6778357028961182, + "learning_rate": 4.6422913980475226e-07, + "loss": 0.0099, + "step": 30100 + }, + { + "epoch": 5.448544040513656, + "grad_norm": 9.778840065002441, + "learning_rate": 4.637686498434334e-07, + "loss": 0.0052, + "step": 30125 + }, + { + "epoch": 5.453065653825285, + "grad_norm": 1.3489108085632324, + "learning_rate": 4.6330815988211454e-07, + "loss": 0.0032, + "step": 30150 + }, + { + "epoch": 5.4575872671369146, + "grad_norm": 0.05276188254356384, + "learning_rate": 4.628476699207957e-07, + "loss": 0.0079, + "step": 30175 + }, + { + "epoch": 5.462108880448544, + "grad_norm": 2.3165993690490723, + "learning_rate": 4.6238717995947687e-07, + "loss": 0.0239, + "step": 30200 + }, + { + "epoch": 5.4666304937601735, + "grad_norm": 0.06526318937540054, + "learning_rate": 4.6192668999815804e-07, + "loss": 0.0006, + "step": 30225 + }, + { + "epoch": 5.471152107071803, + "grad_norm": 12.61938190460205, + "learning_rate": 4.6146620003683915e-07, + "loss": 0.0061, + "step": 30250 + }, + { + "epoch": 5.4756737203834325, + "grad_norm": 1.8737666606903076, + "learning_rate": 4.6100571007552037e-07, + "loss": 0.0126, + "step": 30275 + }, + { + "epoch": 5.480195333695063, + "grad_norm": 0.7588403820991516, + "learning_rate": 4.605452201142015e-07, + "loss": 0.0029, + "step": 30300 + }, + { + "epoch": 5.484716947006692, + "grad_norm": 0.04945962131023407, + "learning_rate": 4.6008473015288265e-07, + "loss": 0.0232, + "step": 30325 + }, + { + "epoch": 5.489238560318322, + "grad_norm": 0.03633030131459236, + "learning_rate": 4.596242401915638e-07, + "loss": 0.0049, + "step": 30350 + }, + { + "epoch": 5.493760173629951, + "grad_norm": 0.049966856837272644, + "learning_rate": 4.5916375023024493e-07, + "loss": 0.0095, + "step": 30375 + }, + { + "epoch": 5.498281786941581, + "grad_norm": 0.017245082184672356, + "learning_rate": 4.5870326026892615e-07, + "loss": 0.0172, + "step": 30400 + }, + { + "epoch": 5.50280340025321, + "grad_norm": 4.209415435791016, + "learning_rate": 4.5824277030760726e-07, + "loss": 0.0076, + "step": 30425 + }, + { + "epoch": 5.50732501356484, + "grad_norm": 0.1449451893568039, + "learning_rate": 4.5778228034628843e-07, + "loss": 0.0268, + "step": 30450 + }, + { + "epoch": 5.511846626876469, + "grad_norm": 6.67424201965332, + "learning_rate": 4.573217903849696e-07, + "loss": 0.0155, + "step": 30475 + }, + { + "epoch": 5.516368240188099, + "grad_norm": 0.1047978401184082, + "learning_rate": 4.568613004236507e-07, + "loss": 0.003, + "step": 30500 + }, + { + "epoch": 5.520889853499729, + "grad_norm": 11.987727165222168, + "learning_rate": 4.5640081046233193e-07, + "loss": 0.006, + "step": 30525 + }, + { + "epoch": 5.5254114668113585, + "grad_norm": 0.01995599828660488, + "learning_rate": 4.5594032050101304e-07, + "loss": 0.0024, + "step": 30550 + }, + { + "epoch": 5.529933080122988, + "grad_norm": 0.24483761191368103, + "learning_rate": 4.554798305396942e-07, + "loss": 0.0054, + "step": 30575 + }, + { + "epoch": 5.5344546934346175, + "grad_norm": 0.03671187534928322, + "learning_rate": 4.550193405783754e-07, + "loss": 0.014, + "step": 30600 + }, + { + "epoch": 5.538976306746247, + "grad_norm": 13.608981132507324, + "learning_rate": 4.545588506170565e-07, + "loss": 0.0025, + "step": 30625 + }, + { + "epoch": 5.5434979200578764, + "grad_norm": 68.79994201660156, + "learning_rate": 4.540983606557377e-07, + "loss": 0.0027, + "step": 30650 + }, + { + "epoch": 5.548019533369506, + "grad_norm": 0.06913666427135468, + "learning_rate": 4.536378706944188e-07, + "loss": 0.0039, + "step": 30675 + }, + { + "epoch": 5.552541146681136, + "grad_norm": 0.04184752330183983, + "learning_rate": 4.5317738073310004e-07, + "loss": 0.0105, + "step": 30700 + }, + { + "epoch": 5.557062759992766, + "grad_norm": 22.15312957763672, + "learning_rate": 4.5271689077178115e-07, + "loss": 0.0089, + "step": 30725 + }, + { + "epoch": 5.561584373304395, + "grad_norm": 0.17298342287540436, + "learning_rate": 4.5225640081046227e-07, + "loss": 0.0253, + "step": 30750 + }, + { + "epoch": 5.566105986616025, + "grad_norm": 17.002382278442383, + "learning_rate": 4.517959108491435e-07, + "loss": 0.024, + "step": 30775 + }, + { + "epoch": 5.570627599927654, + "grad_norm": 0.07881984114646912, + "learning_rate": 4.513354208878246e-07, + "loss": 0.0185, + "step": 30800 + }, + { + "epoch": 5.575149213239284, + "grad_norm": 1.0953749418258667, + "learning_rate": 4.508749309265058e-07, + "loss": 0.0238, + "step": 30825 + }, + { + "epoch": 5.579670826550913, + "grad_norm": 0.5996679067611694, + "learning_rate": 4.5041444096518693e-07, + "loss": 0.0408, + "step": 30850 + }, + { + "epoch": 5.584192439862543, + "grad_norm": 0.037622638046741486, + "learning_rate": 4.499539510038681e-07, + "loss": 0.0082, + "step": 30875 + }, + { + "epoch": 5.588714053174172, + "grad_norm": 0.8418008685112, + "learning_rate": 4.4949346104254926e-07, + "loss": 0.0049, + "step": 30900 + }, + { + "epoch": 5.5932356664858025, + "grad_norm": 0.01251909602433443, + "learning_rate": 4.490329710812304e-07, + "loss": 0.0055, + "step": 30925 + }, + { + "epoch": 5.597757279797432, + "grad_norm": 2.921856641769409, + "learning_rate": 4.485724811199116e-07, + "loss": 0.0089, + "step": 30950 + }, + { + "epoch": 5.6022788931090615, + "grad_norm": 45.41884231567383, + "learning_rate": 4.481119911585927e-07, + "loss": 0.0098, + "step": 30975 + }, + { + "epoch": 5.606800506420691, + "grad_norm": 11.943808555603027, + "learning_rate": 4.4765150119727393e-07, + "loss": 0.0201, + "step": 31000 + }, + { + "epoch": 5.61132211973232, + "grad_norm": 0.03331173211336136, + "learning_rate": 4.4719101123595504e-07, + "loss": 0.0075, + "step": 31025 + }, + { + "epoch": 5.61584373304395, + "grad_norm": 19.69361686706543, + "learning_rate": 4.4673052127463615e-07, + "loss": 0.011, + "step": 31050 + }, + { + "epoch": 5.620365346355579, + "grad_norm": 1.9068259000778198, + "learning_rate": 4.4627003131331737e-07, + "loss": 0.0067, + "step": 31075 + }, + { + "epoch": 5.62488695966721, + "grad_norm": 0.019552985206246376, + "learning_rate": 4.458095413519985e-07, + "loss": 0.0192, + "step": 31100 + }, + { + "epoch": 5.629408572978839, + "grad_norm": 0.02558848075568676, + "learning_rate": 4.453490513906797e-07, + "loss": 0.0064, + "step": 31125 + }, + { + "epoch": 5.633930186290469, + "grad_norm": 0.023367932066321373, + "learning_rate": 4.448885614293608e-07, + "loss": 0.0093, + "step": 31150 + }, + { + "epoch": 5.638451799602098, + "grad_norm": 0.04951472207903862, + "learning_rate": 4.4442807146804193e-07, + "loss": 0.0056, + "step": 31175 + }, + { + "epoch": 5.642973412913728, + "grad_norm": 0.042895007878541946, + "learning_rate": 4.4396758150672315e-07, + "loss": 0.0086, + "step": 31200 + }, + { + "epoch": 5.647495026225357, + "grad_norm": 3.2334864139556885, + "learning_rate": 4.4350709154540426e-07, + "loss": 0.0169, + "step": 31225 + }, + { + "epoch": 5.652016639536987, + "grad_norm": 4.943774700164795, + "learning_rate": 4.430466015840855e-07, + "loss": 0.0401, + "step": 31250 + }, + { + "epoch": 5.656538252848616, + "grad_norm": 0.135379821062088, + "learning_rate": 4.425861116227666e-07, + "loss": 0.0066, + "step": 31275 + }, + { + "epoch": 5.661059866160246, + "grad_norm": 0.023317676037549973, + "learning_rate": 4.421256216614477e-07, + "loss": 0.0035, + "step": 31300 + }, + { + "epoch": 5.665581479471875, + "grad_norm": 0.1724722981452942, + "learning_rate": 4.4166513170012893e-07, + "loss": 0.0106, + "step": 31325 + }, + { + "epoch": 5.670103092783505, + "grad_norm": 0.0674796774983406, + "learning_rate": 4.4120464173881004e-07, + "loss": 0.0078, + "step": 31350 + }, + { + "epoch": 5.674624706095135, + "grad_norm": 0.15505366027355194, + "learning_rate": 4.4074415177749126e-07, + "loss": 0.0069, + "step": 31375 + }, + { + "epoch": 5.679146319406764, + "grad_norm": 7.8905134201049805, + "learning_rate": 4.402836618161724e-07, + "loss": 0.0128, + "step": 31400 + }, + { + "epoch": 5.683667932718394, + "grad_norm": 24.007476806640625, + "learning_rate": 4.398231718548536e-07, + "loss": 0.0102, + "step": 31425 + }, + { + "epoch": 5.688189546030023, + "grad_norm": 0.03173492103815079, + "learning_rate": 4.393626818935347e-07, + "loss": 0.0056, + "step": 31450 + }, + { + "epoch": 5.692711159341653, + "grad_norm": 0.4066329300403595, + "learning_rate": 4.389021919322158e-07, + "loss": 0.0019, + "step": 31475 + }, + { + "epoch": 5.697232772653282, + "grad_norm": 10.2057466506958, + "learning_rate": 4.3844170197089704e-07, + "loss": 0.0063, + "step": 31500 + }, + { + "epoch": 5.701754385964913, + "grad_norm": 0.02208542451262474, + "learning_rate": 4.3798121200957815e-07, + "loss": 0.0202, + "step": 31525 + }, + { + "epoch": 5.706275999276542, + "grad_norm": 0.38067665696144104, + "learning_rate": 4.3752072204825937e-07, + "loss": 0.0076, + "step": 31550 + }, + { + "epoch": 5.710797612588172, + "grad_norm": 0.04764077439904213, + "learning_rate": 4.370602320869405e-07, + "loss": 0.0016, + "step": 31575 + }, + { + "epoch": 5.715319225899801, + "grad_norm": 0.04114853963255882, + "learning_rate": 4.365997421256216e-07, + "loss": 0.0109, + "step": 31600 + }, + { + "epoch": 5.719840839211431, + "grad_norm": 0.229792058467865, + "learning_rate": 4.361392521643028e-07, + "loss": 0.0155, + "step": 31625 + }, + { + "epoch": 5.72436245252306, + "grad_norm": 2.7913074493408203, + "learning_rate": 4.3567876220298393e-07, + "loss": 0.0199, + "step": 31650 + }, + { + "epoch": 5.72888406583469, + "grad_norm": 0.0458095520734787, + "learning_rate": 4.3521827224166515e-07, + "loss": 0.0096, + "step": 31675 + }, + { + "epoch": 5.733405679146319, + "grad_norm": 2.6488194465637207, + "learning_rate": 4.3475778228034626e-07, + "loss": 0.0095, + "step": 31700 + }, + { + "epoch": 5.7379272924579485, + "grad_norm": 0.013299129903316498, + "learning_rate": 4.3429729231902743e-07, + "loss": 0.0081, + "step": 31725 + }, + { + "epoch": 5.742448905769579, + "grad_norm": 0.6266424059867859, + "learning_rate": 4.338368023577086e-07, + "loss": 0.0009, + "step": 31750 + }, + { + "epoch": 5.746970519081208, + "grad_norm": 13.529029846191406, + "learning_rate": 4.333763123963897e-07, + "loss": 0.0102, + "step": 31775 + }, + { + "epoch": 5.751492132392838, + "grad_norm": 0.0697498694062233, + "learning_rate": 4.3291582243507093e-07, + "loss": 0.0053, + "step": 31800 + }, + { + "epoch": 5.756013745704467, + "grad_norm": 0.053803663700819016, + "learning_rate": 4.3245533247375204e-07, + "loss": 0.0016, + "step": 31825 + }, + { + "epoch": 5.760535359016097, + "grad_norm": 0.23834507167339325, + "learning_rate": 4.319948425124332e-07, + "loss": 0.0018, + "step": 31850 + }, + { + "epoch": 5.765056972327726, + "grad_norm": 0.01928309164941311, + "learning_rate": 4.315343525511144e-07, + "loss": 0.019, + "step": 31875 + }, + { + "epoch": 5.769578585639356, + "grad_norm": 0.20884621143341064, + "learning_rate": 4.310738625897955e-07, + "loss": 0.0331, + "step": 31900 + }, + { + "epoch": 5.774100198950986, + "grad_norm": 28.804271697998047, + "learning_rate": 4.306133726284767e-07, + "loss": 0.0103, + "step": 31925 + }, + { + "epoch": 5.778621812262616, + "grad_norm": 16.59792137145996, + "learning_rate": 4.301528826671578e-07, + "loss": 0.005, + "step": 31950 + }, + { + "epoch": 5.783143425574245, + "grad_norm": 13.525711059570312, + "learning_rate": 4.29692392705839e-07, + "loss": 0.0197, + "step": 31975 + }, + { + "epoch": 5.787665038885875, + "grad_norm": 0.2153875082731247, + "learning_rate": 4.2923190274452015e-07, + "loss": 0.0028, + "step": 32000 + }, + { + "epoch": 5.792186652197504, + "grad_norm": 21.80754852294922, + "learning_rate": 4.2877141278320127e-07, + "loss": 0.0509, + "step": 32025 + }, + { + "epoch": 5.7967082655091335, + "grad_norm": 0.027552086859941483, + "learning_rate": 4.283109228218825e-07, + "loss": 0.0455, + "step": 32050 + }, + { + "epoch": 5.801229878820763, + "grad_norm": 0.5974065065383911, + "learning_rate": 4.278504328605636e-07, + "loss": 0.003, + "step": 32075 + }, + { + "epoch": 5.8057514921323925, + "grad_norm": 0.0764407068490982, + "learning_rate": 4.273899428992448e-07, + "loss": 0.0073, + "step": 32100 + }, + { + "epoch": 5.810273105444022, + "grad_norm": 0.00713876448571682, + "learning_rate": 4.2692945293792593e-07, + "loss": 0.0016, + "step": 32125 + }, + { + "epoch": 5.814794718755652, + "grad_norm": 2.5511715412139893, + "learning_rate": 4.264689629766071e-07, + "loss": 0.0018, + "step": 32150 + }, + { + "epoch": 5.819316332067282, + "grad_norm": 0.024245627224445343, + "learning_rate": 4.2600847301528826e-07, + "loss": 0.0072, + "step": 32175 + }, + { + "epoch": 5.823837945378911, + "grad_norm": 0.7642532587051392, + "learning_rate": 4.255479830539694e-07, + "loss": 0.0063, + "step": 32200 + }, + { + "epoch": 5.828359558690541, + "grad_norm": 0.008916143327951431, + "learning_rate": 4.250874930926506e-07, + "loss": 0.0016, + "step": 32225 + }, + { + "epoch": 5.83288117200217, + "grad_norm": 0.04523088410496712, + "learning_rate": 4.246270031313317e-07, + "loss": 0.0068, + "step": 32250 + }, + { + "epoch": 5.8374027853138, + "grad_norm": 0.01864382065832615, + "learning_rate": 4.241665131700129e-07, + "loss": 0.0062, + "step": 32275 + }, + { + "epoch": 5.841924398625429, + "grad_norm": 0.09099319577217102, + "learning_rate": 4.2370602320869404e-07, + "loss": 0.0081, + "step": 32300 + }, + { + "epoch": 5.84644601193706, + "grad_norm": 0.14763249456882477, + "learning_rate": 4.2324553324737516e-07, + "loss": 0.008, + "step": 32325 + }, + { + "epoch": 5.850967625248689, + "grad_norm": 0.08158791810274124, + "learning_rate": 4.227850432860564e-07, + "loss": 0.0096, + "step": 32350 + }, + { + "epoch": 5.8554892385603186, + "grad_norm": 0.14870816469192505, + "learning_rate": 4.223245533247375e-07, + "loss": 0.0019, + "step": 32375 + }, + { + "epoch": 5.860010851871948, + "grad_norm": 0.02300347574055195, + "learning_rate": 4.2186406336341865e-07, + "loss": 0.0054, + "step": 32400 + }, + { + "epoch": 5.8645324651835775, + "grad_norm": 1.2022995948791504, + "learning_rate": 4.214035734020998e-07, + "loss": 0.0355, + "step": 32425 + }, + { + "epoch": 5.869054078495207, + "grad_norm": 1.5367909669876099, + "learning_rate": 4.20943083440781e-07, + "loss": 0.0319, + "step": 32450 + }, + { + "epoch": 5.8735756918068365, + "grad_norm": 0.14563749730587006, + "learning_rate": 4.2048259347946215e-07, + "loss": 0.0059, + "step": 32475 + }, + { + "epoch": 5.878097305118466, + "grad_norm": 0.21176332235336304, + "learning_rate": 4.2002210351814327e-07, + "loss": 0.0023, + "step": 32500 + }, + { + "epoch": 5.882618918430095, + "grad_norm": 0.14737099409103394, + "learning_rate": 4.1956161355682443e-07, + "loss": 0.0029, + "step": 32525 + }, + { + "epoch": 5.887140531741726, + "grad_norm": 0.05561397597193718, + "learning_rate": 4.191011235955056e-07, + "loss": 0.0041, + "step": 32550 + }, + { + "epoch": 5.891662145053355, + "grad_norm": 0.1818033903837204, + "learning_rate": 4.1864063363418676e-07, + "loss": 0.0079, + "step": 32575 + }, + { + "epoch": 5.896183758364985, + "grad_norm": 6.998167514801025, + "learning_rate": 4.1818014367286793e-07, + "loss": 0.0076, + "step": 32600 + }, + { + "epoch": 5.900705371676614, + "grad_norm": 0.168256476521492, + "learning_rate": 4.1771965371154904e-07, + "loss": 0.0046, + "step": 32625 + }, + { + "epoch": 5.905226984988244, + "grad_norm": 0.041657544672489166, + "learning_rate": 4.172591637502302e-07, + "loss": 0.0054, + "step": 32650 + }, + { + "epoch": 5.909748598299873, + "grad_norm": 40.278446197509766, + "learning_rate": 4.167986737889114e-07, + "loss": 0.0125, + "step": 32675 + }, + { + "epoch": 5.914270211611503, + "grad_norm": 0.1068694144487381, + "learning_rate": 4.1633818382759254e-07, + "loss": 0.0023, + "step": 32700 + }, + { + "epoch": 5.918791824923133, + "grad_norm": 0.24333126842975616, + "learning_rate": 4.158776938662737e-07, + "loss": 0.0289, + "step": 32725 + }, + { + "epoch": 5.9233134382347625, + "grad_norm": 0.16926701366901398, + "learning_rate": 4.154172039049548e-07, + "loss": 0.0123, + "step": 32750 + }, + { + "epoch": 5.927835051546392, + "grad_norm": 3.9394893646240234, + "learning_rate": 4.1495671394363604e-07, + "loss": 0.0244, + "step": 32775 + }, + { + "epoch": 5.9323566648580215, + "grad_norm": 0.06976808607578278, + "learning_rate": 4.1449622398231715e-07, + "loss": 0.0098, + "step": 32800 + }, + { + "epoch": 5.936878278169651, + "grad_norm": 24.640043258666992, + "learning_rate": 4.140357340209983e-07, + "loss": 0.0225, + "step": 32825 + }, + { + "epoch": 5.9413998914812804, + "grad_norm": 4.303247451782227, + "learning_rate": 4.135752440596795e-07, + "loss": 0.0136, + "step": 32850 + }, + { + "epoch": 5.94592150479291, + "grad_norm": 0.46884116530418396, + "learning_rate": 4.1311475409836065e-07, + "loss": 0.0156, + "step": 32875 + }, + { + "epoch": 5.950443118104539, + "grad_norm": 0.9783020615577698, + "learning_rate": 4.126542641370418e-07, + "loss": 0.0114, + "step": 32900 + }, + { + "epoch": 5.954964731416169, + "grad_norm": 0.5886393785476685, + "learning_rate": 4.1219377417572293e-07, + "loss": 0.0077, + "step": 32925 + }, + { + "epoch": 5.959486344727799, + "grad_norm": 0.019434532150626183, + "learning_rate": 4.117332842144041e-07, + "loss": 0.0025, + "step": 32950 + }, + { + "epoch": 5.964007958039429, + "grad_norm": 2.6205480098724365, + "learning_rate": 4.1127279425308527e-07, + "loss": 0.0026, + "step": 32975 + }, + { + "epoch": 5.968529571351058, + "grad_norm": 0.25357627868652344, + "learning_rate": 4.1081230429176643e-07, + "loss": 0.0056, + "step": 33000 + }, + { + "epoch": 5.973051184662688, + "grad_norm": 9.764650344848633, + "learning_rate": 4.103518143304476e-07, + "loss": 0.005, + "step": 33025 + }, + { + "epoch": 5.977572797974317, + "grad_norm": 0.15983889997005463, + "learning_rate": 4.098913243691287e-07, + "loss": 0.0093, + "step": 33050 + }, + { + "epoch": 5.982094411285947, + "grad_norm": 0.44210419058799744, + "learning_rate": 4.094308344078099e-07, + "loss": 0.006, + "step": 33075 + }, + { + "epoch": 5.986616024597576, + "grad_norm": 7.316641807556152, + "learning_rate": 4.0897034444649104e-07, + "loss": 0.0096, + "step": 33100 + }, + { + "epoch": 5.9911376379092065, + "grad_norm": 0.040955204516649246, + "learning_rate": 4.085098544851722e-07, + "loss": 0.002, + "step": 33125 + }, + { + "epoch": 5.995659251220836, + "grad_norm": 28.565380096435547, + "learning_rate": 4.080493645238534e-07, + "loss": 0.0204, + "step": 33150 + }, + { + "epoch": 6.0, + "eval_loss": 0.35324448347091675, + "eval_runtime": 8693.2226, + "eval_samples_per_second": 1.092, + "eval_steps_per_second": 0.137, + "eval_wer": 0.10656630365150545, + "step": 33174 + }, + { + "epoch": 6.0001808645324655, + "grad_norm": 1.004451870918274, + "learning_rate": 4.0760729416098725e-07, + "loss": 0.0388, + "step": 33175 + }, + { + "epoch": 6.004702477844095, + "grad_norm": 0.3171859383583069, + "learning_rate": 4.071468041996684e-07, + "loss": 0.0116, + "step": 33200 + }, + { + "epoch": 6.009224091155724, + "grad_norm": 0.3744235634803772, + "learning_rate": 4.066863142383496e-07, + "loss": 0.0065, + "step": 33225 + }, + { + "epoch": 6.013745704467354, + "grad_norm": 0.04872240498661995, + "learning_rate": 4.0622582427703075e-07, + "loss": 0.0118, + "step": 33250 + }, + { + "epoch": 6.018267317778983, + "grad_norm": 0.20710858702659607, + "learning_rate": 4.057653343157119e-07, + "loss": 0.0018, + "step": 33275 + }, + { + "epoch": 6.022788931090613, + "grad_norm": 0.6177895665168762, + "learning_rate": 4.0530484435439303e-07, + "loss": 0.0079, + "step": 33300 + }, + { + "epoch": 6.027310544402242, + "grad_norm": 0.042505986988544464, + "learning_rate": 4.0484435439307425e-07, + "loss": 0.0122, + "step": 33325 + }, + { + "epoch": 6.031832157713873, + "grad_norm": 0.018311861902475357, + "learning_rate": 4.0438386443175536e-07, + "loss": 0.0019, + "step": 33350 + }, + { + "epoch": 6.036353771025502, + "grad_norm": 0.05716663971543312, + "learning_rate": 4.0392337447043653e-07, + "loss": 0.0006, + "step": 33375 + }, + { + "epoch": 6.040875384337132, + "grad_norm": 0.03125373646616936, + "learning_rate": 4.034628845091177e-07, + "loss": 0.0008, + "step": 33400 + }, + { + "epoch": 6.045396997648761, + "grad_norm": 0.04944000765681267, + "learning_rate": 4.030023945477988e-07, + "loss": 0.0112, + "step": 33425 + }, + { + "epoch": 6.049918610960391, + "grad_norm": 3.482018232345581, + "learning_rate": 4.0254190458648003e-07, + "loss": 0.0068, + "step": 33450 + }, + { + "epoch": 6.05444022427202, + "grad_norm": 0.051414769142866135, + "learning_rate": 4.0208141462516114e-07, + "loss": 0.0032, + "step": 33475 + }, + { + "epoch": 6.05896183758365, + "grad_norm": 1.4644255638122559, + "learning_rate": 4.016209246638423e-07, + "loss": 0.0123, + "step": 33500 + }, + { + "epoch": 6.063483450895279, + "grad_norm": 0.038726359605789185, + "learning_rate": 4.011604347025235e-07, + "loss": 0.0028, + "step": 33525 + }, + { + "epoch": 6.068005064206909, + "grad_norm": 0.45498254895210266, + "learning_rate": 4.006999447412046e-07, + "loss": 0.019, + "step": 33550 + }, + { + "epoch": 6.072526677518539, + "grad_norm": 2.0771706104278564, + "learning_rate": 4.002394547798858e-07, + "loss": 0.0319, + "step": 33575 + }, + { + "epoch": 6.077048290830168, + "grad_norm": 0.0325147807598114, + "learning_rate": 3.997789648185669e-07, + "loss": 0.0073, + "step": 33600 + }, + { + "epoch": 6.081569904141798, + "grad_norm": 0.1332240253686905, + "learning_rate": 3.993184748572481e-07, + "loss": 0.0051, + "step": 33625 + }, + { + "epoch": 6.086091517453427, + "grad_norm": 0.13362543284893036, + "learning_rate": 3.9885798489592925e-07, + "loss": 0.0042, + "step": 33650 + }, + { + "epoch": 6.090613130765057, + "grad_norm": 0.09919251501560211, + "learning_rate": 3.9839749493461037e-07, + "loss": 0.0028, + "step": 33675 + }, + { + "epoch": 6.095134744076686, + "grad_norm": 0.2754238545894623, + "learning_rate": 3.979370049732916e-07, + "loss": 0.0031, + "step": 33700 + }, + { + "epoch": 6.099656357388316, + "grad_norm": 0.028968511149287224, + "learning_rate": 3.974765150119727e-07, + "loss": 0.0141, + "step": 33725 + }, + { + "epoch": 6.104177970699946, + "grad_norm": 0.2524532675743103, + "learning_rate": 3.970160250506539e-07, + "loss": 0.0071, + "step": 33750 + }, + { + "epoch": 6.108699584011576, + "grad_norm": 0.031802672892808914, + "learning_rate": 3.9655553508933503e-07, + "loss": 0.0012, + "step": 33775 + }, + { + "epoch": 6.113221197323205, + "grad_norm": 0.05196872353553772, + "learning_rate": 3.9609504512801614e-07, + "loss": 0.0072, + "step": 33800 + }, + { + "epoch": 6.117742810634835, + "grad_norm": 0.03085019811987877, + "learning_rate": 3.9563455516669736e-07, + "loss": 0.002, + "step": 33825 + }, + { + "epoch": 6.122264423946464, + "grad_norm": 0.0648372620344162, + "learning_rate": 3.951740652053785e-07, + "loss": 0.0254, + "step": 33850 + }, + { + "epoch": 6.126786037258094, + "grad_norm": 0.00866938941180706, + "learning_rate": 3.947135752440597e-07, + "loss": 0.0108, + "step": 33875 + }, + { + "epoch": 6.131307650569723, + "grad_norm": 0.19999012351036072, + "learning_rate": 3.942530852827408e-07, + "loss": 0.0069, + "step": 33900 + }, + { + "epoch": 6.1358292638813525, + "grad_norm": 0.04982515051960945, + "learning_rate": 3.93792595321422e-07, + "loss": 0.0016, + "step": 33925 + }, + { + "epoch": 6.140350877192983, + "grad_norm": 2.1079819202423096, + "learning_rate": 3.9333210536010314e-07, + "loss": 0.0117, + "step": 33950 + }, + { + "epoch": 6.144872490504612, + "grad_norm": 0.12225229293107986, + "learning_rate": 3.9287161539878426e-07, + "loss": 0.0389, + "step": 33975 + }, + { + "epoch": 6.149394103816242, + "grad_norm": 0.014936073683202267, + "learning_rate": 3.924111254374655e-07, + "loss": 0.0173, + "step": 34000 + }, + { + "epoch": 6.153915717127871, + "grad_norm": 0.05773633345961571, + "learning_rate": 3.919506354761466e-07, + "loss": 0.0046, + "step": 34025 + }, + { + "epoch": 6.158437330439501, + "grad_norm": 0.7598387598991394, + "learning_rate": 3.914901455148278e-07, + "loss": 0.013, + "step": 34050 + }, + { + "epoch": 6.16295894375113, + "grad_norm": 48.75155258178711, + "learning_rate": 3.910296555535089e-07, + "loss": 0.0044, + "step": 34075 + }, + { + "epoch": 6.16748055706276, + "grad_norm": 0.024905025959014893, + "learning_rate": 3.9056916559219003e-07, + "loss": 0.0021, + "step": 34100 + }, + { + "epoch": 6.172002170374389, + "grad_norm": 0.01038370281457901, + "learning_rate": 3.9010867563087125e-07, + "loss": 0.0016, + "step": 34125 + }, + { + "epoch": 6.176523783686019, + "grad_norm": 2.034224510192871, + "learning_rate": 3.8964818566955237e-07, + "loss": 0.0096, + "step": 34150 + }, + { + "epoch": 6.181045396997649, + "grad_norm": 1.9965308904647827, + "learning_rate": 3.891876957082336e-07, + "loss": 0.0176, + "step": 34175 + }, + { + "epoch": 6.185567010309279, + "grad_norm": 108.9871826171875, + "learning_rate": 3.887272057469147e-07, + "loss": 0.0172, + "step": 34200 + }, + { + "epoch": 6.190088623620908, + "grad_norm": 0.04911862686276436, + "learning_rate": 3.882667157855958e-07, + "loss": 0.0012, + "step": 34225 + }, + { + "epoch": 6.1946102369325375, + "grad_norm": 2.3518688678741455, + "learning_rate": 3.8780622582427703e-07, + "loss": 0.003, + "step": 34250 + }, + { + "epoch": 6.199131850244167, + "grad_norm": 0.08073610067367554, + "learning_rate": 3.8734573586295814e-07, + "loss": 0.0228, + "step": 34275 + }, + { + "epoch": 6.2036534635557965, + "grad_norm": 0.04245547577738762, + "learning_rate": 3.8688524590163936e-07, + "loss": 0.0008, + "step": 34300 + }, + { + "epoch": 6.208175076867426, + "grad_norm": 0.04015149176120758, + "learning_rate": 3.864247559403205e-07, + "loss": 0.003, + "step": 34325 + }, + { + "epoch": 6.2126966901790555, + "grad_norm": 0.040025342255830765, + "learning_rate": 3.859642659790016e-07, + "loss": 0.0283, + "step": 34350 + }, + { + "epoch": 6.217218303490686, + "grad_norm": 4.162205696105957, + "learning_rate": 3.855037760176828e-07, + "loss": 0.0259, + "step": 34375 + }, + { + "epoch": 6.221739916802315, + "grad_norm": 0.10629791021347046, + "learning_rate": 3.850432860563639e-07, + "loss": 0.0046, + "step": 34400 + }, + { + "epoch": 6.226261530113945, + "grad_norm": 0.05209145322442055, + "learning_rate": 3.8458279609504514e-07, + "loss": 0.0063, + "step": 34425 + }, + { + "epoch": 6.230783143425574, + "grad_norm": 0.03029513917863369, + "learning_rate": 3.8412230613372625e-07, + "loss": 0.0035, + "step": 34450 + }, + { + "epoch": 6.235304756737204, + "grad_norm": 0.0416143536567688, + "learning_rate": 3.836618161724075e-07, + "loss": 0.0064, + "step": 34475 + }, + { + "epoch": 6.239826370048833, + "grad_norm": 17.89125633239746, + "learning_rate": 3.832013262110886e-07, + "loss": 0.0103, + "step": 34500 + }, + { + "epoch": 6.244347983360463, + "grad_norm": 0.08937986195087433, + "learning_rate": 3.827408362497697e-07, + "loss": 0.0102, + "step": 34525 + }, + { + "epoch": 6.248869596672092, + "grad_norm": 0.012076592072844505, + "learning_rate": 3.822803462884509e-07, + "loss": 0.0012, + "step": 34550 + }, + { + "epoch": 6.2533912099837226, + "grad_norm": 13.431784629821777, + "learning_rate": 3.8181985632713203e-07, + "loss": 0.0029, + "step": 34575 + }, + { + "epoch": 6.257912823295352, + "grad_norm": 0.15255410969257355, + "learning_rate": 3.8135936636581325e-07, + "loss": 0.0056, + "step": 34600 + }, + { + "epoch": 6.2624344366069815, + "grad_norm": 0.7227760553359985, + "learning_rate": 3.8089887640449437e-07, + "loss": 0.0048, + "step": 34625 + }, + { + "epoch": 6.266956049918611, + "grad_norm": 0.022182561457157135, + "learning_rate": 3.804383864431755e-07, + "loss": 0.0055, + "step": 34650 + }, + { + "epoch": 6.2714776632302405, + "grad_norm": 0.10832487791776657, + "learning_rate": 3.799778964818567e-07, + "loss": 0.0141, + "step": 34675 + }, + { + "epoch": 6.27599927654187, + "grad_norm": 0.08272892981767654, + "learning_rate": 3.795174065205378e-07, + "loss": 0.022, + "step": 34700 + }, + { + "epoch": 6.280520889853499, + "grad_norm": 0.10777498036623001, + "learning_rate": 3.7905691655921903e-07, + "loss": 0.0062, + "step": 34725 + }, + { + "epoch": 6.285042503165129, + "grad_norm": 0.1464318186044693, + "learning_rate": 3.7859642659790014e-07, + "loss": 0.0078, + "step": 34750 + }, + { + "epoch": 6.289564116476759, + "grad_norm": 2.59875750541687, + "learning_rate": 3.781359366365813e-07, + "loss": 0.0364, + "step": 34775 + }, + { + "epoch": 6.294085729788389, + "grad_norm": 0.6128166913986206, + "learning_rate": 3.776754466752625e-07, + "loss": 0.0119, + "step": 34800 + }, + { + "epoch": 6.298607343100018, + "grad_norm": 0.3331466019153595, + "learning_rate": 3.772149567139436e-07, + "loss": 0.0147, + "step": 34825 + }, + { + "epoch": 6.303128956411648, + "grad_norm": 1.0157824754714966, + "learning_rate": 3.767544667526248e-07, + "loss": 0.005, + "step": 34850 + }, + { + "epoch": 6.307650569723277, + "grad_norm": 0.20274540781974792, + "learning_rate": 3.762939767913059e-07, + "loss": 0.0027, + "step": 34875 + }, + { + "epoch": 6.312172183034907, + "grad_norm": 0.29477596282958984, + "learning_rate": 3.758334868299871e-07, + "loss": 0.006, + "step": 34900 + }, + { + "epoch": 6.316693796346536, + "grad_norm": 0.10228332132101059, + "learning_rate": 3.7537299686866825e-07, + "loss": 0.002, + "step": 34925 + }, + { + "epoch": 6.321215409658166, + "grad_norm": 0.08996782451868057, + "learning_rate": 3.7491250690734937e-07, + "loss": 0.0048, + "step": 34950 + }, + { + "epoch": 6.325737022969796, + "grad_norm": 0.012035650201141834, + "learning_rate": 3.744520169460306e-07, + "loss": 0.0157, + "step": 34975 + }, + { + "epoch": 6.3302586362814255, + "grad_norm": 0.035582542419433594, + "learning_rate": 3.739915269847117e-07, + "loss": 0.0112, + "step": 35000 + }, + { + "epoch": 6.334780249593055, + "grad_norm": 0.01056807953864336, + "learning_rate": 3.7353103702339287e-07, + "loss": 0.0008, + "step": 35025 + }, + { + "epoch": 6.3393018629046844, + "grad_norm": 0.2861970067024231, + "learning_rate": 3.7307054706207403e-07, + "loss": 0.0057, + "step": 35050 + }, + { + "epoch": 6.343823476216314, + "grad_norm": 0.013209059834480286, + "learning_rate": 3.7261005710075515e-07, + "loss": 0.0134, + "step": 35075 + }, + { + "epoch": 6.348345089527943, + "grad_norm": 0.1460546851158142, + "learning_rate": 3.7214956713943636e-07, + "loss": 0.0073, + "step": 35100 + }, + { + "epoch": 6.352866702839573, + "grad_norm": 0.09581249952316284, + "learning_rate": 3.716890771781175e-07, + "loss": 0.0104, + "step": 35125 + }, + { + "epoch": 6.357388316151202, + "grad_norm": 0.08900044858455658, + "learning_rate": 3.712285872167987e-07, + "loss": 0.0052, + "step": 35150 + }, + { + "epoch": 6.361909929462833, + "grad_norm": 0.3841894865036011, + "learning_rate": 3.707680972554798e-07, + "loss": 0.0195, + "step": 35175 + }, + { + "epoch": 6.366431542774462, + "grad_norm": 0.09828022122383118, + "learning_rate": 3.70307607294161e-07, + "loss": 0.012, + "step": 35200 + }, + { + "epoch": 6.370953156086092, + "grad_norm": 6.788888931274414, + "learning_rate": 3.6984711733284214e-07, + "loss": 0.023, + "step": 35225 + }, + { + "epoch": 6.375474769397721, + "grad_norm": 2.6568901538848877, + "learning_rate": 3.6938662737152326e-07, + "loss": 0.0014, + "step": 35250 + }, + { + "epoch": 6.379996382709351, + "grad_norm": 0.021253295242786407, + "learning_rate": 3.689261374102045e-07, + "loss": 0.0089, + "step": 35275 + }, + { + "epoch": 6.38451799602098, + "grad_norm": 0.7334450483322144, + "learning_rate": 3.684656474488856e-07, + "loss": 0.0017, + "step": 35300 + }, + { + "epoch": 6.38903960933261, + "grad_norm": 0.41396549344062805, + "learning_rate": 3.6800515748756676e-07, + "loss": 0.0033, + "step": 35325 + }, + { + "epoch": 6.393561222644239, + "grad_norm": 1.214400291442871, + "learning_rate": 3.675446675262479e-07, + "loss": 0.0016, + "step": 35350 + }, + { + "epoch": 6.3980828359558695, + "grad_norm": 0.03277917578816414, + "learning_rate": 3.6708417756492903e-07, + "loss": 0.0009, + "step": 35375 + }, + { + "epoch": 6.402604449267499, + "grad_norm": 0.1065434068441391, + "learning_rate": 3.6662368760361025e-07, + "loss": 0.0034, + "step": 35400 + }, + { + "epoch": 6.407126062579128, + "grad_norm": 1.2677608728408813, + "learning_rate": 3.6616319764229137e-07, + "loss": 0.0024, + "step": 35425 + }, + { + "epoch": 6.411647675890758, + "grad_norm": 0.034879542887210846, + "learning_rate": 3.6570270768097253e-07, + "loss": 0.0147, + "step": 35450 + }, + { + "epoch": 6.416169289202387, + "grad_norm": 0.009821565821766853, + "learning_rate": 3.652422177196537e-07, + "loss": 0.0084, + "step": 35475 + }, + { + "epoch": 6.420690902514017, + "grad_norm": 0.15989068150520325, + "learning_rate": 3.6478172775833487e-07, + "loss": 0.0253, + "step": 35500 + }, + { + "epoch": 6.425212515825646, + "grad_norm": 0.021616969257593155, + "learning_rate": 3.6432123779701603e-07, + "loss": 0.0385, + "step": 35525 + }, + { + "epoch": 6.429734129137276, + "grad_norm": 64.16004180908203, + "learning_rate": 3.6386074783569715e-07, + "loss": 0.0218, + "step": 35550 + }, + { + "epoch": 6.434255742448906, + "grad_norm": 2.1929166316986084, + "learning_rate": 3.634002578743783e-07, + "loss": 0.0276, + "step": 35575 + }, + { + "epoch": 6.438777355760536, + "grad_norm": 0.3426229655742645, + "learning_rate": 3.629397679130595e-07, + "loss": 0.0047, + "step": 35600 + }, + { + "epoch": 6.443298969072165, + "grad_norm": 0.9598920345306396, + "learning_rate": 3.6247927795174064e-07, + "loss": 0.0171, + "step": 35625 + }, + { + "epoch": 6.447820582383795, + "grad_norm": 0.39060238003730774, + "learning_rate": 3.620187879904218e-07, + "loss": 0.011, + "step": 35650 + }, + { + "epoch": 6.452342195695424, + "grad_norm": 0.9227154850959778, + "learning_rate": 3.615582980291029e-07, + "loss": 0.023, + "step": 35675 + }, + { + "epoch": 6.456863809007054, + "grad_norm": 5.520915985107422, + "learning_rate": 3.610978080677841e-07, + "loss": 0.0012, + "step": 35700 + }, + { + "epoch": 6.461385422318683, + "grad_norm": 0.09858091920614243, + "learning_rate": 3.6063731810646526e-07, + "loss": 0.0038, + "step": 35725 + }, + { + "epoch": 6.4659070356303125, + "grad_norm": 0.02428305707871914, + "learning_rate": 3.601768281451464e-07, + "loss": 0.004, + "step": 35750 + }, + { + "epoch": 6.470428648941942, + "grad_norm": 0.07016027718782425, + "learning_rate": 3.597163381838276e-07, + "loss": 0.0006, + "step": 35775 + }, + { + "epoch": 6.474950262253572, + "grad_norm": 0.1356564462184906, + "learning_rate": 3.592558482225087e-07, + "loss": 0.0025, + "step": 35800 + }, + { + "epoch": 6.479471875565202, + "grad_norm": 0.056458380073308945, + "learning_rate": 3.587953582611899e-07, + "loss": 0.0128, + "step": 35825 + }, + { + "epoch": 6.483993488876831, + "grad_norm": 2.0865302085876465, + "learning_rate": 3.5833486829987103e-07, + "loss": 0.0052, + "step": 35850 + }, + { + "epoch": 6.488515102188461, + "grad_norm": 0.2598702013492584, + "learning_rate": 3.578743783385522e-07, + "loss": 0.0079, + "step": 35875 + }, + { + "epoch": 6.49303671550009, + "grad_norm": 2.376058578491211, + "learning_rate": 3.5741388837723337e-07, + "loss": 0.0096, + "step": 35900 + }, + { + "epoch": 6.49755832881172, + "grad_norm": 0.022796526551246643, + "learning_rate": 3.5695339841591453e-07, + "loss": 0.0178, + "step": 35925 + }, + { + "epoch": 6.502079942123349, + "grad_norm": 17.341182708740234, + "learning_rate": 3.564929084545957e-07, + "loss": 0.0093, + "step": 35950 + }, + { + "epoch": 6.50660155543498, + "grad_norm": 0.10820627212524414, + "learning_rate": 3.560324184932768e-07, + "loss": 0.0426, + "step": 35975 + }, + { + "epoch": 6.511123168746609, + "grad_norm": 0.10570292919874191, + "learning_rate": 3.55571928531958e-07, + "loss": 0.0118, + "step": 36000 + }, + { + "epoch": 6.515644782058239, + "grad_norm": 5.8104658126831055, + "learning_rate": 3.5511143857063914e-07, + "loss": 0.0083, + "step": 36025 + }, + { + "epoch": 6.520166395369868, + "grad_norm": 0.9492124915122986, + "learning_rate": 3.546509486093203e-07, + "loss": 0.0157, + "step": 36050 + }, + { + "epoch": 6.524688008681498, + "grad_norm": 0.022533750161528587, + "learning_rate": 3.541904586480015e-07, + "loss": 0.0021, + "step": 36075 + }, + { + "epoch": 6.529209621993127, + "grad_norm": 0.35308748483657837, + "learning_rate": 3.537299686866826e-07, + "loss": 0.0015, + "step": 36100 + }, + { + "epoch": 6.5337312353047565, + "grad_norm": 1.7888010740280151, + "learning_rate": 3.5326947872536376e-07, + "loss": 0.004, + "step": 36125 + }, + { + "epoch": 6.538252848616386, + "grad_norm": 0.1663166582584381, + "learning_rate": 3.528089887640449e-07, + "loss": 0.0048, + "step": 36150 + }, + { + "epoch": 6.5427744619280155, + "grad_norm": 0.4084545969963074, + "learning_rate": 3.523484988027261e-07, + "loss": 0.0004, + "step": 36175 + }, + { + "epoch": 6.547296075239646, + "grad_norm": 0.039476677775382996, + "learning_rate": 3.5188800884140726e-07, + "loss": 0.0006, + "step": 36200 + }, + { + "epoch": 6.551817688551275, + "grad_norm": 0.0072963847778737545, + "learning_rate": 3.514275188800884e-07, + "loss": 0.008, + "step": 36225 + }, + { + "epoch": 6.556339301862905, + "grad_norm": 0.01364920660853386, + "learning_rate": 3.5096702891876954e-07, + "loss": 0.0077, + "step": 36250 + }, + { + "epoch": 6.560860915174534, + "grad_norm": 0.796404242515564, + "learning_rate": 3.505065389574507e-07, + "loss": 0.0229, + "step": 36275 + }, + { + "epoch": 6.565382528486164, + "grad_norm": 0.11998113989830017, + "learning_rate": 3.5004604899613187e-07, + "loss": 0.0266, + "step": 36300 + }, + { + "epoch": 6.569904141797793, + "grad_norm": 0.41521695256233215, + "learning_rate": 3.4958555903481303e-07, + "loss": 0.0025, + "step": 36325 + }, + { + "epoch": 6.574425755109423, + "grad_norm": 0.07342156767845154, + "learning_rate": 3.491250690734942e-07, + "loss": 0.0189, + "step": 36350 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 0.369840145111084, + "learning_rate": 3.486645791121753e-07, + "loss": 0.0209, + "step": 36375 + }, + { + "epoch": 6.583468981732683, + "grad_norm": 0.04731602966785431, + "learning_rate": 3.482040891508565e-07, + "loss": 0.0051, + "step": 36400 + }, + { + "epoch": 6.587990595044312, + "grad_norm": 0.21776604652404785, + "learning_rate": 3.4774359918953765e-07, + "loss": 0.0017, + "step": 36425 + }, + { + "epoch": 6.5925122083559415, + "grad_norm": 0.05085720121860504, + "learning_rate": 3.472831092282188e-07, + "loss": 0.0014, + "step": 36450 + }, + { + "epoch": 6.597033821667571, + "grad_norm": 1.6889132261276245, + "learning_rate": 3.468226192669e-07, + "loss": 0.0029, + "step": 36475 + }, + { + "epoch": 6.6015554349792005, + "grad_norm": 7.945947170257568, + "learning_rate": 3.4636212930558114e-07, + "loss": 0.0041, + "step": 36500 + }, + { + "epoch": 6.60607704829083, + "grad_norm": 0.7570422291755676, + "learning_rate": 3.4590163934426226e-07, + "loss": 0.0015, + "step": 36525 + }, + { + "epoch": 6.6105986616024595, + "grad_norm": 0.05319717898964882, + "learning_rate": 3.454411493829434e-07, + "loss": 0.0042, + "step": 36550 + }, + { + "epoch": 6.615120274914089, + "grad_norm": 0.012651624158024788, + "learning_rate": 3.449806594216246e-07, + "loss": 0.0028, + "step": 36575 + }, + { + "epoch": 6.619641888225719, + "grad_norm": 0.06157555803656578, + "learning_rate": 3.4452016946030576e-07, + "loss": 0.0004, + "step": 36600 + }, + { + "epoch": 6.624163501537349, + "grad_norm": 0.009101797826588154, + "learning_rate": 3.440596794989869e-07, + "loss": 0.0071, + "step": 36625 + }, + { + "epoch": 6.628685114848978, + "grad_norm": 0.08508434146642685, + "learning_rate": 3.435991895376681e-07, + "loss": 0.0046, + "step": 36650 + }, + { + "epoch": 6.633206728160608, + "grad_norm": 0.026564456522464752, + "learning_rate": 3.431386995763492e-07, + "loss": 0.0059, + "step": 36675 + }, + { + "epoch": 6.637728341472237, + "grad_norm": 0.935742199420929, + "learning_rate": 3.4267820961503037e-07, + "loss": 0.0014, + "step": 36700 + }, + { + "epoch": 6.642249954783867, + "grad_norm": 7.623640060424805, + "learning_rate": 3.4221771965371153e-07, + "loss": 0.009, + "step": 36725 + }, + { + "epoch": 6.646771568095496, + "grad_norm": 0.023885022848844528, + "learning_rate": 3.417572296923927e-07, + "loss": 0.0017, + "step": 36750 + }, + { + "epoch": 6.6512931814071266, + "grad_norm": 0.20136716961860657, + "learning_rate": 3.4129673973107387e-07, + "loss": 0.0513, + "step": 36775 + }, + { + "epoch": 6.655814794718756, + "grad_norm": 0.06223003938794136, + "learning_rate": 3.40836249769755e-07, + "loss": 0.0029, + "step": 36800 + }, + { + "epoch": 6.6603364080303855, + "grad_norm": 0.20519289374351501, + "learning_rate": 3.4037575980843615e-07, + "loss": 0.0041, + "step": 36825 + }, + { + "epoch": 6.664858021342015, + "grad_norm": 101.21697998046875, + "learning_rate": 3.399152698471173e-07, + "loss": 0.0043, + "step": 36850 + }, + { + "epoch": 6.6693796346536445, + "grad_norm": 0.015112169086933136, + "learning_rate": 3.394547798857985e-07, + "loss": 0.0037, + "step": 36875 + }, + { + "epoch": 6.673901247965274, + "grad_norm": 0.8060915470123291, + "learning_rate": 3.3899428992447965e-07, + "loss": 0.0029, + "step": 36900 + }, + { + "epoch": 6.678422861276903, + "grad_norm": 0.3591912090778351, + "learning_rate": 3.3853379996316076e-07, + "loss": 0.0042, + "step": 36925 + }, + { + "epoch": 6.682944474588533, + "grad_norm": 0.3402554392814636, + "learning_rate": 3.38073310001842e-07, + "loss": 0.0005, + "step": 36950 + }, + { + "epoch": 6.687466087900162, + "grad_norm": 0.0494844950735569, + "learning_rate": 3.376128200405231e-07, + "loss": 0.0041, + "step": 36975 + }, + { + "epoch": 6.691987701211792, + "grad_norm": 0.08879272639751434, + "learning_rate": 3.3715233007920426e-07, + "loss": 0.0045, + "step": 37000 + }, + { + "epoch": 6.696509314523422, + "grad_norm": 8.207866668701172, + "learning_rate": 3.366918401178854e-07, + "loss": 0.0093, + "step": 37025 + }, + { + "epoch": 6.701030927835052, + "grad_norm": 0.1084265485405922, + "learning_rate": 3.3624976975501935e-07, + "loss": 0.0175, + "step": 37050 + }, + { + "epoch": 6.705552541146681, + "grad_norm": 0.03057611919939518, + "learning_rate": 3.3578927979370047e-07, + "loss": 0.0027, + "step": 37075 + }, + { + "epoch": 6.710074154458311, + "grad_norm": 0.14145521819591522, + "learning_rate": 3.353287898323817e-07, + "loss": 0.0155, + "step": 37100 + }, + { + "epoch": 6.71459576776994, + "grad_norm": 0.03683609515428543, + "learning_rate": 3.348682998710628e-07, + "loss": 0.0121, + "step": 37125 + }, + { + "epoch": 6.71911738108157, + "grad_norm": 0.2000865340232849, + "learning_rate": 3.344078099097439e-07, + "loss": 0.0104, + "step": 37150 + }, + { + "epoch": 6.723638994393199, + "grad_norm": 1.1748223304748535, + "learning_rate": 3.3394731994842513e-07, + "loss": 0.0266, + "step": 37175 + }, + { + "epoch": 6.7281606077048295, + "grad_norm": 0.04432156682014465, + "learning_rate": 3.3348682998710625e-07, + "loss": 0.0047, + "step": 37200 + }, + { + "epoch": 6.732682221016459, + "grad_norm": 0.1042926087975502, + "learning_rate": 3.3302634002578746e-07, + "loss": 0.0017, + "step": 37225 + }, + { + "epoch": 6.7372038343280884, + "grad_norm": 0.025923365727066994, + "learning_rate": 3.325658500644686e-07, + "loss": 0.0023, + "step": 37250 + }, + { + "epoch": 6.741725447639718, + "grad_norm": 5.998639106750488, + "learning_rate": 3.321053601031497e-07, + "loss": 0.0066, + "step": 37275 + }, + { + "epoch": 6.746247060951347, + "grad_norm": 0.038715049624443054, + "learning_rate": 3.316448701418309e-07, + "loss": 0.0114, + "step": 37300 + }, + { + "epoch": 6.750768674262977, + "grad_norm": 0.2970021963119507, + "learning_rate": 3.31184380180512e-07, + "loss": 0.0028, + "step": 37325 + }, + { + "epoch": 6.755290287574606, + "grad_norm": 0.021920220926404, + "learning_rate": 3.3072389021919324e-07, + "loss": 0.0028, + "step": 37350 + }, + { + "epoch": 6.759811900886236, + "grad_norm": 0.009460126049816608, + "learning_rate": 3.3026340025787436e-07, + "loss": 0.0038, + "step": 37375 + }, + { + "epoch": 6.764333514197865, + "grad_norm": 10.395684242248535, + "learning_rate": 3.298029102965555e-07, + "loss": 0.004, + "step": 37400 + }, + { + "epoch": 6.768855127509496, + "grad_norm": 0.3913702070713043, + "learning_rate": 3.293424203352367e-07, + "loss": 0.007, + "step": 37425 + }, + { + "epoch": 6.773376740821125, + "grad_norm": 0.009950965642929077, + "learning_rate": 3.288819303739178e-07, + "loss": 0.0049, + "step": 37450 + }, + { + "epoch": 6.777898354132755, + "grad_norm": 0.06827165186405182, + "learning_rate": 3.28421440412599e-07, + "loss": 0.0006, + "step": 37475 + }, + { + "epoch": 6.782419967444384, + "grad_norm": 0.044129110872745514, + "learning_rate": 3.2796095045128013e-07, + "loss": 0.0108, + "step": 37500 + }, + { + "epoch": 6.786941580756014, + "grad_norm": 0.08213396370410919, + "learning_rate": 3.2750046048996135e-07, + "loss": 0.001, + "step": 37525 + }, + { + "epoch": 6.791463194067643, + "grad_norm": 0.8949390053749084, + "learning_rate": 3.2703997052864247e-07, + "loss": 0.0508, + "step": 37550 + }, + { + "epoch": 6.795984807379273, + "grad_norm": 3.0352182388305664, + "learning_rate": 3.265794805673236e-07, + "loss": 0.0366, + "step": 37575 + }, + { + "epoch": 6.800506420690903, + "grad_norm": 0.03377075120806694, + "learning_rate": 3.261189906060048e-07, + "loss": 0.0061, + "step": 37600 + }, + { + "epoch": 6.805028034002532, + "grad_norm": 0.012610839679837227, + "learning_rate": 3.256585006446859e-07, + "loss": 0.0029, + "step": 37625 + }, + { + "epoch": 6.809549647314162, + "grad_norm": 0.3035992681980133, + "learning_rate": 3.2519801068336713e-07, + "loss": 0.0075, + "step": 37650 + }, + { + "epoch": 6.814071260625791, + "grad_norm": 0.06034184619784355, + "learning_rate": 3.2473752072204824e-07, + "loss": 0.0042, + "step": 37675 + }, + { + "epoch": 6.818592873937421, + "grad_norm": 0.10909536480903625, + "learning_rate": 3.2427703076072936e-07, + "loss": 0.0116, + "step": 37700 + }, + { + "epoch": 6.82311448724905, + "grad_norm": 13.51667594909668, + "learning_rate": 3.238165407994106e-07, + "loss": 0.0039, + "step": 37725 + }, + { + "epoch": 6.82763610056068, + "grad_norm": 0.026187343522906303, + "learning_rate": 3.233560508380917e-07, + "loss": 0.0117, + "step": 37750 + }, + { + "epoch": 6.832157713872309, + "grad_norm": 0.04201328754425049, + "learning_rate": 3.228955608767729e-07, + "loss": 0.0197, + "step": 37775 + }, + { + "epoch": 6.836679327183939, + "grad_norm": 0.05836571007966995, + "learning_rate": 3.22435070915454e-07, + "loss": 0.0122, + "step": 37800 + }, + { + "epoch": 6.841200940495569, + "grad_norm": 0.005761469714343548, + "learning_rate": 3.219745809541352e-07, + "loss": 0.0194, + "step": 37825 + }, + { + "epoch": 6.845722553807199, + "grad_norm": 0.019468627870082855, + "learning_rate": 3.2151409099281636e-07, + "loss": 0.0053, + "step": 37850 + }, + { + "epoch": 6.850244167118828, + "grad_norm": 0.005167699884623289, + "learning_rate": 3.2105360103149747e-07, + "loss": 0.0013, + "step": 37875 + }, + { + "epoch": 6.854765780430458, + "grad_norm": 0.06815607845783234, + "learning_rate": 3.205931110701787e-07, + "loss": 0.0078, + "step": 37900 + }, + { + "epoch": 6.859287393742087, + "grad_norm": 0.05461608245968819, + "learning_rate": 3.201326211088598e-07, + "loss": 0.0046, + "step": 37925 + }, + { + "epoch": 6.8638090070537165, + "grad_norm": 0.3847118020057678, + "learning_rate": 3.1967213114754097e-07, + "loss": 0.0215, + "step": 37950 + }, + { + "epoch": 6.868330620365346, + "grad_norm": 0.15587353706359863, + "learning_rate": 3.1921164118622213e-07, + "loss": 0.03, + "step": 37975 + }, + { + "epoch": 6.872852233676976, + "grad_norm": 0.06245379522442818, + "learning_rate": 3.1875115122490325e-07, + "loss": 0.0102, + "step": 38000 + }, + { + "epoch": 6.877373846988606, + "grad_norm": 0.03094295971095562, + "learning_rate": 3.1829066126358447e-07, + "loss": 0.0065, + "step": 38025 + }, + { + "epoch": 6.881895460300235, + "grad_norm": 0.1180916577577591, + "learning_rate": 3.178301713022656e-07, + "loss": 0.0013, + "step": 38050 + }, + { + "epoch": 6.886417073611865, + "grad_norm": 0.5813055634498596, + "learning_rate": 3.1736968134094675e-07, + "loss": 0.0071, + "step": 38075 + }, + { + "epoch": 6.890938686923494, + "grad_norm": 0.14167048037052155, + "learning_rate": 3.169091913796279e-07, + "loss": 0.0033, + "step": 38100 + }, + { + "epoch": 6.895460300235124, + "grad_norm": 0.1619337499141693, + "learning_rate": 3.164487014183091e-07, + "loss": 0.0045, + "step": 38125 + }, + { + "epoch": 6.899981913546753, + "grad_norm": 0.03847223520278931, + "learning_rate": 3.1598821145699024e-07, + "loss": 0.0043, + "step": 38150 + }, + { + "epoch": 6.904503526858383, + "grad_norm": 0.11694706231355667, + "learning_rate": 3.1552772149567136e-07, + "loss": 0.0097, + "step": 38175 + }, + { + "epoch": 6.909025140170012, + "grad_norm": 0.07207904756069183, + "learning_rate": 3.150672315343526e-07, + "loss": 0.0004, + "step": 38200 + }, + { + "epoch": 6.913546753481643, + "grad_norm": 1.5510050058364868, + "learning_rate": 3.146067415730337e-07, + "loss": 0.007, + "step": 38225 + }, + { + "epoch": 6.918068366793272, + "grad_norm": 0.13146886229515076, + "learning_rate": 3.1414625161171486e-07, + "loss": 0.0121, + "step": 38250 + }, + { + "epoch": 6.922589980104902, + "grad_norm": 0.09875814616680145, + "learning_rate": 3.13685761650396e-07, + "loss": 0.0128, + "step": 38275 + }, + { + "epoch": 6.927111593416531, + "grad_norm": 0.02351992577314377, + "learning_rate": 3.1322527168907714e-07, + "loss": 0.017, + "step": 38300 + }, + { + "epoch": 6.9316332067281605, + "grad_norm": 1.6462205648422241, + "learning_rate": 3.1276478172775835e-07, + "loss": 0.0443, + "step": 38325 + }, + { + "epoch": 6.93615482003979, + "grad_norm": 1.7699528932571411, + "learning_rate": 3.1230429176643947e-07, + "loss": 0.0535, + "step": 38350 + }, + { + "epoch": 6.9406764333514195, + "grad_norm": 0.2231673002243042, + "learning_rate": 3.1184380180512063e-07, + "loss": 0.0369, + "step": 38375 + }, + { + "epoch": 6.94519804666305, + "grad_norm": 4.6962738037109375, + "learning_rate": 3.113833118438018e-07, + "loss": 0.006, + "step": 38400 + }, + { + "epoch": 6.949719659974679, + "grad_norm": 0.08258053660392761, + "learning_rate": 3.109228218824829e-07, + "loss": 0.0052, + "step": 38425 + }, + { + "epoch": 6.954241273286309, + "grad_norm": 1.8889635801315308, + "learning_rate": 3.1046233192116413e-07, + "loss": 0.0061, + "step": 38450 + }, + { + "epoch": 6.958762886597938, + "grad_norm": 0.09585348516702652, + "learning_rate": 3.1000184195984525e-07, + "loss": 0.0047, + "step": 38475 + }, + { + "epoch": 6.963284499909568, + "grad_norm": 0.10662294924259186, + "learning_rate": 3.095413519985264e-07, + "loss": 0.0076, + "step": 38500 + }, + { + "epoch": 6.967806113221197, + "grad_norm": 0.006054690573364496, + "learning_rate": 3.090808620372076e-07, + "loss": 0.0095, + "step": 38525 + }, + { + "epoch": 6.972327726532827, + "grad_norm": 27.81556510925293, + "learning_rate": 3.0862037207588875e-07, + "loss": 0.0016, + "step": 38550 + }, + { + "epoch": 6.976849339844456, + "grad_norm": 0.030143573880195618, + "learning_rate": 3.081598821145699e-07, + "loss": 0.0136, + "step": 38575 + }, + { + "epoch": 6.981370953156086, + "grad_norm": 2.358839750289917, + "learning_rate": 3.07699392153251e-07, + "loss": 0.0021, + "step": 38600 + }, + { + "epoch": 6.985892566467716, + "grad_norm": 12.556412696838379, + "learning_rate": 3.072389021919322e-07, + "loss": 0.0012, + "step": 38625 + }, + { + "epoch": 6.9904141797793455, + "grad_norm": 0.13631652295589447, + "learning_rate": 3.0677841223061336e-07, + "loss": 0.0066, + "step": 38650 + }, + { + "epoch": 6.994935793090975, + "grad_norm": 41.28492736816406, + "learning_rate": 3.063179222692945e-07, + "loss": 0.0265, + "step": 38675 + }, + { + "epoch": 6.9994574064026045, + "grad_norm": 0.12053684145212173, + "learning_rate": 3.058574323079757e-07, + "loss": 0.0605, + "step": 38700 + }, + { + "epoch": 7.0, + "eval_loss": 0.3541729748249054, + "eval_runtime": 8626.509, + "eval_samples_per_second": 1.101, + "eval_steps_per_second": 0.138, + "eval_wer": 0.10525304292120435, + "step": 38703 + }, + { + "epoch": 7.003979019714234, + "grad_norm": 1.1688233613967896, + "learning_rate": 3.053969423466568e-07, + "loss": 0.0075, + "step": 38725 + }, + { + "epoch": 7.0085006330258635, + "grad_norm": 0.06878869980573654, + "learning_rate": 3.0493645238533797e-07, + "loss": 0.0013, + "step": 38750 + }, + { + "epoch": 7.013022246337493, + "grad_norm": 0.12803910672664642, + "learning_rate": 3.0447596242401914e-07, + "loss": 0.006, + "step": 38775 + }, + { + "epoch": 7.017543859649122, + "grad_norm": 11.873225212097168, + "learning_rate": 3.040154724627003e-07, + "loss": 0.0017, + "step": 38800 + }, + { + "epoch": 7.022065472960753, + "grad_norm": 0.09927644580602646, + "learning_rate": 3.0355498250138147e-07, + "loss": 0.0011, + "step": 38825 + }, + { + "epoch": 7.026587086272382, + "grad_norm": 0.014069181866943836, + "learning_rate": 3.030944925400626e-07, + "loss": 0.0007, + "step": 38850 + }, + { + "epoch": 7.031108699584012, + "grad_norm": 3.6158812046051025, + "learning_rate": 3.026340025787438e-07, + "loss": 0.0076, + "step": 38875 + }, + { + "epoch": 7.035630312895641, + "grad_norm": 0.049011897295713425, + "learning_rate": 3.021735126174249e-07, + "loss": 0.0027, + "step": 38900 + }, + { + "epoch": 7.040151926207271, + "grad_norm": 0.02783900685608387, + "learning_rate": 3.017130226561061e-07, + "loss": 0.0026, + "step": 38925 + }, + { + "epoch": 7.0446735395189, + "grad_norm": 0.024276690557599068, + "learning_rate": 3.0125253269478725e-07, + "loss": 0.0018, + "step": 38950 + }, + { + "epoch": 7.04919515283053, + "grad_norm": 0.052325211465358734, + "learning_rate": 3.007920427334684e-07, + "loss": 0.0089, + "step": 38975 + }, + { + "epoch": 7.053716766142159, + "grad_norm": 0.021905574947595596, + "learning_rate": 3.003315527721496e-07, + "loss": 0.0027, + "step": 39000 + }, + { + "epoch": 7.0582383794537895, + "grad_norm": 0.013508542440831661, + "learning_rate": 2.998710628108307e-07, + "loss": 0.002, + "step": 39025 + }, + { + "epoch": 7.062759992765419, + "grad_norm": 0.052605342119932175, + "learning_rate": 2.9941057284951186e-07, + "loss": 0.0069, + "step": 39050 + }, + { + "epoch": 7.0672816060770485, + "grad_norm": 0.578596830368042, + "learning_rate": 2.98950082888193e-07, + "loss": 0.0269, + "step": 39075 + }, + { + "epoch": 7.071803219388678, + "grad_norm": 21.0313663482666, + "learning_rate": 2.984895929268742e-07, + "loss": 0.0125, + "step": 39100 + }, + { + "epoch": 7.076324832700307, + "grad_norm": 0.19108355045318604, + "learning_rate": 2.9802910296555536e-07, + "loss": 0.0074, + "step": 39125 + }, + { + "epoch": 7.080846446011937, + "grad_norm": 2.0000555515289307, + "learning_rate": 2.9756861300423647e-07, + "loss": 0.0028, + "step": 39150 + }, + { + "epoch": 7.085368059323566, + "grad_norm": 0.03665238618850708, + "learning_rate": 2.9710812304291764e-07, + "loss": 0.0046, + "step": 39175 + }, + { + "epoch": 7.089889672635196, + "grad_norm": 0.03791901841759682, + "learning_rate": 2.966476330815988e-07, + "loss": 0.0016, + "step": 39200 + }, + { + "epoch": 7.094411285946826, + "grad_norm": 0.016226934269070625, + "learning_rate": 2.9618714312027997e-07, + "loss": 0.0028, + "step": 39225 + }, + { + "epoch": 7.098932899258456, + "grad_norm": 12.398660659790039, + "learning_rate": 2.9572665315896113e-07, + "loss": 0.0017, + "step": 39250 + }, + { + "epoch": 7.103454512570085, + "grad_norm": 0.012261465191841125, + "learning_rate": 2.952661631976423e-07, + "loss": 0.001, + "step": 39275 + }, + { + "epoch": 7.107976125881715, + "grad_norm": 14.556396484375, + "learning_rate": 2.948056732363234e-07, + "loss": 0.0109, + "step": 39300 + }, + { + "epoch": 7.112497739193344, + "grad_norm": 0.11572438478469849, + "learning_rate": 2.943451832750046e-07, + "loss": 0.0049, + "step": 39325 + }, + { + "epoch": 7.117019352504974, + "grad_norm": 0.28125593066215515, + "learning_rate": 2.9388469331368575e-07, + "loss": 0.0114, + "step": 39350 + }, + { + "epoch": 7.121540965816603, + "grad_norm": 0.05056861415505409, + "learning_rate": 2.934242033523669e-07, + "loss": 0.0012, + "step": 39375 + }, + { + "epoch": 7.126062579128233, + "grad_norm": 0.22000250220298767, + "learning_rate": 2.929637133910481e-07, + "loss": 0.0305, + "step": 39400 + }, + { + "epoch": 7.130584192439863, + "grad_norm": 0.020098086446523666, + "learning_rate": 2.925032234297292e-07, + "loss": 0.0013, + "step": 39425 + }, + { + "epoch": 7.1351058057514924, + "grad_norm": 0.026488734409213066, + "learning_rate": 2.9204273346841036e-07, + "loss": 0.0356, + "step": 39450 + }, + { + "epoch": 7.139627419063122, + "grad_norm": 0.024454880505800247, + "learning_rate": 2.915822435070915e-07, + "loss": 0.0077, + "step": 39475 + }, + { + "epoch": 7.144149032374751, + "grad_norm": 0.017868295311927795, + "learning_rate": 2.911217535457727e-07, + "loss": 0.0215, + "step": 39500 + }, + { + "epoch": 7.148670645686381, + "grad_norm": 1.8522053956985474, + "learning_rate": 2.9066126358445386e-07, + "loss": 0.0043, + "step": 39525 + }, + { + "epoch": 7.15319225899801, + "grad_norm": 0.20884989202022552, + "learning_rate": 2.90200773623135e-07, + "loss": 0.0055, + "step": 39550 + }, + { + "epoch": 7.15771387230964, + "grad_norm": 0.1342606246471405, + "learning_rate": 2.8974028366181614e-07, + "loss": 0.0012, + "step": 39575 + }, + { + "epoch": 7.162235485621269, + "grad_norm": 0.14675575494766235, + "learning_rate": 2.892797937004973e-07, + "loss": 0.0013, + "step": 39600 + }, + { + "epoch": 7.1667570989329, + "grad_norm": 0.034802380949258804, + "learning_rate": 2.8881930373917847e-07, + "loss": 0.0014, + "step": 39625 + }, + { + "epoch": 7.171278712244529, + "grad_norm": 0.03734385594725609, + "learning_rate": 2.8835881377785964e-07, + "loss": 0.0026, + "step": 39650 + }, + { + "epoch": 7.175800325556159, + "grad_norm": 0.7827721834182739, + "learning_rate": 2.878983238165408e-07, + "loss": 0.0009, + "step": 39675 + }, + { + "epoch": 7.180321938867788, + "grad_norm": 0.06746553629636765, + "learning_rate": 2.8743783385522197e-07, + "loss": 0.0024, + "step": 39700 + }, + { + "epoch": 7.184843552179418, + "grad_norm": 0.2701489329338074, + "learning_rate": 2.869773438939031e-07, + "loss": 0.0058, + "step": 39725 + }, + { + "epoch": 7.189365165491047, + "grad_norm": 0.021761702373623848, + "learning_rate": 2.8651685393258425e-07, + "loss": 0.0078, + "step": 39750 + }, + { + "epoch": 7.193886778802677, + "grad_norm": 0.004122884478420019, + "learning_rate": 2.860563639712654e-07, + "loss": 0.0009, + "step": 39775 + }, + { + "epoch": 7.198408392114306, + "grad_norm": 0.05977817252278328, + "learning_rate": 2.855958740099466e-07, + "loss": 0.0061, + "step": 39800 + }, + { + "epoch": 7.2029300054259355, + "grad_norm": 0.7055386900901794, + "learning_rate": 2.8513538404862775e-07, + "loss": 0.0058, + "step": 39825 + }, + { + "epoch": 7.207451618737566, + "grad_norm": 13.389866828918457, + "learning_rate": 2.8467489408730886e-07, + "loss": 0.0142, + "step": 39850 + }, + { + "epoch": 7.211973232049195, + "grad_norm": 0.13937248289585114, + "learning_rate": 2.8421440412599e-07, + "loss": 0.0028, + "step": 39875 + }, + { + "epoch": 7.216494845360825, + "grad_norm": 0.6235303282737732, + "learning_rate": 2.837539141646712e-07, + "loss": 0.0083, + "step": 39900 + }, + { + "epoch": 7.221016458672454, + "grad_norm": 2.9348771572113037, + "learning_rate": 2.8329342420335236e-07, + "loss": 0.0374, + "step": 39925 + }, + { + "epoch": 7.225538071984084, + "grad_norm": 3.077686071395874, + "learning_rate": 2.828329342420335e-07, + "loss": 0.0082, + "step": 39950 + }, + { + "epoch": 7.230059685295713, + "grad_norm": 0.22787833213806152, + "learning_rate": 2.8237244428071464e-07, + "loss": 0.0012, + "step": 39975 + }, + { + "epoch": 7.234581298607343, + "grad_norm": 0.06242289021611214, + "learning_rate": 2.8191195431939586e-07, + "loss": 0.0061, + "step": 40000 + }, + { + "epoch": 7.239102911918972, + "grad_norm": 0.9802669882774353, + "learning_rate": 2.8145146435807697e-07, + "loss": 0.0092, + "step": 40025 + }, + { + "epoch": 7.243624525230603, + "grad_norm": 0.012614204548299313, + "learning_rate": 2.8099097439675814e-07, + "loss": 0.0005, + "step": 40050 + }, + { + "epoch": 7.248146138542232, + "grad_norm": 0.013021476566791534, + "learning_rate": 2.805304844354393e-07, + "loss": 0.0101, + "step": 40075 + }, + { + "epoch": 7.252667751853862, + "grad_norm": 0.008002633228898048, + "learning_rate": 2.800699944741204e-07, + "loss": 0.0083, + "step": 40100 + }, + { + "epoch": 7.257189365165491, + "grad_norm": 0.04996323958039284, + "learning_rate": 2.7960950451280164e-07, + "loss": 0.0011, + "step": 40125 + }, + { + "epoch": 7.2617109784771205, + "grad_norm": 11.185647964477539, + "learning_rate": 2.7914901455148275e-07, + "loss": 0.0034, + "step": 40150 + }, + { + "epoch": 7.26623259178875, + "grad_norm": 0.5441507697105408, + "learning_rate": 2.786885245901639e-07, + "loss": 0.012, + "step": 40175 + }, + { + "epoch": 7.2707542051003795, + "grad_norm": 0.020439432933926582, + "learning_rate": 2.782280346288451e-07, + "loss": 0.0094, + "step": 40200 + }, + { + "epoch": 7.275275818412009, + "grad_norm": 1.3434791564941406, + "learning_rate": 2.77785964265979e-07, + "loss": 0.0097, + "step": 40225 + }, + { + "epoch": 7.279797431723639, + "grad_norm": 0.302569180727005, + "learning_rate": 2.773254743046601e-07, + "loss": 0.0277, + "step": 40250 + }, + { + "epoch": 7.284319045035269, + "grad_norm": 0.06300198286771774, + "learning_rate": 2.7686498434334134e-07, + "loss": 0.0133, + "step": 40275 + }, + { + "epoch": 7.288840658346898, + "grad_norm": 22.47533416748047, + "learning_rate": 2.7640449438202246e-07, + "loss": 0.0173, + "step": 40300 + }, + { + "epoch": 7.293362271658528, + "grad_norm": 0.6076884865760803, + "learning_rate": 2.7594400442070357e-07, + "loss": 0.003, + "step": 40325 + }, + { + "epoch": 7.297883884970157, + "grad_norm": 0.18815143406391144, + "learning_rate": 2.754835144593848e-07, + "loss": 0.0089, + "step": 40350 + }, + { + "epoch": 7.302405498281787, + "grad_norm": 0.004078809637576342, + "learning_rate": 2.750230244980659e-07, + "loss": 0.0007, + "step": 40375 + }, + { + "epoch": 7.306927111593416, + "grad_norm": 0.06133987382054329, + "learning_rate": 2.745625345367471e-07, + "loss": 0.0008, + "step": 40400 + }, + { + "epoch": 7.311448724905046, + "grad_norm": 0.43629419803619385, + "learning_rate": 2.7410204457542824e-07, + "loss": 0.0055, + "step": 40425 + }, + { + "epoch": 7.315970338216676, + "grad_norm": 16.26517105102539, + "learning_rate": 2.736415546141094e-07, + "loss": 0.0016, + "step": 40450 + }, + { + "epoch": 7.320491951528306, + "grad_norm": 0.05941370874643326, + "learning_rate": 2.7318106465279057e-07, + "loss": 0.0071, + "step": 40475 + }, + { + "epoch": 7.325013564839935, + "grad_norm": 0.15563565492630005, + "learning_rate": 2.727205746914717e-07, + "loss": 0.0013, + "step": 40500 + }, + { + "epoch": 7.3295351781515645, + "grad_norm": 0.5691679120063782, + "learning_rate": 2.722600847301529e-07, + "loss": 0.0005, + "step": 40525 + }, + { + "epoch": 7.334056791463194, + "grad_norm": 0.004803875926882029, + "learning_rate": 2.71799594768834e-07, + "loss": 0.008, + "step": 40550 + }, + { + "epoch": 7.3385784047748235, + "grad_norm": 0.08438849449157715, + "learning_rate": 2.7133910480751523e-07, + "loss": 0.0012, + "step": 40575 + }, + { + "epoch": 7.343100018086453, + "grad_norm": 10.578065872192383, + "learning_rate": 2.7087861484619635e-07, + "loss": 0.0042, + "step": 40600 + }, + { + "epoch": 7.347621631398082, + "grad_norm": 0.00701162638142705, + "learning_rate": 2.7041812488487746e-07, + "loss": 0.0098, + "step": 40625 + }, + { + "epoch": 7.352143244709713, + "grad_norm": 0.029753483831882477, + "learning_rate": 2.699576349235587e-07, + "loss": 0.0032, + "step": 40650 + }, + { + "epoch": 7.356664858021342, + "grad_norm": 0.18117420375347137, + "learning_rate": 2.694971449622398e-07, + "loss": 0.0027, + "step": 40675 + }, + { + "epoch": 7.361186471332972, + "grad_norm": 0.9770309329032898, + "learning_rate": 2.69036655000921e-07, + "loss": 0.0229, + "step": 40700 + }, + { + "epoch": 7.365708084644601, + "grad_norm": 0.0630781352519989, + "learning_rate": 2.685761650396021e-07, + "loss": 0.0112, + "step": 40725 + }, + { + "epoch": 7.370229697956231, + "grad_norm": 0.0655864030122757, + "learning_rate": 2.6811567507828324e-07, + "loss": 0.0023, + "step": 40750 + }, + { + "epoch": 7.37475131126786, + "grad_norm": 0.28010591864585876, + "learning_rate": 2.6765518511696446e-07, + "loss": 0.0016, + "step": 40775 + }, + { + "epoch": 7.37927292457949, + "grad_norm": 0.04630829766392708, + "learning_rate": 2.6719469515564557e-07, + "loss": 0.0011, + "step": 40800 + }, + { + "epoch": 7.383794537891119, + "grad_norm": 0.03749445080757141, + "learning_rate": 2.667342051943268e-07, + "loss": 0.0038, + "step": 40825 + }, + { + "epoch": 7.3883161512027495, + "grad_norm": 0.22030295431613922, + "learning_rate": 2.662737152330079e-07, + "loss": 0.0049, + "step": 40850 + }, + { + "epoch": 7.392837764514379, + "grad_norm": 0.07691410928964615, + "learning_rate": 2.6581322527168907e-07, + "loss": 0.0004, + "step": 40875 + }, + { + "epoch": 7.3973593778260085, + "grad_norm": 0.027430010959506035, + "learning_rate": 2.6535273531037023e-07, + "loss": 0.0022, + "step": 40900 + }, + { + "epoch": 7.401880991137638, + "grad_norm": 0.07501472532749176, + "learning_rate": 2.6489224534905135e-07, + "loss": 0.0013, + "step": 40925 + }, + { + "epoch": 7.4064026044492675, + "grad_norm": 0.004892929922789335, + "learning_rate": 2.6443175538773257e-07, + "loss": 0.0044, + "step": 40950 + }, + { + "epoch": 7.410924217760897, + "grad_norm": 0.11225175857543945, + "learning_rate": 2.639712654264137e-07, + "loss": 0.0145, + "step": 40975 + }, + { + "epoch": 7.415445831072526, + "grad_norm": 0.01234606932848692, + "learning_rate": 2.6351077546509485e-07, + "loss": 0.0007, + "step": 41000 + }, + { + "epoch": 7.419967444384156, + "grad_norm": 20.348722457885742, + "learning_rate": 2.63050285503776e-07, + "loss": 0.0173, + "step": 41025 + }, + { + "epoch": 7.424489057695786, + "grad_norm": 0.01828809268772602, + "learning_rate": 2.625897955424571e-07, + "loss": 0.0154, + "step": 41050 + }, + { + "epoch": 7.429010671007416, + "grad_norm": 0.1676705926656723, + "learning_rate": 2.6212930558113835e-07, + "loss": 0.0254, + "step": 41075 + }, + { + "epoch": 7.433532284319045, + "grad_norm": 0.04343516007065773, + "learning_rate": 2.6166881561981946e-07, + "loss": 0.0271, + "step": 41100 + }, + { + "epoch": 7.438053897630675, + "grad_norm": 0.06619753688573837, + "learning_rate": 2.612083256585006e-07, + "loss": 0.0081, + "step": 41125 + }, + { + "epoch": 7.442575510942304, + "grad_norm": 0.2102379947900772, + "learning_rate": 2.607478356971818e-07, + "loss": 0.0045, + "step": 41150 + }, + { + "epoch": 7.447097124253934, + "grad_norm": 0.020573345944285393, + "learning_rate": 2.6028734573586296e-07, + "loss": 0.0027, + "step": 41175 + }, + { + "epoch": 7.451618737565563, + "grad_norm": 0.0597822479903698, + "learning_rate": 2.598268557745441e-07, + "loss": 0.0018, + "step": 41200 + }, + { + "epoch": 7.456140350877193, + "grad_norm": 2.0168440341949463, + "learning_rate": 2.5936636581322524e-07, + "loss": 0.0125, + "step": 41225 + }, + { + "epoch": 7.460661964188823, + "grad_norm": 0.42403435707092285, + "learning_rate": 2.5890587585190646e-07, + "loss": 0.0032, + "step": 41250 + }, + { + "epoch": 7.4651835775004525, + "grad_norm": 0.03630862757563591, + "learning_rate": 2.5844538589058757e-07, + "loss": 0.0011, + "step": 41275 + }, + { + "epoch": 7.469705190812082, + "grad_norm": 0.008765432052314281, + "learning_rate": 2.5798489592926874e-07, + "loss": 0.0095, + "step": 41300 + }, + { + "epoch": 7.474226804123711, + "grad_norm": 0.019622275605797768, + "learning_rate": 2.575244059679499e-07, + "loss": 0.0237, + "step": 41325 + }, + { + "epoch": 7.478748417435341, + "grad_norm": 0.14788568019866943, + "learning_rate": 2.57063916006631e-07, + "loss": 0.0071, + "step": 41350 + }, + { + "epoch": 7.48327003074697, + "grad_norm": 9.201263427734375, + "learning_rate": 2.5660342604531223e-07, + "loss": 0.0059, + "step": 41375 + }, + { + "epoch": 7.4877916440586, + "grad_norm": 9.586228370666504, + "learning_rate": 2.5614293608399335e-07, + "loss": 0.001, + "step": 41400 + }, + { + "epoch": 7.492313257370229, + "grad_norm": 0.035080842673778534, + "learning_rate": 2.556824461226745e-07, + "loss": 0.008, + "step": 41425 + }, + { + "epoch": 7.496834870681859, + "grad_norm": 0.09500127285718918, + "learning_rate": 2.552219561613557e-07, + "loss": 0.0203, + "step": 41450 + }, + { + "epoch": 7.501356483993489, + "grad_norm": 0.030213013291358948, + "learning_rate": 2.547614662000368e-07, + "loss": 0.0226, + "step": 41475 + }, + { + "epoch": 7.505878097305119, + "grad_norm": 0.16554242372512817, + "learning_rate": 2.54300976238718e-07, + "loss": 0.0396, + "step": 41500 + }, + { + "epoch": 7.510399710616748, + "grad_norm": 0.049443017691373825, + "learning_rate": 2.538404862773991e-07, + "loss": 0.0085, + "step": 41525 + }, + { + "epoch": 7.514921323928378, + "grad_norm": 0.045605577528476715, + "learning_rate": 2.533799963160803e-07, + "loss": 0.0045, + "step": 41550 + }, + { + "epoch": 7.519442937240007, + "grad_norm": 0.023337364196777344, + "learning_rate": 2.5291950635476146e-07, + "loss": 0.0102, + "step": 41575 + }, + { + "epoch": 7.523964550551637, + "grad_norm": 0.78485107421875, + "learning_rate": 2.524590163934426e-07, + "loss": 0.0024, + "step": 41600 + }, + { + "epoch": 7.528486163863266, + "grad_norm": 0.014340350404381752, + "learning_rate": 2.519985264321238e-07, + "loss": 0.0019, + "step": 41625 + }, + { + "epoch": 7.5330077771748964, + "grad_norm": 0.004216828849166632, + "learning_rate": 2.515380364708049e-07, + "loss": 0.003, + "step": 41650 + }, + { + "epoch": 7.537529390486526, + "grad_norm": 14.32552719116211, + "learning_rate": 2.5107754650948607e-07, + "loss": 0.0066, + "step": 41675 + }, + { + "epoch": 7.542051003798155, + "grad_norm": 0.160021111369133, + "learning_rate": 2.5061705654816724e-07, + "loss": 0.0053, + "step": 41700 + }, + { + "epoch": 7.546572617109785, + "grad_norm": 0.020444253459572792, + "learning_rate": 2.501565665868484e-07, + "loss": 0.0011, + "step": 41725 + }, + { + "epoch": 7.551094230421414, + "grad_norm": 1.1539157629013062, + "learning_rate": 2.4969607662552957e-07, + "loss": 0.0011, + "step": 41750 + }, + { + "epoch": 7.555615843733044, + "grad_norm": 0.10138887166976929, + "learning_rate": 2.4923558666421074e-07, + "loss": 0.001, + "step": 41775 + }, + { + "epoch": 7.560137457044673, + "grad_norm": 0.034172624349594116, + "learning_rate": 2.4877509670289185e-07, + "loss": 0.0013, + "step": 41800 + }, + { + "epoch": 7.564659070356303, + "grad_norm": 0.02480352483689785, + "learning_rate": 2.48314606741573e-07, + "loss": 0.01, + "step": 41825 + }, + { + "epoch": 7.569180683667932, + "grad_norm": 0.03536754474043846, + "learning_rate": 2.478541167802542e-07, + "loss": 0.0208, + "step": 41850 + }, + { + "epoch": 7.573702296979563, + "grad_norm": 0.20276236534118652, + "learning_rate": 2.4739362681893535e-07, + "loss": 0.0131, + "step": 41875 + }, + { + "epoch": 7.578223910291192, + "grad_norm": 0.03076460212469101, + "learning_rate": 2.469331368576165e-07, + "loss": 0.0082, + "step": 41900 + }, + { + "epoch": 7.582745523602822, + "grad_norm": 0.057225510478019714, + "learning_rate": 2.464726468962977e-07, + "loss": 0.0082, + "step": 41925 + }, + { + "epoch": 7.587267136914451, + "grad_norm": 0.07822202891111374, + "learning_rate": 2.460121569349788e-07, + "loss": 0.0038, + "step": 41950 + }, + { + "epoch": 7.591788750226081, + "grad_norm": 0.01712547056376934, + "learning_rate": 2.4555166697365996e-07, + "loss": 0.0063, + "step": 41975 + }, + { + "epoch": 7.59631036353771, + "grad_norm": 0.06029786914587021, + "learning_rate": 2.450911770123411e-07, + "loss": 0.0069, + "step": 42000 + }, + { + "epoch": 7.6008319768493395, + "grad_norm": 0.025808099657297134, + "learning_rate": 2.446306870510223e-07, + "loss": 0.0042, + "step": 42025 + }, + { + "epoch": 7.60535359016097, + "grad_norm": 0.01600913517177105, + "learning_rate": 2.4417019708970346e-07, + "loss": 0.0007, + "step": 42050 + }, + { + "epoch": 7.609875203472599, + "grad_norm": 7.5506815910339355, + "learning_rate": 2.4370970712838457e-07, + "loss": 0.0054, + "step": 42075 + }, + { + "epoch": 7.614396816784229, + "grad_norm": 0.04834285005927086, + "learning_rate": 2.4324921716706574e-07, + "loss": 0.0015, + "step": 42100 + }, + { + "epoch": 7.618918430095858, + "grad_norm": 0.04366715997457504, + "learning_rate": 2.427887272057469e-07, + "loss": 0.0037, + "step": 42125 + }, + { + "epoch": 7.623440043407488, + "grad_norm": 0.01586577482521534, + "learning_rate": 2.4232823724442807e-07, + "loss": 0.017, + "step": 42150 + }, + { + "epoch": 7.627961656719117, + "grad_norm": 21.55689239501953, + "learning_rate": 2.4186774728310924e-07, + "loss": 0.0098, + "step": 42175 + }, + { + "epoch": 7.632483270030747, + "grad_norm": 0.019474711269140244, + "learning_rate": 2.4140725732179035e-07, + "loss": 0.008, + "step": 42200 + }, + { + "epoch": 7.637004883342376, + "grad_norm": 0.20361949503421783, + "learning_rate": 2.409467673604715e-07, + "loss": 0.0021, + "step": 42225 + }, + { + "epoch": 7.641526496654006, + "grad_norm": 0.061369773000478745, + "learning_rate": 2.404862773991527e-07, + "loss": 0.0167, + "step": 42250 + }, + { + "epoch": 7.646048109965636, + "grad_norm": 0.03836612030863762, + "learning_rate": 2.400442070362866e-07, + "loss": 0.0152, + "step": 42275 + }, + { + "epoch": 7.650569723277266, + "grad_norm": 0.6904728412628174, + "learning_rate": 2.395837170749677e-07, + "loss": 0.0254, + "step": 42300 + }, + { + "epoch": 7.655091336588895, + "grad_norm": 9.530180931091309, + "learning_rate": 2.391232271136489e-07, + "loss": 0.0247, + "step": 42325 + }, + { + "epoch": 7.6596129499005245, + "grad_norm": 0.010562293231487274, + "learning_rate": 2.3866273715233006e-07, + "loss": 0.0021, + "step": 42350 + }, + { + "epoch": 7.664134563212154, + "grad_norm": 0.032406773418188095, + "learning_rate": 2.3820224719101122e-07, + "loss": 0.0026, + "step": 42375 + }, + { + "epoch": 7.6686561765237835, + "grad_norm": 7.6475958824157715, + "learning_rate": 2.377417572296924e-07, + "loss": 0.0095, + "step": 42400 + }, + { + "epoch": 7.673177789835413, + "grad_norm": 0.03494368493556976, + "learning_rate": 2.3728126726837353e-07, + "loss": 0.0029, + "step": 42425 + }, + { + "epoch": 7.677699403147043, + "grad_norm": 14.583782196044922, + "learning_rate": 2.368207773070547e-07, + "loss": 0.0128, + "step": 42450 + }, + { + "epoch": 7.682221016458673, + "grad_norm": 0.12016825377941132, + "learning_rate": 2.3636028734573584e-07, + "loss": 0.0124, + "step": 42475 + }, + { + "epoch": 7.686742629770302, + "grad_norm": 0.11401164531707764, + "learning_rate": 2.35899797384417e-07, + "loss": 0.0006, + "step": 42500 + }, + { + "epoch": 7.691264243081932, + "grad_norm": 0.030007168650627136, + "learning_rate": 2.3543930742309817e-07, + "loss": 0.0013, + "step": 42525 + }, + { + "epoch": 7.695785856393561, + "grad_norm": 0.01560743898153305, + "learning_rate": 2.3497881746177933e-07, + "loss": 0.0132, + "step": 42550 + }, + { + "epoch": 7.700307469705191, + "grad_norm": 0.03253242000937462, + "learning_rate": 2.3451832750046047e-07, + "loss": 0.0104, + "step": 42575 + }, + { + "epoch": 7.70482908301682, + "grad_norm": 0.004338828381150961, + "learning_rate": 2.3405783753914164e-07, + "loss": 0.0108, + "step": 42600 + }, + { + "epoch": 7.70935069632845, + "grad_norm": 0.06748280674219131, + "learning_rate": 2.3359734757782278e-07, + "loss": 0.0169, + "step": 42625 + }, + { + "epoch": 7.713872309640079, + "grad_norm": 0.0437793992459774, + "learning_rate": 2.3313685761650395e-07, + "loss": 0.0142, + "step": 42650 + }, + { + "epoch": 7.718393922951709, + "grad_norm": 0.06706016510725021, + "learning_rate": 2.326763676551851e-07, + "loss": 0.0117, + "step": 42675 + }, + { + "epoch": 7.722915536263339, + "grad_norm": 25.65509796142578, + "learning_rate": 2.3221587769386628e-07, + "loss": 0.0218, + "step": 42700 + }, + { + "epoch": 7.7274371495749685, + "grad_norm": 0.0024367687292397022, + "learning_rate": 2.3175538773254742e-07, + "loss": 0.0028, + "step": 42725 + }, + { + "epoch": 7.731958762886598, + "grad_norm": 0.3435537815093994, + "learning_rate": 2.3129489777122856e-07, + "loss": 0.0041, + "step": 42750 + }, + { + "epoch": 7.7364803761982275, + "grad_norm": 0.23171645402908325, + "learning_rate": 2.3083440780990972e-07, + "loss": 0.0054, + "step": 42775 + }, + { + "epoch": 7.741001989509857, + "grad_norm": 0.06181083992123604, + "learning_rate": 2.303739178485909e-07, + "loss": 0.0006, + "step": 42800 + }, + { + "epoch": 7.745523602821486, + "grad_norm": 0.10873863101005554, + "learning_rate": 2.2991342788727206e-07, + "loss": 0.0035, + "step": 42825 + }, + { + "epoch": 7.750045216133116, + "grad_norm": 0.0161910280585289, + "learning_rate": 2.2945293792595322e-07, + "loss": 0.0048, + "step": 42850 + }, + { + "epoch": 7.754566829444746, + "grad_norm": 0.07214026153087616, + "learning_rate": 2.2899244796463434e-07, + "loss": 0.002, + "step": 42875 + }, + { + "epoch": 7.759088442756376, + "grad_norm": 0.01364390179514885, + "learning_rate": 2.285319580033155e-07, + "loss": 0.0007, + "step": 42900 + }, + { + "epoch": 7.763610056068005, + "grad_norm": 0.023643679916858673, + "learning_rate": 2.2807146804199667e-07, + "loss": 0.0022, + "step": 42925 + }, + { + "epoch": 7.768131669379635, + "grad_norm": 0.021116966381669044, + "learning_rate": 2.2761097808067784e-07, + "loss": 0.0101, + "step": 42950 + }, + { + "epoch": 7.772653282691264, + "grad_norm": 0.10896778851747513, + "learning_rate": 2.27150488119359e-07, + "loss": 0.0031, + "step": 42975 + }, + { + "epoch": 7.777174896002894, + "grad_norm": 0.010054018348455429, + "learning_rate": 2.2668999815804017e-07, + "loss": 0.0057, + "step": 43000 + }, + { + "epoch": 7.781696509314523, + "grad_norm": 0.06566222012042999, + "learning_rate": 2.2622950819672128e-07, + "loss": 0.007, + "step": 43025 + }, + { + "epoch": 7.786218122626153, + "grad_norm": 0.05465374514460564, + "learning_rate": 2.2576901823540245e-07, + "loss": 0.0005, + "step": 43050 + }, + { + "epoch": 7.790739735937782, + "grad_norm": 0.6215785145759583, + "learning_rate": 2.2530852827408361e-07, + "loss": 0.0265, + "step": 43075 + }, + { + "epoch": 7.7952613492494125, + "grad_norm": 3.957580089569092, + "learning_rate": 2.2484803831276478e-07, + "loss": 0.0708, + "step": 43100 + }, + { + "epoch": 7.799782962561042, + "grad_norm": 0.034929584711790085, + "learning_rate": 2.2438754835144595e-07, + "loss": 0.0043, + "step": 43125 + }, + { + "epoch": 7.8043045758726715, + "grad_norm": 0.06470832228660583, + "learning_rate": 2.2392705839012706e-07, + "loss": 0.0059, + "step": 43150 + }, + { + "epoch": 7.808826189184301, + "grad_norm": 3.028578758239746, + "learning_rate": 2.2346656842880823e-07, + "loss": 0.0054, + "step": 43175 + }, + { + "epoch": 7.81334780249593, + "grad_norm": 0.060492563992738724, + "learning_rate": 2.230060784674894e-07, + "loss": 0.0099, + "step": 43200 + }, + { + "epoch": 7.81786941580756, + "grad_norm": 1.7288694381713867, + "learning_rate": 2.2254558850617056e-07, + "loss": 0.0016, + "step": 43225 + }, + { + "epoch": 7.822391029119189, + "grad_norm": 0.06679031252861023, + "learning_rate": 2.2208509854485172e-07, + "loss": 0.0042, + "step": 43250 + }, + { + "epoch": 7.82691264243082, + "grad_norm": 17.381139755249023, + "learning_rate": 2.216246085835329e-07, + "loss": 0.0047, + "step": 43275 + }, + { + "epoch": 7.831434255742449, + "grad_norm": 0.03778848424553871, + "learning_rate": 2.21164118622214e-07, + "loss": 0.0077, + "step": 43300 + }, + { + "epoch": 7.835955869054079, + "grad_norm": 0.009451249614357948, + "learning_rate": 2.2070362866089517e-07, + "loss": 0.0021, + "step": 43325 + }, + { + "epoch": 7.840477482365708, + "grad_norm": 25.109268188476562, + "learning_rate": 2.2024313869957634e-07, + "loss": 0.0042, + "step": 43350 + }, + { + "epoch": 7.844999095677338, + "grad_norm": 0.015822693705558777, + "learning_rate": 2.197826487382575e-07, + "loss": 0.0063, + "step": 43375 + }, + { + "epoch": 7.849520708988967, + "grad_norm": 0.016655854880809784, + "learning_rate": 2.1932215877693867e-07, + "loss": 0.0081, + "step": 43400 + }, + { + "epoch": 7.854042322300597, + "grad_norm": 0.25390344858169556, + "learning_rate": 2.188616688156198e-07, + "loss": 0.0037, + "step": 43425 + }, + { + "epoch": 7.858563935612226, + "grad_norm": 0.09659765660762787, + "learning_rate": 2.1840117885430095e-07, + "loss": 0.0146, + "step": 43450 + }, + { + "epoch": 7.863085548923856, + "grad_norm": 0.028894655406475067, + "learning_rate": 2.1794068889298211e-07, + "loss": 0.0147, + "step": 43475 + }, + { + "epoch": 7.867607162235486, + "grad_norm": 0.015346791595220566, + "learning_rate": 2.1748019893166328e-07, + "loss": 0.0241, + "step": 43500 + }, + { + "epoch": 7.872128775547115, + "grad_norm": 0.43620771169662476, + "learning_rate": 2.1701970897034445e-07, + "loss": 0.0101, + "step": 43525 + }, + { + "epoch": 7.876650388858745, + "grad_norm": 0.023062733933329582, + "learning_rate": 2.1655921900902561e-07, + "loss": 0.0048, + "step": 43550 + }, + { + "epoch": 7.881172002170374, + "grad_norm": 0.011817359365522861, + "learning_rate": 2.1609872904770675e-07, + "loss": 0.0033, + "step": 43575 + }, + { + "epoch": 7.885693615482004, + "grad_norm": 0.3834367096424103, + "learning_rate": 2.156382390863879e-07, + "loss": 0.0012, + "step": 43600 + }, + { + "epoch": 7.890215228793633, + "grad_norm": 4.274073600769043, + "learning_rate": 2.1517774912506906e-07, + "loss": 0.0077, + "step": 43625 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 0.024657847359776497, + "learning_rate": 2.1471725916375023e-07, + "loss": 0.0069, + "step": 43650 + }, + { + "epoch": 7.899258455416893, + "grad_norm": 7.262208461761475, + "learning_rate": 2.142567692024314e-07, + "loss": 0.0073, + "step": 43675 + }, + { + "epoch": 7.903780068728523, + "grad_norm": 0.035356614738702774, + "learning_rate": 2.1379627924111253e-07, + "loss": 0.0162, + "step": 43700 + }, + { + "epoch": 7.908301682040152, + "grad_norm": 0.07969994097948074, + "learning_rate": 2.133357892797937e-07, + "loss": 0.0014, + "step": 43725 + }, + { + "epoch": 7.912823295351782, + "grad_norm": 0.013595969416201115, + "learning_rate": 2.1287529931847484e-07, + "loss": 0.0012, + "step": 43750 + }, + { + "epoch": 7.917344908663411, + "grad_norm": 0.02247740514576435, + "learning_rate": 2.12414809357156e-07, + "loss": 0.0012, + "step": 43775 + }, + { + "epoch": 7.921866521975041, + "grad_norm": 0.07739664614200592, + "learning_rate": 2.1195431939583717e-07, + "loss": 0.0047, + "step": 43800 + }, + { + "epoch": 7.92638813528667, + "grad_norm": 0.32251256704330444, + "learning_rate": 2.114938294345183e-07, + "loss": 0.0056, + "step": 43825 + }, + { + "epoch": 7.9309097485983, + "grad_norm": 0.007755752187222242, + "learning_rate": 2.1103333947319948e-07, + "loss": 0.0188, + "step": 43850 + }, + { + "epoch": 7.935431361909929, + "grad_norm": 8.146416664123535, + "learning_rate": 2.1057284951188062e-07, + "loss": 0.014, + "step": 43875 + }, + { + "epoch": 7.939952975221559, + "grad_norm": 22.875139236450195, + "learning_rate": 2.1011235955056178e-07, + "loss": 0.0218, + "step": 43900 + }, + { + "epoch": 7.944474588533189, + "grad_norm": 0.0895143672823906, + "learning_rate": 2.0965186958924295e-07, + "loss": 0.0331, + "step": 43925 + }, + { + "epoch": 7.948996201844818, + "grad_norm": 0.019100898876786232, + "learning_rate": 2.0919137962792411e-07, + "loss": 0.0013, + "step": 43950 + }, + { + "epoch": 7.953517815156448, + "grad_norm": 0.0297053474932909, + "learning_rate": 2.0873088966660525e-07, + "loss": 0.0007, + "step": 43975 + }, + { + "epoch": 7.958039428468077, + "grad_norm": 0.4429755210876465, + "learning_rate": 2.0827039970528642e-07, + "loss": 0.006, + "step": 44000 + }, + { + "epoch": 7.962561041779707, + "grad_norm": 0.021714074537158012, + "learning_rate": 2.0780990974396756e-07, + "loss": 0.0126, + "step": 44025 + }, + { + "epoch": 7.967082655091336, + "grad_norm": 0.048985399305820465, + "learning_rate": 2.0734941978264873e-07, + "loss": 0.0055, + "step": 44050 + }, + { + "epoch": 7.971604268402967, + "grad_norm": 0.22394807636737823, + "learning_rate": 2.068889298213299e-07, + "loss": 0.0006, + "step": 44075 + }, + { + "epoch": 7.976125881714596, + "grad_norm": 0.030540427193045616, + "learning_rate": 2.0642843986001103e-07, + "loss": 0.0146, + "step": 44100 + }, + { + "epoch": 7.980647495026226, + "grad_norm": 1.285352110862732, + "learning_rate": 2.059679498986922e-07, + "loss": 0.0027, + "step": 44125 + }, + { + "epoch": 7.985169108337855, + "grad_norm": 0.025696493685245514, + "learning_rate": 2.0550745993737336e-07, + "loss": 0.0072, + "step": 44150 + }, + { + "epoch": 7.989690721649485, + "grad_norm": 0.1506178379058838, + "learning_rate": 2.050469699760545e-07, + "loss": 0.0024, + "step": 44175 + }, + { + "epoch": 7.994212334961114, + "grad_norm": 0.03548096492886543, + "learning_rate": 2.0458648001473567e-07, + "loss": 0.0208, + "step": 44200 + }, + { + "epoch": 7.9987339482727435, + "grad_norm": 1.0646295547485352, + "learning_rate": 2.0412599005341684e-07, + "loss": 0.043, + "step": 44225 + }, + { + "epoch": 8.0, + "eval_loss": 0.36690396070480347, + "eval_runtime": 8374.4331, + "eval_samples_per_second": 1.134, + "eval_steps_per_second": 0.142, + "eval_wer": 0.10485265855221013, + "step": 44232 + }, + { + "epoch": 8.003255561584373, + "grad_norm": 0.08020028471946716, + "learning_rate": 2.0366550009209798e-07, + "loss": 0.02, + "step": 44250 + }, + { + "epoch": 8.007777174896002, + "grad_norm": 0.013830600306391716, + "learning_rate": 2.0320501013077914e-07, + "loss": 0.0032, + "step": 44275 + }, + { + "epoch": 8.012298788207632, + "grad_norm": 0.05772462114691734, + "learning_rate": 2.027445201694603e-07, + "loss": 0.0008, + "step": 44300 + }, + { + "epoch": 8.016820401519261, + "grad_norm": 0.021993961185216904, + "learning_rate": 2.0228403020814145e-07, + "loss": 0.0041, + "step": 44325 + }, + { + "epoch": 8.021342014830891, + "grad_norm": 5.4001383781433105, + "learning_rate": 2.0182354024682262e-07, + "loss": 0.0048, + "step": 44350 + }, + { + "epoch": 8.02586362814252, + "grad_norm": 0.08318978548049927, + "learning_rate": 2.0136305028550375e-07, + "loss": 0.0045, + "step": 44375 + }, + { + "epoch": 8.030385241454152, + "grad_norm": 0.09250658750534058, + "learning_rate": 2.0090256032418492e-07, + "loss": 0.0032, + "step": 44400 + }, + { + "epoch": 8.034906854765781, + "grad_norm": 0.23829813301563263, + "learning_rate": 2.004420703628661e-07, + "loss": 0.0019, + "step": 44425 + }, + { + "epoch": 8.03942846807741, + "grad_norm": 0.016779888421297073, + "learning_rate": 1.9998158040154725e-07, + "loss": 0.0033, + "step": 44450 + }, + { + "epoch": 8.04395008138904, + "grad_norm": 0.08098112046718597, + "learning_rate": 1.995210904402284e-07, + "loss": 0.0017, + "step": 44475 + }, + { + "epoch": 8.04847169470067, + "grad_norm": 0.042912207543849945, + "learning_rate": 1.9906060047890953e-07, + "loss": 0.0038, + "step": 44500 + }, + { + "epoch": 8.052993308012299, + "grad_norm": 0.052388470619916916, + "learning_rate": 1.986001105175907e-07, + "loss": 0.0004, + "step": 44525 + }, + { + "epoch": 8.057514921323929, + "grad_norm": 3.6011130809783936, + "learning_rate": 1.9813962055627187e-07, + "loss": 0.0064, + "step": 44550 + }, + { + "epoch": 8.062036534635558, + "grad_norm": 0.05901797115802765, + "learning_rate": 1.9767913059495303e-07, + "loss": 0.0098, + "step": 44575 + }, + { + "epoch": 8.066558147947188, + "grad_norm": 0.08754336833953857, + "learning_rate": 1.9721864063363417e-07, + "loss": 0.0342, + "step": 44600 + }, + { + "epoch": 8.071079761258817, + "grad_norm": 33.210994720458984, + "learning_rate": 1.9675815067231534e-07, + "loss": 0.0427, + "step": 44625 + }, + { + "epoch": 8.075601374570446, + "grad_norm": 0.3902020752429962, + "learning_rate": 1.9629766071099648e-07, + "loss": 0.0331, + "step": 44650 + }, + { + "epoch": 8.080122987882076, + "grad_norm": 0.5214375853538513, + "learning_rate": 1.9583717074967764e-07, + "loss": 0.0022, + "step": 44675 + }, + { + "epoch": 8.084644601193705, + "grad_norm": 0.03465314209461212, + "learning_rate": 1.953766807883588e-07, + "loss": 0.0085, + "step": 44700 + }, + { + "epoch": 8.089166214505335, + "grad_norm": 5.756930828094482, + "learning_rate": 1.9491619082703998e-07, + "loss": 0.0024, + "step": 44725 + }, + { + "epoch": 8.093687827816964, + "grad_norm": 0.10550093650817871, + "learning_rate": 1.9445570086572112e-07, + "loss": 0.0109, + "step": 44750 + }, + { + "epoch": 8.098209441128594, + "grad_norm": 2.2892873287200928, + "learning_rate": 1.9399521090440226e-07, + "loss": 0.0022, + "step": 44775 + }, + { + "epoch": 8.102731054440225, + "grad_norm": 0.008341608569025993, + "learning_rate": 1.9353472094308342e-07, + "loss": 0.0004, + "step": 44800 + }, + { + "epoch": 8.107252667751855, + "grad_norm": 4.5169782638549805, + "learning_rate": 1.930742309817646e-07, + "loss": 0.0021, + "step": 44825 + }, + { + "epoch": 8.111774281063484, + "grad_norm": 0.0315685048699379, + "learning_rate": 1.9261374102044575e-07, + "loss": 0.007, + "step": 44850 + }, + { + "epoch": 8.116295894375114, + "grad_norm": 0.12863166630268097, + "learning_rate": 1.9215325105912692e-07, + "loss": 0.0056, + "step": 44875 + }, + { + "epoch": 8.120817507686743, + "grad_norm": 0.6251371502876282, + "learning_rate": 1.9169276109780806e-07, + "loss": 0.0006, + "step": 44900 + }, + { + "epoch": 8.125339120998373, + "grad_norm": 0.8310458064079285, + "learning_rate": 1.912322711364892e-07, + "loss": 0.0084, + "step": 44925 + }, + { + "epoch": 8.129860734310002, + "grad_norm": 0.02388242445886135, + "learning_rate": 1.9077178117517037e-07, + "loss": 0.0113, + "step": 44950 + }, + { + "epoch": 8.134382347621631, + "grad_norm": 0.00754801370203495, + "learning_rate": 1.9031129121385153e-07, + "loss": 0.0031, + "step": 44975 + }, + { + "epoch": 8.138903960933261, + "grad_norm": 0.41686248779296875, + "learning_rate": 1.898508012525327e-07, + "loss": 0.0042, + "step": 45000 + }, + { + "epoch": 8.14342557424489, + "grad_norm": 3.3315136432647705, + "learning_rate": 1.8939031129121387e-07, + "loss": 0.0198, + "step": 45025 + }, + { + "epoch": 8.14794718755652, + "grad_norm": 0.44836312532424927, + "learning_rate": 1.8892982132989498e-07, + "loss": 0.0191, + "step": 45050 + }, + { + "epoch": 8.15246880086815, + "grad_norm": 0.06881590187549591, + "learning_rate": 1.8846933136857614e-07, + "loss": 0.0016, + "step": 45075 + }, + { + "epoch": 8.156990414179779, + "grad_norm": 0.020737141370773315, + "learning_rate": 1.880088414072573e-07, + "loss": 0.0054, + "step": 45100 + }, + { + "epoch": 8.161512027491408, + "grad_norm": 0.033259179443120956, + "learning_rate": 1.8754835144593848e-07, + "loss": 0.0007, + "step": 45125 + }, + { + "epoch": 8.166033640803038, + "grad_norm": 14.501016616821289, + "learning_rate": 1.8708786148461964e-07, + "loss": 0.0066, + "step": 45150 + }, + { + "epoch": 8.170555254114667, + "grad_norm": 0.03625442460179329, + "learning_rate": 1.866273715233008e-07, + "loss": 0.0036, + "step": 45175 + }, + { + "epoch": 8.175076867426299, + "grad_norm": 0.027331039309501648, + "learning_rate": 1.8616688156198192e-07, + "loss": 0.0058, + "step": 45200 + }, + { + "epoch": 8.179598480737928, + "grad_norm": 1.0191278457641602, + "learning_rate": 1.857063916006631e-07, + "loss": 0.0185, + "step": 45225 + }, + { + "epoch": 8.184120094049558, + "grad_norm": 0.65291428565979, + "learning_rate": 1.8524590163934426e-07, + "loss": 0.0031, + "step": 45250 + }, + { + "epoch": 8.188641707361187, + "grad_norm": 0.14517787098884583, + "learning_rate": 1.8478541167802542e-07, + "loss": 0.0004, + "step": 45275 + }, + { + "epoch": 8.193163320672816, + "grad_norm": 0.016481177881360054, + "learning_rate": 1.8434334131515933e-07, + "loss": 0.0271, + "step": 45300 + }, + { + "epoch": 8.197684933984446, + "grad_norm": 2.0038397312164307, + "learning_rate": 1.838828513538405e-07, + "loss": 0.0157, + "step": 45325 + }, + { + "epoch": 8.202206547296075, + "grad_norm": 0.07896800339221954, + "learning_rate": 1.8342236139252163e-07, + "loss": 0.0092, + "step": 45350 + }, + { + "epoch": 8.206728160607705, + "grad_norm": 0.04571648687124252, + "learning_rate": 1.829618714312028e-07, + "loss": 0.0112, + "step": 45375 + }, + { + "epoch": 8.211249773919334, + "grad_norm": 0.03423347696661949, + "learning_rate": 1.8250138146988394e-07, + "loss": 0.013, + "step": 45400 + }, + { + "epoch": 8.215771387230964, + "grad_norm": 34.946712493896484, + "learning_rate": 1.820408915085651e-07, + "loss": 0.0322, + "step": 45425 + }, + { + "epoch": 8.220293000542593, + "grad_norm": 0.0242567490786314, + "learning_rate": 1.8158040154724627e-07, + "loss": 0.0108, + "step": 45450 + }, + { + "epoch": 8.224814613854223, + "grad_norm": 0.04625415802001953, + "learning_rate": 1.8111991158592744e-07, + "loss": 0.0005, + "step": 45475 + }, + { + "epoch": 8.229336227165852, + "grad_norm": 0.16521072387695312, + "learning_rate": 1.8065942162460858e-07, + "loss": 0.0038, + "step": 45500 + }, + { + "epoch": 8.233857840477482, + "grad_norm": 0.01613481342792511, + "learning_rate": 1.8019893166328972e-07, + "loss": 0.0036, + "step": 45525 + }, + { + "epoch": 8.238379453789111, + "grad_norm": 8.090846061706543, + "learning_rate": 1.7973844170197088e-07, + "loss": 0.011, + "step": 45550 + }, + { + "epoch": 8.24290106710074, + "grad_norm": 0.062387678772211075, + "learning_rate": 1.7927795174065205e-07, + "loss": 0.0022, + "step": 45575 + }, + { + "epoch": 8.24742268041237, + "grad_norm": 0.11970767378807068, + "learning_rate": 1.7881746177933321e-07, + "loss": 0.0078, + "step": 45600 + }, + { + "epoch": 8.251944293724002, + "grad_norm": 0.30487823486328125, + "learning_rate": 1.7835697181801435e-07, + "loss": 0.0005, + "step": 45625 + }, + { + "epoch": 8.256465907035631, + "grad_norm": 0.12851175665855408, + "learning_rate": 1.7789648185669552e-07, + "loss": 0.0041, + "step": 45650 + }, + { + "epoch": 8.26098752034726, + "grad_norm": 0.7285523414611816, + "learning_rate": 1.7743599189537666e-07, + "loss": 0.0012, + "step": 45675 + }, + { + "epoch": 8.26550913365889, + "grad_norm": 0.006563634146004915, + "learning_rate": 1.7697550193405783e-07, + "loss": 0.0006, + "step": 45700 + }, + { + "epoch": 8.27003074697052, + "grad_norm": 0.08779245615005493, + "learning_rate": 1.76515011972739e-07, + "loss": 0.0176, + "step": 45725 + }, + { + "epoch": 8.274552360282149, + "grad_norm": 1.7844347953796387, + "learning_rate": 1.7605452201142016e-07, + "loss": 0.0035, + "step": 45750 + }, + { + "epoch": 8.279073973593778, + "grad_norm": 0.026561090722680092, + "learning_rate": 1.755940320501013e-07, + "loss": 0.0154, + "step": 45775 + }, + { + "epoch": 8.283595586905408, + "grad_norm": 0.08869388699531555, + "learning_rate": 1.7513354208878244e-07, + "loss": 0.0189, + "step": 45800 + }, + { + "epoch": 8.288117200217037, + "grad_norm": 0.09968849271535873, + "learning_rate": 1.746730521274636e-07, + "loss": 0.0057, + "step": 45825 + }, + { + "epoch": 8.292638813528667, + "grad_norm": 0.7243178486824036, + "learning_rate": 1.7421256216614477e-07, + "loss": 0.0214, + "step": 45850 + }, + { + "epoch": 8.297160426840296, + "grad_norm": 0.019970480352640152, + "learning_rate": 1.7375207220482594e-07, + "loss": 0.0058, + "step": 45875 + }, + { + "epoch": 8.301682040151926, + "grad_norm": 0.08685352653265, + "learning_rate": 1.732915822435071e-07, + "loss": 0.0053, + "step": 45900 + }, + { + "epoch": 8.306203653463555, + "grad_norm": 0.02851727232336998, + "learning_rate": 1.7283109228218822e-07, + "loss": 0.0085, + "step": 45925 + }, + { + "epoch": 8.310725266775185, + "grad_norm": 0.10517676919698715, + "learning_rate": 1.7237060232086938e-07, + "loss": 0.0043, + "step": 45950 + }, + { + "epoch": 8.315246880086814, + "grad_norm": 0.03421909734606743, + "learning_rate": 1.7191011235955055e-07, + "loss": 0.0009, + "step": 45975 + }, + { + "epoch": 8.319768493398444, + "grad_norm": 8.051795959472656, + "learning_rate": 1.7144962239823171e-07, + "loss": 0.0012, + "step": 46000 + }, + { + "epoch": 8.324290106710075, + "grad_norm": 0.04528782516717911, + "learning_rate": 1.7098913243691288e-07, + "loss": 0.0049, + "step": 46025 + }, + { + "epoch": 8.328811720021704, + "grad_norm": 0.024573039263486862, + "learning_rate": 1.7052864247559405e-07, + "loss": 0.0055, + "step": 46050 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 13.220183372497559, + "learning_rate": 1.7006815251427516e-07, + "loss": 0.0102, + "step": 46075 + }, + { + "epoch": 8.337854946644963, + "grad_norm": 3.289721727371216, + "learning_rate": 1.6960766255295633e-07, + "loss": 0.0292, + "step": 46100 + }, + { + "epoch": 8.342376559956593, + "grad_norm": 10.872947692871094, + "learning_rate": 1.691471725916375e-07, + "loss": 0.0028, + "step": 46125 + }, + { + "epoch": 8.346898173268222, + "grad_norm": 0.3125823438167572, + "learning_rate": 1.6868668263031866e-07, + "loss": 0.0007, + "step": 46150 + }, + { + "epoch": 8.351419786579852, + "grad_norm": 42.284271240234375, + "learning_rate": 1.6822619266899983e-07, + "loss": 0.0214, + "step": 46175 + }, + { + "epoch": 8.355941399891481, + "grad_norm": 40.723052978515625, + "learning_rate": 1.6776570270768097e-07, + "loss": 0.0119, + "step": 46200 + }, + { + "epoch": 8.36046301320311, + "grad_norm": 15.188941955566406, + "learning_rate": 1.673052127463621e-07, + "loss": 0.0174, + "step": 46225 + }, + { + "epoch": 8.36498462651474, + "grad_norm": 0.19278444349765778, + "learning_rate": 1.6684472278504327e-07, + "loss": 0.0107, + "step": 46250 + }, + { + "epoch": 8.36950623982637, + "grad_norm": 0.961004912853241, + "learning_rate": 1.6638423282372444e-07, + "loss": 0.0023, + "step": 46275 + }, + { + "epoch": 8.374027853138, + "grad_norm": 0.09470321238040924, + "learning_rate": 1.659237428624056e-07, + "loss": 0.0023, + "step": 46300 + }, + { + "epoch": 8.378549466449629, + "grad_norm": 0.10626068711280823, + "learning_rate": 1.6546325290108677e-07, + "loss": 0.0022, + "step": 46325 + }, + { + "epoch": 8.383071079761258, + "grad_norm": 0.07527956366539001, + "learning_rate": 1.6500276293976788e-07, + "loss": 0.0021, + "step": 46350 + }, + { + "epoch": 8.387592693072888, + "grad_norm": 0.07098814100027084, + "learning_rate": 1.6454227297844905e-07, + "loss": 0.0032, + "step": 46375 + }, + { + "epoch": 8.392114306384517, + "grad_norm": 0.06297345459461212, + "learning_rate": 1.6408178301713022e-07, + "loss": 0.0009, + "step": 46400 + }, + { + "epoch": 8.396635919696148, + "grad_norm": 0.6129382252693176, + "learning_rate": 1.6362129305581138e-07, + "loss": 0.0023, + "step": 46425 + }, + { + "epoch": 8.401157533007778, + "grad_norm": 0.07391338050365448, + "learning_rate": 1.6316080309449255e-07, + "loss": 0.0192, + "step": 46450 + }, + { + "epoch": 8.405679146319407, + "grad_norm": 0.009829241782426834, + "learning_rate": 1.627003131331737e-07, + "loss": 0.0009, + "step": 46475 + }, + { + "epoch": 8.410200759631037, + "grad_norm": 0.02008306048810482, + "learning_rate": 1.6223982317185483e-07, + "loss": 0.0012, + "step": 46500 + }, + { + "epoch": 8.414722372942666, + "grad_norm": 0.032892853021621704, + "learning_rate": 1.61779333210536e-07, + "loss": 0.0117, + "step": 46525 + }, + { + "epoch": 8.419243986254296, + "grad_norm": 0.3454951047897339, + "learning_rate": 1.6131884324921716e-07, + "loss": 0.0041, + "step": 46550 + }, + { + "epoch": 8.423765599565925, + "grad_norm": 41.06159973144531, + "learning_rate": 1.6085835328789833e-07, + "loss": 0.0189, + "step": 46575 + }, + { + "epoch": 8.428287212877555, + "grad_norm": 11.987919807434082, + "learning_rate": 1.603978633265795e-07, + "loss": 0.01, + "step": 46600 + }, + { + "epoch": 8.432808826189184, + "grad_norm": 0.24590350687503815, + "learning_rate": 1.5993737336526063e-07, + "loss": 0.0019, + "step": 46625 + }, + { + "epoch": 8.437330439500814, + "grad_norm": 0.11725660413503647, + "learning_rate": 1.5947688340394177e-07, + "loss": 0.0196, + "step": 46650 + }, + { + "epoch": 8.441852052812443, + "grad_norm": 0.04116886481642723, + "learning_rate": 1.5901639344262294e-07, + "loss": 0.0011, + "step": 46675 + }, + { + "epoch": 8.446373666124073, + "grad_norm": 0.03313690423965454, + "learning_rate": 1.585559034813041e-07, + "loss": 0.0051, + "step": 46700 + }, + { + "epoch": 8.450895279435702, + "grad_norm": 0.1109694391489029, + "learning_rate": 1.5809541351998527e-07, + "loss": 0.0042, + "step": 46725 + }, + { + "epoch": 8.455416892747332, + "grad_norm": 0.2245902568101883, + "learning_rate": 1.576349235586664e-07, + "loss": 0.0059, + "step": 46750 + }, + { + "epoch": 8.459938506058961, + "grad_norm": 0.014154641889035702, + "learning_rate": 1.5717443359734758e-07, + "loss": 0.0023, + "step": 46775 + }, + { + "epoch": 8.46446011937059, + "grad_norm": 0.16116198897361755, + "learning_rate": 1.5671394363602872e-07, + "loss": 0.0158, + "step": 46800 + }, + { + "epoch": 8.468981732682222, + "grad_norm": 0.00956847332417965, + "learning_rate": 1.5625345367470988e-07, + "loss": 0.0024, + "step": 46825 + }, + { + "epoch": 8.473503345993851, + "grad_norm": 0.007467388175427914, + "learning_rate": 1.5579296371339105e-07, + "loss": 0.0074, + "step": 46850 + }, + { + "epoch": 8.47802495930548, + "grad_norm": 0.028611036017537117, + "learning_rate": 1.553324737520722e-07, + "loss": 0.0052, + "step": 46875 + }, + { + "epoch": 8.48254657261711, + "grad_norm": 0.23077279329299927, + "learning_rate": 1.5487198379075336e-07, + "loss": 0.0137, + "step": 46900 + }, + { + "epoch": 8.48706818592874, + "grad_norm": 0.1357181966304779, + "learning_rate": 1.5441149382943452e-07, + "loss": 0.0082, + "step": 46925 + }, + { + "epoch": 8.49158979924037, + "grad_norm": 12.11319351196289, + "learning_rate": 1.5395100386811566e-07, + "loss": 0.0021, + "step": 46950 + }, + { + "epoch": 8.496111412551999, + "grad_norm": 1.0207489728927612, + "learning_rate": 1.5349051390679683e-07, + "loss": 0.0012, + "step": 46975 + }, + { + "epoch": 8.500633025863628, + "grad_norm": 0.05066627636551857, + "learning_rate": 1.53030023945478e-07, + "loss": 0.0033, + "step": 47000 + }, + { + "epoch": 8.505154639175258, + "grad_norm": 0.18461652100086212, + "learning_rate": 1.5256953398415913e-07, + "loss": 0.0051, + "step": 47025 + }, + { + "epoch": 8.509676252486887, + "grad_norm": 0.058383241295814514, + "learning_rate": 1.521090440228403e-07, + "loss": 0.0079, + "step": 47050 + }, + { + "epoch": 8.514197865798517, + "grad_norm": 0.08401685953140259, + "learning_rate": 1.5164855406152144e-07, + "loss": 0.005, + "step": 47075 + }, + { + "epoch": 8.518719479110146, + "grad_norm": 0.12280333787202835, + "learning_rate": 1.511880641002026e-07, + "loss": 0.0027, + "step": 47100 + }, + { + "epoch": 8.523241092421776, + "grad_norm": 0.0439714677631855, + "learning_rate": 1.5072757413888377e-07, + "loss": 0.002, + "step": 47125 + }, + { + "epoch": 8.527762705733405, + "grad_norm": 0.02209680713713169, + "learning_rate": 1.502670841775649e-07, + "loss": 0.0009, + "step": 47150 + }, + { + "epoch": 8.532284319045035, + "grad_norm": 6.216536521911621, + "learning_rate": 1.4980659421624608e-07, + "loss": 0.0197, + "step": 47175 + }, + { + "epoch": 8.536805932356664, + "grad_norm": 0.030912378802895546, + "learning_rate": 1.4934610425492724e-07, + "loss": 0.0003, + "step": 47200 + }, + { + "epoch": 8.541327545668295, + "grad_norm": 0.02260858565568924, + "learning_rate": 1.4888561429360838e-07, + "loss": 0.0013, + "step": 47225 + }, + { + "epoch": 8.545849158979925, + "grad_norm": 0.04053138568997383, + "learning_rate": 1.4842512433228955e-07, + "loss": 0.0036, + "step": 47250 + }, + { + "epoch": 8.550370772291554, + "grad_norm": 0.10423174500465393, + "learning_rate": 1.4796463437097072e-07, + "loss": 0.0005, + "step": 47275 + }, + { + "epoch": 8.554892385603184, + "grad_norm": 0.060733165591955185, + "learning_rate": 1.4750414440965186e-07, + "loss": 0.0017, + "step": 47300 + }, + { + "epoch": 8.559413998914813, + "grad_norm": 0.07485374808311462, + "learning_rate": 1.4704365444833302e-07, + "loss": 0.0007, + "step": 47325 + }, + { + "epoch": 8.563935612226443, + "grad_norm": 0.03115830197930336, + "learning_rate": 1.465831644870142e-07, + "loss": 0.0112, + "step": 47350 + }, + { + "epoch": 8.568457225538072, + "grad_norm": 0.02164495922625065, + "learning_rate": 1.4612267452569533e-07, + "loss": 0.0006, + "step": 47375 + }, + { + "epoch": 8.572978838849702, + "grad_norm": 0.024691320955753326, + "learning_rate": 1.456621845643765e-07, + "loss": 0.0259, + "step": 47400 + }, + { + "epoch": 8.577500452161331, + "grad_norm": 17.621110916137695, + "learning_rate": 1.4520169460305763e-07, + "loss": 0.0218, + "step": 47425 + }, + { + "epoch": 8.58202206547296, + "grad_norm": 4.349635601043701, + "learning_rate": 1.447412046417388e-07, + "loss": 0.0055, + "step": 47450 + }, + { + "epoch": 8.58654367878459, + "grad_norm": 1.4642325639724731, + "learning_rate": 1.4428071468041997e-07, + "loss": 0.0065, + "step": 47475 + }, + { + "epoch": 8.59106529209622, + "grad_norm": 0.0913548618555069, + "learning_rate": 1.4382022471910113e-07, + "loss": 0.0034, + "step": 47500 + }, + { + "epoch": 8.595586905407849, + "grad_norm": 0.025698378682136536, + "learning_rate": 1.4335973475778227e-07, + "loss": 0.0023, + "step": 47525 + }, + { + "epoch": 8.600108518719479, + "grad_norm": 0.07771757990121841, + "learning_rate": 1.428992447964634e-07, + "loss": 0.0022, + "step": 47550 + }, + { + "epoch": 8.604630132031108, + "grad_norm": 5.4854254722595215, + "learning_rate": 1.4243875483514458e-07, + "loss": 0.005, + "step": 47575 + }, + { + "epoch": 8.609151745342738, + "grad_norm": 0.031100405380129814, + "learning_rate": 1.4197826487382574e-07, + "loss": 0.0067, + "step": 47600 + }, + { + "epoch": 8.613673358654367, + "grad_norm": 0.015138168819248676, + "learning_rate": 1.415177749125069e-07, + "loss": 0.0094, + "step": 47625 + }, + { + "epoch": 8.618194971965998, + "grad_norm": 1.461853265762329, + "learning_rate": 1.4105728495118808e-07, + "loss": 0.002, + "step": 47650 + }, + { + "epoch": 8.622716585277628, + "grad_norm": 0.05030835047364235, + "learning_rate": 1.4059679498986922e-07, + "loss": 0.0014, + "step": 47675 + }, + { + "epoch": 8.627238198589257, + "grad_norm": 0.15744911134243011, + "learning_rate": 1.4013630502855036e-07, + "loss": 0.0006, + "step": 47700 + }, + { + "epoch": 8.631759811900887, + "grad_norm": 0.6732610464096069, + "learning_rate": 1.3967581506723152e-07, + "loss": 0.0024, + "step": 47725 + }, + { + "epoch": 8.636281425212516, + "grad_norm": 3.470303773880005, + "learning_rate": 1.392153251059127e-07, + "loss": 0.0079, + "step": 47750 + }, + { + "epoch": 8.640803038524146, + "grad_norm": 27.621957778930664, + "learning_rate": 1.3875483514459386e-07, + "loss": 0.0164, + "step": 47775 + }, + { + "epoch": 8.645324651835775, + "grad_norm": 0.01883932389318943, + "learning_rate": 1.38294345183275e-07, + "loss": 0.0145, + "step": 47800 + }, + { + "epoch": 8.649846265147405, + "grad_norm": 0.08440113812685013, + "learning_rate": 1.3783385522195614e-07, + "loss": 0.0188, + "step": 47825 + }, + { + "epoch": 8.654367878459034, + "grad_norm": 0.0456276573240757, + "learning_rate": 1.373733652606373e-07, + "loss": 0.0116, + "step": 47850 + }, + { + "epoch": 8.658889491770664, + "grad_norm": 8.457013130187988, + "learning_rate": 1.3691287529931847e-07, + "loss": 0.005, + "step": 47875 + }, + { + "epoch": 8.663411105082293, + "grad_norm": 2.870605707168579, + "learning_rate": 1.3645238533799963e-07, + "loss": 0.0051, + "step": 47900 + }, + { + "epoch": 8.667932718393923, + "grad_norm": 0.0406540222465992, + "learning_rate": 1.359918953766808e-07, + "loss": 0.0103, + "step": 47925 + }, + { + "epoch": 8.672454331705552, + "grad_norm": 0.12393586337566376, + "learning_rate": 1.3553140541536194e-07, + "loss": 0.0089, + "step": 47950 + }, + { + "epoch": 8.676975945017182, + "grad_norm": 0.04684567451477051, + "learning_rate": 1.3507091545404308e-07, + "loss": 0.005, + "step": 47975 + }, + { + "epoch": 8.681497558328811, + "grad_norm": 0.005455177277326584, + "learning_rate": 1.3461042549272425e-07, + "loss": 0.0011, + "step": 48000 + }, + { + "epoch": 8.686019171640442, + "grad_norm": 0.0036073036026209593, + "learning_rate": 1.341499355314054e-07, + "loss": 0.0109, + "step": 48025 + }, + { + "epoch": 8.690540784952072, + "grad_norm": 0.015157288871705532, + "learning_rate": 1.3368944557008658e-07, + "loss": 0.0028, + "step": 48050 + }, + { + "epoch": 8.695062398263701, + "grad_norm": 0.045515723526477814, + "learning_rate": 1.3322895560876774e-07, + "loss": 0.0061, + "step": 48075 + }, + { + "epoch": 8.69958401157533, + "grad_norm": 0.3657865822315216, + "learning_rate": 1.3276846564744886e-07, + "loss": 0.0024, + "step": 48100 + }, + { + "epoch": 8.70410562488696, + "grad_norm": 0.009580901823937893, + "learning_rate": 1.3230797568613002e-07, + "loss": 0.0132, + "step": 48125 + }, + { + "epoch": 8.70862723819859, + "grad_norm": 0.033869609236717224, + "learning_rate": 1.318474857248112e-07, + "loss": 0.0049, + "step": 48150 + }, + { + "epoch": 8.71314885151022, + "grad_norm": 11.958950996398926, + "learning_rate": 1.3138699576349236e-07, + "loss": 0.0098, + "step": 48175 + }, + { + "epoch": 8.717670464821849, + "grad_norm": 0.02979426644742489, + "learning_rate": 1.3092650580217352e-07, + "loss": 0.0011, + "step": 48200 + }, + { + "epoch": 8.722192078133478, + "grad_norm": 1.0056304931640625, + "learning_rate": 1.304660158408547e-07, + "loss": 0.0352, + "step": 48225 + }, + { + "epoch": 8.726713691445108, + "grad_norm": 1.8282511234283447, + "learning_rate": 1.300055258795358e-07, + "loss": 0.0204, + "step": 48250 + }, + { + "epoch": 8.731235304756737, + "grad_norm": 0.0187783632427454, + "learning_rate": 1.2954503591821697e-07, + "loss": 0.0129, + "step": 48275 + }, + { + "epoch": 8.735756918068367, + "grad_norm": 0.3944437801837921, + "learning_rate": 1.2908454595689813e-07, + "loss": 0.0009, + "step": 48300 + }, + { + "epoch": 8.740278531379996, + "grad_norm": 11.324723243713379, + "learning_rate": 1.286240559955793e-07, + "loss": 0.0015, + "step": 48325 + }, + { + "epoch": 8.744800144691625, + "grad_norm": 0.03956815227866173, + "learning_rate": 1.2816356603426047e-07, + "loss": 0.003, + "step": 48350 + }, + { + "epoch": 8.749321758003255, + "grad_norm": 0.06833972781896591, + "learning_rate": 1.277030760729416e-07, + "loss": 0.0005, + "step": 48375 + }, + { + "epoch": 8.753843371314884, + "grad_norm": 0.029176251962780952, + "learning_rate": 1.2724258611162275e-07, + "loss": 0.001, + "step": 48400 + }, + { + "epoch": 8.758364984626514, + "grad_norm": 0.02110159769654274, + "learning_rate": 1.267820961503039e-07, + "loss": 0.0009, + "step": 48425 + }, + { + "epoch": 8.762886597938145, + "grad_norm": 0.023354971781373024, + "learning_rate": 1.2632160618898508e-07, + "loss": 0.0012, + "step": 48450 + }, + { + "epoch": 8.767408211249775, + "grad_norm": 0.0024972474202513695, + "learning_rate": 1.2586111622766625e-07, + "loss": 0.0014, + "step": 48475 + }, + { + "epoch": 8.771929824561404, + "grad_norm": 10.001263618469238, + "learning_rate": 1.2540062626634739e-07, + "loss": 0.0187, + "step": 48500 + }, + { + "epoch": 8.776451437873034, + "grad_norm": 0.003959618508815765, + "learning_rate": 1.2494013630502855e-07, + "loss": 0.0181, + "step": 48525 + }, + { + "epoch": 8.780973051184663, + "grad_norm": 0.05257127806544304, + "learning_rate": 1.2447964634370972e-07, + "loss": 0.0041, + "step": 48550 + }, + { + "epoch": 8.785494664496293, + "grad_norm": 1.0013961791992188, + "learning_rate": 1.2401915638239086e-07, + "loss": 0.0028, + "step": 48575 + }, + { + "epoch": 8.790016277807922, + "grad_norm": 0.04151635989546776, + "learning_rate": 1.2355866642107202e-07, + "loss": 0.0439, + "step": 48600 + }, + { + "epoch": 8.794537891119552, + "grad_norm": 0.07857254147529602, + "learning_rate": 1.230981764597532e-07, + "loss": 0.0176, + "step": 48625 + }, + { + "epoch": 8.799059504431181, + "grad_norm": 0.0398498959839344, + "learning_rate": 1.2263768649843433e-07, + "loss": 0.0035, + "step": 48650 + }, + { + "epoch": 8.80358111774281, + "grad_norm": 0.18911878764629364, + "learning_rate": 1.221771965371155e-07, + "loss": 0.0041, + "step": 48675 + }, + { + "epoch": 8.80810273105444, + "grad_norm": 0.5053550601005554, + "learning_rate": 1.2171670657579666e-07, + "loss": 0.0029, + "step": 48700 + }, + { + "epoch": 8.81262434436607, + "grad_norm": 0.11117665469646454, + "learning_rate": 1.212562166144778e-07, + "loss": 0.0022, + "step": 48725 + }, + { + "epoch": 8.817145957677699, + "grad_norm": 0.03761090338230133, + "learning_rate": 1.2079572665315897e-07, + "loss": 0.0014, + "step": 48750 + }, + { + "epoch": 8.821667570989328, + "grad_norm": 0.0750727653503418, + "learning_rate": 1.203352366918401e-07, + "loss": 0.0051, + "step": 48775 + }, + { + "epoch": 8.826189184300958, + "grad_norm": 0.1517859250307083, + "learning_rate": 1.1987474673052127e-07, + "loss": 0.0072, + "step": 48800 + }, + { + "epoch": 8.830710797612587, + "grad_norm": 0.02724577859044075, + "learning_rate": 1.1941425676920244e-07, + "loss": 0.0025, + "step": 48825 + }, + { + "epoch": 8.835232410924217, + "grad_norm": 0.02841174229979515, + "learning_rate": 1.1895376680788358e-07, + "loss": 0.0017, + "step": 48850 + }, + { + "epoch": 8.839754024235848, + "grad_norm": 0.03608907014131546, + "learning_rate": 1.1849327684656473e-07, + "loss": 0.0041, + "step": 48875 + }, + { + "epoch": 8.844275637547478, + "grad_norm": 0.022049210965633392, + "learning_rate": 1.180327868852459e-07, + "loss": 0.0011, + "step": 48900 + }, + { + "epoch": 8.848797250859107, + "grad_norm": 13.626410484313965, + "learning_rate": 1.1757229692392705e-07, + "loss": 0.0021, + "step": 48925 + }, + { + "epoch": 8.853318864170737, + "grad_norm": 0.021010667085647583, + "learning_rate": 1.171118069626082e-07, + "loss": 0.0022, + "step": 48950 + }, + { + "epoch": 8.857840477482366, + "grad_norm": 0.009972944855690002, + "learning_rate": 1.1665131700128937e-07, + "loss": 0.0047, + "step": 48975 + }, + { + "epoch": 8.862362090793996, + "grad_norm": 0.04962944611907005, + "learning_rate": 1.1619082703997052e-07, + "loss": 0.0036, + "step": 49000 + }, + { + "epoch": 8.866883704105625, + "grad_norm": 1.5837059020996094, + "learning_rate": 1.1573033707865168e-07, + "loss": 0.0017, + "step": 49025 + }, + { + "epoch": 8.871405317417254, + "grad_norm": 4.073685646057129, + "learning_rate": 1.1526984711733284e-07, + "loss": 0.0092, + "step": 49050 + }, + { + "epoch": 8.875926930728884, + "grad_norm": 0.0222158282995224, + "learning_rate": 1.1480935715601398e-07, + "loss": 0.0051, + "step": 49075 + }, + { + "epoch": 8.880448544040513, + "grad_norm": 0.055091459304094315, + "learning_rate": 1.1434886719469515e-07, + "loss": 0.0077, + "step": 49100 + }, + { + "epoch": 8.884970157352143, + "grad_norm": 0.4387997090816498, + "learning_rate": 1.1388837723337632e-07, + "loss": 0.0013, + "step": 49125 + }, + { + "epoch": 8.889491770663772, + "grad_norm": 0.020865125581622124, + "learning_rate": 1.1342788727205746e-07, + "loss": 0.0005, + "step": 49150 + }, + { + "epoch": 8.894013383975402, + "grad_norm": 0.05737067386507988, + "learning_rate": 1.1296739731073862e-07, + "loss": 0.0016, + "step": 49175 + }, + { + "epoch": 8.898534997287031, + "grad_norm": 0.04351663962006569, + "learning_rate": 1.1250690734941979e-07, + "loss": 0.0049, + "step": 49200 + }, + { + "epoch": 8.90305661059866, + "grad_norm": 0.09150709211826324, + "learning_rate": 1.1204641738810093e-07, + "loss": 0.0047, + "step": 49225 + }, + { + "epoch": 8.907578223910292, + "grad_norm": 0.019495923072099686, + "learning_rate": 1.115859274267821e-07, + "loss": 0.0004, + "step": 49250 + }, + { + "epoch": 8.912099837221922, + "grad_norm": 0.002537541324272752, + "learning_rate": 1.1112543746546326e-07, + "loss": 0.0021, + "step": 49275 + }, + { + "epoch": 8.916621450533551, + "grad_norm": 0.00815527979284525, + "learning_rate": 1.106649475041444e-07, + "loss": 0.0002, + "step": 49300 + }, + { + "epoch": 8.92114306384518, + "grad_norm": 4.410140037536621, + "learning_rate": 1.1020445754282557e-07, + "loss": 0.0086, + "step": 49325 + }, + { + "epoch": 8.92566467715681, + "grad_norm": 0.3327310383319855, + "learning_rate": 1.0974396758150672e-07, + "loss": 0.0179, + "step": 49350 + }, + { + "epoch": 8.93018629046844, + "grad_norm": 0.052819494158029556, + "learning_rate": 1.0930189721864062e-07, + "loss": 0.0199, + "step": 49375 + }, + { + "epoch": 8.934707903780069, + "grad_norm": 29.91364097595215, + "learning_rate": 1.0884140725732179e-07, + "loss": 0.0274, + "step": 49400 + }, + { + "epoch": 8.939229517091698, + "grad_norm": Infinity, + "learning_rate": 1.0839933689445569e-07, + "loss": 0.031, + "step": 49425 + }, + { + "epoch": 8.943751130403328, + "grad_norm": 0.09751866012811661, + "learning_rate": 1.0793884693313686e-07, + "loss": 0.025, + "step": 49450 + }, + { + "epoch": 8.948272743714957, + "grad_norm": 0.38296425342559814, + "learning_rate": 1.0747835697181801e-07, + "loss": 0.004, + "step": 49475 + }, + { + "epoch": 8.952794357026587, + "grad_norm": 0.05933375656604767, + "learning_rate": 1.0701786701049917e-07, + "loss": 0.0018, + "step": 49500 + }, + { + "epoch": 8.957315970338216, + "grad_norm": 0.14764128625392914, + "learning_rate": 1.0655737704918032e-07, + "loss": 0.0005, + "step": 49525 + }, + { + "epoch": 8.961837583649846, + "grad_norm": 0.034122079610824585, + "learning_rate": 1.0609688708786148e-07, + "loss": 0.0029, + "step": 49550 + }, + { + "epoch": 8.966359196961475, + "grad_norm": 0.04565449431538582, + "learning_rate": 1.0563639712654264e-07, + "loss": 0.0004, + "step": 49575 + }, + { + "epoch": 8.970880810273105, + "grad_norm": 0.01443118043243885, + "learning_rate": 1.0517590716522379e-07, + "loss": 0.0041, + "step": 49600 + }, + { + "epoch": 8.975402423584734, + "grad_norm": 0.005753234960138798, + "learning_rate": 1.0471541720390494e-07, + "loss": 0.0026, + "step": 49625 + }, + { + "epoch": 8.979924036896364, + "grad_norm": 0.02034948766231537, + "learning_rate": 1.0425492724258611e-07, + "loss": 0.0007, + "step": 49650 + }, + { + "epoch": 8.984445650207995, + "grad_norm": 0.012954095378518105, + "learning_rate": 1.0379443728126726e-07, + "loss": 0.0028, + "step": 49675 + }, + { + "epoch": 8.988967263519624, + "grad_norm": 11.640181541442871, + "learning_rate": 1.0333394731994842e-07, + "loss": 0.0033, + "step": 49700 + }, + { + "epoch": 8.993488876831254, + "grad_norm": 0.07725071907043457, + "learning_rate": 1.0287345735862958e-07, + "loss": 0.0125, + "step": 49725 + }, + { + "epoch": 8.998010490142883, + "grad_norm": 0.12377389520406723, + "learning_rate": 1.0241296739731073e-07, + "loss": 0.0204, + "step": 49750 + }, + { + "epoch": 9.0, + "eval_loss": 0.370661199092865, + "eval_runtime": 8351.69, + "eval_samples_per_second": 1.137, + "eval_steps_per_second": 0.142, + "eval_wer": 0.10357142857142858, + "step": 49761 + }, + { + "epoch": 9.002532103454513, + "grad_norm": 0.027261001989245415, + "learning_rate": 1.0195247743599189e-07, + "loss": 0.0049, + "step": 49775 + }, + { + "epoch": 9.007053716766142, + "grad_norm": 0.1393139511346817, + "learning_rate": 1.0149198747467305e-07, + "loss": 0.0031, + "step": 49800 + }, + { + "epoch": 9.011575330077772, + "grad_norm": 0.31711727380752563, + "learning_rate": 1.010314975133542e-07, + "loss": 0.0015, + "step": 49825 + }, + { + "epoch": 9.016096943389401, + "grad_norm": 0.021990245208144188, + "learning_rate": 1.0057100755203536e-07, + "loss": 0.0064, + "step": 49850 + }, + { + "epoch": 9.02061855670103, + "grad_norm": 0.028167344629764557, + "learning_rate": 1.0011051759071653e-07, + "loss": 0.0019, + "step": 49875 + }, + { + "epoch": 9.02514017001266, + "grad_norm": 12.446762084960938, + "learning_rate": 9.965002762939767e-08, + "loss": 0.0111, + "step": 49900 + }, + { + "epoch": 9.02966178332429, + "grad_norm": 0.024779673665761948, + "learning_rate": 9.918953766807883e-08, + "loss": 0.0027, + "step": 49925 + }, + { + "epoch": 9.03418339663592, + "grad_norm": 2.8062970638275146, + "learning_rate": 9.872904770676e-08, + "loss": 0.0046, + "step": 49950 + }, + { + "epoch": 9.038705009947549, + "grad_norm": 0.011668199673295021, + "learning_rate": 9.826855774544114e-08, + "loss": 0.0315, + "step": 49975 + }, + { + "epoch": 9.043226623259178, + "grad_norm": 0.005240909289568663, + "learning_rate": 9.78080677841223e-08, + "loss": 0.0067, + "step": 50000 + }, + { + "epoch": 9.047748236570808, + "grad_norm": 0.015816286206245422, + "learning_rate": 9.734757782280347e-08, + "loss": 0.0018, + "step": 50025 + }, + { + "epoch": 9.052269849882437, + "grad_norm": 0.0809655636548996, + "learning_rate": 9.688708786148461e-08, + "loss": 0.0169, + "step": 50050 + }, + { + "epoch": 9.056791463194068, + "grad_norm": 0.1855606585741043, + "learning_rate": 9.642659790016578e-08, + "loss": 0.0003, + "step": 50075 + }, + { + "epoch": 9.061313076505698, + "grad_norm": 1.0818848609924316, + "learning_rate": 9.596610793884693e-08, + "loss": 0.0114, + "step": 50100 + }, + { + "epoch": 9.065834689817327, + "grad_norm": 1.6838890314102173, + "learning_rate": 9.550561797752808e-08, + "loss": 0.0142, + "step": 50125 + }, + { + "epoch": 9.070356303128957, + "grad_norm": 0.051931653171777725, + "learning_rate": 9.504512801620925e-08, + "loss": 0.0267, + "step": 50150 + }, + { + "epoch": 9.074877916440586, + "grad_norm": 0.03803849592804909, + "learning_rate": 9.45846380548904e-08, + "loss": 0.0079, + "step": 50175 + }, + { + "epoch": 9.079399529752216, + "grad_norm": 0.17854449152946472, + "learning_rate": 9.412414809357155e-08, + "loss": 0.0032, + "step": 50200 + }, + { + "epoch": 9.083921143063845, + "grad_norm": 0.07141181081533432, + "learning_rate": 9.366365813225272e-08, + "loss": 0.0009, + "step": 50225 + }, + { + "epoch": 9.088442756375475, + "grad_norm": 0.11261973530054092, + "learning_rate": 9.320316817093386e-08, + "loss": 0.0006, + "step": 50250 + }, + { + "epoch": 9.092964369687104, + "grad_norm": 0.009887372143566608, + "learning_rate": 9.274267820961503e-08, + "loss": 0.0028, + "step": 50275 + }, + { + "epoch": 9.097485982998734, + "grad_norm": 0.03133253753185272, + "learning_rate": 9.228218824829618e-08, + "loss": 0.0106, + "step": 50300 + }, + { + "epoch": 9.102007596310363, + "grad_norm": 0.11163907498121262, + "learning_rate": 9.182169828697733e-08, + "loss": 0.0012, + "step": 50325 + }, + { + "epoch": 9.106529209621993, + "grad_norm": 0.004720740485936403, + "learning_rate": 9.13612083256585e-08, + "loss": 0.0051, + "step": 50350 + }, + { + "epoch": 9.111050822933622, + "grad_norm": 0.031010426580905914, + "learning_rate": 9.090071836433965e-08, + "loss": 0.0008, + "step": 50375 + }, + { + "epoch": 9.115572436245252, + "grad_norm": 0.02335488423705101, + "learning_rate": 9.04402284030208e-08, + "loss": 0.0129, + "step": 50400 + }, + { + "epoch": 9.120094049556881, + "grad_norm": 0.0240317415446043, + "learning_rate": 8.997973844170197e-08, + "loss": 0.0024, + "step": 50425 + }, + { + "epoch": 9.12461566286851, + "grad_norm": 0.03213008865714073, + "learning_rate": 8.951924848038312e-08, + "loss": 0.0052, + "step": 50450 + }, + { + "epoch": 9.129137276180142, + "grad_norm": 0.04678371921181679, + "learning_rate": 8.905875851906428e-08, + "loss": 0.0014, + "step": 50475 + }, + { + "epoch": 9.133658889491771, + "grad_norm": 1.1161173582077026, + "learning_rate": 8.859826855774543e-08, + "loss": 0.0005, + "step": 50500 + }, + { + "epoch": 9.1381805028034, + "grad_norm": 0.1399179995059967, + "learning_rate": 8.81377785964266e-08, + "loss": 0.0063, + "step": 50525 + }, + { + "epoch": 9.14270211611503, + "grad_norm": 0.05243779718875885, + "learning_rate": 8.767728863510775e-08, + "loss": 0.002, + "step": 50550 + }, + { + "epoch": 9.14722372942666, + "grad_norm": 0.03155532851815224, + "learning_rate": 8.72167986737889e-08, + "loss": 0.0164, + "step": 50575 + }, + { + "epoch": 9.15174534273829, + "grad_norm": 0.025026287883520126, + "learning_rate": 8.675630871247007e-08, + "loss": 0.0037, + "step": 50600 + }, + { + "epoch": 9.156266956049919, + "grad_norm": 0.03365040570497513, + "learning_rate": 8.629581875115122e-08, + "loss": 0.0006, + "step": 50625 + }, + { + "epoch": 9.160788569361548, + "grad_norm": 0.01698467880487442, + "learning_rate": 8.583532878983237e-08, + "loss": 0.0075, + "step": 50650 + }, + { + "epoch": 9.165310182673178, + "grad_norm": 2.0226006507873535, + "learning_rate": 8.537483882851354e-08, + "loss": 0.0007, + "step": 50675 + }, + { + "epoch": 9.169831795984807, + "grad_norm": 0.2596571445465088, + "learning_rate": 8.49143488671947e-08, + "loss": 0.0019, + "step": 50700 + }, + { + "epoch": 9.174353409296437, + "grad_norm": 0.1478416472673416, + "learning_rate": 8.445385890587585e-08, + "loss": 0.0014, + "step": 50725 + }, + { + "epoch": 9.178875022608066, + "grad_norm": 0.015776338055729866, + "learning_rate": 8.399336894455701e-08, + "loss": 0.0152, + "step": 50750 + }, + { + "epoch": 9.183396635919696, + "grad_norm": 0.011738612316548824, + "learning_rate": 8.353287898323815e-08, + "loss": 0.0007, + "step": 50775 + }, + { + "epoch": 9.187918249231325, + "grad_norm": 0.031315308064222336, + "learning_rate": 8.307238902191932e-08, + "loss": 0.0013, + "step": 50800 + }, + { + "epoch": 9.192439862542955, + "grad_norm": 0.19211354851722717, + "learning_rate": 8.261189906060049e-08, + "loss": 0.001, + "step": 50825 + }, + { + "epoch": 9.196961475854584, + "grad_norm": 0.03229336068034172, + "learning_rate": 8.215140909928163e-08, + "loss": 0.0097, + "step": 50850 + }, + { + "epoch": 9.201483089166215, + "grad_norm": 0.010991367511451244, + "learning_rate": 8.169091913796279e-08, + "loss": 0.0149, + "step": 50875 + }, + { + "epoch": 9.206004702477845, + "grad_norm": 0.014507956802845001, + "learning_rate": 8.123042917664394e-08, + "loss": 0.0074, + "step": 50900 + }, + { + "epoch": 9.210526315789474, + "grad_norm": 2.2018537521362305, + "learning_rate": 8.07699392153251e-08, + "loss": 0.0245, + "step": 50925 + }, + { + "epoch": 9.215047929101104, + "grad_norm": 0.4151630699634552, + "learning_rate": 8.030944925400626e-08, + "loss": 0.0402, + "step": 50950 + }, + { + "epoch": 9.219569542412733, + "grad_norm": 0.041080426424741745, + "learning_rate": 7.98489592926874e-08, + "loss": 0.0219, + "step": 50975 + }, + { + "epoch": 9.224091155724363, + "grad_norm": 0.026268433779478073, + "learning_rate": 7.938846933136857e-08, + "loss": 0.0091, + "step": 51000 + }, + { + "epoch": 9.228612769035992, + "grad_norm": 0.04076811671257019, + "learning_rate": 7.892797937004974e-08, + "loss": 0.0012, + "step": 51025 + }, + { + "epoch": 9.233134382347622, + "grad_norm": 0.011797059327363968, + "learning_rate": 7.846748940873088e-08, + "loss": 0.005, + "step": 51050 + }, + { + "epoch": 9.237655995659251, + "grad_norm": 2.813450574874878, + "learning_rate": 7.800699944741204e-08, + "loss": 0.001, + "step": 51075 + }, + { + "epoch": 9.24217760897088, + "grad_norm": 0.11257357895374298, + "learning_rate": 7.754650948609321e-08, + "loss": 0.0089, + "step": 51100 + }, + { + "epoch": 9.24669922228251, + "grad_norm": 0.01619679108262062, + "learning_rate": 7.708601952477435e-08, + "loss": 0.0009, + "step": 51125 + }, + { + "epoch": 9.25122083559414, + "grad_norm": 0.0227675624191761, + "learning_rate": 7.662552956345551e-08, + "loss": 0.001, + "step": 51150 + }, + { + "epoch": 9.25574244890577, + "grad_norm": 19.843429565429688, + "learning_rate": 7.616503960213668e-08, + "loss": 0.0012, + "step": 51175 + }, + { + "epoch": 9.260264062217399, + "grad_norm": 0.17110028862953186, + "learning_rate": 7.570454964081782e-08, + "loss": 0.0048, + "step": 51200 + }, + { + "epoch": 9.264785675529028, + "grad_norm": 0.034038007259368896, + "learning_rate": 7.524405967949899e-08, + "loss": 0.0061, + "step": 51225 + }, + { + "epoch": 9.269307288840658, + "grad_norm": 0.8703758716583252, + "learning_rate": 7.478356971818014e-08, + "loss": 0.0041, + "step": 51250 + }, + { + "epoch": 9.273828902152289, + "grad_norm": 0.019711392000317574, + "learning_rate": 7.432307975686129e-08, + "loss": 0.0105, + "step": 51275 + }, + { + "epoch": 9.278350515463918, + "grad_norm": 12.372844696044922, + "learning_rate": 7.386258979554246e-08, + "loss": 0.0258, + "step": 51300 + }, + { + "epoch": 9.282872128775548, + "grad_norm": 0.03570927679538727, + "learning_rate": 7.340209983422361e-08, + "loss": 0.027, + "step": 51325 + }, + { + "epoch": 9.287393742087177, + "grad_norm": 0.2612115144729614, + "learning_rate": 7.294160987290476e-08, + "loss": 0.0166, + "step": 51350 + }, + { + "epoch": 9.291915355398807, + "grad_norm": 0.10630948096513748, + "learning_rate": 7.248111991158593e-08, + "loss": 0.0233, + "step": 51375 + }, + { + "epoch": 9.296436968710436, + "grad_norm": 0.08389817923307419, + "learning_rate": 7.202062995026708e-08, + "loss": 0.0012, + "step": 51400 + }, + { + "epoch": 9.300958582022066, + "grad_norm": 0.012098311446607113, + "learning_rate": 7.156013998894824e-08, + "loss": 0.0049, + "step": 51425 + }, + { + "epoch": 9.305480195333695, + "grad_norm": 1.7195242643356323, + "learning_rate": 7.109965002762939e-08, + "loss": 0.0013, + "step": 51450 + }, + { + "epoch": 9.310001808645325, + "grad_norm": 0.005529410671442747, + "learning_rate": 7.063916006631056e-08, + "loss": 0.0005, + "step": 51475 + }, + { + "epoch": 9.314523421956954, + "grad_norm": 14.70275592803955, + "learning_rate": 7.017867010499171e-08, + "loss": 0.0074, + "step": 51500 + }, + { + "epoch": 9.319045035268584, + "grad_norm": 0.03697911649942398, + "learning_rate": 6.971818014367286e-08, + "loss": 0.0019, + "step": 51525 + }, + { + "epoch": 9.323566648580213, + "grad_norm": 0.023678451776504517, + "learning_rate": 6.927610978080678e-08, + "loss": 0.0067, + "step": 51550 + }, + { + "epoch": 9.328088261891843, + "grad_norm": 0.056655462831258774, + "learning_rate": 6.881561981948793e-08, + "loss": 0.0029, + "step": 51575 + }, + { + "epoch": 9.332609875203472, + "grad_norm": 0.2097851186990738, + "learning_rate": 6.835512985816909e-08, + "loss": 0.0002, + "step": 51600 + }, + { + "epoch": 9.337131488515102, + "grad_norm": 0.030322756618261337, + "learning_rate": 6.789463989685025e-08, + "loss": 0.02, + "step": 51625 + }, + { + "epoch": 9.341653101826731, + "grad_norm": 0.05204546079039574, + "learning_rate": 6.74341499355314e-08, + "loss": 0.0002, + "step": 51650 + }, + { + "epoch": 9.34617471513836, + "grad_norm": 5.798641204833984, + "learning_rate": 6.697365997421256e-08, + "loss": 0.0012, + "step": 51675 + }, + { + "epoch": 9.350696328449992, + "grad_norm": 1.0529894828796387, + "learning_rate": 6.651317001289372e-08, + "loss": 0.016, + "step": 51700 + }, + { + "epoch": 9.355217941761621, + "grad_norm": 0.03712335228919983, + "learning_rate": 6.605268005157486e-08, + "loss": 0.0015, + "step": 51725 + }, + { + "epoch": 9.35973955507325, + "grad_norm": 0.20770879089832306, + "learning_rate": 6.559219009025603e-08, + "loss": 0.0245, + "step": 51750 + }, + { + "epoch": 9.36426116838488, + "grad_norm": 0.034741051495075226, + "learning_rate": 6.51317001289372e-08, + "loss": 0.0384, + "step": 51775 + }, + { + "epoch": 9.36878278169651, + "grad_norm": 0.06246571242809296, + "learning_rate": 6.467121016761834e-08, + "loss": 0.0029, + "step": 51800 + }, + { + "epoch": 9.37330439500814, + "grad_norm": 0.019802218303084373, + "learning_rate": 6.42107202062995e-08, + "loss": 0.0017, + "step": 51825 + }, + { + "epoch": 9.377826008319769, + "grad_norm": 0.035850297659635544, + "learning_rate": 6.375023024498067e-08, + "loss": 0.0024, + "step": 51850 + }, + { + "epoch": 9.382347621631398, + "grad_norm": 0.01637556403875351, + "learning_rate": 6.328974028366181e-08, + "loss": 0.0005, + "step": 51875 + }, + { + "epoch": 9.386869234943028, + "grad_norm": 0.04414265602827072, + "learning_rate": 6.282925032234297e-08, + "loss": 0.0038, + "step": 51900 + }, + { + "epoch": 9.391390848254657, + "grad_norm": 0.048500921577215195, + "learning_rate": 6.236876036102413e-08, + "loss": 0.0088, + "step": 51925 + }, + { + "epoch": 9.395912461566287, + "grad_norm": 0.22167189419269562, + "learning_rate": 6.190827039970528e-08, + "loss": 0.0033, + "step": 51950 + }, + { + "epoch": 9.400434074877916, + "grad_norm": 0.02152320370078087, + "learning_rate": 6.144778043838645e-08, + "loss": 0.0014, + "step": 51975 + }, + { + "epoch": 9.404955688189546, + "grad_norm": 0.016991982236504555, + "learning_rate": 6.09872904770676e-08, + "loss": 0.007, + "step": 52000 + }, + { + "epoch": 9.409477301501175, + "grad_norm": 0.01895890012383461, + "learning_rate": 6.052680051574875e-08, + "loss": 0.0049, + "step": 52025 + }, + { + "epoch": 9.413998914812804, + "grad_norm": 0.02496323734521866, + "learning_rate": 6.006631055442992e-08, + "loss": 0.0107, + "step": 52050 + }, + { + "epoch": 9.418520528124434, + "grad_norm": 5.717367649078369, + "learning_rate": 5.960582059311107e-08, + "loss": 0.0118, + "step": 52075 + }, + { + "epoch": 9.423042141436065, + "grad_norm": 0.02132461778819561, + "learning_rate": 5.9145330631792224e-08, + "loss": 0.008, + "step": 52100 + }, + { + "epoch": 9.427563754747695, + "grad_norm": 0.018231956288218498, + "learning_rate": 5.8684840670473384e-08, + "loss": 0.0376, + "step": 52125 + }, + { + "epoch": 9.432085368059324, + "grad_norm": 19.91530418395996, + "learning_rate": 5.822435070915454e-08, + "loss": 0.0246, + "step": 52150 + }, + { + "epoch": 9.436606981370954, + "grad_norm": 0.037468716502189636, + "learning_rate": 5.7763860747835697e-08, + "loss": 0.0306, + "step": 52175 + }, + { + "epoch": 9.441128594682583, + "grad_norm": 0.013554728589951992, + "learning_rate": 5.730337078651685e-08, + "loss": 0.0054, + "step": 52200 + }, + { + "epoch": 9.445650207994213, + "grad_norm": 0.023764220997691154, + "learning_rate": 5.684288082519801e-08, + "loss": 0.006, + "step": 52225 + }, + { + "epoch": 9.450171821305842, + "grad_norm": 0.040070127695798874, + "learning_rate": 5.638239086387916e-08, + "loss": 0.0008, + "step": 52250 + }, + { + "epoch": 9.454693434617472, + "grad_norm": 0.10571596771478653, + "learning_rate": 5.592190090256032e-08, + "loss": 0.0013, + "step": 52275 + }, + { + "epoch": 9.459215047929101, + "grad_norm": 0.14270947873592377, + "learning_rate": 5.546141094124148e-08, + "loss": 0.0129, + "step": 52300 + }, + { + "epoch": 9.46373666124073, + "grad_norm": 0.04533557966351509, + "learning_rate": 5.5000920979922634e-08, + "loss": 0.0006, + "step": 52325 + }, + { + "epoch": 9.46825827455236, + "grad_norm": 0.06353598833084106, + "learning_rate": 5.454043101860379e-08, + "loss": 0.0005, + "step": 52350 + }, + { + "epoch": 9.47277988786399, + "grad_norm": 0.016834860667586327, + "learning_rate": 5.4079941057284954e-08, + "loss": 0.0031, + "step": 52375 + }, + { + "epoch": 9.477301501175619, + "grad_norm": 0.012295857071876526, + "learning_rate": 5.3619451095966107e-08, + "loss": 0.0003, + "step": 52400 + }, + { + "epoch": 9.481823114487248, + "grad_norm": 10.514667510986328, + "learning_rate": 5.315896113464726e-08, + "loss": 0.008, + "step": 52425 + }, + { + "epoch": 9.486344727798878, + "grad_norm": 0.813252329826355, + "learning_rate": 5.269847117332842e-08, + "loss": 0.0014, + "step": 52450 + }, + { + "epoch": 9.490866341110507, + "grad_norm": 2.42742657661438, + "learning_rate": 5.223798121200958e-08, + "loss": 0.0026, + "step": 52475 + }, + { + "epoch": 9.495387954422139, + "grad_norm": 0.03881525248289108, + "learning_rate": 5.177749125069073e-08, + "loss": 0.0107, + "step": 52500 + }, + { + "epoch": 9.499909567733768, + "grad_norm": 0.09639015793800354, + "learning_rate": 5.1317001289371885e-08, + "loss": 0.0052, + "step": 52525 + }, + { + "epoch": 9.504431181045398, + "grad_norm": 0.16553708910942078, + "learning_rate": 5.085651132805305e-08, + "loss": 0.0188, + "step": 52550 + }, + { + "epoch": 9.508952794357027, + "grad_norm": 9.378378868103027, + "learning_rate": 5.0396021366734204e-08, + "loss": 0.0339, + "step": 52575 + }, + { + "epoch": 9.513474407668657, + "grad_norm": 0.2166759967803955, + "learning_rate": 4.993553140541536e-08, + "loss": 0.0012, + "step": 52600 + }, + { + "epoch": 9.517996020980286, + "grad_norm": 0.026047270745038986, + "learning_rate": 4.9475041444096517e-08, + "loss": 0.0091, + "step": 52625 + }, + { + "epoch": 9.522517634291916, + "grad_norm": 0.03240982070565224, + "learning_rate": 4.9014551482777676e-08, + "loss": 0.0085, + "step": 52650 + }, + { + "epoch": 9.527039247603545, + "grad_norm": 0.07469449937343597, + "learning_rate": 4.855406152145883e-08, + "loss": 0.0016, + "step": 52675 + }, + { + "epoch": 9.531560860915175, + "grad_norm": 0.034668173640966415, + "learning_rate": 4.809357156013999e-08, + "loss": 0.0008, + "step": 52700 + }, + { + "epoch": 9.536082474226804, + "grad_norm": 0.00523386849090457, + "learning_rate": 4.763308159882114e-08, + "loss": 0.0074, + "step": 52725 + }, + { + "epoch": 9.540604087538433, + "grad_norm": 0.018630068749189377, + "learning_rate": 4.71725916375023e-08, + "loss": 0.0108, + "step": 52750 + }, + { + "epoch": 9.545125700850063, + "grad_norm": 0.02157723344862461, + "learning_rate": 4.671210167618346e-08, + "loss": 0.0078, + "step": 52775 + }, + { + "epoch": 9.549647314161692, + "grad_norm": 0.016346458345651627, + "learning_rate": 4.6251611714864614e-08, + "loss": 0.0123, + "step": 52800 + }, + { + "epoch": 9.554168927473322, + "grad_norm": 0.027240611612796783, + "learning_rate": 4.579112175354577e-08, + "loss": 0.0002, + "step": 52825 + }, + { + "epoch": 9.558690540784951, + "grad_norm": 1.3178123235702515, + "learning_rate": 4.533063179222693e-08, + "loss": 0.0038, + "step": 52850 + }, + { + "epoch": 9.56321215409658, + "grad_norm": 0.025551458820700645, + "learning_rate": 4.4870141830908086e-08, + "loss": 0.011, + "step": 52875 + }, + { + "epoch": 9.56773376740821, + "grad_norm": 0.06759845465421677, + "learning_rate": 4.440965186958924e-08, + "loss": 0.0013, + "step": 52900 + }, + { + "epoch": 9.572255380719842, + "grad_norm": 0.01742335967719555, + "learning_rate": 4.394916190827039e-08, + "loss": 0.0124, + "step": 52925 + }, + { + "epoch": 9.576776994031471, + "grad_norm": 0.09457490593194962, + "learning_rate": 4.348867194695156e-08, + "loss": 0.0146, + "step": 52950 + }, + { + "epoch": 9.5812986073431, + "grad_norm": 0.11490760743618011, + "learning_rate": 4.302818198563271e-08, + "loss": 0.0145, + "step": 52975 + }, + { + "epoch": 9.58582022065473, + "grad_norm": 0.17125943303108215, + "learning_rate": 4.2567692024313865e-08, + "loss": 0.0047, + "step": 53000 + }, + { + "epoch": 9.59034183396636, + "grad_norm": 0.08460250496864319, + "learning_rate": 4.2107202062995024e-08, + "loss": 0.0045, + "step": 53025 + }, + { + "epoch": 9.594863447277989, + "grad_norm": 0.5364235639572144, + "learning_rate": 4.1646712101676184e-08, + "loss": 0.0014, + "step": 53050 + }, + { + "epoch": 9.599385060589618, + "grad_norm": 0.2898434102535248, + "learning_rate": 4.118622214035734e-08, + "loss": 0.0034, + "step": 53075 + }, + { + "epoch": 9.603906673901248, + "grad_norm": 0.003405811497941613, + "learning_rate": 4.0725732179038496e-08, + "loss": 0.0064, + "step": 53100 + }, + { + "epoch": 9.608428287212877, + "grad_norm": 0.03517955541610718, + "learning_rate": 4.0265242217719656e-08, + "loss": 0.005, + "step": 53125 + }, + { + "epoch": 9.612949900524507, + "grad_norm": 0.005290856584906578, + "learning_rate": 3.980475225640081e-08, + "loss": 0.006, + "step": 53150 + }, + { + "epoch": 9.617471513836136, + "grad_norm": 0.026814907789230347, + "learning_rate": 3.934426229508197e-08, + "loss": 0.0026, + "step": 53175 + }, + { + "epoch": 9.621993127147766, + "grad_norm": 0.03042653575539589, + "learning_rate": 3.888377233376312e-08, + "loss": 0.0061, + "step": 53200 + }, + { + "epoch": 9.626514740459395, + "grad_norm": 0.010026361793279648, + "learning_rate": 3.842328237244428e-08, + "loss": 0.0002, + "step": 53225 + }, + { + "epoch": 9.631036353771025, + "grad_norm": 0.01914265938103199, + "learning_rate": 3.7962792411125434e-08, + "loss": 0.005, + "step": 53250 + }, + { + "epoch": 9.635557967082654, + "grad_norm": 0.05811558663845062, + "learning_rate": 3.7502302449806594e-08, + "loss": 0.0126, + "step": 53275 + }, + { + "epoch": 9.640079580394286, + "grad_norm": 0.013714855536818504, + "learning_rate": 3.704181248848775e-08, + "loss": 0.0215, + "step": 53300 + }, + { + "epoch": 9.644601193705915, + "grad_norm": 0.0346146784722805, + "learning_rate": 3.6581322527168906e-08, + "loss": 0.0144, + "step": 53325 + }, + { + "epoch": 9.649122807017545, + "grad_norm": 0.0485786534845829, + "learning_rate": 3.6120832565850066e-08, + "loss": 0.015, + "step": 53350 + }, + { + "epoch": 9.653644420329174, + "grad_norm": 0.06555884331464767, + "learning_rate": 3.566034260453122e-08, + "loss": 0.0078, + "step": 53375 + }, + { + "epoch": 9.658166033640804, + "grad_norm": 0.03746599331498146, + "learning_rate": 3.519985264321237e-08, + "loss": 0.0021, + "step": 53400 + }, + { + "epoch": 9.662687646952433, + "grad_norm": 0.04957371950149536, + "learning_rate": 3.473936268189354e-08, + "loss": 0.009, + "step": 53425 + }, + { + "epoch": 9.667209260264062, + "grad_norm": 0.020930081605911255, + "learning_rate": 3.427887272057469e-08, + "loss": 0.0011, + "step": 53450 + }, + { + "epoch": 9.671730873575692, + "grad_norm": 0.014865943230688572, + "learning_rate": 3.3818382759255844e-08, + "loss": 0.0005, + "step": 53475 + }, + { + "epoch": 9.676252486887321, + "grad_norm": 0.025175156071782112, + "learning_rate": 3.3357892797937004e-08, + "loss": 0.0012, + "step": 53500 + }, + { + "epoch": 9.680774100198951, + "grad_norm": 0.03397619351744652, + "learning_rate": 3.2897402836618163e-08, + "loss": 0.0011, + "step": 53525 + }, + { + "epoch": 9.68529571351058, + "grad_norm": 0.02394242398440838, + "learning_rate": 3.2436912875299316e-08, + "loss": 0.0036, + "step": 53550 + }, + { + "epoch": 9.68981732682221, + "grad_norm": 0.08645796030759811, + "learning_rate": 3.197642291398047e-08, + "loss": 0.0005, + "step": 53575 + }, + { + "epoch": 9.69433894013384, + "grad_norm": 0.08837512135505676, + "learning_rate": 3.151593295266163e-08, + "loss": 0.0034, + "step": 53600 + }, + { + "epoch": 9.698860553445469, + "grad_norm": 0.0059136999770998955, + "learning_rate": 3.105544299134279e-08, + "loss": 0.0028, + "step": 53625 + }, + { + "epoch": 9.703382166757098, + "grad_norm": 0.06296961009502411, + "learning_rate": 3.059495303002394e-08, + "loss": 0.0057, + "step": 53650 + }, + { + "epoch": 9.707903780068728, + "grad_norm": 24.09531593322754, + "learning_rate": 3.01344630687051e-08, + "loss": 0.0048, + "step": 53675 + }, + { + "epoch": 9.712425393380357, + "grad_norm": 0.07932830601930618, + "learning_rate": 2.9673973107386258e-08, + "loss": 0.0354, + "step": 53700 + }, + { + "epoch": 9.716947006691989, + "grad_norm": 0.6279693841934204, + "learning_rate": 2.9213483146067417e-08, + "loss": 0.0024, + "step": 53725 + }, + { + "epoch": 9.721468620003618, + "grad_norm": 0.018382834270596504, + "learning_rate": 2.875299318474857e-08, + "loss": 0.0033, + "step": 53750 + }, + { + "epoch": 9.725990233315247, + "grad_norm": 0.13161396980285645, + "learning_rate": 2.829250322342973e-08, + "loss": 0.0269, + "step": 53775 + }, + { + "epoch": 9.730511846626877, + "grad_norm": 0.08527792245149612, + "learning_rate": 2.7832013262110883e-08, + "loss": 0.0098, + "step": 53800 + }, + { + "epoch": 9.735033459938506, + "grad_norm": 0.020729778334498405, + "learning_rate": 2.7371523300792042e-08, + "loss": 0.0041, + "step": 53825 + }, + { + "epoch": 9.739555073250136, + "grad_norm": 0.19172513484954834, + "learning_rate": 2.69110333394732e-08, + "loss": 0.0041, + "step": 53850 + }, + { + "epoch": 9.744076686561765, + "grad_norm": 0.03183312341570854, + "learning_rate": 2.6450543378154355e-08, + "loss": 0.0012, + "step": 53875 + }, + { + "epoch": 9.748598299873395, + "grad_norm": 5.034526824951172, + "learning_rate": 2.599005341683551e-08, + "loss": 0.0018, + "step": 53900 + }, + { + "epoch": 9.753119913185024, + "grad_norm": 0.0769072026014328, + "learning_rate": 2.552956345551667e-08, + "loss": 0.0005, + "step": 53925 + }, + { + "epoch": 9.757641526496654, + "grad_norm": 0.032027099281549454, + "learning_rate": 2.5069073494197824e-08, + "loss": 0.0056, + "step": 53950 + }, + { + "epoch": 9.762163139808283, + "grad_norm": 0.08156726509332657, + "learning_rate": 2.4608583532878984e-08, + "loss": 0.0068, + "step": 53975 + }, + { + "epoch": 9.766684753119913, + "grad_norm": 0.010565202683210373, + "learning_rate": 2.4148093571560137e-08, + "loss": 0.0032, + "step": 54000 + }, + { + "epoch": 9.771206366431542, + "grad_norm": 0.027527930215001106, + "learning_rate": 2.3687603610241296e-08, + "loss": 0.0003, + "step": 54025 + }, + { + "epoch": 9.775727979743172, + "grad_norm": 0.07169647514820099, + "learning_rate": 2.3227113648922452e-08, + "loss": 0.0066, + "step": 54050 + }, + { + "epoch": 9.780249593054801, + "grad_norm": 0.0017488193698227406, + "learning_rate": 2.276662368760361e-08, + "loss": 0.0004, + "step": 54075 + }, + { + "epoch": 9.78477120636643, + "grad_norm": 0.011047018691897392, + "learning_rate": 2.2306133726284765e-08, + "loss": 0.0077, + "step": 54100 + }, + { + "epoch": 9.78929281967806, + "grad_norm": 0.05196432024240494, + "learning_rate": 2.1845643764965925e-08, + "loss": 0.0067, + "step": 54125 + }, + { + "epoch": 9.793814432989691, + "grad_norm": 0.19424718618392944, + "learning_rate": 2.1385153803647078e-08, + "loss": 0.0115, + "step": 54150 + }, + { + "epoch": 9.798336046301321, + "grad_norm": 0.016021044924855232, + "learning_rate": 2.0924663842328237e-08, + "loss": 0.0218, + "step": 54175 + }, + { + "epoch": 9.80285765961295, + "grad_norm": 0.031037848442792892, + "learning_rate": 2.0482593479462148e-08, + "loss": 0.0081, + "step": 54200 + }, + { + "epoch": 9.80737927292458, + "grad_norm": 5.485635757446289, + "learning_rate": 2.0022103518143304e-08, + "loss": 0.0037, + "step": 54225 + }, + { + "epoch": 9.81190088623621, + "grad_norm": 0.02747327648103237, + "learning_rate": 1.956161355682446e-08, + "loss": 0.0044, + "step": 54250 + }, + { + "epoch": 9.816422499547839, + "grad_norm": 0.1059911772608757, + "learning_rate": 1.9101123595505617e-08, + "loss": 0.0038, + "step": 54275 + }, + { + "epoch": 9.820944112859468, + "grad_norm": 0.010986040346324444, + "learning_rate": 1.8640633634186776e-08, + "loss": 0.0066, + "step": 54300 + }, + { + "epoch": 9.825465726171098, + "grad_norm": 0.17033414542675018, + "learning_rate": 1.818014367286793e-08, + "loss": 0.0047, + "step": 54325 + }, + { + "epoch": 9.829987339482727, + "grad_norm": 0.007194284815341234, + "learning_rate": 1.771965371154909e-08, + "loss": 0.0036, + "step": 54350 + }, + { + "epoch": 9.834508952794357, + "grad_norm": 0.02418128214776516, + "learning_rate": 1.7259163750230242e-08, + "loss": 0.002, + "step": 54375 + }, + { + "epoch": 9.839030566105986, + "grad_norm": 0.11744951456785202, + "learning_rate": 1.67986737889114e-08, + "loss": 0.0021, + "step": 54400 + }, + { + "epoch": 9.843552179417616, + "grad_norm": 0.0121499327942729, + "learning_rate": 1.6338183827592558e-08, + "loss": 0.0079, + "step": 54425 + }, + { + "epoch": 9.848073792729245, + "grad_norm": 0.014060701243579388, + "learning_rate": 1.5877693866273714e-08, + "loss": 0.001, + "step": 54450 + }, + { + "epoch": 9.852595406040875, + "grad_norm": 2.7887675762176514, + "learning_rate": 1.541720390495487e-08, + "loss": 0.0038, + "step": 54475 + }, + { + "epoch": 9.857117019352504, + "grad_norm": 0.006395564880222082, + "learning_rate": 1.4956713943636027e-08, + "loss": 0.029, + "step": 54500 + }, + { + "epoch": 9.861638632664135, + "grad_norm": 0.06236935779452324, + "learning_rate": 1.4496223982317185e-08, + "loss": 0.0107, + "step": 54525 + }, + { + "epoch": 9.866160245975765, + "grad_norm": 4.1836981773376465, + "learning_rate": 1.4035734020998342e-08, + "loss": 0.0343, + "step": 54550 + }, + { + "epoch": 9.870681859287394, + "grad_norm": 0.0794193297624588, + "learning_rate": 1.3575244059679499e-08, + "loss": 0.0146, + "step": 54575 + }, + { + "epoch": 9.875203472599024, + "grad_norm": 0.023560110479593277, + "learning_rate": 1.3114754098360655e-08, + "loss": 0.0052, + "step": 54600 + }, + { + "epoch": 9.879725085910653, + "grad_norm": 0.2117091715335846, + "learning_rate": 1.2654264137041811e-08, + "loss": 0.005, + "step": 54625 + }, + { + "epoch": 9.884246699222283, + "grad_norm": 0.0047377352602779865, + "learning_rate": 1.2193774175722968e-08, + "loss": 0.0005, + "step": 54650 + }, + { + "epoch": 9.888768312533912, + "grad_norm": 0.25400134921073914, + "learning_rate": 1.1733284214404126e-08, + "loss": 0.008, + "step": 54675 + }, + { + "epoch": 9.893289925845542, + "grad_norm": 0.09486464411020279, + "learning_rate": 1.1272794253085282e-08, + "loss": 0.0044, + "step": 54700 + }, + { + "epoch": 9.897811539157171, + "grad_norm": 0.02684643305838108, + "learning_rate": 1.0812304291766438e-08, + "loss": 0.0069, + "step": 54725 + }, + { + "epoch": 9.9023331524688, + "grad_norm": 0.025187574326992035, + "learning_rate": 1.0351814330447595e-08, + "loss": 0.0007, + "step": 54750 + }, + { + "epoch": 9.90685476578043, + "grad_norm": 0.034975674003362656, + "learning_rate": 9.891324369128753e-09, + "loss": 0.0071, + "step": 54775 + }, + { + "epoch": 9.91137637909206, + "grad_norm": 15.95773983001709, + "learning_rate": 9.430834407809909e-09, + "loss": 0.011, + "step": 54800 + }, + { + "epoch": 9.91589799240369, + "grad_norm": 0.026726465672254562, + "learning_rate": 8.970344446491065e-09, + "loss": 0.0006, + "step": 54825 + }, + { + "epoch": 9.920419605715319, + "grad_norm": 0.020367203280329704, + "learning_rate": 8.509854485172221e-09, + "loss": 0.0035, + "step": 54850 + }, + { + "epoch": 9.924941219026948, + "grad_norm": 0.12988033890724182, + "learning_rate": 8.049364523853381e-09, + "loss": 0.005, + "step": 54875 + }, + { + "epoch": 9.929462832338578, + "grad_norm": 0.06096798926591873, + "learning_rate": 7.588874562534537e-09, + "loss": 0.004, + "step": 54900 + }, + { + "epoch": 9.933984445650207, + "grad_norm": 0.9536722898483276, + "learning_rate": 7.128384601215693e-09, + "loss": 0.0036, + "step": 54925 + }, + { + "epoch": 9.938506058961838, + "grad_norm": 17.520708084106445, + "learning_rate": 6.66789463989685e-09, + "loss": 0.022, + "step": 54950 + }, + { + "epoch": 9.943027672273468, + "grad_norm": 0.010756449773907661, + "learning_rate": 6.207404678578006e-09, + "loss": 0.0402, + "step": 54975 + }, + { + "epoch": 9.947549285585097, + "grad_norm": 0.015861673280596733, + "learning_rate": 5.746914717259163e-09, + "loss": 0.0017, + "step": 55000 + }, + { + "epoch": 9.952070898896727, + "grad_norm": 0.008379822596907616, + "learning_rate": 5.2864247559403205e-09, + "loss": 0.0052, + "step": 55025 + }, + { + "epoch": 9.956592512208356, + "grad_norm": 0.15586745738983154, + "learning_rate": 4.825934794621478e-09, + "loss": 0.0054, + "step": 55050 + }, + { + "epoch": 9.961114125519986, + "grad_norm": 0.022011611610651016, + "learning_rate": 4.365444833302634e-09, + "loss": 0.0012, + "step": 55075 + }, + { + "epoch": 9.965635738831615, + "grad_norm": 0.012549543753266335, + "learning_rate": 3.90495487198379e-09, + "loss": 0.0007, + "step": 55100 + }, + { + "epoch": 9.970157352143245, + "grad_norm": 0.8270652890205383, + "learning_rate": 3.4444649106649474e-09, + "loss": 0.0063, + "step": 55125 + }, + { + "epoch": 9.974678965454874, + "grad_norm": 0.0325373113155365, + "learning_rate": 2.983974949346104e-09, + "loss": 0.0008, + "step": 55150 + }, + { + "epoch": 9.979200578766504, + "grad_norm": 0.015854543074965477, + "learning_rate": 2.523484988027261e-09, + "loss": 0.0065, + "step": 55175 + }, + { + "epoch": 9.983722192078133, + "grad_norm": 16.109289169311523, + "learning_rate": 2.0629950267084176e-09, + "loss": 0.0122, + "step": 55200 + }, + { + "epoch": 9.988243805389763, + "grad_norm": 3.6053996086120605, + "learning_rate": 1.6025050653895745e-09, + "loss": 0.0018, + "step": 55225 + }, + { + "epoch": 9.992765418701392, + "grad_norm": 0.009253941476345062, + "learning_rate": 1.1420151040707312e-09, + "loss": 0.0008, + "step": 55250 + }, + { + "epoch": 9.997287032013022, + "grad_norm": 11.093884468078613, + "learning_rate": 6.815251427518879e-10, + "loss": 0.0159, + "step": 55275 + }, + { + "epoch": 10.0, + "eval_loss": 0.3697284162044525, + "eval_runtime": 8323.367, + "eval_samples_per_second": 1.141, + "eval_steps_per_second": 0.143, + "eval_wer": 0.10309096732863549, + "step": 55290 + }, + { + "epoch": 10.0, + "step": 55290, + "total_flos": 7.517848352823706e+20, + "train_loss": 0.018776766294569685, + "train_runtime": 360719.5114, + "train_samples_per_second": 0.613, + "train_steps_per_second": 0.153 + } + ], + "logging_steps": 25, + "max_steps": 55290, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.517848352823706e+20, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}