diff --git "a/idefics2/checkpoint-10000/trainer_state.json" "b/idefics2/checkpoint-10000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/idefics2/checkpoint-10000/trainer_state.json" @@ -0,0 +1,7483 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5127448755767339, + "eval_steps": 200, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001512744875576734, + "grad_norm": 0.8282566070556641, + "learning_rate": 0.00019996, + "loss": 3.4576, + "step": 10 + }, + { + "epoch": 0.003025489751153468, + "grad_norm": 0.1628154069185257, + "learning_rate": 0.00019992000000000002, + "loss": 0.0992, + "step": 20 + }, + { + "epoch": 0.004538234626730202, + "grad_norm": 0.17421123385429382, + "learning_rate": 0.00019988, + "loss": 0.0666, + "step": 30 + }, + { + "epoch": 0.006050979502306936, + "grad_norm": 0.08850277960300446, + "learning_rate": 0.00019984, + "loss": 0.0661, + "step": 40 + }, + { + "epoch": 0.00756372437788367, + "grad_norm": 0.11368270963430405, + "learning_rate": 0.0001998, + "loss": 0.0639, + "step": 50 + }, + { + "epoch": 0.009076469253460404, + "grad_norm": 0.12990300357341766, + "learning_rate": 0.00019976000000000003, + "loss": 0.0617, + "step": 60 + }, + { + "epoch": 0.010589214129037138, + "grad_norm": 0.08885369449853897, + "learning_rate": 0.00019972000000000002, + "loss": 0.0643, + "step": 70 + }, + { + "epoch": 0.012101959004613872, + "grad_norm": 0.07073435187339783, + "learning_rate": 0.00019968, + "loss": 0.0629, + "step": 80 + }, + { + "epoch": 0.013614703880190605, + "grad_norm": 0.061856113374233246, + "learning_rate": 0.00019964, + "loss": 0.061, + "step": 90 + }, + { + "epoch": 0.01512744875576734, + "grad_norm": 0.06827201694250107, + "learning_rate": 0.0001996, + "loss": 0.0586, + "step": 100 + }, + { + "epoch": 0.016640193631344075, + "grad_norm": 0.07220456004142761, + "learning_rate": 0.00019956000000000002, + "loss": 0.055, + "step": 110 + }, + { + "epoch": 0.018152938506920808, + "grad_norm": 0.06632555276155472, + "learning_rate": 0.00019952000000000001, + "loss": 0.0586, + "step": 120 + }, + { + "epoch": 0.01966568338249754, + "grad_norm": 0.09966724365949631, + "learning_rate": 0.00019948, + "loss": 0.0621, + "step": 130 + }, + { + "epoch": 0.021178428258074276, + "grad_norm": 0.0833888053894043, + "learning_rate": 0.00019944, + "loss": 0.0591, + "step": 140 + }, + { + "epoch": 0.02269117313365101, + "grad_norm": 0.08170727640390396, + "learning_rate": 0.00019940000000000002, + "loss": 0.055, + "step": 150 + }, + { + "epoch": 0.024203918009227745, + "grad_norm": 0.07089231163263321, + "learning_rate": 0.00019936000000000002, + "loss": 0.0582, + "step": 160 + }, + { + "epoch": 0.025716662884804477, + "grad_norm": 0.09390200674533844, + "learning_rate": 0.00019932, + "loss": 0.0628, + "step": 170 + }, + { + "epoch": 0.02722940776038121, + "grad_norm": 0.06722863018512726, + "learning_rate": 0.00019928, + "loss": 0.0591, + "step": 180 + }, + { + "epoch": 0.028742152635957946, + "grad_norm": 0.0743609368801117, + "learning_rate": 0.00019924, + "loss": 0.0626, + "step": 190 + }, + { + "epoch": 0.03025489751153468, + "grad_norm": 0.08125407248735428, + "learning_rate": 0.00019920000000000002, + "loss": 0.0601, + "step": 200 + }, + { + "epoch": 0.03025489751153468, + "eval_cer": 0.5356160728183765, + "eval_loss": 0.05078176036477089, + "eval_runtime": 10281.8657, + "eval_samples_per_second": 2.047, + "eval_steps_per_second": 0.256, + "step": 200 + }, + { + "epoch": 0.03176764238711141, + "grad_norm": 0.07030890136957169, + "learning_rate": 0.00019916, + "loss": 0.0597, + "step": 210 + }, + { + "epoch": 0.03328038726268815, + "grad_norm": 0.05290469154715538, + "learning_rate": 0.00019912, + "loss": 0.0587, + "step": 220 + }, + { + "epoch": 0.03479313213826488, + "grad_norm": 0.07339277863502502, + "learning_rate": 0.00019908, + "loss": 0.0529, + "step": 230 + }, + { + "epoch": 0.036305877013841616, + "grad_norm": 0.0727711170911789, + "learning_rate": 0.00019904, + "loss": 0.0539, + "step": 240 + }, + { + "epoch": 0.03781862188941835, + "grad_norm": 0.07383541762828827, + "learning_rate": 0.000199, + "loss": 0.0532, + "step": 250 + }, + { + "epoch": 0.03933136676499508, + "grad_norm": 0.07042526453733444, + "learning_rate": 0.00019896, + "loss": 0.0571, + "step": 260 + }, + { + "epoch": 0.04084411164057182, + "grad_norm": 0.08188482373952866, + "learning_rate": 0.00019892000000000003, + "loss": 0.0521, + "step": 270 + }, + { + "epoch": 0.04235685651614855, + "grad_norm": 0.07334589958190918, + "learning_rate": 0.00019888, + "loss": 0.0532, + "step": 280 + }, + { + "epoch": 0.043869601391725285, + "grad_norm": 0.06326377391815186, + "learning_rate": 0.00019884000000000001, + "loss": 0.0528, + "step": 290 + }, + { + "epoch": 0.04538234626730202, + "grad_norm": 0.05303795263171196, + "learning_rate": 0.0001988, + "loss": 0.0539, + "step": 300 + }, + { + "epoch": 0.04689509114287875, + "grad_norm": 0.058723289519548416, + "learning_rate": 0.00019876, + "loss": 0.0469, + "step": 310 + }, + { + "epoch": 0.04840783601845549, + "grad_norm": 0.08683237433433533, + "learning_rate": 0.00019872000000000002, + "loss": 0.0601, + "step": 320 + }, + { + "epoch": 0.04992058089403222, + "grad_norm": 0.07650341093540192, + "learning_rate": 0.00019868, + "loss": 0.0582, + "step": 330 + }, + { + "epoch": 0.051433325769608955, + "grad_norm": 0.054965659976005554, + "learning_rate": 0.00019864, + "loss": 0.0548, + "step": 340 + }, + { + "epoch": 0.05294607064518569, + "grad_norm": 0.06949716061353683, + "learning_rate": 0.0001986, + "loss": 0.0581, + "step": 350 + }, + { + "epoch": 0.05445881552076242, + "grad_norm": 0.10514732450246811, + "learning_rate": 0.00019856000000000002, + "loss": 0.0587, + "step": 360 + }, + { + "epoch": 0.05597156039633916, + "grad_norm": 0.06586117297410965, + "learning_rate": 0.00019852000000000002, + "loss": 0.0561, + "step": 370 + }, + { + "epoch": 0.05748430527191589, + "grad_norm": 0.09821395576000214, + "learning_rate": 0.00019848, + "loss": 0.0556, + "step": 380 + }, + { + "epoch": 0.058997050147492625, + "grad_norm": 0.06488014757633209, + "learning_rate": 0.00019844, + "loss": 0.0634, + "step": 390 + }, + { + "epoch": 0.06050979502306936, + "grad_norm": 0.06910958141088486, + "learning_rate": 0.0001984, + "loss": 0.052, + "step": 400 + }, + { + "epoch": 0.06050979502306936, + "eval_cer": 0.2714758865721352, + "eval_loss": 0.04847713187336922, + "eval_runtime": 10484.76, + "eval_samples_per_second": 2.008, + "eval_steps_per_second": 0.251, + "step": 400 + }, + { + "epoch": 0.0620225398986461, + "grad_norm": 0.048563435673713684, + "learning_rate": 0.00019836000000000002, + "loss": 0.0565, + "step": 410 + }, + { + "epoch": 0.06353528477422282, + "grad_norm": 0.055841896682977676, + "learning_rate": 0.00019832, + "loss": 0.0547, + "step": 420 + }, + { + "epoch": 0.06504802964979955, + "grad_norm": 0.05644605681300163, + "learning_rate": 0.00019828, + "loss": 0.0575, + "step": 430 + }, + { + "epoch": 0.0665607745253763, + "grad_norm": 0.05617703124880791, + "learning_rate": 0.00019824, + "loss": 0.0514, + "step": 440 + }, + { + "epoch": 0.06807351940095303, + "grad_norm": 0.11480820178985596, + "learning_rate": 0.00019820000000000002, + "loss": 0.0562, + "step": 450 + }, + { + "epoch": 0.06958626427652977, + "grad_norm": 0.06004955247044563, + "learning_rate": 0.00019816000000000001, + "loss": 0.0575, + "step": 460 + }, + { + "epoch": 0.0710990091521065, + "grad_norm": 0.07830873131752014, + "learning_rate": 0.00019812, + "loss": 0.0621, + "step": 470 + }, + { + "epoch": 0.07261175402768323, + "grad_norm": 0.052650969475507736, + "learning_rate": 0.00019808, + "loss": 0.0599, + "step": 480 + }, + { + "epoch": 0.07412449890325996, + "grad_norm": 0.09298545122146606, + "learning_rate": 0.00019804, + "loss": 0.0559, + "step": 490 + }, + { + "epoch": 0.0756372437788367, + "grad_norm": 0.06198689714074135, + "learning_rate": 0.00019800000000000002, + "loss": 0.047, + "step": 500 + }, + { + "epoch": 0.07714998865441343, + "grad_norm": 0.06688915193080902, + "learning_rate": 0.00019796, + "loss": 0.0523, + "step": 510 + }, + { + "epoch": 0.07866273352999016, + "grad_norm": 0.06676903367042542, + "learning_rate": 0.00019792000000000003, + "loss": 0.0509, + "step": 520 + }, + { + "epoch": 0.08017547840556691, + "grad_norm": 0.06219707056879997, + "learning_rate": 0.00019788, + "loss": 0.0553, + "step": 530 + }, + { + "epoch": 0.08168822328114364, + "grad_norm": 0.07905440032482147, + "learning_rate": 0.00019784, + "loss": 0.0506, + "step": 540 + }, + { + "epoch": 0.08320096815672037, + "grad_norm": 0.08591905236244202, + "learning_rate": 0.0001978, + "loss": 0.0603, + "step": 550 + }, + { + "epoch": 0.0847137130322971, + "grad_norm": 0.05921874940395355, + "learning_rate": 0.00019776, + "loss": 0.0562, + "step": 560 + }, + { + "epoch": 0.08622645790787384, + "grad_norm": 0.058868613094091415, + "learning_rate": 0.00019772000000000002, + "loss": 0.0517, + "step": 570 + }, + { + "epoch": 0.08773920278345057, + "grad_norm": 0.06818246096372604, + "learning_rate": 0.00019768, + "loss": 0.0478, + "step": 580 + }, + { + "epoch": 0.0892519476590273, + "grad_norm": 0.07364825904369354, + "learning_rate": 0.00019764, + "loss": 0.0553, + "step": 590 + }, + { + "epoch": 0.09076469253460404, + "grad_norm": 0.07647281885147095, + "learning_rate": 0.0001976, + "loss": 0.0527, + "step": 600 + }, + { + "epoch": 0.09076469253460404, + "eval_cer": 0.282631389088609, + "eval_loss": 0.047340717166662216, + "eval_runtime": 10466.4392, + "eval_samples_per_second": 2.011, + "eval_steps_per_second": 0.251, + "step": 600 + }, + { + "epoch": 0.09227743741018077, + "grad_norm": 0.0819125548005104, + "learning_rate": 0.00019756, + "loss": 0.0509, + "step": 610 + }, + { + "epoch": 0.0937901822857575, + "grad_norm": 0.06566735357046127, + "learning_rate": 0.00019752000000000002, + "loss": 0.0583, + "step": 620 + }, + { + "epoch": 0.09530292716133425, + "grad_norm": 0.06856215745210648, + "learning_rate": 0.00019748, + "loss": 0.0465, + "step": 630 + }, + { + "epoch": 0.09681567203691098, + "grad_norm": 0.06130633130669594, + "learning_rate": 0.00019744, + "loss": 0.0509, + "step": 640 + }, + { + "epoch": 0.09832841691248771, + "grad_norm": 0.08208902925252914, + "learning_rate": 0.0001974, + "loss": 0.0549, + "step": 650 + }, + { + "epoch": 0.09984116178806444, + "grad_norm": 0.08106379210948944, + "learning_rate": 0.00019736000000000002, + "loss": 0.0584, + "step": 660 + }, + { + "epoch": 0.10135390666364118, + "grad_norm": 0.08364614844322205, + "learning_rate": 0.00019732000000000001, + "loss": 0.0543, + "step": 670 + }, + { + "epoch": 0.10286665153921791, + "grad_norm": 0.06432674080133438, + "learning_rate": 0.00019728, + "loss": 0.0535, + "step": 680 + }, + { + "epoch": 0.10437939641479464, + "grad_norm": 0.07217614352703094, + "learning_rate": 0.00019724, + "loss": 0.0521, + "step": 690 + }, + { + "epoch": 0.10589214129037137, + "grad_norm": 0.06074230372905731, + "learning_rate": 0.0001972, + "loss": 0.0545, + "step": 700 + }, + { + "epoch": 0.10740488616594811, + "grad_norm": 0.04888018220663071, + "learning_rate": 0.00019716000000000002, + "loss": 0.0445, + "step": 710 + }, + { + "epoch": 0.10891763104152484, + "grad_norm": 0.07705683261156082, + "learning_rate": 0.00019712, + "loss": 0.0491, + "step": 720 + }, + { + "epoch": 0.11043037591710159, + "grad_norm": 0.06741231679916382, + "learning_rate": 0.00019708000000000003, + "loss": 0.053, + "step": 730 + }, + { + "epoch": 0.11194312079267832, + "grad_norm": 0.0673738569021225, + "learning_rate": 0.00019704, + "loss": 0.0473, + "step": 740 + }, + { + "epoch": 0.11345586566825505, + "grad_norm": 0.06236235797405243, + "learning_rate": 0.00019700000000000002, + "loss": 0.0538, + "step": 750 + }, + { + "epoch": 0.11496861054383178, + "grad_norm": 0.0538531057536602, + "learning_rate": 0.00019696, + "loss": 0.0414, + "step": 760 + }, + { + "epoch": 0.11648135541940852, + "grad_norm": 0.09818791598081589, + "learning_rate": 0.00019692, + "loss": 0.0551, + "step": 770 + }, + { + "epoch": 0.11799410029498525, + "grad_norm": 0.06459952145814896, + "learning_rate": 0.00019688000000000003, + "loss": 0.0543, + "step": 780 + }, + { + "epoch": 0.11950684517056198, + "grad_norm": 0.09495878219604492, + "learning_rate": 0.00019684, + "loss": 0.0566, + "step": 790 + }, + { + "epoch": 0.12101959004613871, + "grad_norm": 0.06249309703707695, + "learning_rate": 0.0001968, + "loss": 0.0492, + "step": 800 + }, + { + "epoch": 0.12101959004613871, + "eval_cer": 0.0030890735373690806, + "eval_loss": 0.046879783272743225, + "eval_runtime": 10443.0859, + "eval_samples_per_second": 2.016, + "eval_steps_per_second": 0.252, + "step": 800 + }, + { + "epoch": 0.12253233492171545, + "grad_norm": 0.06483816355466843, + "learning_rate": 0.00019676, + "loss": 0.048, + "step": 810 + }, + { + "epoch": 0.1240450797972922, + "grad_norm": 0.05618014931678772, + "learning_rate": 0.00019672000000000003, + "loss": 0.0484, + "step": 820 + }, + { + "epoch": 0.1255578246728689, + "grad_norm": 0.07441507279872894, + "learning_rate": 0.00019668000000000002, + "loss": 0.0548, + "step": 830 + }, + { + "epoch": 0.12707056954844564, + "grad_norm": 0.05274181067943573, + "learning_rate": 0.00019664000000000001, + "loss": 0.0619, + "step": 840 + }, + { + "epoch": 0.12858331442402238, + "grad_norm": 0.06264190375804901, + "learning_rate": 0.0001966, + "loss": 0.0525, + "step": 850 + }, + { + "epoch": 0.1300960592995991, + "grad_norm": 0.07662319391965866, + "learning_rate": 0.00019656, + "loss": 0.0532, + "step": 860 + }, + { + "epoch": 0.13160880417517587, + "grad_norm": 0.06203316152095795, + "learning_rate": 0.00019652000000000002, + "loss": 0.0525, + "step": 870 + }, + { + "epoch": 0.1331215490507526, + "grad_norm": 0.1326906681060791, + "learning_rate": 0.00019648000000000002, + "loss": 0.0539, + "step": 880 + }, + { + "epoch": 0.13463429392632933, + "grad_norm": 0.10350421816110611, + "learning_rate": 0.00019644, + "loss": 0.0556, + "step": 890 + }, + { + "epoch": 0.13614703880190607, + "grad_norm": 0.049543242901563644, + "learning_rate": 0.0001964, + "loss": 0.0482, + "step": 900 + }, + { + "epoch": 0.1376597836774828, + "grad_norm": 0.11776097118854523, + "learning_rate": 0.00019636000000000002, + "loss": 0.0538, + "step": 910 + }, + { + "epoch": 0.13917252855305953, + "grad_norm": 0.05535553768277168, + "learning_rate": 0.00019632000000000002, + "loss": 0.052, + "step": 920 + }, + { + "epoch": 0.14068527342863626, + "grad_norm": 0.05945896357297897, + "learning_rate": 0.00019628, + "loss": 0.0491, + "step": 930 + }, + { + "epoch": 0.142198018304213, + "grad_norm": 0.1228972002863884, + "learning_rate": 0.00019624, + "loss": 0.0511, + "step": 940 + }, + { + "epoch": 0.14371076317978973, + "grad_norm": 0.08868791162967682, + "learning_rate": 0.0001962, + "loss": 0.057, + "step": 950 + }, + { + "epoch": 0.14522350805536646, + "grad_norm": 0.07960449159145355, + "learning_rate": 0.00019616000000000002, + "loss": 0.0514, + "step": 960 + }, + { + "epoch": 0.1467362529309432, + "grad_norm": 0.06392108649015427, + "learning_rate": 0.00019612, + "loss": 0.0558, + "step": 970 + }, + { + "epoch": 0.14824899780651993, + "grad_norm": 0.07048727571964264, + "learning_rate": 0.00019608, + "loss": 0.053, + "step": 980 + }, + { + "epoch": 0.14976174268209666, + "grad_norm": 0.10491488873958588, + "learning_rate": 0.00019604, + "loss": 0.0489, + "step": 990 + }, + { + "epoch": 0.1512744875576734, + "grad_norm": 0.059835776686668396, + "learning_rate": 0.000196, + "loss": 0.0474, + "step": 1000 + }, + { + "epoch": 0.1512744875576734, + "eval_cer": 0.4367181574025345, + "eval_loss": 0.04569260776042938, + "eval_runtime": 10457.5718, + "eval_samples_per_second": 2.013, + "eval_steps_per_second": 0.252, + "step": 1000 + }, + { + "epoch": 0.15278723243325013, + "grad_norm": 0.07570289075374603, + "learning_rate": 0.00019596000000000001, + "loss": 0.0522, + "step": 1010 + }, + { + "epoch": 0.15429997730882686, + "grad_norm": 0.09082864969968796, + "learning_rate": 0.00019592, + "loss": 0.0516, + "step": 1020 + }, + { + "epoch": 0.1558127221844036, + "grad_norm": 0.06894449889659882, + "learning_rate": 0.00019588000000000003, + "loss": 0.0489, + "step": 1030 + }, + { + "epoch": 0.15732546705998032, + "grad_norm": 0.05989064276218414, + "learning_rate": 0.00019584, + "loss": 0.0514, + "step": 1040 + }, + { + "epoch": 0.15883821193555706, + "grad_norm": 0.060047443956136703, + "learning_rate": 0.00019580000000000002, + "loss": 0.047, + "step": 1050 + }, + { + "epoch": 0.16035095681113382, + "grad_norm": 0.06459174305200577, + "learning_rate": 0.00019576, + "loss": 0.0532, + "step": 1060 + }, + { + "epoch": 0.16186370168671055, + "grad_norm": 0.061583805829286575, + "learning_rate": 0.00019572, + "loss": 0.0485, + "step": 1070 + }, + { + "epoch": 0.16337644656228728, + "grad_norm": 0.060534268617630005, + "learning_rate": 0.00019568000000000002, + "loss": 0.0468, + "step": 1080 + }, + { + "epoch": 0.164889191437864, + "grad_norm": 0.06731607764959335, + "learning_rate": 0.00019564, + "loss": 0.0481, + "step": 1090 + }, + { + "epoch": 0.16640193631344075, + "grad_norm": 0.0757998675107956, + "learning_rate": 0.0001956, + "loss": 0.056, + "step": 1100 + }, + { + "epoch": 0.16791468118901748, + "grad_norm": 0.08009450882673264, + "learning_rate": 0.00019556, + "loss": 0.0523, + "step": 1110 + }, + { + "epoch": 0.1694274260645942, + "grad_norm": 2.663090944290161, + "learning_rate": 0.00019552000000000003, + "loss": 0.1404, + "step": 1120 + }, + { + "epoch": 0.17094017094017094, + "grad_norm": 14.877944946289062, + "learning_rate": 0.00019548000000000002, + "loss": 0.1442, + "step": 1130 + }, + { + "epoch": 0.17245291581574768, + "grad_norm": 2.8173887729644775, + "learning_rate": 0.000195448, + "loss": 0.5461, + "step": 1140 + }, + { + "epoch": 0.1739656606913244, + "grad_norm": 9.367515563964844, + "learning_rate": 0.00019540800000000002, + "loss": 0.2832, + "step": 1150 + }, + { + "epoch": 0.17547840556690114, + "grad_norm": 0.34991636872291565, + "learning_rate": 0.00019536800000000002, + "loss": 0.1497, + "step": 1160 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 0.10464385151863098, + "learning_rate": 0.000195328, + "loss": 0.0686, + "step": 1170 + }, + { + "epoch": 0.1785038953180546, + "grad_norm": 0.8961012363433838, + "learning_rate": 0.000195288, + "loss": 0.0822, + "step": 1180 + }, + { + "epoch": 0.18001664019363134, + "grad_norm": 8.467473983764648, + "learning_rate": 0.000195248, + "loss": 0.0949, + "step": 1190 + }, + { + "epoch": 0.18152938506920807, + "grad_norm": 0.08059060573577881, + "learning_rate": 0.00019520800000000002, + "loss": 0.0552, + "step": 1200 + }, + { + "epoch": 0.18152938506920807, + "eval_cer": 0.0833932493767496, + "eval_loss": 0.04637393727898598, + "eval_runtime": 10459.5021, + "eval_samples_per_second": 2.013, + "eval_steps_per_second": 0.252, + "step": 1200 + }, + { + "epoch": 0.1830421299447848, + "grad_norm": 0.08795847743749619, + "learning_rate": 0.000195168, + "loss": 0.055, + "step": 1210 + }, + { + "epoch": 0.18455487482036154, + "grad_norm": 0.10272721946239471, + "learning_rate": 0.000195128, + "loss": 0.0557, + "step": 1220 + }, + { + "epoch": 0.18606761969593827, + "grad_norm": 0.23404774069786072, + "learning_rate": 0.000195088, + "loss": 0.0611, + "step": 1230 + }, + { + "epoch": 0.187580364571515, + "grad_norm": 0.2968621253967285, + "learning_rate": 0.00019504800000000002, + "loss": 0.0817, + "step": 1240 + }, + { + "epoch": 0.18909310944709176, + "grad_norm": 0.08634278923273087, + "learning_rate": 0.00019500800000000001, + "loss": 0.0685, + "step": 1250 + }, + { + "epoch": 0.1906058543226685, + "grad_norm": 0.11241244524717331, + "learning_rate": 0.000194968, + "loss": 0.0563, + "step": 1260 + }, + { + "epoch": 0.19211859919824523, + "grad_norm": 0.17380298674106598, + "learning_rate": 0.000194928, + "loss": 0.065, + "step": 1270 + }, + { + "epoch": 0.19363134407382196, + "grad_norm": 0.13615791499614716, + "learning_rate": 0.000194888, + "loss": 0.0667, + "step": 1280 + }, + { + "epoch": 0.1951440889493987, + "grad_norm": 0.0854301005601883, + "learning_rate": 0.00019484800000000002, + "loss": 0.0507, + "step": 1290 + }, + { + "epoch": 0.19665683382497542, + "grad_norm": 0.08915933966636658, + "learning_rate": 0.000194808, + "loss": 0.0561, + "step": 1300 + }, + { + "epoch": 0.19816957870055216, + "grad_norm": 0.09583040326833725, + "learning_rate": 0.00019476800000000003, + "loss": 0.0514, + "step": 1310 + }, + { + "epoch": 0.1996823235761289, + "grad_norm": 0.09624961763620377, + "learning_rate": 0.000194728, + "loss": 0.052, + "step": 1320 + }, + { + "epoch": 0.20119506845170562, + "grad_norm": 0.05612370744347572, + "learning_rate": 0.00019468800000000002, + "loss": 0.0471, + "step": 1330 + }, + { + "epoch": 0.20270781332728235, + "grad_norm": 0.0653730109333992, + "learning_rate": 0.000194648, + "loss": 0.0521, + "step": 1340 + }, + { + "epoch": 0.2042205582028591, + "grad_norm": 0.07432978600263596, + "learning_rate": 0.000194608, + "loss": 0.0577, + "step": 1350 + }, + { + "epoch": 0.20573330307843582, + "grad_norm": 0.05863150209188461, + "learning_rate": 0.00019456800000000003, + "loss": 0.0435, + "step": 1360 + }, + { + "epoch": 0.20724604795401255, + "grad_norm": 0.056969739496707916, + "learning_rate": 0.000194528, + "loss": 0.0502, + "step": 1370 + }, + { + "epoch": 0.20875879282958928, + "grad_norm": 0.10658754408359528, + "learning_rate": 0.000194488, + "loss": 0.0469, + "step": 1380 + }, + { + "epoch": 0.21027153770516602, + "grad_norm": 0.06535681337118149, + "learning_rate": 0.000194448, + "loss": 0.0519, + "step": 1390 + }, + { + "epoch": 0.21178428258074275, + "grad_norm": 0.08987314254045486, + "learning_rate": 0.000194408, + "loss": 0.0482, + "step": 1400 + }, + { + "epoch": 0.21178428258074275, + "eval_cer": 0.14607469615771385, + "eval_loss": 0.04351452365517616, + "eval_runtime": 10473.9712, + "eval_samples_per_second": 2.01, + "eval_steps_per_second": 0.251, + "step": 1400 + }, + { + "epoch": 0.21329702745631948, + "grad_norm": 0.09238473325967789, + "learning_rate": 0.00019436800000000002, + "loss": 0.0483, + "step": 1410 + }, + { + "epoch": 0.21480977233189621, + "grad_norm": 0.10443761199712753, + "learning_rate": 0.000194328, + "loss": 0.054, + "step": 1420 + }, + { + "epoch": 0.21632251720747295, + "grad_norm": 0.0742131844162941, + "learning_rate": 0.000194288, + "loss": 0.0507, + "step": 1430 + }, + { + "epoch": 0.21783526208304968, + "grad_norm": 0.09358492493629456, + "learning_rate": 0.000194248, + "loss": 0.0496, + "step": 1440 + }, + { + "epoch": 0.21934800695862644, + "grad_norm": 0.07695715129375458, + "learning_rate": 0.00019420800000000002, + "loss": 0.046, + "step": 1450 + }, + { + "epoch": 0.22086075183420317, + "grad_norm": 0.07772234827280045, + "learning_rate": 0.00019416800000000002, + "loss": 0.0468, + "step": 1460 + }, + { + "epoch": 0.2223734967097799, + "grad_norm": 0.04500894993543625, + "learning_rate": 0.000194128, + "loss": 0.0428, + "step": 1470 + }, + { + "epoch": 0.22388624158535664, + "grad_norm": 0.08258084207773209, + "learning_rate": 0.000194088, + "loss": 0.0542, + "step": 1480 + }, + { + "epoch": 0.22539898646093337, + "grad_norm": 0.06530752032995224, + "learning_rate": 0.000194048, + "loss": 0.0477, + "step": 1490 + }, + { + "epoch": 0.2269117313365101, + "grad_norm": 0.06770725548267365, + "learning_rate": 0.00019400800000000002, + "loss": 0.052, + "step": 1500 + }, + { + "epoch": 0.22842447621208684, + "grad_norm": 0.04499737173318863, + "learning_rate": 0.000193968, + "loss": 0.0392, + "step": 1510 + }, + { + "epoch": 0.22993722108766357, + "grad_norm": 0.0594199039041996, + "learning_rate": 0.000193928, + "loss": 0.0469, + "step": 1520 + }, + { + "epoch": 0.2314499659632403, + "grad_norm": 0.05143499746918678, + "learning_rate": 0.000193888, + "loss": 0.0384, + "step": 1530 + }, + { + "epoch": 0.23296271083881703, + "grad_norm": 0.05464276298880577, + "learning_rate": 0.00019384800000000002, + "loss": 0.0479, + "step": 1540 + }, + { + "epoch": 0.23447545571439377, + "grad_norm": 0.0698809027671814, + "learning_rate": 0.000193808, + "loss": 0.0493, + "step": 1550 + }, + { + "epoch": 0.2359882005899705, + "grad_norm": 0.059237249195575714, + "learning_rate": 0.000193768, + "loss": 0.0493, + "step": 1560 + }, + { + "epoch": 0.23750094546554723, + "grad_norm": 0.08654357492923737, + "learning_rate": 0.000193728, + "loss": 0.0481, + "step": 1570 + }, + { + "epoch": 0.23901369034112396, + "grad_norm": 0.19063305854797363, + "learning_rate": 0.000193688, + "loss": 0.051, + "step": 1580 + }, + { + "epoch": 0.2405264352167007, + "grad_norm": 0.08095410466194153, + "learning_rate": 0.000193648, + "loss": 0.0447, + "step": 1590 + }, + { + "epoch": 0.24203918009227743, + "grad_norm": 0.056007932871580124, + "learning_rate": 0.000193608, + "loss": 0.0431, + "step": 1600 + }, + { + "epoch": 0.24203918009227743, + "eval_cer": 0.1667197881072213, + "eval_loss": 0.04373455420136452, + "eval_runtime": 10595.1515, + "eval_samples_per_second": 1.987, + "eval_steps_per_second": 0.248, + "step": 1600 + }, + { + "epoch": 0.24355192496785416, + "grad_norm": 0.06981740891933441, + "learning_rate": 0.00019356800000000003, + "loss": 0.0442, + "step": 1610 + }, + { + "epoch": 0.2450646698434309, + "grad_norm": 0.10189545899629593, + "learning_rate": 0.000193528, + "loss": 0.0477, + "step": 1620 + }, + { + "epoch": 0.24657741471900763, + "grad_norm": 0.06565351039171219, + "learning_rate": 0.00019348800000000002, + "loss": 0.0532, + "step": 1630 + }, + { + "epoch": 0.2480901595945844, + "grad_norm": 0.06872796267271042, + "learning_rate": 0.000193448, + "loss": 0.0472, + "step": 1640 + }, + { + "epoch": 0.24960290447016112, + "grad_norm": 0.06040889397263527, + "learning_rate": 0.000193408, + "loss": 0.0463, + "step": 1650 + }, + { + "epoch": 0.2511156493457378, + "grad_norm": 0.08789139986038208, + "learning_rate": 0.00019336800000000002, + "loss": 0.0495, + "step": 1660 + }, + { + "epoch": 0.25262839422131456, + "grad_norm": 0.0869157686829567, + "learning_rate": 0.00019332800000000002, + "loss": 0.0491, + "step": 1670 + }, + { + "epoch": 0.2541411390968913, + "grad_norm": 0.06886725127696991, + "learning_rate": 0.000193288, + "loss": 0.0508, + "step": 1680 + }, + { + "epoch": 0.255653883972468, + "grad_norm": 0.06138046458363533, + "learning_rate": 0.000193248, + "loss": 0.0435, + "step": 1690 + }, + { + "epoch": 0.25716662884804475, + "grad_norm": 0.05554139241576195, + "learning_rate": 0.00019320800000000002, + "loss": 0.0483, + "step": 1700 + }, + { + "epoch": 0.2586793737236215, + "grad_norm": 0.06712419539690018, + "learning_rate": 0.00019316800000000002, + "loss": 0.0545, + "step": 1710 + }, + { + "epoch": 0.2601921185991982, + "grad_norm": 0.07289120554924011, + "learning_rate": 0.000193128, + "loss": 0.0481, + "step": 1720 + }, + { + "epoch": 0.261704863474775, + "grad_norm": 0.07003842294216156, + "learning_rate": 0.000193088, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.26321760835035174, + "grad_norm": 0.06333723664283752, + "learning_rate": 0.000193048, + "loss": 0.0536, + "step": 1740 + }, + { + "epoch": 0.26473035322592847, + "grad_norm": 0.0609460324048996, + "learning_rate": 0.00019300800000000002, + "loss": 0.0516, + "step": 1750 + }, + { + "epoch": 0.2662430981015052, + "grad_norm": 0.14176234602928162, + "learning_rate": 0.000192968, + "loss": 0.0522, + "step": 1760 + }, + { + "epoch": 0.26775584297708194, + "grad_norm": 0.09526730328798294, + "learning_rate": 0.000192928, + "loss": 0.0468, + "step": 1770 + }, + { + "epoch": 0.26926858785265867, + "grad_norm": 0.05794398859143257, + "learning_rate": 0.000192888, + "loss": 0.051, + "step": 1780 + }, + { + "epoch": 0.2707813327282354, + "grad_norm": 0.07408788055181503, + "learning_rate": 0.000192848, + "loss": 0.0482, + "step": 1790 + }, + { + "epoch": 0.27229407760381213, + "grad_norm": 0.07873456180095673, + "learning_rate": 0.00019280800000000001, + "loss": 0.0576, + "step": 1800 + }, + { + "epoch": 0.27229407760381213, + "eval_cer": 0.28151275038111545, + "eval_loss": 0.042666129767894745, + "eval_runtime": 10460.0372, + "eval_samples_per_second": 2.013, + "eval_steps_per_second": 0.252, + "step": 1800 + }, + { + "epoch": 0.27380682247938887, + "grad_norm": 0.06786733120679855, + "learning_rate": 0.000192768, + "loss": 0.0505, + "step": 1810 + }, + { + "epoch": 0.2753195673549656, + "grad_norm": 0.090096116065979, + "learning_rate": 0.00019272800000000003, + "loss": 0.0458, + "step": 1820 + }, + { + "epoch": 0.27683231223054233, + "grad_norm": 0.058033574372529984, + "learning_rate": 0.000192688, + "loss": 0.0415, + "step": 1830 + }, + { + "epoch": 0.27834505710611906, + "grad_norm": 0.09522871673107147, + "learning_rate": 0.00019264800000000002, + "loss": 0.0456, + "step": 1840 + }, + { + "epoch": 0.2798578019816958, + "grad_norm": 0.06533698737621307, + "learning_rate": 0.000192608, + "loss": 0.045, + "step": 1850 + }, + { + "epoch": 0.28137054685727253, + "grad_norm": 0.07162319868803024, + "learning_rate": 0.000192568, + "loss": 0.0511, + "step": 1860 + }, + { + "epoch": 0.28288329173284926, + "grad_norm": 0.06015852093696594, + "learning_rate": 0.00019252800000000002, + "loss": 0.0453, + "step": 1870 + }, + { + "epoch": 0.284396036608426, + "grad_norm": 0.0789792612195015, + "learning_rate": 0.000192488, + "loss": 0.0498, + "step": 1880 + }, + { + "epoch": 0.2859087814840027, + "grad_norm": 0.05619093030691147, + "learning_rate": 0.000192448, + "loss": 0.0454, + "step": 1890 + }, + { + "epoch": 0.28742152635957946, + "grad_norm": 0.061943668872117996, + "learning_rate": 0.000192408, + "loss": 0.0496, + "step": 1900 + }, + { + "epoch": 0.2889342712351562, + "grad_norm": 0.07192958891391754, + "learning_rate": 0.00019236800000000003, + "loss": 0.05, + "step": 1910 + }, + { + "epoch": 0.2904470161107329, + "grad_norm": 0.07053862512111664, + "learning_rate": 0.00019232800000000002, + "loss": 0.0504, + "step": 1920 + }, + { + "epoch": 0.29195976098630966, + "grad_norm": 0.06491555273532867, + "learning_rate": 0.000192288, + "loss": 0.0478, + "step": 1930 + }, + { + "epoch": 0.2934725058618864, + "grad_norm": 0.06389233469963074, + "learning_rate": 0.000192248, + "loss": 0.0469, + "step": 1940 + }, + { + "epoch": 0.2949852507374631, + "grad_norm": 0.06336333602666855, + "learning_rate": 0.000192208, + "loss": 0.0472, + "step": 1950 + }, + { + "epoch": 0.29649799561303986, + "grad_norm": 0.06351201981306076, + "learning_rate": 0.00019216800000000002, + "loss": 0.0459, + "step": 1960 + }, + { + "epoch": 0.2980107404886166, + "grad_norm": 0.0773550271987915, + "learning_rate": 0.00019212800000000001, + "loss": 0.0435, + "step": 1970 + }, + { + "epoch": 0.2995234853641933, + "grad_norm": 0.07999245822429657, + "learning_rate": 0.000192088, + "loss": 0.051, + "step": 1980 + }, + { + "epoch": 0.30103623023977005, + "grad_norm": 0.05664638802409172, + "learning_rate": 0.000192048, + "loss": 0.0493, + "step": 1990 + }, + { + "epoch": 0.3025489751153468, + "grad_norm": 0.050149012356996536, + "learning_rate": 0.00019200800000000002, + "loss": 0.0491, + "step": 2000 + }, + { + "epoch": 0.3025489751153468, + "eval_cer": 0.10787543886957575, + "eval_loss": 0.042158011347055435, + "eval_runtime": 10458.1763, + "eval_samples_per_second": 2.013, + "eval_steps_per_second": 0.252, + "step": 2000 + }, + { + "epoch": 0.3040617199909235, + "grad_norm": 0.06383787840604782, + "learning_rate": 0.00019196800000000002, + "loss": 0.0421, + "step": 2010 + }, + { + "epoch": 0.30557446486650025, + "grad_norm": 0.05740641430020332, + "learning_rate": 0.000191928, + "loss": 0.0499, + "step": 2020 + }, + { + "epoch": 0.307087209742077, + "grad_norm": 0.07163075357675552, + "learning_rate": 0.000191888, + "loss": 0.0431, + "step": 2030 + }, + { + "epoch": 0.3085999546176537, + "grad_norm": 0.05976075306534767, + "learning_rate": 0.000191848, + "loss": 0.0476, + "step": 2040 + }, + { + "epoch": 0.31011269949323045, + "grad_norm": 0.0871894434094429, + "learning_rate": 0.00019180800000000002, + "loss": 0.0449, + "step": 2050 + }, + { + "epoch": 0.3116254443688072, + "grad_norm": 0.07474277913570404, + "learning_rate": 0.000191768, + "loss": 0.0422, + "step": 2060 + }, + { + "epoch": 0.3131381892443839, + "grad_norm": 0.05594407767057419, + "learning_rate": 0.00019172800000000003, + "loss": 0.0479, + "step": 2070 + }, + { + "epoch": 0.31465093411996065, + "grad_norm": 0.06565164029598236, + "learning_rate": 0.000191688, + "loss": 0.0501, + "step": 2080 + }, + { + "epoch": 0.3161636789955374, + "grad_norm": 0.07224603742361069, + "learning_rate": 0.000191648, + "loss": 0.0474, + "step": 2090 + }, + { + "epoch": 0.3176764238711141, + "grad_norm": 0.07781083881855011, + "learning_rate": 0.000191608, + "loss": 0.0401, + "step": 2100 + }, + { + "epoch": 0.31918916874669084, + "grad_norm": 0.08147955685853958, + "learning_rate": 0.000191568, + "loss": 0.0486, + "step": 2110 + }, + { + "epoch": 0.32070191362226763, + "grad_norm": 0.05572337657213211, + "learning_rate": 0.00019152800000000003, + "loss": 0.0488, + "step": 2120 + }, + { + "epoch": 0.32221465849784436, + "grad_norm": 0.06601813435554504, + "learning_rate": 0.000191488, + "loss": 0.0466, + "step": 2130 + }, + { + "epoch": 0.3237274033734211, + "grad_norm": 0.057904861867427826, + "learning_rate": 0.00019144800000000001, + "loss": 0.0479, + "step": 2140 + }, + { + "epoch": 0.32524014824899783, + "grad_norm": 0.057231709361076355, + "learning_rate": 0.000191408, + "loss": 0.0522, + "step": 2150 + }, + { + "epoch": 0.32675289312457456, + "grad_norm": 0.08306867629289627, + "learning_rate": 0.000191368, + "loss": 0.0439, + "step": 2160 + }, + { + "epoch": 0.3282656380001513, + "grad_norm": 0.0742512047290802, + "learning_rate": 0.00019132800000000002, + "loss": 0.0434, + "step": 2170 + }, + { + "epoch": 0.329778382875728, + "grad_norm": 0.07260335236787796, + "learning_rate": 0.000191288, + "loss": 0.0505, + "step": 2180 + }, + { + "epoch": 0.33129112775130476, + "grad_norm": 0.07398936152458191, + "learning_rate": 0.000191248, + "loss": 0.0519, + "step": 2190 + }, + { + "epoch": 0.3328038726268815, + "grad_norm": 0.069728784263134, + "learning_rate": 0.000191208, + "loss": 0.0501, + "step": 2200 + }, + { + "epoch": 0.3328038726268815, + "eval_cer": 0.07287520414693144, + "eval_loss": 0.041937489062547684, + "eval_runtime": 10449.7877, + "eval_samples_per_second": 2.015, + "eval_steps_per_second": 0.252, + "step": 2200 + }, + { + "epoch": 0.3343166175024582, + "grad_norm": 0.07778773456811905, + "learning_rate": 0.00019116800000000002, + "loss": 0.0485, + "step": 2210 + }, + { + "epoch": 0.33582936237803496, + "grad_norm": 0.08489017933607101, + "learning_rate": 0.00019112800000000002, + "loss": 0.047, + "step": 2220 + }, + { + "epoch": 0.3373421072536117, + "grad_norm": 0.0746629610657692, + "learning_rate": 0.000191088, + "loss": 0.0444, + "step": 2230 + }, + { + "epoch": 0.3388548521291884, + "grad_norm": 0.07858649641275406, + "learning_rate": 0.000191048, + "loss": 0.0537, + "step": 2240 + }, + { + "epoch": 0.34036759700476515, + "grad_norm": 0.08357574045658112, + "learning_rate": 0.000191008, + "loss": 0.054, + "step": 2250 + }, + { + "epoch": 0.3418803418803419, + "grad_norm": 0.05976574867963791, + "learning_rate": 0.00019096800000000002, + "loss": 0.0465, + "step": 2260 + }, + { + "epoch": 0.3433930867559186, + "grad_norm": 0.07549616694450378, + "learning_rate": 0.000190928, + "loss": 0.0479, + "step": 2270 + }, + { + "epoch": 0.34490583163149535, + "grad_norm": 0.07128783315420151, + "learning_rate": 0.000190888, + "loss": 0.0481, + "step": 2280 + }, + { + "epoch": 0.3464185765070721, + "grad_norm": 0.05093182995915413, + "learning_rate": 0.000190848, + "loss": 0.039, + "step": 2290 + }, + { + "epoch": 0.3479313213826488, + "grad_norm": 0.07213055342435837, + "learning_rate": 0.00019080800000000002, + "loss": 0.0486, + "step": 2300 + }, + { + "epoch": 0.34944406625822555, + "grad_norm": 0.08296896517276764, + "learning_rate": 0.00019076800000000001, + "loss": 0.0436, + "step": 2310 + }, + { + "epoch": 0.3509568111338023, + "grad_norm": 0.05904708430171013, + "learning_rate": 0.000190728, + "loss": 0.0457, + "step": 2320 + }, + { + "epoch": 0.352469556009379, + "grad_norm": 0.07709085941314697, + "learning_rate": 0.000190688, + "loss": 0.0456, + "step": 2330 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 0.061139535158872604, + "learning_rate": 0.000190648, + "loss": 0.0484, + "step": 2340 + }, + { + "epoch": 0.3554950457605325, + "grad_norm": 0.11013538390398026, + "learning_rate": 0.00019060800000000002, + "loss": 0.0463, + "step": 2350 + }, + { + "epoch": 0.3570077906361092, + "grad_norm": 0.04920123890042305, + "learning_rate": 0.000190568, + "loss": 0.0404, + "step": 2360 + }, + { + "epoch": 0.35852053551168594, + "grad_norm": 0.05916327238082886, + "learning_rate": 0.00019052800000000003, + "loss": 0.0506, + "step": 2370 + }, + { + "epoch": 0.3600332803872627, + "grad_norm": 0.08169171214103699, + "learning_rate": 0.000190488, + "loss": 0.0422, + "step": 2380 + }, + { + "epoch": 0.3615460252628394, + "grad_norm": 0.07195686548948288, + "learning_rate": 0.00019044800000000002, + "loss": 0.0476, + "step": 2390 + }, + { + "epoch": 0.36305877013841614, + "grad_norm": 0.06132512912154198, + "learning_rate": 0.000190408, + "loss": 0.0451, + "step": 2400 + }, + { + "epoch": 0.36305877013841614, + "eval_cer": 0.22885396051223894, + "eval_loss": 0.04164993762969971, + "eval_runtime": 10444.7845, + "eval_samples_per_second": 2.016, + "eval_steps_per_second": 0.252, + "step": 2400 + }, + { + "epoch": 0.3645715150139929, + "grad_norm": 0.06889329850673676, + "learning_rate": 0.000190368, + "loss": 0.0536, + "step": 2410 + }, + { + "epoch": 0.3660842598895696, + "grad_norm": 0.06513672322034836, + "learning_rate": 0.00019032800000000002, + "loss": 0.0472, + "step": 2420 + }, + { + "epoch": 0.36759700476514634, + "grad_norm": 0.06588304787874222, + "learning_rate": 0.000190288, + "loss": 0.046, + "step": 2430 + }, + { + "epoch": 0.3691097496407231, + "grad_norm": 0.07162468135356903, + "learning_rate": 0.000190248, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.3706224945162998, + "grad_norm": 0.05831474810838699, + "learning_rate": 0.000190208, + "loss": 0.0448, + "step": 2450 + }, + { + "epoch": 0.37213523939187654, + "grad_norm": 0.11214031279087067, + "learning_rate": 0.000190168, + "loss": 0.0491, + "step": 2460 + }, + { + "epoch": 0.37364798426745327, + "grad_norm": 0.07672178000211716, + "learning_rate": 0.00019012800000000002, + "loss": 0.0489, + "step": 2470 + }, + { + "epoch": 0.37516072914303, + "grad_norm": 0.07850979268550873, + "learning_rate": 0.000190088, + "loss": 0.047, + "step": 2480 + }, + { + "epoch": 0.37667347401860674, + "grad_norm": 0.0473526194691658, + "learning_rate": 0.000190048, + "loss": 0.0436, + "step": 2490 + }, + { + "epoch": 0.3781862188941835, + "grad_norm": 0.08313214778900146, + "learning_rate": 0.000190008, + "loss": 0.0457, + "step": 2500 + }, + { + "epoch": 0.37969896376976026, + "grad_norm": 0.07851678878068924, + "learning_rate": 0.00018996800000000002, + "loss": 0.0399, + "step": 2510 + }, + { + "epoch": 0.381211708645337, + "grad_norm": 0.06067463755607605, + "learning_rate": 0.00018992800000000002, + "loss": 0.0406, + "step": 2520 + }, + { + "epoch": 0.3827244535209137, + "grad_norm": 0.07291869819164276, + "learning_rate": 0.000189888, + "loss": 0.0411, + "step": 2530 + }, + { + "epoch": 0.38423719839649045, + "grad_norm": 0.05576318874955177, + "learning_rate": 0.000189848, + "loss": 0.0412, + "step": 2540 + }, + { + "epoch": 0.3857499432720672, + "grad_norm": 0.05669853091239929, + "learning_rate": 0.000189808, + "loss": 0.0462, + "step": 2550 + }, + { + "epoch": 0.3872626881476439, + "grad_norm": 0.0653596743941307, + "learning_rate": 0.00018976800000000002, + "loss": 0.0504, + "step": 2560 + }, + { + "epoch": 0.38877543302322065, + "grad_norm": 0.07938168197870255, + "learning_rate": 0.000189728, + "loss": 0.0423, + "step": 2570 + }, + { + "epoch": 0.3902881778987974, + "grad_norm": 0.19600598514080048, + "learning_rate": 0.000189688, + "loss": 0.0422, + "step": 2580 + }, + { + "epoch": 0.3918009227743741, + "grad_norm": 0.08753781765699387, + "learning_rate": 0.000189648, + "loss": 0.0485, + "step": 2590 + }, + { + "epoch": 0.39331366764995085, + "grad_norm": 0.07059615105390549, + "learning_rate": 0.00018960800000000002, + "loss": 0.0441, + "step": 2600 + }, + { + "epoch": 0.39331366764995085, + "eval_cer": 0.12797016798729038, + "eval_loss": 0.040877681225538254, + "eval_runtime": 10426.9488, + "eval_samples_per_second": 2.019, + "eval_steps_per_second": 0.252, + "step": 2600 + }, + { + "epoch": 0.3948264125255276, + "grad_norm": 0.07426866888999939, + "learning_rate": 0.000189568, + "loss": 0.0456, + "step": 2610 + }, + { + "epoch": 0.3963391574011043, + "grad_norm": 0.05869770795106888, + "learning_rate": 0.000189528, + "loss": 0.047, + "step": 2620 + }, + { + "epoch": 0.39785190227668105, + "grad_norm": 0.09353045374155045, + "learning_rate": 0.000189488, + "loss": 0.0457, + "step": 2630 + }, + { + "epoch": 0.3993646471522578, + "grad_norm": 0.083396315574646, + "learning_rate": 0.000189448, + "loss": 0.0441, + "step": 2640 + }, + { + "epoch": 0.4008773920278345, + "grad_norm": 0.0698527917265892, + "learning_rate": 0.000189408, + "loss": 0.0469, + "step": 2650 + }, + { + "epoch": 0.40239013690341124, + "grad_norm": 0.07554033398628235, + "learning_rate": 0.000189368, + "loss": 0.0523, + "step": 2660 + }, + { + "epoch": 0.403902881778988, + "grad_norm": 0.08026187121868134, + "learning_rate": 0.00018932800000000003, + "loss": 0.0492, + "step": 2670 + }, + { + "epoch": 0.4054156266545647, + "grad_norm": 0.0758117213845253, + "learning_rate": 0.000189288, + "loss": 0.0471, + "step": 2680 + }, + { + "epoch": 0.40692837153014144, + "grad_norm": 0.0716470330953598, + "learning_rate": 0.00018924800000000001, + "loss": 0.0401, + "step": 2690 + }, + { + "epoch": 0.4084411164057182, + "grad_norm": 0.07114976644515991, + "learning_rate": 0.000189208, + "loss": 0.0483, + "step": 2700 + }, + { + "epoch": 0.4099538612812949, + "grad_norm": 0.059242133051157, + "learning_rate": 0.000189168, + "loss": 0.0416, + "step": 2710 + }, + { + "epoch": 0.41146660615687164, + "grad_norm": 0.07214327901601791, + "learning_rate": 0.00018912800000000002, + "loss": 0.0446, + "step": 2720 + }, + { + "epoch": 0.41297935103244837, + "grad_norm": 0.0404672808945179, + "learning_rate": 0.000189088, + "loss": 0.0445, + "step": 2730 + }, + { + "epoch": 0.4144920959080251, + "grad_norm": 0.06663410365581512, + "learning_rate": 0.000189048, + "loss": 0.0435, + "step": 2740 + }, + { + "epoch": 0.41600484078360184, + "grad_norm": 0.0690486952662468, + "learning_rate": 0.000189008, + "loss": 0.048, + "step": 2750 + }, + { + "epoch": 0.41751758565917857, + "grad_norm": 0.07034830003976822, + "learning_rate": 0.00018896800000000002, + "loss": 0.0423, + "step": 2760 + }, + { + "epoch": 0.4190303305347553, + "grad_norm": 0.08420894294977188, + "learning_rate": 0.00018892800000000002, + "loss": 0.0525, + "step": 2770 + }, + { + "epoch": 0.42054307541033203, + "grad_norm": 0.07617480307817459, + "learning_rate": 0.000188888, + "loss": 0.0492, + "step": 2780 + }, + { + "epoch": 0.42205582028590877, + "grad_norm": 0.06841789186000824, + "learning_rate": 0.000188848, + "loss": 0.0427, + "step": 2790 + }, + { + "epoch": 0.4235685651614855, + "grad_norm": 0.07013357430696487, + "learning_rate": 0.000188808, + "loss": 0.04, + "step": 2800 + }, + { + "epoch": 0.4235685651614855, + "eval_cer": 0.26005539454405746, + "eval_loss": 0.04089580848813057, + "eval_runtime": 10530.3682, + "eval_samples_per_second": 1.999, + "eval_steps_per_second": 0.25, + "step": 2800 + }, + { + "epoch": 0.42508131003706223, + "grad_norm": 0.06432001292705536, + "learning_rate": 0.00018876800000000002, + "loss": 0.0402, + "step": 2810 + }, + { + "epoch": 0.42659405491263896, + "grad_norm": 0.06437406688928604, + "learning_rate": 0.000188728, + "loss": 0.0397, + "step": 2820 + }, + { + "epoch": 0.4281067997882157, + "grad_norm": 0.0579422190785408, + "learning_rate": 0.000188688, + "loss": 0.0431, + "step": 2830 + }, + { + "epoch": 0.42961954466379243, + "grad_norm": 0.0628400593996048, + "learning_rate": 0.000188648, + "loss": 0.0426, + "step": 2840 + }, + { + "epoch": 0.43113228953936916, + "grad_norm": 0.04976367950439453, + "learning_rate": 0.000188608, + "loss": 0.0448, + "step": 2850 + }, + { + "epoch": 0.4326450344149459, + "grad_norm": 0.07479149103164673, + "learning_rate": 0.00018856800000000001, + "loss": 0.0458, + "step": 2860 + }, + { + "epoch": 0.4341577792905226, + "grad_norm": 0.06853318214416504, + "learning_rate": 0.000188528, + "loss": 0.045, + "step": 2870 + }, + { + "epoch": 0.43567052416609936, + "grad_norm": 0.08534535765647888, + "learning_rate": 0.00018848800000000003, + "loss": 0.044, + "step": 2880 + }, + { + "epoch": 0.43718326904167615, + "grad_norm": 0.05148012563586235, + "learning_rate": 0.000188448, + "loss": 0.0448, + "step": 2890 + }, + { + "epoch": 0.4386960139172529, + "grad_norm": 0.073714479804039, + "learning_rate": 0.00018840800000000002, + "loss": 0.0388, + "step": 2900 + }, + { + "epoch": 0.4402087587928296, + "grad_norm": 0.06875050067901611, + "learning_rate": 0.000188368, + "loss": 0.0476, + "step": 2910 + }, + { + "epoch": 0.44172150366840635, + "grad_norm": 0.07048488408327103, + "learning_rate": 0.000188328, + "loss": 0.0537, + "step": 2920 + }, + { + "epoch": 0.4432342485439831, + "grad_norm": 0.06159156188368797, + "learning_rate": 0.00018828800000000002, + "loss": 0.0523, + "step": 2930 + }, + { + "epoch": 0.4447469934195598, + "grad_norm": 0.0851297378540039, + "learning_rate": 0.000188248, + "loss": 0.0466, + "step": 2940 + }, + { + "epoch": 0.44625973829513654, + "grad_norm": 0.07920840382575989, + "learning_rate": 0.000188208, + "loss": 0.0434, + "step": 2950 + }, + { + "epoch": 0.4477724831707133, + "grad_norm": 0.06767392158508301, + "learning_rate": 0.000188168, + "loss": 0.0446, + "step": 2960 + }, + { + "epoch": 0.44928522804629, + "grad_norm": 0.0621979758143425, + "learning_rate": 0.00018812800000000003, + "loss": 0.0514, + "step": 2970 + }, + { + "epoch": 0.45079797292186674, + "grad_norm": 0.06485885381698608, + "learning_rate": 0.00018808800000000002, + "loss": 0.0403, + "step": 2980 + }, + { + "epoch": 0.4523107177974435, + "grad_norm": 0.07618974149227142, + "learning_rate": 0.000188048, + "loss": 0.046, + "step": 2990 + }, + { + "epoch": 0.4538234626730202, + "grad_norm": 0.050627488642930984, + "learning_rate": 0.000188008, + "loss": 0.04, + "step": 3000 + }, + { + "epoch": 0.4538234626730202, + "eval_cer": 0.027385337988253985, + "eval_loss": 0.0410909466445446, + "eval_runtime": 11737.0194, + "eval_samples_per_second": 1.794, + "eval_steps_per_second": 0.224, + "step": 3000 + }, + { + "epoch": 0.45533620754859694, + "grad_norm": 0.07569224387407303, + "learning_rate": 0.000187968, + "loss": 0.0453, + "step": 3010 + }, + { + "epoch": 0.45684895242417367, + "grad_norm": 0.06267885118722916, + "learning_rate": 0.00018792800000000002, + "loss": 0.0519, + "step": 3020 + }, + { + "epoch": 0.4583616972997504, + "grad_norm": 0.0801217257976532, + "learning_rate": 0.00018788800000000001, + "loss": 0.0452, + "step": 3030 + }, + { + "epoch": 0.45987444217532714, + "grad_norm": 0.06966337561607361, + "learning_rate": 0.000187848, + "loss": 0.0459, + "step": 3040 + }, + { + "epoch": 0.46138718705090387, + "grad_norm": 0.05708028003573418, + "learning_rate": 0.000187808, + "loss": 0.0462, + "step": 3050 + }, + { + "epoch": 0.4628999319264806, + "grad_norm": 0.06033516675233841, + "learning_rate": 0.00018776800000000002, + "loss": 0.0459, + "step": 3060 + }, + { + "epoch": 0.46441267680205733, + "grad_norm": 0.06908197700977325, + "learning_rate": 0.00018772800000000002, + "loss": 0.048, + "step": 3070 + }, + { + "epoch": 0.46592542167763407, + "grad_norm": 0.0723978653550148, + "learning_rate": 0.000187688, + "loss": 0.047, + "step": 3080 + }, + { + "epoch": 0.4674381665532108, + "grad_norm": 0.06268727034330368, + "learning_rate": 0.000187648, + "loss": 0.0387, + "step": 3090 + }, + { + "epoch": 0.46895091142878753, + "grad_norm": 0.06796183437108994, + "learning_rate": 0.000187608, + "loss": 0.0379, + "step": 3100 + }, + { + "epoch": 0.47046365630436426, + "grad_norm": 0.08227751404047012, + "learning_rate": 0.00018756800000000002, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.471976401179941, + "grad_norm": 0.06391087174415588, + "learning_rate": 0.000187528, + "loss": 0.045, + "step": 3120 + }, + { + "epoch": 0.47348914605551773, + "grad_norm": 0.09645809978246689, + "learning_rate": 0.00018748800000000003, + "loss": 0.0479, + "step": 3130 + }, + { + "epoch": 0.47500189093109446, + "grad_norm": 0.07187838107347488, + "learning_rate": 0.000187448, + "loss": 0.0438, + "step": 3140 + }, + { + "epoch": 0.4765146358066712, + "grad_norm": 0.06578271836042404, + "learning_rate": 0.00018740800000000002, + "loss": 0.0471, + "step": 3150 + }, + { + "epoch": 0.4780273806822479, + "grad_norm": 0.06598031520843506, + "learning_rate": 0.000187368, + "loss": 0.0463, + "step": 3160 + }, + { + "epoch": 0.47954012555782466, + "grad_norm": 0.06380560249090195, + "learning_rate": 0.000187328, + "loss": 0.0439, + "step": 3170 + }, + { + "epoch": 0.4810528704334014, + "grad_norm": 0.05300907790660858, + "learning_rate": 0.00018728800000000003, + "loss": 0.0385, + "step": 3180 + }, + { + "epoch": 0.4825656153089781, + "grad_norm": 0.08515879511833191, + "learning_rate": 0.000187248, + "loss": 0.0444, + "step": 3190 + }, + { + "epoch": 0.48407836018455486, + "grad_norm": 0.0779171735048294, + "learning_rate": 0.00018720800000000001, + "loss": 0.0453, + "step": 3200 + }, + { + "epoch": 0.48407836018455486, + "eval_cer": 0.010036246117811001, + "eval_loss": 0.04116720333695412, + "eval_runtime": 10575.268, + "eval_samples_per_second": 1.991, + "eval_steps_per_second": 0.249, + "step": 3200 + }, + { + "epoch": 0.4855911050601316, + "grad_norm": 0.07719563692808151, + "learning_rate": 0.000187168, + "loss": 0.0516, + "step": 3210 + }, + { + "epoch": 0.4871038499357083, + "grad_norm": 0.0623527429997921, + "learning_rate": 0.000187128, + "loss": 0.0412, + "step": 3220 + }, + { + "epoch": 0.48861659481128505, + "grad_norm": 0.05286158621311188, + "learning_rate": 0.00018708800000000002, + "loss": 0.0433, + "step": 3230 + }, + { + "epoch": 0.4901293396868618, + "grad_norm": 0.05317120626568794, + "learning_rate": 0.000187048, + "loss": 0.0451, + "step": 3240 + }, + { + "epoch": 0.4916420845624385, + "grad_norm": 0.06447257846593857, + "learning_rate": 0.000187008, + "loss": 0.0552, + "step": 3250 + }, + { + "epoch": 0.49315482943801525, + "grad_norm": 0.05432993173599243, + "learning_rate": 0.000186968, + "loss": 0.0454, + "step": 3260 + }, + { + "epoch": 0.49466757431359204, + "grad_norm": 0.07853369414806366, + "learning_rate": 0.00018692800000000002, + "loss": 0.0513, + "step": 3270 + }, + { + "epoch": 0.4961803191891688, + "grad_norm": 0.07532196491956711, + "learning_rate": 0.00018688800000000002, + "loss": 0.0494, + "step": 3280 + }, + { + "epoch": 0.4976930640647455, + "grad_norm": 0.0591423436999321, + "learning_rate": 0.000186848, + "loss": 0.0406, + "step": 3290 + }, + { + "epoch": 0.49920580894032224, + "grad_norm": 0.05588558688759804, + "learning_rate": 0.000186808, + "loss": 0.0454, + "step": 3300 + }, + { + "epoch": 0.5007185538158989, + "grad_norm": 0.06208329647779465, + "learning_rate": 0.000186768, + "loss": 0.0379, + "step": 3310 + }, + { + "epoch": 0.5022312986914756, + "grad_norm": 0.09954684972763062, + "learning_rate": 0.00018672800000000002, + "loss": 0.0441, + "step": 3320 + }, + { + "epoch": 0.5037440435670524, + "grad_norm": 0.06522241979837418, + "learning_rate": 0.000186688, + "loss": 0.0435, + "step": 3330 + }, + { + "epoch": 0.5052567884426291, + "grad_norm": 0.06771814823150635, + "learning_rate": 0.000186648, + "loss": 0.0407, + "step": 3340 + }, + { + "epoch": 0.5067695333182058, + "grad_norm": 0.09186646342277527, + "learning_rate": 0.000186608, + "loss": 0.0468, + "step": 3350 + }, + { + "epoch": 0.5082822781937826, + "grad_norm": 0.05741488188505173, + "learning_rate": 0.00018656800000000002, + "loss": 0.0427, + "step": 3360 + }, + { + "epoch": 0.5097950230693593, + "grad_norm": 0.078957200050354, + "learning_rate": 0.00018652800000000001, + "loss": 0.0524, + "step": 3370 + }, + { + "epoch": 0.511307767944936, + "grad_norm": 0.06480754166841507, + "learning_rate": 0.000186488, + "loss": 0.0491, + "step": 3380 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.07016266882419586, + "learning_rate": 0.000186448, + "loss": 0.0455, + "step": 3390 + }, + { + "epoch": 0.5143332576960895, + "grad_norm": 0.09549427777528763, + "learning_rate": 0.000186408, + "loss": 0.0435, + "step": 3400 + }, + { + "epoch": 0.5143332576960895, + "eval_cer": 0.06014582453123417, + "eval_loss": 0.040756821632385254, + "eval_runtime": 10458.365, + "eval_samples_per_second": 2.013, + "eval_steps_per_second": 0.252, + "step": 3400 + }, + { + "epoch": 0.5158460025716662, + "grad_norm": 0.06771855056285858, + "learning_rate": 0.00018636800000000002, + "loss": 0.0496, + "step": 3410 + }, + { + "epoch": 0.517358747447243, + "grad_norm": 0.051270436495542526, + "learning_rate": 0.000186328, + "loss": 0.0376, + "step": 3420 + }, + { + "epoch": 0.5188714923228197, + "grad_norm": 0.05424557998776436, + "learning_rate": 0.00018628800000000003, + "loss": 0.0455, + "step": 3430 + }, + { + "epoch": 0.5203842371983964, + "grad_norm": 0.07000952959060669, + "learning_rate": 0.000186248, + "loss": 0.0494, + "step": 3440 + }, + { + "epoch": 0.5218969820739732, + "grad_norm": 0.06696450710296631, + "learning_rate": 0.00018620800000000002, + "loss": 0.0449, + "step": 3450 + }, + { + "epoch": 0.52340972694955, + "grad_norm": 0.07243742048740387, + "learning_rate": 0.000186168, + "loss": 0.0481, + "step": 3460 + }, + { + "epoch": 0.5249224718251267, + "grad_norm": 0.07457748800516129, + "learning_rate": 0.000186128, + "loss": 0.0413, + "step": 3470 + }, + { + "epoch": 0.5264352167007035, + "grad_norm": 0.05373325198888779, + "learning_rate": 0.00018608800000000002, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.5279479615762802, + "grad_norm": 0.07769589871168137, + "learning_rate": 0.000186048, + "loss": 0.0443, + "step": 3490 + }, + { + "epoch": 0.5294607064518569, + "grad_norm": 0.05949350818991661, + "learning_rate": 0.000186008, + "loss": 0.0426, + "step": 3500 + }, + { + "epoch": 0.5309734513274337, + "grad_norm": 0.08557622879743576, + "learning_rate": 0.000185968, + "loss": 0.0436, + "step": 3510 + }, + { + "epoch": 0.5324861962030104, + "grad_norm": 0.07504332065582275, + "learning_rate": 0.00018592800000000003, + "loss": 0.045, + "step": 3520 + }, + { + "epoch": 0.5339989410785871, + "grad_norm": 0.08510497957468033, + "learning_rate": 0.00018588800000000002, + "loss": 0.0451, + "step": 3530 + }, + { + "epoch": 0.5355116859541639, + "grad_norm": 0.06645802408456802, + "learning_rate": 0.000185848, + "loss": 0.0459, + "step": 3540 + }, + { + "epoch": 0.5370244308297406, + "grad_norm": 0.05905970185995102, + "learning_rate": 0.000185808, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.5385371757053173, + "grad_norm": 0.059341125190258026, + "learning_rate": 0.000185768, + "loss": 0.0521, + "step": 3560 + }, + { + "epoch": 0.5400499205808941, + "grad_norm": 0.07676515728235245, + "learning_rate": 0.00018572800000000002, + "loss": 0.0446, + "step": 3570 + }, + { + "epoch": 0.5415626654564708, + "grad_norm": 0.05860384181141853, + "learning_rate": 0.00018568800000000002, + "loss": 0.041, + "step": 3580 + }, + { + "epoch": 0.5430754103320475, + "grad_norm": 0.07133147865533829, + "learning_rate": 0.000185648, + "loss": 0.0479, + "step": 3590 + }, + { + "epoch": 0.5445881552076243, + "grad_norm": 0.058478474617004395, + "learning_rate": 0.000185608, + "loss": 0.0447, + "step": 3600 + }, + { + "epoch": 0.5445881552076243, + "eval_cer": 0.16368877753976077, + "eval_loss": 0.04047335311770439, + "eval_runtime": 10446.0422, + "eval_samples_per_second": 2.015, + "eval_steps_per_second": 0.252, + "step": 3600 + }, + { + "epoch": 0.546100900083201, + "grad_norm": 0.06725309789180756, + "learning_rate": 0.000185568, + "loss": 0.053, + "step": 3610 + }, + { + "epoch": 0.5476136449587777, + "grad_norm": 0.06334862858057022, + "learning_rate": 0.00018552800000000002, + "loss": 0.0451, + "step": 3620 + }, + { + "epoch": 0.5491263898343545, + "grad_norm": 0.12283937633037567, + "learning_rate": 0.000185488, + "loss": 0.0437, + "step": 3630 + }, + { + "epoch": 0.5506391347099312, + "grad_norm": 0.05931037664413452, + "learning_rate": 0.000185448, + "loss": 0.0431, + "step": 3640 + }, + { + "epoch": 0.5521518795855079, + "grad_norm": 0.05501909554004669, + "learning_rate": 0.000185408, + "loss": 0.0398, + "step": 3650 + }, + { + "epoch": 0.5536646244610847, + "grad_norm": 0.06066635251045227, + "learning_rate": 0.00018536800000000002, + "loss": 0.0497, + "step": 3660 + }, + { + "epoch": 0.5551773693366614, + "grad_norm": 0.1352480947971344, + "learning_rate": 0.000185328, + "loss": 0.0445, + "step": 3670 + }, + { + "epoch": 0.5566901142122381, + "grad_norm": 0.08712221682071686, + "learning_rate": 0.000185288, + "loss": 0.0485, + "step": 3680 + }, + { + "epoch": 0.5582028590878149, + "grad_norm": 0.06511665135622025, + "learning_rate": 0.000185248, + "loss": 0.0464, + "step": 3690 + }, + { + "epoch": 0.5597156039633916, + "grad_norm": 0.052760981023311615, + "learning_rate": 0.000185208, + "loss": 0.0417, + "step": 3700 + }, + { + "epoch": 0.5612283488389683, + "grad_norm": 0.05113260820508003, + "learning_rate": 0.000185168, + "loss": 0.0426, + "step": 3710 + }, + { + "epoch": 0.5627410937145451, + "grad_norm": 0.06565012037754059, + "learning_rate": 0.000185128, + "loss": 0.0397, + "step": 3720 + }, + { + "epoch": 0.5642538385901218, + "grad_norm": 0.0608823299407959, + "learning_rate": 0.00018508800000000003, + "loss": 0.0411, + "step": 3730 + }, + { + "epoch": 0.5657665834656985, + "grad_norm": 0.0670706033706665, + "learning_rate": 0.000185048, + "loss": 0.0495, + "step": 3740 + }, + { + "epoch": 0.5672793283412753, + "grad_norm": 0.07000606507062912, + "learning_rate": 0.00018500800000000001, + "loss": 0.0457, + "step": 3750 + }, + { + "epoch": 0.568792073216852, + "grad_norm": 0.08072007447481155, + "learning_rate": 0.000184968, + "loss": 0.0484, + "step": 3760 + }, + { + "epoch": 0.5703048180924287, + "grad_norm": 0.06795356422662735, + "learning_rate": 0.000184928, + "loss": 0.0495, + "step": 3770 + }, + { + "epoch": 0.5718175629680055, + "grad_norm": 0.3031274974346161, + "learning_rate": 0.00018488800000000002, + "loss": 0.0504, + "step": 3780 + }, + { + "epoch": 0.5733303078435822, + "grad_norm": 0.05166814848780632, + "learning_rate": 0.000184848, + "loss": 0.0442, + "step": 3790 + }, + { + "epoch": 0.5748430527191589, + "grad_norm": 0.08816450089216232, + "learning_rate": 0.000184808, + "loss": 0.0525, + "step": 3800 + }, + { + "epoch": 0.5748430527191589, + "eval_cer": 0.09852050611143642, + "eval_loss": 0.041136305779218674, + "eval_runtime": 10432.1011, + "eval_samples_per_second": 2.018, + "eval_steps_per_second": 0.252, + "step": 3800 + }, + { + "epoch": 0.5763557975947357, + "grad_norm": 0.06531400233507156, + "learning_rate": 0.000184768, + "loss": 0.0459, + "step": 3810 + }, + { + "epoch": 0.5778685424703124, + "grad_norm": 0.07049426436424255, + "learning_rate": 0.00018472800000000002, + "loss": 0.0386, + "step": 3820 + }, + { + "epoch": 0.5793812873458891, + "grad_norm": 0.07954803854227066, + "learning_rate": 0.00018468800000000002, + "loss": 0.0451, + "step": 3830 + }, + { + "epoch": 0.5808940322214659, + "grad_norm": 0.07543455064296722, + "learning_rate": 0.000184648, + "loss": 0.0406, + "step": 3840 + }, + { + "epoch": 0.5824067770970426, + "grad_norm": 0.08292882144451141, + "learning_rate": 0.000184608, + "loss": 0.0544, + "step": 3850 + }, + { + "epoch": 0.5839195219726193, + "grad_norm": 0.05814971402287483, + "learning_rate": 0.000184568, + "loss": 0.0441, + "step": 3860 + }, + { + "epoch": 0.585432266848196, + "grad_norm": 0.06112606078386307, + "learning_rate": 0.00018452800000000002, + "loss": 0.0482, + "step": 3870 + }, + { + "epoch": 0.5869450117237728, + "grad_norm": 0.08487452566623688, + "learning_rate": 0.000184488, + "loss": 0.0446, + "step": 3880 + }, + { + "epoch": 0.5884577565993495, + "grad_norm": 0.05025780200958252, + "learning_rate": 0.000184448, + "loss": 0.0453, + "step": 3890 + }, + { + "epoch": 0.5899705014749262, + "grad_norm": 0.10276935994625092, + "learning_rate": 0.000184408, + "loss": 0.0427, + "step": 3900 + }, + { + "epoch": 0.591483246350503, + "grad_norm": 0.11926810443401337, + "learning_rate": 0.000184368, + "loss": 0.0472, + "step": 3910 + }, + { + "epoch": 0.5929959912260797, + "grad_norm": 0.08615875244140625, + "learning_rate": 0.00018432800000000001, + "loss": 0.0504, + "step": 3920 + }, + { + "epoch": 0.5945087361016564, + "grad_norm": 0.05418393015861511, + "learning_rate": 0.000184288, + "loss": 0.0397, + "step": 3930 + }, + { + "epoch": 0.5960214809772332, + "grad_norm": 0.06980731338262558, + "learning_rate": 0.000184248, + "loss": 0.0407, + "step": 3940 + }, + { + "epoch": 0.5975342258528099, + "grad_norm": 0.07121722400188446, + "learning_rate": 0.000184208, + "loss": 0.0441, + "step": 3950 + }, + { + "epoch": 0.5990469707283866, + "grad_norm": 0.05750627815723419, + "learning_rate": 0.00018416800000000002, + "loss": 0.049, + "step": 3960 + }, + { + "epoch": 0.6005597156039634, + "grad_norm": 0.08207126706838608, + "learning_rate": 0.000184128, + "loss": 0.0475, + "step": 3970 + }, + { + "epoch": 0.6020724604795401, + "grad_norm": 0.07319646328687668, + "learning_rate": 0.000184088, + "loss": 0.0517, + "step": 3980 + }, + { + "epoch": 0.6035852053551168, + "grad_norm": 0.06762152910232544, + "learning_rate": 0.000184048, + "loss": 0.042, + "step": 3990 + }, + { + "epoch": 0.6050979502306936, + "grad_norm": 0.05603775382041931, + "learning_rate": 0.000184008, + "loss": 0.0434, + "step": 4000 + }, + { + "epoch": 0.6050979502306936, + "eval_cer": 0.2283245991802003, + "eval_loss": 0.03986261412501335, + "eval_runtime": 10464.7689, + "eval_samples_per_second": 2.012, + "eval_steps_per_second": 0.252, + "step": 4000 + }, + { + "epoch": 0.6066106951062703, + "grad_norm": 0.05094938725233078, + "learning_rate": 0.000183968, + "loss": 0.0493, + "step": 4010 + }, + { + "epoch": 0.608123439981847, + "grad_norm": 0.08996951580047607, + "learning_rate": 0.000183928, + "loss": 0.0475, + "step": 4020 + }, + { + "epoch": 0.6096361848574238, + "grad_norm": 0.07369961589574814, + "learning_rate": 0.00018388800000000003, + "loss": 0.0441, + "step": 4030 + }, + { + "epoch": 0.6111489297330005, + "grad_norm": 0.06135983020067215, + "learning_rate": 0.000183848, + "loss": 0.0421, + "step": 4040 + }, + { + "epoch": 0.6126616746085772, + "grad_norm": 0.04601254314184189, + "learning_rate": 0.000183808, + "loss": 0.037, + "step": 4050 + }, + { + "epoch": 0.614174419484154, + "grad_norm": 0.04949349910020828, + "learning_rate": 0.000183768, + "loss": 0.0424, + "step": 4060 + }, + { + "epoch": 0.6156871643597307, + "grad_norm": 0.08714490383863449, + "learning_rate": 0.000183728, + "loss": 0.0459, + "step": 4070 + }, + { + "epoch": 0.6171999092353074, + "grad_norm": 0.07733121514320374, + "learning_rate": 0.00018368800000000002, + "loss": 0.0423, + "step": 4080 + }, + { + "epoch": 0.6187126541108842, + "grad_norm": 0.070652537047863, + "learning_rate": 0.000183648, + "loss": 0.0417, + "step": 4090 + }, + { + "epoch": 0.6202253989864609, + "grad_norm": 0.08538975566625595, + "learning_rate": 0.000183608, + "loss": 0.045, + "step": 4100 + }, + { + "epoch": 0.6217381438620376, + "grad_norm": 0.07866961508989334, + "learning_rate": 0.000183568, + "loss": 0.0435, + "step": 4110 + }, + { + "epoch": 0.6232508887376144, + "grad_norm": 0.052214980125427246, + "learning_rate": 0.00018352800000000002, + "loss": 0.0389, + "step": 4120 + }, + { + "epoch": 0.6247636336131911, + "grad_norm": 0.07548975199460983, + "learning_rate": 0.00018348800000000002, + "loss": 0.0406, + "step": 4130 + }, + { + "epoch": 0.6262763784887678, + "grad_norm": 0.06064745783805847, + "learning_rate": 0.000183448, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.6277891233643446, + "grad_norm": 0.06255548447370529, + "learning_rate": 0.000183408, + "loss": 0.0426, + "step": 4150 + }, + { + "epoch": 0.6293018682399213, + "grad_norm": 0.05550558492541313, + "learning_rate": 0.000183368, + "loss": 0.0432, + "step": 4160 + }, + { + "epoch": 0.630814613115498, + "grad_norm": 0.06224781274795532, + "learning_rate": 0.00018332800000000002, + "loss": 0.0489, + "step": 4170 + }, + { + "epoch": 0.6323273579910748, + "grad_norm": 0.04567689448595047, + "learning_rate": 0.000183288, + "loss": 0.0392, + "step": 4180 + }, + { + "epoch": 0.6338401028666515, + "grad_norm": 0.08686509728431702, + "learning_rate": 0.00018324800000000003, + "loss": 0.0503, + "step": 4190 + }, + { + "epoch": 0.6353528477422282, + "grad_norm": 0.039897847920656204, + "learning_rate": 0.000183208, + "loss": 0.0437, + "step": 4200 + }, + { + "epoch": 0.6353528477422282, + "eval_cer": 0.0028697931722888917, + "eval_loss": 0.03980256989598274, + "eval_runtime": 10439.5254, + "eval_samples_per_second": 2.017, + "eval_steps_per_second": 0.252, + "step": 4200 + }, + { + "epoch": 0.636865592617805, + "grad_norm": 0.07222657650709152, + "learning_rate": 0.00018316800000000002, + "loss": 0.0445, + "step": 4210 + }, + { + "epoch": 0.6383783374933817, + "grad_norm": 0.06796406954526901, + "learning_rate": 0.000183128, + "loss": 0.0452, + "step": 4220 + }, + { + "epoch": 0.6398910823689585, + "grad_norm": 0.07380914688110352, + "learning_rate": 0.000183088, + "loss": 0.0456, + "step": 4230 + }, + { + "epoch": 0.6414038272445353, + "grad_norm": 0.05780802294611931, + "learning_rate": 0.00018304800000000003, + "loss": 0.043, + "step": 4240 + }, + { + "epoch": 0.642916572120112, + "grad_norm": 0.07155787944793701, + "learning_rate": 0.000183008, + "loss": 0.0422, + "step": 4250 + }, + { + "epoch": 0.6444293169956887, + "grad_norm": 0.06419336050748825, + "learning_rate": 0.00018296800000000001, + "loss": 0.0453, + "step": 4260 + }, + { + "epoch": 0.6459420618712655, + "grad_norm": 0.06702402234077454, + "learning_rate": 0.000182928, + "loss": 0.0416, + "step": 4270 + }, + { + "epoch": 0.6474548067468422, + "grad_norm": 0.062247395515441895, + "learning_rate": 0.00018288800000000003, + "loss": 0.0431, + "step": 4280 + }, + { + "epoch": 0.6489675516224189, + "grad_norm": 0.05556045100092888, + "learning_rate": 0.00018284800000000002, + "loss": 0.0542, + "step": 4290 + }, + { + "epoch": 0.6504802964979957, + "grad_norm": 0.07586701959371567, + "learning_rate": 0.000182808, + "loss": 0.0476, + "step": 4300 + }, + { + "epoch": 0.6519930413735724, + "grad_norm": 0.056563302874565125, + "learning_rate": 0.000182768, + "loss": 0.0441, + "step": 4310 + }, + { + "epoch": 0.6535057862491491, + "grad_norm": 0.08210831135511398, + "learning_rate": 0.000182728, + "loss": 0.0428, + "step": 4320 + }, + { + "epoch": 0.6550185311247259, + "grad_norm": 0.06154036149382591, + "learning_rate": 0.00018268800000000002, + "loss": 0.0437, + "step": 4330 + }, + { + "epoch": 0.6565312760003026, + "grad_norm": 0.06387040764093399, + "learning_rate": 0.00018264800000000002, + "loss": 0.0503, + "step": 4340 + }, + { + "epoch": 0.6580440208758793, + "grad_norm": 0.07460694015026093, + "learning_rate": 0.000182608, + "loss": 0.0388, + "step": 4350 + }, + { + "epoch": 0.659556765751456, + "grad_norm": 0.05871427804231644, + "learning_rate": 0.000182568, + "loss": 0.0409, + "step": 4360 + }, + { + "epoch": 0.6610695106270328, + "grad_norm": 0.05525946244597435, + "learning_rate": 0.000182528, + "loss": 0.0403, + "step": 4370 + }, + { + "epoch": 0.6625822555026095, + "grad_norm": 0.07400190085172653, + "learning_rate": 0.00018248800000000002, + "loss": 0.0544, + "step": 4380 + }, + { + "epoch": 0.6640950003781863, + "grad_norm": 0.05236358568072319, + "learning_rate": 0.000182448, + "loss": 0.0424, + "step": 4390 + }, + { + "epoch": 0.665607745253763, + "grad_norm": 0.07223962247371674, + "learning_rate": 0.000182408, + "loss": 0.0427, + "step": 4400 + }, + { + "epoch": 0.665607745253763, + "eval_cer": 0.22895526186399429, + "eval_loss": 0.039881668984889984, + "eval_runtime": 10486.5948, + "eval_samples_per_second": 2.008, + "eval_steps_per_second": 0.251, + "step": 4400 + }, + { + "epoch": 0.6671204901293397, + "grad_norm": 0.04777299240231514, + "learning_rate": 0.000182368, + "loss": 0.0365, + "step": 4410 + }, + { + "epoch": 0.6686332350049164, + "grad_norm": 0.06789238750934601, + "learning_rate": 0.00018232800000000002, + "loss": 0.041, + "step": 4420 + }, + { + "epoch": 0.6701459798804932, + "grad_norm": 0.07556366920471191, + "learning_rate": 0.00018228800000000001, + "loss": 0.0454, + "step": 4430 + }, + { + "epoch": 0.6716587247560699, + "grad_norm": 0.05699057877063751, + "learning_rate": 0.000182248, + "loss": 0.0412, + "step": 4440 + }, + { + "epoch": 0.6731714696316466, + "grad_norm": 0.06115678697824478, + "learning_rate": 0.000182208, + "loss": 0.0494, + "step": 4450 + }, + { + "epoch": 0.6746842145072234, + "grad_norm": 0.16907750070095062, + "learning_rate": 0.000182168, + "loss": 0.0457, + "step": 4460 + }, + { + "epoch": 0.6761969593828001, + "grad_norm": 0.23710806667804718, + "learning_rate": 0.00018212800000000002, + "loss": 0.0491, + "step": 4470 + }, + { + "epoch": 0.6777097042583768, + "grad_norm": 0.13006287813186646, + "learning_rate": 0.000182088, + "loss": 0.0528, + "step": 4480 + }, + { + "epoch": 0.6792224491339536, + "grad_norm": 0.24661995470523834, + "learning_rate": 0.00018204800000000003, + "loss": 0.043, + "step": 4490 + }, + { + "epoch": 0.6807351940095303, + "grad_norm": 0.2757125198841095, + "learning_rate": 0.000182008, + "loss": 0.0477, + "step": 4500 + }, + { + "epoch": 0.682247938885107, + "grad_norm": 0.27585530281066895, + "learning_rate": 0.00018196800000000002, + "loss": 0.0486, + "step": 4510 + }, + { + "epoch": 0.6837606837606838, + "grad_norm": 0.10548703372478485, + "learning_rate": 0.000181928, + "loss": 0.0448, + "step": 4520 + }, + { + "epoch": 0.6852734286362605, + "grad_norm": 0.1989259272813797, + "learning_rate": 0.000181888, + "loss": 0.0508, + "step": 4530 + }, + { + "epoch": 0.6867861735118372, + "grad_norm": 0.10586623847484589, + "learning_rate": 0.00018184800000000002, + "loss": 0.0486, + "step": 4540 + }, + { + "epoch": 0.688298918387414, + "grad_norm": 0.09687965363264084, + "learning_rate": 0.000181808, + "loss": 0.0463, + "step": 4550 + }, + { + "epoch": 0.6898116632629907, + "grad_norm": 0.13362692296504974, + "learning_rate": 0.000181768, + "loss": 0.0441, + "step": 4560 + }, + { + "epoch": 0.6913244081385674, + "grad_norm": 0.07124081254005432, + "learning_rate": 0.000181728, + "loss": 0.0479, + "step": 4570 + }, + { + "epoch": 0.6928371530141442, + "grad_norm": 0.060886889696121216, + "learning_rate": 0.00018168800000000003, + "loss": 0.0425, + "step": 4580 + }, + { + "epoch": 0.6943498978897209, + "grad_norm": 0.09697773307561874, + "learning_rate": 0.00018164800000000002, + "loss": 0.0466, + "step": 4590 + }, + { + "epoch": 0.6958626427652976, + "grad_norm": 0.09655246883630753, + "learning_rate": 0.00018160800000000001, + "loss": 0.0423, + "step": 4600 + }, + { + "epoch": 0.6958626427652976, + "eval_cer": 0.3264485475609846, + "eval_loss": 0.04431215673685074, + "eval_runtime": 9966.6677, + "eval_samples_per_second": 2.112, + "eval_steps_per_second": 0.264, + "step": 4600 + }, + { + "epoch": 0.6973753876408744, + "grad_norm": 0.6920335292816162, + "learning_rate": 0.000181568, + "loss": 0.0612, + "step": 4610 + }, + { + "epoch": 0.6988881325164511, + "grad_norm": 21.773630142211914, + "learning_rate": 0.00018153600000000002, + "loss": 0.3452, + "step": 4620 + }, + { + "epoch": 0.7004008773920278, + "grad_norm": 0.6047945022583008, + "learning_rate": 0.0001815, + "loss": 0.8043, + "step": 4630 + }, + { + "epoch": 0.7019136222676046, + "grad_norm": 0.30588680505752563, + "learning_rate": 0.00018146000000000001, + "loss": 0.094, + "step": 4640 + }, + { + "epoch": 0.7034263671431813, + "grad_norm": 2.5436811447143555, + "learning_rate": 0.00018142, + "loss": 0.1421, + "step": 4650 + }, + { + "epoch": 0.704939112018758, + "grad_norm": 3.3921713829040527, + "learning_rate": 0.00018138000000000003, + "loss": 0.2285, + "step": 4660 + }, + { + "epoch": 0.7064518568943348, + "grad_norm": 6.751514434814453, + "learning_rate": 0.00018134, + "loss": 0.1609, + "step": 4670 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 0.2919982075691223, + "learning_rate": 0.00018130000000000002, + "loss": 0.0731, + "step": 4680 + }, + { + "epoch": 0.7094773466454882, + "grad_norm": 0.2757503092288971, + "learning_rate": 0.00018126, + "loss": 0.0553, + "step": 4690 + }, + { + "epoch": 0.710990091521065, + "grad_norm": 0.12121643126010895, + "learning_rate": 0.00018122, + "loss": 0.0637, + "step": 4700 + }, + { + "epoch": 0.7125028363966417, + "grad_norm": 0.6880851984024048, + "learning_rate": 0.00018118000000000002, + "loss": 0.0556, + "step": 4710 + }, + { + "epoch": 0.7140155812722184, + "grad_norm": 0.17397326231002808, + "learning_rate": 0.00018114, + "loss": 0.0619, + "step": 4720 + }, + { + "epoch": 0.7155283261477952, + "grad_norm": 0.4361652433872223, + "learning_rate": 0.0001811, + "loss": 0.052, + "step": 4730 + }, + { + "epoch": 0.7170410710233719, + "grad_norm": 0.08802498877048492, + "learning_rate": 0.00018106, + "loss": 0.0531, + "step": 4740 + }, + { + "epoch": 0.7185538158989486, + "grad_norm": 0.16508696973323822, + "learning_rate": 0.00018102000000000003, + "loss": 0.0519, + "step": 4750 + }, + { + "epoch": 0.7200665607745254, + "grad_norm": 0.1359723061323166, + "learning_rate": 0.00018098000000000002, + "loss": 0.0559, + "step": 4760 + }, + { + "epoch": 0.7215793056501021, + "grad_norm": 0.12716355919837952, + "learning_rate": 0.00018093999999999999, + "loss": 0.0478, + "step": 4770 + }, + { + "epoch": 0.7230920505256788, + "grad_norm": 0.24563723802566528, + "learning_rate": 0.0001809, + "loss": 0.0508, + "step": 4780 + }, + { + "epoch": 0.7246047954012556, + "grad_norm": 0.15526343882083893, + "learning_rate": 0.00018086, + "loss": 0.053, + "step": 4790 + }, + { + "epoch": 0.7261175402768323, + "grad_norm": 0.39961257576942444, + "learning_rate": 0.00018082000000000002, + "loss": 0.0543, + "step": 4800 + }, + { + "epoch": 0.7261175402768323, + "eval_cer": 0.8969592299120654, + "eval_loss": 0.04724743589758873, + "eval_runtime": 9508.4862, + "eval_samples_per_second": 2.214, + "eval_steps_per_second": 0.277, + "step": 4800 + }, + { + "epoch": 0.727630285152409, + "grad_norm": 0.11674599349498749, + "learning_rate": 0.00018078000000000001, + "loss": 0.045, + "step": 4810 + }, + { + "epoch": 0.7291430300279858, + "grad_norm": 0.12775878608226776, + "learning_rate": 0.00018074, + "loss": 0.0507, + "step": 4820 + }, + { + "epoch": 0.7306557749035625, + "grad_norm": 0.21720856428146362, + "learning_rate": 0.0001807, + "loss": 0.0507, + "step": 4830 + }, + { + "epoch": 0.7321685197791392, + "grad_norm": 0.09953787177801132, + "learning_rate": 0.00018066, + "loss": 0.0455, + "step": 4840 + }, + { + "epoch": 0.733681264654716, + "grad_norm": 0.1652969866991043, + "learning_rate": 0.00018062000000000002, + "loss": 0.058, + "step": 4850 + }, + { + "epoch": 0.7351940095302927, + "grad_norm": 0.15136420726776123, + "learning_rate": 0.00018058, + "loss": 0.0403, + "step": 4860 + }, + { + "epoch": 0.7367067544058694, + "grad_norm": 0.09294873476028442, + "learning_rate": 0.00018054, + "loss": 0.0454, + "step": 4870 + }, + { + "epoch": 0.7382194992814461, + "grad_norm": 0.06313528120517731, + "learning_rate": 0.0001805, + "loss": 0.0486, + "step": 4880 + }, + { + "epoch": 0.7397322441570229, + "grad_norm": 0.10854914039373398, + "learning_rate": 0.00018046000000000002, + "loss": 0.0419, + "step": 4890 + }, + { + "epoch": 0.7412449890325996, + "grad_norm": 0.08302963525056839, + "learning_rate": 0.00018042, + "loss": 0.0447, + "step": 4900 + }, + { + "epoch": 0.7427577339081763, + "grad_norm": 0.0761631429195404, + "learning_rate": 0.00018038, + "loss": 0.0446, + "step": 4910 + }, + { + "epoch": 0.7442704787837531, + "grad_norm": 0.10130470246076584, + "learning_rate": 0.00018034, + "loss": 0.045, + "step": 4920 + }, + { + "epoch": 0.7457832236593298, + "grad_norm": 0.18436622619628906, + "learning_rate": 0.0001803, + "loss": 0.0429, + "step": 4930 + }, + { + "epoch": 0.7472959685349065, + "grad_norm": 0.08756496757268906, + "learning_rate": 0.00018026, + "loss": 0.0444, + "step": 4940 + }, + { + "epoch": 0.7488087134104833, + "grad_norm": 0.0750514343380928, + "learning_rate": 0.00018022, + "loss": 0.0507, + "step": 4950 + }, + { + "epoch": 0.75032145828606, + "grad_norm": 0.07460404187440872, + "learning_rate": 0.00018018000000000003, + "loss": 0.0397, + "step": 4960 + }, + { + "epoch": 0.7518342031616367, + "grad_norm": 0.12696300446987152, + "learning_rate": 0.00018014, + "loss": 0.0412, + "step": 4970 + }, + { + "epoch": 0.7533469480372135, + "grad_norm": 0.09411120414733887, + "learning_rate": 0.00018010000000000001, + "loss": 0.0431, + "step": 4980 + }, + { + "epoch": 0.7548596929127902, + "grad_norm": 0.08611701428890228, + "learning_rate": 0.00018006, + "loss": 0.041, + "step": 4990 + }, + { + "epoch": 0.756372437788367, + "grad_norm": 0.07411106675863266, + "learning_rate": 0.00018002, + "loss": 0.0448, + "step": 5000 + }, + { + "epoch": 0.756372437788367, + "eval_cer": 0.9283299113242558, + "eval_loss": 0.0398402214050293, + "eval_runtime": 9972.2961, + "eval_samples_per_second": 2.111, + "eval_steps_per_second": 0.264, + "step": 5000 + }, + { + "epoch": 0.7578851826639438, + "grad_norm": 0.06552145630121231, + "learning_rate": 0.00017998000000000002, + "loss": 0.0411, + "step": 5010 + }, + { + "epoch": 0.7593979275395205, + "grad_norm": 0.14544987678527832, + "learning_rate": 0.00017994000000000002, + "loss": 0.0401, + "step": 5020 + }, + { + "epoch": 0.7609106724150972, + "grad_norm": 0.06693132221698761, + "learning_rate": 0.0001799, + "loss": 0.045, + "step": 5030 + }, + { + "epoch": 0.762423417290674, + "grad_norm": 0.08100226521492004, + "learning_rate": 0.00017986, + "loss": 0.0478, + "step": 5040 + }, + { + "epoch": 0.7639361621662507, + "grad_norm": 0.10020666569471359, + "learning_rate": 0.00017982000000000002, + "loss": 0.0484, + "step": 5050 + }, + { + "epoch": 0.7654489070418274, + "grad_norm": 0.055785536766052246, + "learning_rate": 0.00017978000000000002, + "loss": 0.0423, + "step": 5060 + }, + { + "epoch": 0.7669616519174042, + "grad_norm": 0.08791428059339523, + "learning_rate": 0.00017974, + "loss": 0.0433, + "step": 5070 + }, + { + "epoch": 0.7684743967929809, + "grad_norm": 0.10156507045030594, + "learning_rate": 0.0001797, + "loss": 0.0447, + "step": 5080 + }, + { + "epoch": 0.7699871416685576, + "grad_norm": 0.1160702183842659, + "learning_rate": 0.00017966, + "loss": 0.0388, + "step": 5090 + }, + { + "epoch": 0.7714998865441344, + "grad_norm": 0.08716849237680435, + "learning_rate": 0.00017962000000000002, + "loss": 0.0492, + "step": 5100 + }, + { + "epoch": 0.7730126314197111, + "grad_norm": 0.046968474984169006, + "learning_rate": 0.00017958, + "loss": 0.0434, + "step": 5110 + }, + { + "epoch": 0.7745253762952878, + "grad_norm": 0.06234806030988693, + "learning_rate": 0.00017954000000000003, + "loss": 0.0504, + "step": 5120 + }, + { + "epoch": 0.7760381211708646, + "grad_norm": 0.102174311876297, + "learning_rate": 0.0001795, + "loss": 0.044, + "step": 5130 + }, + { + "epoch": 0.7775508660464413, + "grad_norm": 0.0620570033788681, + "learning_rate": 0.00017946, + "loss": 0.0386, + "step": 5140 + }, + { + "epoch": 0.779063610922018, + "grad_norm": 0.057656314224004745, + "learning_rate": 0.00017942, + "loss": 0.043, + "step": 5150 + }, + { + "epoch": 0.7805763557975948, + "grad_norm": 0.08451346307992935, + "learning_rate": 0.00017938, + "loss": 0.0452, + "step": 5160 + }, + { + "epoch": 0.7820891006731715, + "grad_norm": 0.09557165950536728, + "learning_rate": 0.00017934000000000003, + "loss": 0.0437, + "step": 5170 + }, + { + "epoch": 0.7836018455487482, + "grad_norm": 0.12275496870279312, + "learning_rate": 0.0001793, + "loss": 0.0427, + "step": 5180 + }, + { + "epoch": 0.785114590424325, + "grad_norm": 0.3277435600757599, + "learning_rate": 0.00017926000000000002, + "loss": 0.045, + "step": 5190 + }, + { + "epoch": 0.7866273352999017, + "grad_norm": 0.12806734442710876, + "learning_rate": 0.00017922, + "loss": 0.0383, + "step": 5200 + }, + { + "epoch": 0.7866273352999017, + "eval_cer": 0.8426215554451947, + "eval_loss": 0.03898792341351509, + "eval_runtime": 10404.4584, + "eval_samples_per_second": 2.023, + "eval_steps_per_second": 0.253, + "step": 5200 + }, + { + "epoch": 0.7881400801754784, + "grad_norm": 0.07969816774129868, + "learning_rate": 0.00017918, + "loss": 0.0474, + "step": 5210 + }, + { + "epoch": 0.7896528250510552, + "grad_norm": 0.20492368936538696, + "learning_rate": 0.00017914000000000002, + "loss": 0.0423, + "step": 5220 + }, + { + "epoch": 0.7911655699266319, + "grad_norm": 0.0960281640291214, + "learning_rate": 0.0001791, + "loss": 0.0392, + "step": 5230 + }, + { + "epoch": 0.7926783148022086, + "grad_norm": 0.16566351056098938, + "learning_rate": 0.00017906, + "loss": 0.0415, + "step": 5240 + }, + { + "epoch": 0.7941910596777854, + "grad_norm": 0.12343327701091766, + "learning_rate": 0.00017902, + "loss": 0.0439, + "step": 5250 + }, + { + "epoch": 0.7957038045533621, + "grad_norm": 0.0732201486825943, + "learning_rate": 0.00017898000000000002, + "loss": 0.0462, + "step": 5260 + }, + { + "epoch": 0.7972165494289388, + "grad_norm": 0.07991164177656174, + "learning_rate": 0.00017894000000000002, + "loss": 0.0412, + "step": 5270 + }, + { + "epoch": 0.7987292943045156, + "grad_norm": 0.07868771255016327, + "learning_rate": 0.0001789, + "loss": 0.0458, + "step": 5280 + }, + { + "epoch": 0.8002420391800923, + "grad_norm": 0.07392987608909607, + "learning_rate": 0.00017886, + "loss": 0.0489, + "step": 5290 + }, + { + "epoch": 0.801754784055669, + "grad_norm": 0.08330372720956802, + "learning_rate": 0.00017882, + "loss": 0.0448, + "step": 5300 + }, + { + "epoch": 0.8032675289312458, + "grad_norm": 0.06118497997522354, + "learning_rate": 0.00017878000000000002, + "loss": 0.0406, + "step": 5310 + }, + { + "epoch": 0.8047802738068225, + "grad_norm": 0.14288772642612457, + "learning_rate": 0.00017874, + "loss": 0.0439, + "step": 5320 + }, + { + "epoch": 0.8062930186823992, + "grad_norm": 0.06868502497673035, + "learning_rate": 0.0001787, + "loss": 0.0439, + "step": 5330 + }, + { + "epoch": 0.807805763557976, + "grad_norm": 0.08165542781352997, + "learning_rate": 0.00017866, + "loss": 0.0449, + "step": 5340 + }, + { + "epoch": 0.8093185084335527, + "grad_norm": 0.08748511224985123, + "learning_rate": 0.00017862000000000002, + "loss": 0.0455, + "step": 5350 + }, + { + "epoch": 0.8108312533091294, + "grad_norm": 0.0799604058265686, + "learning_rate": 0.00017858000000000001, + "loss": 0.0466, + "step": 5360 + }, + { + "epoch": 0.8123439981847062, + "grad_norm": 0.09606848657131195, + "learning_rate": 0.00017854, + "loss": 0.0452, + "step": 5370 + }, + { + "epoch": 0.8138567430602829, + "grad_norm": 0.07232715934515, + "learning_rate": 0.0001785, + "loss": 0.0426, + "step": 5380 + }, + { + "epoch": 0.8153694879358596, + "grad_norm": 0.07278240472078323, + "learning_rate": 0.00017846, + "loss": 0.0468, + "step": 5390 + }, + { + "epoch": 0.8168822328114363, + "grad_norm": 0.06568820029497147, + "learning_rate": 0.00017842000000000002, + "loss": 0.0407, + "step": 5400 + }, + { + "epoch": 0.8168822328114363, + "eval_cer": 0.9304918304165957, + "eval_loss": 0.039248276501894, + "eval_runtime": 10433.9841, + "eval_samples_per_second": 2.018, + "eval_steps_per_second": 0.252, + "step": 5400 + }, + { + "epoch": 0.8183949776870131, + "grad_norm": 0.08667409420013428, + "learning_rate": 0.00017838, + "loss": 0.0504, + "step": 5410 + }, + { + "epoch": 0.8199077225625898, + "grad_norm": 0.0701778307557106, + "learning_rate": 0.00017834000000000003, + "loss": 0.0425, + "step": 5420 + }, + { + "epoch": 0.8214204674381665, + "grad_norm": 0.07078663259744644, + "learning_rate": 0.0001783, + "loss": 0.0456, + "step": 5430 + }, + { + "epoch": 0.8229332123137433, + "grad_norm": 0.08540530502796173, + "learning_rate": 0.00017826000000000002, + "loss": 0.0437, + "step": 5440 + }, + { + "epoch": 0.82444595718932, + "grad_norm": 0.044258490204811096, + "learning_rate": 0.00017822, + "loss": 0.0373, + "step": 5450 + }, + { + "epoch": 0.8259587020648967, + "grad_norm": 0.08837467432022095, + "learning_rate": 0.00017818, + "loss": 0.0418, + "step": 5460 + }, + { + "epoch": 0.8274714469404735, + "grad_norm": 0.06399261206388474, + "learning_rate": 0.00017814000000000003, + "loss": 0.0461, + "step": 5470 + }, + { + "epoch": 0.8289841918160502, + "grad_norm": 0.07160426676273346, + "learning_rate": 0.0001781, + "loss": 0.0384, + "step": 5480 + }, + { + "epoch": 0.8304969366916269, + "grad_norm": 0.06335125118494034, + "learning_rate": 0.00017806, + "loss": 0.04, + "step": 5490 + }, + { + "epoch": 0.8320096815672037, + "grad_norm": 0.10239727795124054, + "learning_rate": 0.00017802, + "loss": 0.0396, + "step": 5500 + }, + { + "epoch": 0.8335224264427804, + "grad_norm": 0.06797724217176437, + "learning_rate": 0.00017798, + "loss": 0.0406, + "step": 5510 + }, + { + "epoch": 0.8350351713183571, + "grad_norm": 0.08448281139135361, + "learning_rate": 0.00017794000000000002, + "loss": 0.0489, + "step": 5520 + }, + { + "epoch": 0.8365479161939339, + "grad_norm": 0.0817868560552597, + "learning_rate": 0.0001779, + "loss": 0.0437, + "step": 5530 + }, + { + "epoch": 0.8380606610695106, + "grad_norm": 0.12232506275177002, + "learning_rate": 0.00017786, + "loss": 0.0475, + "step": 5540 + }, + { + "epoch": 0.8395734059450873, + "grad_norm": 0.0839553102850914, + "learning_rate": 0.00017782, + "loss": 0.0447, + "step": 5550 + }, + { + "epoch": 0.8410861508206641, + "grad_norm": 0.07315023243427277, + "learning_rate": 0.00017778000000000002, + "loss": 0.0441, + "step": 5560 + }, + { + "epoch": 0.8425988956962408, + "grad_norm": 0.07943390309810638, + "learning_rate": 0.00017774000000000002, + "loss": 0.0457, + "step": 5570 + }, + { + "epoch": 0.8441116405718175, + "grad_norm": 0.07185439020395279, + "learning_rate": 0.0001777, + "loss": 0.0429, + "step": 5580 + }, + { + "epoch": 0.8456243854473943, + "grad_norm": 0.06304585933685303, + "learning_rate": 0.00017766, + "loss": 0.046, + "step": 5590 + }, + { + "epoch": 0.847137130322971, + "grad_norm": 0.07005342841148376, + "learning_rate": 0.00017762, + "loss": 0.0359, + "step": 5600 + }, + { + "epoch": 0.847137130322971, + "eval_cer": 0.5003496132017898, + "eval_loss": 0.038213107734918594, + "eval_runtime": 10454.3437, + "eval_samples_per_second": 2.014, + "eval_steps_per_second": 0.252, + "step": 5600 + }, + { + "epoch": 0.8486498751985477, + "grad_norm": 0.08005109429359436, + "learning_rate": 0.00017758000000000002, + "loss": 0.0491, + "step": 5610 + }, + { + "epoch": 0.8501626200741245, + "grad_norm": 0.07554598152637482, + "learning_rate": 0.00017754, + "loss": 0.0384, + "step": 5620 + }, + { + "epoch": 0.8516753649497012, + "grad_norm": 0.08396964520215988, + "learning_rate": 0.0001775, + "loss": 0.0439, + "step": 5630 + }, + { + "epoch": 0.8531881098252779, + "grad_norm": 0.08719771355390549, + "learning_rate": 0.00017746, + "loss": 0.0417, + "step": 5640 + }, + { + "epoch": 0.8547008547008547, + "grad_norm": 0.09563528001308441, + "learning_rate": 0.00017742000000000002, + "loss": 0.0456, + "step": 5650 + }, + { + "epoch": 0.8562135995764314, + "grad_norm": 0.07019315659999847, + "learning_rate": 0.00017738, + "loss": 0.0394, + "step": 5660 + }, + { + "epoch": 0.8577263444520081, + "grad_norm": 0.06756678968667984, + "learning_rate": 0.00017734, + "loss": 0.046, + "step": 5670 + }, + { + "epoch": 0.8592390893275849, + "grad_norm": 0.06660816073417664, + "learning_rate": 0.0001773, + "loss": 0.0415, + "step": 5680 + }, + { + "epoch": 0.8607518342031616, + "grad_norm": 0.10737419873476028, + "learning_rate": 0.00017726, + "loss": 0.0402, + "step": 5690 + }, + { + "epoch": 0.8622645790787383, + "grad_norm": 0.06818167865276337, + "learning_rate": 0.00017722000000000001, + "loss": 0.039, + "step": 5700 + }, + { + "epoch": 0.8637773239543151, + "grad_norm": 0.05077315866947174, + "learning_rate": 0.00017718, + "loss": 0.0376, + "step": 5710 + }, + { + "epoch": 0.8652900688298918, + "grad_norm": 0.08248795568943024, + "learning_rate": 0.00017714000000000003, + "loss": 0.0427, + "step": 5720 + }, + { + "epoch": 0.8668028137054685, + "grad_norm": 0.06273633241653442, + "learning_rate": 0.0001771, + "loss": 0.0405, + "step": 5730 + }, + { + "epoch": 0.8683155585810453, + "grad_norm": 0.11920665949583054, + "learning_rate": 0.00017706000000000002, + "loss": 0.0416, + "step": 5740 + }, + { + "epoch": 0.869828303456622, + "grad_norm": 0.061835162341594696, + "learning_rate": 0.00017702, + "loss": 0.0456, + "step": 5750 + }, + { + "epoch": 0.8713410483321987, + "grad_norm": 0.06891065835952759, + "learning_rate": 0.00017698, + "loss": 0.0435, + "step": 5760 + }, + { + "epoch": 0.8728537932077756, + "grad_norm": 0.06323794275522232, + "learning_rate": 0.00017694000000000002, + "loss": 0.0424, + "step": 5770 + }, + { + "epoch": 0.8743665380833523, + "grad_norm": 0.08218410611152649, + "learning_rate": 0.0001769, + "loss": 0.0428, + "step": 5780 + }, + { + "epoch": 0.875879282958929, + "grad_norm": 0.05943075567483902, + "learning_rate": 0.00017686, + "loss": 0.0373, + "step": 5790 + }, + { + "epoch": 0.8773920278345058, + "grad_norm": 0.09316141158342361, + "learning_rate": 0.00017682, + "loss": 0.0436, + "step": 5800 + }, + { + "epoch": 0.8773920278345058, + "eval_cer": 0.5988355286077488, + "eval_loss": 0.0380551740527153, + "eval_runtime": 10439.6932, + "eval_samples_per_second": 2.017, + "eval_steps_per_second": 0.252, + "step": 5800 + }, + { + "epoch": 0.8789047727100825, + "grad_norm": 0.06791754812002182, + "learning_rate": 0.00017678000000000003, + "loss": 0.0424, + "step": 5810 + }, + { + "epoch": 0.8804175175856592, + "grad_norm": 0.06572896242141724, + "learning_rate": 0.00017674000000000002, + "loss": 0.0446, + "step": 5820 + }, + { + "epoch": 0.881930262461236, + "grad_norm": 0.07208286970853806, + "learning_rate": 0.00017669999999999999, + "loss": 0.0438, + "step": 5830 + }, + { + "epoch": 0.8834430073368127, + "grad_norm": 0.08518756181001663, + "learning_rate": 0.00017666, + "loss": 0.0401, + "step": 5840 + }, + { + "epoch": 0.8849557522123894, + "grad_norm": 0.060736026614904404, + "learning_rate": 0.00017662, + "loss": 0.0393, + "step": 5850 + }, + { + "epoch": 0.8864684970879662, + "grad_norm": 0.0627061128616333, + "learning_rate": 0.00017658000000000002, + "loss": 0.0358, + "step": 5860 + }, + { + "epoch": 0.8879812419635429, + "grad_norm": 0.06178157031536102, + "learning_rate": 0.00017654000000000001, + "loss": 0.0467, + "step": 5870 + }, + { + "epoch": 0.8894939868391196, + "grad_norm": 0.0688227042555809, + "learning_rate": 0.0001765, + "loss": 0.0415, + "step": 5880 + }, + { + "epoch": 0.8910067317146964, + "grad_norm": 0.06773985177278519, + "learning_rate": 0.00017646, + "loss": 0.0354, + "step": 5890 + }, + { + "epoch": 0.8925194765902731, + "grad_norm": 0.09130257368087769, + "learning_rate": 0.00017642, + "loss": 0.0414, + "step": 5900 + }, + { + "epoch": 0.8940322214658498, + "grad_norm": 0.06815651059150696, + "learning_rate": 0.00017638000000000002, + "loss": 0.0495, + "step": 5910 + }, + { + "epoch": 0.8955449663414266, + "grad_norm": 0.07239062339067459, + "learning_rate": 0.00017634, + "loss": 0.0459, + "step": 5920 + }, + { + "epoch": 0.8970577112170033, + "grad_norm": 0.08951979130506516, + "learning_rate": 0.0001763, + "loss": 0.047, + "step": 5930 + }, + { + "epoch": 0.89857045609258, + "grad_norm": 0.07267329841852188, + "learning_rate": 0.00017626, + "loss": 0.0384, + "step": 5940 + }, + { + "epoch": 0.9000832009681567, + "grad_norm": 0.06272245943546295, + "learning_rate": 0.00017622000000000002, + "loss": 0.0373, + "step": 5950 + }, + { + "epoch": 0.9015959458437335, + "grad_norm": 0.07484642416238785, + "learning_rate": 0.00017618, + "loss": 0.0445, + "step": 5960 + }, + { + "epoch": 0.9031086907193102, + "grad_norm": 0.06894571334123611, + "learning_rate": 0.00017614, + "loss": 0.0418, + "step": 5970 + }, + { + "epoch": 0.904621435594887, + "grad_norm": 0.07352825254201889, + "learning_rate": 0.0001761, + "loss": 0.0361, + "step": 5980 + }, + { + "epoch": 0.9061341804704637, + "grad_norm": 0.07955580949783325, + "learning_rate": 0.00017606, + "loss": 0.0418, + "step": 5990 + }, + { + "epoch": 0.9076469253460404, + "grad_norm": 0.057830698788166046, + "learning_rate": 0.00017602, + "loss": 0.0359, + "step": 6000 + }, + { + "epoch": 0.9076469253460404, + "eval_cer": 0.5058427407698408, + "eval_loss": 0.038296379148960114, + "eval_runtime": 10426.1739, + "eval_samples_per_second": 2.019, + "eval_steps_per_second": 0.252, + "step": 6000 + }, + { + "epoch": 0.9091596702216171, + "grad_norm": 0.08560307323932648, + "learning_rate": 0.00017598, + "loss": 0.0465, + "step": 6010 + }, + { + "epoch": 0.9106724150971939, + "grad_norm": 0.06908106803894043, + "learning_rate": 0.00017594000000000003, + "loss": 0.0469, + "step": 6020 + }, + { + "epoch": 0.9121851599727706, + "grad_norm": 0.058405641466379166, + "learning_rate": 0.0001759, + "loss": 0.0459, + "step": 6030 + }, + { + "epoch": 0.9136979048483473, + "grad_norm": 0.06696103513240814, + "learning_rate": 0.00017586000000000001, + "loss": 0.0389, + "step": 6040 + }, + { + "epoch": 0.9152106497239241, + "grad_norm": 0.06927672773599625, + "learning_rate": 0.00017582, + "loss": 0.0369, + "step": 6050 + }, + { + "epoch": 0.9167233945995008, + "grad_norm": 0.11847919970750809, + "learning_rate": 0.00017578, + "loss": 0.0379, + "step": 6060 + }, + { + "epoch": 0.9182361394750775, + "grad_norm": 0.06731213629245758, + "learning_rate": 0.00017574000000000002, + "loss": 0.0492, + "step": 6070 + }, + { + "epoch": 0.9197488843506543, + "grad_norm": 0.06238566339015961, + "learning_rate": 0.0001757, + "loss": 0.0351, + "step": 6080 + }, + { + "epoch": 0.921261629226231, + "grad_norm": 0.07023432850837708, + "learning_rate": 0.00017566, + "loss": 0.0418, + "step": 6090 + }, + { + "epoch": 0.9227743741018077, + "grad_norm": 0.07269687950611115, + "learning_rate": 0.00017562, + "loss": 0.0473, + "step": 6100 + }, + { + "epoch": 0.9242871189773845, + "grad_norm": 0.0714830756187439, + "learning_rate": 0.00017558000000000002, + "loss": 0.0419, + "step": 6110 + }, + { + "epoch": 0.9257998638529612, + "grad_norm": 0.06455916166305542, + "learning_rate": 0.00017554000000000002, + "loss": 0.0386, + "step": 6120 + }, + { + "epoch": 0.9273126087285379, + "grad_norm": 0.0797223374247551, + "learning_rate": 0.0001755, + "loss": 0.0425, + "step": 6130 + }, + { + "epoch": 0.9288253536041147, + "grad_norm": 0.08360251039266586, + "learning_rate": 0.00017546, + "loss": 0.0414, + "step": 6140 + }, + { + "epoch": 0.9303380984796914, + "grad_norm": 0.06491956114768982, + "learning_rate": 0.00017542, + "loss": 0.0367, + "step": 6150 + }, + { + "epoch": 0.9318508433552681, + "grad_norm": 0.06236764043569565, + "learning_rate": 0.00017538000000000002, + "loss": 0.0514, + "step": 6160 + }, + { + "epoch": 0.9333635882308449, + "grad_norm": 0.08555632829666138, + "learning_rate": 0.00017534, + "loss": 0.041, + "step": 6170 + }, + { + "epoch": 0.9348763331064216, + "grad_norm": 0.08949322998523712, + "learning_rate": 0.0001753, + "loss": 0.0462, + "step": 6180 + }, + { + "epoch": 0.9363890779819983, + "grad_norm": 0.07832244038581848, + "learning_rate": 0.00017526, + "loss": 0.0471, + "step": 6190 + }, + { + "epoch": 0.9379018228575751, + "grad_norm": 0.06077546253800392, + "learning_rate": 0.00017522000000000002, + "loss": 0.0457, + "step": 6200 + }, + { + "epoch": 0.9379018228575751, + "eval_cer": 0.3344013213649492, + "eval_loss": 0.03830147907137871, + "eval_runtime": 10461.8882, + "eval_samples_per_second": 2.012, + "eval_steps_per_second": 0.252, + "step": 6200 + }, + { + "epoch": 0.9394145677331518, + "grad_norm": 0.048287175595760345, + "learning_rate": 0.00017518, + "loss": 0.0393, + "step": 6210 + }, + { + "epoch": 0.9409273126087285, + "grad_norm": 0.08072841167449951, + "learning_rate": 0.00017514, + "loss": 0.0447, + "step": 6220 + }, + { + "epoch": 0.9424400574843053, + "grad_norm": 0.07255307585000992, + "learning_rate": 0.0001751, + "loss": 0.0492, + "step": 6230 + }, + { + "epoch": 0.943952802359882, + "grad_norm": 0.05136171355843544, + "learning_rate": 0.00017506, + "loss": 0.0438, + "step": 6240 + }, + { + "epoch": 0.9454655472354587, + "grad_norm": 0.079404316842556, + "learning_rate": 0.00017502000000000001, + "loss": 0.0383, + "step": 6250 + }, + { + "epoch": 0.9469782921110355, + "grad_norm": 0.10744167119264603, + "learning_rate": 0.00017498, + "loss": 0.0406, + "step": 6260 + }, + { + "epoch": 0.9484910369866122, + "grad_norm": 0.09439695626497269, + "learning_rate": 0.00017494, + "loss": 0.0448, + "step": 6270 + }, + { + "epoch": 0.9500037818621889, + "grad_norm": 0.07746788114309311, + "learning_rate": 0.00017490000000000002, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.9515165267377657, + "grad_norm": 0.161416694521904, + "learning_rate": 0.00017486, + "loss": 0.04, + "step": 6290 + }, + { + "epoch": 0.9530292716133424, + "grad_norm": 0.05279407650232315, + "learning_rate": 0.00017482, + "loss": 0.0387, + "step": 6300 + }, + { + "epoch": 0.9545420164889191, + "grad_norm": 0.06324402987957001, + "learning_rate": 0.00017478, + "loss": 0.0425, + "step": 6310 + }, + { + "epoch": 0.9560547613644959, + "grad_norm": 0.08716294914484024, + "learning_rate": 0.00017474000000000002, + "loss": 0.0436, + "step": 6320 + }, + { + "epoch": 0.9575675062400726, + "grad_norm": 0.08212625980377197, + "learning_rate": 0.00017470000000000002, + "loss": 0.0445, + "step": 6330 + }, + { + "epoch": 0.9590802511156493, + "grad_norm": 0.08856002241373062, + "learning_rate": 0.00017466, + "loss": 0.0385, + "step": 6340 + }, + { + "epoch": 0.960592995991226, + "grad_norm": 0.08907803148031235, + "learning_rate": 0.00017462, + "loss": 0.0451, + "step": 6350 + }, + { + "epoch": 0.9621057408668028, + "grad_norm": 0.053175825625658035, + "learning_rate": 0.00017458, + "loss": 0.0428, + "step": 6360 + }, + { + "epoch": 0.9636184857423795, + "grad_norm": 0.055600494146347046, + "learning_rate": 0.00017454000000000002, + "loss": 0.047, + "step": 6370 + }, + { + "epoch": 0.9651312306179562, + "grad_norm": 0.10455228388309479, + "learning_rate": 0.0001745, + "loss": 0.0517, + "step": 6380 + }, + { + "epoch": 0.966643975493533, + "grad_norm": 0.11780910938978195, + "learning_rate": 0.00017446, + "loss": 0.0414, + "step": 6390 + }, + { + "epoch": 0.9681567203691097, + "grad_norm": 0.12388743460178375, + "learning_rate": 0.00017442, + "loss": 0.0438, + "step": 6400 + }, + { + "epoch": 0.9681567203691097, + "eval_cer": 0.5869913004375724, + "eval_loss": 0.03873522952198982, + "eval_runtime": 10437.6142, + "eval_samples_per_second": 2.017, + "eval_steps_per_second": 0.252, + "step": 6400 + }, + { + "epoch": 0.9696694652446864, + "grad_norm": 0.07916050404310226, + "learning_rate": 0.00017438000000000002, + "loss": 0.0402, + "step": 6410 + }, + { + "epoch": 0.9711822101202632, + "grad_norm": 0.05646761879324913, + "learning_rate": 0.00017434000000000001, + "loss": 0.0425, + "step": 6420 + }, + { + "epoch": 0.9726949549958399, + "grad_norm": 0.08374381810426712, + "learning_rate": 0.0001743, + "loss": 0.041, + "step": 6430 + }, + { + "epoch": 0.9742076998714166, + "grad_norm": 0.06789222359657288, + "learning_rate": 0.00017426, + "loss": 0.0391, + "step": 6440 + }, + { + "epoch": 0.9757204447469934, + "grad_norm": 0.0788172036409378, + "learning_rate": 0.00017422, + "loss": 0.0449, + "step": 6450 + }, + { + "epoch": 0.9772331896225701, + "grad_norm": 0.1257173717021942, + "learning_rate": 0.00017418000000000002, + "loss": 0.0484, + "step": 6460 + }, + { + "epoch": 0.9787459344981468, + "grad_norm": 0.05888710170984268, + "learning_rate": 0.00017414, + "loss": 0.0387, + "step": 6470 + }, + { + "epoch": 0.9802586793737236, + "grad_norm": 0.07102910429239273, + "learning_rate": 0.00017410000000000003, + "loss": 0.0386, + "step": 6480 + }, + { + "epoch": 0.9817714242493003, + "grad_norm": 0.058048397302627563, + "learning_rate": 0.00017406, + "loss": 0.0415, + "step": 6490 + }, + { + "epoch": 0.983284169124877, + "grad_norm": 0.07222626358270645, + "learning_rate": 0.00017402000000000002, + "loss": 0.0378, + "step": 6500 + }, + { + "epoch": 0.9847969140004538, + "grad_norm": 0.06445878744125366, + "learning_rate": 0.00017398, + "loss": 0.0409, + "step": 6510 + }, + { + "epoch": 0.9863096588760305, + "grad_norm": 0.09191201627254486, + "learning_rate": 0.00017394, + "loss": 0.0414, + "step": 6520 + }, + { + "epoch": 0.9878224037516072, + "grad_norm": 0.08073204010725021, + "learning_rate": 0.00017390000000000003, + "loss": 0.0404, + "step": 6530 + }, + { + "epoch": 0.9893351486271841, + "grad_norm": 0.08427068591117859, + "learning_rate": 0.00017386, + "loss": 0.0398, + "step": 6540 + }, + { + "epoch": 0.9908478935027608, + "grad_norm": 0.19870494306087494, + "learning_rate": 0.00017382, + "loss": 0.0388, + "step": 6550 + }, + { + "epoch": 0.9923606383783375, + "grad_norm": 0.34985288977622986, + "learning_rate": 0.00017378, + "loss": 0.051, + "step": 6560 + }, + { + "epoch": 0.9938733832539143, + "grad_norm": 0.12121633440256119, + "learning_rate": 0.00017374000000000003, + "loss": 0.0385, + "step": 6570 + }, + { + "epoch": 0.995386128129491, + "grad_norm": 0.140520840883255, + "learning_rate": 0.00017370000000000002, + "loss": 0.0417, + "step": 6580 + }, + { + "epoch": 0.9968988730050677, + "grad_norm": 0.06655796617269516, + "learning_rate": 0.00017366, + "loss": 0.0394, + "step": 6590 + }, + { + "epoch": 0.9984116178806445, + "grad_norm": 0.07498542964458466, + "learning_rate": 0.00017362, + "loss": 0.0419, + "step": 6600 + }, + { + "epoch": 0.9984116178806445, + "eval_cer": 0.25282902555511905, + "eval_loss": 0.038411665707826614, + "eval_runtime": 10433.3935, + "eval_samples_per_second": 2.018, + "eval_steps_per_second": 0.252, + "step": 6600 + }, + { + "epoch": 0.9999243627562212, + "grad_norm": 0.25646254420280457, + "learning_rate": 0.00017358, + "loss": 0.039, + "step": 6610 + }, + { + "epoch": 1.0014371076317978, + "grad_norm": 0.07744245231151581, + "learning_rate": 0.00017354000000000002, + "loss": 0.0371, + "step": 6620 + }, + { + "epoch": 1.0029498525073746, + "grad_norm": 0.11968632787466049, + "learning_rate": 0.00017350000000000002, + "loss": 0.0303, + "step": 6630 + }, + { + "epoch": 1.0044625973829513, + "grad_norm": 0.07235859334468842, + "learning_rate": 0.00017346, + "loss": 0.0387, + "step": 6640 + }, + { + "epoch": 1.005975342258528, + "grad_norm": 0.12598702311515808, + "learning_rate": 0.00017342, + "loss": 0.0355, + "step": 6650 + }, + { + "epoch": 1.0074880871341048, + "grad_norm": 0.10832694917917252, + "learning_rate": 0.00017338, + "loss": 0.0297, + "step": 6660 + }, + { + "epoch": 1.0090008320096815, + "grad_norm": 0.13988302648067474, + "learning_rate": 0.00017334000000000002, + "loss": 0.0352, + "step": 6670 + }, + { + "epoch": 1.0105135768852582, + "grad_norm": 0.09534142911434174, + "learning_rate": 0.0001733, + "loss": 0.0308, + "step": 6680 + }, + { + "epoch": 1.012026321760835, + "grad_norm": 0.05622931197285652, + "learning_rate": 0.00017326, + "loss": 0.0311, + "step": 6690 + }, + { + "epoch": 1.0135390666364117, + "grad_norm": 0.06480368971824646, + "learning_rate": 0.00017322, + "loss": 0.033, + "step": 6700 + }, + { + "epoch": 1.0150518115119884, + "grad_norm": 0.08531224727630615, + "learning_rate": 0.00017318000000000002, + "loss": 0.0345, + "step": 6710 + }, + { + "epoch": 1.0165645563875652, + "grad_norm": 0.11494185030460358, + "learning_rate": 0.00017314, + "loss": 0.0292, + "step": 6720 + }, + { + "epoch": 1.0180773012631419, + "grad_norm": 0.06993953883647919, + "learning_rate": 0.0001731, + "loss": 0.0343, + "step": 6730 + }, + { + "epoch": 1.0195900461387186, + "grad_norm": 0.09449311345815659, + "learning_rate": 0.00017306, + "loss": 0.0285, + "step": 6740 + }, + { + "epoch": 1.0211027910142954, + "grad_norm": 0.10550418496131897, + "learning_rate": 0.00017302, + "loss": 0.0337, + "step": 6750 + }, + { + "epoch": 1.022615535889872, + "grad_norm": 0.06987041234970093, + "learning_rate": 0.00017298000000000001, + "loss": 0.0273, + "step": 6760 + }, + { + "epoch": 1.0241282807654488, + "grad_norm": 0.08014168590307236, + "learning_rate": 0.00017294, + "loss": 0.0318, + "step": 6770 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.04886119067668915, + "learning_rate": 0.00017290000000000003, + "loss": 0.0318, + "step": 6780 + }, + { + "epoch": 1.0271537705166023, + "grad_norm": 0.07735268771648407, + "learning_rate": 0.00017286, + "loss": 0.0377, + "step": 6790 + }, + { + "epoch": 1.028666515392179, + "grad_norm": 0.07365155220031738, + "learning_rate": 0.00017282000000000002, + "loss": 0.0397, + "step": 6800 + }, + { + "epoch": 1.028666515392179, + "eval_cer": 0.5956908628651482, + "eval_loss": 0.03884879872202873, + "eval_runtime": 10443.3198, + "eval_samples_per_second": 2.016, + "eval_steps_per_second": 0.252, + "step": 6800 + }, + { + "epoch": 1.0301792602677557, + "grad_norm": 0.08235965669155121, + "learning_rate": 0.00017278, + "loss": 0.0356, + "step": 6810 + }, + { + "epoch": 1.0316920051433325, + "grad_norm": 0.1203494668006897, + "learning_rate": 0.00017274, + "loss": 0.0391, + "step": 6820 + }, + { + "epoch": 1.0332047500189092, + "grad_norm": 0.059709157794713974, + "learning_rate": 0.00017270000000000002, + "loss": 0.036, + "step": 6830 + }, + { + "epoch": 1.034717494894486, + "grad_norm": 0.08380923420190811, + "learning_rate": 0.00017266, + "loss": 0.0311, + "step": 6840 + }, + { + "epoch": 1.0362302397700627, + "grad_norm": 0.0642111599445343, + "learning_rate": 0.00017262, + "loss": 0.0296, + "step": 6850 + }, + { + "epoch": 1.0377429846456394, + "grad_norm": 0.07701337337493896, + "learning_rate": 0.00017258, + "loss": 0.0318, + "step": 6860 + }, + { + "epoch": 1.0392557295212161, + "grad_norm": 0.09674856811761856, + "learning_rate": 0.00017254000000000003, + "loss": 0.0294, + "step": 6870 + }, + { + "epoch": 1.0407684743967929, + "grad_norm": 0.08543815463781357, + "learning_rate": 0.00017250000000000002, + "loss": 0.0322, + "step": 6880 + }, + { + "epoch": 1.0422812192723696, + "grad_norm": 0.08181754499673843, + "learning_rate": 0.00017246, + "loss": 0.031, + "step": 6890 + }, + { + "epoch": 1.0437939641479463, + "grad_norm": 0.07326922565698624, + "learning_rate": 0.00017242, + "loss": 0.0298, + "step": 6900 + }, + { + "epoch": 1.045306709023523, + "grad_norm": 0.060128018260002136, + "learning_rate": 0.00017238, + "loss": 0.0351, + "step": 6910 + }, + { + "epoch": 1.0468194538990998, + "grad_norm": 0.055250383913517, + "learning_rate": 0.00017234000000000002, + "loss": 0.0322, + "step": 6920 + }, + { + "epoch": 1.0483321987746765, + "grad_norm": 0.07841707766056061, + "learning_rate": 0.00017230000000000001, + "loss": 0.0311, + "step": 6930 + }, + { + "epoch": 1.0498449436502533, + "grad_norm": 0.06094701215624809, + "learning_rate": 0.00017226, + "loss": 0.0331, + "step": 6940 + }, + { + "epoch": 1.0513576885258302, + "grad_norm": 0.0738435760140419, + "learning_rate": 0.00017222, + "loss": 0.0385, + "step": 6950 + }, + { + "epoch": 1.052870433401407, + "grad_norm": 0.0741799846291542, + "learning_rate": 0.00017218, + "loss": 0.0332, + "step": 6960 + }, + { + "epoch": 1.0543831782769837, + "grad_norm": 0.11769600957632065, + "learning_rate": 0.00017214000000000002, + "loss": 0.0288, + "step": 6970 + }, + { + "epoch": 1.0558959231525604, + "grad_norm": 0.05547551065683365, + "learning_rate": 0.0001721, + "loss": 0.0351, + "step": 6980 + }, + { + "epoch": 1.0574086680281372, + "grad_norm": 0.059602439403533936, + "learning_rate": 0.00017206, + "loss": 0.0315, + "step": 6990 + }, + { + "epoch": 1.0589214129037139, + "grad_norm": 0.07523063570261002, + "learning_rate": 0.00017202, + "loss": 0.0344, + "step": 7000 + }, + { + "epoch": 1.0589214129037139, + "eval_cer": 0.06192848124566072, + "eval_loss": 0.03872867301106453, + "eval_runtime": 10423.0915, + "eval_samples_per_second": 2.02, + "eval_steps_per_second": 0.253, + "step": 7000 + }, + { + "epoch": 1.0604341577792906, + "grad_norm": 0.07334991544485092, + "learning_rate": 0.00017198000000000002, + "loss": 0.0394, + "step": 7010 + }, + { + "epoch": 1.0619469026548674, + "grad_norm": 0.08875437080860138, + "learning_rate": 0.00017194, + "loss": 0.0316, + "step": 7020 + }, + { + "epoch": 1.063459647530444, + "grad_norm": 0.06492207199335098, + "learning_rate": 0.0001719, + "loss": 0.0375, + "step": 7030 + }, + { + "epoch": 1.0649723924060208, + "grad_norm": 0.08707519620656967, + "learning_rate": 0.00017186, + "loss": 0.0333, + "step": 7040 + }, + { + "epoch": 1.0664851372815976, + "grad_norm": 0.06477733701467514, + "learning_rate": 0.00017182, + "loss": 0.036, + "step": 7050 + }, + { + "epoch": 1.0679978821571743, + "grad_norm": 0.05914880335330963, + "learning_rate": 0.00017178, + "loss": 0.0307, + "step": 7060 + }, + { + "epoch": 1.069510627032751, + "grad_norm": 0.11167873442173004, + "learning_rate": 0.00017174, + "loss": 0.0355, + "step": 7070 + }, + { + "epoch": 1.0710233719083277, + "grad_norm": 0.08664342761039734, + "learning_rate": 0.00017170000000000003, + "loss": 0.0373, + "step": 7080 + }, + { + "epoch": 1.0725361167839045, + "grad_norm": 0.06912154704332352, + "learning_rate": 0.00017166, + "loss": 0.0283, + "step": 7090 + }, + { + "epoch": 1.0740488616594812, + "grad_norm": 0.09120757132768631, + "learning_rate": 0.00017162000000000001, + "loss": 0.0313, + "step": 7100 + }, + { + "epoch": 1.075561606535058, + "grad_norm": 0.08159112185239792, + "learning_rate": 0.00017158, + "loss": 0.0413, + "step": 7110 + }, + { + "epoch": 1.0770743514106347, + "grad_norm": 0.095944344997406, + "learning_rate": 0.00017154, + "loss": 0.0355, + "step": 7120 + }, + { + "epoch": 1.0785870962862114, + "grad_norm": 0.10682930797338486, + "learning_rate": 0.00017150000000000002, + "loss": 0.0278, + "step": 7130 + }, + { + "epoch": 1.0800998411617881, + "grad_norm": 0.06514004617929459, + "learning_rate": 0.00017146, + "loss": 0.0306, + "step": 7140 + }, + { + "epoch": 1.0816125860373649, + "grad_norm": 0.07849156856536865, + "learning_rate": 0.00017142, + "loss": 0.0379, + "step": 7150 + }, + { + "epoch": 1.0831253309129416, + "grad_norm": 0.0788741260766983, + "learning_rate": 0.00017138, + "loss": 0.032, + "step": 7160 + }, + { + "epoch": 1.0846380757885183, + "grad_norm": 0.10495191067457199, + "learning_rate": 0.00017134000000000002, + "loss": 0.0358, + "step": 7170 + }, + { + "epoch": 1.086150820664095, + "grad_norm": 0.07463409751653671, + "learning_rate": 0.00017130000000000002, + "loss": 0.0356, + "step": 7180 + }, + { + "epoch": 1.0876635655396718, + "grad_norm": 0.08425049483776093, + "learning_rate": 0.00017126, + "loss": 0.0327, + "step": 7190 + }, + { + "epoch": 1.0891763104152485, + "grad_norm": 0.07767146825790405, + "learning_rate": 0.00017122, + "loss": 0.034, + "step": 7200 + }, + { + "epoch": 1.0891763104152485, + "eval_cer": 0.09758161553419167, + "eval_loss": 0.037929706275463104, + "eval_runtime": 10420.1284, + "eval_samples_per_second": 2.02, + "eval_steps_per_second": 0.253, + "step": 7200 + }, + { + "epoch": 1.0906890552908253, + "grad_norm": 0.07770776748657227, + "learning_rate": 0.00017118, + "loss": 0.0321, + "step": 7210 + }, + { + "epoch": 1.092201800166402, + "grad_norm": 0.06977003812789917, + "learning_rate": 0.00017114000000000002, + "loss": 0.0315, + "step": 7220 + }, + { + "epoch": 1.0937145450419787, + "grad_norm": 0.077842116355896, + "learning_rate": 0.0001711, + "loss": 0.0317, + "step": 7230 + }, + { + "epoch": 1.0952272899175555, + "grad_norm": 0.11414997279644012, + "learning_rate": 0.00017106, + "loss": 0.0392, + "step": 7240 + }, + { + "epoch": 1.0967400347931322, + "grad_norm": 0.07568582892417908, + "learning_rate": 0.00017102, + "loss": 0.0369, + "step": 7250 + }, + { + "epoch": 1.098252779668709, + "grad_norm": 0.07864728569984436, + "learning_rate": 0.00017098000000000002, + "loss": 0.038, + "step": 7260 + }, + { + "epoch": 1.0997655245442857, + "grad_norm": 0.0852401927113533, + "learning_rate": 0.00017094, + "loss": 0.0323, + "step": 7270 + }, + { + "epoch": 1.1012782694198624, + "grad_norm": 0.06548303365707397, + "learning_rate": 0.0001709, + "loss": 0.0373, + "step": 7280 + }, + { + "epoch": 1.1027910142954391, + "grad_norm": 0.10153812170028687, + "learning_rate": 0.00017086, + "loss": 0.0321, + "step": 7290 + }, + { + "epoch": 1.1043037591710159, + "grad_norm": 0.09032442420721054, + "learning_rate": 0.00017082, + "loss": 0.0306, + "step": 7300 + }, + { + "epoch": 1.1058165040465926, + "grad_norm": 0.12109789252281189, + "learning_rate": 0.00017078000000000001, + "loss": 0.0355, + "step": 7310 + }, + { + "epoch": 1.1073292489221693, + "grad_norm": 0.08515240997076035, + "learning_rate": 0.00017074, + "loss": 0.0374, + "step": 7320 + }, + { + "epoch": 1.108841993797746, + "grad_norm": 0.06838446855545044, + "learning_rate": 0.0001707, + "loss": 0.0309, + "step": 7330 + }, + { + "epoch": 1.1103547386733228, + "grad_norm": 0.10029911994934082, + "learning_rate": 0.00017066, + "loss": 0.0377, + "step": 7340 + }, + { + "epoch": 1.1118674835488995, + "grad_norm": 0.08499938994646072, + "learning_rate": 0.00017062, + "loss": 0.0317, + "step": 7350 + }, + { + "epoch": 1.1133802284244763, + "grad_norm": 0.10972133278846741, + "learning_rate": 0.00017058, + "loss": 0.0344, + "step": 7360 + }, + { + "epoch": 1.114892973300053, + "grad_norm": 0.06848263740539551, + "learning_rate": 0.00017054, + "loss": 0.0356, + "step": 7370 + }, + { + "epoch": 1.1164057181756297, + "grad_norm": 0.06813491135835648, + "learning_rate": 0.00017050000000000002, + "loss": 0.0291, + "step": 7380 + }, + { + "epoch": 1.1179184630512065, + "grad_norm": 0.053215883672237396, + "learning_rate": 0.00017046, + "loss": 0.0297, + "step": 7390 + }, + { + "epoch": 1.1194312079267832, + "grad_norm": 0.08575928211212158, + "learning_rate": 0.00017042, + "loss": 0.0378, + "step": 7400 + }, + { + "epoch": 1.1194312079267832, + "eval_cer": 0.05163898174846133, + "eval_loss": 0.03768303617835045, + "eval_runtime": 10418.7834, + "eval_samples_per_second": 2.021, + "eval_steps_per_second": 0.253, + "step": 7400 + }, + { + "epoch": 1.12094395280236, + "grad_norm": 0.07621601223945618, + "learning_rate": 0.00017038, + "loss": 0.032, + "step": 7410 + }, + { + "epoch": 1.1224566976779367, + "grad_norm": 0.11499703675508499, + "learning_rate": 0.00017034, + "loss": 0.0331, + "step": 7420 + }, + { + "epoch": 1.1239694425535134, + "grad_norm": 0.08789568394422531, + "learning_rate": 0.00017030000000000002, + "loss": 0.0332, + "step": 7430 + }, + { + "epoch": 1.1254821874290901, + "grad_norm": 0.0887342318892479, + "learning_rate": 0.00017025999999999999, + "loss": 0.0374, + "step": 7440 + }, + { + "epoch": 1.1269949323046669, + "grad_norm": 0.11794856935739517, + "learning_rate": 0.00017022, + "loss": 0.0347, + "step": 7450 + }, + { + "epoch": 1.1285076771802436, + "grad_norm": 0.07593784481287003, + "learning_rate": 0.00017018, + "loss": 0.0323, + "step": 7460 + }, + { + "epoch": 1.1300204220558203, + "grad_norm": 0.06868909299373627, + "learning_rate": 0.00017014000000000002, + "loss": 0.0311, + "step": 7470 + }, + { + "epoch": 1.131533166931397, + "grad_norm": 0.1010032370686531, + "learning_rate": 0.00017010000000000001, + "loss": 0.0333, + "step": 7480 + }, + { + "epoch": 1.1330459118069738, + "grad_norm": 0.08664656430482864, + "learning_rate": 0.00017006, + "loss": 0.0358, + "step": 7490 + }, + { + "epoch": 1.1345586566825505, + "grad_norm": 0.09153386205434799, + "learning_rate": 0.00017002, + "loss": 0.0288, + "step": 7500 + }, + { + "epoch": 1.1360714015581272, + "grad_norm": 0.10042116045951843, + "learning_rate": 0.00016998, + "loss": 0.0324, + "step": 7510 + }, + { + "epoch": 1.137584146433704, + "grad_norm": 0.09703629463911057, + "learning_rate": 0.00016994000000000002, + "loss": 0.0356, + "step": 7520 + }, + { + "epoch": 1.1390968913092807, + "grad_norm": 0.07961410284042358, + "learning_rate": 0.0001699, + "loss": 0.0279, + "step": 7530 + }, + { + "epoch": 1.1406096361848574, + "grad_norm": 0.09164062142372131, + "learning_rate": 0.00016986000000000003, + "loss": 0.033, + "step": 7540 + }, + { + "epoch": 1.1421223810604342, + "grad_norm": 0.0804910659790039, + "learning_rate": 0.00016982, + "loss": 0.033, + "step": 7550 + }, + { + "epoch": 1.143635125936011, + "grad_norm": 0.07923970371484756, + "learning_rate": 0.00016978000000000002, + "loss": 0.0366, + "step": 7560 + }, + { + "epoch": 1.1451478708115876, + "grad_norm": 0.1198810487985611, + "learning_rate": 0.00016974, + "loss": 0.0361, + "step": 7570 + }, + { + "epoch": 1.1466606156871644, + "grad_norm": 0.08409520238637924, + "learning_rate": 0.0001697, + "loss": 0.0323, + "step": 7580 + }, + { + "epoch": 1.148173360562741, + "grad_norm": 0.09524326026439667, + "learning_rate": 0.00016966000000000003, + "loss": 0.0338, + "step": 7590 + }, + { + "epoch": 1.1496861054383178, + "grad_norm": 0.0670013502240181, + "learning_rate": 0.00016962, + "loss": 0.033, + "step": 7600 + }, + { + "epoch": 1.1496861054383178, + "eval_cer": 0.04317970118571997, + "eval_loss": 0.03775278851389885, + "eval_runtime": 10413.2831, + "eval_samples_per_second": 2.022, + "eval_steps_per_second": 0.253, + "step": 7600 + }, + { + "epoch": 1.1511988503138946, + "grad_norm": 0.07331959903240204, + "learning_rate": 0.00016958, + "loss": 0.0331, + "step": 7610 + }, + { + "epoch": 1.1527115951894713, + "grad_norm": 0.06851343810558319, + "learning_rate": 0.00016954, + "loss": 0.0306, + "step": 7620 + }, + { + "epoch": 1.154224340065048, + "grad_norm": 0.07627418637275696, + "learning_rate": 0.00016950000000000003, + "loss": 0.0334, + "step": 7630 + }, + { + "epoch": 1.1557370849406248, + "grad_norm": 0.08676694333553314, + "learning_rate": 0.00016946000000000002, + "loss": 0.0322, + "step": 7640 + }, + { + "epoch": 1.1572498298162015, + "grad_norm": 0.07023747265338898, + "learning_rate": 0.00016942000000000001, + "loss": 0.0358, + "step": 7650 + }, + { + "epoch": 1.1587625746917782, + "grad_norm": 0.07805462926626205, + "learning_rate": 0.00016938, + "loss": 0.0325, + "step": 7660 + }, + { + "epoch": 1.160275319567355, + "grad_norm": 0.0867529958486557, + "learning_rate": 0.00016934, + "loss": 0.0318, + "step": 7670 + }, + { + "epoch": 1.1617880644429317, + "grad_norm": 0.08449842035770416, + "learning_rate": 0.00016930000000000002, + "loss": 0.0408, + "step": 7680 + }, + { + "epoch": 1.1633008093185084, + "grad_norm": 0.08054087311029434, + "learning_rate": 0.00016926000000000002, + "loss": 0.0306, + "step": 7690 + }, + { + "epoch": 1.1648135541940852, + "grad_norm": 0.08645962178707123, + "learning_rate": 0.00016922, + "loss": 0.0299, + "step": 7700 + }, + { + "epoch": 1.166326299069662, + "grad_norm": 0.0892554521560669, + "learning_rate": 0.00016918, + "loss": 0.0352, + "step": 7710 + }, + { + "epoch": 1.1678390439452386, + "grad_norm": 0.06643500924110413, + "learning_rate": 0.00016914, + "loss": 0.0284, + "step": 7720 + }, + { + "epoch": 1.1693517888208154, + "grad_norm": 0.06918591260910034, + "learning_rate": 0.00016910000000000002, + "loss": 0.0278, + "step": 7730 + }, + { + "epoch": 1.170864533696392, + "grad_norm": 0.08370740711688995, + "learning_rate": 0.00016906, + "loss": 0.0316, + "step": 7740 + }, + { + "epoch": 1.1723772785719688, + "grad_norm": 0.053777385503053665, + "learning_rate": 0.00016902, + "loss": 0.036, + "step": 7750 + }, + { + "epoch": 1.1738900234475456, + "grad_norm": 0.0665329247713089, + "learning_rate": 0.00016898, + "loss": 0.0333, + "step": 7760 + }, + { + "epoch": 1.1754027683231223, + "grad_norm": 0.07484222948551178, + "learning_rate": 0.00016894000000000002, + "loss": 0.0319, + "step": 7770 + }, + { + "epoch": 1.176915513198699, + "grad_norm": 0.08218715339899063, + "learning_rate": 0.0001689, + "loss": 0.0308, + "step": 7780 + }, + { + "epoch": 1.1784282580742758, + "grad_norm": 0.06873024255037308, + "learning_rate": 0.00016886, + "loss": 0.0349, + "step": 7790 + }, + { + "epoch": 1.1799410029498525, + "grad_norm": 0.07846609503030777, + "learning_rate": 0.00016882, + "loss": 0.0359, + "step": 7800 + }, + { + "epoch": 1.1799410029498525, + "eval_cer": 0.1078840865459451, + "eval_loss": 0.03878456726670265, + "eval_runtime": 10398.1972, + "eval_samples_per_second": 2.025, + "eval_steps_per_second": 0.253, + "step": 7800 + }, + { + "epoch": 1.1814537478254292, + "grad_norm": 0.06112883612513542, + "learning_rate": 0.00016878, + "loss": 0.0324, + "step": 7810 + }, + { + "epoch": 1.182966492701006, + "grad_norm": 0.07065495103597641, + "learning_rate": 0.00016874000000000001, + "loss": 0.0333, + "step": 7820 + }, + { + "epoch": 1.1844792375765827, + "grad_norm": 0.10944267362356186, + "learning_rate": 0.0001687, + "loss": 0.0322, + "step": 7830 + }, + { + "epoch": 1.1859919824521594, + "grad_norm": 0.08741329610347748, + "learning_rate": 0.00016866000000000003, + "loss": 0.0339, + "step": 7840 + }, + { + "epoch": 1.1875047273277362, + "grad_norm": 0.06457091867923737, + "learning_rate": 0.00016862, + "loss": 0.0345, + "step": 7850 + }, + { + "epoch": 1.1890174722033129, + "grad_norm": 0.0570165179669857, + "learning_rate": 0.00016858000000000002, + "loss": 0.032, + "step": 7860 + }, + { + "epoch": 1.1905302170788896, + "grad_norm": 0.07944530248641968, + "learning_rate": 0.00016854, + "loss": 0.0347, + "step": 7870 + }, + { + "epoch": 1.1920429619544664, + "grad_norm": 0.06981216371059418, + "learning_rate": 0.0001685, + "loss": 0.0329, + "step": 7880 + }, + { + "epoch": 1.193555706830043, + "grad_norm": 0.052252449095249176, + "learning_rate": 0.00016846000000000002, + "loss": 0.0327, + "step": 7890 + }, + { + "epoch": 1.1950684517056198, + "grad_norm": 0.05333190783858299, + "learning_rate": 0.00016842, + "loss": 0.0269, + "step": 7900 + }, + { + "epoch": 1.1965811965811965, + "grad_norm": 0.18012838065624237, + "learning_rate": 0.00016838, + "loss": 0.0324, + "step": 7910 + }, + { + "epoch": 1.1980939414567733, + "grad_norm": 0.06892676651477814, + "learning_rate": 0.00016834, + "loss": 0.0294, + "step": 7920 + }, + { + "epoch": 1.19960668633235, + "grad_norm": 0.07558593899011612, + "learning_rate": 0.00016830000000000003, + "loss": 0.0371, + "step": 7930 + }, + { + "epoch": 1.2011194312079267, + "grad_norm": 0.08046507835388184, + "learning_rate": 0.00016826000000000002, + "loss": 0.0311, + "step": 7940 + }, + { + "epoch": 1.2026321760835035, + "grad_norm": 0.07986424118280411, + "learning_rate": 0.00016822, + "loss": 0.0357, + "step": 7950 + }, + { + "epoch": 1.2041449209590802, + "grad_norm": 0.07394195348024368, + "learning_rate": 0.00016818, + "loss": 0.0341, + "step": 7960 + }, + { + "epoch": 1.205657665834657, + "grad_norm": 0.06269822269678116, + "learning_rate": 0.00016814, + "loss": 0.0329, + "step": 7970 + }, + { + "epoch": 1.2071704107102337, + "grad_norm": 0.07179784774780273, + "learning_rate": 0.00016810000000000002, + "loss": 0.0329, + "step": 7980 + }, + { + "epoch": 1.2086831555858104, + "grad_norm": 0.10174887627363205, + "learning_rate": 0.00016806000000000001, + "loss": 0.0262, + "step": 7990 + }, + { + "epoch": 1.2101959004613871, + "grad_norm": 0.06536643952131271, + "learning_rate": 0.00016802, + "loss": 0.034, + "step": 8000 + }, + { + "epoch": 1.2101959004613871, + "eval_cer": 0.15941559003095868, + "eval_loss": 0.03837862238287926, + "eval_runtime": 10390.1541, + "eval_samples_per_second": 2.026, + "eval_steps_per_second": 0.253, + "step": 8000 + }, + { + "epoch": 1.2117086453369639, + "grad_norm": 0.13079065084457397, + "learning_rate": 0.00016798, + "loss": 0.037, + "step": 8010 + }, + { + "epoch": 1.2132213902125406, + "grad_norm": 0.07293607294559479, + "learning_rate": 0.00016794000000000002, + "loss": 0.0295, + "step": 8020 + }, + { + "epoch": 1.2147341350881173, + "grad_norm": 0.07390507310628891, + "learning_rate": 0.00016790000000000002, + "loss": 0.0309, + "step": 8030 + }, + { + "epoch": 1.216246879963694, + "grad_norm": 0.22675780951976776, + "learning_rate": 0.00016786, + "loss": 0.0341, + "step": 8040 + }, + { + "epoch": 1.2177596248392708, + "grad_norm": 0.06630139797925949, + "learning_rate": 0.00016782, + "loss": 0.0359, + "step": 8050 + }, + { + "epoch": 1.2192723697148475, + "grad_norm": 0.09231210500001907, + "learning_rate": 0.00016778, + "loss": 0.0325, + "step": 8060 + }, + { + "epoch": 1.2207851145904243, + "grad_norm": 0.067893847823143, + "learning_rate": 0.00016774000000000002, + "loss": 0.0338, + "step": 8070 + }, + { + "epoch": 1.222297859466001, + "grad_norm": 0.16284491121768951, + "learning_rate": 0.0001677, + "loss": 0.0362, + "step": 8080 + }, + { + "epoch": 1.2238106043415777, + "grad_norm": 0.07695828378200531, + "learning_rate": 0.00016766, + "loss": 0.0367, + "step": 8090 + }, + { + "epoch": 1.2253233492171545, + "grad_norm": 0.07685229927301407, + "learning_rate": 0.00016762, + "loss": 0.0383, + "step": 8100 + }, + { + "epoch": 1.2268360940927312, + "grad_norm": 0.08510534465312958, + "learning_rate": 0.00016758, + "loss": 0.0346, + "step": 8110 + }, + { + "epoch": 1.228348838968308, + "grad_norm": 0.16018977761268616, + "learning_rate": 0.00016754, + "loss": 0.0314, + "step": 8120 + }, + { + "epoch": 1.2298615838438847, + "grad_norm": 0.10644716769456863, + "learning_rate": 0.0001675, + "loss": 0.0427, + "step": 8130 + }, + { + "epoch": 1.2313743287194614, + "grad_norm": 0.06390608847141266, + "learning_rate": 0.00016746000000000003, + "loss": 0.0333, + "step": 8140 + }, + { + "epoch": 1.2328870735950381, + "grad_norm": 0.1173742264509201, + "learning_rate": 0.00016742, + "loss": 0.0335, + "step": 8150 + }, + { + "epoch": 1.2343998184706149, + "grad_norm": 0.08506636321544647, + "learning_rate": 0.00016738000000000001, + "loss": 0.0393, + "step": 8160 + }, + { + "epoch": 1.2359125633461916, + "grad_norm": 0.08176897466182709, + "learning_rate": 0.00016734, + "loss": 0.0306, + "step": 8170 + }, + { + "epoch": 1.2374253082217683, + "grad_norm": 0.11272590607404709, + "learning_rate": 0.0001673, + "loss": 0.0368, + "step": 8180 + }, + { + "epoch": 1.238938053097345, + "grad_norm": 0.10923430323600769, + "learning_rate": 0.00016726000000000002, + "loss": 0.0389, + "step": 8190 + }, + { + "epoch": 1.2404507979729218, + "grad_norm": 0.05665091425180435, + "learning_rate": 0.00016722, + "loss": 0.0352, + "step": 8200 + }, + { + "epoch": 1.2404507979729218, + "eval_cer": 0.195939668868118, + "eval_loss": 0.03837649151682854, + "eval_runtime": 10379.5895, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 0.254, + "step": 8200 + }, + { + "epoch": 1.2419635428484985, + "grad_norm": 0.08927123993635178, + "learning_rate": 0.00016718, + "loss": 0.0356, + "step": 8210 + }, + { + "epoch": 1.2434762877240753, + "grad_norm": 0.09398534893989563, + "learning_rate": 0.00016714, + "loss": 0.0365, + "step": 8220 + }, + { + "epoch": 1.244989032599652, + "grad_norm": 0.0905461311340332, + "learning_rate": 0.00016710000000000002, + "loss": 0.0335, + "step": 8230 + }, + { + "epoch": 1.2465017774752287, + "grad_norm": 0.09033455699682236, + "learning_rate": 0.00016706000000000002, + "loss": 0.0376, + "step": 8240 + }, + { + "epoch": 1.2480145223508055, + "grad_norm": 0.08217161148786545, + "learning_rate": 0.00016702, + "loss": 0.032, + "step": 8250 + }, + { + "epoch": 1.2495272672263822, + "grad_norm": 0.0694824755191803, + "learning_rate": 0.00016698, + "loss": 0.0354, + "step": 8260 + }, + { + "epoch": 1.2510400121019591, + "grad_norm": 0.08535374701023102, + "learning_rate": 0.00016694, + "loss": 0.0288, + "step": 8270 + }, + { + "epoch": 1.2525527569775359, + "grad_norm": 0.10267391055822372, + "learning_rate": 0.00016690000000000002, + "loss": 0.0331, + "step": 8280 + }, + { + "epoch": 1.2540655018531126, + "grad_norm": 0.0720328763127327, + "learning_rate": 0.00016686, + "loss": 0.0324, + "step": 8290 + }, + { + "epoch": 1.2555782467286893, + "grad_norm": 0.15617039799690247, + "learning_rate": 0.00016682, + "loss": 0.0374, + "step": 8300 + }, + { + "epoch": 1.257090991604266, + "grad_norm": 0.09863468259572983, + "learning_rate": 0.00016678, + "loss": 0.0363, + "step": 8310 + }, + { + "epoch": 1.2586037364798428, + "grad_norm": 0.08562877029180527, + "learning_rate": 0.00016674000000000002, + "loss": 0.0347, + "step": 8320 + }, + { + "epoch": 1.2601164813554195, + "grad_norm": 0.09868349879980087, + "learning_rate": 0.0001667, + "loss": 0.0362, + "step": 8330 + }, + { + "epoch": 1.2616292262309963, + "grad_norm": 0.09744835644960403, + "learning_rate": 0.00016666, + "loss": 0.0364, + "step": 8340 + }, + { + "epoch": 1.263141971106573, + "grad_norm": 0.19243358075618744, + "learning_rate": 0.00016662, + "loss": 0.0378, + "step": 8350 + }, + { + "epoch": 1.2646547159821497, + "grad_norm": 0.06478457897901535, + "learning_rate": 0.00016658, + "loss": 0.033, + "step": 8360 + }, + { + "epoch": 1.2661674608577265, + "grad_norm": 0.09313791990280151, + "learning_rate": 0.00016654000000000001, + "loss": 0.04, + "step": 8370 + }, + { + "epoch": 1.2676802057333032, + "grad_norm": 0.0906825065612793, + "learning_rate": 0.0001665, + "loss": 0.0341, + "step": 8380 + }, + { + "epoch": 1.26919295060888, + "grad_norm": 0.08549359440803528, + "learning_rate": 0.00016646000000000003, + "loss": 0.0376, + "step": 8390 + }, + { + "epoch": 1.2707056954844567, + "grad_norm": 0.0915452241897583, + "learning_rate": 0.00016642, + "loss": 0.029, + "step": 8400 + }, + { + "epoch": 1.2707056954844567, + "eval_cer": 0.19141261028875828, + "eval_loss": 0.03777679055929184, + "eval_runtime": 10360.722, + "eval_samples_per_second": 2.032, + "eval_steps_per_second": 0.254, + "step": 8400 + }, + { + "epoch": 1.2722184403600334, + "grad_norm": 0.07039971649646759, + "learning_rate": 0.00016638, + "loss": 0.0355, + "step": 8410 + }, + { + "epoch": 1.2737311852356101, + "grad_norm": 0.08890164643526077, + "learning_rate": 0.00016634, + "loss": 0.03, + "step": 8420 + }, + { + "epoch": 1.2752439301111869, + "grad_norm": 0.07611805945634842, + "learning_rate": 0.0001663, + "loss": 0.037, + "step": 8430 + }, + { + "epoch": 1.2767566749867636, + "grad_norm": 0.10268427431583405, + "learning_rate": 0.00016626000000000002, + "loss": 0.0346, + "step": 8440 + }, + { + "epoch": 1.2782694198623403, + "grad_norm": 0.07185817509889603, + "learning_rate": 0.00016622, + "loss": 0.0334, + "step": 8450 + }, + { + "epoch": 1.279782164737917, + "grad_norm": 0.09720634669065475, + "learning_rate": 0.00016618, + "loss": 0.0328, + "step": 8460 + }, + { + "epoch": 1.2812949096134938, + "grad_norm": 0.08373324573040009, + "learning_rate": 0.00016614, + "loss": 0.0342, + "step": 8470 + }, + { + "epoch": 1.2828076544890705, + "grad_norm": 0.05525701493024826, + "learning_rate": 0.0001661, + "loss": 0.0295, + "step": 8480 + }, + { + "epoch": 1.2843203993646473, + "grad_norm": 0.08398504555225372, + "learning_rate": 0.00016606000000000002, + "loss": 0.0336, + "step": 8490 + }, + { + "epoch": 1.285833144240224, + "grad_norm": 0.11384329944849014, + "learning_rate": 0.00016601999999999999, + "loss": 0.0335, + "step": 8500 + }, + { + "epoch": 1.2873458891158007, + "grad_norm": 0.05366117134690285, + "learning_rate": 0.00016598, + "loss": 0.0303, + "step": 8510 + }, + { + "epoch": 1.2888586339913775, + "grad_norm": 0.09270923584699631, + "learning_rate": 0.00016594, + "loss": 0.0309, + "step": 8520 + }, + { + "epoch": 1.2903713788669542, + "grad_norm": 0.09621911495923996, + "learning_rate": 0.00016590000000000002, + "loss": 0.0326, + "step": 8530 + }, + { + "epoch": 1.291884123742531, + "grad_norm": 0.09750113636255264, + "learning_rate": 0.00016586000000000001, + "loss": 0.032, + "step": 8540 + }, + { + "epoch": 1.2933968686181077, + "grad_norm": 0.08557499945163727, + "learning_rate": 0.00016582, + "loss": 0.0331, + "step": 8550 + }, + { + "epoch": 1.2949096134936844, + "grad_norm": 0.0842200294137001, + "learning_rate": 0.00016578, + "loss": 0.0339, + "step": 8560 + }, + { + "epoch": 1.2964223583692611, + "grad_norm": 0.06341574341058731, + "learning_rate": 0.00016574, + "loss": 0.0369, + "step": 8570 + }, + { + "epoch": 1.2979351032448379, + "grad_norm": 0.07687686383724213, + "learning_rate": 0.00016570000000000002, + "loss": 0.0291, + "step": 8580 + }, + { + "epoch": 1.2994478481204146, + "grad_norm": 0.07118263840675354, + "learning_rate": 0.00016566, + "loss": 0.0331, + "step": 8590 + }, + { + "epoch": 1.3009605929959913, + "grad_norm": 0.10967772454023361, + "learning_rate": 0.00016562, + "loss": 0.04, + "step": 8600 + }, + { + "epoch": 1.3009605929959913, + "eval_cer": 0.15955704130871465, + "eval_loss": 0.03786647692322731, + "eval_runtime": 10383.8112, + "eval_samples_per_second": 2.027, + "eval_steps_per_second": 0.253, + "step": 8600 + }, + { + "epoch": 1.302473337871568, + "grad_norm": 0.09102348983287811, + "learning_rate": 0.00016558, + "loss": 0.0337, + "step": 8610 + }, + { + "epoch": 1.3039860827471448, + "grad_norm": 0.0596625916659832, + "learning_rate": 0.00016554000000000002, + "loss": 0.0341, + "step": 8620 + }, + { + "epoch": 1.3054988276227215, + "grad_norm": 0.0790410116314888, + "learning_rate": 0.0001655, + "loss": 0.0348, + "step": 8630 + }, + { + "epoch": 1.3070115724982982, + "grad_norm": 0.08243832737207413, + "learning_rate": 0.00016546, + "loss": 0.0351, + "step": 8640 + }, + { + "epoch": 1.308524317373875, + "grad_norm": 0.07890262454748154, + "learning_rate": 0.00016542, + "loss": 0.0331, + "step": 8650 + }, + { + "epoch": 1.3100370622494517, + "grad_norm": 0.06424404680728912, + "learning_rate": 0.00016538, + "loss": 0.032, + "step": 8660 + }, + { + "epoch": 1.3115498071250284, + "grad_norm": 0.08828658610582352, + "learning_rate": 0.00016534, + "loss": 0.0351, + "step": 8670 + }, + { + "epoch": 1.3130625520006052, + "grad_norm": 0.07190482318401337, + "learning_rate": 0.0001653, + "loss": 0.0334, + "step": 8680 + }, + { + "epoch": 1.314575296876182, + "grad_norm": 0.1207108125090599, + "learning_rate": 0.00016526000000000003, + "loss": 0.0333, + "step": 8690 + }, + { + "epoch": 1.3160880417517586, + "grad_norm": 0.057197410613298416, + "learning_rate": 0.00016522, + "loss": 0.0273, + "step": 8700 + }, + { + "epoch": 1.3176007866273354, + "grad_norm": 0.0845530703663826, + "learning_rate": 0.00016518000000000001, + "loss": 0.0398, + "step": 8710 + }, + { + "epoch": 1.319113531502912, + "grad_norm": 0.07357069104909897, + "learning_rate": 0.00016514, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 1.3206262763784888, + "grad_norm": 0.07419273257255554, + "learning_rate": 0.0001651, + "loss": 0.0267, + "step": 8730 + }, + { + "epoch": 1.3221390212540656, + "grad_norm": 0.08293847739696503, + "learning_rate": 0.00016506000000000002, + "loss": 0.0286, + "step": 8740 + }, + { + "epoch": 1.3236517661296423, + "grad_norm": 0.09437254071235657, + "learning_rate": 0.00016502, + "loss": 0.0411, + "step": 8750 + }, + { + "epoch": 1.325164511005219, + "grad_norm": 0.06988554447889328, + "learning_rate": 0.00016498, + "loss": 0.0288, + "step": 8760 + }, + { + "epoch": 1.3266772558807958, + "grad_norm": 0.11081293970346451, + "learning_rate": 0.00016494, + "loss": 0.0342, + "step": 8770 + }, + { + "epoch": 1.3281900007563725, + "grad_norm": 0.0911073237657547, + "learning_rate": 0.0001649, + "loss": 0.0324, + "step": 8780 + }, + { + "epoch": 1.3297027456319492, + "grad_norm": 0.08337673544883728, + "learning_rate": 0.00016486000000000002, + "loss": 0.0297, + "step": 8790 + }, + { + "epoch": 1.331215490507526, + "grad_norm": 0.09077824652194977, + "learning_rate": 0.00016482, + "loss": 0.0319, + "step": 8800 + }, + { + "epoch": 1.331215490507526, + "eval_cer": 0.050760007214632856, + "eval_loss": 0.03842457756400108, + "eval_runtime": 10378.6583, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 0.254, + "step": 8800 + }, + { + "epoch": 1.3327282353831027, + "grad_norm": 0.12336084991693497, + "learning_rate": 0.00016478, + "loss": 0.0371, + "step": 8810 + }, + { + "epoch": 1.3342409802586794, + "grad_norm": 0.07978357374668121, + "learning_rate": 0.00016474, + "loss": 0.0349, + "step": 8820 + }, + { + "epoch": 1.3357537251342562, + "grad_norm": 0.1073361411690712, + "learning_rate": 0.00016470000000000002, + "loss": 0.0417, + "step": 8830 + }, + { + "epoch": 1.337266470009833, + "grad_norm": 0.05822708085179329, + "learning_rate": 0.00016466, + "loss": 0.0302, + "step": 8840 + }, + { + "epoch": 1.3387792148854096, + "grad_norm": 0.06241593137383461, + "learning_rate": 0.00016462, + "loss": 0.0365, + "step": 8850 + }, + { + "epoch": 1.3402919597609864, + "grad_norm": 0.10107123106718063, + "learning_rate": 0.00016458, + "loss": 0.0345, + "step": 8860 + }, + { + "epoch": 1.341804704636563, + "grad_norm": 0.09659604728221893, + "learning_rate": 0.00016454, + "loss": 0.0324, + "step": 8870 + }, + { + "epoch": 1.3433174495121398, + "grad_norm": 0.07501540333032608, + "learning_rate": 0.00016450000000000001, + "loss": 0.0317, + "step": 8880 + }, + { + "epoch": 1.3448301943877166, + "grad_norm": 0.071120485663414, + "learning_rate": 0.00016446, + "loss": 0.0299, + "step": 8890 + }, + { + "epoch": 1.3463429392632933, + "grad_norm": 0.07235920429229736, + "learning_rate": 0.00016442000000000003, + "loss": 0.0337, + "step": 8900 + }, + { + "epoch": 1.34785568413887, + "grad_norm": 0.08588097244501114, + "learning_rate": 0.00016438, + "loss": 0.0302, + "step": 8910 + }, + { + "epoch": 1.3493684290144468, + "grad_norm": 0.052244190126657486, + "learning_rate": 0.00016434000000000002, + "loss": 0.0326, + "step": 8920 + }, + { + "epoch": 1.3508811738900235, + "grad_norm": 0.0702931210398674, + "learning_rate": 0.0001643, + "loss": 0.0372, + "step": 8930 + }, + { + "epoch": 1.3523939187656002, + "grad_norm": 0.10441485792398453, + "learning_rate": 0.00016426, + "loss": 0.037, + "step": 8940 + }, + { + "epoch": 1.353906663641177, + "grad_norm": 0.10514800250530243, + "learning_rate": 0.00016422000000000002, + "loss": 0.037, + "step": 8950 + }, + { + "epoch": 1.3554194085167537, + "grad_norm": 0.07011867314577103, + "learning_rate": 0.00016418, + "loss": 0.0314, + "step": 8960 + }, + { + "epoch": 1.3569321533923304, + "grad_norm": 0.06335943937301636, + "learning_rate": 0.00016414, + "loss": 0.0311, + "step": 8970 + }, + { + "epoch": 1.3584448982679072, + "grad_norm": 0.07194424420595169, + "learning_rate": 0.0001641, + "loss": 0.0336, + "step": 8980 + }, + { + "epoch": 1.3599576431434839, + "grad_norm": 0.07171431183815002, + "learning_rate": 0.00016406000000000003, + "loss": 0.0312, + "step": 8990 + }, + { + "epoch": 1.3614703880190606, + "grad_norm": 0.14893119037151337, + "learning_rate": 0.00016402000000000002, + "loss": 0.0348, + "step": 9000 + }, + { + "epoch": 1.3614703880190606, + "eval_cer": 0.23852391576669063, + "eval_loss": 0.03737874701619148, + "eval_runtime": 10378.6671, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 0.254, + "step": 9000 + }, + { + "epoch": 1.3629831328946374, + "grad_norm": 0.09854207932949066, + "learning_rate": 0.00016398, + "loss": 0.0334, + "step": 9010 + }, + { + "epoch": 1.364495877770214, + "grad_norm": 0.0829731673002243, + "learning_rate": 0.00016394, + "loss": 0.0367, + "step": 9020 + }, + { + "epoch": 1.3660086226457908, + "grad_norm": 0.05378841981291771, + "learning_rate": 0.0001639, + "loss": 0.0328, + "step": 9030 + }, + { + "epoch": 1.3675213675213675, + "grad_norm": 0.08590775728225708, + "learning_rate": 0.00016386000000000002, + "loss": 0.0337, + "step": 9040 + }, + { + "epoch": 1.3690341123969443, + "grad_norm": 0.06473217159509659, + "learning_rate": 0.00016382000000000001, + "loss": 0.0309, + "step": 9050 + }, + { + "epoch": 1.370546857272521, + "grad_norm": 0.14496292173862457, + "learning_rate": 0.00016378, + "loss": 0.0362, + "step": 9060 + }, + { + "epoch": 1.3720596021480977, + "grad_norm": 0.0658840760588646, + "learning_rate": 0.00016374, + "loss": 0.0316, + "step": 9070 + }, + { + "epoch": 1.3735723470236745, + "grad_norm": 0.0722692534327507, + "learning_rate": 0.00016370000000000002, + "loss": 0.0321, + "step": 9080 + }, + { + "epoch": 1.3750850918992512, + "grad_norm": 0.0751873180270195, + "learning_rate": 0.00016366000000000002, + "loss": 0.0357, + "step": 9090 + }, + { + "epoch": 1.376597836774828, + "grad_norm": 0.07309116423130035, + "learning_rate": 0.00016362, + "loss": 0.0329, + "step": 9100 + }, + { + "epoch": 1.3781105816504047, + "grad_norm": 0.09205902367830276, + "learning_rate": 0.00016358, + "loss": 0.0311, + "step": 9110 + }, + { + "epoch": 1.3796233265259814, + "grad_norm": 0.06787604093551636, + "learning_rate": 0.00016354, + "loss": 0.0308, + "step": 9120 + }, + { + "epoch": 1.3811360714015581, + "grad_norm": 0.08365906029939651, + "learning_rate": 0.00016350000000000002, + "loss": 0.0344, + "step": 9130 + }, + { + "epoch": 1.3826488162771349, + "grad_norm": 0.07461418211460114, + "learning_rate": 0.00016346, + "loss": 0.0286, + "step": 9140 + }, + { + "epoch": 1.3841615611527116, + "grad_norm": 0.11862760782241821, + "learning_rate": 0.00016342, + "loss": 0.0361, + "step": 9150 + }, + { + "epoch": 1.3856743060282883, + "grad_norm": 0.07170487195253372, + "learning_rate": 0.00016338, + "loss": 0.0335, + "step": 9160 + }, + { + "epoch": 1.387187050903865, + "grad_norm": 0.05578533932566643, + "learning_rate": 0.00016334, + "loss": 0.0311, + "step": 9170 + }, + { + "epoch": 1.3886997957794418, + "grad_norm": 0.08838359266519547, + "learning_rate": 0.0001633, + "loss": 0.0341, + "step": 9180 + }, + { + "epoch": 1.3902125406550185, + "grad_norm": 0.09284081310033798, + "learning_rate": 0.00016326, + "loss": 0.0322, + "step": 9190 + }, + { + "epoch": 1.3917252855305953, + "grad_norm": 0.07425800710916519, + "learning_rate": 0.00016322000000000003, + "loss": 0.0319, + "step": 9200 + }, + { + "epoch": 1.3917252855305953, + "eval_cer": 0.11210909414354649, + "eval_loss": 0.036687206476926804, + "eval_runtime": 10439.2076, + "eval_samples_per_second": 2.017, + "eval_steps_per_second": 0.252, + "step": 9200 + }, + { + "epoch": 1.393238030406172, + "grad_norm": 0.0754477009177208, + "learning_rate": 0.00016318, + "loss": 0.0355, + "step": 9210 + }, + { + "epoch": 1.3947507752817487, + "grad_norm": 0.06408898532390594, + "learning_rate": 0.00016314, + "loss": 0.0345, + "step": 9220 + }, + { + "epoch": 1.3962635201573255, + "grad_norm": 0.06003674492239952, + "learning_rate": 0.0001631, + "loss": 0.0316, + "step": 9230 + }, + { + "epoch": 1.3977762650329022, + "grad_norm": 0.07409165799617767, + "learning_rate": 0.00016306, + "loss": 0.03, + "step": 9240 + }, + { + "epoch": 1.399289009908479, + "grad_norm": 0.07411226630210876, + "learning_rate": 0.00016302000000000002, + "loss": 0.0325, + "step": 9250 + }, + { + "epoch": 1.4008017547840557, + "grad_norm": 0.09041300415992737, + "learning_rate": 0.00016298, + "loss": 0.034, + "step": 9260 + }, + { + "epoch": 1.4023144996596324, + "grad_norm": 0.0684356689453125, + "learning_rate": 0.00016294, + "loss": 0.0345, + "step": 9270 + }, + { + "epoch": 1.4038272445352091, + "grad_norm": 0.08621818572282791, + "learning_rate": 0.0001629, + "loss": 0.0287, + "step": 9280 + }, + { + "epoch": 1.4053399894107859, + "grad_norm": 0.09592179954051971, + "learning_rate": 0.00016286000000000002, + "loss": 0.0371, + "step": 9290 + }, + { + "epoch": 1.4068527342863626, + "grad_norm": 0.061489395797252655, + "learning_rate": 0.00016282000000000002, + "loss": 0.0297, + "step": 9300 + }, + { + "epoch": 1.4083654791619393, + "grad_norm": 0.08933687955141068, + "learning_rate": 0.00016278, + "loss": 0.0329, + "step": 9310 + }, + { + "epoch": 1.409878224037516, + "grad_norm": 0.06542832404375076, + "learning_rate": 0.00016274, + "loss": 0.0359, + "step": 9320 + }, + { + "epoch": 1.4113909689130928, + "grad_norm": 0.10515543818473816, + "learning_rate": 0.0001627, + "loss": 0.0282, + "step": 9330 + }, + { + "epoch": 1.4129037137886695, + "grad_norm": 0.11535684019327164, + "learning_rate": 0.00016266000000000002, + "loss": 0.0346, + "step": 9340 + }, + { + "epoch": 1.4144164586642463, + "grad_norm": 0.10359009355306625, + "learning_rate": 0.00016262, + "loss": 0.0326, + "step": 9350 + }, + { + "epoch": 1.415929203539823, + "grad_norm": 0.08905740082263947, + "learning_rate": 0.00016258, + "loss": 0.0353, + "step": 9360 + }, + { + "epoch": 1.4174419484153997, + "grad_norm": 0.0570446141064167, + "learning_rate": 0.00016254, + "loss": 0.0282, + "step": 9370 + }, + { + "epoch": 1.4189546932909765, + "grad_norm": 0.0748140960931778, + "learning_rate": 0.00016250000000000002, + "loss": 0.0304, + "step": 9380 + }, + { + "epoch": 1.4204674381665532, + "grad_norm": 0.07355400919914246, + "learning_rate": 0.00016246, + "loss": 0.031, + "step": 9390 + }, + { + "epoch": 1.42198018304213, + "grad_norm": 0.09431416541337967, + "learning_rate": 0.00016242, + "loss": 0.0355, + "step": 9400 + }, + { + "epoch": 1.42198018304213, + "eval_cer": 0.09460805024547048, + "eval_loss": 0.03653513640165329, + "eval_runtime": 10519.6629, + "eval_samples_per_second": 2.001, + "eval_steps_per_second": 0.25, + "step": 9400 + }, + { + "epoch": 1.4234929279177067, + "grad_norm": 0.10641132295131683, + "learning_rate": 0.00016238, + "loss": 0.0299, + "step": 9410 + }, + { + "epoch": 1.4250056727932834, + "grad_norm": 0.051270656287670135, + "learning_rate": 0.00016234, + "loss": 0.0317, + "step": 9420 + }, + { + "epoch": 1.4265184176688601, + "grad_norm": 0.07362283766269684, + "learning_rate": 0.00016230000000000001, + "loss": 0.0269, + "step": 9430 + }, + { + "epoch": 1.4280311625444368, + "grad_norm": 0.060159552842378616, + "learning_rate": 0.00016226, + "loss": 0.0335, + "step": 9440 + }, + { + "epoch": 1.4295439074200136, + "grad_norm": 0.08667318522930145, + "learning_rate": 0.00016222000000000003, + "loss": 0.0361, + "step": 9450 + }, + { + "epoch": 1.4310566522955903, + "grad_norm": 0.06154588237404823, + "learning_rate": 0.00016218, + "loss": 0.0334, + "step": 9460 + }, + { + "epoch": 1.432569397171167, + "grad_norm": 0.10563425719738007, + "learning_rate": 0.00016214000000000002, + "loss": 0.0362, + "step": 9470 + }, + { + "epoch": 1.4340821420467438, + "grad_norm": 0.10325556248426437, + "learning_rate": 0.0001621, + "loss": 0.0343, + "step": 9480 + }, + { + "epoch": 1.4355948869223205, + "grad_norm": 0.08902329206466675, + "learning_rate": 0.00016206, + "loss": 0.032, + "step": 9490 + }, + { + "epoch": 1.4371076317978972, + "grad_norm": 0.07280543446540833, + "learning_rate": 0.00016202000000000002, + "loss": 0.0366, + "step": 9500 + }, + { + "epoch": 1.438620376673474, + "grad_norm": 0.09071139991283417, + "learning_rate": 0.00016198, + "loss": 0.0299, + "step": 9510 + }, + { + "epoch": 1.4401331215490507, + "grad_norm": 0.06658421456813812, + "learning_rate": 0.00016194, + "loss": 0.0281, + "step": 9520 + }, + { + "epoch": 1.4416458664246274, + "grad_norm": 0.0793207511305809, + "learning_rate": 0.0001619, + "loss": 0.0292, + "step": 9530 + }, + { + "epoch": 1.4431586113002042, + "grad_norm": 0.0829392522573471, + "learning_rate": 0.00016186, + "loss": 0.0337, + "step": 9540 + }, + { + "epoch": 1.444671356175781, + "grad_norm": 0.061817191541194916, + "learning_rate": 0.00016182000000000002, + "loss": 0.0298, + "step": 9550 + }, + { + "epoch": 1.4461841010513576, + "grad_norm": 0.09837779402732849, + "learning_rate": 0.00016177999999999999, + "loss": 0.037, + "step": 9560 + }, + { + "epoch": 1.4476968459269344, + "grad_norm": 0.05777046084403992, + "learning_rate": 0.00016174, + "loss": 0.0339, + "step": 9570 + }, + { + "epoch": 1.449209590802511, + "grad_norm": 0.07731931656599045, + "learning_rate": 0.0001617, + "loss": 0.0338, + "step": 9580 + }, + { + "epoch": 1.4507223356780878, + "grad_norm": 0.08898504078388214, + "learning_rate": 0.00016166000000000002, + "loss": 0.0358, + "step": 9590 + }, + { + "epoch": 1.4522350805536646, + "grad_norm": 0.0696534812450409, + "learning_rate": 0.00016162000000000001, + "loss": 0.0318, + "step": 9600 + }, + { + "epoch": 1.4522350805536646, + "eval_cer": 0.08453906649568975, + "eval_loss": 0.036363635212183, + "eval_runtime": 10514.0599, + "eval_samples_per_second": 2.002, + "eval_steps_per_second": 0.25, + "step": 9600 + }, + { + "epoch": 1.4537478254292413, + "grad_norm": 0.059242941439151764, + "learning_rate": 0.00016158, + "loss": 0.0313, + "step": 9610 + }, + { + "epoch": 1.455260570304818, + "grad_norm": 0.0844852551817894, + "learning_rate": 0.00016154, + "loss": 0.034, + "step": 9620 + }, + { + "epoch": 1.4567733151803948, + "grad_norm": 0.08737514168024063, + "learning_rate": 0.0001615, + "loss": 0.0314, + "step": 9630 + }, + { + "epoch": 1.4582860600559715, + "grad_norm": 0.08028477430343628, + "learning_rate": 0.00016146000000000002, + "loss": 0.028, + "step": 9640 + }, + { + "epoch": 1.4597988049315482, + "grad_norm": 0.08293917775154114, + "learning_rate": 0.00016142, + "loss": 0.0344, + "step": 9650 + }, + { + "epoch": 1.461311549807125, + "grad_norm": 0.07055462896823883, + "learning_rate": 0.00016138, + "loss": 0.0329, + "step": 9660 + }, + { + "epoch": 1.4628242946827017, + "grad_norm": 0.08431320637464523, + "learning_rate": 0.00016134, + "loss": 0.0313, + "step": 9670 + }, + { + "epoch": 1.4643370395582784, + "grad_norm": 0.09756868332624435, + "learning_rate": 0.00016130000000000002, + "loss": 0.0305, + "step": 9680 + }, + { + "epoch": 1.4658497844338552, + "grad_norm": 0.07265082001686096, + "learning_rate": 0.00016126, + "loss": 0.0333, + "step": 9690 + }, + { + "epoch": 1.467362529309432, + "grad_norm": 0.09156455099582672, + "learning_rate": 0.00016122, + "loss": 0.0356, + "step": 9700 + }, + { + "epoch": 1.4688752741850086, + "grad_norm": 0.06957582384347916, + "learning_rate": 0.00016118, + "loss": 0.0313, + "step": 9710 + }, + { + "epoch": 1.4703880190605854, + "grad_norm": 0.06783420592546463, + "learning_rate": 0.00016114, + "loss": 0.0297, + "step": 9720 + }, + { + "epoch": 1.471900763936162, + "grad_norm": 0.07193417102098465, + "learning_rate": 0.0001611, + "loss": 0.0302, + "step": 9730 + }, + { + "epoch": 1.4734135088117388, + "grad_norm": 0.08238872140645981, + "learning_rate": 0.00016106, + "loss": 0.0335, + "step": 9740 + }, + { + "epoch": 1.4749262536873156, + "grad_norm": 0.07197025418281555, + "learning_rate": 0.00016102000000000003, + "loss": 0.0369, + "step": 9750 + }, + { + "epoch": 1.4764389985628923, + "grad_norm": 0.08109525591135025, + "learning_rate": 0.00016098, + "loss": 0.0327, + "step": 9760 + }, + { + "epoch": 1.477951743438469, + "grad_norm": 0.12331151217222214, + "learning_rate": 0.00016094000000000001, + "loss": 0.0372, + "step": 9770 + }, + { + "epoch": 1.4794644883140458, + "grad_norm": 0.08190298080444336, + "learning_rate": 0.0001609, + "loss": 0.0293, + "step": 9780 + }, + { + "epoch": 1.4809772331896225, + "grad_norm": 0.05840008333325386, + "learning_rate": 0.00016086, + "loss": 0.0349, + "step": 9790 + }, + { + "epoch": 1.4824899780651992, + "grad_norm": 0.07874023169279099, + "learning_rate": 0.00016082000000000002, + "loss": 0.0322, + "step": 9800 + }, + { + "epoch": 1.4824899780651992, + "eval_cer": 0.24973192203254985, + "eval_loss": 0.036100711673498154, + "eval_runtime": 10381.657, + "eval_samples_per_second": 2.028, + "eval_steps_per_second": 0.254, + "step": 9800 + }, + { + "epoch": 1.484002722940776, + "grad_norm": 0.0776941329240799, + "learning_rate": 0.00016078, + "loss": 0.0358, + "step": 9810 + }, + { + "epoch": 1.4855154678163527, + "grad_norm": 0.12248267233371735, + "learning_rate": 0.00016074, + "loss": 0.0356, + "step": 9820 + }, + { + "epoch": 1.4870282126919294, + "grad_norm": 0.08847146481275558, + "learning_rate": 0.0001607, + "loss": 0.0274, + "step": 9830 + }, + { + "epoch": 1.4885409575675062, + "grad_norm": 0.0689850002527237, + "learning_rate": 0.00016066000000000002, + "loss": 0.0266, + "step": 9840 + }, + { + "epoch": 1.4900537024430829, + "grad_norm": 0.06342552602291107, + "learning_rate": 0.00016062000000000002, + "loss": 0.031, + "step": 9850 + }, + { + "epoch": 1.4915664473186596, + "grad_norm": 0.11846140772104263, + "learning_rate": 0.00016057999999999998, + "loss": 0.0348, + "step": 9860 + }, + { + "epoch": 1.4930791921942363, + "grad_norm": 0.07698410004377365, + "learning_rate": 0.00016054, + "loss": 0.0259, + "step": 9870 + }, + { + "epoch": 1.494591937069813, + "grad_norm": 0.11177106946706772, + "learning_rate": 0.0001605, + "loss": 0.0301, + "step": 9880 + }, + { + "epoch": 1.4961046819453898, + "grad_norm": 0.09459209442138672, + "learning_rate": 0.00016046000000000002, + "loss": 0.0349, + "step": 9890 + }, + { + "epoch": 1.4976174268209665, + "grad_norm": 0.08800119906663895, + "learning_rate": 0.00016042, + "loss": 0.0335, + "step": 9900 + }, + { + "epoch": 1.4991301716965433, + "grad_norm": 0.09330447763204575, + "learning_rate": 0.00016038, + "loss": 0.0326, + "step": 9910 + }, + { + "epoch": 1.50064291657212, + "grad_norm": 0.10210063308477402, + "learning_rate": 0.00016034, + "loss": 0.035, + "step": 9920 + }, + { + "epoch": 1.5021556614476967, + "grad_norm": 0.11886809766292572, + "learning_rate": 0.0001603, + "loss": 0.036, + "step": 9930 + }, + { + "epoch": 1.5036684063232735, + "grad_norm": 0.07646410167217255, + "learning_rate": 0.00016026000000000001, + "loss": 0.0269, + "step": 9940 + }, + { + "epoch": 1.5051811511988502, + "grad_norm": 0.09994587302207947, + "learning_rate": 0.00016022, + "loss": 0.0298, + "step": 9950 + }, + { + "epoch": 1.506693896074427, + "grad_norm": 0.0781632736325264, + "learning_rate": 0.00016018, + "loss": 0.0299, + "step": 9960 + }, + { + "epoch": 1.5082066409500037, + "grad_norm": 0.09286709874868393, + "learning_rate": 0.00016014, + "loss": 0.0334, + "step": 9970 + }, + { + "epoch": 1.5097193858255804, + "grad_norm": 0.08658807724714279, + "learning_rate": 0.00016010000000000002, + "loss": 0.032, + "step": 9980 + }, + { + "epoch": 1.5112321307011571, + "grad_norm": 0.09535326808691025, + "learning_rate": 0.00016006, + "loss": 0.032, + "step": 9990 + }, + { + "epoch": 1.5127448755767339, + "grad_norm": 0.056372299790382385, + "learning_rate": 0.00016002, + "loss": 0.033, + "step": 10000 + }, + { + "epoch": 1.5127448755767339, + "eval_cer": 0.1808933296766016, + "eval_loss": 0.03580623120069504, + "eval_runtime": 10388.4948, + "eval_samples_per_second": 2.026, + "eval_steps_per_second": 0.253, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 50000, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.622822387689695e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}