{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 12313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.121497604158207e-05, "grad_norm": 6.562383818863084, "learning_rate": 1.3513513513513516e-08, "loss": 1.0675, "step": 1 }, { "epoch": 0.00016242995208316414, "grad_norm": 8.277066733441483, "learning_rate": 2.702702702702703e-08, "loss": 1.3477, "step": 2 }, { "epoch": 0.0002436449281247462, "grad_norm": 5.566945299122935, "learning_rate": 4.0540540540540545e-08, "loss": 0.982, "step": 3 }, { "epoch": 0.0003248599041663283, "grad_norm": 20.123631677375503, "learning_rate": 5.405405405405406e-08, "loss": 1.1009, "step": 4 }, { "epoch": 0.00040607488020791033, "grad_norm": 7.516929368064332, "learning_rate": 6.756756756756757e-08, "loss": 1.1767, "step": 5 }, { "epoch": 0.0004872898562494924, "grad_norm": 6.534930634637095, "learning_rate": 8.108108108108109e-08, "loss": 0.9279, "step": 6 }, { "epoch": 0.0005685048322910744, "grad_norm": 8.547137659048731, "learning_rate": 9.459459459459461e-08, "loss": 1.1229, "step": 7 }, { "epoch": 0.0006497198083326566, "grad_norm": 8.40063081083683, "learning_rate": 1.0810810810810812e-07, "loss": 0.9784, "step": 8 }, { "epoch": 0.0007309347843742387, "grad_norm": 7.113188011014463, "learning_rate": 1.2162162162162163e-07, "loss": 1.007, "step": 9 }, { "epoch": 0.0008121497604158207, "grad_norm": 9.511299992179831, "learning_rate": 1.3513513513513515e-07, "loss": 1.0264, "step": 10 }, { "epoch": 0.0008933647364574028, "grad_norm": 6.234366333859408, "learning_rate": 1.4864864864864866e-07, "loss": 1.1152, "step": 11 }, { "epoch": 0.0009745797124989848, "grad_norm": 8.300258843057527, "learning_rate": 1.6216216216216218e-07, "loss": 0.9787, "step": 12 }, { "epoch": 0.0010557946885405669, "grad_norm": 6.394938331200344, "learning_rate": 1.756756756756757e-07, "loss": 1.0764, "step": 13 }, { "epoch": 0.0011370096645821489, "grad_norm": 5.924100005331387, "learning_rate": 1.8918918918918921e-07, "loss": 1.0187, "step": 14 }, { "epoch": 0.001218224640623731, "grad_norm": 7.076697509559219, "learning_rate": 2.0270270270270273e-07, "loss": 1.3016, "step": 15 }, { "epoch": 0.001299439616665313, "grad_norm": 7.949795337583316, "learning_rate": 2.1621621621621625e-07, "loss": 1.1151, "step": 16 }, { "epoch": 0.001380654592706895, "grad_norm": 7.324802587536972, "learning_rate": 2.2972972972972977e-07, "loss": 1.0339, "step": 17 }, { "epoch": 0.0014618695687484773, "grad_norm": 7.644757636442122, "learning_rate": 2.4324324324324326e-07, "loss": 1.0653, "step": 18 }, { "epoch": 0.0015430845447900593, "grad_norm": 19.41567123436543, "learning_rate": 2.567567567567568e-07, "loss": 1.2616, "step": 19 }, { "epoch": 0.0016242995208316413, "grad_norm": 7.096406348798535, "learning_rate": 2.702702702702703e-07, "loss": 1.0637, "step": 20 }, { "epoch": 0.0017055144968732233, "grad_norm": 9.633919383798274, "learning_rate": 2.837837837837838e-07, "loss": 1.185, "step": 21 }, { "epoch": 0.0017867294729148055, "grad_norm": 10.180555284457757, "learning_rate": 2.972972972972973e-07, "loss": 1.0474, "step": 22 }, { "epoch": 0.0018679444489563875, "grad_norm": 8.634935943189125, "learning_rate": 3.1081081081081084e-07, "loss": 0.9087, "step": 23 }, { "epoch": 0.0019491594249979695, "grad_norm": 7.712256121372059, "learning_rate": 3.2432432432432436e-07, "loss": 1.2098, "step": 24 }, { "epoch": 0.0020303744010395515, "grad_norm": 7.212382477186713, "learning_rate": 3.378378378378379e-07, "loss": 0.9682, "step": 25 }, { "epoch": 0.0021115893770811338, "grad_norm": 8.965382654928913, "learning_rate": 3.513513513513514e-07, "loss": 0.9742, "step": 26 }, { "epoch": 0.002192804353122716, "grad_norm": 9.333155078594814, "learning_rate": 3.648648648648649e-07, "loss": 0.9463, "step": 27 }, { "epoch": 0.0022740193291642978, "grad_norm": 5.770329415577018, "learning_rate": 3.7837837837837843e-07, "loss": 0.9216, "step": 28 }, { "epoch": 0.00235523430520588, "grad_norm": 7.19909286290371, "learning_rate": 3.9189189189189195e-07, "loss": 0.9299, "step": 29 }, { "epoch": 0.002436449281247462, "grad_norm": 8.198959079831749, "learning_rate": 4.0540540540540546e-07, "loss": 1.2135, "step": 30 }, { "epoch": 0.002517664257289044, "grad_norm": 7.469329596056761, "learning_rate": 4.18918918918919e-07, "loss": 1.162, "step": 31 }, { "epoch": 0.002598879233330626, "grad_norm": 6.064802318250161, "learning_rate": 4.324324324324325e-07, "loss": 1.0726, "step": 32 }, { "epoch": 0.0026800942093722084, "grad_norm": 7.561733885808792, "learning_rate": 4.45945945945946e-07, "loss": 0.9132, "step": 33 }, { "epoch": 0.00276130918541379, "grad_norm": 5.510848606396675, "learning_rate": 4.5945945945945953e-07, "loss": 0.8835, "step": 34 }, { "epoch": 0.0028425241614553724, "grad_norm": 5.05147854422008, "learning_rate": 4.7297297297297305e-07, "loss": 1.025, "step": 35 }, { "epoch": 0.0029237391374969546, "grad_norm": 7.449384043723888, "learning_rate": 4.864864864864865e-07, "loss": 1.0634, "step": 36 }, { "epoch": 0.0030049541135385364, "grad_norm": 5.554970539302449, "learning_rate": 5.000000000000001e-07, "loss": 0.9037, "step": 37 }, { "epoch": 0.0030861690895801186, "grad_norm": 5.648676923146146, "learning_rate": 5.135135135135135e-07, "loss": 1.0197, "step": 38 }, { "epoch": 0.0031673840656217004, "grad_norm": 5.581517917957071, "learning_rate": 5.270270270270271e-07, "loss": 0.9723, "step": 39 }, { "epoch": 0.0032485990416632827, "grad_norm": 6.811513573858421, "learning_rate": 5.405405405405406e-07, "loss": 1.0647, "step": 40 }, { "epoch": 0.003329814017704865, "grad_norm": 13.02023304402827, "learning_rate": 5.540540540540542e-07, "loss": 0.9535, "step": 41 }, { "epoch": 0.0034110289937464467, "grad_norm": 5.903154525353516, "learning_rate": 5.675675675675676e-07, "loss": 0.9207, "step": 42 }, { "epoch": 0.003492243969788029, "grad_norm": 5.713506629308762, "learning_rate": 5.810810810810812e-07, "loss": 0.901, "step": 43 }, { "epoch": 0.003573458945829611, "grad_norm": 7.568618399596037, "learning_rate": 5.945945945945947e-07, "loss": 0.8387, "step": 44 }, { "epoch": 0.003654673921871193, "grad_norm": 4.684278865041181, "learning_rate": 6.081081081081082e-07, "loss": 0.9369, "step": 45 }, { "epoch": 0.003735888897912775, "grad_norm": 4.719765340896398, "learning_rate": 6.216216216216217e-07, "loss": 0.9138, "step": 46 }, { "epoch": 0.0038171038739543573, "grad_norm": 6.215441613276706, "learning_rate": 6.351351351351353e-07, "loss": 0.8665, "step": 47 }, { "epoch": 0.003898318849995939, "grad_norm": 5.80882937072084, "learning_rate": 6.486486486486487e-07, "loss": 0.8277, "step": 48 }, { "epoch": 0.003979533826037522, "grad_norm": 4.4336706590480786, "learning_rate": 6.621621621621623e-07, "loss": 0.9194, "step": 49 }, { "epoch": 0.004060748802079103, "grad_norm": 6.287882016956921, "learning_rate": 6.756756756756758e-07, "loss": 0.997, "step": 50 }, { "epoch": 0.004141963778120685, "grad_norm": 5.639526953982635, "learning_rate": 6.891891891891893e-07, "loss": 0.9637, "step": 51 }, { "epoch": 0.0042231787541622675, "grad_norm": 5.51310604984265, "learning_rate": 7.027027027027028e-07, "loss": 0.8232, "step": 52 }, { "epoch": 0.00430439373020385, "grad_norm": 4.517639703226352, "learning_rate": 7.162162162162164e-07, "loss": 0.8983, "step": 53 }, { "epoch": 0.004385608706245432, "grad_norm": 6.666343453939387, "learning_rate": 7.297297297297298e-07, "loss": 0.7985, "step": 54 }, { "epoch": 0.004466823682287013, "grad_norm": 5.422191276786231, "learning_rate": 7.432432432432434e-07, "loss": 0.9058, "step": 55 }, { "epoch": 0.0045480386583285955, "grad_norm": 4.624429643253304, "learning_rate": 7.567567567567569e-07, "loss": 0.7943, "step": 56 }, { "epoch": 0.004629253634370178, "grad_norm": 4.391972463636353, "learning_rate": 7.702702702702704e-07, "loss": 0.8046, "step": 57 }, { "epoch": 0.00471046861041176, "grad_norm": 7.0317366536589745, "learning_rate": 7.837837837837839e-07, "loss": 0.96, "step": 58 }, { "epoch": 0.004791683586453342, "grad_norm": 6.253750275868285, "learning_rate": 7.972972972972974e-07, "loss": 0.8267, "step": 59 }, { "epoch": 0.004872898562494924, "grad_norm": 4.2004573670066545, "learning_rate": 8.108108108108109e-07, "loss": 0.7541, "step": 60 }, { "epoch": 0.004954113538536506, "grad_norm": 4.843922828461304, "learning_rate": 8.243243243243244e-07, "loss": 0.8652, "step": 61 }, { "epoch": 0.005035328514578088, "grad_norm": 6.663532165513732, "learning_rate": 8.37837837837838e-07, "loss": 0.9765, "step": 62 }, { "epoch": 0.00511654349061967, "grad_norm": 5.518475179846077, "learning_rate": 8.513513513513514e-07, "loss": 0.8343, "step": 63 }, { "epoch": 0.005197758466661252, "grad_norm": 4.16743993524973, "learning_rate": 8.64864864864865e-07, "loss": 0.8835, "step": 64 }, { "epoch": 0.005278973442702835, "grad_norm": 4.724546088137992, "learning_rate": 8.783783783783785e-07, "loss": 0.8031, "step": 65 }, { "epoch": 0.005360188418744417, "grad_norm": 5.965079554752788, "learning_rate": 8.91891891891892e-07, "loss": 0.8464, "step": 66 }, { "epoch": 0.005441403394785998, "grad_norm": 7.388225521826432, "learning_rate": 9.054054054054055e-07, "loss": 0.9254, "step": 67 }, { "epoch": 0.00552261837082758, "grad_norm": 4.486467374218727, "learning_rate": 9.189189189189191e-07, "loss": 0.7852, "step": 68 }, { "epoch": 0.005603833346869163, "grad_norm": 4.818486349074325, "learning_rate": 9.324324324324325e-07, "loss": 0.6409, "step": 69 }, { "epoch": 0.005685048322910745, "grad_norm": 12.591926984529453, "learning_rate": 9.459459459459461e-07, "loss": 0.6555, "step": 70 }, { "epoch": 0.005766263298952327, "grad_norm": 3.582506346281597, "learning_rate": 9.594594594594596e-07, "loss": 0.7594, "step": 71 }, { "epoch": 0.005847478274993909, "grad_norm": 6.316272242206502, "learning_rate": 9.72972972972973e-07, "loss": 0.8258, "step": 72 }, { "epoch": 0.005928693251035491, "grad_norm": 7.768203703731759, "learning_rate": 9.864864864864867e-07, "loss": 0.7343, "step": 73 }, { "epoch": 0.006009908227077073, "grad_norm": 4.411865830101082, "learning_rate": 1.0000000000000002e-06, "loss": 0.8984, "step": 74 }, { "epoch": 0.006091123203118655, "grad_norm": 6.221797107498061, "learning_rate": 1.0135135135135136e-06, "loss": 0.8646, "step": 75 }, { "epoch": 0.006172338179160237, "grad_norm": 4.668292899364404, "learning_rate": 1.027027027027027e-06, "loss": 0.7082, "step": 76 }, { "epoch": 0.0062535531552018195, "grad_norm": 5.307505469950394, "learning_rate": 1.0405405405405408e-06, "loss": 0.7584, "step": 77 }, { "epoch": 0.006334768131243401, "grad_norm": 5.411978113961716, "learning_rate": 1.0540540540540542e-06, "loss": 0.8715, "step": 78 }, { "epoch": 0.006415983107284983, "grad_norm": 11.725200388824204, "learning_rate": 1.0675675675675677e-06, "loss": 0.8187, "step": 79 }, { "epoch": 0.006497198083326565, "grad_norm": 7.066619804350216, "learning_rate": 1.0810810810810812e-06, "loss": 0.7657, "step": 80 }, { "epoch": 0.0065784130593681475, "grad_norm": 5.206447831213838, "learning_rate": 1.0945945945945948e-06, "loss": 1.0916, "step": 81 }, { "epoch": 0.00665962803540973, "grad_norm": 6.270913752130497, "learning_rate": 1.1081081081081083e-06, "loss": 0.695, "step": 82 }, { "epoch": 0.006740843011451312, "grad_norm": 5.602218933207931, "learning_rate": 1.1216216216216218e-06, "loss": 0.7361, "step": 83 }, { "epoch": 0.006822057987492893, "grad_norm": 5.592916777746583, "learning_rate": 1.1351351351351352e-06, "loss": 0.6823, "step": 84 }, { "epoch": 0.0069032729635344755, "grad_norm": 6.7335919368870565, "learning_rate": 1.148648648648649e-06, "loss": 0.7927, "step": 85 }, { "epoch": 0.006984487939576058, "grad_norm": 7.941076590684681, "learning_rate": 1.1621621621621624e-06, "loss": 0.8417, "step": 86 }, { "epoch": 0.00706570291561764, "grad_norm": 5.969244160464252, "learning_rate": 1.1756756756756758e-06, "loss": 0.689, "step": 87 }, { "epoch": 0.007146917891659222, "grad_norm": 5.118618058340234, "learning_rate": 1.1891891891891893e-06, "loss": 0.9608, "step": 88 }, { "epoch": 0.007228132867700804, "grad_norm": 8.55651778885645, "learning_rate": 1.2027027027027028e-06, "loss": 0.6376, "step": 89 }, { "epoch": 0.007309347843742386, "grad_norm": 8.9312254966113, "learning_rate": 1.2162162162162164e-06, "loss": 0.8781, "step": 90 }, { "epoch": 0.007390562819783968, "grad_norm": 5.435421842215974, "learning_rate": 1.22972972972973e-06, "loss": 0.7296, "step": 91 }, { "epoch": 0.00747177779582555, "grad_norm": 7.00575424705863, "learning_rate": 1.2432432432432434e-06, "loss": 0.7004, "step": 92 }, { "epoch": 0.007552992771867132, "grad_norm": 5.126051571596671, "learning_rate": 1.2567567567567568e-06, "loss": 0.6933, "step": 93 }, { "epoch": 0.007634207747908715, "grad_norm": 7.546722570854549, "learning_rate": 1.2702702702702705e-06, "loss": 0.7443, "step": 94 }, { "epoch": 0.007715422723950297, "grad_norm": 5.82298529351105, "learning_rate": 1.2837837837837838e-06, "loss": 0.6609, "step": 95 }, { "epoch": 0.007796637699991878, "grad_norm": 5.395134658996866, "learning_rate": 1.2972972972972974e-06, "loss": 0.6966, "step": 96 }, { "epoch": 0.007877852676033461, "grad_norm": 5.715845533979447, "learning_rate": 1.310810810810811e-06, "loss": 0.8075, "step": 97 }, { "epoch": 0.007959067652075043, "grad_norm": 4.346663727077859, "learning_rate": 1.3243243243243246e-06, "loss": 0.8382, "step": 98 }, { "epoch": 0.008040282628116624, "grad_norm": 4.510360891483355, "learning_rate": 1.3378378378378378e-06, "loss": 0.8409, "step": 99 }, { "epoch": 0.008121497604158206, "grad_norm": 6.252214797052545, "learning_rate": 1.3513513513513515e-06, "loss": 0.8968, "step": 100 }, { "epoch": 0.008202712580199788, "grad_norm": 4.133668140760274, "learning_rate": 1.364864864864865e-06, "loss": 0.7697, "step": 101 }, { "epoch": 0.00828392755624137, "grad_norm": 5.13995005645122, "learning_rate": 1.3783783783783786e-06, "loss": 0.8302, "step": 102 }, { "epoch": 0.008365142532282953, "grad_norm": 5.645133972443234, "learning_rate": 1.391891891891892e-06, "loss": 0.8553, "step": 103 }, { "epoch": 0.008446357508324535, "grad_norm": 5.768137707230809, "learning_rate": 1.4054054054054056e-06, "loss": 0.7269, "step": 104 }, { "epoch": 0.008527572484366117, "grad_norm": 3.9384737579894358, "learning_rate": 1.418918918918919e-06, "loss": 0.8465, "step": 105 }, { "epoch": 0.0086087874604077, "grad_norm": 6.608951862752225, "learning_rate": 1.4324324324324327e-06, "loss": 0.7358, "step": 106 }, { "epoch": 0.008690002436449282, "grad_norm": 4.8856541626722425, "learning_rate": 1.445945945945946e-06, "loss": 0.9122, "step": 107 }, { "epoch": 0.008771217412490864, "grad_norm": 3.744812822198274, "learning_rate": 1.4594594594594596e-06, "loss": 0.7847, "step": 108 }, { "epoch": 0.008852432388532446, "grad_norm": 4.555012637892476, "learning_rate": 1.4729729729729731e-06, "loss": 0.6125, "step": 109 }, { "epoch": 0.008933647364574027, "grad_norm": 5.417265029398178, "learning_rate": 1.4864864864864868e-06, "loss": 0.6426, "step": 110 }, { "epoch": 0.009014862340615609, "grad_norm": 6.978404404546592, "learning_rate": 1.5e-06, "loss": 0.8351, "step": 111 }, { "epoch": 0.009096077316657191, "grad_norm": 7.486793674946231, "learning_rate": 1.5135135135135137e-06, "loss": 0.9104, "step": 112 }, { "epoch": 0.009177292292698773, "grad_norm": 6.427379250554024, "learning_rate": 1.5270270270270272e-06, "loss": 0.7182, "step": 113 }, { "epoch": 0.009258507268740356, "grad_norm": 7.581127803220287, "learning_rate": 1.5405405405405409e-06, "loss": 0.7966, "step": 114 }, { "epoch": 0.009339722244781938, "grad_norm": 4.51084883429586, "learning_rate": 1.5540540540540541e-06, "loss": 0.7349, "step": 115 }, { "epoch": 0.00942093722082352, "grad_norm": 5.521276273858657, "learning_rate": 1.5675675675675678e-06, "loss": 0.6476, "step": 116 }, { "epoch": 0.009502152196865102, "grad_norm": 10.232542435537884, "learning_rate": 1.5810810810810812e-06, "loss": 0.7003, "step": 117 }, { "epoch": 0.009583367172906684, "grad_norm": 5.670487468601197, "learning_rate": 1.5945945945945947e-06, "loss": 0.921, "step": 118 }, { "epoch": 0.009664582148948267, "grad_norm": 5.136554782606984, "learning_rate": 1.6081081081081082e-06, "loss": 0.6932, "step": 119 }, { "epoch": 0.009745797124989849, "grad_norm": 3.81643952542769, "learning_rate": 1.6216216216216219e-06, "loss": 0.7676, "step": 120 }, { "epoch": 0.009827012101031431, "grad_norm": 7.129104413729071, "learning_rate": 1.6351351351351353e-06, "loss": 0.7903, "step": 121 }, { "epoch": 0.009908227077073012, "grad_norm": 5.89283670831236, "learning_rate": 1.6486486486486488e-06, "loss": 0.6741, "step": 122 }, { "epoch": 0.009989442053114594, "grad_norm": 4.728889968775353, "learning_rate": 1.6621621621621622e-06, "loss": 0.7493, "step": 123 }, { "epoch": 0.010070657029156176, "grad_norm": 5.400043416105325, "learning_rate": 1.675675675675676e-06, "loss": 0.6254, "step": 124 }, { "epoch": 0.010151872005197758, "grad_norm": 5.7442992971209605, "learning_rate": 1.6891891891891894e-06, "loss": 0.6588, "step": 125 }, { "epoch": 0.01023308698123934, "grad_norm": 5.225132595645112, "learning_rate": 1.7027027027027028e-06, "loss": 0.721, "step": 126 }, { "epoch": 0.010314301957280923, "grad_norm": 4.917827752874381, "learning_rate": 1.7162162162162163e-06, "loss": 0.6719, "step": 127 }, { "epoch": 0.010395516933322505, "grad_norm": 5.96105392795946, "learning_rate": 1.72972972972973e-06, "loss": 0.8192, "step": 128 }, { "epoch": 0.010476731909364087, "grad_norm": 5.348491477184876, "learning_rate": 1.7432432432432432e-06, "loss": 0.741, "step": 129 }, { "epoch": 0.01055794688540567, "grad_norm": 4.046650669596968, "learning_rate": 1.756756756756757e-06, "loss": 0.6943, "step": 130 }, { "epoch": 0.010639161861447251, "grad_norm": 5.045700573097575, "learning_rate": 1.7702702702702704e-06, "loss": 0.5737, "step": 131 }, { "epoch": 0.010720376837488834, "grad_norm": 4.332973323075942, "learning_rate": 1.783783783783784e-06, "loss": 0.7721, "step": 132 }, { "epoch": 0.010801591813530414, "grad_norm": 12.053968494279287, "learning_rate": 1.7972972972972973e-06, "loss": 0.7081, "step": 133 }, { "epoch": 0.010882806789571996, "grad_norm": 4.314977931658641, "learning_rate": 1.810810810810811e-06, "loss": 0.7476, "step": 134 }, { "epoch": 0.010964021765613579, "grad_norm": 4.372992825514325, "learning_rate": 1.8243243243243245e-06, "loss": 0.5623, "step": 135 }, { "epoch": 0.01104523674165516, "grad_norm": 5.06501074840964, "learning_rate": 1.8378378378378381e-06, "loss": 0.8409, "step": 136 }, { "epoch": 0.011126451717696743, "grad_norm": 5.486341378807148, "learning_rate": 1.8513513513513514e-06, "loss": 0.7097, "step": 137 }, { "epoch": 0.011207666693738325, "grad_norm": 5.885624763272731, "learning_rate": 1.864864864864865e-06, "loss": 0.8605, "step": 138 }, { "epoch": 0.011288881669779907, "grad_norm": 5.810017768183461, "learning_rate": 1.8783783783783785e-06, "loss": 0.7589, "step": 139 }, { "epoch": 0.01137009664582149, "grad_norm": 5.241258116276019, "learning_rate": 1.8918918918918922e-06, "loss": 0.6821, "step": 140 }, { "epoch": 0.011451311621863072, "grad_norm": 5.141543904486299, "learning_rate": 1.9054054054054054e-06, "loss": 0.6702, "step": 141 }, { "epoch": 0.011532526597904654, "grad_norm": 6.614627634564522, "learning_rate": 1.918918918918919e-06, "loss": 0.7541, "step": 142 }, { "epoch": 0.011613741573946236, "grad_norm": 4.270716146835968, "learning_rate": 1.9324324324324326e-06, "loss": 0.6118, "step": 143 }, { "epoch": 0.011694956549987819, "grad_norm": 4.833439173611212, "learning_rate": 1.945945945945946e-06, "loss": 0.6631, "step": 144 }, { "epoch": 0.011776171526029399, "grad_norm": 6.670666111898159, "learning_rate": 1.9594594594594595e-06, "loss": 0.5792, "step": 145 }, { "epoch": 0.011857386502070981, "grad_norm": 5.320699935552583, "learning_rate": 1.9729729729729734e-06, "loss": 0.6649, "step": 146 }, { "epoch": 0.011938601478112564, "grad_norm": 10.048210540269071, "learning_rate": 1.9864864864864864e-06, "loss": 0.6055, "step": 147 }, { "epoch": 0.012019816454154146, "grad_norm": 4.8358064531705525, "learning_rate": 2.0000000000000003e-06, "loss": 0.7224, "step": 148 }, { "epoch": 0.012101031430195728, "grad_norm": 8.585169822222198, "learning_rate": 2.013513513513514e-06, "loss": 0.7546, "step": 149 }, { "epoch": 0.01218224640623731, "grad_norm": 4.847625342598578, "learning_rate": 2.0270270270270273e-06, "loss": 0.6867, "step": 150 }, { "epoch": 0.012263461382278892, "grad_norm": 3.7722782135223594, "learning_rate": 2.0405405405405407e-06, "loss": 0.7262, "step": 151 }, { "epoch": 0.012344676358320475, "grad_norm": 5.149292525910975, "learning_rate": 2.054054054054054e-06, "loss": 0.8792, "step": 152 }, { "epoch": 0.012425891334362057, "grad_norm": 6.26503383092344, "learning_rate": 2.0675675675675677e-06, "loss": 0.7837, "step": 153 }, { "epoch": 0.012507106310403639, "grad_norm": 4.188912763837703, "learning_rate": 2.0810810810810815e-06, "loss": 0.5692, "step": 154 }, { "epoch": 0.012588321286445221, "grad_norm": 5.04456436284514, "learning_rate": 2.0945945945945946e-06, "loss": 0.652, "step": 155 }, { "epoch": 0.012669536262486802, "grad_norm": 7.405968257695848, "learning_rate": 2.1081081081081085e-06, "loss": 0.6266, "step": 156 }, { "epoch": 0.012750751238528384, "grad_norm": 5.310169429627866, "learning_rate": 2.121621621621622e-06, "loss": 0.6542, "step": 157 }, { "epoch": 0.012831966214569966, "grad_norm": 5.046936455927635, "learning_rate": 2.1351351351351354e-06, "loss": 0.8108, "step": 158 }, { "epoch": 0.012913181190611548, "grad_norm": 4.036458244440137, "learning_rate": 2.148648648648649e-06, "loss": 0.6339, "step": 159 }, { "epoch": 0.01299439616665313, "grad_norm": 5.254253720134002, "learning_rate": 2.1621621621621623e-06, "loss": 0.7129, "step": 160 }, { "epoch": 0.013075611142694713, "grad_norm": 4.842001153737952, "learning_rate": 2.175675675675676e-06, "loss": 0.7669, "step": 161 }, { "epoch": 0.013156826118736295, "grad_norm": 3.45883013736874, "learning_rate": 2.1891891891891897e-06, "loss": 0.7403, "step": 162 }, { "epoch": 0.013238041094777877, "grad_norm": 6.615557242600742, "learning_rate": 2.2027027027027027e-06, "loss": 0.6072, "step": 163 }, { "epoch": 0.01331925607081946, "grad_norm": 5.295390317262873, "learning_rate": 2.2162162162162166e-06, "loss": 0.4518, "step": 164 }, { "epoch": 0.013400471046861042, "grad_norm": 4.553708607425031, "learning_rate": 2.22972972972973e-06, "loss": 0.7405, "step": 165 }, { "epoch": 0.013481686022902624, "grad_norm": 5.730983978090373, "learning_rate": 2.2432432432432435e-06, "loss": 0.7216, "step": 166 }, { "epoch": 0.013562900998944206, "grad_norm": 6.28068035640207, "learning_rate": 2.256756756756757e-06, "loss": 0.543, "step": 167 }, { "epoch": 0.013644115974985787, "grad_norm": 5.534240505988875, "learning_rate": 2.2702702702702705e-06, "loss": 0.7273, "step": 168 }, { "epoch": 0.013725330951027369, "grad_norm": 10.022692075346896, "learning_rate": 2.283783783783784e-06, "loss": 0.6259, "step": 169 }, { "epoch": 0.013806545927068951, "grad_norm": 4.133335838910277, "learning_rate": 2.297297297297298e-06, "loss": 0.5538, "step": 170 }, { "epoch": 0.013887760903110533, "grad_norm": 6.849412426150848, "learning_rate": 2.310810810810811e-06, "loss": 0.5594, "step": 171 }, { "epoch": 0.013968975879152115, "grad_norm": 7.2240081386637955, "learning_rate": 2.3243243243243247e-06, "loss": 0.6406, "step": 172 }, { "epoch": 0.014050190855193698, "grad_norm": 4.948498140450114, "learning_rate": 2.337837837837838e-06, "loss": 0.697, "step": 173 }, { "epoch": 0.01413140583123528, "grad_norm": 3.8421671175611465, "learning_rate": 2.3513513513513517e-06, "loss": 0.709, "step": 174 }, { "epoch": 0.014212620807276862, "grad_norm": 4.9263286743150925, "learning_rate": 2.364864864864865e-06, "loss": 0.5944, "step": 175 }, { "epoch": 0.014293835783318444, "grad_norm": 4.181352300851268, "learning_rate": 2.3783783783783786e-06, "loss": 0.7951, "step": 176 }, { "epoch": 0.014375050759360027, "grad_norm": 5.217958869841234, "learning_rate": 2.391891891891892e-06, "loss": 0.8847, "step": 177 }, { "epoch": 0.014456265735401609, "grad_norm": 5.53287305794427, "learning_rate": 2.4054054054054055e-06, "loss": 0.6506, "step": 178 }, { "epoch": 0.01453748071144319, "grad_norm": 4.977958862414195, "learning_rate": 2.418918918918919e-06, "loss": 0.649, "step": 179 }, { "epoch": 0.014618695687484771, "grad_norm": 7.079003805106819, "learning_rate": 2.432432432432433e-06, "loss": 0.5173, "step": 180 }, { "epoch": 0.014699910663526354, "grad_norm": 6.4006738662761675, "learning_rate": 2.4459459459459463e-06, "loss": 0.7425, "step": 181 }, { "epoch": 0.014781125639567936, "grad_norm": 7.223867008336038, "learning_rate": 2.45945945945946e-06, "loss": 0.7116, "step": 182 }, { "epoch": 0.014862340615609518, "grad_norm": 4.74150217223079, "learning_rate": 2.4729729729729733e-06, "loss": 0.8202, "step": 183 }, { "epoch": 0.0149435555916511, "grad_norm": 5.838162332180424, "learning_rate": 2.4864864864864867e-06, "loss": 0.7202, "step": 184 }, { "epoch": 0.015024770567692683, "grad_norm": 7.3164412014837845, "learning_rate": 2.5e-06, "loss": 0.6318, "step": 185 }, { "epoch": 0.015105985543734265, "grad_norm": 4.0466470539819674, "learning_rate": 2.5135135135135137e-06, "loss": 0.6161, "step": 186 }, { "epoch": 0.015187200519775847, "grad_norm": 4.20696930651959, "learning_rate": 2.527027027027027e-06, "loss": 0.8844, "step": 187 }, { "epoch": 0.01526841549581743, "grad_norm": 4.619062654793844, "learning_rate": 2.540540540540541e-06, "loss": 0.7893, "step": 188 }, { "epoch": 0.015349630471859011, "grad_norm": 4.028095158003242, "learning_rate": 2.554054054054054e-06, "loss": 0.9053, "step": 189 }, { "epoch": 0.015430845447900594, "grad_norm": 5.474147278368468, "learning_rate": 2.5675675675675675e-06, "loss": 0.6785, "step": 190 }, { "epoch": 0.015512060423942174, "grad_norm": 7.068401125839219, "learning_rate": 2.581081081081081e-06, "loss": 0.6353, "step": 191 }, { "epoch": 0.015593275399983756, "grad_norm": 6.211801926350057, "learning_rate": 2.594594594594595e-06, "loss": 0.6171, "step": 192 }, { "epoch": 0.01567449037602534, "grad_norm": 6.8766856866833805, "learning_rate": 2.6081081081081083e-06, "loss": 0.6291, "step": 193 }, { "epoch": 0.015755705352066923, "grad_norm": 4.450746282218491, "learning_rate": 2.621621621621622e-06, "loss": 0.7322, "step": 194 }, { "epoch": 0.015836920328108503, "grad_norm": 6.737626992175194, "learning_rate": 2.6351351351351353e-06, "loss": 0.8431, "step": 195 }, { "epoch": 0.015918135304150087, "grad_norm": 5.202217528227061, "learning_rate": 2.648648648648649e-06, "loss": 0.6027, "step": 196 }, { "epoch": 0.015999350280191667, "grad_norm": 6.136140183171829, "learning_rate": 2.662162162162162e-06, "loss": 0.7227, "step": 197 }, { "epoch": 0.016080565256233248, "grad_norm": 6.20592135137412, "learning_rate": 2.6756756756756757e-06, "loss": 0.8206, "step": 198 }, { "epoch": 0.016161780232274832, "grad_norm": 5.542234344965935, "learning_rate": 2.689189189189189e-06, "loss": 0.5755, "step": 199 }, { "epoch": 0.016242995208316412, "grad_norm": 6.481129594142135, "learning_rate": 2.702702702702703e-06, "loss": 0.9124, "step": 200 }, { "epoch": 0.016324210184357996, "grad_norm": 8.819932985194717, "learning_rate": 2.7162162162162165e-06, "loss": 0.7077, "step": 201 }, { "epoch": 0.016405425160399577, "grad_norm": 3.7133046294352496, "learning_rate": 2.72972972972973e-06, "loss": 0.7187, "step": 202 }, { "epoch": 0.01648664013644116, "grad_norm": 4.768904441511917, "learning_rate": 2.7432432432432434e-06, "loss": 0.7079, "step": 203 }, { "epoch": 0.01656785511248274, "grad_norm": 5.4109232849671445, "learning_rate": 2.7567567567567573e-06, "loss": 0.8741, "step": 204 }, { "epoch": 0.016649070088524325, "grad_norm": 11.784275280928684, "learning_rate": 2.7702702702702703e-06, "loss": 0.6088, "step": 205 }, { "epoch": 0.016730285064565906, "grad_norm": 6.596444076197084, "learning_rate": 2.783783783783784e-06, "loss": 0.7085, "step": 206 }, { "epoch": 0.01681150004060749, "grad_norm": 4.7347497389094055, "learning_rate": 2.7972972972972973e-06, "loss": 0.7641, "step": 207 }, { "epoch": 0.01689271501664907, "grad_norm": 4.895873979702855, "learning_rate": 2.810810810810811e-06, "loss": 0.613, "step": 208 }, { "epoch": 0.01697392999269065, "grad_norm": 4.112688584540271, "learning_rate": 2.8243243243243246e-06, "loss": 0.6551, "step": 209 }, { "epoch": 0.017055144968732235, "grad_norm": 4.280798819098727, "learning_rate": 2.837837837837838e-06, "loss": 0.8091, "step": 210 }, { "epoch": 0.017136359944773815, "grad_norm": 4.953151146240816, "learning_rate": 2.851351351351351e-06, "loss": 0.6225, "step": 211 }, { "epoch": 0.0172175749208154, "grad_norm": 4.858934778537565, "learning_rate": 2.8648648648648654e-06, "loss": 0.7894, "step": 212 }, { "epoch": 0.01729878989685698, "grad_norm": 3.776464089377743, "learning_rate": 2.8783783783783785e-06, "loss": 0.6714, "step": 213 }, { "epoch": 0.017380004872898563, "grad_norm": 5.927211431887128, "learning_rate": 2.891891891891892e-06, "loss": 0.6115, "step": 214 }, { "epoch": 0.017461219848940144, "grad_norm": 4.290882863865101, "learning_rate": 2.9054054054054054e-06, "loss": 0.8019, "step": 215 }, { "epoch": 0.017542434824981728, "grad_norm": 5.014996919723652, "learning_rate": 2.9189189189189193e-06, "loss": 0.9165, "step": 216 }, { "epoch": 0.01762364980102331, "grad_norm": 4.678677528211619, "learning_rate": 2.9324324324324328e-06, "loss": 0.7034, "step": 217 }, { "epoch": 0.017704864777064892, "grad_norm": 5.856721537107752, "learning_rate": 2.9459459459459462e-06, "loss": 0.6187, "step": 218 }, { "epoch": 0.017786079753106473, "grad_norm": 5.675292275975791, "learning_rate": 2.9594594594594593e-06, "loss": 0.6282, "step": 219 }, { "epoch": 0.017867294729148053, "grad_norm": 4.026148472844569, "learning_rate": 2.9729729729729736e-06, "loss": 0.6984, "step": 220 }, { "epoch": 0.017948509705189637, "grad_norm": 3.7553668002568115, "learning_rate": 2.9864864864864866e-06, "loss": 0.7059, "step": 221 }, { "epoch": 0.018029724681231218, "grad_norm": 5.010656407185249, "learning_rate": 3e-06, "loss": 0.6676, "step": 222 }, { "epoch": 0.0181109396572728, "grad_norm": 5.174403860939526, "learning_rate": 3.0135135135135135e-06, "loss": 0.6528, "step": 223 }, { "epoch": 0.018192154633314382, "grad_norm": 9.300176785765524, "learning_rate": 3.0270270270270274e-06, "loss": 0.6981, "step": 224 }, { "epoch": 0.018273369609355966, "grad_norm": 6.1866583084404185, "learning_rate": 3.040540540540541e-06, "loss": 0.704, "step": 225 }, { "epoch": 0.018354584585397547, "grad_norm": 7.4629748217872605, "learning_rate": 3.0540540540540544e-06, "loss": 0.593, "step": 226 }, { "epoch": 0.01843579956143913, "grad_norm": 4.530659236075315, "learning_rate": 3.0675675675675674e-06, "loss": 0.5932, "step": 227 }, { "epoch": 0.01851701453748071, "grad_norm": 5.748440319813538, "learning_rate": 3.0810810810810817e-06, "loss": 0.7238, "step": 228 }, { "epoch": 0.018598229513522295, "grad_norm": 40.72028295804037, "learning_rate": 3.0945945945945947e-06, "loss": 0.7017, "step": 229 }, { "epoch": 0.018679444489563875, "grad_norm": 8.886281188290711, "learning_rate": 3.1081081081081082e-06, "loss": 0.5761, "step": 230 }, { "epoch": 0.018760659465605456, "grad_norm": 4.343869317099014, "learning_rate": 3.1216216216216217e-06, "loss": 0.8567, "step": 231 }, { "epoch": 0.01884187444164704, "grad_norm": 4.744970167283222, "learning_rate": 3.1351351351351356e-06, "loss": 0.7763, "step": 232 }, { "epoch": 0.01892308941768862, "grad_norm": 7.495714776851687, "learning_rate": 3.148648648648649e-06, "loss": 0.8186, "step": 233 }, { "epoch": 0.019004304393730204, "grad_norm": 4.102840820567404, "learning_rate": 3.1621621621621625e-06, "loss": 0.7489, "step": 234 }, { "epoch": 0.019085519369771785, "grad_norm": 3.8421345026888027, "learning_rate": 3.1756756756756755e-06, "loss": 0.6933, "step": 235 }, { "epoch": 0.01916673434581337, "grad_norm": 7.387916364375026, "learning_rate": 3.1891891891891894e-06, "loss": 0.6052, "step": 236 }, { "epoch": 0.01924794932185495, "grad_norm": 3.849533522847183, "learning_rate": 3.202702702702703e-06, "loss": 0.7804, "step": 237 }, { "epoch": 0.019329164297896533, "grad_norm": 5.396298934498324, "learning_rate": 3.2162162162162164e-06, "loss": 0.6641, "step": 238 }, { "epoch": 0.019410379273938114, "grad_norm": 4.940398668509269, "learning_rate": 3.22972972972973e-06, "loss": 0.5532, "step": 239 }, { "epoch": 0.019491594249979698, "grad_norm": 6.577994445331441, "learning_rate": 3.2432432432432437e-06, "loss": 0.7118, "step": 240 }, { "epoch": 0.019572809226021278, "grad_norm": 5.519259337961804, "learning_rate": 3.256756756756757e-06, "loss": 0.7421, "step": 241 }, { "epoch": 0.019654024202062862, "grad_norm": 6.0576611783888215, "learning_rate": 3.2702702702702706e-06, "loss": 0.6083, "step": 242 }, { "epoch": 0.019735239178104443, "grad_norm": 8.457628617936836, "learning_rate": 3.2837837837837837e-06, "loss": 0.678, "step": 243 }, { "epoch": 0.019816454154146023, "grad_norm": 7.364746035461257, "learning_rate": 3.2972972972972976e-06, "loss": 0.5896, "step": 244 }, { "epoch": 0.019897669130187607, "grad_norm": 7.750984536568317, "learning_rate": 3.310810810810811e-06, "loss": 0.6219, "step": 245 }, { "epoch": 0.019978884106229187, "grad_norm": 4.981685980750705, "learning_rate": 3.3243243243243245e-06, "loss": 0.6942, "step": 246 }, { "epoch": 0.02006009908227077, "grad_norm": 6.278603739216485, "learning_rate": 3.337837837837838e-06, "loss": 0.7533, "step": 247 }, { "epoch": 0.020141314058312352, "grad_norm": 4.801507815370232, "learning_rate": 3.351351351351352e-06, "loss": 0.6608, "step": 248 }, { "epoch": 0.020222529034353936, "grad_norm": 7.021222735394233, "learning_rate": 3.3648648648648653e-06, "loss": 0.8164, "step": 249 }, { "epoch": 0.020303744010395516, "grad_norm": 5.832060700035103, "learning_rate": 3.3783783783783788e-06, "loss": 0.6886, "step": 250 }, { "epoch": 0.0203849589864371, "grad_norm": 4.387381573209668, "learning_rate": 3.391891891891892e-06, "loss": 0.6018, "step": 251 }, { "epoch": 0.02046617396247868, "grad_norm": 4.588515154439844, "learning_rate": 3.4054054054054057e-06, "loss": 0.8009, "step": 252 }, { "epoch": 0.020547388938520265, "grad_norm": 5.9906722613866865, "learning_rate": 3.418918918918919e-06, "loss": 0.6694, "step": 253 }, { "epoch": 0.020628603914561845, "grad_norm": 4.955892303287632, "learning_rate": 3.4324324324324326e-06, "loss": 0.8309, "step": 254 }, { "epoch": 0.020709818890603426, "grad_norm": 4.824146659749785, "learning_rate": 3.445945945945946e-06, "loss": 0.8692, "step": 255 }, { "epoch": 0.02079103386664501, "grad_norm": 5.5270982189809015, "learning_rate": 3.45945945945946e-06, "loss": 0.5992, "step": 256 }, { "epoch": 0.02087224884268659, "grad_norm": 6.204903985337891, "learning_rate": 3.4729729729729734e-06, "loss": 0.777, "step": 257 }, { "epoch": 0.020953463818728174, "grad_norm": 3.695647833466083, "learning_rate": 3.4864864864864865e-06, "loss": 0.6435, "step": 258 }, { "epoch": 0.021034678794769755, "grad_norm": 5.291080353938334, "learning_rate": 3.5e-06, "loss": 0.6796, "step": 259 }, { "epoch": 0.02111589377081134, "grad_norm": 4.5185266685147285, "learning_rate": 3.513513513513514e-06, "loss": 0.6448, "step": 260 }, { "epoch": 0.02119710874685292, "grad_norm": 4.851027601896295, "learning_rate": 3.5270270270270273e-06, "loss": 0.682, "step": 261 }, { "epoch": 0.021278323722894503, "grad_norm": 5.6100378122852925, "learning_rate": 3.5405405405405408e-06, "loss": 0.7084, "step": 262 }, { "epoch": 0.021359538698936083, "grad_norm": 5.135851719796815, "learning_rate": 3.5540540540540542e-06, "loss": 0.8671, "step": 263 }, { "epoch": 0.021440753674977667, "grad_norm": 5.735868784715722, "learning_rate": 3.567567567567568e-06, "loss": 0.5465, "step": 264 }, { "epoch": 0.021521968651019248, "grad_norm": 4.041664795202519, "learning_rate": 3.5810810810810816e-06, "loss": 0.6117, "step": 265 }, { "epoch": 0.02160318362706083, "grad_norm": 4.742239360128328, "learning_rate": 3.5945945945945946e-06, "loss": 0.6543, "step": 266 }, { "epoch": 0.021684398603102412, "grad_norm": 5.6376075863303114, "learning_rate": 3.608108108108108e-06, "loss": 0.7132, "step": 267 }, { "epoch": 0.021765613579143993, "grad_norm": 4.108413552937677, "learning_rate": 3.621621621621622e-06, "loss": 0.9163, "step": 268 }, { "epoch": 0.021846828555185577, "grad_norm": 7.767807764665652, "learning_rate": 3.6351351351351354e-06, "loss": 0.6385, "step": 269 }, { "epoch": 0.021928043531227157, "grad_norm": 5.864609873584057, "learning_rate": 3.648648648648649e-06, "loss": 0.6538, "step": 270 }, { "epoch": 0.02200925850726874, "grad_norm": 4.820511968405664, "learning_rate": 3.6621621621621624e-06, "loss": 0.6993, "step": 271 }, { "epoch": 0.02209047348331032, "grad_norm": 4.967071259608292, "learning_rate": 3.6756756756756763e-06, "loss": 0.8915, "step": 272 }, { "epoch": 0.022171688459351906, "grad_norm": 3.7176252188194723, "learning_rate": 3.6891891891891897e-06, "loss": 0.906, "step": 273 }, { "epoch": 0.022252903435393486, "grad_norm": 7.432800095587209, "learning_rate": 3.7027027027027028e-06, "loss": 0.5487, "step": 274 }, { "epoch": 0.02233411841143507, "grad_norm": 4.725582603425111, "learning_rate": 3.7162162162162162e-06, "loss": 0.6497, "step": 275 }, { "epoch": 0.02241533338747665, "grad_norm": 7.338793955644148, "learning_rate": 3.72972972972973e-06, "loss": 0.6416, "step": 276 }, { "epoch": 0.02249654836351823, "grad_norm": 5.306407989783936, "learning_rate": 3.7432432432432436e-06, "loss": 0.6278, "step": 277 }, { "epoch": 0.022577763339559815, "grad_norm": 5.1495205257748164, "learning_rate": 3.756756756756757e-06, "loss": 0.7679, "step": 278 }, { "epoch": 0.022658978315601395, "grad_norm": 5.708997533226292, "learning_rate": 3.7702702702702705e-06, "loss": 0.7532, "step": 279 }, { "epoch": 0.02274019329164298, "grad_norm": 8.83003188884847, "learning_rate": 3.7837837837837844e-06, "loss": 0.7133, "step": 280 }, { "epoch": 0.02282140826768456, "grad_norm": 5.539341071050379, "learning_rate": 3.797297297297298e-06, "loss": 0.7097, "step": 281 }, { "epoch": 0.022902623243726144, "grad_norm": 8.240407300947401, "learning_rate": 3.810810810810811e-06, "loss": 0.6048, "step": 282 }, { "epoch": 0.022983838219767724, "grad_norm": 4.587467280833204, "learning_rate": 3.824324324324324e-06, "loss": 0.8175, "step": 283 }, { "epoch": 0.02306505319580931, "grad_norm": 7.179515697903426, "learning_rate": 3.837837837837838e-06, "loss": 0.8436, "step": 284 }, { "epoch": 0.02314626817185089, "grad_norm": 5.309912678695326, "learning_rate": 3.851351351351352e-06, "loss": 0.625, "step": 285 }, { "epoch": 0.023227483147892473, "grad_norm": 4.941339777332484, "learning_rate": 3.864864864864865e-06, "loss": 0.6989, "step": 286 }, { "epoch": 0.023308698123934053, "grad_norm": 5.2052407828367215, "learning_rate": 3.878378378378378e-06, "loss": 0.5503, "step": 287 }, { "epoch": 0.023389913099975637, "grad_norm": 4.238862573234204, "learning_rate": 3.891891891891892e-06, "loss": 0.8404, "step": 288 }, { "epoch": 0.023471128076017218, "grad_norm": 9.415691207617115, "learning_rate": 3.905405405405406e-06, "loss": 0.6679, "step": 289 }, { "epoch": 0.023552343052058798, "grad_norm": 4.698876605885647, "learning_rate": 3.918918918918919e-06, "loss": 0.5115, "step": 290 }, { "epoch": 0.023633558028100382, "grad_norm": 8.590289244453471, "learning_rate": 3.932432432432433e-06, "loss": 0.8333, "step": 291 }, { "epoch": 0.023714773004141963, "grad_norm": 3.9911010631765764, "learning_rate": 3.945945945945947e-06, "loss": 0.5686, "step": 292 }, { "epoch": 0.023795987980183547, "grad_norm": 4.926542248398489, "learning_rate": 3.95945945945946e-06, "loss": 0.6576, "step": 293 }, { "epoch": 0.023877202956225127, "grad_norm": 7.808087195520982, "learning_rate": 3.972972972972973e-06, "loss": 0.7971, "step": 294 }, { "epoch": 0.02395841793226671, "grad_norm": 4.140299491048756, "learning_rate": 3.986486486486487e-06, "loss": 0.649, "step": 295 }, { "epoch": 0.02403963290830829, "grad_norm": 6.598136262319362, "learning_rate": 4.000000000000001e-06, "loss": 0.6101, "step": 296 }, { "epoch": 0.024120847884349875, "grad_norm": 6.657448338847534, "learning_rate": 4.013513513513514e-06, "loss": 0.5807, "step": 297 }, { "epoch": 0.024202062860391456, "grad_norm": 6.049362925798347, "learning_rate": 4.027027027027028e-06, "loss": 0.614, "step": 298 }, { "epoch": 0.02428327783643304, "grad_norm": 7.1798282609319415, "learning_rate": 4.040540540540541e-06, "loss": 0.6665, "step": 299 }, { "epoch": 0.02436449281247462, "grad_norm": 15.049209513289172, "learning_rate": 4.0540540540540545e-06, "loss": 0.6214, "step": 300 }, { "epoch": 0.0244457077885162, "grad_norm": 3.8053242555293796, "learning_rate": 4.067567567567568e-06, "loss": 0.9301, "step": 301 }, { "epoch": 0.024526922764557785, "grad_norm": 5.676808515237133, "learning_rate": 4.0810810810810815e-06, "loss": 0.7352, "step": 302 }, { "epoch": 0.024608137740599365, "grad_norm": 3.930706133352666, "learning_rate": 4.0945945945945945e-06, "loss": 0.6721, "step": 303 }, { "epoch": 0.02468935271664095, "grad_norm": 4.260405802198655, "learning_rate": 4.108108108108108e-06, "loss": 0.6061, "step": 304 }, { "epoch": 0.02477056769268253, "grad_norm": 3.8039307407950544, "learning_rate": 4.121621621621622e-06, "loss": 0.7034, "step": 305 }, { "epoch": 0.024851782668724114, "grad_norm": 4.487160345392378, "learning_rate": 4.135135135135135e-06, "loss": 0.703, "step": 306 }, { "epoch": 0.024932997644765694, "grad_norm": 7.679594903469531, "learning_rate": 4.148648648648649e-06, "loss": 0.6759, "step": 307 }, { "epoch": 0.025014212620807278, "grad_norm": 3.8651431834889203, "learning_rate": 4.162162162162163e-06, "loss": 0.6642, "step": 308 }, { "epoch": 0.02509542759684886, "grad_norm": 7.531093080992881, "learning_rate": 4.175675675675676e-06, "loss": 0.6803, "step": 309 }, { "epoch": 0.025176642572890442, "grad_norm": 5.135386924538244, "learning_rate": 4.189189189189189e-06, "loss": 0.9062, "step": 310 }, { "epoch": 0.025257857548932023, "grad_norm": 6.817605774173451, "learning_rate": 4.202702702702703e-06, "loss": 0.5363, "step": 311 }, { "epoch": 0.025339072524973603, "grad_norm": 4.999220532436134, "learning_rate": 4.216216216216217e-06, "loss": 0.7114, "step": 312 }, { "epoch": 0.025420287501015187, "grad_norm": 5.1328146811650495, "learning_rate": 4.22972972972973e-06, "loss": 0.6691, "step": 313 }, { "epoch": 0.025501502477056768, "grad_norm": 5.421243989150761, "learning_rate": 4.243243243243244e-06, "loss": 0.5742, "step": 314 }, { "epoch": 0.025582717453098352, "grad_norm": 5.473638240229686, "learning_rate": 4.256756756756757e-06, "loss": 0.6078, "step": 315 }, { "epoch": 0.025663932429139932, "grad_norm": 4.005836520564742, "learning_rate": 4.270270270270271e-06, "loss": 0.6261, "step": 316 }, { "epoch": 0.025745147405181516, "grad_norm": 4.474936034323712, "learning_rate": 4.283783783783784e-06, "loss": 0.7862, "step": 317 }, { "epoch": 0.025826362381223097, "grad_norm": 7.022757009649874, "learning_rate": 4.297297297297298e-06, "loss": 0.7268, "step": 318 }, { "epoch": 0.02590757735726468, "grad_norm": 4.756298282781381, "learning_rate": 4.310810810810811e-06, "loss": 0.9672, "step": 319 }, { "epoch": 0.02598879233330626, "grad_norm": 3.7972155322987797, "learning_rate": 4.324324324324325e-06, "loss": 0.7338, "step": 320 }, { "epoch": 0.026070007309347845, "grad_norm": 6.4304814258504, "learning_rate": 4.3378378378378385e-06, "loss": 0.618, "step": 321 }, { "epoch": 0.026151222285389426, "grad_norm": 4.985880984448781, "learning_rate": 4.351351351351352e-06, "loss": 0.752, "step": 322 }, { "epoch": 0.02623243726143101, "grad_norm": 6.965800077960176, "learning_rate": 4.364864864864865e-06, "loss": 0.6961, "step": 323 }, { "epoch": 0.02631365223747259, "grad_norm": 5.990584472824659, "learning_rate": 4.378378378378379e-06, "loss": 0.68, "step": 324 }, { "epoch": 0.02639486721351417, "grad_norm": 3.9564290883110647, "learning_rate": 4.391891891891892e-06, "loss": 0.6824, "step": 325 }, { "epoch": 0.026476082189555755, "grad_norm": 4.712991398333253, "learning_rate": 4.4054054054054054e-06, "loss": 0.7004, "step": 326 }, { "epoch": 0.026557297165597335, "grad_norm": 4.011893314732997, "learning_rate": 4.418918918918919e-06, "loss": 0.5713, "step": 327 }, { "epoch": 0.02663851214163892, "grad_norm": 10.789632918330955, "learning_rate": 4.432432432432433e-06, "loss": 0.5751, "step": 328 }, { "epoch": 0.0267197271176805, "grad_norm": 5.480368608439147, "learning_rate": 4.445945945945946e-06, "loss": 0.621, "step": 329 }, { "epoch": 0.026800942093722083, "grad_norm": 5.7028472331846, "learning_rate": 4.45945945945946e-06, "loss": 0.6287, "step": 330 }, { "epoch": 0.026882157069763664, "grad_norm": 4.278443647971252, "learning_rate": 4.472972972972973e-06, "loss": 0.7512, "step": 331 }, { "epoch": 0.026963372045805248, "grad_norm": 5.540973909463085, "learning_rate": 4.486486486486487e-06, "loss": 0.6958, "step": 332 }, { "epoch": 0.02704458702184683, "grad_norm": 8.48363442658351, "learning_rate": 4.5e-06, "loss": 0.6622, "step": 333 }, { "epoch": 0.027125801997888412, "grad_norm": 5.265277541309016, "learning_rate": 4.513513513513514e-06, "loss": 0.7521, "step": 334 }, { "epoch": 0.027207016973929993, "grad_norm": 6.782051797558551, "learning_rate": 4.527027027027027e-06, "loss": 0.5695, "step": 335 }, { "epoch": 0.027288231949971573, "grad_norm": 7.707736747999393, "learning_rate": 4.540540540540541e-06, "loss": 0.7251, "step": 336 }, { "epoch": 0.027369446926013157, "grad_norm": 9.996464962694214, "learning_rate": 4.554054054054055e-06, "loss": 0.7112, "step": 337 }, { "epoch": 0.027450661902054738, "grad_norm": 5.660779752011064, "learning_rate": 4.567567567567568e-06, "loss": 0.7041, "step": 338 }, { "epoch": 0.02753187687809632, "grad_norm": 6.58420296764706, "learning_rate": 4.581081081081081e-06, "loss": 0.7357, "step": 339 }, { "epoch": 0.027613091854137902, "grad_norm": 6.154533828404867, "learning_rate": 4.594594594594596e-06, "loss": 0.8672, "step": 340 }, { "epoch": 0.027694306830179486, "grad_norm": 3.6916825043347874, "learning_rate": 4.608108108108109e-06, "loss": 0.7116, "step": 341 }, { "epoch": 0.027775521806221067, "grad_norm": 5.536606059040381, "learning_rate": 4.621621621621622e-06, "loss": 0.5388, "step": 342 }, { "epoch": 0.02785673678226265, "grad_norm": 5.0131032158949305, "learning_rate": 4.635135135135136e-06, "loss": 0.63, "step": 343 }, { "epoch": 0.02793795175830423, "grad_norm": 8.645264812499297, "learning_rate": 4.6486486486486495e-06, "loss": 0.6898, "step": 344 }, { "epoch": 0.028019166734345815, "grad_norm": 8.845007826985789, "learning_rate": 4.6621621621621625e-06, "loss": 0.7366, "step": 345 }, { "epoch": 0.028100381710387395, "grad_norm": 6.800329318246896, "learning_rate": 4.675675675675676e-06, "loss": 0.6626, "step": 346 }, { "epoch": 0.028181596686428976, "grad_norm": 3.843752947734245, "learning_rate": 4.6891891891891895e-06, "loss": 0.9352, "step": 347 }, { "epoch": 0.02826281166247056, "grad_norm": 4.717520289005672, "learning_rate": 4.702702702702703e-06, "loss": 0.558, "step": 348 }, { "epoch": 0.02834402663851214, "grad_norm": 5.18843657142752, "learning_rate": 4.716216216216216e-06, "loss": 0.6733, "step": 349 }, { "epoch": 0.028425241614553724, "grad_norm": 6.203969455338302, "learning_rate": 4.72972972972973e-06, "loss": 0.7618, "step": 350 }, { "epoch": 0.028506456590595305, "grad_norm": 5.787025528310779, "learning_rate": 4.743243243243243e-06, "loss": 0.6203, "step": 351 }, { "epoch": 0.02858767156663689, "grad_norm": 5.74015922758883, "learning_rate": 4.756756756756757e-06, "loss": 0.7899, "step": 352 }, { "epoch": 0.02866888654267847, "grad_norm": 3.468789863957611, "learning_rate": 4.770270270270271e-06, "loss": 0.5911, "step": 353 }, { "epoch": 0.028750101518720053, "grad_norm": 4.222539599204078, "learning_rate": 4.783783783783784e-06, "loss": 0.6077, "step": 354 }, { "epoch": 0.028831316494761634, "grad_norm": 5.987825387101823, "learning_rate": 4.797297297297297e-06, "loss": 0.631, "step": 355 }, { "epoch": 0.028912531470803218, "grad_norm": 3.494510978483797, "learning_rate": 4.810810810810811e-06, "loss": 0.6175, "step": 356 }, { "epoch": 0.028993746446844798, "grad_norm": 5.341804933211992, "learning_rate": 4.824324324324325e-06, "loss": 0.8241, "step": 357 }, { "epoch": 0.02907496142288638, "grad_norm": 5.332752685486192, "learning_rate": 4.837837837837838e-06, "loss": 0.6647, "step": 358 }, { "epoch": 0.029156176398927963, "grad_norm": 4.174093461478239, "learning_rate": 4.851351351351352e-06, "loss": 0.681, "step": 359 }, { "epoch": 0.029237391374969543, "grad_norm": 4.796111353710757, "learning_rate": 4.864864864864866e-06, "loss": 0.7803, "step": 360 }, { "epoch": 0.029318606351011127, "grad_norm": 4.430094054311749, "learning_rate": 4.878378378378379e-06, "loss": 0.5628, "step": 361 }, { "epoch": 0.029399821327052707, "grad_norm": 4.736436195711088, "learning_rate": 4.891891891891893e-06, "loss": 0.6637, "step": 362 }, { "epoch": 0.02948103630309429, "grad_norm": 5.52908335708342, "learning_rate": 4.905405405405406e-06, "loss": 0.6167, "step": 363 }, { "epoch": 0.029562251279135872, "grad_norm": 5.203669203589008, "learning_rate": 4.91891891891892e-06, "loss": 0.7164, "step": 364 }, { "epoch": 0.029643466255177456, "grad_norm": 5.632857406640382, "learning_rate": 4.932432432432433e-06, "loss": 0.6265, "step": 365 }, { "epoch": 0.029724681231219036, "grad_norm": 4.525898082221645, "learning_rate": 4.9459459459459466e-06, "loss": 0.6532, "step": 366 }, { "epoch": 0.02980589620726062, "grad_norm": 3.993279600481055, "learning_rate": 4.95945945945946e-06, "loss": 0.6465, "step": 367 }, { "epoch": 0.0298871111833022, "grad_norm": 4.976392222325483, "learning_rate": 4.9729729729729735e-06, "loss": 0.6768, "step": 368 }, { "epoch": 0.029968326159343785, "grad_norm": 7.027194066182947, "learning_rate": 4.986486486486487e-06, "loss": 0.7201, "step": 369 }, { "epoch": 0.030049541135385365, "grad_norm": 5.6159327874829605, "learning_rate": 5e-06, "loss": 0.6772, "step": 370 }, { "epoch": 0.030130756111426946, "grad_norm": 4.876745048221957, "learning_rate": 4.999999913506616e-06, "loss": 0.6551, "step": 371 }, { "epoch": 0.03021197108746853, "grad_norm": 11.157591685761751, "learning_rate": 4.999999654026468e-06, "loss": 0.5551, "step": 372 }, { "epoch": 0.03029318606351011, "grad_norm": 4.729581532775085, "learning_rate": 4.999999221559576e-06, "loss": 0.7666, "step": 373 }, { "epoch": 0.030374401039551694, "grad_norm": 5.683222450900519, "learning_rate": 4.9999986161059685e-06, "loss": 0.7107, "step": 374 }, { "epoch": 0.030455616015593275, "grad_norm": 20.13916455542101, "learning_rate": 4.9999978376656875e-06, "loss": 0.5651, "step": 375 }, { "epoch": 0.03053683099163486, "grad_norm": 4.663636959572572, "learning_rate": 4.999996886238788e-06, "loss": 0.6166, "step": 376 }, { "epoch": 0.03061804596767644, "grad_norm": 3.547301470431729, "learning_rate": 4.999995761825335e-06, "loss": 0.6379, "step": 377 }, { "epoch": 0.030699260943718023, "grad_norm": 4.026761047432411, "learning_rate": 4.999994464425406e-06, "loss": 0.6795, "step": 378 }, { "epoch": 0.030780475919759603, "grad_norm": 4.177399049954423, "learning_rate": 4.99999299403909e-06, "loss": 0.6356, "step": 379 }, { "epoch": 0.030861690895801187, "grad_norm": 8.689971440287565, "learning_rate": 4.999991350666491e-06, "loss": 0.5655, "step": 380 }, { "epoch": 0.030942905871842768, "grad_norm": 4.286085314862286, "learning_rate": 4.999989534307722e-06, "loss": 0.7988, "step": 381 }, { "epoch": 0.03102412084788435, "grad_norm": 4.468190941115001, "learning_rate": 4.999987544962908e-06, "loss": 0.6375, "step": 382 }, { "epoch": 0.031105335823925932, "grad_norm": 11.088261247896137, "learning_rate": 4.999985382632186e-06, "loss": 0.6834, "step": 383 }, { "epoch": 0.031186550799967513, "grad_norm": 4.746884755745409, "learning_rate": 4.9999830473157065e-06, "loss": 0.6825, "step": 384 }, { "epoch": 0.03126776577600909, "grad_norm": 5.042936323367531, "learning_rate": 4.9999805390136315e-06, "loss": 0.6341, "step": 385 }, { "epoch": 0.03134898075205068, "grad_norm": 4.6209559647373375, "learning_rate": 4.999977857726135e-06, "loss": 0.5923, "step": 386 }, { "epoch": 0.03143019572809226, "grad_norm": 5.563156193085517, "learning_rate": 4.999975003453401e-06, "loss": 0.6442, "step": 387 }, { "epoch": 0.031511410704133845, "grad_norm": 5.076759188588105, "learning_rate": 4.999971976195628e-06, "loss": 0.6856, "step": 388 }, { "epoch": 0.03159262568017542, "grad_norm": 3.752599670476179, "learning_rate": 4.999968775953025e-06, "loss": 0.7078, "step": 389 }, { "epoch": 0.031673840656217006, "grad_norm": 4.540276130175966, "learning_rate": 4.999965402725812e-06, "loss": 0.6747, "step": 390 }, { "epoch": 0.03175505563225859, "grad_norm": 6.85554781706532, "learning_rate": 4.999961856514226e-06, "loss": 0.58, "step": 391 }, { "epoch": 0.031836270608300174, "grad_norm": 4.470561799528824, "learning_rate": 4.99995813731851e-06, "loss": 0.544, "step": 392 }, { "epoch": 0.03191748558434175, "grad_norm": 3.381515929810499, "learning_rate": 4.999954245138921e-06, "loss": 0.6551, "step": 393 }, { "epoch": 0.031998700560383335, "grad_norm": 5.642691704207667, "learning_rate": 4.99995017997573e-06, "loss": 0.7476, "step": 394 }, { "epoch": 0.03207991553642492, "grad_norm": 4.423437336138073, "learning_rate": 4.999945941829217e-06, "loss": 0.6501, "step": 395 }, { "epoch": 0.032161130512466496, "grad_norm": 5.470944485898885, "learning_rate": 4.999941530699675e-06, "loss": 0.5093, "step": 396 }, { "epoch": 0.03224234548850808, "grad_norm": 6.234514653013104, "learning_rate": 4.999936946587412e-06, "loss": 0.5785, "step": 397 }, { "epoch": 0.032323560464549664, "grad_norm": 4.217022001267068, "learning_rate": 4.999932189492741e-06, "loss": 0.6715, "step": 398 }, { "epoch": 0.03240477544059125, "grad_norm": 3.997512319922242, "learning_rate": 4.999927259415994e-06, "loss": 0.6798, "step": 399 }, { "epoch": 0.032485990416632825, "grad_norm": 5.231675871788692, "learning_rate": 4.99992215635751e-06, "loss": 0.581, "step": 400 }, { "epoch": 0.03256720539267441, "grad_norm": 6.544773650290789, "learning_rate": 4.999916880317645e-06, "loss": 0.608, "step": 401 }, { "epoch": 0.03264842036871599, "grad_norm": 5.966992739092824, "learning_rate": 4.999911431296762e-06, "loss": 0.6306, "step": 402 }, { "epoch": 0.03272963534475758, "grad_norm": 9.143165980782102, "learning_rate": 4.999905809295239e-06, "loss": 0.7834, "step": 403 }, { "epoch": 0.032810850320799154, "grad_norm": 5.104500215594029, "learning_rate": 4.999900014313464e-06, "loss": 0.5444, "step": 404 }, { "epoch": 0.03289206529684074, "grad_norm": 3.982825484617581, "learning_rate": 4.999894046351839e-06, "loss": 0.6663, "step": 405 }, { "epoch": 0.03297328027288232, "grad_norm": 9.429533561374097, "learning_rate": 4.999887905410775e-06, "loss": 0.6111, "step": 406 }, { "epoch": 0.0330544952489239, "grad_norm": 6.206743996051552, "learning_rate": 4.9998815914907e-06, "loss": 0.6354, "step": 407 }, { "epoch": 0.03313571022496548, "grad_norm": 8.266987795691529, "learning_rate": 4.9998751045920494e-06, "loss": 0.5287, "step": 408 }, { "epoch": 0.033216925201007066, "grad_norm": 5.87817379773943, "learning_rate": 4.999868444715271e-06, "loss": 0.7346, "step": 409 }, { "epoch": 0.03329814017704865, "grad_norm": 5.524749698493624, "learning_rate": 4.999861611860827e-06, "loss": 0.702, "step": 410 }, { "epoch": 0.03337935515309023, "grad_norm": 4.172513273585029, "learning_rate": 4.99985460602919e-06, "loss": 0.6599, "step": 411 }, { "epoch": 0.03346057012913181, "grad_norm": 4.750265866846706, "learning_rate": 4.9998474272208445e-06, "loss": 0.5904, "step": 412 }, { "epoch": 0.033541785105173395, "grad_norm": 5.899694142476412, "learning_rate": 4.999840075436286e-06, "loss": 0.5972, "step": 413 }, { "epoch": 0.03362300008121498, "grad_norm": 8.320548493491238, "learning_rate": 4.999832550676026e-06, "loss": 0.5492, "step": 414 }, { "epoch": 0.033704215057256556, "grad_norm": 4.509397796756542, "learning_rate": 4.999824852940583e-06, "loss": 0.7786, "step": 415 }, { "epoch": 0.03378543003329814, "grad_norm": 5.616284004220855, "learning_rate": 4.999816982230491e-06, "loss": 0.7635, "step": 416 }, { "epoch": 0.033866645009339724, "grad_norm": 5.599669292826977, "learning_rate": 4.999808938546294e-06, "loss": 0.759, "step": 417 }, { "epoch": 0.0339478599853813, "grad_norm": 6.6642180967818945, "learning_rate": 4.999800721888548e-06, "loss": 0.6626, "step": 418 }, { "epoch": 0.034029074961422885, "grad_norm": 5.882885604144428, "learning_rate": 4.999792332257822e-06, "loss": 0.8418, "step": 419 }, { "epoch": 0.03411028993746447, "grad_norm": 4.363979980545484, "learning_rate": 4.999783769654697e-06, "loss": 0.6134, "step": 420 }, { "epoch": 0.03419150491350605, "grad_norm": 4.776238235905251, "learning_rate": 4.999775034079765e-06, "loss": 0.7331, "step": 421 }, { "epoch": 0.03427271988954763, "grad_norm": 4.945197802004096, "learning_rate": 4.99976612553363e-06, "loss": 0.692, "step": 422 }, { "epoch": 0.034353934865589214, "grad_norm": 3.1538233404162628, "learning_rate": 4.999757044016909e-06, "loss": 0.7029, "step": 423 }, { "epoch": 0.0344351498416308, "grad_norm": 5.660874509416241, "learning_rate": 4.99974778953023e-06, "loss": 0.6652, "step": 424 }, { "epoch": 0.03451636481767238, "grad_norm": 3.5726277574942857, "learning_rate": 4.9997383620742354e-06, "loss": 0.7189, "step": 425 }, { "epoch": 0.03459757979371396, "grad_norm": 5.076928873914476, "learning_rate": 4.9997287616495745e-06, "loss": 0.6376, "step": 426 }, { "epoch": 0.03467879476975554, "grad_norm": 7.805331425608438, "learning_rate": 4.999718988256913e-06, "loss": 0.6691, "step": 427 }, { "epoch": 0.03476000974579713, "grad_norm": 6.422161317682913, "learning_rate": 4.999709041896927e-06, "loss": 0.6528, "step": 428 }, { "epoch": 0.034841224721838704, "grad_norm": 4.561480725928831, "learning_rate": 4.9996989225703055e-06, "loss": 0.5714, "step": 429 }, { "epoch": 0.03492243969788029, "grad_norm": 5.799407616311238, "learning_rate": 4.9996886302777466e-06, "loss": 0.6321, "step": 430 }, { "epoch": 0.03500365467392187, "grad_norm": 6.86134449223638, "learning_rate": 4.9996781650199655e-06, "loss": 0.6045, "step": 431 }, { "epoch": 0.035084869649963456, "grad_norm": 4.7834450971379034, "learning_rate": 4.999667526797685e-06, "loss": 0.8719, "step": 432 }, { "epoch": 0.03516608462600503, "grad_norm": 8.624024638101803, "learning_rate": 4.9996567156116395e-06, "loss": 0.7095, "step": 433 }, { "epoch": 0.03524729960204662, "grad_norm": 4.366180557531383, "learning_rate": 4.9996457314625794e-06, "loss": 0.5537, "step": 434 }, { "epoch": 0.0353285145780882, "grad_norm": 5.128282050398408, "learning_rate": 4.9996345743512635e-06, "loss": 0.8241, "step": 435 }, { "epoch": 0.035409729554129785, "grad_norm": 5.611359253780588, "learning_rate": 4.999623244278464e-06, "loss": 0.6241, "step": 436 }, { "epoch": 0.03549094453017136, "grad_norm": 14.430464729028463, "learning_rate": 4.999611741244965e-06, "loss": 0.5835, "step": 437 }, { "epoch": 0.035572159506212946, "grad_norm": 5.4664011456664285, "learning_rate": 4.999600065251563e-06, "loss": 0.6353, "step": 438 }, { "epoch": 0.03565337448225453, "grad_norm": 3.9622611695813172, "learning_rate": 4.999588216299065e-06, "loss": 0.5976, "step": 439 }, { "epoch": 0.03573458945829611, "grad_norm": 5.613516770561363, "learning_rate": 4.999576194388292e-06, "loss": 0.7192, "step": 440 }, { "epoch": 0.03581580443433769, "grad_norm": 4.503094326677596, "learning_rate": 4.999563999520075e-06, "loss": 0.587, "step": 441 }, { "epoch": 0.035897019410379274, "grad_norm": 4.636439079968228, "learning_rate": 4.999551631695257e-06, "loss": 0.4959, "step": 442 }, { "epoch": 0.03597823438642086, "grad_norm": 4.842745847770766, "learning_rate": 4.999539090914696e-06, "loss": 0.8282, "step": 443 }, { "epoch": 0.036059449362462435, "grad_norm": 19.890875208483, "learning_rate": 4.999526377179259e-06, "loss": 0.6707, "step": 444 }, { "epoch": 0.03614066433850402, "grad_norm": 4.903212810732332, "learning_rate": 4.999513490489824e-06, "loss": 0.5994, "step": 445 }, { "epoch": 0.0362218793145456, "grad_norm": 4.394920629425536, "learning_rate": 4.999500430847284e-06, "loss": 0.667, "step": 446 }, { "epoch": 0.03630309429058719, "grad_norm": 3.62470443817531, "learning_rate": 4.9994871982525425e-06, "loss": 0.703, "step": 447 }, { "epoch": 0.036384309266628764, "grad_norm": 6.417116392361057, "learning_rate": 4.999473792706516e-06, "loss": 0.6643, "step": 448 }, { "epoch": 0.03646552424267035, "grad_norm": 4.999543742426304, "learning_rate": 4.999460214210131e-06, "loss": 0.7914, "step": 449 }, { "epoch": 0.03654673921871193, "grad_norm": 5.788280407646206, "learning_rate": 4.999446462764327e-06, "loss": 0.6302, "step": 450 }, { "epoch": 0.03662795419475351, "grad_norm": 4.2978695329143495, "learning_rate": 4.999432538370057e-06, "loss": 0.6356, "step": 451 }, { "epoch": 0.03670916917079509, "grad_norm": 4.224970563305071, "learning_rate": 4.999418441028283e-06, "loss": 0.7923, "step": 452 }, { "epoch": 0.03679038414683668, "grad_norm": 6.438067501026122, "learning_rate": 4.9994041707399794e-06, "loss": 0.4936, "step": 453 }, { "epoch": 0.03687159912287826, "grad_norm": 4.579554653548933, "learning_rate": 4.999389727506137e-06, "loss": 0.58, "step": 454 }, { "epoch": 0.03695281409891984, "grad_norm": 4.4684771869350035, "learning_rate": 4.999375111327753e-06, "loss": 0.5475, "step": 455 }, { "epoch": 0.03703402907496142, "grad_norm": 7.907710639614374, "learning_rate": 4.999360322205838e-06, "loss": 0.6459, "step": 456 }, { "epoch": 0.037115244051003006, "grad_norm": 5.848886945428208, "learning_rate": 4.999345360141417e-06, "loss": 0.6723, "step": 457 }, { "epoch": 0.03719645902704459, "grad_norm": 3.055736509730621, "learning_rate": 4.999330225135525e-06, "loss": 0.7079, "step": 458 }, { "epoch": 0.03727767400308617, "grad_norm": 4.84395863310794, "learning_rate": 4.999314917189209e-06, "loss": 0.8212, "step": 459 }, { "epoch": 0.03735888897912775, "grad_norm": 4.4914093742426, "learning_rate": 4.999299436303527e-06, "loss": 0.7551, "step": 460 }, { "epoch": 0.037440103955169335, "grad_norm": 5.747535926175868, "learning_rate": 4.999283782479552e-06, "loss": 0.755, "step": 461 }, { "epoch": 0.03752131893121091, "grad_norm": 4.124807094235786, "learning_rate": 4.999267955718367e-06, "loss": 0.6151, "step": 462 }, { "epoch": 0.037602533907252496, "grad_norm": 4.951699574302496, "learning_rate": 4.999251956021066e-06, "loss": 0.6636, "step": 463 }, { "epoch": 0.03768374888329408, "grad_norm": 3.634905291149469, "learning_rate": 4.999235783388757e-06, "loss": 0.7115, "step": 464 }, { "epoch": 0.037764963859335664, "grad_norm": 7.068387813096013, "learning_rate": 4.999219437822559e-06, "loss": 0.689, "step": 465 }, { "epoch": 0.03784617883537724, "grad_norm": 4.404856833972122, "learning_rate": 4.999202919323603e-06, "loss": 0.6214, "step": 466 }, { "epoch": 0.037927393811418825, "grad_norm": 4.578684856908467, "learning_rate": 4.9991862278930315e-06, "loss": 0.7647, "step": 467 }, { "epoch": 0.03800860878746041, "grad_norm": 4.898513292964633, "learning_rate": 4.9991693635320005e-06, "loss": 0.6884, "step": 468 }, { "epoch": 0.03808982376350199, "grad_norm": 4.575368944744301, "learning_rate": 4.999152326241675e-06, "loss": 0.6647, "step": 469 }, { "epoch": 0.03817103873954357, "grad_norm": 4.238070688134001, "learning_rate": 4.999135116023236e-06, "loss": 0.6667, "step": 470 }, { "epoch": 0.038252253715585154, "grad_norm": 6.035566032840847, "learning_rate": 4.999117732877873e-06, "loss": 0.6754, "step": 471 }, { "epoch": 0.03833346869162674, "grad_norm": 3.607010706319493, "learning_rate": 4.9991001768067895e-06, "loss": 0.8115, "step": 472 }, { "epoch": 0.03841468366766832, "grad_norm": 6.191780527301594, "learning_rate": 4.9990824478112e-06, "loss": 0.6837, "step": 473 }, { "epoch": 0.0384958986437099, "grad_norm": 5.355617747355118, "learning_rate": 4.999064545892331e-06, "loss": 0.7097, "step": 474 }, { "epoch": 0.03857711361975148, "grad_norm": 4.233673909331852, "learning_rate": 4.999046471051422e-06, "loss": 0.5915, "step": 475 }, { "epoch": 0.038658328595793066, "grad_norm": 4.780173065160517, "learning_rate": 4.999028223289724e-06, "loss": 0.9316, "step": 476 }, { "epoch": 0.03873954357183464, "grad_norm": 4.063084409048435, "learning_rate": 4.999009802608497e-06, "loss": 0.6671, "step": 477 }, { "epoch": 0.03882075854787623, "grad_norm": 4.851439856688829, "learning_rate": 4.998991209009019e-06, "loss": 0.8034, "step": 478 }, { "epoch": 0.03890197352391781, "grad_norm": 7.105889513775342, "learning_rate": 4.998972442492575e-06, "loss": 0.532, "step": 479 }, { "epoch": 0.038983188499959395, "grad_norm": 3.8451222439770683, "learning_rate": 4.9989535030604615e-06, "loss": 0.5993, "step": 480 }, { "epoch": 0.03906440347600097, "grad_norm": 5.621997119878575, "learning_rate": 4.998934390713994e-06, "loss": 0.5273, "step": 481 }, { "epoch": 0.039145618452042556, "grad_norm": 4.81307738692358, "learning_rate": 4.9989151054544905e-06, "loss": 0.6318, "step": 482 }, { "epoch": 0.03922683342808414, "grad_norm": 3.849217538759433, "learning_rate": 4.998895647283287e-06, "loss": 0.6886, "step": 483 }, { "epoch": 0.039308048404125724, "grad_norm": 3.909888024088152, "learning_rate": 4.99887601620173e-06, "loss": 0.7443, "step": 484 }, { "epoch": 0.0393892633801673, "grad_norm": 5.4327964670836835, "learning_rate": 4.9988562122111785e-06, "loss": 0.6783, "step": 485 }, { "epoch": 0.039470478356208885, "grad_norm": 4.60325431470408, "learning_rate": 4.998836235313001e-06, "loss": 0.6589, "step": 486 }, { "epoch": 0.03955169333225047, "grad_norm": 4.5476339338388945, "learning_rate": 4.998816085508582e-06, "loss": 0.5747, "step": 487 }, { "epoch": 0.039632908308292046, "grad_norm": 4.970415173334207, "learning_rate": 4.9987957627993145e-06, "loss": 0.6026, "step": 488 }, { "epoch": 0.03971412328433363, "grad_norm": 4.315065332489597, "learning_rate": 4.998775267186605e-06, "loss": 0.7375, "step": 489 }, { "epoch": 0.039795338260375214, "grad_norm": 5.5992123809226, "learning_rate": 4.998754598671871e-06, "loss": 0.6427, "step": 490 }, { "epoch": 0.0398765532364168, "grad_norm": 3.7014622620168405, "learning_rate": 4.998733757256544e-06, "loss": 0.7897, "step": 491 }, { "epoch": 0.039957768212458375, "grad_norm": 6.7379942751786235, "learning_rate": 4.998712742942065e-06, "loss": 0.6393, "step": 492 }, { "epoch": 0.04003898318849996, "grad_norm": 4.857719068955753, "learning_rate": 4.998691555729888e-06, "loss": 0.6606, "step": 493 }, { "epoch": 0.04012019816454154, "grad_norm": 4.0539838554657885, "learning_rate": 4.9986701956214804e-06, "loss": 0.7552, "step": 494 }, { "epoch": 0.04020141314058313, "grad_norm": 4.957439145405436, "learning_rate": 4.998648662618318e-06, "loss": 0.666, "step": 495 }, { "epoch": 0.040282628116624704, "grad_norm": 4.953757290265213, "learning_rate": 4.998626956721894e-06, "loss": 0.6508, "step": 496 }, { "epoch": 0.04036384309266629, "grad_norm": 3.84200026766362, "learning_rate": 4.998605077933706e-06, "loss": 0.6307, "step": 497 }, { "epoch": 0.04044505806870787, "grad_norm": 6.038380469894045, "learning_rate": 4.998583026255272e-06, "loss": 0.794, "step": 498 }, { "epoch": 0.04052627304474945, "grad_norm": 5.862309248926957, "learning_rate": 4.998560801688116e-06, "loss": 0.5471, "step": 499 }, { "epoch": 0.04060748802079103, "grad_norm": 5.376571375980098, "learning_rate": 4.998538404233776e-06, "loss": 0.6685, "step": 500 }, { "epoch": 0.04068870299683262, "grad_norm": 3.972813169207523, "learning_rate": 4.998515833893801e-06, "loss": 0.5943, "step": 501 }, { "epoch": 0.0407699179728742, "grad_norm": 5.457428585886831, "learning_rate": 4.998493090669754e-06, "loss": 0.6821, "step": 502 }, { "epoch": 0.04085113294891578, "grad_norm": 5.526128969258129, "learning_rate": 4.998470174563208e-06, "loss": 0.6268, "step": 503 }, { "epoch": 0.04093234792495736, "grad_norm": 5.00358557141072, "learning_rate": 4.9984470855757485e-06, "loss": 0.5511, "step": 504 }, { "epoch": 0.041013562900998946, "grad_norm": 9.228964746118006, "learning_rate": 4.998423823708974e-06, "loss": 0.7192, "step": 505 }, { "epoch": 0.04109477787704053, "grad_norm": 7.043937824113614, "learning_rate": 4.998400388964494e-06, "loss": 0.6898, "step": 506 }, { "epoch": 0.041175992853082106, "grad_norm": 7.469052434654404, "learning_rate": 4.998376781343929e-06, "loss": 0.6673, "step": 507 }, { "epoch": 0.04125720782912369, "grad_norm": 5.819947119265823, "learning_rate": 4.998353000848913e-06, "loss": 0.8352, "step": 508 }, { "epoch": 0.041338422805165274, "grad_norm": 4.632267764249478, "learning_rate": 4.998329047481093e-06, "loss": 0.6627, "step": 509 }, { "epoch": 0.04141963778120685, "grad_norm": 5.04622538690542, "learning_rate": 4.998304921242124e-06, "loss": 0.6928, "step": 510 }, { "epoch": 0.041500852757248435, "grad_norm": 4.929054550494181, "learning_rate": 4.998280622133677e-06, "loss": 0.6864, "step": 511 }, { "epoch": 0.04158206773329002, "grad_norm": 5.630142199208348, "learning_rate": 4.998256150157433e-06, "loss": 0.5752, "step": 512 }, { "epoch": 0.0416632827093316, "grad_norm": 4.327018939734424, "learning_rate": 4.998231505315085e-06, "loss": 0.6679, "step": 513 }, { "epoch": 0.04174449768537318, "grad_norm": 5.207414191020043, "learning_rate": 4.998206687608339e-06, "loss": 0.5235, "step": 514 }, { "epoch": 0.041825712661414764, "grad_norm": 5.078181936565571, "learning_rate": 4.998181697038912e-06, "loss": 0.5066, "step": 515 }, { "epoch": 0.04190692763745635, "grad_norm": 4.596975319564036, "learning_rate": 4.998156533608531e-06, "loss": 0.5846, "step": 516 }, { "epoch": 0.04198814261349793, "grad_norm": 6.0532848525291545, "learning_rate": 4.998131197318942e-06, "loss": 0.5737, "step": 517 }, { "epoch": 0.04206935758953951, "grad_norm": 4.35820106724424, "learning_rate": 4.998105688171893e-06, "loss": 0.6762, "step": 518 }, { "epoch": 0.04215057256558109, "grad_norm": 6.664678597912458, "learning_rate": 4.998080006169153e-06, "loss": 0.5852, "step": 519 }, { "epoch": 0.04223178754162268, "grad_norm": 8.897090441557413, "learning_rate": 4.9980541513124966e-06, "loss": 0.7535, "step": 520 }, { "epoch": 0.042313002517664254, "grad_norm": 5.576078105545307, "learning_rate": 4.998028123603714e-06, "loss": 0.5836, "step": 521 }, { "epoch": 0.04239421749370584, "grad_norm": 5.498839425247008, "learning_rate": 4.998001923044605e-06, "loss": 0.638, "step": 522 }, { "epoch": 0.04247543246974742, "grad_norm": 7.495880701431175, "learning_rate": 4.997975549636985e-06, "loss": 0.6051, "step": 523 }, { "epoch": 0.042556647445789006, "grad_norm": 8.634014617651163, "learning_rate": 4.997949003382676e-06, "loss": 0.5373, "step": 524 }, { "epoch": 0.04263786242183058, "grad_norm": 3.6876707339560153, "learning_rate": 4.997922284283517e-06, "loss": 0.8439, "step": 525 }, { "epoch": 0.04271907739787217, "grad_norm": 4.386768122021177, "learning_rate": 4.997895392341356e-06, "loss": 0.9273, "step": 526 }, { "epoch": 0.04280029237391375, "grad_norm": 4.887505724579102, "learning_rate": 4.997868327558053e-06, "loss": 1.0235, "step": 527 }, { "epoch": 0.042881507349955335, "grad_norm": 4.002537001522704, "learning_rate": 4.997841089935482e-06, "loss": 0.7851, "step": 528 }, { "epoch": 0.04296272232599691, "grad_norm": 7.16699779080668, "learning_rate": 4.997813679475528e-06, "loss": 0.7571, "step": 529 }, { "epoch": 0.043043937302038496, "grad_norm": 4.775024972507428, "learning_rate": 4.997786096180086e-06, "loss": 0.721, "step": 530 }, { "epoch": 0.04312515227808008, "grad_norm": 5.099681936395401, "learning_rate": 4.997758340051066e-06, "loss": 0.5692, "step": 531 }, { "epoch": 0.04320636725412166, "grad_norm": 4.818659941047612, "learning_rate": 4.997730411090387e-06, "loss": 0.643, "step": 532 }, { "epoch": 0.04328758223016324, "grad_norm": 6.604728440380809, "learning_rate": 4.997702309299983e-06, "loss": 0.7149, "step": 533 }, { "epoch": 0.043368797206204825, "grad_norm": 5.6047753258513415, "learning_rate": 4.997674034681799e-06, "loss": 0.6094, "step": 534 }, { "epoch": 0.04345001218224641, "grad_norm": 5.772050070771113, "learning_rate": 4.99764558723779e-06, "loss": 0.5527, "step": 535 }, { "epoch": 0.043531227158287986, "grad_norm": 6.695215883564865, "learning_rate": 4.997616966969925e-06, "loss": 0.8073, "step": 536 }, { "epoch": 0.04361244213432957, "grad_norm": 5.0342584361934035, "learning_rate": 4.997588173880184e-06, "loss": 0.7011, "step": 537 }, { "epoch": 0.043693657110371154, "grad_norm": 4.746592585436567, "learning_rate": 4.99755920797056e-06, "loss": 0.6319, "step": 538 }, { "epoch": 0.04377487208641274, "grad_norm": 5.2897074099894095, "learning_rate": 4.997530069243057e-06, "loss": 0.5789, "step": 539 }, { "epoch": 0.043856087062454314, "grad_norm": 3.8007318490045416, "learning_rate": 4.997500757699691e-06, "loss": 0.6953, "step": 540 }, { "epoch": 0.0439373020384959, "grad_norm": 5.144861861634209, "learning_rate": 4.9974712733424905e-06, "loss": 0.621, "step": 541 }, { "epoch": 0.04401851701453748, "grad_norm": 5.905282410348868, "learning_rate": 4.997441616173495e-06, "loss": 0.6511, "step": 542 }, { "epoch": 0.04409973199057906, "grad_norm": 5.309964350881816, "learning_rate": 4.997411786194758e-06, "loss": 0.5526, "step": 543 }, { "epoch": 0.04418094696662064, "grad_norm": 11.149541991220845, "learning_rate": 4.997381783408343e-06, "loss": 0.6067, "step": 544 }, { "epoch": 0.04426216194266223, "grad_norm": 5.410844505382304, "learning_rate": 4.9973516078163256e-06, "loss": 0.6418, "step": 545 }, { "epoch": 0.04434337691870381, "grad_norm": 5.430764606424544, "learning_rate": 4.997321259420793e-06, "loss": 0.6921, "step": 546 }, { "epoch": 0.04442459189474539, "grad_norm": 11.412874034754271, "learning_rate": 4.997290738223847e-06, "loss": 0.5558, "step": 547 }, { "epoch": 0.04450580687078697, "grad_norm": 4.133154872685264, "learning_rate": 4.9972600442275985e-06, "loss": 0.6049, "step": 548 }, { "epoch": 0.044587021846828556, "grad_norm": 5.592148797903198, "learning_rate": 4.997229177434171e-06, "loss": 0.5138, "step": 549 }, { "epoch": 0.04466823682287014, "grad_norm": 8.487263757967648, "learning_rate": 4.997198137845702e-06, "loss": 0.7193, "step": 550 }, { "epoch": 0.04474945179891172, "grad_norm": 7.830506522230207, "learning_rate": 4.997166925464337e-06, "loss": 0.7594, "step": 551 }, { "epoch": 0.0448306667749533, "grad_norm": 8.206186247100188, "learning_rate": 4.997135540292237e-06, "loss": 0.5762, "step": 552 }, { "epoch": 0.044911881750994885, "grad_norm": 4.437021611163288, "learning_rate": 4.997103982331574e-06, "loss": 0.5914, "step": 553 }, { "epoch": 0.04499309672703646, "grad_norm": 5.38722516332683, "learning_rate": 4.997072251584531e-06, "loss": 0.5335, "step": 554 }, { "epoch": 0.045074311703078046, "grad_norm": 7.791956117401669, "learning_rate": 4.997040348053304e-06, "loss": 0.6441, "step": 555 }, { "epoch": 0.04515552667911963, "grad_norm": 4.66481520418914, "learning_rate": 4.9970082717401e-06, "loss": 0.8322, "step": 556 }, { "epoch": 0.045236741655161214, "grad_norm": 5.615716066994191, "learning_rate": 4.9969760226471385e-06, "loss": 0.6918, "step": 557 }, { "epoch": 0.04531795663120279, "grad_norm": 5.424151701577734, "learning_rate": 4.9969436007766514e-06, "loss": 0.5777, "step": 558 }, { "epoch": 0.045399171607244375, "grad_norm": 4.7203985695060195, "learning_rate": 4.9969110061308826e-06, "loss": 0.7466, "step": 559 }, { "epoch": 0.04548038658328596, "grad_norm": 4.730334252790068, "learning_rate": 4.996878238712087e-06, "loss": 0.679, "step": 560 }, { "epoch": 0.04556160155932754, "grad_norm": 3.979712937320014, "learning_rate": 4.996845298522531e-06, "loss": 0.5825, "step": 561 }, { "epoch": 0.04564281653536912, "grad_norm": 6.730522946218954, "learning_rate": 4.996812185564496e-06, "loss": 0.6467, "step": 562 }, { "epoch": 0.045724031511410704, "grad_norm": 3.33620867884503, "learning_rate": 4.99677889984027e-06, "loss": 0.7073, "step": 563 }, { "epoch": 0.04580524648745229, "grad_norm": 5.926130876855334, "learning_rate": 4.996745441352159e-06, "loss": 0.667, "step": 564 }, { "epoch": 0.04588646146349387, "grad_norm": 5.572153026812714, "learning_rate": 4.996711810102478e-06, "loss": 0.8168, "step": 565 }, { "epoch": 0.04596767643953545, "grad_norm": 8.174580316906296, "learning_rate": 4.996678006093553e-06, "loss": 0.6412, "step": 566 }, { "epoch": 0.04604889141557703, "grad_norm": 4.09571754078007, "learning_rate": 4.996644029327723e-06, "loss": 0.6742, "step": 567 }, { "epoch": 0.04613010639161862, "grad_norm": 4.109112417478182, "learning_rate": 4.996609879807341e-06, "loss": 0.8325, "step": 568 }, { "epoch": 0.046211321367660194, "grad_norm": 4.752061462760559, "learning_rate": 4.9965755575347665e-06, "loss": 0.6972, "step": 569 }, { "epoch": 0.04629253634370178, "grad_norm": 3.9521387750727524, "learning_rate": 4.996541062512377e-06, "loss": 0.5081, "step": 570 }, { "epoch": 0.04637375131974336, "grad_norm": 7.487630136380704, "learning_rate": 4.996506394742559e-06, "loss": 0.5242, "step": 571 }, { "epoch": 0.046454966295784945, "grad_norm": 5.41182957872003, "learning_rate": 4.996471554227711e-06, "loss": 0.6066, "step": 572 }, { "epoch": 0.04653618127182652, "grad_norm": 4.448050180267281, "learning_rate": 4.996436540970243e-06, "loss": 0.647, "step": 573 }, { "epoch": 0.046617396247868106, "grad_norm": 7.285104243616152, "learning_rate": 4.99640135497258e-06, "loss": 0.5929, "step": 574 }, { "epoch": 0.04669861122390969, "grad_norm": 3.4657873419662413, "learning_rate": 4.996365996237155e-06, "loss": 0.616, "step": 575 }, { "epoch": 0.046779826199951274, "grad_norm": 6.130204069422916, "learning_rate": 4.996330464766414e-06, "loss": 0.7037, "step": 576 }, { "epoch": 0.04686104117599285, "grad_norm": 4.045238400737803, "learning_rate": 4.996294760562817e-06, "loss": 0.735, "step": 577 }, { "epoch": 0.046942256152034435, "grad_norm": 4.541687240117253, "learning_rate": 4.996258883628834e-06, "loss": 0.5942, "step": 578 }, { "epoch": 0.04702347112807602, "grad_norm": 3.8724612963620326, "learning_rate": 4.996222833966947e-06, "loss": 0.5935, "step": 579 }, { "epoch": 0.047104686104117596, "grad_norm": 6.548773937549966, "learning_rate": 4.996186611579652e-06, "loss": 0.5016, "step": 580 }, { "epoch": 0.04718590108015918, "grad_norm": 4.80061928529426, "learning_rate": 4.996150216469454e-06, "loss": 0.7185, "step": 581 }, { "epoch": 0.047267116056200764, "grad_norm": 3.5697724511020974, "learning_rate": 4.996113648638872e-06, "loss": 0.6544, "step": 582 }, { "epoch": 0.04734833103224235, "grad_norm": 8.657160084841168, "learning_rate": 4.996076908090435e-06, "loss": 0.712, "step": 583 }, { "epoch": 0.047429546008283925, "grad_norm": 5.811863937951256, "learning_rate": 4.9960399948266865e-06, "loss": 0.6283, "step": 584 }, { "epoch": 0.04751076098432551, "grad_norm": 4.144290801020178, "learning_rate": 4.9960029088501814e-06, "loss": 0.5942, "step": 585 }, { "epoch": 0.04759197596036709, "grad_norm": 11.532211557025384, "learning_rate": 4.995965650163485e-06, "loss": 0.7404, "step": 586 }, { "epoch": 0.04767319093640868, "grad_norm": 4.498137515882452, "learning_rate": 4.995928218769174e-06, "loss": 0.5951, "step": 587 }, { "epoch": 0.047754405912450254, "grad_norm": 5.1824043128703465, "learning_rate": 4.99589061466984e-06, "loss": 0.6123, "step": 588 }, { "epoch": 0.04783562088849184, "grad_norm": 5.787634937924542, "learning_rate": 4.995852837868086e-06, "loss": 0.5376, "step": 589 }, { "epoch": 0.04791683586453342, "grad_norm": 7.146634422767184, "learning_rate": 4.995814888366523e-06, "loss": 0.599, "step": 590 }, { "epoch": 0.047998050840575, "grad_norm": 4.672683860091777, "learning_rate": 4.995776766167781e-06, "loss": 0.7119, "step": 591 }, { "epoch": 0.04807926581661658, "grad_norm": 4.546708179061631, "learning_rate": 4.9957384712744935e-06, "loss": 0.5744, "step": 592 }, { "epoch": 0.04816048079265817, "grad_norm": 5.406893771451819, "learning_rate": 4.9957000036893124e-06, "loss": 0.6749, "step": 593 }, { "epoch": 0.04824169576869975, "grad_norm": 4.6310196800989685, "learning_rate": 4.9956613634149e-06, "loss": 0.9589, "step": 594 }, { "epoch": 0.04832291074474133, "grad_norm": 5.64526302845874, "learning_rate": 4.995622550453929e-06, "loss": 0.7278, "step": 595 }, { "epoch": 0.04840412572078291, "grad_norm": 7.582504845192103, "learning_rate": 4.995583564809086e-06, "loss": 0.56, "step": 596 }, { "epoch": 0.048485340696824496, "grad_norm": 4.631867662632177, "learning_rate": 4.995544406483067e-06, "loss": 0.7885, "step": 597 }, { "epoch": 0.04856655567286608, "grad_norm": 4.646870659330431, "learning_rate": 4.9955050754785835e-06, "loss": 0.7185, "step": 598 }, { "epoch": 0.04864777064890766, "grad_norm": 4.724376677010658, "learning_rate": 4.995465571798356e-06, "loss": 0.6945, "step": 599 }, { "epoch": 0.04872898562494924, "grad_norm": 3.7877986447817684, "learning_rate": 4.995425895445118e-06, "loss": 0.7329, "step": 600 }, { "epoch": 0.048810200600990825, "grad_norm": 4.6352382568877335, "learning_rate": 4.995386046421614e-06, "loss": 0.6972, "step": 601 }, { "epoch": 0.0488914155770324, "grad_norm": 4.7603716353361305, "learning_rate": 4.9953460247306035e-06, "loss": 0.502, "step": 602 }, { "epoch": 0.048972630553073986, "grad_norm": 4.752679614641838, "learning_rate": 4.995305830374854e-06, "loss": 0.9242, "step": 603 }, { "epoch": 0.04905384552911557, "grad_norm": 3.4417838568029855, "learning_rate": 4.995265463357147e-06, "loss": 0.7566, "step": 604 }, { "epoch": 0.04913506050515715, "grad_norm": 6.888094078901035, "learning_rate": 4.995224923680277e-06, "loss": 0.6201, "step": 605 }, { "epoch": 0.04921627548119873, "grad_norm": 4.161212593195377, "learning_rate": 4.995184211347046e-06, "loss": 0.7392, "step": 606 }, { "epoch": 0.049297490457240314, "grad_norm": 7.5719084378043515, "learning_rate": 4.995143326360274e-06, "loss": 0.6514, "step": 607 }, { "epoch": 0.0493787054332819, "grad_norm": 10.224202349098311, "learning_rate": 4.99510226872279e-06, "loss": 0.5716, "step": 608 }, { "epoch": 0.04945992040932348, "grad_norm": 5.767252392060593, "learning_rate": 4.995061038437434e-06, "loss": 0.6206, "step": 609 }, { "epoch": 0.04954113538536506, "grad_norm": 5.065547686279448, "learning_rate": 4.995019635507059e-06, "loss": 0.5774, "step": 610 }, { "epoch": 0.04962235036140664, "grad_norm": 5.65097481771209, "learning_rate": 4.9949780599345295e-06, "loss": 0.5498, "step": 611 }, { "epoch": 0.04970356533744823, "grad_norm": 6.523300211900963, "learning_rate": 4.994936311722723e-06, "loss": 0.6711, "step": 612 }, { "epoch": 0.049784780313489804, "grad_norm": 6.2556246954187715, "learning_rate": 4.994894390874527e-06, "loss": 0.7809, "step": 613 }, { "epoch": 0.04986599528953139, "grad_norm": 5.630465680189662, "learning_rate": 4.994852297392845e-06, "loss": 0.5909, "step": 614 }, { "epoch": 0.04994721026557297, "grad_norm": 5.019843596472891, "learning_rate": 4.994810031280587e-06, "loss": 0.8296, "step": 615 }, { "epoch": 0.050028425241614556, "grad_norm": 6.614977325258197, "learning_rate": 4.994767592540678e-06, "loss": 0.5424, "step": 616 }, { "epoch": 0.05010964021765613, "grad_norm": 6.173835775193037, "learning_rate": 4.9947249811760555e-06, "loss": 0.6781, "step": 617 }, { "epoch": 0.05019085519369772, "grad_norm": 7.520930623536426, "learning_rate": 4.994682197189667e-06, "loss": 0.6022, "step": 618 }, { "epoch": 0.0502720701697393, "grad_norm": 8.187916646181824, "learning_rate": 4.994639240584474e-06, "loss": 0.6612, "step": 619 }, { "epoch": 0.050353285145780885, "grad_norm": 7.453069836355382, "learning_rate": 4.994596111363448e-06, "loss": 0.7443, "step": 620 }, { "epoch": 0.05043450012182246, "grad_norm": 7.747733385771382, "learning_rate": 4.994552809529573e-06, "loss": 0.5906, "step": 621 }, { "epoch": 0.050515715097864046, "grad_norm": 9.01741469170529, "learning_rate": 4.994509335085847e-06, "loss": 0.591, "step": 622 }, { "epoch": 0.05059693007390563, "grad_norm": 6.323489129605991, "learning_rate": 4.994465688035276e-06, "loss": 0.9078, "step": 623 }, { "epoch": 0.05067814504994721, "grad_norm": 13.299317733180853, "learning_rate": 4.994421868380881e-06, "loss": 0.525, "step": 624 }, { "epoch": 0.05075936002598879, "grad_norm": 4.296125414620111, "learning_rate": 4.994377876125695e-06, "loss": 0.5857, "step": 625 }, { "epoch": 0.050840575002030375, "grad_norm": 3.877492783088978, "learning_rate": 4.994333711272761e-06, "loss": 0.6115, "step": 626 }, { "epoch": 0.05092178997807196, "grad_norm": 7.011247326243034, "learning_rate": 4.9942893738251355e-06, "loss": 0.5045, "step": 627 }, { "epoch": 0.051003004954113536, "grad_norm": 7.046298816782971, "learning_rate": 4.994244863785887e-06, "loss": 0.6668, "step": 628 }, { "epoch": 0.05108421993015512, "grad_norm": 13.568359594461246, "learning_rate": 4.994200181158093e-06, "loss": 0.5775, "step": 629 }, { "epoch": 0.051165434906196704, "grad_norm": 7.5605209242829545, "learning_rate": 4.9941553259448475e-06, "loss": 0.5511, "step": 630 }, { "epoch": 0.05124664988223829, "grad_norm": 7.085727266285509, "learning_rate": 4.994110298149253e-06, "loss": 0.6272, "step": 631 }, { "epoch": 0.051327864858279865, "grad_norm": 4.890560092384863, "learning_rate": 4.994065097774426e-06, "loss": 0.6843, "step": 632 }, { "epoch": 0.05140907983432145, "grad_norm": 5.665397086701724, "learning_rate": 4.994019724823495e-06, "loss": 0.9322, "step": 633 }, { "epoch": 0.05149029481036303, "grad_norm": 12.520866191934164, "learning_rate": 4.993974179299597e-06, "loss": 0.9683, "step": 634 }, { "epoch": 0.05157150978640461, "grad_norm": 6.353771093218972, "learning_rate": 4.993928461205885e-06, "loss": 0.7176, "step": 635 }, { "epoch": 0.051652724762446194, "grad_norm": 6.398842006345174, "learning_rate": 4.993882570545523e-06, "loss": 0.6833, "step": 636 }, { "epoch": 0.05173393973848778, "grad_norm": 4.720771899259599, "learning_rate": 4.993836507321686e-06, "loss": 0.554, "step": 637 }, { "epoch": 0.05181515471452936, "grad_norm": 6.5529107721965945, "learning_rate": 4.9937902715375605e-06, "loss": 0.8592, "step": 638 }, { "epoch": 0.05189636969057094, "grad_norm": 6.335582258551991, "learning_rate": 4.993743863196348e-06, "loss": 0.6642, "step": 639 }, { "epoch": 0.05197758466661252, "grad_norm": 5.488427009047834, "learning_rate": 4.993697282301256e-06, "loss": 0.6354, "step": 640 }, { "epoch": 0.052058799642654106, "grad_norm": 4.435230831471148, "learning_rate": 4.99365052885551e-06, "loss": 0.6361, "step": 641 }, { "epoch": 0.05214001461869569, "grad_norm": 3.973892603431311, "learning_rate": 4.9936036028623465e-06, "loss": 0.6415, "step": 642 }, { "epoch": 0.05222122959473727, "grad_norm": 11.239148633823167, "learning_rate": 4.99355650432501e-06, "loss": 0.6895, "step": 643 }, { "epoch": 0.05230244457077885, "grad_norm": 4.274914346041189, "learning_rate": 4.993509233246761e-06, "loss": 0.5007, "step": 644 }, { "epoch": 0.052383659546820435, "grad_norm": 6.237193126781422, "learning_rate": 4.9934617896308675e-06, "loss": 0.6882, "step": 645 }, { "epoch": 0.05246487452286202, "grad_norm": 3.98788426140263, "learning_rate": 4.993414173480617e-06, "loss": 0.6089, "step": 646 }, { "epoch": 0.052546089498903596, "grad_norm": 14.335138449996226, "learning_rate": 4.9933663847993005e-06, "loss": 0.7351, "step": 647 }, { "epoch": 0.05262730447494518, "grad_norm": 7.149335425005816, "learning_rate": 4.9933184235902275e-06, "loss": 0.7582, "step": 648 }, { "epoch": 0.052708519450986764, "grad_norm": 4.593224187186037, "learning_rate": 4.993270289856714e-06, "loss": 0.5349, "step": 649 }, { "epoch": 0.05278973442702834, "grad_norm": 11.086443268562299, "learning_rate": 4.993221983602093e-06, "loss": 0.5782, "step": 650 }, { "epoch": 0.052870949403069925, "grad_norm": 4.566408185176231, "learning_rate": 4.993173504829705e-06, "loss": 0.5413, "step": 651 }, { "epoch": 0.05295216437911151, "grad_norm": 4.3786114129219165, "learning_rate": 4.993124853542906e-06, "loss": 0.7764, "step": 652 }, { "epoch": 0.05303337935515309, "grad_norm": 4.4667797149749955, "learning_rate": 4.993076029745061e-06, "loss": 0.5315, "step": 653 }, { "epoch": 0.05311459433119467, "grad_norm": 4.402424708224538, "learning_rate": 4.99302703343955e-06, "loss": 0.6889, "step": 654 }, { "epoch": 0.053195809307236254, "grad_norm": 5.124489236968013, "learning_rate": 4.992977864629762e-06, "loss": 0.7257, "step": 655 }, { "epoch": 0.05327702428327784, "grad_norm": 5.821195473769332, "learning_rate": 4.9929285233191005e-06, "loss": 0.6547, "step": 656 }, { "epoch": 0.05335823925931942, "grad_norm": 6.888571998941501, "learning_rate": 4.992879009510978e-06, "loss": 0.5126, "step": 657 }, { "epoch": 0.053439454235361, "grad_norm": 5.619280693409271, "learning_rate": 4.992829323208822e-06, "loss": 0.6526, "step": 658 }, { "epoch": 0.05352066921140258, "grad_norm": 6.967418216632179, "learning_rate": 4.992779464416069e-06, "loss": 0.596, "step": 659 }, { "epoch": 0.05360188418744417, "grad_norm": 4.9434745741217645, "learning_rate": 4.992729433136171e-06, "loss": 0.6319, "step": 660 }, { "epoch": 0.053683099163485744, "grad_norm": 8.648403799491454, "learning_rate": 4.992679229372588e-06, "loss": 0.5502, "step": 661 }, { "epoch": 0.05376431413952733, "grad_norm": 11.185178358108589, "learning_rate": 4.9926288531287946e-06, "loss": 0.6938, "step": 662 }, { "epoch": 0.05384552911556891, "grad_norm": 5.6934860327185115, "learning_rate": 4.992578304408278e-06, "loss": 0.6992, "step": 663 }, { "epoch": 0.053926744091610496, "grad_norm": 5.313881451433935, "learning_rate": 4.992527583214533e-06, "loss": 0.738, "step": 664 }, { "epoch": 0.05400795906765207, "grad_norm": 6.054202350907215, "learning_rate": 4.992476689551071e-06, "loss": 0.6326, "step": 665 }, { "epoch": 0.05408917404369366, "grad_norm": 7.4133417376434725, "learning_rate": 4.992425623421414e-06, "loss": 0.7014, "step": 666 }, { "epoch": 0.05417038901973524, "grad_norm": 4.221490444985979, "learning_rate": 4.992374384829094e-06, "loss": 0.7564, "step": 667 }, { "epoch": 0.054251603995776825, "grad_norm": 10.257715188715267, "learning_rate": 4.992322973777658e-06, "loss": 0.6686, "step": 668 }, { "epoch": 0.0543328189718184, "grad_norm": 4.701020797968543, "learning_rate": 4.992271390270662e-06, "loss": 0.5668, "step": 669 }, { "epoch": 0.054414033947859985, "grad_norm": 5.172562987556883, "learning_rate": 4.992219634311677e-06, "loss": 0.563, "step": 670 }, { "epoch": 0.05449524892390157, "grad_norm": 8.921514201488444, "learning_rate": 4.992167705904282e-06, "loss": 0.694, "step": 671 }, { "epoch": 0.054576463899943146, "grad_norm": 3.807857859349466, "learning_rate": 4.992115605052072e-06, "loss": 0.5912, "step": 672 }, { "epoch": 0.05465767887598473, "grad_norm": 3.2898787814816357, "learning_rate": 4.992063331758651e-06, "loss": 0.4573, "step": 673 }, { "epoch": 0.054738893852026314, "grad_norm": 3.4263761182268526, "learning_rate": 4.9920108860276375e-06, "loss": 0.6525, "step": 674 }, { "epoch": 0.0548201088280679, "grad_norm": 4.749436044976401, "learning_rate": 4.991958267862659e-06, "loss": 0.51, "step": 675 }, { "epoch": 0.054901323804109475, "grad_norm": 5.875542464144077, "learning_rate": 4.991905477267356e-06, "loss": 0.6024, "step": 676 }, { "epoch": 0.05498253878015106, "grad_norm": 5.5252942081115695, "learning_rate": 4.991852514245384e-06, "loss": 0.6975, "step": 677 }, { "epoch": 0.05506375375619264, "grad_norm": 3.5503049002474336, "learning_rate": 4.991799378800404e-06, "loss": 0.7191, "step": 678 }, { "epoch": 0.05514496873223423, "grad_norm": 3.9287521299019947, "learning_rate": 4.9917460709360955e-06, "loss": 0.6743, "step": 679 }, { "epoch": 0.055226183708275804, "grad_norm": 7.458997142187155, "learning_rate": 4.991692590656146e-06, "loss": 0.6277, "step": 680 }, { "epoch": 0.05530739868431739, "grad_norm": 5.196449898312582, "learning_rate": 4.991638937964257e-06, "loss": 0.5941, "step": 681 }, { "epoch": 0.05538861366035897, "grad_norm": 7.808673495814518, "learning_rate": 4.9915851128641405e-06, "loss": 0.4626, "step": 682 }, { "epoch": 0.05546982863640055, "grad_norm": 10.233712958106928, "learning_rate": 4.991531115359519e-06, "loss": 0.6285, "step": 683 }, { "epoch": 0.05555104361244213, "grad_norm": 4.467794770096275, "learning_rate": 4.991476945454133e-06, "loss": 0.5607, "step": 684 }, { "epoch": 0.05563225858848372, "grad_norm": 5.521835573749263, "learning_rate": 4.991422603151727e-06, "loss": 0.5919, "step": 685 }, { "epoch": 0.0557134735645253, "grad_norm": 5.041631495422818, "learning_rate": 4.991368088456062e-06, "loss": 0.5745, "step": 686 }, { "epoch": 0.05579468854056688, "grad_norm": 4.822878210770882, "learning_rate": 4.99131340137091e-06, "loss": 0.7395, "step": 687 }, { "epoch": 0.05587590351660846, "grad_norm": 8.016561518905647, "learning_rate": 4.991258541900058e-06, "loss": 0.587, "step": 688 }, { "epoch": 0.055957118492650046, "grad_norm": 8.66516164876571, "learning_rate": 4.991203510047299e-06, "loss": 0.7209, "step": 689 }, { "epoch": 0.05603833346869163, "grad_norm": 4.158292161529495, "learning_rate": 4.991148305816441e-06, "loss": 0.5705, "step": 690 }, { "epoch": 0.05611954844473321, "grad_norm": 8.004317430680866, "learning_rate": 4.991092929211305e-06, "loss": 0.6297, "step": 691 }, { "epoch": 0.05620076342077479, "grad_norm": 7.148231924748558, "learning_rate": 4.9910373802357214e-06, "loss": 0.5579, "step": 692 }, { "epoch": 0.056281978396816375, "grad_norm": 4.294720627427241, "learning_rate": 4.990981658893535e-06, "loss": 0.5622, "step": 693 }, { "epoch": 0.05636319337285795, "grad_norm": 7.230336818000405, "learning_rate": 4.990925765188602e-06, "loss": 0.8874, "step": 694 }, { "epoch": 0.056444408348899536, "grad_norm": 3.9400202583073662, "learning_rate": 4.9908696991247885e-06, "loss": 0.6678, "step": 695 }, { "epoch": 0.05652562332494112, "grad_norm": 4.144892106492086, "learning_rate": 4.990813460705975e-06, "loss": 0.6598, "step": 696 }, { "epoch": 0.056606838300982704, "grad_norm": 5.029300780914533, "learning_rate": 4.990757049936051e-06, "loss": 0.5704, "step": 697 }, { "epoch": 0.05668805327702428, "grad_norm": 5.440381358402857, "learning_rate": 4.990700466818923e-06, "loss": 0.4935, "step": 698 }, { "epoch": 0.056769268253065865, "grad_norm": 64.88422852309522, "learning_rate": 4.990643711358504e-06, "loss": 0.6541, "step": 699 }, { "epoch": 0.05685048322910745, "grad_norm": 9.761223335464411, "learning_rate": 4.990586783558722e-06, "loss": 0.6356, "step": 700 }, { "epoch": 0.05693169820514903, "grad_norm": 8.471075449596462, "learning_rate": 4.990529683423515e-06, "loss": 0.6032, "step": 701 }, { "epoch": 0.05701291318119061, "grad_norm": 8.610029295506125, "learning_rate": 4.990472410956835e-06, "loss": 0.6641, "step": 702 }, { "epoch": 0.057094128157232193, "grad_norm": 8.074345676422041, "learning_rate": 4.9904149661626456e-06, "loss": 0.7349, "step": 703 }, { "epoch": 0.05717534313327378, "grad_norm": 6.475287058051118, "learning_rate": 4.99035734904492e-06, "loss": 0.601, "step": 704 }, { "epoch": 0.057256558109315354, "grad_norm": 5.1928926651854335, "learning_rate": 4.990299559607646e-06, "loss": 0.6079, "step": 705 }, { "epoch": 0.05733777308535694, "grad_norm": 5.520091117650318, "learning_rate": 4.990241597854822e-06, "loss": 0.7949, "step": 706 }, { "epoch": 0.05741898806139852, "grad_norm": 6.225321905485309, "learning_rate": 4.99018346379046e-06, "loss": 0.5618, "step": 707 }, { "epoch": 0.057500203037440106, "grad_norm": 4.444351207370442, "learning_rate": 4.99012515741858e-06, "loss": 0.5987, "step": 708 }, { "epoch": 0.05758141801348168, "grad_norm": 11.383212330047579, "learning_rate": 4.990066678743219e-06, "loss": 0.5409, "step": 709 }, { "epoch": 0.05766263298952327, "grad_norm": 5.0174724436449765, "learning_rate": 4.9900080277684224e-06, "loss": 0.8077, "step": 710 }, { "epoch": 0.05774384796556485, "grad_norm": 6.82758331108689, "learning_rate": 4.989949204498248e-06, "loss": 0.7131, "step": 711 }, { "epoch": 0.057825062941606435, "grad_norm": 4.599805153792023, "learning_rate": 4.989890208936767e-06, "loss": 0.7143, "step": 712 }, { "epoch": 0.05790627791764801, "grad_norm": 5.615806754693292, "learning_rate": 4.98983104108806e-06, "loss": 0.4989, "step": 713 }, { "epoch": 0.057987492893689596, "grad_norm": 5.944167262991299, "learning_rate": 4.989771700956223e-06, "loss": 0.6002, "step": 714 }, { "epoch": 0.05806870786973118, "grad_norm": 5.363075574799061, "learning_rate": 4.989712188545362e-06, "loss": 0.5249, "step": 715 }, { "epoch": 0.05814992284577276, "grad_norm": 5.71188897749546, "learning_rate": 4.989652503859592e-06, "loss": 0.6282, "step": 716 }, { "epoch": 0.05823113782181434, "grad_norm": 6.4060919441849835, "learning_rate": 4.989592646903047e-06, "loss": 0.8009, "step": 717 }, { "epoch": 0.058312352797855925, "grad_norm": 5.806526365359627, "learning_rate": 4.989532617679866e-06, "loss": 0.5443, "step": 718 }, { "epoch": 0.05839356777389751, "grad_norm": 5.459430108939258, "learning_rate": 4.989472416194204e-06, "loss": 0.5308, "step": 719 }, { "epoch": 0.058474782749939086, "grad_norm": 5.688152424438068, "learning_rate": 4.9894120424502254e-06, "loss": 0.6284, "step": 720 }, { "epoch": 0.05855599772598067, "grad_norm": 7.784647717876923, "learning_rate": 4.989351496452109e-06, "loss": 0.5055, "step": 721 }, { "epoch": 0.058637212702022254, "grad_norm": 4.6787370535165245, "learning_rate": 4.9892907782040435e-06, "loss": 0.6513, "step": 722 }, { "epoch": 0.05871842767806384, "grad_norm": 6.799832486685082, "learning_rate": 4.9892298877102305e-06, "loss": 0.7293, "step": 723 }, { "epoch": 0.058799642654105415, "grad_norm": 5.66494166268401, "learning_rate": 4.989168824974884e-06, "loss": 0.5597, "step": 724 }, { "epoch": 0.058880857630147, "grad_norm": 6.3608187122909206, "learning_rate": 4.989107590002228e-06, "loss": 0.6171, "step": 725 }, { "epoch": 0.05896207260618858, "grad_norm": 6.134461895108829, "learning_rate": 4.989046182796501e-06, "loss": 0.5414, "step": 726 }, { "epoch": 0.05904328758223017, "grad_norm": 7.100189240185222, "learning_rate": 4.988984603361949e-06, "loss": 0.6406, "step": 727 }, { "epoch": 0.059124502558271744, "grad_norm": 5.1992788570507535, "learning_rate": 4.988922851702837e-06, "loss": 0.6249, "step": 728 }, { "epoch": 0.05920571753431333, "grad_norm": 4.3545253550275635, "learning_rate": 4.988860927823436e-06, "loss": 0.8036, "step": 729 }, { "epoch": 0.05928693251035491, "grad_norm": 5.3206883020912, "learning_rate": 4.988798831728031e-06, "loss": 0.5943, "step": 730 }, { "epoch": 0.05936814748639649, "grad_norm": 6.089917627663601, "learning_rate": 4.9887365634209186e-06, "loss": 0.7094, "step": 731 }, { "epoch": 0.05944936246243807, "grad_norm": 3.956275748674879, "learning_rate": 4.9886741229064075e-06, "loss": 0.5626, "step": 732 }, { "epoch": 0.05953057743847966, "grad_norm": 4.511627354156277, "learning_rate": 4.988611510188818e-06, "loss": 0.5764, "step": 733 }, { "epoch": 0.05961179241452124, "grad_norm": 6.4116489379834025, "learning_rate": 4.988548725272482e-06, "loss": 0.5982, "step": 734 }, { "epoch": 0.05969300739056282, "grad_norm": 6.311885375734862, "learning_rate": 4.988485768161746e-06, "loss": 0.5339, "step": 735 }, { "epoch": 0.0597742223666044, "grad_norm": 3.9308120481243902, "learning_rate": 4.988422638860964e-06, "loss": 0.584, "step": 736 }, { "epoch": 0.059855437342645985, "grad_norm": 4.880711425301184, "learning_rate": 4.988359337374505e-06, "loss": 0.5078, "step": 737 }, { "epoch": 0.05993665231868757, "grad_norm": 5.281870836321762, "learning_rate": 4.988295863706751e-06, "loss": 0.5754, "step": 738 }, { "epoch": 0.060017867294729146, "grad_norm": 5.0230810112537485, "learning_rate": 4.988232217862091e-06, "loss": 0.6391, "step": 739 }, { "epoch": 0.06009908227077073, "grad_norm": 6.955537743254657, "learning_rate": 4.988168399844931e-06, "loss": 0.5283, "step": 740 }, { "epoch": 0.060180297246812314, "grad_norm": 4.147704339162062, "learning_rate": 4.988104409659685e-06, "loss": 0.653, "step": 741 }, { "epoch": 0.06026151222285389, "grad_norm": 7.537477012145328, "learning_rate": 4.988040247310783e-06, "loss": 0.6525, "step": 742 }, { "epoch": 0.060342727198895475, "grad_norm": 4.863974888473347, "learning_rate": 4.987975912802663e-06, "loss": 0.5931, "step": 743 }, { "epoch": 0.06042394217493706, "grad_norm": 5.366327422788624, "learning_rate": 4.9879114061397784e-06, "loss": 0.4838, "step": 744 }, { "epoch": 0.06050515715097864, "grad_norm": 4.656417177398882, "learning_rate": 4.987846727326591e-06, "loss": 0.681, "step": 745 }, { "epoch": 0.06058637212702022, "grad_norm": 20.788942947113796, "learning_rate": 4.987781876367576e-06, "loss": 0.6331, "step": 746 }, { "epoch": 0.060667587103061804, "grad_norm": 7.459258545982763, "learning_rate": 4.987716853267222e-06, "loss": 0.5166, "step": 747 }, { "epoch": 0.06074880207910339, "grad_norm": 7.51821304798715, "learning_rate": 4.9876516580300285e-06, "loss": 0.4729, "step": 748 }, { "epoch": 0.06083001705514497, "grad_norm": 6.15090790030722, "learning_rate": 4.987586290660506e-06, "loss": 0.5615, "step": 749 }, { "epoch": 0.06091123203118655, "grad_norm": 5.226446559415324, "learning_rate": 4.987520751163176e-06, "loss": 0.4638, "step": 750 }, { "epoch": 0.06099244700722813, "grad_norm": 6.752685506869791, "learning_rate": 4.9874550395425764e-06, "loss": 0.5201, "step": 751 }, { "epoch": 0.06107366198326972, "grad_norm": 4.989408238882795, "learning_rate": 4.987389155803252e-06, "loss": 0.7494, "step": 752 }, { "epoch": 0.061154876959311294, "grad_norm": 6.750783797733722, "learning_rate": 4.987323099949763e-06, "loss": 0.65, "step": 753 }, { "epoch": 0.06123609193535288, "grad_norm": 7.609585837462416, "learning_rate": 4.9872568719866795e-06, "loss": 0.6736, "step": 754 }, { "epoch": 0.06131730691139446, "grad_norm": 4.452864412048146, "learning_rate": 4.987190471918584e-06, "loss": 0.5907, "step": 755 }, { "epoch": 0.061398521887436046, "grad_norm": 4.734835124703949, "learning_rate": 4.98712389975007e-06, "loss": 0.653, "step": 756 }, { "epoch": 0.06147973686347762, "grad_norm": 5.928107826920242, "learning_rate": 4.987057155485746e-06, "loss": 0.7772, "step": 757 }, { "epoch": 0.06156095183951921, "grad_norm": 9.884828624607355, "learning_rate": 4.98699023913023e-06, "loss": 0.5529, "step": 758 }, { "epoch": 0.06164216681556079, "grad_norm": 4.2764549097941, "learning_rate": 4.986923150688151e-06, "loss": 0.6011, "step": 759 }, { "epoch": 0.061723381791602375, "grad_norm": 7.233144313683277, "learning_rate": 4.986855890164152e-06, "loss": 0.6427, "step": 760 }, { "epoch": 0.06180459676764395, "grad_norm": 5.536503055820394, "learning_rate": 4.986788457562887e-06, "loss": 0.6915, "step": 761 }, { "epoch": 0.061885811743685536, "grad_norm": 5.309270582449383, "learning_rate": 4.986720852889021e-06, "loss": 0.5991, "step": 762 }, { "epoch": 0.06196702671972712, "grad_norm": 4.284356637879479, "learning_rate": 4.9866530761472335e-06, "loss": 0.8062, "step": 763 }, { "epoch": 0.0620482416957687, "grad_norm": 5.643793776418088, "learning_rate": 4.986585127342214e-06, "loss": 0.7215, "step": 764 }, { "epoch": 0.06212945667181028, "grad_norm": 4.295093117802471, "learning_rate": 4.986517006478663e-06, "loss": 0.4926, "step": 765 }, { "epoch": 0.062210671647851865, "grad_norm": 5.881778143353826, "learning_rate": 4.986448713561295e-06, "loss": 0.5777, "step": 766 }, { "epoch": 0.06229188662389345, "grad_norm": 3.5376491877561533, "learning_rate": 4.986380248594835e-06, "loss": 0.8141, "step": 767 }, { "epoch": 0.062373101599935026, "grad_norm": 15.197506269463709, "learning_rate": 4.9863116115840215e-06, "loss": 0.6347, "step": 768 }, { "epoch": 0.06245431657597661, "grad_norm": 5.5635913852411765, "learning_rate": 4.986242802533603e-06, "loss": 0.7381, "step": 769 }, { "epoch": 0.06253553155201819, "grad_norm": 4.921640572698924, "learning_rate": 4.986173821448341e-06, "loss": 0.5331, "step": 770 }, { "epoch": 0.06261674652805978, "grad_norm": 4.243356448186626, "learning_rate": 4.9861046683330085e-06, "loss": 0.7414, "step": 771 }, { "epoch": 0.06269796150410135, "grad_norm": 4.7676286369533685, "learning_rate": 4.986035343192389e-06, "loss": 0.7152, "step": 772 }, { "epoch": 0.06277917648014295, "grad_norm": 5.052963380949943, "learning_rate": 4.985965846031283e-06, "loss": 0.5741, "step": 773 }, { "epoch": 0.06286039145618452, "grad_norm": 5.830703109939469, "learning_rate": 4.985896176854496e-06, "loss": 0.5442, "step": 774 }, { "epoch": 0.0629416064322261, "grad_norm": 28.565367004080375, "learning_rate": 4.9858263356668505e-06, "loss": 0.6674, "step": 775 }, { "epoch": 0.06302282140826769, "grad_norm": 5.2243621774870475, "learning_rate": 4.985756322473178e-06, "loss": 0.5452, "step": 776 }, { "epoch": 0.06310403638430927, "grad_norm": 5.380977330836419, "learning_rate": 4.9856861372783236e-06, "loss": 0.5991, "step": 777 }, { "epoch": 0.06318525136035084, "grad_norm": 6.151320235729201, "learning_rate": 4.9856157800871455e-06, "loss": 0.661, "step": 778 }, { "epoch": 0.06326646633639244, "grad_norm": 4.520649099692102, "learning_rate": 4.985545250904509e-06, "loss": 0.5931, "step": 779 }, { "epoch": 0.06334768131243401, "grad_norm": 3.8505363861712336, "learning_rate": 4.985474549735296e-06, "loss": 0.6474, "step": 780 }, { "epoch": 0.06342889628847559, "grad_norm": 4.383396583650249, "learning_rate": 4.985403676584397e-06, "loss": 0.7205, "step": 781 }, { "epoch": 0.06351011126451718, "grad_norm": 5.432225271309565, "learning_rate": 4.985332631456719e-06, "loss": 0.6348, "step": 782 }, { "epoch": 0.06359132624055876, "grad_norm": 8.317084663016479, "learning_rate": 4.9852614143571755e-06, "loss": 0.6672, "step": 783 }, { "epoch": 0.06367254121660035, "grad_norm": 6.050699233785058, "learning_rate": 4.985190025290696e-06, "loss": 0.5816, "step": 784 }, { "epoch": 0.06375375619264192, "grad_norm": 4.054546774991104, "learning_rate": 4.985118464262219e-06, "loss": 0.5229, "step": 785 }, { "epoch": 0.0638349711686835, "grad_norm": 6.331916697325334, "learning_rate": 4.985046731276697e-06, "loss": 0.6044, "step": 786 }, { "epoch": 0.06391618614472509, "grad_norm": 5.448136315607667, "learning_rate": 4.984974826339093e-06, "loss": 0.6758, "step": 787 }, { "epoch": 0.06399740112076667, "grad_norm": 3.0701662409934065, "learning_rate": 4.984902749454382e-06, "loss": 0.5999, "step": 788 }, { "epoch": 0.06407861609680825, "grad_norm": 4.6802757341120556, "learning_rate": 4.9848305006275525e-06, "loss": 0.6773, "step": 789 }, { "epoch": 0.06415983107284984, "grad_norm": 5.59540917186297, "learning_rate": 4.984758079863603e-06, "loss": 0.5526, "step": 790 }, { "epoch": 0.06424104604889141, "grad_norm": 4.319537345372907, "learning_rate": 4.984685487167544e-06, "loss": 0.4746, "step": 791 }, { "epoch": 0.06432226102493299, "grad_norm": 4.812410544795939, "learning_rate": 4.9846127225444e-06, "loss": 0.5985, "step": 792 }, { "epoch": 0.06440347600097458, "grad_norm": 3.8747262789638, "learning_rate": 4.984539785999205e-06, "loss": 0.8711, "step": 793 }, { "epoch": 0.06448469097701616, "grad_norm": 9.168257138239873, "learning_rate": 4.984466677537007e-06, "loss": 0.6323, "step": 794 }, { "epoch": 0.06456590595305775, "grad_norm": 9.621282105988284, "learning_rate": 4.984393397162862e-06, "loss": 0.6117, "step": 795 }, { "epoch": 0.06464712092909933, "grad_norm": 4.3783026160070895, "learning_rate": 4.984319944881844e-06, "loss": 0.6991, "step": 796 }, { "epoch": 0.0647283359051409, "grad_norm": 4.35590663167429, "learning_rate": 4.984246320699033e-06, "loss": 0.7034, "step": 797 }, { "epoch": 0.0648095508811825, "grad_norm": 4.746827833135994, "learning_rate": 4.984172524619525e-06, "loss": 0.6776, "step": 798 }, { "epoch": 0.06489076585722407, "grad_norm": 5.905725252167514, "learning_rate": 4.984098556648425e-06, "loss": 0.5483, "step": 799 }, { "epoch": 0.06497198083326565, "grad_norm": 4.596246589585779, "learning_rate": 4.984024416790852e-06, "loss": 0.688, "step": 800 }, { "epoch": 0.06505319580930724, "grad_norm": 4.9534579923413595, "learning_rate": 4.983950105051936e-06, "loss": 0.6562, "step": 801 }, { "epoch": 0.06513441078534882, "grad_norm": 5.123985851109989, "learning_rate": 4.9838756214368185e-06, "loss": 0.6707, "step": 802 }, { "epoch": 0.0652156257613904, "grad_norm": 5.402799215185337, "learning_rate": 4.9838009659506535e-06, "loss": 0.6378, "step": 803 }, { "epoch": 0.06529684073743199, "grad_norm": 4.492757760289716, "learning_rate": 4.983726138598608e-06, "loss": 0.5916, "step": 804 }, { "epoch": 0.06537805571347356, "grad_norm": 3.95999131962753, "learning_rate": 4.9836511393858575e-06, "loss": 0.5023, "step": 805 }, { "epoch": 0.06545927068951515, "grad_norm": 3.920207373866997, "learning_rate": 4.983575968317593e-06, "loss": 0.7991, "step": 806 }, { "epoch": 0.06554048566555673, "grad_norm": 7.315668961691789, "learning_rate": 4.983500625399017e-06, "loss": 0.5301, "step": 807 }, { "epoch": 0.06562170064159831, "grad_norm": 3.912121127147346, "learning_rate": 4.98342511063534e-06, "loss": 0.7372, "step": 808 }, { "epoch": 0.0657029156176399, "grad_norm": 7.552130637632774, "learning_rate": 4.983349424031789e-06, "loss": 0.5946, "step": 809 }, { "epoch": 0.06578413059368148, "grad_norm": 6.533691238009157, "learning_rate": 4.983273565593601e-06, "loss": 0.642, "step": 810 }, { "epoch": 0.06586534556972305, "grad_norm": 9.916532299872616, "learning_rate": 4.983197535326024e-06, "loss": 0.6117, "step": 811 }, { "epoch": 0.06594656054576464, "grad_norm": 5.29822144361094, "learning_rate": 4.983121333234321e-06, "loss": 0.576, "step": 812 }, { "epoch": 0.06602777552180622, "grad_norm": 7.011546578891193, "learning_rate": 4.983044959323763e-06, "loss": 0.6305, "step": 813 }, { "epoch": 0.0661089904978478, "grad_norm": 7.6729419826090695, "learning_rate": 4.982968413599635e-06, "loss": 0.5282, "step": 814 }, { "epoch": 0.06619020547388939, "grad_norm": 3.568801069591388, "learning_rate": 4.982891696067234e-06, "loss": 0.5766, "step": 815 }, { "epoch": 0.06627142044993097, "grad_norm": 3.822836645624077, "learning_rate": 4.9828148067318675e-06, "loss": 0.647, "step": 816 }, { "epoch": 0.06635263542597256, "grad_norm": 4.578694026237904, "learning_rate": 4.982737745598857e-06, "loss": 0.8134, "step": 817 }, { "epoch": 0.06643385040201413, "grad_norm": 6.7533794019944065, "learning_rate": 4.982660512673534e-06, "loss": 0.6404, "step": 818 }, { "epoch": 0.06651506537805571, "grad_norm": 5.791281720391061, "learning_rate": 4.982583107961243e-06, "loss": 0.5909, "step": 819 }, { "epoch": 0.0665962803540973, "grad_norm": 4.161592546084501, "learning_rate": 4.982505531467339e-06, "loss": 0.5977, "step": 820 }, { "epoch": 0.06667749533013888, "grad_norm": 6.394531347329611, "learning_rate": 4.982427783197191e-06, "loss": 0.5928, "step": 821 }, { "epoch": 0.06675871030618045, "grad_norm": 8.488158051535715, "learning_rate": 4.982349863156179e-06, "loss": 0.7508, "step": 822 }, { "epoch": 0.06683992528222205, "grad_norm": 5.428324952116931, "learning_rate": 4.982271771349694e-06, "loss": 0.5392, "step": 823 }, { "epoch": 0.06692114025826362, "grad_norm": 4.025855738456331, "learning_rate": 4.98219350778314e-06, "loss": 0.4991, "step": 824 }, { "epoch": 0.0670023552343052, "grad_norm": 5.231237358948681, "learning_rate": 4.982115072461932e-06, "loss": 0.522, "step": 825 }, { "epoch": 0.06708357021034679, "grad_norm": 3.4702942997245616, "learning_rate": 4.9820364653914964e-06, "loss": 0.6092, "step": 826 }, { "epoch": 0.06716478518638837, "grad_norm": 8.455649789106133, "learning_rate": 4.981957686577275e-06, "loss": 0.8485, "step": 827 }, { "epoch": 0.06724600016242996, "grad_norm": 6.0384535186038955, "learning_rate": 4.981878736024716e-06, "loss": 0.5669, "step": 828 }, { "epoch": 0.06732721513847154, "grad_norm": 3.4751635293718732, "learning_rate": 4.981799613739284e-06, "loss": 0.5404, "step": 829 }, { "epoch": 0.06740843011451311, "grad_norm": 4.516175343460089, "learning_rate": 4.981720319726453e-06, "loss": 0.512, "step": 830 }, { "epoch": 0.0674896450905547, "grad_norm": 6.121636308285373, "learning_rate": 4.981640853991712e-06, "loss": 0.7, "step": 831 }, { "epoch": 0.06757086006659628, "grad_norm": 4.916730473198547, "learning_rate": 4.981561216540556e-06, "loss": 0.6203, "step": 832 }, { "epoch": 0.06765207504263786, "grad_norm": 6.140736960651221, "learning_rate": 4.981481407378498e-06, "loss": 0.6689, "step": 833 }, { "epoch": 0.06773329001867945, "grad_norm": 3.651255962877865, "learning_rate": 4.981401426511059e-06, "loss": 0.6711, "step": 834 }, { "epoch": 0.06781450499472103, "grad_norm": 4.433646055135197, "learning_rate": 4.981321273943775e-06, "loss": 0.5962, "step": 835 }, { "epoch": 0.0678957199707626, "grad_norm": 4.214671257503501, "learning_rate": 4.98124094968219e-06, "loss": 0.6696, "step": 836 }, { "epoch": 0.0679769349468042, "grad_norm": 8.938425585700115, "learning_rate": 4.981160453731864e-06, "loss": 0.5597, "step": 837 }, { "epoch": 0.06805814992284577, "grad_norm": 5.5327509204918215, "learning_rate": 4.981079786098365e-06, "loss": 0.6733, "step": 838 }, { "epoch": 0.06813936489888736, "grad_norm": 4.203367930591512, "learning_rate": 4.980998946787276e-06, "loss": 0.6717, "step": 839 }, { "epoch": 0.06822057987492894, "grad_norm": 4.743933277670276, "learning_rate": 4.98091793580419e-06, "loss": 0.6467, "step": 840 }, { "epoch": 0.06830179485097052, "grad_norm": 5.519327116976564, "learning_rate": 4.9808367531547144e-06, "loss": 0.5875, "step": 841 }, { "epoch": 0.0683830098270121, "grad_norm": 5.421051052782078, "learning_rate": 4.980755398844464e-06, "loss": 0.5742, "step": 842 }, { "epoch": 0.06846422480305368, "grad_norm": 5.205786851548457, "learning_rate": 4.980673872879069e-06, "loss": 0.734, "step": 843 }, { "epoch": 0.06854543977909526, "grad_norm": 7.795985083428892, "learning_rate": 4.980592175264172e-06, "loss": 0.6009, "step": 844 }, { "epoch": 0.06862665475513685, "grad_norm": 4.005038090148275, "learning_rate": 4.9805103060054235e-06, "loss": 0.5548, "step": 845 }, { "epoch": 0.06870786973117843, "grad_norm": 5.440366650852374, "learning_rate": 4.980428265108491e-06, "loss": 0.5983, "step": 846 }, { "epoch": 0.06878908470722, "grad_norm": 6.265163613283854, "learning_rate": 4.980346052579049e-06, "loss": 0.5767, "step": 847 }, { "epoch": 0.0688702996832616, "grad_norm": 4.18497253484256, "learning_rate": 4.9802636684227875e-06, "loss": 0.6295, "step": 848 }, { "epoch": 0.06895151465930317, "grad_norm": 36.607837712422636, "learning_rate": 4.980181112645407e-06, "loss": 0.5392, "step": 849 }, { "epoch": 0.06903272963534476, "grad_norm": 4.250875496889967, "learning_rate": 4.9800983852526195e-06, "loss": 0.7019, "step": 850 }, { "epoch": 0.06911394461138634, "grad_norm": 4.20052946210975, "learning_rate": 4.980015486250149e-06, "loss": 0.6024, "step": 851 }, { "epoch": 0.06919515958742792, "grad_norm": 8.36262034310868, "learning_rate": 4.979932415643733e-06, "loss": 0.505, "step": 852 }, { "epoch": 0.06927637456346951, "grad_norm": 3.9569054991402206, "learning_rate": 4.9798491734391185e-06, "loss": 0.6134, "step": 853 }, { "epoch": 0.06935758953951109, "grad_norm": 8.784347823002246, "learning_rate": 4.9797657596420655e-06, "loss": 0.6008, "step": 854 }, { "epoch": 0.06943880451555266, "grad_norm": 6.296297313209, "learning_rate": 4.979682174258346e-06, "loss": 0.5597, "step": 855 }, { "epoch": 0.06952001949159425, "grad_norm": 5.72675894986214, "learning_rate": 4.979598417293743e-06, "loss": 0.7964, "step": 856 }, { "epoch": 0.06960123446763583, "grad_norm": 9.153121757631224, "learning_rate": 4.979514488754053e-06, "loss": 0.6276, "step": 857 }, { "epoch": 0.06968244944367741, "grad_norm": 3.4432015000564156, "learning_rate": 4.979430388645083e-06, "loss": 0.6616, "step": 858 }, { "epoch": 0.069763664419719, "grad_norm": 4.626309196229603, "learning_rate": 4.979346116972653e-06, "loss": 0.6686, "step": 859 }, { "epoch": 0.06984487939576058, "grad_norm": 4.3625603630162555, "learning_rate": 4.979261673742592e-06, "loss": 0.7034, "step": 860 }, { "epoch": 0.06992609437180217, "grad_norm": 3.987951551641592, "learning_rate": 4.9791770589607455e-06, "loss": 0.7321, "step": 861 }, { "epoch": 0.07000730934784374, "grad_norm": 4.806090340673884, "learning_rate": 4.979092272632968e-06, "loss": 0.6409, "step": 862 }, { "epoch": 0.07008852432388532, "grad_norm": 8.252678583793005, "learning_rate": 4.979007314765124e-06, "loss": 0.6183, "step": 863 }, { "epoch": 0.07016973929992691, "grad_norm": 7.048613756795895, "learning_rate": 4.978922185363095e-06, "loss": 0.5649, "step": 864 }, { "epoch": 0.07025095427596849, "grad_norm": 7.107709811977146, "learning_rate": 4.97883688443277e-06, "loss": 0.7172, "step": 865 }, { "epoch": 0.07033216925201007, "grad_norm": 5.254387891978319, "learning_rate": 4.9787514119800515e-06, "loss": 0.756, "step": 866 }, { "epoch": 0.07041338422805166, "grad_norm": 4.158496862178936, "learning_rate": 4.9786657680108545e-06, "loss": 0.4973, "step": 867 }, { "epoch": 0.07049459920409323, "grad_norm": 4.086601599664647, "learning_rate": 4.978579952531104e-06, "loss": 0.7624, "step": 868 }, { "epoch": 0.07057581418013481, "grad_norm": 4.066778591214372, "learning_rate": 4.978493965546738e-06, "loss": 0.4797, "step": 869 }, { "epoch": 0.0706570291561764, "grad_norm": 4.458730486420523, "learning_rate": 4.9784078070637076e-06, "loss": 0.7739, "step": 870 }, { "epoch": 0.07073824413221798, "grad_norm": 4.2724723822092425, "learning_rate": 4.978321477087974e-06, "loss": 0.5737, "step": 871 }, { "epoch": 0.07081945910825957, "grad_norm": 4.54951709242148, "learning_rate": 4.97823497562551e-06, "loss": 0.5447, "step": 872 }, { "epoch": 0.07090067408430115, "grad_norm": 4.839083574408698, "learning_rate": 4.978148302682301e-06, "loss": 0.7224, "step": 873 }, { "epoch": 0.07098188906034272, "grad_norm": 5.90958403636814, "learning_rate": 4.978061458264346e-06, "loss": 0.6231, "step": 874 }, { "epoch": 0.07106310403638431, "grad_norm": 4.950954034585027, "learning_rate": 4.977974442377652e-06, "loss": 0.5298, "step": 875 }, { "epoch": 0.07114431901242589, "grad_norm": 4.705158462536222, "learning_rate": 4.977887255028241e-06, "loss": 0.5893, "step": 876 }, { "epoch": 0.07122553398846747, "grad_norm": 6.237199358592749, "learning_rate": 4.977799896222148e-06, "loss": 0.6342, "step": 877 }, { "epoch": 0.07130674896450906, "grad_norm": 5.842398936618045, "learning_rate": 4.977712365965414e-06, "loss": 0.6228, "step": 878 }, { "epoch": 0.07138796394055064, "grad_norm": 4.674400039131552, "learning_rate": 4.9776246642640965e-06, "loss": 0.639, "step": 879 }, { "epoch": 0.07146917891659221, "grad_norm": 6.64025918584984, "learning_rate": 4.977536791124267e-06, "loss": 0.6026, "step": 880 }, { "epoch": 0.0715503938926338, "grad_norm": 6.036395710793269, "learning_rate": 4.9774487465520025e-06, "loss": 0.8198, "step": 881 }, { "epoch": 0.07163160886867538, "grad_norm": 4.761294703683568, "learning_rate": 4.977360530553397e-06, "loss": 0.5251, "step": 882 }, { "epoch": 0.07171282384471697, "grad_norm": 4.922109235469366, "learning_rate": 4.977272143134554e-06, "loss": 0.6486, "step": 883 }, { "epoch": 0.07179403882075855, "grad_norm": 4.672737077545084, "learning_rate": 4.97718358430159e-06, "loss": 0.6457, "step": 884 }, { "epoch": 0.07187525379680013, "grad_norm": 8.289465047137561, "learning_rate": 4.977094854060631e-06, "loss": 0.6234, "step": 885 }, { "epoch": 0.07195646877284172, "grad_norm": 5.587306517848276, "learning_rate": 4.977005952417818e-06, "loss": 0.6662, "step": 886 }, { "epoch": 0.0720376837488833, "grad_norm": 3.711622831241775, "learning_rate": 4.9769168793793036e-06, "loss": 0.7314, "step": 887 }, { "epoch": 0.07211889872492487, "grad_norm": 7.5781498502969145, "learning_rate": 4.976827634951249e-06, "loss": 0.6133, "step": 888 }, { "epoch": 0.07220011370096646, "grad_norm": 4.823514400373474, "learning_rate": 4.976738219139831e-06, "loss": 0.6408, "step": 889 }, { "epoch": 0.07228132867700804, "grad_norm": 8.761667850604423, "learning_rate": 4.976648631951236e-06, "loss": 0.6452, "step": 890 }, { "epoch": 0.07236254365304962, "grad_norm": 3.571671073730977, "learning_rate": 4.976558873391663e-06, "loss": 0.6598, "step": 891 }, { "epoch": 0.0724437586290912, "grad_norm": 5.322524151802511, "learning_rate": 4.976468943467323e-06, "loss": 0.5386, "step": 892 }, { "epoch": 0.07252497360513278, "grad_norm": 5.66163748773043, "learning_rate": 4.976378842184439e-06, "loss": 0.5516, "step": 893 }, { "epoch": 0.07260618858117437, "grad_norm": 5.353033651730894, "learning_rate": 4.9762885695492454e-06, "loss": 0.6012, "step": 894 }, { "epoch": 0.07268740355721595, "grad_norm": 6.855863397519137, "learning_rate": 4.976198125567988e-06, "loss": 0.6303, "step": 895 }, { "epoch": 0.07276861853325753, "grad_norm": 4.84169944440175, "learning_rate": 4.976107510246925e-06, "loss": 0.528, "step": 896 }, { "epoch": 0.07284983350929912, "grad_norm": 5.092992144693499, "learning_rate": 4.976016723592328e-06, "loss": 0.5648, "step": 897 }, { "epoch": 0.0729310484853407, "grad_norm": 4.858637678276697, "learning_rate": 4.975925765610476e-06, "loss": 0.5921, "step": 898 }, { "epoch": 0.07301226346138227, "grad_norm": 4.995201775650152, "learning_rate": 4.975834636307667e-06, "loss": 0.6145, "step": 899 }, { "epoch": 0.07309347843742386, "grad_norm": 6.2150093032360365, "learning_rate": 4.975743335690203e-06, "loss": 0.5049, "step": 900 }, { "epoch": 0.07317469341346544, "grad_norm": 5.737916586640195, "learning_rate": 4.975651863764403e-06, "loss": 0.5949, "step": 901 }, { "epoch": 0.07325590838950702, "grad_norm": 5.21474627504135, "learning_rate": 4.975560220536596e-06, "loss": 0.8498, "step": 902 }, { "epoch": 0.07333712336554861, "grad_norm": 3.9495934594747877, "learning_rate": 4.975468406013124e-06, "loss": 0.6854, "step": 903 }, { "epoch": 0.07341833834159019, "grad_norm": 7.580396023436531, "learning_rate": 4.97537642020034e-06, "loss": 0.585, "step": 904 }, { "epoch": 0.07349955331763178, "grad_norm": 6.081348176726435, "learning_rate": 4.9752842631046075e-06, "loss": 0.5681, "step": 905 }, { "epoch": 0.07358076829367335, "grad_norm": 7.6219497191818695, "learning_rate": 4.975191934732304e-06, "loss": 0.5283, "step": 906 }, { "epoch": 0.07366198326971493, "grad_norm": 7.644719180949304, "learning_rate": 4.975099435089819e-06, "loss": 0.544, "step": 907 }, { "epoch": 0.07374319824575652, "grad_norm": 4.967279914364186, "learning_rate": 4.975006764183552e-06, "loss": 0.6976, "step": 908 }, { "epoch": 0.0738244132217981, "grad_norm": 4.727348872362556, "learning_rate": 4.974913922019916e-06, "loss": 0.6466, "step": 909 }, { "epoch": 0.07390562819783968, "grad_norm": 6.6416921364354895, "learning_rate": 4.974820908605336e-06, "loss": 0.5407, "step": 910 }, { "epoch": 0.07398684317388127, "grad_norm": 4.934581673705169, "learning_rate": 4.974727723946245e-06, "loss": 0.6653, "step": 911 }, { "epoch": 0.07406805814992284, "grad_norm": 3.2488987014947286, "learning_rate": 4.974634368049094e-06, "loss": 0.5007, "step": 912 }, { "epoch": 0.07414927312596442, "grad_norm": 6.227255018738214, "learning_rate": 4.974540840920341e-06, "loss": 0.5501, "step": 913 }, { "epoch": 0.07423048810200601, "grad_norm": 4.977844807059578, "learning_rate": 4.974447142566458e-06, "loss": 0.7246, "step": 914 }, { "epoch": 0.07431170307804759, "grad_norm": 4.9582732140375105, "learning_rate": 4.974353272993929e-06, "loss": 0.6714, "step": 915 }, { "epoch": 0.07439291805408918, "grad_norm": 5.123191354056708, "learning_rate": 4.974259232209249e-06, "loss": 0.7354, "step": 916 }, { "epoch": 0.07447413303013076, "grad_norm": 5.206554108430837, "learning_rate": 4.9741650202189245e-06, "loss": 0.6421, "step": 917 }, { "epoch": 0.07455534800617233, "grad_norm": 8.459450047833085, "learning_rate": 4.9740706370294755e-06, "loss": 0.8359, "step": 918 }, { "epoch": 0.07463656298221392, "grad_norm": 4.78007740463755, "learning_rate": 4.973976082647432e-06, "loss": 0.6941, "step": 919 }, { "epoch": 0.0747177779582555, "grad_norm": 4.06407753833687, "learning_rate": 4.9738813570793365e-06, "loss": 0.6078, "step": 920 }, { "epoch": 0.07479899293429708, "grad_norm": 6.4634082754205915, "learning_rate": 4.973786460331744e-06, "loss": 0.6072, "step": 921 }, { "epoch": 0.07488020791033867, "grad_norm": 4.794171216936538, "learning_rate": 4.973691392411221e-06, "loss": 0.6489, "step": 922 }, { "epoch": 0.07496142288638025, "grad_norm": 4.048880670381879, "learning_rate": 4.973596153324346e-06, "loss": 0.6415, "step": 923 }, { "epoch": 0.07504263786242182, "grad_norm": 4.260077508939967, "learning_rate": 4.973500743077707e-06, "loss": 0.5398, "step": 924 }, { "epoch": 0.07512385283846341, "grad_norm": 4.630184924560269, "learning_rate": 4.9734051616779085e-06, "loss": 0.5155, "step": 925 }, { "epoch": 0.07520506781450499, "grad_norm": 6.107378791436437, "learning_rate": 4.973309409131564e-06, "loss": 0.5784, "step": 926 }, { "epoch": 0.07528628279054658, "grad_norm": 4.991440001564157, "learning_rate": 4.973213485445298e-06, "loss": 0.6463, "step": 927 }, { "epoch": 0.07536749776658816, "grad_norm": 3.9596877989124017, "learning_rate": 4.973117390625746e-06, "loss": 0.5694, "step": 928 }, { "epoch": 0.07544871274262974, "grad_norm": 5.9210563584390385, "learning_rate": 4.9730211246795614e-06, "loss": 0.5044, "step": 929 }, { "epoch": 0.07552992771867133, "grad_norm": 6.400935356399938, "learning_rate": 4.9729246876134015e-06, "loss": 0.5711, "step": 930 }, { "epoch": 0.0756111426947129, "grad_norm": 6.074570162257794, "learning_rate": 4.9728280794339426e-06, "loss": 0.7171, "step": 931 }, { "epoch": 0.07569235767075448, "grad_norm": 7.525931570888455, "learning_rate": 4.972731300147867e-06, "loss": 0.5338, "step": 932 }, { "epoch": 0.07577357264679607, "grad_norm": 5.22558586071865, "learning_rate": 4.972634349761873e-06, "loss": 0.5591, "step": 933 }, { "epoch": 0.07585478762283765, "grad_norm": 10.047768244682082, "learning_rate": 4.972537228282668e-06, "loss": 0.6477, "step": 934 }, { "epoch": 0.07593600259887923, "grad_norm": 6.876714921725991, "learning_rate": 4.972439935716972e-06, "loss": 0.5482, "step": 935 }, { "epoch": 0.07601721757492082, "grad_norm": 6.550177057547305, "learning_rate": 4.972342472071518e-06, "loss": 0.5923, "step": 936 }, { "epoch": 0.0760984325509624, "grad_norm": 5.709520994341491, "learning_rate": 4.97224483735305e-06, "loss": 0.6293, "step": 937 }, { "epoch": 0.07617964752700399, "grad_norm": 4.727570596744493, "learning_rate": 4.972147031568322e-06, "loss": 0.5793, "step": 938 }, { "epoch": 0.07626086250304556, "grad_norm": 5.251994860118902, "learning_rate": 4.972049054724104e-06, "loss": 0.7869, "step": 939 }, { "epoch": 0.07634207747908714, "grad_norm": 7.152014425413055, "learning_rate": 4.9719509068271755e-06, "loss": 0.5164, "step": 940 }, { "epoch": 0.07642329245512873, "grad_norm": 9.90794133427569, "learning_rate": 4.971852587884325e-06, "loss": 0.5625, "step": 941 }, { "epoch": 0.07650450743117031, "grad_norm": 5.980631957938455, "learning_rate": 4.97175409790236e-06, "loss": 0.6737, "step": 942 }, { "epoch": 0.07658572240721188, "grad_norm": 11.229665313768436, "learning_rate": 4.97165543688809e-06, "loss": 0.6511, "step": 943 }, { "epoch": 0.07666693738325348, "grad_norm": 15.507076859457626, "learning_rate": 4.971556604848346e-06, "loss": 0.595, "step": 944 }, { "epoch": 0.07674815235929505, "grad_norm": 4.875894631175207, "learning_rate": 4.971457601789966e-06, "loss": 0.5992, "step": 945 }, { "epoch": 0.07682936733533664, "grad_norm": 4.406952012061629, "learning_rate": 4.9713584277198e-06, "loss": 0.5121, "step": 946 }, { "epoch": 0.07691058231137822, "grad_norm": 5.211822267384949, "learning_rate": 4.97125908264471e-06, "loss": 0.521, "step": 947 }, { "epoch": 0.0769917972874198, "grad_norm": 6.646319909846443, "learning_rate": 4.97115956657157e-06, "loss": 0.5977, "step": 948 }, { "epoch": 0.07707301226346139, "grad_norm": 5.951258269254235, "learning_rate": 4.971059879507268e-06, "loss": 0.6193, "step": 949 }, { "epoch": 0.07715422723950296, "grad_norm": 15.849437156314497, "learning_rate": 4.970960021458699e-06, "loss": 0.7991, "step": 950 }, { "epoch": 0.07723544221554454, "grad_norm": 4.470056142523615, "learning_rate": 4.9708599924327735e-06, "loss": 0.5992, "step": 951 }, { "epoch": 0.07731665719158613, "grad_norm": 7.357239226247135, "learning_rate": 4.970759792436414e-06, "loss": 0.574, "step": 952 }, { "epoch": 0.07739787216762771, "grad_norm": 6.193524739615741, "learning_rate": 4.970659421476553e-06, "loss": 0.6871, "step": 953 }, { "epoch": 0.07747908714366929, "grad_norm": 6.192316599355833, "learning_rate": 4.970558879560137e-06, "loss": 0.6069, "step": 954 }, { "epoch": 0.07756030211971088, "grad_norm": 7.025324708867777, "learning_rate": 4.97045816669412e-06, "loss": 0.45, "step": 955 }, { "epoch": 0.07764151709575245, "grad_norm": 5.808792647688984, "learning_rate": 4.970357282885473e-06, "loss": 0.7007, "step": 956 }, { "epoch": 0.07772273207179405, "grad_norm": 5.800444379893061, "learning_rate": 4.970256228141177e-06, "loss": 0.5379, "step": 957 }, { "epoch": 0.07780394704783562, "grad_norm": 4.764091635977128, "learning_rate": 4.970155002468223e-06, "loss": 0.6805, "step": 958 }, { "epoch": 0.0778851620238772, "grad_norm": 5.223528426735964, "learning_rate": 4.970053605873616e-06, "loss": 0.6757, "step": 959 }, { "epoch": 0.07796637699991879, "grad_norm": 7.117293048534837, "learning_rate": 4.969952038364372e-06, "loss": 0.5716, "step": 960 }, { "epoch": 0.07804759197596037, "grad_norm": 8.849272822881796, "learning_rate": 4.96985029994752e-06, "loss": 0.7158, "step": 961 }, { "epoch": 0.07812880695200194, "grad_norm": 4.323247038552405, "learning_rate": 4.969748390630097e-06, "loss": 0.7014, "step": 962 }, { "epoch": 0.07821002192804354, "grad_norm": 3.15078482423997, "learning_rate": 4.969646310419157e-06, "loss": 0.6552, "step": 963 }, { "epoch": 0.07829123690408511, "grad_norm": 5.142051674034742, "learning_rate": 4.9695440593217635e-06, "loss": 0.5731, "step": 964 }, { "epoch": 0.07837245188012669, "grad_norm": 4.530754006814891, "learning_rate": 4.96944163734499e-06, "loss": 0.4648, "step": 965 }, { "epoch": 0.07845366685616828, "grad_norm": 5.999396192869684, "learning_rate": 4.969339044495925e-06, "loss": 0.6981, "step": 966 }, { "epoch": 0.07853488183220986, "grad_norm": 4.570316628434694, "learning_rate": 4.969236280781667e-06, "loss": 0.5834, "step": 967 }, { "epoch": 0.07861609680825145, "grad_norm": 29.36059594145154, "learning_rate": 4.9691333462093264e-06, "loss": 0.6442, "step": 968 }, { "epoch": 0.07869731178429303, "grad_norm": 4.95470560540118, "learning_rate": 4.969030240786026e-06, "loss": 0.4758, "step": 969 }, { "epoch": 0.0787785267603346, "grad_norm": 3.893402872277116, "learning_rate": 4.9689269645189e-06, "loss": 0.6544, "step": 970 }, { "epoch": 0.0788597417363762, "grad_norm": 5.3148301362547885, "learning_rate": 4.968823517415095e-06, "loss": 0.611, "step": 971 }, { "epoch": 0.07894095671241777, "grad_norm": 5.03652657763463, "learning_rate": 4.9687198994817685e-06, "loss": 0.6703, "step": 972 }, { "epoch": 0.07902217168845935, "grad_norm": 4.100966531743051, "learning_rate": 4.9686161107260906e-06, "loss": 0.5816, "step": 973 }, { "epoch": 0.07910338666450094, "grad_norm": 5.116827473240173, "learning_rate": 4.968512151155242e-06, "loss": 0.5917, "step": 974 }, { "epoch": 0.07918460164054252, "grad_norm": 3.2335435030359596, "learning_rate": 4.968408020776419e-06, "loss": 0.5538, "step": 975 }, { "epoch": 0.07926581661658409, "grad_norm": 7.5591035161901035, "learning_rate": 4.968303719596823e-06, "loss": 0.461, "step": 976 }, { "epoch": 0.07934703159262568, "grad_norm": 10.075832303176476, "learning_rate": 4.9681992476236725e-06, "loss": 0.6354, "step": 977 }, { "epoch": 0.07942824656866726, "grad_norm": 5.561625086524589, "learning_rate": 4.968094604864198e-06, "loss": 0.5383, "step": 978 }, { "epoch": 0.07950946154470885, "grad_norm": 5.6568874477353095, "learning_rate": 4.967989791325639e-06, "loss": 0.6593, "step": 979 }, { "epoch": 0.07959067652075043, "grad_norm": 6.416186383122935, "learning_rate": 4.967884807015247e-06, "loss": 0.833, "step": 980 }, { "epoch": 0.079671891496792, "grad_norm": 5.7530564697103355, "learning_rate": 4.967779651940289e-06, "loss": 0.7025, "step": 981 }, { "epoch": 0.0797531064728336, "grad_norm": 6.850642228085424, "learning_rate": 4.967674326108039e-06, "loss": 0.5582, "step": 982 }, { "epoch": 0.07983432144887517, "grad_norm": 4.208095970275836, "learning_rate": 4.9675688295257855e-06, "loss": 0.514, "step": 983 }, { "epoch": 0.07991553642491675, "grad_norm": 5.1148065579680075, "learning_rate": 4.967463162200828e-06, "loss": 0.6708, "step": 984 }, { "epoch": 0.07999675140095834, "grad_norm": 7.806584044546556, "learning_rate": 4.967357324140479e-06, "loss": 0.639, "step": 985 }, { "epoch": 0.08007796637699992, "grad_norm": 7.822904916898733, "learning_rate": 4.967251315352062e-06, "loss": 0.7296, "step": 986 }, { "epoch": 0.0801591813530415, "grad_norm": 4.251509260453048, "learning_rate": 4.9671451358429115e-06, "loss": 0.6169, "step": 987 }, { "epoch": 0.08024039632908309, "grad_norm": 3.9712450920352014, "learning_rate": 4.967038785620374e-06, "loss": 0.6307, "step": 988 }, { "epoch": 0.08032161130512466, "grad_norm": 5.013282143950785, "learning_rate": 4.96693226469181e-06, "loss": 0.6896, "step": 989 }, { "epoch": 0.08040282628116625, "grad_norm": 6.309239309855388, "learning_rate": 4.966825573064589e-06, "loss": 0.5816, "step": 990 }, { "epoch": 0.08048404125720783, "grad_norm": 17.18761480554951, "learning_rate": 4.9667187107460934e-06, "loss": 0.7024, "step": 991 }, { "epoch": 0.08056525623324941, "grad_norm": 8.511668880504516, "learning_rate": 4.966611677743719e-06, "loss": 0.6545, "step": 992 }, { "epoch": 0.080646471209291, "grad_norm": 7.07740712784662, "learning_rate": 4.96650447406487e-06, "loss": 0.6241, "step": 993 }, { "epoch": 0.08072768618533258, "grad_norm": 6.4092146264144505, "learning_rate": 4.966397099716965e-06, "loss": 0.6096, "step": 994 }, { "epoch": 0.08080890116137415, "grad_norm": 3.371066265022369, "learning_rate": 4.9662895547074345e-06, "loss": 0.6807, "step": 995 }, { "epoch": 0.08089011613741574, "grad_norm": 11.15283861894202, "learning_rate": 4.96618183904372e-06, "loss": 0.69, "step": 996 }, { "epoch": 0.08097133111345732, "grad_norm": 5.584728402833689, "learning_rate": 4.966073952733273e-06, "loss": 0.6545, "step": 997 }, { "epoch": 0.0810525460894989, "grad_norm": 4.928404998187488, "learning_rate": 4.965965895783561e-06, "loss": 0.7363, "step": 998 }, { "epoch": 0.08113376106554049, "grad_norm": 5.18162526836767, "learning_rate": 4.96585766820206e-06, "loss": 0.7405, "step": 999 }, { "epoch": 0.08121497604158207, "grad_norm": 5.108535543984558, "learning_rate": 4.965749269996258e-06, "loss": 0.4666, "step": 1000 }, { "epoch": 0.08129619101762366, "grad_norm": 4.081215207318422, "learning_rate": 4.965640701173657e-06, "loss": 0.6248, "step": 1001 }, { "epoch": 0.08137740599366523, "grad_norm": 4.819602648972772, "learning_rate": 4.9655319617417674e-06, "loss": 0.6029, "step": 1002 }, { "epoch": 0.08145862096970681, "grad_norm": 3.672836794950573, "learning_rate": 4.965423051708116e-06, "loss": 0.7782, "step": 1003 }, { "epoch": 0.0815398359457484, "grad_norm": 4.726346322374398, "learning_rate": 4.965313971080237e-06, "loss": 0.5324, "step": 1004 }, { "epoch": 0.08162105092178998, "grad_norm": 5.155470667426197, "learning_rate": 4.96520471986568e-06, "loss": 0.666, "step": 1005 }, { "epoch": 0.08170226589783156, "grad_norm": 4.923143643344893, "learning_rate": 4.965095298072001e-06, "loss": 0.5542, "step": 1006 }, { "epoch": 0.08178348087387315, "grad_norm": 4.1341959962667305, "learning_rate": 4.964985705706775e-06, "loss": 0.5911, "step": 1007 }, { "epoch": 0.08186469584991472, "grad_norm": 5.790946529810634, "learning_rate": 4.964875942777584e-06, "loss": 0.7514, "step": 1008 }, { "epoch": 0.0819459108259563, "grad_norm": 5.188744184884793, "learning_rate": 4.964766009292022e-06, "loss": 0.704, "step": 1009 }, { "epoch": 0.08202712580199789, "grad_norm": 9.318471720754397, "learning_rate": 4.9646559052576985e-06, "loss": 0.6622, "step": 1010 }, { "epoch": 0.08210834077803947, "grad_norm": 4.902314685975934, "learning_rate": 4.9645456306822285e-06, "loss": 0.5447, "step": 1011 }, { "epoch": 0.08218955575408106, "grad_norm": 5.828651116573771, "learning_rate": 4.964435185573245e-06, "loss": 0.5288, "step": 1012 }, { "epoch": 0.08227077073012264, "grad_norm": 2.864291283393185, "learning_rate": 4.96432456993839e-06, "loss": 0.6403, "step": 1013 }, { "epoch": 0.08235198570616421, "grad_norm": 5.284219787159016, "learning_rate": 4.964213783785317e-06, "loss": 0.6291, "step": 1014 }, { "epoch": 0.0824332006822058, "grad_norm": 4.43415065344841, "learning_rate": 4.9641028271216905e-06, "loss": 0.8372, "step": 1015 }, { "epoch": 0.08251441565824738, "grad_norm": 4.37746143116903, "learning_rate": 4.9639916999551905e-06, "loss": 0.5823, "step": 1016 }, { "epoch": 0.08259563063428896, "grad_norm": 15.987254695136924, "learning_rate": 4.963880402293506e-06, "loss": 0.6104, "step": 1017 }, { "epoch": 0.08267684561033055, "grad_norm": 5.7244481019012925, "learning_rate": 4.963768934144336e-06, "loss": 0.7552, "step": 1018 }, { "epoch": 0.08275806058637213, "grad_norm": 4.217271904051636, "learning_rate": 4.963657295515396e-06, "loss": 0.6159, "step": 1019 }, { "epoch": 0.0828392755624137, "grad_norm": 7.382613542004634, "learning_rate": 4.963545486414411e-06, "loss": 0.6277, "step": 1020 }, { "epoch": 0.0829204905384553, "grad_norm": 5.353844268940195, "learning_rate": 4.963433506849115e-06, "loss": 0.738, "step": 1021 }, { "epoch": 0.08300170551449687, "grad_norm": 10.18166117587427, "learning_rate": 4.963321356827258e-06, "loss": 0.6173, "step": 1022 }, { "epoch": 0.08308292049053846, "grad_norm": 7.319414111585189, "learning_rate": 4.9632090363565995e-06, "loss": 0.6377, "step": 1023 }, { "epoch": 0.08316413546658004, "grad_norm": 6.252416872046548, "learning_rate": 4.963096545444913e-06, "loss": 0.5095, "step": 1024 }, { "epoch": 0.08324535044262162, "grad_norm": 4.976076953773711, "learning_rate": 4.962983884099981e-06, "loss": 0.5225, "step": 1025 }, { "epoch": 0.0833265654186632, "grad_norm": 6.865653287550914, "learning_rate": 4.9628710523296e-06, "loss": 0.5425, "step": 1026 }, { "epoch": 0.08340778039470478, "grad_norm": 5.6514607039180005, "learning_rate": 4.962758050141576e-06, "loss": 0.5492, "step": 1027 }, { "epoch": 0.08348899537074636, "grad_norm": 8.200634823957119, "learning_rate": 4.962644877543729e-06, "loss": 0.5683, "step": 1028 }, { "epoch": 0.08357021034678795, "grad_norm": 4.089877382417978, "learning_rate": 4.96253153454389e-06, "loss": 0.7186, "step": 1029 }, { "epoch": 0.08365142532282953, "grad_norm": 4.918100288171265, "learning_rate": 4.9624180211499004e-06, "loss": 0.4817, "step": 1030 }, { "epoch": 0.0837326402988711, "grad_norm": 5.979359137308161, "learning_rate": 4.962304337369618e-06, "loss": 0.575, "step": 1031 }, { "epoch": 0.0838138552749127, "grad_norm": 5.9147018999524645, "learning_rate": 4.962190483210906e-06, "loss": 0.5979, "step": 1032 }, { "epoch": 0.08389507025095427, "grad_norm": 3.8882956927363614, "learning_rate": 4.962076458681642e-06, "loss": 0.5231, "step": 1033 }, { "epoch": 0.08397628522699586, "grad_norm": 8.61999834460005, "learning_rate": 4.96196226378972e-06, "loss": 0.6006, "step": 1034 }, { "epoch": 0.08405750020303744, "grad_norm": 4.813530840618122, "learning_rate": 4.961847898543038e-06, "loss": 0.7872, "step": 1035 }, { "epoch": 0.08413871517907902, "grad_norm": 6.218523263233032, "learning_rate": 4.96173336294951e-06, "loss": 0.6882, "step": 1036 }, { "epoch": 0.08421993015512061, "grad_norm": 5.495336585299689, "learning_rate": 4.961618657017063e-06, "loss": 0.828, "step": 1037 }, { "epoch": 0.08430114513116219, "grad_norm": 6.714525494166789, "learning_rate": 4.961503780753633e-06, "loss": 0.8362, "step": 1038 }, { "epoch": 0.08438236010720376, "grad_norm": 3.212772209979488, "learning_rate": 4.9613887341671675e-06, "loss": 0.4224, "step": 1039 }, { "epoch": 0.08446357508324535, "grad_norm": 7.183836025951974, "learning_rate": 4.961273517265629e-06, "loss": 0.5646, "step": 1040 }, { "epoch": 0.08454479005928693, "grad_norm": 6.286950241807605, "learning_rate": 4.961158130056989e-06, "loss": 0.5711, "step": 1041 }, { "epoch": 0.08462600503532851, "grad_norm": 5.85967031646826, "learning_rate": 4.961042572549232e-06, "loss": 0.4819, "step": 1042 }, { "epoch": 0.0847072200113701, "grad_norm": 4.788980234170179, "learning_rate": 4.960926844750353e-06, "loss": 0.5843, "step": 1043 }, { "epoch": 0.08478843498741168, "grad_norm": 6.955485093136964, "learning_rate": 4.960810946668362e-06, "loss": 0.5618, "step": 1044 }, { "epoch": 0.08486964996345327, "grad_norm": 5.484890003960757, "learning_rate": 4.960694878311276e-06, "loss": 0.6486, "step": 1045 }, { "epoch": 0.08495086493949484, "grad_norm": 5.542258816933063, "learning_rate": 4.960578639687129e-06, "loss": 0.4978, "step": 1046 }, { "epoch": 0.08503207991553642, "grad_norm": 5.931712433067937, "learning_rate": 4.960462230803961e-06, "loss": 0.6843, "step": 1047 }, { "epoch": 0.08511329489157801, "grad_norm": 5.759837139562491, "learning_rate": 4.960345651669829e-06, "loss": 0.5678, "step": 1048 }, { "epoch": 0.08519450986761959, "grad_norm": 8.019301092937635, "learning_rate": 4.960228902292799e-06, "loss": 0.5629, "step": 1049 }, { "epoch": 0.08527572484366117, "grad_norm": 4.806150258046436, "learning_rate": 4.96011198268095e-06, "loss": 0.4966, "step": 1050 }, { "epoch": 0.08535693981970276, "grad_norm": 6.278721005688989, "learning_rate": 4.959994892842371e-06, "loss": 0.6009, "step": 1051 }, { "epoch": 0.08543815479574433, "grad_norm": 4.006330485950786, "learning_rate": 4.959877632785166e-06, "loss": 0.538, "step": 1052 }, { "epoch": 0.08551936977178591, "grad_norm": 6.0015668910114, "learning_rate": 4.959760202517446e-06, "loss": 0.5693, "step": 1053 }, { "epoch": 0.0856005847478275, "grad_norm": 17.347793241402837, "learning_rate": 4.959642602047339e-06, "loss": 0.5231, "step": 1054 }, { "epoch": 0.08568179972386908, "grad_norm": 4.788609767567765, "learning_rate": 4.959524831382981e-06, "loss": 0.6282, "step": 1055 }, { "epoch": 0.08576301469991067, "grad_norm": 10.272501989404129, "learning_rate": 4.9594068905325225e-06, "loss": 0.7099, "step": 1056 }, { "epoch": 0.08584422967595225, "grad_norm": 5.360284497181169, "learning_rate": 4.959288779504122e-06, "loss": 0.5964, "step": 1057 }, { "epoch": 0.08592544465199382, "grad_norm": 4.376816971311538, "learning_rate": 4.959170498305955e-06, "loss": 0.6034, "step": 1058 }, { "epoch": 0.08600665962803541, "grad_norm": 7.410246905461928, "learning_rate": 4.959052046946203e-06, "loss": 0.5496, "step": 1059 }, { "epoch": 0.08608787460407699, "grad_norm": 3.455502795284755, "learning_rate": 4.958933425433065e-06, "loss": 0.6964, "step": 1060 }, { "epoch": 0.08616908958011857, "grad_norm": 5.754779871120552, "learning_rate": 4.958814633774747e-06, "loss": 0.5759, "step": 1061 }, { "epoch": 0.08625030455616016, "grad_norm": 5.304809027548407, "learning_rate": 4.95869567197947e-06, "loss": 0.807, "step": 1062 }, { "epoch": 0.08633151953220174, "grad_norm": 6.315153689798026, "learning_rate": 4.958576540055464e-06, "loss": 0.6127, "step": 1063 }, { "epoch": 0.08641273450824331, "grad_norm": 5.159058407796672, "learning_rate": 4.958457238010974e-06, "loss": 0.5677, "step": 1064 }, { "epoch": 0.0864939494842849, "grad_norm": 6.440705718742274, "learning_rate": 4.958337765854254e-06, "loss": 0.6875, "step": 1065 }, { "epoch": 0.08657516446032648, "grad_norm": 5.084236342123366, "learning_rate": 4.958218123593572e-06, "loss": 0.7366, "step": 1066 }, { "epoch": 0.08665637943636807, "grad_norm": 3.7240825747904447, "learning_rate": 4.958098311237205e-06, "loss": 0.674, "step": 1067 }, { "epoch": 0.08673759441240965, "grad_norm": 5.058020669894491, "learning_rate": 4.9579783287934445e-06, "loss": 0.6569, "step": 1068 }, { "epoch": 0.08681880938845123, "grad_norm": 4.0274556261885515, "learning_rate": 4.957858176270591e-06, "loss": 0.6855, "step": 1069 }, { "epoch": 0.08690002436449282, "grad_norm": 7.179073356157699, "learning_rate": 4.957737853676961e-06, "loss": 0.58, "step": 1070 }, { "epoch": 0.0869812393405344, "grad_norm": 4.03973215146239, "learning_rate": 4.957617361020879e-06, "loss": 0.515, "step": 1071 }, { "epoch": 0.08706245431657597, "grad_norm": 7.191602127334415, "learning_rate": 4.9574966983106824e-06, "loss": 0.7152, "step": 1072 }, { "epoch": 0.08714366929261756, "grad_norm": 4.274773215099363, "learning_rate": 4.95737586555472e-06, "loss": 0.6789, "step": 1073 }, { "epoch": 0.08722488426865914, "grad_norm": 4.114285344199829, "learning_rate": 4.957254862761354e-06, "loss": 0.7913, "step": 1074 }, { "epoch": 0.08730609924470072, "grad_norm": 6.861324021554996, "learning_rate": 4.957133689938955e-06, "loss": 0.7393, "step": 1075 }, { "epoch": 0.08738731422074231, "grad_norm": 6.978671101537034, "learning_rate": 4.95701234709591e-06, "loss": 0.6303, "step": 1076 }, { "epoch": 0.08746852919678388, "grad_norm": 9.578370134137916, "learning_rate": 4.956890834240613e-06, "loss": 0.6739, "step": 1077 }, { "epoch": 0.08754974417282547, "grad_norm": 4.953780236460816, "learning_rate": 4.956769151381474e-06, "loss": 0.6609, "step": 1078 }, { "epoch": 0.08763095914886705, "grad_norm": 6.102476761076953, "learning_rate": 4.9566472985269125e-06, "loss": 0.5512, "step": 1079 }, { "epoch": 0.08771217412490863, "grad_norm": 4.373415052928376, "learning_rate": 4.956525275685358e-06, "loss": 0.5459, "step": 1080 }, { "epoch": 0.08779338910095022, "grad_norm": 5.532793766061284, "learning_rate": 4.9564030828652565e-06, "loss": 0.6867, "step": 1081 }, { "epoch": 0.0878746040769918, "grad_norm": 3.8285573439072254, "learning_rate": 4.956280720075062e-06, "loss": 0.7538, "step": 1082 }, { "epoch": 0.08795581905303337, "grad_norm": 6.492766812771604, "learning_rate": 4.9561581873232415e-06, "loss": 0.5405, "step": 1083 }, { "epoch": 0.08803703402907496, "grad_norm": 12.542445075286674, "learning_rate": 4.956035484618272e-06, "loss": 0.5867, "step": 1084 }, { "epoch": 0.08811824900511654, "grad_norm": 5.8788039197140005, "learning_rate": 4.955912611968648e-06, "loss": 0.5364, "step": 1085 }, { "epoch": 0.08819946398115812, "grad_norm": 6.820020186044584, "learning_rate": 4.955789569382866e-06, "loss": 0.5554, "step": 1086 }, { "epoch": 0.08828067895719971, "grad_norm": 5.44097898708547, "learning_rate": 4.955666356869445e-06, "loss": 0.6424, "step": 1087 }, { "epoch": 0.08836189393324129, "grad_norm": 6.451838410522676, "learning_rate": 4.955542974436908e-06, "loss": 0.6376, "step": 1088 }, { "epoch": 0.08844310890928288, "grad_norm": 16.847875335305353, "learning_rate": 4.955419422093792e-06, "loss": 0.5683, "step": 1089 }, { "epoch": 0.08852432388532445, "grad_norm": 5.250679262222032, "learning_rate": 4.955295699848649e-06, "loss": 0.5726, "step": 1090 }, { "epoch": 0.08860553886136603, "grad_norm": 3.708739891574932, "learning_rate": 4.955171807710037e-06, "loss": 0.664, "step": 1091 }, { "epoch": 0.08868675383740762, "grad_norm": 6.291250688681625, "learning_rate": 4.955047745686529e-06, "loss": 0.5727, "step": 1092 }, { "epoch": 0.0887679688134492, "grad_norm": 9.32506667375153, "learning_rate": 4.954923513786711e-06, "loss": 0.6444, "step": 1093 }, { "epoch": 0.08884918378949078, "grad_norm": 5.9130602901334735, "learning_rate": 4.954799112019178e-06, "loss": 0.6713, "step": 1094 }, { "epoch": 0.08893039876553237, "grad_norm": 7.27169259231534, "learning_rate": 4.9546745403925385e-06, "loss": 0.598, "step": 1095 }, { "epoch": 0.08901161374157394, "grad_norm": 4.39443693321864, "learning_rate": 4.954549798915412e-06, "loss": 0.5987, "step": 1096 }, { "epoch": 0.08909282871761552, "grad_norm": 5.108191362159325, "learning_rate": 4.95442488759643e-06, "loss": 0.5794, "step": 1097 }, { "epoch": 0.08917404369365711, "grad_norm": 4.394532094744748, "learning_rate": 4.954299806444236e-06, "loss": 0.6292, "step": 1098 }, { "epoch": 0.08925525866969869, "grad_norm": 5.785973974044764, "learning_rate": 4.954174555467484e-06, "loss": 0.7976, "step": 1099 }, { "epoch": 0.08933647364574028, "grad_norm": 8.854881338118322, "learning_rate": 4.954049134674842e-06, "loss": 0.4992, "step": 1100 }, { "epoch": 0.08941768862178186, "grad_norm": 10.211232540471157, "learning_rate": 4.953923544074987e-06, "loss": 0.612, "step": 1101 }, { "epoch": 0.08949890359782343, "grad_norm": 3.8402408710819724, "learning_rate": 4.953797783676611e-06, "loss": 0.7663, "step": 1102 }, { "epoch": 0.08958011857386503, "grad_norm": 4.979965541200029, "learning_rate": 4.9536718534884136e-06, "loss": 0.5334, "step": 1103 }, { "epoch": 0.0896613335499066, "grad_norm": 5.601550825200515, "learning_rate": 4.9535457535191104e-06, "loss": 0.7872, "step": 1104 }, { "epoch": 0.08974254852594818, "grad_norm": 7.030742327798493, "learning_rate": 4.953419483777427e-06, "loss": 0.5551, "step": 1105 }, { "epoch": 0.08982376350198977, "grad_norm": 11.753328029891325, "learning_rate": 4.953293044272099e-06, "loss": 0.5917, "step": 1106 }, { "epoch": 0.08990497847803135, "grad_norm": 4.23164973424406, "learning_rate": 4.953166435011876e-06, "loss": 0.7211, "step": 1107 }, { "epoch": 0.08998619345407292, "grad_norm": 4.427467426240099, "learning_rate": 4.953039656005519e-06, "loss": 0.725, "step": 1108 }, { "epoch": 0.09006740843011452, "grad_norm": 6.656857308336458, "learning_rate": 4.9529127072618e-06, "loss": 0.7282, "step": 1109 }, { "epoch": 0.09014862340615609, "grad_norm": 4.606494946768081, "learning_rate": 4.952785588789504e-06, "loss": 0.6008, "step": 1110 }, { "epoch": 0.09022983838219768, "grad_norm": 4.861377291580884, "learning_rate": 4.9526583005974275e-06, "loss": 0.7185, "step": 1111 }, { "epoch": 0.09031105335823926, "grad_norm": 5.1172810841957785, "learning_rate": 4.952530842694375e-06, "loss": 0.451, "step": 1112 }, { "epoch": 0.09039226833428084, "grad_norm": 5.226532353483265, "learning_rate": 4.95240321508917e-06, "loss": 0.658, "step": 1113 }, { "epoch": 0.09047348331032243, "grad_norm": 5.970819410829736, "learning_rate": 4.952275417790641e-06, "loss": 0.6415, "step": 1114 }, { "epoch": 0.090554698286364, "grad_norm": 5.235598768790702, "learning_rate": 4.95214745080763e-06, "loss": 0.5784, "step": 1115 }, { "epoch": 0.09063591326240558, "grad_norm": 4.456877671742799, "learning_rate": 4.952019314148995e-06, "loss": 0.6458, "step": 1116 }, { "epoch": 0.09071712823844717, "grad_norm": 5.531350494822164, "learning_rate": 4.951891007823601e-06, "loss": 0.7039, "step": 1117 }, { "epoch": 0.09079834321448875, "grad_norm": 3.056795429652102, "learning_rate": 4.951762531840325e-06, "loss": 0.6661, "step": 1118 }, { "epoch": 0.09087955819053034, "grad_norm": 5.721632966434159, "learning_rate": 4.951633886208057e-06, "loss": 0.591, "step": 1119 }, { "epoch": 0.09096077316657192, "grad_norm": 8.349255186085697, "learning_rate": 4.951505070935699e-06, "loss": 0.6245, "step": 1120 }, { "epoch": 0.0910419881426135, "grad_norm": 4.572535564011483, "learning_rate": 4.951376086032166e-06, "loss": 0.7326, "step": 1121 }, { "epoch": 0.09112320311865509, "grad_norm": 4.793988248982188, "learning_rate": 4.95124693150638e-06, "loss": 0.6501, "step": 1122 }, { "epoch": 0.09120441809469666, "grad_norm": 4.303548964129903, "learning_rate": 4.951117607367281e-06, "loss": 0.507, "step": 1123 }, { "epoch": 0.09128563307073824, "grad_norm": 3.9997657093256613, "learning_rate": 4.9509881136238144e-06, "loss": 0.647, "step": 1124 }, { "epoch": 0.09136684804677983, "grad_norm": 6.458649539085403, "learning_rate": 4.950858450284943e-06, "loss": 0.7264, "step": 1125 }, { "epoch": 0.09144806302282141, "grad_norm": 8.005259138006465, "learning_rate": 4.950728617359637e-06, "loss": 0.7207, "step": 1126 }, { "epoch": 0.09152927799886298, "grad_norm": 10.787406113255107, "learning_rate": 4.950598614856882e-06, "loss": 0.7295, "step": 1127 }, { "epoch": 0.09161049297490458, "grad_norm": 7.384428417264019, "learning_rate": 4.950468442785672e-06, "loss": 0.7061, "step": 1128 }, { "epoch": 0.09169170795094615, "grad_norm": 6.688743726208685, "learning_rate": 4.9503381011550145e-06, "loss": 0.5785, "step": 1129 }, { "epoch": 0.09177292292698774, "grad_norm": 7.409666261351277, "learning_rate": 4.950207589973929e-06, "loss": 0.6196, "step": 1130 }, { "epoch": 0.09185413790302932, "grad_norm": 4.151052125921888, "learning_rate": 4.950076909251445e-06, "loss": 0.6793, "step": 1131 }, { "epoch": 0.0919353528790709, "grad_norm": 6.945429669541111, "learning_rate": 4.949946058996606e-06, "loss": 0.7033, "step": 1132 }, { "epoch": 0.09201656785511249, "grad_norm": 5.827624293232605, "learning_rate": 4.949815039218467e-06, "loss": 0.673, "step": 1133 }, { "epoch": 0.09209778283115407, "grad_norm": 7.170155218321833, "learning_rate": 4.949683849926092e-06, "loss": 0.5155, "step": 1134 }, { "epoch": 0.09217899780719564, "grad_norm": 4.143518961770841, "learning_rate": 4.949552491128559e-06, "loss": 0.4783, "step": 1135 }, { "epoch": 0.09226021278323723, "grad_norm": 4.759778607114406, "learning_rate": 4.9494209628349585e-06, "loss": 0.7102, "step": 1136 }, { "epoch": 0.09234142775927881, "grad_norm": 4.218386626424805, "learning_rate": 4.94928926505439e-06, "loss": 0.7196, "step": 1137 }, { "epoch": 0.09242264273532039, "grad_norm": 3.168695853330223, "learning_rate": 4.949157397795967e-06, "loss": 0.5415, "step": 1138 }, { "epoch": 0.09250385771136198, "grad_norm": 3.826938431817392, "learning_rate": 4.949025361068814e-06, "loss": 0.7162, "step": 1139 }, { "epoch": 0.09258507268740356, "grad_norm": 6.290926748077408, "learning_rate": 4.9488931548820685e-06, "loss": 0.5282, "step": 1140 }, { "epoch": 0.09266628766344515, "grad_norm": 7.032750557333736, "learning_rate": 4.9487607792448765e-06, "loss": 0.5308, "step": 1141 }, { "epoch": 0.09274750263948672, "grad_norm": 5.947078730213946, "learning_rate": 4.948628234166398e-06, "loss": 0.5358, "step": 1142 }, { "epoch": 0.0928287176155283, "grad_norm": 5.843171180813921, "learning_rate": 4.948495519655805e-06, "loss": 0.5523, "step": 1143 }, { "epoch": 0.09290993259156989, "grad_norm": 4.227961968923302, "learning_rate": 4.948362635722281e-06, "loss": 0.6363, "step": 1144 }, { "epoch": 0.09299114756761147, "grad_norm": 7.599291580837134, "learning_rate": 4.948229582375021e-06, "loss": 0.5243, "step": 1145 }, { "epoch": 0.09307236254365304, "grad_norm": 6.482980832600019, "learning_rate": 4.948096359623229e-06, "loss": 0.5535, "step": 1146 }, { "epoch": 0.09315357751969464, "grad_norm": 8.221446398590253, "learning_rate": 4.9479629674761265e-06, "loss": 0.603, "step": 1147 }, { "epoch": 0.09323479249573621, "grad_norm": 3.448295224589768, "learning_rate": 4.947829405942942e-06, "loss": 0.667, "step": 1148 }, { "epoch": 0.09331600747177779, "grad_norm": 3.719043167127091, "learning_rate": 4.947695675032919e-06, "loss": 0.5841, "step": 1149 }, { "epoch": 0.09339722244781938, "grad_norm": 4.156557050698605, "learning_rate": 4.947561774755307e-06, "loss": 0.6107, "step": 1150 }, { "epoch": 0.09347843742386096, "grad_norm": 4.192630167978793, "learning_rate": 4.947427705119375e-06, "loss": 0.4772, "step": 1151 }, { "epoch": 0.09355965239990255, "grad_norm": 5.508607665282113, "learning_rate": 4.947293466134399e-06, "loss": 0.6472, "step": 1152 }, { "epoch": 0.09364086737594413, "grad_norm": 6.1116074527140185, "learning_rate": 4.947159057809668e-06, "loss": 0.5252, "step": 1153 }, { "epoch": 0.0937220823519857, "grad_norm": 4.348443564825216, "learning_rate": 4.9470244801544794e-06, "loss": 0.6287, "step": 1154 }, { "epoch": 0.0938032973280273, "grad_norm": 4.398778822812722, "learning_rate": 4.94688973317815e-06, "loss": 0.6383, "step": 1155 }, { "epoch": 0.09388451230406887, "grad_norm": 8.481255729214475, "learning_rate": 4.946754816889999e-06, "loss": 0.6607, "step": 1156 }, { "epoch": 0.09396572728011045, "grad_norm": 8.572467423080703, "learning_rate": 4.946619731299365e-06, "loss": 0.5745, "step": 1157 }, { "epoch": 0.09404694225615204, "grad_norm": 7.588248988044493, "learning_rate": 4.946484476415593e-06, "loss": 0.5236, "step": 1158 }, { "epoch": 0.09412815723219362, "grad_norm": 6.803366010970415, "learning_rate": 4.946349052248044e-06, "loss": 0.4852, "step": 1159 }, { "epoch": 0.09420937220823519, "grad_norm": 3.614463840238927, "learning_rate": 4.946213458806088e-06, "loss": 0.4897, "step": 1160 }, { "epoch": 0.09429058718427678, "grad_norm": 4.806863459042568, "learning_rate": 4.946077696099107e-06, "loss": 0.6462, "step": 1161 }, { "epoch": 0.09437180216031836, "grad_norm": 5.367629362476872, "learning_rate": 4.945941764136494e-06, "loss": 0.5871, "step": 1162 }, { "epoch": 0.09445301713635995, "grad_norm": 3.4996137215165826, "learning_rate": 4.945805662927657e-06, "loss": 0.5799, "step": 1163 }, { "epoch": 0.09453423211240153, "grad_norm": 5.909531771722533, "learning_rate": 4.9456693924820124e-06, "loss": 0.672, "step": 1164 }, { "epoch": 0.0946154470884431, "grad_norm": 11.436424144343768, "learning_rate": 4.945532952808989e-06, "loss": 0.5327, "step": 1165 }, { "epoch": 0.0946966620644847, "grad_norm": 3.885963408729381, "learning_rate": 4.945396343918027e-06, "loss": 0.6624, "step": 1166 }, { "epoch": 0.09477787704052627, "grad_norm": 6.229973134745262, "learning_rate": 4.945259565818582e-06, "loss": 0.5427, "step": 1167 }, { "epoch": 0.09485909201656785, "grad_norm": 4.17803636708481, "learning_rate": 4.9451226185201155e-06, "loss": 0.5436, "step": 1168 }, { "epoch": 0.09494030699260944, "grad_norm": 3.7870816927723228, "learning_rate": 4.9449855020321045e-06, "loss": 0.6648, "step": 1169 }, { "epoch": 0.09502152196865102, "grad_norm": 5.063697581248885, "learning_rate": 4.944848216364036e-06, "loss": 0.6312, "step": 1170 }, { "epoch": 0.0951027369446926, "grad_norm": 7.690605712577253, "learning_rate": 4.944710761525411e-06, "loss": 0.62, "step": 1171 }, { "epoch": 0.09518395192073419, "grad_norm": 5.020085939206364, "learning_rate": 4.94457313752574e-06, "loss": 0.5703, "step": 1172 }, { "epoch": 0.09526516689677576, "grad_norm": 4.5490891920736525, "learning_rate": 4.944435344374544e-06, "loss": 0.7024, "step": 1173 }, { "epoch": 0.09534638187281735, "grad_norm": 5.913707485877068, "learning_rate": 4.944297382081361e-06, "loss": 0.5042, "step": 1174 }, { "epoch": 0.09542759684885893, "grad_norm": 3.759445496054164, "learning_rate": 4.944159250655734e-06, "loss": 0.7242, "step": 1175 }, { "epoch": 0.09550881182490051, "grad_norm": 4.721355611086034, "learning_rate": 4.944020950107224e-06, "loss": 0.6527, "step": 1176 }, { "epoch": 0.0955900268009421, "grad_norm": 4.718530932065967, "learning_rate": 4.943882480445398e-06, "loss": 0.6651, "step": 1177 }, { "epoch": 0.09567124177698368, "grad_norm": 3.7323028514817764, "learning_rate": 4.943743841679839e-06, "loss": 0.4774, "step": 1178 }, { "epoch": 0.09575245675302525, "grad_norm": 7.764398624185714, "learning_rate": 4.943605033820138e-06, "loss": 0.5687, "step": 1179 }, { "epoch": 0.09583367172906684, "grad_norm": 5.7460830112551715, "learning_rate": 4.943466056875903e-06, "loss": 0.6266, "step": 1180 }, { "epoch": 0.09591488670510842, "grad_norm": 6.12456544392407, "learning_rate": 4.943326910856749e-06, "loss": 0.9285, "step": 1181 }, { "epoch": 0.09599610168115, "grad_norm": 5.9339500045832505, "learning_rate": 4.943187595772302e-06, "loss": 0.572, "step": 1182 }, { "epoch": 0.09607731665719159, "grad_norm": 5.327929545992472, "learning_rate": 4.943048111632205e-06, "loss": 0.7426, "step": 1183 }, { "epoch": 0.09615853163323317, "grad_norm": 4.422671758582798, "learning_rate": 4.942908458446107e-06, "loss": 0.5256, "step": 1184 }, { "epoch": 0.09623974660927476, "grad_norm": 4.210710659594965, "learning_rate": 4.942768636223674e-06, "loss": 0.6544, "step": 1185 }, { "epoch": 0.09632096158531633, "grad_norm": 5.86594483722807, "learning_rate": 4.94262864497458e-06, "loss": 0.6285, "step": 1186 }, { "epoch": 0.09640217656135791, "grad_norm": 11.95231315351679, "learning_rate": 4.94248848470851e-06, "loss": 0.5907, "step": 1187 }, { "epoch": 0.0964833915373995, "grad_norm": 4.610476439616043, "learning_rate": 4.9423481554351636e-06, "loss": 0.5891, "step": 1188 }, { "epoch": 0.09656460651344108, "grad_norm": 15.02936414694867, "learning_rate": 4.9422076571642516e-06, "loss": 0.6007, "step": 1189 }, { "epoch": 0.09664582148948266, "grad_norm": 7.408105632703543, "learning_rate": 4.942066989905494e-06, "loss": 0.5263, "step": 1190 }, { "epoch": 0.09672703646552425, "grad_norm": 6.6983744546854105, "learning_rate": 4.941926153668626e-06, "loss": 0.6209, "step": 1191 }, { "epoch": 0.09680825144156582, "grad_norm": 12.37964028499769, "learning_rate": 4.941785148463391e-06, "loss": 0.5479, "step": 1192 }, { "epoch": 0.0968894664176074, "grad_norm": 7.949353882372964, "learning_rate": 4.941643974299547e-06, "loss": 0.6356, "step": 1193 }, { "epoch": 0.09697068139364899, "grad_norm": 12.259554197141354, "learning_rate": 4.941502631186863e-06, "loss": 0.5876, "step": 1194 }, { "epoch": 0.09705189636969057, "grad_norm": 4.5151134917162254, "learning_rate": 4.941361119135118e-06, "loss": 0.6081, "step": 1195 }, { "epoch": 0.09713311134573216, "grad_norm": 5.027871122019048, "learning_rate": 4.941219438154103e-06, "loss": 0.7848, "step": 1196 }, { "epoch": 0.09721432632177374, "grad_norm": 6.044100876833535, "learning_rate": 4.941077588253624e-06, "loss": 0.6873, "step": 1197 }, { "epoch": 0.09729554129781531, "grad_norm": 5.889644758237491, "learning_rate": 4.940935569443496e-06, "loss": 0.5557, "step": 1198 }, { "epoch": 0.0973767562738569, "grad_norm": 15.297541275971485, "learning_rate": 4.940793381733544e-06, "loss": 0.547, "step": 1199 }, { "epoch": 0.09745797124989848, "grad_norm": 12.207281968202853, "learning_rate": 4.940651025133607e-06, "loss": 0.7011, "step": 1200 }, { "epoch": 0.09753918622594006, "grad_norm": 5.195830064818431, "learning_rate": 4.9405084996535376e-06, "loss": 0.7094, "step": 1201 }, { "epoch": 0.09762040120198165, "grad_norm": 4.519169304548104, "learning_rate": 4.940365805303195e-06, "loss": 0.5373, "step": 1202 }, { "epoch": 0.09770161617802323, "grad_norm": 4.18578492619036, "learning_rate": 4.940222942092455e-06, "loss": 0.5373, "step": 1203 }, { "epoch": 0.0977828311540648, "grad_norm": 5.871433014218688, "learning_rate": 4.940079910031201e-06, "loss": 0.6525, "step": 1204 }, { "epoch": 0.0978640461301064, "grad_norm": 7.770798201538124, "learning_rate": 4.939936709129333e-06, "loss": 0.6541, "step": 1205 }, { "epoch": 0.09794526110614797, "grad_norm": 10.00163900865208, "learning_rate": 4.939793339396756e-06, "loss": 0.6166, "step": 1206 }, { "epoch": 0.09802647608218956, "grad_norm": 6.793365007116543, "learning_rate": 4.939649800843394e-06, "loss": 0.5834, "step": 1207 }, { "epoch": 0.09810769105823114, "grad_norm": 3.8998039709967927, "learning_rate": 4.939506093479176e-06, "loss": 0.654, "step": 1208 }, { "epoch": 0.09818890603427272, "grad_norm": 3.5977727046620473, "learning_rate": 4.939362217314048e-06, "loss": 0.5947, "step": 1209 }, { "epoch": 0.0982701210103143, "grad_norm": 3.430672786433987, "learning_rate": 4.939218172357965e-06, "loss": 0.4971, "step": 1210 }, { "epoch": 0.09835133598635588, "grad_norm": 4.703232976940314, "learning_rate": 4.9390739586208926e-06, "loss": 0.6256, "step": 1211 }, { "epoch": 0.09843255096239746, "grad_norm": 8.889711877291688, "learning_rate": 4.938929576112812e-06, "loss": 0.6425, "step": 1212 }, { "epoch": 0.09851376593843905, "grad_norm": 5.215270092203213, "learning_rate": 4.938785024843712e-06, "loss": 0.5402, "step": 1213 }, { "epoch": 0.09859498091448063, "grad_norm": 4.480490370978996, "learning_rate": 4.938640304823596e-06, "loss": 0.4592, "step": 1214 }, { "epoch": 0.0986761958905222, "grad_norm": 4.5775356603291995, "learning_rate": 4.938495416062477e-06, "loss": 0.5733, "step": 1215 }, { "epoch": 0.0987574108665638, "grad_norm": 5.374607118889353, "learning_rate": 4.93835035857038e-06, "loss": 0.4699, "step": 1216 }, { "epoch": 0.09883862584260537, "grad_norm": 8.062779666258928, "learning_rate": 4.938205132357344e-06, "loss": 0.6582, "step": 1217 }, { "epoch": 0.09891984081864696, "grad_norm": 5.276936843727084, "learning_rate": 4.938059737433416e-06, "loss": 0.4957, "step": 1218 }, { "epoch": 0.09900105579468854, "grad_norm": 5.321609718799565, "learning_rate": 4.9379141738086575e-06, "loss": 0.5664, "step": 1219 }, { "epoch": 0.09908227077073012, "grad_norm": 4.479754054431104, "learning_rate": 4.9377684414931415e-06, "loss": 0.7467, "step": 1220 }, { "epoch": 0.09916348574677171, "grad_norm": 4.74529856971756, "learning_rate": 4.937622540496951e-06, "loss": 0.5793, "step": 1221 }, { "epoch": 0.09924470072281329, "grad_norm": 3.911535284660022, "learning_rate": 4.937476470830181e-06, "loss": 0.6115, "step": 1222 }, { "epoch": 0.09932591569885486, "grad_norm": 4.166632824497643, "learning_rate": 4.937330232502939e-06, "loss": 0.5927, "step": 1223 }, { "epoch": 0.09940713067489645, "grad_norm": 5.587700034937294, "learning_rate": 4.937183825525346e-06, "loss": 0.9838, "step": 1224 }, { "epoch": 0.09948834565093803, "grad_norm": 4.01441412316951, "learning_rate": 4.937037249907529e-06, "loss": 0.6088, "step": 1225 }, { "epoch": 0.09956956062697961, "grad_norm": 5.422702256253332, "learning_rate": 4.9368905056596336e-06, "loss": 0.8051, "step": 1226 }, { "epoch": 0.0996507756030212, "grad_norm": 4.730421753529755, "learning_rate": 4.936743592791812e-06, "loss": 0.8022, "step": 1227 }, { "epoch": 0.09973199057906278, "grad_norm": 6.417481219598529, "learning_rate": 4.936596511314229e-06, "loss": 0.7359, "step": 1228 }, { "epoch": 0.09981320555510437, "grad_norm": 5.936442260399057, "learning_rate": 4.936449261237064e-06, "loss": 0.4835, "step": 1229 }, { "epoch": 0.09989442053114594, "grad_norm": 5.161367234189236, "learning_rate": 4.936301842570505e-06, "loss": 0.6098, "step": 1230 }, { "epoch": 0.09997563550718752, "grad_norm": 6.171812579141409, "learning_rate": 4.936154255324751e-06, "loss": 0.5208, "step": 1231 }, { "epoch": 0.10005685048322911, "grad_norm": 6.898043493723934, "learning_rate": 4.936006499510016e-06, "loss": 0.6272, "step": 1232 }, { "epoch": 0.10013806545927069, "grad_norm": 5.279799808361505, "learning_rate": 4.935858575136525e-06, "loss": 0.6761, "step": 1233 }, { "epoch": 0.10021928043531227, "grad_norm": 6.1780863372983, "learning_rate": 4.935710482214512e-06, "loss": 0.5666, "step": 1234 }, { "epoch": 0.10030049541135386, "grad_norm": 4.127005405020764, "learning_rate": 4.935562220754224e-06, "loss": 0.7762, "step": 1235 }, { "epoch": 0.10038171038739543, "grad_norm": 6.939945712344684, "learning_rate": 4.935413790765919e-06, "loss": 0.5601, "step": 1236 }, { "epoch": 0.10046292536343701, "grad_norm": 5.4373785364212965, "learning_rate": 4.935265192259871e-06, "loss": 0.5489, "step": 1237 }, { "epoch": 0.1005441403394786, "grad_norm": 3.19887214936069, "learning_rate": 4.935116425246359e-06, "loss": 0.6456, "step": 1238 }, { "epoch": 0.10062535531552018, "grad_norm": 7.543707007832977, "learning_rate": 4.934967489735679e-06, "loss": 0.5061, "step": 1239 }, { "epoch": 0.10070657029156177, "grad_norm": 4.008848618860738, "learning_rate": 4.934818385738135e-06, "loss": 0.6719, "step": 1240 }, { "epoch": 0.10078778526760335, "grad_norm": 4.006099731057698, "learning_rate": 4.934669113264044e-06, "loss": 0.6852, "step": 1241 }, { "epoch": 0.10086900024364492, "grad_norm": 8.793769739342025, "learning_rate": 4.934519672323737e-06, "loss": 0.5916, "step": 1242 }, { "epoch": 0.10095021521968651, "grad_norm": 5.1218716406597835, "learning_rate": 4.9343700629275525e-06, "loss": 0.4989, "step": 1243 }, { "epoch": 0.10103143019572809, "grad_norm": 4.3513298577507245, "learning_rate": 4.934220285085843e-06, "loss": 0.5374, "step": 1244 }, { "epoch": 0.10111264517176967, "grad_norm": 5.858277317975228, "learning_rate": 4.934070338808974e-06, "loss": 0.4365, "step": 1245 }, { "epoch": 0.10119386014781126, "grad_norm": 4.971141537823225, "learning_rate": 4.933920224107319e-06, "loss": 0.6175, "step": 1246 }, { "epoch": 0.10127507512385284, "grad_norm": 7.572976492490089, "learning_rate": 4.933769940991266e-06, "loss": 0.6484, "step": 1247 }, { "epoch": 0.10135629009989441, "grad_norm": 7.466443178328625, "learning_rate": 4.933619489471213e-06, "loss": 0.6078, "step": 1248 }, { "epoch": 0.101437505075936, "grad_norm": 3.837978031503154, "learning_rate": 4.933468869557572e-06, "loss": 0.672, "step": 1249 }, { "epoch": 0.10151872005197758, "grad_norm": 4.587774401604302, "learning_rate": 4.933318081260763e-06, "loss": 0.6828, "step": 1250 }, { "epoch": 0.10159993502801917, "grad_norm": 5.728533677177494, "learning_rate": 4.933167124591222e-06, "loss": 0.5304, "step": 1251 }, { "epoch": 0.10168115000406075, "grad_norm": 7.693599571430989, "learning_rate": 4.9330159995593926e-06, "loss": 0.6125, "step": 1252 }, { "epoch": 0.10176236498010233, "grad_norm": 5.284936139752091, "learning_rate": 4.9328647061757326e-06, "loss": 0.6306, "step": 1253 }, { "epoch": 0.10184357995614392, "grad_norm": 4.502661220996693, "learning_rate": 4.932713244450712e-06, "loss": 0.6503, "step": 1254 }, { "epoch": 0.1019247949321855, "grad_norm": 30.435010504189158, "learning_rate": 4.932561614394809e-06, "loss": 0.4843, "step": 1255 }, { "epoch": 0.10200600990822707, "grad_norm": 3.2680414669887226, "learning_rate": 4.932409816018516e-06, "loss": 0.5335, "step": 1256 }, { "epoch": 0.10208722488426866, "grad_norm": 5.550536415470609, "learning_rate": 4.932257849332337e-06, "loss": 0.6274, "step": 1257 }, { "epoch": 0.10216843986031024, "grad_norm": 6.490325484791516, "learning_rate": 4.932105714346788e-06, "loss": 0.7055, "step": 1258 }, { "epoch": 0.10224965483635182, "grad_norm": 4.635391717749872, "learning_rate": 4.931953411072395e-06, "loss": 0.7677, "step": 1259 }, { "epoch": 0.10233086981239341, "grad_norm": 5.213698736457178, "learning_rate": 4.931800939519697e-06, "loss": 0.6283, "step": 1260 }, { "epoch": 0.10241208478843498, "grad_norm": 12.81670585201401, "learning_rate": 4.931648299699245e-06, "loss": 0.5411, "step": 1261 }, { "epoch": 0.10249329976447658, "grad_norm": 4.7979729375829425, "learning_rate": 4.931495491621598e-06, "loss": 0.6, "step": 1262 }, { "epoch": 0.10257451474051815, "grad_norm": 5.738885766711591, "learning_rate": 4.931342515297333e-06, "loss": 0.6029, "step": 1263 }, { "epoch": 0.10265572971655973, "grad_norm": 4.5708865991196586, "learning_rate": 4.931189370737033e-06, "loss": 0.6877, "step": 1264 }, { "epoch": 0.10273694469260132, "grad_norm": 4.292203823138975, "learning_rate": 4.931036057951295e-06, "loss": 0.6054, "step": 1265 }, { "epoch": 0.1028181596686429, "grad_norm": 5.094248884490348, "learning_rate": 4.930882576950728e-06, "loss": 1.016, "step": 1266 }, { "epoch": 0.10289937464468447, "grad_norm": 7.74744023190108, "learning_rate": 4.930728927745954e-06, "loss": 0.6266, "step": 1267 }, { "epoch": 0.10298058962072607, "grad_norm": 7.178060412399686, "learning_rate": 4.930575110347601e-06, "loss": 0.5436, "step": 1268 }, { "epoch": 0.10306180459676764, "grad_norm": 4.8955814375005, "learning_rate": 4.9304211247663135e-06, "loss": 0.6069, "step": 1269 }, { "epoch": 0.10314301957280922, "grad_norm": 6.4873010558747675, "learning_rate": 4.930266971012748e-06, "loss": 0.5739, "step": 1270 }, { "epoch": 0.10322423454885081, "grad_norm": 5.62659346934576, "learning_rate": 4.930112649097569e-06, "loss": 0.7317, "step": 1271 }, { "epoch": 0.10330544952489239, "grad_norm": 5.596039021432254, "learning_rate": 4.929958159031457e-06, "loss": 0.4922, "step": 1272 }, { "epoch": 0.10338666450093398, "grad_norm": 5.488922386676459, "learning_rate": 4.9298035008251e-06, "loss": 0.5276, "step": 1273 }, { "epoch": 0.10346787947697555, "grad_norm": 5.79593069742807, "learning_rate": 4.929648674489201e-06, "loss": 0.5529, "step": 1274 }, { "epoch": 0.10354909445301713, "grad_norm": 9.122388238453604, "learning_rate": 4.929493680034472e-06, "loss": 0.6647, "step": 1275 }, { "epoch": 0.10363030942905872, "grad_norm": 4.654226716442984, "learning_rate": 4.929338517471638e-06, "loss": 0.6721, "step": 1276 }, { "epoch": 0.1037115244051003, "grad_norm": 4.921506870025976, "learning_rate": 4.929183186811436e-06, "loss": 0.4812, "step": 1277 }, { "epoch": 0.10379273938114188, "grad_norm": 8.206987541104693, "learning_rate": 4.9290276880646144e-06, "loss": 0.6183, "step": 1278 }, { "epoch": 0.10387395435718347, "grad_norm": 6.664251212109854, "learning_rate": 4.928872021241932e-06, "loss": 0.5676, "step": 1279 }, { "epoch": 0.10395516933322504, "grad_norm": 5.059968838894893, "learning_rate": 4.92871618635416e-06, "loss": 0.5516, "step": 1280 }, { "epoch": 0.10403638430926662, "grad_norm": 4.462582725976913, "learning_rate": 4.928560183412081e-06, "loss": 0.637, "step": 1281 }, { "epoch": 0.10411759928530821, "grad_norm": 13.013107165252704, "learning_rate": 4.928404012426491e-06, "loss": 0.585, "step": 1282 }, { "epoch": 0.10419881426134979, "grad_norm": 4.877077629277337, "learning_rate": 4.9282476734081955e-06, "loss": 0.4416, "step": 1283 }, { "epoch": 0.10428002923739138, "grad_norm": 5.339858862218509, "learning_rate": 4.928091166368013e-06, "loss": 0.534, "step": 1284 }, { "epoch": 0.10436124421343296, "grad_norm": 4.638726269664911, "learning_rate": 4.927934491316771e-06, "loss": 0.5402, "step": 1285 }, { "epoch": 0.10444245918947453, "grad_norm": 5.279866806349571, "learning_rate": 4.927777648265313e-06, "loss": 0.8809, "step": 1286 }, { "epoch": 0.10452367416551613, "grad_norm": 10.172012498738642, "learning_rate": 4.927620637224489e-06, "loss": 0.6804, "step": 1287 }, { "epoch": 0.1046048891415577, "grad_norm": 4.0125921901692605, "learning_rate": 4.927463458205167e-06, "loss": 0.484, "step": 1288 }, { "epoch": 0.10468610411759928, "grad_norm": 5.038605450511745, "learning_rate": 4.9273061112182195e-06, "loss": 0.4823, "step": 1289 }, { "epoch": 0.10476731909364087, "grad_norm": 11.514242344873601, "learning_rate": 4.9271485962745356e-06, "loss": 0.5736, "step": 1290 }, { "epoch": 0.10484853406968245, "grad_norm": 5.2507131812652705, "learning_rate": 4.9269909133850146e-06, "loss": 0.4464, "step": 1291 }, { "epoch": 0.10492974904572404, "grad_norm": 3.587842597108573, "learning_rate": 4.926833062560566e-06, "loss": 0.5488, "step": 1292 }, { "epoch": 0.10501096402176562, "grad_norm": 4.9325898595523405, "learning_rate": 4.926675043812115e-06, "loss": 0.7937, "step": 1293 }, { "epoch": 0.10509217899780719, "grad_norm": 4.089069618738186, "learning_rate": 4.926516857150593e-06, "loss": 0.6927, "step": 1294 }, { "epoch": 0.10517339397384878, "grad_norm": 4.157543455041517, "learning_rate": 4.926358502586948e-06, "loss": 0.6499, "step": 1295 }, { "epoch": 0.10525460894989036, "grad_norm": 4.591475297442821, "learning_rate": 4.9261999801321345e-06, "loss": 0.6702, "step": 1296 }, { "epoch": 0.10533582392593194, "grad_norm": 5.076099157718857, "learning_rate": 4.9260412897971225e-06, "loss": 0.5649, "step": 1297 }, { "epoch": 0.10541703890197353, "grad_norm": 5.92574396887693, "learning_rate": 4.9258824315928935e-06, "loss": 0.6207, "step": 1298 }, { "epoch": 0.1054982538780151, "grad_norm": 4.295889777812483, "learning_rate": 4.925723405530439e-06, "loss": 0.6171, "step": 1299 }, { "epoch": 0.10557946885405668, "grad_norm": 6.0185008543132055, "learning_rate": 4.925564211620764e-06, "loss": 0.6708, "step": 1300 }, { "epoch": 0.10566068383009827, "grad_norm": 4.288628260503543, "learning_rate": 4.9254048498748804e-06, "loss": 0.5312, "step": 1301 }, { "epoch": 0.10574189880613985, "grad_norm": 5.912622580277487, "learning_rate": 4.925245320303819e-06, "loss": 0.6256, "step": 1302 }, { "epoch": 0.10582311378218144, "grad_norm": 4.036381733012363, "learning_rate": 4.925085622918618e-06, "loss": 0.6512, "step": 1303 }, { "epoch": 0.10590432875822302, "grad_norm": 11.770792105747262, "learning_rate": 4.924925757730324e-06, "loss": 0.6243, "step": 1304 }, { "epoch": 0.1059855437342646, "grad_norm": 7.6314249401022085, "learning_rate": 4.924765724750002e-06, "loss": 0.5521, "step": 1305 }, { "epoch": 0.10606675871030619, "grad_norm": 4.238420473450789, "learning_rate": 4.9246055239887255e-06, "loss": 0.626, "step": 1306 }, { "epoch": 0.10614797368634776, "grad_norm": 5.023366885034705, "learning_rate": 4.924445155457578e-06, "loss": 0.7469, "step": 1307 }, { "epoch": 0.10622918866238934, "grad_norm": 5.927044022062121, "learning_rate": 4.924284619167657e-06, "loss": 0.6673, "step": 1308 }, { "epoch": 0.10631040363843093, "grad_norm": 27.667866009821907, "learning_rate": 4.924123915130072e-06, "loss": 0.4591, "step": 1309 }, { "epoch": 0.10639161861447251, "grad_norm": 6.067033293424752, "learning_rate": 4.92396304335594e-06, "loss": 0.7129, "step": 1310 }, { "epoch": 0.10647283359051408, "grad_norm": 4.48662582291358, "learning_rate": 4.923802003856395e-06, "loss": 0.7251, "step": 1311 }, { "epoch": 0.10655404856655568, "grad_norm": 8.910396684118473, "learning_rate": 4.923640796642578e-06, "loss": 0.5621, "step": 1312 }, { "epoch": 0.10663526354259725, "grad_norm": 10.709244615053429, "learning_rate": 4.923479421725646e-06, "loss": 0.7027, "step": 1313 }, { "epoch": 0.10671647851863884, "grad_norm": 6.558186461825374, "learning_rate": 4.923317879116764e-06, "loss": 0.6991, "step": 1314 }, { "epoch": 0.10679769349468042, "grad_norm": 4.402808201885828, "learning_rate": 4.923156168827109e-06, "loss": 0.5193, "step": 1315 }, { "epoch": 0.106878908470722, "grad_norm": 3.9531074248473677, "learning_rate": 4.922994290867872e-06, "loss": 0.5984, "step": 1316 }, { "epoch": 0.10696012344676359, "grad_norm": 5.634957726444024, "learning_rate": 4.922832245250254e-06, "loss": 0.5759, "step": 1317 }, { "epoch": 0.10704133842280517, "grad_norm": 4.687012040425469, "learning_rate": 4.922670031985467e-06, "loss": 0.5291, "step": 1318 }, { "epoch": 0.10712255339884674, "grad_norm": 7.331234418642, "learning_rate": 4.922507651084736e-06, "loss": 0.631, "step": 1319 }, { "epoch": 0.10720376837488833, "grad_norm": 6.179571341056749, "learning_rate": 4.9223451025592965e-06, "loss": 0.5047, "step": 1320 }, { "epoch": 0.10728498335092991, "grad_norm": 4.797755420619416, "learning_rate": 4.9221823864203955e-06, "loss": 0.5616, "step": 1321 }, { "epoch": 0.10736619832697149, "grad_norm": 4.7796380411767885, "learning_rate": 4.922019502679292e-06, "loss": 0.7038, "step": 1322 }, { "epoch": 0.10744741330301308, "grad_norm": 4.880297830435732, "learning_rate": 4.921856451347258e-06, "loss": 0.8187, "step": 1323 }, { "epoch": 0.10752862827905466, "grad_norm": 4.4975518069828295, "learning_rate": 4.9216932324355755e-06, "loss": 0.6288, "step": 1324 }, { "epoch": 0.10760984325509625, "grad_norm": 5.693686439938185, "learning_rate": 4.921529845955537e-06, "loss": 0.5948, "step": 1325 }, { "epoch": 0.10769105823113782, "grad_norm": 9.014157893199854, "learning_rate": 4.9213662919184495e-06, "loss": 0.7244, "step": 1326 }, { "epoch": 0.1077722732071794, "grad_norm": 17.656772040857238, "learning_rate": 4.921202570335629e-06, "loss": 0.5731, "step": 1327 }, { "epoch": 0.10785348818322099, "grad_norm": 7.338056929014862, "learning_rate": 4.921038681218405e-06, "loss": 0.5082, "step": 1328 }, { "epoch": 0.10793470315926257, "grad_norm": 4.851712422566327, "learning_rate": 4.920874624578118e-06, "loss": 0.7288, "step": 1329 }, { "epoch": 0.10801591813530415, "grad_norm": 5.876176027514485, "learning_rate": 4.920710400426118e-06, "loss": 0.4265, "step": 1330 }, { "epoch": 0.10809713311134574, "grad_norm": 5.880882787741879, "learning_rate": 4.920546008773771e-06, "loss": 0.4902, "step": 1331 }, { "epoch": 0.10817834808738731, "grad_norm": 5.58193279698704, "learning_rate": 4.920381449632451e-06, "loss": 0.7027, "step": 1332 }, { "epoch": 0.10825956306342889, "grad_norm": 5.173776451487993, "learning_rate": 4.920216723013544e-06, "loss": 0.646, "step": 1333 }, { "epoch": 0.10834077803947048, "grad_norm": 4.388587188343882, "learning_rate": 4.920051828928448e-06, "loss": 0.4977, "step": 1334 }, { "epoch": 0.10842199301551206, "grad_norm": 4.776839443402457, "learning_rate": 4.919886767388573e-06, "loss": 0.5767, "step": 1335 }, { "epoch": 0.10850320799155365, "grad_norm": 3.4824258344375725, "learning_rate": 4.919721538405341e-06, "loss": 0.533, "step": 1336 }, { "epoch": 0.10858442296759523, "grad_norm": 4.742639309949176, "learning_rate": 4.919556141990186e-06, "loss": 0.6688, "step": 1337 }, { "epoch": 0.1086656379436368, "grad_norm": 4.653988617546264, "learning_rate": 4.919390578154551e-06, "loss": 0.631, "step": 1338 }, { "epoch": 0.1087468529196784, "grad_norm": 5.238136207553621, "learning_rate": 4.919224846909891e-06, "loss": 0.5437, "step": 1339 }, { "epoch": 0.10882806789571997, "grad_norm": 6.815351329648775, "learning_rate": 4.919058948267677e-06, "loss": 0.8166, "step": 1340 }, { "epoch": 0.10890928287176155, "grad_norm": 5.2640299180243355, "learning_rate": 4.918892882239384e-06, "loss": 0.6044, "step": 1341 }, { "epoch": 0.10899049784780314, "grad_norm": 4.358517148619114, "learning_rate": 4.918726648836507e-06, "loss": 0.7873, "step": 1342 }, { "epoch": 0.10907171282384472, "grad_norm": 6.052945574950095, "learning_rate": 4.918560248070547e-06, "loss": 0.5615, "step": 1343 }, { "epoch": 0.10915292779988629, "grad_norm": 4.566253758686044, "learning_rate": 4.918393679953018e-06, "loss": 0.5893, "step": 1344 }, { "epoch": 0.10923414277592788, "grad_norm": 3.497997250474178, "learning_rate": 4.918226944495445e-06, "loss": 0.6582, "step": 1345 }, { "epoch": 0.10931535775196946, "grad_norm": 6.920939485791569, "learning_rate": 4.918060041709366e-06, "loss": 0.563, "step": 1346 }, { "epoch": 0.10939657272801105, "grad_norm": 13.15024709863077, "learning_rate": 4.917892971606329e-06, "loss": 0.5954, "step": 1347 }, { "epoch": 0.10947778770405263, "grad_norm": 4.208801375695993, "learning_rate": 4.917725734197896e-06, "loss": 0.548, "step": 1348 }, { "epoch": 0.1095590026800942, "grad_norm": 4.648874348064858, "learning_rate": 4.917558329495636e-06, "loss": 0.6602, "step": 1349 }, { "epoch": 0.1096402176561358, "grad_norm": 4.308899285098474, "learning_rate": 4.917390757511136e-06, "loss": 0.5401, "step": 1350 }, { "epoch": 0.10972143263217737, "grad_norm": 5.129516631257819, "learning_rate": 4.917223018255989e-06, "loss": 0.5193, "step": 1351 }, { "epoch": 0.10980264760821895, "grad_norm": 6.107816197257616, "learning_rate": 4.917055111741802e-06, "loss": 0.5366, "step": 1352 }, { "epoch": 0.10988386258426054, "grad_norm": 4.707618481744731, "learning_rate": 4.916887037980193e-06, "loss": 0.5354, "step": 1353 }, { "epoch": 0.10996507756030212, "grad_norm": 4.1837375718082255, "learning_rate": 4.916718796982793e-06, "loss": 0.7129, "step": 1354 }, { "epoch": 0.1100462925363437, "grad_norm": 3.4672179946418975, "learning_rate": 4.916550388761242e-06, "loss": 0.5567, "step": 1355 }, { "epoch": 0.11012750751238529, "grad_norm": 4.01249160395805, "learning_rate": 4.916381813327194e-06, "loss": 0.5612, "step": 1356 }, { "epoch": 0.11020872248842686, "grad_norm": 4.478566912854505, "learning_rate": 4.916213070692312e-06, "loss": 0.5274, "step": 1357 }, { "epoch": 0.11028993746446845, "grad_norm": 7.752567209797171, "learning_rate": 4.916044160868273e-06, "loss": 0.5645, "step": 1358 }, { "epoch": 0.11037115244051003, "grad_norm": 4.915949482614922, "learning_rate": 4.915875083866766e-06, "loss": 0.5816, "step": 1359 }, { "epoch": 0.11045236741655161, "grad_norm": 4.872300208909787, "learning_rate": 4.915705839699488e-06, "loss": 0.5693, "step": 1360 }, { "epoch": 0.1105335823925932, "grad_norm": 6.8669686866426645, "learning_rate": 4.915536428378152e-06, "loss": 0.6373, "step": 1361 }, { "epoch": 0.11061479736863478, "grad_norm": 9.217173762118762, "learning_rate": 4.915366849914479e-06, "loss": 0.5895, "step": 1362 }, { "epoch": 0.11069601234467635, "grad_norm": 5.851967895961872, "learning_rate": 4.915197104320203e-06, "loss": 0.6099, "step": 1363 }, { "epoch": 0.11077722732071794, "grad_norm": 6.051580089476815, "learning_rate": 4.915027191607069e-06, "loss": 0.6278, "step": 1364 }, { "epoch": 0.11085844229675952, "grad_norm": 4.182706621669257, "learning_rate": 4.914857111786835e-06, "loss": 0.7995, "step": 1365 }, { "epoch": 0.1109396572728011, "grad_norm": 6.4377665961998, "learning_rate": 4.9146868648712694e-06, "loss": 0.5338, "step": 1366 }, { "epoch": 0.11102087224884269, "grad_norm": 4.1795950899223255, "learning_rate": 4.914516450872152e-06, "loss": 0.4989, "step": 1367 }, { "epoch": 0.11110208722488427, "grad_norm": 5.590546828289337, "learning_rate": 4.914345869801276e-06, "loss": 0.5973, "step": 1368 }, { "epoch": 0.11118330220092586, "grad_norm": 6.538514315603108, "learning_rate": 4.914175121670443e-06, "loss": 0.5177, "step": 1369 }, { "epoch": 0.11126451717696743, "grad_norm": 3.691433117466898, "learning_rate": 4.914004206491467e-06, "loss": 0.5844, "step": 1370 }, { "epoch": 0.11134573215300901, "grad_norm": 4.522420652396943, "learning_rate": 4.913833124276177e-06, "loss": 0.5717, "step": 1371 }, { "epoch": 0.1114269471290506, "grad_norm": 5.582588983452335, "learning_rate": 4.9136618750364105e-06, "loss": 0.5929, "step": 1372 }, { "epoch": 0.11150816210509218, "grad_norm": 3.5822825901443145, "learning_rate": 4.913490458784016e-06, "loss": 0.6584, "step": 1373 }, { "epoch": 0.11158937708113376, "grad_norm": 6.955323322713761, "learning_rate": 4.913318875530855e-06, "loss": 0.7448, "step": 1374 }, { "epoch": 0.11167059205717535, "grad_norm": 7.999732286808689, "learning_rate": 4.9131471252887995e-06, "loss": 0.5943, "step": 1375 }, { "epoch": 0.11175180703321692, "grad_norm": 5.699553884482589, "learning_rate": 4.912975208069735e-06, "loss": 0.5888, "step": 1376 }, { "epoch": 0.1118330220092585, "grad_norm": 9.937720162090995, "learning_rate": 4.912803123885555e-06, "loss": 0.579, "step": 1377 }, { "epoch": 0.11191423698530009, "grad_norm": 6.426480257301145, "learning_rate": 4.912630872748171e-06, "loss": 0.4428, "step": 1378 }, { "epoch": 0.11199545196134167, "grad_norm": 4.6331109760310625, "learning_rate": 4.912458454669498e-06, "loss": 0.5904, "step": 1379 }, { "epoch": 0.11207666693738326, "grad_norm": 5.28940520266137, "learning_rate": 4.912285869661467e-06, "loss": 0.5371, "step": 1380 }, { "epoch": 0.11215788191342484, "grad_norm": 3.8233787152045418, "learning_rate": 4.912113117736022e-06, "loss": 0.7556, "step": 1381 }, { "epoch": 0.11223909688946641, "grad_norm": 5.065689586582994, "learning_rate": 4.911940198905114e-06, "loss": 0.6213, "step": 1382 }, { "epoch": 0.112320311865508, "grad_norm": 6.1114483413633245, "learning_rate": 4.91176711318071e-06, "loss": 0.6723, "step": 1383 }, { "epoch": 0.11240152684154958, "grad_norm": 7.783537136958568, "learning_rate": 4.911593860574785e-06, "loss": 0.5786, "step": 1384 }, { "epoch": 0.11248274181759116, "grad_norm": 8.312210011930208, "learning_rate": 4.911420441099329e-06, "loss": 0.6476, "step": 1385 }, { "epoch": 0.11256395679363275, "grad_norm": 4.230328963619588, "learning_rate": 4.911246854766341e-06, "loss": 0.686, "step": 1386 }, { "epoch": 0.11264517176967433, "grad_norm": 12.499607763263779, "learning_rate": 4.911073101587831e-06, "loss": 0.4509, "step": 1387 }, { "epoch": 0.1127263867457159, "grad_norm": 4.97136722337538, "learning_rate": 4.9108991815758225e-06, "loss": 0.8017, "step": 1388 }, { "epoch": 0.1128076017217575, "grad_norm": 4.68367623760023, "learning_rate": 4.9107250947423516e-06, "loss": 0.574, "step": 1389 }, { "epoch": 0.11288881669779907, "grad_norm": 4.5482183003098, "learning_rate": 4.910550841099462e-06, "loss": 0.6234, "step": 1390 }, { "epoch": 0.11297003167384066, "grad_norm": 3.734134582087491, "learning_rate": 4.910376420659211e-06, "loss": 0.7674, "step": 1391 }, { "epoch": 0.11305124664988224, "grad_norm": 9.222540748041622, "learning_rate": 4.91020183343367e-06, "loss": 0.6176, "step": 1392 }, { "epoch": 0.11313246162592382, "grad_norm": 7.2087300631114095, "learning_rate": 4.910027079434917e-06, "loss": 0.7806, "step": 1393 }, { "epoch": 0.11321367660196541, "grad_norm": 4.209780015944576, "learning_rate": 4.909852158675045e-06, "loss": 0.6478, "step": 1394 }, { "epoch": 0.11329489157800698, "grad_norm": 4.672108189183994, "learning_rate": 4.9096770711661575e-06, "loss": 0.7015, "step": 1395 }, { "epoch": 0.11337610655404856, "grad_norm": 5.391522206407413, "learning_rate": 4.90950181692037e-06, "loss": 0.6014, "step": 1396 }, { "epoch": 0.11345732153009015, "grad_norm": 6.512018993656574, "learning_rate": 4.909326395949809e-06, "loss": 0.6083, "step": 1397 }, { "epoch": 0.11353853650613173, "grad_norm": 5.621883708458305, "learning_rate": 4.909150808266613e-06, "loss": 0.6495, "step": 1398 }, { "epoch": 0.1136197514821733, "grad_norm": 7.16365741465048, "learning_rate": 4.908975053882931e-06, "loss": 0.5368, "step": 1399 }, { "epoch": 0.1137009664582149, "grad_norm": 5.860792942986273, "learning_rate": 4.908799132810924e-06, "loss": 0.7422, "step": 1400 }, { "epoch": 0.11378218143425647, "grad_norm": 3.673990330967191, "learning_rate": 4.9086230450627655e-06, "loss": 0.61, "step": 1401 }, { "epoch": 0.11386339641029807, "grad_norm": 3.927055438430478, "learning_rate": 4.908446790650641e-06, "loss": 0.7079, "step": 1402 }, { "epoch": 0.11394461138633964, "grad_norm": 7.144929200227564, "learning_rate": 4.908270369586744e-06, "loss": 0.5993, "step": 1403 }, { "epoch": 0.11402582636238122, "grad_norm": 4.458920511681134, "learning_rate": 4.908093781883283e-06, "loss": 0.7028, "step": 1404 }, { "epoch": 0.11410704133842281, "grad_norm": 4.699619928678632, "learning_rate": 4.9079170275524765e-06, "loss": 0.5911, "step": 1405 }, { "epoch": 0.11418825631446439, "grad_norm": 5.941174948945086, "learning_rate": 4.907740106606557e-06, "loss": 0.5615, "step": 1406 }, { "epoch": 0.11426947129050596, "grad_norm": 6.471908401673319, "learning_rate": 4.9075630190577634e-06, "loss": 0.5378, "step": 1407 }, { "epoch": 0.11435068626654755, "grad_norm": 3.9643546125770825, "learning_rate": 4.907385764918351e-06, "loss": 0.6547, "step": 1408 }, { "epoch": 0.11443190124258913, "grad_norm": 4.878410036784967, "learning_rate": 4.907208344200585e-06, "loss": 0.6994, "step": 1409 }, { "epoch": 0.11451311621863071, "grad_norm": 5.156488131990337, "learning_rate": 4.907030756916741e-06, "loss": 0.5712, "step": 1410 }, { "epoch": 0.1145943311946723, "grad_norm": 4.790335390127153, "learning_rate": 4.906853003079108e-06, "loss": 0.6316, "step": 1411 }, { "epoch": 0.11467554617071388, "grad_norm": 4.153017717675372, "learning_rate": 4.9066750826999855e-06, "loss": 0.6906, "step": 1412 }, { "epoch": 0.11475676114675547, "grad_norm": 3.87389513168029, "learning_rate": 4.906496995791684e-06, "loss": 0.512, "step": 1413 }, { "epoch": 0.11483797612279704, "grad_norm": 4.564186801844, "learning_rate": 4.906318742366527e-06, "loss": 0.5193, "step": 1414 }, { "epoch": 0.11491919109883862, "grad_norm": 5.555381562592889, "learning_rate": 4.906140322436849e-06, "loss": 0.7675, "step": 1415 }, { "epoch": 0.11500040607488021, "grad_norm": 8.412815182964685, "learning_rate": 4.9059617360149936e-06, "loss": 0.4897, "step": 1416 }, { "epoch": 0.11508162105092179, "grad_norm": 5.9497894212896325, "learning_rate": 4.905782983113321e-06, "loss": 0.8408, "step": 1417 }, { "epoch": 0.11516283602696337, "grad_norm": 8.78483062159324, "learning_rate": 4.905604063744197e-06, "loss": 0.6732, "step": 1418 }, { "epoch": 0.11524405100300496, "grad_norm": 9.970264715659578, "learning_rate": 4.905424977920004e-06, "loss": 0.532, "step": 1419 }, { "epoch": 0.11532526597904653, "grad_norm": 5.671192067715544, "learning_rate": 4.9052457256531325e-06, "loss": 0.5852, "step": 1420 }, { "epoch": 0.11540648095508811, "grad_norm": 7.150271447974999, "learning_rate": 4.905066306955986e-06, "loss": 0.6486, "step": 1421 }, { "epoch": 0.1154876959311297, "grad_norm": 6.958143840228272, "learning_rate": 4.904886721840981e-06, "loss": 0.6012, "step": 1422 }, { "epoch": 0.11556891090717128, "grad_norm": 4.760774004700414, "learning_rate": 4.904706970320542e-06, "loss": 0.6706, "step": 1423 }, { "epoch": 0.11565012588321287, "grad_norm": 4.299376204301504, "learning_rate": 4.904527052407107e-06, "loss": 0.633, "step": 1424 }, { "epoch": 0.11573134085925445, "grad_norm": 4.095629515769702, "learning_rate": 4.904346968113126e-06, "loss": 0.8379, "step": 1425 }, { "epoch": 0.11581255583529602, "grad_norm": 6.181276965379973, "learning_rate": 4.904166717451059e-06, "loss": 0.7148, "step": 1426 }, { "epoch": 0.11589377081133762, "grad_norm": 3.8573979500957676, "learning_rate": 4.90398630043338e-06, "loss": 0.6601, "step": 1427 }, { "epoch": 0.11597498578737919, "grad_norm": 5.572617436631759, "learning_rate": 4.903805717072572e-06, "loss": 0.8698, "step": 1428 }, { "epoch": 0.11605620076342077, "grad_norm": 6.9707268541736775, "learning_rate": 4.90362496738113e-06, "loss": 0.5288, "step": 1429 }, { "epoch": 0.11613741573946236, "grad_norm": 4.5564529770959945, "learning_rate": 4.9034440513715605e-06, "loss": 0.4828, "step": 1430 }, { "epoch": 0.11621863071550394, "grad_norm": 4.109199531053192, "learning_rate": 4.9032629690563835e-06, "loss": 0.7356, "step": 1431 }, { "epoch": 0.11629984569154551, "grad_norm": 4.437378918259062, "learning_rate": 4.903081720448128e-06, "loss": 0.5222, "step": 1432 }, { "epoch": 0.1163810606675871, "grad_norm": 6.399358574915106, "learning_rate": 4.902900305559336e-06, "loss": 0.5954, "step": 1433 }, { "epoch": 0.11646227564362868, "grad_norm": 4.343118949951745, "learning_rate": 4.9027187244025594e-06, "loss": 0.5096, "step": 1434 }, { "epoch": 0.11654349061967027, "grad_norm": 4.3409440129123515, "learning_rate": 4.902536976990364e-06, "loss": 0.5396, "step": 1435 }, { "epoch": 0.11662470559571185, "grad_norm": 6.602474988405446, "learning_rate": 4.902355063335324e-06, "loss": 0.5902, "step": 1436 }, { "epoch": 0.11670592057175343, "grad_norm": 5.5514520331211425, "learning_rate": 4.902172983450029e-06, "loss": 0.8389, "step": 1437 }, { "epoch": 0.11678713554779502, "grad_norm": 6.703176030649621, "learning_rate": 4.901990737347076e-06, "loss": 0.8458, "step": 1438 }, { "epoch": 0.1168683505238366, "grad_norm": 4.454570008772595, "learning_rate": 4.901808325039077e-06, "loss": 0.6968, "step": 1439 }, { "epoch": 0.11694956549987817, "grad_norm": 4.842565250482216, "learning_rate": 4.901625746538653e-06, "loss": 0.5741, "step": 1440 }, { "epoch": 0.11703078047591976, "grad_norm": 8.75110385587914, "learning_rate": 4.901443001858438e-06, "loss": 0.5164, "step": 1441 }, { "epoch": 0.11711199545196134, "grad_norm": 4.468121329238175, "learning_rate": 4.901260091011076e-06, "loss": 0.6103, "step": 1442 }, { "epoch": 0.11719321042800292, "grad_norm": 7.4646000814738045, "learning_rate": 4.901077014009225e-06, "loss": 0.4323, "step": 1443 }, { "epoch": 0.11727442540404451, "grad_norm": 3.3258631715598055, "learning_rate": 4.900893770865552e-06, "loss": 0.5174, "step": 1444 }, { "epoch": 0.11735564038008608, "grad_norm": 8.792100552657674, "learning_rate": 4.900710361592737e-06, "loss": 0.6525, "step": 1445 }, { "epoch": 0.11743685535612768, "grad_norm": 4.961731173202635, "learning_rate": 4.9005267862034695e-06, "loss": 0.6754, "step": 1446 }, { "epoch": 0.11751807033216925, "grad_norm": 3.814651002868712, "learning_rate": 4.900343044710453e-06, "loss": 0.6968, "step": 1447 }, { "epoch": 0.11759928530821083, "grad_norm": 4.911489896369547, "learning_rate": 4.900159137126402e-06, "loss": 0.5924, "step": 1448 }, { "epoch": 0.11768050028425242, "grad_norm": 7.2633083972944945, "learning_rate": 4.899975063464042e-06, "loss": 0.5496, "step": 1449 }, { "epoch": 0.117761715260294, "grad_norm": 5.005595908510799, "learning_rate": 4.899790823736108e-06, "loss": 0.6644, "step": 1450 }, { "epoch": 0.11784293023633557, "grad_norm": 4.702266439098655, "learning_rate": 4.89960641795535e-06, "loss": 0.8283, "step": 1451 }, { "epoch": 0.11792414521237717, "grad_norm": 6.422288029967876, "learning_rate": 4.899421846134529e-06, "loss": 0.5909, "step": 1452 }, { "epoch": 0.11800536018841874, "grad_norm": 6.425496839266026, "learning_rate": 4.899237108286414e-06, "loss": 0.499, "step": 1453 }, { "epoch": 0.11808657516446033, "grad_norm": 3.4591295878059247, "learning_rate": 4.8990522044237884e-06, "loss": 0.5702, "step": 1454 }, { "epoch": 0.11816779014050191, "grad_norm": 4.321892903241153, "learning_rate": 4.898867134559448e-06, "loss": 0.4269, "step": 1455 }, { "epoch": 0.11824900511654349, "grad_norm": 6.352156069028349, "learning_rate": 4.898681898706197e-06, "loss": 0.5002, "step": 1456 }, { "epoch": 0.11833022009258508, "grad_norm": 4.739299344449501, "learning_rate": 4.898496496876854e-06, "loss": 0.7199, "step": 1457 }, { "epoch": 0.11841143506862666, "grad_norm": 5.475577877089898, "learning_rate": 4.898310929084247e-06, "loss": 0.6848, "step": 1458 }, { "epoch": 0.11849265004466823, "grad_norm": 4.312752315143542, "learning_rate": 4.898125195341217e-06, "loss": 0.6698, "step": 1459 }, { "epoch": 0.11857386502070982, "grad_norm": 6.101152761282113, "learning_rate": 4.897939295660615e-06, "loss": 0.5399, "step": 1460 }, { "epoch": 0.1186550799967514, "grad_norm": 5.730964159782436, "learning_rate": 4.897753230055304e-06, "loss": 0.5391, "step": 1461 }, { "epoch": 0.11873629497279298, "grad_norm": 4.602835724556902, "learning_rate": 4.89756699853816e-06, "loss": 0.6377, "step": 1462 }, { "epoch": 0.11881750994883457, "grad_norm": 5.577323364531521, "learning_rate": 4.8973806011220695e-06, "loss": 0.6413, "step": 1463 }, { "epoch": 0.11889872492487615, "grad_norm": 6.109634102169282, "learning_rate": 4.897194037819928e-06, "loss": 0.5454, "step": 1464 }, { "epoch": 0.11897993990091774, "grad_norm": 4.473657871957378, "learning_rate": 4.897007308644647e-06, "loss": 0.5516, "step": 1465 }, { "epoch": 0.11906115487695931, "grad_norm": 4.6381132522690995, "learning_rate": 4.896820413609146e-06, "loss": 0.6565, "step": 1466 }, { "epoch": 0.11914236985300089, "grad_norm": 5.386818421013147, "learning_rate": 4.896633352726357e-06, "loss": 0.7063, "step": 1467 }, { "epoch": 0.11922358482904248, "grad_norm": 3.1581048679010646, "learning_rate": 4.896446126009224e-06, "loss": 0.7565, "step": 1468 }, { "epoch": 0.11930479980508406, "grad_norm": 6.401004722074878, "learning_rate": 4.896258733470702e-06, "loss": 0.5026, "step": 1469 }, { "epoch": 0.11938601478112564, "grad_norm": 2.7519458408541797, "learning_rate": 4.896071175123758e-06, "loss": 0.6255, "step": 1470 }, { "epoch": 0.11946722975716723, "grad_norm": 4.404610542747091, "learning_rate": 4.8958834509813706e-06, "loss": 0.5504, "step": 1471 }, { "epoch": 0.1195484447332088, "grad_norm": 6.077687424149937, "learning_rate": 4.8956955610565275e-06, "loss": 0.6899, "step": 1472 }, { "epoch": 0.11962965970925038, "grad_norm": 5.523332768156323, "learning_rate": 4.895507505362231e-06, "loss": 0.5743, "step": 1473 }, { "epoch": 0.11971087468529197, "grad_norm": 5.840235758929547, "learning_rate": 4.895319283911492e-06, "loss": 0.4483, "step": 1474 }, { "epoch": 0.11979208966133355, "grad_norm": 3.9853018352679177, "learning_rate": 4.895130896717336e-06, "loss": 0.6511, "step": 1475 }, { "epoch": 0.11987330463737514, "grad_norm": 4.087084936629081, "learning_rate": 4.894942343792799e-06, "loss": 0.6048, "step": 1476 }, { "epoch": 0.11995451961341672, "grad_norm": 4.9948060996575325, "learning_rate": 4.894753625150927e-06, "loss": 0.5193, "step": 1477 }, { "epoch": 0.12003573458945829, "grad_norm": 5.148303769586792, "learning_rate": 4.894564740804777e-06, "loss": 0.4947, "step": 1478 }, { "epoch": 0.12011694956549988, "grad_norm": 4.926289780739354, "learning_rate": 4.89437569076742e-06, "loss": 0.641, "step": 1479 }, { "epoch": 0.12019816454154146, "grad_norm": 4.420640360809511, "learning_rate": 4.894186475051938e-06, "loss": 0.6427, "step": 1480 }, { "epoch": 0.12027937951758304, "grad_norm": 4.552192514027042, "learning_rate": 4.893997093671422e-06, "loss": 0.6684, "step": 1481 }, { "epoch": 0.12036059449362463, "grad_norm": 3.6702893426104306, "learning_rate": 4.893807546638979e-06, "loss": 0.6778, "step": 1482 }, { "epoch": 0.1204418094696662, "grad_norm": 7.860150046179448, "learning_rate": 4.893617833967721e-06, "loss": 0.6479, "step": 1483 }, { "epoch": 0.12052302444570778, "grad_norm": 3.6225218661319083, "learning_rate": 4.893427955670778e-06, "loss": 0.5258, "step": 1484 }, { "epoch": 0.12060423942174937, "grad_norm": 7.286179190372548, "learning_rate": 4.893237911761287e-06, "loss": 0.5499, "step": 1485 }, { "epoch": 0.12068545439779095, "grad_norm": 5.426351678323337, "learning_rate": 4.893047702252399e-06, "loss": 0.4595, "step": 1486 }, { "epoch": 0.12076666937383254, "grad_norm": 4.641027529381159, "learning_rate": 4.892857327157275e-06, "loss": 0.7437, "step": 1487 }, { "epoch": 0.12084788434987412, "grad_norm": 5.910005928653577, "learning_rate": 4.892666786489087e-06, "loss": 0.5187, "step": 1488 }, { "epoch": 0.1209290993259157, "grad_norm": 4.433359476670774, "learning_rate": 4.8924760802610215e-06, "loss": 0.6245, "step": 1489 }, { "epoch": 0.12101031430195729, "grad_norm": 16.649957190107745, "learning_rate": 4.8922852084862734e-06, "loss": 0.6885, "step": 1490 }, { "epoch": 0.12109152927799886, "grad_norm": 5.101531362441216, "learning_rate": 4.892094171178049e-06, "loss": 0.4662, "step": 1491 }, { "epoch": 0.12117274425404044, "grad_norm": 5.1657125457609725, "learning_rate": 4.891902968349568e-06, "loss": 0.5948, "step": 1492 }, { "epoch": 0.12125395923008203, "grad_norm": 4.342894827579331, "learning_rate": 4.8917116000140614e-06, "loss": 0.5409, "step": 1493 }, { "epoch": 0.12133517420612361, "grad_norm": 3.8330951289725124, "learning_rate": 4.8915200661847695e-06, "loss": 0.52, "step": 1494 }, { "epoch": 0.12141638918216519, "grad_norm": 5.179925909022087, "learning_rate": 4.891328366874946e-06, "loss": 0.5504, "step": 1495 }, { "epoch": 0.12149760415820678, "grad_norm": 5.330823987896905, "learning_rate": 4.891136502097855e-06, "loss": 0.6273, "step": 1496 }, { "epoch": 0.12157881913424835, "grad_norm": 5.585945845508732, "learning_rate": 4.890944471866774e-06, "loss": 0.6236, "step": 1497 }, { "epoch": 0.12166003411028994, "grad_norm": 3.83199043102917, "learning_rate": 4.890752276194989e-06, "loss": 0.7202, "step": 1498 }, { "epoch": 0.12174124908633152, "grad_norm": 3.6411956112594677, "learning_rate": 4.890559915095798e-06, "loss": 0.5213, "step": 1499 }, { "epoch": 0.1218224640623731, "grad_norm": 4.786252322922742, "learning_rate": 4.890367388582514e-06, "loss": 0.9004, "step": 1500 }, { "epoch": 0.12190367903841469, "grad_norm": 4.979342968709159, "learning_rate": 4.890174696668458e-06, "loss": 0.6452, "step": 1501 }, { "epoch": 0.12198489401445627, "grad_norm": 4.11868570254399, "learning_rate": 4.889981839366962e-06, "loss": 0.4785, "step": 1502 }, { "epoch": 0.12206610899049784, "grad_norm": 4.908815740621781, "learning_rate": 4.889788816691372e-06, "loss": 0.6467, "step": 1503 }, { "epoch": 0.12214732396653943, "grad_norm": 6.3622155832910625, "learning_rate": 4.889595628655044e-06, "loss": 0.61, "step": 1504 }, { "epoch": 0.12222853894258101, "grad_norm": 3.6609655168066064, "learning_rate": 4.8894022752713445e-06, "loss": 0.7134, "step": 1505 }, { "epoch": 0.12230975391862259, "grad_norm": 4.846573857319596, "learning_rate": 4.8892087565536535e-06, "loss": 0.6646, "step": 1506 }, { "epoch": 0.12239096889466418, "grad_norm": 5.788899246359694, "learning_rate": 4.889015072515361e-06, "loss": 0.7547, "step": 1507 }, { "epoch": 0.12247218387070576, "grad_norm": 4.583902093938661, "learning_rate": 4.888821223169869e-06, "loss": 0.5206, "step": 1508 }, { "epoch": 0.12255339884674735, "grad_norm": 3.3647325384813915, "learning_rate": 4.888627208530592e-06, "loss": 0.5928, "step": 1509 }, { "epoch": 0.12263461382278892, "grad_norm": 3.8539584897978654, "learning_rate": 4.8884330286109535e-06, "loss": 0.5397, "step": 1510 }, { "epoch": 0.1227158287988305, "grad_norm": 15.266199610075871, "learning_rate": 4.88823868342439e-06, "loss": 0.6754, "step": 1511 }, { "epoch": 0.12279704377487209, "grad_norm": 5.095675930247071, "learning_rate": 4.888044172984349e-06, "loss": 0.7183, "step": 1512 }, { "epoch": 0.12287825875091367, "grad_norm": 5.589937550753531, "learning_rate": 4.887849497304289e-06, "loss": 0.7005, "step": 1513 }, { "epoch": 0.12295947372695525, "grad_norm": 6.394170257036341, "learning_rate": 4.8876546563976825e-06, "loss": 0.6316, "step": 1514 }, { "epoch": 0.12304068870299684, "grad_norm": 5.521671875850051, "learning_rate": 4.88745965027801e-06, "loss": 0.6913, "step": 1515 }, { "epoch": 0.12312190367903841, "grad_norm": 4.969659762897969, "learning_rate": 4.887264478958765e-06, "loss": 0.5295, "step": 1516 }, { "epoch": 0.12320311865507999, "grad_norm": 6.2253045417503845, "learning_rate": 4.887069142453453e-06, "loss": 0.6538, "step": 1517 }, { "epoch": 0.12328433363112158, "grad_norm": 9.570082889106901, "learning_rate": 4.886873640775588e-06, "loss": 0.5829, "step": 1518 }, { "epoch": 0.12336554860716316, "grad_norm": 5.609163036466005, "learning_rate": 4.886677973938701e-06, "loss": 0.5647, "step": 1519 }, { "epoch": 0.12344676358320475, "grad_norm": 7.818406834616618, "learning_rate": 4.886482141956329e-06, "loss": 0.6357, "step": 1520 }, { "epoch": 0.12352797855924633, "grad_norm": 3.1695857831672067, "learning_rate": 4.8862861448420234e-06, "loss": 0.5878, "step": 1521 }, { "epoch": 0.1236091935352879, "grad_norm": 4.384307163664372, "learning_rate": 4.886089982609345e-06, "loss": 0.5771, "step": 1522 }, { "epoch": 0.1236904085113295, "grad_norm": 4.469922202011447, "learning_rate": 4.885893655271869e-06, "loss": 0.5124, "step": 1523 }, { "epoch": 0.12377162348737107, "grad_norm": 4.594952235845705, "learning_rate": 4.885697162843179e-06, "loss": 0.6882, "step": 1524 }, { "epoch": 0.12385283846341265, "grad_norm": 11.952868311490398, "learning_rate": 4.8855005053368715e-06, "loss": 0.5141, "step": 1525 }, { "epoch": 0.12393405343945424, "grad_norm": 5.107326813624944, "learning_rate": 4.885303682766554e-06, "loss": 0.6077, "step": 1526 }, { "epoch": 0.12401526841549582, "grad_norm": 3.90801909322924, "learning_rate": 4.885106695145846e-06, "loss": 0.739, "step": 1527 }, { "epoch": 0.1240964833915374, "grad_norm": 4.458856274877279, "learning_rate": 4.884909542488377e-06, "loss": 0.4853, "step": 1528 }, { "epoch": 0.12417769836757898, "grad_norm": 7.488057909444421, "learning_rate": 4.88471222480779e-06, "loss": 0.5267, "step": 1529 }, { "epoch": 0.12425891334362056, "grad_norm": 10.709236752099061, "learning_rate": 4.8845147421177375e-06, "loss": 0.5767, "step": 1530 }, { "epoch": 0.12434012831966215, "grad_norm": 4.227280902938617, "learning_rate": 4.8843170944318855e-06, "loss": 0.6582, "step": 1531 }, { "epoch": 0.12442134329570373, "grad_norm": 5.304472983158379, "learning_rate": 4.88411928176391e-06, "loss": 0.4994, "step": 1532 }, { "epoch": 0.1245025582717453, "grad_norm": 6.083933660892841, "learning_rate": 4.8839213041274955e-06, "loss": 0.454, "step": 1533 }, { "epoch": 0.1245837732477869, "grad_norm": 7.150062978858339, "learning_rate": 4.8837231615363455e-06, "loss": 0.7561, "step": 1534 }, { "epoch": 0.12466498822382847, "grad_norm": 6.602096765289932, "learning_rate": 4.883524854004168e-06, "loss": 0.6597, "step": 1535 }, { "epoch": 0.12474620319987005, "grad_norm": 4.853497195307292, "learning_rate": 4.883326381544686e-06, "loss": 0.5989, "step": 1536 }, { "epoch": 0.12482741817591164, "grad_norm": 5.57662629799743, "learning_rate": 4.88312774417163e-06, "loss": 0.555, "step": 1537 }, { "epoch": 0.12490863315195322, "grad_norm": 4.64783655989287, "learning_rate": 4.882928941898748e-06, "loss": 0.5814, "step": 1538 }, { "epoch": 0.1249898481279948, "grad_norm": 8.657184018704008, "learning_rate": 4.882729974739794e-06, "loss": 0.5567, "step": 1539 }, { "epoch": 0.12507106310403637, "grad_norm": 9.124561847833686, "learning_rate": 4.882530842708537e-06, "loss": 0.5428, "step": 1540 }, { "epoch": 0.12515227808007798, "grad_norm": 5.93162727377288, "learning_rate": 4.882331545818755e-06, "loss": 0.5641, "step": 1541 }, { "epoch": 0.12523349305611955, "grad_norm": 4.417596995333459, "learning_rate": 4.882132084084238e-06, "loss": 0.6554, "step": 1542 }, { "epoch": 0.12531470803216113, "grad_norm": 5.238384789515402, "learning_rate": 4.8819324575187875e-06, "loss": 0.8369, "step": 1543 }, { "epoch": 0.1253959230082027, "grad_norm": 5.490158000040072, "learning_rate": 4.881732666136217e-06, "loss": 0.7737, "step": 1544 }, { "epoch": 0.12547713798424429, "grad_norm": 18.610112814184124, "learning_rate": 4.881532709950352e-06, "loss": 0.5733, "step": 1545 }, { "epoch": 0.1255583529602859, "grad_norm": 4.3346186230095585, "learning_rate": 4.8813325889750275e-06, "loss": 0.518, "step": 1546 }, { "epoch": 0.12563956793632747, "grad_norm": 5.394377171648292, "learning_rate": 4.881132303224091e-06, "loss": 0.4973, "step": 1547 }, { "epoch": 0.12572078291236904, "grad_norm": 6.281282652469185, "learning_rate": 4.880931852711401e-06, "loss": 0.53, "step": 1548 }, { "epoch": 0.12580199788841062, "grad_norm": 5.063604963463662, "learning_rate": 4.880731237450828e-06, "loss": 0.543, "step": 1549 }, { "epoch": 0.1258832128644522, "grad_norm": 5.589066320656067, "learning_rate": 4.880530457456252e-06, "loss": 0.5307, "step": 1550 }, { "epoch": 0.12596442784049378, "grad_norm": 5.313990606526498, "learning_rate": 4.880329512741568e-06, "loss": 0.6096, "step": 1551 }, { "epoch": 0.12604564281653538, "grad_norm": 5.093580296509499, "learning_rate": 4.88012840332068e-06, "loss": 0.6667, "step": 1552 }, { "epoch": 0.12612685779257696, "grad_norm": 3.074951611991778, "learning_rate": 4.879927129207502e-06, "loss": 0.669, "step": 1553 }, { "epoch": 0.12620807276861853, "grad_norm": 9.209748336487731, "learning_rate": 4.8797256904159625e-06, "loss": 0.7399, "step": 1554 }, { "epoch": 0.1262892877446601, "grad_norm": 5.6697335990100965, "learning_rate": 4.87952408696e-06, "loss": 0.6407, "step": 1555 }, { "epoch": 0.1263705027207017, "grad_norm": 8.545213357048508, "learning_rate": 4.879322318853564e-06, "loss": 0.5582, "step": 1556 }, { "epoch": 0.1264517176967433, "grad_norm": 7.569125385036642, "learning_rate": 4.879120386110616e-06, "loss": 0.6649, "step": 1557 }, { "epoch": 0.12653293267278487, "grad_norm": 16.126868678635603, "learning_rate": 4.878918288745128e-06, "loss": 0.6249, "step": 1558 }, { "epoch": 0.12661414764882645, "grad_norm": 5.161708427963795, "learning_rate": 4.878716026771086e-06, "loss": 0.5061, "step": 1559 }, { "epoch": 0.12669536262486802, "grad_norm": 4.94640934937331, "learning_rate": 4.878513600202483e-06, "loss": 0.6149, "step": 1560 }, { "epoch": 0.1267765776009096, "grad_norm": 5.628454265223888, "learning_rate": 4.878311009053328e-06, "loss": 0.5789, "step": 1561 }, { "epoch": 0.12685779257695118, "grad_norm": 5.099342067120917, "learning_rate": 4.878108253337638e-06, "loss": 0.6344, "step": 1562 }, { "epoch": 0.12693900755299278, "grad_norm": 5.258000831985147, "learning_rate": 4.877905333069442e-06, "loss": 0.5775, "step": 1563 }, { "epoch": 0.12702022252903436, "grad_norm": 6.236383483926228, "learning_rate": 4.877702248262782e-06, "loss": 0.5334, "step": 1564 }, { "epoch": 0.12710143750507594, "grad_norm": 4.048534369160581, "learning_rate": 4.87749899893171e-06, "loss": 0.7184, "step": 1565 }, { "epoch": 0.12718265248111751, "grad_norm": 4.398427223579315, "learning_rate": 4.8772955850902914e-06, "loss": 0.6671, "step": 1566 }, { "epoch": 0.1272638674571591, "grad_norm": 5.637044630053773, "learning_rate": 4.877092006752599e-06, "loss": 0.621, "step": 1567 }, { "epoch": 0.1273450824332007, "grad_norm": 6.7178341766236125, "learning_rate": 4.876888263932721e-06, "loss": 0.5828, "step": 1568 }, { "epoch": 0.12742629740924227, "grad_norm": 5.624169626907889, "learning_rate": 4.876684356644754e-06, "loss": 0.6779, "step": 1569 }, { "epoch": 0.12750751238528385, "grad_norm": 4.476842415269882, "learning_rate": 4.876480284902807e-06, "loss": 0.4934, "step": 1570 }, { "epoch": 0.12758872736132543, "grad_norm": 3.7336585247616885, "learning_rate": 4.8762760487210035e-06, "loss": 0.5301, "step": 1571 }, { "epoch": 0.127669942337367, "grad_norm": 8.61154812995492, "learning_rate": 4.876071648113473e-06, "loss": 0.6757, "step": 1572 }, { "epoch": 0.12775115731340858, "grad_norm": 6.969888194514512, "learning_rate": 4.875867083094359e-06, "loss": 0.7971, "step": 1573 }, { "epoch": 0.12783237228945019, "grad_norm": 7.7890484350762765, "learning_rate": 4.875662353677818e-06, "loss": 0.4918, "step": 1574 }, { "epoch": 0.12791358726549176, "grad_norm": 4.545675942015323, "learning_rate": 4.875457459878014e-06, "loss": 0.6069, "step": 1575 }, { "epoch": 0.12799480224153334, "grad_norm": 7.615402665241847, "learning_rate": 4.875252401709126e-06, "loss": 0.5996, "step": 1576 }, { "epoch": 0.12807601721757492, "grad_norm": 5.198690570704765, "learning_rate": 4.8750471791853436e-06, "loss": 0.6422, "step": 1577 }, { "epoch": 0.1281572321936165, "grad_norm": 5.875388513113994, "learning_rate": 4.874841792320865e-06, "loss": 0.4701, "step": 1578 }, { "epoch": 0.1282384471696581, "grad_norm": 6.491950200516407, "learning_rate": 4.874636241129904e-06, "loss": 0.5815, "step": 1579 }, { "epoch": 0.12831966214569968, "grad_norm": 4.293195229926921, "learning_rate": 4.874430525626682e-06, "loss": 0.6365, "step": 1580 }, { "epoch": 0.12840087712174125, "grad_norm": 8.402150286140357, "learning_rate": 4.874224645825435e-06, "loss": 0.4979, "step": 1581 }, { "epoch": 0.12848209209778283, "grad_norm": 6.3159806141726, "learning_rate": 4.874018601740407e-06, "loss": 0.504, "step": 1582 }, { "epoch": 0.1285633070738244, "grad_norm": 5.865695583703645, "learning_rate": 4.873812393385856e-06, "loss": 0.5678, "step": 1583 }, { "epoch": 0.12864452204986598, "grad_norm": 4.259448849973863, "learning_rate": 4.873606020776051e-06, "loss": 0.637, "step": 1584 }, { "epoch": 0.1287257370259076, "grad_norm": 4.795031800681299, "learning_rate": 4.873399483925272e-06, "loss": 0.6518, "step": 1585 }, { "epoch": 0.12880695200194917, "grad_norm": 6.982468505996596, "learning_rate": 4.8731927828478085e-06, "loss": 0.5015, "step": 1586 }, { "epoch": 0.12888816697799074, "grad_norm": 4.194815922907224, "learning_rate": 4.872985917557965e-06, "loss": 0.6631, "step": 1587 }, { "epoch": 0.12896938195403232, "grad_norm": 6.979913270961702, "learning_rate": 4.872778888070055e-06, "loss": 0.477, "step": 1588 }, { "epoch": 0.1290505969300739, "grad_norm": 6.539545690632459, "learning_rate": 4.872571694398403e-06, "loss": 0.5583, "step": 1589 }, { "epoch": 0.1291318119061155, "grad_norm": 7.207244340353426, "learning_rate": 4.872364336557348e-06, "loss": 0.6374, "step": 1590 }, { "epoch": 0.12921302688215708, "grad_norm": 5.271822346774599, "learning_rate": 4.8721568145612355e-06, "loss": 0.5826, "step": 1591 }, { "epoch": 0.12929424185819866, "grad_norm": 6.699431722524916, "learning_rate": 4.8719491284244256e-06, "loss": 0.5468, "step": 1592 }, { "epoch": 0.12937545683424023, "grad_norm": 4.303099582925413, "learning_rate": 4.871741278161291e-06, "loss": 0.5465, "step": 1593 }, { "epoch": 0.1294566718102818, "grad_norm": 3.318090123286297, "learning_rate": 4.87153326378621e-06, "loss": 0.5562, "step": 1594 }, { "epoch": 0.1295378867863234, "grad_norm": 3.8336332872072507, "learning_rate": 4.87132508531358e-06, "loss": 0.6022, "step": 1595 }, { "epoch": 0.129619101762365, "grad_norm": 6.539483107346492, "learning_rate": 4.871116742757803e-06, "loss": 0.4905, "step": 1596 }, { "epoch": 0.12970031673840657, "grad_norm": 3.2101449234700072, "learning_rate": 4.870908236133297e-06, "loss": 0.6639, "step": 1597 }, { "epoch": 0.12978153171444815, "grad_norm": 11.166157350084525, "learning_rate": 4.870699565454489e-06, "loss": 0.5098, "step": 1598 }, { "epoch": 0.12986274669048972, "grad_norm": 4.137968227782281, "learning_rate": 4.870490730735818e-06, "loss": 0.4853, "step": 1599 }, { "epoch": 0.1299439616665313, "grad_norm": 3.555952898094085, "learning_rate": 4.870281731991733e-06, "loss": 0.5433, "step": 1600 }, { "epoch": 0.1300251766425729, "grad_norm": 6.292559801998863, "learning_rate": 4.870072569236697e-06, "loss": 0.5833, "step": 1601 }, { "epoch": 0.13010639161861448, "grad_norm": 4.245679851338393, "learning_rate": 4.869863242485183e-06, "loss": 0.5839, "step": 1602 }, { "epoch": 0.13018760659465606, "grad_norm": 10.695591016769804, "learning_rate": 4.8696537517516754e-06, "loss": 0.5284, "step": 1603 }, { "epoch": 0.13026882157069763, "grad_norm": 5.501068173317865, "learning_rate": 4.869444097050668e-06, "loss": 0.5927, "step": 1604 }, { "epoch": 0.1303500365467392, "grad_norm": 5.717664746661781, "learning_rate": 4.8692342783966706e-06, "loss": 0.5258, "step": 1605 }, { "epoch": 0.1304312515227808, "grad_norm": 5.029968109519692, "learning_rate": 4.869024295804199e-06, "loss": 0.6064, "step": 1606 }, { "epoch": 0.1305124664988224, "grad_norm": 3.7931205356074575, "learning_rate": 4.868814149287785e-06, "loss": 0.5764, "step": 1607 }, { "epoch": 0.13059368147486397, "grad_norm": 4.944616527342133, "learning_rate": 4.868603838861969e-06, "loss": 0.5991, "step": 1608 }, { "epoch": 0.13067489645090555, "grad_norm": 5.178859852231613, "learning_rate": 4.868393364541301e-06, "loss": 0.6465, "step": 1609 }, { "epoch": 0.13075611142694712, "grad_norm": 4.567090745940825, "learning_rate": 4.868182726340349e-06, "loss": 0.6649, "step": 1610 }, { "epoch": 0.1308373264029887, "grad_norm": 5.802293496504625, "learning_rate": 4.867971924273685e-06, "loss": 0.5043, "step": 1611 }, { "epoch": 0.1309185413790303, "grad_norm": 4.914755653217928, "learning_rate": 4.8677609583558956e-06, "loss": 0.5295, "step": 1612 }, { "epoch": 0.13099975635507188, "grad_norm": 7.208070575519608, "learning_rate": 4.867549828601579e-06, "loss": 0.6164, "step": 1613 }, { "epoch": 0.13108097133111346, "grad_norm": 7.77772857949289, "learning_rate": 4.8673385350253454e-06, "loss": 0.5208, "step": 1614 }, { "epoch": 0.13116218630715504, "grad_norm": 8.264639243565528, "learning_rate": 4.867127077641813e-06, "loss": 0.6235, "step": 1615 }, { "epoch": 0.13124340128319661, "grad_norm": 5.062219960669989, "learning_rate": 4.866915456465615e-06, "loss": 0.57, "step": 1616 }, { "epoch": 0.1313246162592382, "grad_norm": 16.478111147319943, "learning_rate": 4.866703671511395e-06, "loss": 0.5729, "step": 1617 }, { "epoch": 0.1314058312352798, "grad_norm": 4.4225512989681475, "learning_rate": 4.8664917227938056e-06, "loss": 0.5722, "step": 1618 }, { "epoch": 0.13148704621132137, "grad_norm": 5.041669688436434, "learning_rate": 4.866279610327514e-06, "loss": 0.7651, "step": 1619 }, { "epoch": 0.13156826118736295, "grad_norm": 5.8825838835020505, "learning_rate": 4.8660673341271966e-06, "loss": 0.6381, "step": 1620 }, { "epoch": 0.13164947616340453, "grad_norm": 4.034188407645285, "learning_rate": 4.865854894207541e-06, "loss": 0.6949, "step": 1621 }, { "epoch": 0.1317306911394461, "grad_norm": 3.6403419129784864, "learning_rate": 4.865642290583249e-06, "loss": 0.5172, "step": 1622 }, { "epoch": 0.1318119061154877, "grad_norm": 4.836824286973765, "learning_rate": 4.86542952326903e-06, "loss": 0.6615, "step": 1623 }, { "epoch": 0.1318931210915293, "grad_norm": 4.388993168301412, "learning_rate": 4.865216592279607e-06, "loss": 0.6131, "step": 1624 }, { "epoch": 0.13197433606757086, "grad_norm": 6.591216319051455, "learning_rate": 4.865003497629713e-06, "loss": 0.6583, "step": 1625 }, { "epoch": 0.13205555104361244, "grad_norm": 4.389020157372549, "learning_rate": 4.8647902393340955e-06, "loss": 0.6026, "step": 1626 }, { "epoch": 0.13213676601965402, "grad_norm": 4.250808042920454, "learning_rate": 4.864576817407507e-06, "loss": 0.6323, "step": 1627 }, { "epoch": 0.1322179809956956, "grad_norm": 6.881832060317491, "learning_rate": 4.864363231864717e-06, "loss": 0.6207, "step": 1628 }, { "epoch": 0.1322991959717372, "grad_norm": 10.412011238660916, "learning_rate": 4.864149482720505e-06, "loss": 0.5453, "step": 1629 }, { "epoch": 0.13238041094777878, "grad_norm": 5.774868373588398, "learning_rate": 4.863935569989662e-06, "loss": 0.5582, "step": 1630 }, { "epoch": 0.13246162592382035, "grad_norm": 5.245799631843284, "learning_rate": 4.863721493686987e-06, "loss": 0.6431, "step": 1631 }, { "epoch": 0.13254284089986193, "grad_norm": 4.177337851592638, "learning_rate": 4.8635072538272954e-06, "loss": 0.5681, "step": 1632 }, { "epoch": 0.1326240558759035, "grad_norm": 3.517896942856102, "learning_rate": 4.863292850425409e-06, "loss": 0.6479, "step": 1633 }, { "epoch": 0.1327052708519451, "grad_norm": 5.350168873979354, "learning_rate": 4.863078283496167e-06, "loss": 0.6847, "step": 1634 }, { "epoch": 0.1327864858279867, "grad_norm": 6.986629354106734, "learning_rate": 4.862863553054413e-06, "loss": 0.6294, "step": 1635 }, { "epoch": 0.13286770080402827, "grad_norm": 4.997326232828624, "learning_rate": 4.862648659115007e-06, "loss": 0.7014, "step": 1636 }, { "epoch": 0.13294891578006984, "grad_norm": 4.2346207052457485, "learning_rate": 4.8624336016928175e-06, "loss": 0.5626, "step": 1637 }, { "epoch": 0.13303013075611142, "grad_norm": 4.331783623394118, "learning_rate": 4.8622183808027255e-06, "loss": 0.6618, "step": 1638 }, { "epoch": 0.133111345732153, "grad_norm": 5.008484418976238, "learning_rate": 4.8620029964596234e-06, "loss": 0.6353, "step": 1639 }, { "epoch": 0.1331925607081946, "grad_norm": 6.50139784970913, "learning_rate": 4.861787448678416e-06, "loss": 0.486, "step": 1640 }, { "epoch": 0.13327377568423618, "grad_norm": 6.369833370069241, "learning_rate": 4.861571737474015e-06, "loss": 0.5904, "step": 1641 }, { "epoch": 0.13335499066027776, "grad_norm": 3.8537585558162317, "learning_rate": 4.8613558628613494e-06, "loss": 0.8424, "step": 1642 }, { "epoch": 0.13343620563631933, "grad_norm": 6.904987154180493, "learning_rate": 4.8611398248553554e-06, "loss": 0.5671, "step": 1643 }, { "epoch": 0.1335174206123609, "grad_norm": 3.1433510533347415, "learning_rate": 4.860923623470981e-06, "loss": 0.4704, "step": 1644 }, { "epoch": 0.13359863558840251, "grad_norm": 5.418253266261615, "learning_rate": 4.860707258723187e-06, "loss": 0.5313, "step": 1645 }, { "epoch": 0.1336798505644441, "grad_norm": 5.2359535500265615, "learning_rate": 4.860490730626945e-06, "loss": 0.5742, "step": 1646 }, { "epoch": 0.13376106554048567, "grad_norm": 11.436418505055565, "learning_rate": 4.860274039197237e-06, "loss": 0.5654, "step": 1647 }, { "epoch": 0.13384228051652725, "grad_norm": 6.262589762855426, "learning_rate": 4.860057184449057e-06, "loss": 0.4724, "step": 1648 }, { "epoch": 0.13392349549256882, "grad_norm": 5.518995358402323, "learning_rate": 4.85984016639741e-06, "loss": 0.5537, "step": 1649 }, { "epoch": 0.1340047104686104, "grad_norm": 5.088469039450566, "learning_rate": 4.859622985057313e-06, "loss": 0.5638, "step": 1650 }, { "epoch": 0.134085925444652, "grad_norm": 4.614369100716233, "learning_rate": 4.859405640443793e-06, "loss": 0.6239, "step": 1651 }, { "epoch": 0.13416714042069358, "grad_norm": 6.877855199561925, "learning_rate": 4.85918813257189e-06, "loss": 0.5406, "step": 1652 }, { "epoch": 0.13424835539673516, "grad_norm": 6.336440018776646, "learning_rate": 4.858970461456655e-06, "loss": 0.5815, "step": 1653 }, { "epoch": 0.13432957037277674, "grad_norm": 6.858031875975209, "learning_rate": 4.858752627113148e-06, "loss": 0.6946, "step": 1654 }, { "epoch": 0.1344107853488183, "grad_norm": 3.403378239632881, "learning_rate": 4.8585346295564425e-06, "loss": 0.7919, "step": 1655 }, { "epoch": 0.13449200032485992, "grad_norm": 6.487088297683663, "learning_rate": 4.858316468801624e-06, "loss": 0.4289, "step": 1656 }, { "epoch": 0.1345732153009015, "grad_norm": 9.756045801760866, "learning_rate": 4.858098144863786e-06, "loss": 0.5673, "step": 1657 }, { "epoch": 0.13465443027694307, "grad_norm": 4.769994844405226, "learning_rate": 4.857879657758037e-06, "loss": 0.6929, "step": 1658 }, { "epoch": 0.13473564525298465, "grad_norm": 4.567963041913676, "learning_rate": 4.857661007499493e-06, "loss": 0.6402, "step": 1659 }, { "epoch": 0.13481686022902623, "grad_norm": 7.539959586550248, "learning_rate": 4.857442194103287e-06, "loss": 0.6195, "step": 1660 }, { "epoch": 0.1348980752050678, "grad_norm": 3.320972169580362, "learning_rate": 4.8572232175845574e-06, "loss": 0.5644, "step": 1661 }, { "epoch": 0.1349792901811094, "grad_norm": 6.466693615210596, "learning_rate": 4.857004077958456e-06, "loss": 0.6244, "step": 1662 }, { "epoch": 0.13506050515715098, "grad_norm": 5.503111817650875, "learning_rate": 4.8567847752401476e-06, "loss": 0.5756, "step": 1663 }, { "epoch": 0.13514172013319256, "grad_norm": 5.858104142956886, "learning_rate": 4.8565653094448054e-06, "loss": 0.6822, "step": 1664 }, { "epoch": 0.13522293510923414, "grad_norm": 4.782981817213062, "learning_rate": 4.856345680587616e-06, "loss": 0.7815, "step": 1665 }, { "epoch": 0.13530415008527572, "grad_norm": 15.4570240471019, "learning_rate": 4.856125888683775e-06, "loss": 0.4739, "step": 1666 }, { "epoch": 0.13538536506131732, "grad_norm": 4.282142809688187, "learning_rate": 4.855905933748492e-06, "loss": 0.5324, "step": 1667 }, { "epoch": 0.1354665800373589, "grad_norm": 5.095680068440331, "learning_rate": 4.855685815796989e-06, "loss": 0.5745, "step": 1668 }, { "epoch": 0.13554779501340047, "grad_norm": 4.141977804675802, "learning_rate": 4.855465534844494e-06, "loss": 0.7417, "step": 1669 }, { "epoch": 0.13562900998944205, "grad_norm": 7.6969307870140895, "learning_rate": 4.8552450909062494e-06, "loss": 0.5206, "step": 1670 }, { "epoch": 0.13571022496548363, "grad_norm": 6.450650416903025, "learning_rate": 4.855024483997509e-06, "loss": 0.655, "step": 1671 }, { "epoch": 0.1357914399415252, "grad_norm": 4.810253355009561, "learning_rate": 4.85480371413354e-06, "loss": 0.5323, "step": 1672 }, { "epoch": 0.1358726549175668, "grad_norm": 3.3253261110937116, "learning_rate": 4.8545827813296154e-06, "loss": 0.5753, "step": 1673 }, { "epoch": 0.1359538698936084, "grad_norm": 3.8466882509432496, "learning_rate": 4.8543616856010235e-06, "loss": 0.7926, "step": 1674 }, { "epoch": 0.13603508486964996, "grad_norm": 4.8978637912258, "learning_rate": 4.854140426963064e-06, "loss": 0.5541, "step": 1675 }, { "epoch": 0.13611629984569154, "grad_norm": 5.229352786538515, "learning_rate": 4.853919005431046e-06, "loss": 0.4727, "step": 1676 }, { "epoch": 0.13619751482173312, "grad_norm": 2.9378106286602095, "learning_rate": 4.85369742102029e-06, "loss": 0.7667, "step": 1677 }, { "epoch": 0.13627872979777472, "grad_norm": 9.698335327128627, "learning_rate": 4.8534756737461305e-06, "loss": 0.7053, "step": 1678 }, { "epoch": 0.1363599447738163, "grad_norm": 4.258566186707566, "learning_rate": 4.853253763623909e-06, "loss": 0.7551, "step": 1679 }, { "epoch": 0.13644115974985788, "grad_norm": 7.742141939352821, "learning_rate": 4.853031690668982e-06, "loss": 0.6012, "step": 1680 }, { "epoch": 0.13652237472589945, "grad_norm": 7.324297389610232, "learning_rate": 4.852809454896715e-06, "loss": 0.8128, "step": 1681 }, { "epoch": 0.13660358970194103, "grad_norm": 6.79191875565859, "learning_rate": 4.852587056322485e-06, "loss": 0.4649, "step": 1682 }, { "epoch": 0.1366848046779826, "grad_norm": 4.346976001633204, "learning_rate": 4.852364494961684e-06, "loss": 0.5741, "step": 1683 }, { "epoch": 0.1367660196540242, "grad_norm": 5.599252491473373, "learning_rate": 4.852141770829707e-06, "loss": 0.5591, "step": 1684 }, { "epoch": 0.1368472346300658, "grad_norm": 7.657958990969802, "learning_rate": 4.851918883941969e-06, "loss": 0.5636, "step": 1685 }, { "epoch": 0.13692844960610737, "grad_norm": 3.4450921267982126, "learning_rate": 4.851695834313892e-06, "loss": 0.7366, "step": 1686 }, { "epoch": 0.13700966458214894, "grad_norm": 5.273690756756951, "learning_rate": 4.851472621960909e-06, "loss": 0.5808, "step": 1687 }, { "epoch": 0.13709087955819052, "grad_norm": 4.952280541512243, "learning_rate": 4.851249246898465e-06, "loss": 0.6648, "step": 1688 }, { "epoch": 0.13717209453423213, "grad_norm": 7.707004969920144, "learning_rate": 4.851025709142018e-06, "loss": 0.6268, "step": 1689 }, { "epoch": 0.1372533095102737, "grad_norm": 4.430949277747921, "learning_rate": 4.850802008707034e-06, "loss": 0.4935, "step": 1690 }, { "epoch": 0.13733452448631528, "grad_norm": 4.0862917668071175, "learning_rate": 4.8505781456089926e-06, "loss": 0.7303, "step": 1691 }, { "epoch": 0.13741573946235686, "grad_norm": 8.15399495830509, "learning_rate": 4.850354119863384e-06, "loss": 0.573, "step": 1692 }, { "epoch": 0.13749695443839843, "grad_norm": 4.4854550799183, "learning_rate": 4.850129931485709e-06, "loss": 0.6696, "step": 1693 }, { "epoch": 0.13757816941444, "grad_norm": 6.062165358428213, "learning_rate": 4.849905580491481e-06, "loss": 0.506, "step": 1694 }, { "epoch": 0.13765938439048162, "grad_norm": 4.30707595006544, "learning_rate": 4.849681066896224e-06, "loss": 0.5298, "step": 1695 }, { "epoch": 0.1377405993665232, "grad_norm": 4.645280994473025, "learning_rate": 4.849456390715471e-06, "loss": 0.5178, "step": 1696 }, { "epoch": 0.13782181434256477, "grad_norm": 4.408267231508574, "learning_rate": 4.849231551964771e-06, "loss": 0.5725, "step": 1697 }, { "epoch": 0.13790302931860635, "grad_norm": 9.45812967426135, "learning_rate": 4.849006550659681e-06, "loss": 0.6328, "step": 1698 }, { "epoch": 0.13798424429464792, "grad_norm": 4.491014201207106, "learning_rate": 4.84878138681577e-06, "loss": 0.5886, "step": 1699 }, { "epoch": 0.13806545927068953, "grad_norm": 6.727338627008777, "learning_rate": 4.848556060448617e-06, "loss": 0.5413, "step": 1700 }, { "epoch": 0.1381466742467311, "grad_norm": 6.965832541226805, "learning_rate": 4.848330571573815e-06, "loss": 0.582, "step": 1701 }, { "epoch": 0.13822788922277268, "grad_norm": 5.263090461409852, "learning_rate": 4.848104920206964e-06, "loss": 0.5413, "step": 1702 }, { "epoch": 0.13830910419881426, "grad_norm": 4.916463781199789, "learning_rate": 4.847879106363681e-06, "loss": 0.5164, "step": 1703 }, { "epoch": 0.13839031917485584, "grad_norm": 4.020484866288476, "learning_rate": 4.847653130059591e-06, "loss": 0.5987, "step": 1704 }, { "epoch": 0.1384715341508974, "grad_norm": 7.925299595269813, "learning_rate": 4.847426991310327e-06, "loss": 0.5019, "step": 1705 }, { "epoch": 0.13855274912693902, "grad_norm": 4.765085041505615, "learning_rate": 4.84720069013154e-06, "loss": 0.6024, "step": 1706 }, { "epoch": 0.1386339641029806, "grad_norm": 3.5389423477868704, "learning_rate": 4.846974226538887e-06, "loss": 0.5936, "step": 1707 }, { "epoch": 0.13871517907902217, "grad_norm": 4.437272587336693, "learning_rate": 4.846747600548039e-06, "loss": 0.6592, "step": 1708 }, { "epoch": 0.13879639405506375, "grad_norm": 6.923436463835136, "learning_rate": 4.8465208121746775e-06, "loss": 0.7827, "step": 1709 }, { "epoch": 0.13887760903110533, "grad_norm": 4.075078776605313, "learning_rate": 4.846293861434494e-06, "loss": 0.6561, "step": 1710 }, { "epoch": 0.13895882400714693, "grad_norm": 4.965543277677793, "learning_rate": 4.846066748343193e-06, "loss": 0.6394, "step": 1711 }, { "epoch": 0.1390400389831885, "grad_norm": 4.952847589325617, "learning_rate": 4.84583947291649e-06, "loss": 0.5429, "step": 1712 }, { "epoch": 0.13912125395923008, "grad_norm": 6.231762044145288, "learning_rate": 4.84561203517011e-06, "loss": 0.6042, "step": 1713 }, { "epoch": 0.13920246893527166, "grad_norm": 7.90195822251666, "learning_rate": 4.8453844351197906e-06, "loss": 0.6262, "step": 1714 }, { "epoch": 0.13928368391131324, "grad_norm": 5.163740585708867, "learning_rate": 4.845156672781283e-06, "loss": 0.4433, "step": 1715 }, { "epoch": 0.13936489888735482, "grad_norm": 6.772106592052472, "learning_rate": 4.844928748170343e-06, "loss": 0.5625, "step": 1716 }, { "epoch": 0.13944611386339642, "grad_norm": 7.967404236571118, "learning_rate": 4.844700661302745e-06, "loss": 0.5634, "step": 1717 }, { "epoch": 0.139527328839438, "grad_norm": 4.952354575898061, "learning_rate": 4.844472412194271e-06, "loss": 0.4938, "step": 1718 }, { "epoch": 0.13960854381547957, "grad_norm": 4.882182783614095, "learning_rate": 4.844244000860713e-06, "loss": 0.514, "step": 1719 }, { "epoch": 0.13968975879152115, "grad_norm": 4.517018593698755, "learning_rate": 4.844015427317878e-06, "loss": 0.5567, "step": 1720 }, { "epoch": 0.13977097376756273, "grad_norm": 5.353241935249897, "learning_rate": 4.84378669158158e-06, "loss": 0.6774, "step": 1721 }, { "epoch": 0.13985218874360433, "grad_norm": 4.71686990816346, "learning_rate": 4.843557793667647e-06, "loss": 0.5591, "step": 1722 }, { "epoch": 0.1399334037196459, "grad_norm": 4.136924741202888, "learning_rate": 4.843328733591918e-06, "loss": 0.711, "step": 1723 }, { "epoch": 0.1400146186956875, "grad_norm": 5.450901778292648, "learning_rate": 4.843099511370243e-06, "loss": 0.6455, "step": 1724 }, { "epoch": 0.14009583367172906, "grad_norm": 5.05521301047581, "learning_rate": 4.842870127018482e-06, "loss": 0.5929, "step": 1725 }, { "epoch": 0.14017704864777064, "grad_norm": 5.278566981624996, "learning_rate": 4.842640580552508e-06, "loss": 0.5624, "step": 1726 }, { "epoch": 0.14025826362381222, "grad_norm": 3.187653564030086, "learning_rate": 4.842410871988204e-06, "loss": 0.4295, "step": 1727 }, { "epoch": 0.14033947859985382, "grad_norm": 4.96925386948935, "learning_rate": 4.842181001341465e-06, "loss": 0.6622, "step": 1728 }, { "epoch": 0.1404206935758954, "grad_norm": 4.620002867331911, "learning_rate": 4.8419509686281965e-06, "loss": 0.6541, "step": 1729 }, { "epoch": 0.14050190855193698, "grad_norm": 4.351427980680414, "learning_rate": 4.841720773864315e-06, "loss": 0.5794, "step": 1730 }, { "epoch": 0.14058312352797855, "grad_norm": 6.055475530739616, "learning_rate": 4.84149041706575e-06, "loss": 0.6507, "step": 1731 }, { "epoch": 0.14066433850402013, "grad_norm": 4.39137607633036, "learning_rate": 4.8412598982484396e-06, "loss": 0.5077, "step": 1732 }, { "epoch": 0.14074555348006174, "grad_norm": 5.347025072258548, "learning_rate": 4.8410292174283356e-06, "loss": 0.5427, "step": 1733 }, { "epoch": 0.1408267684561033, "grad_norm": 5.982794959612033, "learning_rate": 4.840798374621399e-06, "loss": 0.6222, "step": 1734 }, { "epoch": 0.1409079834321449, "grad_norm": 4.786923948878069, "learning_rate": 4.8405673698436046e-06, "loss": 0.5887, "step": 1735 }, { "epoch": 0.14098919840818647, "grad_norm": 4.34136443925787, "learning_rate": 4.840336203110934e-06, "loss": 0.8337, "step": 1736 }, { "epoch": 0.14107041338422804, "grad_norm": 6.40416263003894, "learning_rate": 4.840104874439385e-06, "loss": 0.4646, "step": 1737 }, { "epoch": 0.14115162836026962, "grad_norm": 4.993914992295412, "learning_rate": 4.839873383844964e-06, "loss": 0.6091, "step": 1738 }, { "epoch": 0.14123284333631123, "grad_norm": 4.072501067349767, "learning_rate": 4.839641731343688e-06, "loss": 0.5803, "step": 1739 }, { "epoch": 0.1413140583123528, "grad_norm": 4.916864007591204, "learning_rate": 4.839409916951586e-06, "loss": 0.6042, "step": 1740 }, { "epoch": 0.14139527328839438, "grad_norm": 7.127573823763706, "learning_rate": 4.839177940684699e-06, "loss": 0.5263, "step": 1741 }, { "epoch": 0.14147648826443596, "grad_norm": 4.634413084773449, "learning_rate": 4.838945802559079e-06, "loss": 0.6171, "step": 1742 }, { "epoch": 0.14155770324047753, "grad_norm": 4.578421538253424, "learning_rate": 4.8387135025907885e-06, "loss": 0.5441, "step": 1743 }, { "epoch": 0.14163891821651914, "grad_norm": 4.586524758850179, "learning_rate": 4.8384810407959e-06, "loss": 0.5399, "step": 1744 }, { "epoch": 0.14172013319256072, "grad_norm": 5.918670545030279, "learning_rate": 4.8382484171905006e-06, "loss": 0.4686, "step": 1745 }, { "epoch": 0.1418013481686023, "grad_norm": 5.704207192778562, "learning_rate": 4.8380156317906855e-06, "loss": 0.6141, "step": 1746 }, { "epoch": 0.14188256314464387, "grad_norm": 4.921937978250558, "learning_rate": 4.837782684612562e-06, "loss": 0.5936, "step": 1747 }, { "epoch": 0.14196377812068545, "grad_norm": 8.366604146884352, "learning_rate": 4.83754957567225e-06, "loss": 0.5511, "step": 1748 }, { "epoch": 0.14204499309672702, "grad_norm": 3.91804902399355, "learning_rate": 4.837316304985879e-06, "loss": 0.6865, "step": 1749 }, { "epoch": 0.14212620807276863, "grad_norm": 7.888393422464691, "learning_rate": 4.8370828725695885e-06, "loss": 0.5439, "step": 1750 }, { "epoch": 0.1422074230488102, "grad_norm": 4.847280666549644, "learning_rate": 4.836849278439532e-06, "loss": 0.6774, "step": 1751 }, { "epoch": 0.14228863802485178, "grad_norm": 5.563719824565103, "learning_rate": 4.836615522611874e-06, "loss": 0.5419, "step": 1752 }, { "epoch": 0.14236985300089336, "grad_norm": 9.244694159995396, "learning_rate": 4.8363816051027875e-06, "loss": 0.5761, "step": 1753 }, { "epoch": 0.14245106797693494, "grad_norm": 4.37956947048255, "learning_rate": 4.8361475259284604e-06, "loss": 0.6103, "step": 1754 }, { "epoch": 0.14253228295297654, "grad_norm": 11.159158566456547, "learning_rate": 4.8359132851050875e-06, "loss": 0.4938, "step": 1755 }, { "epoch": 0.14261349792901812, "grad_norm": 5.121940842643738, "learning_rate": 4.835678882648878e-06, "loss": 0.7047, "step": 1756 }, { "epoch": 0.1426947129050597, "grad_norm": 4.634689837382904, "learning_rate": 4.8354443185760505e-06, "loss": 0.5441, "step": 1757 }, { "epoch": 0.14277592788110127, "grad_norm": 4.696643113982221, "learning_rate": 4.835209592902837e-06, "loss": 0.6011, "step": 1758 }, { "epoch": 0.14285714285714285, "grad_norm": 33.651614779388595, "learning_rate": 4.834974705645478e-06, "loss": 0.5516, "step": 1759 }, { "epoch": 0.14293835783318443, "grad_norm": 7.739691900140675, "learning_rate": 4.834739656820228e-06, "loss": 0.6453, "step": 1760 }, { "epoch": 0.14301957280922603, "grad_norm": 6.791535657044752, "learning_rate": 4.83450444644335e-06, "loss": 0.7097, "step": 1761 }, { "epoch": 0.1431007877852676, "grad_norm": 5.503204544875674, "learning_rate": 4.834269074531119e-06, "loss": 0.5335, "step": 1762 }, { "epoch": 0.14318200276130919, "grad_norm": 4.4158123615058935, "learning_rate": 4.834033541099822e-06, "loss": 0.5697, "step": 1763 }, { "epoch": 0.14326321773735076, "grad_norm": 8.45132186133372, "learning_rate": 4.833797846165758e-06, "loss": 0.575, "step": 1764 }, { "epoch": 0.14334443271339234, "grad_norm": 4.117636505890885, "learning_rate": 4.833561989745232e-06, "loss": 0.7003, "step": 1765 }, { "epoch": 0.14342564768943394, "grad_norm": 5.425663068958977, "learning_rate": 4.833325971854568e-06, "loss": 0.5898, "step": 1766 }, { "epoch": 0.14350686266547552, "grad_norm": 4.870653806817092, "learning_rate": 4.8330897925100966e-06, "loss": 0.6641, "step": 1767 }, { "epoch": 0.1435880776415171, "grad_norm": 4.047961871711326, "learning_rate": 4.8328534517281575e-06, "loss": 0.6453, "step": 1768 }, { "epoch": 0.14366929261755867, "grad_norm": 4.458252548214351, "learning_rate": 4.832616949525107e-06, "loss": 0.4695, "step": 1769 }, { "epoch": 0.14375050759360025, "grad_norm": 7.708237827828306, "learning_rate": 4.832380285917309e-06, "loss": 0.5208, "step": 1770 }, { "epoch": 0.14383172256964183, "grad_norm": 6.499621307199643, "learning_rate": 4.8321434609211386e-06, "loss": 0.6735, "step": 1771 }, { "epoch": 0.14391293754568343, "grad_norm": 5.872483962693705, "learning_rate": 4.831906474552983e-06, "loss": 0.467, "step": 1772 }, { "epoch": 0.143994152521725, "grad_norm": 3.669849200249159, "learning_rate": 4.831669326829242e-06, "loss": 0.6378, "step": 1773 }, { "epoch": 0.1440753674977666, "grad_norm": 4.3559205214849674, "learning_rate": 4.831432017766323e-06, "loss": 0.652, "step": 1774 }, { "epoch": 0.14415658247380816, "grad_norm": 4.444082580638693, "learning_rate": 4.831194547380647e-06, "loss": 0.6826, "step": 1775 }, { "epoch": 0.14423779744984974, "grad_norm": 7.791669775176109, "learning_rate": 4.830956915688647e-06, "loss": 0.5328, "step": 1776 }, { "epoch": 0.14431901242589135, "grad_norm": 3.828853400773109, "learning_rate": 4.830719122706764e-06, "loss": 0.7301, "step": 1777 }, { "epoch": 0.14440022740193292, "grad_norm": 4.69526829505865, "learning_rate": 4.830481168451453e-06, "loss": 0.6871, "step": 1778 }, { "epoch": 0.1444814423779745, "grad_norm": 3.7376839261484927, "learning_rate": 4.830243052939179e-06, "loss": 0.6172, "step": 1779 }, { "epoch": 0.14456265735401608, "grad_norm": 5.5007109017935365, "learning_rate": 4.830004776186419e-06, "loss": 0.6784, "step": 1780 }, { "epoch": 0.14464387233005765, "grad_norm": 9.156018680302578, "learning_rate": 4.82976633820966e-06, "loss": 0.5217, "step": 1781 }, { "epoch": 0.14472508730609923, "grad_norm": 3.661157732313108, "learning_rate": 4.829527739025399e-06, "loss": 0.7003, "step": 1782 }, { "epoch": 0.14480630228214084, "grad_norm": 6.275804371905447, "learning_rate": 4.829288978650149e-06, "loss": 0.5237, "step": 1783 }, { "epoch": 0.1448875172581824, "grad_norm": 6.549033525463337, "learning_rate": 4.829050057100428e-06, "loss": 0.5854, "step": 1784 }, { "epoch": 0.144968732234224, "grad_norm": 3.236921347753896, "learning_rate": 4.82881097439277e-06, "loss": 0.6303, "step": 1785 }, { "epoch": 0.14504994721026557, "grad_norm": 5.188648638663512, "learning_rate": 4.828571730543718e-06, "loss": 0.5043, "step": 1786 }, { "epoch": 0.14513116218630714, "grad_norm": 4.430201943407788, "learning_rate": 4.828332325569825e-06, "loss": 0.6505, "step": 1787 }, { "epoch": 0.14521237716234875, "grad_norm": 4.8572855854705645, "learning_rate": 4.828092759487658e-06, "loss": 0.6374, "step": 1788 }, { "epoch": 0.14529359213839033, "grad_norm": 18.826306527694214, "learning_rate": 4.827853032313793e-06, "loss": 0.4907, "step": 1789 }, { "epoch": 0.1453748071144319, "grad_norm": 7.143933819169128, "learning_rate": 4.827613144064819e-06, "loss": 0.5695, "step": 1790 }, { "epoch": 0.14545602209047348, "grad_norm": 5.6480765506427755, "learning_rate": 4.827373094757334e-06, "loss": 0.6081, "step": 1791 }, { "epoch": 0.14553723706651506, "grad_norm": 3.6245079382385987, "learning_rate": 4.827132884407948e-06, "loss": 0.6509, "step": 1792 }, { "epoch": 0.14561845204255663, "grad_norm": 4.493541124449762, "learning_rate": 4.826892513033283e-06, "loss": 0.6714, "step": 1793 }, { "epoch": 0.14569966701859824, "grad_norm": 4.222014821722098, "learning_rate": 4.8266519806499705e-06, "loss": 0.6869, "step": 1794 }, { "epoch": 0.14578088199463982, "grad_norm": 5.40579336748145, "learning_rate": 4.826411287274655e-06, "loss": 0.5754, "step": 1795 }, { "epoch": 0.1458620969706814, "grad_norm": 7.803642190908184, "learning_rate": 4.82617043292399e-06, "loss": 0.5751, "step": 1796 }, { "epoch": 0.14594331194672297, "grad_norm": 5.10891502640099, "learning_rate": 4.825929417614643e-06, "loss": 0.5584, "step": 1797 }, { "epoch": 0.14602452692276455, "grad_norm": 5.342228163843412, "learning_rate": 4.825688241363289e-06, "loss": 0.6378, "step": 1798 }, { "epoch": 0.14610574189880615, "grad_norm": 3.4939671214065506, "learning_rate": 4.825446904186619e-06, "loss": 0.7725, "step": 1799 }, { "epoch": 0.14618695687484773, "grad_norm": 7.740693551879523, "learning_rate": 4.825205406101328e-06, "loss": 0.5815, "step": 1800 }, { "epoch": 0.1462681718508893, "grad_norm": 3.8127593882048676, "learning_rate": 4.824963747124132e-06, "loss": 0.6857, "step": 1801 }, { "epoch": 0.14634938682693088, "grad_norm": 7.341843917850864, "learning_rate": 4.824721927271747e-06, "loss": 0.5974, "step": 1802 }, { "epoch": 0.14643060180297246, "grad_norm": 5.649042550435367, "learning_rate": 4.8244799465609095e-06, "loss": 0.6106, "step": 1803 }, { "epoch": 0.14651181677901404, "grad_norm": 6.644722885021702, "learning_rate": 4.82423780500836e-06, "loss": 0.5848, "step": 1804 }, { "epoch": 0.14659303175505564, "grad_norm": 3.4917280474695853, "learning_rate": 4.823995502630857e-06, "loss": 0.5819, "step": 1805 }, { "epoch": 0.14667424673109722, "grad_norm": 4.938957664863549, "learning_rate": 4.823753039445164e-06, "loss": 0.6797, "step": 1806 }, { "epoch": 0.1467554617071388, "grad_norm": 3.8798922539217955, "learning_rate": 4.823510415468059e-06, "loss": 0.6352, "step": 1807 }, { "epoch": 0.14683667668318037, "grad_norm": 4.629237568082465, "learning_rate": 4.82326763071633e-06, "loss": 0.6874, "step": 1808 }, { "epoch": 0.14691789165922195, "grad_norm": 4.720441802830114, "learning_rate": 4.8230246852067784e-06, "loss": 0.587, "step": 1809 }, { "epoch": 0.14699910663526355, "grad_norm": 5.910430878822207, "learning_rate": 4.822781578956212e-06, "loss": 0.4856, "step": 1810 }, { "epoch": 0.14708032161130513, "grad_norm": 4.138152132805585, "learning_rate": 4.8225383119814526e-06, "loss": 0.6528, "step": 1811 }, { "epoch": 0.1471615365873467, "grad_norm": 4.151700339228477, "learning_rate": 4.822294884299335e-06, "loss": 0.552, "step": 1812 }, { "epoch": 0.14724275156338829, "grad_norm": 4.1614483287452915, "learning_rate": 4.822051295926701e-06, "loss": 0.7129, "step": 1813 }, { "epoch": 0.14732396653942986, "grad_norm": 6.707224091798943, "learning_rate": 4.821807546880407e-06, "loss": 0.6328, "step": 1814 }, { "epoch": 0.14740518151547144, "grad_norm": 6.368579491491943, "learning_rate": 4.8215636371773186e-06, "loss": 0.7908, "step": 1815 }, { "epoch": 0.14748639649151304, "grad_norm": 4.199767598115914, "learning_rate": 4.821319566834314e-06, "loss": 0.5206, "step": 1816 }, { "epoch": 0.14756761146755462, "grad_norm": 4.709792455873804, "learning_rate": 4.82107533586828e-06, "loss": 0.6195, "step": 1817 }, { "epoch": 0.1476488264435962, "grad_norm": 4.319113558733222, "learning_rate": 4.820830944296117e-06, "loss": 0.6602, "step": 1818 }, { "epoch": 0.14773004141963778, "grad_norm": 5.544772853522585, "learning_rate": 4.820586392134735e-06, "loss": 0.5952, "step": 1819 }, { "epoch": 0.14781125639567935, "grad_norm": 4.015521528170778, "learning_rate": 4.820341679401057e-06, "loss": 0.5967, "step": 1820 }, { "epoch": 0.14789247137172096, "grad_norm": 3.9474000516118215, "learning_rate": 4.820096806112015e-06, "loss": 0.6292, "step": 1821 }, { "epoch": 0.14797368634776253, "grad_norm": 4.8859410343320615, "learning_rate": 4.8198517722845524e-06, "loss": 0.4472, "step": 1822 }, { "epoch": 0.1480549013238041, "grad_norm": 4.277013788450521, "learning_rate": 4.819606577935626e-06, "loss": 0.6099, "step": 1823 }, { "epoch": 0.1481361162998457, "grad_norm": 4.8797029682772095, "learning_rate": 4.8193612230822e-06, "loss": 0.598, "step": 1824 }, { "epoch": 0.14821733127588727, "grad_norm": 7.742169376350744, "learning_rate": 4.819115707741252e-06, "loss": 0.6474, "step": 1825 }, { "epoch": 0.14829854625192884, "grad_norm": 5.12268577416012, "learning_rate": 4.818870031929771e-06, "loss": 0.4876, "step": 1826 }, { "epoch": 0.14837976122797045, "grad_norm": 3.7709678599136605, "learning_rate": 4.818624195664756e-06, "loss": 0.727, "step": 1827 }, { "epoch": 0.14846097620401202, "grad_norm": 2.5702762627716833, "learning_rate": 4.818378198963218e-06, "loss": 0.7224, "step": 1828 }, { "epoch": 0.1485421911800536, "grad_norm": 3.097076580981341, "learning_rate": 4.81813204184218e-06, "loss": 0.6025, "step": 1829 }, { "epoch": 0.14862340615609518, "grad_norm": 7.135529520333667, "learning_rate": 4.817885724318671e-06, "loss": 0.6685, "step": 1830 }, { "epoch": 0.14870462113213676, "grad_norm": 3.4538569552830003, "learning_rate": 4.817639246409738e-06, "loss": 0.6588, "step": 1831 }, { "epoch": 0.14878583610817836, "grad_norm": 4.3881359378371965, "learning_rate": 4.817392608132435e-06, "loss": 0.4665, "step": 1832 }, { "epoch": 0.14886705108421994, "grad_norm": 3.7661484270198304, "learning_rate": 4.817145809503828e-06, "loss": 0.5136, "step": 1833 }, { "epoch": 0.1489482660602615, "grad_norm": 5.494631448466639, "learning_rate": 4.816898850540995e-06, "loss": 0.5539, "step": 1834 }, { "epoch": 0.1490294810363031, "grad_norm": 10.552148803818461, "learning_rate": 4.816651731261023e-06, "loss": 0.6492, "step": 1835 }, { "epoch": 0.14911069601234467, "grad_norm": 4.963897986383459, "learning_rate": 4.816404451681012e-06, "loss": 0.6898, "step": 1836 }, { "epoch": 0.14919191098838624, "grad_norm": 4.656896578348469, "learning_rate": 4.816157011818073e-06, "loss": 0.6373, "step": 1837 }, { "epoch": 0.14927312596442785, "grad_norm": 6.1635228629484935, "learning_rate": 4.815909411689326e-06, "loss": 0.61, "step": 1838 }, { "epoch": 0.14935434094046943, "grad_norm": 5.077711445021842, "learning_rate": 4.815661651311905e-06, "loss": 0.5255, "step": 1839 }, { "epoch": 0.149435555916511, "grad_norm": 3.5697847643358926, "learning_rate": 4.815413730702953e-06, "loss": 0.5737, "step": 1840 }, { "epoch": 0.14951677089255258, "grad_norm": 4.3625630887558025, "learning_rate": 4.8151656498796245e-06, "loss": 0.5621, "step": 1841 }, { "epoch": 0.14959798586859416, "grad_norm": 3.632848050826748, "learning_rate": 4.814917408859087e-06, "loss": 0.6579, "step": 1842 }, { "epoch": 0.14967920084463576, "grad_norm": 5.537325255170911, "learning_rate": 4.8146690076585145e-06, "loss": 0.6326, "step": 1843 }, { "epoch": 0.14976041582067734, "grad_norm": 4.502954580803916, "learning_rate": 4.8144204462950985e-06, "loss": 0.5227, "step": 1844 }, { "epoch": 0.14984163079671892, "grad_norm": 9.989837223985806, "learning_rate": 4.8141717247860355e-06, "loss": 0.5365, "step": 1845 }, { "epoch": 0.1499228457727605, "grad_norm": 9.268156392807445, "learning_rate": 4.813922843148537e-06, "loss": 0.7256, "step": 1846 }, { "epoch": 0.15000406074880207, "grad_norm": 4.723473199759222, "learning_rate": 4.813673801399825e-06, "loss": 0.6814, "step": 1847 }, { "epoch": 0.15008527572484365, "grad_norm": 5.682840300388035, "learning_rate": 4.81342459955713e-06, "loss": 0.6083, "step": 1848 }, { "epoch": 0.15016649070088525, "grad_norm": 3.7968600757264417, "learning_rate": 4.813175237637697e-06, "loss": 0.5635, "step": 1849 }, { "epoch": 0.15024770567692683, "grad_norm": 4.69369222204755, "learning_rate": 4.812925715658779e-06, "loss": 0.5237, "step": 1850 }, { "epoch": 0.1503289206529684, "grad_norm": 4.060309446961183, "learning_rate": 4.812676033637643e-06, "loss": 0.6771, "step": 1851 }, { "epoch": 0.15041013562900998, "grad_norm": 4.068231340082502, "learning_rate": 4.812426191591565e-06, "loss": 0.4976, "step": 1852 }, { "epoch": 0.15049135060505156, "grad_norm": 2.940489927194899, "learning_rate": 4.812176189537833e-06, "loss": 0.6372, "step": 1853 }, { "epoch": 0.15057256558109317, "grad_norm": 18.577063761435014, "learning_rate": 4.811926027493745e-06, "loss": 0.5688, "step": 1854 }, { "epoch": 0.15065378055713474, "grad_norm": 9.15886334831888, "learning_rate": 4.811675705476613e-06, "loss": 0.5274, "step": 1855 }, { "epoch": 0.15073499553317632, "grad_norm": 8.116139489468573, "learning_rate": 4.811425223503755e-06, "loss": 0.6006, "step": 1856 }, { "epoch": 0.1508162105092179, "grad_norm": 5.340837408995612, "learning_rate": 4.811174581592506e-06, "loss": 0.4627, "step": 1857 }, { "epoch": 0.15089742548525947, "grad_norm": 7.861074956324514, "learning_rate": 4.810923779760207e-06, "loss": 0.6517, "step": 1858 }, { "epoch": 0.15097864046130105, "grad_norm": 5.046613794107396, "learning_rate": 4.810672818024212e-06, "loss": 0.8007, "step": 1859 }, { "epoch": 0.15105985543734265, "grad_norm": 3.78641177578102, "learning_rate": 4.810421696401889e-06, "loss": 0.4462, "step": 1860 }, { "epoch": 0.15114107041338423, "grad_norm": 5.953501964167005, "learning_rate": 4.810170414910611e-06, "loss": 0.6483, "step": 1861 }, { "epoch": 0.1512222853894258, "grad_norm": 4.508454868931279, "learning_rate": 4.809918973567767e-06, "loss": 0.4402, "step": 1862 }, { "epoch": 0.15130350036546739, "grad_norm": 6.1091026815566645, "learning_rate": 4.809667372390755e-06, "loss": 0.5401, "step": 1863 }, { "epoch": 0.15138471534150896, "grad_norm": 4.269870818997486, "learning_rate": 4.809415611396984e-06, "loss": 0.6984, "step": 1864 }, { "epoch": 0.15146593031755057, "grad_norm": 8.488877241499456, "learning_rate": 4.809163690603877e-06, "loss": 0.6111, "step": 1865 }, { "epoch": 0.15154714529359214, "grad_norm": 5.571115307591249, "learning_rate": 4.808911610028861e-06, "loss": 0.5755, "step": 1866 }, { "epoch": 0.15162836026963372, "grad_norm": 5.592522309071258, "learning_rate": 4.808659369689384e-06, "loss": 0.7285, "step": 1867 }, { "epoch": 0.1517095752456753, "grad_norm": 3.3631906733071535, "learning_rate": 4.808406969602895e-06, "loss": 0.6066, "step": 1868 }, { "epoch": 0.15179079022171688, "grad_norm": 5.558011990131933, "learning_rate": 4.8081544097868615e-06, "loss": 0.599, "step": 1869 }, { "epoch": 0.15187200519775845, "grad_norm": 4.196854563685466, "learning_rate": 4.8079016902587586e-06, "loss": 0.6429, "step": 1870 }, { "epoch": 0.15195322017380006, "grad_norm": 5.663489456766142, "learning_rate": 4.807648811036073e-06, "loss": 0.4956, "step": 1871 }, { "epoch": 0.15203443514984163, "grad_norm": 4.348733009474466, "learning_rate": 4.807395772136303e-06, "loss": 0.5331, "step": 1872 }, { "epoch": 0.1521156501258832, "grad_norm": 5.237186173256271, "learning_rate": 4.807142573576958e-06, "loss": 0.7043, "step": 1873 }, { "epoch": 0.1521968651019248, "grad_norm": 5.71550907696815, "learning_rate": 4.806889215375556e-06, "loss": 0.491, "step": 1874 }, { "epoch": 0.15227808007796637, "grad_norm": 5.112913674384685, "learning_rate": 4.80663569754963e-06, "loss": 0.5455, "step": 1875 }, { "epoch": 0.15235929505400797, "grad_norm": 4.422551589450753, "learning_rate": 4.806382020116721e-06, "loss": 0.6936, "step": 1876 }, { "epoch": 0.15244051003004955, "grad_norm": 4.418250640226396, "learning_rate": 4.806128183094383e-06, "loss": 0.6371, "step": 1877 }, { "epoch": 0.15252172500609112, "grad_norm": 4.611528104096768, "learning_rate": 4.805874186500179e-06, "loss": 0.5783, "step": 1878 }, { "epoch": 0.1526029399821327, "grad_norm": 4.3459558003142345, "learning_rate": 4.805620030351686e-06, "loss": 0.6227, "step": 1879 }, { "epoch": 0.15268415495817428, "grad_norm": 6.170103993151758, "learning_rate": 4.805365714666489e-06, "loss": 0.5419, "step": 1880 }, { "epoch": 0.15276536993421586, "grad_norm": 2.958664035946074, "learning_rate": 4.805111239462185e-06, "loss": 0.7513, "step": 1881 }, { "epoch": 0.15284658491025746, "grad_norm": 13.416473280689857, "learning_rate": 4.8048566047563835e-06, "loss": 0.4584, "step": 1882 }, { "epoch": 0.15292779988629904, "grad_norm": 4.99824146528109, "learning_rate": 4.8046018105667024e-06, "loss": 0.6614, "step": 1883 }, { "epoch": 0.15300901486234061, "grad_norm": 9.102398270856987, "learning_rate": 4.8043468569107735e-06, "loss": 0.7377, "step": 1884 }, { "epoch": 0.1530902298383822, "grad_norm": 3.746424579208254, "learning_rate": 4.804091743806237e-06, "loss": 0.4525, "step": 1885 }, { "epoch": 0.15317144481442377, "grad_norm": 6.368130414333825, "learning_rate": 4.803836471270748e-06, "loss": 0.5629, "step": 1886 }, { "epoch": 0.15325265979046537, "grad_norm": 6.446103551439812, "learning_rate": 4.803581039321966e-06, "loss": 0.5416, "step": 1887 }, { "epoch": 0.15333387476650695, "grad_norm": 5.229434076566928, "learning_rate": 4.803325447977568e-06, "loss": 0.5842, "step": 1888 }, { "epoch": 0.15341508974254853, "grad_norm": 5.954690104051001, "learning_rate": 4.80306969725524e-06, "loss": 0.5898, "step": 1889 }, { "epoch": 0.1534963047185901, "grad_norm": 6.150359963901614, "learning_rate": 4.802813787172678e-06, "loss": 0.5839, "step": 1890 }, { "epoch": 0.15357751969463168, "grad_norm": 5.125992935197739, "learning_rate": 4.802557717747588e-06, "loss": 0.5668, "step": 1891 }, { "epoch": 0.15365873467067329, "grad_norm": 4.364632434646274, "learning_rate": 4.802301488997691e-06, "loss": 0.4172, "step": 1892 }, { "epoch": 0.15373994964671486, "grad_norm": 4.882188374905198, "learning_rate": 4.802045100940715e-06, "loss": 0.6242, "step": 1893 }, { "epoch": 0.15382116462275644, "grad_norm": 3.3401070060949403, "learning_rate": 4.801788553594403e-06, "loss": 0.5872, "step": 1894 }, { "epoch": 0.15390237959879802, "grad_norm": 4.868516648657934, "learning_rate": 4.801531846976504e-06, "loss": 0.5688, "step": 1895 }, { "epoch": 0.1539835945748396, "grad_norm": 5.105870944801884, "learning_rate": 4.801274981104781e-06, "loss": 0.6434, "step": 1896 }, { "epoch": 0.15406480955088117, "grad_norm": 3.8171439187123903, "learning_rate": 4.80101795599701e-06, "loss": 0.5289, "step": 1897 }, { "epoch": 0.15414602452692278, "grad_norm": 16.280448607344095, "learning_rate": 4.800760771670974e-06, "loss": 0.4721, "step": 1898 }, { "epoch": 0.15422723950296435, "grad_norm": 3.7231117337427055, "learning_rate": 4.800503428144469e-06, "loss": 0.6052, "step": 1899 }, { "epoch": 0.15430845447900593, "grad_norm": 3.648455745421052, "learning_rate": 4.800245925435302e-06, "loss": 0.6106, "step": 1900 }, { "epoch": 0.1543896694550475, "grad_norm": 6.0421687476954995, "learning_rate": 4.7999882635612916e-06, "loss": 0.5272, "step": 1901 }, { "epoch": 0.15447088443108908, "grad_norm": 4.462318149762371, "learning_rate": 4.799730442540265e-06, "loss": 0.4801, "step": 1902 }, { "epoch": 0.1545520994071307, "grad_norm": 5.097311868178371, "learning_rate": 4.7994724623900636e-06, "loss": 0.6126, "step": 1903 }, { "epoch": 0.15463331438317227, "grad_norm": 10.491382918494583, "learning_rate": 4.799214323128537e-06, "loss": 0.7263, "step": 1904 }, { "epoch": 0.15471452935921384, "grad_norm": 5.632700323452676, "learning_rate": 4.798956024773548e-06, "loss": 0.427, "step": 1905 }, { "epoch": 0.15479574433525542, "grad_norm": 4.125424526878242, "learning_rate": 4.798697567342969e-06, "loss": 0.8174, "step": 1906 }, { "epoch": 0.154876959311297, "grad_norm": 3.2529153125095682, "learning_rate": 4.798438950854685e-06, "loss": 0.5607, "step": 1907 }, { "epoch": 0.15495817428733857, "grad_norm": 6.639548071036903, "learning_rate": 4.798180175326589e-06, "loss": 0.6207, "step": 1908 }, { "epoch": 0.15503938926338018, "grad_norm": 3.419026777929521, "learning_rate": 4.797921240776587e-06, "loss": 0.6548, "step": 1909 }, { "epoch": 0.15512060423942176, "grad_norm": 6.3759242855440315, "learning_rate": 4.797662147222598e-06, "loss": 0.722, "step": 1910 }, { "epoch": 0.15520181921546333, "grad_norm": 4.2074330009622525, "learning_rate": 4.797402894682548e-06, "loss": 0.7711, "step": 1911 }, { "epoch": 0.1552830341915049, "grad_norm": 5.86918562130916, "learning_rate": 4.797143483174377e-06, "loss": 0.5241, "step": 1912 }, { "epoch": 0.1553642491675465, "grad_norm": 4.110856189622478, "learning_rate": 4.796883912716034e-06, "loss": 0.6855, "step": 1913 }, { "epoch": 0.1554454641435881, "grad_norm": 4.190357484842863, "learning_rate": 4.79662418332548e-06, "loss": 0.616, "step": 1914 }, { "epoch": 0.15552667911962967, "grad_norm": 6.818021899205355, "learning_rate": 4.796364295020688e-06, "loss": 0.554, "step": 1915 }, { "epoch": 0.15560789409567125, "grad_norm": 4.400441609364029, "learning_rate": 4.7961042478196394e-06, "loss": 0.5425, "step": 1916 }, { "epoch": 0.15568910907171282, "grad_norm": 3.755867976669089, "learning_rate": 4.7958440417403295e-06, "loss": 0.5295, "step": 1917 }, { "epoch": 0.1557703240477544, "grad_norm": 3.799331044287829, "learning_rate": 4.795583676800762e-06, "loss": 0.4355, "step": 1918 }, { "epoch": 0.15585153902379598, "grad_norm": 5.47739825987387, "learning_rate": 4.795323153018953e-06, "loss": 0.6229, "step": 1919 }, { "epoch": 0.15593275399983758, "grad_norm": 4.656969062795371, "learning_rate": 4.795062470412931e-06, "loss": 0.6301, "step": 1920 }, { "epoch": 0.15601396897587916, "grad_norm": 4.914269261314513, "learning_rate": 4.794801629000732e-06, "loss": 0.6974, "step": 1921 }, { "epoch": 0.15609518395192074, "grad_norm": 6.004759949716761, "learning_rate": 4.794540628800405e-06, "loss": 0.6272, "step": 1922 }, { "epoch": 0.1561763989279623, "grad_norm": 5.879866804838136, "learning_rate": 4.79427946983001e-06, "loss": 0.624, "step": 1923 }, { "epoch": 0.1562576139040039, "grad_norm": 4.9010570268519835, "learning_rate": 4.794018152107618e-06, "loss": 0.5972, "step": 1924 }, { "epoch": 0.1563388288800455, "grad_norm": 6.1005563812014625, "learning_rate": 4.793756675651311e-06, "loss": 0.5372, "step": 1925 }, { "epoch": 0.15642004385608707, "grad_norm": 7.657069975595423, "learning_rate": 4.7934950404791815e-06, "loss": 0.5059, "step": 1926 }, { "epoch": 0.15650125883212865, "grad_norm": 3.7164386302842325, "learning_rate": 4.793233246609333e-06, "loss": 0.5145, "step": 1927 }, { "epoch": 0.15658247380817022, "grad_norm": 6.7098122658626025, "learning_rate": 4.792971294059882e-06, "loss": 0.5909, "step": 1928 }, { "epoch": 0.1566636887842118, "grad_norm": 4.5647090456408765, "learning_rate": 4.792709182848951e-06, "loss": 0.5851, "step": 1929 }, { "epoch": 0.15674490376025338, "grad_norm": 5.659279097889884, "learning_rate": 4.792446912994679e-06, "loss": 0.509, "step": 1930 }, { "epoch": 0.15682611873629498, "grad_norm": 4.660823647963301, "learning_rate": 4.792184484515214e-06, "loss": 0.6214, "step": 1931 }, { "epoch": 0.15690733371233656, "grad_norm": 5.379888896939118, "learning_rate": 4.791921897428714e-06, "loss": 0.6083, "step": 1932 }, { "epoch": 0.15698854868837814, "grad_norm": 3.7091543807048555, "learning_rate": 4.791659151753348e-06, "loss": 0.5366, "step": 1933 }, { "epoch": 0.15706976366441971, "grad_norm": 4.46536923401703, "learning_rate": 4.791396247507297e-06, "loss": 0.5548, "step": 1934 }, { "epoch": 0.1571509786404613, "grad_norm": 3.0478693770891, "learning_rate": 4.791133184708753e-06, "loss": 0.6562, "step": 1935 }, { "epoch": 0.1572321936165029, "grad_norm": 5.052248542138617, "learning_rate": 4.790869963375918e-06, "loss": 0.6266, "step": 1936 }, { "epoch": 0.15731340859254447, "grad_norm": 6.299780666458338, "learning_rate": 4.790606583527006e-06, "loss": 0.6602, "step": 1937 }, { "epoch": 0.15739462356858605, "grad_norm": 7.111506262099773, "learning_rate": 4.790343045180242e-06, "loss": 0.6822, "step": 1938 }, { "epoch": 0.15747583854462763, "grad_norm": 3.5717735290621015, "learning_rate": 4.790079348353859e-06, "loss": 0.5579, "step": 1939 }, { "epoch": 0.1575570535206692, "grad_norm": 4.539252075565922, "learning_rate": 4.789815493066106e-06, "loss": 0.5513, "step": 1940 }, { "epoch": 0.15763826849671078, "grad_norm": 3.19405646131031, "learning_rate": 4.78955147933524e-06, "loss": 0.6305, "step": 1941 }, { "epoch": 0.1577194834727524, "grad_norm": 3.9600914109632805, "learning_rate": 4.7892873071795285e-06, "loss": 0.5575, "step": 1942 }, { "epoch": 0.15780069844879396, "grad_norm": 5.033361013107082, "learning_rate": 4.789022976617251e-06, "loss": 0.6199, "step": 1943 }, { "epoch": 0.15788191342483554, "grad_norm": 15.306333789764619, "learning_rate": 4.7887584876666984e-06, "loss": 0.458, "step": 1944 }, { "epoch": 0.15796312840087712, "grad_norm": 4.337274209547185, "learning_rate": 4.788493840346172e-06, "loss": 0.691, "step": 1945 }, { "epoch": 0.1580443433769187, "grad_norm": 4.2820007012074415, "learning_rate": 4.788229034673983e-06, "loss": 0.7317, "step": 1946 }, { "epoch": 0.1581255583529603, "grad_norm": 4.361412328827347, "learning_rate": 4.787964070668455e-06, "loss": 0.5208, "step": 1947 }, { "epoch": 0.15820677332900188, "grad_norm": 4.543017266735986, "learning_rate": 4.787698948347922e-06, "loss": 0.7488, "step": 1948 }, { "epoch": 0.15828798830504345, "grad_norm": 6.9360624754566444, "learning_rate": 4.78743366773073e-06, "loss": 0.478, "step": 1949 }, { "epoch": 0.15836920328108503, "grad_norm": 4.321466474176901, "learning_rate": 4.787168228835234e-06, "loss": 0.8373, "step": 1950 }, { "epoch": 0.1584504182571266, "grad_norm": 10.29187836219349, "learning_rate": 4.7869026316798005e-06, "loss": 0.5927, "step": 1951 }, { "epoch": 0.15853163323316818, "grad_norm": 5.206315709924438, "learning_rate": 4.7866368762828095e-06, "loss": 0.7284, "step": 1952 }, { "epoch": 0.1586128482092098, "grad_norm": 4.695971985775556, "learning_rate": 4.786370962662647e-06, "loss": 0.6129, "step": 1953 }, { "epoch": 0.15869406318525137, "grad_norm": 3.810668859896304, "learning_rate": 4.786104890837715e-06, "loss": 0.7817, "step": 1954 }, { "epoch": 0.15877527816129294, "grad_norm": 5.945278033090391, "learning_rate": 4.785838660826424e-06, "loss": 0.4929, "step": 1955 }, { "epoch": 0.15885649313733452, "grad_norm": 5.613790832997751, "learning_rate": 4.785572272647196e-06, "loss": 0.5365, "step": 1956 }, { "epoch": 0.1589377081133761, "grad_norm": 12.194006996853599, "learning_rate": 4.785305726318461e-06, "loss": 0.5274, "step": 1957 }, { "epoch": 0.1590189230894177, "grad_norm": 6.67246997329038, "learning_rate": 4.785039021858665e-06, "loss": 0.5129, "step": 1958 }, { "epoch": 0.15910013806545928, "grad_norm": 4.361772829570909, "learning_rate": 4.784772159286263e-06, "loss": 0.5762, "step": 1959 }, { "epoch": 0.15918135304150086, "grad_norm": 3.6171339798903115, "learning_rate": 4.784505138619719e-06, "loss": 0.5687, "step": 1960 }, { "epoch": 0.15926256801754243, "grad_norm": 3.3498401020879696, "learning_rate": 4.78423795987751e-06, "loss": 0.6731, "step": 1961 }, { "epoch": 0.159343782993584, "grad_norm": 4.218304560758474, "learning_rate": 4.783970623078124e-06, "loss": 0.5832, "step": 1962 }, { "epoch": 0.1594249979696256, "grad_norm": 3.2954022912979473, "learning_rate": 4.783703128240058e-06, "loss": 0.626, "step": 1963 }, { "epoch": 0.1595062129456672, "grad_norm": 5.637076756015832, "learning_rate": 4.783435475381822e-06, "loss": 0.549, "step": 1964 }, { "epoch": 0.15958742792170877, "grad_norm": 4.690580073274557, "learning_rate": 4.7831676645219364e-06, "loss": 0.6577, "step": 1965 }, { "epoch": 0.15966864289775035, "grad_norm": 5.3749569715672125, "learning_rate": 4.782899695678931e-06, "loss": 0.5685, "step": 1966 }, { "epoch": 0.15974985787379192, "grad_norm": 6.809233492187388, "learning_rate": 4.782631568871349e-06, "loss": 0.6891, "step": 1967 }, { "epoch": 0.1598310728498335, "grad_norm": 4.530421811396376, "learning_rate": 4.782363284117744e-06, "loss": 0.3833, "step": 1968 }, { "epoch": 0.1599122878258751, "grad_norm": 4.835132250782642, "learning_rate": 4.782094841436677e-06, "loss": 0.4926, "step": 1969 }, { "epoch": 0.15999350280191668, "grad_norm": 3.8849675683937184, "learning_rate": 4.781826240846726e-06, "loss": 0.7052, "step": 1970 }, { "epoch": 0.16007471777795826, "grad_norm": 4.260063677720038, "learning_rate": 4.781557482366477e-06, "loss": 0.6791, "step": 1971 }, { "epoch": 0.16015593275399984, "grad_norm": 9.298148375842267, "learning_rate": 4.781288566014524e-06, "loss": 0.6356, "step": 1972 }, { "epoch": 0.1602371477300414, "grad_norm": 4.484161226398304, "learning_rate": 4.781019491809475e-06, "loss": 0.5682, "step": 1973 }, { "epoch": 0.160318362706083, "grad_norm": 3.7283315441063984, "learning_rate": 4.78075025976995e-06, "loss": 0.6072, "step": 1974 }, { "epoch": 0.1603995776821246, "grad_norm": 6.059353678550269, "learning_rate": 4.780480869914578e-06, "loss": 0.5845, "step": 1975 }, { "epoch": 0.16048079265816617, "grad_norm": 4.554935916876435, "learning_rate": 4.780211322261998e-06, "loss": 0.5095, "step": 1976 }, { "epoch": 0.16056200763420775, "grad_norm": 6.981060106812001, "learning_rate": 4.779941616830863e-06, "loss": 0.5098, "step": 1977 }, { "epoch": 0.16064322261024933, "grad_norm": 4.9440805505487, "learning_rate": 4.779671753639835e-06, "loss": 0.7675, "step": 1978 }, { "epoch": 0.1607244375862909, "grad_norm": 8.280400518698295, "learning_rate": 4.779401732707586e-06, "loss": 0.5639, "step": 1979 }, { "epoch": 0.1608056525623325, "grad_norm": 6.056327257248986, "learning_rate": 4.779131554052801e-06, "loss": 0.5981, "step": 1980 }, { "epoch": 0.16088686753837408, "grad_norm": 5.385241894979969, "learning_rate": 4.778861217694174e-06, "loss": 0.6512, "step": 1981 }, { "epoch": 0.16096808251441566, "grad_norm": 4.550144423102369, "learning_rate": 4.778590723650413e-06, "loss": 0.5836, "step": 1982 }, { "epoch": 0.16104929749045724, "grad_norm": 4.3719124593201055, "learning_rate": 4.778320071940231e-06, "loss": 0.7309, "step": 1983 }, { "epoch": 0.16113051246649882, "grad_norm": 4.283507192707898, "learning_rate": 4.77804926258236e-06, "loss": 0.5897, "step": 1984 }, { "epoch": 0.1612117274425404, "grad_norm": 3.9610170071985524, "learning_rate": 4.777778295595535e-06, "loss": 0.6077, "step": 1985 }, { "epoch": 0.161292942418582, "grad_norm": 6.219189683636367, "learning_rate": 4.777507170998508e-06, "loss": 0.7406, "step": 1986 }, { "epoch": 0.16137415739462357, "grad_norm": 4.465198885140077, "learning_rate": 4.777235888810037e-06, "loss": 0.5908, "step": 1987 }, { "epoch": 0.16145537237066515, "grad_norm": 6.450650746265666, "learning_rate": 4.776964449048895e-06, "loss": 0.5436, "step": 1988 }, { "epoch": 0.16153658734670673, "grad_norm": 9.863744897399046, "learning_rate": 4.776692851733864e-06, "loss": 0.4796, "step": 1989 }, { "epoch": 0.1616178023227483, "grad_norm": 3.703087572235551, "learning_rate": 4.776421096883737e-06, "loss": 0.737, "step": 1990 }, { "epoch": 0.1616990172987899, "grad_norm": 5.885859853270686, "learning_rate": 4.776149184517318e-06, "loss": 0.7037, "step": 1991 }, { "epoch": 0.1617802322748315, "grad_norm": 4.7082939980756136, "learning_rate": 4.775877114653422e-06, "loss": 0.5807, "step": 1992 }, { "epoch": 0.16186144725087306, "grad_norm": 4.719665547064216, "learning_rate": 4.775604887310874e-06, "loss": 0.5684, "step": 1993 }, { "epoch": 0.16194266222691464, "grad_norm": 4.372013687616748, "learning_rate": 4.775332502508511e-06, "loss": 0.5176, "step": 1994 }, { "epoch": 0.16202387720295622, "grad_norm": 3.574773949292872, "learning_rate": 4.775059960265181e-06, "loss": 0.6306, "step": 1995 }, { "epoch": 0.1621050921789978, "grad_norm": 3.0419771807588134, "learning_rate": 4.774787260599744e-06, "loss": 0.5673, "step": 1996 }, { "epoch": 0.1621863071550394, "grad_norm": 4.11991510271793, "learning_rate": 4.7745144035310656e-06, "loss": 0.5221, "step": 1997 }, { "epoch": 0.16226752213108098, "grad_norm": 8.878399802384537, "learning_rate": 4.77424138907803e-06, "loss": 0.5639, "step": 1998 }, { "epoch": 0.16234873710712255, "grad_norm": 5.661428317678857, "learning_rate": 4.773968217259525e-06, "loss": 0.6172, "step": 1999 }, { "epoch": 0.16242995208316413, "grad_norm": 4.587434812251549, "learning_rate": 4.773694888094454e-06, "loss": 0.5641, "step": 2000 }, { "epoch": 0.1625111670592057, "grad_norm": 6.782537400712025, "learning_rate": 4.773421401601731e-06, "loss": 0.6165, "step": 2001 }, { "epoch": 0.1625923820352473, "grad_norm": 5.248837296847351, "learning_rate": 4.773147757800279e-06, "loss": 0.5165, "step": 2002 }, { "epoch": 0.1626735970112889, "grad_norm": 6.771172761964919, "learning_rate": 4.772873956709032e-06, "loss": 0.5348, "step": 2003 }, { "epoch": 0.16275481198733047, "grad_norm": 3.7091479668688407, "learning_rate": 4.772599998346937e-06, "loss": 0.7519, "step": 2004 }, { "epoch": 0.16283602696337204, "grad_norm": 4.478594715066707, "learning_rate": 4.772325882732949e-06, "loss": 0.5111, "step": 2005 }, { "epoch": 0.16291724193941362, "grad_norm": 5.771923023591905, "learning_rate": 4.772051609886036e-06, "loss": 0.4556, "step": 2006 }, { "epoch": 0.1629984569154552, "grad_norm": 4.808968764339891, "learning_rate": 4.771777179825176e-06, "loss": 0.5635, "step": 2007 }, { "epoch": 0.1630796718914968, "grad_norm": 6.317225738730894, "learning_rate": 4.7715025925693595e-06, "loss": 0.8752, "step": 2008 }, { "epoch": 0.16316088686753838, "grad_norm": 4.920049883294183, "learning_rate": 4.771227848137585e-06, "loss": 0.6393, "step": 2009 }, { "epoch": 0.16324210184357996, "grad_norm": 4.256612076091997, "learning_rate": 4.770952946548864e-06, "loss": 0.6003, "step": 2010 }, { "epoch": 0.16332331681962153, "grad_norm": 4.5734192613631315, "learning_rate": 4.770677887822217e-06, "loss": 0.4413, "step": 2011 }, { "epoch": 0.1634045317956631, "grad_norm": 4.163070055535816, "learning_rate": 4.770402671976677e-06, "loss": 0.6597, "step": 2012 }, { "epoch": 0.16348574677170472, "grad_norm": 3.0742054711235887, "learning_rate": 4.77012729903129e-06, "loss": 0.4076, "step": 2013 }, { "epoch": 0.1635669617477463, "grad_norm": 5.55441667151701, "learning_rate": 4.769851769005107e-06, "loss": 0.5314, "step": 2014 }, { "epoch": 0.16364817672378787, "grad_norm": 4.576121873555356, "learning_rate": 4.769576081917195e-06, "loss": 0.6124, "step": 2015 }, { "epoch": 0.16372939169982945, "grad_norm": 6.382821144731639, "learning_rate": 4.7693002377866295e-06, "loss": 0.6729, "step": 2016 }, { "epoch": 0.16381060667587102, "grad_norm": 9.489241945586448, "learning_rate": 4.769024236632498e-06, "loss": 0.4966, "step": 2017 }, { "epoch": 0.1638918216519126, "grad_norm": 10.574632939063292, "learning_rate": 4.768748078473898e-06, "loss": 0.6579, "step": 2018 }, { "epoch": 0.1639730366279542, "grad_norm": 3.323125086340652, "learning_rate": 4.768471763329938e-06, "loss": 0.9138, "step": 2019 }, { "epoch": 0.16405425160399578, "grad_norm": 14.315467246083593, "learning_rate": 4.768195291219738e-06, "loss": 0.4129, "step": 2020 }, { "epoch": 0.16413546658003736, "grad_norm": 3.5370073104621613, "learning_rate": 4.767918662162428e-06, "loss": 0.6045, "step": 2021 }, { "epoch": 0.16421668155607894, "grad_norm": 4.952315377512936, "learning_rate": 4.767641876177149e-06, "loss": 0.5661, "step": 2022 }, { "epoch": 0.1642978965321205, "grad_norm": 2.9834860448273273, "learning_rate": 4.767364933283053e-06, "loss": 0.5964, "step": 2023 }, { "epoch": 0.16437911150816212, "grad_norm": 38.67374390447997, "learning_rate": 4.767087833499305e-06, "loss": 0.545, "step": 2024 }, { "epoch": 0.1644603264842037, "grad_norm": 8.792176852895958, "learning_rate": 4.7668105768450755e-06, "loss": 0.6039, "step": 2025 }, { "epoch": 0.16454154146024527, "grad_norm": 8.176394720323723, "learning_rate": 4.766533163339553e-06, "loss": 0.5289, "step": 2026 }, { "epoch": 0.16462275643628685, "grad_norm": 6.977860465026282, "learning_rate": 4.766255593001929e-06, "loss": 0.4951, "step": 2027 }, { "epoch": 0.16470397141232843, "grad_norm": 5.399038234082669, "learning_rate": 4.765977865851413e-06, "loss": 0.4714, "step": 2028 }, { "epoch": 0.16478518638837, "grad_norm": 6.62984701521281, "learning_rate": 4.765699981907221e-06, "loss": 0.5556, "step": 2029 }, { "epoch": 0.1648664013644116, "grad_norm": 6.225819915723398, "learning_rate": 4.765421941188582e-06, "loss": 0.8441, "step": 2030 }, { "epoch": 0.16494761634045318, "grad_norm": 5.344504032565665, "learning_rate": 4.765143743714734e-06, "loss": 0.411, "step": 2031 }, { "epoch": 0.16502883131649476, "grad_norm": 6.407678819914006, "learning_rate": 4.764865389504927e-06, "loss": 0.5036, "step": 2032 }, { "epoch": 0.16511004629253634, "grad_norm": 4.502779183775894, "learning_rate": 4.764586878578421e-06, "loss": 0.5595, "step": 2033 }, { "epoch": 0.16519126126857792, "grad_norm": 5.385105120731812, "learning_rate": 4.7643082109544894e-06, "loss": 0.5466, "step": 2034 }, { "epoch": 0.16527247624461952, "grad_norm": 5.3322559203413435, "learning_rate": 4.764029386652412e-06, "loss": 0.5369, "step": 2035 }, { "epoch": 0.1653536912206611, "grad_norm": 5.983847101773067, "learning_rate": 4.763750405691483e-06, "loss": 0.5569, "step": 2036 }, { "epoch": 0.16543490619670267, "grad_norm": 4.212501542827999, "learning_rate": 4.7634712680910075e-06, "loss": 0.6493, "step": 2037 }, { "epoch": 0.16551612117274425, "grad_norm": 5.116892931497582, "learning_rate": 4.7631919738703e-06, "loss": 0.511, "step": 2038 }, { "epoch": 0.16559733614878583, "grad_norm": 6.147176716463638, "learning_rate": 4.762912523048685e-06, "loss": 0.5332, "step": 2039 }, { "epoch": 0.1656785511248274, "grad_norm": 4.334801382262547, "learning_rate": 4.7626329156455e-06, "loss": 0.843, "step": 2040 }, { "epoch": 0.165759766100869, "grad_norm": 6.491344666003298, "learning_rate": 4.7623531516800916e-06, "loss": 0.5301, "step": 2041 }, { "epoch": 0.1658409810769106, "grad_norm": 4.47643734321392, "learning_rate": 4.762073231171819e-06, "loss": 0.7063, "step": 2042 }, { "epoch": 0.16592219605295216, "grad_norm": 4.934657642144628, "learning_rate": 4.76179315414005e-06, "loss": 0.6217, "step": 2043 }, { "epoch": 0.16600341102899374, "grad_norm": 6.800502146070916, "learning_rate": 4.761512920604165e-06, "loss": 0.5184, "step": 2044 }, { "epoch": 0.16608462600503532, "grad_norm": 4.106268196492193, "learning_rate": 4.761232530583556e-06, "loss": 0.4322, "step": 2045 }, { "epoch": 0.16616584098107692, "grad_norm": 4.860918988738907, "learning_rate": 4.760951984097622e-06, "loss": 0.541, "step": 2046 }, { "epoch": 0.1662470559571185, "grad_norm": 3.6112840346969564, "learning_rate": 4.760671281165777e-06, "loss": 0.7025, "step": 2047 }, { "epoch": 0.16632827093316008, "grad_norm": 3.363415520275136, "learning_rate": 4.760390421807445e-06, "loss": 0.5791, "step": 2048 }, { "epoch": 0.16640948590920165, "grad_norm": 5.52653404100446, "learning_rate": 4.760109406042057e-06, "loss": 0.545, "step": 2049 }, { "epoch": 0.16649070088524323, "grad_norm": 6.546707591746984, "learning_rate": 4.759828233889061e-06, "loss": 0.5705, "step": 2050 }, { "epoch": 0.1665719158612848, "grad_norm": 5.661580374415145, "learning_rate": 4.75954690536791e-06, "loss": 0.5044, "step": 2051 }, { "epoch": 0.1666531308373264, "grad_norm": 4.883386412845315, "learning_rate": 4.759265420498073e-06, "loss": 0.5467, "step": 2052 }, { "epoch": 0.166734345813368, "grad_norm": 9.411178068543851, "learning_rate": 4.758983779299025e-06, "loss": 0.5892, "step": 2053 }, { "epoch": 0.16681556078940957, "grad_norm": 4.132413410970519, "learning_rate": 4.758701981790255e-06, "loss": 0.986, "step": 2054 }, { "epoch": 0.16689677576545114, "grad_norm": 34.42736555216525, "learning_rate": 4.7584200279912614e-06, "loss": 0.6445, "step": 2055 }, { "epoch": 0.16697799074149272, "grad_norm": 4.276245246172677, "learning_rate": 4.7581379179215545e-06, "loss": 0.7808, "step": 2056 }, { "epoch": 0.16705920571753433, "grad_norm": 3.964870477401249, "learning_rate": 4.757855651600656e-06, "loss": 0.7379, "step": 2057 }, { "epoch": 0.1671404206935759, "grad_norm": 3.606188650388298, "learning_rate": 4.757573229048095e-06, "loss": 0.7927, "step": 2058 }, { "epoch": 0.16722163566961748, "grad_norm": 4.909495914927784, "learning_rate": 4.757290650283414e-06, "loss": 0.5828, "step": 2059 }, { "epoch": 0.16730285064565906, "grad_norm": 4.647983652975215, "learning_rate": 4.757007915326167e-06, "loss": 0.5334, "step": 2060 }, { "epoch": 0.16738406562170063, "grad_norm": 6.091135267313866, "learning_rate": 4.756725024195918e-06, "loss": 0.7908, "step": 2061 }, { "epoch": 0.1674652805977422, "grad_norm": 4.236635859135639, "learning_rate": 4.75644197691224e-06, "loss": 0.4387, "step": 2062 }, { "epoch": 0.16754649557378382, "grad_norm": 5.254900085055447, "learning_rate": 4.7561587734947195e-06, "loss": 0.5377, "step": 2063 }, { "epoch": 0.1676277105498254, "grad_norm": 6.08722014972973, "learning_rate": 4.755875413962953e-06, "loss": 0.4746, "step": 2064 }, { "epoch": 0.16770892552586697, "grad_norm": 6.117784644667623, "learning_rate": 4.7555918983365456e-06, "loss": 0.4134, "step": 2065 }, { "epoch": 0.16779014050190855, "grad_norm": 3.931238911985374, "learning_rate": 4.755308226635117e-06, "loss": 0.6176, "step": 2066 }, { "epoch": 0.16787135547795012, "grad_norm": 5.993875492451563, "learning_rate": 4.755024398878296e-06, "loss": 0.5139, "step": 2067 }, { "epoch": 0.16795257045399173, "grad_norm": 4.120681576998816, "learning_rate": 4.75474041508572e-06, "loss": 0.7734, "step": 2068 }, { "epoch": 0.1680337854300333, "grad_norm": 4.2419734083458716, "learning_rate": 4.7544562752770415e-06, "loss": 0.6459, "step": 2069 }, { "epoch": 0.16811500040607488, "grad_norm": 8.463809226446845, "learning_rate": 4.75417197947192e-06, "loss": 0.4706, "step": 2070 }, { "epoch": 0.16819621538211646, "grad_norm": 3.8343323335367128, "learning_rate": 4.753887527690027e-06, "loss": 0.4932, "step": 2071 }, { "epoch": 0.16827743035815804, "grad_norm": 8.218415227607817, "learning_rate": 4.753602919951046e-06, "loss": 0.5316, "step": 2072 }, { "epoch": 0.1683586453341996, "grad_norm": 3.6050617955271216, "learning_rate": 4.753318156274669e-06, "loss": 0.5167, "step": 2073 }, { "epoch": 0.16843986031024122, "grad_norm": 4.841750253634242, "learning_rate": 4.753033236680602e-06, "loss": 0.47, "step": 2074 }, { "epoch": 0.1685210752862828, "grad_norm": 5.173184919410824, "learning_rate": 4.75274816118856e-06, "loss": 0.7099, "step": 2075 }, { "epoch": 0.16860229026232437, "grad_norm": 3.9701269933467622, "learning_rate": 4.7524629298182655e-06, "loss": 0.7438, "step": 2076 }, { "epoch": 0.16868350523836595, "grad_norm": 5.896346024335442, "learning_rate": 4.752177542589459e-06, "loss": 0.5601, "step": 2077 }, { "epoch": 0.16876472021440753, "grad_norm": 4.249527053571267, "learning_rate": 4.7518919995218854e-06, "loss": 0.6669, "step": 2078 }, { "epoch": 0.16884593519044913, "grad_norm": 6.037758485869208, "learning_rate": 4.7516063006353035e-06, "loss": 0.5073, "step": 2079 }, { "epoch": 0.1689271501664907, "grad_norm": 3.7856446494128266, "learning_rate": 4.7513204459494825e-06, "loss": 0.5701, "step": 2080 }, { "epoch": 0.16900836514253229, "grad_norm": 7.188036974920219, "learning_rate": 4.751034435484201e-06, "loss": 0.5066, "step": 2081 }, { "epoch": 0.16908958011857386, "grad_norm": 4.317432007080439, "learning_rate": 4.75074826925925e-06, "loss": 0.7534, "step": 2082 }, { "epoch": 0.16917079509461544, "grad_norm": 5.902119726898797, "learning_rate": 4.750461947294431e-06, "loss": 0.6171, "step": 2083 }, { "epoch": 0.16925201007065702, "grad_norm": 3.7384712420136523, "learning_rate": 4.750175469609555e-06, "loss": 0.6519, "step": 2084 }, { "epoch": 0.16933322504669862, "grad_norm": 3.71902625991903, "learning_rate": 4.749888836224446e-06, "loss": 0.6105, "step": 2085 }, { "epoch": 0.1694144400227402, "grad_norm": 4.570880922255994, "learning_rate": 4.749602047158937e-06, "loss": 0.8081, "step": 2086 }, { "epoch": 0.16949565499878178, "grad_norm": 5.288276551549225, "learning_rate": 4.749315102432872e-06, "loss": 0.6383, "step": 2087 }, { "epoch": 0.16957686997482335, "grad_norm": 11.77165556632558, "learning_rate": 4.749028002066106e-06, "loss": 0.5472, "step": 2088 }, { "epoch": 0.16965808495086493, "grad_norm": 4.971856688210904, "learning_rate": 4.748740746078505e-06, "loss": 0.3578, "step": 2089 }, { "epoch": 0.16973929992690653, "grad_norm": 3.215058873377653, "learning_rate": 4.748453334489947e-06, "loss": 0.6725, "step": 2090 }, { "epoch": 0.1698205149029481, "grad_norm": 5.921520125153694, "learning_rate": 4.748165767320316e-06, "loss": 0.604, "step": 2091 }, { "epoch": 0.1699017298789897, "grad_norm": 16.542194531090658, "learning_rate": 4.747878044589513e-06, "loss": 0.527, "step": 2092 }, { "epoch": 0.16998294485503126, "grad_norm": 6.697998722104277, "learning_rate": 4.747590166317447e-06, "loss": 0.6809, "step": 2093 }, { "epoch": 0.17006415983107284, "grad_norm": 4.365123433682997, "learning_rate": 4.7473021325240355e-06, "loss": 0.6052, "step": 2094 }, { "epoch": 0.17014537480711442, "grad_norm": 4.341123013344583, "learning_rate": 4.74701394322921e-06, "loss": 0.7157, "step": 2095 }, { "epoch": 0.17022658978315602, "grad_norm": 7.815755600154752, "learning_rate": 4.7467255984529124e-06, "loss": 0.4511, "step": 2096 }, { "epoch": 0.1703078047591976, "grad_norm": 3.5475344534604276, "learning_rate": 4.746437098215094e-06, "loss": 0.5972, "step": 2097 }, { "epoch": 0.17038901973523918, "grad_norm": 5.6422926548593635, "learning_rate": 4.746148442535717e-06, "loss": 0.6985, "step": 2098 }, { "epoch": 0.17047023471128075, "grad_norm": 3.7587908987149032, "learning_rate": 4.745859631434757e-06, "loss": 0.7605, "step": 2099 }, { "epoch": 0.17055144968732233, "grad_norm": 6.674895969578671, "learning_rate": 4.745570664932195e-06, "loss": 0.7068, "step": 2100 }, { "epoch": 0.17063266466336394, "grad_norm": 5.123301796152971, "learning_rate": 4.745281543048027e-06, "loss": 0.4672, "step": 2101 }, { "epoch": 0.1707138796394055, "grad_norm": 5.4550745050456655, "learning_rate": 4.744992265802261e-06, "loss": 0.4934, "step": 2102 }, { "epoch": 0.1707950946154471, "grad_norm": 5.148605782096688, "learning_rate": 4.74470283321491e-06, "loss": 0.5513, "step": 2103 }, { "epoch": 0.17087630959148867, "grad_norm": 4.960917833837893, "learning_rate": 4.7444132453060046e-06, "loss": 0.5779, "step": 2104 }, { "epoch": 0.17095752456753024, "grad_norm": 5.30615437756679, "learning_rate": 4.744123502095579e-06, "loss": 0.4572, "step": 2105 }, { "epoch": 0.17103873954357182, "grad_norm": 12.9603922902617, "learning_rate": 4.743833603603685e-06, "loss": 0.6257, "step": 2106 }, { "epoch": 0.17111995451961343, "grad_norm": 4.802287043719077, "learning_rate": 4.743543549850381e-06, "loss": 0.695, "step": 2107 }, { "epoch": 0.171201169495655, "grad_norm": 10.403228449617778, "learning_rate": 4.743253340855737e-06, "loss": 0.6116, "step": 2108 }, { "epoch": 0.17128238447169658, "grad_norm": 6.756935813007018, "learning_rate": 4.742962976639835e-06, "loss": 0.7586, "step": 2109 }, { "epoch": 0.17136359944773816, "grad_norm": 10.07886788901266, "learning_rate": 4.742672457222764e-06, "loss": 0.4777, "step": 2110 }, { "epoch": 0.17144481442377973, "grad_norm": 5.62161561654736, "learning_rate": 4.742381782624629e-06, "loss": 0.4269, "step": 2111 }, { "epoch": 0.17152602939982134, "grad_norm": 5.949206043889717, "learning_rate": 4.7420909528655416e-06, "loss": 0.7793, "step": 2112 }, { "epoch": 0.17160724437586292, "grad_norm": 4.263095903085148, "learning_rate": 4.741799967965627e-06, "loss": 0.5637, "step": 2113 }, { "epoch": 0.1716884593519045, "grad_norm": 10.633709854598653, "learning_rate": 4.74150882794502e-06, "loss": 0.82, "step": 2114 }, { "epoch": 0.17176967432794607, "grad_norm": 3.032916783451706, "learning_rate": 4.741217532823864e-06, "loss": 0.72, "step": 2115 }, { "epoch": 0.17185088930398765, "grad_norm": 3.8608167614052484, "learning_rate": 4.740926082622316e-06, "loss": 0.7894, "step": 2116 }, { "epoch": 0.17193210428002922, "grad_norm": 3.9765186741541436, "learning_rate": 4.740634477360544e-06, "loss": 0.5949, "step": 2117 }, { "epoch": 0.17201331925607083, "grad_norm": 3.6048556532254246, "learning_rate": 4.740342717058723e-06, "loss": 0.6829, "step": 2118 }, { "epoch": 0.1720945342321124, "grad_norm": 3.612675372859013, "learning_rate": 4.740050801737045e-06, "loss": 0.4803, "step": 2119 }, { "epoch": 0.17217574920815398, "grad_norm": 4.645131781522265, "learning_rate": 4.739758731415705e-06, "loss": 0.7209, "step": 2120 }, { "epoch": 0.17225696418419556, "grad_norm": 5.631469115723518, "learning_rate": 4.739466506114916e-06, "loss": 0.6008, "step": 2121 }, { "epoch": 0.17233817916023714, "grad_norm": 4.1088983111658735, "learning_rate": 4.739174125854896e-06, "loss": 0.6917, "step": 2122 }, { "epoch": 0.17241939413627874, "grad_norm": 5.169477468086094, "learning_rate": 4.738881590655877e-06, "loss": 0.5403, "step": 2123 }, { "epoch": 0.17250060911232032, "grad_norm": 4.946107948001271, "learning_rate": 4.738588900538102e-06, "loss": 0.608, "step": 2124 }, { "epoch": 0.1725818240883619, "grad_norm": 3.559651178268444, "learning_rate": 4.738296055521821e-06, "loss": 0.6463, "step": 2125 }, { "epoch": 0.17266303906440347, "grad_norm": 6.129364916078834, "learning_rate": 4.738003055627301e-06, "loss": 0.4888, "step": 2126 }, { "epoch": 0.17274425404044505, "grad_norm": 4.278147231585998, "learning_rate": 4.7377099008748125e-06, "loss": 0.8504, "step": 2127 }, { "epoch": 0.17282546901648663, "grad_norm": 8.024624331958586, "learning_rate": 4.737416591284643e-06, "loss": 0.6727, "step": 2128 }, { "epoch": 0.17290668399252823, "grad_norm": 29.45093767091559, "learning_rate": 4.737123126877086e-06, "loss": 0.7422, "step": 2129 }, { "epoch": 0.1729878989685698, "grad_norm": 3.13136259957198, "learning_rate": 4.736829507672449e-06, "loss": 0.5307, "step": 2130 }, { "epoch": 0.17306911394461139, "grad_norm": 7.567136439537917, "learning_rate": 4.736535733691048e-06, "loss": 0.6303, "step": 2131 }, { "epoch": 0.17315032892065296, "grad_norm": 5.220061316823007, "learning_rate": 4.73624180495321e-06, "loss": 0.5315, "step": 2132 }, { "epoch": 0.17323154389669454, "grad_norm": 7.195772361468456, "learning_rate": 4.7359477214792754e-06, "loss": 0.5764, "step": 2133 }, { "epoch": 0.17331275887273614, "grad_norm": 7.3854852778069615, "learning_rate": 4.735653483289591e-06, "loss": 0.5438, "step": 2134 }, { "epoch": 0.17339397384877772, "grad_norm": 6.686734429776475, "learning_rate": 4.7353590904045184e-06, "loss": 0.6625, "step": 2135 }, { "epoch": 0.1734751888248193, "grad_norm": 3.9164463638901514, "learning_rate": 4.735064542844428e-06, "loss": 0.5811, "step": 2136 }, { "epoch": 0.17355640380086088, "grad_norm": 4.957310787853842, "learning_rate": 4.734769840629699e-06, "loss": 0.4821, "step": 2137 }, { "epoch": 0.17363761877690245, "grad_norm": 8.419842894201476, "learning_rate": 4.734474983780724e-06, "loss": 0.5227, "step": 2138 }, { "epoch": 0.17371883375294403, "grad_norm": 8.041916137901524, "learning_rate": 4.734179972317907e-06, "loss": 0.6015, "step": 2139 }, { "epoch": 0.17380004872898563, "grad_norm": 4.1188752273792195, "learning_rate": 4.73388480626166e-06, "loss": 0.6766, "step": 2140 }, { "epoch": 0.1738812637050272, "grad_norm": 5.065614073087699, "learning_rate": 4.733589485632407e-06, "loss": 0.5939, "step": 2141 }, { "epoch": 0.1739624786810688, "grad_norm": 3.8622027019274077, "learning_rate": 4.733294010450583e-06, "loss": 0.5127, "step": 2142 }, { "epoch": 0.17404369365711037, "grad_norm": 5.31622281327025, "learning_rate": 4.732998380736632e-06, "loss": 0.6016, "step": 2143 }, { "epoch": 0.17412490863315194, "grad_norm": 3.832733531671153, "learning_rate": 4.732702596511012e-06, "loss": 0.4538, "step": 2144 }, { "epoch": 0.17420612360919355, "grad_norm": 4.720524116716847, "learning_rate": 4.732406657794188e-06, "loss": 0.5959, "step": 2145 }, { "epoch": 0.17428733858523512, "grad_norm": 3.3574370172144734, "learning_rate": 4.732110564606639e-06, "loss": 0.6259, "step": 2146 }, { "epoch": 0.1743685535612767, "grad_norm": 3.845949435304935, "learning_rate": 4.7318143169688515e-06, "loss": 0.5966, "step": 2147 }, { "epoch": 0.17444976853731828, "grad_norm": 3.7946357565371973, "learning_rate": 4.731517914901324e-06, "loss": 0.6552, "step": 2148 }, { "epoch": 0.17453098351335986, "grad_norm": 8.89191739354207, "learning_rate": 4.731221358424569e-06, "loss": 0.7327, "step": 2149 }, { "epoch": 0.17461219848940143, "grad_norm": 6.740032168112452, "learning_rate": 4.730924647559103e-06, "loss": 0.6989, "step": 2150 }, { "epoch": 0.17469341346544304, "grad_norm": 4.6320616184970875, "learning_rate": 4.730627782325459e-06, "loss": 0.5567, "step": 2151 }, { "epoch": 0.17477462844148461, "grad_norm": 4.627022975502933, "learning_rate": 4.730330762744178e-06, "loss": 0.5177, "step": 2152 }, { "epoch": 0.1748558434175262, "grad_norm": 5.2547160244571876, "learning_rate": 4.730033588835812e-06, "loss": 0.578, "step": 2153 }, { "epoch": 0.17493705839356777, "grad_norm": 4.545741644009064, "learning_rate": 4.729736260620924e-06, "loss": 0.5025, "step": 2154 }, { "epoch": 0.17501827336960935, "grad_norm": 3.1745271529048074, "learning_rate": 4.729438778120088e-06, "loss": 0.6065, "step": 2155 }, { "epoch": 0.17509948834565095, "grad_norm": 3.614800398396882, "learning_rate": 4.729141141353887e-06, "loss": 0.5726, "step": 2156 }, { "epoch": 0.17518070332169253, "grad_norm": 4.093940125645144, "learning_rate": 4.7288433503429165e-06, "loss": 0.7513, "step": 2157 }, { "epoch": 0.1752619182977341, "grad_norm": 6.4087498177915885, "learning_rate": 4.728545405107782e-06, "loss": 0.5511, "step": 2158 }, { "epoch": 0.17534313327377568, "grad_norm": 6.964419687177815, "learning_rate": 4.7282473056691e-06, "loss": 0.5388, "step": 2159 }, { "epoch": 0.17542434824981726, "grad_norm": 7.381195247577337, "learning_rate": 4.727949052047498e-06, "loss": 0.5553, "step": 2160 }, { "epoch": 0.17550556322585883, "grad_norm": 3.447447501574935, "learning_rate": 4.7276506442636125e-06, "loss": 0.6407, "step": 2161 }, { "epoch": 0.17558677820190044, "grad_norm": 3.624265660451841, "learning_rate": 4.727352082338092e-06, "loss": 0.5699, "step": 2162 }, { "epoch": 0.17566799317794202, "grad_norm": 4.736948616378692, "learning_rate": 4.727053366291595e-06, "loss": 0.5561, "step": 2163 }, { "epoch": 0.1757492081539836, "grad_norm": 5.557189925086155, "learning_rate": 4.726754496144792e-06, "loss": 0.6388, "step": 2164 }, { "epoch": 0.17583042313002517, "grad_norm": 5.654257813296, "learning_rate": 4.726455471918363e-06, "loss": 0.5798, "step": 2165 }, { "epoch": 0.17591163810606675, "grad_norm": 4.017186015236763, "learning_rate": 4.726156293632998e-06, "loss": 0.6494, "step": 2166 }, { "epoch": 0.17599285308210835, "grad_norm": 3.2976783263541036, "learning_rate": 4.725856961309401e-06, "loss": 0.6894, "step": 2167 }, { "epoch": 0.17607406805814993, "grad_norm": 9.505301533813855, "learning_rate": 4.725557474968281e-06, "loss": 0.5775, "step": 2168 }, { "epoch": 0.1761552830341915, "grad_norm": 5.237101724122944, "learning_rate": 4.725257834630362e-06, "loss": 0.5132, "step": 2169 }, { "epoch": 0.17623649801023308, "grad_norm": 4.666530981374312, "learning_rate": 4.7249580403163786e-06, "loss": 0.4708, "step": 2170 }, { "epoch": 0.17631771298627466, "grad_norm": 3.82098458375026, "learning_rate": 4.7246580920470746e-06, "loss": 0.6887, "step": 2171 }, { "epoch": 0.17639892796231624, "grad_norm": 4.794596840480159, "learning_rate": 4.7243579898432035e-06, "loss": 0.5326, "step": 2172 }, { "epoch": 0.17648014293835784, "grad_norm": 5.163380467947997, "learning_rate": 4.724057733725532e-06, "loss": 0.4342, "step": 2173 }, { "epoch": 0.17656135791439942, "grad_norm": 6.130524877364227, "learning_rate": 4.723757323714836e-06, "loss": 0.5504, "step": 2174 }, { "epoch": 0.176642572890441, "grad_norm": 3.6388040943311566, "learning_rate": 4.723456759831903e-06, "loss": 0.519, "step": 2175 }, { "epoch": 0.17672378786648257, "grad_norm": 5.217405099472009, "learning_rate": 4.7231560420975294e-06, "loss": 0.4867, "step": 2176 }, { "epoch": 0.17680500284252415, "grad_norm": 8.145512643236385, "learning_rate": 4.722855170532523e-06, "loss": 0.532, "step": 2177 }, { "epoch": 0.17688621781856576, "grad_norm": 6.513554956269954, "learning_rate": 4.7225541451577035e-06, "loss": 0.7263, "step": 2178 }, { "epoch": 0.17696743279460733, "grad_norm": 5.647381362856042, "learning_rate": 4.7222529659939e-06, "loss": 0.5248, "step": 2179 }, { "epoch": 0.1770486477706489, "grad_norm": 8.634842181250088, "learning_rate": 4.721951633061952e-06, "loss": 0.5163, "step": 2180 }, { "epoch": 0.1771298627466905, "grad_norm": 5.804232556564542, "learning_rate": 4.721650146382711e-06, "loss": 0.5169, "step": 2181 }, { "epoch": 0.17721107772273206, "grad_norm": 5.170134284377379, "learning_rate": 4.721348505977037e-06, "loss": 0.569, "step": 2182 }, { "epoch": 0.17729229269877364, "grad_norm": 6.347534050197857, "learning_rate": 4.721046711865803e-06, "loss": 0.7412, "step": 2183 }, { "epoch": 0.17737350767481525, "grad_norm": 3.947255210170048, "learning_rate": 4.720744764069892e-06, "loss": 0.6272, "step": 2184 }, { "epoch": 0.17745472265085682, "grad_norm": 8.963510036415032, "learning_rate": 4.7204426626101955e-06, "loss": 0.5039, "step": 2185 }, { "epoch": 0.1775359376268984, "grad_norm": 5.0045242805343255, "learning_rate": 4.720140407507619e-06, "loss": 0.5961, "step": 2186 }, { "epoch": 0.17761715260293998, "grad_norm": 80.66410418515012, "learning_rate": 4.719837998783075e-06, "loss": 0.6292, "step": 2187 }, { "epoch": 0.17769836757898155, "grad_norm": 3.8541304256097804, "learning_rate": 4.7195354364574915e-06, "loss": 0.5465, "step": 2188 }, { "epoch": 0.17777958255502316, "grad_norm": 3.524866314545861, "learning_rate": 4.719232720551802e-06, "loss": 0.6409, "step": 2189 }, { "epoch": 0.17786079753106473, "grad_norm": 3.929607398758829, "learning_rate": 4.718929851086953e-06, "loss": 0.5239, "step": 2190 }, { "epoch": 0.1779420125071063, "grad_norm": 6.4534562177957575, "learning_rate": 4.718626828083902e-06, "loss": 0.515, "step": 2191 }, { "epoch": 0.1780232274831479, "grad_norm": 4.416334886882551, "learning_rate": 4.718323651563616e-06, "loss": 0.4572, "step": 2192 }, { "epoch": 0.17810444245918947, "grad_norm": 4.2150922659750085, "learning_rate": 4.718020321547075e-06, "loss": 0.6128, "step": 2193 }, { "epoch": 0.17818565743523104, "grad_norm": 3.3177339549952705, "learning_rate": 4.717716838055265e-06, "loss": 0.642, "step": 2194 }, { "epoch": 0.17826687241127265, "grad_norm": 4.248446599343683, "learning_rate": 4.717413201109187e-06, "loss": 0.8113, "step": 2195 }, { "epoch": 0.17834808738731422, "grad_norm": 4.561435467958204, "learning_rate": 4.717109410729851e-06, "loss": 0.6368, "step": 2196 }, { "epoch": 0.1784293023633558, "grad_norm": 5.104529981932405, "learning_rate": 4.716805466938278e-06, "loss": 0.6438, "step": 2197 }, { "epoch": 0.17851051733939738, "grad_norm": 3.5323774996376374, "learning_rate": 4.7165013697555e-06, "loss": 0.6165, "step": 2198 }, { "epoch": 0.17859173231543896, "grad_norm": 6.640997619244458, "learning_rate": 4.716197119202556e-06, "loss": 0.6347, "step": 2199 }, { "epoch": 0.17867294729148056, "grad_norm": 11.632083395111554, "learning_rate": 4.715892715300501e-06, "loss": 0.7224, "step": 2200 }, { "epoch": 0.17875416226752214, "grad_norm": 4.224735988915179, "learning_rate": 4.7155881580703984e-06, "loss": 0.5769, "step": 2201 }, { "epoch": 0.17883537724356371, "grad_norm": 7.9851742695130685, "learning_rate": 4.71528344753332e-06, "loss": 0.4985, "step": 2202 }, { "epoch": 0.1789165922196053, "grad_norm": 6.25379024192091, "learning_rate": 4.714978583710352e-06, "loss": 0.5391, "step": 2203 }, { "epoch": 0.17899780719564687, "grad_norm": 3.486820916489094, "learning_rate": 4.714673566622589e-06, "loss": 0.567, "step": 2204 }, { "epoch": 0.17907902217168845, "grad_norm": 3.4864795044696617, "learning_rate": 4.714368396291135e-06, "loss": 0.7785, "step": 2205 }, { "epoch": 0.17916023714773005, "grad_norm": 3.6141664132762172, "learning_rate": 4.714063072737108e-06, "loss": 0.4835, "step": 2206 }, { "epoch": 0.17924145212377163, "grad_norm": 5.839740334780792, "learning_rate": 4.713757595981634e-06, "loss": 0.5031, "step": 2207 }, { "epoch": 0.1793226670998132, "grad_norm": 8.510815914171106, "learning_rate": 4.713451966045851e-06, "loss": 0.6997, "step": 2208 }, { "epoch": 0.17940388207585478, "grad_norm": 4.803667331216526, "learning_rate": 4.713146182950905e-06, "loss": 0.5733, "step": 2209 }, { "epoch": 0.17948509705189636, "grad_norm": 3.5388152829208015, "learning_rate": 4.7128402467179575e-06, "loss": 0.5907, "step": 2210 }, { "epoch": 0.17956631202793796, "grad_norm": 3.03616427792538, "learning_rate": 4.712534157368176e-06, "loss": 0.5774, "step": 2211 }, { "epoch": 0.17964752700397954, "grad_norm": 9.760631640572191, "learning_rate": 4.7122279149227405e-06, "loss": 0.5262, "step": 2212 }, { "epoch": 0.17972874198002112, "grad_norm": 4.386181821659895, "learning_rate": 4.711921519402841e-06, "loss": 0.6952, "step": 2213 }, { "epoch": 0.1798099569560627, "grad_norm": 3.3351010256653573, "learning_rate": 4.711614970829679e-06, "loss": 0.5091, "step": 2214 }, { "epoch": 0.17989117193210427, "grad_norm": 6.9608846044796024, "learning_rate": 4.711308269224466e-06, "loss": 0.4963, "step": 2215 }, { "epoch": 0.17997238690814585, "grad_norm": 5.215289754630284, "learning_rate": 4.7110014146084235e-06, "loss": 0.5736, "step": 2216 }, { "epoch": 0.18005360188418745, "grad_norm": 11.93663447822261, "learning_rate": 4.710694407002785e-06, "loss": 0.7406, "step": 2217 }, { "epoch": 0.18013481686022903, "grad_norm": 5.163645505705543, "learning_rate": 4.710387246428794e-06, "loss": 0.6448, "step": 2218 }, { "epoch": 0.1802160318362706, "grad_norm": 4.661698490114284, "learning_rate": 4.710079932907703e-06, "loss": 0.7224, "step": 2219 }, { "epoch": 0.18029724681231218, "grad_norm": 3.401258624128954, "learning_rate": 4.7097724664607775e-06, "loss": 0.5251, "step": 2220 }, { "epoch": 0.18037846178835376, "grad_norm": 5.125047047447445, "learning_rate": 4.709464847109292e-06, "loss": 0.6384, "step": 2221 }, { "epoch": 0.18045967676439537, "grad_norm": 4.023853533344583, "learning_rate": 4.709157074874533e-06, "loss": 0.5047, "step": 2222 }, { "epoch": 0.18054089174043694, "grad_norm": 6.949692493391, "learning_rate": 4.7088491497777965e-06, "loss": 0.5204, "step": 2223 }, { "epoch": 0.18062210671647852, "grad_norm": 4.10788473501353, "learning_rate": 4.708541071840388e-06, "loss": 0.5865, "step": 2224 }, { "epoch": 0.1807033216925201, "grad_norm": 4.694036134420709, "learning_rate": 4.708232841083628e-06, "loss": 0.5692, "step": 2225 }, { "epoch": 0.18078453666856167, "grad_norm": 5.994257706825592, "learning_rate": 4.70792445752884e-06, "loss": 0.5018, "step": 2226 }, { "epoch": 0.18086575164460328, "grad_norm": 8.767129488205637, "learning_rate": 4.707615921197366e-06, "loss": 0.5381, "step": 2227 }, { "epoch": 0.18094696662064486, "grad_norm": 4.847865205503708, "learning_rate": 4.707307232110554e-06, "loss": 0.7474, "step": 2228 }, { "epoch": 0.18102818159668643, "grad_norm": 5.668800604259548, "learning_rate": 4.706998390289763e-06, "loss": 0.6027, "step": 2229 }, { "epoch": 0.181109396572728, "grad_norm": 6.745634612012881, "learning_rate": 4.706689395756363e-06, "loss": 0.5229, "step": 2230 }, { "epoch": 0.1811906115487696, "grad_norm": 4.895045129217889, "learning_rate": 4.706380248531737e-06, "loss": 0.5253, "step": 2231 }, { "epoch": 0.18127182652481116, "grad_norm": 4.810666628926101, "learning_rate": 4.706070948637274e-06, "loss": 0.6069, "step": 2232 }, { "epoch": 0.18135304150085277, "grad_norm": 7.912604183646548, "learning_rate": 4.705761496094377e-06, "loss": 0.5576, "step": 2233 }, { "epoch": 0.18143425647689435, "grad_norm": 4.074525434845986, "learning_rate": 4.705451890924459e-06, "loss": 0.6926, "step": 2234 }, { "epoch": 0.18151547145293592, "grad_norm": 9.471081474256104, "learning_rate": 4.705142133148943e-06, "loss": 0.5167, "step": 2235 }, { "epoch": 0.1815966864289775, "grad_norm": 4.8687622437884555, "learning_rate": 4.70483222278926e-06, "loss": 0.6418, "step": 2236 }, { "epoch": 0.18167790140501908, "grad_norm": 4.3746758828868995, "learning_rate": 4.704522159866857e-06, "loss": 0.5135, "step": 2237 }, { "epoch": 0.18175911638106068, "grad_norm": 4.148684813653182, "learning_rate": 4.704211944403188e-06, "loss": 0.6289, "step": 2238 }, { "epoch": 0.18184033135710226, "grad_norm": 4.101220246406701, "learning_rate": 4.703901576419717e-06, "loss": 0.473, "step": 2239 }, { "epoch": 0.18192154633314384, "grad_norm": 4.884276965609116, "learning_rate": 4.703591055937922e-06, "loss": 0.5761, "step": 2240 }, { "epoch": 0.1820027613091854, "grad_norm": 9.506027279588086, "learning_rate": 4.7032803829792875e-06, "loss": 0.5745, "step": 2241 }, { "epoch": 0.182083976285227, "grad_norm": 4.5640936040307825, "learning_rate": 4.702969557565312e-06, "loss": 0.6804, "step": 2242 }, { "epoch": 0.18216519126126857, "grad_norm": 5.16553253038635, "learning_rate": 4.702658579717502e-06, "loss": 0.4686, "step": 2243 }, { "epoch": 0.18224640623731017, "grad_norm": 5.1964649031938395, "learning_rate": 4.702347449457375e-06, "loss": 0.5184, "step": 2244 }, { "epoch": 0.18232762121335175, "grad_norm": 8.556515016420585, "learning_rate": 4.702036166806461e-06, "loss": 0.3698, "step": 2245 }, { "epoch": 0.18240883618939333, "grad_norm": 7.054847431413247, "learning_rate": 4.7017247317862976e-06, "loss": 0.5364, "step": 2246 }, { "epoch": 0.1824900511654349, "grad_norm": 5.019891726347235, "learning_rate": 4.701413144418437e-06, "loss": 0.4901, "step": 2247 }, { "epoch": 0.18257126614147648, "grad_norm": 4.195016853555098, "learning_rate": 4.701101404724435e-06, "loss": 0.4601, "step": 2248 }, { "epoch": 0.18265248111751808, "grad_norm": 5.378788487799258, "learning_rate": 4.700789512725867e-06, "loss": 0.6267, "step": 2249 }, { "epoch": 0.18273369609355966, "grad_norm": 3.793906030224367, "learning_rate": 4.700477468444311e-06, "loss": 0.6438, "step": 2250 }, { "epoch": 0.18281491106960124, "grad_norm": 4.868151253449179, "learning_rate": 4.700165271901361e-06, "loss": 0.6933, "step": 2251 }, { "epoch": 0.18289612604564282, "grad_norm": 7.777361957555963, "learning_rate": 4.699852923118618e-06, "loss": 0.4894, "step": 2252 }, { "epoch": 0.1829773410216844, "grad_norm": 4.106355721520342, "learning_rate": 4.699540422117695e-06, "loss": 0.4232, "step": 2253 }, { "epoch": 0.18305855599772597, "grad_norm": 5.430574051132866, "learning_rate": 4.699227768920216e-06, "loss": 0.5387, "step": 2254 }, { "epoch": 0.18313977097376757, "grad_norm": 6.575598696133923, "learning_rate": 4.6989149635478145e-06, "loss": 0.5371, "step": 2255 }, { "epoch": 0.18322098594980915, "grad_norm": 5.4676777555816765, "learning_rate": 4.698602006022136e-06, "loss": 0.5012, "step": 2256 }, { "epoch": 0.18330220092585073, "grad_norm": 4.576711281154184, "learning_rate": 4.698288896364834e-06, "loss": 0.6324, "step": 2257 }, { "epoch": 0.1833834159018923, "grad_norm": 11.976819517596883, "learning_rate": 4.697975634597574e-06, "loss": 0.5405, "step": 2258 }, { "epoch": 0.18346463087793388, "grad_norm": 6.301644428134142, "learning_rate": 4.697662220742033e-06, "loss": 0.5355, "step": 2259 }, { "epoch": 0.1835458458539755, "grad_norm": 16.567162364593425, "learning_rate": 4.697348654819898e-06, "loss": 0.5812, "step": 2260 }, { "epoch": 0.18362706083001706, "grad_norm": 5.845436104952721, "learning_rate": 4.697034936852865e-06, "loss": 0.6177, "step": 2261 }, { "epoch": 0.18370827580605864, "grad_norm": 3.4478757836782608, "learning_rate": 4.6967210668626415e-06, "loss": 0.6487, "step": 2262 }, { "epoch": 0.18378949078210022, "grad_norm": 8.26455582424995, "learning_rate": 4.696407044870947e-06, "loss": 0.6865, "step": 2263 }, { "epoch": 0.1838707057581418, "grad_norm": 3.6489257014011924, "learning_rate": 4.696092870899509e-06, "loss": 0.4881, "step": 2264 }, { "epoch": 0.18395192073418337, "grad_norm": 7.476743860852032, "learning_rate": 4.695778544970066e-06, "loss": 0.5365, "step": 2265 }, { "epoch": 0.18403313571022498, "grad_norm": 5.335228274279977, "learning_rate": 4.695464067104371e-06, "loss": 0.5978, "step": 2266 }, { "epoch": 0.18411435068626655, "grad_norm": 7.0816544622030175, "learning_rate": 4.6951494373241805e-06, "loss": 0.769, "step": 2267 }, { "epoch": 0.18419556566230813, "grad_norm": 3.851216245811756, "learning_rate": 4.694834655651266e-06, "loss": 0.627, "step": 2268 }, { "epoch": 0.1842767806383497, "grad_norm": 4.805289628209942, "learning_rate": 4.6945197221074104e-06, "loss": 0.8659, "step": 2269 }, { "epoch": 0.18435799561439128, "grad_norm": 7.794985897939018, "learning_rate": 4.694204636714403e-06, "loss": 0.5485, "step": 2270 }, { "epoch": 0.1844392105904329, "grad_norm": 20.84411786544806, "learning_rate": 4.693889399494049e-06, "loss": 0.5583, "step": 2271 }, { "epoch": 0.18452042556647447, "grad_norm": 4.8093342027022326, "learning_rate": 4.693574010468159e-06, "loss": 0.7422, "step": 2272 }, { "epoch": 0.18460164054251604, "grad_norm": 4.897421838824223, "learning_rate": 4.693258469658557e-06, "loss": 0.6693, "step": 2273 }, { "epoch": 0.18468285551855762, "grad_norm": 5.418227314133965, "learning_rate": 4.692942777087076e-06, "loss": 0.5361, "step": 2274 }, { "epoch": 0.1847640704945992, "grad_norm": 4.1871350066526745, "learning_rate": 4.692626932775561e-06, "loss": 0.6755, "step": 2275 }, { "epoch": 0.18484528547064077, "grad_norm": 10.739798887484266, "learning_rate": 4.6923109367458665e-06, "loss": 0.7373, "step": 2276 }, { "epoch": 0.18492650044668238, "grad_norm": 4.134435714566178, "learning_rate": 4.6919947890198585e-06, "loss": 0.5737, "step": 2277 }, { "epoch": 0.18500771542272396, "grad_norm": 4.222974710982374, "learning_rate": 4.691678489619411e-06, "loss": 0.7376, "step": 2278 }, { "epoch": 0.18508893039876553, "grad_norm": 4.6176435172773065, "learning_rate": 4.691362038566411e-06, "loss": 0.6068, "step": 2279 }, { "epoch": 0.1851701453748071, "grad_norm": 3.8516607452654577, "learning_rate": 4.691045435882758e-06, "loss": 0.5776, "step": 2280 }, { "epoch": 0.1852513603508487, "grad_norm": 4.380266220174846, "learning_rate": 4.690728681590354e-06, "loss": 0.4589, "step": 2281 }, { "epoch": 0.1853325753268903, "grad_norm": 20.10704260314573, "learning_rate": 4.6904117757111215e-06, "loss": 0.4806, "step": 2282 }, { "epoch": 0.18541379030293187, "grad_norm": 4.622570996308856, "learning_rate": 4.6900947182669855e-06, "loss": 0.5437, "step": 2283 }, { "epoch": 0.18549500527897345, "grad_norm": 3.4999814335004125, "learning_rate": 4.689777509279886e-06, "loss": 0.6253, "step": 2284 }, { "epoch": 0.18557622025501502, "grad_norm": 8.159646406598347, "learning_rate": 4.689460148771773e-06, "loss": 0.6941, "step": 2285 }, { "epoch": 0.1856574352310566, "grad_norm": 4.119601877397449, "learning_rate": 4.6891426367646046e-06, "loss": 0.5396, "step": 2286 }, { "epoch": 0.18573865020709818, "grad_norm": 5.783579437576225, "learning_rate": 4.6888249732803516e-06, "loss": 0.3815, "step": 2287 }, { "epoch": 0.18581986518313978, "grad_norm": 4.479086903423486, "learning_rate": 4.688507158340994e-06, "loss": 0.5961, "step": 2288 }, { "epoch": 0.18590108015918136, "grad_norm": 5.521280967881707, "learning_rate": 4.688189191968524e-06, "loss": 0.5213, "step": 2289 }, { "epoch": 0.18598229513522294, "grad_norm": 5.560685189930327, "learning_rate": 4.687871074184944e-06, "loss": 0.5069, "step": 2290 }, { "epoch": 0.1860635101112645, "grad_norm": 8.713540857848809, "learning_rate": 4.687552805012263e-06, "loss": 0.4698, "step": 2291 }, { "epoch": 0.1861447250873061, "grad_norm": 7.120678804920857, "learning_rate": 4.687234384472506e-06, "loss": 0.5941, "step": 2292 }, { "epoch": 0.1862259400633477, "grad_norm": 3.7747766229799105, "learning_rate": 4.686915812587706e-06, "loss": 0.532, "step": 2293 }, { "epoch": 0.18630715503938927, "grad_norm": 5.78663404222734, "learning_rate": 4.686597089379905e-06, "loss": 0.6332, "step": 2294 }, { "epoch": 0.18638837001543085, "grad_norm": 5.344791467556041, "learning_rate": 4.6862782148711584e-06, "loss": 0.551, "step": 2295 }, { "epoch": 0.18646958499147243, "grad_norm": 3.2548480768417183, "learning_rate": 4.685959189083531e-06, "loss": 0.7081, "step": 2296 }, { "epoch": 0.186550799967514, "grad_norm": 4.522340647304046, "learning_rate": 4.685640012039095e-06, "loss": 0.6577, "step": 2297 }, { "epoch": 0.18663201494355558, "grad_norm": 4.317215335517002, "learning_rate": 4.685320683759939e-06, "loss": 0.5544, "step": 2298 }, { "epoch": 0.18671322991959718, "grad_norm": 5.363458185770196, "learning_rate": 4.685001204268156e-06, "loss": 0.4849, "step": 2299 }, { "epoch": 0.18679444489563876, "grad_norm": 25.593738509492916, "learning_rate": 4.684681573585854e-06, "loss": 0.5735, "step": 2300 }, { "epoch": 0.18687565987168034, "grad_norm": 6.172329696323195, "learning_rate": 4.684361791735149e-06, "loss": 0.5465, "step": 2301 }, { "epoch": 0.18695687484772192, "grad_norm": 4.8669050067958874, "learning_rate": 4.684041858738169e-06, "loss": 0.6078, "step": 2302 }, { "epoch": 0.1870380898237635, "grad_norm": 12.729610338065322, "learning_rate": 4.683721774617052e-06, "loss": 0.5647, "step": 2303 }, { "epoch": 0.1871193047998051, "grad_norm": 5.8134799742582475, "learning_rate": 4.6834015393939445e-06, "loss": 0.5026, "step": 2304 }, { "epoch": 0.18720051977584667, "grad_norm": 3.5817996920030257, "learning_rate": 4.683081153091006e-06, "loss": 0.4811, "step": 2305 }, { "epoch": 0.18728173475188825, "grad_norm": 4.114121320477102, "learning_rate": 4.682760615730405e-06, "loss": 0.6439, "step": 2306 }, { "epoch": 0.18736294972792983, "grad_norm": 7.202440009743923, "learning_rate": 4.682439927334323e-06, "loss": 0.6019, "step": 2307 }, { "epoch": 0.1874441647039714, "grad_norm": 5.323112544773871, "learning_rate": 4.682119087924948e-06, "loss": 0.7116, "step": 2308 }, { "epoch": 0.18752537968001298, "grad_norm": 3.6253125361558967, "learning_rate": 4.681798097524479e-06, "loss": 0.6427, "step": 2309 }, { "epoch": 0.1876065946560546, "grad_norm": 4.355698784301835, "learning_rate": 4.681476956155131e-06, "loss": 0.4505, "step": 2310 }, { "epoch": 0.18768780963209616, "grad_norm": 3.395486667581571, "learning_rate": 4.681155663839122e-06, "loss": 0.5654, "step": 2311 }, { "epoch": 0.18776902460813774, "grad_norm": 5.363908094574731, "learning_rate": 4.680834220598685e-06, "loss": 0.5545, "step": 2312 }, { "epoch": 0.18785023958417932, "grad_norm": 4.304796229403271, "learning_rate": 4.6805126264560605e-06, "loss": 0.6393, "step": 2313 }, { "epoch": 0.1879314545602209, "grad_norm": 4.291548258367333, "learning_rate": 4.680190881433504e-06, "loss": 0.6884, "step": 2314 }, { "epoch": 0.1880126695362625, "grad_norm": 9.784318271093905, "learning_rate": 4.679868985553276e-06, "loss": 0.5777, "step": 2315 }, { "epoch": 0.18809388451230408, "grad_norm": 7.761537889560317, "learning_rate": 4.6795469388376525e-06, "loss": 0.47, "step": 2316 }, { "epoch": 0.18817509948834565, "grad_norm": 6.995156273431482, "learning_rate": 4.6792247413089145e-06, "loss": 0.6137, "step": 2317 }, { "epoch": 0.18825631446438723, "grad_norm": 4.313226522351422, "learning_rate": 4.678902392989359e-06, "loss": 0.6229, "step": 2318 }, { "epoch": 0.1883375294404288, "grad_norm": 5.081068946857541, "learning_rate": 4.678579893901288e-06, "loss": 0.5712, "step": 2319 }, { "epoch": 0.18841874441647039, "grad_norm": 4.809789843673943, "learning_rate": 4.678257244067019e-06, "loss": 0.5531, "step": 2320 }, { "epoch": 0.188499959392512, "grad_norm": 3.8452865625471255, "learning_rate": 4.677934443508877e-06, "loss": 0.5659, "step": 2321 }, { "epoch": 0.18858117436855357, "grad_norm": 4.917273325549435, "learning_rate": 4.6776114922491985e-06, "loss": 0.5518, "step": 2322 }, { "epoch": 0.18866238934459514, "grad_norm": 4.459277191818069, "learning_rate": 4.67728839031033e-06, "loss": 0.5119, "step": 2323 }, { "epoch": 0.18874360432063672, "grad_norm": 8.098290875630656, "learning_rate": 4.676965137714626e-06, "loss": 0.5824, "step": 2324 }, { "epoch": 0.1888248192966783, "grad_norm": 5.792865775052247, "learning_rate": 4.676641734484457e-06, "loss": 0.5991, "step": 2325 }, { "epoch": 0.1889060342727199, "grad_norm": 6.158616589247779, "learning_rate": 4.6763181806422e-06, "loss": 0.6869, "step": 2326 }, { "epoch": 0.18898724924876148, "grad_norm": 6.1672909888851395, "learning_rate": 4.675994476210243e-06, "loss": 0.4876, "step": 2327 }, { "epoch": 0.18906846422480306, "grad_norm": 4.374487244986078, "learning_rate": 4.675670621210985e-06, "loss": 0.4851, "step": 2328 }, { "epoch": 0.18914967920084463, "grad_norm": 4.378802636321053, "learning_rate": 4.675346615666834e-06, "loss": 0.5938, "step": 2329 }, { "epoch": 0.1892308941768862, "grad_norm": 6.805865659937306, "learning_rate": 4.675022459600209e-06, "loss": 0.5517, "step": 2330 }, { "epoch": 0.1893121091529278, "grad_norm": 10.069257270167085, "learning_rate": 4.674698153033542e-06, "loss": 0.5793, "step": 2331 }, { "epoch": 0.1893933241289694, "grad_norm": 5.002867430656955, "learning_rate": 4.674373695989272e-06, "loss": 0.4501, "step": 2332 }, { "epoch": 0.18947453910501097, "grad_norm": 6.5558875829671726, "learning_rate": 4.67404908848985e-06, "loss": 0.6941, "step": 2333 }, { "epoch": 0.18955575408105255, "grad_norm": 3.310181376592309, "learning_rate": 4.673724330557737e-06, "loss": 0.5876, "step": 2334 }, { "epoch": 0.18963696905709412, "grad_norm": 4.3922451080442615, "learning_rate": 4.673399422215405e-06, "loss": 0.6729, "step": 2335 }, { "epoch": 0.1897181840331357, "grad_norm": 3.2619584312724124, "learning_rate": 4.673074363485336e-06, "loss": 0.6935, "step": 2336 }, { "epoch": 0.1897993990091773, "grad_norm": 4.553167577761572, "learning_rate": 4.672749154390021e-06, "loss": 0.4633, "step": 2337 }, { "epoch": 0.18988061398521888, "grad_norm": 3.149250992164633, "learning_rate": 4.6724237949519635e-06, "loss": 0.6547, "step": 2338 }, { "epoch": 0.18996182896126046, "grad_norm": 3.8540236289067646, "learning_rate": 4.672098285193677e-06, "loss": 0.5031, "step": 2339 }, { "epoch": 0.19004304393730204, "grad_norm": 5.029855666317853, "learning_rate": 4.671772625137685e-06, "loss": 0.5433, "step": 2340 }, { "epoch": 0.1901242589133436, "grad_norm": 4.4089431032158, "learning_rate": 4.6714468148065215e-06, "loss": 0.6162, "step": 2341 }, { "epoch": 0.1902054738893852, "grad_norm": 4.951136934389378, "learning_rate": 4.67112085422273e-06, "loss": 0.5549, "step": 2342 }, { "epoch": 0.1902866888654268, "grad_norm": 4.969440488977865, "learning_rate": 4.6707947434088665e-06, "loss": 0.5311, "step": 2343 }, { "epoch": 0.19036790384146837, "grad_norm": 7.467675141875417, "learning_rate": 4.670468482387495e-06, "loss": 0.5161, "step": 2344 }, { "epoch": 0.19044911881750995, "grad_norm": 5.817463721744332, "learning_rate": 4.670142071181192e-06, "loss": 0.5573, "step": 2345 }, { "epoch": 0.19053033379355153, "grad_norm": 3.588261641157244, "learning_rate": 4.6698155098125435e-06, "loss": 0.4921, "step": 2346 }, { "epoch": 0.1906115487695931, "grad_norm": 5.247075946408668, "learning_rate": 4.6694887983041434e-06, "loss": 0.6335, "step": 2347 }, { "epoch": 0.1906927637456347, "grad_norm": 5.98092813691066, "learning_rate": 4.669161936678602e-06, "loss": 0.6301, "step": 2348 }, { "epoch": 0.19077397872167629, "grad_norm": 4.882612458777484, "learning_rate": 4.668834924958534e-06, "loss": 0.605, "step": 2349 }, { "epoch": 0.19085519369771786, "grad_norm": 4.135928802898827, "learning_rate": 4.668507763166568e-06, "loss": 0.7428, "step": 2350 }, { "epoch": 0.19093640867375944, "grad_norm": 6.436181992978232, "learning_rate": 4.668180451325341e-06, "loss": 0.6098, "step": 2351 }, { "epoch": 0.19101762364980102, "grad_norm": 8.974548488637932, "learning_rate": 4.667852989457502e-06, "loss": 0.5165, "step": 2352 }, { "epoch": 0.1910988386258426, "grad_norm": 4.064278950992003, "learning_rate": 4.6675253775857096e-06, "loss": 0.5658, "step": 2353 }, { "epoch": 0.1911800536018842, "grad_norm": 4.497922853159247, "learning_rate": 4.667197615732633e-06, "loss": 0.6748, "step": 2354 }, { "epoch": 0.19126126857792577, "grad_norm": 4.494597527974153, "learning_rate": 4.66686970392095e-06, "loss": 0.5488, "step": 2355 }, { "epoch": 0.19134248355396735, "grad_norm": 5.505569141881063, "learning_rate": 4.666541642173352e-06, "loss": 0.5822, "step": 2356 }, { "epoch": 0.19142369853000893, "grad_norm": 3.730344521450571, "learning_rate": 4.666213430512538e-06, "loss": 0.634, "step": 2357 }, { "epoch": 0.1915049135060505, "grad_norm": 5.750851724066535, "learning_rate": 4.66588506896122e-06, "loss": 0.5967, "step": 2358 }, { "epoch": 0.1915861284820921, "grad_norm": 5.307685486956423, "learning_rate": 4.665556557542118e-06, "loss": 0.7144, "step": 2359 }, { "epoch": 0.1916673434581337, "grad_norm": 5.143047388945135, "learning_rate": 4.6652278962779625e-06, "loss": 0.6383, "step": 2360 }, { "epoch": 0.19174855843417526, "grad_norm": 4.999760195133255, "learning_rate": 4.664899085191496e-06, "loss": 0.6477, "step": 2361 }, { "epoch": 0.19182977341021684, "grad_norm": 7.787092748476295, "learning_rate": 4.664570124305472e-06, "loss": 0.5304, "step": 2362 }, { "epoch": 0.19191098838625842, "grad_norm": 4.318743886643792, "learning_rate": 4.66424101364265e-06, "loss": 0.4677, "step": 2363 }, { "epoch": 0.1919922033623, "grad_norm": 4.9984105511208545, "learning_rate": 4.663911753225803e-06, "loss": 0.5786, "step": 2364 }, { "epoch": 0.1920734183383416, "grad_norm": 3.7096978585490556, "learning_rate": 4.663582343077716e-06, "loss": 0.4654, "step": 2365 }, { "epoch": 0.19215463331438318, "grad_norm": 5.0966976399052815, "learning_rate": 4.663252783221182e-06, "loss": 0.4797, "step": 2366 }, { "epoch": 0.19223584829042475, "grad_norm": 11.273482637934793, "learning_rate": 4.662923073679003e-06, "loss": 0.5612, "step": 2367 }, { "epoch": 0.19231706326646633, "grad_norm": 5.498434540946209, "learning_rate": 4.662593214473995e-06, "loss": 0.6027, "step": 2368 }, { "epoch": 0.1923982782425079, "grad_norm": 3.859479042229787, "learning_rate": 4.662263205628983e-06, "loss": 0.7433, "step": 2369 }, { "epoch": 0.1924794932185495, "grad_norm": 4.418822540532742, "learning_rate": 4.661933047166799e-06, "loss": 0.4785, "step": 2370 }, { "epoch": 0.1925607081945911, "grad_norm": 5.051803928696753, "learning_rate": 4.661602739110291e-06, "loss": 0.5654, "step": 2371 }, { "epoch": 0.19264192317063267, "grad_norm": 4.0841298739845895, "learning_rate": 4.661272281482313e-06, "loss": 0.7034, "step": 2372 }, { "epoch": 0.19272313814667424, "grad_norm": 7.928386324620138, "learning_rate": 4.660941674305732e-06, "loss": 0.57, "step": 2373 }, { "epoch": 0.19280435312271582, "grad_norm": 6.641168636182495, "learning_rate": 4.660610917603423e-06, "loss": 0.4108, "step": 2374 }, { "epoch": 0.1928855680987574, "grad_norm": 7.233453638420033, "learning_rate": 4.6602800113982746e-06, "loss": 0.5886, "step": 2375 }, { "epoch": 0.192966783074799, "grad_norm": 5.3241909788384385, "learning_rate": 4.659948955713181e-06, "loss": 0.6516, "step": 2376 }, { "epoch": 0.19304799805084058, "grad_norm": 6.517892337897202, "learning_rate": 4.659617750571052e-06, "loss": 0.5341, "step": 2377 }, { "epoch": 0.19312921302688216, "grad_norm": 8.78691467540056, "learning_rate": 4.659286395994806e-06, "loss": 0.7056, "step": 2378 }, { "epoch": 0.19321042800292373, "grad_norm": 5.510965559445886, "learning_rate": 4.658954892007367e-06, "loss": 0.4839, "step": 2379 }, { "epoch": 0.1932916429789653, "grad_norm": 5.065066805659556, "learning_rate": 4.658623238631675e-06, "loss": 0.5688, "step": 2380 }, { "epoch": 0.19337285795500692, "grad_norm": 4.049904522221411, "learning_rate": 4.658291435890681e-06, "loss": 0.5662, "step": 2381 }, { "epoch": 0.1934540729310485, "grad_norm": 5.778500245146091, "learning_rate": 4.657959483807342e-06, "loss": 0.52, "step": 2382 }, { "epoch": 0.19353528790709007, "grad_norm": 4.275606455147308, "learning_rate": 4.657627382404627e-06, "loss": 0.4357, "step": 2383 }, { "epoch": 0.19361650288313165, "grad_norm": 4.9185646028031345, "learning_rate": 4.657295131705516e-06, "loss": 0.6067, "step": 2384 }, { "epoch": 0.19369771785917322, "grad_norm": 4.858322104090867, "learning_rate": 4.6569627317329995e-06, "loss": 0.58, "step": 2385 }, { "epoch": 0.1937789328352148, "grad_norm": 5.010825230384572, "learning_rate": 4.656630182510078e-06, "loss": 0.5096, "step": 2386 }, { "epoch": 0.1938601478112564, "grad_norm": 4.971344756018251, "learning_rate": 4.656297484059761e-06, "loss": 0.5781, "step": 2387 }, { "epoch": 0.19394136278729798, "grad_norm": 5.7140608376861906, "learning_rate": 4.655964636405071e-06, "loss": 0.5301, "step": 2388 }, { "epoch": 0.19402257776333956, "grad_norm": 5.25952348586539, "learning_rate": 4.655631639569037e-06, "loss": 0.5345, "step": 2389 }, { "epoch": 0.19410379273938114, "grad_norm": 4.858225371501829, "learning_rate": 4.655298493574704e-06, "loss": 0.5165, "step": 2390 }, { "epoch": 0.19418500771542271, "grad_norm": 7.9815798526373145, "learning_rate": 4.65496519844512e-06, "loss": 0.5336, "step": 2391 }, { "epoch": 0.19426622269146432, "grad_norm": 6.221631927317744, "learning_rate": 4.654631754203351e-06, "loss": 0.5233, "step": 2392 }, { "epoch": 0.1943474376675059, "grad_norm": 5.199395166051189, "learning_rate": 4.6542981608724665e-06, "loss": 0.5399, "step": 2393 }, { "epoch": 0.19442865264354747, "grad_norm": 5.85510080538778, "learning_rate": 4.6539644184755515e-06, "loss": 0.6658, "step": 2394 }, { "epoch": 0.19450986761958905, "grad_norm": 5.187441178217061, "learning_rate": 4.6536305270356975e-06, "loss": 0.5119, "step": 2395 }, { "epoch": 0.19459108259563063, "grad_norm": 4.41810789985307, "learning_rate": 4.65329648657601e-06, "loss": 0.3997, "step": 2396 }, { "epoch": 0.1946722975716722, "grad_norm": 5.73516894480618, "learning_rate": 4.652962297119601e-06, "loss": 0.7692, "step": 2397 }, { "epoch": 0.1947535125477138, "grad_norm": 7.959466481454268, "learning_rate": 4.652627958689596e-06, "loss": 0.4979, "step": 2398 }, { "epoch": 0.19483472752375539, "grad_norm": 5.346996268227039, "learning_rate": 4.65229347130913e-06, "loss": 0.6685, "step": 2399 }, { "epoch": 0.19491594249979696, "grad_norm": 14.105527004476242, "learning_rate": 4.651958835001345e-06, "loss": 0.5679, "step": 2400 }, { "epoch": 0.19499715747583854, "grad_norm": 5.030593581231049, "learning_rate": 4.651624049789397e-06, "loss": 0.4162, "step": 2401 }, { "epoch": 0.19507837245188012, "grad_norm": 10.896072520758361, "learning_rate": 4.651289115696454e-06, "loss": 0.479, "step": 2402 }, { "epoch": 0.19515958742792172, "grad_norm": 4.958720750747442, "learning_rate": 4.650954032745689e-06, "loss": 0.5053, "step": 2403 }, { "epoch": 0.1952408024039633, "grad_norm": 3.974794068624415, "learning_rate": 4.6506188009602885e-06, "loss": 0.574, "step": 2404 }, { "epoch": 0.19532201738000488, "grad_norm": 8.24891618792213, "learning_rate": 4.65028342036345e-06, "loss": 0.514, "step": 2405 }, { "epoch": 0.19540323235604645, "grad_norm": 4.335525878369569, "learning_rate": 4.6499478909783764e-06, "loss": 0.427, "step": 2406 }, { "epoch": 0.19548444733208803, "grad_norm": 6.424849313389938, "learning_rate": 4.649612212828289e-06, "loss": 0.4575, "step": 2407 }, { "epoch": 0.1955656623081296, "grad_norm": 4.535461924057379, "learning_rate": 4.6492763859364134e-06, "loss": 0.6179, "step": 2408 }, { "epoch": 0.1956468772841712, "grad_norm": 7.409680541443817, "learning_rate": 4.648940410325987e-06, "loss": 0.6637, "step": 2409 }, { "epoch": 0.1957280922602128, "grad_norm": 4.6863434003973845, "learning_rate": 4.648604286020256e-06, "loss": 0.5738, "step": 2410 }, { "epoch": 0.19580930723625437, "grad_norm": 5.544427789484983, "learning_rate": 4.64826801304248e-06, "loss": 0.4905, "step": 2411 }, { "epoch": 0.19589052221229594, "grad_norm": 6.119211167714467, "learning_rate": 4.647931591415929e-06, "loss": 0.7071, "step": 2412 }, { "epoch": 0.19597173718833752, "grad_norm": 3.1340827150045714, "learning_rate": 4.647595021163878e-06, "loss": 0.6431, "step": 2413 }, { "epoch": 0.19605295216437912, "grad_norm": 5.376283662779039, "learning_rate": 4.647258302309618e-06, "loss": 0.6288, "step": 2414 }, { "epoch": 0.1961341671404207, "grad_norm": 6.177768802745785, "learning_rate": 4.646921434876447e-06, "loss": 0.4535, "step": 2415 }, { "epoch": 0.19621538211646228, "grad_norm": 12.088856729370683, "learning_rate": 4.646584418887675e-06, "loss": 0.6907, "step": 2416 }, { "epoch": 0.19629659709250386, "grad_norm": 8.938823511128325, "learning_rate": 4.646247254366622e-06, "loss": 0.5471, "step": 2417 }, { "epoch": 0.19637781206854543, "grad_norm": 6.009413684644707, "learning_rate": 4.645909941336619e-06, "loss": 0.6261, "step": 2418 }, { "epoch": 0.196459027044587, "grad_norm": 5.0207649299589034, "learning_rate": 4.645572479821004e-06, "loss": 0.5226, "step": 2419 }, { "epoch": 0.1965402420206286, "grad_norm": 5.8465718027493025, "learning_rate": 4.645234869843129e-06, "loss": 0.7304, "step": 2420 }, { "epoch": 0.1966214569966702, "grad_norm": 4.44988664177105, "learning_rate": 4.644897111426355e-06, "loss": 0.4947, "step": 2421 }, { "epoch": 0.19670267197271177, "grad_norm": 4.080957228552931, "learning_rate": 4.6445592045940515e-06, "loss": 0.6638, "step": 2422 }, { "epoch": 0.19678388694875334, "grad_norm": 6.67424212054808, "learning_rate": 4.644221149369602e-06, "loss": 0.5218, "step": 2423 }, { "epoch": 0.19686510192479492, "grad_norm": 4.3515760143355084, "learning_rate": 4.643882945776397e-06, "loss": 0.5116, "step": 2424 }, { "epoch": 0.19694631690083653, "grad_norm": 4.665310299217762, "learning_rate": 4.6435445938378375e-06, "loss": 0.5858, "step": 2425 }, { "epoch": 0.1970275318768781, "grad_norm": 3.4118287749102367, "learning_rate": 4.643206093577338e-06, "loss": 0.3923, "step": 2426 }, { "epoch": 0.19710874685291968, "grad_norm": 4.722707242292216, "learning_rate": 4.642867445018318e-06, "loss": 0.5732, "step": 2427 }, { "epoch": 0.19718996182896126, "grad_norm": 4.984185032243032, "learning_rate": 4.642528648184213e-06, "loss": 0.4799, "step": 2428 }, { "epoch": 0.19727117680500283, "grad_norm": 4.175601292098027, "learning_rate": 4.642189703098466e-06, "loss": 0.5171, "step": 2429 }, { "epoch": 0.1973523917810444, "grad_norm": 8.539390583313239, "learning_rate": 4.6418506097845264e-06, "loss": 0.7051, "step": 2430 }, { "epoch": 0.19743360675708602, "grad_norm": 6.857246634219236, "learning_rate": 4.641511368265861e-06, "loss": 0.491, "step": 2431 }, { "epoch": 0.1975148217331276, "grad_norm": 8.537762993370494, "learning_rate": 4.641171978565943e-06, "loss": 0.7512, "step": 2432 }, { "epoch": 0.19759603670916917, "grad_norm": 4.408249935592557, "learning_rate": 4.640832440708256e-06, "loss": 0.6716, "step": 2433 }, { "epoch": 0.19767725168521075, "grad_norm": 5.109570036404427, "learning_rate": 4.640492754716294e-06, "loss": 0.5133, "step": 2434 }, { "epoch": 0.19775846666125232, "grad_norm": 4.736765394511924, "learning_rate": 4.640152920613562e-06, "loss": 0.9746, "step": 2435 }, { "epoch": 0.19783968163729393, "grad_norm": 5.063603609846527, "learning_rate": 4.639812938423574e-06, "loss": 0.4769, "step": 2436 }, { "epoch": 0.1979208966133355, "grad_norm": 5.272415199098991, "learning_rate": 4.639472808169857e-06, "loss": 0.5295, "step": 2437 }, { "epoch": 0.19800211158937708, "grad_norm": 8.257542098149266, "learning_rate": 4.639132529875943e-06, "loss": 0.6467, "step": 2438 }, { "epoch": 0.19808332656541866, "grad_norm": 5.3139070377916315, "learning_rate": 4.63879210356538e-06, "loss": 0.5433, "step": 2439 }, { "epoch": 0.19816454154146024, "grad_norm": 6.841621560493213, "learning_rate": 4.6384515292617226e-06, "loss": 0.5335, "step": 2440 }, { "epoch": 0.19824575651750181, "grad_norm": 8.402046267183342, "learning_rate": 4.6381108069885376e-06, "loss": 0.5433, "step": 2441 }, { "epoch": 0.19832697149354342, "grad_norm": 3.6536735919997674, "learning_rate": 4.6377699367694e-06, "loss": 0.5768, "step": 2442 }, { "epoch": 0.198408186469585, "grad_norm": 6.961987557428991, "learning_rate": 4.637428918627896e-06, "loss": 0.5184, "step": 2443 }, { "epoch": 0.19848940144562657, "grad_norm": 5.119773267513261, "learning_rate": 4.637087752587624e-06, "loss": 0.6723, "step": 2444 }, { "epoch": 0.19857061642166815, "grad_norm": 5.37579569573017, "learning_rate": 4.636746438672189e-06, "loss": 0.4536, "step": 2445 }, { "epoch": 0.19865183139770973, "grad_norm": 4.8653070857140825, "learning_rate": 4.63640497690521e-06, "loss": 0.7141, "step": 2446 }, { "epoch": 0.19873304637375133, "grad_norm": 5.213638271403077, "learning_rate": 4.636063367310313e-06, "loss": 0.4026, "step": 2447 }, { "epoch": 0.1988142613497929, "grad_norm": 4.022917999876865, "learning_rate": 4.635721609911137e-06, "loss": 0.8184, "step": 2448 }, { "epoch": 0.19889547632583449, "grad_norm": 5.721442337840194, "learning_rate": 4.635379704731327e-06, "loss": 0.5173, "step": 2449 }, { "epoch": 0.19897669130187606, "grad_norm": 4.746208323114987, "learning_rate": 4.635037651794544e-06, "loss": 0.6091, "step": 2450 }, { "epoch": 0.19905790627791764, "grad_norm": 3.7466714374073393, "learning_rate": 4.634695451124454e-06, "loss": 0.5977, "step": 2451 }, { "epoch": 0.19913912125395922, "grad_norm": 5.06981488833001, "learning_rate": 4.634353102744737e-06, "loss": 0.4463, "step": 2452 }, { "epoch": 0.19922033623000082, "grad_norm": 4.625187502181156, "learning_rate": 4.634010606679081e-06, "loss": 0.5105, "step": 2453 }, { "epoch": 0.1993015512060424, "grad_norm": 4.590440891315889, "learning_rate": 4.633667962951186e-06, "loss": 0.5703, "step": 2454 }, { "epoch": 0.19938276618208398, "grad_norm": 8.282884396118154, "learning_rate": 4.6333251715847595e-06, "loss": 0.5573, "step": 2455 }, { "epoch": 0.19946398115812555, "grad_norm": 5.99824415434856, "learning_rate": 4.6329822326035214e-06, "loss": 0.6637, "step": 2456 }, { "epoch": 0.19954519613416713, "grad_norm": 4.69181027886392, "learning_rate": 4.632639146031201e-06, "loss": 0.548, "step": 2457 }, { "epoch": 0.19962641111020873, "grad_norm": 4.774146261889797, "learning_rate": 4.63229591189154e-06, "loss": 0.5267, "step": 2458 }, { "epoch": 0.1997076260862503, "grad_norm": 4.0388362634362505, "learning_rate": 4.631952530208286e-06, "loss": 0.6004, "step": 2459 }, { "epoch": 0.1997888410622919, "grad_norm": 11.372135655939333, "learning_rate": 4.6316090010052006e-06, "loss": 0.5377, "step": 2460 }, { "epoch": 0.19987005603833347, "grad_norm": 8.258738666338818, "learning_rate": 4.631265324306053e-06, "loss": 0.6255, "step": 2461 }, { "epoch": 0.19995127101437504, "grad_norm": 4.838265403873367, "learning_rate": 4.630921500134625e-06, "loss": 0.513, "step": 2462 }, { "epoch": 0.20003248599041662, "grad_norm": 6.210942277246636, "learning_rate": 4.630577528514707e-06, "loss": 0.5705, "step": 2463 }, { "epoch": 0.20011370096645822, "grad_norm": 4.867014588828138, "learning_rate": 4.6302334094701e-06, "loss": 0.7147, "step": 2464 }, { "epoch": 0.2001949159424998, "grad_norm": 3.4107828105330116, "learning_rate": 4.629889143024615e-06, "loss": 0.4722, "step": 2465 }, { "epoch": 0.20027613091854138, "grad_norm": 6.2252845176976574, "learning_rate": 4.6295447292020735e-06, "loss": 0.5165, "step": 2466 }, { "epoch": 0.20035734589458296, "grad_norm": 5.732379914152118, "learning_rate": 4.629200168026307e-06, "loss": 0.4561, "step": 2467 }, { "epoch": 0.20043856087062453, "grad_norm": 6.463671611218234, "learning_rate": 4.6288554595211575e-06, "loss": 0.5683, "step": 2468 }, { "epoch": 0.20051977584666614, "grad_norm": 6.359868474828741, "learning_rate": 4.628510603710478e-06, "loss": 0.5942, "step": 2469 }, { "epoch": 0.20060099082270771, "grad_norm": 5.920271897280425, "learning_rate": 4.628165600618129e-06, "loss": 0.5336, "step": 2470 }, { "epoch": 0.2006822057987493, "grad_norm": 4.578491403510653, "learning_rate": 4.627820450267984e-06, "loss": 0.4962, "step": 2471 }, { "epoch": 0.20076342077479087, "grad_norm": 3.829851338031233, "learning_rate": 4.627475152683924e-06, "loss": 0.554, "step": 2472 }, { "epoch": 0.20084463575083245, "grad_norm": 5.016784542180406, "learning_rate": 4.627129707889843e-06, "loss": 0.7465, "step": 2473 }, { "epoch": 0.20092585072687402, "grad_norm": 7.808176500440273, "learning_rate": 4.626784115909645e-06, "loss": 0.5234, "step": 2474 }, { "epoch": 0.20100706570291563, "grad_norm": 3.777859086820638, "learning_rate": 4.626438376767241e-06, "loss": 0.7113, "step": 2475 }, { "epoch": 0.2010882806789572, "grad_norm": 6.167573513223816, "learning_rate": 4.626092490486557e-06, "loss": 0.5625, "step": 2476 }, { "epoch": 0.20116949565499878, "grad_norm": 4.609400429566531, "learning_rate": 4.6257464570915235e-06, "loss": 0.39, "step": 2477 }, { "epoch": 0.20125071063104036, "grad_norm": 4.207144293674372, "learning_rate": 4.625400276606086e-06, "loss": 0.4698, "step": 2478 }, { "epoch": 0.20133192560708194, "grad_norm": 4.988407235613174, "learning_rate": 4.625053949054198e-06, "loss": 0.6785, "step": 2479 }, { "epoch": 0.20141314058312354, "grad_norm": 3.9367620435534314, "learning_rate": 4.6247074744598234e-06, "loss": 0.7332, "step": 2480 }, { "epoch": 0.20149435555916512, "grad_norm": 3.9271827060994564, "learning_rate": 4.6243608528469356e-06, "loss": 0.489, "step": 2481 }, { "epoch": 0.2015755705352067, "grad_norm": 5.02867311978453, "learning_rate": 4.6240140842395205e-06, "loss": 0.5228, "step": 2482 }, { "epoch": 0.20165678551124827, "grad_norm": 3.9679711814084406, "learning_rate": 4.623667168661572e-06, "loss": 0.8265, "step": 2483 }, { "epoch": 0.20173800048728985, "grad_norm": 4.395768395942478, "learning_rate": 4.623320106137095e-06, "loss": 0.5777, "step": 2484 }, { "epoch": 0.20181921546333143, "grad_norm": 4.203898879590677, "learning_rate": 4.6229728966901036e-06, "loss": 0.7345, "step": 2485 }, { "epoch": 0.20190043043937303, "grad_norm": 3.5440873158516673, "learning_rate": 4.622625540344623e-06, "loss": 0.6419, "step": 2486 }, { "epoch": 0.2019816454154146, "grad_norm": 2.545901550496782, "learning_rate": 4.62227803712469e-06, "loss": 0.6928, "step": 2487 }, { "epoch": 0.20206286039145618, "grad_norm": 4.89784077750463, "learning_rate": 4.621930387054349e-06, "loss": 0.5345, "step": 2488 }, { "epoch": 0.20214407536749776, "grad_norm": 5.15304789976238, "learning_rate": 4.621582590157654e-06, "loss": 0.5168, "step": 2489 }, { "epoch": 0.20222529034353934, "grad_norm": 3.6944859365300062, "learning_rate": 4.621234646458673e-06, "loss": 0.6469, "step": 2490 }, { "epoch": 0.20230650531958094, "grad_norm": 4.253884701651634, "learning_rate": 4.6208865559814805e-06, "loss": 0.6652, "step": 2491 }, { "epoch": 0.20238772029562252, "grad_norm": 5.40192379256261, "learning_rate": 4.620538318750163e-06, "loss": 0.5256, "step": 2492 }, { "epoch": 0.2024689352716641, "grad_norm": 3.4100834853157704, "learning_rate": 4.620189934788817e-06, "loss": 0.6897, "step": 2493 }, { "epoch": 0.20255015024770567, "grad_norm": 3.8199295718971102, "learning_rate": 4.6198414041215484e-06, "loss": 0.5001, "step": 2494 }, { "epoch": 0.20263136522374725, "grad_norm": 3.145895750195355, "learning_rate": 4.619492726772473e-06, "loss": 0.6773, "step": 2495 }, { "epoch": 0.20271258019978883, "grad_norm": 4.373203705255476, "learning_rate": 4.619143902765719e-06, "loss": 0.6648, "step": 2496 }, { "epoch": 0.20279379517583043, "grad_norm": 6.177848983200393, "learning_rate": 4.618794932125422e-06, "loss": 0.4791, "step": 2497 }, { "epoch": 0.202875010151872, "grad_norm": 3.952402300031322, "learning_rate": 4.61844581487573e-06, "loss": 0.8067, "step": 2498 }, { "epoch": 0.2029562251279136, "grad_norm": 3.670343152102567, "learning_rate": 4.618096551040798e-06, "loss": 0.502, "step": 2499 }, { "epoch": 0.20303744010395516, "grad_norm": 4.359414649768362, "learning_rate": 4.617747140644796e-06, "loss": 0.5255, "step": 2500 }, { "epoch": 0.20311865507999674, "grad_norm": 6.653021158021161, "learning_rate": 4.617397583711899e-06, "loss": 0.6263, "step": 2501 }, { "epoch": 0.20319987005603835, "grad_norm": 4.748016005088914, "learning_rate": 4.617047880266295e-06, "loss": 0.6699, "step": 2502 }, { "epoch": 0.20328108503207992, "grad_norm": 3.35437246059798, "learning_rate": 4.616698030332183e-06, "loss": 0.4972, "step": 2503 }, { "epoch": 0.2033623000081215, "grad_norm": 14.870459669223077, "learning_rate": 4.616348033933769e-06, "loss": 0.5515, "step": 2504 }, { "epoch": 0.20344351498416308, "grad_norm": 5.088873364458127, "learning_rate": 4.615997891095272e-06, "loss": 0.6022, "step": 2505 }, { "epoch": 0.20352472996020465, "grad_norm": 3.7990841422951447, "learning_rate": 4.6156476018409204e-06, "loss": 0.6326, "step": 2506 }, { "epoch": 0.20360594493624623, "grad_norm": 4.333396061155222, "learning_rate": 4.61529716619495e-06, "loss": 0.6267, "step": 2507 }, { "epoch": 0.20368715991228784, "grad_norm": 4.685302979204798, "learning_rate": 4.614946584181612e-06, "loss": 0.6191, "step": 2508 }, { "epoch": 0.2037683748883294, "grad_norm": 4.124779647332827, "learning_rate": 4.614595855825164e-06, "loss": 0.4998, "step": 2509 }, { "epoch": 0.203849589864371, "grad_norm": 4.501692024239236, "learning_rate": 4.6142449811498725e-06, "loss": 0.5782, "step": 2510 }, { "epoch": 0.20393080484041257, "grad_norm": 4.707923879718902, "learning_rate": 4.613893960180018e-06, "loss": 0.5297, "step": 2511 }, { "epoch": 0.20401201981645414, "grad_norm": 7.3215672264551035, "learning_rate": 4.613542792939891e-06, "loss": 0.7369, "step": 2512 }, { "epoch": 0.20409323479249575, "grad_norm": 5.2423338527201615, "learning_rate": 4.613191479453787e-06, "loss": 0.8081, "step": 2513 }, { "epoch": 0.20417444976853732, "grad_norm": 5.115816586092132, "learning_rate": 4.612840019746016e-06, "loss": 0.67, "step": 2514 }, { "epoch": 0.2042556647445789, "grad_norm": 3.4887642989488348, "learning_rate": 4.612488413840899e-06, "loss": 0.8183, "step": 2515 }, { "epoch": 0.20433687972062048, "grad_norm": 5.202669123420781, "learning_rate": 4.6121366617627635e-06, "loss": 0.4956, "step": 2516 }, { "epoch": 0.20441809469666206, "grad_norm": 4.783952508433326, "learning_rate": 4.6117847635359494e-06, "loss": 0.5427, "step": 2517 }, { "epoch": 0.20449930967270363, "grad_norm": 3.8836746569798, "learning_rate": 4.611432719184806e-06, "loss": 0.7236, "step": 2518 }, { "epoch": 0.20458052464874524, "grad_norm": 4.277705470777233, "learning_rate": 4.611080528733693e-06, "loss": 0.5379, "step": 2519 }, { "epoch": 0.20466173962478681, "grad_norm": 4.597920469594989, "learning_rate": 4.6107281922069805e-06, "loss": 0.6815, "step": 2520 }, { "epoch": 0.2047429546008284, "grad_norm": 5.510027910722754, "learning_rate": 4.610375709629047e-06, "loss": 0.5213, "step": 2521 }, { "epoch": 0.20482416957686997, "grad_norm": 26.571846192694977, "learning_rate": 4.610023081024284e-06, "loss": 0.6494, "step": 2522 }, { "epoch": 0.20490538455291155, "grad_norm": 7.215116215564211, "learning_rate": 4.6096703064170915e-06, "loss": 0.5739, "step": 2523 }, { "epoch": 0.20498659952895315, "grad_norm": 5.568682872058116, "learning_rate": 4.609317385831879e-06, "loss": 0.7281, "step": 2524 }, { "epoch": 0.20506781450499473, "grad_norm": 7.457927768406811, "learning_rate": 4.608964319293066e-06, "loss": 0.6219, "step": 2525 }, { "epoch": 0.2051490294810363, "grad_norm": 4.3162604433484155, "learning_rate": 4.6086111068250834e-06, "loss": 0.5108, "step": 2526 }, { "epoch": 0.20523024445707788, "grad_norm": 5.339053755707136, "learning_rate": 4.608257748452372e-06, "loss": 0.4834, "step": 2527 }, { "epoch": 0.20531145943311946, "grad_norm": 3.319564859771241, "learning_rate": 4.607904244199384e-06, "loss": 0.6459, "step": 2528 }, { "epoch": 0.20539267440916104, "grad_norm": 3.3949377280505657, "learning_rate": 4.6075505940905765e-06, "loss": 0.8157, "step": 2529 }, { "epoch": 0.20547388938520264, "grad_norm": 4.846601443639954, "learning_rate": 4.607196798150423e-06, "loss": 0.6862, "step": 2530 }, { "epoch": 0.20555510436124422, "grad_norm": 3.634653145365433, "learning_rate": 4.606842856403402e-06, "loss": 0.4921, "step": 2531 }, { "epoch": 0.2056363193372858, "grad_norm": 3.453187944742966, "learning_rate": 4.6064887688740065e-06, "loss": 0.7135, "step": 2532 }, { "epoch": 0.20571753431332737, "grad_norm": 6.8028434368171, "learning_rate": 4.606134535586737e-06, "loss": 0.7577, "step": 2533 }, { "epoch": 0.20579874928936895, "grad_norm": 5.81957789301719, "learning_rate": 4.605780156566103e-06, "loss": 0.6454, "step": 2534 }, { "epoch": 0.20587996426541055, "grad_norm": 3.726486607180334, "learning_rate": 4.6054256318366275e-06, "loss": 0.5258, "step": 2535 }, { "epoch": 0.20596117924145213, "grad_norm": 6.553307065329235, "learning_rate": 4.6050709614228416e-06, "loss": 0.6497, "step": 2536 }, { "epoch": 0.2060423942174937, "grad_norm": 6.182500794690838, "learning_rate": 4.604716145349285e-06, "loss": 0.5451, "step": 2537 }, { "epoch": 0.20612360919353528, "grad_norm": 3.621812427485916, "learning_rate": 4.604361183640511e-06, "loss": 0.5455, "step": 2538 }, { "epoch": 0.20620482416957686, "grad_norm": 6.106652776271362, "learning_rate": 4.60400607632108e-06, "loss": 0.5335, "step": 2539 }, { "epoch": 0.20628603914561844, "grad_norm": 8.175917425138248, "learning_rate": 4.603650823415563e-06, "loss": 0.5764, "step": 2540 }, { "epoch": 0.20636725412166004, "grad_norm": 4.015447399797649, "learning_rate": 4.603295424948544e-06, "loss": 0.514, "step": 2541 }, { "epoch": 0.20644846909770162, "grad_norm": 4.883808409792754, "learning_rate": 4.602939880944612e-06, "loss": 0.5623, "step": 2542 }, { "epoch": 0.2065296840737432, "grad_norm": 3.0577448372785745, "learning_rate": 4.6025841914283705e-06, "loss": 0.5176, "step": 2543 }, { "epoch": 0.20661089904978477, "grad_norm": 3.73178717833002, "learning_rate": 4.602228356424431e-06, "loss": 0.5816, "step": 2544 }, { "epoch": 0.20669211402582635, "grad_norm": 3.105000510085083, "learning_rate": 4.601872375957414e-06, "loss": 0.7313, "step": 2545 }, { "epoch": 0.20677332900186796, "grad_norm": 5.053370814100823, "learning_rate": 4.601516250051954e-06, "loss": 0.4578, "step": 2546 }, { "epoch": 0.20685454397790953, "grad_norm": 3.758806458666759, "learning_rate": 4.601159978732691e-06, "loss": 0.5377, "step": 2547 }, { "epoch": 0.2069357589539511, "grad_norm": 3.486408365033022, "learning_rate": 4.600803562024277e-06, "loss": 0.6031, "step": 2548 }, { "epoch": 0.2070169739299927, "grad_norm": 4.0286968600378925, "learning_rate": 4.6004469999513755e-06, "loss": 0.6561, "step": 2549 }, { "epoch": 0.20709818890603426, "grad_norm": 4.653795014118944, "learning_rate": 4.600090292538658e-06, "loss": 0.5098, "step": 2550 }, { "epoch": 0.20717940388207584, "grad_norm": 4.008738162793329, "learning_rate": 4.599733439810807e-06, "loss": 0.596, "step": 2551 }, { "epoch": 0.20726061885811745, "grad_norm": 6.379187521913423, "learning_rate": 4.5993764417925145e-06, "loss": 0.4166, "step": 2552 }, { "epoch": 0.20734183383415902, "grad_norm": 5.162913747178339, "learning_rate": 4.599019298508482e-06, "loss": 0.6534, "step": 2553 }, { "epoch": 0.2074230488102006, "grad_norm": 4.626210052826402, "learning_rate": 4.598662009983424e-06, "loss": 0.459, "step": 2554 }, { "epoch": 0.20750426378624218, "grad_norm": 4.343058752050401, "learning_rate": 4.598304576242063e-06, "loss": 0.5086, "step": 2555 }, { "epoch": 0.20758547876228375, "grad_norm": 7.300206881482361, "learning_rate": 4.597946997309129e-06, "loss": 0.5035, "step": 2556 }, { "epoch": 0.20766669373832536, "grad_norm": 3.0899015840363075, "learning_rate": 4.597589273209366e-06, "loss": 0.5187, "step": 2557 }, { "epoch": 0.20774790871436694, "grad_norm": 6.052813956883026, "learning_rate": 4.597231403967527e-06, "loss": 0.5379, "step": 2558 }, { "epoch": 0.2078291236904085, "grad_norm": 5.517294439794208, "learning_rate": 4.5968733896083745e-06, "loss": 0.8438, "step": 2559 }, { "epoch": 0.2079103386664501, "grad_norm": 4.128375503613039, "learning_rate": 4.59651523015668e-06, "loss": 0.6151, "step": 2560 }, { "epoch": 0.20799155364249167, "grad_norm": 6.118250286731413, "learning_rate": 4.5961569256372285e-06, "loss": 0.5982, "step": 2561 }, { "epoch": 0.20807276861853324, "grad_norm": 6.553871079659764, "learning_rate": 4.595798476074811e-06, "loss": 0.4833, "step": 2562 }, { "epoch": 0.20815398359457485, "grad_norm": 5.380395025356601, "learning_rate": 4.59543988149423e-06, "loss": 0.506, "step": 2563 }, { "epoch": 0.20823519857061643, "grad_norm": 5.37381105799694, "learning_rate": 4.595081141920301e-06, "loss": 0.4393, "step": 2564 }, { "epoch": 0.208316413546658, "grad_norm": 4.617124660591345, "learning_rate": 4.594722257377844e-06, "loss": 0.6313, "step": 2565 }, { "epoch": 0.20839762852269958, "grad_norm": 3.1785490526390974, "learning_rate": 4.594363227891693e-06, "loss": 0.6085, "step": 2566 }, { "epoch": 0.20847884349874116, "grad_norm": 4.209871969565939, "learning_rate": 4.5940040534866905e-06, "loss": 0.8759, "step": 2567 }, { "epoch": 0.20856005847478276, "grad_norm": 5.084178440847334, "learning_rate": 4.59364473418769e-06, "loss": 0.4414, "step": 2568 }, { "epoch": 0.20864127345082434, "grad_norm": 4.799592885282473, "learning_rate": 4.593285270019555e-06, "loss": 0.4927, "step": 2569 }, { "epoch": 0.20872248842686592, "grad_norm": 3.748127121013163, "learning_rate": 4.592925661007157e-06, "loss": 0.7891, "step": 2570 }, { "epoch": 0.2088037034029075, "grad_norm": 7.646211226477787, "learning_rate": 4.592565907175381e-06, "loss": 0.4914, "step": 2571 }, { "epoch": 0.20888491837894907, "grad_norm": 4.10621409423765, "learning_rate": 4.592206008549118e-06, "loss": 0.5923, "step": 2572 }, { "epoch": 0.20896613335499067, "grad_norm": 5.917514515415448, "learning_rate": 4.591845965153272e-06, "loss": 0.6719, "step": 2573 }, { "epoch": 0.20904734833103225, "grad_norm": 4.0238274192183985, "learning_rate": 4.591485777012757e-06, "loss": 0.636, "step": 2574 }, { "epoch": 0.20912856330707383, "grad_norm": 5.0280430135245116, "learning_rate": 4.591125444152495e-06, "loss": 0.7144, "step": 2575 }, { "epoch": 0.2092097782831154, "grad_norm": 5.449721642811628, "learning_rate": 4.590764966597419e-06, "loss": 0.5855, "step": 2576 }, { "epoch": 0.20929099325915698, "grad_norm": 5.452281756916645, "learning_rate": 4.590404344372472e-06, "loss": 0.5043, "step": 2577 }, { "epoch": 0.20937220823519856, "grad_norm": 7.9224346525989375, "learning_rate": 4.590043577502609e-06, "loss": 0.5569, "step": 2578 }, { "epoch": 0.20945342321124016, "grad_norm": 5.570197940829358, "learning_rate": 4.589682666012791e-06, "loss": 0.5516, "step": 2579 }, { "epoch": 0.20953463818728174, "grad_norm": 4.7228930255767425, "learning_rate": 4.5893216099279925e-06, "loss": 0.7413, "step": 2580 }, { "epoch": 0.20961585316332332, "grad_norm": 4.327696809821123, "learning_rate": 4.588960409273196e-06, "loss": 0.6607, "step": 2581 }, { "epoch": 0.2096970681393649, "grad_norm": 3.271070383423843, "learning_rate": 4.588599064073395e-06, "loss": 0.6295, "step": 2582 }, { "epoch": 0.20977828311540647, "grad_norm": 4.928203939520188, "learning_rate": 4.588237574353592e-06, "loss": 0.4415, "step": 2583 }, { "epoch": 0.20985949809144808, "grad_norm": 4.524773784130613, "learning_rate": 4.587875940138801e-06, "loss": 0.4493, "step": 2584 }, { "epoch": 0.20994071306748965, "grad_norm": 5.4612869658664005, "learning_rate": 4.587514161454045e-06, "loss": 0.6702, "step": 2585 }, { "epoch": 0.21002192804353123, "grad_norm": 5.877089399039422, "learning_rate": 4.587152238324357e-06, "loss": 0.5599, "step": 2586 }, { "epoch": 0.2101031430195728, "grad_norm": 5.615645343923605, "learning_rate": 4.58679017077478e-06, "loss": 0.6475, "step": 2587 }, { "epoch": 0.21018435799561438, "grad_norm": 4.466358103769594, "learning_rate": 4.586427958830367e-06, "loss": 0.7045, "step": 2588 }, { "epoch": 0.21026557297165596, "grad_norm": 3.905668647886352, "learning_rate": 4.586065602516182e-06, "loss": 0.5548, "step": 2589 }, { "epoch": 0.21034678794769757, "grad_norm": 5.834395303055314, "learning_rate": 4.585703101857298e-06, "loss": 0.4948, "step": 2590 }, { "epoch": 0.21042800292373914, "grad_norm": 5.414482998853449, "learning_rate": 4.585340456878798e-06, "loss": 0.4951, "step": 2591 }, { "epoch": 0.21050921789978072, "grad_norm": 3.1815206113624535, "learning_rate": 4.584977667605774e-06, "loss": 0.5363, "step": 2592 }, { "epoch": 0.2105904328758223, "grad_norm": 3.1419631204303915, "learning_rate": 4.5846147340633305e-06, "loss": 0.5075, "step": 2593 }, { "epoch": 0.21067164785186387, "grad_norm": 3.5141257327029205, "learning_rate": 4.58425165627658e-06, "loss": 0.543, "step": 2594 }, { "epoch": 0.21075286282790548, "grad_norm": 4.156860647624618, "learning_rate": 4.583888434270645e-06, "loss": 0.5039, "step": 2595 }, { "epoch": 0.21083407780394706, "grad_norm": 4.750813630585856, "learning_rate": 4.58352506807066e-06, "loss": 0.5138, "step": 2596 }, { "epoch": 0.21091529277998863, "grad_norm": 9.312381581764441, "learning_rate": 4.583161557701767e-06, "loss": 0.5763, "step": 2597 }, { "epoch": 0.2109965077560302, "grad_norm": 8.915059751168238, "learning_rate": 4.582797903189119e-06, "loss": 0.5809, "step": 2598 }, { "epoch": 0.2110777227320718, "grad_norm": 3.648070795784496, "learning_rate": 4.582434104557879e-06, "loss": 0.482, "step": 2599 }, { "epoch": 0.21115893770811336, "grad_norm": 4.2519549882216845, "learning_rate": 4.582070161833221e-06, "loss": 0.5429, "step": 2600 }, { "epoch": 0.21124015268415497, "grad_norm": 4.546548576906945, "learning_rate": 4.581706075040326e-06, "loss": 0.5398, "step": 2601 }, { "epoch": 0.21132136766019655, "grad_norm": 5.001987184534323, "learning_rate": 4.5813418442043885e-06, "loss": 0.5318, "step": 2602 }, { "epoch": 0.21140258263623812, "grad_norm": 6.362483259653969, "learning_rate": 4.58097746935061e-06, "loss": 0.5363, "step": 2603 }, { "epoch": 0.2114837976122797, "grad_norm": 4.0989340540279064, "learning_rate": 4.580612950504204e-06, "loss": 0.5545, "step": 2604 }, { "epoch": 0.21156501258832128, "grad_norm": 5.132444357319185, "learning_rate": 4.580248287690394e-06, "loss": 0.4948, "step": 2605 }, { "epoch": 0.21164622756436288, "grad_norm": 7.374853871968238, "learning_rate": 4.579883480934413e-06, "loss": 0.5281, "step": 2606 }, { "epoch": 0.21172744254040446, "grad_norm": 8.014579215103465, "learning_rate": 4.579518530261501e-06, "loss": 0.6517, "step": 2607 }, { "epoch": 0.21180865751644604, "grad_norm": 4.7524183309310315, "learning_rate": 4.579153435696913e-06, "loss": 0.5649, "step": 2608 }, { "epoch": 0.2118898724924876, "grad_norm": 3.2946505830547697, "learning_rate": 4.578788197265911e-06, "loss": 0.4996, "step": 2609 }, { "epoch": 0.2119710874685292, "grad_norm": 4.2400296703901645, "learning_rate": 4.578422814993768e-06, "loss": 0.5894, "step": 2610 }, { "epoch": 0.21205230244457077, "grad_norm": 5.74324372003261, "learning_rate": 4.578057288905766e-06, "loss": 0.6345, "step": 2611 }, { "epoch": 0.21213351742061237, "grad_norm": 3.66763262733531, "learning_rate": 4.577691619027197e-06, "loss": 0.5196, "step": 2612 }, { "epoch": 0.21221473239665395, "grad_norm": 7.157752603621458, "learning_rate": 4.577325805383364e-06, "loss": 0.4639, "step": 2613 }, { "epoch": 0.21229594737269553, "grad_norm": 6.775920682174733, "learning_rate": 4.57695984799958e-06, "loss": 0.5971, "step": 2614 }, { "epoch": 0.2123771623487371, "grad_norm": 5.5027350147320275, "learning_rate": 4.576593746901166e-06, "loss": 0.6259, "step": 2615 }, { "epoch": 0.21245837732477868, "grad_norm": 4.893766528880051, "learning_rate": 4.576227502113455e-06, "loss": 0.5943, "step": 2616 }, { "epoch": 0.21253959230082028, "grad_norm": 4.639805956669454, "learning_rate": 4.575861113661791e-06, "loss": 0.7361, "step": 2617 }, { "epoch": 0.21262080727686186, "grad_norm": 4.20860766014754, "learning_rate": 4.575494581571521e-06, "loss": 0.3826, "step": 2618 }, { "epoch": 0.21270202225290344, "grad_norm": 5.655888239003785, "learning_rate": 4.575127905868013e-06, "loss": 0.5891, "step": 2619 }, { "epoch": 0.21278323722894502, "grad_norm": 5.032532015247516, "learning_rate": 4.574761086576635e-06, "loss": 0.4914, "step": 2620 }, { "epoch": 0.2128644522049866, "grad_norm": 11.837098472717457, "learning_rate": 4.57439412372277e-06, "loss": 0.4658, "step": 2621 }, { "epoch": 0.21294566718102817, "grad_norm": 14.561447419694415, "learning_rate": 4.574027017331812e-06, "loss": 0.5611, "step": 2622 }, { "epoch": 0.21302688215706977, "grad_norm": 3.633636275262492, "learning_rate": 4.57365976742916e-06, "loss": 0.5933, "step": 2623 }, { "epoch": 0.21310809713311135, "grad_norm": 6.065413078817048, "learning_rate": 4.573292374040227e-06, "loss": 0.5984, "step": 2624 }, { "epoch": 0.21318931210915293, "grad_norm": 12.083369823179202, "learning_rate": 4.572924837190434e-06, "loss": 0.4037, "step": 2625 }, { "epoch": 0.2132705270851945, "grad_norm": 4.512701432450698, "learning_rate": 4.572557156905213e-06, "loss": 0.6599, "step": 2626 }, { "epoch": 0.21335174206123608, "grad_norm": 4.466963402457013, "learning_rate": 4.572189333210007e-06, "loss": 0.6408, "step": 2627 }, { "epoch": 0.2134329570372777, "grad_norm": 4.5137793929351115, "learning_rate": 4.571821366130265e-06, "loss": 0.4764, "step": 2628 }, { "epoch": 0.21351417201331926, "grad_norm": 3.8656056596236184, "learning_rate": 4.571453255691449e-06, "loss": 0.4507, "step": 2629 }, { "epoch": 0.21359538698936084, "grad_norm": 4.382341801906267, "learning_rate": 4.571085001919031e-06, "loss": 0.6934, "step": 2630 }, { "epoch": 0.21367660196540242, "grad_norm": 8.589875710292333, "learning_rate": 4.570716604838492e-06, "loss": 0.6249, "step": 2631 }, { "epoch": 0.213757816941444, "grad_norm": 4.007630827099167, "learning_rate": 4.570348064475323e-06, "loss": 0.6744, "step": 2632 }, { "epoch": 0.21383903191748557, "grad_norm": 5.484359947965143, "learning_rate": 4.569979380855025e-06, "loss": 0.5493, "step": 2633 }, { "epoch": 0.21392024689352718, "grad_norm": 4.493010697403236, "learning_rate": 4.56961055400311e-06, "loss": 0.5519, "step": 2634 }, { "epoch": 0.21400146186956875, "grad_norm": 4.045159603215263, "learning_rate": 4.5692415839450965e-06, "loss": 0.59, "step": 2635 }, { "epoch": 0.21408267684561033, "grad_norm": 4.821023174942057, "learning_rate": 4.568872470706518e-06, "loss": 0.3977, "step": 2636 }, { "epoch": 0.2141638918216519, "grad_norm": 5.675194155783661, "learning_rate": 4.568503214312913e-06, "loss": 0.5558, "step": 2637 }, { "epoch": 0.21424510679769349, "grad_norm": 4.7601824577948015, "learning_rate": 4.568133814789833e-06, "loss": 0.5638, "step": 2638 }, { "epoch": 0.2143263217737351, "grad_norm": 5.107493849030825, "learning_rate": 4.567764272162839e-06, "loss": 0.5493, "step": 2639 }, { "epoch": 0.21440753674977667, "grad_norm": 4.8431041110711766, "learning_rate": 4.567394586457501e-06, "loss": 0.5961, "step": 2640 }, { "epoch": 0.21448875172581824, "grad_norm": 6.24655286207524, "learning_rate": 4.567024757699399e-06, "loss": 0.5455, "step": 2641 }, { "epoch": 0.21456996670185982, "grad_norm": 4.819548308107245, "learning_rate": 4.566654785914123e-06, "loss": 0.5451, "step": 2642 }, { "epoch": 0.2146511816779014, "grad_norm": 8.148594207313314, "learning_rate": 4.566284671127273e-06, "loss": 0.4268, "step": 2643 }, { "epoch": 0.21473239665394298, "grad_norm": 5.200028510658096, "learning_rate": 4.56591441336446e-06, "loss": 0.5667, "step": 2644 }, { "epoch": 0.21481361162998458, "grad_norm": 3.953005099419873, "learning_rate": 4.565544012651304e-06, "loss": 0.9163, "step": 2645 }, { "epoch": 0.21489482660602616, "grad_norm": 6.166119347251547, "learning_rate": 4.565173469013432e-06, "loss": 0.4929, "step": 2646 }, { "epoch": 0.21497604158206773, "grad_norm": 17.196433928853956, "learning_rate": 4.564802782476487e-06, "loss": 0.5905, "step": 2647 }, { "epoch": 0.2150572565581093, "grad_norm": 6.178896351260531, "learning_rate": 4.564431953066118e-06, "loss": 0.7147, "step": 2648 }, { "epoch": 0.2151384715341509, "grad_norm": 6.440122516675341, "learning_rate": 4.564060980807983e-06, "loss": 0.7101, "step": 2649 }, { "epoch": 0.2152196865101925, "grad_norm": 5.2000555006994125, "learning_rate": 4.563689865727752e-06, "loss": 0.6096, "step": 2650 }, { "epoch": 0.21530090148623407, "grad_norm": 4.6419662478035155, "learning_rate": 4.563318607851104e-06, "loss": 0.5928, "step": 2651 }, { "epoch": 0.21538211646227565, "grad_norm": 8.141047545711643, "learning_rate": 4.562947207203728e-06, "loss": 0.4954, "step": 2652 }, { "epoch": 0.21546333143831722, "grad_norm": 3.9486721093318153, "learning_rate": 4.562575663811324e-06, "loss": 0.6568, "step": 2653 }, { "epoch": 0.2155445464143588, "grad_norm": 5.149290757056804, "learning_rate": 4.5622039776996006e-06, "loss": 0.5941, "step": 2654 }, { "epoch": 0.21562576139040038, "grad_norm": 6.391781782039198, "learning_rate": 4.561832148894275e-06, "loss": 0.4204, "step": 2655 }, { "epoch": 0.21570697636644198, "grad_norm": 9.968941914044036, "learning_rate": 4.561460177421078e-06, "loss": 0.5599, "step": 2656 }, { "epoch": 0.21578819134248356, "grad_norm": 5.522046324373681, "learning_rate": 4.561088063305745e-06, "loss": 0.6469, "step": 2657 }, { "epoch": 0.21586940631852514, "grad_norm": 3.8464087079545277, "learning_rate": 4.560715806574028e-06, "loss": 0.6275, "step": 2658 }, { "epoch": 0.2159506212945667, "grad_norm": 5.078649015582283, "learning_rate": 4.560343407251682e-06, "loss": 0.5032, "step": 2659 }, { "epoch": 0.2160318362706083, "grad_norm": 4.334473048243304, "learning_rate": 4.559970865364477e-06, "loss": 0.6853, "step": 2660 }, { "epoch": 0.2161130512466499, "grad_norm": 6.419291069877329, "learning_rate": 4.55959818093819e-06, "loss": 0.5029, "step": 2661 }, { "epoch": 0.21619426622269147, "grad_norm": 5.8025560914497385, "learning_rate": 4.559225353998609e-06, "loss": 0.4812, "step": 2662 }, { "epoch": 0.21627548119873305, "grad_norm": 4.30038594406105, "learning_rate": 4.558852384571533e-06, "loss": 0.5198, "step": 2663 }, { "epoch": 0.21635669617477463, "grad_norm": 4.101496826162754, "learning_rate": 4.558479272682768e-06, "loss": 0.6267, "step": 2664 }, { "epoch": 0.2164379111508162, "grad_norm": 5.736035126104208, "learning_rate": 4.558106018358131e-06, "loss": 0.5187, "step": 2665 }, { "epoch": 0.21651912612685778, "grad_norm": 5.345843996754261, "learning_rate": 4.557732621623449e-06, "loss": 0.5716, "step": 2666 }, { "epoch": 0.21660034110289939, "grad_norm": 4.853630853027857, "learning_rate": 4.557359082504562e-06, "loss": 0.4583, "step": 2667 }, { "epoch": 0.21668155607894096, "grad_norm": 3.473558582520133, "learning_rate": 4.556985401027314e-06, "loss": 0.4872, "step": 2668 }, { "epoch": 0.21676277105498254, "grad_norm": 6.011752197100141, "learning_rate": 4.556611577217563e-06, "loss": 0.5814, "step": 2669 }, { "epoch": 0.21684398603102412, "grad_norm": 4.738766968848329, "learning_rate": 4.5562376111011745e-06, "loss": 0.4709, "step": 2670 }, { "epoch": 0.2169252010070657, "grad_norm": 9.10744576822217, "learning_rate": 4.5558635027040265e-06, "loss": 0.542, "step": 2671 }, { "epoch": 0.2170064159831073, "grad_norm": 3.811056086790161, "learning_rate": 4.555489252052005e-06, "loss": 0.6779, "step": 2672 }, { "epoch": 0.21708763095914888, "grad_norm": 5.148351495523538, "learning_rate": 4.5551148591710045e-06, "loss": 0.491, "step": 2673 }, { "epoch": 0.21716884593519045, "grad_norm": 5.581434436680359, "learning_rate": 4.5547403240869335e-06, "loss": 0.6449, "step": 2674 }, { "epoch": 0.21725006091123203, "grad_norm": 3.742718625839809, "learning_rate": 4.554365646825706e-06, "loss": 0.7014, "step": 2675 }, { "epoch": 0.2173312758872736, "grad_norm": 6.612469532237822, "learning_rate": 4.5539908274132485e-06, "loss": 0.596, "step": 2676 }, { "epoch": 0.21741249086331518, "grad_norm": 4.306520677998056, "learning_rate": 4.553615865875496e-06, "loss": 0.7381, "step": 2677 }, { "epoch": 0.2174937058393568, "grad_norm": 4.614381185208202, "learning_rate": 4.553240762238394e-06, "loss": 0.6964, "step": 2678 }, { "epoch": 0.21757492081539836, "grad_norm": 7.406343400623716, "learning_rate": 4.552865516527899e-06, "loss": 0.5004, "step": 2679 }, { "epoch": 0.21765613579143994, "grad_norm": 3.9606433285981266, "learning_rate": 4.552490128769975e-06, "loss": 0.6269, "step": 2680 }, { "epoch": 0.21773735076748152, "grad_norm": 4.11495963139638, "learning_rate": 4.5521145989905955e-06, "loss": 0.5763, "step": 2681 }, { "epoch": 0.2178185657435231, "grad_norm": 4.588404789142679, "learning_rate": 4.551738927215747e-06, "loss": 0.4674, "step": 2682 }, { "epoch": 0.2178997807195647, "grad_norm": 7.297594706002203, "learning_rate": 4.5513631134714235e-06, "loss": 0.5801, "step": 2683 }, { "epoch": 0.21798099569560628, "grad_norm": 4.3047090539293045, "learning_rate": 4.550987157783629e-06, "loss": 0.4813, "step": 2684 }, { "epoch": 0.21806221067164785, "grad_norm": 5.768465577812249, "learning_rate": 4.550611060178378e-06, "loss": 0.567, "step": 2685 }, { "epoch": 0.21814342564768943, "grad_norm": 5.651861901253761, "learning_rate": 4.550234820681695e-06, "loss": 0.6873, "step": 2686 }, { "epoch": 0.218224640623731, "grad_norm": 9.183694716367905, "learning_rate": 4.549858439319612e-06, "loss": 0.6324, "step": 2687 }, { "epoch": 0.21830585559977259, "grad_norm": 7.585915804173281, "learning_rate": 4.549481916118174e-06, "loss": 0.5873, "step": 2688 }, { "epoch": 0.2183870705758142, "grad_norm": 3.760740144941256, "learning_rate": 4.5491052511034345e-06, "loss": 0.6382, "step": 2689 }, { "epoch": 0.21846828555185577, "grad_norm": 4.945745347505512, "learning_rate": 4.548728444301456e-06, "loss": 0.5818, "step": 2690 }, { "epoch": 0.21854950052789734, "grad_norm": 4.9769795576864215, "learning_rate": 4.548351495738312e-06, "loss": 0.7123, "step": 2691 }, { "epoch": 0.21863071550393892, "grad_norm": 4.481119259930459, "learning_rate": 4.547974405440085e-06, "loss": 0.4976, "step": 2692 }, { "epoch": 0.2187119304799805, "grad_norm": 5.7374511609884955, "learning_rate": 4.547597173432869e-06, "loss": 0.5487, "step": 2693 }, { "epoch": 0.2187931454560221, "grad_norm": 5.502760303179846, "learning_rate": 4.547219799742765e-06, "loss": 0.7687, "step": 2694 }, { "epoch": 0.21887436043206368, "grad_norm": 10.421957536535567, "learning_rate": 4.5468422843958845e-06, "loss": 0.5052, "step": 2695 }, { "epoch": 0.21895557540810526, "grad_norm": 3.6707221757502917, "learning_rate": 4.546464627418351e-06, "loss": 0.5888, "step": 2696 }, { "epoch": 0.21903679038414683, "grad_norm": 3.304249792857001, "learning_rate": 4.546086828836297e-06, "loss": 0.6277, "step": 2697 }, { "epoch": 0.2191180053601884, "grad_norm": 4.76062394973188, "learning_rate": 4.545708888675862e-06, "loss": 0.6074, "step": 2698 }, { "epoch": 0.21919922033623, "grad_norm": 4.462680720514244, "learning_rate": 4.5453308069632e-06, "loss": 0.5367, "step": 2699 }, { "epoch": 0.2192804353122716, "grad_norm": 7.372740705732592, "learning_rate": 4.54495258372447e-06, "loss": 0.5074, "step": 2700 }, { "epoch": 0.21936165028831317, "grad_norm": 7.860054087415267, "learning_rate": 4.544574218985845e-06, "loss": 0.4761, "step": 2701 }, { "epoch": 0.21944286526435475, "grad_norm": 8.081036919499534, "learning_rate": 4.544195712773504e-06, "loss": 0.5659, "step": 2702 }, { "epoch": 0.21952408024039632, "grad_norm": 5.8269057668228985, "learning_rate": 4.543817065113638e-06, "loss": 0.462, "step": 2703 }, { "epoch": 0.2196052952164379, "grad_norm": 4.2668146590202305, "learning_rate": 4.543438276032448e-06, "loss": 0.4101, "step": 2704 }, { "epoch": 0.2196865101924795, "grad_norm": 5.140738961702395, "learning_rate": 4.543059345556145e-06, "loss": 0.48, "step": 2705 }, { "epoch": 0.21976772516852108, "grad_norm": 5.170374218467899, "learning_rate": 4.542680273710947e-06, "loss": 0.5453, "step": 2706 }, { "epoch": 0.21984894014456266, "grad_norm": 5.753753477369303, "learning_rate": 4.542301060523086e-06, "loss": 0.5962, "step": 2707 }, { "epoch": 0.21993015512060424, "grad_norm": 4.532213136821538, "learning_rate": 4.541921706018799e-06, "loss": 0.561, "step": 2708 }, { "epoch": 0.22001137009664581, "grad_norm": 5.5635492887810125, "learning_rate": 4.541542210224337e-06, "loss": 0.6049, "step": 2709 }, { "epoch": 0.2200925850726874, "grad_norm": 4.789661183080903, "learning_rate": 4.5411625731659595e-06, "loss": 0.5815, "step": 2710 }, { "epoch": 0.220173800048729, "grad_norm": 3.231674262508777, "learning_rate": 4.540782794869933e-06, "loss": 0.5184, "step": 2711 }, { "epoch": 0.22025501502477057, "grad_norm": 6.341088144720809, "learning_rate": 4.5404028753625396e-06, "loss": 0.4839, "step": 2712 }, { "epoch": 0.22033623000081215, "grad_norm": 4.816690291787395, "learning_rate": 4.5400228146700654e-06, "loss": 0.6509, "step": 2713 }, { "epoch": 0.22041744497685373, "grad_norm": 3.6214196801416474, "learning_rate": 4.539642612818809e-06, "loss": 0.627, "step": 2714 }, { "epoch": 0.2204986599528953, "grad_norm": 7.585870445781305, "learning_rate": 4.539262269835078e-06, "loss": 0.4846, "step": 2715 }, { "epoch": 0.2205798749289369, "grad_norm": 5.185206149837472, "learning_rate": 4.538881785745191e-06, "loss": 0.5169, "step": 2716 }, { "epoch": 0.22066108990497849, "grad_norm": 9.120252331770077, "learning_rate": 4.538501160575475e-06, "loss": 0.5466, "step": 2717 }, { "epoch": 0.22074230488102006, "grad_norm": 4.271633224077144, "learning_rate": 4.538120394352267e-06, "loss": 0.5654, "step": 2718 }, { "epoch": 0.22082351985706164, "grad_norm": 6.425265204493728, "learning_rate": 4.5377394871019145e-06, "loss": 0.5984, "step": 2719 }, { "epoch": 0.22090473483310322, "grad_norm": 5.662199724686198, "learning_rate": 4.5373584388507745e-06, "loss": 0.5098, "step": 2720 }, { "epoch": 0.2209859498091448, "grad_norm": 7.194165344057847, "learning_rate": 4.536977249625213e-06, "loss": 0.49, "step": 2721 }, { "epoch": 0.2210671647851864, "grad_norm": 6.981685701050804, "learning_rate": 4.536595919451606e-06, "loss": 0.6383, "step": 2722 }, { "epoch": 0.22114837976122798, "grad_norm": 4.116671598914044, "learning_rate": 4.53621444835634e-06, "loss": 0.5659, "step": 2723 }, { "epoch": 0.22122959473726955, "grad_norm": 5.134961603692183, "learning_rate": 4.535832836365811e-06, "loss": 0.4805, "step": 2724 }, { "epoch": 0.22131080971331113, "grad_norm": 4.576284219113115, "learning_rate": 4.535451083506424e-06, "loss": 0.7364, "step": 2725 }, { "epoch": 0.2213920246893527, "grad_norm": 6.0501880740602365, "learning_rate": 4.535069189804594e-06, "loss": 0.5578, "step": 2726 }, { "epoch": 0.2214732396653943, "grad_norm": 4.658417802807258, "learning_rate": 4.534687155286747e-06, "loss": 0.5017, "step": 2727 }, { "epoch": 0.2215544546414359, "grad_norm": 3.5188270873838117, "learning_rate": 4.534304979979317e-06, "loss": 0.5166, "step": 2728 }, { "epoch": 0.22163566961747747, "grad_norm": 3.6771178180337865, "learning_rate": 4.53392266390875e-06, "loss": 0.717, "step": 2729 }, { "epoch": 0.22171688459351904, "grad_norm": 4.874076485337167, "learning_rate": 4.533540207101498e-06, "loss": 0.676, "step": 2730 }, { "epoch": 0.22179809956956062, "grad_norm": 3.3053339197225307, "learning_rate": 4.533157609584026e-06, "loss": 0.7047, "step": 2731 }, { "epoch": 0.2218793145456022, "grad_norm": 4.0334080892398845, "learning_rate": 4.532774871382807e-06, "loss": 0.6683, "step": 2732 }, { "epoch": 0.2219605295216438, "grad_norm": 7.9679246058681885, "learning_rate": 4.532391992524327e-06, "loss": 0.5633, "step": 2733 }, { "epoch": 0.22204174449768538, "grad_norm": 5.020060727741971, "learning_rate": 4.532008973035076e-06, "loss": 0.6868, "step": 2734 }, { "epoch": 0.22212295947372696, "grad_norm": 3.77106126429378, "learning_rate": 4.531625812941559e-06, "loss": 0.5032, "step": 2735 }, { "epoch": 0.22220417444976853, "grad_norm": 3.682569403476453, "learning_rate": 4.531242512270287e-06, "loss": 0.7004, "step": 2736 }, { "epoch": 0.2222853894258101, "grad_norm": 4.487348592439573, "learning_rate": 4.530859071047785e-06, "loss": 0.5239, "step": 2737 }, { "epoch": 0.22236660440185171, "grad_norm": 4.188000337446031, "learning_rate": 4.530475489300583e-06, "loss": 0.4732, "step": 2738 }, { "epoch": 0.2224478193778933, "grad_norm": 8.498349813305607, "learning_rate": 4.530091767055223e-06, "loss": 0.4986, "step": 2739 }, { "epoch": 0.22252903435393487, "grad_norm": 5.52528756591483, "learning_rate": 4.5297079043382566e-06, "loss": 0.6785, "step": 2740 }, { "epoch": 0.22261024932997645, "grad_norm": 5.752497762609521, "learning_rate": 4.529323901176245e-06, "loss": 0.4531, "step": 2741 }, { "epoch": 0.22269146430601802, "grad_norm": 3.092790657666851, "learning_rate": 4.52893975759576e-06, "loss": 0.7052, "step": 2742 }, { "epoch": 0.2227726792820596, "grad_norm": 4.559669547606855, "learning_rate": 4.528555473623381e-06, "loss": 0.5464, "step": 2743 }, { "epoch": 0.2228538942581012, "grad_norm": 3.5783978557136127, "learning_rate": 4.5281710492857e-06, "loss": 0.6876, "step": 2744 }, { "epoch": 0.22293510923414278, "grad_norm": 4.328813594256833, "learning_rate": 4.527786484609316e-06, "loss": 0.564, "step": 2745 }, { "epoch": 0.22301632421018436, "grad_norm": 3.952478916018802, "learning_rate": 4.52740177962084e-06, "loss": 0.5783, "step": 2746 }, { "epoch": 0.22309753918622593, "grad_norm": 4.924032799834654, "learning_rate": 4.52701693434689e-06, "loss": 0.7729, "step": 2747 }, { "epoch": 0.2231787541622675, "grad_norm": 5.139762166088582, "learning_rate": 4.526631948814096e-06, "loss": 0.5408, "step": 2748 }, { "epoch": 0.22325996913830912, "grad_norm": 4.468079919633165, "learning_rate": 4.5262468230490975e-06, "loss": 0.6876, "step": 2749 }, { "epoch": 0.2233411841143507, "grad_norm": 4.463738118530915, "learning_rate": 4.525861557078542e-06, "loss": 0.7465, "step": 2750 }, { "epoch": 0.22342239909039227, "grad_norm": 7.852415705301734, "learning_rate": 4.525476150929089e-06, "loss": 0.5134, "step": 2751 }, { "epoch": 0.22350361406643385, "grad_norm": 4.963966987188751, "learning_rate": 4.525090604627406e-06, "loss": 0.5476, "step": 2752 }, { "epoch": 0.22358482904247542, "grad_norm": 6.07590271752492, "learning_rate": 4.52470491820017e-06, "loss": 0.4523, "step": 2753 }, { "epoch": 0.223666044018517, "grad_norm": 4.302538032685536, "learning_rate": 4.52431909167407e-06, "loss": 0.6684, "step": 2754 }, { "epoch": 0.2237472589945586, "grad_norm": 4.256570317991106, "learning_rate": 4.5239331250758025e-06, "loss": 0.5804, "step": 2755 }, { "epoch": 0.22382847397060018, "grad_norm": 3.987772037631392, "learning_rate": 4.523547018432074e-06, "loss": 0.5361, "step": 2756 }, { "epoch": 0.22390968894664176, "grad_norm": 4.25893742048742, "learning_rate": 4.523160771769602e-06, "loss": 0.5403, "step": 2757 }, { "epoch": 0.22399090392268334, "grad_norm": 4.754148515001833, "learning_rate": 4.52277438511511e-06, "loss": 0.6913, "step": 2758 }, { "epoch": 0.22407211889872491, "grad_norm": 11.158110474612203, "learning_rate": 4.522387858495337e-06, "loss": 0.4877, "step": 2759 }, { "epoch": 0.22415333387476652, "grad_norm": 5.122504334003654, "learning_rate": 4.522001191937028e-06, "loss": 0.4932, "step": 2760 }, { "epoch": 0.2242345488508081, "grad_norm": 4.861511116519465, "learning_rate": 4.521614385466938e-06, "loss": 0.5527, "step": 2761 }, { "epoch": 0.22431576382684967, "grad_norm": 3.6596485107845003, "learning_rate": 4.521227439111831e-06, "loss": 0.7121, "step": 2762 }, { "epoch": 0.22439697880289125, "grad_norm": 4.522445266045689, "learning_rate": 4.520840352898483e-06, "loss": 0.5672, "step": 2763 }, { "epoch": 0.22447819377893283, "grad_norm": 7.737848493782743, "learning_rate": 4.520453126853677e-06, "loss": 0.5862, "step": 2764 }, { "epoch": 0.2245594087549744, "grad_norm": 6.141302741029408, "learning_rate": 4.520065761004209e-06, "loss": 0.5703, "step": 2765 }, { "epoch": 0.224640623731016, "grad_norm": 6.569966061095918, "learning_rate": 4.51967825537688e-06, "loss": 0.5038, "step": 2766 }, { "epoch": 0.2247218387070576, "grad_norm": 5.8858674844088235, "learning_rate": 4.5192906099985055e-06, "loss": 0.5216, "step": 2767 }, { "epoch": 0.22480305368309916, "grad_norm": 4.116267752952551, "learning_rate": 4.518902824895908e-06, "loss": 0.4604, "step": 2768 }, { "epoch": 0.22488426865914074, "grad_norm": 6.293436608006783, "learning_rate": 4.518514900095919e-06, "loss": 0.466, "step": 2769 }, { "epoch": 0.22496548363518232, "grad_norm": 8.339236282987327, "learning_rate": 4.518126835625382e-06, "loss": 0.5357, "step": 2770 }, { "epoch": 0.22504669861122392, "grad_norm": 4.726236800961924, "learning_rate": 4.51773863151115e-06, "loss": 0.6255, "step": 2771 }, { "epoch": 0.2251279135872655, "grad_norm": 4.1928620515740365, "learning_rate": 4.517350287780081e-06, "loss": 0.553, "step": 2772 }, { "epoch": 0.22520912856330708, "grad_norm": 6.445972691759003, "learning_rate": 4.51696180445905e-06, "loss": 0.4875, "step": 2773 }, { "epoch": 0.22529034353934865, "grad_norm": 14.794833956873413, "learning_rate": 4.516573181574937e-06, "loss": 0.5604, "step": 2774 }, { "epoch": 0.22537155851539023, "grad_norm": 5.562154003766993, "learning_rate": 4.516184419154633e-06, "loss": 0.5572, "step": 2775 }, { "epoch": 0.2254527734914318, "grad_norm": 3.647681045658577, "learning_rate": 4.515795517225037e-06, "loss": 0.617, "step": 2776 }, { "epoch": 0.2255339884674734, "grad_norm": 5.695037511735684, "learning_rate": 4.51540647581306e-06, "loss": 0.412, "step": 2777 }, { "epoch": 0.225615203443515, "grad_norm": 4.475228081800305, "learning_rate": 4.51501729494562e-06, "loss": 0.6703, "step": 2778 }, { "epoch": 0.22569641841955657, "grad_norm": 4.611656081080196, "learning_rate": 4.514627974649649e-06, "loss": 0.6964, "step": 2779 }, { "epoch": 0.22577763339559814, "grad_norm": 3.968385112319059, "learning_rate": 4.514238514952084e-06, "loss": 0.486, "step": 2780 }, { "epoch": 0.22585884837163972, "grad_norm": 4.069304366874197, "learning_rate": 4.513848915879874e-06, "loss": 0.501, "step": 2781 }, { "epoch": 0.22594006334768132, "grad_norm": 5.496815740257597, "learning_rate": 4.513459177459977e-06, "loss": 0.6377, "step": 2782 }, { "epoch": 0.2260212783237229, "grad_norm": 6.857579000068127, "learning_rate": 4.513069299719361e-06, "loss": 0.5332, "step": 2783 }, { "epoch": 0.22610249329976448, "grad_norm": 3.9422493886680354, "learning_rate": 4.512679282685003e-06, "loss": 0.7389, "step": 2784 }, { "epoch": 0.22618370827580606, "grad_norm": 3.4503956109068135, "learning_rate": 4.512289126383892e-06, "loss": 0.4416, "step": 2785 }, { "epoch": 0.22626492325184763, "grad_norm": 4.6592622473563665, "learning_rate": 4.511898830843022e-06, "loss": 0.5942, "step": 2786 }, { "epoch": 0.2263461382278892, "grad_norm": 5.905176155853469, "learning_rate": 4.511508396089401e-06, "loss": 0.5971, "step": 2787 }, { "epoch": 0.22642735320393081, "grad_norm": 3.3680130138338744, "learning_rate": 4.5111178221500455e-06, "loss": 0.5056, "step": 2788 }, { "epoch": 0.2265085681799724, "grad_norm": 3.6474870201796006, "learning_rate": 4.51072710905198e-06, "loss": 0.623, "step": 2789 }, { "epoch": 0.22658978315601397, "grad_norm": 5.380202578661765, "learning_rate": 4.5103362568222395e-06, "loss": 0.5094, "step": 2790 }, { "epoch": 0.22667099813205555, "grad_norm": 3.51083847592959, "learning_rate": 4.509945265487871e-06, "loss": 0.5929, "step": 2791 }, { "epoch": 0.22675221310809712, "grad_norm": 4.91331163188237, "learning_rate": 4.5095541350759265e-06, "loss": 0.5545, "step": 2792 }, { "epoch": 0.22683342808413873, "grad_norm": 3.869643139237871, "learning_rate": 4.5091628656134715e-06, "loss": 0.5104, "step": 2793 }, { "epoch": 0.2269146430601803, "grad_norm": 11.289546788075414, "learning_rate": 4.508771457127579e-06, "loss": 0.4783, "step": 2794 }, { "epoch": 0.22699585803622188, "grad_norm": 4.701834247380548, "learning_rate": 4.508379909645334e-06, "loss": 0.6242, "step": 2795 }, { "epoch": 0.22707707301226346, "grad_norm": 3.8081500149721608, "learning_rate": 4.5079882231938274e-06, "loss": 0.6682, "step": 2796 }, { "epoch": 0.22715828798830504, "grad_norm": 8.196015374000737, "learning_rate": 4.5075963978001634e-06, "loss": 0.5618, "step": 2797 }, { "epoch": 0.2272395029643466, "grad_norm": 3.4819053096272743, "learning_rate": 4.5072044334914546e-06, "loss": 0.4528, "step": 2798 }, { "epoch": 0.22732071794038822, "grad_norm": 5.156427328355886, "learning_rate": 4.506812330294821e-06, "loss": 0.5095, "step": 2799 }, { "epoch": 0.2274019329164298, "grad_norm": 3.7634245699967477, "learning_rate": 4.506420088237395e-06, "loss": 0.6707, "step": 2800 }, { "epoch": 0.22748314789247137, "grad_norm": 3.431587308735926, "learning_rate": 4.5060277073463174e-06, "loss": 0.566, "step": 2801 }, { "epoch": 0.22756436286851295, "grad_norm": 4.437013694098261, "learning_rate": 4.50563518764874e-06, "loss": 0.7123, "step": 2802 }, { "epoch": 0.22764557784455453, "grad_norm": 4.567183665178771, "learning_rate": 4.505242529171822e-06, "loss": 0.5152, "step": 2803 }, { "epoch": 0.22772679282059613, "grad_norm": 3.9017580026940664, "learning_rate": 4.504849731942734e-06, "loss": 0.5201, "step": 2804 }, { "epoch": 0.2278080077966377, "grad_norm": 4.536121689617816, "learning_rate": 4.504456795988654e-06, "loss": 0.6886, "step": 2805 }, { "epoch": 0.22788922277267928, "grad_norm": 10.84869509304053, "learning_rate": 4.504063721336773e-06, "loss": 0.5154, "step": 2806 }, { "epoch": 0.22797043774872086, "grad_norm": 5.404245994389873, "learning_rate": 4.503670508014289e-06, "loss": 0.5609, "step": 2807 }, { "epoch": 0.22805165272476244, "grad_norm": 6.579796252814421, "learning_rate": 4.50327715604841e-06, "loss": 0.523, "step": 2808 }, { "epoch": 0.22813286770080402, "grad_norm": 3.973362927458198, "learning_rate": 4.5028836654663535e-06, "loss": 0.5637, "step": 2809 }, { "epoch": 0.22821408267684562, "grad_norm": 5.793419725047325, "learning_rate": 4.502490036295348e-06, "loss": 0.5813, "step": 2810 }, { "epoch": 0.2282952976528872, "grad_norm": 4.201946994996218, "learning_rate": 4.50209626856263e-06, "loss": 0.3738, "step": 2811 }, { "epoch": 0.22837651262892877, "grad_norm": 4.182121020616893, "learning_rate": 4.501702362295446e-06, "loss": 0.5465, "step": 2812 }, { "epoch": 0.22845772760497035, "grad_norm": 3.5727363113753645, "learning_rate": 4.501308317521052e-06, "loss": 0.5189, "step": 2813 }, { "epoch": 0.22853894258101193, "grad_norm": 5.356568404800836, "learning_rate": 4.500914134266715e-06, "loss": 0.8021, "step": 2814 }, { "epoch": 0.22862015755705353, "grad_norm": 4.550631173863838, "learning_rate": 4.500519812559709e-06, "loss": 0.583, "step": 2815 }, { "epoch": 0.2287013725330951, "grad_norm": 4.328142108387039, "learning_rate": 4.50012535242732e-06, "loss": 0.4888, "step": 2816 }, { "epoch": 0.2287825875091367, "grad_norm": 4.774451426228126, "learning_rate": 4.499730753896841e-06, "loss": 0.4892, "step": 2817 }, { "epoch": 0.22886380248517826, "grad_norm": 4.5207191018836355, "learning_rate": 4.4993360169955784e-06, "loss": 0.4906, "step": 2818 }, { "epoch": 0.22894501746121984, "grad_norm": 3.12349199747274, "learning_rate": 4.498941141750845e-06, "loss": 0.6733, "step": 2819 }, { "epoch": 0.22902623243726142, "grad_norm": 3.911794379996745, "learning_rate": 4.498546128189963e-06, "loss": 0.5263, "step": 2820 }, { "epoch": 0.22910744741330302, "grad_norm": 5.139451559263501, "learning_rate": 4.498150976340266e-06, "loss": 0.5766, "step": 2821 }, { "epoch": 0.2291886623893446, "grad_norm": 4.27868628115041, "learning_rate": 4.497755686229097e-06, "loss": 0.529, "step": 2822 }, { "epoch": 0.22926987736538618, "grad_norm": 3.9641320918354372, "learning_rate": 4.497360257883808e-06, "loss": 0.5311, "step": 2823 }, { "epoch": 0.22935109234142775, "grad_norm": 3.5108973371563055, "learning_rate": 4.496964691331759e-06, "loss": 0.5227, "step": 2824 }, { "epoch": 0.22943230731746933, "grad_norm": 5.403815218671819, "learning_rate": 4.496568986600323e-06, "loss": 0.9042, "step": 2825 }, { "epoch": 0.22951352229351094, "grad_norm": 4.12742663870129, "learning_rate": 4.4961731437168795e-06, "loss": 0.7359, "step": 2826 }, { "epoch": 0.2295947372695525, "grad_norm": 3.568114706661449, "learning_rate": 4.4957771627088185e-06, "loss": 0.6484, "step": 2827 }, { "epoch": 0.2296759522455941, "grad_norm": 5.842995081486275, "learning_rate": 4.495381043603541e-06, "loss": 0.6022, "step": 2828 }, { "epoch": 0.22975716722163567, "grad_norm": 5.602774830593073, "learning_rate": 4.494984786428455e-06, "loss": 0.6084, "step": 2829 }, { "epoch": 0.22983838219767724, "grad_norm": 5.779320891496907, "learning_rate": 4.494588391210981e-06, "loss": 0.5428, "step": 2830 }, { "epoch": 0.22991959717371882, "grad_norm": 5.734619447546111, "learning_rate": 4.494191857978546e-06, "loss": 0.5494, "step": 2831 }, { "epoch": 0.23000081214976043, "grad_norm": 3.540278577269194, "learning_rate": 4.493795186758589e-06, "loss": 0.6195, "step": 2832 }, { "epoch": 0.230082027125802, "grad_norm": 5.458025507421001, "learning_rate": 4.493398377578557e-06, "loss": 0.5911, "step": 2833 }, { "epoch": 0.23016324210184358, "grad_norm": 3.0958255282858596, "learning_rate": 4.4930014304659066e-06, "loss": 0.6099, "step": 2834 }, { "epoch": 0.23024445707788516, "grad_norm": 4.075827299092992, "learning_rate": 4.492604345448106e-06, "loss": 0.5688, "step": 2835 }, { "epoch": 0.23032567205392673, "grad_norm": 5.7956480372939065, "learning_rate": 4.492207122552629e-06, "loss": 0.6, "step": 2836 }, { "epoch": 0.23040688702996834, "grad_norm": 5.13610708333408, "learning_rate": 4.491809761806964e-06, "loss": 0.5496, "step": 2837 }, { "epoch": 0.23048810200600992, "grad_norm": 5.910698565673017, "learning_rate": 4.491412263238605e-06, "loss": 0.4669, "step": 2838 }, { "epoch": 0.2305693169820515, "grad_norm": 3.4340630962930305, "learning_rate": 4.4910146268750555e-06, "loss": 0.5895, "step": 2839 }, { "epoch": 0.23065053195809307, "grad_norm": 4.029802742225141, "learning_rate": 4.490616852743832e-06, "loss": 0.5887, "step": 2840 }, { "epoch": 0.23073174693413465, "grad_norm": 5.37046614141328, "learning_rate": 4.490218940872457e-06, "loss": 0.6715, "step": 2841 }, { "epoch": 0.23081296191017622, "grad_norm": 6.574976668750347, "learning_rate": 4.489820891288466e-06, "loss": 0.6401, "step": 2842 }, { "epoch": 0.23089417688621783, "grad_norm": 4.12936501875235, "learning_rate": 4.489422704019399e-06, "loss": 0.6287, "step": 2843 }, { "epoch": 0.2309753918622594, "grad_norm": 4.078499752253989, "learning_rate": 4.489024379092809e-06, "loss": 0.5193, "step": 2844 }, { "epoch": 0.23105660683830098, "grad_norm": 8.139115119863513, "learning_rate": 4.48862591653626e-06, "loss": 0.415, "step": 2845 }, { "epoch": 0.23113782181434256, "grad_norm": 6.159373760750005, "learning_rate": 4.488227316377322e-06, "loss": 0.5214, "step": 2846 }, { "epoch": 0.23121903679038414, "grad_norm": 3.4411255569736956, "learning_rate": 4.487828578643576e-06, "loss": 0.541, "step": 2847 }, { "epoch": 0.23130025176642574, "grad_norm": 4.121631308793716, "learning_rate": 4.4874297033626126e-06, "loss": 0.6158, "step": 2848 }, { "epoch": 0.23138146674246732, "grad_norm": 3.8683262343429647, "learning_rate": 4.487030690562032e-06, "loss": 0.5957, "step": 2849 }, { "epoch": 0.2314626817185089, "grad_norm": 4.36066324441673, "learning_rate": 4.486631540269445e-06, "loss": 0.5908, "step": 2850 }, { "epoch": 0.23154389669455047, "grad_norm": 23.856582803005605, "learning_rate": 4.486232252512468e-06, "loss": 0.6196, "step": 2851 }, { "epoch": 0.23162511167059205, "grad_norm": 4.565500746372573, "learning_rate": 4.485832827318733e-06, "loss": 0.517, "step": 2852 }, { "epoch": 0.23170632664663363, "grad_norm": 5.415515197041984, "learning_rate": 4.485433264715874e-06, "loss": 0.7045, "step": 2853 }, { "epoch": 0.23178754162267523, "grad_norm": 7.611491735007953, "learning_rate": 4.485033564731542e-06, "loss": 0.632, "step": 2854 }, { "epoch": 0.2318687565987168, "grad_norm": 5.094950046339584, "learning_rate": 4.484633727393393e-06, "loss": 0.5768, "step": 2855 }, { "epoch": 0.23194997157475838, "grad_norm": 46.65401837853409, "learning_rate": 4.484233752729093e-06, "loss": 0.5038, "step": 2856 }, { "epoch": 0.23203118655079996, "grad_norm": 25.235913155436222, "learning_rate": 4.483833640766319e-06, "loss": 0.6252, "step": 2857 }, { "epoch": 0.23211240152684154, "grad_norm": 7.36939234328555, "learning_rate": 4.4834333915327564e-06, "loss": 0.6505, "step": 2858 }, { "epoch": 0.23219361650288314, "grad_norm": 4.830643690141342, "learning_rate": 4.483033005056101e-06, "loss": 0.5895, "step": 2859 }, { "epoch": 0.23227483147892472, "grad_norm": 9.183888328065596, "learning_rate": 4.482632481364055e-06, "loss": 0.4614, "step": 2860 }, { "epoch": 0.2323560464549663, "grad_norm": 4.613246918041221, "learning_rate": 4.482231820484336e-06, "loss": 0.6206, "step": 2861 }, { "epoch": 0.23243726143100787, "grad_norm": 7.324346436954002, "learning_rate": 4.4818310224446645e-06, "loss": 0.4812, "step": 2862 }, { "epoch": 0.23251847640704945, "grad_norm": 6.5981856748393755, "learning_rate": 4.481430087272776e-06, "loss": 0.6606, "step": 2863 }, { "epoch": 0.23259969138309103, "grad_norm": 4.619941208113225, "learning_rate": 4.481029014996412e-06, "loss": 0.4754, "step": 2864 }, { "epoch": 0.23268090635913263, "grad_norm": 5.7207154634398085, "learning_rate": 4.480627805643324e-06, "loss": 0.5482, "step": 2865 }, { "epoch": 0.2327621213351742, "grad_norm": 7.298594910122886, "learning_rate": 4.480226459241275e-06, "loss": 0.6597, "step": 2866 }, { "epoch": 0.2328433363112158, "grad_norm": 9.363398730309251, "learning_rate": 4.479824975818034e-06, "loss": 0.6121, "step": 2867 }, { "epoch": 0.23292455128725736, "grad_norm": 4.781902048228461, "learning_rate": 4.4794233554013835e-06, "loss": 0.5151, "step": 2868 }, { "epoch": 0.23300576626329894, "grad_norm": 2.949065575080919, "learning_rate": 4.479021598019113e-06, "loss": 0.5063, "step": 2869 }, { "epoch": 0.23308698123934055, "grad_norm": 4.021888781896201, "learning_rate": 4.4786197036990205e-06, "loss": 0.5932, "step": 2870 }, { "epoch": 0.23316819621538212, "grad_norm": 5.096647466230962, "learning_rate": 4.478217672468918e-06, "loss": 0.6553, "step": 2871 }, { "epoch": 0.2332494111914237, "grad_norm": 3.3544437426357434, "learning_rate": 4.47781550435662e-06, "loss": 0.7021, "step": 2872 }, { "epoch": 0.23333062616746528, "grad_norm": 3.955029448497719, "learning_rate": 4.4774131993899585e-06, "loss": 0.5843, "step": 2873 }, { "epoch": 0.23341184114350685, "grad_norm": 5.718278060740205, "learning_rate": 4.477010757596768e-06, "loss": 0.5778, "step": 2874 }, { "epoch": 0.23349305611954843, "grad_norm": 3.558212336992993, "learning_rate": 4.4766081790048965e-06, "loss": 0.4537, "step": 2875 }, { "epoch": 0.23357427109559004, "grad_norm": 3.821349848295602, "learning_rate": 4.4762054636422005e-06, "loss": 0.5913, "step": 2876 }, { "epoch": 0.2336554860716316, "grad_norm": 5.178508133946879, "learning_rate": 4.475802611536545e-06, "loss": 0.5516, "step": 2877 }, { "epoch": 0.2337367010476732, "grad_norm": 4.64155733262824, "learning_rate": 4.475399622715805e-06, "loss": 0.5186, "step": 2878 }, { "epoch": 0.23381791602371477, "grad_norm": 5.700083589065183, "learning_rate": 4.474996497207866e-06, "loss": 0.6616, "step": 2879 }, { "epoch": 0.23389913099975634, "grad_norm": 5.357424859826312, "learning_rate": 4.4745932350406225e-06, "loss": 0.6395, "step": 2880 }, { "epoch": 0.23398034597579795, "grad_norm": 5.301441690670492, "learning_rate": 4.474189836241976e-06, "loss": 0.5116, "step": 2881 }, { "epoch": 0.23406156095183953, "grad_norm": 3.7935528708576784, "learning_rate": 4.473786300839843e-06, "loss": 0.537, "step": 2882 }, { "epoch": 0.2341427759278811, "grad_norm": 4.6354063500549945, "learning_rate": 4.4733826288621435e-06, "loss": 0.4027, "step": 2883 }, { "epoch": 0.23422399090392268, "grad_norm": 4.258268978553181, "learning_rate": 4.47297882033681e-06, "loss": 0.5345, "step": 2884 }, { "epoch": 0.23430520587996426, "grad_norm": 4.645513866526441, "learning_rate": 4.472574875291784e-06, "loss": 0.3998, "step": 2885 }, { "epoch": 0.23438642085600583, "grad_norm": 5.592577886479116, "learning_rate": 4.472170793755016e-06, "loss": 0.5404, "step": 2886 }, { "epoch": 0.23446763583204744, "grad_norm": 4.864175858056895, "learning_rate": 4.471766575754467e-06, "loss": 0.4769, "step": 2887 }, { "epoch": 0.23454885080808902, "grad_norm": 5.3559791822991984, "learning_rate": 4.471362221318106e-06, "loss": 0.5522, "step": 2888 }, { "epoch": 0.2346300657841306, "grad_norm": 3.985841294995745, "learning_rate": 4.470957730473913e-06, "loss": 0.5913, "step": 2889 }, { "epoch": 0.23471128076017217, "grad_norm": 7.9982149419628765, "learning_rate": 4.470553103249876e-06, "loss": 0.5101, "step": 2890 }, { "epoch": 0.23479249573621375, "grad_norm": 3.748824385107628, "learning_rate": 4.470148339673993e-06, "loss": 0.5213, "step": 2891 }, { "epoch": 0.23487371071225535, "grad_norm": 4.816022996866285, "learning_rate": 4.469743439774272e-06, "loss": 0.5209, "step": 2892 }, { "epoch": 0.23495492568829693, "grad_norm": 5.863963017632638, "learning_rate": 4.46933840357873e-06, "loss": 0.5256, "step": 2893 }, { "epoch": 0.2350361406643385, "grad_norm": 4.876008648396625, "learning_rate": 4.468933231115393e-06, "loss": 0.53, "step": 2894 }, { "epoch": 0.23511735564038008, "grad_norm": 6.008481144166018, "learning_rate": 4.468527922412297e-06, "loss": 0.5812, "step": 2895 }, { "epoch": 0.23519857061642166, "grad_norm": 6.418455057806858, "learning_rate": 4.468122477497486e-06, "loss": 0.5318, "step": 2896 }, { "epoch": 0.23527978559246324, "grad_norm": 2.5240378044740415, "learning_rate": 4.467716896399017e-06, "loss": 0.5948, "step": 2897 }, { "epoch": 0.23536100056850484, "grad_norm": 4.361122855517343, "learning_rate": 4.4673111791449515e-06, "loss": 0.5233, "step": 2898 }, { "epoch": 0.23544221554454642, "grad_norm": 4.528523763290337, "learning_rate": 4.466905325763365e-06, "loss": 0.6362, "step": 2899 }, { "epoch": 0.235523430520588, "grad_norm": 6.538860668916424, "learning_rate": 4.4664993362823394e-06, "loss": 0.5745, "step": 2900 }, { "epoch": 0.23560464549662957, "grad_norm": 5.847755183433358, "learning_rate": 4.466093210729967e-06, "loss": 0.6584, "step": 2901 }, { "epoch": 0.23568586047267115, "grad_norm": 4.179474365670219, "learning_rate": 4.465686949134351e-06, "loss": 0.6396, "step": 2902 }, { "epoch": 0.23576707544871275, "grad_norm": 5.774986352129162, "learning_rate": 4.465280551523601e-06, "loss": 0.5861, "step": 2903 }, { "epoch": 0.23584829042475433, "grad_norm": 3.2478970012240325, "learning_rate": 4.464874017925837e-06, "loss": 0.6337, "step": 2904 }, { "epoch": 0.2359295054007959, "grad_norm": 6.068452434980089, "learning_rate": 4.46446734836919e-06, "loss": 0.6407, "step": 2905 }, { "epoch": 0.23601072037683749, "grad_norm": 3.2686213630532026, "learning_rate": 4.4640605428818e-06, "loss": 0.6955, "step": 2906 }, { "epoch": 0.23609193535287906, "grad_norm": 4.07905119935594, "learning_rate": 4.463653601491815e-06, "loss": 0.4519, "step": 2907 }, { "epoch": 0.23617315032892067, "grad_norm": 6.169592515743247, "learning_rate": 4.463246524227393e-06, "loss": 0.523, "step": 2908 }, { "epoch": 0.23625436530496224, "grad_norm": 4.321690950129896, "learning_rate": 4.462839311116702e-06, "loss": 0.522, "step": 2909 }, { "epoch": 0.23633558028100382, "grad_norm": 7.237413918755258, "learning_rate": 4.462431962187919e-06, "loss": 0.747, "step": 2910 }, { "epoch": 0.2364167952570454, "grad_norm": 3.734173317333828, "learning_rate": 4.46202447746923e-06, "loss": 0.6299, "step": 2911 }, { "epoch": 0.23649801023308697, "grad_norm": 5.880101867809025, "learning_rate": 4.461616856988831e-06, "loss": 0.6253, "step": 2912 }, { "epoch": 0.23657922520912855, "grad_norm": 4.370253406800247, "learning_rate": 4.461209100774928e-06, "loss": 0.5456, "step": 2913 }, { "epoch": 0.23666044018517016, "grad_norm": 4.39926590709237, "learning_rate": 4.460801208855734e-06, "loss": 0.427, "step": 2914 }, { "epoch": 0.23674165516121173, "grad_norm": 6.280728294974258, "learning_rate": 4.4603931812594735e-06, "loss": 0.7197, "step": 2915 }, { "epoch": 0.2368228701372533, "grad_norm": 6.982111134101197, "learning_rate": 4.45998501801438e-06, "loss": 0.4599, "step": 2916 }, { "epoch": 0.2369040851132949, "grad_norm": 7.601832967298451, "learning_rate": 4.459576719148697e-06, "loss": 0.4797, "step": 2917 }, { "epoch": 0.23698530008933646, "grad_norm": 5.8714956548009924, "learning_rate": 4.459168284690676e-06, "loss": 0.5447, "step": 2918 }, { "epoch": 0.23706651506537807, "grad_norm": 7.1059975498172685, "learning_rate": 4.458759714668578e-06, "loss": 0.6044, "step": 2919 }, { "epoch": 0.23714773004141965, "grad_norm": 4.612443635513808, "learning_rate": 4.458351009110675e-06, "loss": 0.5054, "step": 2920 }, { "epoch": 0.23722894501746122, "grad_norm": 4.117158983616684, "learning_rate": 4.457942168045246e-06, "loss": 0.6243, "step": 2921 }, { "epoch": 0.2373101599935028, "grad_norm": 4.243517625792392, "learning_rate": 4.457533191500581e-06, "loss": 0.6199, "step": 2922 }, { "epoch": 0.23739137496954438, "grad_norm": 6.196728526337707, "learning_rate": 4.45712407950498e-06, "loss": 0.4745, "step": 2923 }, { "epoch": 0.23747258994558595, "grad_norm": 4.089855406887254, "learning_rate": 4.45671483208675e-06, "loss": 0.5762, "step": 2924 }, { "epoch": 0.23755380492162756, "grad_norm": 8.680736194237051, "learning_rate": 4.45630544927421e-06, "loss": 0.6403, "step": 2925 }, { "epoch": 0.23763501989766914, "grad_norm": 3.4364871224606452, "learning_rate": 4.4558959310956865e-06, "loss": 0.5683, "step": 2926 }, { "epoch": 0.2377162348737107, "grad_norm": 4.94476918130175, "learning_rate": 4.4554862775795146e-06, "loss": 0.4727, "step": 2927 }, { "epoch": 0.2377974498497523, "grad_norm": 3.68514712087398, "learning_rate": 4.455076488754043e-06, "loss": 0.5719, "step": 2928 }, { "epoch": 0.23787866482579387, "grad_norm": 4.764481615544404, "learning_rate": 4.4546665646476254e-06, "loss": 0.575, "step": 2929 }, { "epoch": 0.23795987980183547, "grad_norm": 4.8777757605766405, "learning_rate": 4.4542565052886256e-06, "loss": 0.6751, "step": 2930 }, { "epoch": 0.23804109477787705, "grad_norm": 4.182540781833306, "learning_rate": 4.45384631070542e-06, "loss": 0.6563, "step": 2931 }, { "epoch": 0.23812230975391863, "grad_norm": 3.5345505370213264, "learning_rate": 4.453435980926388e-06, "loss": 0.5779, "step": 2932 }, { "epoch": 0.2382035247299602, "grad_norm": 8.12041102050761, "learning_rate": 4.453025515979926e-06, "loss": 0.5615, "step": 2933 }, { "epoch": 0.23828473970600178, "grad_norm": 6.278197290858452, "learning_rate": 4.452614915894434e-06, "loss": 0.4717, "step": 2934 }, { "epoch": 0.23836595468204336, "grad_norm": 3.470804618193489, "learning_rate": 4.452204180698325e-06, "loss": 0.5222, "step": 2935 }, { "epoch": 0.23844716965808496, "grad_norm": 4.736573933269891, "learning_rate": 4.451793310420017e-06, "loss": 0.6273, "step": 2936 }, { "epoch": 0.23852838463412654, "grad_norm": 6.791656553293532, "learning_rate": 4.451382305087943e-06, "loss": 0.6002, "step": 2937 }, { "epoch": 0.23860959961016812, "grad_norm": 4.478548259706515, "learning_rate": 4.450971164730541e-06, "loss": 0.4814, "step": 2938 }, { "epoch": 0.2386908145862097, "grad_norm": 3.681202303319773, "learning_rate": 4.4505598893762595e-06, "loss": 0.5895, "step": 2939 }, { "epoch": 0.23877202956225127, "grad_norm": 4.1452045492879055, "learning_rate": 4.4501484790535555e-06, "loss": 0.6095, "step": 2940 }, { "epoch": 0.23885324453829287, "grad_norm": 6.265763596141227, "learning_rate": 4.449736933790899e-06, "loss": 0.6445, "step": 2941 }, { "epoch": 0.23893445951433445, "grad_norm": 3.195443239329471, "learning_rate": 4.449325253616765e-06, "loss": 0.6796, "step": 2942 }, { "epoch": 0.23901567449037603, "grad_norm": 4.681861481447409, "learning_rate": 4.448913438559641e-06, "loss": 0.5013, "step": 2943 }, { "epoch": 0.2390968894664176, "grad_norm": 3.9893399361235873, "learning_rate": 4.448501488648021e-06, "loss": 0.4841, "step": 2944 }, { "epoch": 0.23917810444245918, "grad_norm": 8.26472685534627, "learning_rate": 4.448089403910411e-06, "loss": 0.65, "step": 2945 }, { "epoch": 0.23925931941850076, "grad_norm": 7.279481691054026, "learning_rate": 4.447677184375323e-06, "loss": 0.6863, "step": 2946 }, { "epoch": 0.23934053439454236, "grad_norm": 5.107617453615278, "learning_rate": 4.447264830071282e-06, "loss": 0.4314, "step": 2947 }, { "epoch": 0.23942174937058394, "grad_norm": 10.45135842568149, "learning_rate": 4.446852341026822e-06, "loss": 0.5368, "step": 2948 }, { "epoch": 0.23950296434662552, "grad_norm": 3.384814108308006, "learning_rate": 4.4464397172704825e-06, "loss": 0.5992, "step": 2949 }, { "epoch": 0.2395841793226671, "grad_norm": 3.6038417485455723, "learning_rate": 4.446026958830816e-06, "loss": 0.6577, "step": 2950 }, { "epoch": 0.23966539429870867, "grad_norm": 5.105585351046193, "learning_rate": 4.4456140657363824e-06, "loss": 0.5815, "step": 2951 }, { "epoch": 0.23974660927475028, "grad_norm": 6.436128437554316, "learning_rate": 4.445201038015753e-06, "loss": 0.6109, "step": 2952 }, { "epoch": 0.23982782425079185, "grad_norm": 6.0286114195287, "learning_rate": 4.4447878756975074e-06, "loss": 0.5737, "step": 2953 }, { "epoch": 0.23990903922683343, "grad_norm": 6.340347602254372, "learning_rate": 4.444374578810233e-06, "loss": 0.6333, "step": 2954 }, { "epoch": 0.239990254202875, "grad_norm": 3.611889348535864, "learning_rate": 4.443961147382528e-06, "loss": 0.5584, "step": 2955 }, { "epoch": 0.24007146917891659, "grad_norm": 4.508170768419845, "learning_rate": 4.4435475814429995e-06, "loss": 0.5662, "step": 2956 }, { "epoch": 0.24015268415495816, "grad_norm": 3.857621418576937, "learning_rate": 4.4431338810202655e-06, "loss": 0.5413, "step": 2957 }, { "epoch": 0.24023389913099977, "grad_norm": 5.167525653044904, "learning_rate": 4.4427200461429494e-06, "loss": 0.5279, "step": 2958 }, { "epoch": 0.24031511410704134, "grad_norm": 9.873321863622351, "learning_rate": 4.442306076839689e-06, "loss": 0.4993, "step": 2959 }, { "epoch": 0.24039632908308292, "grad_norm": 9.73450941091562, "learning_rate": 4.441891973139127e-06, "loss": 0.5868, "step": 2960 }, { "epoch": 0.2404775440591245, "grad_norm": 3.7053040122979297, "learning_rate": 4.441477735069918e-06, "loss": 0.6352, "step": 2961 }, { "epoch": 0.24055875903516608, "grad_norm": 5.920625186279193, "learning_rate": 4.441063362660726e-06, "loss": 0.6923, "step": 2962 }, { "epoch": 0.24063997401120768, "grad_norm": 6.875957645683971, "learning_rate": 4.44064885594022e-06, "loss": 0.5829, "step": 2963 }, { "epoch": 0.24072118898724926, "grad_norm": 6.3088089208901215, "learning_rate": 4.440234214937086e-06, "loss": 0.4949, "step": 2964 }, { "epoch": 0.24080240396329083, "grad_norm": 6.283915304694119, "learning_rate": 4.439819439680012e-06, "loss": 0.5373, "step": 2965 }, { "epoch": 0.2408836189393324, "grad_norm": 5.544344295999567, "learning_rate": 4.439404530197699e-06, "loss": 0.5615, "step": 2966 }, { "epoch": 0.240964833915374, "grad_norm": 5.522735082552348, "learning_rate": 4.438989486518856e-06, "loss": 0.496, "step": 2967 }, { "epoch": 0.24104604889141557, "grad_norm": 6.134276115094066, "learning_rate": 4.438574308672203e-06, "loss": 0.3989, "step": 2968 }, { "epoch": 0.24112726386745717, "grad_norm": 18.34187318276363, "learning_rate": 4.438158996686468e-06, "loss": 0.4992, "step": 2969 }, { "epoch": 0.24120847884349875, "grad_norm": 5.7407597574290605, "learning_rate": 4.4377435505903876e-06, "loss": 0.7617, "step": 2970 }, { "epoch": 0.24128969381954032, "grad_norm": 5.129242694583421, "learning_rate": 4.4373279704127095e-06, "loss": 0.6338, "step": 2971 }, { "epoch": 0.2413709087955819, "grad_norm": 5.956882401490225, "learning_rate": 4.4369122561821885e-06, "loss": 0.4831, "step": 2972 }, { "epoch": 0.24145212377162348, "grad_norm": 4.390822342695221, "learning_rate": 4.436496407927591e-06, "loss": 0.5962, "step": 2973 }, { "epoch": 0.24153333874766508, "grad_norm": 8.138342846156773, "learning_rate": 4.436080425677689e-06, "loss": 0.5129, "step": 2974 }, { "epoch": 0.24161455372370666, "grad_norm": 3.692323205845064, "learning_rate": 4.43566430946127e-06, "loss": 0.6631, "step": 2975 }, { "epoch": 0.24169576869974824, "grad_norm": 4.598524784324876, "learning_rate": 4.435248059307124e-06, "loss": 0.5203, "step": 2976 }, { "epoch": 0.2417769836757898, "grad_norm": 36.50302303476803, "learning_rate": 4.434831675244056e-06, "loss": 0.5976, "step": 2977 }, { "epoch": 0.2418581986518314, "grad_norm": 4.225579732329154, "learning_rate": 4.434415157300875e-06, "loss": 0.7187, "step": 2978 }, { "epoch": 0.24193941362787297, "grad_norm": 6.144364914404981, "learning_rate": 4.433998505506402e-06, "loss": 0.5036, "step": 2979 }, { "epoch": 0.24202062860391457, "grad_norm": 5.890869286549283, "learning_rate": 4.433581719889469e-06, "loss": 0.5174, "step": 2980 }, { "epoch": 0.24210184357995615, "grad_norm": 4.666862090401287, "learning_rate": 4.433164800478914e-06, "loss": 0.5758, "step": 2981 }, { "epoch": 0.24218305855599773, "grad_norm": 4.488080886068684, "learning_rate": 4.432747747303586e-06, "loss": 0.5223, "step": 2982 }, { "epoch": 0.2422642735320393, "grad_norm": 4.722733122087884, "learning_rate": 4.432330560392343e-06, "loss": 0.5231, "step": 2983 }, { "epoch": 0.24234548850808088, "grad_norm": 10.302897259600046, "learning_rate": 4.431913239774052e-06, "loss": 0.6269, "step": 2984 }, { "epoch": 0.24242670348412249, "grad_norm": 4.986694907260506, "learning_rate": 4.4314957854775895e-06, "loss": 0.4772, "step": 2985 }, { "epoch": 0.24250791846016406, "grad_norm": 28.644633867544297, "learning_rate": 4.43107819753184e-06, "loss": 0.4776, "step": 2986 }, { "epoch": 0.24258913343620564, "grad_norm": 4.650389954451338, "learning_rate": 4.4306604759657e-06, "loss": 0.6489, "step": 2987 }, { "epoch": 0.24267034841224722, "grad_norm": 8.110031601217097, "learning_rate": 4.430242620808073e-06, "loss": 0.4797, "step": 2988 }, { "epoch": 0.2427515633882888, "grad_norm": 4.435012545474071, "learning_rate": 4.429824632087873e-06, "loss": 0.5772, "step": 2989 }, { "epoch": 0.24283277836433037, "grad_norm": 6.6461768049947505, "learning_rate": 4.42940650983402e-06, "loss": 0.6512, "step": 2990 }, { "epoch": 0.24291399334037198, "grad_norm": 4.2488570760327935, "learning_rate": 4.428988254075449e-06, "loss": 0.4932, "step": 2991 }, { "epoch": 0.24299520831641355, "grad_norm": 3.4583725498967657, "learning_rate": 4.4285698648411005e-06, "loss": 0.6507, "step": 2992 }, { "epoch": 0.24307642329245513, "grad_norm": 8.442412186418991, "learning_rate": 4.428151342159923e-06, "loss": 0.7045, "step": 2993 }, { "epoch": 0.2431576382684967, "grad_norm": 4.275956058237837, "learning_rate": 4.427732686060877e-06, "loss": 0.9022, "step": 2994 }, { "epoch": 0.24323885324453828, "grad_norm": 3.5996383051852328, "learning_rate": 4.427313896572933e-06, "loss": 0.5063, "step": 2995 }, { "epoch": 0.2433200682205799, "grad_norm": 8.226615556870824, "learning_rate": 4.426894973725066e-06, "loss": 0.5054, "step": 2996 }, { "epoch": 0.24340128319662147, "grad_norm": 4.428674996775757, "learning_rate": 4.426475917546266e-06, "loss": 0.5189, "step": 2997 }, { "epoch": 0.24348249817266304, "grad_norm": 3.292995498057493, "learning_rate": 4.426056728065527e-06, "loss": 0.5175, "step": 2998 }, { "epoch": 0.24356371314870462, "grad_norm": 5.396362723485166, "learning_rate": 4.425637405311857e-06, "loss": 0.5239, "step": 2999 }, { "epoch": 0.2436449281247462, "grad_norm": 4.409324480057902, "learning_rate": 4.425217949314269e-06, "loss": 0.5758, "step": 3000 }, { "epoch": 0.24372614310078777, "grad_norm": 4.772191402586126, "learning_rate": 4.424798360101788e-06, "loss": 0.5178, "step": 3001 }, { "epoch": 0.24380735807682938, "grad_norm": 4.70270174443938, "learning_rate": 4.424378637703448e-06, "loss": 0.5634, "step": 3002 }, { "epoch": 0.24388857305287096, "grad_norm": 4.384411856516953, "learning_rate": 4.423958782148291e-06, "loss": 0.6138, "step": 3003 }, { "epoch": 0.24396978802891253, "grad_norm": 4.849333757605469, "learning_rate": 4.423538793465368e-06, "loss": 0.5399, "step": 3004 }, { "epoch": 0.2440510030049541, "grad_norm": 7.683916564358679, "learning_rate": 4.423118671683741e-06, "loss": 0.5882, "step": 3005 }, { "epoch": 0.24413221798099569, "grad_norm": 6.414015664183622, "learning_rate": 4.42269841683248e-06, "loss": 0.501, "step": 3006 }, { "epoch": 0.2442134329570373, "grad_norm": 3.445748481334447, "learning_rate": 4.422278028940664e-06, "loss": 0.7989, "step": 3007 }, { "epoch": 0.24429464793307887, "grad_norm": 3.446395382611435, "learning_rate": 4.4218575080373825e-06, "loss": 0.6564, "step": 3008 }, { "epoch": 0.24437586290912044, "grad_norm": 7.075485495123882, "learning_rate": 4.421436854151731e-06, "loss": 0.5491, "step": 3009 }, { "epoch": 0.24445707788516202, "grad_norm": 13.379409500342488, "learning_rate": 4.421016067312821e-06, "loss": 0.5192, "step": 3010 }, { "epoch": 0.2445382928612036, "grad_norm": 6.268741236760656, "learning_rate": 4.420595147549764e-06, "loss": 0.5197, "step": 3011 }, { "epoch": 0.24461950783724518, "grad_norm": 4.324676469837308, "learning_rate": 4.420174094891688e-06, "loss": 0.6056, "step": 3012 }, { "epoch": 0.24470072281328678, "grad_norm": 4.740074003918016, "learning_rate": 4.419752909367727e-06, "loss": 0.4472, "step": 3013 }, { "epoch": 0.24478193778932836, "grad_norm": 4.345165270453679, "learning_rate": 4.419331591007025e-06, "loss": 0.662, "step": 3014 }, { "epoch": 0.24486315276536993, "grad_norm": 3.2912342359960913, "learning_rate": 4.418910139838734e-06, "loss": 0.6347, "step": 3015 }, { "epoch": 0.2449443677414115, "grad_norm": 4.197613659900217, "learning_rate": 4.418488555892018e-06, "loss": 0.709, "step": 3016 }, { "epoch": 0.2450255827174531, "grad_norm": 6.165218832788323, "learning_rate": 4.418066839196047e-06, "loss": 0.5097, "step": 3017 }, { "epoch": 0.2451067976934947, "grad_norm": 5.754235502940853, "learning_rate": 4.4176449897800025e-06, "loss": 0.4602, "step": 3018 }, { "epoch": 0.24518801266953627, "grad_norm": 6.807261983784002, "learning_rate": 4.417223007673073e-06, "loss": 0.6116, "step": 3019 }, { "epoch": 0.24526922764557785, "grad_norm": 5.874117891239969, "learning_rate": 4.4168008929044585e-06, "loss": 0.4809, "step": 3020 }, { "epoch": 0.24535044262161942, "grad_norm": 4.771234322125993, "learning_rate": 4.416378645503366e-06, "loss": 0.5298, "step": 3021 }, { "epoch": 0.245431657597661, "grad_norm": 5.033152836788516, "learning_rate": 4.415956265499014e-06, "loss": 0.5011, "step": 3022 }, { "epoch": 0.24551287257370258, "grad_norm": 4.892045552702636, "learning_rate": 4.415533752920629e-06, "loss": 0.5847, "step": 3023 }, { "epoch": 0.24559408754974418, "grad_norm": 3.158494131947643, "learning_rate": 4.415111107797445e-06, "loss": 0.5626, "step": 3024 }, { "epoch": 0.24567530252578576, "grad_norm": 19.734315350148133, "learning_rate": 4.414688330158709e-06, "loss": 0.6228, "step": 3025 }, { "epoch": 0.24575651750182734, "grad_norm": 3.975333547273237, "learning_rate": 4.4142654200336735e-06, "loss": 0.6962, "step": 3026 }, { "epoch": 0.24583773247786891, "grad_norm": 5.615922654297775, "learning_rate": 4.413842377451602e-06, "loss": 0.541, "step": 3027 }, { "epoch": 0.2459189474539105, "grad_norm": 9.003336691465453, "learning_rate": 4.4134192024417674e-06, "loss": 0.4546, "step": 3028 }, { "epoch": 0.2460001624299521, "grad_norm": 3.891965066900615, "learning_rate": 4.412995895033449e-06, "loss": 0.4849, "step": 3029 }, { "epoch": 0.24608137740599367, "grad_norm": 4.878722949580337, "learning_rate": 4.412572455255942e-06, "loss": 0.615, "step": 3030 }, { "epoch": 0.24616259238203525, "grad_norm": 7.419504517267929, "learning_rate": 4.412148883138541e-06, "loss": 0.5744, "step": 3031 }, { "epoch": 0.24624380735807683, "grad_norm": 15.433110249433737, "learning_rate": 4.4117251787105566e-06, "loss": 0.4246, "step": 3032 }, { "epoch": 0.2463250223341184, "grad_norm": 4.626381274322953, "learning_rate": 4.411301342001309e-06, "loss": 0.4779, "step": 3033 }, { "epoch": 0.24640623731015998, "grad_norm": 3.140055018404333, "learning_rate": 4.4108773730401235e-06, "loss": 0.5733, "step": 3034 }, { "epoch": 0.24648745228620159, "grad_norm": 5.288841429027363, "learning_rate": 4.410453271856337e-06, "loss": 0.5525, "step": 3035 }, { "epoch": 0.24656866726224316, "grad_norm": 5.257436077679411, "learning_rate": 4.410029038479295e-06, "loss": 0.5659, "step": 3036 }, { "epoch": 0.24664988223828474, "grad_norm": 5.634845248429157, "learning_rate": 4.409604672938352e-06, "loss": 0.5136, "step": 3037 }, { "epoch": 0.24673109721432632, "grad_norm": 6.3457504654963195, "learning_rate": 4.409180175262872e-06, "loss": 0.5139, "step": 3038 }, { "epoch": 0.2468123121903679, "grad_norm": 6.705941236376485, "learning_rate": 4.408755545482229e-06, "loss": 0.6184, "step": 3039 }, { "epoch": 0.2468935271664095, "grad_norm": 4.672344302149794, "learning_rate": 4.408330783625803e-06, "loss": 0.6296, "step": 3040 }, { "epoch": 0.24697474214245108, "grad_norm": 7.4047101228444685, "learning_rate": 4.407905889722987e-06, "loss": 0.3766, "step": 3041 }, { "epoch": 0.24705595711849265, "grad_norm": 4.2484417683054545, "learning_rate": 4.407480863803181e-06, "loss": 0.5838, "step": 3042 }, { "epoch": 0.24713717209453423, "grad_norm": 5.209047094212918, "learning_rate": 4.407055705895794e-06, "loss": 0.6218, "step": 3043 }, { "epoch": 0.2472183870705758, "grad_norm": 4.0255565337154025, "learning_rate": 4.4066304160302455e-06, "loss": 0.5566, "step": 3044 }, { "epoch": 0.24729960204661738, "grad_norm": 4.333677201999764, "learning_rate": 4.4062049942359634e-06, "loss": 0.5295, "step": 3045 }, { "epoch": 0.247380817022659, "grad_norm": 4.229077150168856, "learning_rate": 4.405779440542383e-06, "loss": 0.7309, "step": 3046 }, { "epoch": 0.24746203199870057, "grad_norm": 7.678488339873706, "learning_rate": 4.405353754978952e-06, "loss": 0.5675, "step": 3047 }, { "epoch": 0.24754324697474214, "grad_norm": 5.135214318233568, "learning_rate": 4.404927937575125e-06, "loss": 0.6678, "step": 3048 }, { "epoch": 0.24762446195078372, "grad_norm": 5.25362634341942, "learning_rate": 4.4045019883603676e-06, "loss": 0.544, "step": 3049 }, { "epoch": 0.2477056769268253, "grad_norm": 5.0585319314475194, "learning_rate": 4.40407590736415e-06, "loss": 0.6509, "step": 3050 }, { "epoch": 0.2477868919028669, "grad_norm": 6.248691395394384, "learning_rate": 4.403649694615959e-06, "loss": 0.5899, "step": 3051 }, { "epoch": 0.24786810687890848, "grad_norm": 10.336963003211816, "learning_rate": 4.403223350145283e-06, "loss": 0.4696, "step": 3052 }, { "epoch": 0.24794932185495006, "grad_norm": 4.541662859187703, "learning_rate": 4.402796873981623e-06, "loss": 0.4006, "step": 3053 }, { "epoch": 0.24803053683099163, "grad_norm": 6.296840627991974, "learning_rate": 4.402370266154491e-06, "loss": 0.5062, "step": 3054 }, { "epoch": 0.2481117518070332, "grad_norm": 5.894042406789113, "learning_rate": 4.401943526693404e-06, "loss": 0.7281, "step": 3055 }, { "epoch": 0.2481929667830748, "grad_norm": 5.656446562333912, "learning_rate": 4.401516655627891e-06, "loss": 0.6206, "step": 3056 }, { "epoch": 0.2482741817591164, "grad_norm": 5.686281684309384, "learning_rate": 4.401089652987489e-06, "loss": 0.4256, "step": 3057 }, { "epoch": 0.24835539673515797, "grad_norm": 6.0150133514660284, "learning_rate": 4.4006625188017445e-06, "loss": 0.502, "step": 3058 }, { "epoch": 0.24843661171119955, "grad_norm": 5.249632475937014, "learning_rate": 4.400235253100214e-06, "loss": 0.5524, "step": 3059 }, { "epoch": 0.24851782668724112, "grad_norm": 4.835925595719573, "learning_rate": 4.399807855912459e-06, "loss": 0.6269, "step": 3060 }, { "epoch": 0.2485990416632827, "grad_norm": 4.606327993826141, "learning_rate": 4.3993803272680555e-06, "loss": 0.6161, "step": 3061 }, { "epoch": 0.2486802566393243, "grad_norm": 7.236306200132777, "learning_rate": 4.398952667196585e-06, "loss": 0.479, "step": 3062 }, { "epoch": 0.24876147161536588, "grad_norm": 5.858172893023612, "learning_rate": 4.398524875727641e-06, "loss": 0.568, "step": 3063 }, { "epoch": 0.24884268659140746, "grad_norm": 4.056227046494207, "learning_rate": 4.398096952890823e-06, "loss": 0.631, "step": 3064 }, { "epoch": 0.24892390156744904, "grad_norm": 8.0717170292666, "learning_rate": 4.397668898715743e-06, "loss": 0.5397, "step": 3065 }, { "epoch": 0.2490051165434906, "grad_norm": 6.347095752035782, "learning_rate": 4.397240713232016e-06, "loss": 0.5774, "step": 3066 }, { "epoch": 0.2490863315195322, "grad_norm": 4.659148068120233, "learning_rate": 4.3968123964692745e-06, "loss": 0.4825, "step": 3067 }, { "epoch": 0.2491675464955738, "grad_norm": 5.987640448722061, "learning_rate": 4.396383948457153e-06, "loss": 0.6587, "step": 3068 }, { "epoch": 0.24924876147161537, "grad_norm": 3.9258521372891573, "learning_rate": 4.395955369225299e-06, "loss": 0.8384, "step": 3069 }, { "epoch": 0.24932997644765695, "grad_norm": 5.296577196209221, "learning_rate": 4.395526658803367e-06, "loss": 0.5995, "step": 3070 }, { "epoch": 0.24941119142369853, "grad_norm": 6.077098107367331, "learning_rate": 4.395097817221023e-06, "loss": 0.5141, "step": 3071 }, { "epoch": 0.2494924063997401, "grad_norm": 9.074255129254627, "learning_rate": 4.39466884450794e-06, "loss": 0.5545, "step": 3072 }, { "epoch": 0.2495736213757817, "grad_norm": 3.837360404731743, "learning_rate": 4.3942397406937996e-06, "loss": 0.7089, "step": 3073 }, { "epoch": 0.24965483635182328, "grad_norm": 24.0705694528878, "learning_rate": 4.393810505808294e-06, "loss": 0.685, "step": 3074 }, { "epoch": 0.24973605132786486, "grad_norm": 29.345212216374478, "learning_rate": 4.393381139881125e-06, "loss": 0.5362, "step": 3075 }, { "epoch": 0.24981726630390644, "grad_norm": 2.6160968338209645, "learning_rate": 4.392951642942001e-06, "loss": 0.5189, "step": 3076 }, { "epoch": 0.24989848127994801, "grad_norm": 3.2671386577254586, "learning_rate": 4.392522015020643e-06, "loss": 0.526, "step": 3077 }, { "epoch": 0.2499796962559896, "grad_norm": 3.4560985141832905, "learning_rate": 4.392092256146776e-06, "loss": 0.484, "step": 3078 }, { "epoch": 0.25006091123203117, "grad_norm": 4.1989067824587405, "learning_rate": 4.391662366350139e-06, "loss": 0.482, "step": 3079 }, { "epoch": 0.25014212620807275, "grad_norm": 5.326290581465668, "learning_rate": 4.3912323456604785e-06, "loss": 0.6224, "step": 3080 }, { "epoch": 0.2502233411841144, "grad_norm": 7.502364694548779, "learning_rate": 4.390802194107548e-06, "loss": 0.5254, "step": 3081 }, { "epoch": 0.25030455616015596, "grad_norm": 4.007678084848024, "learning_rate": 4.390371911721113e-06, "loss": 0.6956, "step": 3082 }, { "epoch": 0.25038577113619753, "grad_norm": 6.194239862786131, "learning_rate": 4.389941498530946e-06, "loss": 0.6204, "step": 3083 }, { "epoch": 0.2504669861122391, "grad_norm": 5.28161752223026, "learning_rate": 4.38951095456683e-06, "loss": 0.6249, "step": 3084 }, { "epoch": 0.2505482010882807, "grad_norm": 6.6004264867031415, "learning_rate": 4.389080279858556e-06, "loss": 0.6299, "step": 3085 }, { "epoch": 0.25062941606432226, "grad_norm": 4.08243439857223, "learning_rate": 4.388649474435925e-06, "loss": 0.5395, "step": 3086 }, { "epoch": 0.25071063104036384, "grad_norm": 10.991487088043705, "learning_rate": 4.388218538328746e-06, "loss": 0.4487, "step": 3087 }, { "epoch": 0.2507918460164054, "grad_norm": 5.374906431529036, "learning_rate": 4.387787471566837e-06, "loss": 0.6908, "step": 3088 }, { "epoch": 0.250873060992447, "grad_norm": 6.4091647653829344, "learning_rate": 4.387356274180025e-06, "loss": 0.559, "step": 3089 }, { "epoch": 0.25095427596848857, "grad_norm": 3.189931420557015, "learning_rate": 4.386924946198148e-06, "loss": 0.5294, "step": 3090 }, { "epoch": 0.25103549094453015, "grad_norm": 3.881260853633071, "learning_rate": 4.386493487651052e-06, "loss": 0.7014, "step": 3091 }, { "epoch": 0.2511167059205718, "grad_norm": 9.67421505771052, "learning_rate": 4.38606189856859e-06, "loss": 0.506, "step": 3092 }, { "epoch": 0.25119792089661336, "grad_norm": 12.331043817036711, "learning_rate": 4.385630178980627e-06, "loss": 0.539, "step": 3093 }, { "epoch": 0.25127913587265494, "grad_norm": 4.246731275050449, "learning_rate": 4.385198328917034e-06, "loss": 0.5523, "step": 3094 }, { "epoch": 0.2513603508486965, "grad_norm": 5.769645319972767, "learning_rate": 4.384766348407695e-06, "loss": 0.5281, "step": 3095 }, { "epoch": 0.2514415658247381, "grad_norm": 4.286081820286076, "learning_rate": 4.3843342374825e-06, "loss": 0.5954, "step": 3096 }, { "epoch": 0.25152278080077967, "grad_norm": 3.6620430541424187, "learning_rate": 4.383901996171348e-06, "loss": 0.809, "step": 3097 }, { "epoch": 0.25160399577682124, "grad_norm": 4.730554157695557, "learning_rate": 4.383469624504148e-06, "loss": 0.5542, "step": 3098 }, { "epoch": 0.2516852107528628, "grad_norm": 5.354061606302798, "learning_rate": 4.3830371225108185e-06, "loss": 0.4754, "step": 3099 }, { "epoch": 0.2517664257289044, "grad_norm": 7.122075601960895, "learning_rate": 4.382604490221286e-06, "loss": 0.4081, "step": 3100 }, { "epoch": 0.251847640704946, "grad_norm": 4.487864560668316, "learning_rate": 4.382171727665486e-06, "loss": 0.71, "step": 3101 }, { "epoch": 0.25192885568098755, "grad_norm": 5.423045386515985, "learning_rate": 4.381738834873364e-06, "loss": 0.518, "step": 3102 }, { "epoch": 0.2520100706570292, "grad_norm": 4.117696423281904, "learning_rate": 4.381305811874873e-06, "loss": 0.3472, "step": 3103 }, { "epoch": 0.25209128563307076, "grad_norm": 4.590924951658724, "learning_rate": 4.3808726586999766e-06, "loss": 0.6369, "step": 3104 }, { "epoch": 0.25217250060911234, "grad_norm": 6.69015366947571, "learning_rate": 4.380439375378646e-06, "loss": 0.5335, "step": 3105 }, { "epoch": 0.2522537155851539, "grad_norm": 9.103381660915732, "learning_rate": 4.380005961940864e-06, "loss": 0.4577, "step": 3106 }, { "epoch": 0.2523349305611955, "grad_norm": 7.434831084210403, "learning_rate": 4.379572418416619e-06, "loss": 0.5355, "step": 3107 }, { "epoch": 0.25241614553723707, "grad_norm": 8.173889824189096, "learning_rate": 4.37913874483591e-06, "loss": 0.4481, "step": 3108 }, { "epoch": 0.25249736051327865, "grad_norm": 10.499303258759314, "learning_rate": 4.378704941228746e-06, "loss": 0.5386, "step": 3109 }, { "epoch": 0.2525785754893202, "grad_norm": 5.028078700952843, "learning_rate": 4.378271007625141e-06, "loss": 0.4977, "step": 3110 }, { "epoch": 0.2526597904653618, "grad_norm": 4.3897095234161245, "learning_rate": 4.377836944055124e-06, "loss": 0.5747, "step": 3111 }, { "epoch": 0.2527410054414034, "grad_norm": 5.593416675150363, "learning_rate": 4.377402750548729e-06, "loss": 0.6102, "step": 3112 }, { "epoch": 0.25282222041744495, "grad_norm": 5.376961891531233, "learning_rate": 4.376968427135999e-06, "loss": 0.6942, "step": 3113 }, { "epoch": 0.2529034353934866, "grad_norm": 7.126459999216967, "learning_rate": 4.376533973846988e-06, "loss": 0.7788, "step": 3114 }, { "epoch": 0.25298465036952816, "grad_norm": 4.854425202352291, "learning_rate": 4.376099390711758e-06, "loss": 0.4626, "step": 3115 }, { "epoch": 0.25306586534556974, "grad_norm": 8.756831124179643, "learning_rate": 4.375664677760378e-06, "loss": 0.6323, "step": 3116 }, { "epoch": 0.2531470803216113, "grad_norm": 4.267269088423493, "learning_rate": 4.375229835022929e-06, "loss": 0.6067, "step": 3117 }, { "epoch": 0.2532282952976529, "grad_norm": 5.198815978500407, "learning_rate": 4.374794862529501e-06, "loss": 0.5796, "step": 3118 }, { "epoch": 0.25330951027369447, "grad_norm": 4.95003494031335, "learning_rate": 4.374359760310191e-06, "loss": 0.4924, "step": 3119 }, { "epoch": 0.25339072524973605, "grad_norm": 4.604098155034568, "learning_rate": 4.373924528395105e-06, "loss": 0.6536, "step": 3120 }, { "epoch": 0.2534719402257776, "grad_norm": 5.033107338044671, "learning_rate": 4.373489166814358e-06, "loss": 0.6378, "step": 3121 }, { "epoch": 0.2535531552018192, "grad_norm": 5.806258612021515, "learning_rate": 4.3730536755980776e-06, "loss": 0.4952, "step": 3122 }, { "epoch": 0.2536343701778608, "grad_norm": 4.217899119753335, "learning_rate": 4.372618054776395e-06, "loss": 0.7578, "step": 3123 }, { "epoch": 0.25371558515390236, "grad_norm": 4.5310110282850085, "learning_rate": 4.372182304379455e-06, "loss": 0.5417, "step": 3124 }, { "epoch": 0.253796800129944, "grad_norm": 5.792695550840339, "learning_rate": 4.371746424437406e-06, "loss": 0.5578, "step": 3125 }, { "epoch": 0.25387801510598557, "grad_norm": 9.674533837851303, "learning_rate": 4.371310414980412e-06, "loss": 0.5975, "step": 3126 }, { "epoch": 0.25395923008202714, "grad_norm": 4.234861474964158, "learning_rate": 4.37087427603864e-06, "loss": 0.5482, "step": 3127 }, { "epoch": 0.2540404450580687, "grad_norm": 5.754506563667726, "learning_rate": 4.37043800764227e-06, "loss": 0.5447, "step": 3128 }, { "epoch": 0.2541216600341103, "grad_norm": 10.678832172560531, "learning_rate": 4.37000160982149e-06, "loss": 0.7462, "step": 3129 }, { "epoch": 0.2542028750101519, "grad_norm": 4.948964860269852, "learning_rate": 4.369565082606495e-06, "loss": 0.7041, "step": 3130 }, { "epoch": 0.25428408998619345, "grad_norm": 3.6881446493621657, "learning_rate": 4.369128426027489e-06, "loss": 0.6471, "step": 3131 }, { "epoch": 0.25436530496223503, "grad_norm": 4.0215825826282625, "learning_rate": 4.36869164011469e-06, "loss": 0.6399, "step": 3132 }, { "epoch": 0.2544465199382766, "grad_norm": 5.583956863498479, "learning_rate": 4.368254724898319e-06, "loss": 0.7778, "step": 3133 }, { "epoch": 0.2545277349143182, "grad_norm": 4.313157219895635, "learning_rate": 4.367817680408609e-06, "loss": 0.5685, "step": 3134 }, { "epoch": 0.25460894989035976, "grad_norm": 5.121719815553165, "learning_rate": 4.3673805066758e-06, "loss": 0.6546, "step": 3135 }, { "epoch": 0.2546901648664014, "grad_norm": 6.598529582533699, "learning_rate": 4.366943203730144e-06, "loss": 0.5776, "step": 3136 }, { "epoch": 0.25477137984244297, "grad_norm": 8.179382375263176, "learning_rate": 4.366505771601898e-06, "loss": 0.655, "step": 3137 }, { "epoch": 0.25485259481848455, "grad_norm": 3.5740269182727085, "learning_rate": 4.366068210321331e-06, "loss": 0.5393, "step": 3138 }, { "epoch": 0.2549338097945261, "grad_norm": 9.451730618946357, "learning_rate": 4.3656305199187195e-06, "loss": 0.5082, "step": 3139 }, { "epoch": 0.2550150247705677, "grad_norm": 6.028800369548608, "learning_rate": 4.365192700424351e-06, "loss": 0.6664, "step": 3140 }, { "epoch": 0.2550962397466093, "grad_norm": 5.858773502768749, "learning_rate": 4.364754751868519e-06, "loss": 0.5164, "step": 3141 }, { "epoch": 0.25517745472265085, "grad_norm": 7.052601898082844, "learning_rate": 4.364316674281526e-06, "loss": 0.544, "step": 3142 }, { "epoch": 0.25525866969869243, "grad_norm": 5.076254392341973, "learning_rate": 4.363878467693686e-06, "loss": 0.5765, "step": 3143 }, { "epoch": 0.255339884674734, "grad_norm": 4.671237955123445, "learning_rate": 4.363440132135322e-06, "loss": 0.6367, "step": 3144 }, { "epoch": 0.2554210996507756, "grad_norm": 4.135246305916654, "learning_rate": 4.363001667636762e-06, "loss": 0.725, "step": 3145 }, { "epoch": 0.25550231462681716, "grad_norm": 5.579014444017926, "learning_rate": 4.362563074228346e-06, "loss": 0.5253, "step": 3146 }, { "epoch": 0.2555835296028588, "grad_norm": 4.386851280267495, "learning_rate": 4.3621243519404235e-06, "loss": 0.5849, "step": 3147 }, { "epoch": 0.25566474457890037, "grad_norm": 4.068805050530992, "learning_rate": 4.36168550080335e-06, "loss": 0.5252, "step": 3148 }, { "epoch": 0.25574595955494195, "grad_norm": 5.079228239353849, "learning_rate": 4.361246520847493e-06, "loss": 0.4509, "step": 3149 }, { "epoch": 0.2558271745309835, "grad_norm": 4.417633075136854, "learning_rate": 4.360807412103228e-06, "loss": 0.5594, "step": 3150 }, { "epoch": 0.2559083895070251, "grad_norm": 5.457155176446224, "learning_rate": 4.3603681746009374e-06, "loss": 0.5581, "step": 3151 }, { "epoch": 0.2559896044830667, "grad_norm": 4.311137196331857, "learning_rate": 4.3599288083710155e-06, "loss": 0.5777, "step": 3152 }, { "epoch": 0.25607081945910826, "grad_norm": 5.779855459554121, "learning_rate": 4.359489313443864e-06, "loss": 0.6154, "step": 3153 }, { "epoch": 0.25615203443514983, "grad_norm": 5.203623644425137, "learning_rate": 4.359049689849893e-06, "loss": 0.5689, "step": 3154 }, { "epoch": 0.2562332494111914, "grad_norm": 6.2014101236501995, "learning_rate": 4.358609937619522e-06, "loss": 0.6593, "step": 3155 }, { "epoch": 0.256314464387233, "grad_norm": 6.085891873335906, "learning_rate": 4.358170056783179e-06, "loss": 0.425, "step": 3156 }, { "epoch": 0.25639567936327456, "grad_norm": 3.58366241946543, "learning_rate": 4.357730047371304e-06, "loss": 0.538, "step": 3157 }, { "epoch": 0.2564768943393162, "grad_norm": 5.44533166154409, "learning_rate": 4.357289909414341e-06, "loss": 0.5894, "step": 3158 }, { "epoch": 0.2565581093153578, "grad_norm": 3.8173019501608625, "learning_rate": 4.356849642942746e-06, "loss": 0.5385, "step": 3159 }, { "epoch": 0.25663932429139935, "grad_norm": 6.388033240415221, "learning_rate": 4.356409247986982e-06, "loss": 0.5628, "step": 3160 }, { "epoch": 0.25672053926744093, "grad_norm": 10.182238415534854, "learning_rate": 4.355968724577523e-06, "loss": 0.5962, "step": 3161 }, { "epoch": 0.2568017542434825, "grad_norm": 3.7342682561919998, "learning_rate": 4.355528072744851e-06, "loss": 0.6925, "step": 3162 }, { "epoch": 0.2568829692195241, "grad_norm": 3.771086888643871, "learning_rate": 4.355087292519458e-06, "loss": 0.7971, "step": 3163 }, { "epoch": 0.25696418419556566, "grad_norm": 4.185200079768148, "learning_rate": 4.354646383931841e-06, "loss": 0.6958, "step": 3164 }, { "epoch": 0.25704539917160724, "grad_norm": 4.773930727509564, "learning_rate": 4.3542053470125104e-06, "loss": 0.5607, "step": 3165 }, { "epoch": 0.2571266141476488, "grad_norm": 4.658976115697843, "learning_rate": 4.353764181791983e-06, "loss": 0.5175, "step": 3166 }, { "epoch": 0.2572078291236904, "grad_norm": 6.33815354108174, "learning_rate": 4.353322888300785e-06, "loss": 0.4758, "step": 3167 }, { "epoch": 0.25728904409973197, "grad_norm": 3.6435625688001423, "learning_rate": 4.3528814665694515e-06, "loss": 0.6159, "step": 3168 }, { "epoch": 0.2573702590757736, "grad_norm": 3.901416738242754, "learning_rate": 4.352439916628527e-06, "loss": 0.6253, "step": 3169 }, { "epoch": 0.2574514740518152, "grad_norm": 5.97609923012503, "learning_rate": 4.351998238508563e-06, "loss": 0.4725, "step": 3170 }, { "epoch": 0.25753268902785675, "grad_norm": 3.836440857096687, "learning_rate": 4.351556432240124e-06, "loss": 0.4923, "step": 3171 }, { "epoch": 0.25761390400389833, "grad_norm": 6.0800255854901595, "learning_rate": 4.351114497853779e-06, "loss": 0.6401, "step": 3172 }, { "epoch": 0.2576951189799399, "grad_norm": 5.622546507114739, "learning_rate": 4.350672435380107e-06, "loss": 0.7105, "step": 3173 }, { "epoch": 0.2577763339559815, "grad_norm": 5.822121836191669, "learning_rate": 4.350230244849697e-06, "loss": 0.6403, "step": 3174 }, { "epoch": 0.25785754893202306, "grad_norm": 3.5905578627539123, "learning_rate": 4.349787926293146e-06, "loss": 0.6267, "step": 3175 }, { "epoch": 0.25793876390806464, "grad_norm": 3.4379879993140126, "learning_rate": 4.349345479741062e-06, "loss": 0.7763, "step": 3176 }, { "epoch": 0.2580199788841062, "grad_norm": 5.296351841281579, "learning_rate": 4.348902905224057e-06, "loss": 0.5434, "step": 3177 }, { "epoch": 0.2581011938601478, "grad_norm": 4.502659862665315, "learning_rate": 4.348460202772756e-06, "loss": 0.6596, "step": 3178 }, { "epoch": 0.25818240883618937, "grad_norm": 5.66077638544368, "learning_rate": 4.348017372417792e-06, "loss": 0.4524, "step": 3179 }, { "epoch": 0.258263623812231, "grad_norm": 6.837754388761909, "learning_rate": 4.347574414189807e-06, "loss": 0.539, "step": 3180 }, { "epoch": 0.2583448387882726, "grad_norm": 4.065776898335388, "learning_rate": 4.347131328119451e-06, "loss": 0.4425, "step": 3181 }, { "epoch": 0.25842605376431416, "grad_norm": 6.317177110076131, "learning_rate": 4.346688114237381e-06, "loss": 0.5503, "step": 3182 }, { "epoch": 0.25850726874035573, "grad_norm": 3.9597286603343207, "learning_rate": 4.346244772574268e-06, "loss": 0.565, "step": 3183 }, { "epoch": 0.2585884837163973, "grad_norm": 4.617361547979039, "learning_rate": 4.345801303160788e-06, "loss": 0.6364, "step": 3184 }, { "epoch": 0.2586696986924389, "grad_norm": 3.869309442994426, "learning_rate": 4.3453577060276264e-06, "loss": 0.7814, "step": 3185 }, { "epoch": 0.25875091366848046, "grad_norm": 5.268139660706938, "learning_rate": 4.344913981205479e-06, "loss": 0.6746, "step": 3186 }, { "epoch": 0.25883212864452204, "grad_norm": 8.40326725980448, "learning_rate": 4.344470128725047e-06, "loss": 0.5914, "step": 3187 }, { "epoch": 0.2589133436205636, "grad_norm": 6.871592559950185, "learning_rate": 4.344026148617043e-06, "loss": 0.5718, "step": 3188 }, { "epoch": 0.2589945585966052, "grad_norm": 5.20815182160167, "learning_rate": 4.343582040912191e-06, "loss": 0.5049, "step": 3189 }, { "epoch": 0.2590757735726468, "grad_norm": 5.335316024532678, "learning_rate": 4.343137805641217e-06, "loss": 0.6226, "step": 3190 }, { "epoch": 0.2591569885486884, "grad_norm": 3.8793130716336335, "learning_rate": 4.3426934428348624e-06, "loss": 0.4488, "step": 3191 }, { "epoch": 0.25923820352473, "grad_norm": 5.127013360030541, "learning_rate": 4.342248952523874e-06, "loss": 0.4591, "step": 3192 }, { "epoch": 0.25931941850077156, "grad_norm": 4.532500566508718, "learning_rate": 4.341804334739008e-06, "loss": 0.4925, "step": 3193 }, { "epoch": 0.25940063347681314, "grad_norm": 5.195487587147855, "learning_rate": 4.34135958951103e-06, "loss": 0.5041, "step": 3194 }, { "epoch": 0.2594818484528547, "grad_norm": 3.983184952506586, "learning_rate": 4.3409147168707124e-06, "loss": 0.5198, "step": 3195 }, { "epoch": 0.2595630634288963, "grad_norm": 4.96090598003835, "learning_rate": 4.34046971684884e-06, "loss": 0.4985, "step": 3196 }, { "epoch": 0.25964427840493787, "grad_norm": 4.583631169515384, "learning_rate": 4.340024589476204e-06, "loss": 0.634, "step": 3197 }, { "epoch": 0.25972549338097944, "grad_norm": 5.545926672500218, "learning_rate": 4.3395793347836034e-06, "loss": 0.4752, "step": 3198 }, { "epoch": 0.259806708357021, "grad_norm": 6.711460341815773, "learning_rate": 4.33913395280185e-06, "loss": 0.7308, "step": 3199 }, { "epoch": 0.2598879233330626, "grad_norm": 3.7919381803697276, "learning_rate": 4.33868844356176e-06, "loss": 0.5049, "step": 3200 }, { "epoch": 0.2599691383091042, "grad_norm": 6.205008626331162, "learning_rate": 4.338242807094161e-06, "loss": 0.6063, "step": 3201 }, { "epoch": 0.2600503532851458, "grad_norm": 4.694118634605663, "learning_rate": 4.3377970434298885e-06, "loss": 0.6166, "step": 3202 }, { "epoch": 0.2601315682611874, "grad_norm": 5.435770883892459, "learning_rate": 4.337351152599787e-06, "loss": 0.5616, "step": 3203 }, { "epoch": 0.26021278323722896, "grad_norm": 3.8614175476698485, "learning_rate": 4.33690513463471e-06, "loss": 0.5997, "step": 3204 }, { "epoch": 0.26029399821327054, "grad_norm": 5.804567048793658, "learning_rate": 4.336458989565519e-06, "loss": 0.5467, "step": 3205 }, { "epoch": 0.2603752131893121, "grad_norm": 6.725056494128386, "learning_rate": 4.336012717423085e-06, "loss": 0.4595, "step": 3206 }, { "epoch": 0.2604564281653537, "grad_norm": 5.3233685676327065, "learning_rate": 4.335566318238289e-06, "loss": 0.5616, "step": 3207 }, { "epoch": 0.26053764314139527, "grad_norm": 3.4875685060457218, "learning_rate": 4.335119792042017e-06, "loss": 0.5414, "step": 3208 }, { "epoch": 0.26061885811743685, "grad_norm": 4.305522238103512, "learning_rate": 4.334673138865169e-06, "loss": 0.5734, "step": 3209 }, { "epoch": 0.2607000730934784, "grad_norm": 6.18157123882514, "learning_rate": 4.334226358738649e-06, "loss": 0.462, "step": 3210 }, { "epoch": 0.26078128806952, "grad_norm": 14.713830091090161, "learning_rate": 4.333779451693373e-06, "loss": 0.6092, "step": 3211 }, { "epoch": 0.2608625030455616, "grad_norm": 11.006487008864882, "learning_rate": 4.333332417760263e-06, "loss": 0.4972, "step": 3212 }, { "epoch": 0.2609437180216032, "grad_norm": 4.400134883259066, "learning_rate": 4.332885256970253e-06, "loss": 0.5909, "step": 3213 }, { "epoch": 0.2610249329976448, "grad_norm": 5.443017416295305, "learning_rate": 4.332437969354284e-06, "loss": 0.4379, "step": 3214 }, { "epoch": 0.26110614797368636, "grad_norm": 4.4738174242506235, "learning_rate": 4.331990554943305e-06, "loss": 0.6238, "step": 3215 }, { "epoch": 0.26118736294972794, "grad_norm": 5.0416535662280655, "learning_rate": 4.331543013768276e-06, "loss": 0.3969, "step": 3216 }, { "epoch": 0.2612685779257695, "grad_norm": 4.237641745664451, "learning_rate": 4.331095345860162e-06, "loss": 0.4619, "step": 3217 }, { "epoch": 0.2613497929018111, "grad_norm": 5.09656551194579, "learning_rate": 4.330647551249942e-06, "loss": 0.5224, "step": 3218 }, { "epoch": 0.2614310078778527, "grad_norm": 4.113097985628701, "learning_rate": 4.330199629968601e-06, "loss": 0.7045, "step": 3219 }, { "epoch": 0.26151222285389425, "grad_norm": 4.391696991616136, "learning_rate": 4.329751582047132e-06, "loss": 0.4772, "step": 3220 }, { "epoch": 0.2615934378299358, "grad_norm": 6.284634192395287, "learning_rate": 4.3293034075165355e-06, "loss": 0.7468, "step": 3221 }, { "epoch": 0.2616746528059774, "grad_norm": 5.31239683535975, "learning_rate": 4.328855106407826e-06, "loss": 0.5164, "step": 3222 }, { "epoch": 0.261755867782019, "grad_norm": 4.2612158979297945, "learning_rate": 4.328406678752022e-06, "loss": 0.6003, "step": 3223 }, { "epoch": 0.2618370827580606, "grad_norm": 5.243856895565067, "learning_rate": 4.3279581245801515e-06, "loss": 0.5995, "step": 3224 }, { "epoch": 0.2619182977341022, "grad_norm": 5.3541602152619046, "learning_rate": 4.327509443923254e-06, "loss": 0.4661, "step": 3225 }, { "epoch": 0.26199951271014377, "grad_norm": 5.615300922143262, "learning_rate": 4.327060636812375e-06, "loss": 0.634, "step": 3226 }, { "epoch": 0.26208072768618534, "grad_norm": 5.152013169697966, "learning_rate": 4.32661170327857e-06, "loss": 0.6963, "step": 3227 }, { "epoch": 0.2621619426622269, "grad_norm": 5.2073960016961545, "learning_rate": 4.326162643352901e-06, "loss": 0.7672, "step": 3228 }, { "epoch": 0.2622431576382685, "grad_norm": 5.909318364895728, "learning_rate": 4.325713457066443e-06, "loss": 0.6173, "step": 3229 }, { "epoch": 0.2623243726143101, "grad_norm": 16.26621162135915, "learning_rate": 4.325264144450276e-06, "loss": 0.3936, "step": 3230 }, { "epoch": 0.26240558759035165, "grad_norm": 4.238425289228819, "learning_rate": 4.324814705535491e-06, "loss": 0.5425, "step": 3231 }, { "epoch": 0.26248680256639323, "grad_norm": 7.639214601519878, "learning_rate": 4.324365140353185e-06, "loss": 0.6787, "step": 3232 }, { "epoch": 0.2625680175424348, "grad_norm": 4.776323919697988, "learning_rate": 4.323915448934466e-06, "loss": 0.7179, "step": 3233 }, { "epoch": 0.2626492325184764, "grad_norm": 4.829079791949981, "learning_rate": 4.323465631310452e-06, "loss": 0.5759, "step": 3234 }, { "epoch": 0.262730447494518, "grad_norm": 5.074740707653109, "learning_rate": 4.323015687512267e-06, "loss": 0.5894, "step": 3235 }, { "epoch": 0.2628116624705596, "grad_norm": 9.615050714429785, "learning_rate": 4.322565617571044e-06, "loss": 0.5466, "step": 3236 }, { "epoch": 0.26289287744660117, "grad_norm": 5.123025702283434, "learning_rate": 4.322115421517926e-06, "loss": 0.5913, "step": 3237 }, { "epoch": 0.26297409242264275, "grad_norm": 4.751136267011921, "learning_rate": 4.321665099384064e-06, "loss": 0.4789, "step": 3238 }, { "epoch": 0.2630553073986843, "grad_norm": 10.947055513952483, "learning_rate": 4.321214651200619e-06, "loss": 0.5118, "step": 3239 }, { "epoch": 0.2631365223747259, "grad_norm": 5.687786414091623, "learning_rate": 4.320764076998759e-06, "loss": 0.6006, "step": 3240 }, { "epoch": 0.2632177373507675, "grad_norm": 4.846530548806665, "learning_rate": 4.32031337680966e-06, "loss": 0.6819, "step": 3241 }, { "epoch": 0.26329895232680905, "grad_norm": 4.29992728461369, "learning_rate": 4.31986255066451e-06, "loss": 0.6645, "step": 3242 }, { "epoch": 0.26338016730285063, "grad_norm": 4.728540495274627, "learning_rate": 4.319411598594503e-06, "loss": 0.65, "step": 3243 }, { "epoch": 0.2634613822788922, "grad_norm": 4.044743720033708, "learning_rate": 4.318960520630842e-06, "loss": 0.5344, "step": 3244 }, { "epoch": 0.2635425972549338, "grad_norm": 4.062414651992789, "learning_rate": 4.3185093168047395e-06, "loss": 0.6314, "step": 3245 }, { "epoch": 0.2636238122309754, "grad_norm": 5.77814635308223, "learning_rate": 4.318057987147418e-06, "loss": 0.5496, "step": 3246 }, { "epoch": 0.263705027207017, "grad_norm": 8.868723303386513, "learning_rate": 4.317606531690104e-06, "loss": 0.5484, "step": 3247 }, { "epoch": 0.2637862421830586, "grad_norm": 5.931221128514847, "learning_rate": 4.317154950464039e-06, "loss": 0.5311, "step": 3248 }, { "epoch": 0.26386745715910015, "grad_norm": 5.582327237175479, "learning_rate": 4.316703243500467e-06, "loss": 0.532, "step": 3249 }, { "epoch": 0.2639486721351417, "grad_norm": 3.5277983786023235, "learning_rate": 4.3162514108306465e-06, "loss": 0.7615, "step": 3250 }, { "epoch": 0.2640298871111833, "grad_norm": 6.701677321017773, "learning_rate": 4.315799452485841e-06, "loss": 0.4753, "step": 3251 }, { "epoch": 0.2641111020872249, "grad_norm": 4.478895629503352, "learning_rate": 4.3153473684973226e-06, "loss": 0.5828, "step": 3252 }, { "epoch": 0.26419231706326646, "grad_norm": 6.992879628361261, "learning_rate": 4.314895158896374e-06, "loss": 0.5683, "step": 3253 }, { "epoch": 0.26427353203930803, "grad_norm": 4.622185359910068, "learning_rate": 4.314442823714286e-06, "loss": 0.4762, "step": 3254 }, { "epoch": 0.2643547470153496, "grad_norm": 3.3396459737640587, "learning_rate": 4.313990362982357e-06, "loss": 0.7585, "step": 3255 }, { "epoch": 0.2644359619913912, "grad_norm": 9.71525006135501, "learning_rate": 4.313537776731895e-06, "loss": 0.4599, "step": 3256 }, { "epoch": 0.2645171769674328, "grad_norm": 5.997259305627276, "learning_rate": 4.313085064994218e-06, "loss": 0.59, "step": 3257 }, { "epoch": 0.2645983919434744, "grad_norm": 3.4896157532430285, "learning_rate": 4.3126322278006496e-06, "loss": 0.6535, "step": 3258 }, { "epoch": 0.264679606919516, "grad_norm": 3.163990271513905, "learning_rate": 4.312179265182523e-06, "loss": 0.4253, "step": 3259 }, { "epoch": 0.26476082189555755, "grad_norm": 4.468139415385732, "learning_rate": 4.311726177171184e-06, "loss": 0.5902, "step": 3260 }, { "epoch": 0.26484203687159913, "grad_norm": 3.7893680396692053, "learning_rate": 4.311272963797981e-06, "loss": 0.5972, "step": 3261 }, { "epoch": 0.2649232518476407, "grad_norm": 5.50109202559223, "learning_rate": 4.3108196250942746e-06, "loss": 0.4875, "step": 3262 }, { "epoch": 0.2650044668236823, "grad_norm": 10.031595831871455, "learning_rate": 4.310366161091435e-06, "loss": 0.7109, "step": 3263 }, { "epoch": 0.26508568179972386, "grad_norm": 5.5919773594492, "learning_rate": 4.309912571820837e-06, "loss": 0.521, "step": 3264 }, { "epoch": 0.26516689677576544, "grad_norm": 4.444420349163222, "learning_rate": 4.309458857313868e-06, "loss": 0.6694, "step": 3265 }, { "epoch": 0.265248111751807, "grad_norm": 3.8949106613348947, "learning_rate": 4.309005017601924e-06, "loss": 0.6129, "step": 3266 }, { "epoch": 0.2653293267278486, "grad_norm": 3.3059067568093847, "learning_rate": 4.308551052716406e-06, "loss": 0.6001, "step": 3267 }, { "epoch": 0.2654105417038902, "grad_norm": 5.114398735836412, "learning_rate": 4.308096962688726e-06, "loss": 0.5416, "step": 3268 }, { "epoch": 0.2654917566799318, "grad_norm": 3.1154319915363855, "learning_rate": 4.307642747550306e-06, "loss": 0.5931, "step": 3269 }, { "epoch": 0.2655729716559734, "grad_norm": 4.307648831780373, "learning_rate": 4.307188407332574e-06, "loss": 0.5852, "step": 3270 }, { "epoch": 0.26565418663201495, "grad_norm": 4.424486660650266, "learning_rate": 4.306733942066969e-06, "loss": 0.5399, "step": 3271 }, { "epoch": 0.26573540160805653, "grad_norm": 6.036748818342054, "learning_rate": 4.306279351784938e-06, "loss": 0.4185, "step": 3272 }, { "epoch": 0.2658166165840981, "grad_norm": 3.6762274889278195, "learning_rate": 4.305824636517935e-06, "loss": 0.4484, "step": 3273 }, { "epoch": 0.2658978315601397, "grad_norm": 4.098602260190931, "learning_rate": 4.305369796297424e-06, "loss": 0.4614, "step": 3274 }, { "epoch": 0.26597904653618126, "grad_norm": 5.298237691361923, "learning_rate": 4.3049148311548785e-06, "loss": 0.5971, "step": 3275 }, { "epoch": 0.26606026151222284, "grad_norm": 5.05302699787098, "learning_rate": 4.304459741121778e-06, "loss": 0.4597, "step": 3276 }, { "epoch": 0.2661414764882644, "grad_norm": 4.995834340661036, "learning_rate": 4.304004526229614e-06, "loss": 0.5854, "step": 3277 }, { "epoch": 0.266222691464306, "grad_norm": 2.5777374640004838, "learning_rate": 4.303549186509884e-06, "loss": 0.498, "step": 3278 }, { "epoch": 0.2663039064403476, "grad_norm": 3.390125198708796, "learning_rate": 4.303093721994096e-06, "loss": 0.4985, "step": 3279 }, { "epoch": 0.2663851214163892, "grad_norm": 5.071175646539791, "learning_rate": 4.302638132713766e-06, "loss": 0.5427, "step": 3280 }, { "epoch": 0.2664663363924308, "grad_norm": 4.587539332384161, "learning_rate": 4.302182418700415e-06, "loss": 0.4533, "step": 3281 }, { "epoch": 0.26654755136847236, "grad_norm": 10.760547007231233, "learning_rate": 4.301726579985581e-06, "loss": 0.5711, "step": 3282 }, { "epoch": 0.26662876634451393, "grad_norm": 6.241807773314562, "learning_rate": 4.301270616600802e-06, "loss": 0.7594, "step": 3283 }, { "epoch": 0.2667099813205555, "grad_norm": 5.23430106311079, "learning_rate": 4.30081452857763e-06, "loss": 0.5922, "step": 3284 }, { "epoch": 0.2667911962965971, "grad_norm": 5.314419566343465, "learning_rate": 4.300358315947622e-06, "loss": 0.6298, "step": 3285 }, { "epoch": 0.26687241127263867, "grad_norm": 5.02528385387479, "learning_rate": 4.299901978742349e-06, "loss": 0.5465, "step": 3286 }, { "epoch": 0.26695362624868024, "grad_norm": 4.18489447765815, "learning_rate": 4.2994455169933835e-06, "loss": 0.5514, "step": 3287 }, { "epoch": 0.2670348412247218, "grad_norm": 4.050947136506907, "learning_rate": 4.298988930732312e-06, "loss": 0.6557, "step": 3288 }, { "epoch": 0.2671160562007634, "grad_norm": 4.82281153344797, "learning_rate": 4.2985322199907275e-06, "loss": 0.4805, "step": 3289 }, { "epoch": 0.26719727117680503, "grad_norm": 3.2786394296234787, "learning_rate": 4.298075384800232e-06, "loss": 0.7165, "step": 3290 }, { "epoch": 0.2672784861528466, "grad_norm": 4.134763536721899, "learning_rate": 4.297618425192436e-06, "loss": 0.6281, "step": 3291 }, { "epoch": 0.2673597011288882, "grad_norm": 5.393028330188706, "learning_rate": 4.297161341198957e-06, "loss": 0.4522, "step": 3292 }, { "epoch": 0.26744091610492976, "grad_norm": 3.3337915433920178, "learning_rate": 4.296704132851427e-06, "loss": 0.5315, "step": 3293 }, { "epoch": 0.26752213108097134, "grad_norm": 4.494608257605046, "learning_rate": 4.296246800181479e-06, "loss": 0.594, "step": 3294 }, { "epoch": 0.2676033460570129, "grad_norm": 6.986279435908919, "learning_rate": 4.29578934322076e-06, "loss": 0.4113, "step": 3295 }, { "epoch": 0.2676845610330545, "grad_norm": 4.754454279458852, "learning_rate": 4.295331762000921e-06, "loss": 0.656, "step": 3296 }, { "epoch": 0.26776577600909607, "grad_norm": 4.557941560884389, "learning_rate": 4.294874056553626e-06, "loss": 0.6003, "step": 3297 }, { "epoch": 0.26784699098513765, "grad_norm": 3.5253434081641593, "learning_rate": 4.294416226910546e-06, "loss": 0.6373, "step": 3298 }, { "epoch": 0.2679282059611792, "grad_norm": 5.390157405133369, "learning_rate": 4.2939582731033605e-06, "loss": 0.5665, "step": 3299 }, { "epoch": 0.2680094209372208, "grad_norm": 4.444716348064678, "learning_rate": 4.293500195163756e-06, "loss": 0.4936, "step": 3300 }, { "epoch": 0.26809063591326243, "grad_norm": 4.929185123300809, "learning_rate": 4.29304199312343e-06, "loss": 0.6039, "step": 3301 }, { "epoch": 0.268171850889304, "grad_norm": 5.005771469258986, "learning_rate": 4.292583667014087e-06, "loss": 0.6245, "step": 3302 }, { "epoch": 0.2682530658653456, "grad_norm": 6.794918253939256, "learning_rate": 4.292125216867443e-06, "loss": 0.4753, "step": 3303 }, { "epoch": 0.26833428084138716, "grad_norm": 4.918313944683376, "learning_rate": 4.2916666427152175e-06, "loss": 0.6796, "step": 3304 }, { "epoch": 0.26841549581742874, "grad_norm": 5.184653793095706, "learning_rate": 4.291207944589143e-06, "loss": 0.5299, "step": 3305 }, { "epoch": 0.2684967107934703, "grad_norm": 5.158508970573285, "learning_rate": 4.290749122520959e-06, "loss": 0.5115, "step": 3306 }, { "epoch": 0.2685779257695119, "grad_norm": 5.019559380014884, "learning_rate": 4.290290176542412e-06, "loss": 0.6411, "step": 3307 }, { "epoch": 0.26865914074555347, "grad_norm": 5.867738157836222, "learning_rate": 4.289831106685261e-06, "loss": 0.5529, "step": 3308 }, { "epoch": 0.26874035572159505, "grad_norm": 4.019129715439323, "learning_rate": 4.289371912981268e-06, "loss": 0.5655, "step": 3309 }, { "epoch": 0.2688215706976366, "grad_norm": 4.959547313791584, "learning_rate": 4.28891259546221e-06, "loss": 0.7881, "step": 3310 }, { "epoch": 0.2689027856736782, "grad_norm": 3.3894537647637466, "learning_rate": 4.288453154159869e-06, "loss": 0.6476, "step": 3311 }, { "epoch": 0.26898400064971983, "grad_norm": 4.576484804759739, "learning_rate": 4.287993589106034e-06, "loss": 0.4195, "step": 3312 }, { "epoch": 0.2690652156257614, "grad_norm": 5.5159598677639075, "learning_rate": 4.287533900332506e-06, "loss": 0.5731, "step": 3313 }, { "epoch": 0.269146430601803, "grad_norm": 3.476369390277886, "learning_rate": 4.287074087871092e-06, "loss": 0.7422, "step": 3314 }, { "epoch": 0.26922764557784457, "grad_norm": 4.3872226024253305, "learning_rate": 4.2866141517536085e-06, "loss": 0.5666, "step": 3315 }, { "epoch": 0.26930886055388614, "grad_norm": 4.1888189167131795, "learning_rate": 4.286154092011882e-06, "loss": 0.6373, "step": 3316 }, { "epoch": 0.2693900755299277, "grad_norm": 6.277380300582101, "learning_rate": 4.285693908677746e-06, "loss": 0.5566, "step": 3317 }, { "epoch": 0.2694712905059693, "grad_norm": 3.4248947125674802, "learning_rate": 4.285233601783041e-06, "loss": 0.7619, "step": 3318 }, { "epoch": 0.2695525054820109, "grad_norm": 9.083452534210942, "learning_rate": 4.28477317135962e-06, "loss": 0.441, "step": 3319 }, { "epoch": 0.26963372045805245, "grad_norm": 5.194205565045371, "learning_rate": 4.28431261743934e-06, "loss": 0.5573, "step": 3320 }, { "epoch": 0.269714935434094, "grad_norm": 4.678152409953697, "learning_rate": 4.2838519400540715e-06, "loss": 0.5867, "step": 3321 }, { "epoch": 0.2697961504101356, "grad_norm": 6.079974041201289, "learning_rate": 4.283391139235688e-06, "loss": 0.5913, "step": 3322 }, { "epoch": 0.26987736538617724, "grad_norm": 4.43636337967898, "learning_rate": 4.282930215016078e-06, "loss": 0.4492, "step": 3323 }, { "epoch": 0.2699585803622188, "grad_norm": 5.658242363279942, "learning_rate": 4.282469167427132e-06, "loss": 0.7522, "step": 3324 }, { "epoch": 0.2700397953382604, "grad_norm": 6.996911479616853, "learning_rate": 4.2820079965007545e-06, "loss": 0.5871, "step": 3325 }, { "epoch": 0.27012101031430197, "grad_norm": 4.53448881154164, "learning_rate": 4.281546702268853e-06, "loss": 0.3827, "step": 3326 }, { "epoch": 0.27020222529034355, "grad_norm": 6.424034548623486, "learning_rate": 4.28108528476335e-06, "loss": 0.3635, "step": 3327 }, { "epoch": 0.2702834402663851, "grad_norm": 4.466125493504516, "learning_rate": 4.280623744016171e-06, "loss": 0.5539, "step": 3328 }, { "epoch": 0.2703646552424267, "grad_norm": 3.693236745360508, "learning_rate": 4.280162080059252e-06, "loss": 0.5083, "step": 3329 }, { "epoch": 0.2704458702184683, "grad_norm": 7.92497787565039, "learning_rate": 4.279700292924539e-06, "loss": 0.6099, "step": 3330 }, { "epoch": 0.27052708519450985, "grad_norm": 6.126906391884235, "learning_rate": 4.279238382643985e-06, "loss": 0.4806, "step": 3331 }, { "epoch": 0.27060830017055143, "grad_norm": 10.147235977729743, "learning_rate": 4.278776349249551e-06, "loss": 0.5868, "step": 3332 }, { "epoch": 0.270689515146593, "grad_norm": 4.694547184635383, "learning_rate": 4.278314192773208e-06, "loss": 0.5579, "step": 3333 }, { "epoch": 0.27077073012263464, "grad_norm": 5.790255949685903, "learning_rate": 4.277851913246934e-06, "loss": 0.5259, "step": 3334 }, { "epoch": 0.2708519450986762, "grad_norm": 4.610604823465653, "learning_rate": 4.277389510702717e-06, "loss": 0.5614, "step": 3335 }, { "epoch": 0.2709331600747178, "grad_norm": 6.063379795121963, "learning_rate": 4.276926985172553e-06, "loss": 0.5133, "step": 3336 }, { "epoch": 0.27101437505075937, "grad_norm": 4.576106292327785, "learning_rate": 4.276464336688445e-06, "loss": 0.4748, "step": 3337 }, { "epoch": 0.27109559002680095, "grad_norm": 5.762207004631984, "learning_rate": 4.2760015652824074e-06, "loss": 0.7092, "step": 3338 }, { "epoch": 0.2711768050028425, "grad_norm": 4.920850361695753, "learning_rate": 4.27553867098646e-06, "loss": 0.4025, "step": 3339 }, { "epoch": 0.2712580199788841, "grad_norm": 4.965952642074921, "learning_rate": 4.275075653832635e-06, "loss": 0.5994, "step": 3340 }, { "epoch": 0.2713392349549257, "grad_norm": 6.661938259420609, "learning_rate": 4.274612513852968e-06, "loss": 0.6148, "step": 3341 }, { "epoch": 0.27142044993096726, "grad_norm": 4.176847484212596, "learning_rate": 4.274149251079507e-06, "loss": 0.4974, "step": 3342 }, { "epoch": 0.27150166490700883, "grad_norm": 9.324003992934651, "learning_rate": 4.273685865544308e-06, "loss": 0.518, "step": 3343 }, { "epoch": 0.2715828798830504, "grad_norm": 10.84005640144426, "learning_rate": 4.273222357279434e-06, "loss": 0.5259, "step": 3344 }, { "epoch": 0.27166409485909204, "grad_norm": 4.583615144085001, "learning_rate": 4.272758726316958e-06, "loss": 0.5688, "step": 3345 }, { "epoch": 0.2717453098351336, "grad_norm": 5.704188407998161, "learning_rate": 4.272294972688959e-06, "loss": 0.7256, "step": 3346 }, { "epoch": 0.2718265248111752, "grad_norm": 5.611046782367686, "learning_rate": 4.2718310964275285e-06, "loss": 0.4029, "step": 3347 }, { "epoch": 0.2719077397872168, "grad_norm": 3.432494651225952, "learning_rate": 4.271367097564763e-06, "loss": 0.5548, "step": 3348 }, { "epoch": 0.27198895476325835, "grad_norm": 5.993392392105414, "learning_rate": 4.27090297613277e-06, "loss": 0.4336, "step": 3349 }, { "epoch": 0.2720701697392999, "grad_norm": 5.973550784862252, "learning_rate": 4.270438732163663e-06, "loss": 0.5539, "step": 3350 }, { "epoch": 0.2721513847153415, "grad_norm": 7.021609581750342, "learning_rate": 4.269974365689565e-06, "loss": 0.5499, "step": 3351 }, { "epoch": 0.2722325996913831, "grad_norm": 4.0778999962904106, "learning_rate": 4.269509876742609e-06, "loss": 0.5756, "step": 3352 }, { "epoch": 0.27231381466742466, "grad_norm": 4.339699329280206, "learning_rate": 4.269045265354935e-06, "loss": 0.6475, "step": 3353 }, { "epoch": 0.27239502964346624, "grad_norm": 3.557598291542795, "learning_rate": 4.26858053155869e-06, "loss": 0.6789, "step": 3354 }, { "epoch": 0.2724762446195078, "grad_norm": 5.638513919453793, "learning_rate": 4.268115675386033e-06, "loss": 0.5661, "step": 3355 }, { "epoch": 0.27255745959554945, "grad_norm": 7.007916663451633, "learning_rate": 4.267650696869129e-06, "loss": 0.6213, "step": 3356 }, { "epoch": 0.272638674571591, "grad_norm": 6.707656670971426, "learning_rate": 4.267185596040152e-06, "loss": 0.6185, "step": 3357 }, { "epoch": 0.2727198895476326, "grad_norm": 5.468221363182492, "learning_rate": 4.266720372931285e-06, "loss": 0.6918, "step": 3358 }, { "epoch": 0.2728011045236742, "grad_norm": 3.4031280276219826, "learning_rate": 4.2662550275747175e-06, "loss": 0.4968, "step": 3359 }, { "epoch": 0.27288231949971575, "grad_norm": 3.7289743320117914, "learning_rate": 4.26578956000265e-06, "loss": 0.5128, "step": 3360 }, { "epoch": 0.27296353447575733, "grad_norm": 4.058327224361656, "learning_rate": 4.26532397024729e-06, "loss": 0.5662, "step": 3361 }, { "epoch": 0.2730447494517989, "grad_norm": 6.483948262408605, "learning_rate": 4.264858258340854e-06, "loss": 0.5985, "step": 3362 }, { "epoch": 0.2731259644278405, "grad_norm": 6.7970175715192065, "learning_rate": 4.264392424315568e-06, "loss": 0.5611, "step": 3363 }, { "epoch": 0.27320717940388206, "grad_norm": 4.719015734535453, "learning_rate": 4.263926468203663e-06, "loss": 0.5915, "step": 3364 }, { "epoch": 0.27328839437992364, "grad_norm": 5.439068319982008, "learning_rate": 4.2634603900373825e-06, "loss": 0.6571, "step": 3365 }, { "epoch": 0.2733696093559652, "grad_norm": 9.218141495545822, "learning_rate": 4.262994189848976e-06, "loss": 0.5393, "step": 3366 }, { "epoch": 0.27345082433200685, "grad_norm": 4.307338467347387, "learning_rate": 4.262527867670702e-06, "loss": 0.5521, "step": 3367 }, { "epoch": 0.2735320393080484, "grad_norm": 4.015516448961165, "learning_rate": 4.2620614235348265e-06, "loss": 0.4343, "step": 3368 }, { "epoch": 0.27361325428409, "grad_norm": 5.503396831053377, "learning_rate": 4.261594857473628e-06, "loss": 0.4884, "step": 3369 }, { "epoch": 0.2736944692601316, "grad_norm": 5.761220256833325, "learning_rate": 4.261128169519388e-06, "loss": 0.5385, "step": 3370 }, { "epoch": 0.27377568423617316, "grad_norm": 7.751355969511051, "learning_rate": 4.2606613597043975e-06, "loss": 0.5659, "step": 3371 }, { "epoch": 0.27385689921221473, "grad_norm": 4.671040413073545, "learning_rate": 4.260194428060961e-06, "loss": 0.4999, "step": 3372 }, { "epoch": 0.2739381141882563, "grad_norm": 7.901394405330174, "learning_rate": 4.2597273746213855e-06, "loss": 0.6527, "step": 3373 }, { "epoch": 0.2740193291642979, "grad_norm": 6.603352470726141, "learning_rate": 4.259260199417988e-06, "loss": 0.5473, "step": 3374 }, { "epoch": 0.27410054414033946, "grad_norm": 5.40024658636952, "learning_rate": 4.2587929024830964e-06, "loss": 0.5654, "step": 3375 }, { "epoch": 0.27418175911638104, "grad_norm": 9.759841121522395, "learning_rate": 4.258325483849044e-06, "loss": 0.6142, "step": 3376 }, { "epoch": 0.2742629740924226, "grad_norm": 5.125699081398096, "learning_rate": 4.257857943548173e-06, "loss": 0.5213, "step": 3377 }, { "epoch": 0.27434418906846425, "grad_norm": 4.34028312916136, "learning_rate": 4.257390281612837e-06, "loss": 0.4747, "step": 3378 }, { "epoch": 0.2744254040445058, "grad_norm": 4.263312229046128, "learning_rate": 4.256922498075394e-06, "loss": 0.5938, "step": 3379 }, { "epoch": 0.2745066190205474, "grad_norm": 4.797086281941267, "learning_rate": 4.256454592968212e-06, "loss": 0.4205, "step": 3380 }, { "epoch": 0.274587833996589, "grad_norm": 7.033829432880135, "learning_rate": 4.255986566323668e-06, "loss": 0.6218, "step": 3381 }, { "epoch": 0.27466904897263056, "grad_norm": 7.8902900782976095, "learning_rate": 4.255518418174148e-06, "loss": 0.5008, "step": 3382 }, { "epoch": 0.27475026394867214, "grad_norm": 3.2994006635611655, "learning_rate": 4.2550501485520445e-06, "loss": 0.694, "step": 3383 }, { "epoch": 0.2748314789247137, "grad_norm": 5.401884002359012, "learning_rate": 4.254581757489758e-06, "loss": 0.5462, "step": 3384 }, { "epoch": 0.2749126939007553, "grad_norm": 11.938262897785561, "learning_rate": 4.254113245019701e-06, "loss": 0.5148, "step": 3385 }, { "epoch": 0.27499390887679687, "grad_norm": 6.8820894517380555, "learning_rate": 4.25364461117429e-06, "loss": 0.6357, "step": 3386 }, { "epoch": 0.27507512385283844, "grad_norm": 3.919541658984626, "learning_rate": 4.2531758559859535e-06, "loss": 0.609, "step": 3387 }, { "epoch": 0.27515633882888, "grad_norm": 3.9922271367677475, "learning_rate": 4.252706979487127e-06, "loss": 0.5394, "step": 3388 }, { "epoch": 0.27523755380492165, "grad_norm": 4.307590756579932, "learning_rate": 4.2522379817102525e-06, "loss": 0.4479, "step": 3389 }, { "epoch": 0.27531876878096323, "grad_norm": 4.6485314568936715, "learning_rate": 4.251768862687783e-06, "loss": 0.5758, "step": 3390 }, { "epoch": 0.2753999837570048, "grad_norm": 3.219961667439573, "learning_rate": 4.25129962245218e-06, "loss": 0.5229, "step": 3391 }, { "epoch": 0.2754811987330464, "grad_norm": 3.6351810295222955, "learning_rate": 4.250830261035911e-06, "loss": 0.5625, "step": 3392 }, { "epoch": 0.27556241370908796, "grad_norm": 5.369015436096168, "learning_rate": 4.250360778471455e-06, "loss": 0.6667, "step": 3393 }, { "epoch": 0.27564362868512954, "grad_norm": 4.427072372118147, "learning_rate": 4.249891174791297e-06, "loss": 0.6424, "step": 3394 }, { "epoch": 0.2757248436611711, "grad_norm": 5.994983906902898, "learning_rate": 4.249421450027929e-06, "loss": 0.6288, "step": 3395 }, { "epoch": 0.2758060586372127, "grad_norm": 5.740886381814397, "learning_rate": 4.248951604213858e-06, "loss": 0.6017, "step": 3396 }, { "epoch": 0.27588727361325427, "grad_norm": 6.578063566695588, "learning_rate": 4.24848163738159e-06, "loss": 0.5481, "step": 3397 }, { "epoch": 0.27596848858929585, "grad_norm": 4.114106520635557, "learning_rate": 4.248011549563647e-06, "loss": 0.601, "step": 3398 }, { "epoch": 0.2760497035653374, "grad_norm": 5.445436802534245, "learning_rate": 4.247541340792557e-06, "loss": 0.5067, "step": 3399 }, { "epoch": 0.27613091854137906, "grad_norm": 5.480238969386964, "learning_rate": 4.247071011100855e-06, "loss": 0.4474, "step": 3400 }, { "epoch": 0.27621213351742063, "grad_norm": 3.9075643825539865, "learning_rate": 4.246600560521084e-06, "loss": 0.5349, "step": 3401 }, { "epoch": 0.2762933484934622, "grad_norm": 4.585770407486754, "learning_rate": 4.246129989085798e-06, "loss": 0.6055, "step": 3402 }, { "epoch": 0.2763745634695038, "grad_norm": 6.06730610156998, "learning_rate": 4.245659296827559e-06, "loss": 0.5557, "step": 3403 }, { "epoch": 0.27645577844554536, "grad_norm": 4.632787296567119, "learning_rate": 4.245188483778935e-06, "loss": 0.5214, "step": 3404 }, { "epoch": 0.27653699342158694, "grad_norm": 8.317514548088848, "learning_rate": 4.244717549972504e-06, "loss": 0.5625, "step": 3405 }, { "epoch": 0.2766182083976285, "grad_norm": 3.6101896303329255, "learning_rate": 4.2442464954408524e-06, "loss": 0.5844, "step": 3406 }, { "epoch": 0.2766994233736701, "grad_norm": 4.099925210417941, "learning_rate": 4.243775320216575e-06, "loss": 0.5043, "step": 3407 }, { "epoch": 0.27678063834971167, "grad_norm": 5.864792263094429, "learning_rate": 4.243304024332273e-06, "loss": 0.5581, "step": 3408 }, { "epoch": 0.27686185332575325, "grad_norm": 4.148641967222804, "learning_rate": 4.24283260782056e-06, "loss": 0.7032, "step": 3409 }, { "epoch": 0.2769430683017948, "grad_norm": 5.951997919915405, "learning_rate": 4.2423610707140545e-06, "loss": 0.4561, "step": 3410 }, { "epoch": 0.27702428327783646, "grad_norm": 3.9198889092781233, "learning_rate": 4.241889413045384e-06, "loss": 0.6652, "step": 3411 }, { "epoch": 0.27710549825387804, "grad_norm": 4.4203783957213725, "learning_rate": 4.2414176348471845e-06, "loss": 0.7011, "step": 3412 }, { "epoch": 0.2771867132299196, "grad_norm": 2.6236418539062303, "learning_rate": 4.240945736152101e-06, "loss": 0.5444, "step": 3413 }, { "epoch": 0.2772679282059612, "grad_norm": 3.283626852296422, "learning_rate": 4.240473716992786e-06, "loss": 0.5089, "step": 3414 }, { "epoch": 0.27734914318200277, "grad_norm": 4.388557235644921, "learning_rate": 4.240001577401903e-06, "loss": 0.6291, "step": 3415 }, { "epoch": 0.27743035815804434, "grad_norm": 5.882384300446532, "learning_rate": 4.239529317412118e-06, "loss": 0.3563, "step": 3416 }, { "epoch": 0.2775115731340859, "grad_norm": 5.663696285420261, "learning_rate": 4.239056937056111e-06, "loss": 0.4948, "step": 3417 }, { "epoch": 0.2775927881101275, "grad_norm": 5.913404763781138, "learning_rate": 4.238584436366568e-06, "loss": 0.7371, "step": 3418 }, { "epoch": 0.2776740030861691, "grad_norm": 2.9595554999598455, "learning_rate": 4.238111815376182e-06, "loss": 0.5524, "step": 3419 }, { "epoch": 0.27775521806221065, "grad_norm": 6.622001053281284, "learning_rate": 4.23763907411766e-06, "loss": 0.436, "step": 3420 }, { "epoch": 0.27783643303825223, "grad_norm": 4.194004633417891, "learning_rate": 4.237166212623708e-06, "loss": 0.4323, "step": 3421 }, { "epoch": 0.27791764801429386, "grad_norm": 5.700219795773494, "learning_rate": 4.236693230927048e-06, "loss": 0.496, "step": 3422 }, { "epoch": 0.27799886299033544, "grad_norm": 4.15322324920816, "learning_rate": 4.2362201290604085e-06, "loss": 0.5907, "step": 3423 }, { "epoch": 0.278080077966377, "grad_norm": 5.540030074318849, "learning_rate": 4.235746907056525e-06, "loss": 0.652, "step": 3424 }, { "epoch": 0.2781612929424186, "grad_norm": 6.39998859259988, "learning_rate": 4.235273564948142e-06, "loss": 0.4876, "step": 3425 }, { "epoch": 0.27824250791846017, "grad_norm": 3.7525190738960816, "learning_rate": 4.234800102768012e-06, "loss": 0.7998, "step": 3426 }, { "epoch": 0.27832372289450175, "grad_norm": 4.294740550636778, "learning_rate": 4.234326520548895e-06, "loss": 0.6744, "step": 3427 }, { "epoch": 0.2784049378705433, "grad_norm": 5.52457863696226, "learning_rate": 4.233852818323563e-06, "loss": 0.4104, "step": 3428 }, { "epoch": 0.2784861528465849, "grad_norm": 5.182650646362298, "learning_rate": 4.233378996124792e-06, "loss": 0.5443, "step": 3429 }, { "epoch": 0.2785673678226265, "grad_norm": 8.063914556948795, "learning_rate": 4.232905053985368e-06, "loss": 0.6733, "step": 3430 }, { "epoch": 0.27864858279866805, "grad_norm": 3.9395695955779186, "learning_rate": 4.232430991938085e-06, "loss": 0.5109, "step": 3431 }, { "epoch": 0.27872979777470963, "grad_norm": 2.5171709827838993, "learning_rate": 4.231956810015747e-06, "loss": 0.672, "step": 3432 }, { "epoch": 0.27881101275075126, "grad_norm": 5.452147154881219, "learning_rate": 4.231482508251164e-06, "loss": 0.5055, "step": 3433 }, { "epoch": 0.27889222772679284, "grad_norm": 10.095151440140139, "learning_rate": 4.231008086677154e-06, "loss": 0.4714, "step": 3434 }, { "epoch": 0.2789734427028344, "grad_norm": 4.246691334687768, "learning_rate": 4.230533545326547e-06, "loss": 0.5819, "step": 3435 }, { "epoch": 0.279054657678876, "grad_norm": 7.118049034428091, "learning_rate": 4.230058884232177e-06, "loss": 0.7539, "step": 3436 }, { "epoch": 0.27913587265491757, "grad_norm": 4.493070103171718, "learning_rate": 4.229584103426888e-06, "loss": 0.5447, "step": 3437 }, { "epoch": 0.27921708763095915, "grad_norm": 3.9508050807048236, "learning_rate": 4.229109202943533e-06, "loss": 0.5914, "step": 3438 }, { "epoch": 0.2792983026070007, "grad_norm": 6.3185262215803775, "learning_rate": 4.228634182814972e-06, "loss": 0.5831, "step": 3439 }, { "epoch": 0.2793795175830423, "grad_norm": 4.369461605720211, "learning_rate": 4.228159043074075e-06, "loss": 0.6527, "step": 3440 }, { "epoch": 0.2794607325590839, "grad_norm": 6.77973389079385, "learning_rate": 4.227683783753717e-06, "loss": 0.5189, "step": 3441 }, { "epoch": 0.27954194753512546, "grad_norm": 4.379921845032675, "learning_rate": 4.227208404886787e-06, "loss": 0.5284, "step": 3442 }, { "epoch": 0.27962316251116703, "grad_norm": 4.507924635404893, "learning_rate": 4.2267329065061745e-06, "loss": 0.5291, "step": 3443 }, { "epoch": 0.27970437748720867, "grad_norm": 6.075130482555993, "learning_rate": 4.226257288644784e-06, "loss": 0.5183, "step": 3444 }, { "epoch": 0.27978559246325024, "grad_norm": 4.72530804483612, "learning_rate": 4.225781551335526e-06, "loss": 0.5276, "step": 3445 }, { "epoch": 0.2798668074392918, "grad_norm": 5.2670855376913535, "learning_rate": 4.225305694611318e-06, "loss": 0.5282, "step": 3446 }, { "epoch": 0.2799480224153334, "grad_norm": 4.900878051447264, "learning_rate": 4.224829718505087e-06, "loss": 0.5453, "step": 3447 }, { "epoch": 0.280029237391375, "grad_norm": 7.178013534818383, "learning_rate": 4.224353623049767e-06, "loss": 0.6766, "step": 3448 }, { "epoch": 0.28011045236741655, "grad_norm": 4.664935512001858, "learning_rate": 4.2238774082783025e-06, "loss": 0.6089, "step": 3449 }, { "epoch": 0.28019166734345813, "grad_norm": 4.852476116024416, "learning_rate": 4.223401074223646e-06, "loss": 0.6572, "step": 3450 }, { "epoch": 0.2802728823194997, "grad_norm": 8.563098982741186, "learning_rate": 4.222924620918755e-06, "loss": 0.5789, "step": 3451 }, { "epoch": 0.2803540972955413, "grad_norm": 4.014234539171271, "learning_rate": 4.222448048396599e-06, "loss": 0.6374, "step": 3452 }, { "epoch": 0.28043531227158286, "grad_norm": 4.822310015531145, "learning_rate": 4.221971356690154e-06, "loss": 0.5382, "step": 3453 }, { "epoch": 0.28051652724762444, "grad_norm": 4.267417151682677, "learning_rate": 4.221494545832405e-06, "loss": 0.4697, "step": 3454 }, { "epoch": 0.28059774222366607, "grad_norm": 8.796169733126867, "learning_rate": 4.221017615856344e-06, "loss": 0.467, "step": 3455 }, { "epoch": 0.28067895719970765, "grad_norm": 5.742915475497312, "learning_rate": 4.220540566794972e-06, "loss": 0.5189, "step": 3456 }, { "epoch": 0.2807601721757492, "grad_norm": 4.702000191158769, "learning_rate": 4.220063398681299e-06, "loss": 0.6394, "step": 3457 }, { "epoch": 0.2808413871517908, "grad_norm": 7.827322462489466, "learning_rate": 4.219586111548342e-06, "loss": 0.427, "step": 3458 }, { "epoch": 0.2809226021278324, "grad_norm": 4.5952790790019495, "learning_rate": 4.219108705429127e-06, "loss": 0.5275, "step": 3459 }, { "epoch": 0.28100381710387395, "grad_norm": 4.260809941886097, "learning_rate": 4.218631180356688e-06, "loss": 0.6774, "step": 3460 }, { "epoch": 0.28108503207991553, "grad_norm": 5.933145706552524, "learning_rate": 4.218153536364067e-06, "loss": 0.5556, "step": 3461 }, { "epoch": 0.2811662470559571, "grad_norm": 5.5215075629008545, "learning_rate": 4.217675773484314e-06, "loss": 0.6402, "step": 3462 }, { "epoch": 0.2812474620319987, "grad_norm": 4.367099875870321, "learning_rate": 4.217197891750488e-06, "loss": 0.553, "step": 3463 }, { "epoch": 0.28132867700804026, "grad_norm": 8.120031592132388, "learning_rate": 4.216719891195657e-06, "loss": 0.7601, "step": 3464 }, { "epoch": 0.28140989198408184, "grad_norm": 13.915399996993806, "learning_rate": 4.216241771852895e-06, "loss": 0.3699, "step": 3465 }, { "epoch": 0.28149110696012347, "grad_norm": 4.475259050725444, "learning_rate": 4.215763533755285e-06, "loss": 0.6308, "step": 3466 }, { "epoch": 0.28157232193616505, "grad_norm": 4.246073015911453, "learning_rate": 4.215285176935919e-06, "loss": 0.5941, "step": 3467 }, { "epoch": 0.2816535369122066, "grad_norm": 4.463656799978896, "learning_rate": 4.214806701427896e-06, "loss": 0.5504, "step": 3468 }, { "epoch": 0.2817347518882482, "grad_norm": 2.9321912614363517, "learning_rate": 4.214328107264326e-06, "loss": 0.5993, "step": 3469 }, { "epoch": 0.2818159668642898, "grad_norm": 5.912480103116702, "learning_rate": 4.213849394478323e-06, "loss": 0.8218, "step": 3470 }, { "epoch": 0.28189718184033136, "grad_norm": 3.0648081596549996, "learning_rate": 4.213370563103013e-06, "loss": 0.5549, "step": 3471 }, { "epoch": 0.28197839681637293, "grad_norm": 7.610704699641839, "learning_rate": 4.212891613171528e-06, "loss": 0.539, "step": 3472 }, { "epoch": 0.2820596117924145, "grad_norm": 4.821090407393734, "learning_rate": 4.212412544717009e-06, "loss": 0.5433, "step": 3473 }, { "epoch": 0.2821408267684561, "grad_norm": 4.480177003679071, "learning_rate": 4.211933357772604e-06, "loss": 0.5649, "step": 3474 }, { "epoch": 0.28222204174449766, "grad_norm": 6.0539202446851546, "learning_rate": 4.211454052371471e-06, "loss": 0.5074, "step": 3475 }, { "epoch": 0.28230325672053924, "grad_norm": 3.2128791712651066, "learning_rate": 4.210974628546776e-06, "loss": 0.6066, "step": 3476 }, { "epoch": 0.2823844716965809, "grad_norm": 5.748988331106903, "learning_rate": 4.210495086331691e-06, "loss": 0.5114, "step": 3477 }, { "epoch": 0.28246568667262245, "grad_norm": 2.985462121611765, "learning_rate": 4.2100154257594e-06, "loss": 0.6491, "step": 3478 }, { "epoch": 0.28254690164866403, "grad_norm": 4.631989287533722, "learning_rate": 4.20953564686309e-06, "loss": 0.5046, "step": 3479 }, { "epoch": 0.2826281166247056, "grad_norm": 7.645616919092413, "learning_rate": 4.2090557496759615e-06, "loss": 0.5868, "step": 3480 }, { "epoch": 0.2827093316007472, "grad_norm": 13.216352857254574, "learning_rate": 4.208575734231221e-06, "loss": 0.539, "step": 3481 }, { "epoch": 0.28279054657678876, "grad_norm": 4.282296217606311, "learning_rate": 4.208095600562081e-06, "loss": 0.6534, "step": 3482 }, { "epoch": 0.28287176155283034, "grad_norm": 3.613475303783212, "learning_rate": 4.2076153487017655e-06, "loss": 0.5671, "step": 3483 }, { "epoch": 0.2829529765288719, "grad_norm": 4.958757945118231, "learning_rate": 4.207134978683506e-06, "loss": 0.5416, "step": 3484 }, { "epoch": 0.2830341915049135, "grad_norm": 4.199726726733855, "learning_rate": 4.206654490540541e-06, "loss": 0.6141, "step": 3485 }, { "epoch": 0.28311540648095507, "grad_norm": 4.600582479141581, "learning_rate": 4.206173884306116e-06, "loss": 0.6657, "step": 3486 }, { "epoch": 0.28319662145699664, "grad_norm": 7.6947988469297375, "learning_rate": 4.20569316001349e-06, "loss": 0.5541, "step": 3487 }, { "epoch": 0.2832778364330383, "grad_norm": 5.367699544702637, "learning_rate": 4.205212317695924e-06, "loss": 0.6524, "step": 3488 }, { "epoch": 0.28335905140907985, "grad_norm": 6.029688715188512, "learning_rate": 4.204731357386689e-06, "loss": 0.5518, "step": 3489 }, { "epoch": 0.28344026638512143, "grad_norm": 4.352090436645465, "learning_rate": 4.204250279119068e-06, "loss": 0.5141, "step": 3490 }, { "epoch": 0.283521481361163, "grad_norm": 4.073146574157928, "learning_rate": 4.203769082926346e-06, "loss": 0.5047, "step": 3491 }, { "epoch": 0.2836026963372046, "grad_norm": 4.576078022715138, "learning_rate": 4.203287768841822e-06, "loss": 0.5063, "step": 3492 }, { "epoch": 0.28368391131324616, "grad_norm": 5.141508394238785, "learning_rate": 4.202806336898798e-06, "loss": 0.5, "step": 3493 }, { "epoch": 0.28376512628928774, "grad_norm": 4.33832646184562, "learning_rate": 4.202324787130587e-06, "loss": 0.5375, "step": 3494 }, { "epoch": 0.2838463412653293, "grad_norm": 7.671372488757828, "learning_rate": 4.201843119570511e-06, "loss": 0.6192, "step": 3495 }, { "epoch": 0.2839275562413709, "grad_norm": 6.000625361865301, "learning_rate": 4.201361334251898e-06, "loss": 0.5039, "step": 3496 }, { "epoch": 0.28400877121741247, "grad_norm": 10.56462611054689, "learning_rate": 4.200879431208084e-06, "loss": 0.4379, "step": 3497 }, { "epoch": 0.28408998619345405, "grad_norm": 5.2118113489010325, "learning_rate": 4.200397410472416e-06, "loss": 0.4859, "step": 3498 }, { "epoch": 0.2841712011694957, "grad_norm": 3.7680592065666447, "learning_rate": 4.199915272078247e-06, "loss": 0.5509, "step": 3499 }, { "epoch": 0.28425241614553726, "grad_norm": 4.998842782205934, "learning_rate": 4.199433016058936e-06, "loss": 0.5495, "step": 3500 }, { "epoch": 0.28433363112157883, "grad_norm": 3.6541213589146166, "learning_rate": 4.198950642447856e-06, "loss": 0.7963, "step": 3501 }, { "epoch": 0.2844148460976204, "grad_norm": 4.6320107487602655, "learning_rate": 4.198468151278382e-06, "loss": 0.5354, "step": 3502 }, { "epoch": 0.284496061073662, "grad_norm": 16.856109294832514, "learning_rate": 4.197985542583902e-06, "loss": 0.5639, "step": 3503 }, { "epoch": 0.28457727604970356, "grad_norm": 4.341047779732985, "learning_rate": 4.197502816397809e-06, "loss": 0.5327, "step": 3504 }, { "epoch": 0.28465849102574514, "grad_norm": 6.54679516358405, "learning_rate": 4.197019972753504e-06, "loss": 0.5685, "step": 3505 }, { "epoch": 0.2847397060017867, "grad_norm": 8.538628468801688, "learning_rate": 4.1965370116843985e-06, "loss": 0.6608, "step": 3506 }, { "epoch": 0.2848209209778283, "grad_norm": 6.781146736308602, "learning_rate": 4.1960539332239115e-06, "loss": 0.5363, "step": 3507 }, { "epoch": 0.2849021359538699, "grad_norm": 6.568218511710609, "learning_rate": 4.195570737405468e-06, "loss": 0.4654, "step": 3508 }, { "epoch": 0.28498335092991145, "grad_norm": 6.14037981907578, "learning_rate": 4.195087424262503e-06, "loss": 0.6075, "step": 3509 }, { "epoch": 0.2850645659059531, "grad_norm": 3.9057111334726016, "learning_rate": 4.194603993828459e-06, "loss": 0.4975, "step": 3510 }, { "epoch": 0.28514578088199466, "grad_norm": 6.56859758615991, "learning_rate": 4.194120446136788e-06, "loss": 0.6143, "step": 3511 }, { "epoch": 0.28522699585803624, "grad_norm": 7.248281271806781, "learning_rate": 4.193636781220948e-06, "loss": 0.6135, "step": 3512 }, { "epoch": 0.2853082108340778, "grad_norm": 3.6814056969521265, "learning_rate": 4.1931529991144056e-06, "loss": 0.644, "step": 3513 }, { "epoch": 0.2853894258101194, "grad_norm": 4.124004556862366, "learning_rate": 4.192669099850637e-06, "loss": 0.4091, "step": 3514 }, { "epoch": 0.28547064078616097, "grad_norm": 9.237020510741628, "learning_rate": 4.192185083463125e-06, "loss": 0.6916, "step": 3515 }, { "epoch": 0.28555185576220254, "grad_norm": 4.325518645446921, "learning_rate": 4.19170094998536e-06, "loss": 0.641, "step": 3516 }, { "epoch": 0.2856330707382441, "grad_norm": 3.772328467417333, "learning_rate": 4.191216699450844e-06, "loss": 0.5248, "step": 3517 }, { "epoch": 0.2857142857142857, "grad_norm": 4.5828680511801, "learning_rate": 4.190732331893083e-06, "loss": 0.5488, "step": 3518 }, { "epoch": 0.2857955006903273, "grad_norm": 6.7144709613110045, "learning_rate": 4.190247847345591e-06, "loss": 0.6085, "step": 3519 }, { "epoch": 0.28587671566636885, "grad_norm": 6.61578411126398, "learning_rate": 4.189763245841895e-06, "loss": 0.4582, "step": 3520 }, { "epoch": 0.2859579306424105, "grad_norm": 7.697721977409483, "learning_rate": 4.189278527415524e-06, "loss": 0.4666, "step": 3521 }, { "epoch": 0.28603914561845206, "grad_norm": 9.805505848445298, "learning_rate": 4.188793692100021e-06, "loss": 0.5197, "step": 3522 }, { "epoch": 0.28612036059449364, "grad_norm": 4.38159024051503, "learning_rate": 4.1883087399289315e-06, "loss": 0.6191, "step": 3523 }, { "epoch": 0.2862015755705352, "grad_norm": 5.443162368213296, "learning_rate": 4.187823670935812e-06, "loss": 0.4839, "step": 3524 }, { "epoch": 0.2862827905465768, "grad_norm": 9.060902521439406, "learning_rate": 4.187338485154228e-06, "loss": 0.5212, "step": 3525 }, { "epoch": 0.28636400552261837, "grad_norm": 4.5276722061450405, "learning_rate": 4.186853182617751e-06, "loss": 0.5874, "step": 3526 }, { "epoch": 0.28644522049865995, "grad_norm": 4.787775939104659, "learning_rate": 4.1863677633599605e-06, "loss": 0.6216, "step": 3527 }, { "epoch": 0.2865264354747015, "grad_norm": 4.032333523480888, "learning_rate": 4.1858822274144465e-06, "loss": 0.8107, "step": 3528 }, { "epoch": 0.2866076504507431, "grad_norm": 4.164487711997063, "learning_rate": 4.185396574814804e-06, "loss": 0.5834, "step": 3529 }, { "epoch": 0.2866888654267847, "grad_norm": 4.736119308914805, "learning_rate": 4.184910805594639e-06, "loss": 0.5395, "step": 3530 }, { "epoch": 0.28677008040282626, "grad_norm": 4.206365573240851, "learning_rate": 4.184424919787563e-06, "loss": 0.634, "step": 3531 }, { "epoch": 0.2868512953788679, "grad_norm": 3.609471547339424, "learning_rate": 4.183938917427198e-06, "loss": 0.512, "step": 3532 }, { "epoch": 0.28693251035490946, "grad_norm": 3.3756577667802303, "learning_rate": 4.183452798547171e-06, "loss": 0.6594, "step": 3533 }, { "epoch": 0.28701372533095104, "grad_norm": 4.577782600286523, "learning_rate": 4.1829665631811214e-06, "loss": 0.5162, "step": 3534 }, { "epoch": 0.2870949403069926, "grad_norm": 5.423760632233463, "learning_rate": 4.182480211362691e-06, "loss": 0.6164, "step": 3535 }, { "epoch": 0.2871761552830342, "grad_norm": 8.37175609091639, "learning_rate": 4.181993743125535e-06, "loss": 0.5202, "step": 3536 }, { "epoch": 0.2872573702590758, "grad_norm": 4.332707609863406, "learning_rate": 4.181507158503314e-06, "loss": 0.7249, "step": 3537 }, { "epoch": 0.28733858523511735, "grad_norm": 4.538954914725168, "learning_rate": 4.1810204575296966e-06, "loss": 0.5169, "step": 3538 }, { "epoch": 0.2874198002111589, "grad_norm": 4.191746753570993, "learning_rate": 4.180533640238361e-06, "loss": 0.6053, "step": 3539 }, { "epoch": 0.2875010151872005, "grad_norm": 4.85627080590084, "learning_rate": 4.180046706662991e-06, "loss": 0.5235, "step": 3540 }, { "epoch": 0.2875822301632421, "grad_norm": 6.5813372383035915, "learning_rate": 4.17955965683728e-06, "loss": 0.5635, "step": 3541 }, { "epoch": 0.28766344513928366, "grad_norm": 6.353502644005653, "learning_rate": 4.17907249079493e-06, "loss": 0.5271, "step": 3542 }, { "epoch": 0.2877446601153253, "grad_norm": 3.316222423088927, "learning_rate": 4.17858520856965e-06, "loss": 0.5198, "step": 3543 }, { "epoch": 0.28782587509136687, "grad_norm": 3.6027797950419846, "learning_rate": 4.178097810195157e-06, "loss": 0.5364, "step": 3544 }, { "epoch": 0.28790709006740844, "grad_norm": 6.196439592693107, "learning_rate": 4.177610295705178e-06, "loss": 0.5973, "step": 3545 }, { "epoch": 0.28798830504345, "grad_norm": 3.281604318639601, "learning_rate": 4.177122665133444e-06, "loss": 0.6097, "step": 3546 }, { "epoch": 0.2880695200194916, "grad_norm": 5.0788056380548525, "learning_rate": 4.176634918513698e-06, "loss": 0.479, "step": 3547 }, { "epoch": 0.2881507349955332, "grad_norm": 10.492165661109485, "learning_rate": 4.176147055879689e-06, "loss": 0.6935, "step": 3548 }, { "epoch": 0.28823194997157475, "grad_norm": 5.566980762163421, "learning_rate": 4.175659077265175e-06, "loss": 0.5723, "step": 3549 }, { "epoch": 0.28831316494761633, "grad_norm": 4.510747031993934, "learning_rate": 4.175170982703921e-06, "loss": 0.4667, "step": 3550 }, { "epoch": 0.2883943799236579, "grad_norm": 5.795017468211081, "learning_rate": 4.1746827722297e-06, "loss": 0.6429, "step": 3551 }, { "epoch": 0.2884755948996995, "grad_norm": 4.90772281384199, "learning_rate": 4.174194445876295e-06, "loss": 0.6138, "step": 3552 }, { "epoch": 0.28855680987574106, "grad_norm": 9.38458036391076, "learning_rate": 4.1737060036774945e-06, "loss": 0.5942, "step": 3553 }, { "epoch": 0.2886380248517827, "grad_norm": 3.744621239251605, "learning_rate": 4.173217445667097e-06, "loss": 0.4725, "step": 3554 }, { "epoch": 0.28871923982782427, "grad_norm": 3.5760812252375858, "learning_rate": 4.172728771878908e-06, "loss": 0.4955, "step": 3555 }, { "epoch": 0.28880045480386585, "grad_norm": 5.300448357602322, "learning_rate": 4.17223998234674e-06, "loss": 0.5496, "step": 3556 }, { "epoch": 0.2888816697799074, "grad_norm": 4.301151535524972, "learning_rate": 4.171751077104415e-06, "loss": 0.6269, "step": 3557 }, { "epoch": 0.288962884755949, "grad_norm": 4.150238159594849, "learning_rate": 4.171262056185764e-06, "loss": 0.6023, "step": 3558 }, { "epoch": 0.2890440997319906, "grad_norm": 6.588110479356063, "learning_rate": 4.170772919624624e-06, "loss": 0.5044, "step": 3559 }, { "epoch": 0.28912531470803216, "grad_norm": 9.52780556407402, "learning_rate": 4.170283667454839e-06, "loss": 0.4627, "step": 3560 }, { "epoch": 0.28920652968407373, "grad_norm": 3.112608140290074, "learning_rate": 4.169794299710266e-06, "loss": 0.5403, "step": 3561 }, { "epoch": 0.2892877446601153, "grad_norm": 5.782425679496101, "learning_rate": 4.169304816424763e-06, "loss": 0.6422, "step": 3562 }, { "epoch": 0.2893689596361569, "grad_norm": 6.327129043676947, "learning_rate": 4.168815217632202e-06, "loss": 0.5983, "step": 3563 }, { "epoch": 0.28945017461219846, "grad_norm": 5.64892446517355, "learning_rate": 4.168325503366461e-06, "loss": 0.6639, "step": 3564 }, { "epoch": 0.2895313895882401, "grad_norm": 4.212091644362981, "learning_rate": 4.167835673661422e-06, "loss": 0.5173, "step": 3565 }, { "epoch": 0.2896126045642817, "grad_norm": 4.627484710444349, "learning_rate": 4.167345728550984e-06, "loss": 0.5776, "step": 3566 }, { "epoch": 0.28969381954032325, "grad_norm": 6.431815623240765, "learning_rate": 4.166855668069045e-06, "loss": 0.5357, "step": 3567 }, { "epoch": 0.2897750345163648, "grad_norm": 5.104463737375422, "learning_rate": 4.166365492249514e-06, "loss": 0.4888, "step": 3568 }, { "epoch": 0.2898562494924064, "grad_norm": 2.6553169339995755, "learning_rate": 4.1658752011263125e-06, "loss": 0.4652, "step": 3569 }, { "epoch": 0.289937464468448, "grad_norm": 4.522379342995965, "learning_rate": 4.1653847947333625e-06, "loss": 0.6268, "step": 3570 }, { "epoch": 0.29001867944448956, "grad_norm": 4.206423587982746, "learning_rate": 4.164894273104599e-06, "loss": 0.7023, "step": 3571 }, { "epoch": 0.29009989442053113, "grad_norm": 3.550921658943635, "learning_rate": 4.164403636273963e-06, "loss": 0.6467, "step": 3572 }, { "epoch": 0.2901811093965727, "grad_norm": 4.41960430630749, "learning_rate": 4.163912884275403e-06, "loss": 0.6449, "step": 3573 }, { "epoch": 0.2902623243726143, "grad_norm": 5.335473970114484, "learning_rate": 4.163422017142879e-06, "loss": 0.532, "step": 3574 }, { "epoch": 0.29034353934865587, "grad_norm": 9.485322332094816, "learning_rate": 4.162931034910354e-06, "loss": 0.5057, "step": 3575 }, { "epoch": 0.2904247543246975, "grad_norm": 5.472990906211878, "learning_rate": 4.162439937611803e-06, "loss": 0.5523, "step": 3576 }, { "epoch": 0.2905059693007391, "grad_norm": 4.626244114541653, "learning_rate": 4.161948725281206e-06, "loss": 0.5859, "step": 3577 }, { "epoch": 0.29058718427678065, "grad_norm": 4.102620389372604, "learning_rate": 4.161457397952553e-06, "loss": 0.5745, "step": 3578 }, { "epoch": 0.29066839925282223, "grad_norm": 5.532218377747797, "learning_rate": 4.160965955659843e-06, "loss": 0.4707, "step": 3579 }, { "epoch": 0.2907496142288638, "grad_norm": 4.2544863217335545, "learning_rate": 4.160474398437077e-06, "loss": 0.5238, "step": 3580 }, { "epoch": 0.2908308292049054, "grad_norm": 5.496605956094565, "learning_rate": 4.159982726318271e-06, "loss": 0.8256, "step": 3581 }, { "epoch": 0.29091204418094696, "grad_norm": 4.300837180788803, "learning_rate": 4.159490939337447e-06, "loss": 0.6179, "step": 3582 }, { "epoch": 0.29099325915698854, "grad_norm": 3.957326025756882, "learning_rate": 4.158999037528632e-06, "loss": 0.5216, "step": 3583 }, { "epoch": 0.2910744741330301, "grad_norm": 5.327052291363593, "learning_rate": 4.1585070209258635e-06, "loss": 0.859, "step": 3584 }, { "epoch": 0.2911556891090717, "grad_norm": 6.708637795707786, "learning_rate": 4.158014889563187e-06, "loss": 0.4936, "step": 3585 }, { "epoch": 0.29123690408511327, "grad_norm": 3.6081169619085487, "learning_rate": 4.157522643474654e-06, "loss": 0.4951, "step": 3586 }, { "epoch": 0.2913181190611549, "grad_norm": 5.692407639431199, "learning_rate": 4.157030282694328e-06, "loss": 0.4514, "step": 3587 }, { "epoch": 0.2913993340371965, "grad_norm": 5.96319840346763, "learning_rate": 4.156537807256275e-06, "loss": 0.6435, "step": 3588 }, { "epoch": 0.29148054901323805, "grad_norm": 4.057628153052015, "learning_rate": 4.156045217194573e-06, "loss": 0.5785, "step": 3589 }, { "epoch": 0.29156176398927963, "grad_norm": 4.725549294999352, "learning_rate": 4.1555525125433074e-06, "loss": 0.561, "step": 3590 }, { "epoch": 0.2916429789653212, "grad_norm": 4.897497425355875, "learning_rate": 4.155059693336569e-06, "loss": 0.4877, "step": 3591 }, { "epoch": 0.2917241939413628, "grad_norm": 6.7306605891617, "learning_rate": 4.1545667596084596e-06, "loss": 0.6536, "step": 3592 }, { "epoch": 0.29180540891740436, "grad_norm": 4.458589616639652, "learning_rate": 4.154073711393087e-06, "loss": 0.6075, "step": 3593 }, { "epoch": 0.29188662389344594, "grad_norm": 16.172763248388275, "learning_rate": 4.153580548724567e-06, "loss": 0.5503, "step": 3594 }, { "epoch": 0.2919678388694875, "grad_norm": 5.597703145374191, "learning_rate": 4.153087271637025e-06, "loss": 0.5837, "step": 3595 }, { "epoch": 0.2920490538455291, "grad_norm": 11.754470585418524, "learning_rate": 4.1525938801645926e-06, "loss": 0.5668, "step": 3596 }, { "epoch": 0.29213026882157067, "grad_norm": 6.254803456678843, "learning_rate": 4.152100374341409e-06, "loss": 0.5732, "step": 3597 }, { "epoch": 0.2922114837976123, "grad_norm": 5.190101012718226, "learning_rate": 4.151606754201625e-06, "loss": 0.6051, "step": 3598 }, { "epoch": 0.2922926987736539, "grad_norm": 10.669928997016616, "learning_rate": 4.151113019779393e-06, "loss": 0.639, "step": 3599 }, { "epoch": 0.29237391374969546, "grad_norm": 5.619799524172815, "learning_rate": 4.150619171108879e-06, "loss": 0.5745, "step": 3600 }, { "epoch": 0.29245512872573703, "grad_norm": 4.562644028883115, "learning_rate": 4.150125208224255e-06, "loss": 0.6914, "step": 3601 }, { "epoch": 0.2925363437017786, "grad_norm": 4.491512432892291, "learning_rate": 4.149631131159698e-06, "loss": 0.4882, "step": 3602 }, { "epoch": 0.2926175586778202, "grad_norm": 4.214753071853409, "learning_rate": 4.149136939949399e-06, "loss": 0.5967, "step": 3603 }, { "epoch": 0.29269877365386177, "grad_norm": 6.53315802645859, "learning_rate": 4.14864263462755e-06, "loss": 0.5239, "step": 3604 }, { "epoch": 0.29277998862990334, "grad_norm": 5.806026900388708, "learning_rate": 4.148148215228357e-06, "loss": 0.6479, "step": 3605 }, { "epoch": 0.2928612036059449, "grad_norm": 4.392767448225316, "learning_rate": 4.147653681786031e-06, "loss": 0.5045, "step": 3606 }, { "epoch": 0.2929424185819865, "grad_norm": 7.680285801817868, "learning_rate": 4.147159034334789e-06, "loss": 0.5433, "step": 3607 }, { "epoch": 0.2930236335580281, "grad_norm": 5.275584522954317, "learning_rate": 4.146664272908859e-06, "loss": 0.5867, "step": 3608 }, { "epoch": 0.2931048485340697, "grad_norm": 5.37369190885388, "learning_rate": 4.146169397542478e-06, "loss": 0.5683, "step": 3609 }, { "epoch": 0.2931860635101113, "grad_norm": 3.8049589428305413, "learning_rate": 4.145674408269885e-06, "loss": 0.6545, "step": 3610 }, { "epoch": 0.29326727848615286, "grad_norm": 5.091756495748853, "learning_rate": 4.145179305125333e-06, "loss": 0.5765, "step": 3611 }, { "epoch": 0.29334849346219444, "grad_norm": 13.084271350497739, "learning_rate": 4.14468408814308e-06, "loss": 0.5044, "step": 3612 }, { "epoch": 0.293429708438236, "grad_norm": 5.905329389388773, "learning_rate": 4.1441887573573935e-06, "loss": 0.5513, "step": 3613 }, { "epoch": 0.2935109234142776, "grad_norm": 4.255974685046024, "learning_rate": 4.143693312802546e-06, "loss": 0.4885, "step": 3614 }, { "epoch": 0.29359213839031917, "grad_norm": 7.189955017278807, "learning_rate": 4.143197754512821e-06, "loss": 0.5293, "step": 3615 }, { "epoch": 0.29367335336636075, "grad_norm": 4.434545028745304, "learning_rate": 4.142702082522507e-06, "loss": 0.4807, "step": 3616 }, { "epoch": 0.2937545683424023, "grad_norm": 5.613891388134908, "learning_rate": 4.142206296865904e-06, "loss": 0.5229, "step": 3617 }, { "epoch": 0.2938357833184439, "grad_norm": 3.159086352727405, "learning_rate": 4.141710397577315e-06, "loss": 0.6718, "step": 3618 }, { "epoch": 0.2939169982944855, "grad_norm": 3.5367459467799858, "learning_rate": 4.141214384691056e-06, "loss": 0.5547, "step": 3619 }, { "epoch": 0.2939982132705271, "grad_norm": 7.18465729456046, "learning_rate": 4.1407182582414476e-06, "loss": 0.5301, "step": 3620 }, { "epoch": 0.2940794282465687, "grad_norm": 5.6826492806068405, "learning_rate": 4.140222018262818e-06, "loss": 0.6391, "step": 3621 }, { "epoch": 0.29416064322261026, "grad_norm": 3.357835141229998, "learning_rate": 4.139725664789507e-06, "loss": 0.7172, "step": 3622 }, { "epoch": 0.29424185819865184, "grad_norm": 17.104412542873206, "learning_rate": 4.139229197855857e-06, "loss": 0.441, "step": 3623 }, { "epoch": 0.2943230731746934, "grad_norm": 4.059380496052101, "learning_rate": 4.138732617496223e-06, "loss": 0.5264, "step": 3624 }, { "epoch": 0.294404288150735, "grad_norm": 4.907343820967712, "learning_rate": 4.138235923744964e-06, "loss": 0.4865, "step": 3625 }, { "epoch": 0.29448550312677657, "grad_norm": 4.767491409739287, "learning_rate": 4.13773911663645e-06, "loss": 0.5446, "step": 3626 }, { "epoch": 0.29456671810281815, "grad_norm": 11.043074971749009, "learning_rate": 4.137242196205056e-06, "loss": 0.514, "step": 3627 }, { "epoch": 0.2946479330788597, "grad_norm": 8.751284247365856, "learning_rate": 4.136745162485168e-06, "loss": 0.5782, "step": 3628 }, { "epoch": 0.2947291480549013, "grad_norm": 4.259342421928949, "learning_rate": 4.1362480155111764e-06, "loss": 0.4735, "step": 3629 }, { "epoch": 0.2948103630309429, "grad_norm": 5.982588574242516, "learning_rate": 4.135750755317481e-06, "loss": 0.6233, "step": 3630 }, { "epoch": 0.2948915780069845, "grad_norm": 4.717722006131577, "learning_rate": 4.135253381938492e-06, "loss": 0.6496, "step": 3631 }, { "epoch": 0.2949727929830261, "grad_norm": 3.9506979794735337, "learning_rate": 4.134755895408623e-06, "loss": 0.6076, "step": 3632 }, { "epoch": 0.29505400795906767, "grad_norm": 6.244712141178371, "learning_rate": 4.134258295762297e-06, "loss": 0.5481, "step": 3633 }, { "epoch": 0.29513522293510924, "grad_norm": 5.110391491927889, "learning_rate": 4.1337605830339465e-06, "loss": 0.5417, "step": 3634 }, { "epoch": 0.2952164379111508, "grad_norm": 4.8675201483594615, "learning_rate": 4.133262757258011e-06, "loss": 0.6065, "step": 3635 }, { "epoch": 0.2952976528871924, "grad_norm": 11.498154785191828, "learning_rate": 4.132764818468936e-06, "loss": 0.5392, "step": 3636 }, { "epoch": 0.295378867863234, "grad_norm": 6.065167654609438, "learning_rate": 4.1322667667011774e-06, "loss": 0.6767, "step": 3637 }, { "epoch": 0.29546008283927555, "grad_norm": 9.853858235216926, "learning_rate": 4.131768601989196e-06, "loss": 0.5793, "step": 3638 }, { "epoch": 0.2955412978153171, "grad_norm": 5.19821566952064, "learning_rate": 4.131270324367464e-06, "loss": 0.7266, "step": 3639 }, { "epoch": 0.2956225127913587, "grad_norm": 6.474598040312163, "learning_rate": 4.130771933870459e-06, "loss": 0.6649, "step": 3640 }, { "epoch": 0.2957037277674003, "grad_norm": 6.622824718236689, "learning_rate": 4.130273430532667e-06, "loss": 0.4317, "step": 3641 }, { "epoch": 0.2957849427434419, "grad_norm": 8.781051249556342, "learning_rate": 4.129774814388582e-06, "loss": 0.4864, "step": 3642 }, { "epoch": 0.2958661577194835, "grad_norm": 4.776911195727594, "learning_rate": 4.1292760854727045e-06, "loss": 0.5531, "step": 3643 }, { "epoch": 0.29594737269552507, "grad_norm": 4.828770110545079, "learning_rate": 4.128777243819546e-06, "loss": 0.5435, "step": 3644 }, { "epoch": 0.29602858767156665, "grad_norm": 4.550741300328438, "learning_rate": 4.128278289463621e-06, "loss": 0.4474, "step": 3645 }, { "epoch": 0.2961098026476082, "grad_norm": 3.03442154965047, "learning_rate": 4.127779222439457e-06, "loss": 0.5896, "step": 3646 }, { "epoch": 0.2961910176236498, "grad_norm": 4.7725701865486085, "learning_rate": 4.127280042781585e-06, "loss": 0.6183, "step": 3647 }, { "epoch": 0.2962722325996914, "grad_norm": 5.751607562129402, "learning_rate": 4.126780750524546e-06, "loss": 0.4919, "step": 3648 }, { "epoch": 0.29635344757573295, "grad_norm": 3.5887135989563057, "learning_rate": 4.126281345702889e-06, "loss": 0.5275, "step": 3649 }, { "epoch": 0.29643466255177453, "grad_norm": 6.464495222017677, "learning_rate": 4.125781828351171e-06, "loss": 0.8401, "step": 3650 }, { "epoch": 0.2965158775278161, "grad_norm": 5.980573147403461, "learning_rate": 4.125282198503953e-06, "loss": 0.5954, "step": 3651 }, { "epoch": 0.2965970925038577, "grad_norm": 5.228111544471104, "learning_rate": 4.124782456195809e-06, "loss": 0.5105, "step": 3652 }, { "epoch": 0.2966783074798993, "grad_norm": 7.53149544733575, "learning_rate": 4.124282601461319e-06, "loss": 0.4924, "step": 3653 }, { "epoch": 0.2967595224559409, "grad_norm": 5.556355856990694, "learning_rate": 4.123782634335068e-06, "loss": 0.4124, "step": 3654 }, { "epoch": 0.29684073743198247, "grad_norm": 4.358048092574572, "learning_rate": 4.123282554851654e-06, "loss": 0.5824, "step": 3655 }, { "epoch": 0.29692195240802405, "grad_norm": 3.5786979265887187, "learning_rate": 4.122782363045677e-06, "loss": 0.4748, "step": 3656 }, { "epoch": 0.2970031673840656, "grad_norm": 4.524617536184045, "learning_rate": 4.12228205895175e-06, "loss": 0.4482, "step": 3657 }, { "epoch": 0.2970843823601072, "grad_norm": 6.2371920872247735, "learning_rate": 4.12178164260449e-06, "loss": 0.565, "step": 3658 }, { "epoch": 0.2971655973361488, "grad_norm": 3.54005929501434, "learning_rate": 4.121281114038524e-06, "loss": 0.4706, "step": 3659 }, { "epoch": 0.29724681231219036, "grad_norm": 6.2010580244373905, "learning_rate": 4.120780473288485e-06, "loss": 0.4807, "step": 3660 }, { "epoch": 0.29732802728823193, "grad_norm": 3.39391672989296, "learning_rate": 4.120279720389015e-06, "loss": 0.5279, "step": 3661 }, { "epoch": 0.2974092422642735, "grad_norm": 7.485601794086229, "learning_rate": 4.119778855374763e-06, "loss": 0.6333, "step": 3662 }, { "epoch": 0.2974904572403151, "grad_norm": 4.283931201224972, "learning_rate": 4.1192778782803875e-06, "loss": 0.5886, "step": 3663 }, { "epoch": 0.2975716722163567, "grad_norm": 6.09549459673354, "learning_rate": 4.118776789140551e-06, "loss": 0.4158, "step": 3664 }, { "epoch": 0.2976528871923983, "grad_norm": 3.9183720909856645, "learning_rate": 4.1182755879899305e-06, "loss": 0.5857, "step": 3665 }, { "epoch": 0.2977341021684399, "grad_norm": 4.554155011331954, "learning_rate": 4.117774274863203e-06, "loss": 0.5828, "step": 3666 }, { "epoch": 0.29781531714448145, "grad_norm": 6.026037573474456, "learning_rate": 4.117272849795057e-06, "loss": 0.7132, "step": 3667 }, { "epoch": 0.297896532120523, "grad_norm": 5.369767546922633, "learning_rate": 4.116771312820189e-06, "loss": 0.6505, "step": 3668 }, { "epoch": 0.2979777470965646, "grad_norm": 3.49258490401417, "learning_rate": 4.116269663973304e-06, "loss": 0.537, "step": 3669 }, { "epoch": 0.2980589620726062, "grad_norm": 8.038084077662122, "learning_rate": 4.115767903289112e-06, "loss": 0.6225, "step": 3670 }, { "epoch": 0.29814017704864776, "grad_norm": 10.562283989123358, "learning_rate": 4.115266030802332e-06, "loss": 0.4825, "step": 3671 }, { "epoch": 0.29822139202468934, "grad_norm": 5.676004986730943, "learning_rate": 4.114764046547691e-06, "loss": 0.59, "step": 3672 }, { "epoch": 0.2983026070007309, "grad_norm": 8.432493876901468, "learning_rate": 4.114261950559924e-06, "loss": 0.5298, "step": 3673 }, { "epoch": 0.2983838219767725, "grad_norm": 4.1760389034352015, "learning_rate": 4.113759742873774e-06, "loss": 0.6273, "step": 3674 }, { "epoch": 0.2984650369528141, "grad_norm": 6.694938489409981, "learning_rate": 4.11325742352399e-06, "loss": 0.4842, "step": 3675 }, { "epoch": 0.2985462519288557, "grad_norm": 4.346992428927781, "learning_rate": 4.112754992545331e-06, "loss": 0.5768, "step": 3676 }, { "epoch": 0.2986274669048973, "grad_norm": 4.528058186506179, "learning_rate": 4.112252449972562e-06, "loss": 0.5956, "step": 3677 }, { "epoch": 0.29870868188093885, "grad_norm": 4.764966099793393, "learning_rate": 4.111749795840455e-06, "loss": 0.5005, "step": 3678 }, { "epoch": 0.29878989685698043, "grad_norm": 10.20025692058819, "learning_rate": 4.111247030183793e-06, "loss": 0.5522, "step": 3679 }, { "epoch": 0.298871111833022, "grad_norm": 31.38171024066769, "learning_rate": 4.110744153037363e-06, "loss": 0.505, "step": 3680 }, { "epoch": 0.2989523268090636, "grad_norm": 4.523115084329832, "learning_rate": 4.110241164435964e-06, "loss": 0.4673, "step": 3681 }, { "epoch": 0.29903354178510516, "grad_norm": 3.7531700099140393, "learning_rate": 4.109738064414397e-06, "loss": 0.4812, "step": 3682 }, { "epoch": 0.29911475676114674, "grad_norm": 5.06537300558853, "learning_rate": 4.1092348530074764e-06, "loss": 0.5328, "step": 3683 }, { "epoch": 0.2991959717371883, "grad_norm": 6.176828372115656, "learning_rate": 4.10873153025002e-06, "loss": 0.4383, "step": 3684 }, { "epoch": 0.2992771867132299, "grad_norm": 5.037654343199056, "learning_rate": 4.108228096176856e-06, "loss": 0.6148, "step": 3685 }, { "epoch": 0.2993584016892715, "grad_norm": 6.942767647531231, "learning_rate": 4.10772455082282e-06, "loss": 0.558, "step": 3686 }, { "epoch": 0.2994396166653131, "grad_norm": 4.277595045669092, "learning_rate": 4.107220894222753e-06, "loss": 0.7493, "step": 3687 }, { "epoch": 0.2995208316413547, "grad_norm": 6.128933496939369, "learning_rate": 4.106717126411506e-06, "loss": 0.4093, "step": 3688 }, { "epoch": 0.29960204661739626, "grad_norm": 3.4094618806546713, "learning_rate": 4.106213247423938e-06, "loss": 0.5386, "step": 3689 }, { "epoch": 0.29968326159343783, "grad_norm": 5.429499060264869, "learning_rate": 4.105709257294914e-06, "loss": 0.5606, "step": 3690 }, { "epoch": 0.2997644765694794, "grad_norm": 8.126911051862221, "learning_rate": 4.105205156059307e-06, "loss": 0.4964, "step": 3691 }, { "epoch": 0.299845691545521, "grad_norm": 5.576298258657682, "learning_rate": 4.104700943751999e-06, "loss": 0.5032, "step": 3692 }, { "epoch": 0.29992690652156256, "grad_norm": 5.428256954230044, "learning_rate": 4.104196620407878e-06, "loss": 0.5164, "step": 3693 }, { "epoch": 0.30000812149760414, "grad_norm": 5.490233849085779, "learning_rate": 4.1036921860618415e-06, "loss": 0.4619, "step": 3694 }, { "epoch": 0.3000893364736457, "grad_norm": 3.6578438271498683, "learning_rate": 4.103187640748792e-06, "loss": 0.4709, "step": 3695 }, { "epoch": 0.3001705514496873, "grad_norm": 4.119105065976182, "learning_rate": 4.102682984503644e-06, "loss": 0.5943, "step": 3696 }, { "epoch": 0.3002517664257289, "grad_norm": 4.816227234767905, "learning_rate": 4.102178217361315e-06, "loss": 0.5706, "step": 3697 }, { "epoch": 0.3003329814017705, "grad_norm": 3.586911969028518, "learning_rate": 4.101673339356733e-06, "loss": 0.4774, "step": 3698 }, { "epoch": 0.3004141963778121, "grad_norm": 9.66099144536593, "learning_rate": 4.101168350524832e-06, "loss": 0.4695, "step": 3699 }, { "epoch": 0.30049541135385366, "grad_norm": 4.619743032013704, "learning_rate": 4.100663250900556e-06, "loss": 0.4937, "step": 3700 }, { "epoch": 0.30057662632989524, "grad_norm": 5.0528198312806705, "learning_rate": 4.100158040518854e-06, "loss": 0.4896, "step": 3701 }, { "epoch": 0.3006578413059368, "grad_norm": 6.192798320269057, "learning_rate": 4.099652719414684e-06, "loss": 0.4393, "step": 3702 }, { "epoch": 0.3007390562819784, "grad_norm": 4.544818358602807, "learning_rate": 4.099147287623012e-06, "loss": 0.4435, "step": 3703 }, { "epoch": 0.30082027125801997, "grad_norm": 3.843568388195506, "learning_rate": 4.098641745178812e-06, "loss": 0.5758, "step": 3704 }, { "epoch": 0.30090148623406154, "grad_norm": 7.032873685349521, "learning_rate": 4.098136092117063e-06, "loss": 0.5837, "step": 3705 }, { "epoch": 0.3009827012101031, "grad_norm": 4.039586791406998, "learning_rate": 4.097630328472755e-06, "loss": 0.4516, "step": 3706 }, { "epoch": 0.3010639161861447, "grad_norm": 5.893186542446594, "learning_rate": 4.097124454280883e-06, "loss": 0.4981, "step": 3707 }, { "epoch": 0.30114513116218633, "grad_norm": 4.234572464495813, "learning_rate": 4.096618469576451e-06, "loss": 0.721, "step": 3708 }, { "epoch": 0.3012263461382279, "grad_norm": 4.138584167113376, "learning_rate": 4.0961123743944715e-06, "loss": 0.4969, "step": 3709 }, { "epoch": 0.3013075611142695, "grad_norm": 4.733715325450099, "learning_rate": 4.095606168769964e-06, "loss": 0.65, "step": 3710 }, { "epoch": 0.30138877609031106, "grad_norm": 5.217622232509772, "learning_rate": 4.095099852737953e-06, "loss": 0.6026, "step": 3711 }, { "epoch": 0.30146999106635264, "grad_norm": 6.573735753701009, "learning_rate": 4.094593426333474e-06, "loss": 0.6201, "step": 3712 }, { "epoch": 0.3015512060423942, "grad_norm": 7.983639715927653, "learning_rate": 4.09408688959157e-06, "loss": 0.6581, "step": 3713 }, { "epoch": 0.3016324210184358, "grad_norm": 5.652732617432702, "learning_rate": 4.093580242547289e-06, "loss": 0.7463, "step": 3714 }, { "epoch": 0.30171363599447737, "grad_norm": 5.775875513388475, "learning_rate": 4.09307348523569e-06, "loss": 0.4833, "step": 3715 }, { "epoch": 0.30179485097051895, "grad_norm": 5.373772908608717, "learning_rate": 4.092566617691837e-06, "loss": 0.4648, "step": 3716 }, { "epoch": 0.3018760659465605, "grad_norm": 4.681484734835625, "learning_rate": 4.092059639950802e-06, "loss": 0.543, "step": 3717 }, { "epoch": 0.3019572809226021, "grad_norm": 3.293957563526287, "learning_rate": 4.0915525520476665e-06, "loss": 0.7871, "step": 3718 }, { "epoch": 0.30203849589864373, "grad_norm": 9.736689094555402, "learning_rate": 4.091045354017517e-06, "loss": 0.54, "step": 3719 }, { "epoch": 0.3021197108746853, "grad_norm": 7.004070985864832, "learning_rate": 4.090538045895449e-06, "loss": 0.4686, "step": 3720 }, { "epoch": 0.3022009258507269, "grad_norm": 6.304287102750301, "learning_rate": 4.090030627716567e-06, "loss": 0.6621, "step": 3721 }, { "epoch": 0.30228214082676846, "grad_norm": 5.678988794221106, "learning_rate": 4.08952309951598e-06, "loss": 0.6359, "step": 3722 }, { "epoch": 0.30236335580281004, "grad_norm": 7.331654562862624, "learning_rate": 4.0890154613288066e-06, "loss": 0.6547, "step": 3723 }, { "epoch": 0.3024445707788516, "grad_norm": 3.7966525875405277, "learning_rate": 4.088507713190174e-06, "loss": 0.5675, "step": 3724 }, { "epoch": 0.3025257857548932, "grad_norm": 4.688025131548344, "learning_rate": 4.087999855135215e-06, "loss": 0.6325, "step": 3725 }, { "epoch": 0.30260700073093477, "grad_norm": 4.823758976025031, "learning_rate": 4.087491887199069e-06, "loss": 0.5405, "step": 3726 }, { "epoch": 0.30268821570697635, "grad_norm": 3.9620946246180884, "learning_rate": 4.086983809416887e-06, "loss": 0.4993, "step": 3727 }, { "epoch": 0.3027694306830179, "grad_norm": 4.5032403641906305, "learning_rate": 4.086475621823824e-06, "loss": 0.4079, "step": 3728 }, { "epoch": 0.3028506456590595, "grad_norm": 20.477342042531873, "learning_rate": 4.085967324455045e-06, "loss": 0.5201, "step": 3729 }, { "epoch": 0.30293186063510114, "grad_norm": 6.231510482092364, "learning_rate": 4.085458917345721e-06, "loss": 0.7878, "step": 3730 }, { "epoch": 0.3030130756111427, "grad_norm": 4.154775199567728, "learning_rate": 4.084950400531029e-06, "loss": 0.5936, "step": 3731 }, { "epoch": 0.3030942905871843, "grad_norm": 4.694151372288833, "learning_rate": 4.0844417740461586e-06, "loss": 0.4481, "step": 3732 }, { "epoch": 0.30317550556322587, "grad_norm": 4.709650246764357, "learning_rate": 4.083933037926303e-06, "loss": 0.6051, "step": 3733 }, { "epoch": 0.30325672053926744, "grad_norm": 5.029331738383321, "learning_rate": 4.0834241922066644e-06, "loss": 0.5312, "step": 3734 }, { "epoch": 0.303337935515309, "grad_norm": 4.293546250505703, "learning_rate": 4.082915236922451e-06, "loss": 0.5698, "step": 3735 }, { "epoch": 0.3034191504913506, "grad_norm": 3.9815078810069218, "learning_rate": 4.082406172108882e-06, "loss": 0.6138, "step": 3736 }, { "epoch": 0.3035003654673922, "grad_norm": 8.25343748834109, "learning_rate": 4.0818969978011795e-06, "loss": 0.5962, "step": 3737 }, { "epoch": 0.30358158044343375, "grad_norm": 8.293232016506174, "learning_rate": 4.081387714034577e-06, "loss": 0.5964, "step": 3738 }, { "epoch": 0.30366279541947533, "grad_norm": 9.876486919775553, "learning_rate": 4.080878320844315e-06, "loss": 0.5288, "step": 3739 }, { "epoch": 0.3037440103955169, "grad_norm": 4.90237518330183, "learning_rate": 4.080368818265639e-06, "loss": 0.542, "step": 3740 }, { "epoch": 0.30382522537155854, "grad_norm": 5.433619699450851, "learning_rate": 4.079859206333805e-06, "loss": 0.5346, "step": 3741 }, { "epoch": 0.3039064403476001, "grad_norm": 6.417127497404236, "learning_rate": 4.079349485084074e-06, "loss": 0.5068, "step": 3742 }, { "epoch": 0.3039876553236417, "grad_norm": 9.395212073819222, "learning_rate": 4.078839654551718e-06, "loss": 0.5981, "step": 3743 }, { "epoch": 0.30406887029968327, "grad_norm": 6.080896733160445, "learning_rate": 4.078329714772015e-06, "loss": 0.5716, "step": 3744 }, { "epoch": 0.30415008527572485, "grad_norm": 3.9795202208766463, "learning_rate": 4.0778196657802484e-06, "loss": 0.589, "step": 3745 }, { "epoch": 0.3042313002517664, "grad_norm": 5.978058004337624, "learning_rate": 4.077309507611711e-06, "loss": 0.559, "step": 3746 }, { "epoch": 0.304312515227808, "grad_norm": 10.73728162343595, "learning_rate": 4.076799240301703e-06, "loss": 0.5419, "step": 3747 }, { "epoch": 0.3043937302038496, "grad_norm": 5.705981837585083, "learning_rate": 4.076288863885533e-06, "loss": 0.5277, "step": 3748 }, { "epoch": 0.30447494517989115, "grad_norm": 4.148490626037343, "learning_rate": 4.0757783783985164e-06, "loss": 0.6104, "step": 3749 }, { "epoch": 0.30455616015593273, "grad_norm": 7.3071974055301006, "learning_rate": 4.0752677838759755e-06, "loss": 0.6195, "step": 3750 }, { "epoch": 0.3046373751319743, "grad_norm": 3.536531811654719, "learning_rate": 4.074757080353241e-06, "loss": 0.6436, "step": 3751 }, { "epoch": 0.30471859010801594, "grad_norm": 4.25870092662668, "learning_rate": 4.074246267865652e-06, "loss": 0.6274, "step": 3752 }, { "epoch": 0.3047998050840575, "grad_norm": 3.6079741565893895, "learning_rate": 4.073735346448551e-06, "loss": 0.6034, "step": 3753 }, { "epoch": 0.3048810200600991, "grad_norm": 7.935117748318247, "learning_rate": 4.073224316137293e-06, "loss": 0.5586, "step": 3754 }, { "epoch": 0.30496223503614067, "grad_norm": 6.813447251757785, "learning_rate": 4.072713176967239e-06, "loss": 0.5641, "step": 3755 }, { "epoch": 0.30504345001218225, "grad_norm": 6.715619033709069, "learning_rate": 4.072201928973757e-06, "loss": 0.5836, "step": 3756 }, { "epoch": 0.3051246649882238, "grad_norm": 3.5300282565737944, "learning_rate": 4.071690572192222e-06, "loss": 0.6304, "step": 3757 }, { "epoch": 0.3052058799642654, "grad_norm": 9.284044398768938, "learning_rate": 4.071179106658017e-06, "loss": 0.5776, "step": 3758 }, { "epoch": 0.305287094940307, "grad_norm": 6.469344755338421, "learning_rate": 4.070667532406534e-06, "loss": 0.518, "step": 3759 }, { "epoch": 0.30536830991634856, "grad_norm": 9.204050268328828, "learning_rate": 4.070155849473169e-06, "loss": 0.4228, "step": 3760 }, { "epoch": 0.30544952489239013, "grad_norm": 7.33932480094826, "learning_rate": 4.06964405789333e-06, "loss": 0.5276, "step": 3761 }, { "epoch": 0.3055307398684317, "grad_norm": 4.5551115854633695, "learning_rate": 4.06913215770243e-06, "loss": 0.5275, "step": 3762 }, { "epoch": 0.30561195484447334, "grad_norm": 7.628505640253093, "learning_rate": 4.068620148935889e-06, "loss": 0.5706, "step": 3763 }, { "epoch": 0.3056931698205149, "grad_norm": 7.296776756806712, "learning_rate": 4.0681080316291355e-06, "loss": 0.4332, "step": 3764 }, { "epoch": 0.3057743847965565, "grad_norm": 6.211933336592377, "learning_rate": 4.067595805817604e-06, "loss": 0.5385, "step": 3765 }, { "epoch": 0.3058555997725981, "grad_norm": 5.2682390368881125, "learning_rate": 4.0670834715367405e-06, "loss": 0.6172, "step": 3766 }, { "epoch": 0.30593681474863965, "grad_norm": 5.7266015459873465, "learning_rate": 4.066571028821994e-06, "loss": 0.663, "step": 3767 }, { "epoch": 0.30601802972468123, "grad_norm": 4.566744030336992, "learning_rate": 4.066058477708824e-06, "loss": 0.6204, "step": 3768 }, { "epoch": 0.3060992447007228, "grad_norm": 5.241528003933075, "learning_rate": 4.065545818232695e-06, "loss": 0.5374, "step": 3769 }, { "epoch": 0.3061804596767644, "grad_norm": 4.210043089768735, "learning_rate": 4.06503305042908e-06, "loss": 0.5627, "step": 3770 }, { "epoch": 0.30626167465280596, "grad_norm": 24.358962625710497, "learning_rate": 4.064520174333462e-06, "loss": 0.4803, "step": 3771 }, { "epoch": 0.30634288962884754, "grad_norm": 5.001904028877196, "learning_rate": 4.0640071899813284e-06, "loss": 0.5884, "step": 3772 }, { "epoch": 0.30642410460488917, "grad_norm": 3.685895651696086, "learning_rate": 4.0634940974081735e-06, "loss": 0.5976, "step": 3773 }, { "epoch": 0.30650531958093075, "grad_norm": 4.690707174624639, "learning_rate": 4.062980896649502e-06, "loss": 0.4432, "step": 3774 }, { "epoch": 0.3065865345569723, "grad_norm": 7.66057017392417, "learning_rate": 4.062467587740825e-06, "loss": 0.5297, "step": 3775 }, { "epoch": 0.3066677495330139, "grad_norm": 4.80151641086556, "learning_rate": 4.0619541707176595e-06, "loss": 0.5717, "step": 3776 }, { "epoch": 0.3067489645090555, "grad_norm": 5.073279099762993, "learning_rate": 4.061440645615532e-06, "loss": 0.4853, "step": 3777 }, { "epoch": 0.30683017948509705, "grad_norm": 6.406852301630108, "learning_rate": 4.060927012469976e-06, "loss": 0.5093, "step": 3778 }, { "epoch": 0.30691139446113863, "grad_norm": 3.919683009512748, "learning_rate": 4.060413271316531e-06, "loss": 0.4647, "step": 3779 }, { "epoch": 0.3069926094371802, "grad_norm": 4.810874854512679, "learning_rate": 4.059899422190747e-06, "loss": 0.3816, "step": 3780 }, { "epoch": 0.3070738244132218, "grad_norm": 7.470318516960077, "learning_rate": 4.059385465128179e-06, "loss": 0.4818, "step": 3781 }, { "epoch": 0.30715503938926336, "grad_norm": 4.142497626949441, "learning_rate": 4.058871400164388e-06, "loss": 0.6359, "step": 3782 }, { "epoch": 0.30723625436530494, "grad_norm": 4.107423918701493, "learning_rate": 4.058357227334947e-06, "loss": 0.5755, "step": 3783 }, { "epoch": 0.30731746934134657, "grad_norm": 7.190290906025693, "learning_rate": 4.057842946675434e-06, "loss": 0.4618, "step": 3784 }, { "epoch": 0.30739868431738815, "grad_norm": 6.779887515845196, "learning_rate": 4.057328558221434e-06, "loss": 0.4605, "step": 3785 }, { "epoch": 0.3074798992934297, "grad_norm": 6.8626494300279495, "learning_rate": 4.056814062008539e-06, "loss": 0.4735, "step": 3786 }, { "epoch": 0.3075611142694713, "grad_norm": 4.43663040407582, "learning_rate": 4.056299458072351e-06, "loss": 0.5549, "step": 3787 }, { "epoch": 0.3076423292455129, "grad_norm": 9.927091358251289, "learning_rate": 4.0557847464484766e-06, "loss": 0.6012, "step": 3788 }, { "epoch": 0.30772354422155446, "grad_norm": 3.3047332458402874, "learning_rate": 4.055269927172532e-06, "loss": 0.5754, "step": 3789 }, { "epoch": 0.30780475919759603, "grad_norm": 4.086984576275844, "learning_rate": 4.054755000280139e-06, "loss": 0.6482, "step": 3790 }, { "epoch": 0.3078859741736376, "grad_norm": 10.124343594059361, "learning_rate": 4.054239965806929e-06, "loss": 0.6541, "step": 3791 }, { "epoch": 0.3079671891496792, "grad_norm": 9.088285597560201, "learning_rate": 4.053724823788538e-06, "loss": 0.5313, "step": 3792 }, { "epoch": 0.30804840412572077, "grad_norm": 10.64301926225871, "learning_rate": 4.053209574260614e-06, "loss": 0.5672, "step": 3793 }, { "epoch": 0.30812961910176234, "grad_norm": 4.299438598485095, "learning_rate": 4.052694217258806e-06, "loss": 0.4908, "step": 3794 }, { "epoch": 0.308210834077804, "grad_norm": 50.64834820262253, "learning_rate": 4.052178752818776e-06, "loss": 0.4808, "step": 3795 }, { "epoch": 0.30829204905384555, "grad_norm": 3.318247558859262, "learning_rate": 4.051663180976192e-06, "loss": 0.6804, "step": 3796 }, { "epoch": 0.30837326402988713, "grad_norm": 15.92488834517386, "learning_rate": 4.051147501766727e-06, "loss": 0.5227, "step": 3797 }, { "epoch": 0.3084544790059287, "grad_norm": 3.980666622212895, "learning_rate": 4.050631715226064e-06, "loss": 0.5733, "step": 3798 }, { "epoch": 0.3085356939819703, "grad_norm": 5.924342358250464, "learning_rate": 4.050115821389894e-06, "loss": 0.4722, "step": 3799 }, { "epoch": 0.30861690895801186, "grad_norm": 6.826343924107241, "learning_rate": 4.049599820293913e-06, "loss": 0.5608, "step": 3800 }, { "epoch": 0.30869812393405344, "grad_norm": 5.798254498370531, "learning_rate": 4.049083711973824e-06, "loss": 0.482, "step": 3801 }, { "epoch": 0.308779338910095, "grad_norm": 4.3433768010776905, "learning_rate": 4.0485674964653424e-06, "loss": 0.4834, "step": 3802 }, { "epoch": 0.3088605538861366, "grad_norm": 4.326553027616668, "learning_rate": 4.048051173804185e-06, "loss": 0.5556, "step": 3803 }, { "epoch": 0.30894176886217817, "grad_norm": 6.800606044283711, "learning_rate": 4.047534744026079e-06, "loss": 0.6, "step": 3804 }, { "epoch": 0.30902298383821974, "grad_norm": 10.590880966190726, "learning_rate": 4.04701820716676e-06, "loss": 0.4796, "step": 3805 }, { "epoch": 0.3091041988142614, "grad_norm": 5.6795091144428635, "learning_rate": 4.046501563261968e-06, "loss": 0.5129, "step": 3806 }, { "epoch": 0.30918541379030295, "grad_norm": 3.210645271484872, "learning_rate": 4.045984812347452e-06, "loss": 0.5332, "step": 3807 }, { "epoch": 0.30926662876634453, "grad_norm": 6.294410231720947, "learning_rate": 4.045467954458969e-06, "loss": 0.6181, "step": 3808 }, { "epoch": 0.3093478437423861, "grad_norm": 4.474575703083307, "learning_rate": 4.044950989632283e-06, "loss": 0.4792, "step": 3809 }, { "epoch": 0.3094290587184277, "grad_norm": 4.779056570525405, "learning_rate": 4.044433917903166e-06, "loss": 0.5667, "step": 3810 }, { "epoch": 0.30951027369446926, "grad_norm": 4.782139569666095, "learning_rate": 4.043916739307394e-06, "loss": 0.7153, "step": 3811 }, { "epoch": 0.30959148867051084, "grad_norm": 6.550010939029318, "learning_rate": 4.0433994538807564e-06, "loss": 0.5078, "step": 3812 }, { "epoch": 0.3096727036465524, "grad_norm": 7.70154315773568, "learning_rate": 4.042882061659043e-06, "loss": 0.5411, "step": 3813 }, { "epoch": 0.309753918622594, "grad_norm": 5.468435592136346, "learning_rate": 4.042364562678059e-06, "loss": 0.6321, "step": 3814 }, { "epoch": 0.30983513359863557, "grad_norm": 4.143944075441312, "learning_rate": 4.041846956973608e-06, "loss": 0.46, "step": 3815 }, { "epoch": 0.30991634857467715, "grad_norm": 4.798354632196031, "learning_rate": 4.041329244581509e-06, "loss": 0.4921, "step": 3816 }, { "epoch": 0.3099975635507188, "grad_norm": 5.155312927681158, "learning_rate": 4.040811425537583e-06, "loss": 0.6239, "step": 3817 }, { "epoch": 0.31007877852676036, "grad_norm": 3.701165442814568, "learning_rate": 4.040293499877661e-06, "loss": 0.6161, "step": 3818 }, { "epoch": 0.31015999350280193, "grad_norm": 5.396166406448343, "learning_rate": 4.039775467637581e-06, "loss": 0.4338, "step": 3819 }, { "epoch": 0.3102412084788435, "grad_norm": 4.72888620508123, "learning_rate": 4.039257328853188e-06, "loss": 0.7018, "step": 3820 }, { "epoch": 0.3103224234548851, "grad_norm": 4.775493593228451, "learning_rate": 4.038739083560334e-06, "loss": 0.526, "step": 3821 }, { "epoch": 0.31040363843092666, "grad_norm": 4.103968235635027, "learning_rate": 4.038220731794878e-06, "loss": 0.6318, "step": 3822 }, { "epoch": 0.31048485340696824, "grad_norm": 8.31477225841899, "learning_rate": 4.03770227359269e-06, "loss": 0.5303, "step": 3823 }, { "epoch": 0.3105660683830098, "grad_norm": 3.8725355108338766, "learning_rate": 4.037183708989642e-06, "loss": 0.4775, "step": 3824 }, { "epoch": 0.3106472833590514, "grad_norm": 7.614229033887325, "learning_rate": 4.0366650380216165e-06, "loss": 0.4905, "step": 3825 }, { "epoch": 0.310728498335093, "grad_norm": 5.153358446059254, "learning_rate": 4.036146260724503e-06, "loss": 0.5249, "step": 3826 }, { "epoch": 0.31080971331113455, "grad_norm": 8.578326851480215, "learning_rate": 4.0356273771341984e-06, "loss": 0.6979, "step": 3827 }, { "epoch": 0.3108909282871762, "grad_norm": 5.785574362196001, "learning_rate": 4.035108387286607e-06, "loss": 0.4557, "step": 3828 }, { "epoch": 0.31097214326321776, "grad_norm": 3.55379778886365, "learning_rate": 4.03458929121764e-06, "loss": 0.6176, "step": 3829 }, { "epoch": 0.31105335823925934, "grad_norm": 7.612980594268243, "learning_rate": 4.0340700889632145e-06, "loss": 0.7673, "step": 3830 }, { "epoch": 0.3111345732153009, "grad_norm": 9.252507294094109, "learning_rate": 4.033550780559258e-06, "loss": 0.5036, "step": 3831 }, { "epoch": 0.3112157881913425, "grad_norm": 3.594686390622405, "learning_rate": 4.033031366041704e-06, "loss": 0.4821, "step": 3832 }, { "epoch": 0.31129700316738407, "grad_norm": 6.285694736961024, "learning_rate": 4.0325118454464935e-06, "loss": 0.5007, "step": 3833 }, { "epoch": 0.31137821814342564, "grad_norm": 3.160762643660874, "learning_rate": 4.031992218809573e-06, "loss": 0.5703, "step": 3834 }, { "epoch": 0.3114594331194672, "grad_norm": 8.890289793696613, "learning_rate": 4.0314724861669e-06, "loss": 0.5602, "step": 3835 }, { "epoch": 0.3115406480955088, "grad_norm": 4.321571181434268, "learning_rate": 4.0309526475544355e-06, "loss": 0.6061, "step": 3836 }, { "epoch": 0.3116218630715504, "grad_norm": 7.365273193938057, "learning_rate": 4.03043270300815e-06, "loss": 0.5242, "step": 3837 }, { "epoch": 0.31170307804759195, "grad_norm": 6.152202896973093, "learning_rate": 4.029912652564022e-06, "loss": 0.527, "step": 3838 }, { "epoch": 0.3117842930236336, "grad_norm": 9.939281394272458, "learning_rate": 4.029392496258035e-06, "loss": 0.6952, "step": 3839 }, { "epoch": 0.31186550799967516, "grad_norm": 7.9876078180108685, "learning_rate": 4.028872234126181e-06, "loss": 0.4901, "step": 3840 }, { "epoch": 0.31194672297571674, "grad_norm": 4.522557141556952, "learning_rate": 4.02835186620446e-06, "loss": 0.4127, "step": 3841 }, { "epoch": 0.3120279379517583, "grad_norm": 5.5314558917789505, "learning_rate": 4.027831392528879e-06, "loss": 0.5045, "step": 3842 }, { "epoch": 0.3121091529277999, "grad_norm": 6.372253503387104, "learning_rate": 4.027310813135451e-06, "loss": 0.5654, "step": 3843 }, { "epoch": 0.31219036790384147, "grad_norm": 3.5189797244740446, "learning_rate": 4.0267901280601985e-06, "loss": 0.4954, "step": 3844 }, { "epoch": 0.31227158287988305, "grad_norm": 6.001913797204444, "learning_rate": 4.026269337339149e-06, "loss": 0.5535, "step": 3845 }, { "epoch": 0.3123527978559246, "grad_norm": 6.805227975140487, "learning_rate": 4.025748441008339e-06, "loss": 0.5175, "step": 3846 }, { "epoch": 0.3124340128319662, "grad_norm": 3.6933697953285423, "learning_rate": 4.0252274391038125e-06, "loss": 0.6538, "step": 3847 }, { "epoch": 0.3125152278080078, "grad_norm": 4.975606249432986, "learning_rate": 4.024706331661618e-06, "loss": 0.6194, "step": 3848 }, { "epoch": 0.31259644278404936, "grad_norm": 21.413001819693548, "learning_rate": 4.024185118717816e-06, "loss": 0.5108, "step": 3849 }, { "epoch": 0.312677657760091, "grad_norm": 4.688146949121658, "learning_rate": 4.023663800308471e-06, "loss": 0.5959, "step": 3850 }, { "epoch": 0.31275887273613256, "grad_norm": 4.929413723785714, "learning_rate": 4.023142376469653e-06, "loss": 0.4974, "step": 3851 }, { "epoch": 0.31284008771217414, "grad_norm": 7.165568666050521, "learning_rate": 4.022620847237445e-06, "loss": 0.5106, "step": 3852 }, { "epoch": 0.3129213026882157, "grad_norm": 5.314358756089364, "learning_rate": 4.022099212647933e-06, "loss": 0.6104, "step": 3853 }, { "epoch": 0.3130025176642573, "grad_norm": 4.330125714880257, "learning_rate": 4.021577472737209e-06, "loss": 0.587, "step": 3854 }, { "epoch": 0.3130837326402989, "grad_norm": 4.654860553507325, "learning_rate": 4.021055627541379e-06, "loss": 0.4938, "step": 3855 }, { "epoch": 0.31316494761634045, "grad_norm": 5.006258634790825, "learning_rate": 4.020533677096549e-06, "loss": 0.5747, "step": 3856 }, { "epoch": 0.313246162592382, "grad_norm": 6.0280062807614545, "learning_rate": 4.020011621438836e-06, "loss": 0.5764, "step": 3857 }, { "epoch": 0.3133273775684236, "grad_norm": 6.574325175703227, "learning_rate": 4.019489460604364e-06, "loss": 0.5299, "step": 3858 }, { "epoch": 0.3134085925444652, "grad_norm": 4.080793981103117, "learning_rate": 4.018967194629261e-06, "loss": 0.7868, "step": 3859 }, { "epoch": 0.31348980752050676, "grad_norm": 3.847772634998914, "learning_rate": 4.0184448235496685e-06, "loss": 0.6058, "step": 3860 }, { "epoch": 0.3135710224965484, "grad_norm": 4.404444873697644, "learning_rate": 4.017922347401731e-06, "loss": 0.523, "step": 3861 }, { "epoch": 0.31365223747258997, "grad_norm": 9.45105154996585, "learning_rate": 4.017399766221599e-06, "loss": 0.4319, "step": 3862 }, { "epoch": 0.31373345244863154, "grad_norm": 4.265430944389896, "learning_rate": 4.016877080045435e-06, "loss": 0.4388, "step": 3863 }, { "epoch": 0.3138146674246731, "grad_norm": 4.626292776222082, "learning_rate": 4.016354288909405e-06, "loss": 0.4834, "step": 3864 }, { "epoch": 0.3138958824007147, "grad_norm": 3.893386508353787, "learning_rate": 4.0158313928496826e-06, "loss": 0.5888, "step": 3865 }, { "epoch": 0.3139770973767563, "grad_norm": 5.310652622885137, "learning_rate": 4.015308391902452e-06, "loss": 0.5323, "step": 3866 }, { "epoch": 0.31405831235279785, "grad_norm": 6.885983808946358, "learning_rate": 4.014785286103898e-06, "loss": 0.5397, "step": 3867 }, { "epoch": 0.31413952732883943, "grad_norm": 7.793035367661995, "learning_rate": 4.014262075490221e-06, "loss": 0.4684, "step": 3868 }, { "epoch": 0.314220742304881, "grad_norm": 14.532888308919345, "learning_rate": 4.013738760097622e-06, "loss": 0.5751, "step": 3869 }, { "epoch": 0.3143019572809226, "grad_norm": 6.316239281714061, "learning_rate": 4.0132153399623106e-06, "loss": 0.5754, "step": 3870 }, { "epoch": 0.31438317225696416, "grad_norm": 5.218130390396817, "learning_rate": 4.012691815120508e-06, "loss": 0.5718, "step": 3871 }, { "epoch": 0.3144643872330058, "grad_norm": 10.486211867182627, "learning_rate": 4.012168185608437e-06, "loss": 0.4341, "step": 3872 }, { "epoch": 0.31454560220904737, "grad_norm": 13.04476716032136, "learning_rate": 4.011644451462331e-06, "loss": 0.6967, "step": 3873 }, { "epoch": 0.31462681718508895, "grad_norm": 4.374789125818755, "learning_rate": 4.011120612718429e-06, "loss": 0.5778, "step": 3874 }, { "epoch": 0.3147080321611305, "grad_norm": 4.427126765436375, "learning_rate": 4.010596669412978e-06, "loss": 0.5147, "step": 3875 }, { "epoch": 0.3147892471371721, "grad_norm": 6.121898739230059, "learning_rate": 4.010072621582233e-06, "loss": 0.4541, "step": 3876 }, { "epoch": 0.3148704621132137, "grad_norm": 8.191495131008242, "learning_rate": 4.009548469262453e-06, "loss": 0.7081, "step": 3877 }, { "epoch": 0.31495167708925526, "grad_norm": 5.939103893735796, "learning_rate": 4.009024212489909e-06, "loss": 0.588, "step": 3878 }, { "epoch": 0.31503289206529683, "grad_norm": 5.468809556850152, "learning_rate": 4.0084998513008765e-06, "loss": 0.5442, "step": 3879 }, { "epoch": 0.3151141070413384, "grad_norm": 3.760788030695487, "learning_rate": 4.007975385731637e-06, "loss": 0.771, "step": 3880 }, { "epoch": 0.31519532201738, "grad_norm": 4.150500328342237, "learning_rate": 4.007450815818481e-06, "loss": 0.5091, "step": 3881 }, { "epoch": 0.31527653699342156, "grad_norm": 4.7154167246286285, "learning_rate": 4.0069261415977075e-06, "loss": 0.5819, "step": 3882 }, { "epoch": 0.3153577519694632, "grad_norm": 4.032822924231925, "learning_rate": 4.006401363105621e-06, "loss": 0.5721, "step": 3883 }, { "epoch": 0.3154389669455048, "grad_norm": 4.862866759718993, "learning_rate": 4.0058764803785325e-06, "loss": 0.6929, "step": 3884 }, { "epoch": 0.31552018192154635, "grad_norm": 3.918915992085951, "learning_rate": 4.00535149345276e-06, "loss": 0.5521, "step": 3885 }, { "epoch": 0.3156013968975879, "grad_norm": 5.105255048171445, "learning_rate": 4.0048264023646325e-06, "loss": 0.5472, "step": 3886 }, { "epoch": 0.3156826118736295, "grad_norm": 8.077151245590299, "learning_rate": 4.004301207150482e-06, "loss": 0.4525, "step": 3887 }, { "epoch": 0.3157638268496711, "grad_norm": 8.364488519427972, "learning_rate": 4.003775907846648e-06, "loss": 0.566, "step": 3888 }, { "epoch": 0.31584504182571266, "grad_norm": 4.65775942487372, "learning_rate": 4.003250504489481e-06, "loss": 0.4639, "step": 3889 }, { "epoch": 0.31592625680175423, "grad_norm": 10.681842547013567, "learning_rate": 4.002724997115335e-06, "loss": 0.3836, "step": 3890 }, { "epoch": 0.3160074717777958, "grad_norm": 4.910721299022231, "learning_rate": 4.002199385760571e-06, "loss": 0.5612, "step": 3891 }, { "epoch": 0.3160886867538374, "grad_norm": 5.963214780659344, "learning_rate": 4.001673670461561e-06, "loss": 0.6232, "step": 3892 }, { "epoch": 0.31616990172987897, "grad_norm": 6.652157701535889, "learning_rate": 4.0011478512546805e-06, "loss": 0.5664, "step": 3893 }, { "epoch": 0.3162511167059206, "grad_norm": 3.8712273283479233, "learning_rate": 4.000621928176313e-06, "loss": 0.5419, "step": 3894 }, { "epoch": 0.3163323316819622, "grad_norm": 5.721147211228416, "learning_rate": 4.000095901262851e-06, "loss": 0.5002, "step": 3895 }, { "epoch": 0.31641354665800375, "grad_norm": 5.522015478399547, "learning_rate": 3.99956977055069e-06, "loss": 0.5785, "step": 3896 }, { "epoch": 0.31649476163404533, "grad_norm": 12.710524189514066, "learning_rate": 3.999043536076238e-06, "loss": 0.5987, "step": 3897 }, { "epoch": 0.3165759766100869, "grad_norm": 15.432871133292979, "learning_rate": 3.998517197875908e-06, "loss": 0.5758, "step": 3898 }, { "epoch": 0.3166571915861285, "grad_norm": 3.2538000933185622, "learning_rate": 3.997990755986117e-06, "loss": 0.3758, "step": 3899 }, { "epoch": 0.31673840656217006, "grad_norm": 6.504572502780165, "learning_rate": 3.9974642104432945e-06, "loss": 0.6192, "step": 3900 }, { "epoch": 0.31681962153821164, "grad_norm": 5.380343912168226, "learning_rate": 3.996937561283874e-06, "loss": 0.5603, "step": 3901 }, { "epoch": 0.3169008365142532, "grad_norm": 8.424493108304915, "learning_rate": 3.996410808544296e-06, "loss": 0.5061, "step": 3902 }, { "epoch": 0.3169820514902948, "grad_norm": 5.3477146060500225, "learning_rate": 3.99588395226101e-06, "loss": 0.5234, "step": 3903 }, { "epoch": 0.31706326646633637, "grad_norm": 12.609875677912806, "learning_rate": 3.9953569924704715e-06, "loss": 0.5047, "step": 3904 }, { "epoch": 0.317144481442378, "grad_norm": 6.884048976659566, "learning_rate": 3.994829929209143e-06, "loss": 0.5393, "step": 3905 }, { "epoch": 0.3172256964184196, "grad_norm": 5.4396250935337465, "learning_rate": 3.994302762513496e-06, "loss": 0.5701, "step": 3906 }, { "epoch": 0.31730691139446116, "grad_norm": 3.9789051349804905, "learning_rate": 3.993775492420005e-06, "loss": 0.8038, "step": 3907 }, { "epoch": 0.31738812637050273, "grad_norm": 6.0730946208938335, "learning_rate": 3.993248118965155e-06, "loss": 0.5355, "step": 3908 }, { "epoch": 0.3174693413465443, "grad_norm": 7.073767654590764, "learning_rate": 3.992720642185439e-06, "loss": 0.4949, "step": 3909 }, { "epoch": 0.3175505563225859, "grad_norm": 6.028367836015118, "learning_rate": 3.992193062117354e-06, "loss": 0.5065, "step": 3910 }, { "epoch": 0.31763177129862746, "grad_norm": 5.478458346134022, "learning_rate": 3.991665378797408e-06, "loss": 0.6328, "step": 3911 }, { "epoch": 0.31771298627466904, "grad_norm": 5.80032080288049, "learning_rate": 3.991137592262111e-06, "loss": 0.3775, "step": 3912 }, { "epoch": 0.3177942012507106, "grad_norm": 5.364477054479598, "learning_rate": 3.990609702547985e-06, "loss": 0.6169, "step": 3913 }, { "epoch": 0.3178754162267522, "grad_norm": 6.357814340494215, "learning_rate": 3.990081709691556e-06, "loss": 0.5666, "step": 3914 }, { "epoch": 0.31795663120279377, "grad_norm": 3.0964122690765055, "learning_rate": 3.989553613729359e-06, "loss": 0.5291, "step": 3915 }, { "epoch": 0.3180378461788354, "grad_norm": 6.1633146191090615, "learning_rate": 3.989025414697935e-06, "loss": 0.4764, "step": 3916 }, { "epoch": 0.318119061154877, "grad_norm": 5.380367903861603, "learning_rate": 3.988497112633834e-06, "loss": 0.4923, "step": 3917 }, { "epoch": 0.31820027613091856, "grad_norm": 4.635335577517618, "learning_rate": 3.98796870757361e-06, "loss": 0.5211, "step": 3918 }, { "epoch": 0.31828149110696013, "grad_norm": 4.837892560831567, "learning_rate": 3.987440199553826e-06, "loss": 0.5608, "step": 3919 }, { "epoch": 0.3183627060830017, "grad_norm": 5.412511747936819, "learning_rate": 3.986911588611052e-06, "loss": 0.5652, "step": 3920 }, { "epoch": 0.3184439210590433, "grad_norm": 6.754512301450377, "learning_rate": 3.986382874781866e-06, "loss": 0.6123, "step": 3921 }, { "epoch": 0.31852513603508487, "grad_norm": 3.851560698751757, "learning_rate": 3.985854058102851e-06, "loss": 0.4627, "step": 3922 }, { "epoch": 0.31860635101112644, "grad_norm": 9.180563910365933, "learning_rate": 3.9853251386106e-06, "loss": 0.5239, "step": 3923 }, { "epoch": 0.318687565987168, "grad_norm": 6.398627354244881, "learning_rate": 3.9847961163417094e-06, "loss": 0.4989, "step": 3924 }, { "epoch": 0.3187687809632096, "grad_norm": 4.516597706332039, "learning_rate": 3.984266991332787e-06, "loss": 0.5573, "step": 3925 }, { "epoch": 0.3188499959392512, "grad_norm": 6.97958059945626, "learning_rate": 3.9837377636204435e-06, "loss": 0.3659, "step": 3926 }, { "epoch": 0.3189312109152928, "grad_norm": 9.380173505814613, "learning_rate": 3.983208433241298e-06, "loss": 0.5665, "step": 3927 }, { "epoch": 0.3190124258913344, "grad_norm": 5.179839950219316, "learning_rate": 3.98267900023198e-06, "loss": 0.6098, "step": 3928 }, { "epoch": 0.31909364086737596, "grad_norm": 11.243873982204496, "learning_rate": 3.982149464629123e-06, "loss": 0.5793, "step": 3929 }, { "epoch": 0.31917485584341754, "grad_norm": 4.1291560709585164, "learning_rate": 3.981619826469366e-06, "loss": 0.5195, "step": 3930 }, { "epoch": 0.3192560708194591, "grad_norm": 4.373741253760804, "learning_rate": 3.981090085789359e-06, "loss": 0.5506, "step": 3931 }, { "epoch": 0.3193372857955007, "grad_norm": 6.318431118742852, "learning_rate": 3.980560242625756e-06, "loss": 0.6606, "step": 3932 }, { "epoch": 0.31941850077154227, "grad_norm": 4.240677812054353, "learning_rate": 3.9800302970152205e-06, "loss": 0.6272, "step": 3933 }, { "epoch": 0.31949971574758385, "grad_norm": 5.672962140852729, "learning_rate": 3.9795002489944216e-06, "loss": 0.6, "step": 3934 }, { "epoch": 0.3195809307236254, "grad_norm": 6.714289970142845, "learning_rate": 3.978970098600035e-06, "loss": 0.5626, "step": 3935 }, { "epoch": 0.319662145699667, "grad_norm": 4.761339278563214, "learning_rate": 3.978439845868745e-06, "loss": 0.4723, "step": 3936 }, { "epoch": 0.3197433606757086, "grad_norm": 13.677606899149971, "learning_rate": 3.977909490837242e-06, "loss": 0.5259, "step": 3937 }, { "epoch": 0.3198245756517502, "grad_norm": 6.128118023978932, "learning_rate": 3.977379033542225e-06, "loss": 0.438, "step": 3938 }, { "epoch": 0.3199057906277918, "grad_norm": 6.303253700795032, "learning_rate": 3.976848474020397e-06, "loss": 0.4292, "step": 3939 }, { "epoch": 0.31998700560383336, "grad_norm": 10.396013099121385, "learning_rate": 3.97631781230847e-06, "loss": 0.635, "step": 3940 }, { "epoch": 0.32006822057987494, "grad_norm": 3.2724038031097575, "learning_rate": 3.975787048443165e-06, "loss": 0.7294, "step": 3941 }, { "epoch": 0.3201494355559165, "grad_norm": 5.0292671115469645, "learning_rate": 3.975256182461206e-06, "loss": 0.5492, "step": 3942 }, { "epoch": 0.3202306505319581, "grad_norm": 15.222409035535888, "learning_rate": 3.9747252143993265e-06, "loss": 0.5846, "step": 3943 }, { "epoch": 0.32031186550799967, "grad_norm": 3.721523920711308, "learning_rate": 3.9741941442942685e-06, "loss": 0.7537, "step": 3944 }, { "epoch": 0.32039308048404125, "grad_norm": 9.490205761589818, "learning_rate": 3.973662972182777e-06, "loss": 0.4823, "step": 3945 }, { "epoch": 0.3204742954600828, "grad_norm": 5.92729992222592, "learning_rate": 3.973131698101606e-06, "loss": 0.6342, "step": 3946 }, { "epoch": 0.3205555104361244, "grad_norm": 3.889265293501103, "learning_rate": 3.97260032208752e-06, "loss": 0.6895, "step": 3947 }, { "epoch": 0.320636725412166, "grad_norm": 5.718930345280851, "learning_rate": 3.972068844177284e-06, "loss": 0.5711, "step": 3948 }, { "epoch": 0.3207179403882076, "grad_norm": 5.665338399507542, "learning_rate": 3.971537264407674e-06, "loss": 0.4434, "step": 3949 }, { "epoch": 0.3207991553642492, "grad_norm": 5.195191497507738, "learning_rate": 3.971005582815475e-06, "loss": 0.5603, "step": 3950 }, { "epoch": 0.32088037034029077, "grad_norm": 4.595813111552932, "learning_rate": 3.970473799437475e-06, "loss": 0.5229, "step": 3951 }, { "epoch": 0.32096158531633234, "grad_norm": 7.011551829999244, "learning_rate": 3.969941914310469e-06, "loss": 0.5054, "step": 3952 }, { "epoch": 0.3210428002923739, "grad_norm": 5.133064719413326, "learning_rate": 3.969409927471263e-06, "loss": 0.4704, "step": 3953 }, { "epoch": 0.3211240152684155, "grad_norm": 4.606505759316552, "learning_rate": 3.968877838956667e-06, "loss": 0.525, "step": 3954 }, { "epoch": 0.3212052302444571, "grad_norm": 5.298403703345239, "learning_rate": 3.968345648803497e-06, "loss": 0.4454, "step": 3955 }, { "epoch": 0.32128644522049865, "grad_norm": 3.561393835187397, "learning_rate": 3.96781335704858e-06, "loss": 0.5577, "step": 3956 }, { "epoch": 0.32136766019654023, "grad_norm": 4.307683957878129, "learning_rate": 3.967280963728748e-06, "loss": 0.4494, "step": 3957 }, { "epoch": 0.3214488751725818, "grad_norm": 6.747635355041429, "learning_rate": 3.966748468880838e-06, "loss": 0.6197, "step": 3958 }, { "epoch": 0.3215300901486234, "grad_norm": 7.367076215445246, "learning_rate": 3.9662158725416964e-06, "loss": 0.5736, "step": 3959 }, { "epoch": 0.321611305124665, "grad_norm": 4.986542323666267, "learning_rate": 3.965683174748176e-06, "loss": 0.6222, "step": 3960 }, { "epoch": 0.3216925201007066, "grad_norm": 4.343391538341711, "learning_rate": 3.965150375537137e-06, "loss": 0.4032, "step": 3961 }, { "epoch": 0.32177373507674817, "grad_norm": 10.317677958429377, "learning_rate": 3.964617474945447e-06, "loss": 0.5128, "step": 3962 }, { "epoch": 0.32185495005278975, "grad_norm": 5.753574432660448, "learning_rate": 3.9640844730099795e-06, "loss": 0.5081, "step": 3963 }, { "epoch": 0.3219361650288313, "grad_norm": 3.9575067307483045, "learning_rate": 3.963551369767613e-06, "loss": 0.5913, "step": 3964 }, { "epoch": 0.3220173800048729, "grad_norm": 4.7333217124955365, "learning_rate": 3.963018165255239e-06, "loss": 0.5454, "step": 3965 }, { "epoch": 0.3220985949809145, "grad_norm": 4.107636292267094, "learning_rate": 3.962484859509751e-06, "loss": 0.4283, "step": 3966 }, { "epoch": 0.32217980995695605, "grad_norm": 3.710054258246495, "learning_rate": 3.96195145256805e-06, "loss": 0.5421, "step": 3967 }, { "epoch": 0.32226102493299763, "grad_norm": 4.914093323605704, "learning_rate": 3.961417944467046e-06, "loss": 0.624, "step": 3968 }, { "epoch": 0.3223422399090392, "grad_norm": 6.367334808510856, "learning_rate": 3.960884335243655e-06, "loss": 0.557, "step": 3969 }, { "epoch": 0.3224234548850808, "grad_norm": 7.4083421441872455, "learning_rate": 3.9603506249348e-06, "loss": 0.7381, "step": 3970 }, { "epoch": 0.3225046698611224, "grad_norm": 5.365590371657556, "learning_rate": 3.959816813577409e-06, "loss": 0.4419, "step": 3971 }, { "epoch": 0.322585884837164, "grad_norm": 4.032690708340649, "learning_rate": 3.959282901208422e-06, "loss": 0.5859, "step": 3972 }, { "epoch": 0.32266709981320557, "grad_norm": 5.285352909806383, "learning_rate": 3.9587488878647816e-06, "loss": 0.5464, "step": 3973 }, { "epoch": 0.32274831478924715, "grad_norm": 6.70864234598179, "learning_rate": 3.958214773583437e-06, "loss": 0.5481, "step": 3974 }, { "epoch": 0.3228295297652887, "grad_norm": 8.065418291262025, "learning_rate": 3.957680558401348e-06, "loss": 0.5129, "step": 3975 }, { "epoch": 0.3229107447413303, "grad_norm": 5.8858306697747755, "learning_rate": 3.95714624235548e-06, "loss": 0.5457, "step": 3976 }, { "epoch": 0.3229919597173719, "grad_norm": 4.006111281727507, "learning_rate": 3.956611825482803e-06, "loss": 0.5355, "step": 3977 }, { "epoch": 0.32307317469341346, "grad_norm": 5.150201779401238, "learning_rate": 3.956077307820296e-06, "loss": 0.4682, "step": 3978 }, { "epoch": 0.32315438966945503, "grad_norm": 5.941131977925157, "learning_rate": 3.955542689404948e-06, "loss": 0.5036, "step": 3979 }, { "epoch": 0.3232356046454966, "grad_norm": 5.8735315327446, "learning_rate": 3.955007970273747e-06, "loss": 0.7358, "step": 3980 }, { "epoch": 0.3233168196215382, "grad_norm": 6.004080782992699, "learning_rate": 3.954473150463696e-06, "loss": 0.4277, "step": 3981 }, { "epoch": 0.3233980345975798, "grad_norm": 3.969562702014225, "learning_rate": 3.9539382300117995e-06, "loss": 0.6674, "step": 3982 }, { "epoch": 0.3234792495736214, "grad_norm": 5.552544120013278, "learning_rate": 3.953403208955074e-06, "loss": 0.5466, "step": 3983 }, { "epoch": 0.323560464549663, "grad_norm": 6.198037037595219, "learning_rate": 3.952868087330537e-06, "loss": 0.5557, "step": 3984 }, { "epoch": 0.32364167952570455, "grad_norm": 3.7865591645101278, "learning_rate": 3.952332865175218e-06, "loss": 0.622, "step": 3985 }, { "epoch": 0.32372289450174613, "grad_norm": 4.933123244556156, "learning_rate": 3.951797542526151e-06, "loss": 0.5681, "step": 3986 }, { "epoch": 0.3238041094777877, "grad_norm": 5.397627483042807, "learning_rate": 3.951262119420378e-06, "loss": 0.4988, "step": 3987 }, { "epoch": 0.3238853244538293, "grad_norm": 4.851399386478982, "learning_rate": 3.950726595894947e-06, "loss": 0.4111, "step": 3988 }, { "epoch": 0.32396653942987086, "grad_norm": 3.1067739446054636, "learning_rate": 3.950190971986913e-06, "loss": 0.6122, "step": 3989 }, { "epoch": 0.32404775440591244, "grad_norm": 5.367173504535532, "learning_rate": 3.9496552477333396e-06, "loss": 0.5503, "step": 3990 }, { "epoch": 0.324128969381954, "grad_norm": 6.546774201268415, "learning_rate": 3.9491194231712945e-06, "loss": 0.4951, "step": 3991 }, { "epoch": 0.3242101843579956, "grad_norm": 3.3008359196847876, "learning_rate": 3.948583498337854e-06, "loss": 0.4695, "step": 3992 }, { "epoch": 0.3242913993340372, "grad_norm": 6.2106989027955315, "learning_rate": 3.9480474732701034e-06, "loss": 0.6426, "step": 3993 }, { "epoch": 0.3243726143100788, "grad_norm": 7.437097992706253, "learning_rate": 3.9475113480051305e-06, "loss": 0.5088, "step": 3994 }, { "epoch": 0.3244538292861204, "grad_norm": 4.649897071407217, "learning_rate": 3.9469751225800344e-06, "loss": 0.3348, "step": 3995 }, { "epoch": 0.32453504426216195, "grad_norm": 7.807412990752628, "learning_rate": 3.946438797031916e-06, "loss": 0.5809, "step": 3996 }, { "epoch": 0.32461625923820353, "grad_norm": 10.891441494223873, "learning_rate": 3.9459023713978895e-06, "loss": 0.4846, "step": 3997 }, { "epoch": 0.3246974742142451, "grad_norm": 8.904826186725101, "learning_rate": 3.94536584571507e-06, "loss": 0.6143, "step": 3998 }, { "epoch": 0.3247786891902867, "grad_norm": 5.6954977350735785, "learning_rate": 3.944829220020584e-06, "loss": 0.5203, "step": 3999 }, { "epoch": 0.32485990416632826, "grad_norm": 4.93125171789273, "learning_rate": 3.944292494351563e-06, "loss": 0.6325, "step": 4000 }, { "epoch": 0.32494111914236984, "grad_norm": 4.965862139362356, "learning_rate": 3.943755668745145e-06, "loss": 0.5805, "step": 4001 }, { "epoch": 0.3250223341184114, "grad_norm": 5.16787159248045, "learning_rate": 3.943218743238476e-06, "loss": 0.5562, "step": 4002 }, { "epoch": 0.325103549094453, "grad_norm": 7.5362717832466055, "learning_rate": 3.942681717868707e-06, "loss": 0.5688, "step": 4003 }, { "epoch": 0.3251847640704946, "grad_norm": 4.981243847271942, "learning_rate": 3.942144592673e-06, "loss": 0.5032, "step": 4004 }, { "epoch": 0.3252659790465362, "grad_norm": 5.040020101083127, "learning_rate": 3.941607367688518e-06, "loss": 0.6819, "step": 4005 }, { "epoch": 0.3253471940225778, "grad_norm": 5.368844738609336, "learning_rate": 3.941070042952437e-06, "loss": 0.602, "step": 4006 }, { "epoch": 0.32542840899861936, "grad_norm": 6.620173821128225, "learning_rate": 3.940532618501935e-06, "loss": 0.4266, "step": 4007 }, { "epoch": 0.32550962397466093, "grad_norm": 3.763790781156802, "learning_rate": 3.9399950943742e-06, "loss": 0.6278, "step": 4008 }, { "epoch": 0.3255908389507025, "grad_norm": 5.280317791344952, "learning_rate": 3.939457470606426e-06, "loss": 0.5235, "step": 4009 }, { "epoch": 0.3256720539267441, "grad_norm": 6.207070016608509, "learning_rate": 3.938919747235812e-06, "loss": 0.498, "step": 4010 }, { "epoch": 0.32575326890278566, "grad_norm": 3.9148661505895923, "learning_rate": 3.938381924299568e-06, "loss": 0.6105, "step": 4011 }, { "epoch": 0.32583448387882724, "grad_norm": 11.982123261840117, "learning_rate": 3.937844001834907e-06, "loss": 0.4771, "step": 4012 }, { "epoch": 0.3259156988548688, "grad_norm": 8.177076369022492, "learning_rate": 3.93730597987905e-06, "loss": 0.5762, "step": 4013 }, { "epoch": 0.3259969138309104, "grad_norm": 4.802261707731817, "learning_rate": 3.936767858469228e-06, "loss": 0.4574, "step": 4014 }, { "epoch": 0.32607812880695203, "grad_norm": 4.140558850278409, "learning_rate": 3.936229637642672e-06, "loss": 0.4855, "step": 4015 }, { "epoch": 0.3261593437829936, "grad_norm": 3.5682197931038644, "learning_rate": 3.935691317436628e-06, "loss": 0.6508, "step": 4016 }, { "epoch": 0.3262405587590352, "grad_norm": 4.124877825005949, "learning_rate": 3.9351528978883425e-06, "loss": 0.6438, "step": 4017 }, { "epoch": 0.32632177373507676, "grad_norm": 5.830213074804944, "learning_rate": 3.934614379035071e-06, "loss": 0.5822, "step": 4018 }, { "epoch": 0.32640298871111834, "grad_norm": 8.315171390345546, "learning_rate": 3.9340757609140785e-06, "loss": 0.5308, "step": 4019 }, { "epoch": 0.3264842036871599, "grad_norm": 3.4244913994169313, "learning_rate": 3.933537043562632e-06, "loss": 0.4855, "step": 4020 }, { "epoch": 0.3265654186632015, "grad_norm": 4.156835180938438, "learning_rate": 3.932998227018009e-06, "loss": 0.6688, "step": 4021 }, { "epoch": 0.32664663363924307, "grad_norm": 3.4822522397229956, "learning_rate": 3.932459311317494e-06, "loss": 0.5877, "step": 4022 }, { "epoch": 0.32672784861528464, "grad_norm": 4.707979946257839, "learning_rate": 3.931920296498374e-06, "loss": 0.5496, "step": 4023 }, { "epoch": 0.3268090635913262, "grad_norm": 5.369501192774783, "learning_rate": 3.931381182597949e-06, "loss": 0.4053, "step": 4024 }, { "epoch": 0.3268902785673678, "grad_norm": 7.0165291185775, "learning_rate": 3.930841969653521e-06, "loss": 0.5032, "step": 4025 }, { "epoch": 0.32697149354340943, "grad_norm": 3.7285656291064844, "learning_rate": 3.930302657702402e-06, "loss": 0.5277, "step": 4026 }, { "epoch": 0.327052708519451, "grad_norm": 4.479547869133663, "learning_rate": 3.929763246781909e-06, "loss": 0.5248, "step": 4027 }, { "epoch": 0.3271339234954926, "grad_norm": 5.115409985178063, "learning_rate": 3.929223736929366e-06, "loss": 0.4248, "step": 4028 }, { "epoch": 0.32721513847153416, "grad_norm": 6.753042464948224, "learning_rate": 3.928684128182104e-06, "loss": 0.5976, "step": 4029 }, { "epoch": 0.32729635344757574, "grad_norm": 6.503506149392067, "learning_rate": 3.9281444205774625e-06, "loss": 0.5214, "step": 4030 }, { "epoch": 0.3273775684236173, "grad_norm": 3.8958786789276787, "learning_rate": 3.927604614152784e-06, "loss": 0.6028, "step": 4031 }, { "epoch": 0.3274587833996589, "grad_norm": 4.602684285068746, "learning_rate": 3.927064708945423e-06, "loss": 0.5836, "step": 4032 }, { "epoch": 0.32753999837570047, "grad_norm": 5.4356792062574515, "learning_rate": 3.926524704992736e-06, "loss": 0.4976, "step": 4033 }, { "epoch": 0.32762121335174205, "grad_norm": 5.1282337398644, "learning_rate": 3.9259846023320895e-06, "loss": 0.6328, "step": 4034 }, { "epoch": 0.3277024283277836, "grad_norm": 5.574289088333828, "learning_rate": 3.925444401000855e-06, "loss": 0.5181, "step": 4035 }, { "epoch": 0.3277836433038252, "grad_norm": 4.638010244640014, "learning_rate": 3.924904101036413e-06, "loss": 0.8373, "step": 4036 }, { "epoch": 0.32786485827986683, "grad_norm": 3.1082588154104225, "learning_rate": 3.924363702476147e-06, "loss": 0.5468, "step": 4037 }, { "epoch": 0.3279460732559084, "grad_norm": 3.8088490874389427, "learning_rate": 3.923823205357453e-06, "loss": 0.555, "step": 4038 }, { "epoch": 0.32802728823195, "grad_norm": 3.64966498327372, "learning_rate": 3.923282609717727e-06, "loss": 0.4554, "step": 4039 }, { "epoch": 0.32810850320799156, "grad_norm": 5.086174759269899, "learning_rate": 3.922741915594378e-06, "loss": 0.5311, "step": 4040 }, { "epoch": 0.32818971818403314, "grad_norm": 3.596353017219979, "learning_rate": 3.9222011230248175e-06, "loss": 0.6889, "step": 4041 }, { "epoch": 0.3282709331600747, "grad_norm": 4.391572461946081, "learning_rate": 3.9216602320464655e-06, "loss": 0.6185, "step": 4042 }, { "epoch": 0.3283521481361163, "grad_norm": 3.615775625258503, "learning_rate": 3.921119242696751e-06, "loss": 0.5438, "step": 4043 }, { "epoch": 0.3284333631121579, "grad_norm": 5.820794815401509, "learning_rate": 3.920578155013106e-06, "loss": 0.6468, "step": 4044 }, { "epoch": 0.32851457808819945, "grad_norm": 5.467081465670503, "learning_rate": 3.92003696903297e-06, "loss": 0.4855, "step": 4045 }, { "epoch": 0.328595793064241, "grad_norm": 5.561814863652521, "learning_rate": 3.919495684793792e-06, "loss": 0.6468, "step": 4046 }, { "epoch": 0.3286770080402826, "grad_norm": 3.807058673477291, "learning_rate": 3.918954302333025e-06, "loss": 0.5918, "step": 4047 }, { "epoch": 0.32875822301632424, "grad_norm": 13.31883734675245, "learning_rate": 3.91841282168813e-06, "loss": 0.6432, "step": 4048 }, { "epoch": 0.3288394379923658, "grad_norm": 7.056921038692695, "learning_rate": 3.917871242896575e-06, "loss": 0.4209, "step": 4049 }, { "epoch": 0.3289206529684074, "grad_norm": 3.9195936437606025, "learning_rate": 3.917329565995833e-06, "loss": 0.5187, "step": 4050 }, { "epoch": 0.32900186794444897, "grad_norm": 4.550380167725966, "learning_rate": 3.916787791023386e-06, "loss": 0.5355, "step": 4051 }, { "epoch": 0.32908308292049054, "grad_norm": 3.6800755148359148, "learning_rate": 3.916245918016724e-06, "loss": 0.5518, "step": 4052 }, { "epoch": 0.3291642978965321, "grad_norm": 3.478748983074201, "learning_rate": 3.915703947013338e-06, "loss": 0.7117, "step": 4053 }, { "epoch": 0.3292455128725737, "grad_norm": 3.7123489856745957, "learning_rate": 3.9151618780507316e-06, "loss": 0.5708, "step": 4054 }, { "epoch": 0.3293267278486153, "grad_norm": 5.643512507779105, "learning_rate": 3.914619711166413e-06, "loss": 0.5249, "step": 4055 }, { "epoch": 0.32940794282465685, "grad_norm": 3.920365869559619, "learning_rate": 3.914077446397897e-06, "loss": 0.493, "step": 4056 }, { "epoch": 0.32948915780069843, "grad_norm": 5.253760200255425, "learning_rate": 3.913535083782707e-06, "loss": 0.4857, "step": 4057 }, { "epoch": 0.32957037277674, "grad_norm": 5.037318300624211, "learning_rate": 3.912992623358368e-06, "loss": 0.5694, "step": 4058 }, { "epoch": 0.32965158775278164, "grad_norm": 16.068629096649826, "learning_rate": 3.91245006516242e-06, "loss": 0.407, "step": 4059 }, { "epoch": 0.3297328027288232, "grad_norm": 8.438998004731252, "learning_rate": 3.911907409232402e-06, "loss": 0.5709, "step": 4060 }, { "epoch": 0.3298140177048648, "grad_norm": 4.031413575021964, "learning_rate": 3.911364655605863e-06, "loss": 0.5698, "step": 4061 }, { "epoch": 0.32989523268090637, "grad_norm": 6.129803113623354, "learning_rate": 3.9108218043203595e-06, "loss": 0.5103, "step": 4062 }, { "epoch": 0.32997644765694795, "grad_norm": 5.08608452932562, "learning_rate": 3.910278855413454e-06, "loss": 0.6426, "step": 4063 }, { "epoch": 0.3300576626329895, "grad_norm": 4.487322847861987, "learning_rate": 3.909735808922716e-06, "loss": 0.5326, "step": 4064 }, { "epoch": 0.3301388776090311, "grad_norm": 5.951925940505768, "learning_rate": 3.90919266488572e-06, "loss": 0.481, "step": 4065 }, { "epoch": 0.3302200925850727, "grad_norm": 5.479126876701789, "learning_rate": 3.908649423340049e-06, "loss": 0.5089, "step": 4066 }, { "epoch": 0.33030130756111425, "grad_norm": 4.922261403979778, "learning_rate": 3.908106084323295e-06, "loss": 0.3857, "step": 4067 }, { "epoch": 0.33038252253715583, "grad_norm": 5.574237760697828, "learning_rate": 3.9075626478730515e-06, "loss": 0.6416, "step": 4068 }, { "epoch": 0.3304637375131974, "grad_norm": 3.790844587172487, "learning_rate": 3.907019114026922e-06, "loss": 0.5985, "step": 4069 }, { "epoch": 0.33054495248923904, "grad_norm": 6.112434755772752, "learning_rate": 3.906475482822517e-06, "loss": 0.5749, "step": 4070 }, { "epoch": 0.3306261674652806, "grad_norm": 3.6083314382705836, "learning_rate": 3.905931754297451e-06, "loss": 0.5349, "step": 4071 }, { "epoch": 0.3307073824413222, "grad_norm": 7.284570880294314, "learning_rate": 3.905387928489349e-06, "loss": 1.0363, "step": 4072 }, { "epoch": 0.33078859741736377, "grad_norm": 4.232160296762864, "learning_rate": 3.904844005435841e-06, "loss": 0.5585, "step": 4073 }, { "epoch": 0.33086981239340535, "grad_norm": 6.814310707158021, "learning_rate": 3.904299985174562e-06, "loss": 0.6862, "step": 4074 }, { "epoch": 0.3309510273694469, "grad_norm": 4.171877579913468, "learning_rate": 3.903755867743156e-06, "loss": 0.5983, "step": 4075 }, { "epoch": 0.3310322423454885, "grad_norm": 7.101609370383942, "learning_rate": 3.9032116531792745e-06, "loss": 0.5303, "step": 4076 }, { "epoch": 0.3311134573215301, "grad_norm": 5.802670937749591, "learning_rate": 3.902667341520572e-06, "loss": 0.5949, "step": 4077 }, { "epoch": 0.33119467229757166, "grad_norm": 4.414654939089757, "learning_rate": 3.902122932804713e-06, "loss": 0.5337, "step": 4078 }, { "epoch": 0.33127588727361323, "grad_norm": 5.085650564698827, "learning_rate": 3.901578427069368e-06, "loss": 0.5897, "step": 4079 }, { "epoch": 0.3313571022496548, "grad_norm": 3.7722391508616036, "learning_rate": 3.901033824352213e-06, "loss": 0.5486, "step": 4080 }, { "epoch": 0.33143831722569644, "grad_norm": 6.237526026700791, "learning_rate": 3.9004891246909325e-06, "loss": 0.5186, "step": 4081 }, { "epoch": 0.331519532201738, "grad_norm": 4.2314785310516365, "learning_rate": 3.8999443281232175e-06, "loss": 0.727, "step": 4082 }, { "epoch": 0.3316007471777796, "grad_norm": 4.575453746497402, "learning_rate": 3.899399434686762e-06, "loss": 0.4363, "step": 4083 }, { "epoch": 0.3316819621538212, "grad_norm": 5.329498253898677, "learning_rate": 3.898854444419274e-06, "loss": 0.4231, "step": 4084 }, { "epoch": 0.33176317712986275, "grad_norm": 7.175459151315008, "learning_rate": 3.8983093573584605e-06, "loss": 0.4597, "step": 4085 }, { "epoch": 0.33184439210590433, "grad_norm": 4.3131878789803855, "learning_rate": 3.89776417354204e-06, "loss": 0.4702, "step": 4086 }, { "epoch": 0.3319256070819459, "grad_norm": 4.921638293579504, "learning_rate": 3.897218893007737e-06, "loss": 0.5382, "step": 4087 }, { "epoch": 0.3320068220579875, "grad_norm": 5.520197777335041, "learning_rate": 3.896673515793281e-06, "loss": 0.5162, "step": 4088 }, { "epoch": 0.33208803703402906, "grad_norm": 4.2991891020242265, "learning_rate": 3.89612804193641e-06, "loss": 0.4928, "step": 4089 }, { "epoch": 0.33216925201007064, "grad_norm": 5.160012794460195, "learning_rate": 3.895582471474866e-06, "loss": 0.5771, "step": 4090 }, { "epoch": 0.3322504669861122, "grad_norm": 4.051259036428905, "learning_rate": 3.895036804446402e-06, "loss": 0.4006, "step": 4091 }, { "epoch": 0.33233168196215385, "grad_norm": 8.065546594512844, "learning_rate": 3.894491040888774e-06, "loss": 0.7044, "step": 4092 }, { "epoch": 0.3324128969381954, "grad_norm": 5.703512084951918, "learning_rate": 3.893945180839747e-06, "loss": 0.6109, "step": 4093 }, { "epoch": 0.332494111914237, "grad_norm": 6.178693238141215, "learning_rate": 3.893399224337089e-06, "loss": 0.5347, "step": 4094 }, { "epoch": 0.3325753268902786, "grad_norm": 3.9476533225077293, "learning_rate": 3.892853171418581e-06, "loss": 0.7827, "step": 4095 }, { "epoch": 0.33265654186632015, "grad_norm": 6.5472391710424755, "learning_rate": 3.8923070221220035e-06, "loss": 0.5795, "step": 4096 }, { "epoch": 0.33273775684236173, "grad_norm": 6.934804355699152, "learning_rate": 3.891760776485151e-06, "loss": 0.4096, "step": 4097 }, { "epoch": 0.3328189718184033, "grad_norm": 4.704286066411406, "learning_rate": 3.891214434545817e-06, "loss": 0.5433, "step": 4098 }, { "epoch": 0.3329001867944449, "grad_norm": 6.41984014774679, "learning_rate": 3.890667996341806e-06, "loss": 0.4911, "step": 4099 }, { "epoch": 0.33298140177048646, "grad_norm": 5.021861875613623, "learning_rate": 3.8901214619109315e-06, "loss": 0.6617, "step": 4100 }, { "epoch": 0.33306261674652804, "grad_norm": 5.862846079718947, "learning_rate": 3.889574831291008e-06, "loss": 0.5158, "step": 4101 }, { "epoch": 0.3331438317225696, "grad_norm": 3.942541898863967, "learning_rate": 3.88902810451986e-06, "loss": 0.6377, "step": 4102 }, { "epoch": 0.33322504669861125, "grad_norm": 4.915257424154396, "learning_rate": 3.88848128163532e-06, "loss": 0.516, "step": 4103 }, { "epoch": 0.3333062616746528, "grad_norm": 4.848991134824205, "learning_rate": 3.887934362675223e-06, "loss": 0.427, "step": 4104 }, { "epoch": 0.3333874766506944, "grad_norm": 5.880446166851823, "learning_rate": 3.887387347677413e-06, "loss": 0.4525, "step": 4105 }, { "epoch": 0.333468691626736, "grad_norm": 7.1524682868349405, "learning_rate": 3.886840236679742e-06, "loss": 0.4618, "step": 4106 }, { "epoch": 0.33354990660277756, "grad_norm": 4.096340585575771, "learning_rate": 3.8862930297200665e-06, "loss": 0.4553, "step": 4107 }, { "epoch": 0.33363112157881913, "grad_norm": 3.963306450481558, "learning_rate": 3.885745726836249e-06, "loss": 0.5802, "step": 4108 }, { "epoch": 0.3337123365548607, "grad_norm": 7.460142742604042, "learning_rate": 3.885198328066163e-06, "loss": 0.4519, "step": 4109 }, { "epoch": 0.3337935515309023, "grad_norm": 4.260196594904515, "learning_rate": 3.8846508334476824e-06, "loss": 0.5484, "step": 4110 }, { "epoch": 0.33387476650694387, "grad_norm": 3.7783084323283074, "learning_rate": 3.884103243018693e-06, "loss": 0.5141, "step": 4111 }, { "epoch": 0.33395598148298544, "grad_norm": 5.894477787381174, "learning_rate": 3.883555556817083e-06, "loss": 0.6465, "step": 4112 }, { "epoch": 0.334037196459027, "grad_norm": 3.0063514235175917, "learning_rate": 3.883007774880753e-06, "loss": 0.4949, "step": 4113 }, { "epoch": 0.33411841143506865, "grad_norm": 5.033096732224493, "learning_rate": 3.882459897247603e-06, "loss": 0.3705, "step": 4114 }, { "epoch": 0.33419962641111023, "grad_norm": 5.991095754571292, "learning_rate": 3.881911923955545e-06, "loss": 0.4903, "step": 4115 }, { "epoch": 0.3342808413871518, "grad_norm": 6.237218731747614, "learning_rate": 3.881363855042496e-06, "loss": 0.5439, "step": 4116 }, { "epoch": 0.3343620563631934, "grad_norm": 4.522440392622378, "learning_rate": 3.880815690546378e-06, "loss": 0.4756, "step": 4117 }, { "epoch": 0.33444327133923496, "grad_norm": 12.342772678402822, "learning_rate": 3.880267430505123e-06, "loss": 0.5839, "step": 4118 }, { "epoch": 0.33452448631527654, "grad_norm": 4.148599127985545, "learning_rate": 3.879719074956667e-06, "loss": 0.5127, "step": 4119 }, { "epoch": 0.3346057012913181, "grad_norm": 4.58831638518393, "learning_rate": 3.879170623938951e-06, "loss": 0.6514, "step": 4120 }, { "epoch": 0.3346869162673597, "grad_norm": 4.795703097401847, "learning_rate": 3.878622077489929e-06, "loss": 0.5412, "step": 4121 }, { "epoch": 0.33476813124340127, "grad_norm": 2.8973056633483325, "learning_rate": 3.8780734356475555e-06, "loss": 0.5741, "step": 4122 }, { "epoch": 0.33484934621944284, "grad_norm": 3.5866605917438474, "learning_rate": 3.8775246984497924e-06, "loss": 0.5285, "step": 4123 }, { "epoch": 0.3349305611954844, "grad_norm": 4.6085635915185, "learning_rate": 3.876975865934612e-06, "loss": 0.6005, "step": 4124 }, { "epoch": 0.33501177617152605, "grad_norm": 8.852381490643886, "learning_rate": 3.876426938139988e-06, "loss": 0.3847, "step": 4125 }, { "epoch": 0.33509299114756763, "grad_norm": 4.677167179306538, "learning_rate": 3.875877915103905e-06, "loss": 0.5997, "step": 4126 }, { "epoch": 0.3351742061236092, "grad_norm": 5.192252334603382, "learning_rate": 3.875328796864353e-06, "loss": 0.4351, "step": 4127 }, { "epoch": 0.3352554210996508, "grad_norm": 5.52994136456474, "learning_rate": 3.8747795834593255e-06, "loss": 0.5296, "step": 4128 }, { "epoch": 0.33533663607569236, "grad_norm": 9.716260977214207, "learning_rate": 3.8742302749268264e-06, "loss": 0.5424, "step": 4129 }, { "epoch": 0.33541785105173394, "grad_norm": 5.297918507808388, "learning_rate": 3.873680871304867e-06, "loss": 0.6156, "step": 4130 }, { "epoch": 0.3354990660277755, "grad_norm": 6.873079350204181, "learning_rate": 3.8731313726314615e-06, "loss": 0.5344, "step": 4131 }, { "epoch": 0.3355802810038171, "grad_norm": 7.36681510636934, "learning_rate": 3.87258177894463e-06, "loss": 0.4929, "step": 4132 }, { "epoch": 0.33566149597985867, "grad_norm": 5.602356345828153, "learning_rate": 3.872032090282406e-06, "loss": 0.5442, "step": 4133 }, { "epoch": 0.33574271095590025, "grad_norm": 7.3672095285990435, "learning_rate": 3.871482306682821e-06, "loss": 0.492, "step": 4134 }, { "epoch": 0.3358239259319418, "grad_norm": 3.5366167997788427, "learning_rate": 3.8709324281839205e-06, "loss": 0.5198, "step": 4135 }, { "epoch": 0.33590514090798346, "grad_norm": 6.858040921971297, "learning_rate": 3.87038245482375e-06, "loss": 0.5951, "step": 4136 }, { "epoch": 0.33598635588402503, "grad_norm": 6.617980225631782, "learning_rate": 3.869832386640367e-06, "loss": 0.5853, "step": 4137 }, { "epoch": 0.3360675708600666, "grad_norm": 4.219229609437151, "learning_rate": 3.8692822236718334e-06, "loss": 0.5662, "step": 4138 }, { "epoch": 0.3361487858361082, "grad_norm": 7.439696385730913, "learning_rate": 3.868731965956215e-06, "loss": 0.4658, "step": 4139 }, { "epoch": 0.33623000081214977, "grad_norm": 5.487826086727248, "learning_rate": 3.86818161353159e-06, "loss": 0.4537, "step": 4140 }, { "epoch": 0.33631121578819134, "grad_norm": 4.200360102230179, "learning_rate": 3.867631166436038e-06, "loss": 0.5663, "step": 4141 }, { "epoch": 0.3363924307642329, "grad_norm": 6.128604530592701, "learning_rate": 3.867080624707647e-06, "loss": 0.6134, "step": 4142 }, { "epoch": 0.3364736457402745, "grad_norm": 8.81104997626726, "learning_rate": 3.866529988384512e-06, "loss": 0.5773, "step": 4143 }, { "epoch": 0.3365548607163161, "grad_norm": 5.16122326363201, "learning_rate": 3.865979257504734e-06, "loss": 0.6132, "step": 4144 }, { "epoch": 0.33663607569235765, "grad_norm": 5.870897322328888, "learning_rate": 3.8654284321064205e-06, "loss": 0.6016, "step": 4145 }, { "epoch": 0.3367172906683992, "grad_norm": 5.314926965529579, "learning_rate": 3.864877512227686e-06, "loss": 0.5678, "step": 4146 }, { "epoch": 0.33679850564444086, "grad_norm": 4.818645386450623, "learning_rate": 3.864326497906652e-06, "loss": 0.5558, "step": 4147 }, { "epoch": 0.33687972062048244, "grad_norm": 14.41001497081986, "learning_rate": 3.8637753891814435e-06, "loss": 0.5669, "step": 4148 }, { "epoch": 0.336960935596524, "grad_norm": 10.608448522114792, "learning_rate": 3.863224186090197e-06, "loss": 0.5596, "step": 4149 }, { "epoch": 0.3370421505725656, "grad_norm": 6.7059941990937775, "learning_rate": 3.862672888671051e-06, "loss": 0.5213, "step": 4150 }, { "epoch": 0.33712336554860717, "grad_norm": 3.7804743609104237, "learning_rate": 3.862121496962153e-06, "loss": 0.7185, "step": 4151 }, { "epoch": 0.33720458052464874, "grad_norm": 5.24213546954736, "learning_rate": 3.861570011001658e-06, "loss": 0.5231, "step": 4152 }, { "epoch": 0.3372857955006903, "grad_norm": 7.547577006376037, "learning_rate": 3.8610184308277216e-06, "loss": 0.4785, "step": 4153 }, { "epoch": 0.3373670104767319, "grad_norm": 2.5300512242174387, "learning_rate": 3.860466756478514e-06, "loss": 0.6946, "step": 4154 }, { "epoch": 0.3374482254527735, "grad_norm": 4.653767821488039, "learning_rate": 3.859914987992207e-06, "loss": 0.7683, "step": 4155 }, { "epoch": 0.33752944042881505, "grad_norm": 4.930191892040406, "learning_rate": 3.85936312540698e-06, "loss": 0.5301, "step": 4156 }, { "epoch": 0.33761065540485663, "grad_norm": 5.429582397608964, "learning_rate": 3.858811168761019e-06, "loss": 0.4413, "step": 4157 }, { "epoch": 0.33769187038089826, "grad_norm": 4.863739044933661, "learning_rate": 3.8582591180925164e-06, "loss": 0.4454, "step": 4158 }, { "epoch": 0.33777308535693984, "grad_norm": 5.522590385270159, "learning_rate": 3.857706973439672e-06, "loss": 0.507, "step": 4159 }, { "epoch": 0.3378543003329814, "grad_norm": 5.8411296020442, "learning_rate": 3.85715473484069e-06, "loss": 0.5373, "step": 4160 }, { "epoch": 0.337935515309023, "grad_norm": 6.215435578421796, "learning_rate": 3.856602402333783e-06, "loss": 0.5099, "step": 4161 }, { "epoch": 0.33801673028506457, "grad_norm": 3.6082614721689215, "learning_rate": 3.85604997595717e-06, "loss": 0.5364, "step": 4162 }, { "epoch": 0.33809794526110615, "grad_norm": 6.122674136049005, "learning_rate": 3.855497455749076e-06, "loss": 0.5525, "step": 4163 }, { "epoch": 0.3381791602371477, "grad_norm": 7.470369967210209, "learning_rate": 3.854944841747731e-06, "loss": 0.4537, "step": 4164 }, { "epoch": 0.3382603752131893, "grad_norm": 3.13970191041356, "learning_rate": 3.854392133991373e-06, "loss": 0.4119, "step": 4165 }, { "epoch": 0.3383415901892309, "grad_norm": 5.817426875860977, "learning_rate": 3.853839332518249e-06, "loss": 0.6385, "step": 4166 }, { "epoch": 0.33842280516527246, "grad_norm": 6.211091908912435, "learning_rate": 3.8532864373666076e-06, "loss": 0.5794, "step": 4167 }, { "epoch": 0.33850402014131403, "grad_norm": 5.831264510593337, "learning_rate": 3.852733448574707e-06, "loss": 0.53, "step": 4168 }, { "epoch": 0.33858523511735567, "grad_norm": 6.324531527655222, "learning_rate": 3.8521803661808105e-06, "loss": 0.4364, "step": 4169 }, { "epoch": 0.33866645009339724, "grad_norm": 4.057887664623083, "learning_rate": 3.851627190223189e-06, "loss": 0.6626, "step": 4170 }, { "epoch": 0.3387476650694388, "grad_norm": 6.855870724748997, "learning_rate": 3.85107392074012e-06, "loss": 0.5495, "step": 4171 }, { "epoch": 0.3388288800454804, "grad_norm": 4.761555193030634, "learning_rate": 3.850520557769886e-06, "loss": 0.5127, "step": 4172 }, { "epoch": 0.338910095021522, "grad_norm": 4.705148709464969, "learning_rate": 3.849967101350777e-06, "loss": 0.4324, "step": 4173 }, { "epoch": 0.33899130999756355, "grad_norm": 6.188404278264953, "learning_rate": 3.849413551521089e-06, "loss": 0.4936, "step": 4174 }, { "epoch": 0.3390725249736051, "grad_norm": 4.011266087707078, "learning_rate": 3.848859908319124e-06, "loss": 0.4387, "step": 4175 }, { "epoch": 0.3391537399496467, "grad_norm": 3.339221489587599, "learning_rate": 3.8483061717831935e-06, "loss": 0.6163, "step": 4176 }, { "epoch": 0.3392349549256883, "grad_norm": 5.751859436326058, "learning_rate": 3.8477523419516115e-06, "loss": 0.5683, "step": 4177 }, { "epoch": 0.33931616990172986, "grad_norm": 5.088627831398807, "learning_rate": 3.8471984188627e-06, "loss": 0.5431, "step": 4178 }, { "epoch": 0.33939738487777144, "grad_norm": 5.37045736248948, "learning_rate": 3.846644402554788e-06, "loss": 0.7555, "step": 4179 }, { "epoch": 0.33947859985381307, "grad_norm": 5.257725496690353, "learning_rate": 3.84609029306621e-06, "loss": 0.4728, "step": 4180 }, { "epoch": 0.33955981482985464, "grad_norm": 4.575264421254666, "learning_rate": 3.845536090435308e-06, "loss": 0.5399, "step": 4181 }, { "epoch": 0.3396410298058962, "grad_norm": 8.153018797116616, "learning_rate": 3.84498179470043e-06, "loss": 0.6235, "step": 4182 }, { "epoch": 0.3397222447819378, "grad_norm": 6.682613939663991, "learning_rate": 3.8444274058999295e-06, "loss": 0.4972, "step": 4183 }, { "epoch": 0.3398034597579794, "grad_norm": 4.184180026129862, "learning_rate": 3.843872924072168e-06, "loss": 0.4771, "step": 4184 }, { "epoch": 0.33988467473402095, "grad_norm": 5.132706408054481, "learning_rate": 3.843318349255512e-06, "loss": 0.4656, "step": 4185 }, { "epoch": 0.33996588971006253, "grad_norm": 4.503889789040779, "learning_rate": 3.842763681488337e-06, "loss": 0.5627, "step": 4186 }, { "epoch": 0.3400471046861041, "grad_norm": 8.418497213540109, "learning_rate": 3.84220892080902e-06, "loss": 0.4399, "step": 4187 }, { "epoch": 0.3401283196621457, "grad_norm": 5.704270836674129, "learning_rate": 3.841654067255951e-06, "loss": 0.4365, "step": 4188 }, { "epoch": 0.34020953463818726, "grad_norm": 3.7971463715319813, "learning_rate": 3.84109912086752e-06, "loss": 0.6448, "step": 4189 }, { "epoch": 0.34029074961422884, "grad_norm": 5.144040812396076, "learning_rate": 3.840544081682128e-06, "loss": 0.7299, "step": 4190 }, { "epoch": 0.34037196459027047, "grad_norm": 4.053682645967981, "learning_rate": 3.839988949738179e-06, "loss": 0.4847, "step": 4191 }, { "epoch": 0.34045317956631205, "grad_norm": 4.2685472291584245, "learning_rate": 3.8394337250740886e-06, "loss": 0.4542, "step": 4192 }, { "epoch": 0.3405343945423536, "grad_norm": 5.165758993343546, "learning_rate": 3.838878407728272e-06, "loss": 0.5573, "step": 4193 }, { "epoch": 0.3406156095183952, "grad_norm": 4.470607814383757, "learning_rate": 3.838322997739155e-06, "loss": 0.6386, "step": 4194 }, { "epoch": 0.3406968244944368, "grad_norm": 6.678206627035974, "learning_rate": 3.837767495145171e-06, "loss": 0.6893, "step": 4195 }, { "epoch": 0.34077803947047836, "grad_norm": 3.54047842198611, "learning_rate": 3.837211899984756e-06, "loss": 0.6608, "step": 4196 }, { "epoch": 0.34085925444651993, "grad_norm": 2.4752964852325023, "learning_rate": 3.836656212296353e-06, "loss": 0.752, "step": 4197 }, { "epoch": 0.3409404694225615, "grad_norm": 5.131230574367217, "learning_rate": 3.836100432118416e-06, "loss": 0.5224, "step": 4198 }, { "epoch": 0.3410216843986031, "grad_norm": 7.654106670047192, "learning_rate": 3.8355445594894e-06, "loss": 0.5236, "step": 4199 }, { "epoch": 0.34110289937464466, "grad_norm": 5.475141337183228, "learning_rate": 3.834988594447768e-06, "loss": 0.3241, "step": 4200 }, { "epoch": 0.34118411435068624, "grad_norm": 4.9511971467274964, "learning_rate": 3.8344325370319914e-06, "loss": 0.5689, "step": 4201 }, { "epoch": 0.3412653293267279, "grad_norm": 4.8632486482468, "learning_rate": 3.833876387280546e-06, "loss": 0.6184, "step": 4202 }, { "epoch": 0.34134654430276945, "grad_norm": 8.274806032942742, "learning_rate": 3.833320145231913e-06, "loss": 0.6777, "step": 4203 }, { "epoch": 0.341427759278811, "grad_norm": 3.411012348367635, "learning_rate": 3.832763810924583e-06, "loss": 0.5455, "step": 4204 }, { "epoch": 0.3415089742548526, "grad_norm": 4.294223710510589, "learning_rate": 3.832207384397051e-06, "loss": 0.5742, "step": 4205 }, { "epoch": 0.3415901892308942, "grad_norm": 7.056992590176889, "learning_rate": 3.831650865687818e-06, "loss": 0.8163, "step": 4206 }, { "epoch": 0.34167140420693576, "grad_norm": 4.513215530320878, "learning_rate": 3.831094254835393e-06, "loss": 0.4881, "step": 4207 }, { "epoch": 0.34175261918297734, "grad_norm": 7.046767735752562, "learning_rate": 3.8305375518782905e-06, "loss": 0.5084, "step": 4208 }, { "epoch": 0.3418338341590189, "grad_norm": 4.65613704507043, "learning_rate": 3.829980756855032e-06, "loss": 0.4564, "step": 4209 }, { "epoch": 0.3419150491350605, "grad_norm": 5.501379878695886, "learning_rate": 3.829423869804143e-06, "loss": 0.5426, "step": 4210 }, { "epoch": 0.34199626411110207, "grad_norm": 7.787760682762794, "learning_rate": 3.828866890764157e-06, "loss": 0.4953, "step": 4211 }, { "epoch": 0.34207747908714364, "grad_norm": 4.761244297351993, "learning_rate": 3.828309819773617e-06, "loss": 0.7606, "step": 4212 }, { "epoch": 0.3421586940631853, "grad_norm": 6.816779994137413, "learning_rate": 3.827752656871067e-06, "loss": 0.4022, "step": 4213 }, { "epoch": 0.34223990903922685, "grad_norm": 5.030056000377152, "learning_rate": 3.827195402095059e-06, "loss": 0.5904, "step": 4214 }, { "epoch": 0.34232112401526843, "grad_norm": 5.852140493007803, "learning_rate": 3.826638055484154e-06, "loss": 0.423, "step": 4215 }, { "epoch": 0.34240233899131, "grad_norm": 4.727427200553123, "learning_rate": 3.826080617076917e-06, "loss": 0.6244, "step": 4216 }, { "epoch": 0.3424835539673516, "grad_norm": 8.080683087721372, "learning_rate": 3.825523086911919e-06, "loss": 0.4194, "step": 4217 }, { "epoch": 0.34256476894339316, "grad_norm": 3.4664753816902834, "learning_rate": 3.824965465027739e-06, "loss": 0.5493, "step": 4218 }, { "epoch": 0.34264598391943474, "grad_norm": 8.5527804211556, "learning_rate": 3.824407751462962e-06, "loss": 0.5962, "step": 4219 }, { "epoch": 0.3427271988954763, "grad_norm": 9.91272395777568, "learning_rate": 3.823849946256176e-06, "loss": 0.5413, "step": 4220 }, { "epoch": 0.3428084138715179, "grad_norm": 5.49826382214047, "learning_rate": 3.82329204944598e-06, "loss": 0.4414, "step": 4221 }, { "epoch": 0.34288962884755947, "grad_norm": 5.466358398271932, "learning_rate": 3.822734061070979e-06, "loss": 0.653, "step": 4222 }, { "epoch": 0.34297084382360105, "grad_norm": 11.56485219391147, "learning_rate": 3.8221759811697814e-06, "loss": 0.6647, "step": 4223 }, { "epoch": 0.3430520587996427, "grad_norm": 17.302847263409163, "learning_rate": 3.821617809781004e-06, "loss": 0.5001, "step": 4224 }, { "epoch": 0.34313327377568426, "grad_norm": 4.9393254820446, "learning_rate": 3.821059546943268e-06, "loss": 0.4429, "step": 4225 }, { "epoch": 0.34321448875172583, "grad_norm": 4.737433746889213, "learning_rate": 3.820501192695202e-06, "loss": 0.6441, "step": 4226 }, { "epoch": 0.3432957037277674, "grad_norm": 5.321040703740516, "learning_rate": 3.819942747075443e-06, "loss": 0.4669, "step": 4227 }, { "epoch": 0.343376918703809, "grad_norm": 5.28796113251718, "learning_rate": 3.819384210122631e-06, "loss": 0.682, "step": 4228 }, { "epoch": 0.34345813367985056, "grad_norm": 4.339078198914844, "learning_rate": 3.818825581875415e-06, "loss": 0.5883, "step": 4229 }, { "epoch": 0.34353934865589214, "grad_norm": 4.128567458915762, "learning_rate": 3.818266862372449e-06, "loss": 0.5184, "step": 4230 }, { "epoch": 0.3436205636319337, "grad_norm": 3.9206753012437146, "learning_rate": 3.817708051652392e-06, "loss": 0.6334, "step": 4231 }, { "epoch": 0.3437017786079753, "grad_norm": 4.303153750803405, "learning_rate": 3.817149149753912e-06, "loss": 0.5891, "step": 4232 }, { "epoch": 0.34378299358401687, "grad_norm": 4.474909831406013, "learning_rate": 3.816590156715682e-06, "loss": 0.5449, "step": 4233 }, { "epoch": 0.34386420856005845, "grad_norm": 4.504556577904222, "learning_rate": 3.81603107257638e-06, "loss": 0.4061, "step": 4234 }, { "epoch": 0.3439454235361001, "grad_norm": 2.939069265586592, "learning_rate": 3.815471897374695e-06, "loss": 0.5448, "step": 4235 }, { "epoch": 0.34402663851214166, "grad_norm": 4.384216019391756, "learning_rate": 3.814912631149315e-06, "loss": 0.4506, "step": 4236 }, { "epoch": 0.34410785348818324, "grad_norm": 5.19610868271772, "learning_rate": 3.8143532739389403e-06, "loss": 0.5923, "step": 4237 }, { "epoch": 0.3441890684642248, "grad_norm": 4.2187938416631185, "learning_rate": 3.813793825782276e-06, "loss": 0.6411, "step": 4238 }, { "epoch": 0.3442702834402664, "grad_norm": 5.412344609770045, "learning_rate": 3.8132342867180318e-06, "loss": 0.6972, "step": 4239 }, { "epoch": 0.34435149841630797, "grad_norm": 3.8719949393667887, "learning_rate": 3.812674656784924e-06, "loss": 0.6241, "step": 4240 }, { "epoch": 0.34443271339234954, "grad_norm": 6.272686879652281, "learning_rate": 3.812114936021678e-06, "loss": 0.4416, "step": 4241 }, { "epoch": 0.3445139283683911, "grad_norm": 4.522533410945012, "learning_rate": 3.811555124467023e-06, "loss": 0.6361, "step": 4242 }, { "epoch": 0.3445951433444327, "grad_norm": 3.7359506733942554, "learning_rate": 3.8109952221596948e-06, "loss": 0.5521, "step": 4243 }, { "epoch": 0.3446763583204743, "grad_norm": 5.599008701288348, "learning_rate": 3.810435229138435e-06, "loss": 0.6105, "step": 4244 }, { "epoch": 0.34475757329651585, "grad_norm": 4.77660925406526, "learning_rate": 3.8098751454419925e-06, "loss": 0.6491, "step": 4245 }, { "epoch": 0.3448387882725575, "grad_norm": 5.070740770359748, "learning_rate": 3.8093149711091227e-06, "loss": 0.4824, "step": 4246 }, { "epoch": 0.34492000324859906, "grad_norm": 5.29618525895374, "learning_rate": 3.8087547061785864e-06, "loss": 0.591, "step": 4247 }, { "epoch": 0.34500121822464064, "grad_norm": 4.011238439114341, "learning_rate": 3.8081943506891505e-06, "loss": 0.6667, "step": 4248 }, { "epoch": 0.3450824332006822, "grad_norm": 4.640224694054099, "learning_rate": 3.8076339046795897e-06, "loss": 0.5231, "step": 4249 }, { "epoch": 0.3451636481767238, "grad_norm": 9.930057551543705, "learning_rate": 3.807073368188683e-06, "loss": 0.7279, "step": 4250 }, { "epoch": 0.34524486315276537, "grad_norm": 10.280051980432813, "learning_rate": 3.8065127412552172e-06, "loss": 0.7463, "step": 4251 }, { "epoch": 0.34532607812880695, "grad_norm": 4.260596124125055, "learning_rate": 3.8059520239179836e-06, "loss": 0.7628, "step": 4252 }, { "epoch": 0.3454072931048485, "grad_norm": 6.511349904040633, "learning_rate": 3.805391216215782e-06, "loss": 0.4457, "step": 4253 }, { "epoch": 0.3454885080808901, "grad_norm": 7.176157624874702, "learning_rate": 3.8048303181874167e-06, "loss": 0.4474, "step": 4254 }, { "epoch": 0.3455697230569317, "grad_norm": 5.9798967177829345, "learning_rate": 3.8042693298717e-06, "loss": 0.7248, "step": 4255 }, { "epoch": 0.34565093803297325, "grad_norm": 5.3870467284981, "learning_rate": 3.8037082513074468e-06, "loss": 0.4971, "step": 4256 }, { "epoch": 0.3457321530090149, "grad_norm": 6.0361440421296, "learning_rate": 3.8031470825334838e-06, "loss": 0.5185, "step": 4257 }, { "epoch": 0.34581336798505646, "grad_norm": 3.7390306125476243, "learning_rate": 3.8025858235886394e-06, "loss": 0.7618, "step": 4258 }, { "epoch": 0.34589458296109804, "grad_norm": 6.423197360909206, "learning_rate": 3.802024474511749e-06, "loss": 0.5033, "step": 4259 }, { "epoch": 0.3459757979371396, "grad_norm": 4.523265015480748, "learning_rate": 3.801463035341656e-06, "loss": 0.5696, "step": 4260 }, { "epoch": 0.3460570129131812, "grad_norm": 6.030313585209124, "learning_rate": 3.8009015061172095e-06, "loss": 0.5074, "step": 4261 }, { "epoch": 0.34613822788922277, "grad_norm": 9.4302885564273, "learning_rate": 3.8003398868772635e-06, "loss": 0.4557, "step": 4262 }, { "epoch": 0.34621944286526435, "grad_norm": 7.919711884223888, "learning_rate": 3.799778177660679e-06, "loss": 0.5377, "step": 4263 }, { "epoch": 0.3463006578413059, "grad_norm": 4.176073302897445, "learning_rate": 3.7992163785063236e-06, "loss": 0.5704, "step": 4264 }, { "epoch": 0.3463818728173475, "grad_norm": 5.09213708548835, "learning_rate": 3.798654489453071e-06, "loss": 0.554, "step": 4265 }, { "epoch": 0.3464630877933891, "grad_norm": 6.0436953163637295, "learning_rate": 3.7980925105398004e-06, "loss": 0.526, "step": 4266 }, { "epoch": 0.34654430276943066, "grad_norm": 6.156030657781099, "learning_rate": 3.7975304418053986e-06, "loss": 0.4978, "step": 4267 }, { "epoch": 0.3466255177454723, "grad_norm": 8.589190270382112, "learning_rate": 3.796968283288758e-06, "loss": 0.6419, "step": 4268 }, { "epoch": 0.34670673272151387, "grad_norm": 3.828448877259962, "learning_rate": 3.7964060350287747e-06, "loss": 0.5272, "step": 4269 }, { "epoch": 0.34678794769755544, "grad_norm": 4.335800260568463, "learning_rate": 3.795843697064355e-06, "loss": 0.5072, "step": 4270 }, { "epoch": 0.346869162673597, "grad_norm": 4.112493113260704, "learning_rate": 3.795281269434411e-06, "loss": 0.5839, "step": 4271 }, { "epoch": 0.3469503776496386, "grad_norm": 7.2088319680381225, "learning_rate": 3.794718752177857e-06, "loss": 0.7074, "step": 4272 }, { "epoch": 0.3470315926256802, "grad_norm": 3.766153162278056, "learning_rate": 3.7941561453336184e-06, "loss": 0.5309, "step": 4273 }, { "epoch": 0.34711280760172175, "grad_norm": 5.845126706777332, "learning_rate": 3.7935934489406232e-06, "loss": 0.3967, "step": 4274 }, { "epoch": 0.34719402257776333, "grad_norm": 6.9520337995936785, "learning_rate": 3.7930306630378085e-06, "loss": 0.7975, "step": 4275 }, { "epoch": 0.3472752375538049, "grad_norm": 4.890242996197673, "learning_rate": 3.7924677876641147e-06, "loss": 0.4203, "step": 4276 }, { "epoch": 0.3473564525298465, "grad_norm": 3.8913871864084255, "learning_rate": 3.79190482285849e-06, "loss": 0.7376, "step": 4277 }, { "epoch": 0.34743766750588806, "grad_norm": 3.544505934601897, "learning_rate": 3.7913417686598886e-06, "loss": 0.6519, "step": 4278 }, { "epoch": 0.3475188824819297, "grad_norm": 9.771874636578227, "learning_rate": 3.790778625107272e-06, "loss": 0.5073, "step": 4279 }, { "epoch": 0.34760009745797127, "grad_norm": 3.505229136041032, "learning_rate": 3.790215392239606e-06, "loss": 0.5759, "step": 4280 }, { "epoch": 0.34768131243401285, "grad_norm": 5.243230166877009, "learning_rate": 3.7896520700958616e-06, "loss": 0.5125, "step": 4281 }, { "epoch": 0.3477625274100544, "grad_norm": 5.947817581890729, "learning_rate": 3.789088658715021e-06, "loss": 0.5668, "step": 4282 }, { "epoch": 0.347843742386096, "grad_norm": 4.83590576883994, "learning_rate": 3.788525158136067e-06, "loss": 0.5129, "step": 4283 }, { "epoch": 0.3479249573621376, "grad_norm": 4.089901831736869, "learning_rate": 3.787961568397992e-06, "loss": 0.45, "step": 4284 }, { "epoch": 0.34800617233817915, "grad_norm": 4.292363577189182, "learning_rate": 3.787397889539792e-06, "loss": 0.699, "step": 4285 }, { "epoch": 0.34808738731422073, "grad_norm": 3.731168656658335, "learning_rate": 3.786834121600472e-06, "loss": 0.4771, "step": 4286 }, { "epoch": 0.3481686022902623, "grad_norm": 6.663689172947077, "learning_rate": 3.7862702646190415e-06, "loss": 0.6777, "step": 4287 }, { "epoch": 0.3482498172663039, "grad_norm": 7.125149701358761, "learning_rate": 3.7857063186345156e-06, "loss": 0.62, "step": 4288 }, { "epoch": 0.34833103224234546, "grad_norm": 3.677264775633149, "learning_rate": 3.7851422836859177e-06, "loss": 0.541, "step": 4289 }, { "epoch": 0.3484122472183871, "grad_norm": 7.094843934887101, "learning_rate": 3.7845781598122743e-06, "loss": 0.4561, "step": 4290 }, { "epoch": 0.34849346219442867, "grad_norm": 4.886642342030592, "learning_rate": 3.7840139470526215e-06, "loss": 0.4937, "step": 4291 }, { "epoch": 0.34857467717047025, "grad_norm": 5.62838070355612, "learning_rate": 3.783449645445999e-06, "loss": 0.513, "step": 4292 }, { "epoch": 0.3486558921465118, "grad_norm": 4.6812249317048025, "learning_rate": 3.782885255031453e-06, "loss": 0.5147, "step": 4293 }, { "epoch": 0.3487371071225534, "grad_norm": 6.938556921862774, "learning_rate": 3.782320775848038e-06, "loss": 0.3674, "step": 4294 }, { "epoch": 0.348818322098595, "grad_norm": 6.723176214680427, "learning_rate": 3.7817562079348114e-06, "loss": 0.4626, "step": 4295 }, { "epoch": 0.34889953707463656, "grad_norm": 5.553775061400944, "learning_rate": 3.7811915513308382e-06, "loss": 0.4768, "step": 4296 }, { "epoch": 0.34898075205067813, "grad_norm": 5.884939622386394, "learning_rate": 3.7806268060751914e-06, "loss": 0.4368, "step": 4297 }, { "epoch": 0.3490619670267197, "grad_norm": 6.5667330861720234, "learning_rate": 3.7800619722069464e-06, "loss": 0.444, "step": 4298 }, { "epoch": 0.3491431820027613, "grad_norm": 6.393075158086913, "learning_rate": 3.7794970497651877e-06, "loss": 0.4569, "step": 4299 }, { "epoch": 0.34922439697880286, "grad_norm": 7.173171177630671, "learning_rate": 3.7789320387890056e-06, "loss": 0.6791, "step": 4300 }, { "epoch": 0.3493056119548445, "grad_norm": 4.165736108586291, "learning_rate": 3.778366939317494e-06, "loss": 0.5986, "step": 4301 }, { "epoch": 0.3493868269308861, "grad_norm": 3.349799899482817, "learning_rate": 3.777801751389757e-06, "loss": 0.5417, "step": 4302 }, { "epoch": 0.34946804190692765, "grad_norm": 5.017370500577692, "learning_rate": 3.7772364750449002e-06, "loss": 0.7214, "step": 4303 }, { "epoch": 0.34954925688296923, "grad_norm": 6.384615890109909, "learning_rate": 3.77667111032204e-06, "loss": 0.4845, "step": 4304 }, { "epoch": 0.3496304718590108, "grad_norm": 5.139067733395603, "learning_rate": 3.776105657260295e-06, "loss": 0.5029, "step": 4305 }, { "epoch": 0.3497116868350524, "grad_norm": 30.243450603041254, "learning_rate": 3.7755401158987926e-06, "loss": 0.6012, "step": 4306 }, { "epoch": 0.34979290181109396, "grad_norm": 8.484420160765751, "learning_rate": 3.774974486276664e-06, "loss": 0.4776, "step": 4307 }, { "epoch": 0.34987411678713554, "grad_norm": 6.935408912143071, "learning_rate": 3.77440876843305e-06, "loss": 0.4172, "step": 4308 }, { "epoch": 0.3499553317631771, "grad_norm": 4.024333397155922, "learning_rate": 3.773842962407093e-06, "loss": 0.7109, "step": 4309 }, { "epoch": 0.3500365467392187, "grad_norm": 4.11100179002336, "learning_rate": 3.773277068237945e-06, "loss": 0.5926, "step": 4310 }, { "epoch": 0.35011776171526027, "grad_norm": 4.947318220787382, "learning_rate": 3.7727110859647627e-06, "loss": 0.6069, "step": 4311 }, { "epoch": 0.3501989766913019, "grad_norm": 3.6871411628254696, "learning_rate": 3.772145015626709e-06, "loss": 0.5277, "step": 4312 }, { "epoch": 0.3502801916673435, "grad_norm": 10.305081134224809, "learning_rate": 3.771578857262953e-06, "loss": 0.4547, "step": 4313 }, { "epoch": 0.35036140664338505, "grad_norm": 4.429330825416072, "learning_rate": 3.771012610912669e-06, "loss": 0.6503, "step": 4314 }, { "epoch": 0.35044262161942663, "grad_norm": 8.456969721413778, "learning_rate": 3.7704462766150396e-06, "loss": 0.5715, "step": 4315 }, { "epoch": 0.3505238365954682, "grad_norm": 5.543349480716538, "learning_rate": 3.7698798544092525e-06, "loss": 0.3989, "step": 4316 }, { "epoch": 0.3506050515715098, "grad_norm": 4.0226607503884315, "learning_rate": 3.7693133443344986e-06, "loss": 0.7712, "step": 4317 }, { "epoch": 0.35068626654755136, "grad_norm": 3.3414221408894798, "learning_rate": 3.7687467464299797e-06, "loss": 0.5677, "step": 4318 }, { "epoch": 0.35076748152359294, "grad_norm": 3.3587069368493263, "learning_rate": 3.7681800607349017e-06, "loss": 0.5779, "step": 4319 }, { "epoch": 0.3508486964996345, "grad_norm": 7.359687891817828, "learning_rate": 3.767613287288474e-06, "loss": 0.5286, "step": 4320 }, { "epoch": 0.3509299114756761, "grad_norm": 4.5580897772433095, "learning_rate": 3.767046426129917e-06, "loss": 0.6514, "step": 4321 }, { "epoch": 0.35101112645171767, "grad_norm": 6.868236388509833, "learning_rate": 3.7664794772984515e-06, "loss": 0.4803, "step": 4322 }, { "epoch": 0.3510923414277593, "grad_norm": 4.0864068693665985, "learning_rate": 3.7659124408333094e-06, "loss": 0.4627, "step": 4323 }, { "epoch": 0.3511735564038009, "grad_norm": 4.220792384857057, "learning_rate": 3.7653453167737263e-06, "loss": 0.6832, "step": 4324 }, { "epoch": 0.35125477137984246, "grad_norm": 6.391965671992134, "learning_rate": 3.7647781051589436e-06, "loss": 0.7653, "step": 4325 }, { "epoch": 0.35133598635588403, "grad_norm": 6.3932460863379905, "learning_rate": 3.76421080602821e-06, "loss": 0.4652, "step": 4326 }, { "epoch": 0.3514172013319256, "grad_norm": 5.223581522055953, "learning_rate": 3.76364341942078e-06, "loss": 0.6826, "step": 4327 }, { "epoch": 0.3514984163079672, "grad_norm": 6.233483510000385, "learning_rate": 3.7630759453759123e-06, "loss": 0.4624, "step": 4328 }, { "epoch": 0.35157963128400876, "grad_norm": 4.989931764108342, "learning_rate": 3.7625083839328747e-06, "loss": 0.4736, "step": 4329 }, { "epoch": 0.35166084626005034, "grad_norm": 3.2608228815714755, "learning_rate": 3.7619407351309377e-06, "loss": 0.4673, "step": 4330 }, { "epoch": 0.3517420612360919, "grad_norm": 4.925326222217201, "learning_rate": 3.761372999009381e-06, "loss": 0.5807, "step": 4331 }, { "epoch": 0.3518232762121335, "grad_norm": 3.6058579203829844, "learning_rate": 3.7608051756074894e-06, "loss": 0.4816, "step": 4332 }, { "epoch": 0.3519044911881751, "grad_norm": 5.412076104516127, "learning_rate": 3.7602372649645512e-06, "loss": 0.6296, "step": 4333 }, { "epoch": 0.3519857061642167, "grad_norm": 4.984953620322977, "learning_rate": 3.759669267119864e-06, "loss": 0.5238, "step": 4334 }, { "epoch": 0.3520669211402583, "grad_norm": 3.348233179207022, "learning_rate": 3.759101182112731e-06, "loss": 0.6843, "step": 4335 }, { "epoch": 0.35214813611629986, "grad_norm": 3.9680570346308204, "learning_rate": 3.758533009982459e-06, "loss": 0.4943, "step": 4336 }, { "epoch": 0.35222935109234144, "grad_norm": 7.538460902230132, "learning_rate": 3.7579647507683636e-06, "loss": 0.4964, "step": 4337 }, { "epoch": 0.352310566068383, "grad_norm": 4.840273083850319, "learning_rate": 3.7573964045097655e-06, "loss": 0.6186, "step": 4338 }, { "epoch": 0.3523917810444246, "grad_norm": 4.652079301299005, "learning_rate": 3.7568279712459908e-06, "loss": 0.4687, "step": 4339 }, { "epoch": 0.35247299602046617, "grad_norm": 4.542830944823784, "learning_rate": 3.7562594510163718e-06, "loss": 0.6544, "step": 4340 }, { "epoch": 0.35255421099650774, "grad_norm": 5.562323687848427, "learning_rate": 3.755690843860248e-06, "loss": 0.598, "step": 4341 }, { "epoch": 0.3526354259725493, "grad_norm": 3.056102850388366, "learning_rate": 3.7551221498169633e-06, "loss": 0.46, "step": 4342 }, { "epoch": 0.3527166409485909, "grad_norm": 6.370494566318182, "learning_rate": 3.7545533689258683e-06, "loss": 0.5024, "step": 4343 }, { "epoch": 0.3527978559246325, "grad_norm": 3.674196156661001, "learning_rate": 3.75398450122632e-06, "loss": 0.4952, "step": 4344 }, { "epoch": 0.3528790709006741, "grad_norm": 5.618292271167161, "learning_rate": 3.7534155467576805e-06, "loss": 0.545, "step": 4345 }, { "epoch": 0.3529602858767157, "grad_norm": 7.155057050102085, "learning_rate": 3.7528465055593186e-06, "loss": 0.7136, "step": 4346 }, { "epoch": 0.35304150085275726, "grad_norm": 4.835019495981251, "learning_rate": 3.75227737767061e-06, "loss": 0.5152, "step": 4347 }, { "epoch": 0.35312271582879884, "grad_norm": 6.80042426573597, "learning_rate": 3.7517081631309336e-06, "loss": 0.5799, "step": 4348 }, { "epoch": 0.3532039308048404, "grad_norm": 7.706095450683844, "learning_rate": 3.751138861979678e-06, "loss": 0.5641, "step": 4349 }, { "epoch": 0.353285145780882, "grad_norm": 8.667289733551144, "learning_rate": 3.750569474256233e-06, "loss": 0.5249, "step": 4350 }, { "epoch": 0.35336636075692357, "grad_norm": 4.311680279675531, "learning_rate": 3.7500000000000005e-06, "loss": 0.4826, "step": 4351 }, { "epoch": 0.35344757573296515, "grad_norm": 5.148268090014955, "learning_rate": 3.7494304392503826e-06, "loss": 0.5173, "step": 4352 }, { "epoch": 0.3535287907090067, "grad_norm": 5.2133873799522705, "learning_rate": 3.7488607920467912e-06, "loss": 0.7347, "step": 4353 }, { "epoch": 0.3536100056850483, "grad_norm": 5.251842412363714, "learning_rate": 3.7482910584286424e-06, "loss": 0.4798, "step": 4354 }, { "epoch": 0.3536912206610899, "grad_norm": 4.4703093132843, "learning_rate": 3.747721238435359e-06, "loss": 0.6538, "step": 4355 }, { "epoch": 0.3537724356371315, "grad_norm": 4.814603055251707, "learning_rate": 3.747151332106369e-06, "loss": 0.487, "step": 4356 }, { "epoch": 0.3538536506131731, "grad_norm": 5.542109813139826, "learning_rate": 3.746581339481108e-06, "loss": 0.5932, "step": 4357 }, { "epoch": 0.35393486558921466, "grad_norm": 5.541416801300144, "learning_rate": 3.746011260599015e-06, "loss": 0.605, "step": 4358 }, { "epoch": 0.35401608056525624, "grad_norm": 4.197242421152716, "learning_rate": 3.7454410954995375e-06, "loss": 0.4671, "step": 4359 }, { "epoch": 0.3540972955412978, "grad_norm": 6.220911708551648, "learning_rate": 3.7448708442221277e-06, "loss": 0.5889, "step": 4360 }, { "epoch": 0.3541785105173394, "grad_norm": 4.456428745711773, "learning_rate": 3.744300506806243e-06, "loss": 0.5731, "step": 4361 }, { "epoch": 0.354259725493381, "grad_norm": 4.103784001652695, "learning_rate": 3.7437300832913503e-06, "loss": 0.4561, "step": 4362 }, { "epoch": 0.35434094046942255, "grad_norm": 16.445100012740834, "learning_rate": 3.743159573716917e-06, "loss": 0.6277, "step": 4363 }, { "epoch": 0.3544221554454641, "grad_norm": 4.616837308526195, "learning_rate": 3.7425889781224204e-06, "loss": 0.5266, "step": 4364 }, { "epoch": 0.3545033704215057, "grad_norm": 8.670573299062688, "learning_rate": 3.742018296547344e-06, "loss": 0.5105, "step": 4365 }, { "epoch": 0.3545845853975473, "grad_norm": 6.449318189960221, "learning_rate": 3.741447529031173e-06, "loss": 0.7134, "step": 4366 }, { "epoch": 0.3546658003735889, "grad_norm": 4.211623063096266, "learning_rate": 3.7408766756134046e-06, "loss": 0.5255, "step": 4367 }, { "epoch": 0.3547470153496305, "grad_norm": 4.907991529043284, "learning_rate": 3.740305736333537e-06, "loss": 0.6893, "step": 4368 }, { "epoch": 0.35482823032567207, "grad_norm": 4.845747717269982, "learning_rate": 3.7397347112310767e-06, "loss": 0.6383, "step": 4369 }, { "epoch": 0.35490944530171364, "grad_norm": 4.148877926596722, "learning_rate": 3.7391636003455355e-06, "loss": 0.4795, "step": 4370 }, { "epoch": 0.3549906602777552, "grad_norm": 5.568518592335486, "learning_rate": 3.7385924037164316e-06, "loss": 0.5019, "step": 4371 }, { "epoch": 0.3550718752537968, "grad_norm": 11.380910354185877, "learning_rate": 3.7380211213832882e-06, "loss": 0.4622, "step": 4372 }, { "epoch": 0.3551530902298384, "grad_norm": 4.898875896209035, "learning_rate": 3.737449753385636e-06, "loss": 0.5913, "step": 4373 }, { "epoch": 0.35523430520587995, "grad_norm": 3.7018730566328433, "learning_rate": 3.7368782997630093e-06, "loss": 0.7925, "step": 4374 }, { "epoch": 0.35531552018192153, "grad_norm": 3.8066706472807565, "learning_rate": 3.7363067605549515e-06, "loss": 0.5056, "step": 4375 }, { "epoch": 0.3553967351579631, "grad_norm": 2.709504723389777, "learning_rate": 3.7357351358010075e-06, "loss": 0.5972, "step": 4376 }, { "epoch": 0.3554779501340047, "grad_norm": 4.824489245992463, "learning_rate": 3.735163425540732e-06, "loss": 0.6907, "step": 4377 }, { "epoch": 0.3555591651100463, "grad_norm": 7.943391610774761, "learning_rate": 3.734591629813686e-06, "loss": 0.5539, "step": 4378 }, { "epoch": 0.3556403800860879, "grad_norm": 4.381215594871206, "learning_rate": 3.7340197486594315e-06, "loss": 0.5431, "step": 4379 }, { "epoch": 0.35572159506212947, "grad_norm": 4.946575395839248, "learning_rate": 3.7334477821175424e-06, "loss": 0.6081, "step": 4380 }, { "epoch": 0.35580281003817105, "grad_norm": 4.52740218848989, "learning_rate": 3.732875730227595e-06, "loss": 0.4524, "step": 4381 }, { "epoch": 0.3558840250142126, "grad_norm": 6.069550060126085, "learning_rate": 3.7323035930291706e-06, "loss": 0.5258, "step": 4382 }, { "epoch": 0.3559652399902542, "grad_norm": 5.631666918507212, "learning_rate": 3.731731370561861e-06, "loss": 0.8438, "step": 4383 }, { "epoch": 0.3560464549662958, "grad_norm": 8.095060386123581, "learning_rate": 3.7311590628652584e-06, "loss": 0.6436, "step": 4384 }, { "epoch": 0.35612766994233735, "grad_norm": 5.278732076516886, "learning_rate": 3.730586669978965e-06, "loss": 0.5972, "step": 4385 }, { "epoch": 0.35620888491837893, "grad_norm": 6.566634009856689, "learning_rate": 3.7300141919425865e-06, "loss": 0.4352, "step": 4386 }, { "epoch": 0.3562900998944205, "grad_norm": 4.301691821909096, "learning_rate": 3.729441628795736e-06, "loss": 0.4451, "step": 4387 }, { "epoch": 0.3563713148704621, "grad_norm": 3.538764460520504, "learning_rate": 3.728868980578031e-06, "loss": 0.4408, "step": 4388 }, { "epoch": 0.3564525298465037, "grad_norm": 8.009167633892602, "learning_rate": 3.7282962473290964e-06, "loss": 0.6824, "step": 4389 }, { "epoch": 0.3565337448225453, "grad_norm": 3.6954739187392414, "learning_rate": 3.727723429088562e-06, "loss": 0.6465, "step": 4390 }, { "epoch": 0.3566149597985869, "grad_norm": 5.835592705497657, "learning_rate": 3.7271505258960644e-06, "loss": 0.3879, "step": 4391 }, { "epoch": 0.35669617477462845, "grad_norm": 4.621090146943028, "learning_rate": 3.726577537791245e-06, "loss": 0.6456, "step": 4392 }, { "epoch": 0.35677738975067, "grad_norm": 7.345583344323817, "learning_rate": 3.726004464813752e-06, "loss": 0.5299, "step": 4393 }, { "epoch": 0.3568586047267116, "grad_norm": 5.887510819023688, "learning_rate": 3.725431307003238e-06, "loss": 0.4683, "step": 4394 }, { "epoch": 0.3569398197027532, "grad_norm": 4.4086817353864785, "learning_rate": 3.7248580643993625e-06, "loss": 0.5324, "step": 4395 }, { "epoch": 0.35702103467879476, "grad_norm": 5.875362001143908, "learning_rate": 3.724284737041792e-06, "loss": 0.5235, "step": 4396 }, { "epoch": 0.35710224965483633, "grad_norm": 4.84990196629755, "learning_rate": 3.723711324970197e-06, "loss": 0.4776, "step": 4397 }, { "epoch": 0.3571834646308779, "grad_norm": 4.446080183771416, "learning_rate": 3.723137828224255e-06, "loss": 0.5653, "step": 4398 }, { "epoch": 0.3572646796069195, "grad_norm": 11.963154592877663, "learning_rate": 3.722564246843648e-06, "loss": 0.6452, "step": 4399 }, { "epoch": 0.3573458945829611, "grad_norm": 5.038523322391007, "learning_rate": 3.7219905808680663e-06, "loss": 0.5563, "step": 4400 }, { "epoch": 0.3574271095590027, "grad_norm": 5.775055041451304, "learning_rate": 3.7214168303372033e-06, "loss": 0.4416, "step": 4401 }, { "epoch": 0.3575083245350443, "grad_norm": 13.159177334829804, "learning_rate": 3.72084299529076e-06, "loss": 0.6903, "step": 4402 }, { "epoch": 0.35758953951108585, "grad_norm": 6.4873380102452325, "learning_rate": 3.720269075768442e-06, "loss": 0.5353, "step": 4403 }, { "epoch": 0.35767075448712743, "grad_norm": 6.187953414055915, "learning_rate": 3.7196950718099636e-06, "loss": 0.5224, "step": 4404 }, { "epoch": 0.357751969463169, "grad_norm": 6.813848598601859, "learning_rate": 3.71912098345504e-06, "loss": 0.4981, "step": 4405 }, { "epoch": 0.3578331844392106, "grad_norm": 5.500592742588713, "learning_rate": 3.7185468107433966e-06, "loss": 0.6389, "step": 4406 }, { "epoch": 0.35791439941525216, "grad_norm": 4.764003153750818, "learning_rate": 3.7179725537147638e-06, "loss": 0.6802, "step": 4407 }, { "epoch": 0.35799561439129374, "grad_norm": 23.629219188415377, "learning_rate": 3.717398212408875e-06, "loss": 0.6526, "step": 4408 }, { "epoch": 0.3580768293673353, "grad_norm": 6.635582406761952, "learning_rate": 3.716823786865474e-06, "loss": 0.4644, "step": 4409 }, { "epoch": 0.3581580443433769, "grad_norm": 5.158135221215257, "learning_rate": 3.7162492771243068e-06, "loss": 0.5585, "step": 4410 }, { "epoch": 0.3582392593194185, "grad_norm": 3.8038224782647925, "learning_rate": 3.7156746832251266e-06, "loss": 0.6006, "step": 4411 }, { "epoch": 0.3583204742954601, "grad_norm": 3.935552912541937, "learning_rate": 3.7151000052076913e-06, "loss": 0.5972, "step": 4412 }, { "epoch": 0.3584016892715017, "grad_norm": 5.2873448761373885, "learning_rate": 3.7145252431117672e-06, "loss": 0.5808, "step": 4413 }, { "epoch": 0.35848290424754325, "grad_norm": 7.369629869130491, "learning_rate": 3.713950396977124e-06, "loss": 0.7058, "step": 4414 }, { "epoch": 0.35856411922358483, "grad_norm": 5.201267760709762, "learning_rate": 3.7133754668435377e-06, "loss": 0.8023, "step": 4415 }, { "epoch": 0.3586453341996264, "grad_norm": 3.1998164797424598, "learning_rate": 3.7128004527507916e-06, "loss": 0.596, "step": 4416 }, { "epoch": 0.358726549175668, "grad_norm": 5.052174245450653, "learning_rate": 3.712225354738672e-06, "loss": 0.6137, "step": 4417 }, { "epoch": 0.35880776415170956, "grad_norm": 4.804582850754989, "learning_rate": 3.7116501728469746e-06, "loss": 0.5863, "step": 4418 }, { "epoch": 0.35888897912775114, "grad_norm": 4.754942942838345, "learning_rate": 3.711074907115497e-06, "loss": 0.5453, "step": 4419 }, { "epoch": 0.3589701941037927, "grad_norm": 4.182528054308997, "learning_rate": 3.710499557584045e-06, "loss": 0.6604, "step": 4420 }, { "epoch": 0.3590514090798343, "grad_norm": 5.430420823976959, "learning_rate": 3.7099241242924306e-06, "loss": 0.6099, "step": 4421 }, { "epoch": 0.3591326240558759, "grad_norm": 6.298870190762428, "learning_rate": 3.7093486072804696e-06, "loss": 0.5275, "step": 4422 }, { "epoch": 0.3592138390319175, "grad_norm": 4.826705123382431, "learning_rate": 3.7087730065879862e-06, "loss": 0.7317, "step": 4423 }, { "epoch": 0.3592950540079591, "grad_norm": 13.171633048916624, "learning_rate": 3.708197322254807e-06, "loss": 0.6044, "step": 4424 }, { "epoch": 0.35937626898400066, "grad_norm": 4.774839704277342, "learning_rate": 3.7076215543207688e-06, "loss": 0.6298, "step": 4425 }, { "epoch": 0.35945748396004223, "grad_norm": 7.708019365223568, "learning_rate": 3.7070457028257095e-06, "loss": 0.4906, "step": 4426 }, { "epoch": 0.3595386989360838, "grad_norm": 6.0983843136049805, "learning_rate": 3.7064697678094765e-06, "loss": 0.459, "step": 4427 }, { "epoch": 0.3596199139121254, "grad_norm": 4.100219190551162, "learning_rate": 3.7058937493119195e-06, "loss": 0.553, "step": 4428 }, { "epoch": 0.35970112888816697, "grad_norm": 5.689077880893839, "learning_rate": 3.705317647372898e-06, "loss": 0.4724, "step": 4429 }, { "epoch": 0.35978234386420854, "grad_norm": 3.858145204321467, "learning_rate": 3.704741462032274e-06, "loss": 0.4586, "step": 4430 }, { "epoch": 0.3598635588402501, "grad_norm": 11.983671864011523, "learning_rate": 3.7041651933299167e-06, "loss": 0.5293, "step": 4431 }, { "epoch": 0.3599447738162917, "grad_norm": 5.870016078637099, "learning_rate": 3.703588841305702e-06, "loss": 0.6165, "step": 4432 }, { "epoch": 0.36002598879233333, "grad_norm": 11.720283579984393, "learning_rate": 3.7030124059995086e-06, "loss": 0.5869, "step": 4433 }, { "epoch": 0.3601072037683749, "grad_norm": 3.3466945140934072, "learning_rate": 3.7024358874512235e-06, "loss": 0.5978, "step": 4434 }, { "epoch": 0.3601884187444165, "grad_norm": 6.264649818329499, "learning_rate": 3.7018592857007386e-06, "loss": 0.5366, "step": 4435 }, { "epoch": 0.36026963372045806, "grad_norm": 5.505223118383528, "learning_rate": 3.701282600787952e-06, "loss": 0.5043, "step": 4436 }, { "epoch": 0.36035084869649964, "grad_norm": 3.9050434398042793, "learning_rate": 3.700705832752768e-06, "loss": 0.4023, "step": 4437 }, { "epoch": 0.3604320636725412, "grad_norm": 3.8486979131156818, "learning_rate": 3.700128981635094e-06, "loss": 0.7087, "step": 4438 }, { "epoch": 0.3605132786485828, "grad_norm": 4.97236792052597, "learning_rate": 3.6995520474748457e-06, "loss": 0.676, "step": 4439 }, { "epoch": 0.36059449362462437, "grad_norm": 5.54885373013609, "learning_rate": 3.698975030311946e-06, "loss": 0.4853, "step": 4440 }, { "epoch": 0.36067570860066595, "grad_norm": 4.00154383333082, "learning_rate": 3.6983979301863184e-06, "loss": 0.6239, "step": 4441 }, { "epoch": 0.3607569235767075, "grad_norm": 8.644737774833692, "learning_rate": 3.6978207471378965e-06, "loss": 0.648, "step": 4442 }, { "epoch": 0.3608381385527491, "grad_norm": 4.192845183053263, "learning_rate": 3.697243481206619e-06, "loss": 0.5129, "step": 4443 }, { "epoch": 0.36091935352879073, "grad_norm": 4.71407305163305, "learning_rate": 3.6966661324324278e-06, "loss": 0.6095, "step": 4444 }, { "epoch": 0.3610005685048323, "grad_norm": 7.220930209286327, "learning_rate": 3.6960887008552743e-06, "loss": 0.4677, "step": 4445 }, { "epoch": 0.3610817834808739, "grad_norm": 4.962625757719452, "learning_rate": 3.6955111865151127e-06, "loss": 0.5154, "step": 4446 }, { "epoch": 0.36116299845691546, "grad_norm": 10.29073366584436, "learning_rate": 3.6949335894519033e-06, "loss": 0.5977, "step": 4447 }, { "epoch": 0.36124421343295704, "grad_norm": 4.688756562689801, "learning_rate": 3.6943559097056155e-06, "loss": 0.4716, "step": 4448 }, { "epoch": 0.3613254284089986, "grad_norm": 5.833182973807545, "learning_rate": 3.6937781473162183e-06, "loss": 0.6092, "step": 4449 }, { "epoch": 0.3614066433850402, "grad_norm": 7.160318638424944, "learning_rate": 3.6932003023236916e-06, "loss": 0.6076, "step": 4450 }, { "epoch": 0.36148785836108177, "grad_norm": 9.311074485235293, "learning_rate": 3.692622374768019e-06, "loss": 0.5193, "step": 4451 }, { "epoch": 0.36156907333712335, "grad_norm": 12.351430012812038, "learning_rate": 3.69204436468919e-06, "loss": 0.4899, "step": 4452 }, { "epoch": 0.3616502883131649, "grad_norm": 5.254888121313631, "learning_rate": 3.6914662721272e-06, "loss": 0.5991, "step": 4453 }, { "epoch": 0.36173150328920656, "grad_norm": 4.86844411981734, "learning_rate": 3.6908880971220494e-06, "loss": 0.5256, "step": 4454 }, { "epoch": 0.36181271826524813, "grad_norm": 11.036443384726775, "learning_rate": 3.690309839713745e-06, "loss": 0.504, "step": 4455 }, { "epoch": 0.3618939332412897, "grad_norm": 3.81687688264698, "learning_rate": 3.6897314999423e-06, "loss": 0.4975, "step": 4456 }, { "epoch": 0.3619751482173313, "grad_norm": 3.12299941091288, "learning_rate": 3.6891530778477306e-06, "loss": 0.5774, "step": 4457 }, { "epoch": 0.36205636319337287, "grad_norm": 5.036876057872486, "learning_rate": 3.6885745734700628e-06, "loss": 0.4885, "step": 4458 }, { "epoch": 0.36213757816941444, "grad_norm": 8.62327137178916, "learning_rate": 3.687995986849325e-06, "loss": 0.6869, "step": 4459 }, { "epoch": 0.362218793145456, "grad_norm": 3.4708246203276443, "learning_rate": 3.687417318025551e-06, "loss": 0.5733, "step": 4460 }, { "epoch": 0.3623000081214976, "grad_norm": 4.973444884464531, "learning_rate": 3.686838567038784e-06, "loss": 0.6222, "step": 4461 }, { "epoch": 0.3623812230975392, "grad_norm": 5.667375423236751, "learning_rate": 3.68625973392907e-06, "loss": 0.5412, "step": 4462 }, { "epoch": 0.36246243807358075, "grad_norm": 6.103470863477208, "learning_rate": 3.6856808187364594e-06, "loss": 0.5223, "step": 4463 }, { "epoch": 0.3625436530496223, "grad_norm": 7.865435634910888, "learning_rate": 3.685101821501012e-06, "loss": 0.4664, "step": 4464 }, { "epoch": 0.36262486802566396, "grad_norm": 4.7253648152592715, "learning_rate": 3.6845227422627904e-06, "loss": 0.3839, "step": 4465 }, { "epoch": 0.36270608300170554, "grad_norm": 5.338042758405891, "learning_rate": 3.683943581061864e-06, "loss": 0.587, "step": 4466 }, { "epoch": 0.3627872979777471, "grad_norm": 5.179836245143301, "learning_rate": 3.683364337938308e-06, "loss": 0.5633, "step": 4467 }, { "epoch": 0.3628685129537887, "grad_norm": 6.353705665638407, "learning_rate": 3.6827850129322017e-06, "loss": 0.6154, "step": 4468 }, { "epoch": 0.36294972792983027, "grad_norm": 4.584366349679948, "learning_rate": 3.682205606083633e-06, "loss": 0.4864, "step": 4469 }, { "epoch": 0.36303094290587185, "grad_norm": 6.208825994136881, "learning_rate": 3.681626117432693e-06, "loss": 0.4577, "step": 4470 }, { "epoch": 0.3631121578819134, "grad_norm": 6.366010507681265, "learning_rate": 3.6810465470194796e-06, "loss": 0.4515, "step": 4471 }, { "epoch": 0.363193372857955, "grad_norm": 4.554722007028088, "learning_rate": 3.680466894884096e-06, "loss": 0.5824, "step": 4472 }, { "epoch": 0.3632745878339966, "grad_norm": 14.16578555542569, "learning_rate": 3.6798871610666497e-06, "loss": 0.5687, "step": 4473 }, { "epoch": 0.36335580281003815, "grad_norm": 6.940126197441687, "learning_rate": 3.679307345607257e-06, "loss": 0.5116, "step": 4474 }, { "epoch": 0.36343701778607973, "grad_norm": 11.501609688473003, "learning_rate": 3.6787274485460377e-06, "loss": 0.5931, "step": 4475 }, { "epoch": 0.36351823276212136, "grad_norm": 9.174827684495176, "learning_rate": 3.678147469923117e-06, "loss": 0.5515, "step": 4476 }, { "epoch": 0.36359944773816294, "grad_norm": 4.650571061924677, "learning_rate": 3.677567409778626e-06, "loss": 0.4882, "step": 4477 }, { "epoch": 0.3636806627142045, "grad_norm": 4.716688957959236, "learning_rate": 3.6769872681527036e-06, "loss": 0.5081, "step": 4478 }, { "epoch": 0.3637618776902461, "grad_norm": 6.347553656163701, "learning_rate": 3.6764070450854907e-06, "loss": 0.4636, "step": 4479 }, { "epoch": 0.36384309266628767, "grad_norm": 10.156227178827374, "learning_rate": 3.675826740617136e-06, "loss": 0.4511, "step": 4480 }, { "epoch": 0.36392430764232925, "grad_norm": 3.559954335727195, "learning_rate": 3.6752463547877946e-06, "loss": 0.6027, "step": 4481 }, { "epoch": 0.3640055226183708, "grad_norm": 4.72021021192598, "learning_rate": 3.674665887637625e-06, "loss": 0.5531, "step": 4482 }, { "epoch": 0.3640867375944124, "grad_norm": 11.263712709683006, "learning_rate": 3.6740853392067925e-06, "loss": 0.609, "step": 4483 }, { "epoch": 0.364167952570454, "grad_norm": 5.123832459796128, "learning_rate": 3.6735047095354693e-06, "loss": 0.458, "step": 4484 }, { "epoch": 0.36424916754649556, "grad_norm": 4.265471041437096, "learning_rate": 3.67292399866383e-06, "loss": 0.8243, "step": 4485 }, { "epoch": 0.36433038252253713, "grad_norm": 7.3136697189021405, "learning_rate": 3.6723432066320575e-06, "loss": 0.5279, "step": 4486 }, { "epoch": 0.36441159749857877, "grad_norm": 3.4708908613272618, "learning_rate": 3.67176233348034e-06, "loss": 0.5274, "step": 4487 }, { "epoch": 0.36449281247462034, "grad_norm": 3.2379003246205205, "learning_rate": 3.6711813792488706e-06, "loss": 0.4907, "step": 4488 }, { "epoch": 0.3645740274506619, "grad_norm": 4.61818923987606, "learning_rate": 3.6706003439778476e-06, "loss": 0.5109, "step": 4489 }, { "epoch": 0.3646552424267035, "grad_norm": 6.1238028357576555, "learning_rate": 3.6700192277074766e-06, "loss": 0.4814, "step": 4490 }, { "epoch": 0.3647364574027451, "grad_norm": 5.108669074454857, "learning_rate": 3.6694380304779676e-06, "loss": 0.4514, "step": 4491 }, { "epoch": 0.36481767237878665, "grad_norm": 4.555539206226715, "learning_rate": 3.6688567523295356e-06, "loss": 0.6225, "step": 4492 }, { "epoch": 0.3648988873548282, "grad_norm": 4.947835075771073, "learning_rate": 3.668275393302402e-06, "loss": 0.526, "step": 4493 }, { "epoch": 0.3649801023308698, "grad_norm": 4.987280419667079, "learning_rate": 3.667693953436795e-06, "loss": 0.5885, "step": 4494 }, { "epoch": 0.3650613173069114, "grad_norm": 5.33019974406346, "learning_rate": 3.6671124327729457e-06, "loss": 0.5906, "step": 4495 }, { "epoch": 0.36514253228295296, "grad_norm": 3.911550454532029, "learning_rate": 3.6665308313510927e-06, "loss": 0.4641, "step": 4496 }, { "epoch": 0.36522374725899454, "grad_norm": 7.327451479070638, "learning_rate": 3.665949149211481e-06, "loss": 0.5346, "step": 4497 }, { "epoch": 0.36530496223503617, "grad_norm": 6.679619506230374, "learning_rate": 3.6653673863943584e-06, "loss": 0.564, "step": 4498 }, { "epoch": 0.36538617721107775, "grad_norm": 7.979525255569825, "learning_rate": 3.6647855429399803e-06, "loss": 0.5412, "step": 4499 }, { "epoch": 0.3654673921871193, "grad_norm": 5.930675497170029, "learning_rate": 3.6642036188886072e-06, "loss": 0.5653, "step": 4500 }, { "epoch": 0.3655486071631609, "grad_norm": 4.872227933745589, "learning_rate": 3.663621614280505e-06, "loss": 0.6143, "step": 4501 }, { "epoch": 0.3656298221392025, "grad_norm": 4.942347208045127, "learning_rate": 3.663039529155945e-06, "loss": 0.5996, "step": 4502 }, { "epoch": 0.36571103711524405, "grad_norm": 3.9093013103451084, "learning_rate": 3.6624573635552056e-06, "loss": 0.6432, "step": 4503 }, { "epoch": 0.36579225209128563, "grad_norm": 6.258360673482065, "learning_rate": 3.6618751175185687e-06, "loss": 0.5461, "step": 4504 }, { "epoch": 0.3658734670673272, "grad_norm": 6.512518797476682, "learning_rate": 3.6612927910863235e-06, "loss": 0.4864, "step": 4505 }, { "epoch": 0.3659546820433688, "grad_norm": 4.622906130526112, "learning_rate": 3.660710384298762e-06, "loss": 0.6799, "step": 4506 }, { "epoch": 0.36603589701941036, "grad_norm": 3.157312017939539, "learning_rate": 3.6601278971961853e-06, "loss": 0.6479, "step": 4507 }, { "epoch": 0.36611711199545194, "grad_norm": 5.831757782243094, "learning_rate": 3.659545329818898e-06, "loss": 0.3765, "step": 4508 }, { "epoch": 0.36619832697149357, "grad_norm": 4.871188343403577, "learning_rate": 3.6589626822072105e-06, "loss": 0.5067, "step": 4509 }, { "epoch": 0.36627954194753515, "grad_norm": 3.3143048064685825, "learning_rate": 3.6583799544014397e-06, "loss": 0.7395, "step": 4510 }, { "epoch": 0.3663607569235767, "grad_norm": 6.5927342002006535, "learning_rate": 3.6577971464419064e-06, "loss": 0.4955, "step": 4511 }, { "epoch": 0.3664419718996183, "grad_norm": 5.678543552265716, "learning_rate": 3.6572142583689372e-06, "loss": 0.5946, "step": 4512 }, { "epoch": 0.3665231868756599, "grad_norm": 9.978179156767617, "learning_rate": 3.656631290222867e-06, "loss": 0.6531, "step": 4513 }, { "epoch": 0.36660440185170146, "grad_norm": 4.4736754108351215, "learning_rate": 3.656048242044033e-06, "loss": 0.545, "step": 4514 }, { "epoch": 0.36668561682774303, "grad_norm": 3.069124245403996, "learning_rate": 3.655465113872779e-06, "loss": 0.4614, "step": 4515 }, { "epoch": 0.3667668318037846, "grad_norm": 5.78003187791633, "learning_rate": 3.6548819057494533e-06, "loss": 0.5642, "step": 4516 }, { "epoch": 0.3668480467798262, "grad_norm": 6.651983645474769, "learning_rate": 3.6542986177144124e-06, "loss": 0.726, "step": 4517 }, { "epoch": 0.36692926175586776, "grad_norm": 5.61902839950427, "learning_rate": 3.6537152498080165e-06, "loss": 0.5679, "step": 4518 }, { "epoch": 0.36701047673190934, "grad_norm": 3.6610484670477597, "learning_rate": 3.653131802070631e-06, "loss": 0.6392, "step": 4519 }, { "epoch": 0.367091691707951, "grad_norm": 4.744064286312601, "learning_rate": 3.6525482745426277e-06, "loss": 0.5744, "step": 4520 }, { "epoch": 0.36717290668399255, "grad_norm": 5.713260181456036, "learning_rate": 3.6519646672643837e-06, "loss": 0.647, "step": 4521 }, { "epoch": 0.3672541216600341, "grad_norm": 4.789967162852778, "learning_rate": 3.6513809802762805e-06, "loss": 0.5129, "step": 4522 }, { "epoch": 0.3673353366360757, "grad_norm": 5.3194037700176215, "learning_rate": 3.6507972136187082e-06, "loss": 0.53, "step": 4523 }, { "epoch": 0.3674165516121173, "grad_norm": 4.077739067704118, "learning_rate": 3.650213367332059e-06, "loss": 0.4182, "step": 4524 }, { "epoch": 0.36749776658815886, "grad_norm": 5.963767765697606, "learning_rate": 3.6496294414567313e-06, "loss": 0.5525, "step": 4525 }, { "epoch": 0.36757898156420044, "grad_norm": 16.366327402411947, "learning_rate": 3.649045436033132e-06, "loss": 0.6207, "step": 4526 }, { "epoch": 0.367660196540242, "grad_norm": 4.600677095064222, "learning_rate": 3.6484613511016693e-06, "loss": 0.5597, "step": 4527 }, { "epoch": 0.3677414115162836, "grad_norm": 4.635569455138606, "learning_rate": 3.6478771867027585e-06, "loss": 0.5274, "step": 4528 }, { "epoch": 0.36782262649232517, "grad_norm": 13.8186155722183, "learning_rate": 3.647292942876822e-06, "loss": 0.3283, "step": 4529 }, { "epoch": 0.36790384146836674, "grad_norm": 4.921988696473463, "learning_rate": 3.646708619664286e-06, "loss": 0.666, "step": 4530 }, { "epoch": 0.3679850564444084, "grad_norm": 5.125112781896423, "learning_rate": 3.646124217105582e-06, "loss": 0.504, "step": 4531 }, { "epoch": 0.36806627142044995, "grad_norm": 4.160106735681602, "learning_rate": 3.645539735241148e-06, "loss": 0.4889, "step": 4532 }, { "epoch": 0.36814748639649153, "grad_norm": 11.018187691856639, "learning_rate": 3.6449551741114277e-06, "loss": 0.5853, "step": 4533 }, { "epoch": 0.3682287013725331, "grad_norm": 6.301041751261872, "learning_rate": 3.6443705337568683e-06, "loss": 0.5506, "step": 4534 }, { "epoch": 0.3683099163485747, "grad_norm": 2.9536104737688635, "learning_rate": 3.643785814217924e-06, "loss": 0.4832, "step": 4535 }, { "epoch": 0.36839113132461626, "grad_norm": 6.237830789303934, "learning_rate": 3.6432010155350556e-06, "loss": 0.5408, "step": 4536 }, { "epoch": 0.36847234630065784, "grad_norm": 5.326352917032812, "learning_rate": 3.642616137748727e-06, "loss": 0.5208, "step": 4537 }, { "epoch": 0.3685535612766994, "grad_norm": 6.248306452774462, "learning_rate": 3.6420311808994084e-06, "loss": 0.5739, "step": 4538 }, { "epoch": 0.368634776252741, "grad_norm": 4.147693647179564, "learning_rate": 3.641446145027577e-06, "loss": 0.6192, "step": 4539 }, { "epoch": 0.36871599122878257, "grad_norm": 6.469141494304793, "learning_rate": 3.640861030173713e-06, "loss": 0.5873, "step": 4540 }, { "epoch": 0.36879720620482415, "grad_norm": 7.33657703163369, "learning_rate": 3.6402758363783037e-06, "loss": 0.6911, "step": 4541 }, { "epoch": 0.3688784211808658, "grad_norm": 5.919960884683257, "learning_rate": 3.639690563681841e-06, "loss": 0.5873, "step": 4542 }, { "epoch": 0.36895963615690736, "grad_norm": 18.615597531360145, "learning_rate": 3.6391052121248233e-06, "loss": 0.5995, "step": 4543 }, { "epoch": 0.36904085113294893, "grad_norm": 7.146041859144082, "learning_rate": 3.6385197817477535e-06, "loss": 0.4588, "step": 4544 }, { "epoch": 0.3691220661089905, "grad_norm": 4.196856311260542, "learning_rate": 3.6379342725911402e-06, "loss": 0.4738, "step": 4545 }, { "epoch": 0.3692032810850321, "grad_norm": 4.917160222161063, "learning_rate": 3.637348684695498e-06, "loss": 0.6132, "step": 4546 }, { "epoch": 0.36928449606107366, "grad_norm": 3.5260428355619218, "learning_rate": 3.6367630181013457e-06, "loss": 0.3245, "step": 4547 }, { "epoch": 0.36936571103711524, "grad_norm": 5.486577536708969, "learning_rate": 3.6361772728492096e-06, "loss": 0.6407, "step": 4548 }, { "epoch": 0.3694469260131568, "grad_norm": 5.527101563539024, "learning_rate": 3.6355914489796185e-06, "loss": 0.645, "step": 4549 }, { "epoch": 0.3695281409891984, "grad_norm": 5.8535848869679405, "learning_rate": 3.6350055465331098e-06, "loss": 0.5124, "step": 4550 }, { "epoch": 0.36960935596523997, "grad_norm": 3.0247220854088592, "learning_rate": 3.6344195655502233e-06, "loss": 0.5123, "step": 4551 }, { "epoch": 0.36969057094128155, "grad_norm": 5.325222924482676, "learning_rate": 3.633833506071508e-06, "loss": 0.3802, "step": 4552 }, { "epoch": 0.3697717859173232, "grad_norm": 6.707191666111479, "learning_rate": 3.6332473681375146e-06, "loss": 0.4215, "step": 4553 }, { "epoch": 0.36985300089336476, "grad_norm": 6.3473511974694885, "learning_rate": 3.6326611517888e-06, "loss": 0.5247, "step": 4554 }, { "epoch": 0.36993421586940634, "grad_norm": 4.62624452294825, "learning_rate": 3.632074857065928e-06, "loss": 0.4702, "step": 4555 }, { "epoch": 0.3700154308454479, "grad_norm": 3.421641642355826, "learning_rate": 3.631488484009469e-06, "loss": 0.5228, "step": 4556 }, { "epoch": 0.3700966458214895, "grad_norm": 6.538600958475513, "learning_rate": 3.630902032659994e-06, "loss": 0.5604, "step": 4557 }, { "epoch": 0.37017786079753107, "grad_norm": 6.779583853613752, "learning_rate": 3.6303155030580834e-06, "loss": 0.5324, "step": 4558 }, { "epoch": 0.37025907577357264, "grad_norm": 4.786962125388907, "learning_rate": 3.629728895244323e-06, "loss": 0.4619, "step": 4559 }, { "epoch": 0.3703402907496142, "grad_norm": 5.970025695408968, "learning_rate": 3.6291422092593016e-06, "loss": 0.5009, "step": 4560 }, { "epoch": 0.3704215057256558, "grad_norm": 5.054283857837389, "learning_rate": 3.628555445143615e-06, "loss": 0.5614, "step": 4561 }, { "epoch": 0.3705027207016974, "grad_norm": 4.038161358633893, "learning_rate": 3.6279686029378646e-06, "loss": 0.5086, "step": 4562 }, { "epoch": 0.37058393567773895, "grad_norm": 4.169489051463636, "learning_rate": 3.6273816826826565e-06, "loss": 0.4921, "step": 4563 }, { "epoch": 0.3706651506537806, "grad_norm": 6.938462515442908, "learning_rate": 3.6267946844186023e-06, "loss": 0.5351, "step": 4564 }, { "epoch": 0.37074636562982216, "grad_norm": 4.177658193519644, "learning_rate": 3.6262076081863195e-06, "loss": 0.4798, "step": 4565 }, { "epoch": 0.37082758060586374, "grad_norm": 3.350413459828565, "learning_rate": 3.625620454026431e-06, "loss": 0.468, "step": 4566 }, { "epoch": 0.3709087955819053, "grad_norm": 5.601772436876186, "learning_rate": 3.625033221979564e-06, "loss": 0.5411, "step": 4567 }, { "epoch": 0.3709900105579469, "grad_norm": 5.46442194527407, "learning_rate": 3.624445912086352e-06, "loss": 0.6332, "step": 4568 }, { "epoch": 0.37107122553398847, "grad_norm": 5.8554252343208, "learning_rate": 3.6238585243874346e-06, "loss": 0.6209, "step": 4569 }, { "epoch": 0.37115244051003005, "grad_norm": 6.38111885179741, "learning_rate": 3.6232710589234556e-06, "loss": 0.5771, "step": 4570 }, { "epoch": 0.3712336554860716, "grad_norm": 4.123032307940846, "learning_rate": 3.6226835157350625e-06, "loss": 0.6513, "step": 4571 }, { "epoch": 0.3713148704621132, "grad_norm": 5.925335677806806, "learning_rate": 3.6220958948629137e-06, "loss": 0.6454, "step": 4572 }, { "epoch": 0.3713960854381548, "grad_norm": 4.123001819171552, "learning_rate": 3.621508196347667e-06, "loss": 0.5576, "step": 4573 }, { "epoch": 0.37147730041419635, "grad_norm": 6.967825569297834, "learning_rate": 3.6209204202299875e-06, "loss": 0.6256, "step": 4574 }, { "epoch": 0.371558515390238, "grad_norm": 6.347779650753475, "learning_rate": 3.6203325665505486e-06, "loss": 0.632, "step": 4575 }, { "epoch": 0.37163973036627956, "grad_norm": 3.8118384188679943, "learning_rate": 3.619744635350025e-06, "loss": 0.5919, "step": 4576 }, { "epoch": 0.37172094534232114, "grad_norm": 7.993997703219614, "learning_rate": 3.619156626669098e-06, "loss": 0.6338, "step": 4577 }, { "epoch": 0.3718021603183627, "grad_norm": 4.85057042384623, "learning_rate": 3.6185685405484566e-06, "loss": 0.5769, "step": 4578 }, { "epoch": 0.3718833752944043, "grad_norm": 4.65594156616089, "learning_rate": 3.6179803770287913e-06, "loss": 0.4109, "step": 4579 }, { "epoch": 0.37196459027044587, "grad_norm": 4.986423129791739, "learning_rate": 3.6173921361508012e-06, "loss": 0.3943, "step": 4580 }, { "epoch": 0.37204580524648745, "grad_norm": 6.284947322407295, "learning_rate": 3.616803817955189e-06, "loss": 0.5428, "step": 4581 }, { "epoch": 0.372127020222529, "grad_norm": 5.691836227913848, "learning_rate": 3.6162154224826627e-06, "loss": 0.5465, "step": 4582 }, { "epoch": 0.3722082351985706, "grad_norm": 8.95784788971493, "learning_rate": 3.615626949773937e-06, "loss": 0.4766, "step": 4583 }, { "epoch": 0.3722894501746122, "grad_norm": 4.7473375463729495, "learning_rate": 3.6150383998697315e-06, "loss": 0.5862, "step": 4584 }, { "epoch": 0.37237066515065376, "grad_norm": 5.4505240290945585, "learning_rate": 3.614449772810769e-06, "loss": 0.4044, "step": 4585 }, { "epoch": 0.3724518801266954, "grad_norm": 6.977930137629381, "learning_rate": 3.613861068637781e-06, "loss": 0.4798, "step": 4586 }, { "epoch": 0.37253309510273697, "grad_norm": 6.5403690266388335, "learning_rate": 3.6132722873915017e-06, "loss": 0.6262, "step": 4587 }, { "epoch": 0.37261431007877854, "grad_norm": 6.834395602891912, "learning_rate": 3.6126834291126724e-06, "loss": 0.4946, "step": 4588 }, { "epoch": 0.3726955250548201, "grad_norm": 3.6009773174215054, "learning_rate": 3.6120944938420384e-06, "loss": 0.552, "step": 4589 }, { "epoch": 0.3727767400308617, "grad_norm": 6.184936256815004, "learning_rate": 3.6115054816203504e-06, "loss": 0.4826, "step": 4590 }, { "epoch": 0.3728579550069033, "grad_norm": 4.521800781478745, "learning_rate": 3.6109163924883668e-06, "loss": 0.595, "step": 4591 }, { "epoch": 0.37293916998294485, "grad_norm": 5.129658876888231, "learning_rate": 3.6103272264868473e-06, "loss": 0.5459, "step": 4592 }, { "epoch": 0.37302038495898643, "grad_norm": 3.955425189400771, "learning_rate": 3.6097379836565604e-06, "loss": 0.7445, "step": 4593 }, { "epoch": 0.373101599935028, "grad_norm": 4.651648963857242, "learning_rate": 3.6091486640382785e-06, "loss": 0.5907, "step": 4594 }, { "epoch": 0.3731828149110696, "grad_norm": 3.3474693180043045, "learning_rate": 3.6085592676727786e-06, "loss": 0.332, "step": 4595 }, { "epoch": 0.37326402988711116, "grad_norm": 6.634033909104456, "learning_rate": 3.6079697946008453e-06, "loss": 0.4753, "step": 4596 }, { "epoch": 0.3733452448631528, "grad_norm": 5.83533093603698, "learning_rate": 3.607380244863265e-06, "loss": 0.5514, "step": 4597 }, { "epoch": 0.37342645983919437, "grad_norm": 4.575836973417688, "learning_rate": 3.6067906185008328e-06, "loss": 0.3765, "step": 4598 }, { "epoch": 0.37350767481523595, "grad_norm": 4.848076544705409, "learning_rate": 3.6062009155543483e-06, "loss": 0.5876, "step": 4599 }, { "epoch": 0.3735888897912775, "grad_norm": 5.836658897942606, "learning_rate": 3.6056111360646134e-06, "loss": 0.4459, "step": 4600 }, { "epoch": 0.3736701047673191, "grad_norm": 3.9899151337710306, "learning_rate": 3.6050212800724403e-06, "loss": 0.4275, "step": 4601 }, { "epoch": 0.3737513197433607, "grad_norm": 4.289228152113736, "learning_rate": 3.6044313476186433e-06, "loss": 0.5786, "step": 4602 }, { "epoch": 0.37383253471940225, "grad_norm": 5.434462312775084, "learning_rate": 3.603841338744041e-06, "loss": 0.4954, "step": 4603 }, { "epoch": 0.37391374969544383, "grad_norm": 3.5351327522854423, "learning_rate": 3.6032512534894597e-06, "loss": 0.6879, "step": 4604 }, { "epoch": 0.3739949646714854, "grad_norm": 5.549918211724535, "learning_rate": 3.602661091895732e-06, "loss": 0.43, "step": 4605 }, { "epoch": 0.374076179647527, "grad_norm": 4.083602872009912, "learning_rate": 3.602070854003692e-06, "loss": 0.5157, "step": 4606 }, { "epoch": 0.37415739462356856, "grad_norm": 7.5939379592765786, "learning_rate": 3.6014805398541815e-06, "loss": 0.5669, "step": 4607 }, { "epoch": 0.3742386095996102, "grad_norm": 3.8544939676204355, "learning_rate": 3.6008901494880467e-06, "loss": 0.4815, "step": 4608 }, { "epoch": 0.37431982457565177, "grad_norm": 6.619794981662382, "learning_rate": 3.60029968294614e-06, "loss": 0.8072, "step": 4609 }, { "epoch": 0.37440103955169335, "grad_norm": 9.067352524497199, "learning_rate": 3.599709140269319e-06, "loss": 0.409, "step": 4610 }, { "epoch": 0.3744822545277349, "grad_norm": 4.980681642925656, "learning_rate": 3.599118521498445e-06, "loss": 0.6207, "step": 4611 }, { "epoch": 0.3745634695037765, "grad_norm": 5.8795928027879585, "learning_rate": 3.598527826674387e-06, "loss": 0.4077, "step": 4612 }, { "epoch": 0.3746446844798181, "grad_norm": 6.664130906786667, "learning_rate": 3.597937055838017e-06, "loss": 0.4889, "step": 4613 }, { "epoch": 0.37472589945585966, "grad_norm": 3.512866216429364, "learning_rate": 3.5973462090302137e-06, "loss": 0.556, "step": 4614 }, { "epoch": 0.37480711443190123, "grad_norm": 7.915343003804626, "learning_rate": 3.5967552862918603e-06, "loss": 0.5955, "step": 4615 }, { "epoch": 0.3748883294079428, "grad_norm": 5.142753619848735, "learning_rate": 3.596164287663845e-06, "loss": 0.4933, "step": 4616 }, { "epoch": 0.3749695443839844, "grad_norm": 5.185440275795067, "learning_rate": 3.5955732131870626e-06, "loss": 0.9477, "step": 4617 }, { "epoch": 0.37505075936002596, "grad_norm": 12.990297623448031, "learning_rate": 3.594982062902412e-06, "loss": 0.6607, "step": 4618 }, { "epoch": 0.3751319743360676, "grad_norm": 4.182193438396625, "learning_rate": 3.5943908368507985e-06, "loss": 0.6018, "step": 4619 }, { "epoch": 0.3752131893121092, "grad_norm": 3.8071948876468733, "learning_rate": 3.59379953507313e-06, "loss": 0.5165, "step": 4620 }, { "epoch": 0.37529440428815075, "grad_norm": 4.47056756713465, "learning_rate": 3.593208157610324e-06, "loss": 0.4829, "step": 4621 }, { "epoch": 0.37537561926419233, "grad_norm": 4.861658029251169, "learning_rate": 3.592616704503298e-06, "loss": 0.5083, "step": 4622 }, { "epoch": 0.3754568342402339, "grad_norm": 4.194610134145657, "learning_rate": 3.5920251757929787e-06, "loss": 0.5095, "step": 4623 }, { "epoch": 0.3755380492162755, "grad_norm": 6.791531269079618, "learning_rate": 3.5914335715202976e-06, "loss": 0.4922, "step": 4624 }, { "epoch": 0.37561926419231706, "grad_norm": 5.2361522290402185, "learning_rate": 3.590841891726189e-06, "loss": 0.6261, "step": 4625 }, { "epoch": 0.37570047916835864, "grad_norm": 10.082840154908705, "learning_rate": 3.5902501364515945e-06, "loss": 0.5765, "step": 4626 }, { "epoch": 0.3757816941444002, "grad_norm": 6.544537456866241, "learning_rate": 3.5896583057374607e-06, "loss": 0.3993, "step": 4627 }, { "epoch": 0.3758629091204418, "grad_norm": 4.601861451010068, "learning_rate": 3.589066399624739e-06, "loss": 0.6202, "step": 4628 }, { "epoch": 0.37594412409648337, "grad_norm": 4.584192839631751, "learning_rate": 3.5884744181543868e-06, "loss": 0.5738, "step": 4629 }, { "epoch": 0.376025339072525, "grad_norm": 3.856688627858152, "learning_rate": 3.5878823613673652e-06, "loss": 0.4293, "step": 4630 }, { "epoch": 0.3761065540485666, "grad_norm": 3.3977632115418106, "learning_rate": 3.5872902293046417e-06, "loss": 0.4848, "step": 4631 }, { "epoch": 0.37618776902460815, "grad_norm": 5.9959616576129395, "learning_rate": 3.586698022007189e-06, "loss": 0.554, "step": 4632 }, { "epoch": 0.37626898400064973, "grad_norm": 4.593599550308795, "learning_rate": 3.5861057395159837e-06, "loss": 0.494, "step": 4633 }, { "epoch": 0.3763501989766913, "grad_norm": 3.9971449166055644, "learning_rate": 3.5855133818720106e-06, "loss": 0.3877, "step": 4634 }, { "epoch": 0.3764314139527329, "grad_norm": 10.899762207965088, "learning_rate": 3.5849209491162555e-06, "loss": 0.4489, "step": 4635 }, { "epoch": 0.37651262892877446, "grad_norm": 3.0450268530430598, "learning_rate": 3.5843284412897127e-06, "loss": 0.6891, "step": 4636 }, { "epoch": 0.37659384390481604, "grad_norm": 5.435757415343299, "learning_rate": 3.5837358584333814e-06, "loss": 0.6111, "step": 4637 }, { "epoch": 0.3766750588808576, "grad_norm": 4.283255422002926, "learning_rate": 3.583143200588263e-06, "loss": 0.4667, "step": 4638 }, { "epoch": 0.3767562738568992, "grad_norm": 2.6058528884262553, "learning_rate": 3.5825504677953684e-06, "loss": 0.5796, "step": 4639 }, { "epoch": 0.37683748883294077, "grad_norm": 5.535677617104462, "learning_rate": 3.581957660095711e-06, "loss": 0.579, "step": 4640 }, { "epoch": 0.3769187038089824, "grad_norm": 7.993394293530673, "learning_rate": 3.5813647775303084e-06, "loss": 0.531, "step": 4641 }, { "epoch": 0.376999918785024, "grad_norm": 6.421520761195438, "learning_rate": 3.580771820140187e-06, "loss": 0.5836, "step": 4642 }, { "epoch": 0.37708113376106556, "grad_norm": 4.4706924859775174, "learning_rate": 3.580178787966376e-06, "loss": 0.6717, "step": 4643 }, { "epoch": 0.37716234873710713, "grad_norm": 4.900856116172455, "learning_rate": 3.5795856810499085e-06, "loss": 0.5729, "step": 4644 }, { "epoch": 0.3772435637131487, "grad_norm": 3.182583890174541, "learning_rate": 3.5789924994318267e-06, "loss": 0.6078, "step": 4645 }, { "epoch": 0.3773247786891903, "grad_norm": 3.865947434731064, "learning_rate": 3.578399243153174e-06, "loss": 0.6606, "step": 4646 }, { "epoch": 0.37740599366523186, "grad_norm": 3.349782633519335, "learning_rate": 3.5778059122550007e-06, "loss": 0.544, "step": 4647 }, { "epoch": 0.37748720864127344, "grad_norm": 4.885217069177799, "learning_rate": 3.5772125067783624e-06, "loss": 0.5011, "step": 4648 }, { "epoch": 0.377568423617315, "grad_norm": 4.590378597766298, "learning_rate": 3.57661902676432e-06, "loss": 0.5884, "step": 4649 }, { "epoch": 0.3776496385933566, "grad_norm": 4.527590654884785, "learning_rate": 3.576025472253939e-06, "loss": 0.4238, "step": 4650 }, { "epoch": 0.3777308535693982, "grad_norm": 4.451704643586669, "learning_rate": 3.5754318432882907e-06, "loss": 0.4485, "step": 4651 }, { "epoch": 0.3778120685454398, "grad_norm": 5.927309052944885, "learning_rate": 3.5748381399084492e-06, "loss": 0.5697, "step": 4652 }, { "epoch": 0.3778932835214814, "grad_norm": 5.139872795558716, "learning_rate": 3.5742443621554977e-06, "loss": 0.5761, "step": 4653 }, { "epoch": 0.37797449849752296, "grad_norm": 6.387127931906058, "learning_rate": 3.5736505100705223e-06, "loss": 0.6974, "step": 4654 }, { "epoch": 0.37805571347356454, "grad_norm": 5.810815572808849, "learning_rate": 3.573056583694612e-06, "loss": 0.6572, "step": 4655 }, { "epoch": 0.3781369284496061, "grad_norm": 4.719558057872907, "learning_rate": 3.5724625830688667e-06, "loss": 0.5495, "step": 4656 }, { "epoch": 0.3782181434256477, "grad_norm": 6.208002944055565, "learning_rate": 3.571868508234386e-06, "loss": 0.4823, "step": 4657 }, { "epoch": 0.37829935840168927, "grad_norm": 3.68160438978963, "learning_rate": 3.5712743592322775e-06, "loss": 0.504, "step": 4658 }, { "epoch": 0.37838057337773084, "grad_norm": 4.064268236543746, "learning_rate": 3.570680136103653e-06, "loss": 0.571, "step": 4659 }, { "epoch": 0.3784617883537724, "grad_norm": 7.721913069819413, "learning_rate": 3.57008583888963e-06, "loss": 0.4338, "step": 4660 }, { "epoch": 0.378543003329814, "grad_norm": 8.025619823408007, "learning_rate": 3.569491467631329e-06, "loss": 0.4907, "step": 4661 }, { "epoch": 0.3786242183058556, "grad_norm": 4.4108267729746204, "learning_rate": 3.568897022369879e-06, "loss": 0.6222, "step": 4662 }, { "epoch": 0.3787054332818972, "grad_norm": 15.163395858330826, "learning_rate": 3.568302503146413e-06, "loss": 0.5193, "step": 4663 }, { "epoch": 0.3787866482579388, "grad_norm": 3.2649054015739525, "learning_rate": 3.567707910002068e-06, "loss": 0.5473, "step": 4664 }, { "epoch": 0.37886786323398036, "grad_norm": 5.082393091909739, "learning_rate": 3.5671132429779847e-06, "loss": 0.4679, "step": 4665 }, { "epoch": 0.37894907821002194, "grad_norm": 7.079599784559318, "learning_rate": 3.566518502115314e-06, "loss": 0.501, "step": 4666 }, { "epoch": 0.3790302931860635, "grad_norm": 6.472514959171711, "learning_rate": 3.565923687455207e-06, "loss": 0.6414, "step": 4667 }, { "epoch": 0.3791115081621051, "grad_norm": 6.442050896136885, "learning_rate": 3.565328799038822e-06, "loss": 0.4772, "step": 4668 }, { "epoch": 0.37919272313814667, "grad_norm": 6.208659634243696, "learning_rate": 3.5647338369073225e-06, "loss": 0.6315, "step": 4669 }, { "epoch": 0.37927393811418825, "grad_norm": 4.322414723790994, "learning_rate": 3.5641388011018764e-06, "loss": 0.4861, "step": 4670 }, { "epoch": 0.3793551530902298, "grad_norm": 6.68918224709947, "learning_rate": 3.563543691663657e-06, "loss": 0.6495, "step": 4671 }, { "epoch": 0.3794363680662714, "grad_norm": 6.412987095133581, "learning_rate": 3.5629485086338432e-06, "loss": 0.7778, "step": 4672 }, { "epoch": 0.379517583042313, "grad_norm": 4.2253670535048204, "learning_rate": 3.562353252053618e-06, "loss": 0.4884, "step": 4673 }, { "epoch": 0.3795987980183546, "grad_norm": 5.325608575778679, "learning_rate": 3.56175792196417e-06, "loss": 0.4152, "step": 4674 }, { "epoch": 0.3796800129943962, "grad_norm": 5.859119696052518, "learning_rate": 3.561162518406693e-06, "loss": 0.5208, "step": 4675 }, { "epoch": 0.37976122797043776, "grad_norm": 3.929599806942015, "learning_rate": 3.5605670414223866e-06, "loss": 0.6021, "step": 4676 }, { "epoch": 0.37984244294647934, "grad_norm": 10.494504954752514, "learning_rate": 3.559971491052453e-06, "loss": 0.6292, "step": 4677 }, { "epoch": 0.3799236579225209, "grad_norm": 4.64718386046636, "learning_rate": 3.559375867338103e-06, "loss": 0.4315, "step": 4678 }, { "epoch": 0.3800048728985625, "grad_norm": 4.74521085436876, "learning_rate": 3.5587801703205486e-06, "loss": 0.4433, "step": 4679 }, { "epoch": 0.3800860878746041, "grad_norm": 9.15577004594742, "learning_rate": 3.558184400041011e-06, "loss": 0.5353, "step": 4680 }, { "epoch": 0.38016730285064565, "grad_norm": 4.324557986539743, "learning_rate": 3.557588556540712e-06, "loss": 0.6834, "step": 4681 }, { "epoch": 0.3802485178266872, "grad_norm": 4.785865260255061, "learning_rate": 3.556992639860883e-06, "loss": 0.487, "step": 4682 }, { "epoch": 0.3803297328027288, "grad_norm": 7.647172576935408, "learning_rate": 3.5563966500427577e-06, "loss": 0.4949, "step": 4683 }, { "epoch": 0.3804109477787704, "grad_norm": 6.176240715797557, "learning_rate": 3.555800587127574e-06, "loss": 0.6732, "step": 4684 }, { "epoch": 0.380492162754812, "grad_norm": 2.9442581188680306, "learning_rate": 3.5552044511565783e-06, "loss": 0.6017, "step": 4685 }, { "epoch": 0.3805733777308536, "grad_norm": 3.6958273548877916, "learning_rate": 3.554608242171019e-06, "loss": 0.5588, "step": 4686 }, { "epoch": 0.38065459270689517, "grad_norm": 4.174711443180175, "learning_rate": 3.554011960212151e-06, "loss": 0.4675, "step": 4687 }, { "epoch": 0.38073580768293674, "grad_norm": 4.393276038211761, "learning_rate": 3.5534156053212333e-06, "loss": 0.5403, "step": 4688 }, { "epoch": 0.3808170226589783, "grad_norm": 4.053882314908021, "learning_rate": 3.5528191775395304e-06, "loss": 0.5421, "step": 4689 }, { "epoch": 0.3808982376350199, "grad_norm": 2.5408049268953437, "learning_rate": 3.552222676908313e-06, "loss": 0.4942, "step": 4690 }, { "epoch": 0.3809794526110615, "grad_norm": 10.13235114058035, "learning_rate": 3.5516261034688547e-06, "loss": 0.5421, "step": 4691 }, { "epoch": 0.38106066758710305, "grad_norm": 7.384145049649841, "learning_rate": 3.5510294572624358e-06, "loss": 0.4957, "step": 4692 }, { "epoch": 0.38114188256314463, "grad_norm": 11.958189835069302, "learning_rate": 3.5504327383303415e-06, "loss": 0.5927, "step": 4693 }, { "epoch": 0.3812230975391862, "grad_norm": 3.403027900822607, "learning_rate": 3.549835946713861e-06, "loss": 0.5608, "step": 4694 }, { "epoch": 0.3813043125152278, "grad_norm": 3.057095877975725, "learning_rate": 3.5492390824542887e-06, "loss": 0.5047, "step": 4695 }, { "epoch": 0.3813855274912694, "grad_norm": 6.33274087310348, "learning_rate": 3.5486421455929253e-06, "loss": 0.4971, "step": 4696 }, { "epoch": 0.381466742467311, "grad_norm": 4.1556831093555555, "learning_rate": 3.5480451361710744e-06, "loss": 0.6343, "step": 4697 }, { "epoch": 0.38154795744335257, "grad_norm": 3.1197068090047786, "learning_rate": 3.5474480542300475e-06, "loss": 0.6561, "step": 4698 }, { "epoch": 0.38162917241939415, "grad_norm": 4.160059988171047, "learning_rate": 3.5468508998111596e-06, "loss": 0.6627, "step": 4699 }, { "epoch": 0.3817103873954357, "grad_norm": 5.478519363239558, "learning_rate": 3.5462536729557284e-06, "loss": 0.6216, "step": 4700 }, { "epoch": 0.3817916023714773, "grad_norm": 6.8557884315946795, "learning_rate": 3.545656373705081e-06, "loss": 0.6758, "step": 4701 }, { "epoch": 0.3818728173475189, "grad_norm": 4.172187135110137, "learning_rate": 3.5450590021005465e-06, "loss": 0.4609, "step": 4702 }, { "epoch": 0.38195403232356046, "grad_norm": 7.593654425254116, "learning_rate": 3.5444615581834595e-06, "loss": 0.5303, "step": 4703 }, { "epoch": 0.38203524729960203, "grad_norm": 2.9965381248103466, "learning_rate": 3.5438640419951608e-06, "loss": 0.6348, "step": 4704 }, { "epoch": 0.3821164622756436, "grad_norm": 3.965887732610272, "learning_rate": 3.5432664535769952e-06, "loss": 0.7041, "step": 4705 }, { "epoch": 0.3821976772516852, "grad_norm": 10.865736295673367, "learning_rate": 3.5426687929703117e-06, "loss": 0.5335, "step": 4706 }, { "epoch": 0.3822788922277268, "grad_norm": 6.090855516494487, "learning_rate": 3.5420710602164665e-06, "loss": 0.6845, "step": 4707 }, { "epoch": 0.3823601072037684, "grad_norm": 3.356332999735362, "learning_rate": 3.5414732553568194e-06, "loss": 0.5373, "step": 4708 }, { "epoch": 0.38244132217981, "grad_norm": 6.469401787137639, "learning_rate": 3.5408753784327344e-06, "loss": 0.5448, "step": 4709 }, { "epoch": 0.38252253715585155, "grad_norm": 5.575865042285411, "learning_rate": 3.540277429485582e-06, "loss": 0.474, "step": 4710 }, { "epoch": 0.3826037521318931, "grad_norm": 5.15216848497321, "learning_rate": 3.539679408556737e-06, "loss": 0.4949, "step": 4711 }, { "epoch": 0.3826849671079347, "grad_norm": 4.447503975145806, "learning_rate": 3.5390813156875792e-06, "loss": 0.4684, "step": 4712 }, { "epoch": 0.3827661820839763, "grad_norm": 6.602421808698497, "learning_rate": 3.538483150919494e-06, "loss": 0.4992, "step": 4713 }, { "epoch": 0.38284739706001786, "grad_norm": 9.959981744936794, "learning_rate": 3.537884914293871e-06, "loss": 0.5644, "step": 4714 }, { "epoch": 0.38292861203605943, "grad_norm": 4.773698731849422, "learning_rate": 3.537286605852105e-06, "loss": 0.5316, "step": 4715 }, { "epoch": 0.383009827012101, "grad_norm": 5.852883004242007, "learning_rate": 3.536688225635595e-06, "loss": 0.4393, "step": 4716 }, { "epoch": 0.3830910419881426, "grad_norm": 3.5878863623629442, "learning_rate": 3.5360897736857464e-06, "loss": 0.7132, "step": 4717 }, { "epoch": 0.3831722569641842, "grad_norm": 5.620156517899439, "learning_rate": 3.5354912500439696e-06, "loss": 0.5248, "step": 4718 }, { "epoch": 0.3832534719402258, "grad_norm": 7.422807956148856, "learning_rate": 3.5348926547516783e-06, "loss": 0.5087, "step": 4719 }, { "epoch": 0.3833346869162674, "grad_norm": 5.38906725850609, "learning_rate": 3.534293987850291e-06, "loss": 0.7483, "step": 4720 }, { "epoch": 0.38341590189230895, "grad_norm": 4.767084586004944, "learning_rate": 3.5336952493812353e-06, "loss": 0.6177, "step": 4721 }, { "epoch": 0.38349711686835053, "grad_norm": 4.383751010173588, "learning_rate": 3.533096439385939e-06, "loss": 0.5982, "step": 4722 }, { "epoch": 0.3835783318443921, "grad_norm": 18.60580065511833, "learning_rate": 3.532497557905836e-06, "loss": 0.6254, "step": 4723 }, { "epoch": 0.3836595468204337, "grad_norm": 4.724647861890121, "learning_rate": 3.531898604982367e-06, "loss": 0.4813, "step": 4724 }, { "epoch": 0.38374076179647526, "grad_norm": 5.525218151959553, "learning_rate": 3.5312995806569754e-06, "loss": 0.604, "step": 4725 }, { "epoch": 0.38382197677251684, "grad_norm": 4.419767437379395, "learning_rate": 3.5307004849711114e-06, "loss": 0.6971, "step": 4726 }, { "epoch": 0.3839031917485584, "grad_norm": 4.847805495392348, "learning_rate": 3.530101317966228e-06, "loss": 0.4638, "step": 4727 }, { "epoch": 0.3839844067246, "grad_norm": 4.89939462212519, "learning_rate": 3.5295020796837854e-06, "loss": 0.503, "step": 4728 }, { "epoch": 0.3840656217006416, "grad_norm": 4.244557296775247, "learning_rate": 3.528902770165248e-06, "loss": 0.5719, "step": 4729 }, { "epoch": 0.3841468366766832, "grad_norm": 8.538711454427421, "learning_rate": 3.5283033894520836e-06, "loss": 0.4718, "step": 4730 }, { "epoch": 0.3842280516527248, "grad_norm": 3.6208666339316995, "learning_rate": 3.5277039375857677e-06, "loss": 0.5757, "step": 4731 }, { "epoch": 0.38430926662876636, "grad_norm": 6.751106985384788, "learning_rate": 3.5271044146077773e-06, "loss": 0.5931, "step": 4732 }, { "epoch": 0.38439048160480793, "grad_norm": 7.411379001602781, "learning_rate": 3.5265048205595976e-06, "loss": 0.4918, "step": 4733 }, { "epoch": 0.3844716965808495, "grad_norm": 4.0826091607694455, "learning_rate": 3.5259051554827175e-06, "loss": 0.4503, "step": 4734 }, { "epoch": 0.3845529115568911, "grad_norm": 4.479671138778023, "learning_rate": 3.5253054194186297e-06, "loss": 0.5551, "step": 4735 }, { "epoch": 0.38463412653293266, "grad_norm": 6.005050130154772, "learning_rate": 3.524705612408833e-06, "loss": 0.6683, "step": 4736 }, { "epoch": 0.38471534150897424, "grad_norm": 5.171375305373519, "learning_rate": 3.5241057344948317e-06, "loss": 0.5399, "step": 4737 }, { "epoch": 0.3847965564850158, "grad_norm": 10.084203556597553, "learning_rate": 3.523505785718133e-06, "loss": 0.6731, "step": 4738 }, { "epoch": 0.3848777714610574, "grad_norm": 6.878409019634097, "learning_rate": 3.5229057661202513e-06, "loss": 0.4689, "step": 4739 }, { "epoch": 0.384958986437099, "grad_norm": 5.680734133112059, "learning_rate": 3.5223056757427044e-06, "loss": 0.5154, "step": 4740 }, { "epoch": 0.3850402014131406, "grad_norm": 2.861081172022858, "learning_rate": 3.5217055146270144e-06, "loss": 0.5012, "step": 4741 }, { "epoch": 0.3851214163891822, "grad_norm": 7.512509192657636, "learning_rate": 3.5211052828147114e-06, "loss": 0.5743, "step": 4742 }, { "epoch": 0.38520263136522376, "grad_norm": 7.512493844062562, "learning_rate": 3.5205049803473257e-06, "loss": 0.5381, "step": 4743 }, { "epoch": 0.38528384634126533, "grad_norm": 4.916065054864969, "learning_rate": 3.5199046072663968e-06, "loss": 0.5969, "step": 4744 }, { "epoch": 0.3853650613173069, "grad_norm": 5.767505645700775, "learning_rate": 3.5193041636134673e-06, "loss": 0.896, "step": 4745 }, { "epoch": 0.3854462762933485, "grad_norm": 3.7412254239286566, "learning_rate": 3.518703649430083e-06, "loss": 0.6188, "step": 4746 }, { "epoch": 0.38552749126939007, "grad_norm": 5.626427073356312, "learning_rate": 3.518103064757798e-06, "loss": 0.4346, "step": 4747 }, { "epoch": 0.38560870624543164, "grad_norm": 5.8362139577169, "learning_rate": 3.51750240963817e-06, "loss": 0.5342, "step": 4748 }, { "epoch": 0.3856899212214732, "grad_norm": 4.982453547043835, "learning_rate": 3.516901684112759e-06, "loss": 0.6567, "step": 4749 }, { "epoch": 0.3857711361975148, "grad_norm": 4.093696003385388, "learning_rate": 3.5163008882231347e-06, "loss": 0.5791, "step": 4750 }, { "epoch": 0.38585235117355643, "grad_norm": 5.642536053815455, "learning_rate": 3.5157000220108674e-06, "loss": 0.4339, "step": 4751 }, { "epoch": 0.385933566149598, "grad_norm": 4.2134737972824, "learning_rate": 3.5150990855175337e-06, "loss": 0.6173, "step": 4752 }, { "epoch": 0.3860147811256396, "grad_norm": 7.7835339423428405, "learning_rate": 3.5144980787847155e-06, "loss": 0.7697, "step": 4753 }, { "epoch": 0.38609599610168116, "grad_norm": 4.445884072963403, "learning_rate": 3.5138970018539998e-06, "loss": 0.4681, "step": 4754 }, { "epoch": 0.38617721107772274, "grad_norm": 7.397822019738059, "learning_rate": 3.513295854766977e-06, "loss": 0.4439, "step": 4755 }, { "epoch": 0.3862584260537643, "grad_norm": 5.83757001159602, "learning_rate": 3.5126946375652443e-06, "loss": 0.5881, "step": 4756 }, { "epoch": 0.3863396410298059, "grad_norm": 5.649456450527899, "learning_rate": 3.512093350290402e-06, "loss": 0.4453, "step": 4757 }, { "epoch": 0.38642085600584747, "grad_norm": 4.735284890427706, "learning_rate": 3.511491992984057e-06, "loss": 0.6149, "step": 4758 }, { "epoch": 0.38650207098188905, "grad_norm": 9.984154759576063, "learning_rate": 3.510890565687818e-06, "loss": 0.4296, "step": 4759 }, { "epoch": 0.3865832859579306, "grad_norm": 5.043149999950322, "learning_rate": 3.5102890684433026e-06, "loss": 0.5894, "step": 4760 }, { "epoch": 0.3866645009339722, "grad_norm": 4.065716569234592, "learning_rate": 3.509687501292132e-06, "loss": 0.3679, "step": 4761 }, { "epoch": 0.38674571591001383, "grad_norm": 5.553031924098756, "learning_rate": 3.5090858642759273e-06, "loss": 0.4891, "step": 4762 }, { "epoch": 0.3868269308860554, "grad_norm": 9.139353955383072, "learning_rate": 3.5084841574363227e-06, "loss": 0.5457, "step": 4763 }, { "epoch": 0.386908145862097, "grad_norm": 5.211077508553389, "learning_rate": 3.507882380814952e-06, "loss": 0.7012, "step": 4764 }, { "epoch": 0.38698936083813856, "grad_norm": 3.9781818293321027, "learning_rate": 3.507280534453454e-06, "loss": 0.5159, "step": 4765 }, { "epoch": 0.38707057581418014, "grad_norm": 3.7855417874921455, "learning_rate": 3.5066786183934743e-06, "loss": 0.409, "step": 4766 }, { "epoch": 0.3871517907902217, "grad_norm": 5.51441031251894, "learning_rate": 3.5060766326766626e-06, "loss": 0.5835, "step": 4767 }, { "epoch": 0.3872330057662633, "grad_norm": 5.729949936654051, "learning_rate": 3.505474577344672e-06, "loss": 0.4933, "step": 4768 }, { "epoch": 0.38731422074230487, "grad_norm": 4.617510877977197, "learning_rate": 3.504872452439162e-06, "loss": 0.6594, "step": 4769 }, { "epoch": 0.38739543571834645, "grad_norm": 4.618436696346881, "learning_rate": 3.504270258001796e-06, "loss": 0.6615, "step": 4770 }, { "epoch": 0.387476650694388, "grad_norm": 3.771460379690749, "learning_rate": 3.503667994074244e-06, "loss": 0.2883, "step": 4771 }, { "epoch": 0.3875578656704296, "grad_norm": 6.252805853476455, "learning_rate": 3.5030656606981783e-06, "loss": 0.4846, "step": 4772 }, { "epoch": 0.38763908064647123, "grad_norm": 6.604659609809073, "learning_rate": 3.5024632579152775e-06, "loss": 0.4682, "step": 4773 }, { "epoch": 0.3877202956225128, "grad_norm": 8.26513746625276, "learning_rate": 3.501860785767225e-06, "loss": 0.5166, "step": 4774 }, { "epoch": 0.3878015105985544, "grad_norm": 4.4406417232924085, "learning_rate": 3.5012582442957077e-06, "loss": 0.7179, "step": 4775 }, { "epoch": 0.38788272557459597, "grad_norm": 13.64277952343109, "learning_rate": 3.5006556335424197e-06, "loss": 0.3686, "step": 4776 }, { "epoch": 0.38796394055063754, "grad_norm": 4.226503480824964, "learning_rate": 3.500052953549058e-06, "loss": 0.6139, "step": 4777 }, { "epoch": 0.3880451555266791, "grad_norm": 4.974191546797816, "learning_rate": 3.4994502043573237e-06, "loss": 0.7821, "step": 4778 }, { "epoch": 0.3881263705027207, "grad_norm": 6.7453632874753815, "learning_rate": 3.498847386008925e-06, "loss": 0.5492, "step": 4779 }, { "epoch": 0.3882075854787623, "grad_norm": 4.381360711445991, "learning_rate": 3.4982444985455744e-06, "loss": 0.5192, "step": 4780 }, { "epoch": 0.38828880045480385, "grad_norm": 4.91701938364453, "learning_rate": 3.4976415420089865e-06, "loss": 0.5407, "step": 4781 }, { "epoch": 0.38837001543084543, "grad_norm": 5.048580518545317, "learning_rate": 3.4970385164408837e-06, "loss": 0.466, "step": 4782 }, { "epoch": 0.388451230406887, "grad_norm": 4.973082626749845, "learning_rate": 3.496435421882994e-06, "loss": 0.7516, "step": 4783 }, { "epoch": 0.38853244538292864, "grad_norm": 5.743564392198817, "learning_rate": 3.4958322583770453e-06, "loss": 0.593, "step": 4784 }, { "epoch": 0.3886136603589702, "grad_norm": 4.979299976934781, "learning_rate": 3.495229025964775e-06, "loss": 0.4979, "step": 4785 }, { "epoch": 0.3886948753350118, "grad_norm": 12.613122878358682, "learning_rate": 3.494625724687923e-06, "loss": 0.6043, "step": 4786 }, { "epoch": 0.38877609031105337, "grad_norm": 3.85614729618497, "learning_rate": 3.494022354588235e-06, "loss": 0.7142, "step": 4787 }, { "epoch": 0.38885730528709495, "grad_norm": 6.046549551329485, "learning_rate": 3.493418915707461e-06, "loss": 0.4985, "step": 4788 }, { "epoch": 0.3889385202631365, "grad_norm": 5.004952219579293, "learning_rate": 3.4928154080873556e-06, "loss": 0.7017, "step": 4789 }, { "epoch": 0.3890197352391781, "grad_norm": 6.147470697065457, "learning_rate": 3.4922118317696785e-06, "loss": 0.5423, "step": 4790 }, { "epoch": 0.3891009502152197, "grad_norm": 6.876508550025277, "learning_rate": 3.491608186796193e-06, "loss": 0.6039, "step": 4791 }, { "epoch": 0.38918216519126125, "grad_norm": 10.41135603101845, "learning_rate": 3.49100447320867e-06, "loss": 0.5495, "step": 4792 }, { "epoch": 0.38926338016730283, "grad_norm": 3.362847757169651, "learning_rate": 3.4904006910488824e-06, "loss": 0.5267, "step": 4793 }, { "epoch": 0.3893445951433444, "grad_norm": 8.868539359623025, "learning_rate": 3.489796840358608e-06, "loss": 0.5027, "step": 4794 }, { "epoch": 0.38942581011938604, "grad_norm": 3.869570039313511, "learning_rate": 3.4891929211796303e-06, "loss": 0.6388, "step": 4795 }, { "epoch": 0.3895070250954276, "grad_norm": 18.46976613986865, "learning_rate": 3.488588933553739e-06, "loss": 0.7654, "step": 4796 }, { "epoch": 0.3895882400714692, "grad_norm": 4.462040282578044, "learning_rate": 3.4879848775227243e-06, "loss": 0.5233, "step": 4797 }, { "epoch": 0.38966945504751077, "grad_norm": 16.476898959715022, "learning_rate": 3.487380753128385e-06, "loss": 0.463, "step": 4798 }, { "epoch": 0.38975067002355235, "grad_norm": 4.641132700498576, "learning_rate": 3.4867765604125236e-06, "loss": 0.3472, "step": 4799 }, { "epoch": 0.3898318849995939, "grad_norm": 4.809948864229084, "learning_rate": 3.4861722994169466e-06, "loss": 0.4521, "step": 4800 }, { "epoch": 0.3899130999756355, "grad_norm": 4.6897556369620625, "learning_rate": 3.485567970183466e-06, "loss": 0.5308, "step": 4801 }, { "epoch": 0.3899943149516771, "grad_norm": 4.712889214741922, "learning_rate": 3.484963572753898e-06, "loss": 0.4595, "step": 4802 }, { "epoch": 0.39007552992771866, "grad_norm": 3.914385123227235, "learning_rate": 3.4843591071700627e-06, "loss": 0.6773, "step": 4803 }, { "epoch": 0.39015674490376023, "grad_norm": 3.7046943355774147, "learning_rate": 3.4837545734737877e-06, "loss": 0.4701, "step": 4804 }, { "epoch": 0.3902379598798018, "grad_norm": 5.273455209488689, "learning_rate": 3.483149971706902e-06, "loss": 0.5245, "step": 4805 }, { "epoch": 0.39031917485584344, "grad_norm": 5.259747071201792, "learning_rate": 3.482545301911242e-06, "loss": 0.58, "step": 4806 }, { "epoch": 0.390400389831885, "grad_norm": 3.865781563186537, "learning_rate": 3.4819405641286476e-06, "loss": 0.5728, "step": 4807 }, { "epoch": 0.3904816048079266, "grad_norm": 4.430734119933856, "learning_rate": 3.481335758400962e-06, "loss": 0.406, "step": 4808 }, { "epoch": 0.3905628197839682, "grad_norm": 3.1468197398544295, "learning_rate": 3.480730884770036e-06, "loss": 0.5959, "step": 4809 }, { "epoch": 0.39064403476000975, "grad_norm": 4.928724212153909, "learning_rate": 3.4801259432777236e-06, "loss": 0.534, "step": 4810 }, { "epoch": 0.3907252497360513, "grad_norm": 5.338771481365462, "learning_rate": 3.479520933965882e-06, "loss": 0.6451, "step": 4811 }, { "epoch": 0.3908064647120929, "grad_norm": 4.865233384961484, "learning_rate": 3.4789158568763777e-06, "loss": 0.6479, "step": 4812 }, { "epoch": 0.3908876796881345, "grad_norm": 6.708639239866096, "learning_rate": 3.4783107120510758e-06, "loss": 0.4542, "step": 4813 }, { "epoch": 0.39096889466417606, "grad_norm": 5.797570016494007, "learning_rate": 3.4777054995318493e-06, "loss": 0.6198, "step": 4814 }, { "epoch": 0.39105010964021764, "grad_norm": 3.723719927731895, "learning_rate": 3.4771002193605783e-06, "loss": 0.5544, "step": 4815 }, { "epoch": 0.3911313246162592, "grad_norm": 4.734496419671341, "learning_rate": 3.4764948715791425e-06, "loss": 0.5319, "step": 4816 }, { "epoch": 0.39121253959230085, "grad_norm": 3.483732371245711, "learning_rate": 3.47588945622943e-06, "loss": 0.6067, "step": 4817 }, { "epoch": 0.3912937545683424, "grad_norm": 3.7596387481376143, "learning_rate": 3.4752839733533315e-06, "loss": 0.7024, "step": 4818 }, { "epoch": 0.391374969544384, "grad_norm": 6.235595740178511, "learning_rate": 3.4746784229927445e-06, "loss": 0.5705, "step": 4819 }, { "epoch": 0.3914561845204256, "grad_norm": 4.914796686357341, "learning_rate": 3.4740728051895683e-06, "loss": 0.4127, "step": 4820 }, { "epoch": 0.39153739949646715, "grad_norm": 6.037035106034122, "learning_rate": 3.4734671199857093e-06, "loss": 0.461, "step": 4821 }, { "epoch": 0.39161861447250873, "grad_norm": 4.084676397060007, "learning_rate": 3.4728613674230777e-06, "loss": 0.4533, "step": 4822 }, { "epoch": 0.3916998294485503, "grad_norm": 5.820986362019269, "learning_rate": 3.472255547543589e-06, "loss": 0.4948, "step": 4823 }, { "epoch": 0.3917810444245919, "grad_norm": 6.625284347900165, "learning_rate": 3.4716496603891605e-06, "loss": 0.6856, "step": 4824 }, { "epoch": 0.39186225940063346, "grad_norm": 4.751431998722198, "learning_rate": 3.471043706001719e-06, "loss": 0.5442, "step": 4825 }, { "epoch": 0.39194347437667504, "grad_norm": 3.789894887233057, "learning_rate": 3.4704376844231922e-06, "loss": 0.5568, "step": 4826 }, { "epoch": 0.3920246893527166, "grad_norm": 3.666796443871811, "learning_rate": 3.4698315956955125e-06, "loss": 0.6599, "step": 4827 }, { "epoch": 0.39210590432875825, "grad_norm": 3.6434242436779973, "learning_rate": 3.46922543986062e-06, "loss": 0.4903, "step": 4828 }, { "epoch": 0.3921871193047998, "grad_norm": 7.08366419413291, "learning_rate": 3.468619216960457e-06, "loss": 0.6005, "step": 4829 }, { "epoch": 0.3922683342808414, "grad_norm": 4.495677796038025, "learning_rate": 3.46801292703697e-06, "loss": 0.5686, "step": 4830 }, { "epoch": 0.392349549256883, "grad_norm": 6.22783462766733, "learning_rate": 3.467406570132112e-06, "loss": 0.402, "step": 4831 }, { "epoch": 0.39243076423292456, "grad_norm": 6.8479413586138165, "learning_rate": 3.4668001462878386e-06, "loss": 0.4031, "step": 4832 }, { "epoch": 0.39251197920896613, "grad_norm": 11.848480365877366, "learning_rate": 3.466193655546112e-06, "loss": 0.4203, "step": 4833 }, { "epoch": 0.3925931941850077, "grad_norm": 4.1344833900535205, "learning_rate": 3.465587097948898e-06, "loss": 0.4155, "step": 4834 }, { "epoch": 0.3926744091610493, "grad_norm": 4.430449430971486, "learning_rate": 3.4649804735381675e-06, "loss": 0.6314, "step": 4835 }, { "epoch": 0.39275562413709086, "grad_norm": 5.347837303662017, "learning_rate": 3.4643737823558947e-06, "loss": 0.5447, "step": 4836 }, { "epoch": 0.39283683911313244, "grad_norm": 6.546391191124815, "learning_rate": 3.463767024444061e-06, "loss": 0.5371, "step": 4837 }, { "epoch": 0.392918054089174, "grad_norm": 4.80456942652959, "learning_rate": 3.4631601998446484e-06, "loss": 0.638, "step": 4838 }, { "epoch": 0.39299926906521565, "grad_norm": 5.280861827964533, "learning_rate": 3.4625533085996495e-06, "loss": 0.4545, "step": 4839 }, { "epoch": 0.3930804840412572, "grad_norm": 6.5672307967606995, "learning_rate": 3.4619463507510536e-06, "loss": 0.4733, "step": 4840 }, { "epoch": 0.3931616990172988, "grad_norm": 4.308991992577792, "learning_rate": 3.4613393263408625e-06, "loss": 0.5039, "step": 4841 }, { "epoch": 0.3932429139933404, "grad_norm": 5.784233850167831, "learning_rate": 3.4607322354110785e-06, "loss": 0.5651, "step": 4842 }, { "epoch": 0.39332412896938196, "grad_norm": 5.463637822033822, "learning_rate": 3.4601250780037064e-06, "loss": 0.6328, "step": 4843 }, { "epoch": 0.39340534394542354, "grad_norm": 6.116722144553743, "learning_rate": 3.4595178541607616e-06, "loss": 0.63, "step": 4844 }, { "epoch": 0.3934865589214651, "grad_norm": 5.568944318775315, "learning_rate": 3.45891056392426e-06, "loss": 0.4404, "step": 4845 }, { "epoch": 0.3935677738975067, "grad_norm": 5.00032757592686, "learning_rate": 3.4583032073362216e-06, "loss": 0.4921, "step": 4846 }, { "epoch": 0.39364898887354827, "grad_norm": 6.833348088070499, "learning_rate": 3.4576957844386728e-06, "loss": 0.5459, "step": 4847 }, { "epoch": 0.39373020384958984, "grad_norm": 3.622191745550694, "learning_rate": 3.4570882952736445e-06, "loss": 0.5338, "step": 4848 }, { "epoch": 0.3938114188256314, "grad_norm": 3.7561292429825617, "learning_rate": 3.4564807398831716e-06, "loss": 0.5432, "step": 4849 }, { "epoch": 0.39389263380167305, "grad_norm": 6.7697920687749145, "learning_rate": 3.4558731183092936e-06, "loss": 0.4724, "step": 4850 }, { "epoch": 0.39397384877771463, "grad_norm": 17.901118220763788, "learning_rate": 3.4552654305940546e-06, "loss": 0.4965, "step": 4851 }, { "epoch": 0.3940550637537562, "grad_norm": 5.591762564408774, "learning_rate": 3.4546576767795036e-06, "loss": 0.4596, "step": 4852 }, { "epoch": 0.3941362787297978, "grad_norm": 4.5991945379468975, "learning_rate": 3.4540498569076935e-06, "loss": 0.6449, "step": 4853 }, { "epoch": 0.39421749370583936, "grad_norm": 4.012140667999568, "learning_rate": 3.453441971020682e-06, "loss": 0.5522, "step": 4854 }, { "epoch": 0.39429870868188094, "grad_norm": 4.026685263806628, "learning_rate": 3.4528340191605336e-06, "loss": 0.5232, "step": 4855 }, { "epoch": 0.3943799236579225, "grad_norm": 3.9644370555022226, "learning_rate": 3.452226001369313e-06, "loss": 0.4068, "step": 4856 }, { "epoch": 0.3944611386339641, "grad_norm": 4.955057245622184, "learning_rate": 3.451617917689093e-06, "loss": 0.5034, "step": 4857 }, { "epoch": 0.39454235361000567, "grad_norm": 6.121308963517397, "learning_rate": 3.4510097681619497e-06, "loss": 0.5604, "step": 4858 }, { "epoch": 0.39462356858604725, "grad_norm": 3.955911011778945, "learning_rate": 3.4504015528299633e-06, "loss": 0.742, "step": 4859 }, { "epoch": 0.3947047835620888, "grad_norm": 3.416220952136702, "learning_rate": 3.449793271735219e-06, "loss": 0.4727, "step": 4860 }, { "epoch": 0.39478599853813046, "grad_norm": 4.42715063532882, "learning_rate": 3.4491849249198074e-06, "loss": 0.6083, "step": 4861 }, { "epoch": 0.39486721351417203, "grad_norm": 5.7756208575555466, "learning_rate": 3.4485765124258223e-06, "loss": 0.537, "step": 4862 }, { "epoch": 0.3949484284902136, "grad_norm": 5.659261574392475, "learning_rate": 3.4479680342953627e-06, "loss": 0.4879, "step": 4863 }, { "epoch": 0.3950296434662552, "grad_norm": 3.9294665757315625, "learning_rate": 3.4473594905705326e-06, "loss": 0.5727, "step": 4864 }, { "epoch": 0.39511085844229676, "grad_norm": 4.426724017012553, "learning_rate": 3.446750881293439e-06, "loss": 0.5823, "step": 4865 }, { "epoch": 0.39519207341833834, "grad_norm": 4.23549319138764, "learning_rate": 3.4461422065061957e-06, "loss": 0.594, "step": 4866 }, { "epoch": 0.3952732883943799, "grad_norm": 4.069048549480606, "learning_rate": 3.4455334662509186e-06, "loss": 0.5666, "step": 4867 }, { "epoch": 0.3953545033704215, "grad_norm": 9.920612867004065, "learning_rate": 3.44492466056973e-06, "loss": 0.5656, "step": 4868 }, { "epoch": 0.39543571834646307, "grad_norm": 7.365805382978001, "learning_rate": 3.4443157895047556e-06, "loss": 0.5565, "step": 4869 }, { "epoch": 0.39551693332250465, "grad_norm": 4.079639656340487, "learning_rate": 3.4437068530981266e-06, "loss": 0.5486, "step": 4870 }, { "epoch": 0.3955981482985462, "grad_norm": 5.1014095863826805, "learning_rate": 3.4430978513919777e-06, "loss": 0.4873, "step": 4871 }, { "epoch": 0.39567936327458786, "grad_norm": 6.57666055283231, "learning_rate": 3.4424887844284492e-06, "loss": 0.5425, "step": 4872 }, { "epoch": 0.39576057825062944, "grad_norm": 4.768382603290578, "learning_rate": 3.4418796522496845e-06, "loss": 0.3941, "step": 4873 }, { "epoch": 0.395841793226671, "grad_norm": 6.114180863563064, "learning_rate": 3.4412704548978326e-06, "loss": 0.7732, "step": 4874 }, { "epoch": 0.3959230082027126, "grad_norm": 3.93227698865023, "learning_rate": 3.4406611924150468e-06, "loss": 0.5043, "step": 4875 }, { "epoch": 0.39600422317875417, "grad_norm": 5.0863186506106794, "learning_rate": 3.440051864843485e-06, "loss": 0.4199, "step": 4876 }, { "epoch": 0.39608543815479574, "grad_norm": 8.186606675125352, "learning_rate": 3.4394424722253095e-06, "loss": 0.5594, "step": 4877 }, { "epoch": 0.3961666531308373, "grad_norm": 3.1309990632230185, "learning_rate": 3.4388330146026865e-06, "loss": 0.515, "step": 4878 }, { "epoch": 0.3962478681068789, "grad_norm": 3.0856049239044676, "learning_rate": 3.438223492017787e-06, "loss": 0.65, "step": 4879 }, { "epoch": 0.3963290830829205, "grad_norm": 5.758345655080891, "learning_rate": 3.4376139045127886e-06, "loss": 0.5401, "step": 4880 }, { "epoch": 0.39641029805896205, "grad_norm": 6.386676934781451, "learning_rate": 3.4370042521298697e-06, "loss": 0.63, "step": 4881 }, { "epoch": 0.39649151303500363, "grad_norm": 4.289237032932299, "learning_rate": 3.436394534911216e-06, "loss": 0.4628, "step": 4882 }, { "epoch": 0.39657272801104526, "grad_norm": 4.493221527603865, "learning_rate": 3.4357847528990157e-06, "loss": 0.4507, "step": 4883 }, { "epoch": 0.39665394298708684, "grad_norm": 5.931364030127042, "learning_rate": 3.4351749061354634e-06, "loss": 0.4214, "step": 4884 }, { "epoch": 0.3967351579631284, "grad_norm": 5.201076643919345, "learning_rate": 3.4345649946627567e-06, "loss": 0.4338, "step": 4885 }, { "epoch": 0.39681637293917, "grad_norm": 6.755107310201019, "learning_rate": 3.4339550185230985e-06, "loss": 0.633, "step": 4886 }, { "epoch": 0.39689758791521157, "grad_norm": 6.012302549365938, "learning_rate": 3.4333449777586957e-06, "loss": 0.4477, "step": 4887 }, { "epoch": 0.39697880289125315, "grad_norm": 5.404553280600934, "learning_rate": 3.432734872411761e-06, "loss": 0.555, "step": 4888 }, { "epoch": 0.3970600178672947, "grad_norm": 6.802478190345189, "learning_rate": 3.4321247025245084e-06, "loss": 0.56, "step": 4889 }, { "epoch": 0.3971412328433363, "grad_norm": 4.629306992412694, "learning_rate": 3.4315144681391604e-06, "loss": 0.5331, "step": 4890 }, { "epoch": 0.3972224478193779, "grad_norm": 4.785707649092083, "learning_rate": 3.430904169297941e-06, "loss": 0.9165, "step": 4891 }, { "epoch": 0.39730366279541945, "grad_norm": 7.873306295295412, "learning_rate": 3.4302938060430794e-06, "loss": 0.6152, "step": 4892 }, { "epoch": 0.39738487777146103, "grad_norm": 3.4652132108801195, "learning_rate": 3.429683378416811e-06, "loss": 0.4936, "step": 4893 }, { "epoch": 0.39746609274750266, "grad_norm": 4.301920992179896, "learning_rate": 3.429072886461372e-06, "loss": 0.5562, "step": 4894 }, { "epoch": 0.39754730772354424, "grad_norm": 5.010865178574669, "learning_rate": 3.428462330219007e-06, "loss": 0.5153, "step": 4895 }, { "epoch": 0.3976285226995858, "grad_norm": 6.077200433598698, "learning_rate": 3.4278517097319617e-06, "loss": 0.4894, "step": 4896 }, { "epoch": 0.3977097376756274, "grad_norm": 4.201260945415446, "learning_rate": 3.4272410250424893e-06, "loss": 0.5466, "step": 4897 }, { "epoch": 0.39779095265166897, "grad_norm": 4.064316394961935, "learning_rate": 3.4266302761928453e-06, "loss": 0.499, "step": 4898 }, { "epoch": 0.39787216762771055, "grad_norm": 10.729804958408987, "learning_rate": 3.4260194632252903e-06, "loss": 0.6487, "step": 4899 }, { "epoch": 0.3979533826037521, "grad_norm": 5.44190759303068, "learning_rate": 3.4254085861820895e-06, "loss": 0.4588, "step": 4900 }, { "epoch": 0.3980345975797937, "grad_norm": 6.394017389790841, "learning_rate": 3.424797645105512e-06, "loss": 0.5658, "step": 4901 }, { "epoch": 0.3981158125558353, "grad_norm": 5.684350734119721, "learning_rate": 3.4241866400378315e-06, "loss": 0.5158, "step": 4902 }, { "epoch": 0.39819702753187686, "grad_norm": 3.9864422041154275, "learning_rate": 3.423575571021327e-06, "loss": 0.4298, "step": 4903 }, { "epoch": 0.39827824250791843, "grad_norm": 5.696110565863003, "learning_rate": 3.4229644380982817e-06, "loss": 0.6485, "step": 4904 }, { "epoch": 0.39835945748396007, "grad_norm": 8.085273365710513, "learning_rate": 3.4223532413109807e-06, "loss": 0.5311, "step": 4905 }, { "epoch": 0.39844067246000164, "grad_norm": 21.38684299111223, "learning_rate": 3.4217419807017177e-06, "loss": 0.3467, "step": 4906 }, { "epoch": 0.3985218874360432, "grad_norm": 6.879258693117489, "learning_rate": 3.4211306563127876e-06, "loss": 0.3502, "step": 4907 }, { "epoch": 0.3986031024120848, "grad_norm": 5.751298879545744, "learning_rate": 3.4205192681864905e-06, "loss": 0.5344, "step": 4908 }, { "epoch": 0.3986843173881264, "grad_norm": 9.337940570841411, "learning_rate": 3.4199078163651335e-06, "loss": 0.5033, "step": 4909 }, { "epoch": 0.39876553236416795, "grad_norm": 4.8306108732275925, "learning_rate": 3.419296300891023e-06, "loss": 0.518, "step": 4910 }, { "epoch": 0.39884674734020953, "grad_norm": 7.933167629628174, "learning_rate": 3.418684721806474e-06, "loss": 0.4499, "step": 4911 }, { "epoch": 0.3989279623162511, "grad_norm": 6.472677681508684, "learning_rate": 3.418073079153804e-06, "loss": 0.534, "step": 4912 }, { "epoch": 0.3990091772922927, "grad_norm": 6.504622980241599, "learning_rate": 3.4174613729753364e-06, "loss": 0.569, "step": 4913 }, { "epoch": 0.39909039226833426, "grad_norm": 7.727056330599826, "learning_rate": 3.4168496033133968e-06, "loss": 0.5061, "step": 4914 }, { "epoch": 0.39917160724437584, "grad_norm": 4.697545157125827, "learning_rate": 3.416237770210317e-06, "loss": 0.4645, "step": 4915 }, { "epoch": 0.39925282222041747, "grad_norm": 4.809263186617258, "learning_rate": 3.415625873708433e-06, "loss": 0.4595, "step": 4916 }, { "epoch": 0.39933403719645905, "grad_norm": 10.22371082626964, "learning_rate": 3.4150139138500843e-06, "loss": 0.4436, "step": 4917 }, { "epoch": 0.3994152521725006, "grad_norm": 5.9377177585980245, "learning_rate": 3.4144018906776155e-06, "loss": 0.5012, "step": 4918 }, { "epoch": 0.3994964671485422, "grad_norm": 5.523225188806799, "learning_rate": 3.413789804233375e-06, "loss": 0.6356, "step": 4919 }, { "epoch": 0.3995776821245838, "grad_norm": 4.922432155658762, "learning_rate": 3.413177654559717e-06, "loss": 0.4785, "step": 4920 }, { "epoch": 0.39965889710062535, "grad_norm": 8.522209977514363, "learning_rate": 3.4125654416989975e-06, "loss": 0.4049, "step": 4921 }, { "epoch": 0.39974011207666693, "grad_norm": 4.166534006928631, "learning_rate": 3.411953165693579e-06, "loss": 0.5051, "step": 4922 }, { "epoch": 0.3998213270527085, "grad_norm": 7.556966122206325, "learning_rate": 3.4113408265858282e-06, "loss": 0.5134, "step": 4923 }, { "epoch": 0.3999025420287501, "grad_norm": 4.46432321290014, "learning_rate": 3.4107284244181154e-06, "loss": 0.5773, "step": 4924 }, { "epoch": 0.39998375700479166, "grad_norm": 3.3296185748575837, "learning_rate": 3.4101159592328148e-06, "loss": 0.492, "step": 4925 }, { "epoch": 0.40006497198083324, "grad_norm": 4.808058470791979, "learning_rate": 3.409503431072308e-06, "loss": 0.4053, "step": 4926 }, { "epoch": 0.40014618695687487, "grad_norm": 5.658421289602297, "learning_rate": 3.408890839978976e-06, "loss": 0.515, "step": 4927 }, { "epoch": 0.40022740193291645, "grad_norm": 6.896229712559321, "learning_rate": 3.4082781859952087e-06, "loss": 0.5547, "step": 4928 }, { "epoch": 0.400308616908958, "grad_norm": 5.824806238919424, "learning_rate": 3.407665469163398e-06, "loss": 0.6033, "step": 4929 }, { "epoch": 0.4003898318849996, "grad_norm": 4.135425915903601, "learning_rate": 3.4070526895259403e-06, "loss": 0.5083, "step": 4930 }, { "epoch": 0.4004710468610412, "grad_norm": 5.133129422170843, "learning_rate": 3.4064398471252367e-06, "loss": 0.5962, "step": 4931 }, { "epoch": 0.40055226183708276, "grad_norm": 5.46292327134982, "learning_rate": 3.4058269420036937e-06, "loss": 0.4848, "step": 4932 }, { "epoch": 0.40063347681312433, "grad_norm": 5.055147156646343, "learning_rate": 3.40521397420372e-06, "loss": 0.5223, "step": 4933 }, { "epoch": 0.4007146917891659, "grad_norm": 6.056907838015249, "learning_rate": 3.4046009437677296e-06, "loss": 0.658, "step": 4934 }, { "epoch": 0.4007959067652075, "grad_norm": 5.370500579188346, "learning_rate": 3.403987850738142e-06, "loss": 0.4064, "step": 4935 }, { "epoch": 0.40087712174124907, "grad_norm": 5.583756706162014, "learning_rate": 3.4033746951573797e-06, "loss": 0.4991, "step": 4936 }, { "epoch": 0.40095833671729064, "grad_norm": 7.016728588738892, "learning_rate": 3.4027614770678695e-06, "loss": 0.6236, "step": 4937 }, { "epoch": 0.4010395516933323, "grad_norm": 4.595088770001303, "learning_rate": 3.402148196512042e-06, "loss": 0.4602, "step": 4938 }, { "epoch": 0.40112076666937385, "grad_norm": 8.586997426840448, "learning_rate": 3.4015348535323344e-06, "loss": 0.582, "step": 4939 }, { "epoch": 0.40120198164541543, "grad_norm": 6.420810576653918, "learning_rate": 3.400921448171187e-06, "loss": 0.456, "step": 4940 }, { "epoch": 0.401283196621457, "grad_norm": 3.598889910541628, "learning_rate": 3.4003079804710414e-06, "loss": 0.5049, "step": 4941 }, { "epoch": 0.4013644115974986, "grad_norm": 4.85229866007985, "learning_rate": 3.39969445047435e-06, "loss": 0.5795, "step": 4942 }, { "epoch": 0.40144562657354016, "grad_norm": 5.046704044838436, "learning_rate": 3.399080858223564e-06, "loss": 0.4898, "step": 4943 }, { "epoch": 0.40152684154958174, "grad_norm": 4.201283039118606, "learning_rate": 3.3984672037611403e-06, "loss": 0.4583, "step": 4944 }, { "epoch": 0.4016080565256233, "grad_norm": 3.6245756239233855, "learning_rate": 3.3978534871295423e-06, "loss": 0.569, "step": 4945 }, { "epoch": 0.4016892715016649, "grad_norm": 5.943049720731405, "learning_rate": 3.3972397083712337e-06, "loss": 0.4635, "step": 4946 }, { "epoch": 0.40177048647770647, "grad_norm": 4.4825133089550615, "learning_rate": 3.3966258675286868e-06, "loss": 0.5545, "step": 4947 }, { "epoch": 0.40185170145374804, "grad_norm": 7.159533755424081, "learning_rate": 3.3960119646443743e-06, "loss": 0.5132, "step": 4948 }, { "epoch": 0.4019329164297897, "grad_norm": 5.668432680762583, "learning_rate": 3.395397999760777e-06, "loss": 0.4824, "step": 4949 }, { "epoch": 0.40201413140583125, "grad_norm": 7.74454321710248, "learning_rate": 3.394783972920376e-06, "loss": 0.5292, "step": 4950 }, { "epoch": 0.40209534638187283, "grad_norm": 3.9688138836277864, "learning_rate": 3.3941698841656594e-06, "loss": 0.5123, "step": 4951 }, { "epoch": 0.4021765613579144, "grad_norm": 3.6092591329939196, "learning_rate": 3.3935557335391194e-06, "loss": 0.5618, "step": 4952 }, { "epoch": 0.402257776333956, "grad_norm": 5.268354030573778, "learning_rate": 3.3929415210832526e-06, "loss": 0.4638, "step": 4953 }, { "epoch": 0.40233899130999756, "grad_norm": 4.725073559638671, "learning_rate": 3.392327246840558e-06, "loss": 0.5442, "step": 4954 }, { "epoch": 0.40242020628603914, "grad_norm": 5.0239266752483225, "learning_rate": 3.39171291085354e-06, "loss": 0.5132, "step": 4955 }, { "epoch": 0.4025014212620807, "grad_norm": 4.688205783968622, "learning_rate": 3.3910985131647077e-06, "loss": 0.6074, "step": 4956 }, { "epoch": 0.4025826362381223, "grad_norm": 4.306614565733134, "learning_rate": 3.3904840538165745e-06, "loss": 0.4564, "step": 4957 }, { "epoch": 0.40266385121416387, "grad_norm": 13.862109865488437, "learning_rate": 3.3898695328516585e-06, "loss": 0.536, "step": 4958 }, { "epoch": 0.40274506619020545, "grad_norm": 4.512610902634363, "learning_rate": 3.38925495031248e-06, "loss": 0.6559, "step": 4959 }, { "epoch": 0.4028262811662471, "grad_norm": 4.1778178120621225, "learning_rate": 3.3886403062415653e-06, "loss": 0.5741, "step": 4960 }, { "epoch": 0.40290749614228866, "grad_norm": 5.5923461249681194, "learning_rate": 3.3880256006814436e-06, "loss": 0.7565, "step": 4961 }, { "epoch": 0.40298871111833023, "grad_norm": 10.781234356503536, "learning_rate": 3.387410833674651e-06, "loss": 0.817, "step": 4962 }, { "epoch": 0.4030699260943718, "grad_norm": 7.255219164441723, "learning_rate": 3.386796005263725e-06, "loss": 0.4918, "step": 4963 }, { "epoch": 0.4031511410704134, "grad_norm": 3.7214218539504094, "learning_rate": 3.3861811154912085e-06, "loss": 0.5502, "step": 4964 }, { "epoch": 0.40323235604645496, "grad_norm": 3.7233458707878246, "learning_rate": 3.385566164399649e-06, "loss": 0.6731, "step": 4965 }, { "epoch": 0.40331357102249654, "grad_norm": 6.244845838241695, "learning_rate": 3.3849511520315986e-06, "loss": 0.426, "step": 4966 }, { "epoch": 0.4033947859985381, "grad_norm": 5.384785695599885, "learning_rate": 3.384336078429611e-06, "loss": 0.7065, "step": 4967 }, { "epoch": 0.4034760009745797, "grad_norm": 4.49252865805685, "learning_rate": 3.3837209436362473e-06, "loss": 0.477, "step": 4968 }, { "epoch": 0.4035572159506213, "grad_norm": 3.4933203948485643, "learning_rate": 3.3831057476940716e-06, "loss": 0.5459, "step": 4969 }, { "epoch": 0.40363843092666285, "grad_norm": 7.33852450348436, "learning_rate": 3.382490490645651e-06, "loss": 0.559, "step": 4970 }, { "epoch": 0.4037196459027045, "grad_norm": 5.565835641557815, "learning_rate": 3.3818751725335595e-06, "loss": 0.4383, "step": 4971 }, { "epoch": 0.40380086087874606, "grad_norm": 6.069976549417349, "learning_rate": 3.3812597934003746e-06, "loss": 0.6346, "step": 4972 }, { "epoch": 0.40388207585478764, "grad_norm": 5.41297727878491, "learning_rate": 3.3806443532886736e-06, "loss": 0.4902, "step": 4973 }, { "epoch": 0.4039632908308292, "grad_norm": 3.474757683944031, "learning_rate": 3.3800288522410464e-06, "loss": 0.6833, "step": 4974 }, { "epoch": 0.4040445058068708, "grad_norm": 18.494129306251725, "learning_rate": 3.3794132903000787e-06, "loss": 0.4717, "step": 4975 }, { "epoch": 0.40412572078291237, "grad_norm": 5.201629259993079, "learning_rate": 3.3787976675083657e-06, "loss": 0.4586, "step": 4976 }, { "epoch": 0.40420693575895394, "grad_norm": 3.067579639195292, "learning_rate": 3.3781819839085056e-06, "loss": 0.6093, "step": 4977 }, { "epoch": 0.4042881507349955, "grad_norm": 6.51098650482296, "learning_rate": 3.3775662395431e-06, "loss": 0.4642, "step": 4978 }, { "epoch": 0.4043693657110371, "grad_norm": 3.2528809931282057, "learning_rate": 3.376950434454754e-06, "loss": 0.6634, "step": 4979 }, { "epoch": 0.4044505806870787, "grad_norm": 6.255509398772904, "learning_rate": 3.37633456868608e-06, "loss": 0.5264, "step": 4980 }, { "epoch": 0.40453179566312025, "grad_norm": 6.628156482525939, "learning_rate": 3.3757186422796918e-06, "loss": 0.4249, "step": 4981 }, { "epoch": 0.4046130106391619, "grad_norm": 5.667448673111919, "learning_rate": 3.3751026552782085e-06, "loss": 0.5736, "step": 4982 }, { "epoch": 0.40469422561520346, "grad_norm": 4.987055006193434, "learning_rate": 3.3744866077242516e-06, "loss": 0.585, "step": 4983 }, { "epoch": 0.40477544059124504, "grad_norm": 5.82552976044451, "learning_rate": 3.3738704996604505e-06, "loss": 0.6341, "step": 4984 }, { "epoch": 0.4048566555672866, "grad_norm": 4.093717103910503, "learning_rate": 3.373254331129436e-06, "loss": 0.4046, "step": 4985 }, { "epoch": 0.4049378705433282, "grad_norm": 7.070638776876593, "learning_rate": 3.3726381021738426e-06, "loss": 0.4569, "step": 4986 }, { "epoch": 0.40501908551936977, "grad_norm": 4.327438597298289, "learning_rate": 3.372021812836311e-06, "loss": 0.522, "step": 4987 }, { "epoch": 0.40510030049541135, "grad_norm": 7.802424985063589, "learning_rate": 3.371405463159486e-06, "loss": 0.5055, "step": 4988 }, { "epoch": 0.4051815154714529, "grad_norm": 5.2484985718748405, "learning_rate": 3.3707890531860143e-06, "loss": 0.5691, "step": 4989 }, { "epoch": 0.4052627304474945, "grad_norm": 3.961595770162228, "learning_rate": 3.3701725829585484e-06, "loss": 0.5087, "step": 4990 }, { "epoch": 0.4053439454235361, "grad_norm": 5.115368545445615, "learning_rate": 3.369556052519746e-06, "loss": 0.5014, "step": 4991 }, { "epoch": 0.40542516039957766, "grad_norm": 9.264859588158524, "learning_rate": 3.3689394619122654e-06, "loss": 0.5217, "step": 4992 }, { "epoch": 0.4055063753756193, "grad_norm": 5.809302524376221, "learning_rate": 3.3683228111787738e-06, "loss": 0.5209, "step": 4993 }, { "epoch": 0.40558759035166086, "grad_norm": 6.347688148270995, "learning_rate": 3.367706100361939e-06, "loss": 0.5284, "step": 4994 }, { "epoch": 0.40566880532770244, "grad_norm": 5.2075723248409185, "learning_rate": 3.3670893295044344e-06, "loss": 0.6141, "step": 4995 }, { "epoch": 0.405750020303744, "grad_norm": 6.241751717955846, "learning_rate": 3.3664724986489368e-06, "loss": 0.539, "step": 4996 }, { "epoch": 0.4058312352797856, "grad_norm": 5.913411596718994, "learning_rate": 3.3658556078381283e-06, "loss": 0.4779, "step": 4997 }, { "epoch": 0.4059124502558272, "grad_norm": 5.987603839595508, "learning_rate": 3.3652386571146945e-06, "loss": 0.4415, "step": 4998 }, { "epoch": 0.40599366523186875, "grad_norm": 5.342255084971644, "learning_rate": 3.3646216465213245e-06, "loss": 0.5426, "step": 4999 }, { "epoch": 0.4060748802079103, "grad_norm": 5.881978781645484, "learning_rate": 3.364004576100712e-06, "loss": 0.5235, "step": 5000 }, { "epoch": 0.4061560951839519, "grad_norm": 5.907061959419017, "learning_rate": 3.3633874458955573e-06, "loss": 0.5061, "step": 5001 }, { "epoch": 0.4062373101599935, "grad_norm": 4.640005094027489, "learning_rate": 3.362770255948559e-06, "loss": 0.5892, "step": 5002 }, { "epoch": 0.40631852513603506, "grad_norm": 5.450649196042516, "learning_rate": 3.3621530063024257e-06, "loss": 0.6451, "step": 5003 }, { "epoch": 0.4063997401120767, "grad_norm": 12.30013823840977, "learning_rate": 3.3615356969998676e-06, "loss": 0.5278, "step": 5004 }, { "epoch": 0.40648095508811827, "grad_norm": 3.2285712876853156, "learning_rate": 3.360918328083598e-06, "loss": 0.6247, "step": 5005 }, { "epoch": 0.40656217006415984, "grad_norm": 4.8576215977033135, "learning_rate": 3.3603008995963373e-06, "loss": 0.527, "step": 5006 }, { "epoch": 0.4066433850402014, "grad_norm": 6.983405839371138, "learning_rate": 3.3596834115808074e-06, "loss": 0.5264, "step": 5007 }, { "epoch": 0.406724600016243, "grad_norm": 4.393414313571411, "learning_rate": 3.3590658640797346e-06, "loss": 0.6397, "step": 5008 }, { "epoch": 0.4068058149922846, "grad_norm": 5.416974817120511, "learning_rate": 3.3584482571358513e-06, "loss": 0.4797, "step": 5009 }, { "epoch": 0.40688702996832615, "grad_norm": 3.780442832500498, "learning_rate": 3.357830590791891e-06, "loss": 0.452, "step": 5010 }, { "epoch": 0.40696824494436773, "grad_norm": 4.203958087565441, "learning_rate": 3.3572128650905946e-06, "loss": 0.4962, "step": 5011 }, { "epoch": 0.4070494599204093, "grad_norm": 7.016929878457646, "learning_rate": 3.3565950800747038e-06, "loss": 0.5625, "step": 5012 }, { "epoch": 0.4071306748964509, "grad_norm": 5.779316536373502, "learning_rate": 3.355977235786968e-06, "loss": 0.544, "step": 5013 }, { "epoch": 0.40721188987249246, "grad_norm": 4.419872849655686, "learning_rate": 3.3553593322701374e-06, "loss": 0.5445, "step": 5014 }, { "epoch": 0.4072931048485341, "grad_norm": 19.583565435812158, "learning_rate": 3.3547413695669673e-06, "loss": 0.4479, "step": 5015 }, { "epoch": 0.40737431982457567, "grad_norm": 6.737000577753693, "learning_rate": 3.3541233477202184e-06, "loss": 0.6231, "step": 5016 }, { "epoch": 0.40745553480061725, "grad_norm": 5.651619729539659, "learning_rate": 3.3535052667726546e-06, "loss": 0.4791, "step": 5017 }, { "epoch": 0.4075367497766588, "grad_norm": 13.01736060108638, "learning_rate": 3.352887126767043e-06, "loss": 0.618, "step": 5018 }, { "epoch": 0.4076179647527004, "grad_norm": 7.913220552107575, "learning_rate": 3.352268927746156e-06, "loss": 0.5014, "step": 5019 }, { "epoch": 0.407699179728742, "grad_norm": 5.00487843260526, "learning_rate": 3.3516506697527706e-06, "loss": 0.7005, "step": 5020 }, { "epoch": 0.40778039470478356, "grad_norm": 4.0699036925411045, "learning_rate": 3.3510323528296656e-06, "loss": 0.5042, "step": 5021 }, { "epoch": 0.40786160968082513, "grad_norm": 3.9550635463784944, "learning_rate": 3.3504139770196252e-06, "loss": 0.5318, "step": 5022 }, { "epoch": 0.4079428246568667, "grad_norm": 3.9690809902318605, "learning_rate": 3.3497955423654395e-06, "loss": 0.5501, "step": 5023 }, { "epoch": 0.4080240396329083, "grad_norm": 5.238253417658213, "learning_rate": 3.349177048909899e-06, "loss": 0.4221, "step": 5024 }, { "epoch": 0.40810525460894986, "grad_norm": 6.31641058902131, "learning_rate": 3.3485584966958005e-06, "loss": 0.5599, "step": 5025 }, { "epoch": 0.4081864695849915, "grad_norm": 4.773613373392485, "learning_rate": 3.3479398857659464e-06, "loss": 0.672, "step": 5026 }, { "epoch": 0.4082676845610331, "grad_norm": 6.145123089078632, "learning_rate": 3.3473212161631385e-06, "loss": 0.4964, "step": 5027 }, { "epoch": 0.40834889953707465, "grad_norm": 4.502728069285821, "learning_rate": 3.3467024879301873e-06, "loss": 0.633, "step": 5028 }, { "epoch": 0.4084301145131162, "grad_norm": 4.232356275831273, "learning_rate": 3.346083701109905e-06, "loss": 0.7541, "step": 5029 }, { "epoch": 0.4085113294891578, "grad_norm": 6.5669331099437045, "learning_rate": 3.3454648557451087e-06, "loss": 0.6154, "step": 5030 }, { "epoch": 0.4085925444651994, "grad_norm": 3.585238962477474, "learning_rate": 3.3448459518786193e-06, "loss": 0.563, "step": 5031 }, { "epoch": 0.40867375944124096, "grad_norm": 7.113560531413456, "learning_rate": 3.3442269895532604e-06, "loss": 0.4665, "step": 5032 }, { "epoch": 0.40875497441728254, "grad_norm": 4.184427215416845, "learning_rate": 3.3436079688118618e-06, "loss": 0.4255, "step": 5033 }, { "epoch": 0.4088361893933241, "grad_norm": 3.690634979166768, "learning_rate": 3.3429888896972575e-06, "loss": 0.4262, "step": 5034 }, { "epoch": 0.4089174043693657, "grad_norm": 3.7301420071620424, "learning_rate": 3.3423697522522823e-06, "loss": 0.5172, "step": 5035 }, { "epoch": 0.40899861934540727, "grad_norm": 4.87720646565865, "learning_rate": 3.3417505565197794e-06, "loss": 0.6716, "step": 5036 }, { "epoch": 0.4090798343214489, "grad_norm": 3.6748482560950615, "learning_rate": 3.3411313025425927e-06, "loss": 0.5472, "step": 5037 }, { "epoch": 0.4091610492974905, "grad_norm": 5.7048262277305275, "learning_rate": 3.340511990363571e-06, "loss": 0.3746, "step": 5038 }, { "epoch": 0.40924226427353205, "grad_norm": 5.363584407358412, "learning_rate": 3.3398926200255684e-06, "loss": 0.3669, "step": 5039 }, { "epoch": 0.40932347924957363, "grad_norm": 4.185826585370702, "learning_rate": 3.3392731915714417e-06, "loss": 0.5765, "step": 5040 }, { "epoch": 0.4094046942256152, "grad_norm": 5.9673921212077286, "learning_rate": 3.338653705044051e-06, "loss": 0.574, "step": 5041 }, { "epoch": 0.4094859092016568, "grad_norm": 13.697669906869592, "learning_rate": 3.3380341604862633e-06, "loss": 0.4787, "step": 5042 }, { "epoch": 0.40956712417769836, "grad_norm": 18.101167067110552, "learning_rate": 3.3374145579409467e-06, "loss": 0.5793, "step": 5043 }, { "epoch": 0.40964833915373994, "grad_norm": 33.302750260887564, "learning_rate": 3.3367948974509743e-06, "loss": 0.4709, "step": 5044 }, { "epoch": 0.4097295541297815, "grad_norm": 5.565821515126126, "learning_rate": 3.336175179059224e-06, "loss": 0.4976, "step": 5045 }, { "epoch": 0.4098107691058231, "grad_norm": 4.92473685068305, "learning_rate": 3.335555402808577e-06, "loss": 0.5237, "step": 5046 }, { "epoch": 0.40989198408186467, "grad_norm": 3.6230723915327343, "learning_rate": 3.334935568741918e-06, "loss": 0.505, "step": 5047 }, { "epoch": 0.4099731990579063, "grad_norm": 4.073098074480497, "learning_rate": 3.3343156769021355e-06, "loss": 0.5275, "step": 5048 }, { "epoch": 0.4100544140339479, "grad_norm": 3.908868759278294, "learning_rate": 3.333695727332125e-06, "loss": 0.635, "step": 5049 }, { "epoch": 0.41013562900998946, "grad_norm": 4.898135196202766, "learning_rate": 3.3330757200747828e-06, "loss": 0.4958, "step": 5050 }, { "epoch": 0.41021684398603103, "grad_norm": 11.463379753224626, "learning_rate": 3.332455655173008e-06, "loss": 0.4647, "step": 5051 }, { "epoch": 0.4102980589620726, "grad_norm": 4.243407156064371, "learning_rate": 3.3318355326697093e-06, "loss": 0.6904, "step": 5052 }, { "epoch": 0.4103792739381142, "grad_norm": 3.5903815519776585, "learning_rate": 3.3312153526077933e-06, "loss": 0.5248, "step": 5053 }, { "epoch": 0.41046048891415576, "grad_norm": 5.5607811919006345, "learning_rate": 3.330595115030174e-06, "loss": 0.4837, "step": 5054 }, { "epoch": 0.41054170389019734, "grad_norm": 4.396444235590735, "learning_rate": 3.3299748199797686e-06, "loss": 0.4331, "step": 5055 }, { "epoch": 0.4106229188662389, "grad_norm": 5.6326755354518845, "learning_rate": 3.3293544674994987e-06, "loss": 0.5109, "step": 5056 }, { "epoch": 0.4107041338422805, "grad_norm": 6.458559629689074, "learning_rate": 3.328734057632289e-06, "loss": 0.5488, "step": 5057 }, { "epoch": 0.41078534881832207, "grad_norm": 6.259903972340533, "learning_rate": 3.328113590421068e-06, "loss": 0.787, "step": 5058 }, { "epoch": 0.4108665637943637, "grad_norm": 5.031847327165066, "learning_rate": 3.3274930659087694e-06, "loss": 0.6045, "step": 5059 }, { "epoch": 0.4109477787704053, "grad_norm": 12.757281627069613, "learning_rate": 3.3268724841383302e-06, "loss": 0.5007, "step": 5060 }, { "epoch": 0.41102899374644686, "grad_norm": 8.972711433026783, "learning_rate": 3.3262518451526916e-06, "loss": 0.6009, "step": 5061 }, { "epoch": 0.41111020872248843, "grad_norm": 3.924314034573167, "learning_rate": 3.3256311489947973e-06, "loss": 0.4718, "step": 5062 }, { "epoch": 0.41119142369853, "grad_norm": 3.8326582602133796, "learning_rate": 3.3250103957075987e-06, "loss": 0.6721, "step": 5063 }, { "epoch": 0.4112726386745716, "grad_norm": 5.858057317848575, "learning_rate": 3.3243895853340445e-06, "loss": 0.3982, "step": 5064 }, { "epoch": 0.41135385365061317, "grad_norm": 4.79530936974041, "learning_rate": 3.323768717917096e-06, "loss": 0.6906, "step": 5065 }, { "epoch": 0.41143506862665474, "grad_norm": 3.761887070675124, "learning_rate": 3.323147793499712e-06, "loss": 0.5835, "step": 5066 }, { "epoch": 0.4115162836026963, "grad_norm": 4.8103574968475735, "learning_rate": 3.3225268121248567e-06, "loss": 0.5361, "step": 5067 }, { "epoch": 0.4115974985787379, "grad_norm": 3.859720909374451, "learning_rate": 3.321905773835498e-06, "loss": 0.6076, "step": 5068 }, { "epoch": 0.4116787135547795, "grad_norm": 3.758573255947487, "learning_rate": 3.3212846786746113e-06, "loss": 0.4216, "step": 5069 }, { "epoch": 0.4117599285308211, "grad_norm": 7.220042667589109, "learning_rate": 3.3206635266851707e-06, "loss": 0.5195, "step": 5070 }, { "epoch": 0.4118411435068627, "grad_norm": 5.446095850430871, "learning_rate": 3.320042317910157e-06, "loss": 0.5241, "step": 5071 }, { "epoch": 0.41192235848290426, "grad_norm": 4.589008245862078, "learning_rate": 3.319421052392556e-06, "loss": 0.6736, "step": 5072 }, { "epoch": 0.41200357345894584, "grad_norm": 9.586999090184612, "learning_rate": 3.318799730175354e-06, "loss": 0.5927, "step": 5073 }, { "epoch": 0.4120847884349874, "grad_norm": 3.790231648461632, "learning_rate": 3.3181783513015443e-06, "loss": 0.5262, "step": 5074 }, { "epoch": 0.412166003411029, "grad_norm": 7.8339239381427435, "learning_rate": 3.317556915814123e-06, "loss": 0.5578, "step": 5075 }, { "epoch": 0.41224721838707057, "grad_norm": 4.189621669708876, "learning_rate": 3.31693542375609e-06, "loss": 0.4758, "step": 5076 }, { "epoch": 0.41232843336311215, "grad_norm": 3.23569345024193, "learning_rate": 3.316313875170449e-06, "loss": 0.5989, "step": 5077 }, { "epoch": 0.4124096483391537, "grad_norm": 5.092573629488183, "learning_rate": 3.3156922701002082e-06, "loss": 0.4451, "step": 5078 }, { "epoch": 0.4124908633151953, "grad_norm": 3.7634789724489774, "learning_rate": 3.3150706085883795e-06, "loss": 0.6206, "step": 5079 }, { "epoch": 0.4125720782912369, "grad_norm": 9.494128622068938, "learning_rate": 3.3144488906779775e-06, "loss": 0.5531, "step": 5080 }, { "epoch": 0.4126532932672785, "grad_norm": 5.01807369882204, "learning_rate": 3.3138271164120235e-06, "loss": 0.4668, "step": 5081 }, { "epoch": 0.4127345082433201, "grad_norm": 4.224562759189403, "learning_rate": 3.3132052858335405e-06, "loss": 0.471, "step": 5082 }, { "epoch": 0.41281572321936166, "grad_norm": 4.452581770491441, "learning_rate": 3.312583398985555e-06, "loss": 0.478, "step": 5083 }, { "epoch": 0.41289693819540324, "grad_norm": 6.547729750220411, "learning_rate": 3.3119614559110986e-06, "loss": 0.7315, "step": 5084 }, { "epoch": 0.4129781531714448, "grad_norm": 3.7189334139526307, "learning_rate": 3.3113394566532076e-06, "loss": 0.4778, "step": 5085 }, { "epoch": 0.4130593681474864, "grad_norm": 5.682620271914815, "learning_rate": 3.310717401254919e-06, "loss": 0.6956, "step": 5086 }, { "epoch": 0.41314058312352797, "grad_norm": 5.539361693130007, "learning_rate": 3.3100952897592774e-06, "loss": 0.4507, "step": 5087 }, { "epoch": 0.41322179809956955, "grad_norm": 5.822291658506241, "learning_rate": 3.3094731222093297e-06, "loss": 0.5914, "step": 5088 }, { "epoch": 0.4133030130756111, "grad_norm": 4.73471914454109, "learning_rate": 3.3088508986481256e-06, "loss": 0.4881, "step": 5089 }, { "epoch": 0.4133842280516527, "grad_norm": 6.22074010943859, "learning_rate": 3.30822861911872e-06, "loss": 0.603, "step": 5090 }, { "epoch": 0.4134654430276943, "grad_norm": 6.1051785076416385, "learning_rate": 3.3076062836641716e-06, "loss": 0.43, "step": 5091 }, { "epoch": 0.4135466580037359, "grad_norm": 4.391606365906377, "learning_rate": 3.306983892327542e-06, "loss": 0.4868, "step": 5092 }, { "epoch": 0.4136278729797775, "grad_norm": 3.796872062788742, "learning_rate": 3.306361445151899e-06, "loss": 0.5556, "step": 5093 }, { "epoch": 0.41370908795581907, "grad_norm": 5.3352051929174165, "learning_rate": 3.3057389421803104e-06, "loss": 0.5305, "step": 5094 }, { "epoch": 0.41379030293186064, "grad_norm": 5.507108563864587, "learning_rate": 3.305116383455852e-06, "loss": 0.8252, "step": 5095 }, { "epoch": 0.4138715179079022, "grad_norm": 3.239020837952215, "learning_rate": 3.304493769021601e-06, "loss": 0.5663, "step": 5096 }, { "epoch": 0.4139527328839438, "grad_norm": 5.771163705049402, "learning_rate": 3.3038710989206386e-06, "loss": 0.5291, "step": 5097 }, { "epoch": 0.4140339478599854, "grad_norm": 7.098970415020295, "learning_rate": 3.303248373196051e-06, "loss": 0.5858, "step": 5098 }, { "epoch": 0.41411516283602695, "grad_norm": 6.2763287339806535, "learning_rate": 3.3026255918909267e-06, "loss": 0.4381, "step": 5099 }, { "epoch": 0.41419637781206853, "grad_norm": 3.6575950544234708, "learning_rate": 3.302002755048359e-06, "loss": 0.7504, "step": 5100 }, { "epoch": 0.4142775927881101, "grad_norm": 4.90381745333394, "learning_rate": 3.3013798627114457e-06, "loss": 0.3803, "step": 5101 }, { "epoch": 0.4143588077641517, "grad_norm": 4.885461667100054, "learning_rate": 3.300756914923287e-06, "loss": 0.4982, "step": 5102 }, { "epoch": 0.4144400227401933, "grad_norm": 4.517776876693077, "learning_rate": 3.3001339117269883e-06, "loss": 0.6321, "step": 5103 }, { "epoch": 0.4145212377162349, "grad_norm": 4.594992279285769, "learning_rate": 3.2995108531656566e-06, "loss": 0.4652, "step": 5104 }, { "epoch": 0.41460245269227647, "grad_norm": 4.353302582897617, "learning_rate": 3.298887739282406e-06, "loss": 0.5871, "step": 5105 }, { "epoch": 0.41468366766831805, "grad_norm": 8.458802751943896, "learning_rate": 3.298264570120351e-06, "loss": 0.7833, "step": 5106 }, { "epoch": 0.4147648826443596, "grad_norm": 3.170384706895752, "learning_rate": 3.297641345722613e-06, "loss": 0.5037, "step": 5107 }, { "epoch": 0.4148460976204012, "grad_norm": 2.9676348563178094, "learning_rate": 3.2970180661323155e-06, "loss": 0.6279, "step": 5108 }, { "epoch": 0.4149273125964428, "grad_norm": 4.616218599976382, "learning_rate": 3.2963947313925857e-06, "loss": 0.559, "step": 5109 }, { "epoch": 0.41500852757248435, "grad_norm": 6.559911673941787, "learning_rate": 3.295771341546555e-06, "loss": 0.5066, "step": 5110 }, { "epoch": 0.41508974254852593, "grad_norm": 4.702456652104927, "learning_rate": 3.2951478966373602e-06, "loss": 0.4724, "step": 5111 }, { "epoch": 0.4151709575245675, "grad_norm": 6.625773908660462, "learning_rate": 3.2945243967081386e-06, "loss": 0.5304, "step": 5112 }, { "epoch": 0.4152521725006091, "grad_norm": 4.794677100016037, "learning_rate": 3.2939008418020334e-06, "loss": 0.5455, "step": 5113 }, { "epoch": 0.4153333874766507, "grad_norm": 4.228847169637584, "learning_rate": 3.293277231962192e-06, "loss": 0.4838, "step": 5114 }, { "epoch": 0.4154146024526923, "grad_norm": 5.780889261663269, "learning_rate": 3.292653567231765e-06, "loss": 0.5851, "step": 5115 }, { "epoch": 0.41549581742873387, "grad_norm": 3.5767019478443514, "learning_rate": 3.2920298476539047e-06, "loss": 0.4035, "step": 5116 }, { "epoch": 0.41557703240477545, "grad_norm": 3.173589475313961, "learning_rate": 3.2914060732717725e-06, "loss": 0.4834, "step": 5117 }, { "epoch": 0.415658247380817, "grad_norm": 5.702832664803554, "learning_rate": 3.290782244128527e-06, "loss": 0.4122, "step": 5118 }, { "epoch": 0.4157394623568586, "grad_norm": 4.350245123381758, "learning_rate": 3.290158360267336e-06, "loss": 0.7049, "step": 5119 }, { "epoch": 0.4158206773329002, "grad_norm": 3.6272992204161887, "learning_rate": 3.2895344217313683e-06, "loss": 0.5299, "step": 5120 }, { "epoch": 0.41590189230894176, "grad_norm": 3.9213077791760966, "learning_rate": 3.2889104285637967e-06, "loss": 0.5795, "step": 5121 }, { "epoch": 0.41598310728498333, "grad_norm": 4.304699561537021, "learning_rate": 3.2882863808077993e-06, "loss": 0.5066, "step": 5122 }, { "epoch": 0.4160643222610249, "grad_norm": 13.4372391747002, "learning_rate": 3.287662278506556e-06, "loss": 0.4601, "step": 5123 }, { "epoch": 0.4161455372370665, "grad_norm": 8.076930536538704, "learning_rate": 3.2870381217032522e-06, "loss": 0.4771, "step": 5124 }, { "epoch": 0.4162267522131081, "grad_norm": 5.712865061327913, "learning_rate": 3.2864139104410753e-06, "loss": 0.3819, "step": 5125 }, { "epoch": 0.4163079671891497, "grad_norm": 8.244135425876427, "learning_rate": 3.2857896447632174e-06, "loss": 0.5816, "step": 5126 }, { "epoch": 0.4163891821651913, "grad_norm": 3.9304755927131785, "learning_rate": 3.2851653247128755e-06, "loss": 0.5813, "step": 5127 }, { "epoch": 0.41647039714123285, "grad_norm": 3.476794981739987, "learning_rate": 3.2845409503332488e-06, "loss": 0.7026, "step": 5128 }, { "epoch": 0.41655161211727443, "grad_norm": 6.988919037400744, "learning_rate": 3.2839165216675396e-06, "loss": 0.3883, "step": 5129 }, { "epoch": 0.416632827093316, "grad_norm": 9.043393482730245, "learning_rate": 3.283292038758956e-06, "loss": 0.6782, "step": 5130 }, { "epoch": 0.4167140420693576, "grad_norm": 9.047180698274353, "learning_rate": 3.2826675016507094e-06, "loss": 0.3717, "step": 5131 }, { "epoch": 0.41679525704539916, "grad_norm": 4.352324808412342, "learning_rate": 3.2820429103860133e-06, "loss": 0.5285, "step": 5132 }, { "epoch": 0.41687647202144074, "grad_norm": 4.3428722591294155, "learning_rate": 3.281418265008087e-06, "loss": 0.571, "step": 5133 }, { "epoch": 0.4169576869974823, "grad_norm": 4.529875570698177, "learning_rate": 3.280793565560153e-06, "loss": 0.6332, "step": 5134 }, { "epoch": 0.41703890197352395, "grad_norm": 10.119978089195198, "learning_rate": 3.280168812085436e-06, "loss": 0.4528, "step": 5135 }, { "epoch": 0.4171201169495655, "grad_norm": 3.973556997891654, "learning_rate": 3.279544004627166e-06, "loss": 0.6027, "step": 5136 }, { "epoch": 0.4172013319256071, "grad_norm": 3.7856125466570814, "learning_rate": 3.2789191432285767e-06, "loss": 0.5312, "step": 5137 }, { "epoch": 0.4172825469016487, "grad_norm": 3.897046711276253, "learning_rate": 3.278294227932905e-06, "loss": 0.4928, "step": 5138 }, { "epoch": 0.41736376187769025, "grad_norm": 3.9807540904239715, "learning_rate": 3.277669258783391e-06, "loss": 0.4981, "step": 5139 }, { "epoch": 0.41744497685373183, "grad_norm": 4.193176888500685, "learning_rate": 3.277044235823281e-06, "loss": 0.5247, "step": 5140 }, { "epoch": 0.4175261918297734, "grad_norm": 4.102295880640859, "learning_rate": 3.2764191590958234e-06, "loss": 0.5928, "step": 5141 }, { "epoch": 0.417607406805815, "grad_norm": 4.4166185861935, "learning_rate": 3.2757940286442676e-06, "loss": 0.4061, "step": 5142 }, { "epoch": 0.41768862178185656, "grad_norm": 6.205963915441601, "learning_rate": 3.2751688445118705e-06, "loss": 0.5939, "step": 5143 }, { "epoch": 0.41776983675789814, "grad_norm": 9.375588558168431, "learning_rate": 3.2745436067418934e-06, "loss": 0.5445, "step": 5144 }, { "epoch": 0.4178510517339397, "grad_norm": 7.231124759113749, "learning_rate": 3.2739183153775964e-06, "loss": 0.3809, "step": 5145 }, { "epoch": 0.41793226670998135, "grad_norm": 5.36109001138212, "learning_rate": 3.2732929704622485e-06, "loss": 0.4421, "step": 5146 }, { "epoch": 0.4180134816860229, "grad_norm": 5.1614659053586545, "learning_rate": 3.2726675720391203e-06, "loss": 0.4925, "step": 5147 }, { "epoch": 0.4180946966620645, "grad_norm": 5.095668465921769, "learning_rate": 3.272042120151485e-06, "loss": 0.7233, "step": 5148 }, { "epoch": 0.4181759116381061, "grad_norm": 6.655912818912801, "learning_rate": 3.2714166148426204e-06, "loss": 0.4635, "step": 5149 }, { "epoch": 0.41825712661414766, "grad_norm": 5.621140760979749, "learning_rate": 3.27079105615581e-06, "loss": 0.5281, "step": 5150 }, { "epoch": 0.41833834159018923, "grad_norm": 5.0469957881501255, "learning_rate": 3.2701654441343365e-06, "loss": 0.5175, "step": 5151 }, { "epoch": 0.4184195565662308, "grad_norm": 4.710792152616251, "learning_rate": 3.269539778821491e-06, "loss": 0.4824, "step": 5152 }, { "epoch": 0.4185007715422724, "grad_norm": 5.248614874618813, "learning_rate": 3.268914060260565e-06, "loss": 0.6031, "step": 5153 }, { "epoch": 0.41858198651831396, "grad_norm": 3.7091517078378096, "learning_rate": 3.2682882884948557e-06, "loss": 0.445, "step": 5154 }, { "epoch": 0.41866320149435554, "grad_norm": 7.473085780555477, "learning_rate": 3.2676624635676637e-06, "loss": 0.4225, "step": 5155 }, { "epoch": 0.4187444164703971, "grad_norm": 5.984937157684743, "learning_rate": 3.267036585522291e-06, "loss": 0.3815, "step": 5156 }, { "epoch": 0.41882563144643875, "grad_norm": 7.910654251052096, "learning_rate": 3.2664106544020464e-06, "loss": 0.4595, "step": 5157 }, { "epoch": 0.41890684642248033, "grad_norm": 5.447690662144098, "learning_rate": 3.2657846702502404e-06, "loss": 0.5105, "step": 5158 }, { "epoch": 0.4189880613985219, "grad_norm": 6.61183045497076, "learning_rate": 3.2651586331101887e-06, "loss": 0.4271, "step": 5159 }, { "epoch": 0.4190692763745635, "grad_norm": 6.227375988300761, "learning_rate": 3.2645325430252096e-06, "loss": 0.5066, "step": 5160 }, { "epoch": 0.41915049135060506, "grad_norm": 3.5873779321139714, "learning_rate": 3.2639064000386236e-06, "loss": 0.5637, "step": 5161 }, { "epoch": 0.41923170632664664, "grad_norm": 3.2262674778585376, "learning_rate": 3.2632802041937574e-06, "loss": 0.4754, "step": 5162 }, { "epoch": 0.4193129213026882, "grad_norm": 4.976278018847007, "learning_rate": 3.262653955533942e-06, "loss": 0.4554, "step": 5163 }, { "epoch": 0.4193941362787298, "grad_norm": 7.7409127238618245, "learning_rate": 3.262027654102508e-06, "loss": 0.5582, "step": 5164 }, { "epoch": 0.41947535125477137, "grad_norm": 5.01210383268172, "learning_rate": 3.2614012999427934e-06, "loss": 0.4712, "step": 5165 }, { "epoch": 0.41955656623081294, "grad_norm": 4.527219081383319, "learning_rate": 3.26077489309814e-06, "loss": 0.7056, "step": 5166 }, { "epoch": 0.4196377812068545, "grad_norm": 4.790861705461241, "learning_rate": 3.2601484336118887e-06, "loss": 0.6324, "step": 5167 }, { "epoch": 0.41971899618289615, "grad_norm": 5.6189045494809875, "learning_rate": 3.2595219215273895e-06, "loss": 0.4605, "step": 5168 }, { "epoch": 0.41980021115893773, "grad_norm": 3.3926840995979206, "learning_rate": 3.258895356887993e-06, "loss": 0.4653, "step": 5169 }, { "epoch": 0.4198814261349793, "grad_norm": 4.7133012677803405, "learning_rate": 3.2582687397370538e-06, "loss": 0.3422, "step": 5170 }, { "epoch": 0.4199626411110209, "grad_norm": 6.7508413558662355, "learning_rate": 3.257642070117931e-06, "loss": 0.4689, "step": 5171 }, { "epoch": 0.42004385608706246, "grad_norm": 7.457776695287912, "learning_rate": 3.2570153480739867e-06, "loss": 0.5847, "step": 5172 }, { "epoch": 0.42012507106310404, "grad_norm": 5.26972034775471, "learning_rate": 3.2563885736485873e-06, "loss": 0.5858, "step": 5173 }, { "epoch": 0.4202062860391456, "grad_norm": 2.8327813340648325, "learning_rate": 3.255761746885101e-06, "loss": 0.437, "step": 5174 }, { "epoch": 0.4202875010151872, "grad_norm": 4.515606181218866, "learning_rate": 3.2551348678269023e-06, "loss": 0.6866, "step": 5175 }, { "epoch": 0.42036871599122877, "grad_norm": 4.52820690426378, "learning_rate": 3.2545079365173672e-06, "loss": 0.4457, "step": 5176 }, { "epoch": 0.42044993096727035, "grad_norm": 5.8094460720614505, "learning_rate": 3.253880952999876e-06, "loss": 0.501, "step": 5177 }, { "epoch": 0.4205311459433119, "grad_norm": 9.872169395018048, "learning_rate": 3.2532539173178125e-06, "loss": 0.6308, "step": 5178 }, { "epoch": 0.42061236091935356, "grad_norm": 8.660617050371519, "learning_rate": 3.2526268295145647e-06, "loss": 0.5709, "step": 5179 }, { "epoch": 0.42069357589539513, "grad_norm": 4.736443287957975, "learning_rate": 3.251999689633523e-06, "loss": 0.4353, "step": 5180 }, { "epoch": 0.4207747908714367, "grad_norm": 10.700216416621961, "learning_rate": 3.2513724977180828e-06, "loss": 0.623, "step": 5181 }, { "epoch": 0.4208560058474783, "grad_norm": 11.331063237198684, "learning_rate": 3.250745253811643e-06, "loss": 0.4766, "step": 5182 }, { "epoch": 0.42093722082351986, "grad_norm": 4.634568286716608, "learning_rate": 3.250117957957604e-06, "loss": 0.5879, "step": 5183 }, { "epoch": 0.42101843579956144, "grad_norm": 4.031580348842282, "learning_rate": 3.249490610199373e-06, "loss": 0.4788, "step": 5184 }, { "epoch": 0.421099650775603, "grad_norm": 5.071428970099418, "learning_rate": 3.248863210580358e-06, "loss": 0.4319, "step": 5185 }, { "epoch": 0.4211808657516446, "grad_norm": 4.271403990802303, "learning_rate": 3.248235759143972e-06, "loss": 0.4285, "step": 5186 }, { "epoch": 0.4212620807276862, "grad_norm": 5.620600198973997, "learning_rate": 3.247608255933632e-06, "loss": 0.5508, "step": 5187 }, { "epoch": 0.42134329570372775, "grad_norm": 3.5656379095963366, "learning_rate": 3.2469807009927568e-06, "loss": 0.5465, "step": 5188 }, { "epoch": 0.4214245106797693, "grad_norm": 4.267563683154685, "learning_rate": 3.2463530943647708e-06, "loss": 0.6901, "step": 5189 }, { "epoch": 0.42150572565581096, "grad_norm": 6.968445424375631, "learning_rate": 3.2457254360931013e-06, "loss": 0.6524, "step": 5190 }, { "epoch": 0.42158694063185254, "grad_norm": 6.404940362152012, "learning_rate": 3.245097726221177e-06, "loss": 0.4577, "step": 5191 }, { "epoch": 0.4216681556078941, "grad_norm": 10.422891227755217, "learning_rate": 3.244469964792434e-06, "loss": 0.487, "step": 5192 }, { "epoch": 0.4217493705839357, "grad_norm": 4.235778167208473, "learning_rate": 3.24384215185031e-06, "loss": 0.5221, "step": 5193 }, { "epoch": 0.42183058555997727, "grad_norm": 5.314122103787476, "learning_rate": 3.2432142874382442e-06, "loss": 0.5772, "step": 5194 }, { "epoch": 0.42191180053601884, "grad_norm": 34.115592182605965, "learning_rate": 3.2425863715996852e-06, "loss": 0.6579, "step": 5195 }, { "epoch": 0.4219930155120604, "grad_norm": 5.189198561105788, "learning_rate": 3.241958404378078e-06, "loss": 0.5213, "step": 5196 }, { "epoch": 0.422074230488102, "grad_norm": 4.933122374718706, "learning_rate": 3.2413303858168767e-06, "loss": 0.6707, "step": 5197 }, { "epoch": 0.4221554454641436, "grad_norm": 4.33962971377441, "learning_rate": 3.2407023159595356e-06, "loss": 0.4882, "step": 5198 }, { "epoch": 0.42223666044018515, "grad_norm": 6.399904760111607, "learning_rate": 3.2400741948495146e-06, "loss": 0.383, "step": 5199 }, { "epoch": 0.42231787541622673, "grad_norm": 11.60358668566535, "learning_rate": 3.239446022530276e-06, "loss": 0.6851, "step": 5200 }, { "epoch": 0.42239909039226836, "grad_norm": 10.376518506368765, "learning_rate": 3.2388177990452863e-06, "loss": 0.5153, "step": 5201 }, { "epoch": 0.42248030536830994, "grad_norm": 5.087706195580212, "learning_rate": 3.2381895244380146e-06, "loss": 0.4698, "step": 5202 }, { "epoch": 0.4225615203443515, "grad_norm": 3.447914836981935, "learning_rate": 3.237561198751935e-06, "loss": 0.5198, "step": 5203 }, { "epoch": 0.4226427353203931, "grad_norm": 7.767514821519397, "learning_rate": 3.2369328220305242e-06, "loss": 0.5854, "step": 5204 }, { "epoch": 0.42272395029643467, "grad_norm": 7.5527718662260295, "learning_rate": 3.2363043943172616e-06, "loss": 0.4495, "step": 5205 }, { "epoch": 0.42280516527247625, "grad_norm": 6.408787873539771, "learning_rate": 3.235675915655633e-06, "loss": 0.4103, "step": 5206 }, { "epoch": 0.4228863802485178, "grad_norm": 8.10134326274207, "learning_rate": 3.235047386089123e-06, "loss": 0.4829, "step": 5207 }, { "epoch": 0.4229675952245594, "grad_norm": 4.1969050205146425, "learning_rate": 3.2344188056612247e-06, "loss": 0.5541, "step": 5208 }, { "epoch": 0.423048810200601, "grad_norm": 4.954025585754433, "learning_rate": 3.233790174415432e-06, "loss": 0.4104, "step": 5209 }, { "epoch": 0.42313002517664255, "grad_norm": 5.191981721617316, "learning_rate": 3.2331614923952424e-06, "loss": 0.6211, "step": 5210 }, { "epoch": 0.42321124015268413, "grad_norm": 10.46456809201404, "learning_rate": 3.232532759644158e-06, "loss": 0.5638, "step": 5211 }, { "epoch": 0.42329245512872576, "grad_norm": 4.916424280031669, "learning_rate": 3.231903976205684e-06, "loss": 0.4139, "step": 5212 }, { "epoch": 0.42337367010476734, "grad_norm": 4.247004575276243, "learning_rate": 3.231275142123328e-06, "loss": 0.3899, "step": 5213 }, { "epoch": 0.4234548850808089, "grad_norm": 4.899674792955789, "learning_rate": 3.2306462574406024e-06, "loss": 0.697, "step": 5214 }, { "epoch": 0.4235361000568505, "grad_norm": 4.934275124900704, "learning_rate": 3.2300173222010225e-06, "loss": 0.6492, "step": 5215 }, { "epoch": 0.42361731503289207, "grad_norm": 4.326952640941383, "learning_rate": 3.229388336448107e-06, "loss": 0.5192, "step": 5216 }, { "epoch": 0.42369853000893365, "grad_norm": 6.478214535298061, "learning_rate": 3.22875930022538e-06, "loss": 0.5203, "step": 5217 }, { "epoch": 0.4237797449849752, "grad_norm": 3.7343533496893393, "learning_rate": 3.2281302135763655e-06, "loss": 0.5004, "step": 5218 }, { "epoch": 0.4238609599610168, "grad_norm": 4.189623891419984, "learning_rate": 3.227501076544594e-06, "loss": 0.5008, "step": 5219 }, { "epoch": 0.4239421749370584, "grad_norm": 9.421806026431668, "learning_rate": 3.2268718891735985e-06, "loss": 0.4428, "step": 5220 }, { "epoch": 0.42402338991309996, "grad_norm": 8.411484589237242, "learning_rate": 3.2262426515069144e-06, "loss": 0.5658, "step": 5221 }, { "epoch": 0.42410460488914153, "grad_norm": 5.809866223556941, "learning_rate": 3.225613363588084e-06, "loss": 0.5688, "step": 5222 }, { "epoch": 0.42418581986518317, "grad_norm": 4.3592044879636465, "learning_rate": 3.2249840254606474e-06, "loss": 0.5421, "step": 5223 }, { "epoch": 0.42426703484122474, "grad_norm": 2.8417643428931147, "learning_rate": 3.2243546371681535e-06, "loss": 0.5438, "step": 5224 }, { "epoch": 0.4243482498172663, "grad_norm": 4.321132502798912, "learning_rate": 3.2237251987541535e-06, "loss": 0.669, "step": 5225 }, { "epoch": 0.4244294647933079, "grad_norm": 17.616950980213982, "learning_rate": 3.223095710262199e-06, "loss": 0.5083, "step": 5226 }, { "epoch": 0.4245106797693495, "grad_norm": 6.679913112655923, "learning_rate": 3.2224661717358474e-06, "loss": 0.5919, "step": 5227 }, { "epoch": 0.42459189474539105, "grad_norm": 4.47571261524808, "learning_rate": 3.221836583218662e-06, "loss": 0.376, "step": 5228 }, { "epoch": 0.42467310972143263, "grad_norm": 8.501783709365114, "learning_rate": 3.221206944754205e-06, "loss": 0.4275, "step": 5229 }, { "epoch": 0.4247543246974742, "grad_norm": 4.892245082709042, "learning_rate": 3.220577256386043e-06, "loss": 0.6864, "step": 5230 }, { "epoch": 0.4248355396735158, "grad_norm": 5.054086864480611, "learning_rate": 3.21994751815775e-06, "loss": 0.5241, "step": 5231 }, { "epoch": 0.42491675464955736, "grad_norm": 3.2768063978983095, "learning_rate": 3.2193177301128985e-06, "loss": 0.5298, "step": 5232 }, { "epoch": 0.42499796962559894, "grad_norm": 4.649050909544704, "learning_rate": 3.2186878922950672e-06, "loss": 0.6465, "step": 5233 }, { "epoch": 0.42507918460164057, "grad_norm": 4.194611280947634, "learning_rate": 3.218058004747837e-06, "loss": 0.5813, "step": 5234 }, { "epoch": 0.42516039957768215, "grad_norm": 12.035010001320948, "learning_rate": 3.2174280675147933e-06, "loss": 0.444, "step": 5235 }, { "epoch": 0.4252416145537237, "grad_norm": 3.9279173401772285, "learning_rate": 3.2167980806395244e-06, "loss": 0.4574, "step": 5236 }, { "epoch": 0.4253228295297653, "grad_norm": 3.760359692561372, "learning_rate": 3.216168044165622e-06, "loss": 0.4997, "step": 5237 }, { "epoch": 0.4254040445058069, "grad_norm": 6.915480718748514, "learning_rate": 3.215537958136681e-06, "loss": 0.5207, "step": 5238 }, { "epoch": 0.42548525948184845, "grad_norm": 9.393926456415134, "learning_rate": 3.2149078225963e-06, "loss": 0.4996, "step": 5239 }, { "epoch": 0.42556647445789003, "grad_norm": 5.607193609498181, "learning_rate": 3.2142776375880814e-06, "loss": 0.6179, "step": 5240 }, { "epoch": 0.4256476894339316, "grad_norm": 7.777688950913407, "learning_rate": 3.213647403155631e-06, "loss": 0.4407, "step": 5241 }, { "epoch": 0.4257289044099732, "grad_norm": 5.406198429944911, "learning_rate": 3.213017119342557e-06, "loss": 0.4342, "step": 5242 }, { "epoch": 0.42581011938601476, "grad_norm": 4.332500644815715, "learning_rate": 3.2123867861924705e-06, "loss": 0.547, "step": 5243 }, { "epoch": 0.42589133436205634, "grad_norm": 4.158064015085327, "learning_rate": 3.211756403748991e-06, "loss": 0.6803, "step": 5244 }, { "epoch": 0.42597254933809797, "grad_norm": 4.670204213436716, "learning_rate": 3.211125972055734e-06, "loss": 0.3712, "step": 5245 }, { "epoch": 0.42605376431413955, "grad_norm": 3.1072025340370835, "learning_rate": 3.210495491156323e-06, "loss": 0.6592, "step": 5246 }, { "epoch": 0.4261349792901811, "grad_norm": 3.845276286103103, "learning_rate": 3.2098649610943855e-06, "loss": 0.3973, "step": 5247 }, { "epoch": 0.4262161942662227, "grad_norm": 5.263686826495678, "learning_rate": 3.2092343819135485e-06, "loss": 0.4361, "step": 5248 }, { "epoch": 0.4262974092422643, "grad_norm": 4.706614514686621, "learning_rate": 3.2086037536574467e-06, "loss": 0.5642, "step": 5249 }, { "epoch": 0.42637862421830586, "grad_norm": 7.9955278197924455, "learning_rate": 3.207973076369715e-06, "loss": 0.4473, "step": 5250 }, { "epoch": 0.42645983919434743, "grad_norm": 5.6591134035265505, "learning_rate": 3.2073423500939926e-06, "loss": 0.4853, "step": 5251 }, { "epoch": 0.426541054170389, "grad_norm": 8.612516192884062, "learning_rate": 3.206711574873924e-06, "loss": 0.462, "step": 5252 }, { "epoch": 0.4266222691464306, "grad_norm": 5.102810963488392, "learning_rate": 3.2060807507531545e-06, "loss": 0.5559, "step": 5253 }, { "epoch": 0.42670348412247217, "grad_norm": 7.341129074246249, "learning_rate": 3.2054498777753335e-06, "loss": 0.5183, "step": 5254 }, { "epoch": 0.42678469909851374, "grad_norm": 5.00922975998508, "learning_rate": 3.204818955984115e-06, "loss": 0.4557, "step": 5255 }, { "epoch": 0.4268659140745554, "grad_norm": 3.0225137055666087, "learning_rate": 3.2041879854231545e-06, "loss": 0.6202, "step": 5256 }, { "epoch": 0.42694712905059695, "grad_norm": 5.862930940720101, "learning_rate": 3.203556966136113e-06, "loss": 0.5674, "step": 5257 }, { "epoch": 0.42702834402663853, "grad_norm": 8.760401643981206, "learning_rate": 3.202925898166652e-06, "loss": 0.4603, "step": 5258 }, { "epoch": 0.4271095590026801, "grad_norm": 7.198009661677629, "learning_rate": 3.2022947815584393e-06, "loss": 0.4481, "step": 5259 }, { "epoch": 0.4271907739787217, "grad_norm": 4.513288458419195, "learning_rate": 3.2016636163551456e-06, "loss": 0.4721, "step": 5260 }, { "epoch": 0.42727198895476326, "grad_norm": 4.556711076123282, "learning_rate": 3.2010324026004425e-06, "loss": 0.5153, "step": 5261 }, { "epoch": 0.42735320393080484, "grad_norm": 4.403874321167366, "learning_rate": 3.200401140338007e-06, "loss": 0.5405, "step": 5262 }, { "epoch": 0.4274344189068464, "grad_norm": 15.464524857508628, "learning_rate": 3.1997698296115192e-06, "loss": 0.5812, "step": 5263 }, { "epoch": 0.427515633882888, "grad_norm": 6.763078104210325, "learning_rate": 3.1991384704646632e-06, "loss": 0.4366, "step": 5264 }, { "epoch": 0.42759684885892957, "grad_norm": 3.8083736623019377, "learning_rate": 3.198507062941125e-06, "loss": 0.7049, "step": 5265 }, { "epoch": 0.42767806383497114, "grad_norm": 4.411432581791518, "learning_rate": 3.197875607084595e-06, "loss": 0.5536, "step": 5266 }, { "epoch": 0.4277592788110128, "grad_norm": 5.209356808212503, "learning_rate": 3.1972441029387664e-06, "loss": 0.7173, "step": 5267 }, { "epoch": 0.42784049378705435, "grad_norm": 4.422812878298527, "learning_rate": 3.196612550547336e-06, "loss": 0.5126, "step": 5268 }, { "epoch": 0.42792170876309593, "grad_norm": 4.063375030795528, "learning_rate": 3.1959809499540033e-06, "loss": 0.4163, "step": 5269 }, { "epoch": 0.4280029237391375, "grad_norm": 6.439580078200473, "learning_rate": 3.1953493012024728e-06, "loss": 0.46, "step": 5270 }, { "epoch": 0.4280841387151791, "grad_norm": 6.644696648989759, "learning_rate": 3.1947176043364512e-06, "loss": 0.5559, "step": 5271 }, { "epoch": 0.42816535369122066, "grad_norm": 9.625730715608832, "learning_rate": 3.194085859399647e-06, "loss": 0.4542, "step": 5272 }, { "epoch": 0.42824656866726224, "grad_norm": 6.80723990351529, "learning_rate": 3.1934540664357756e-06, "loss": 0.3875, "step": 5273 }, { "epoch": 0.4283277836433038, "grad_norm": 4.063505294236836, "learning_rate": 3.1928222254885527e-06, "loss": 0.4263, "step": 5274 }, { "epoch": 0.4284089986193454, "grad_norm": 5.254426921833707, "learning_rate": 3.192190336601698e-06, "loss": 0.601, "step": 5275 }, { "epoch": 0.42849021359538697, "grad_norm": 3.3439818153250274, "learning_rate": 3.1915583998189365e-06, "loss": 0.8393, "step": 5276 }, { "epoch": 0.42857142857142855, "grad_norm": 4.423002542739268, "learning_rate": 3.190926415183993e-06, "loss": 0.4905, "step": 5277 }, { "epoch": 0.4286526435474702, "grad_norm": 4.969141698478439, "learning_rate": 3.190294382740598e-06, "loss": 0.4598, "step": 5278 }, { "epoch": 0.42873385852351176, "grad_norm": 5.329263080541811, "learning_rate": 3.189662302532486e-06, "loss": 0.5027, "step": 5279 }, { "epoch": 0.42881507349955333, "grad_norm": 8.465008482689347, "learning_rate": 3.1890301746033914e-06, "loss": 0.4451, "step": 5280 }, { "epoch": 0.4288962884755949, "grad_norm": 4.288959841432594, "learning_rate": 3.188397998997056e-06, "loss": 0.5379, "step": 5281 }, { "epoch": 0.4289775034516365, "grad_norm": 4.307272162236529, "learning_rate": 3.1877657757572223e-06, "loss": 0.3864, "step": 5282 }, { "epoch": 0.42905871842767807, "grad_norm": 4.853723061541425, "learning_rate": 3.187133504927637e-06, "loss": 0.5744, "step": 5283 }, { "epoch": 0.42913993340371964, "grad_norm": 9.888573881930817, "learning_rate": 3.18650118655205e-06, "loss": 0.4017, "step": 5284 }, { "epoch": 0.4292211483797612, "grad_norm": 6.195263958719191, "learning_rate": 3.1858688206742135e-06, "loss": 0.4613, "step": 5285 }, { "epoch": 0.4293023633558028, "grad_norm": 5.993567524504076, "learning_rate": 3.1852364073378845e-06, "loss": 0.5304, "step": 5286 }, { "epoch": 0.4293835783318444, "grad_norm": 6.246122494535498, "learning_rate": 3.1846039465868233e-06, "loss": 0.4809, "step": 5287 }, { "epoch": 0.42946479330788595, "grad_norm": 11.344295485158227, "learning_rate": 3.1839714384647914e-06, "loss": 0.416, "step": 5288 }, { "epoch": 0.4295460082839276, "grad_norm": 6.789926972632632, "learning_rate": 3.1833388830155564e-06, "loss": 0.5286, "step": 5289 }, { "epoch": 0.42962722325996916, "grad_norm": 4.731313984743315, "learning_rate": 3.1827062802828878e-06, "loss": 0.6368, "step": 5290 }, { "epoch": 0.42970843823601074, "grad_norm": 8.422362176653303, "learning_rate": 3.182073630310557e-06, "loss": 0.4566, "step": 5291 }, { "epoch": 0.4297896532120523, "grad_norm": 5.811200383485644, "learning_rate": 3.18144093314234e-06, "loss": 0.7081, "step": 5292 }, { "epoch": 0.4298708681880939, "grad_norm": 3.464910361407899, "learning_rate": 3.180808188822019e-06, "loss": 0.6889, "step": 5293 }, { "epoch": 0.42995208316413547, "grad_norm": 3.8106177663295115, "learning_rate": 3.180175397393373e-06, "loss": 0.6863, "step": 5294 }, { "epoch": 0.43003329814017704, "grad_norm": 8.071444985141785, "learning_rate": 3.1795425589001896e-06, "loss": 0.5488, "step": 5295 }, { "epoch": 0.4301145131162186, "grad_norm": 4.430290323012749, "learning_rate": 3.178909673386257e-06, "loss": 0.5592, "step": 5296 }, { "epoch": 0.4301957280922602, "grad_norm": 4.5330502482145505, "learning_rate": 3.178276740895369e-06, "loss": 0.5002, "step": 5297 }, { "epoch": 0.4302769430683018, "grad_norm": 7.279026055668376, "learning_rate": 3.1776437614713197e-06, "loss": 0.547, "step": 5298 }, { "epoch": 0.43035815804434335, "grad_norm": 5.6210852302334855, "learning_rate": 3.177010735157909e-06, "loss": 0.6157, "step": 5299 }, { "epoch": 0.430439373020385, "grad_norm": 6.112959552581171, "learning_rate": 3.1763776619989377e-06, "loss": 0.5323, "step": 5300 }, { "epoch": 0.43052058799642656, "grad_norm": 4.4197983016582745, "learning_rate": 3.175744542038212e-06, "loss": 0.525, "step": 5301 }, { "epoch": 0.43060180297246814, "grad_norm": 5.4564596091826, "learning_rate": 3.175111375319541e-06, "loss": 0.5436, "step": 5302 }, { "epoch": 0.4306830179485097, "grad_norm": 4.9689391465007615, "learning_rate": 3.174478161886736e-06, "loss": 0.4957, "step": 5303 }, { "epoch": 0.4307642329245513, "grad_norm": 9.04638437397121, "learning_rate": 3.1738449017836102e-06, "loss": 0.4726, "step": 5304 }, { "epoch": 0.43084544790059287, "grad_norm": 6.518271460980143, "learning_rate": 3.173211595053985e-06, "loss": 0.4154, "step": 5305 }, { "epoch": 0.43092666287663445, "grad_norm": 9.139018032286032, "learning_rate": 3.17257824174168e-06, "loss": 0.6605, "step": 5306 }, { "epoch": 0.431007877852676, "grad_norm": 5.893703080064373, "learning_rate": 3.17194484189052e-06, "loss": 0.5479, "step": 5307 }, { "epoch": 0.4310890928287176, "grad_norm": 6.561881133428007, "learning_rate": 3.171311395544333e-06, "loss": 0.5071, "step": 5308 }, { "epoch": 0.4311703078047592, "grad_norm": 5.394616296616023, "learning_rate": 3.170677902746951e-06, "loss": 0.4559, "step": 5309 }, { "epoch": 0.43125152278080076, "grad_norm": 6.001462406111083, "learning_rate": 3.170044363542207e-06, "loss": 0.4429, "step": 5310 }, { "epoch": 0.4313327377568424, "grad_norm": 5.400528463179634, "learning_rate": 3.1694107779739394e-06, "loss": 0.4939, "step": 5311 }, { "epoch": 0.43141395273288397, "grad_norm": 3.4971169628242347, "learning_rate": 3.1687771460859886e-06, "loss": 0.5207, "step": 5312 }, { "epoch": 0.43149516770892554, "grad_norm": 5.801443009456409, "learning_rate": 3.168143467922199e-06, "loss": 0.537, "step": 5313 }, { "epoch": 0.4315763826849671, "grad_norm": 3.877634512463603, "learning_rate": 3.1675097435264175e-06, "loss": 0.6002, "step": 5314 }, { "epoch": 0.4316575976610087, "grad_norm": 4.436235293442774, "learning_rate": 3.166875972942494e-06, "loss": 0.5651, "step": 5315 }, { "epoch": 0.4317388126370503, "grad_norm": 3.8720642376323857, "learning_rate": 3.166242156214283e-06, "loss": 0.5255, "step": 5316 }, { "epoch": 0.43182002761309185, "grad_norm": 5.096619815050037, "learning_rate": 3.1656082933856415e-06, "loss": 0.4563, "step": 5317 }, { "epoch": 0.4319012425891334, "grad_norm": 4.129271114889403, "learning_rate": 3.1649743845004275e-06, "loss": 0.4742, "step": 5318 }, { "epoch": 0.431982457565175, "grad_norm": 5.369554351025192, "learning_rate": 3.164340429602506e-06, "loss": 0.5313, "step": 5319 }, { "epoch": 0.4320636725412166, "grad_norm": 5.6537665065078615, "learning_rate": 3.1637064287357433e-06, "loss": 0.427, "step": 5320 }, { "epoch": 0.43214488751725816, "grad_norm": 5.909754479859293, "learning_rate": 3.1630723819440075e-06, "loss": 0.4343, "step": 5321 }, { "epoch": 0.4322261024932998, "grad_norm": 9.806257811882773, "learning_rate": 3.1624382892711724e-06, "loss": 0.4722, "step": 5322 }, { "epoch": 0.43230731746934137, "grad_norm": 9.73741516646952, "learning_rate": 3.161804150761114e-06, "loss": 0.5959, "step": 5323 }, { "epoch": 0.43238853244538294, "grad_norm": 4.940223216087869, "learning_rate": 3.16116996645771e-06, "loss": 0.5979, "step": 5324 }, { "epoch": 0.4324697474214245, "grad_norm": 16.649964044548284, "learning_rate": 3.1605357364048446e-06, "loss": 0.4802, "step": 5325 }, { "epoch": 0.4325509623974661, "grad_norm": 7.719840433898037, "learning_rate": 3.159901460646401e-06, "loss": 0.5291, "step": 5326 }, { "epoch": 0.4326321773735077, "grad_norm": 5.612098859952399, "learning_rate": 3.15926713922627e-06, "loss": 0.4709, "step": 5327 }, { "epoch": 0.43271339234954925, "grad_norm": 15.165120351544296, "learning_rate": 3.1586327721883416e-06, "loss": 0.4699, "step": 5328 }, { "epoch": 0.43279460732559083, "grad_norm": 7.373093805345169, "learning_rate": 3.1579983595765107e-06, "loss": 0.5246, "step": 5329 }, { "epoch": 0.4328758223016324, "grad_norm": 4.232276270918911, "learning_rate": 3.1573639014346756e-06, "loss": 0.413, "step": 5330 }, { "epoch": 0.432957037277674, "grad_norm": 5.475928131532231, "learning_rate": 3.1567293978067383e-06, "loss": 0.4798, "step": 5331 }, { "epoch": 0.43303825225371556, "grad_norm": 5.319170320864578, "learning_rate": 3.1560948487366016e-06, "loss": 0.5221, "step": 5332 }, { "epoch": 0.4331194672297572, "grad_norm": 4.7595654317237654, "learning_rate": 3.1554602542681746e-06, "loss": 0.5395, "step": 5333 }, { "epoch": 0.43320068220579877, "grad_norm": 5.163449251799242, "learning_rate": 3.154825614445366e-06, "loss": 0.5755, "step": 5334 }, { "epoch": 0.43328189718184035, "grad_norm": 4.2774589507058085, "learning_rate": 3.154190929312091e-06, "loss": 0.4156, "step": 5335 }, { "epoch": 0.4333631121578819, "grad_norm": 6.636509210502926, "learning_rate": 3.1535561989122667e-06, "loss": 0.3532, "step": 5336 }, { "epoch": 0.4334443271339235, "grad_norm": 4.679244393970052, "learning_rate": 3.152921423289811e-06, "loss": 0.5302, "step": 5337 }, { "epoch": 0.4335255421099651, "grad_norm": 4.600315390689705, "learning_rate": 3.1522866024886497e-06, "loss": 0.5156, "step": 5338 }, { "epoch": 0.43360675708600666, "grad_norm": 7.387419218755155, "learning_rate": 3.1516517365527064e-06, "loss": 0.6254, "step": 5339 }, { "epoch": 0.43368797206204823, "grad_norm": 8.038125663875935, "learning_rate": 3.151016825525912e-06, "loss": 0.4804, "step": 5340 }, { "epoch": 0.4337691870380898, "grad_norm": 5.664820632074068, "learning_rate": 3.1503818694521993e-06, "loss": 0.5997, "step": 5341 }, { "epoch": 0.4338504020141314, "grad_norm": 5.420884590250101, "learning_rate": 3.1497468683755027e-06, "loss": 0.5103, "step": 5342 }, { "epoch": 0.43393161699017296, "grad_norm": 4.344293142047334, "learning_rate": 3.1491118223397622e-06, "loss": 0.6567, "step": 5343 }, { "epoch": 0.4340128319662146, "grad_norm": 3.3509931709068184, "learning_rate": 3.1484767313889186e-06, "loss": 0.5217, "step": 5344 }, { "epoch": 0.4340940469422562, "grad_norm": 6.46488127239872, "learning_rate": 3.1478415955669174e-06, "loss": 0.5403, "step": 5345 }, { "epoch": 0.43417526191829775, "grad_norm": 28.243274098376233, "learning_rate": 3.1472064149177063e-06, "loss": 0.613, "step": 5346 }, { "epoch": 0.4342564768943393, "grad_norm": 6.961851378871304, "learning_rate": 3.1465711894852364e-06, "loss": 0.5199, "step": 5347 }, { "epoch": 0.4343376918703809, "grad_norm": 4.438081889262291, "learning_rate": 3.145935919313462e-06, "loss": 0.6373, "step": 5348 }, { "epoch": 0.4344189068464225, "grad_norm": 4.323565064610984, "learning_rate": 3.1453006044463417e-06, "loss": 0.5597, "step": 5349 }, { "epoch": 0.43450012182246406, "grad_norm": 3.0858340228520826, "learning_rate": 3.144665244927833e-06, "loss": 0.5194, "step": 5350 }, { "epoch": 0.43458133679850564, "grad_norm": 4.877755892925608, "learning_rate": 3.144029840801902e-06, "loss": 0.7601, "step": 5351 }, { "epoch": 0.4346625517745472, "grad_norm": 4.213210244527702, "learning_rate": 3.1433943921125154e-06, "loss": 0.5054, "step": 5352 }, { "epoch": 0.4347437667505888, "grad_norm": 4.009304731887304, "learning_rate": 3.1427588989036406e-06, "loss": 0.8653, "step": 5353 }, { "epoch": 0.43482498172663037, "grad_norm": 6.671797591935914, "learning_rate": 3.1421233612192527e-06, "loss": 0.5943, "step": 5354 }, { "epoch": 0.434906196702672, "grad_norm": 6.798919847330098, "learning_rate": 3.1414877791033267e-06, "loss": 0.5151, "step": 5355 }, { "epoch": 0.4349874116787136, "grad_norm": 5.3539572195583975, "learning_rate": 3.1408521525998403e-06, "loss": 0.4387, "step": 5356 }, { "epoch": 0.43506862665475515, "grad_norm": 4.194017130183013, "learning_rate": 3.1402164817527776e-06, "loss": 0.3866, "step": 5357 }, { "epoch": 0.43514984163079673, "grad_norm": 3.6695150698980354, "learning_rate": 3.1395807666061223e-06, "loss": 0.6504, "step": 5358 }, { "epoch": 0.4352310566068383, "grad_norm": 4.641952406086023, "learning_rate": 3.138945007203863e-06, "loss": 0.4588, "step": 5359 }, { "epoch": 0.4353122715828799, "grad_norm": 10.931096401774482, "learning_rate": 3.1383092035899903e-06, "loss": 0.5878, "step": 5360 }, { "epoch": 0.43539348655892146, "grad_norm": 5.762955444790604, "learning_rate": 3.1376733558084994e-06, "loss": 0.3661, "step": 5361 }, { "epoch": 0.43547470153496304, "grad_norm": 4.461787974097396, "learning_rate": 3.1370374639033876e-06, "loss": 0.5696, "step": 5362 }, { "epoch": 0.4355559165110046, "grad_norm": 3.0791414260530683, "learning_rate": 3.1364015279186537e-06, "loss": 0.6584, "step": 5363 }, { "epoch": 0.4356371314870462, "grad_norm": 5.145988507189497, "learning_rate": 3.1357655478983028e-06, "loss": 0.4157, "step": 5364 }, { "epoch": 0.43571834646308777, "grad_norm": 4.936564393357792, "learning_rate": 3.135129523886341e-06, "loss": 0.6098, "step": 5365 }, { "epoch": 0.4357995614391294, "grad_norm": 5.83447257075552, "learning_rate": 3.1344934559267763e-06, "loss": 0.4316, "step": 5366 }, { "epoch": 0.435880776415171, "grad_norm": 4.5475608942051196, "learning_rate": 3.1338573440636232e-06, "loss": 0.591, "step": 5367 }, { "epoch": 0.43596199139121256, "grad_norm": 6.710109555046167, "learning_rate": 3.133221188340897e-06, "loss": 0.5388, "step": 5368 }, { "epoch": 0.43604320636725413, "grad_norm": 6.706321579158362, "learning_rate": 3.132584988802615e-06, "loss": 0.5046, "step": 5369 }, { "epoch": 0.4361244213432957, "grad_norm": 5.443757710169265, "learning_rate": 3.1319487454928005e-06, "loss": 0.5206, "step": 5370 }, { "epoch": 0.4362056363193373, "grad_norm": 43.84951492925826, "learning_rate": 3.1313124584554772e-06, "loss": 0.601, "step": 5371 }, { "epoch": 0.43628685129537886, "grad_norm": 2.965181185591869, "learning_rate": 3.130676127734673e-06, "loss": 0.5649, "step": 5372 }, { "epoch": 0.43636806627142044, "grad_norm": 4.0688538065516955, "learning_rate": 3.1300397533744176e-06, "loss": 0.5637, "step": 5373 }, { "epoch": 0.436449281247462, "grad_norm": 4.285947029576996, "learning_rate": 3.129403335418747e-06, "loss": 0.6039, "step": 5374 }, { "epoch": 0.4365304962235036, "grad_norm": 9.05418272699839, "learning_rate": 3.128766873911696e-06, "loss": 0.504, "step": 5375 }, { "epoch": 0.43661171119954517, "grad_norm": 7.150334825744675, "learning_rate": 3.1281303688973054e-06, "loss": 0.437, "step": 5376 }, { "epoch": 0.4366929261755868, "grad_norm": 4.999179068937834, "learning_rate": 3.127493820419617e-06, "loss": 0.4809, "step": 5377 }, { "epoch": 0.4367741411516284, "grad_norm": 4.8473879562154565, "learning_rate": 3.1268572285226773e-06, "loss": 0.4894, "step": 5378 }, { "epoch": 0.43685535612766996, "grad_norm": 6.929506260339618, "learning_rate": 3.1262205932505353e-06, "loss": 0.4363, "step": 5379 }, { "epoch": 0.43693657110371154, "grad_norm": 5.5615771450834846, "learning_rate": 3.125583914647242e-06, "loss": 0.5205, "step": 5380 }, { "epoch": 0.4370177860797531, "grad_norm": 3.3462484960680072, "learning_rate": 3.124947192756853e-06, "loss": 0.6615, "step": 5381 }, { "epoch": 0.4370990010557947, "grad_norm": 6.634956155249173, "learning_rate": 3.124310427623426e-06, "loss": 0.4979, "step": 5382 }, { "epoch": 0.43718021603183627, "grad_norm": 5.3106857326366095, "learning_rate": 3.123673619291021e-06, "loss": 0.5231, "step": 5383 }, { "epoch": 0.43726143100787784, "grad_norm": 4.925146872849586, "learning_rate": 3.123036767803703e-06, "loss": 0.6155, "step": 5384 }, { "epoch": 0.4373426459839194, "grad_norm": 6.351273194254498, "learning_rate": 3.122399873205538e-06, "loss": 0.4055, "step": 5385 }, { "epoch": 0.437423860959961, "grad_norm": 7.439635575652295, "learning_rate": 3.121762935540595e-06, "loss": 0.4508, "step": 5386 }, { "epoch": 0.4375050759360026, "grad_norm": 11.221500508022611, "learning_rate": 3.121125954852948e-06, "loss": 0.4207, "step": 5387 }, { "epoch": 0.4375862909120442, "grad_norm": 15.28900713021598, "learning_rate": 3.120488931186672e-06, "loss": 0.4755, "step": 5388 }, { "epoch": 0.4376675058880858, "grad_norm": 3.903889410795801, "learning_rate": 3.1198518645858455e-06, "loss": 0.4798, "step": 5389 }, { "epoch": 0.43774872086412736, "grad_norm": 6.674485525301077, "learning_rate": 3.1192147550945517e-06, "loss": 0.6514, "step": 5390 }, { "epoch": 0.43782993584016894, "grad_norm": 14.258332144198288, "learning_rate": 3.118577602756873e-06, "loss": 0.4016, "step": 5391 }, { "epoch": 0.4379111508162105, "grad_norm": 4.922640216869197, "learning_rate": 3.1179404076168983e-06, "loss": 0.5567, "step": 5392 }, { "epoch": 0.4379923657922521, "grad_norm": 5.597787362812228, "learning_rate": 3.1173031697187178e-06, "loss": 0.6444, "step": 5393 }, { "epoch": 0.43807358076829367, "grad_norm": 3.8553056656955143, "learning_rate": 3.116665889106425e-06, "loss": 0.3967, "step": 5394 }, { "epoch": 0.43815479574433525, "grad_norm": 3.517771835764705, "learning_rate": 3.1160285658241157e-06, "loss": 0.4896, "step": 5395 }, { "epoch": 0.4382360107203768, "grad_norm": 3.6077248593647715, "learning_rate": 3.11539119991589e-06, "loss": 0.6903, "step": 5396 }, { "epoch": 0.4383172256964184, "grad_norm": 3.6023017655336056, "learning_rate": 3.1147537914258513e-06, "loss": 0.602, "step": 5397 }, { "epoch": 0.43839844067246, "grad_norm": 5.836048987707051, "learning_rate": 3.1141163403981033e-06, "loss": 0.4444, "step": 5398 }, { "epoch": 0.4384796556485016, "grad_norm": 4.215159805640962, "learning_rate": 3.113478846876754e-06, "loss": 0.6145, "step": 5399 }, { "epoch": 0.4385608706245432, "grad_norm": 7.095922534739042, "learning_rate": 3.1128413109059164e-06, "loss": 0.5145, "step": 5400 }, { "epoch": 0.43864208560058476, "grad_norm": 3.5538230920359077, "learning_rate": 3.1122037325297027e-06, "loss": 0.6229, "step": 5401 }, { "epoch": 0.43872330057662634, "grad_norm": 5.0267826629365455, "learning_rate": 3.1115661117922307e-06, "loss": 0.5481, "step": 5402 }, { "epoch": 0.4388045155526679, "grad_norm": 3.179251364264237, "learning_rate": 3.1109284487376213e-06, "loss": 0.5039, "step": 5403 }, { "epoch": 0.4388857305287095, "grad_norm": 3.2478475873654626, "learning_rate": 3.1102907434099962e-06, "loss": 0.5207, "step": 5404 }, { "epoch": 0.43896694550475107, "grad_norm": 7.568604033338787, "learning_rate": 3.1096529958534805e-06, "loss": 0.5009, "step": 5405 }, { "epoch": 0.43904816048079265, "grad_norm": 5.352814742876498, "learning_rate": 3.1090152061122053e-06, "loss": 0.4465, "step": 5406 }, { "epoch": 0.4391293754568342, "grad_norm": 6.7191876007979525, "learning_rate": 3.1083773742303003e-06, "loss": 0.5651, "step": 5407 }, { "epoch": 0.4392105904328758, "grad_norm": 4.940449250528321, "learning_rate": 3.1077395002519013e-06, "loss": 0.5716, "step": 5408 }, { "epoch": 0.4392918054089174, "grad_norm": 3.648123205660983, "learning_rate": 3.1071015842211447e-06, "loss": 0.9121, "step": 5409 }, { "epoch": 0.439373020384959, "grad_norm": 4.953572403071827, "learning_rate": 3.1064636261821716e-06, "loss": 0.6146, "step": 5410 }, { "epoch": 0.4394542353610006, "grad_norm": 6.9246508482882145, "learning_rate": 3.105825626179126e-06, "loss": 0.3964, "step": 5411 }, { "epoch": 0.43953545033704217, "grad_norm": 4.790951487900791, "learning_rate": 3.1051875842561523e-06, "loss": 0.6518, "step": 5412 }, { "epoch": 0.43961666531308374, "grad_norm": 4.736239149088806, "learning_rate": 3.1045495004574017e-06, "loss": 0.5212, "step": 5413 }, { "epoch": 0.4396978802891253, "grad_norm": 7.540621113454675, "learning_rate": 3.1039113748270248e-06, "loss": 0.5596, "step": 5414 }, { "epoch": 0.4397790952651669, "grad_norm": 3.720380110888026, "learning_rate": 3.1032732074091765e-06, "loss": 0.4385, "step": 5415 }, { "epoch": 0.4398603102412085, "grad_norm": 4.7658804425871235, "learning_rate": 3.1026349982480153e-06, "loss": 0.6445, "step": 5416 }, { "epoch": 0.43994152521725005, "grad_norm": 3.8751042373300963, "learning_rate": 3.101996747387702e-06, "loss": 0.5791, "step": 5417 }, { "epoch": 0.44002274019329163, "grad_norm": 4.852177339352803, "learning_rate": 3.101358454872399e-06, "loss": 0.5171, "step": 5418 }, { "epoch": 0.4401039551693332, "grad_norm": 4.223969934330754, "learning_rate": 3.1007201207462745e-06, "loss": 0.491, "step": 5419 }, { "epoch": 0.4401851701453748, "grad_norm": 6.43559751438046, "learning_rate": 3.1000817450534964e-06, "loss": 0.4736, "step": 5420 }, { "epoch": 0.4402663851214164, "grad_norm": 4.415584725720336, "learning_rate": 3.0994433278382374e-06, "loss": 0.5335, "step": 5421 }, { "epoch": 0.440347600097458, "grad_norm": 7.044600702022488, "learning_rate": 3.0988048691446733e-06, "loss": 0.4042, "step": 5422 }, { "epoch": 0.44042881507349957, "grad_norm": 5.958216205084174, "learning_rate": 3.0981663690169806e-06, "loss": 0.5884, "step": 5423 }, { "epoch": 0.44051003004954115, "grad_norm": 7.972188724833584, "learning_rate": 3.097527827499341e-06, "loss": 0.4748, "step": 5424 }, { "epoch": 0.4405912450255827, "grad_norm": 5.3560614787912195, "learning_rate": 3.0968892446359383e-06, "loss": 0.4807, "step": 5425 }, { "epoch": 0.4406724600016243, "grad_norm": 4.339200995867066, "learning_rate": 3.0962506204709587e-06, "loss": 0.5928, "step": 5426 }, { "epoch": 0.4407536749776659, "grad_norm": 5.4395379188651605, "learning_rate": 3.0956119550485925e-06, "loss": 0.4506, "step": 5427 }, { "epoch": 0.44083488995370745, "grad_norm": 6.180414632693948, "learning_rate": 3.09497324841303e-06, "loss": 0.4891, "step": 5428 }, { "epoch": 0.44091610492974903, "grad_norm": 4.088720798041899, "learning_rate": 3.0943345006084678e-06, "loss": 0.4157, "step": 5429 }, { "epoch": 0.4409973199057906, "grad_norm": 4.834350877095814, "learning_rate": 3.0936957116791048e-06, "loss": 0.4818, "step": 5430 }, { "epoch": 0.4410785348818322, "grad_norm": 6.729193046534626, "learning_rate": 3.0930568816691394e-06, "loss": 0.4463, "step": 5431 }, { "epoch": 0.4411597498578738, "grad_norm": 4.907909407785353, "learning_rate": 3.092418010622777e-06, "loss": 0.5464, "step": 5432 }, { "epoch": 0.4412409648339154, "grad_norm": 3.5985370572492146, "learning_rate": 3.091779098584224e-06, "loss": 0.5931, "step": 5433 }, { "epoch": 0.44132217980995697, "grad_norm": 3.530928584827468, "learning_rate": 3.0911401455976882e-06, "loss": 0.483, "step": 5434 }, { "epoch": 0.44140339478599855, "grad_norm": 4.3804647274752035, "learning_rate": 3.0905011517073834e-06, "loss": 0.4682, "step": 5435 }, { "epoch": 0.4414846097620401, "grad_norm": 8.630610987447541, "learning_rate": 3.089862116957525e-06, "loss": 0.5017, "step": 5436 }, { "epoch": 0.4415658247380817, "grad_norm": 4.490184934510654, "learning_rate": 3.089223041392329e-06, "loss": 0.4949, "step": 5437 }, { "epoch": 0.4416470397141233, "grad_norm": 4.853522689003282, "learning_rate": 3.0885839250560172e-06, "loss": 0.6344, "step": 5438 }, { "epoch": 0.44172825469016486, "grad_norm": 7.016095469900614, "learning_rate": 3.087944767992813e-06, "loss": 0.5162, "step": 5439 }, { "epoch": 0.44180946966620643, "grad_norm": 5.994420464445846, "learning_rate": 3.0873055702469416e-06, "loss": 0.6458, "step": 5440 }, { "epoch": 0.441890684642248, "grad_norm": 4.376884747350049, "learning_rate": 3.086666331862634e-06, "loss": 0.4059, "step": 5441 }, { "epoch": 0.4419718996182896, "grad_norm": 5.692559176862132, "learning_rate": 3.0860270528841208e-06, "loss": 0.4641, "step": 5442 }, { "epoch": 0.4420531145943312, "grad_norm": 4.8077643907663665, "learning_rate": 3.085387733355637e-06, "loss": 0.5179, "step": 5443 }, { "epoch": 0.4421343295703728, "grad_norm": 5.021716165337083, "learning_rate": 3.08474837332142e-06, "loss": 0.6234, "step": 5444 }, { "epoch": 0.4422155445464144, "grad_norm": 5.826683036908708, "learning_rate": 3.0841089728257108e-06, "loss": 0.5355, "step": 5445 }, { "epoch": 0.44229675952245595, "grad_norm": 4.024099529077247, "learning_rate": 3.0834695319127516e-06, "loss": 0.5641, "step": 5446 }, { "epoch": 0.44237797449849753, "grad_norm": 6.876907227052113, "learning_rate": 3.082830050626789e-06, "loss": 0.4901, "step": 5447 }, { "epoch": 0.4424591894745391, "grad_norm": 5.144105316785434, "learning_rate": 3.0821905290120712e-06, "loss": 0.3913, "step": 5448 }, { "epoch": 0.4425404044505807, "grad_norm": 3.975940531276462, "learning_rate": 3.0815509671128506e-06, "loss": 0.4229, "step": 5449 }, { "epoch": 0.44262161942662226, "grad_norm": 5.375633378244454, "learning_rate": 3.0809113649733803e-06, "loss": 0.5113, "step": 5450 }, { "epoch": 0.44270283440266384, "grad_norm": 12.651478454129343, "learning_rate": 3.0802717226379175e-06, "loss": 0.5075, "step": 5451 }, { "epoch": 0.4427840493787054, "grad_norm": 4.831174765479994, "learning_rate": 3.079632040150724e-06, "loss": 0.5238, "step": 5452 }, { "epoch": 0.442865264354747, "grad_norm": 5.064193661858532, "learning_rate": 3.07899231755606e-06, "loss": 0.5808, "step": 5453 }, { "epoch": 0.4429464793307886, "grad_norm": 5.352945242031246, "learning_rate": 3.0783525548981917e-06, "loss": 0.4835, "step": 5454 }, { "epoch": 0.4430276943068302, "grad_norm": 4.965261328987609, "learning_rate": 3.077712752221388e-06, "loss": 0.4251, "step": 5455 }, { "epoch": 0.4431089092828718, "grad_norm": 4.9254182519569465, "learning_rate": 3.0770729095699194e-06, "loss": 0.6029, "step": 5456 }, { "epoch": 0.44319012425891335, "grad_norm": 3.5617209008138992, "learning_rate": 3.0764330269880593e-06, "loss": 0.5718, "step": 5457 }, { "epoch": 0.44327133923495493, "grad_norm": 4.355025485119006, "learning_rate": 3.0757931045200844e-06, "loss": 0.6286, "step": 5458 }, { "epoch": 0.4433525542109965, "grad_norm": 6.239139168135333, "learning_rate": 3.075153142210274e-06, "loss": 0.5216, "step": 5459 }, { "epoch": 0.4434337691870381, "grad_norm": 4.784319073015132, "learning_rate": 3.0745131401029105e-06, "loss": 0.5841, "step": 5460 }, { "epoch": 0.44351498416307966, "grad_norm": 4.463298659345465, "learning_rate": 3.073873098242278e-06, "loss": 0.6327, "step": 5461 }, { "epoch": 0.44359619913912124, "grad_norm": 6.8742335301264585, "learning_rate": 3.0732330166726644e-06, "loss": 0.5043, "step": 5462 }, { "epoch": 0.4436774141151628, "grad_norm": 3.366707977466441, "learning_rate": 3.07259289543836e-06, "loss": 0.5506, "step": 5463 }, { "epoch": 0.4437586290912044, "grad_norm": 4.220402613518687, "learning_rate": 3.0719527345836568e-06, "loss": 0.4197, "step": 5464 }, { "epoch": 0.443839844067246, "grad_norm": 3.555095408073302, "learning_rate": 3.0713125341528527e-06, "loss": 0.4006, "step": 5465 }, { "epoch": 0.4439210590432876, "grad_norm": 5.959143884494173, "learning_rate": 3.0706722941902438e-06, "loss": 0.5243, "step": 5466 }, { "epoch": 0.4440022740193292, "grad_norm": 3.7731524034106707, "learning_rate": 3.0700320147401324e-06, "loss": 0.5004, "step": 5467 }, { "epoch": 0.44408348899537076, "grad_norm": 4.782660700197082, "learning_rate": 3.0693916958468236e-06, "loss": 0.5797, "step": 5468 }, { "epoch": 0.44416470397141233, "grad_norm": 4.37229758317183, "learning_rate": 3.0687513375546216e-06, "loss": 0.614, "step": 5469 }, { "epoch": 0.4442459189474539, "grad_norm": 2.828225215491486, "learning_rate": 3.0681109399078375e-06, "loss": 0.7196, "step": 5470 }, { "epoch": 0.4443271339234955, "grad_norm": 6.309177247959491, "learning_rate": 3.0674705029507833e-06, "loss": 0.476, "step": 5471 }, { "epoch": 0.44440834889953706, "grad_norm": 4.103834477568316, "learning_rate": 3.0668300267277735e-06, "loss": 0.6375, "step": 5472 }, { "epoch": 0.44448956387557864, "grad_norm": 6.093953734178574, "learning_rate": 3.066189511283126e-06, "loss": 0.4653, "step": 5473 }, { "epoch": 0.4445707788516202, "grad_norm": 5.225914425304106, "learning_rate": 3.0655489566611603e-06, "loss": 0.6781, "step": 5474 }, { "epoch": 0.4446519938276618, "grad_norm": 5.8819081687868335, "learning_rate": 3.0649083629062e-06, "loss": 0.5596, "step": 5475 }, { "epoch": 0.44473320880370343, "grad_norm": 7.739981641220584, "learning_rate": 3.0642677300625704e-06, "loss": 0.3823, "step": 5476 }, { "epoch": 0.444814423779745, "grad_norm": 5.131722265818073, "learning_rate": 3.063627058174601e-06, "loss": 0.4458, "step": 5477 }, { "epoch": 0.4448956387557866, "grad_norm": 3.2411651551006364, "learning_rate": 3.062986347286622e-06, "loss": 0.7122, "step": 5478 }, { "epoch": 0.44497685373182816, "grad_norm": 4.225618329584794, "learning_rate": 3.0623455974429677e-06, "loss": 0.4025, "step": 5479 }, { "epoch": 0.44505806870786974, "grad_norm": 7.665112766366524, "learning_rate": 3.061704808687973e-06, "loss": 0.6129, "step": 5480 }, { "epoch": 0.4451392836839113, "grad_norm": 9.07304807101099, "learning_rate": 3.061063981065979e-06, "loss": 0.5051, "step": 5481 }, { "epoch": 0.4452204986599529, "grad_norm": 4.554173085391561, "learning_rate": 3.0604231146213276e-06, "loss": 0.6264, "step": 5482 }, { "epoch": 0.44530171363599447, "grad_norm": 3.5414231796303515, "learning_rate": 3.0597822093983614e-06, "loss": 0.5022, "step": 5483 }, { "epoch": 0.44538292861203604, "grad_norm": 8.230686535869143, "learning_rate": 3.0591412654414297e-06, "loss": 0.6486, "step": 5484 }, { "epoch": 0.4454641435880776, "grad_norm": 5.579450710250304, "learning_rate": 3.058500282794882e-06, "loss": 0.5926, "step": 5485 }, { "epoch": 0.4455453585641192, "grad_norm": 6.0408658738191425, "learning_rate": 3.0578592615030693e-06, "loss": 0.3853, "step": 5486 }, { "epoch": 0.44562657354016083, "grad_norm": 4.852372309573421, "learning_rate": 3.057218201610349e-06, "loss": 0.4482, "step": 5487 }, { "epoch": 0.4457077885162024, "grad_norm": 5.300082736246172, "learning_rate": 3.056577103161078e-06, "loss": 0.5611, "step": 5488 }, { "epoch": 0.445789003492244, "grad_norm": 5.314374108815902, "learning_rate": 3.055935966199617e-06, "loss": 0.505, "step": 5489 }, { "epoch": 0.44587021846828556, "grad_norm": 3.462874288033993, "learning_rate": 3.0552947907703296e-06, "loss": 0.5165, "step": 5490 }, { "epoch": 0.44595143344432714, "grad_norm": 5.895017788703498, "learning_rate": 3.054653576917581e-06, "loss": 0.3758, "step": 5491 }, { "epoch": 0.4460326484203687, "grad_norm": 3.720905933238603, "learning_rate": 3.054012324685742e-06, "loss": 0.574, "step": 5492 }, { "epoch": 0.4461138633964103, "grad_norm": 6.446011438422685, "learning_rate": 3.05337103411918e-06, "loss": 0.6723, "step": 5493 }, { "epoch": 0.44619507837245187, "grad_norm": 4.27621399758535, "learning_rate": 3.0527297052622724e-06, "loss": 0.5515, "step": 5494 }, { "epoch": 0.44627629334849345, "grad_norm": 14.21365870200448, "learning_rate": 3.0520883381593945e-06, "loss": 0.4223, "step": 5495 }, { "epoch": 0.446357508324535, "grad_norm": 3.727959140755334, "learning_rate": 3.0514469328549244e-06, "loss": 0.7102, "step": 5496 }, { "epoch": 0.4464387233005766, "grad_norm": 4.155096776178402, "learning_rate": 3.050805489393246e-06, "loss": 0.6996, "step": 5497 }, { "epoch": 0.44651993827661823, "grad_norm": 3.376388149465253, "learning_rate": 3.0501640078187433e-06, "loss": 0.6374, "step": 5498 }, { "epoch": 0.4466011532526598, "grad_norm": 6.291374977431427, "learning_rate": 3.049522488175802e-06, "loss": 0.5527, "step": 5499 }, { "epoch": 0.4466823682287014, "grad_norm": 6.926029198647835, "learning_rate": 3.048880930508813e-06, "loss": 0.4501, "step": 5500 }, { "epoch": 0.44676358320474296, "grad_norm": 8.251812163196625, "learning_rate": 3.0482393348621686e-06, "loss": 0.4361, "step": 5501 }, { "epoch": 0.44684479818078454, "grad_norm": 4.312548651784504, "learning_rate": 3.0475977012802636e-06, "loss": 0.4884, "step": 5502 }, { "epoch": 0.4469260131568261, "grad_norm": 8.316308066336365, "learning_rate": 3.0469560298074963e-06, "loss": 0.4457, "step": 5503 }, { "epoch": 0.4470072281328677, "grad_norm": 5.919440004233395, "learning_rate": 3.046314320488266e-06, "loss": 0.5825, "step": 5504 }, { "epoch": 0.4470884431089093, "grad_norm": 3.520589949942169, "learning_rate": 3.045672573366976e-06, "loss": 0.7033, "step": 5505 }, { "epoch": 0.44716965808495085, "grad_norm": 5.061834295396976, "learning_rate": 3.045030788488032e-06, "loss": 0.5824, "step": 5506 }, { "epoch": 0.4472508730609924, "grad_norm": 5.285749057766182, "learning_rate": 3.0443889658958425e-06, "loss": 0.5358, "step": 5507 }, { "epoch": 0.447332088037034, "grad_norm": 6.668648467416671, "learning_rate": 3.043747105634817e-06, "loss": 0.4972, "step": 5508 }, { "epoch": 0.44741330301307564, "grad_norm": 5.315769012630028, "learning_rate": 3.0431052077493693e-06, "loss": 0.7238, "step": 5509 }, { "epoch": 0.4474945179891172, "grad_norm": 4.114487698288283, "learning_rate": 3.0424632722839164e-06, "loss": 0.7628, "step": 5510 }, { "epoch": 0.4475757329651588, "grad_norm": 5.636091284907231, "learning_rate": 3.041821299282876e-06, "loss": 0.6014, "step": 5511 }, { "epoch": 0.44765694794120037, "grad_norm": 5.436008606726467, "learning_rate": 3.0411792887906684e-06, "loss": 0.4414, "step": 5512 }, { "epoch": 0.44773816291724194, "grad_norm": 5.179891754796302, "learning_rate": 3.0405372408517187e-06, "loss": 0.477, "step": 5513 }, { "epoch": 0.4478193778932835, "grad_norm": 2.443055617371502, "learning_rate": 3.0398951555104528e-06, "loss": 0.6141, "step": 5514 }, { "epoch": 0.4479005928693251, "grad_norm": 3.1073744907433745, "learning_rate": 3.0392530328112997e-06, "loss": 0.4829, "step": 5515 }, { "epoch": 0.4479818078453667, "grad_norm": 10.2090255345531, "learning_rate": 3.0386108727986903e-06, "loss": 0.4541, "step": 5516 }, { "epoch": 0.44806302282140825, "grad_norm": 5.848825993848949, "learning_rate": 3.037968675517059e-06, "loss": 0.4629, "step": 5517 }, { "epoch": 0.44814423779744983, "grad_norm": 5.418590209098287, "learning_rate": 3.0373264410108422e-06, "loss": 0.512, "step": 5518 }, { "epoch": 0.4482254527734914, "grad_norm": 4.498788774198856, "learning_rate": 3.03668416932448e-06, "loss": 0.4333, "step": 5519 }, { "epoch": 0.44830666774953304, "grad_norm": 7.415061980682369, "learning_rate": 3.0360418605024134e-06, "loss": 0.4415, "step": 5520 }, { "epoch": 0.4483878827255746, "grad_norm": 4.0616647064171465, "learning_rate": 3.0353995145890868e-06, "loss": 0.6796, "step": 5521 }, { "epoch": 0.4484690977016162, "grad_norm": 5.786539246564089, "learning_rate": 3.0347571316289476e-06, "loss": 0.4574, "step": 5522 }, { "epoch": 0.44855031267765777, "grad_norm": 4.2099399908062445, "learning_rate": 3.0341147116664455e-06, "loss": 0.5367, "step": 5523 }, { "epoch": 0.44863152765369935, "grad_norm": 4.538423749900379, "learning_rate": 3.0334722547460317e-06, "loss": 0.374, "step": 5524 }, { "epoch": 0.4487127426297409, "grad_norm": 3.5387099780140137, "learning_rate": 3.032829760912161e-06, "loss": 0.4963, "step": 5525 }, { "epoch": 0.4487939576057825, "grad_norm": 5.798358992841778, "learning_rate": 3.032187230209291e-06, "loss": 0.5552, "step": 5526 }, { "epoch": 0.4488751725818241, "grad_norm": 4.773816159502634, "learning_rate": 3.0315446626818816e-06, "loss": 0.5549, "step": 5527 }, { "epoch": 0.44895638755786565, "grad_norm": 4.150570620974484, "learning_rate": 3.030902058374394e-06, "loss": 0.5485, "step": 5528 }, { "epoch": 0.44903760253390723, "grad_norm": 5.126013922801945, "learning_rate": 3.0302594173312937e-06, "loss": 0.4232, "step": 5529 }, { "epoch": 0.4491188175099488, "grad_norm": 3.268316271468154, "learning_rate": 3.0296167395970494e-06, "loss": 0.5323, "step": 5530 }, { "epoch": 0.44920003248599044, "grad_norm": 5.730548961152452, "learning_rate": 3.0289740252161288e-06, "loss": 0.6075, "step": 5531 }, { "epoch": 0.449281247462032, "grad_norm": 7.380827782342893, "learning_rate": 3.0283312742330044e-06, "loss": 0.5684, "step": 5532 }, { "epoch": 0.4493624624380736, "grad_norm": 3.0968839249512983, "learning_rate": 3.027688486692153e-06, "loss": 0.6726, "step": 5533 }, { "epoch": 0.4494436774141152, "grad_norm": 3.139400489756557, "learning_rate": 3.027045662638051e-06, "loss": 0.5246, "step": 5534 }, { "epoch": 0.44952489239015675, "grad_norm": 5.977877338701136, "learning_rate": 3.026402802115178e-06, "loss": 0.4677, "step": 5535 }, { "epoch": 0.4496061073661983, "grad_norm": 4.58739972834906, "learning_rate": 3.0257599051680175e-06, "loss": 0.4622, "step": 5536 }, { "epoch": 0.4496873223422399, "grad_norm": 6.184504437794764, "learning_rate": 3.025116971841054e-06, "loss": 0.6765, "step": 5537 }, { "epoch": 0.4497685373182815, "grad_norm": 8.256944628820975, "learning_rate": 3.0244740021787756e-06, "loss": 0.5273, "step": 5538 }, { "epoch": 0.44984975229432306, "grad_norm": 5.127185634205651, "learning_rate": 3.023830996225671e-06, "loss": 0.5202, "step": 5539 }, { "epoch": 0.44993096727036463, "grad_norm": 5.783243249616182, "learning_rate": 3.023187954026234e-06, "loss": 0.588, "step": 5540 }, { "epoch": 0.4500121822464062, "grad_norm": 11.669014789188214, "learning_rate": 3.0225448756249605e-06, "loss": 0.5103, "step": 5541 }, { "epoch": 0.45009339722244784, "grad_norm": 12.31667513372858, "learning_rate": 3.0219017610663466e-06, "loss": 0.6313, "step": 5542 }, { "epoch": 0.4501746121984894, "grad_norm": 4.705587679569699, "learning_rate": 3.0212586103948933e-06, "loss": 0.5984, "step": 5543 }, { "epoch": 0.450255827174531, "grad_norm": 7.657296950948716, "learning_rate": 3.020615423655102e-06, "loss": 0.4608, "step": 5544 }, { "epoch": 0.4503370421505726, "grad_norm": 4.260385063187866, "learning_rate": 3.0199722008914787e-06, "loss": 0.6098, "step": 5545 }, { "epoch": 0.45041825712661415, "grad_norm": 5.405032331240298, "learning_rate": 3.0193289421485317e-06, "loss": 0.3907, "step": 5546 }, { "epoch": 0.45049947210265573, "grad_norm": 5.755862088555348, "learning_rate": 3.0186856474707705e-06, "loss": 0.4768, "step": 5547 }, { "epoch": 0.4505806870786973, "grad_norm": 8.389991068275073, "learning_rate": 3.0180423169027067e-06, "loss": 0.4632, "step": 5548 }, { "epoch": 0.4506619020547389, "grad_norm": 7.221227795632766, "learning_rate": 3.0173989504888573e-06, "loss": 0.4662, "step": 5549 }, { "epoch": 0.45074311703078046, "grad_norm": 3.6157966531181467, "learning_rate": 3.0167555482737384e-06, "loss": 0.6383, "step": 5550 }, { "epoch": 0.45082433200682204, "grad_norm": 5.5152149011394185, "learning_rate": 3.01611211030187e-06, "loss": 0.3908, "step": 5551 }, { "epoch": 0.4509055469828636, "grad_norm": 4.18620086470904, "learning_rate": 3.0154686366177753e-06, "loss": 0.4462, "step": 5552 }, { "epoch": 0.45098676195890525, "grad_norm": 5.069886109587496, "learning_rate": 3.0148251272659795e-06, "loss": 0.4242, "step": 5553 }, { "epoch": 0.4510679769349468, "grad_norm": 4.107158289610819, "learning_rate": 3.0141815822910094e-06, "loss": 0.5341, "step": 5554 }, { "epoch": 0.4511491919109884, "grad_norm": 4.955579663758407, "learning_rate": 3.013538001737395e-06, "loss": 0.561, "step": 5555 }, { "epoch": 0.45123040688703, "grad_norm": 4.9599952397922635, "learning_rate": 3.0128943856496686e-06, "loss": 0.6114, "step": 5556 }, { "epoch": 0.45131162186307155, "grad_norm": 8.882975709525924, "learning_rate": 3.0122507340723656e-06, "loss": 0.6247, "step": 5557 }, { "epoch": 0.45139283683911313, "grad_norm": 5.7173513452837135, "learning_rate": 3.011607047050022e-06, "loss": 0.4825, "step": 5558 }, { "epoch": 0.4514740518151547, "grad_norm": 6.378003204165053, "learning_rate": 3.0109633246271783e-06, "loss": 0.4608, "step": 5559 }, { "epoch": 0.4515552667911963, "grad_norm": 8.215078154867541, "learning_rate": 3.0103195668483787e-06, "loss": 0.4853, "step": 5560 }, { "epoch": 0.45163648176723786, "grad_norm": 5.6937027402469305, "learning_rate": 3.009675773758164e-06, "loss": 0.5338, "step": 5561 }, { "epoch": 0.45171769674327944, "grad_norm": 9.181599736624747, "learning_rate": 3.009031945401084e-06, "loss": 0.4927, "step": 5562 }, { "epoch": 0.451798911719321, "grad_norm": 11.154194461173983, "learning_rate": 3.008388081821687e-06, "loss": 0.5629, "step": 5563 }, { "epoch": 0.45188012669536265, "grad_norm": 8.09877927664861, "learning_rate": 3.0077441830645256e-06, "loss": 0.6261, "step": 5564 }, { "epoch": 0.4519613416714042, "grad_norm": 6.075046325765384, "learning_rate": 3.0071002491741537e-06, "loss": 0.5261, "step": 5565 }, { "epoch": 0.4520425566474458, "grad_norm": 7.918146195582449, "learning_rate": 3.0064562801951286e-06, "loss": 0.4545, "step": 5566 }, { "epoch": 0.4521237716234874, "grad_norm": 4.644487714982916, "learning_rate": 3.005812276172009e-06, "loss": 0.5743, "step": 5567 }, { "epoch": 0.45220498659952896, "grad_norm": 3.5759922555457924, "learning_rate": 3.005168237149357e-06, "loss": 0.572, "step": 5568 }, { "epoch": 0.45228620157557053, "grad_norm": 6.775625726215614, "learning_rate": 3.0045241631717366e-06, "loss": 0.5287, "step": 5569 }, { "epoch": 0.4523674165516121, "grad_norm": 3.7500419034479457, "learning_rate": 3.0038800542837137e-06, "loss": 0.5277, "step": 5570 }, { "epoch": 0.4524486315276537, "grad_norm": 4.670720788070081, "learning_rate": 3.003235910529859e-06, "loss": 0.4724, "step": 5571 }, { "epoch": 0.45252984650369527, "grad_norm": 7.410732606803197, "learning_rate": 3.0025917319547417e-06, "loss": 0.6593, "step": 5572 }, { "epoch": 0.45261106147973684, "grad_norm": 7.469496813543684, "learning_rate": 3.001947518602937e-06, "loss": 0.4562, "step": 5573 }, { "epoch": 0.4526922764557784, "grad_norm": 4.482461581886912, "learning_rate": 3.0013032705190196e-06, "loss": 0.5644, "step": 5574 }, { "epoch": 0.45277349143182005, "grad_norm": 4.5986439128006, "learning_rate": 3.00065898774757e-06, "loss": 0.5426, "step": 5575 }, { "epoch": 0.45285470640786163, "grad_norm": 4.16197803163674, "learning_rate": 3.000014670333168e-06, "loss": 0.5278, "step": 5576 }, { "epoch": 0.4529359213839032, "grad_norm": 3.5560084963087646, "learning_rate": 2.9993703183203963e-06, "loss": 0.5385, "step": 5577 }, { "epoch": 0.4530171363599448, "grad_norm": 4.260872328467781, "learning_rate": 2.998725931753842e-06, "loss": 0.449, "step": 5578 }, { "epoch": 0.45309835133598636, "grad_norm": 4.8192798552612635, "learning_rate": 2.9980815106780937e-06, "loss": 0.605, "step": 5579 }, { "epoch": 0.45317956631202794, "grad_norm": 3.949873091272697, "learning_rate": 2.9974370551377396e-06, "loss": 0.5863, "step": 5580 }, { "epoch": 0.4532607812880695, "grad_norm": 8.728645349811258, "learning_rate": 2.9967925651773745e-06, "loss": 0.4891, "step": 5581 }, { "epoch": 0.4533419962641111, "grad_norm": 3.623472869472515, "learning_rate": 2.9961480408415926e-06, "loss": 0.6374, "step": 5582 }, { "epoch": 0.45342321124015267, "grad_norm": 4.651906826052307, "learning_rate": 2.995503482174993e-06, "loss": 0.4461, "step": 5583 }, { "epoch": 0.45350442621619425, "grad_norm": 4.256628817272692, "learning_rate": 2.9948588892221744e-06, "loss": 0.4862, "step": 5584 }, { "epoch": 0.4535856411922358, "grad_norm": 3.3916680603373797, "learning_rate": 2.9942142620277394e-06, "loss": 0.5934, "step": 5585 }, { "epoch": 0.45366685616827745, "grad_norm": 6.635234044978131, "learning_rate": 2.993569600636293e-06, "loss": 0.5009, "step": 5586 }, { "epoch": 0.45374807114431903, "grad_norm": 5.742703156297073, "learning_rate": 2.9929249050924424e-06, "loss": 0.4523, "step": 5587 }, { "epoch": 0.4538292861203606, "grad_norm": 4.30344798988703, "learning_rate": 2.992280175440797e-06, "loss": 0.5456, "step": 5588 }, { "epoch": 0.4539105010964022, "grad_norm": 4.517980897014619, "learning_rate": 2.99163541172597e-06, "loss": 0.4736, "step": 5589 }, { "epoch": 0.45399171607244376, "grad_norm": 5.224517211850113, "learning_rate": 2.990990613992573e-06, "loss": 0.6306, "step": 5590 }, { "epoch": 0.45407293104848534, "grad_norm": 4.522124261787193, "learning_rate": 2.990345782285225e-06, "loss": 0.5577, "step": 5591 }, { "epoch": 0.4541541460245269, "grad_norm": 4.777313229511017, "learning_rate": 2.989700916648544e-06, "loss": 0.4167, "step": 5592 }, { "epoch": 0.4542353610005685, "grad_norm": 6.094818732275617, "learning_rate": 2.989056017127151e-06, "loss": 0.4435, "step": 5593 }, { "epoch": 0.45431657597661007, "grad_norm": 7.062295268512343, "learning_rate": 2.988411083765669e-06, "loss": 0.5327, "step": 5594 }, { "epoch": 0.45439779095265165, "grad_norm": 5.878383502169189, "learning_rate": 2.9877661166087265e-06, "loss": 0.7565, "step": 5595 }, { "epoch": 0.4544790059286932, "grad_norm": 5.252077496243823, "learning_rate": 2.9871211157009496e-06, "loss": 0.3842, "step": 5596 }, { "epoch": 0.45456022090473486, "grad_norm": 4.8053252445722014, "learning_rate": 2.986476081086969e-06, "loss": 0.5382, "step": 5597 }, { "epoch": 0.45464143588077643, "grad_norm": 5.29723639288913, "learning_rate": 2.9858310128114187e-06, "loss": 0.5417, "step": 5598 }, { "epoch": 0.454722650856818, "grad_norm": 4.021237021928457, "learning_rate": 2.9851859109189335e-06, "loss": 0.54, "step": 5599 }, { "epoch": 0.4548038658328596, "grad_norm": 26.465640352460827, "learning_rate": 2.9845407754541513e-06, "loss": 0.4593, "step": 5600 }, { "epoch": 0.45488508080890117, "grad_norm": 4.300973423387741, "learning_rate": 2.9838956064617108e-06, "loss": 0.547, "step": 5601 }, { "epoch": 0.45496629578494274, "grad_norm": 4.972732875615428, "learning_rate": 2.9832504039862564e-06, "loss": 0.5216, "step": 5602 }, { "epoch": 0.4550475107609843, "grad_norm": 3.8151254239492074, "learning_rate": 2.982605168072431e-06, "loss": 0.5508, "step": 5603 }, { "epoch": 0.4551287257370259, "grad_norm": 4.251587340007891, "learning_rate": 2.981959898764882e-06, "loss": 0.4318, "step": 5604 }, { "epoch": 0.4552099407130675, "grad_norm": 4.436180317192108, "learning_rate": 2.9813145961082594e-06, "loss": 0.5097, "step": 5605 }, { "epoch": 0.45529115568910905, "grad_norm": 4.5049604397224785, "learning_rate": 2.9806692601472143e-06, "loss": 0.6497, "step": 5606 }, { "epoch": 0.4553723706651506, "grad_norm": 4.817837180058034, "learning_rate": 2.9800238909263994e-06, "loss": 0.6143, "step": 5607 }, { "epoch": 0.45545358564119226, "grad_norm": 3.4243930089969785, "learning_rate": 2.9793784884904733e-06, "loss": 0.5063, "step": 5608 }, { "epoch": 0.45553480061723384, "grad_norm": 5.925953178663676, "learning_rate": 2.9787330528840915e-06, "loss": 0.4992, "step": 5609 }, { "epoch": 0.4556160155932754, "grad_norm": 5.497205043778295, "learning_rate": 2.978087584151915e-06, "loss": 0.5372, "step": 5610 }, { "epoch": 0.455697230569317, "grad_norm": 8.33939404348421, "learning_rate": 2.9774420823386104e-06, "loss": 0.5619, "step": 5611 }, { "epoch": 0.45577844554535857, "grad_norm": 6.195975570900406, "learning_rate": 2.9767965474888395e-06, "loss": 0.5683, "step": 5612 }, { "epoch": 0.45585966052140015, "grad_norm": 14.703831195583096, "learning_rate": 2.9761509796472697e-06, "loss": 0.3429, "step": 5613 }, { "epoch": 0.4559408754974417, "grad_norm": 4.631555776984385, "learning_rate": 2.975505378858574e-06, "loss": 0.6409, "step": 5614 }, { "epoch": 0.4560220904734833, "grad_norm": 3.4809174977075803, "learning_rate": 2.974859745167422e-06, "loss": 0.769, "step": 5615 }, { "epoch": 0.4561033054495249, "grad_norm": 5.098517264596913, "learning_rate": 2.9742140786184885e-06, "loss": 0.5612, "step": 5616 }, { "epoch": 0.45618452042556645, "grad_norm": 6.00392072759688, "learning_rate": 2.9735683792564506e-06, "loss": 0.5576, "step": 5617 }, { "epoch": 0.45626573540160803, "grad_norm": 7.263834884088008, "learning_rate": 2.9729226471259877e-06, "loss": 0.6709, "step": 5618 }, { "epoch": 0.45634695037764966, "grad_norm": 3.554328673137164, "learning_rate": 2.9722768822717795e-06, "loss": 0.6453, "step": 5619 }, { "epoch": 0.45642816535369124, "grad_norm": 5.778288795426547, "learning_rate": 2.971631084738511e-06, "loss": 0.4381, "step": 5620 }, { "epoch": 0.4565093803297328, "grad_norm": 3.465143813973219, "learning_rate": 2.9709852545708677e-06, "loss": 0.5124, "step": 5621 }, { "epoch": 0.4565905953057744, "grad_norm": 4.670081929190185, "learning_rate": 2.9703393918135383e-06, "loss": 0.4459, "step": 5622 }, { "epoch": 0.45667181028181597, "grad_norm": 4.713194770845547, "learning_rate": 2.96969349651121e-06, "loss": 0.4847, "step": 5623 }, { "epoch": 0.45675302525785755, "grad_norm": 3.630426739892531, "learning_rate": 2.9690475687085795e-06, "loss": 0.4093, "step": 5624 }, { "epoch": 0.4568342402338991, "grad_norm": 5.486371575732508, "learning_rate": 2.968401608450339e-06, "loss": 0.6108, "step": 5625 }, { "epoch": 0.4569154552099407, "grad_norm": 8.575804166016505, "learning_rate": 2.967755615781186e-06, "loss": 0.45, "step": 5626 }, { "epoch": 0.4569966701859823, "grad_norm": 8.699853217920039, "learning_rate": 2.9671095907458203e-06, "loss": 0.5099, "step": 5627 }, { "epoch": 0.45707788516202386, "grad_norm": 3.801057038330998, "learning_rate": 2.966463533388943e-06, "loss": 0.5301, "step": 5628 }, { "epoch": 0.45715910013806543, "grad_norm": 6.044511448542259, "learning_rate": 2.9658174437552577e-06, "loss": 0.5236, "step": 5629 }, { "epoch": 0.45724031511410707, "grad_norm": 5.875396782939016, "learning_rate": 2.9651713218894706e-06, "loss": 0.6417, "step": 5630 }, { "epoch": 0.45732153009014864, "grad_norm": 4.000366715370684, "learning_rate": 2.96452516783629e-06, "loss": 0.5186, "step": 5631 }, { "epoch": 0.4574027450661902, "grad_norm": 4.69919331836541, "learning_rate": 2.9638789816404264e-06, "loss": 0.5162, "step": 5632 }, { "epoch": 0.4574839600422318, "grad_norm": 7.3603896088494185, "learning_rate": 2.9632327633465917e-06, "loss": 0.4416, "step": 5633 }, { "epoch": 0.4575651750182734, "grad_norm": 7.25776019771969, "learning_rate": 2.9625865129995023e-06, "loss": 0.5364, "step": 5634 }, { "epoch": 0.45764638999431495, "grad_norm": 5.174876153569623, "learning_rate": 2.9619402306438738e-06, "loss": 0.412, "step": 5635 }, { "epoch": 0.4577276049703565, "grad_norm": 5.4010502742692355, "learning_rate": 2.9612939163244266e-06, "loss": 0.4053, "step": 5636 }, { "epoch": 0.4578088199463981, "grad_norm": 3.835214497106897, "learning_rate": 2.960647570085881e-06, "loss": 0.4996, "step": 5637 }, { "epoch": 0.4578900349224397, "grad_norm": 6.055794052072197, "learning_rate": 2.960001191972963e-06, "loss": 0.7144, "step": 5638 }, { "epoch": 0.45797124989848126, "grad_norm": 4.158855522188884, "learning_rate": 2.9593547820303954e-06, "loss": 0.6862, "step": 5639 }, { "epoch": 0.45805246487452284, "grad_norm": 3.708348870716642, "learning_rate": 2.958708340302908e-06, "loss": 0.5555, "step": 5640 }, { "epoch": 0.45813367985056447, "grad_norm": 5.983420776955866, "learning_rate": 2.958061866835232e-06, "loss": 0.4188, "step": 5641 }, { "epoch": 0.45821489482660605, "grad_norm": 8.29550387074976, "learning_rate": 2.9574153616720986e-06, "loss": 0.5177, "step": 5642 }, { "epoch": 0.4582961098026476, "grad_norm": 5.444764698582431, "learning_rate": 2.9567688248582436e-06, "loss": 0.5237, "step": 5643 }, { "epoch": 0.4583773247786892, "grad_norm": 10.29419590630684, "learning_rate": 2.956122256438403e-06, "loss": 0.4966, "step": 5644 }, { "epoch": 0.4584585397547308, "grad_norm": 6.484747843668005, "learning_rate": 2.955475656457316e-06, "loss": 0.5255, "step": 5645 }, { "epoch": 0.45853975473077235, "grad_norm": 5.119025976889396, "learning_rate": 2.9548290249597246e-06, "loss": 0.5482, "step": 5646 }, { "epoch": 0.45862096970681393, "grad_norm": 6.402280776009877, "learning_rate": 2.9541823619903716e-06, "loss": 0.6823, "step": 5647 }, { "epoch": 0.4587021846828555, "grad_norm": 7.6633301689603375, "learning_rate": 2.9535356675940023e-06, "loss": 0.4238, "step": 5648 }, { "epoch": 0.4587833996588971, "grad_norm": 4.171651081928115, "learning_rate": 2.952888941815366e-06, "loss": 0.6524, "step": 5649 }, { "epoch": 0.45886461463493866, "grad_norm": 5.133593022664716, "learning_rate": 2.952242184699211e-06, "loss": 0.5472, "step": 5650 }, { "epoch": 0.45894582961098024, "grad_norm": 5.9594049521898205, "learning_rate": 2.9515953962902914e-06, "loss": 0.5447, "step": 5651 }, { "epoch": 0.45902704458702187, "grad_norm": 6.3931523146679785, "learning_rate": 2.950948576633359e-06, "loss": 0.5729, "step": 5652 }, { "epoch": 0.45910825956306345, "grad_norm": 5.8713619633879, "learning_rate": 2.9503017257731727e-06, "loss": 0.4821, "step": 5653 }, { "epoch": 0.459189474539105, "grad_norm": 3.2931039167346743, "learning_rate": 2.9496548437544905e-06, "loss": 0.4733, "step": 5654 }, { "epoch": 0.4592706895151466, "grad_norm": 5.1941445369327335, "learning_rate": 2.9490079306220714e-06, "loss": 0.3829, "step": 5655 }, { "epoch": 0.4593519044911882, "grad_norm": 4.360689715599596, "learning_rate": 2.9483609864206808e-06, "loss": 0.5581, "step": 5656 }, { "epoch": 0.45943311946722976, "grad_norm": 7.1909194878489195, "learning_rate": 2.9477140111950834e-06, "loss": 0.4832, "step": 5657 }, { "epoch": 0.45951433444327133, "grad_norm": 5.716565178813941, "learning_rate": 2.947067004990045e-06, "loss": 0.458, "step": 5658 }, { "epoch": 0.4595955494193129, "grad_norm": 5.12640741752529, "learning_rate": 2.9464199678503364e-06, "loss": 0.5096, "step": 5659 }, { "epoch": 0.4596767643953545, "grad_norm": 4.73475880030873, "learning_rate": 2.9457728998207286e-06, "loss": 0.46, "step": 5660 }, { "epoch": 0.45975797937139606, "grad_norm": 5.497100257560182, "learning_rate": 2.9451258009459947e-06, "loss": 0.4947, "step": 5661 }, { "epoch": 0.45983919434743764, "grad_norm": 3.188631639464191, "learning_rate": 2.9444786712709122e-06, "loss": 0.4633, "step": 5662 }, { "epoch": 0.4599204093234793, "grad_norm": 6.625940309443984, "learning_rate": 2.943831510840257e-06, "loss": 0.5155, "step": 5663 }, { "epoch": 0.46000162429952085, "grad_norm": 6.021653211612759, "learning_rate": 2.9431843196988107e-06, "loss": 0.4657, "step": 5664 }, { "epoch": 0.4600828392755624, "grad_norm": 4.027902151297978, "learning_rate": 2.942537097891355e-06, "loss": 0.5745, "step": 5665 }, { "epoch": 0.460164054251604, "grad_norm": 4.556130080376428, "learning_rate": 2.9418898454626744e-06, "loss": 0.7196, "step": 5666 }, { "epoch": 0.4602452692276456, "grad_norm": 7.887493386006087, "learning_rate": 2.9412425624575553e-06, "loss": 0.5629, "step": 5667 }, { "epoch": 0.46032648420368716, "grad_norm": 11.085374599142877, "learning_rate": 2.9405952489207858e-06, "loss": 0.4118, "step": 5668 }, { "epoch": 0.46040769917972874, "grad_norm": 5.492625782917754, "learning_rate": 2.9399479048971567e-06, "loss": 0.4832, "step": 5669 }, { "epoch": 0.4604889141557703, "grad_norm": 3.5056589438634216, "learning_rate": 2.939300530431462e-06, "loss": 0.5736, "step": 5670 }, { "epoch": 0.4605701291318119, "grad_norm": 4.465558544811287, "learning_rate": 2.9386531255684942e-06, "loss": 0.7238, "step": 5671 }, { "epoch": 0.46065134410785347, "grad_norm": 48.57316377462239, "learning_rate": 2.938005690353052e-06, "loss": 0.5249, "step": 5672 }, { "epoch": 0.46073255908389504, "grad_norm": 5.242926405538982, "learning_rate": 2.937358224829935e-06, "loss": 0.5339, "step": 5673 }, { "epoch": 0.4608137740599367, "grad_norm": 5.8937374649076695, "learning_rate": 2.936710729043943e-06, "loss": 0.6889, "step": 5674 }, { "epoch": 0.46089498903597825, "grad_norm": 5.462741459326538, "learning_rate": 2.936063203039879e-06, "loss": 0.5676, "step": 5675 }, { "epoch": 0.46097620401201983, "grad_norm": 6.268297125768787, "learning_rate": 2.93541564686255e-06, "loss": 0.526, "step": 5676 }, { "epoch": 0.4610574189880614, "grad_norm": 3.614454345003604, "learning_rate": 2.9347680605567624e-06, "loss": 0.4487, "step": 5677 }, { "epoch": 0.461138633964103, "grad_norm": 8.297993020626132, "learning_rate": 2.9341204441673267e-06, "loss": 0.4531, "step": 5678 }, { "epoch": 0.46121984894014456, "grad_norm": 4.364678727568973, "learning_rate": 2.9334727977390526e-06, "loss": 0.6027, "step": 5679 }, { "epoch": 0.46130106391618614, "grad_norm": 8.596844992148577, "learning_rate": 2.9328251213167557e-06, "loss": 0.5818, "step": 5680 }, { "epoch": 0.4613822788922277, "grad_norm": 4.6174846839314005, "learning_rate": 2.9321774149452507e-06, "loss": 0.4268, "step": 5681 }, { "epoch": 0.4614634938682693, "grad_norm": 10.308065553433666, "learning_rate": 2.9315296786693564e-06, "loss": 0.5332, "step": 5682 }, { "epoch": 0.46154470884431087, "grad_norm": 7.029864284315326, "learning_rate": 2.9308819125338923e-06, "loss": 0.6391, "step": 5683 }, { "epoch": 0.46162592382035245, "grad_norm": 6.141294742654424, "learning_rate": 2.9302341165836794e-06, "loss": 0.7152, "step": 5684 }, { "epoch": 0.4617071387963941, "grad_norm": 5.578226095205441, "learning_rate": 2.9295862908635436e-06, "loss": 0.5101, "step": 5685 }, { "epoch": 0.46178835377243566, "grad_norm": 5.083560208470007, "learning_rate": 2.92893843541831e-06, "loss": 0.5236, "step": 5686 }, { "epoch": 0.46186956874847723, "grad_norm": 3.7370396960027654, "learning_rate": 2.928290550292806e-06, "loss": 0.6088, "step": 5687 }, { "epoch": 0.4619507837245188, "grad_norm": 8.921482639240043, "learning_rate": 2.9276426355318625e-06, "loss": 0.4895, "step": 5688 }, { "epoch": 0.4620319987005604, "grad_norm": 3.2133566714283055, "learning_rate": 2.9269946911803134e-06, "loss": 0.5798, "step": 5689 }, { "epoch": 0.46211321367660196, "grad_norm": 6.204381262030037, "learning_rate": 2.92634671728299e-06, "loss": 0.4417, "step": 5690 }, { "epoch": 0.46219442865264354, "grad_norm": 5.691308583371654, "learning_rate": 2.9256987138847302e-06, "loss": 0.5512, "step": 5691 }, { "epoch": 0.4622756436286851, "grad_norm": 4.319078154150966, "learning_rate": 2.925050681030373e-06, "loss": 0.5221, "step": 5692 }, { "epoch": 0.4623568586047267, "grad_norm": 5.04058444742641, "learning_rate": 2.9244026187647584e-06, "loss": 0.647, "step": 5693 }, { "epoch": 0.46243807358076827, "grad_norm": 5.620845349439251, "learning_rate": 2.923754527132728e-06, "loss": 0.4942, "step": 5694 }, { "epoch": 0.46251928855680985, "grad_norm": 5.022513745576825, "learning_rate": 2.9231064061791277e-06, "loss": 0.3824, "step": 5695 }, { "epoch": 0.4626005035328515, "grad_norm": 4.5284930571538835, "learning_rate": 2.922458255948803e-06, "loss": 0.4568, "step": 5696 }, { "epoch": 0.46268171850889306, "grad_norm": 4.600857423481818, "learning_rate": 2.9218100764866025e-06, "loss": 0.6157, "step": 5697 }, { "epoch": 0.46276293348493464, "grad_norm": 3.265626698109701, "learning_rate": 2.9211618678373775e-06, "loss": 0.4392, "step": 5698 }, { "epoch": 0.4628441484609762, "grad_norm": 4.624889936593594, "learning_rate": 2.9205136300459803e-06, "loss": 0.463, "step": 5699 }, { "epoch": 0.4629253634370178, "grad_norm": 5.8554370873391655, "learning_rate": 2.919865363157265e-06, "loss": 0.6943, "step": 5700 }, { "epoch": 0.46300657841305937, "grad_norm": 10.564403475889055, "learning_rate": 2.9192170672160892e-06, "loss": 0.5849, "step": 5701 }, { "epoch": 0.46308779338910094, "grad_norm": 4.570004688540801, "learning_rate": 2.9185687422673103e-06, "loss": 0.5091, "step": 5702 }, { "epoch": 0.4631690083651425, "grad_norm": 7.370321832435026, "learning_rate": 2.917920388355791e-06, "loss": 0.4578, "step": 5703 }, { "epoch": 0.4632502233411841, "grad_norm": 4.488088245228176, "learning_rate": 2.9172720055263916e-06, "loss": 0.3637, "step": 5704 }, { "epoch": 0.4633314383172257, "grad_norm": 3.3278944590303086, "learning_rate": 2.9166235938239785e-06, "loss": 0.4415, "step": 5705 }, { "epoch": 0.46341265329326725, "grad_norm": 25.965744402127452, "learning_rate": 2.9159751532934165e-06, "loss": 0.4605, "step": 5706 }, { "epoch": 0.4634938682693089, "grad_norm": 5.292291969576549, "learning_rate": 2.9153266839795756e-06, "loss": 0.5707, "step": 5707 }, { "epoch": 0.46357508324535046, "grad_norm": 7.540226421886441, "learning_rate": 2.9146781859273276e-06, "loss": 0.4891, "step": 5708 }, { "epoch": 0.46365629822139204, "grad_norm": 3.896646180404308, "learning_rate": 2.9140296591815425e-06, "loss": 0.5333, "step": 5709 }, { "epoch": 0.4637375131974336, "grad_norm": 5.700465799524542, "learning_rate": 2.913381103787097e-06, "loss": 0.391, "step": 5710 }, { "epoch": 0.4638187281734752, "grad_norm": 4.798705492265267, "learning_rate": 2.9127325197888663e-06, "loss": 0.404, "step": 5711 }, { "epoch": 0.46389994314951677, "grad_norm": 3.5471558345306877, "learning_rate": 2.91208390723173e-06, "loss": 0.7544, "step": 5712 }, { "epoch": 0.46398115812555835, "grad_norm": 6.891850689361664, "learning_rate": 2.911435266160568e-06, "loss": 0.4801, "step": 5713 }, { "epoch": 0.4640623731015999, "grad_norm": 4.753158541053688, "learning_rate": 2.910786596620263e-06, "loss": 0.4848, "step": 5714 }, { "epoch": 0.4641435880776415, "grad_norm": 5.463566847148066, "learning_rate": 2.9101378986556996e-06, "loss": 0.4833, "step": 5715 }, { "epoch": 0.4642248030536831, "grad_norm": 3.542984250968721, "learning_rate": 2.909489172311765e-06, "loss": 0.7555, "step": 5716 }, { "epoch": 0.46430601802972465, "grad_norm": 4.081018198125554, "learning_rate": 2.9088404176333456e-06, "loss": 0.5655, "step": 5717 }, { "epoch": 0.4643872330057663, "grad_norm": 5.0004125127548, "learning_rate": 2.9081916346653333e-06, "loss": 0.6157, "step": 5718 }, { "epoch": 0.46446844798180786, "grad_norm": 6.309500154267012, "learning_rate": 2.9075428234526215e-06, "loss": 0.404, "step": 5719 }, { "epoch": 0.46454966295784944, "grad_norm": 5.199891325095606, "learning_rate": 2.9068939840401018e-06, "loss": 0.4213, "step": 5720 }, { "epoch": 0.464630877933891, "grad_norm": 3.3927116646573685, "learning_rate": 2.906245116472672e-06, "loss": 0.5366, "step": 5721 }, { "epoch": 0.4647120929099326, "grad_norm": 5.355679239024366, "learning_rate": 2.905596220795231e-06, "loss": 0.5221, "step": 5722 }, { "epoch": 0.46479330788597417, "grad_norm": 4.733947923123343, "learning_rate": 2.9049472970526777e-06, "loss": 0.49, "step": 5723 }, { "epoch": 0.46487452286201575, "grad_norm": 6.624090170841801, "learning_rate": 2.904298345289914e-06, "loss": 0.4846, "step": 5724 }, { "epoch": 0.4649557378380573, "grad_norm": 5.72885175850521, "learning_rate": 2.9036493655518456e-06, "loss": 0.4319, "step": 5725 }, { "epoch": 0.4650369528140989, "grad_norm": 4.332865010791888, "learning_rate": 2.9030003578833765e-06, "loss": 0.5975, "step": 5726 }, { "epoch": 0.4651181677901405, "grad_norm": 6.313800965150897, "learning_rate": 2.902351322329416e-06, "loss": 0.5159, "step": 5727 }, { "epoch": 0.46519938276618206, "grad_norm": 5.227382270145744, "learning_rate": 2.9017022589348733e-06, "loss": 0.393, "step": 5728 }, { "epoch": 0.4652805977422237, "grad_norm": 2.840558709344795, "learning_rate": 2.9010531677446602e-06, "loss": 0.52, "step": 5729 }, { "epoch": 0.46536181271826527, "grad_norm": 5.366773140432362, "learning_rate": 2.90040404880369e-06, "loss": 0.506, "step": 5730 }, { "epoch": 0.46544302769430684, "grad_norm": 3.2313831572145193, "learning_rate": 2.8997549021568792e-06, "loss": 0.6582, "step": 5731 }, { "epoch": 0.4655242426703484, "grad_norm": 5.926467612756465, "learning_rate": 2.899105727849145e-06, "loss": 0.4414, "step": 5732 }, { "epoch": 0.46560545764639, "grad_norm": 4.447511140460557, "learning_rate": 2.898456525925406e-06, "loss": 0.54, "step": 5733 }, { "epoch": 0.4656866726224316, "grad_norm": 5.303134033820951, "learning_rate": 2.8978072964305848e-06, "loss": 0.4579, "step": 5734 }, { "epoch": 0.46576788759847315, "grad_norm": 5.195062102266287, "learning_rate": 2.8971580394096043e-06, "loss": 0.4434, "step": 5735 }, { "epoch": 0.46584910257451473, "grad_norm": 5.8469679642289165, "learning_rate": 2.896508754907389e-06, "loss": 0.5036, "step": 5736 }, { "epoch": 0.4659303175505563, "grad_norm": 3.484649611013429, "learning_rate": 2.8958594429688656e-06, "loss": 0.483, "step": 5737 }, { "epoch": 0.4660115325265979, "grad_norm": 7.276763839964852, "learning_rate": 2.895210103638966e-06, "loss": 0.5142, "step": 5738 }, { "epoch": 0.46609274750263946, "grad_norm": 4.316468383561267, "learning_rate": 2.894560736962617e-06, "loss": 0.6067, "step": 5739 }, { "epoch": 0.4661739624786811, "grad_norm": 3.8962363928693184, "learning_rate": 2.893911342984754e-06, "loss": 0.7335, "step": 5740 }, { "epoch": 0.46625517745472267, "grad_norm": 4.124297053875989, "learning_rate": 2.89326192175031e-06, "loss": 0.4862, "step": 5741 }, { "epoch": 0.46633639243076425, "grad_norm": 5.37302630861064, "learning_rate": 2.8926124733042228e-06, "loss": 0.3562, "step": 5742 }, { "epoch": 0.4664176074068058, "grad_norm": 5.046203124192214, "learning_rate": 2.89196299769143e-06, "loss": 0.7901, "step": 5743 }, { "epoch": 0.4664988223828474, "grad_norm": 8.053971161102101, "learning_rate": 2.8913134949568726e-06, "loss": 0.5199, "step": 5744 }, { "epoch": 0.466580037358889, "grad_norm": 3.327749003210284, "learning_rate": 2.890663965145492e-06, "loss": 0.5486, "step": 5745 }, { "epoch": 0.46666125233493055, "grad_norm": 5.0353868986052674, "learning_rate": 2.890014408302233e-06, "loss": 0.6335, "step": 5746 }, { "epoch": 0.46674246731097213, "grad_norm": 6.139502965392141, "learning_rate": 2.8893648244720406e-06, "loss": 0.4263, "step": 5747 }, { "epoch": 0.4668236822870137, "grad_norm": 5.019570855865613, "learning_rate": 2.8887152136998644e-06, "loss": 0.5657, "step": 5748 }, { "epoch": 0.4669048972630553, "grad_norm": 16.504398142409077, "learning_rate": 2.8880655760306507e-06, "loss": 0.3836, "step": 5749 }, { "epoch": 0.46698611223909686, "grad_norm": 5.525848295027518, "learning_rate": 2.887415911509354e-06, "loss": 0.6306, "step": 5750 }, { "epoch": 0.4670673272151385, "grad_norm": 4.288458640459229, "learning_rate": 2.8867662201809266e-06, "loss": 0.6479, "step": 5751 }, { "epoch": 0.46714854219118007, "grad_norm": 8.844918615051897, "learning_rate": 2.8861165020903235e-06, "loss": 0.5476, "step": 5752 }, { "epoch": 0.46722975716722165, "grad_norm": 8.708089067131917, "learning_rate": 2.8854667572825013e-06, "loss": 0.4022, "step": 5753 }, { "epoch": 0.4673109721432632, "grad_norm": 3.9421995828081897, "learning_rate": 2.8848169858024206e-06, "loss": 0.4588, "step": 5754 }, { "epoch": 0.4673921871193048, "grad_norm": 3.8977256522289374, "learning_rate": 2.8841671876950404e-06, "loss": 0.4142, "step": 5755 }, { "epoch": 0.4674734020953464, "grad_norm": 8.084593014494807, "learning_rate": 2.8835173630053244e-06, "loss": 0.4468, "step": 5756 }, { "epoch": 0.46755461707138796, "grad_norm": 3.533484576984449, "learning_rate": 2.882867511778237e-06, "loss": 0.5784, "step": 5757 }, { "epoch": 0.46763583204742953, "grad_norm": 4.263459871167963, "learning_rate": 2.8822176340587434e-06, "loss": 0.6078, "step": 5758 }, { "epoch": 0.4677170470234711, "grad_norm": 3.9845148250837865, "learning_rate": 2.881567729891812e-06, "loss": 0.4972, "step": 5759 }, { "epoch": 0.4677982619995127, "grad_norm": 7.062828717835998, "learning_rate": 2.8809177993224143e-06, "loss": 0.5564, "step": 5760 }, { "epoch": 0.46787947697555426, "grad_norm": 4.435936639131757, "learning_rate": 2.88026784239552e-06, "loss": 0.573, "step": 5761 }, { "epoch": 0.4679606919515959, "grad_norm": 4.512414174366236, "learning_rate": 2.8796178591561035e-06, "loss": 0.4828, "step": 5762 }, { "epoch": 0.4680419069276375, "grad_norm": 5.094918568491417, "learning_rate": 2.8789678496491407e-06, "loss": 0.5475, "step": 5763 }, { "epoch": 0.46812312190367905, "grad_norm": 2.6728500983692514, "learning_rate": 2.878317813919608e-06, "loss": 0.5026, "step": 5764 }, { "epoch": 0.46820433687972063, "grad_norm": 4.7597544395019495, "learning_rate": 2.877667752012485e-06, "loss": 0.7266, "step": 5765 }, { "epoch": 0.4682855518557622, "grad_norm": 6.8641010734258, "learning_rate": 2.877017663972752e-06, "loss": 0.4941, "step": 5766 }, { "epoch": 0.4683667668318038, "grad_norm": 5.927018173381962, "learning_rate": 2.876367549845393e-06, "loss": 0.5093, "step": 5767 }, { "epoch": 0.46844798180784536, "grad_norm": 3.4289610085754383, "learning_rate": 2.875717409675391e-06, "loss": 0.5454, "step": 5768 }, { "epoch": 0.46852919678388694, "grad_norm": 7.9437461607693, "learning_rate": 2.875067243507732e-06, "loss": 0.4647, "step": 5769 }, { "epoch": 0.4686104117599285, "grad_norm": 3.7116341834744078, "learning_rate": 2.8744170513874054e-06, "loss": 0.4881, "step": 5770 }, { "epoch": 0.4686916267359701, "grad_norm": 4.2367362028156, "learning_rate": 2.8737668333594005e-06, "loss": 0.4672, "step": 5771 }, { "epoch": 0.46877284171201167, "grad_norm": 4.273237686231666, "learning_rate": 2.873116589468708e-06, "loss": 0.4611, "step": 5772 }, { "epoch": 0.4688540566880533, "grad_norm": 5.096615588091853, "learning_rate": 2.872466319760323e-06, "loss": 0.5954, "step": 5773 }, { "epoch": 0.4689352716640949, "grad_norm": 4.825210645748332, "learning_rate": 2.87181602427924e-06, "loss": 0.5718, "step": 5774 }, { "epoch": 0.46901648664013645, "grad_norm": 5.991461149755983, "learning_rate": 2.8711657030704553e-06, "loss": 0.4037, "step": 5775 }, { "epoch": 0.46909770161617803, "grad_norm": 3.4800097173343776, "learning_rate": 2.870515356178969e-06, "loss": 0.513, "step": 5776 }, { "epoch": 0.4691789165922196, "grad_norm": 6.256528064000865, "learning_rate": 2.8698649836497805e-06, "loss": 0.6655, "step": 5777 }, { "epoch": 0.4692601315682612, "grad_norm": 5.04158800857517, "learning_rate": 2.869214585527893e-06, "loss": 0.4901, "step": 5778 }, { "epoch": 0.46934134654430276, "grad_norm": 4.617317169571152, "learning_rate": 2.8685641618583098e-06, "loss": 0.4887, "step": 5779 }, { "epoch": 0.46942256152034434, "grad_norm": 3.9855464372411147, "learning_rate": 2.8679137126860373e-06, "loss": 0.647, "step": 5780 }, { "epoch": 0.4695037764963859, "grad_norm": 4.5232125393874965, "learning_rate": 2.867263238056084e-06, "loss": 0.5514, "step": 5781 }, { "epoch": 0.4695849914724275, "grad_norm": 6.069972056894211, "learning_rate": 2.866612738013457e-06, "loss": 0.6096, "step": 5782 }, { "epoch": 0.46966620644846907, "grad_norm": 4.326041253791683, "learning_rate": 2.8659622126031687e-06, "loss": 0.5519, "step": 5783 }, { "epoch": 0.4697474214245107, "grad_norm": 6.02872157760923, "learning_rate": 2.8653116618702338e-06, "loss": 0.5394, "step": 5784 }, { "epoch": 0.4698286364005523, "grad_norm": 12.764955573129805, "learning_rate": 2.8646610858596635e-06, "loss": 0.5991, "step": 5785 }, { "epoch": 0.46990985137659386, "grad_norm": 5.345811501835503, "learning_rate": 2.864010484616477e-06, "loss": 0.54, "step": 5786 }, { "epoch": 0.46999106635263543, "grad_norm": 4.1825362641291415, "learning_rate": 2.8633598581856915e-06, "loss": 0.515, "step": 5787 }, { "epoch": 0.470072281328677, "grad_norm": 4.283001560216305, "learning_rate": 2.8627092066123263e-06, "loss": 0.4081, "step": 5788 }, { "epoch": 0.4701534963047186, "grad_norm": 4.308589294597862, "learning_rate": 2.8620585299414038e-06, "loss": 0.5021, "step": 5789 }, { "epoch": 0.47023471128076016, "grad_norm": 8.01361742978466, "learning_rate": 2.861407828217947e-06, "loss": 0.4754, "step": 5790 }, { "epoch": 0.47031592625680174, "grad_norm": 5.074919316163974, "learning_rate": 2.8607571014869816e-06, "loss": 0.5081, "step": 5791 }, { "epoch": 0.4703971412328433, "grad_norm": 4.9471885543147325, "learning_rate": 2.860106349793534e-06, "loss": 0.6083, "step": 5792 }, { "epoch": 0.4704783562088849, "grad_norm": 5.520036465193203, "learning_rate": 2.859455573182632e-06, "loss": 0.5181, "step": 5793 }, { "epoch": 0.4705595711849265, "grad_norm": 8.857086777181047, "learning_rate": 2.8588047716993084e-06, "loss": 0.402, "step": 5794 }, { "epoch": 0.4706407861609681, "grad_norm": 5.525749548650858, "learning_rate": 2.858153945388592e-06, "loss": 0.4228, "step": 5795 }, { "epoch": 0.4707220011370097, "grad_norm": 4.365954557012976, "learning_rate": 2.8575030942955185e-06, "loss": 0.3335, "step": 5796 }, { "epoch": 0.47080321611305126, "grad_norm": 5.357132843700224, "learning_rate": 2.856852218465124e-06, "loss": 0.5517, "step": 5797 }, { "epoch": 0.47088443108909284, "grad_norm": 5.2190595114312694, "learning_rate": 2.856201317942443e-06, "loss": 0.4705, "step": 5798 }, { "epoch": 0.4709656460651344, "grad_norm": 3.424227721385115, "learning_rate": 2.8555503927725164e-06, "loss": 0.4894, "step": 5799 }, { "epoch": 0.471046861041176, "grad_norm": 5.0569787088343325, "learning_rate": 2.854899443000385e-06, "loss": 0.4604, "step": 5800 }, { "epoch": 0.47112807601721757, "grad_norm": 8.187150772848494, "learning_rate": 2.8542484686710896e-06, "loss": 0.5102, "step": 5801 }, { "epoch": 0.47120929099325914, "grad_norm": 6.207010087185806, "learning_rate": 2.8535974698296765e-06, "loss": 0.3528, "step": 5802 }, { "epoch": 0.4712905059693007, "grad_norm": 4.227888017544308, "learning_rate": 2.8529464465211886e-06, "loss": 0.6394, "step": 5803 }, { "epoch": 0.4713717209453423, "grad_norm": 4.665116991045132, "learning_rate": 2.852295398790675e-06, "loss": 0.6159, "step": 5804 }, { "epoch": 0.4714529359213839, "grad_norm": 3.8703424664055857, "learning_rate": 2.8516443266831837e-06, "loss": 0.673, "step": 5805 }, { "epoch": 0.4715341508974255, "grad_norm": 3.9676189612736388, "learning_rate": 2.8509932302437665e-06, "loss": 0.5117, "step": 5806 }, { "epoch": 0.4716153658734671, "grad_norm": 4.329273160183401, "learning_rate": 2.850342109517475e-06, "loss": 0.3692, "step": 5807 }, { "epoch": 0.47169658084950866, "grad_norm": 11.340360099111553, "learning_rate": 2.8496909645493642e-06, "loss": 0.4246, "step": 5808 }, { "epoch": 0.47177779582555024, "grad_norm": 3.8942256837186626, "learning_rate": 2.849039795384489e-06, "loss": 0.5581, "step": 5809 }, { "epoch": 0.4718590108015918, "grad_norm": 9.155900656113834, "learning_rate": 2.8483886020679075e-06, "loss": 0.4519, "step": 5810 }, { "epoch": 0.4719402257776334, "grad_norm": 6.0033962365746465, "learning_rate": 2.847737384644678e-06, "loss": 0.5612, "step": 5811 }, { "epoch": 0.47202144075367497, "grad_norm": 5.889887605386142, "learning_rate": 2.8470861431598623e-06, "loss": 0.6559, "step": 5812 }, { "epoch": 0.47210265572971655, "grad_norm": 4.599377492190549, "learning_rate": 2.8464348776585234e-06, "loss": 0.635, "step": 5813 }, { "epoch": 0.4721838707057581, "grad_norm": 10.76890029919621, "learning_rate": 2.8457835881857227e-06, "loss": 0.4829, "step": 5814 }, { "epoch": 0.4722650856817997, "grad_norm": 7.220464203607489, "learning_rate": 2.8451322747865286e-06, "loss": 0.5789, "step": 5815 }, { "epoch": 0.47234630065784133, "grad_norm": 4.1108394338985725, "learning_rate": 2.844480937506008e-06, "loss": 0.609, "step": 5816 }, { "epoch": 0.4724275156338829, "grad_norm": 9.04222984053817, "learning_rate": 2.843829576389229e-06, "loss": 0.4926, "step": 5817 }, { "epoch": 0.4725087306099245, "grad_norm": 4.528427577254921, "learning_rate": 2.843178191481263e-06, "loss": 0.6624, "step": 5818 }, { "epoch": 0.47258994558596606, "grad_norm": 3.3028903657064825, "learning_rate": 2.842526782827183e-06, "loss": 0.5929, "step": 5819 }, { "epoch": 0.47267116056200764, "grad_norm": 5.026903257750425, "learning_rate": 2.841875350472062e-06, "loss": 0.5268, "step": 5820 }, { "epoch": 0.4727523755380492, "grad_norm": 6.5104574690823975, "learning_rate": 2.841223894460976e-06, "loss": 0.6567, "step": 5821 }, { "epoch": 0.4728335905140908, "grad_norm": 4.314065979269395, "learning_rate": 2.8405724148390023e-06, "loss": 0.449, "step": 5822 }, { "epoch": 0.4729148054901324, "grad_norm": 3.312866780207277, "learning_rate": 2.8399209116512204e-06, "loss": 0.5964, "step": 5823 }, { "epoch": 0.47299602046617395, "grad_norm": 5.37935898304279, "learning_rate": 2.83926938494271e-06, "loss": 0.5702, "step": 5824 }, { "epoch": 0.4730772354422155, "grad_norm": 4.881465697522925, "learning_rate": 2.838617834758554e-06, "loss": 0.4697, "step": 5825 }, { "epoch": 0.4731584504182571, "grad_norm": 5.744126603133613, "learning_rate": 2.8379662611438356e-06, "loss": 0.4512, "step": 5826 }, { "epoch": 0.47323966539429874, "grad_norm": 5.291309755595148, "learning_rate": 2.8373146641436413e-06, "loss": 0.5276, "step": 5827 }, { "epoch": 0.4733208803703403, "grad_norm": 3.9537639542161407, "learning_rate": 2.836663043803057e-06, "loss": 0.5143, "step": 5828 }, { "epoch": 0.4734020953463819, "grad_norm": 7.745428674706201, "learning_rate": 2.8360114001671724e-06, "loss": 0.4687, "step": 5829 }, { "epoch": 0.47348331032242347, "grad_norm": 4.510709091006437, "learning_rate": 2.835359733281077e-06, "loss": 0.4782, "step": 5830 }, { "epoch": 0.47356452529846504, "grad_norm": 7.03651123334896, "learning_rate": 2.834708043189862e-06, "loss": 0.4816, "step": 5831 }, { "epoch": 0.4736457402745066, "grad_norm": 4.581264549358578, "learning_rate": 2.8340563299386226e-06, "loss": 0.3454, "step": 5832 }, { "epoch": 0.4737269552505482, "grad_norm": 3.39063784376629, "learning_rate": 2.833404593572453e-06, "loss": 0.3696, "step": 5833 }, { "epoch": 0.4738081702265898, "grad_norm": 5.734013115764664, "learning_rate": 2.832752834136449e-06, "loss": 0.4435, "step": 5834 }, { "epoch": 0.47388938520263135, "grad_norm": 4.343417166849715, "learning_rate": 2.832101051675712e-06, "loss": 0.6581, "step": 5835 }, { "epoch": 0.47397060017867293, "grad_norm": 3.661535617789465, "learning_rate": 2.8314492462353386e-06, "loss": 0.5384, "step": 5836 }, { "epoch": 0.4740518151547145, "grad_norm": 4.785958660433438, "learning_rate": 2.8307974178604312e-06, "loss": 0.5199, "step": 5837 }, { "epoch": 0.47413303013075614, "grad_norm": 7.777693529299737, "learning_rate": 2.830145566596094e-06, "loss": 0.5311, "step": 5838 }, { "epoch": 0.4742142451067977, "grad_norm": 6.275994532015221, "learning_rate": 2.8294936924874304e-06, "loss": 0.6261, "step": 5839 }, { "epoch": 0.4742954600828393, "grad_norm": 5.198840400244198, "learning_rate": 2.8288417955795476e-06, "loss": 0.5628, "step": 5840 }, { "epoch": 0.47437667505888087, "grad_norm": 3.7789894977053615, "learning_rate": 2.828189875917553e-06, "loss": 0.6039, "step": 5841 }, { "epoch": 0.47445789003492245, "grad_norm": 4.356995519309679, "learning_rate": 2.827537933546555e-06, "loss": 0.6307, "step": 5842 }, { "epoch": 0.474539105010964, "grad_norm": 4.052004440671419, "learning_rate": 2.8268859685116663e-06, "loss": 0.5372, "step": 5843 }, { "epoch": 0.4746203199870056, "grad_norm": 3.622092711482535, "learning_rate": 2.826233980857998e-06, "loss": 0.534, "step": 5844 }, { "epoch": 0.4747015349630472, "grad_norm": 6.265338779708121, "learning_rate": 2.8255819706306653e-06, "loss": 0.4377, "step": 5845 }, { "epoch": 0.47478274993908876, "grad_norm": 5.902804627865366, "learning_rate": 2.8249299378747833e-06, "loss": 0.3812, "step": 5846 }, { "epoch": 0.47486396491513033, "grad_norm": 7.988753927468825, "learning_rate": 2.824277882635469e-06, "loss": 0.5673, "step": 5847 }, { "epoch": 0.4749451798911719, "grad_norm": 10.87808274886899, "learning_rate": 2.8236258049578418e-06, "loss": 0.5389, "step": 5848 }, { "epoch": 0.47502639486721354, "grad_norm": 5.350053974435865, "learning_rate": 2.8229737048870216e-06, "loss": 0.4301, "step": 5849 }, { "epoch": 0.4751076098432551, "grad_norm": 5.918757588156862, "learning_rate": 2.8223215824681295e-06, "loss": 0.5695, "step": 5850 }, { "epoch": 0.4751888248192967, "grad_norm": 12.49565829789524, "learning_rate": 2.821669437746291e-06, "loss": 0.6008, "step": 5851 }, { "epoch": 0.4752700397953383, "grad_norm": 7.165842116453641, "learning_rate": 2.8210172707666296e-06, "loss": 0.5504, "step": 5852 }, { "epoch": 0.47535125477137985, "grad_norm": 5.786887937045634, "learning_rate": 2.820365081574271e-06, "loss": 0.5192, "step": 5853 }, { "epoch": 0.4754324697474214, "grad_norm": 6.157943347201467, "learning_rate": 2.819712870214345e-06, "loss": 0.5769, "step": 5854 }, { "epoch": 0.475513684723463, "grad_norm": 15.746368000393133, "learning_rate": 2.8190606367319806e-06, "loss": 0.4668, "step": 5855 }, { "epoch": 0.4755948996995046, "grad_norm": 7.868624147224612, "learning_rate": 2.8184083811723083e-06, "loss": 0.4084, "step": 5856 }, { "epoch": 0.47567611467554616, "grad_norm": 16.215616004236644, "learning_rate": 2.817756103580461e-06, "loss": 0.588, "step": 5857 }, { "epoch": 0.47575732965158773, "grad_norm": 4.747406090572309, "learning_rate": 2.8171038040015737e-06, "loss": 0.5907, "step": 5858 }, { "epoch": 0.4758385446276293, "grad_norm": 4.9449474953603305, "learning_rate": 2.8164514824807814e-06, "loss": 0.5015, "step": 5859 }, { "epoch": 0.47591975960367094, "grad_norm": 13.074384603626209, "learning_rate": 2.8157991390632206e-06, "loss": 0.6872, "step": 5860 }, { "epoch": 0.4760009745797125, "grad_norm": 6.893964042250645, "learning_rate": 2.8151467737940312e-06, "loss": 0.4646, "step": 5861 }, { "epoch": 0.4760821895557541, "grad_norm": 11.048526461851749, "learning_rate": 2.8144943867183535e-06, "loss": 0.6181, "step": 5862 }, { "epoch": 0.4761634045317957, "grad_norm": 5.148676616822644, "learning_rate": 2.8138419778813274e-06, "loss": 0.5295, "step": 5863 }, { "epoch": 0.47624461950783725, "grad_norm": 7.31376261027769, "learning_rate": 2.8131895473280985e-06, "loss": 0.6112, "step": 5864 }, { "epoch": 0.47632583448387883, "grad_norm": 8.847836919115894, "learning_rate": 2.81253709510381e-06, "loss": 0.5427, "step": 5865 }, { "epoch": 0.4764070494599204, "grad_norm": 3.549876129014136, "learning_rate": 2.811884621253608e-06, "loss": 0.5135, "step": 5866 }, { "epoch": 0.476488264435962, "grad_norm": 4.676010284588651, "learning_rate": 2.811232125822642e-06, "loss": 0.6499, "step": 5867 }, { "epoch": 0.47656947941200356, "grad_norm": 4.475545565711852, "learning_rate": 2.81057960885606e-06, "loss": 0.4845, "step": 5868 }, { "epoch": 0.47665069438804514, "grad_norm": 5.150829015821627, "learning_rate": 2.8099270703990124e-06, "loss": 0.4768, "step": 5869 }, { "epoch": 0.4767319093640867, "grad_norm": 6.981268549767182, "learning_rate": 2.8092745104966514e-06, "loss": 0.5269, "step": 5870 }, { "epoch": 0.47681312434012835, "grad_norm": 4.317381690472616, "learning_rate": 2.8086219291941314e-06, "loss": 0.5675, "step": 5871 }, { "epoch": 0.4768943393161699, "grad_norm": 6.190644029515262, "learning_rate": 2.807969326536607e-06, "loss": 0.4985, "step": 5872 }, { "epoch": 0.4769755542922115, "grad_norm": 3.4066085952901073, "learning_rate": 2.8073167025692354e-06, "loss": 0.5183, "step": 5873 }, { "epoch": 0.4770567692682531, "grad_norm": 5.692886429945919, "learning_rate": 2.8066640573371747e-06, "loss": 0.5322, "step": 5874 }, { "epoch": 0.47713798424429466, "grad_norm": 7.642876548549243, "learning_rate": 2.8060113908855847e-06, "loss": 0.6323, "step": 5875 }, { "epoch": 0.47721919922033623, "grad_norm": 17.72297787277452, "learning_rate": 2.805358703259624e-06, "loss": 0.5077, "step": 5876 }, { "epoch": 0.4773004141963778, "grad_norm": 5.581364142761338, "learning_rate": 2.8047059945044585e-06, "loss": 0.4381, "step": 5877 }, { "epoch": 0.4773816291724194, "grad_norm": 9.67091630066501, "learning_rate": 2.8040532646652515e-06, "loss": 0.4763, "step": 5878 }, { "epoch": 0.47746284414846096, "grad_norm": 4.1891971217435575, "learning_rate": 2.803400513787166e-06, "loss": 0.5316, "step": 5879 }, { "epoch": 0.47754405912450254, "grad_norm": 6.17817844727735, "learning_rate": 2.802747741915372e-06, "loss": 0.5111, "step": 5880 }, { "epoch": 0.4776252741005441, "grad_norm": 4.9847568946301015, "learning_rate": 2.8020949490950367e-06, "loss": 0.5109, "step": 5881 }, { "epoch": 0.47770648907658575, "grad_norm": 4.473462329646816, "learning_rate": 2.801442135371329e-06, "loss": 0.6185, "step": 5882 }, { "epoch": 0.4777877040526273, "grad_norm": 6.199201466381722, "learning_rate": 2.800789300789421e-06, "loss": 0.4709, "step": 5883 }, { "epoch": 0.4778689190286689, "grad_norm": 3.4927971670876157, "learning_rate": 2.8001364453944853e-06, "loss": 0.658, "step": 5884 }, { "epoch": 0.4779501340047105, "grad_norm": 9.252290801408948, "learning_rate": 2.799483569231696e-06, "loss": 0.3258, "step": 5885 }, { "epoch": 0.47803134898075206, "grad_norm": 5.159111959600434, "learning_rate": 2.798830672346229e-06, "loss": 0.7327, "step": 5886 }, { "epoch": 0.47811256395679363, "grad_norm": 4.997715592914455, "learning_rate": 2.7981777547832604e-06, "loss": 0.5373, "step": 5887 }, { "epoch": 0.4781937789328352, "grad_norm": 7.611885925164203, "learning_rate": 2.7975248165879697e-06, "loss": 0.4249, "step": 5888 }, { "epoch": 0.4782749939088768, "grad_norm": 8.402898696861591, "learning_rate": 2.7968718578055365e-06, "loss": 0.5686, "step": 5889 }, { "epoch": 0.47835620888491837, "grad_norm": 4.514667421391678, "learning_rate": 2.796218878481142e-06, "loss": 0.5125, "step": 5890 }, { "epoch": 0.47843742386095994, "grad_norm": 6.200592103587213, "learning_rate": 2.7955658786599688e-06, "loss": 0.4591, "step": 5891 }, { "epoch": 0.4785186388370015, "grad_norm": 3.9141827656240262, "learning_rate": 2.7949128583872e-06, "loss": 0.6035, "step": 5892 }, { "epoch": 0.47859985381304315, "grad_norm": 2.881200756727726, "learning_rate": 2.7942598177080233e-06, "loss": 0.5105, "step": 5893 }, { "epoch": 0.47868106878908473, "grad_norm": 4.876865531343913, "learning_rate": 2.7936067566676244e-06, "loss": 0.4788, "step": 5894 }, { "epoch": 0.4787622837651263, "grad_norm": 4.129321830180821, "learning_rate": 2.792953675311192e-06, "loss": 0.4933, "step": 5895 }, { "epoch": 0.4788434987411679, "grad_norm": 4.975168884090526, "learning_rate": 2.792300573683915e-06, "loss": 0.6777, "step": 5896 }, { "epoch": 0.47892471371720946, "grad_norm": 4.373706953851931, "learning_rate": 2.7916474518309854e-06, "loss": 0.6101, "step": 5897 }, { "epoch": 0.47900592869325104, "grad_norm": 6.787308432476194, "learning_rate": 2.790994309797596e-06, "loss": 0.5524, "step": 5898 }, { "epoch": 0.4790871436692926, "grad_norm": 3.130380915868563, "learning_rate": 2.79034114762894e-06, "loss": 0.5455, "step": 5899 }, { "epoch": 0.4791683586453342, "grad_norm": 5.155056558227334, "learning_rate": 2.789687965370214e-06, "loss": 0.405, "step": 5900 }, { "epoch": 0.47924957362137577, "grad_norm": 5.772687941455542, "learning_rate": 2.7890347630666135e-06, "loss": 0.498, "step": 5901 }, { "epoch": 0.47933078859741735, "grad_norm": 4.464910286196713, "learning_rate": 2.788381540763337e-06, "loss": 0.4895, "step": 5902 }, { "epoch": 0.4794120035734589, "grad_norm": 6.222904764201984, "learning_rate": 2.787728298505584e-06, "loss": 0.456, "step": 5903 }, { "epoch": 0.47949321854950055, "grad_norm": 30.493510617091204, "learning_rate": 2.787075036338556e-06, "loss": 0.4607, "step": 5904 }, { "epoch": 0.47957443352554213, "grad_norm": 12.321251377438697, "learning_rate": 2.7864217543074544e-06, "loss": 0.545, "step": 5905 }, { "epoch": 0.4796556485015837, "grad_norm": 5.80807114653115, "learning_rate": 2.7857684524574833e-06, "loss": 0.6029, "step": 5906 }, { "epoch": 0.4797368634776253, "grad_norm": 7.259661847105047, "learning_rate": 2.7851151308338483e-06, "loss": 0.5074, "step": 5907 }, { "epoch": 0.47981807845366686, "grad_norm": 5.490831475001269, "learning_rate": 2.784461789481754e-06, "loss": 0.4931, "step": 5908 }, { "epoch": 0.47989929342970844, "grad_norm": 10.451899945538207, "learning_rate": 2.7838084284464105e-06, "loss": 0.3872, "step": 5909 }, { "epoch": 0.47998050840575, "grad_norm": 7.293146517834257, "learning_rate": 2.7831550477730255e-06, "loss": 0.6436, "step": 5910 }, { "epoch": 0.4800617233817916, "grad_norm": 6.817978018689368, "learning_rate": 2.78250164750681e-06, "loss": 0.4786, "step": 5911 }, { "epoch": 0.48014293835783317, "grad_norm": 5.17758792887419, "learning_rate": 2.781848227692974e-06, "loss": 0.316, "step": 5912 }, { "epoch": 0.48022415333387475, "grad_norm": 9.128124210915745, "learning_rate": 2.7811947883767343e-06, "loss": 0.4606, "step": 5913 }, { "epoch": 0.4803053683099163, "grad_norm": 6.296126142033211, "learning_rate": 2.780541329603303e-06, "loss": 0.4745, "step": 5914 }, { "epoch": 0.48038658328595796, "grad_norm": 6.296563355774902, "learning_rate": 2.7798878514178955e-06, "loss": 0.5816, "step": 5915 }, { "epoch": 0.48046779826199953, "grad_norm": 4.08831948588252, "learning_rate": 2.779234353865731e-06, "loss": 0.4533, "step": 5916 }, { "epoch": 0.4805490132380411, "grad_norm": 6.198220168009877, "learning_rate": 2.7785808369920263e-06, "loss": 0.52, "step": 5917 }, { "epoch": 0.4806302282140827, "grad_norm": 7.348946572016875, "learning_rate": 2.777927300842003e-06, "loss": 0.4988, "step": 5918 }, { "epoch": 0.48071144319012427, "grad_norm": 7.4243657712578, "learning_rate": 2.7772737454608804e-06, "loss": 0.5427, "step": 5919 }, { "epoch": 0.48079265816616584, "grad_norm": 6.240520113282074, "learning_rate": 2.7766201708938823e-06, "loss": 0.6156, "step": 5920 }, { "epoch": 0.4808738731422074, "grad_norm": 4.092737052613775, "learning_rate": 2.7759665771862324e-06, "loss": 0.4755, "step": 5921 }, { "epoch": 0.480955088118249, "grad_norm": 4.543429446499594, "learning_rate": 2.775312964383156e-06, "loss": 0.4982, "step": 5922 }, { "epoch": 0.4810363030942906, "grad_norm": 4.139943817706591, "learning_rate": 2.77465933252988e-06, "loss": 0.6698, "step": 5923 }, { "epoch": 0.48111751807033215, "grad_norm": 4.420260702078683, "learning_rate": 2.7740056816716317e-06, "loss": 0.4612, "step": 5924 }, { "epoch": 0.48119873304637373, "grad_norm": 8.399576878392056, "learning_rate": 2.7733520118536395e-06, "loss": 0.4372, "step": 5925 }, { "epoch": 0.48127994802241536, "grad_norm": 8.529789641771623, "learning_rate": 2.772698323121135e-06, "loss": 0.4788, "step": 5926 }, { "epoch": 0.48136116299845694, "grad_norm": 6.033494247535142, "learning_rate": 2.7720446155193503e-06, "loss": 0.8274, "step": 5927 }, { "epoch": 0.4814423779744985, "grad_norm": 3.680362638542244, "learning_rate": 2.7713908890935177e-06, "loss": 0.5214, "step": 5928 }, { "epoch": 0.4815235929505401, "grad_norm": 7.379140973686685, "learning_rate": 2.770737143888872e-06, "loss": 0.7151, "step": 5929 }, { "epoch": 0.48160480792658167, "grad_norm": 3.6816364647280584, "learning_rate": 2.7700833799506487e-06, "loss": 0.553, "step": 5930 }, { "epoch": 0.48168602290262325, "grad_norm": 5.987951257678209, "learning_rate": 2.7694295973240848e-06, "loss": 0.4937, "step": 5931 }, { "epoch": 0.4817672378786648, "grad_norm": 4.956448565368726, "learning_rate": 2.7687757960544193e-06, "loss": 0.4982, "step": 5932 }, { "epoch": 0.4818484528547064, "grad_norm": 4.49940857003615, "learning_rate": 2.7681219761868905e-06, "loss": 0.6454, "step": 5933 }, { "epoch": 0.481929667830748, "grad_norm": 4.009481443453415, "learning_rate": 2.7674681377667403e-06, "loss": 0.5949, "step": 5934 }, { "epoch": 0.48201088280678955, "grad_norm": 6.276103431385119, "learning_rate": 2.7668142808392102e-06, "loss": 0.6751, "step": 5935 }, { "epoch": 0.48209209778283113, "grad_norm": 5.264109990138893, "learning_rate": 2.7661604054495447e-06, "loss": 0.5605, "step": 5936 }, { "epoch": 0.48217331275887276, "grad_norm": 6.623257155307936, "learning_rate": 2.765506511642987e-06, "loss": 0.5536, "step": 5937 }, { "epoch": 0.48225452773491434, "grad_norm": 8.180552462619701, "learning_rate": 2.764852599464784e-06, "loss": 0.5155, "step": 5938 }, { "epoch": 0.4823357427109559, "grad_norm": 3.6982416953033996, "learning_rate": 2.764198668960183e-06, "loss": 0.4975, "step": 5939 }, { "epoch": 0.4824169576869975, "grad_norm": 4.58146111767092, "learning_rate": 2.7635447201744324e-06, "loss": 0.6719, "step": 5940 }, { "epoch": 0.48249817266303907, "grad_norm": 4.792428868707111, "learning_rate": 2.7628907531527815e-06, "loss": 0.5106, "step": 5941 }, { "epoch": 0.48257938763908065, "grad_norm": 5.140027653654479, "learning_rate": 2.762236767940482e-06, "loss": 0.3998, "step": 5942 }, { "epoch": 0.4826606026151222, "grad_norm": 4.42487569925798, "learning_rate": 2.761582764582787e-06, "loss": 0.4726, "step": 5943 }, { "epoch": 0.4827418175911638, "grad_norm": 4.997518890904461, "learning_rate": 2.760928743124948e-06, "loss": 0.5601, "step": 5944 }, { "epoch": 0.4828230325672054, "grad_norm": 3.799079838934848, "learning_rate": 2.7602747036122213e-06, "loss": 0.6182, "step": 5945 }, { "epoch": 0.48290424754324696, "grad_norm": 10.353271040341017, "learning_rate": 2.759620646089863e-06, "loss": 0.476, "step": 5946 }, { "epoch": 0.48298546251928853, "grad_norm": 5.666756315530278, "learning_rate": 2.758966570603129e-06, "loss": 0.5052, "step": 5947 }, { "epoch": 0.48306667749533017, "grad_norm": 6.165341654780337, "learning_rate": 2.7583124771972797e-06, "loss": 0.4994, "step": 5948 }, { "epoch": 0.48314789247137174, "grad_norm": 4.569577065572749, "learning_rate": 2.7576583659175738e-06, "loss": 0.439, "step": 5949 }, { "epoch": 0.4832291074474133, "grad_norm": 3.098941821564416, "learning_rate": 2.7570042368092724e-06, "loss": 0.5527, "step": 5950 }, { "epoch": 0.4833103224234549, "grad_norm": 5.497608597420854, "learning_rate": 2.7563500899176383e-06, "loss": 0.4795, "step": 5951 }, { "epoch": 0.4833915373994965, "grad_norm": 5.7165228874758505, "learning_rate": 2.7556959252879345e-06, "loss": 0.4909, "step": 5952 }, { "epoch": 0.48347275237553805, "grad_norm": 5.874962862560617, "learning_rate": 2.755041742965426e-06, "loss": 0.4677, "step": 5953 }, { "epoch": 0.4835539673515796, "grad_norm": 4.095451834207825, "learning_rate": 2.7543875429953787e-06, "loss": 0.6461, "step": 5954 }, { "epoch": 0.4836351823276212, "grad_norm": 4.473317225148218, "learning_rate": 2.7537333254230596e-06, "loss": 0.4963, "step": 5955 }, { "epoch": 0.4837163973036628, "grad_norm": 5.773342053411397, "learning_rate": 2.7530790902937376e-06, "loss": 0.4534, "step": 5956 }, { "epoch": 0.48379761227970436, "grad_norm": 3.510346924849262, "learning_rate": 2.752424837652681e-06, "loss": 0.6875, "step": 5957 }, { "epoch": 0.48387882725574594, "grad_norm": 7.132250653049774, "learning_rate": 2.751770567545163e-06, "loss": 0.5412, "step": 5958 }, { "epoch": 0.48396004223178757, "grad_norm": 5.085327115862414, "learning_rate": 2.7511162800164536e-06, "loss": 0.7837, "step": 5959 }, { "epoch": 0.48404125720782915, "grad_norm": 7.899170324549878, "learning_rate": 2.7504619751118266e-06, "loss": 0.5815, "step": 5960 }, { "epoch": 0.4841224721838707, "grad_norm": 12.534688873820935, "learning_rate": 2.749807652876556e-06, "loss": 0.5867, "step": 5961 }, { "epoch": 0.4842036871599123, "grad_norm": 5.888585867450327, "learning_rate": 2.749153313355919e-06, "loss": 0.4844, "step": 5962 }, { "epoch": 0.4842849021359539, "grad_norm": 4.658146495411607, "learning_rate": 2.74849895659519e-06, "loss": 0.4221, "step": 5963 }, { "epoch": 0.48436611711199545, "grad_norm": 8.00901759996569, "learning_rate": 2.7478445826396495e-06, "loss": 0.3995, "step": 5964 }, { "epoch": 0.48444733208803703, "grad_norm": 4.722557156452973, "learning_rate": 2.747190191534575e-06, "loss": 0.5922, "step": 5965 }, { "epoch": 0.4845285470640786, "grad_norm": 4.907315907025934, "learning_rate": 2.7465357833252477e-06, "loss": 0.483, "step": 5966 }, { "epoch": 0.4846097620401202, "grad_norm": 3.780776893956639, "learning_rate": 2.7458813580569487e-06, "loss": 0.5136, "step": 5967 }, { "epoch": 0.48469097701616176, "grad_norm": 4.759267167903013, "learning_rate": 2.7452269157749614e-06, "loss": 0.6048, "step": 5968 }, { "epoch": 0.48477219199220334, "grad_norm": 5.977936036351052, "learning_rate": 2.744572456524569e-06, "loss": 0.5071, "step": 5969 }, { "epoch": 0.48485340696824497, "grad_norm": 5.801052897330955, "learning_rate": 2.7439179803510567e-06, "loss": 0.3259, "step": 5970 }, { "epoch": 0.48493462194428655, "grad_norm": 5.504537267230917, "learning_rate": 2.7432634872997123e-06, "loss": 0.454, "step": 5971 }, { "epoch": 0.4850158369203281, "grad_norm": 5.980973589761268, "learning_rate": 2.7426089774158217e-06, "loss": 0.5151, "step": 5972 }, { "epoch": 0.4850970518963697, "grad_norm": 6.090787296944255, "learning_rate": 2.7419544507446727e-06, "loss": 0.5434, "step": 5973 }, { "epoch": 0.4851782668724113, "grad_norm": 5.184785779288021, "learning_rate": 2.7412999073315567e-06, "loss": 0.6548, "step": 5974 }, { "epoch": 0.48525948184845286, "grad_norm": 3.168315464569746, "learning_rate": 2.7406453472217654e-06, "loss": 0.5838, "step": 5975 }, { "epoch": 0.48534069682449443, "grad_norm": 5.7401424667695204, "learning_rate": 2.7399907704605884e-06, "loss": 0.4268, "step": 5976 }, { "epoch": 0.485421911800536, "grad_norm": 8.886061000991853, "learning_rate": 2.7393361770933198e-06, "loss": 0.4986, "step": 5977 }, { "epoch": 0.4855031267765776, "grad_norm": 4.804673355461681, "learning_rate": 2.7386815671652556e-06, "loss": 0.6466, "step": 5978 }, { "epoch": 0.48558434175261916, "grad_norm": 5.321396681856843, "learning_rate": 2.7380269407216896e-06, "loss": 0.4684, "step": 5979 }, { "epoch": 0.48566555672866074, "grad_norm": 5.686891915170021, "learning_rate": 2.737372297807919e-06, "loss": 0.6947, "step": 5980 }, { "epoch": 0.4857467717047024, "grad_norm": 6.348245493300976, "learning_rate": 2.7367176384692425e-06, "loss": 0.4924, "step": 5981 }, { "epoch": 0.48582798668074395, "grad_norm": 6.0903339297311545, "learning_rate": 2.736062962750957e-06, "loss": 0.5165, "step": 5982 }, { "epoch": 0.4859092016567855, "grad_norm": 11.757378358765155, "learning_rate": 2.735408270698364e-06, "loss": 0.4462, "step": 5983 }, { "epoch": 0.4859904166328271, "grad_norm": 16.683533350645916, "learning_rate": 2.7347535623567656e-06, "loss": 0.6067, "step": 5984 }, { "epoch": 0.4860716316088687, "grad_norm": 7.944296108437706, "learning_rate": 2.734098837771462e-06, "loss": 0.5032, "step": 5985 }, { "epoch": 0.48615284658491026, "grad_norm": 5.943425981392983, "learning_rate": 2.7334440969877584e-06, "loss": 0.5087, "step": 5986 }, { "epoch": 0.48623406156095184, "grad_norm": 3.3987635620526686, "learning_rate": 2.7327893400509586e-06, "loss": 0.5818, "step": 5987 }, { "epoch": 0.4863152765369934, "grad_norm": 6.620696249187532, "learning_rate": 2.732134567006368e-06, "loss": 0.4364, "step": 5988 }, { "epoch": 0.486396491513035, "grad_norm": 5.572056490122121, "learning_rate": 2.731479777899295e-06, "loss": 0.4405, "step": 5989 }, { "epoch": 0.48647770648907657, "grad_norm": 6.354284008710806, "learning_rate": 2.730824972775045e-06, "loss": 0.6067, "step": 5990 }, { "epoch": 0.48655892146511814, "grad_norm": 5.802616735842974, "learning_rate": 2.7301701516789303e-06, "loss": 0.3966, "step": 5991 }, { "epoch": 0.4866401364411598, "grad_norm": 4.459624839521087, "learning_rate": 2.729515314656258e-06, "loss": 0.5996, "step": 5992 }, { "epoch": 0.48672135141720135, "grad_norm": 7.157939161361248, "learning_rate": 2.7288604617523405e-06, "loss": 0.5517, "step": 5993 }, { "epoch": 0.48680256639324293, "grad_norm": 5.292457922278461, "learning_rate": 2.728205593012491e-06, "loss": 0.4596, "step": 5994 }, { "epoch": 0.4868837813692845, "grad_norm": 5.048505829129327, "learning_rate": 2.7275507084820226e-06, "loss": 0.6614, "step": 5995 }, { "epoch": 0.4869649963453261, "grad_norm": 5.999484690389678, "learning_rate": 2.726895808206248e-06, "loss": 0.4645, "step": 5996 }, { "epoch": 0.48704621132136766, "grad_norm": 4.707719844104531, "learning_rate": 2.7262408922304857e-06, "loss": 0.5658, "step": 5997 }, { "epoch": 0.48712742629740924, "grad_norm": 9.302118644783453, "learning_rate": 2.72558596060005e-06, "loss": 0.4844, "step": 5998 }, { "epoch": 0.4872086412734508, "grad_norm": 6.41310409364449, "learning_rate": 2.72493101336026e-06, "loss": 0.4755, "step": 5999 }, { "epoch": 0.4872898562494924, "grad_norm": 7.372592645487124, "learning_rate": 2.7242760505564346e-06, "loss": 0.4443, "step": 6000 }, { "epoch": 0.48737107122553397, "grad_norm": 5.478164117803782, "learning_rate": 2.7236210722338936e-06, "loss": 0.6266, "step": 6001 }, { "epoch": 0.48745228620157555, "grad_norm": 4.451621100299445, "learning_rate": 2.7229660784379575e-06, "loss": 0.6028, "step": 6002 }, { "epoch": 0.4875335011776172, "grad_norm": 4.829301062695959, "learning_rate": 2.7223110692139487e-06, "loss": 0.3843, "step": 6003 }, { "epoch": 0.48761471615365876, "grad_norm": 8.161182349000478, "learning_rate": 2.7216560446071904e-06, "loss": 0.5373, "step": 6004 }, { "epoch": 0.48769593112970033, "grad_norm": 5.075301630204499, "learning_rate": 2.721001004663008e-06, "loss": 0.5209, "step": 6005 }, { "epoch": 0.4877771461057419, "grad_norm": 3.5392282378757094, "learning_rate": 2.7203459494267243e-06, "loss": 0.4714, "step": 6006 }, { "epoch": 0.4878583610817835, "grad_norm": 3.3493451655565147, "learning_rate": 2.719690878943668e-06, "loss": 0.6381, "step": 6007 }, { "epoch": 0.48793957605782506, "grad_norm": 11.279877740153177, "learning_rate": 2.7190357932591653e-06, "loss": 0.5869, "step": 6008 }, { "epoch": 0.48802079103386664, "grad_norm": 15.655656745532893, "learning_rate": 2.7183806924185447e-06, "loss": 0.4589, "step": 6009 }, { "epoch": 0.4881020060099082, "grad_norm": 7.534260268163045, "learning_rate": 2.717725576467136e-06, "loss": 0.5564, "step": 6010 }, { "epoch": 0.4881832209859498, "grad_norm": 8.16602240519616, "learning_rate": 2.71707044545027e-06, "loss": 0.4806, "step": 6011 }, { "epoch": 0.48826443596199137, "grad_norm": 5.065942666535822, "learning_rate": 2.716415299413278e-06, "loss": 0.4948, "step": 6012 }, { "epoch": 0.48834565093803295, "grad_norm": 6.1717583953297455, "learning_rate": 2.7157601384014927e-06, "loss": 0.5663, "step": 6013 }, { "epoch": 0.4884268659140746, "grad_norm": 5.9450854471553125, "learning_rate": 2.7151049624602473e-06, "loss": 0.6468, "step": 6014 }, { "epoch": 0.48850808089011616, "grad_norm": 6.376587879616875, "learning_rate": 2.714449771634877e-06, "loss": 0.4685, "step": 6015 }, { "epoch": 0.48858929586615774, "grad_norm": 4.341090257406891, "learning_rate": 2.713794565970718e-06, "loss": 0.438, "step": 6016 }, { "epoch": 0.4886705108421993, "grad_norm": 4.016157690853844, "learning_rate": 2.7131393455131057e-06, "loss": 0.6089, "step": 6017 }, { "epoch": 0.4887517258182409, "grad_norm": 2.638895965006909, "learning_rate": 2.7124841103073794e-06, "loss": 0.3652, "step": 6018 }, { "epoch": 0.48883294079428247, "grad_norm": 9.501098441788494, "learning_rate": 2.711828860398877e-06, "loss": 0.5685, "step": 6019 }, { "epoch": 0.48891415577032404, "grad_norm": 6.054200225549085, "learning_rate": 2.7111735958329383e-06, "loss": 0.3793, "step": 6020 }, { "epoch": 0.4889953707463656, "grad_norm": 5.616508580837288, "learning_rate": 2.7105183166549048e-06, "loss": 0.547, "step": 6021 }, { "epoch": 0.4890765857224072, "grad_norm": 6.361440708041633, "learning_rate": 2.7098630229101174e-06, "loss": 0.5794, "step": 6022 }, { "epoch": 0.4891578006984488, "grad_norm": 9.543869316221988, "learning_rate": 2.70920771464392e-06, "loss": 0.6857, "step": 6023 }, { "epoch": 0.48923901567449035, "grad_norm": 4.931811725830596, "learning_rate": 2.708552391901656e-06, "loss": 0.6798, "step": 6024 }, { "epoch": 0.489320230650532, "grad_norm": 7.691562606616547, "learning_rate": 2.70789705472867e-06, "loss": 0.6456, "step": 6025 }, { "epoch": 0.48940144562657356, "grad_norm": 3.9609368324412286, "learning_rate": 2.707241703170308e-06, "loss": 0.4363, "step": 6026 }, { "epoch": 0.48948266060261514, "grad_norm": 5.514465718417427, "learning_rate": 2.706586337271917e-06, "loss": 0.4752, "step": 6027 }, { "epoch": 0.4895638755786567, "grad_norm": 4.986568690370059, "learning_rate": 2.705930957078845e-06, "loss": 0.5089, "step": 6028 }, { "epoch": 0.4896450905546983, "grad_norm": 4.59026441450655, "learning_rate": 2.705275562636441e-06, "loss": 0.5348, "step": 6029 }, { "epoch": 0.48972630553073987, "grad_norm": 5.882838457335099, "learning_rate": 2.7046201539900537e-06, "loss": 0.5789, "step": 6030 }, { "epoch": 0.48980752050678145, "grad_norm": 6.273818536620701, "learning_rate": 2.7039647311850347e-06, "loss": 0.5857, "step": 6031 }, { "epoch": 0.489888735482823, "grad_norm": 4.353727277852687, "learning_rate": 2.7033092942667362e-06, "loss": 0.4136, "step": 6032 }, { "epoch": 0.4899699504588646, "grad_norm": 11.49874908610111, "learning_rate": 2.70265384328051e-06, "loss": 0.4196, "step": 6033 }, { "epoch": 0.4900511654349062, "grad_norm": 8.579866589240709, "learning_rate": 2.701998378271711e-06, "loss": 0.4894, "step": 6034 }, { "epoch": 0.49013238041094775, "grad_norm": 25.037657431765965, "learning_rate": 2.7013428992856925e-06, "loss": 0.5139, "step": 6035 }, { "epoch": 0.4902135953869894, "grad_norm": 11.228244377773777, "learning_rate": 2.700687406367812e-06, "loss": 0.6387, "step": 6036 }, { "epoch": 0.49029481036303096, "grad_norm": 4.944749953225672, "learning_rate": 2.700031899563425e-06, "loss": 0.5617, "step": 6037 }, { "epoch": 0.49037602533907254, "grad_norm": 11.318432666073637, "learning_rate": 2.6993763789178885e-06, "loss": 0.6037, "step": 6038 }, { "epoch": 0.4904572403151141, "grad_norm": 4.831316548854263, "learning_rate": 2.698720844476562e-06, "loss": 0.6919, "step": 6039 }, { "epoch": 0.4905384552911557, "grad_norm": 6.464718545938012, "learning_rate": 2.6980652962848055e-06, "loss": 0.5024, "step": 6040 }, { "epoch": 0.49061967026719727, "grad_norm": 8.179746403330494, "learning_rate": 2.697409734387978e-06, "loss": 0.4774, "step": 6041 }, { "epoch": 0.49070088524323885, "grad_norm": 6.753372733627089, "learning_rate": 2.6967541588314413e-06, "loss": 0.4216, "step": 6042 }, { "epoch": 0.4907821002192804, "grad_norm": 3.5059703839980707, "learning_rate": 2.6960985696605583e-06, "loss": 0.5735, "step": 6043 }, { "epoch": 0.490863315195322, "grad_norm": 6.5528726281723335, "learning_rate": 2.695442966920693e-06, "loss": 0.4922, "step": 6044 }, { "epoch": 0.4909445301713636, "grad_norm": 6.2801153498685744, "learning_rate": 2.6947873506572083e-06, "loss": 0.5194, "step": 6045 }, { "epoch": 0.49102574514740516, "grad_norm": 2.729788705777957, "learning_rate": 2.6941317209154694e-06, "loss": 0.7296, "step": 6046 }, { "epoch": 0.4911069601234468, "grad_norm": 5.110271057688403, "learning_rate": 2.693476077740843e-06, "loss": 0.4946, "step": 6047 }, { "epoch": 0.49118817509948837, "grad_norm": 5.133921081919526, "learning_rate": 2.6928204211786957e-06, "loss": 0.5102, "step": 6048 }, { "epoch": 0.49126939007552994, "grad_norm": 4.1327143511126, "learning_rate": 2.6921647512743963e-06, "loss": 0.4642, "step": 6049 }, { "epoch": 0.4913506050515715, "grad_norm": 13.617171548443578, "learning_rate": 2.691509068073313e-06, "loss": 0.356, "step": 6050 }, { "epoch": 0.4914318200276131, "grad_norm": 5.719645297157029, "learning_rate": 2.6908533716208157e-06, "loss": 0.5327, "step": 6051 }, { "epoch": 0.4915130350036547, "grad_norm": 3.9479135140894224, "learning_rate": 2.690197661962275e-06, "loss": 0.6186, "step": 6052 }, { "epoch": 0.49159424997969625, "grad_norm": 7.503761117502973, "learning_rate": 2.6895419391430635e-06, "loss": 0.3574, "step": 6053 }, { "epoch": 0.49167546495573783, "grad_norm": 4.597046243404988, "learning_rate": 2.688886203208552e-06, "loss": 0.4214, "step": 6054 }, { "epoch": 0.4917566799317794, "grad_norm": 19.152422694361487, "learning_rate": 2.6882304542041147e-06, "loss": 0.4696, "step": 6055 }, { "epoch": 0.491837894907821, "grad_norm": 6.515260679020386, "learning_rate": 2.687574692175127e-06, "loss": 0.4959, "step": 6056 }, { "epoch": 0.49191910988386256, "grad_norm": 4.2821690669001224, "learning_rate": 2.6869189171669637e-06, "loss": 0.3626, "step": 6057 }, { "epoch": 0.4920003248599042, "grad_norm": 7.192130700433451, "learning_rate": 2.686263129224999e-06, "loss": 0.5963, "step": 6058 }, { "epoch": 0.49208153983594577, "grad_norm": 7.6817361440233585, "learning_rate": 2.685607328394613e-06, "loss": 0.4168, "step": 6059 }, { "epoch": 0.49216275481198735, "grad_norm": 11.249409008964918, "learning_rate": 2.6849515147211814e-06, "loss": 0.4758, "step": 6060 }, { "epoch": 0.4922439697880289, "grad_norm": 3.9392524421599346, "learning_rate": 2.6842956882500843e-06, "loss": 0.445, "step": 6061 }, { "epoch": 0.4923251847640705, "grad_norm": 9.559526227459175, "learning_rate": 2.6836398490267006e-06, "loss": 0.5492, "step": 6062 }, { "epoch": 0.4924063997401121, "grad_norm": 3.917087406483406, "learning_rate": 2.6829839970964112e-06, "loss": 0.4452, "step": 6063 }, { "epoch": 0.49248761471615365, "grad_norm": 3.7273772258833566, "learning_rate": 2.682328132504598e-06, "loss": 0.4458, "step": 6064 }, { "epoch": 0.49256882969219523, "grad_norm": 4.9038911903534474, "learning_rate": 2.6816722552966423e-06, "loss": 0.5387, "step": 6065 }, { "epoch": 0.4926500446682368, "grad_norm": 5.697167255856347, "learning_rate": 2.6810163655179287e-06, "loss": 0.4933, "step": 6066 }, { "epoch": 0.4927312596442784, "grad_norm": 4.553361726145492, "learning_rate": 2.6803604632138403e-06, "loss": 0.381, "step": 6067 }, { "epoch": 0.49281247462031996, "grad_norm": 4.7949903203120305, "learning_rate": 2.6797045484297624e-06, "loss": 0.5172, "step": 6068 }, { "epoch": 0.4928936895963616, "grad_norm": 20.378872084713798, "learning_rate": 2.6790486212110812e-06, "loss": 0.4133, "step": 6069 }, { "epoch": 0.49297490457240317, "grad_norm": 6.870346383155372, "learning_rate": 2.678392681603183e-06, "loss": 0.4685, "step": 6070 }, { "epoch": 0.49305611954844475, "grad_norm": 4.53706289191984, "learning_rate": 2.6777367296514547e-06, "loss": 0.4881, "step": 6071 }, { "epoch": 0.4931373345244863, "grad_norm": 5.005808865354581, "learning_rate": 2.677080765401286e-06, "loss": 0.5477, "step": 6072 }, { "epoch": 0.4932185495005279, "grad_norm": 8.68217397091714, "learning_rate": 2.6764247888980654e-06, "loss": 0.4074, "step": 6073 }, { "epoch": 0.4932997644765695, "grad_norm": 6.058970447210371, "learning_rate": 2.675768800187182e-06, "loss": 0.5763, "step": 6074 }, { "epoch": 0.49338097945261106, "grad_norm": 2.9831711571626522, "learning_rate": 2.67511279931403e-06, "loss": 0.6046, "step": 6075 }, { "epoch": 0.49346219442865263, "grad_norm": 3.841362882001304, "learning_rate": 2.674456786323998e-06, "loss": 0.505, "step": 6076 }, { "epoch": 0.4935434094046942, "grad_norm": 5.296826365325529, "learning_rate": 2.6738007612624792e-06, "loss": 0.5466, "step": 6077 }, { "epoch": 0.4936246243807358, "grad_norm": 4.678775568920299, "learning_rate": 2.673144724174868e-06, "loss": 0.5204, "step": 6078 }, { "epoch": 0.49370583935677737, "grad_norm": 4.857667826957571, "learning_rate": 2.6724886751065584e-06, "loss": 0.5329, "step": 6079 }, { "epoch": 0.493787054332819, "grad_norm": 5.592492702134483, "learning_rate": 2.671832614102945e-06, "loss": 0.529, "step": 6080 }, { "epoch": 0.4938682693088606, "grad_norm": 61.655374457513666, "learning_rate": 2.671176541209424e-06, "loss": 0.5004, "step": 6081 }, { "epoch": 0.49394948428490215, "grad_norm": 6.490521162736462, "learning_rate": 2.6705204564713927e-06, "loss": 0.4508, "step": 6082 }, { "epoch": 0.49403069926094373, "grad_norm": 6.042977369262763, "learning_rate": 2.669864359934249e-06, "loss": 0.4754, "step": 6083 }, { "epoch": 0.4941119142369853, "grad_norm": 7.550162317594159, "learning_rate": 2.6692082516433886e-06, "loss": 0.4356, "step": 6084 }, { "epoch": 0.4941931292130269, "grad_norm": 3.954161538122901, "learning_rate": 2.668552131644214e-06, "loss": 0.6147, "step": 6085 }, { "epoch": 0.49427434418906846, "grad_norm": 8.837611857542369, "learning_rate": 2.667895999982124e-06, "loss": 0.4406, "step": 6086 }, { "epoch": 0.49435555916511004, "grad_norm": 5.060653912888538, "learning_rate": 2.6672398567025188e-06, "loss": 0.4472, "step": 6087 }, { "epoch": 0.4944367741411516, "grad_norm": 9.229957236777977, "learning_rate": 2.666583701850802e-06, "loss": 0.507, "step": 6088 }, { "epoch": 0.4945179891171932, "grad_norm": 4.035963295981821, "learning_rate": 2.6659275354723735e-06, "loss": 0.4867, "step": 6089 }, { "epoch": 0.49459920409323477, "grad_norm": 8.351371921914652, "learning_rate": 2.6652713576126376e-06, "loss": 0.6755, "step": 6090 }, { "epoch": 0.4946804190692764, "grad_norm": 5.78678933022576, "learning_rate": 2.6646151683169985e-06, "loss": 0.6439, "step": 6091 }, { "epoch": 0.494761634045318, "grad_norm": 4.566506684904742, "learning_rate": 2.6639589676308614e-06, "loss": 0.7211, "step": 6092 }, { "epoch": 0.49484284902135955, "grad_norm": 7.609049634242752, "learning_rate": 2.663302755599631e-06, "loss": 0.6008, "step": 6093 }, { "epoch": 0.49492406399740113, "grad_norm": 3.3025767017322893, "learning_rate": 2.6626465322687144e-06, "loss": 0.5058, "step": 6094 }, { "epoch": 0.4950052789734427, "grad_norm": 5.250107319379549, "learning_rate": 2.6619902976835187e-06, "loss": 0.3995, "step": 6095 }, { "epoch": 0.4950864939494843, "grad_norm": 4.264694509387339, "learning_rate": 2.6613340518894513e-06, "loss": 0.4428, "step": 6096 }, { "epoch": 0.49516770892552586, "grad_norm": 9.523848099068603, "learning_rate": 2.6606777949319217e-06, "loss": 0.4568, "step": 6097 }, { "epoch": 0.49524892390156744, "grad_norm": 4.645197872989053, "learning_rate": 2.6600215268563396e-06, "loss": 0.7066, "step": 6098 }, { "epoch": 0.495330138877609, "grad_norm": 6.723227949020278, "learning_rate": 2.6593652477081146e-06, "loss": 0.5934, "step": 6099 }, { "epoch": 0.4954113538536506, "grad_norm": 6.836596280031364, "learning_rate": 2.658708957532657e-06, "loss": 0.4197, "step": 6100 }, { "epoch": 0.49549256882969217, "grad_norm": 5.530454100104951, "learning_rate": 2.6580526563753794e-06, "loss": 0.5521, "step": 6101 }, { "epoch": 0.4955737838057338, "grad_norm": 8.014803649260598, "learning_rate": 2.6573963442816957e-06, "loss": 0.4716, "step": 6102 }, { "epoch": 0.4956549987817754, "grad_norm": 5.651578109021286, "learning_rate": 2.656740021297017e-06, "loss": 0.6337, "step": 6103 }, { "epoch": 0.49573621375781696, "grad_norm": 4.497707754410902, "learning_rate": 2.6560836874667584e-06, "loss": 0.4835, "step": 6104 }, { "epoch": 0.49581742873385853, "grad_norm": 6.748930438264841, "learning_rate": 2.6554273428363352e-06, "loss": 0.4341, "step": 6105 }, { "epoch": 0.4958986437099001, "grad_norm": 4.813991480179138, "learning_rate": 2.6547709874511622e-06, "loss": 0.4681, "step": 6106 }, { "epoch": 0.4959798586859417, "grad_norm": 4.661897396362085, "learning_rate": 2.654114621356656e-06, "loss": 0.4306, "step": 6107 }, { "epoch": 0.49606107366198327, "grad_norm": 6.397578426337775, "learning_rate": 2.6534582445982338e-06, "loss": 0.4603, "step": 6108 }, { "epoch": 0.49614228863802484, "grad_norm": 4.315053588094933, "learning_rate": 2.6528018572213133e-06, "loss": 0.6069, "step": 6109 }, { "epoch": 0.4962235036140664, "grad_norm": 3.5982681139239876, "learning_rate": 2.6521454592713125e-06, "loss": 0.4975, "step": 6110 }, { "epoch": 0.496304718590108, "grad_norm": 4.778399998810193, "learning_rate": 2.6514890507936515e-06, "loss": 0.4753, "step": 6111 }, { "epoch": 0.4963859335661496, "grad_norm": 5.875526293696564, "learning_rate": 2.6508326318337498e-06, "loss": 0.6111, "step": 6112 }, { "epoch": 0.4964671485421912, "grad_norm": 4.292823551355768, "learning_rate": 2.6501762024370283e-06, "loss": 0.3784, "step": 6113 }, { "epoch": 0.4965483635182328, "grad_norm": 8.401173587655329, "learning_rate": 2.6495197626489082e-06, "loss": 0.582, "step": 6114 }, { "epoch": 0.49662957849427436, "grad_norm": 4.321012772875746, "learning_rate": 2.6488633125148127e-06, "loss": 0.4502, "step": 6115 }, { "epoch": 0.49671079347031594, "grad_norm": 5.379767516829725, "learning_rate": 2.6482068520801625e-06, "loss": 0.5668, "step": 6116 }, { "epoch": 0.4967920084463575, "grad_norm": 5.218798786070938, "learning_rate": 2.647550381390383e-06, "loss": 0.4885, "step": 6117 }, { "epoch": 0.4968732234223991, "grad_norm": 6.280161135864912, "learning_rate": 2.6468939004908987e-06, "loss": 0.5571, "step": 6118 }, { "epoch": 0.49695443839844067, "grad_norm": 6.147745243094484, "learning_rate": 2.646237409427133e-06, "loss": 0.6213, "step": 6119 }, { "epoch": 0.49703565337448224, "grad_norm": 4.237997643901119, "learning_rate": 2.645580908244513e-06, "loss": 0.5256, "step": 6120 }, { "epoch": 0.4971168683505238, "grad_norm": 5.794983422357201, "learning_rate": 2.644924396988465e-06, "loss": 0.4717, "step": 6121 }, { "epoch": 0.4971980833265654, "grad_norm": 3.028956483672569, "learning_rate": 2.644267875704415e-06, "loss": 0.605, "step": 6122 }, { "epoch": 0.497279298302607, "grad_norm": 4.783447111346257, "learning_rate": 2.6436113444377916e-06, "loss": 0.3828, "step": 6123 }, { "epoch": 0.4973605132786486, "grad_norm": 4.634896048686429, "learning_rate": 2.6429548032340233e-06, "loss": 0.4624, "step": 6124 }, { "epoch": 0.4974417282546902, "grad_norm": 4.4180065168228575, "learning_rate": 2.642298252138539e-06, "loss": 0.471, "step": 6125 }, { "epoch": 0.49752294323073176, "grad_norm": 7.029597857740861, "learning_rate": 2.641641691196769e-06, "loss": 0.3827, "step": 6126 }, { "epoch": 0.49760415820677334, "grad_norm": 3.5588276162139922, "learning_rate": 2.6409851204541435e-06, "loss": 0.634, "step": 6127 }, { "epoch": 0.4976853731828149, "grad_norm": 5.229212123221044, "learning_rate": 2.640328539956094e-06, "loss": 0.4562, "step": 6128 }, { "epoch": 0.4977665881588565, "grad_norm": 4.266180122295215, "learning_rate": 2.639671949748052e-06, "loss": 0.5074, "step": 6129 }, { "epoch": 0.49784780313489807, "grad_norm": 5.993522414034609, "learning_rate": 2.6390153498754506e-06, "loss": 0.5987, "step": 6130 }, { "epoch": 0.49792901811093965, "grad_norm": 5.432626385109642, "learning_rate": 2.638358740383723e-06, "loss": 0.5943, "step": 6131 }, { "epoch": 0.4980102330869812, "grad_norm": 7.744059010319982, "learning_rate": 2.637702121318302e-06, "loss": 0.4533, "step": 6132 }, { "epoch": 0.4980914480630228, "grad_norm": 6.471183582152214, "learning_rate": 2.6370454927246237e-06, "loss": 0.4274, "step": 6133 }, { "epoch": 0.4981726630390644, "grad_norm": 3.457874851254734, "learning_rate": 2.6363888546481224e-06, "loss": 0.4676, "step": 6134 }, { "epoch": 0.498253878015106, "grad_norm": 8.481196069942994, "learning_rate": 2.635732207134234e-06, "loss": 0.4627, "step": 6135 }, { "epoch": 0.4983350929911476, "grad_norm": 5.660075404955752, "learning_rate": 2.635075550228395e-06, "loss": 0.4784, "step": 6136 }, { "epoch": 0.49841630796718916, "grad_norm": 4.313279838626725, "learning_rate": 2.634418883976043e-06, "loss": 0.4569, "step": 6137 }, { "epoch": 0.49849752294323074, "grad_norm": 3.362477204863664, "learning_rate": 2.6337622084226163e-06, "loss": 0.5727, "step": 6138 }, { "epoch": 0.4985787379192723, "grad_norm": 4.485740579944916, "learning_rate": 2.633105523613551e-06, "loss": 0.7045, "step": 6139 }, { "epoch": 0.4986599528953139, "grad_norm": 3.989314569645018, "learning_rate": 2.6324488295942897e-06, "loss": 0.693, "step": 6140 }, { "epoch": 0.4987411678713555, "grad_norm": 6.808212058806507, "learning_rate": 2.6317921264102697e-06, "loss": 0.4904, "step": 6141 }, { "epoch": 0.49882238284739705, "grad_norm": 4.01346195857619, "learning_rate": 2.6311354141069324e-06, "loss": 0.6021, "step": 6142 }, { "epoch": 0.4989035978234386, "grad_norm": 5.471950249286309, "learning_rate": 2.630478692729718e-06, "loss": 0.5009, "step": 6143 }, { "epoch": 0.4989848127994802, "grad_norm": 4.820722399207098, "learning_rate": 2.6298219623240685e-06, "loss": 0.3804, "step": 6144 }, { "epoch": 0.4990660277755218, "grad_norm": 3.164263910898727, "learning_rate": 2.6291652229354264e-06, "loss": 0.4665, "step": 6145 }, { "epoch": 0.4991472427515634, "grad_norm": 5.584269076148355, "learning_rate": 2.6285084746092347e-06, "loss": 0.5254, "step": 6146 }, { "epoch": 0.499228457727605, "grad_norm": 10.9678056140828, "learning_rate": 2.627851717390936e-06, "loss": 0.4869, "step": 6147 }, { "epoch": 0.49930967270364657, "grad_norm": 6.984083124060342, "learning_rate": 2.6271949513259764e-06, "loss": 0.5502, "step": 6148 }, { "epoch": 0.49939088767968814, "grad_norm": 4.28910430937024, "learning_rate": 2.626538176459798e-06, "loss": 0.5626, "step": 6149 }, { "epoch": 0.4994721026557297, "grad_norm": 5.809158832217803, "learning_rate": 2.625881392837849e-06, "loss": 0.5614, "step": 6150 }, { "epoch": 0.4995533176317713, "grad_norm": 4.895799235690263, "learning_rate": 2.6252246005055725e-06, "loss": 0.5047, "step": 6151 }, { "epoch": 0.4996345326078129, "grad_norm": 3.584830115023326, "learning_rate": 2.6245677995084163e-06, "loss": 0.555, "step": 6152 }, { "epoch": 0.49971574758385445, "grad_norm": 10.564765809267739, "learning_rate": 2.6239109898918286e-06, "loss": 0.4141, "step": 6153 }, { "epoch": 0.49979696255989603, "grad_norm": 3.816580959787303, "learning_rate": 2.6232541717012563e-06, "loss": 0.5562, "step": 6154 }, { "epoch": 0.4998781775359376, "grad_norm": 7.68222211219942, "learning_rate": 2.6225973449821468e-06, "loss": 0.4434, "step": 6155 }, { "epoch": 0.4999593925119792, "grad_norm": 4.677405643402403, "learning_rate": 2.6219405097799498e-06, "loss": 0.67, "step": 6156 }, { "epoch": 0.5000406074880208, "grad_norm": 6.735859677395967, "learning_rate": 2.6212836661401154e-06, "loss": 0.5044, "step": 6157 }, { "epoch": 0.5001218224640623, "grad_norm": 5.559869392207337, "learning_rate": 2.6206268141080924e-06, "loss": 0.4942, "step": 6158 }, { "epoch": 0.500203037440104, "grad_norm": 5.91589064626228, "learning_rate": 2.619969953729333e-06, "loss": 0.5946, "step": 6159 }, { "epoch": 0.5002842524161455, "grad_norm": 9.884104436935436, "learning_rate": 2.6193130850492876e-06, "loss": 0.6033, "step": 6160 }, { "epoch": 0.5003654673921871, "grad_norm": 5.191171578581443, "learning_rate": 2.618656208113408e-06, "loss": 0.4999, "step": 6161 }, { "epoch": 0.5004466823682288, "grad_norm": 4.869819153010546, "learning_rate": 2.6179993229671473e-06, "loss": 0.5341, "step": 6162 }, { "epoch": 0.5005278973442703, "grad_norm": 4.257762076221053, "learning_rate": 2.6173424296559575e-06, "loss": 0.5408, "step": 6163 }, { "epoch": 0.5006091123203119, "grad_norm": 4.96043740363745, "learning_rate": 2.6166855282252933e-06, "loss": 0.5863, "step": 6164 }, { "epoch": 0.5006903272963534, "grad_norm": 4.664873730144854, "learning_rate": 2.616028618720607e-06, "loss": 0.4936, "step": 6165 }, { "epoch": 0.5007715422723951, "grad_norm": 5.757723223582411, "learning_rate": 2.615371701187355e-06, "loss": 0.5604, "step": 6166 }, { "epoch": 0.5008527572484366, "grad_norm": 5.55920229153415, "learning_rate": 2.6147147756709925e-06, "loss": 0.5215, "step": 6167 }, { "epoch": 0.5009339722244782, "grad_norm": 3.916022531924741, "learning_rate": 2.614057842216973e-06, "loss": 0.6314, "step": 6168 }, { "epoch": 0.5010151872005197, "grad_norm": 3.748268469101728, "learning_rate": 2.6134009008707555e-06, "loss": 0.5058, "step": 6169 }, { "epoch": 0.5010964021765614, "grad_norm": 3.866469109499023, "learning_rate": 2.6127439516777956e-06, "loss": 0.6175, "step": 6170 }, { "epoch": 0.5011776171526029, "grad_norm": 9.784917821882507, "learning_rate": 2.6120869946835513e-06, "loss": 0.4369, "step": 6171 }, { "epoch": 0.5012588321286445, "grad_norm": 3.918198159806199, "learning_rate": 2.61143002993348e-06, "loss": 0.5966, "step": 6172 }, { "epoch": 0.5013400471046862, "grad_norm": 9.939958648361882, "learning_rate": 2.61077305747304e-06, "loss": 0.5353, "step": 6173 }, { "epoch": 0.5014212620807277, "grad_norm": 6.322410360361211, "learning_rate": 2.610116077347691e-06, "loss": 0.4513, "step": 6174 }, { "epoch": 0.5015024770567693, "grad_norm": 6.622717663567328, "learning_rate": 2.609459089602892e-06, "loss": 0.3957, "step": 6175 }, { "epoch": 0.5015836920328108, "grad_norm": 8.064816332776429, "learning_rate": 2.6088020942841034e-06, "loss": 0.3698, "step": 6176 }, { "epoch": 0.5016649070088525, "grad_norm": 7.086132865620177, "learning_rate": 2.6081450914367864e-06, "loss": 0.5031, "step": 6177 }, { "epoch": 0.501746121984894, "grad_norm": 3.7954526083234077, "learning_rate": 2.6074880811064003e-06, "loss": 0.5348, "step": 6178 }, { "epoch": 0.5018273369609356, "grad_norm": 5.020997847466201, "learning_rate": 2.606831063338408e-06, "loss": 0.4605, "step": 6179 }, { "epoch": 0.5019085519369771, "grad_norm": 4.560830832766969, "learning_rate": 2.6061740381782723e-06, "loss": 0.5455, "step": 6180 }, { "epoch": 0.5019897669130188, "grad_norm": 4.229120124206621, "learning_rate": 2.605517005671454e-06, "loss": 0.5157, "step": 6181 }, { "epoch": 0.5020709818890603, "grad_norm": 4.581886089334441, "learning_rate": 2.604859965863418e-06, "loss": 0.4667, "step": 6182 }, { "epoch": 0.5021521968651019, "grad_norm": 4.4186999593426055, "learning_rate": 2.6042029187996277e-06, "loss": 0.5935, "step": 6183 }, { "epoch": 0.5022334118411436, "grad_norm": 4.736522021215294, "learning_rate": 2.6035458645255467e-06, "loss": 0.5252, "step": 6184 }, { "epoch": 0.5023146268171851, "grad_norm": 4.668585962104124, "learning_rate": 2.602888803086639e-06, "loss": 0.5201, "step": 6185 }, { "epoch": 0.5023958417932267, "grad_norm": 9.66768445142635, "learning_rate": 2.602231734528372e-06, "loss": 0.4524, "step": 6186 }, { "epoch": 0.5024770567692682, "grad_norm": 4.15432156915695, "learning_rate": 2.601574658896209e-06, "loss": 0.6911, "step": 6187 }, { "epoch": 0.5025582717453099, "grad_norm": 5.036750129329255, "learning_rate": 2.6009175762356176e-06, "loss": 0.587, "step": 6188 }, { "epoch": 0.5026394867213514, "grad_norm": 9.244386485882309, "learning_rate": 2.6002604865920645e-06, "loss": 0.5369, "step": 6189 }, { "epoch": 0.502720701697393, "grad_norm": 4.712476893869534, "learning_rate": 2.5996033900110155e-06, "loss": 0.6125, "step": 6190 }, { "epoch": 0.5028019166734345, "grad_norm": 7.034161631409917, "learning_rate": 2.5989462865379394e-06, "loss": 0.6711, "step": 6191 }, { "epoch": 0.5028831316494762, "grad_norm": 2.537027536386944, "learning_rate": 2.598289176218304e-06, "loss": 0.5937, "step": 6192 }, { "epoch": 0.5029643466255177, "grad_norm": 3.394185226747455, "learning_rate": 2.597632059097577e-06, "loss": 0.6653, "step": 6193 }, { "epoch": 0.5030455616015593, "grad_norm": 4.376129626575339, "learning_rate": 2.5969749352212294e-06, "loss": 0.6337, "step": 6194 }, { "epoch": 0.503126776577601, "grad_norm": 4.748486742998477, "learning_rate": 2.5963178046347286e-06, "loss": 0.4952, "step": 6195 }, { "epoch": 0.5032079915536425, "grad_norm": 7.369302442783826, "learning_rate": 2.595660667383547e-06, "loss": 0.3893, "step": 6196 }, { "epoch": 0.5032892065296841, "grad_norm": 5.509513918719035, "learning_rate": 2.5950035235131515e-06, "loss": 0.5277, "step": 6197 }, { "epoch": 0.5033704215057256, "grad_norm": 5.829645133957113, "learning_rate": 2.594346373069016e-06, "loss": 0.4237, "step": 6198 }, { "epoch": 0.5034516364817673, "grad_norm": 5.626826411794362, "learning_rate": 2.593689216096611e-06, "loss": 0.4906, "step": 6199 }, { "epoch": 0.5035328514578088, "grad_norm": 6.977550708147046, "learning_rate": 2.5930320526414083e-06, "loss": 0.548, "step": 6200 }, { "epoch": 0.5036140664338504, "grad_norm": 7.209561596520351, "learning_rate": 2.592374882748879e-06, "loss": 0.4672, "step": 6201 }, { "epoch": 0.503695281409892, "grad_norm": 5.543565192935603, "learning_rate": 2.5917177064644974e-06, "loss": 0.5535, "step": 6202 }, { "epoch": 0.5037764963859336, "grad_norm": 5.366571236178294, "learning_rate": 2.5910605238337355e-06, "loss": 0.6972, "step": 6203 }, { "epoch": 0.5038577113619751, "grad_norm": 6.261566607697583, "learning_rate": 2.5904033349020675e-06, "loss": 0.4503, "step": 6204 }, { "epoch": 0.5039389263380167, "grad_norm": 5.805760768553691, "learning_rate": 2.589746139714967e-06, "loss": 0.4825, "step": 6205 }, { "epoch": 0.5040201413140584, "grad_norm": 8.328312111239873, "learning_rate": 2.5890889383179086e-06, "loss": 0.4764, "step": 6206 }, { "epoch": 0.5041013562900999, "grad_norm": 4.810122399520198, "learning_rate": 2.588431730756367e-06, "loss": 0.5084, "step": 6207 }, { "epoch": 0.5041825712661415, "grad_norm": 3.7065509135197545, "learning_rate": 2.5877745170758177e-06, "loss": 0.524, "step": 6208 }, { "epoch": 0.504263786242183, "grad_norm": 5.121308484330212, "learning_rate": 2.5871172973217367e-06, "loss": 0.5104, "step": 6209 }, { "epoch": 0.5043450012182247, "grad_norm": 5.928958710988731, "learning_rate": 2.5864600715396e-06, "loss": 0.5344, "step": 6210 }, { "epoch": 0.5044262161942662, "grad_norm": 5.887834972208363, "learning_rate": 2.585802839774883e-06, "loss": 0.4809, "step": 6211 }, { "epoch": 0.5045074311703078, "grad_norm": 4.367368393282637, "learning_rate": 2.5851456020730643e-06, "loss": 0.6071, "step": 6212 }, { "epoch": 0.5045886461463494, "grad_norm": 3.4461457206000374, "learning_rate": 2.584488358479621e-06, "loss": 0.6266, "step": 6213 }, { "epoch": 0.504669861122391, "grad_norm": 3.1291003154252004, "learning_rate": 2.5838311090400293e-06, "loss": 0.5782, "step": 6214 }, { "epoch": 0.5047510760984325, "grad_norm": 9.218149230525775, "learning_rate": 2.58317385379977e-06, "loss": 0.395, "step": 6215 }, { "epoch": 0.5048322910744741, "grad_norm": 7.375352110089412, "learning_rate": 2.582516592804319e-06, "loss": 0.4645, "step": 6216 }, { "epoch": 0.5049135060505158, "grad_norm": 6.15235163146789, "learning_rate": 2.5818593260991565e-06, "loss": 0.514, "step": 6217 }, { "epoch": 0.5049947210265573, "grad_norm": 16.29462482704463, "learning_rate": 2.581202053729762e-06, "loss": 0.5103, "step": 6218 }, { "epoch": 0.5050759360025989, "grad_norm": 4.3058949775554884, "learning_rate": 2.580544775741616e-06, "loss": 0.5023, "step": 6219 }, { "epoch": 0.5051571509786404, "grad_norm": 3.2866867580977606, "learning_rate": 2.579887492180197e-06, "loss": 0.6187, "step": 6220 }, { "epoch": 0.5052383659546821, "grad_norm": 5.219156744499874, "learning_rate": 2.579230203090986e-06, "loss": 0.3933, "step": 6221 }, { "epoch": 0.5053195809307236, "grad_norm": 4.147926897905404, "learning_rate": 2.578572908519465e-06, "loss": 0.506, "step": 6222 }, { "epoch": 0.5054007959067652, "grad_norm": 4.123405600563359, "learning_rate": 2.577915608511114e-06, "loss": 0.5677, "step": 6223 }, { "epoch": 0.5054820108828068, "grad_norm": 7.522333120828346, "learning_rate": 2.5772583031114157e-06, "loss": 0.4562, "step": 6224 }, { "epoch": 0.5055632258588484, "grad_norm": 6.046806353687886, "learning_rate": 2.5766009923658516e-06, "loss": 0.3849, "step": 6225 }, { "epoch": 0.5056444408348899, "grad_norm": 4.707865925528092, "learning_rate": 2.5759436763199047e-06, "loss": 0.4868, "step": 6226 }, { "epoch": 0.5057256558109315, "grad_norm": 4.715913190140127, "learning_rate": 2.575286355019056e-06, "loss": 0.587, "step": 6227 }, { "epoch": 0.5058068707869732, "grad_norm": 3.2497965234461264, "learning_rate": 2.5746290285087912e-06, "loss": 0.7003, "step": 6228 }, { "epoch": 0.5058880857630147, "grad_norm": 3.280084946712466, "learning_rate": 2.5739716968345922e-06, "loss": 0.6808, "step": 6229 }, { "epoch": 0.5059693007390563, "grad_norm": 5.724019303106518, "learning_rate": 2.573314360041943e-06, "loss": 0.4032, "step": 6230 }, { "epoch": 0.5060505157150978, "grad_norm": 4.723214741303482, "learning_rate": 2.5726570181763286e-06, "loss": 0.4832, "step": 6231 }, { "epoch": 0.5061317306911395, "grad_norm": 3.7300991619481363, "learning_rate": 2.571999671283233e-06, "loss": 0.4762, "step": 6232 }, { "epoch": 0.506212945667181, "grad_norm": 4.889315053935509, "learning_rate": 2.5713423194081404e-06, "loss": 0.5565, "step": 6233 }, { "epoch": 0.5062941606432226, "grad_norm": 7.67062134201034, "learning_rate": 2.570684962596538e-06, "loss": 0.5569, "step": 6234 }, { "epoch": 0.5063753756192642, "grad_norm": 4.100525163986654, "learning_rate": 2.5700276008939096e-06, "loss": 0.4171, "step": 6235 }, { "epoch": 0.5064565905953058, "grad_norm": 4.883509578878357, "learning_rate": 2.569370234345742e-06, "loss": 0.4534, "step": 6236 }, { "epoch": 0.5065378055713473, "grad_norm": 17.999174929712392, "learning_rate": 2.568712862997522e-06, "loss": 0.4791, "step": 6237 }, { "epoch": 0.5066190205473889, "grad_norm": 3.501500494032202, "learning_rate": 2.5680554868947346e-06, "loss": 0.3923, "step": 6238 }, { "epoch": 0.5067002355234306, "grad_norm": 5.490863237584197, "learning_rate": 2.5673981060828672e-06, "loss": 0.557, "step": 6239 }, { "epoch": 0.5067814504994721, "grad_norm": 4.017635335289608, "learning_rate": 2.5667407206074084e-06, "loss": 0.4779, "step": 6240 }, { "epoch": 0.5068626654755137, "grad_norm": 5.193790698559395, "learning_rate": 2.566083330513845e-06, "loss": 0.5404, "step": 6241 }, { "epoch": 0.5069438804515553, "grad_norm": 3.781621395266743, "learning_rate": 2.565425935847665e-06, "loss": 0.5054, "step": 6242 }, { "epoch": 0.5070250954275969, "grad_norm": 6.125227421255686, "learning_rate": 2.564768536654356e-06, "loss": 0.565, "step": 6243 }, { "epoch": 0.5071063104036384, "grad_norm": 5.159395575524112, "learning_rate": 2.564111132979407e-06, "loss": 0.5639, "step": 6244 }, { "epoch": 0.50718752537968, "grad_norm": 5.800011134638956, "learning_rate": 2.563453724868308e-06, "loss": 0.4247, "step": 6245 }, { "epoch": 0.5072687403557216, "grad_norm": 3.2153805898476606, "learning_rate": 2.5627963123665455e-06, "loss": 0.4201, "step": 6246 }, { "epoch": 0.5073499553317632, "grad_norm": 3.5092901295905405, "learning_rate": 2.5621388955196113e-06, "loss": 0.472, "step": 6247 }, { "epoch": 0.5074311703078047, "grad_norm": 8.695984681785939, "learning_rate": 2.561481474372995e-06, "loss": 0.4915, "step": 6248 }, { "epoch": 0.5075123852838463, "grad_norm": 4.710656984668509, "learning_rate": 2.560824048972185e-06, "loss": 0.4936, "step": 6249 }, { "epoch": 0.507593600259888, "grad_norm": 7.293645049422129, "learning_rate": 2.5601666193626735e-06, "loss": 0.6042, "step": 6250 }, { "epoch": 0.5076748152359295, "grad_norm": 5.2621850945387285, "learning_rate": 2.55950918558995e-06, "loss": 0.443, "step": 6251 }, { "epoch": 0.5077560302119711, "grad_norm": 3.4152519821072578, "learning_rate": 2.558851747699506e-06, "loss": 0.697, "step": 6252 }, { "epoch": 0.5078372451880127, "grad_norm": 5.202402703658923, "learning_rate": 2.5581943057368317e-06, "loss": 0.4773, "step": 6253 }, { "epoch": 0.5079184601640543, "grad_norm": 7.606949152034, "learning_rate": 2.5575368597474202e-06, "loss": 0.3156, "step": 6254 }, { "epoch": 0.5079996751400958, "grad_norm": 3.929776691209423, "learning_rate": 2.5568794097767624e-06, "loss": 0.4507, "step": 6255 }, { "epoch": 0.5080808901161374, "grad_norm": 6.335298963383283, "learning_rate": 2.5562219558703504e-06, "loss": 0.6335, "step": 6256 }, { "epoch": 0.508162105092179, "grad_norm": 7.8562095782475865, "learning_rate": 2.555564498073677e-06, "loss": 0.5108, "step": 6257 }, { "epoch": 0.5082433200682206, "grad_norm": 4.8650466475894145, "learning_rate": 2.554907036432235e-06, "loss": 0.5093, "step": 6258 }, { "epoch": 0.5083245350442621, "grad_norm": 5.229859161415607, "learning_rate": 2.554249570991515e-06, "loss": 0.5999, "step": 6259 }, { "epoch": 0.5084057500203037, "grad_norm": 4.359502656532959, "learning_rate": 2.5535921017970123e-06, "loss": 0.435, "step": 6260 }, { "epoch": 0.5084869649963454, "grad_norm": 15.539251038374521, "learning_rate": 2.5529346288942203e-06, "loss": 0.4546, "step": 6261 }, { "epoch": 0.5085681799723869, "grad_norm": 4.359760496582075, "learning_rate": 2.5522771523286317e-06, "loss": 0.4273, "step": 6262 }, { "epoch": 0.5086493949484285, "grad_norm": 5.6985722122365114, "learning_rate": 2.551619672145741e-06, "loss": 0.5175, "step": 6263 }, { "epoch": 0.5087306099244701, "grad_norm": 4.451602003719397, "learning_rate": 2.5509621883910424e-06, "loss": 0.5247, "step": 6264 }, { "epoch": 0.5088118249005117, "grad_norm": 6.12534073894256, "learning_rate": 2.55030470111003e-06, "loss": 0.4593, "step": 6265 }, { "epoch": 0.5088930398765532, "grad_norm": 5.010881145167957, "learning_rate": 2.5496472103481984e-06, "loss": 0.523, "step": 6266 }, { "epoch": 0.5089742548525948, "grad_norm": 6.63962229868484, "learning_rate": 2.5489897161510425e-06, "loss": 0.4728, "step": 6267 }, { "epoch": 0.5090554698286364, "grad_norm": 4.6745969311887485, "learning_rate": 2.5483322185640575e-06, "loss": 0.5094, "step": 6268 }, { "epoch": 0.509136684804678, "grad_norm": 7.671233857618601, "learning_rate": 2.547674717632739e-06, "loss": 0.5129, "step": 6269 }, { "epoch": 0.5092178997807195, "grad_norm": 4.60865861862504, "learning_rate": 2.547017213402582e-06, "loss": 0.4483, "step": 6270 }, { "epoch": 0.5092991147567612, "grad_norm": 6.2621430489481265, "learning_rate": 2.546359705919083e-06, "loss": 0.4052, "step": 6271 }, { "epoch": 0.5093803297328028, "grad_norm": 5.501701524912082, "learning_rate": 2.545702195227737e-06, "loss": 0.504, "step": 6272 }, { "epoch": 0.5094615447088443, "grad_norm": 9.43685395208808, "learning_rate": 2.545044681374042e-06, "loss": 0.7419, "step": 6273 }, { "epoch": 0.5095427596848859, "grad_norm": 4.655667182094343, "learning_rate": 2.544387164403493e-06, "loss": 0.6057, "step": 6274 }, { "epoch": 0.5096239746609275, "grad_norm": 6.207757916402813, "learning_rate": 2.543729644361587e-06, "loss": 0.3979, "step": 6275 }, { "epoch": 0.5097051896369691, "grad_norm": 6.082343128329472, "learning_rate": 2.5430721212938216e-06, "loss": 0.4088, "step": 6276 }, { "epoch": 0.5097864046130106, "grad_norm": 4.720976085216124, "learning_rate": 2.542414595245693e-06, "loss": 0.4561, "step": 6277 }, { "epoch": 0.5098676195890522, "grad_norm": 4.73426813490108, "learning_rate": 2.541757066262699e-06, "loss": 0.6201, "step": 6278 }, { "epoch": 0.5099488345650938, "grad_norm": 3.383613942551711, "learning_rate": 2.541099534390336e-06, "loss": 0.4821, "step": 6279 }, { "epoch": 0.5100300495411354, "grad_norm": 8.433408631314988, "learning_rate": 2.5404419996741042e-06, "loss": 0.4494, "step": 6280 }, { "epoch": 0.5101112645171769, "grad_norm": 5.815113828274007, "learning_rate": 2.5397844621594997e-06, "loss": 0.5385, "step": 6281 }, { "epoch": 0.5101924794932186, "grad_norm": 4.687103952333867, "learning_rate": 2.5391269218920202e-06, "loss": 0.4344, "step": 6282 }, { "epoch": 0.5102736944692602, "grad_norm": 5.089752144632809, "learning_rate": 2.5384693789171656e-06, "loss": 0.4559, "step": 6283 }, { "epoch": 0.5103549094453017, "grad_norm": 5.854876777674836, "learning_rate": 2.537811833280433e-06, "loss": 0.5045, "step": 6284 }, { "epoch": 0.5104361244213433, "grad_norm": 4.799615858367655, "learning_rate": 2.5371542850273224e-06, "loss": 0.4898, "step": 6285 }, { "epoch": 0.5105173393973849, "grad_norm": 5.42400764526346, "learning_rate": 2.5364967342033307e-06, "loss": 0.6302, "step": 6286 }, { "epoch": 0.5105985543734265, "grad_norm": 4.933143417479578, "learning_rate": 2.5358391808539597e-06, "loss": 0.4977, "step": 6287 }, { "epoch": 0.510679769349468, "grad_norm": 3.4223345719076113, "learning_rate": 2.535181625024706e-06, "loss": 0.4809, "step": 6288 }, { "epoch": 0.5107609843255096, "grad_norm": 9.061912369576826, "learning_rate": 2.53452406676107e-06, "loss": 0.3358, "step": 6289 }, { "epoch": 0.5108421993015512, "grad_norm": 10.525328047917656, "learning_rate": 2.5338665061085518e-06, "loss": 0.6054, "step": 6290 }, { "epoch": 0.5109234142775928, "grad_norm": 5.383474823413491, "learning_rate": 2.5332089431126504e-06, "loss": 0.6881, "step": 6291 }, { "epoch": 0.5110046292536343, "grad_norm": 11.602351673058207, "learning_rate": 2.532551377818866e-06, "loss": 0.6387, "step": 6292 }, { "epoch": 0.511085844229676, "grad_norm": 4.317046810663442, "learning_rate": 2.5318938102726985e-06, "loss": 0.6418, "step": 6293 }, { "epoch": 0.5111670592057176, "grad_norm": 4.588601931278866, "learning_rate": 2.5312362405196485e-06, "loss": 0.4955, "step": 6294 }, { "epoch": 0.5112482741817591, "grad_norm": 5.9618029800437204, "learning_rate": 2.530578668605215e-06, "loss": 0.4686, "step": 6295 }, { "epoch": 0.5113294891578007, "grad_norm": 4.309047195757413, "learning_rate": 2.5299210945749005e-06, "loss": 0.624, "step": 6296 }, { "epoch": 0.5114107041338423, "grad_norm": 6.062649085654284, "learning_rate": 2.529263518474204e-06, "loss": 0.5655, "step": 6297 }, { "epoch": 0.5114919191098839, "grad_norm": 8.101899866072504, "learning_rate": 2.5286059403486262e-06, "loss": 0.6048, "step": 6298 }, { "epoch": 0.5115731340859254, "grad_norm": 4.855939734927873, "learning_rate": 2.52794836024367e-06, "loss": 0.5202, "step": 6299 }, { "epoch": 0.511654349061967, "grad_norm": 4.1533242601287315, "learning_rate": 2.5272907782048343e-06, "loss": 0.6758, "step": 6300 }, { "epoch": 0.5117355640380086, "grad_norm": 44.854170715654014, "learning_rate": 2.526633194277622e-06, "loss": 0.4042, "step": 6301 }, { "epoch": 0.5118167790140502, "grad_norm": 4.477675057177277, "learning_rate": 2.5259756085075333e-06, "loss": 0.641, "step": 6302 }, { "epoch": 0.5118979939900917, "grad_norm": 5.00794940429514, "learning_rate": 2.5253180209400697e-06, "loss": 0.4153, "step": 6303 }, { "epoch": 0.5119792089661334, "grad_norm": 35.355057930038036, "learning_rate": 2.5246604316207327e-06, "loss": 0.3842, "step": 6304 }, { "epoch": 0.512060423942175, "grad_norm": 5.069001886917277, "learning_rate": 2.524002840595025e-06, "loss": 0.5766, "step": 6305 }, { "epoch": 0.5121416389182165, "grad_norm": 4.7801204519502845, "learning_rate": 2.523345247908448e-06, "loss": 0.5046, "step": 6306 }, { "epoch": 0.5122228538942581, "grad_norm": 13.094992788363395, "learning_rate": 2.522687653606503e-06, "loss": 0.4069, "step": 6307 }, { "epoch": 0.5123040688702997, "grad_norm": 3.4115277705733273, "learning_rate": 2.5220300577346925e-06, "loss": 0.6193, "step": 6308 }, { "epoch": 0.5123852838463413, "grad_norm": 4.934143576608908, "learning_rate": 2.521372460338518e-06, "loss": 0.4763, "step": 6309 }, { "epoch": 0.5124664988223828, "grad_norm": 4.955171281673901, "learning_rate": 2.5207148614634836e-06, "loss": 0.5961, "step": 6310 }, { "epoch": 0.5125477137984245, "grad_norm": 5.804504802010579, "learning_rate": 2.5200572611550893e-06, "loss": 0.4608, "step": 6311 }, { "epoch": 0.512628928774466, "grad_norm": 3.3522351499110346, "learning_rate": 2.5193996594588395e-06, "loss": 0.5245, "step": 6312 }, { "epoch": 0.5127101437505076, "grad_norm": 4.503464521908895, "learning_rate": 2.5187420564202357e-06, "loss": 0.6784, "step": 6313 }, { "epoch": 0.5127913587265491, "grad_norm": 4.619599612350149, "learning_rate": 2.518084452084781e-06, "loss": 0.5239, "step": 6314 }, { "epoch": 0.5128725737025908, "grad_norm": 6.2490361812009745, "learning_rate": 2.5174268464979775e-06, "loss": 0.4502, "step": 6315 }, { "epoch": 0.5129537886786324, "grad_norm": 6.493719400660229, "learning_rate": 2.516769239705328e-06, "loss": 0.45, "step": 6316 }, { "epoch": 0.5130350036546739, "grad_norm": 15.674054593332599, "learning_rate": 2.5161116317523367e-06, "loss": 0.5184, "step": 6317 }, { "epoch": 0.5131162186307155, "grad_norm": 5.072386439299523, "learning_rate": 2.5154540226845053e-06, "loss": 0.4148, "step": 6318 }, { "epoch": 0.5131974336067571, "grad_norm": 6.330463701552802, "learning_rate": 2.514796412547337e-06, "loss": 0.6593, "step": 6319 }, { "epoch": 0.5132786485827987, "grad_norm": 6.329569453939536, "learning_rate": 2.5141388013863366e-06, "loss": 0.6006, "step": 6320 }, { "epoch": 0.5133598635588402, "grad_norm": 4.995421040496941, "learning_rate": 2.5134811892470046e-06, "loss": 0.4651, "step": 6321 }, { "epoch": 0.5134410785348819, "grad_norm": 6.312161136312402, "learning_rate": 2.512823576174846e-06, "loss": 0.34, "step": 6322 }, { "epoch": 0.5135222935109234, "grad_norm": 4.377238702155564, "learning_rate": 2.5121659622153643e-06, "loss": 0.4297, "step": 6323 }, { "epoch": 0.513603508486965, "grad_norm": 4.928516077514282, "learning_rate": 2.511508347414062e-06, "loss": 0.5385, "step": 6324 }, { "epoch": 0.5136847234630065, "grad_norm": 7.762372366845343, "learning_rate": 2.510850731816443e-06, "loss": 0.3269, "step": 6325 }, { "epoch": 0.5137659384390482, "grad_norm": 5.504193583882282, "learning_rate": 2.510193115468011e-06, "loss": 0.3859, "step": 6326 }, { "epoch": 0.5138471534150898, "grad_norm": 6.152137635079484, "learning_rate": 2.5095354984142682e-06, "loss": 0.4028, "step": 6327 }, { "epoch": 0.5139283683911313, "grad_norm": 7.085948581559598, "learning_rate": 2.5088778807007203e-06, "loss": 0.3954, "step": 6328 }, { "epoch": 0.514009583367173, "grad_norm": 5.649925226999557, "learning_rate": 2.5082202623728707e-06, "loss": 0.4362, "step": 6329 }, { "epoch": 0.5140907983432145, "grad_norm": 7.520999779577865, "learning_rate": 2.507562643476222e-06, "loss": 0.7279, "step": 6330 }, { "epoch": 0.5141720133192561, "grad_norm": 3.4023713944158143, "learning_rate": 2.5069050240562782e-06, "loss": 0.745, "step": 6331 }, { "epoch": 0.5142532282952976, "grad_norm": 5.4405959398028125, "learning_rate": 2.5062474041585432e-06, "loss": 0.5726, "step": 6332 }, { "epoch": 0.5143344432713393, "grad_norm": 5.517222494769225, "learning_rate": 2.5055897838285207e-06, "loss": 0.5267, "step": 6333 }, { "epoch": 0.5144156582473808, "grad_norm": 4.726623734798838, "learning_rate": 2.504932163111715e-06, "loss": 0.5198, "step": 6334 }, { "epoch": 0.5144968732234224, "grad_norm": 4.440896263973417, "learning_rate": 2.5042745420536295e-06, "loss": 0.5967, "step": 6335 }, { "epoch": 0.5145780881994639, "grad_norm": 3.37064042601589, "learning_rate": 2.503616920699769e-06, "loss": 0.5971, "step": 6336 }, { "epoch": 0.5146593031755056, "grad_norm": 4.690221273319353, "learning_rate": 2.502959299095636e-06, "loss": 0.5007, "step": 6337 }, { "epoch": 0.5147405181515472, "grad_norm": 4.908136131112301, "learning_rate": 2.5023016772867353e-06, "loss": 0.5678, "step": 6338 }, { "epoch": 0.5148217331275887, "grad_norm": 5.443552173567539, "learning_rate": 2.5016440553185718e-06, "loss": 0.4118, "step": 6339 }, { "epoch": 0.5149029481036304, "grad_norm": 6.1838875143833665, "learning_rate": 2.5009864332366467e-06, "loss": 0.6241, "step": 6340 }, { "epoch": 0.5149841630796719, "grad_norm": 3.986184988106473, "learning_rate": 2.5003288110864664e-06, "loss": 0.5065, "step": 6341 }, { "epoch": 0.5150653780557135, "grad_norm": 4.231672066174447, "learning_rate": 2.4996711889135344e-06, "loss": 0.6497, "step": 6342 }, { "epoch": 0.515146593031755, "grad_norm": 5.815390035755676, "learning_rate": 2.499013566763354e-06, "loss": 0.4079, "step": 6343 }, { "epoch": 0.5152278080077967, "grad_norm": 7.9133098452043455, "learning_rate": 2.4983559446814295e-06, "loss": 0.438, "step": 6344 }, { "epoch": 0.5153090229838382, "grad_norm": 3.207812612582073, "learning_rate": 2.497698322713265e-06, "loss": 0.5876, "step": 6345 }, { "epoch": 0.5153902379598798, "grad_norm": 4.353200703794427, "learning_rate": 2.4970407009043646e-06, "loss": 0.5272, "step": 6346 }, { "epoch": 0.5154714529359213, "grad_norm": 5.892469802972017, "learning_rate": 2.4963830793002313e-06, "loss": 0.4392, "step": 6347 }, { "epoch": 0.515552667911963, "grad_norm": 7.87771121157062, "learning_rate": 2.495725457946371e-06, "loss": 0.326, "step": 6348 }, { "epoch": 0.5156338828880046, "grad_norm": 4.905801737169398, "learning_rate": 2.4950678368882863e-06, "loss": 0.5542, "step": 6349 }, { "epoch": 0.5157150978640461, "grad_norm": 3.2384008143696117, "learning_rate": 2.49441021617148e-06, "loss": 0.5257, "step": 6350 }, { "epoch": 0.5157963128400878, "grad_norm": 8.399266140485144, "learning_rate": 2.4937525958414576e-06, "loss": 0.6298, "step": 6351 }, { "epoch": 0.5158775278161293, "grad_norm": 6.628609352541173, "learning_rate": 2.4930949759437234e-06, "loss": 0.3812, "step": 6352 }, { "epoch": 0.5159587427921709, "grad_norm": 7.882154633725088, "learning_rate": 2.492437356523779e-06, "loss": 0.5552, "step": 6353 }, { "epoch": 0.5160399577682124, "grad_norm": 9.337887778597489, "learning_rate": 2.4917797376271297e-06, "loss": 0.5093, "step": 6354 }, { "epoch": 0.5161211727442541, "grad_norm": 7.376950318246357, "learning_rate": 2.49112211929928e-06, "loss": 0.5847, "step": 6355 }, { "epoch": 0.5162023877202956, "grad_norm": 6.482321048724295, "learning_rate": 2.4904645015857318e-06, "loss": 0.5784, "step": 6356 }, { "epoch": 0.5162836026963372, "grad_norm": 5.797801300962893, "learning_rate": 2.48980688453199e-06, "loss": 0.5229, "step": 6357 }, { "epoch": 0.5163648176723787, "grad_norm": 5.360257972509032, "learning_rate": 2.4891492681835584e-06, "loss": 0.4893, "step": 6358 }, { "epoch": 0.5164460326484204, "grad_norm": 5.202599723312153, "learning_rate": 2.4884916525859386e-06, "loss": 0.4775, "step": 6359 }, { "epoch": 0.516527247624462, "grad_norm": 6.783080832818119, "learning_rate": 2.4878340377846365e-06, "loss": 0.4474, "step": 6360 }, { "epoch": 0.5166084626005035, "grad_norm": 12.845276601042336, "learning_rate": 2.4871764238251547e-06, "loss": 0.4225, "step": 6361 }, { "epoch": 0.5166896775765452, "grad_norm": 7.494098434686925, "learning_rate": 2.4865188107529963e-06, "loss": 0.5043, "step": 6362 }, { "epoch": 0.5167708925525867, "grad_norm": 5.004627980302835, "learning_rate": 2.485861198613664e-06, "loss": 0.4393, "step": 6363 }, { "epoch": 0.5168521075286283, "grad_norm": 6.726551242092707, "learning_rate": 2.4852035874526632e-06, "loss": 0.5499, "step": 6364 }, { "epoch": 0.5169333225046698, "grad_norm": 4.666439480933289, "learning_rate": 2.4845459773154964e-06, "loss": 0.5416, "step": 6365 }, { "epoch": 0.5170145374807115, "grad_norm": 5.180749129277349, "learning_rate": 2.483888368247664e-06, "loss": 0.5082, "step": 6366 }, { "epoch": 0.517095752456753, "grad_norm": 4.494845006036604, "learning_rate": 2.4832307602946726e-06, "loss": 0.4328, "step": 6367 }, { "epoch": 0.5171769674327946, "grad_norm": 7.722256382211164, "learning_rate": 2.4825731535020242e-06, "loss": 0.4709, "step": 6368 }, { "epoch": 0.5172581824088361, "grad_norm": 5.336177066714621, "learning_rate": 2.48191554791522e-06, "loss": 0.5804, "step": 6369 }, { "epoch": 0.5173393973848778, "grad_norm": 6.222339347839942, "learning_rate": 2.481257943579765e-06, "loss": 0.4719, "step": 6370 }, { "epoch": 0.5174206123609194, "grad_norm": 7.613124790360803, "learning_rate": 2.4806003405411617e-06, "loss": 0.6593, "step": 6371 }, { "epoch": 0.5175018273369609, "grad_norm": 6.354106819523852, "learning_rate": 2.479942738844911e-06, "loss": 0.4365, "step": 6372 }, { "epoch": 0.5175830423130026, "grad_norm": 5.688366482514378, "learning_rate": 2.479285138536517e-06, "loss": 0.5308, "step": 6373 }, { "epoch": 0.5176642572890441, "grad_norm": 5.415154693434397, "learning_rate": 2.4786275396614823e-06, "loss": 0.4804, "step": 6374 }, { "epoch": 0.5177454722650857, "grad_norm": 28.82102125102892, "learning_rate": 2.477969942265308e-06, "loss": 0.3669, "step": 6375 }, { "epoch": 0.5178266872411272, "grad_norm": 6.810481547484077, "learning_rate": 2.4773123463934973e-06, "loss": 0.6299, "step": 6376 }, { "epoch": 0.5179079022171689, "grad_norm": 4.392949418938218, "learning_rate": 2.476654752091553e-06, "loss": 0.5736, "step": 6377 }, { "epoch": 0.5179891171932104, "grad_norm": 4.2704517486876234, "learning_rate": 2.4759971594049763e-06, "loss": 0.5811, "step": 6378 }, { "epoch": 0.518070332169252, "grad_norm": 6.692688448203209, "learning_rate": 2.4753395683792677e-06, "loss": 0.4761, "step": 6379 }, { "epoch": 0.5181515471452935, "grad_norm": 7.334413784638161, "learning_rate": 2.474681979059931e-06, "loss": 0.5694, "step": 6380 }, { "epoch": 0.5182327621213352, "grad_norm": 4.544276408270827, "learning_rate": 2.474024391492468e-06, "loss": 0.4231, "step": 6381 }, { "epoch": 0.5183139770973768, "grad_norm": 4.593897927457953, "learning_rate": 2.473366805722379e-06, "loss": 0.6074, "step": 6382 }, { "epoch": 0.5183951920734183, "grad_norm": 4.897073039683832, "learning_rate": 2.472709221795166e-06, "loss": 0.4831, "step": 6383 }, { "epoch": 0.51847640704946, "grad_norm": 4.507107061670829, "learning_rate": 2.4720516397563314e-06, "loss": 0.5052, "step": 6384 }, { "epoch": 0.5185576220255015, "grad_norm": 4.582461812162122, "learning_rate": 2.471394059651374e-06, "loss": 0.6067, "step": 6385 }, { "epoch": 0.5186388370015431, "grad_norm": 5.866771836678322, "learning_rate": 2.470736481525797e-06, "loss": 0.6531, "step": 6386 }, { "epoch": 0.5187200519775846, "grad_norm": 5.813106703432218, "learning_rate": 2.470078905425101e-06, "loss": 0.4097, "step": 6387 }, { "epoch": 0.5188012669536263, "grad_norm": 7.253453752131115, "learning_rate": 2.4694213313947855e-06, "loss": 0.4998, "step": 6388 }, { "epoch": 0.5188824819296678, "grad_norm": 3.081751300748125, "learning_rate": 2.4687637594803527e-06, "loss": 0.5751, "step": 6389 }, { "epoch": 0.5189636969057094, "grad_norm": 5.45405710599491, "learning_rate": 2.4681061897273028e-06, "loss": 0.4083, "step": 6390 }, { "epoch": 0.519044911881751, "grad_norm": 4.843125937608803, "learning_rate": 2.4674486221811345e-06, "loss": 0.4051, "step": 6391 }, { "epoch": 0.5191261268577926, "grad_norm": 6.00791556685988, "learning_rate": 2.46679105688735e-06, "loss": 0.5527, "step": 6392 }, { "epoch": 0.5192073418338342, "grad_norm": 3.547137678101548, "learning_rate": 2.466133493891449e-06, "loss": 0.4875, "step": 6393 }, { "epoch": 0.5192885568098757, "grad_norm": 8.66243792163446, "learning_rate": 2.46547593323893e-06, "loss": 0.496, "step": 6394 }, { "epoch": 0.5193697717859174, "grad_norm": 4.430513402849331, "learning_rate": 2.464818374975295e-06, "loss": 0.4364, "step": 6395 }, { "epoch": 0.5194509867619589, "grad_norm": 3.7975317298710625, "learning_rate": 2.4641608191460415e-06, "loss": 0.6302, "step": 6396 }, { "epoch": 0.5195322017380005, "grad_norm": 3.888846410063618, "learning_rate": 2.46350326579667e-06, "loss": 0.5352, "step": 6397 }, { "epoch": 0.519613416714042, "grad_norm": 4.412943996478983, "learning_rate": 2.462845714972679e-06, "loss": 0.4854, "step": 6398 }, { "epoch": 0.5196946316900837, "grad_norm": 3.706708426188841, "learning_rate": 2.4621881667195676e-06, "loss": 0.4375, "step": 6399 }, { "epoch": 0.5197758466661252, "grad_norm": 3.8449883186160787, "learning_rate": 2.4615306210828357e-06, "loss": 0.5516, "step": 6400 }, { "epoch": 0.5198570616421668, "grad_norm": 5.909942632055917, "learning_rate": 2.46087307810798e-06, "loss": 0.5237, "step": 6401 }, { "epoch": 0.5199382766182084, "grad_norm": 3.547279382037591, "learning_rate": 2.460215537840501e-06, "loss": 0.4175, "step": 6402 }, { "epoch": 0.52001949159425, "grad_norm": 4.882379724186628, "learning_rate": 2.459558000325897e-06, "loss": 0.6123, "step": 6403 }, { "epoch": 0.5201007065702916, "grad_norm": 4.050921637738143, "learning_rate": 2.458900465609664e-06, "loss": 0.5215, "step": 6404 }, { "epoch": 0.5201819215463331, "grad_norm": 3.382346078488962, "learning_rate": 2.4582429337373018e-06, "loss": 0.4617, "step": 6405 }, { "epoch": 0.5202631365223748, "grad_norm": 7.257748442971193, "learning_rate": 2.4575854047543082e-06, "loss": 0.4909, "step": 6406 }, { "epoch": 0.5203443514984163, "grad_norm": 6.459780012681047, "learning_rate": 2.456927878706179e-06, "loss": 0.4125, "step": 6407 }, { "epoch": 0.5204255664744579, "grad_norm": 3.8936939687249863, "learning_rate": 2.4562703556384136e-06, "loss": 0.5746, "step": 6408 }, { "epoch": 0.5205067814504994, "grad_norm": 5.650603779541892, "learning_rate": 2.4556128355965076e-06, "loss": 0.4194, "step": 6409 }, { "epoch": 0.5205879964265411, "grad_norm": 6.233819656387275, "learning_rate": 2.454955318625958e-06, "loss": 0.5296, "step": 6410 }, { "epoch": 0.5206692114025826, "grad_norm": 5.175873926721092, "learning_rate": 2.4542978047722633e-06, "loss": 0.5687, "step": 6411 }, { "epoch": 0.5207504263786242, "grad_norm": 7.441015494915213, "learning_rate": 2.453640294080918e-06, "loss": 0.7367, "step": 6412 }, { "epoch": 0.5208316413546658, "grad_norm": 7.282413948152037, "learning_rate": 2.452982786597419e-06, "loss": 0.5059, "step": 6413 }, { "epoch": 0.5209128563307074, "grad_norm": 5.063377741570186, "learning_rate": 2.452325282367262e-06, "loss": 0.4086, "step": 6414 }, { "epoch": 0.520994071306749, "grad_norm": 4.377231675145512, "learning_rate": 2.4516677814359434e-06, "loss": 0.5554, "step": 6415 }, { "epoch": 0.5210752862827905, "grad_norm": 5.111222932344912, "learning_rate": 2.4510102838489587e-06, "loss": 0.5666, "step": 6416 }, { "epoch": 0.5211565012588322, "grad_norm": 2.6572969128420363, "learning_rate": 2.4503527896518025e-06, "loss": 0.5856, "step": 6417 }, { "epoch": 0.5212377162348737, "grad_norm": 3.467977660333967, "learning_rate": 2.449695298889971e-06, "loss": 0.5549, "step": 6418 }, { "epoch": 0.5213189312109153, "grad_norm": 3.9586980008450547, "learning_rate": 2.449037811608959e-06, "loss": 0.5114, "step": 6419 }, { "epoch": 0.5214001461869568, "grad_norm": 6.6024044152724795, "learning_rate": 2.4483803278542594e-06, "loss": 0.4881, "step": 6420 }, { "epoch": 0.5214813611629985, "grad_norm": 3.872306998345536, "learning_rate": 2.447722847671369e-06, "loss": 0.4957, "step": 6421 }, { "epoch": 0.52156257613904, "grad_norm": 4.928256043695277, "learning_rate": 2.4470653711057805e-06, "loss": 0.5453, "step": 6422 }, { "epoch": 0.5216437911150816, "grad_norm": 5.00049623611133, "learning_rate": 2.446407898202988e-06, "loss": 0.4302, "step": 6423 }, { "epoch": 0.5217250060911232, "grad_norm": 6.210683915021455, "learning_rate": 2.445750429008486e-06, "loss": 0.4867, "step": 6424 }, { "epoch": 0.5218062210671648, "grad_norm": 4.733137071274775, "learning_rate": 2.4450929635677667e-06, "loss": 0.5342, "step": 6425 }, { "epoch": 0.5218874360432064, "grad_norm": 3.573213441447143, "learning_rate": 2.4444355019263235e-06, "loss": 0.5391, "step": 6426 }, { "epoch": 0.5219686510192479, "grad_norm": 3.9218556675383685, "learning_rate": 2.44377804412965e-06, "loss": 0.5641, "step": 6427 }, { "epoch": 0.5220498659952896, "grad_norm": 3.72842203066307, "learning_rate": 2.443120590223238e-06, "loss": 0.5727, "step": 6428 }, { "epoch": 0.5221310809713311, "grad_norm": 4.5179452250246355, "learning_rate": 2.4424631402525797e-06, "loss": 0.5027, "step": 6429 }, { "epoch": 0.5222122959473727, "grad_norm": 7.296740350963874, "learning_rate": 2.4418056942631687e-06, "loss": 0.4627, "step": 6430 }, { "epoch": 0.5222935109234143, "grad_norm": 4.733022729564796, "learning_rate": 2.4411482523004946e-06, "loss": 0.4634, "step": 6431 }, { "epoch": 0.5223747258994559, "grad_norm": 5.089595564180344, "learning_rate": 2.4404908144100513e-06, "loss": 0.3586, "step": 6432 }, { "epoch": 0.5224559408754974, "grad_norm": 5.358613043214558, "learning_rate": 2.4398333806373274e-06, "loss": 0.465, "step": 6433 }, { "epoch": 0.522537155851539, "grad_norm": 5.524190371189003, "learning_rate": 2.4391759510278153e-06, "loss": 0.5472, "step": 6434 }, { "epoch": 0.5226183708275806, "grad_norm": 4.103822972000649, "learning_rate": 2.438518525627006e-06, "loss": 0.5572, "step": 6435 }, { "epoch": 0.5226995858036222, "grad_norm": 5.959127088930798, "learning_rate": 2.4378611044803887e-06, "loss": 0.4209, "step": 6436 }, { "epoch": 0.5227808007796638, "grad_norm": 5.31529709254031, "learning_rate": 2.437203687633455e-06, "loss": 0.3897, "step": 6437 }, { "epoch": 0.5228620157557053, "grad_norm": 3.7128349527804656, "learning_rate": 2.436546275131693e-06, "loss": 0.4428, "step": 6438 }, { "epoch": 0.522943230731747, "grad_norm": 3.784355733006889, "learning_rate": 2.435888867020593e-06, "loss": 0.575, "step": 6439 }, { "epoch": 0.5230244457077885, "grad_norm": 4.907099811863354, "learning_rate": 2.435231463345645e-06, "loss": 0.5123, "step": 6440 }, { "epoch": 0.5231056606838301, "grad_norm": 3.094941647382156, "learning_rate": 2.4345740641523362e-06, "loss": 0.6211, "step": 6441 }, { "epoch": 0.5231868756598717, "grad_norm": 5.697788871531617, "learning_rate": 2.4339166694861553e-06, "loss": 0.4124, "step": 6442 }, { "epoch": 0.5232680906359133, "grad_norm": 5.825537615998602, "learning_rate": 2.433259279392592e-06, "loss": 0.4934, "step": 6443 }, { "epoch": 0.5233493056119548, "grad_norm": 4.894893357625769, "learning_rate": 2.432601893917133e-06, "loss": 0.5467, "step": 6444 }, { "epoch": 0.5234305205879964, "grad_norm": 7.012722371867207, "learning_rate": 2.431944513105266e-06, "loss": 0.5173, "step": 6445 }, { "epoch": 0.523511735564038, "grad_norm": 3.8914614811391144, "learning_rate": 2.4312871370024794e-06, "loss": 0.5101, "step": 6446 }, { "epoch": 0.5235929505400796, "grad_norm": 4.15063424291914, "learning_rate": 2.4306297656542584e-06, "loss": 0.4239, "step": 6447 }, { "epoch": 0.5236741655161212, "grad_norm": 26.114252256627633, "learning_rate": 2.4299723991060904e-06, "loss": 0.5355, "step": 6448 }, { "epoch": 0.5237553804921627, "grad_norm": 5.921632572418434, "learning_rate": 2.4293150374034625e-06, "loss": 0.3273, "step": 6449 }, { "epoch": 0.5238365954682044, "grad_norm": 5.16163949915224, "learning_rate": 2.4286576805918604e-06, "loss": 0.4013, "step": 6450 }, { "epoch": 0.5239178104442459, "grad_norm": 3.8705275409101643, "learning_rate": 2.4280003287167684e-06, "loss": 0.4692, "step": 6451 }, { "epoch": 0.5239990254202875, "grad_norm": 4.713434827452213, "learning_rate": 2.427342981823672e-06, "loss": 0.4827, "step": 6452 }, { "epoch": 0.5240802403963291, "grad_norm": 9.847453330969838, "learning_rate": 2.426685639958058e-06, "loss": 0.597, "step": 6453 }, { "epoch": 0.5241614553723707, "grad_norm": 14.084536911041138, "learning_rate": 2.426028303165409e-06, "loss": 0.7119, "step": 6454 }, { "epoch": 0.5242426703484122, "grad_norm": 4.162924811617758, "learning_rate": 2.425370971491209e-06, "loss": 0.5353, "step": 6455 }, { "epoch": 0.5243238853244538, "grad_norm": 5.310608964267923, "learning_rate": 2.424713644980945e-06, "loss": 0.4668, "step": 6456 }, { "epoch": 0.5244051003004954, "grad_norm": 3.485581874385866, "learning_rate": 2.424056323680097e-06, "loss": 0.5376, "step": 6457 }, { "epoch": 0.524486315276537, "grad_norm": 4.466249110985916, "learning_rate": 2.423399007634149e-06, "loss": 0.5362, "step": 6458 }, { "epoch": 0.5245675302525786, "grad_norm": 15.507085752685619, "learning_rate": 2.422741696888585e-06, "loss": 0.4823, "step": 6459 }, { "epoch": 0.5246487452286202, "grad_norm": 3.42283227124507, "learning_rate": 2.4220843914888865e-06, "loss": 0.5139, "step": 6460 }, { "epoch": 0.5247299602046618, "grad_norm": 4.471153348445109, "learning_rate": 2.4214270914805353e-06, "loss": 0.6101, "step": 6461 }, { "epoch": 0.5248111751807033, "grad_norm": 4.937962838447406, "learning_rate": 2.4207697969090145e-06, "loss": 0.5725, "step": 6462 }, { "epoch": 0.5248923901567449, "grad_norm": 3.4301025114527355, "learning_rate": 2.420112507819804e-06, "loss": 0.6678, "step": 6463 }, { "epoch": 0.5249736051327865, "grad_norm": 9.810088340742176, "learning_rate": 2.4194552242583845e-06, "loss": 0.4788, "step": 6464 }, { "epoch": 0.5250548201088281, "grad_norm": 5.247524236071172, "learning_rate": 2.4187979462702382e-06, "loss": 0.588, "step": 6465 }, { "epoch": 0.5251360350848696, "grad_norm": 3.8501987558213133, "learning_rate": 2.4181406739008443e-06, "loss": 0.5239, "step": 6466 }, { "epoch": 0.5252172500609112, "grad_norm": 4.286565115822655, "learning_rate": 2.417483407195682e-06, "loss": 0.4764, "step": 6467 }, { "epoch": 0.5252984650369528, "grad_norm": 2.9196698091729347, "learning_rate": 2.416826146200231e-06, "loss": 0.5093, "step": 6468 }, { "epoch": 0.5253796800129944, "grad_norm": 4.6923054622476, "learning_rate": 2.4161688909599715e-06, "loss": 0.6884, "step": 6469 }, { "epoch": 0.525460894989036, "grad_norm": 3.966646784959062, "learning_rate": 2.4155116415203804e-06, "loss": 0.5132, "step": 6470 }, { "epoch": 0.5255421099650776, "grad_norm": 3.6916733502842956, "learning_rate": 2.4148543979269357e-06, "loss": 0.5949, "step": 6471 }, { "epoch": 0.5256233249411192, "grad_norm": 4.986568652074796, "learning_rate": 2.4141971602251176e-06, "loss": 0.5262, "step": 6472 }, { "epoch": 0.5257045399171607, "grad_norm": 7.761776773353264, "learning_rate": 2.4135399284604012e-06, "loss": 0.3962, "step": 6473 }, { "epoch": 0.5257857548932023, "grad_norm": 28.513128146924068, "learning_rate": 2.4128827026782633e-06, "loss": 0.655, "step": 6474 }, { "epoch": 0.5258669698692439, "grad_norm": 5.030638393005198, "learning_rate": 2.4122254829241827e-06, "loss": 0.6157, "step": 6475 }, { "epoch": 0.5259481848452855, "grad_norm": 3.759316711837708, "learning_rate": 2.4115682692436337e-06, "loss": 0.4513, "step": 6476 }, { "epoch": 0.526029399821327, "grad_norm": 5.1999339488745395, "learning_rate": 2.4109110616820918e-06, "loss": 0.427, "step": 6477 }, { "epoch": 0.5261106147973686, "grad_norm": 9.081659712327916, "learning_rate": 2.4102538602850337e-06, "loss": 0.4293, "step": 6478 }, { "epoch": 0.5261918297734102, "grad_norm": 3.9469459629155574, "learning_rate": 2.4095966650979342e-06, "loss": 0.5247, "step": 6479 }, { "epoch": 0.5262730447494518, "grad_norm": 4.830126815254179, "learning_rate": 2.4089394761662653e-06, "loss": 0.5248, "step": 6480 }, { "epoch": 0.5263542597254934, "grad_norm": 5.427859891508367, "learning_rate": 2.4082822935355035e-06, "loss": 0.4429, "step": 6481 }, { "epoch": 0.526435474701535, "grad_norm": 5.032571069469282, "learning_rate": 2.4076251172511224e-06, "loss": 0.4578, "step": 6482 }, { "epoch": 0.5265166896775766, "grad_norm": 3.8795340568732364, "learning_rate": 2.4069679473585925e-06, "loss": 0.5519, "step": 6483 }, { "epoch": 0.5265979046536181, "grad_norm": 3.9342593469573877, "learning_rate": 2.4063107839033894e-06, "loss": 0.5532, "step": 6484 }, { "epoch": 0.5266791196296597, "grad_norm": 6.08364360680254, "learning_rate": 2.4056536269309847e-06, "loss": 0.5749, "step": 6485 }, { "epoch": 0.5267603346057013, "grad_norm": 6.140920876184235, "learning_rate": 2.4049964764868493e-06, "loss": 0.3915, "step": 6486 }, { "epoch": 0.5268415495817429, "grad_norm": 4.532054993744361, "learning_rate": 2.4043393326164536e-06, "loss": 0.4485, "step": 6487 }, { "epoch": 0.5269227645577844, "grad_norm": 4.418973334061362, "learning_rate": 2.403682195365272e-06, "loss": 0.4327, "step": 6488 }, { "epoch": 0.527003979533826, "grad_norm": 4.167489894614658, "learning_rate": 2.4030250647787714e-06, "loss": 0.545, "step": 6489 }, { "epoch": 0.5270851945098676, "grad_norm": 5.917681962851331, "learning_rate": 2.402367940902423e-06, "loss": 0.5363, "step": 6490 }, { "epoch": 0.5271664094859092, "grad_norm": 10.913996246318028, "learning_rate": 2.401710823781697e-06, "loss": 0.3902, "step": 6491 }, { "epoch": 0.5272476244619508, "grad_norm": 25.430239522056482, "learning_rate": 2.4010537134620614e-06, "loss": 0.4407, "step": 6492 }, { "epoch": 0.5273288394379924, "grad_norm": 9.226096785828776, "learning_rate": 2.400396609988985e-06, "loss": 0.5414, "step": 6493 }, { "epoch": 0.527410054414034, "grad_norm": 4.553011519720875, "learning_rate": 2.3997395134079367e-06, "loss": 0.5125, "step": 6494 }, { "epoch": 0.5274912693900755, "grad_norm": 3.85392516623142, "learning_rate": 2.399082423764383e-06, "loss": 0.5804, "step": 6495 }, { "epoch": 0.5275724843661171, "grad_norm": 6.43430352311751, "learning_rate": 2.3984253411037913e-06, "loss": 0.4332, "step": 6496 }, { "epoch": 0.5276536993421587, "grad_norm": 5.348598603512892, "learning_rate": 2.397768265471629e-06, "loss": 0.4464, "step": 6497 }, { "epoch": 0.5277349143182003, "grad_norm": 5.503883803363463, "learning_rate": 2.397111196913362e-06, "loss": 0.5333, "step": 6498 }, { "epoch": 0.5278161292942418, "grad_norm": 5.084027665154841, "learning_rate": 2.396454135474454e-06, "loss": 0.5963, "step": 6499 }, { "epoch": 0.5278973442702835, "grad_norm": 4.104843883398135, "learning_rate": 2.3957970812003727e-06, "loss": 0.6582, "step": 6500 }, { "epoch": 0.527978559246325, "grad_norm": 6.334197801454462, "learning_rate": 2.3951400341365827e-06, "loss": 0.5699, "step": 6501 }, { "epoch": 0.5280597742223666, "grad_norm": 6.9437969081416435, "learning_rate": 2.394482994328546e-06, "loss": 0.4565, "step": 6502 }, { "epoch": 0.5281409891984082, "grad_norm": 3.941363956260876, "learning_rate": 2.393825961821728e-06, "loss": 0.705, "step": 6503 }, { "epoch": 0.5282222041744498, "grad_norm": 3.7273474058501974, "learning_rate": 2.3931689366615926e-06, "loss": 0.488, "step": 6504 }, { "epoch": 0.5283034191504914, "grad_norm": 5.084976289181063, "learning_rate": 2.392511918893601e-06, "loss": 0.5142, "step": 6505 }, { "epoch": 0.5283846341265329, "grad_norm": 6.4658935525713295, "learning_rate": 2.3918549085632145e-06, "loss": 0.4072, "step": 6506 }, { "epoch": 0.5284658491025745, "grad_norm": 3.651970957929354, "learning_rate": 2.3911979057158974e-06, "loss": 0.4404, "step": 6507 }, { "epoch": 0.5285470640786161, "grad_norm": 3.5446244109539045, "learning_rate": 2.3905409103971096e-06, "loss": 0.5651, "step": 6508 }, { "epoch": 0.5286282790546577, "grad_norm": 3.9057802685762217, "learning_rate": 2.38988392265231e-06, "loss": 0.4663, "step": 6509 }, { "epoch": 0.5287094940306992, "grad_norm": 2.871351243716771, "learning_rate": 2.389226942526961e-06, "loss": 0.5126, "step": 6510 }, { "epoch": 0.5287907090067409, "grad_norm": 6.264655385586473, "learning_rate": 2.3885699700665217e-06, "loss": 0.463, "step": 6511 }, { "epoch": 0.5288719239827824, "grad_norm": 6.176994725304146, "learning_rate": 2.3879130053164495e-06, "loss": 0.3438, "step": 6512 }, { "epoch": 0.528953138958824, "grad_norm": 3.841956377687345, "learning_rate": 2.3872560483222048e-06, "loss": 0.6078, "step": 6513 }, { "epoch": 0.5290343539348656, "grad_norm": 3.946093406576217, "learning_rate": 2.3865990991292458e-06, "loss": 0.49, "step": 6514 }, { "epoch": 0.5291155689109072, "grad_norm": 5.815463605481333, "learning_rate": 2.3859421577830276e-06, "loss": 0.5979, "step": 6515 }, { "epoch": 0.5291967838869488, "grad_norm": 7.2849496983935875, "learning_rate": 2.385285224329009e-06, "loss": 0.4522, "step": 6516 }, { "epoch": 0.5292779988629903, "grad_norm": 6.703027017567743, "learning_rate": 2.384628298812646e-06, "loss": 0.4208, "step": 6517 }, { "epoch": 0.529359213839032, "grad_norm": 4.068730154097392, "learning_rate": 2.383971381279393e-06, "loss": 0.649, "step": 6518 }, { "epoch": 0.5294404288150735, "grad_norm": 3.9699040583000667, "learning_rate": 2.383314471774707e-06, "loss": 0.7164, "step": 6519 }, { "epoch": 0.5295216437911151, "grad_norm": 11.734590811153446, "learning_rate": 2.382657570344043e-06, "loss": 0.5432, "step": 6520 }, { "epoch": 0.5296028587671566, "grad_norm": 5.270660282246951, "learning_rate": 2.382000677032854e-06, "loss": 0.5122, "step": 6521 }, { "epoch": 0.5296840737431983, "grad_norm": 4.145643825845797, "learning_rate": 2.3813437918865925e-06, "loss": 0.5513, "step": 6522 }, { "epoch": 0.5297652887192398, "grad_norm": 5.695234292809091, "learning_rate": 2.380686914950713e-06, "loss": 0.5945, "step": 6523 }, { "epoch": 0.5298465036952814, "grad_norm": 5.187560295102548, "learning_rate": 2.380030046270668e-06, "loss": 0.5533, "step": 6524 }, { "epoch": 0.529927718671323, "grad_norm": 5.2980757209337215, "learning_rate": 2.379373185891908e-06, "loss": 0.5801, "step": 6525 }, { "epoch": 0.5300089336473646, "grad_norm": 4.379144750425471, "learning_rate": 2.3787163338598854e-06, "loss": 0.6238, "step": 6526 }, { "epoch": 0.5300901486234062, "grad_norm": 5.906641818249719, "learning_rate": 2.3780594902200515e-06, "loss": 0.5211, "step": 6527 }, { "epoch": 0.5301713635994477, "grad_norm": 5.654605677438833, "learning_rate": 2.377402655017854e-06, "loss": 0.4426, "step": 6528 }, { "epoch": 0.5302525785754894, "grad_norm": 9.600316549753119, "learning_rate": 2.376745828298745e-06, "loss": 0.4189, "step": 6529 }, { "epoch": 0.5303337935515309, "grad_norm": 4.652507037908199, "learning_rate": 2.376089010108172e-06, "loss": 0.5635, "step": 6530 }, { "epoch": 0.5304150085275725, "grad_norm": 6.872480563300781, "learning_rate": 2.3754322004915837e-06, "loss": 0.452, "step": 6531 }, { "epoch": 0.530496223503614, "grad_norm": 6.133795123377434, "learning_rate": 2.3747753994944283e-06, "loss": 0.5212, "step": 6532 }, { "epoch": 0.5305774384796557, "grad_norm": 5.688402297871484, "learning_rate": 2.3741186071621523e-06, "loss": 0.5803, "step": 6533 }, { "epoch": 0.5306586534556972, "grad_norm": 4.147307899835006, "learning_rate": 2.373461823540202e-06, "loss": 0.4938, "step": 6534 }, { "epoch": 0.5307398684317388, "grad_norm": 4.16411315568013, "learning_rate": 2.3728050486740244e-06, "loss": 0.4921, "step": 6535 }, { "epoch": 0.5308210834077804, "grad_norm": 4.86479630808414, "learning_rate": 2.3721482826090643e-06, "loss": 0.5561, "step": 6536 }, { "epoch": 0.530902298383822, "grad_norm": 4.549881463792877, "learning_rate": 2.3714915253907657e-06, "loss": 0.4727, "step": 6537 }, { "epoch": 0.5309835133598636, "grad_norm": 3.1960496743376745, "learning_rate": 2.370834777064574e-06, "loss": 0.5856, "step": 6538 }, { "epoch": 0.5310647283359051, "grad_norm": 6.183831960146142, "learning_rate": 2.3701780376759323e-06, "loss": 0.4186, "step": 6539 }, { "epoch": 0.5311459433119468, "grad_norm": 6.1626582028588075, "learning_rate": 2.3695213072702834e-06, "loss": 0.4421, "step": 6540 }, { "epoch": 0.5312271582879883, "grad_norm": 4.399409989452963, "learning_rate": 2.368864585893069e-06, "loss": 0.5147, "step": 6541 }, { "epoch": 0.5313083732640299, "grad_norm": 4.73410977261382, "learning_rate": 2.368207873589731e-06, "loss": 0.56, "step": 6542 }, { "epoch": 0.5313895882400714, "grad_norm": 4.575239688507527, "learning_rate": 2.3675511704057115e-06, "loss": 0.5333, "step": 6543 }, { "epoch": 0.5314708032161131, "grad_norm": 7.035858437872166, "learning_rate": 2.3668944763864486e-06, "loss": 0.5508, "step": 6544 }, { "epoch": 0.5315520181921546, "grad_norm": 5.241353250145363, "learning_rate": 2.3662377915773845e-06, "loss": 0.4836, "step": 6545 }, { "epoch": 0.5316332331681962, "grad_norm": 7.382490118024742, "learning_rate": 2.365581116023958e-06, "loss": 0.4395, "step": 6546 }, { "epoch": 0.5317144481442379, "grad_norm": 5.3401428892737455, "learning_rate": 2.364924449771605e-06, "loss": 0.427, "step": 6547 }, { "epoch": 0.5317956631202794, "grad_norm": 3.7061230832301066, "learning_rate": 2.364267792865767e-06, "loss": 0.8927, "step": 6548 }, { "epoch": 0.531876878096321, "grad_norm": 4.420036708733726, "learning_rate": 2.363611145351879e-06, "loss": 0.5167, "step": 6549 }, { "epoch": 0.5319580930723625, "grad_norm": 4.783438045653566, "learning_rate": 2.3629545072753767e-06, "loss": 0.5279, "step": 6550 }, { "epoch": 0.5320393080484042, "grad_norm": 4.749522841051883, "learning_rate": 2.3622978786816984e-06, "loss": 0.4024, "step": 6551 }, { "epoch": 0.5321205230244457, "grad_norm": 7.7640179721652, "learning_rate": 2.361641259616278e-06, "loss": 0.5393, "step": 6552 }, { "epoch": 0.5322017380004873, "grad_norm": 3.6547210717885865, "learning_rate": 2.3609846501245494e-06, "loss": 0.4325, "step": 6553 }, { "epoch": 0.5322829529765288, "grad_norm": 7.3797843232620455, "learning_rate": 2.3603280502519482e-06, "loss": 0.5021, "step": 6554 }, { "epoch": 0.5323641679525705, "grad_norm": 3.694146153232864, "learning_rate": 2.3596714600439062e-06, "loss": 0.5116, "step": 6555 }, { "epoch": 0.532445382928612, "grad_norm": 5.0985701151843354, "learning_rate": 2.3590148795458577e-06, "loss": 0.4945, "step": 6556 }, { "epoch": 0.5325265979046536, "grad_norm": 5.131669229194393, "learning_rate": 2.3583583088032313e-06, "loss": 0.5813, "step": 6557 }, { "epoch": 0.5326078128806953, "grad_norm": 4.005833213164181, "learning_rate": 2.3577017478614613e-06, "loss": 0.5053, "step": 6558 }, { "epoch": 0.5326890278567368, "grad_norm": 14.883116766398912, "learning_rate": 2.357045196765978e-06, "loss": 0.6758, "step": 6559 }, { "epoch": 0.5327702428327784, "grad_norm": 4.9450151525987955, "learning_rate": 2.3563886555622093e-06, "loss": 0.561, "step": 6560 }, { "epoch": 0.5328514578088199, "grad_norm": 9.194980096203347, "learning_rate": 2.355732124295586e-06, "loss": 0.5195, "step": 6561 }, { "epoch": 0.5329326727848616, "grad_norm": 5.200787441007623, "learning_rate": 2.3550756030115364e-06, "loss": 0.6133, "step": 6562 }, { "epoch": 0.5330138877609031, "grad_norm": 4.874445965778859, "learning_rate": 2.3544190917554875e-06, "loss": 0.456, "step": 6563 }, { "epoch": 0.5330951027369447, "grad_norm": 4.734097340063761, "learning_rate": 2.3537625905728677e-06, "loss": 0.5303, "step": 6564 }, { "epoch": 0.5331763177129862, "grad_norm": 5.317506693611895, "learning_rate": 2.3531060995091026e-06, "loss": 0.6195, "step": 6565 }, { "epoch": 0.5332575326890279, "grad_norm": 5.085875518247843, "learning_rate": 2.352449618609617e-06, "loss": 0.6773, "step": 6566 }, { "epoch": 0.5333387476650694, "grad_norm": 5.216473307627287, "learning_rate": 2.3517931479198383e-06, "loss": 0.4075, "step": 6567 }, { "epoch": 0.533419962641111, "grad_norm": 4.73626486723172, "learning_rate": 2.3511366874851885e-06, "loss": 0.5928, "step": 6568 }, { "epoch": 0.5335011776171527, "grad_norm": 4.068709628111098, "learning_rate": 2.350480237351092e-06, "loss": 0.5746, "step": 6569 }, { "epoch": 0.5335823925931942, "grad_norm": 7.162771272344813, "learning_rate": 2.3498237975629726e-06, "loss": 0.6726, "step": 6570 }, { "epoch": 0.5336636075692358, "grad_norm": 8.523429545622452, "learning_rate": 2.349167368166251e-06, "loss": 0.4304, "step": 6571 }, { "epoch": 0.5337448225452773, "grad_norm": 5.274081883984625, "learning_rate": 2.348510949206349e-06, "loss": 0.5222, "step": 6572 }, { "epoch": 0.533826037521319, "grad_norm": 3.8799868155561406, "learning_rate": 2.3478545407286883e-06, "loss": 0.6353, "step": 6573 }, { "epoch": 0.5339072524973605, "grad_norm": 3.6165853886009844, "learning_rate": 2.3471981427786875e-06, "loss": 0.5822, "step": 6574 }, { "epoch": 0.5339884674734021, "grad_norm": 6.353757597164967, "learning_rate": 2.3465417554017675e-06, "loss": 0.4707, "step": 6575 }, { "epoch": 0.5340696824494436, "grad_norm": 5.10414117668738, "learning_rate": 2.3458853786433444e-06, "loss": 0.5357, "step": 6576 }, { "epoch": 0.5341508974254853, "grad_norm": 3.3925342392484317, "learning_rate": 2.345229012548838e-06, "loss": 0.6855, "step": 6577 }, { "epoch": 0.5342321124015268, "grad_norm": 3.3042399836502776, "learning_rate": 2.3445726571636656e-06, "loss": 0.4581, "step": 6578 }, { "epoch": 0.5343133273775684, "grad_norm": 4.792803765102656, "learning_rate": 2.3439163125332415e-06, "loss": 0.4442, "step": 6579 }, { "epoch": 0.5343945423536101, "grad_norm": 53.09136619996998, "learning_rate": 2.343259978702984e-06, "loss": 0.5999, "step": 6580 }, { "epoch": 0.5344757573296516, "grad_norm": 4.366575308474977, "learning_rate": 2.3426036557183056e-06, "loss": 0.55, "step": 6581 }, { "epoch": 0.5345569723056932, "grad_norm": 4.219747570498485, "learning_rate": 2.3419473436246206e-06, "loss": 0.6209, "step": 6582 }, { "epoch": 0.5346381872817347, "grad_norm": 5.505458655811916, "learning_rate": 2.341291042467344e-06, "loss": 0.5682, "step": 6583 }, { "epoch": 0.5347194022577764, "grad_norm": 5.438215231835009, "learning_rate": 2.3406347522918866e-06, "loss": 0.4587, "step": 6584 }, { "epoch": 0.5348006172338179, "grad_norm": 9.221582577454688, "learning_rate": 2.339978473143661e-06, "loss": 0.6507, "step": 6585 }, { "epoch": 0.5348818322098595, "grad_norm": 6.264732038581897, "learning_rate": 2.3393222050680788e-06, "loss": 0.5201, "step": 6586 }, { "epoch": 0.534963047185901, "grad_norm": 5.801333658841605, "learning_rate": 2.338665948110549e-06, "loss": 0.5507, "step": 6587 }, { "epoch": 0.5350442621619427, "grad_norm": 6.810336490229261, "learning_rate": 2.3380097023164813e-06, "loss": 0.67, "step": 6588 }, { "epoch": 0.5351254771379842, "grad_norm": 5.781581076596003, "learning_rate": 2.337353467731286e-06, "loss": 0.391, "step": 6589 }, { "epoch": 0.5352066921140258, "grad_norm": 4.370970902980655, "learning_rate": 2.3366972444003698e-06, "loss": 0.5351, "step": 6590 }, { "epoch": 0.5352879070900675, "grad_norm": 12.469579377782821, "learning_rate": 2.3360410323691386e-06, "loss": 0.4929, "step": 6591 }, { "epoch": 0.535369122066109, "grad_norm": 10.008332152388622, "learning_rate": 2.335384831683002e-06, "loss": 0.4752, "step": 6592 }, { "epoch": 0.5354503370421506, "grad_norm": 9.797876571328075, "learning_rate": 2.334728642387363e-06, "loss": 0.4668, "step": 6593 }, { "epoch": 0.5355315520181921, "grad_norm": 5.754796284833849, "learning_rate": 2.334072464527628e-06, "loss": 0.3572, "step": 6594 }, { "epoch": 0.5356127669942338, "grad_norm": 4.316813249696164, "learning_rate": 2.333416298149199e-06, "loss": 0.4839, "step": 6595 }, { "epoch": 0.5356939819702753, "grad_norm": 4.974375179617703, "learning_rate": 2.3327601432974817e-06, "loss": 0.3869, "step": 6596 }, { "epoch": 0.5357751969463169, "grad_norm": 3.985538587500043, "learning_rate": 2.332104000017877e-06, "loss": 0.4175, "step": 6597 }, { "epoch": 0.5358564119223584, "grad_norm": 3.2055836403047064, "learning_rate": 2.3314478683557863e-06, "loss": 0.5612, "step": 6598 }, { "epoch": 0.5359376268984001, "grad_norm": 5.214849251275779, "learning_rate": 2.330791748356612e-06, "loss": 0.6605, "step": 6599 }, { "epoch": 0.5360188418744416, "grad_norm": 5.888955286356365, "learning_rate": 2.3301356400657527e-06, "loss": 0.4432, "step": 6600 }, { "epoch": 0.5361000568504832, "grad_norm": 4.505062527177114, "learning_rate": 2.3294795435286073e-06, "loss": 0.6324, "step": 6601 }, { "epoch": 0.5361812718265249, "grad_norm": 79.62251681114921, "learning_rate": 2.3288234587905767e-06, "loss": 0.4921, "step": 6602 }, { "epoch": 0.5362624868025664, "grad_norm": 7.158021669325846, "learning_rate": 2.328167385897056e-06, "loss": 0.4798, "step": 6603 }, { "epoch": 0.536343701778608, "grad_norm": 5.336534392507409, "learning_rate": 2.327511324893442e-06, "loss": 0.6895, "step": 6604 }, { "epoch": 0.5364249167546495, "grad_norm": 5.969891236348079, "learning_rate": 2.3268552758251327e-06, "loss": 0.5348, "step": 6605 }, { "epoch": 0.5365061317306912, "grad_norm": 5.339767902188447, "learning_rate": 2.3261992387375216e-06, "loss": 0.4408, "step": 6606 }, { "epoch": 0.5365873467067327, "grad_norm": 6.745067120318976, "learning_rate": 2.3255432136760026e-06, "loss": 0.5484, "step": 6607 }, { "epoch": 0.5366685616827743, "grad_norm": 6.071978645783486, "learning_rate": 2.324887200685971e-06, "loss": 0.5867, "step": 6608 }, { "epoch": 0.5367497766588158, "grad_norm": 6.2394195972445665, "learning_rate": 2.3242311998128182e-06, "loss": 0.5295, "step": 6609 }, { "epoch": 0.5368309916348575, "grad_norm": 5.667495576556501, "learning_rate": 2.3235752111019362e-06, "loss": 0.4169, "step": 6610 }, { "epoch": 0.536912206610899, "grad_norm": 7.823798452635792, "learning_rate": 2.3229192345987146e-06, "loss": 0.6219, "step": 6611 }, { "epoch": 0.5369934215869406, "grad_norm": 6.983866314486278, "learning_rate": 2.322263270348546e-06, "loss": 0.5665, "step": 6612 }, { "epoch": 0.5370746365629823, "grad_norm": 6.526144378674829, "learning_rate": 2.3216073183968184e-06, "loss": 0.3971, "step": 6613 }, { "epoch": 0.5371558515390238, "grad_norm": 10.243690491824887, "learning_rate": 2.320951378788919e-06, "loss": 0.6378, "step": 6614 }, { "epoch": 0.5372370665150654, "grad_norm": 4.954484713664851, "learning_rate": 2.3202954515702384e-06, "loss": 0.548, "step": 6615 }, { "epoch": 0.5373182814911069, "grad_norm": 5.184321630416869, "learning_rate": 2.3196395367861605e-06, "loss": 0.5172, "step": 6616 }, { "epoch": 0.5373994964671486, "grad_norm": 5.640278703192089, "learning_rate": 2.3189836344820717e-06, "loss": 0.4252, "step": 6617 }, { "epoch": 0.5374807114431901, "grad_norm": 6.756103044876508, "learning_rate": 2.318327744703358e-06, "loss": 0.5939, "step": 6618 }, { "epoch": 0.5375619264192317, "grad_norm": 4.587869437489592, "learning_rate": 2.317671867495403e-06, "loss": 0.5293, "step": 6619 }, { "epoch": 0.5376431413952732, "grad_norm": 5.744181498371175, "learning_rate": 2.317016002903589e-06, "loss": 0.4909, "step": 6620 }, { "epoch": 0.5377243563713149, "grad_norm": 5.53015069217161, "learning_rate": 2.3163601509733e-06, "loss": 0.52, "step": 6621 }, { "epoch": 0.5378055713473564, "grad_norm": 6.7221739959300555, "learning_rate": 2.3157043117499174e-06, "loss": 0.4972, "step": 6622 }, { "epoch": 0.537886786323398, "grad_norm": 4.038968186465272, "learning_rate": 2.3150484852788186e-06, "loss": 0.4686, "step": 6623 }, { "epoch": 0.5379680012994397, "grad_norm": 5.334775479299188, "learning_rate": 2.3143926716053876e-06, "loss": 0.4335, "step": 6624 }, { "epoch": 0.5380492162754812, "grad_norm": 8.59183543983034, "learning_rate": 2.3137368707750018e-06, "loss": 0.6309, "step": 6625 }, { "epoch": 0.5381304312515228, "grad_norm": 4.578718128339019, "learning_rate": 2.3130810828330375e-06, "loss": 0.6889, "step": 6626 }, { "epoch": 0.5382116462275643, "grad_norm": 7.819423595541969, "learning_rate": 2.3124253078248734e-06, "loss": 0.43, "step": 6627 }, { "epoch": 0.538292861203606, "grad_norm": 3.229990826836947, "learning_rate": 2.3117695457958857e-06, "loss": 0.7549, "step": 6628 }, { "epoch": 0.5383740761796475, "grad_norm": 4.562845996906024, "learning_rate": 2.3111137967914492e-06, "loss": 0.434, "step": 6629 }, { "epoch": 0.5384552911556891, "grad_norm": 5.915615501905179, "learning_rate": 2.310458060856937e-06, "loss": 0.3945, "step": 6630 }, { "epoch": 0.5385365061317307, "grad_norm": 10.277625736537166, "learning_rate": 2.3098023380377257e-06, "loss": 0.542, "step": 6631 }, { "epoch": 0.5386177211077723, "grad_norm": 3.9982941253903843, "learning_rate": 2.309146628379185e-06, "loss": 0.4956, "step": 6632 }, { "epoch": 0.5386989360838138, "grad_norm": 5.587736755912178, "learning_rate": 2.308490931926687e-06, "loss": 0.3932, "step": 6633 }, { "epoch": 0.5387801510598554, "grad_norm": 6.249822940854045, "learning_rate": 2.3078352487256045e-06, "loss": 0.551, "step": 6634 }, { "epoch": 0.5388613660358971, "grad_norm": 4.330260149960894, "learning_rate": 2.3071795788213047e-06, "loss": 0.419, "step": 6635 }, { "epoch": 0.5389425810119386, "grad_norm": 3.9213426854190554, "learning_rate": 2.3065239222591574e-06, "loss": 0.912, "step": 6636 }, { "epoch": 0.5390237959879802, "grad_norm": 5.204419720367973, "learning_rate": 2.3058682790845314e-06, "loss": 0.582, "step": 6637 }, { "epoch": 0.5391050109640217, "grad_norm": 6.739029726775385, "learning_rate": 2.3052126493427934e-06, "loss": 0.6209, "step": 6638 }, { "epoch": 0.5391862259400634, "grad_norm": 4.96555228670431, "learning_rate": 2.304557033079308e-06, "loss": 0.431, "step": 6639 }, { "epoch": 0.5392674409161049, "grad_norm": 6.401695121326637, "learning_rate": 2.303901430339442e-06, "loss": 0.3933, "step": 6640 }, { "epoch": 0.5393486558921465, "grad_norm": 6.539736744319487, "learning_rate": 2.30324584116856e-06, "loss": 0.5304, "step": 6641 }, { "epoch": 0.539429870868188, "grad_norm": 4.494895959996267, "learning_rate": 2.302590265612023e-06, "loss": 0.4851, "step": 6642 }, { "epoch": 0.5395110858442297, "grad_norm": 5.068900846135241, "learning_rate": 2.301934703715196e-06, "loss": 0.542, "step": 6643 }, { "epoch": 0.5395923008202712, "grad_norm": 3.670185354744509, "learning_rate": 2.301279155523439e-06, "loss": 0.4716, "step": 6644 }, { "epoch": 0.5396735157963128, "grad_norm": 3.9777847287273436, "learning_rate": 2.3006236210821127e-06, "loss": 0.4095, "step": 6645 }, { "epoch": 0.5397547307723545, "grad_norm": 8.621397414880517, "learning_rate": 2.2999681004365755e-06, "loss": 0.4291, "step": 6646 }, { "epoch": 0.539835945748396, "grad_norm": 4.348209634682363, "learning_rate": 2.299312593632189e-06, "loss": 0.5603, "step": 6647 }, { "epoch": 0.5399171607244376, "grad_norm": 5.453003219785878, "learning_rate": 2.298657100714308e-06, "loss": 0.4495, "step": 6648 }, { "epoch": 0.5399983757004791, "grad_norm": 6.222935801462418, "learning_rate": 2.2980016217282892e-06, "loss": 0.5602, "step": 6649 }, { "epoch": 0.5400795906765208, "grad_norm": 6.456779266203264, "learning_rate": 2.2973461567194903e-06, "loss": 0.6055, "step": 6650 }, { "epoch": 0.5401608056525623, "grad_norm": 2.812177928800366, "learning_rate": 2.296690705733265e-06, "loss": 0.5931, "step": 6651 }, { "epoch": 0.5402420206286039, "grad_norm": 5.628644054902416, "learning_rate": 2.2960352688149657e-06, "loss": 0.5323, "step": 6652 }, { "epoch": 0.5403232356046455, "grad_norm": 6.158902153206404, "learning_rate": 2.295379846009947e-06, "loss": 0.6064, "step": 6653 }, { "epoch": 0.5404044505806871, "grad_norm": 4.596838238746014, "learning_rate": 2.2947244373635608e-06, "loss": 0.5383, "step": 6654 }, { "epoch": 0.5404856655567286, "grad_norm": 10.141289999607887, "learning_rate": 2.294069042921156e-06, "loss": 0.4802, "step": 6655 }, { "epoch": 0.5405668805327702, "grad_norm": 3.56367135718309, "learning_rate": 2.2934136627280834e-06, "loss": 0.5922, "step": 6656 }, { "epoch": 0.5406480955088119, "grad_norm": 4.957818406371747, "learning_rate": 2.292758296829693e-06, "loss": 0.7018, "step": 6657 }, { "epoch": 0.5407293104848534, "grad_norm": 4.552129764687744, "learning_rate": 2.2921029452713305e-06, "loss": 0.5328, "step": 6658 }, { "epoch": 0.540810525460895, "grad_norm": 5.962108877745079, "learning_rate": 2.291447608098345e-06, "loss": 0.5459, "step": 6659 }, { "epoch": 0.5408917404369366, "grad_norm": 3.831361014930282, "learning_rate": 2.290792285356081e-06, "loss": 0.445, "step": 6660 }, { "epoch": 0.5409729554129782, "grad_norm": 4.039508905590606, "learning_rate": 2.290136977089883e-06, "loss": 0.6447, "step": 6661 }, { "epoch": 0.5410541703890197, "grad_norm": 4.701839781470267, "learning_rate": 2.289481683345096e-06, "loss": 0.4856, "step": 6662 }, { "epoch": 0.5411353853650613, "grad_norm": 5.28893125359557, "learning_rate": 2.2888264041670625e-06, "loss": 0.3886, "step": 6663 }, { "epoch": 0.5412166003411029, "grad_norm": 4.226127643770201, "learning_rate": 2.288171139601124e-06, "loss": 0.451, "step": 6664 }, { "epoch": 0.5412978153171445, "grad_norm": 4.150106558822198, "learning_rate": 2.287515889692621e-06, "loss": 0.585, "step": 6665 }, { "epoch": 0.541379030293186, "grad_norm": 7.149187546807345, "learning_rate": 2.2868606544868947e-06, "loss": 0.5057, "step": 6666 }, { "epoch": 0.5414602452692276, "grad_norm": 4.941704759283249, "learning_rate": 2.2862054340292835e-06, "loss": 0.5628, "step": 6667 }, { "epoch": 0.5415414602452693, "grad_norm": 6.843536039362226, "learning_rate": 2.2855502283651238e-06, "loss": 0.5538, "step": 6668 }, { "epoch": 0.5416226752213108, "grad_norm": 6.747892070688352, "learning_rate": 2.284895037539753e-06, "loss": 0.5538, "step": 6669 }, { "epoch": 0.5417038901973524, "grad_norm": 4.985721031141658, "learning_rate": 2.2842398615985086e-06, "loss": 0.5889, "step": 6670 }, { "epoch": 0.541785105173394, "grad_norm": 3.9696526712453686, "learning_rate": 2.283584700586723e-06, "loss": 0.4235, "step": 6671 }, { "epoch": 0.5418663201494356, "grad_norm": 4.370310826153604, "learning_rate": 2.2829295545497304e-06, "loss": 0.565, "step": 6672 }, { "epoch": 0.5419475351254771, "grad_norm": 7.303076408805493, "learning_rate": 2.282274423532865e-06, "loss": 0.4592, "step": 6673 }, { "epoch": 0.5420287501015187, "grad_norm": 5.82847333716156, "learning_rate": 2.2816193075814557e-06, "loss": 0.519, "step": 6674 }, { "epoch": 0.5421099650775603, "grad_norm": 3.1507733594386704, "learning_rate": 2.280964206740835e-06, "loss": 0.5885, "step": 6675 }, { "epoch": 0.5421911800536019, "grad_norm": 5.983802017759874, "learning_rate": 2.280309121056333e-06, "loss": 0.4888, "step": 6676 }, { "epoch": 0.5422723950296434, "grad_norm": 3.19378590139564, "learning_rate": 2.279654050573276e-06, "loss": 0.4794, "step": 6677 }, { "epoch": 0.542353610005685, "grad_norm": 4.270799718455597, "learning_rate": 2.2789989953369924e-06, "loss": 0.5408, "step": 6678 }, { "epoch": 0.5424348249817267, "grad_norm": 5.277701316589272, "learning_rate": 2.27834395539281e-06, "loss": 0.4466, "step": 6679 }, { "epoch": 0.5425160399577682, "grad_norm": 5.962388380985894, "learning_rate": 2.2776889307860513e-06, "loss": 0.4871, "step": 6680 }, { "epoch": 0.5425972549338098, "grad_norm": 4.937531617716085, "learning_rate": 2.2770339215620433e-06, "loss": 0.4022, "step": 6681 }, { "epoch": 0.5426784699098514, "grad_norm": 4.989377028856358, "learning_rate": 2.2763789277661077e-06, "loss": 0.4676, "step": 6682 }, { "epoch": 0.542759684885893, "grad_norm": 3.814875085144808, "learning_rate": 2.2757239494435666e-06, "loss": 0.4234, "step": 6683 }, { "epoch": 0.5428408998619345, "grad_norm": 6.832068792784659, "learning_rate": 2.2750689866397407e-06, "loss": 0.6319, "step": 6684 }, { "epoch": 0.5429221148379761, "grad_norm": 26.304115965543602, "learning_rate": 2.2744140393999507e-06, "loss": 0.3713, "step": 6685 }, { "epoch": 0.5430033298140177, "grad_norm": 7.2769279119321135, "learning_rate": 2.273759107769516e-06, "loss": 0.6063, "step": 6686 }, { "epoch": 0.5430845447900593, "grad_norm": 4.819897180382021, "learning_rate": 2.2731041917937524e-06, "loss": 0.5373, "step": 6687 }, { "epoch": 0.5431657597661008, "grad_norm": 5.898798059857416, "learning_rate": 2.2724492915179787e-06, "loss": 0.5572, "step": 6688 }, { "epoch": 0.5432469747421425, "grad_norm": 4.441345383933851, "learning_rate": 2.27179440698751e-06, "loss": 0.6711, "step": 6689 }, { "epoch": 0.5433281897181841, "grad_norm": 3.8157128193761185, "learning_rate": 2.2711395382476595e-06, "loss": 0.6712, "step": 6690 }, { "epoch": 0.5434094046942256, "grad_norm": 4.388902512240195, "learning_rate": 2.2704846853437424e-06, "loss": 0.5042, "step": 6691 }, { "epoch": 0.5434906196702672, "grad_norm": 7.873275069114776, "learning_rate": 2.269829848321071e-06, "loss": 0.4374, "step": 6692 }, { "epoch": 0.5435718346463088, "grad_norm": 5.506546269665942, "learning_rate": 2.2691750272249545e-06, "loss": 0.3936, "step": 6693 }, { "epoch": 0.5436530496223504, "grad_norm": 4.315922170242298, "learning_rate": 2.2685202221007057e-06, "loss": 0.66, "step": 6694 }, { "epoch": 0.5437342645983919, "grad_norm": 41.128866529656484, "learning_rate": 2.2678654329936322e-06, "loss": 0.4566, "step": 6695 }, { "epoch": 0.5438154795744335, "grad_norm": 4.547047318098447, "learning_rate": 2.267210659949042e-06, "loss": 0.5562, "step": 6696 }, { "epoch": 0.5438966945504751, "grad_norm": 5.996426610029105, "learning_rate": 2.2665559030122424e-06, "loss": 0.4198, "step": 6697 }, { "epoch": 0.5439779095265167, "grad_norm": 4.366420114566318, "learning_rate": 2.2659011622285383e-06, "loss": 0.468, "step": 6698 }, { "epoch": 0.5440591245025582, "grad_norm": 4.742561220461673, "learning_rate": 2.265246437643236e-06, "loss": 0.4206, "step": 6699 }, { "epoch": 0.5441403394785999, "grad_norm": 5.981908620139316, "learning_rate": 2.2645917293016363e-06, "loss": 0.6577, "step": 6700 }, { "epoch": 0.5442215544546415, "grad_norm": 3.9854122807772723, "learning_rate": 2.2639370372490434e-06, "loss": 0.442, "step": 6701 }, { "epoch": 0.544302769430683, "grad_norm": 4.317869994988899, "learning_rate": 2.263282361530759e-06, "loss": 0.6317, "step": 6702 }, { "epoch": 0.5443839844067246, "grad_norm": 3.2344627144715874, "learning_rate": 2.2626277021920813e-06, "loss": 0.4615, "step": 6703 }, { "epoch": 0.5444651993827662, "grad_norm": 6.42718836422913, "learning_rate": 2.2619730592783108e-06, "loss": 0.5471, "step": 6704 }, { "epoch": 0.5445464143588078, "grad_norm": 4.488283000655936, "learning_rate": 2.2613184328347453e-06, "loss": 0.4092, "step": 6705 }, { "epoch": 0.5446276293348493, "grad_norm": 6.305967356057076, "learning_rate": 2.2606638229066802e-06, "loss": 0.6374, "step": 6706 }, { "epoch": 0.544708844310891, "grad_norm": 4.2258432165081565, "learning_rate": 2.2600092295394125e-06, "loss": 0.5745, "step": 6707 }, { "epoch": 0.5447900592869325, "grad_norm": 4.384272018008662, "learning_rate": 2.2593546527782362e-06, "loss": 0.433, "step": 6708 }, { "epoch": 0.5448712742629741, "grad_norm": 6.576525934865906, "learning_rate": 2.2587000926684432e-06, "loss": 0.5326, "step": 6709 }, { "epoch": 0.5449524892390156, "grad_norm": 5.555291917382588, "learning_rate": 2.258045549255328e-06, "loss": 0.4881, "step": 6710 }, { "epoch": 0.5450337042150573, "grad_norm": 5.546197578764206, "learning_rate": 2.25739102258418e-06, "loss": 0.479, "step": 6711 }, { "epoch": 0.5451149191910989, "grad_norm": 6.1439652570444325, "learning_rate": 2.256736512700288e-06, "loss": 0.4543, "step": 6712 }, { "epoch": 0.5451961341671404, "grad_norm": 5.653213757078279, "learning_rate": 2.2560820196489437e-06, "loss": 0.5768, "step": 6713 }, { "epoch": 0.545277349143182, "grad_norm": 4.21691379783433, "learning_rate": 2.255427543475432e-06, "loss": 0.4927, "step": 6714 }, { "epoch": 0.5453585641192236, "grad_norm": 6.798777933865642, "learning_rate": 2.254773084225039e-06, "loss": 0.4237, "step": 6715 }, { "epoch": 0.5454397790952652, "grad_norm": 4.819909494343674, "learning_rate": 2.254118641943052e-06, "loss": 0.4288, "step": 6716 }, { "epoch": 0.5455209940713067, "grad_norm": 7.439931633022933, "learning_rate": 2.253464216674753e-06, "loss": 0.482, "step": 6717 }, { "epoch": 0.5456022090473484, "grad_norm": 7.792402778020558, "learning_rate": 2.2528098084654262e-06, "loss": 0.614, "step": 6718 }, { "epoch": 0.5456834240233899, "grad_norm": 4.495435217507245, "learning_rate": 2.2521554173603513e-06, "loss": 0.4562, "step": 6719 }, { "epoch": 0.5457646389994315, "grad_norm": 4.419124752755961, "learning_rate": 2.25150104340481e-06, "loss": 0.5417, "step": 6720 }, { "epoch": 0.545845853975473, "grad_norm": 6.332131149232858, "learning_rate": 2.2508466866440824e-06, "loss": 0.4154, "step": 6721 }, { "epoch": 0.5459270689515147, "grad_norm": 6.610876686409083, "learning_rate": 2.2501923471234444e-06, "loss": 0.49, "step": 6722 }, { "epoch": 0.5460082839275563, "grad_norm": 17.017885967526706, "learning_rate": 2.249538024888174e-06, "loss": 0.4733, "step": 6723 }, { "epoch": 0.5460894989035978, "grad_norm": 7.28235162312456, "learning_rate": 2.2488837199835477e-06, "loss": 0.6985, "step": 6724 }, { "epoch": 0.5461707138796394, "grad_norm": 6.165256861279543, "learning_rate": 2.2482294324548376e-06, "loss": 0.5424, "step": 6725 }, { "epoch": 0.546251928855681, "grad_norm": 7.302972322656833, "learning_rate": 2.2475751623473193e-06, "loss": 0.6049, "step": 6726 }, { "epoch": 0.5463331438317226, "grad_norm": 4.71990096240607, "learning_rate": 2.2469209097062637e-06, "loss": 0.4342, "step": 6727 }, { "epoch": 0.5464143588077641, "grad_norm": 6.900667478651783, "learning_rate": 2.246266674576941e-06, "loss": 0.4572, "step": 6728 }, { "epoch": 0.5464955737838058, "grad_norm": 2.982802060269296, "learning_rate": 2.245612457004622e-06, "loss": 0.4723, "step": 6729 }, { "epoch": 0.5465767887598473, "grad_norm": 4.176653744332102, "learning_rate": 2.244958257034575e-06, "loss": 0.4191, "step": 6730 }, { "epoch": 0.5466580037358889, "grad_norm": 4.511011865460297, "learning_rate": 2.244304074712066e-06, "loss": 0.4973, "step": 6731 }, { "epoch": 0.5467392187119304, "grad_norm": 4.202369107086116, "learning_rate": 2.243649910082363e-06, "loss": 0.6125, "step": 6732 }, { "epoch": 0.5468204336879721, "grad_norm": 7.339054515036435, "learning_rate": 2.2429957631907285e-06, "loss": 0.444, "step": 6733 }, { "epoch": 0.5469016486640137, "grad_norm": 4.854339006065371, "learning_rate": 2.2423416340824266e-06, "loss": 0.4205, "step": 6734 }, { "epoch": 0.5469828636400552, "grad_norm": 5.693035824826316, "learning_rate": 2.241687522802721e-06, "loss": 0.5089, "step": 6735 }, { "epoch": 0.5470640786160968, "grad_norm": 4.855926081415383, "learning_rate": 2.2410334293968716e-06, "loss": 0.43, "step": 6736 }, { "epoch": 0.5471452935921384, "grad_norm": 4.266917082773724, "learning_rate": 2.2403793539101387e-06, "loss": 0.4145, "step": 6737 }, { "epoch": 0.54722650856818, "grad_norm": 5.411690627496338, "learning_rate": 2.2397252963877795e-06, "loss": 0.5073, "step": 6738 }, { "epoch": 0.5473077235442215, "grad_norm": 2.6098814011197096, "learning_rate": 2.239071256875053e-06, "loss": 0.5423, "step": 6739 }, { "epoch": 0.5473889385202632, "grad_norm": 5.3581580919616885, "learning_rate": 2.238417235417214e-06, "loss": 0.4069, "step": 6740 }, { "epoch": 0.5474701534963047, "grad_norm": 4.544117591382317, "learning_rate": 2.237763232059518e-06, "loss": 0.4756, "step": 6741 }, { "epoch": 0.5475513684723463, "grad_norm": 18.0511878541294, "learning_rate": 2.2371092468472193e-06, "loss": 0.5058, "step": 6742 }, { "epoch": 0.5476325834483878, "grad_norm": 6.268300319177437, "learning_rate": 2.236455279825569e-06, "loss": 0.4712, "step": 6743 }, { "epoch": 0.5477137984244295, "grad_norm": 3.663452778757879, "learning_rate": 2.2358013310398174e-06, "loss": 0.5031, "step": 6744 }, { "epoch": 0.5477950134004711, "grad_norm": 4.936451070405641, "learning_rate": 2.235147400535217e-06, "loss": 0.4914, "step": 6745 }, { "epoch": 0.5478762283765126, "grad_norm": 5.976409510815185, "learning_rate": 2.2344934883570143e-06, "loss": 0.5417, "step": 6746 }, { "epoch": 0.5479574433525543, "grad_norm": 7.332839588724573, "learning_rate": 2.2338395945504557e-06, "loss": 0.525, "step": 6747 }, { "epoch": 0.5480386583285958, "grad_norm": 5.026062475739817, "learning_rate": 2.23318571916079e-06, "loss": 0.6906, "step": 6748 }, { "epoch": 0.5481198733046374, "grad_norm": 5.070833605866744, "learning_rate": 2.2325318622332606e-06, "loss": 0.4563, "step": 6749 }, { "epoch": 0.5482010882806789, "grad_norm": 4.9673547801455955, "learning_rate": 2.2318780238131095e-06, "loss": 0.5338, "step": 6750 }, { "epoch": 0.5482823032567206, "grad_norm": 3.9729119791581744, "learning_rate": 2.2312242039455816e-06, "loss": 0.503, "step": 6751 }, { "epoch": 0.5483635182327621, "grad_norm": 3.9917626185437878, "learning_rate": 2.230570402675916e-06, "loss": 0.3857, "step": 6752 }, { "epoch": 0.5484447332088037, "grad_norm": 4.014300920446785, "learning_rate": 2.2299166200493526e-06, "loss": 0.4657, "step": 6753 }, { "epoch": 0.5485259481848452, "grad_norm": 5.148556943279063, "learning_rate": 2.2292628561111285e-06, "loss": 0.4921, "step": 6754 }, { "epoch": 0.5486071631608869, "grad_norm": 3.2886345422623715, "learning_rate": 2.228609110906483e-06, "loss": 0.4815, "step": 6755 }, { "epoch": 0.5486883781369285, "grad_norm": 9.805600602297103, "learning_rate": 2.2279553844806506e-06, "loss": 0.5324, "step": 6756 }, { "epoch": 0.54876959311297, "grad_norm": 7.9338566902011385, "learning_rate": 2.2273016768788653e-06, "loss": 0.4963, "step": 6757 }, { "epoch": 0.5488508080890117, "grad_norm": 4.352389013230028, "learning_rate": 2.2266479881463614e-06, "loss": 0.4975, "step": 6758 }, { "epoch": 0.5489320230650532, "grad_norm": 5.035317373817535, "learning_rate": 2.2259943183283696e-06, "loss": 0.5316, "step": 6759 }, { "epoch": 0.5490132380410948, "grad_norm": 6.329671082512638, "learning_rate": 2.2253406674701206e-06, "loss": 0.4758, "step": 6760 }, { "epoch": 0.5490944530171363, "grad_norm": 4.350812102992751, "learning_rate": 2.2246870356168447e-06, "loss": 0.3731, "step": 6761 }, { "epoch": 0.549175667993178, "grad_norm": 4.411297111261324, "learning_rate": 2.224033422813768e-06, "loss": 0.4601, "step": 6762 }, { "epoch": 0.5492568829692195, "grad_norm": 3.3546196364460172, "learning_rate": 2.2233798291061177e-06, "loss": 0.5048, "step": 6763 }, { "epoch": 0.5493380979452611, "grad_norm": 5.75969034058185, "learning_rate": 2.2227262545391204e-06, "loss": 0.5135, "step": 6764 }, { "epoch": 0.5494193129213026, "grad_norm": 4.676293581342606, "learning_rate": 2.222072699157998e-06, "loss": 0.382, "step": 6765 }, { "epoch": 0.5495005278973443, "grad_norm": 4.301771743349808, "learning_rate": 2.2214191630079733e-06, "loss": 0.4019, "step": 6766 }, { "epoch": 0.5495817428733859, "grad_norm": 8.356786267103574, "learning_rate": 2.2207656461342696e-06, "loss": 0.486, "step": 6767 }, { "epoch": 0.5496629578494274, "grad_norm": 5.187735340758213, "learning_rate": 2.2201121485821053e-06, "loss": 0.5106, "step": 6768 }, { "epoch": 0.5497441728254691, "grad_norm": 14.002790250484441, "learning_rate": 2.2194586703966976e-06, "loss": 0.5786, "step": 6769 }, { "epoch": 0.5498253878015106, "grad_norm": 4.598645970086092, "learning_rate": 2.218805211623266e-06, "loss": 0.6202, "step": 6770 }, { "epoch": 0.5499066027775522, "grad_norm": 7.035714823525913, "learning_rate": 2.2181517723070263e-06, "loss": 0.3767, "step": 6771 }, { "epoch": 0.5499878177535937, "grad_norm": 3.789629383739064, "learning_rate": 2.2174983524931916e-06, "loss": 0.4665, "step": 6772 }, { "epoch": 0.5500690327296354, "grad_norm": 5.153278702932309, "learning_rate": 2.216844952226975e-06, "loss": 0.5535, "step": 6773 }, { "epoch": 0.5501502477056769, "grad_norm": 5.090968399667655, "learning_rate": 2.2161915715535903e-06, "loss": 0.5416, "step": 6774 }, { "epoch": 0.5502314626817185, "grad_norm": 4.880285899644429, "learning_rate": 2.2155382105182462e-06, "loss": 0.528, "step": 6775 }, { "epoch": 0.55031267765776, "grad_norm": 7.133831222495926, "learning_rate": 2.214884869166152e-06, "loss": 0.4645, "step": 6776 }, { "epoch": 0.5503938926338017, "grad_norm": 5.111553296411102, "learning_rate": 2.214231547542517e-06, "loss": 0.5628, "step": 6777 }, { "epoch": 0.5504751076098433, "grad_norm": 4.076454306491572, "learning_rate": 2.213578245692546e-06, "loss": 0.4838, "step": 6778 }, { "epoch": 0.5505563225858848, "grad_norm": 5.211719936927261, "learning_rate": 2.2129249636614443e-06, "loss": 0.563, "step": 6779 }, { "epoch": 0.5506375375619265, "grad_norm": 12.302787876796701, "learning_rate": 2.2122717014944167e-06, "loss": 0.4786, "step": 6780 }, { "epoch": 0.550718752537968, "grad_norm": 4.330340043687264, "learning_rate": 2.2116184592366643e-06, "loss": 0.5747, "step": 6781 }, { "epoch": 0.5507999675140096, "grad_norm": 5.034937065269623, "learning_rate": 2.2109652369333873e-06, "loss": 0.4252, "step": 6782 }, { "epoch": 0.5508811824900511, "grad_norm": 7.614248716990825, "learning_rate": 2.2103120346297864e-06, "loss": 0.4404, "step": 6783 }, { "epoch": 0.5509623974660928, "grad_norm": 7.1625705175917425, "learning_rate": 2.2096588523710606e-06, "loss": 0.5373, "step": 6784 }, { "epoch": 0.5510436124421343, "grad_norm": 4.064726317144616, "learning_rate": 2.2090056902024045e-06, "loss": 0.4773, "step": 6785 }, { "epoch": 0.5511248274181759, "grad_norm": 4.1799707253524065, "learning_rate": 2.208352548169015e-06, "loss": 0.5596, "step": 6786 }, { "epoch": 0.5512060423942174, "grad_norm": 5.2430508668568425, "learning_rate": 2.2076994263160863e-06, "loss": 0.5455, "step": 6787 }, { "epoch": 0.5512872573702591, "grad_norm": 5.514510034389823, "learning_rate": 2.2070463246888094e-06, "loss": 0.4733, "step": 6788 }, { "epoch": 0.5513684723463007, "grad_norm": 4.50738218564209, "learning_rate": 2.206393243332376e-06, "loss": 0.5448, "step": 6789 }, { "epoch": 0.5514496873223422, "grad_norm": 4.34724451667795, "learning_rate": 2.2057401822919775e-06, "loss": 0.5013, "step": 6790 }, { "epoch": 0.5515309022983839, "grad_norm": 5.578815799717539, "learning_rate": 2.2050871416128005e-06, "loss": 0.5814, "step": 6791 }, { "epoch": 0.5516121172744254, "grad_norm": 6.701219984709083, "learning_rate": 2.204434121340032e-06, "loss": 0.3959, "step": 6792 }, { "epoch": 0.551693332250467, "grad_norm": 3.6134666388120404, "learning_rate": 2.203781121518859e-06, "loss": 0.536, "step": 6793 }, { "epoch": 0.5517745472265085, "grad_norm": 6.117283840596765, "learning_rate": 2.2031281421944643e-06, "loss": 0.4515, "step": 6794 }, { "epoch": 0.5518557622025502, "grad_norm": 3.9112049782621003, "learning_rate": 2.2024751834120302e-06, "loss": 0.4622, "step": 6795 }, { "epoch": 0.5519369771785917, "grad_norm": 3.8079103420465437, "learning_rate": 2.20182224521674e-06, "loss": 0.5821, "step": 6796 }, { "epoch": 0.5520181921546333, "grad_norm": 5.019201884297034, "learning_rate": 2.2011693276537722e-06, "loss": 0.6276, "step": 6797 }, { "epoch": 0.5520994071306748, "grad_norm": 5.4407005403339825, "learning_rate": 2.2005164307683047e-06, "loss": 0.4406, "step": 6798 }, { "epoch": 0.5521806221067165, "grad_norm": 4.301332089064263, "learning_rate": 2.199863554605515e-06, "loss": 0.5167, "step": 6799 }, { "epoch": 0.5522618370827581, "grad_norm": 7.493043226544131, "learning_rate": 2.19921069921058e-06, "loss": 0.5069, "step": 6800 }, { "epoch": 0.5523430520587996, "grad_norm": 3.8437035341706967, "learning_rate": 2.1985578646286717e-06, "loss": 0.6306, "step": 6801 }, { "epoch": 0.5524242670348413, "grad_norm": 3.805644122254566, "learning_rate": 2.197905050904964e-06, "loss": 0.4094, "step": 6802 }, { "epoch": 0.5525054820108828, "grad_norm": 6.899378255822541, "learning_rate": 2.197252258084629e-06, "loss": 0.4034, "step": 6803 }, { "epoch": 0.5525866969869244, "grad_norm": 7.710934181517784, "learning_rate": 2.196599486212834e-06, "loss": 0.5314, "step": 6804 }, { "epoch": 0.5526679119629659, "grad_norm": 3.3575643225535856, "learning_rate": 2.1959467353347494e-06, "loss": 0.4281, "step": 6805 }, { "epoch": 0.5527491269390076, "grad_norm": 3.714838991436428, "learning_rate": 2.195294005495542e-06, "loss": 0.5778, "step": 6806 }, { "epoch": 0.5528303419150491, "grad_norm": 6.171375505356923, "learning_rate": 2.1946412967403763e-06, "loss": 0.5032, "step": 6807 }, { "epoch": 0.5529115568910907, "grad_norm": 5.843750427655172, "learning_rate": 2.1939886091144165e-06, "loss": 0.4943, "step": 6808 }, { "epoch": 0.5529927718671322, "grad_norm": 3.222741223173242, "learning_rate": 2.193335942662826e-06, "loss": 0.4707, "step": 6809 }, { "epoch": 0.5530739868431739, "grad_norm": 3.916188730239071, "learning_rate": 2.192683297430766e-06, "loss": 0.4572, "step": 6810 }, { "epoch": 0.5531552018192155, "grad_norm": 4.698273133823479, "learning_rate": 2.1920306734633932e-06, "loss": 0.4173, "step": 6811 }, { "epoch": 0.553236416795257, "grad_norm": 5.974971528842556, "learning_rate": 2.1913780708058694e-06, "loss": 0.5129, "step": 6812 }, { "epoch": 0.5533176317712987, "grad_norm": 4.899970058534168, "learning_rate": 2.19072548950335e-06, "loss": 0.4309, "step": 6813 }, { "epoch": 0.5533988467473402, "grad_norm": 4.224538543746903, "learning_rate": 2.190072929600989e-06, "loss": 0.6603, "step": 6814 }, { "epoch": 0.5534800617233818, "grad_norm": 9.421465257051333, "learning_rate": 2.189420391143941e-06, "loss": 0.4734, "step": 6815 }, { "epoch": 0.5535612766994233, "grad_norm": 6.665428434932173, "learning_rate": 2.1887678741773592e-06, "loss": 0.5195, "step": 6816 }, { "epoch": 0.553642491675465, "grad_norm": 2.89387484029611, "learning_rate": 2.188115378746392e-06, "loss": 0.4763, "step": 6817 }, { "epoch": 0.5537237066515065, "grad_norm": 6.823773735416532, "learning_rate": 2.1874629048961904e-06, "loss": 0.5321, "step": 6818 }, { "epoch": 0.5538049216275481, "grad_norm": 4.521778809799006, "learning_rate": 2.1868104526719023e-06, "loss": 0.5816, "step": 6819 }, { "epoch": 0.5538861366035897, "grad_norm": 3.3396433922902906, "learning_rate": 2.1861580221186726e-06, "loss": 0.4168, "step": 6820 }, { "epoch": 0.5539673515796313, "grad_norm": 3.3374583977198036, "learning_rate": 2.185505613281647e-06, "loss": 0.4614, "step": 6821 }, { "epoch": 0.5540485665556729, "grad_norm": 6.068612637720992, "learning_rate": 2.1848532262059696e-06, "loss": 0.5797, "step": 6822 }, { "epoch": 0.5541297815317144, "grad_norm": 5.0334311376181144, "learning_rate": 2.1842008609367794e-06, "loss": 0.6781, "step": 6823 }, { "epoch": 0.5542109965077561, "grad_norm": 4.81150612829465, "learning_rate": 2.183548517519219e-06, "loss": 0.4113, "step": 6824 }, { "epoch": 0.5542922114837976, "grad_norm": 7.137103277777565, "learning_rate": 2.1828961959984267e-06, "loss": 0.502, "step": 6825 }, { "epoch": 0.5543734264598392, "grad_norm": 4.93103788302294, "learning_rate": 2.18224389641954e-06, "loss": 0.3781, "step": 6826 }, { "epoch": 0.5544546414358807, "grad_norm": 4.17706299919582, "learning_rate": 2.1815916188276925e-06, "loss": 0.5585, "step": 6827 }, { "epoch": 0.5545358564119224, "grad_norm": 6.194948711383427, "learning_rate": 2.18093936326802e-06, "loss": 0.5219, "step": 6828 }, { "epoch": 0.5546170713879639, "grad_norm": 7.761708704297234, "learning_rate": 2.180287129785656e-06, "loss": 0.4665, "step": 6829 }, { "epoch": 0.5546982863640055, "grad_norm": 5.591231620591249, "learning_rate": 2.1796349184257294e-06, "loss": 0.4973, "step": 6830 }, { "epoch": 0.554779501340047, "grad_norm": 3.9216643074294595, "learning_rate": 2.1789827292333717e-06, "loss": 0.3763, "step": 6831 }, { "epoch": 0.5548607163160887, "grad_norm": 4.444201370705435, "learning_rate": 2.1783305622537106e-06, "loss": 0.522, "step": 6832 }, { "epoch": 0.5549419312921303, "grad_norm": 3.5234577542993972, "learning_rate": 2.1776784175318705e-06, "loss": 0.6173, "step": 6833 }, { "epoch": 0.5550231462681718, "grad_norm": 5.522357961927791, "learning_rate": 2.1770262951129792e-06, "loss": 0.4304, "step": 6834 }, { "epoch": 0.5551043612442135, "grad_norm": 2.753675801015695, "learning_rate": 2.1763741950421595e-06, "loss": 0.5176, "step": 6835 }, { "epoch": 0.555185576220255, "grad_norm": 5.5643466943030715, "learning_rate": 2.175722117364531e-06, "loss": 0.6027, "step": 6836 }, { "epoch": 0.5552667911962966, "grad_norm": 2.6499029128692957, "learning_rate": 2.175070062125217e-06, "loss": 0.472, "step": 6837 }, { "epoch": 0.5553480061723381, "grad_norm": 11.241112599357884, "learning_rate": 2.1744180293693355e-06, "loss": 0.5573, "step": 6838 }, { "epoch": 0.5554292211483798, "grad_norm": 5.094638009416633, "learning_rate": 2.173766019142002e-06, "loss": 0.4747, "step": 6839 }, { "epoch": 0.5555104361244213, "grad_norm": 6.191994268871611, "learning_rate": 2.1731140314883346e-06, "loss": 0.5245, "step": 6840 }, { "epoch": 0.5555916511004629, "grad_norm": 4.2743419408023495, "learning_rate": 2.1724620664534453e-06, "loss": 0.508, "step": 6841 }, { "epoch": 0.5556728660765045, "grad_norm": 12.369287470212933, "learning_rate": 2.1718101240824485e-06, "loss": 0.6018, "step": 6842 }, { "epoch": 0.5557540810525461, "grad_norm": 6.34277452834017, "learning_rate": 2.171158204420453e-06, "loss": 0.5897, "step": 6843 }, { "epoch": 0.5558352960285877, "grad_norm": 3.6842271055521763, "learning_rate": 2.17050630751257e-06, "loss": 0.3811, "step": 6844 }, { "epoch": 0.5559165110046292, "grad_norm": 4.1774312445120545, "learning_rate": 2.169854433403907e-06, "loss": 0.7328, "step": 6845 }, { "epoch": 0.5559977259806709, "grad_norm": 4.702814140349739, "learning_rate": 2.169202582139569e-06, "loss": 0.5899, "step": 6846 }, { "epoch": 0.5560789409567124, "grad_norm": 5.095589215614225, "learning_rate": 2.1685507537646622e-06, "loss": 0.4751, "step": 6847 }, { "epoch": 0.556160155932754, "grad_norm": 5.703171345481269, "learning_rate": 2.1678989483242896e-06, "loss": 0.4403, "step": 6848 }, { "epoch": 0.5562413709087956, "grad_norm": 4.75054750690896, "learning_rate": 2.1672471658635506e-06, "loss": 0.5135, "step": 6849 }, { "epoch": 0.5563225858848372, "grad_norm": 3.3902339048047305, "learning_rate": 2.166595406427548e-06, "loss": 0.6971, "step": 6850 }, { "epoch": 0.5564038008608787, "grad_norm": 5.852734969187135, "learning_rate": 2.1659436700613787e-06, "loss": 0.4691, "step": 6851 }, { "epoch": 0.5564850158369203, "grad_norm": 3.727932064595546, "learning_rate": 2.1652919568101386e-06, "loss": 0.5747, "step": 6852 }, { "epoch": 0.5565662308129619, "grad_norm": 5.153139343207781, "learning_rate": 2.1646402667189245e-06, "loss": 0.4892, "step": 6853 }, { "epoch": 0.5566474457890035, "grad_norm": 3.8440190223031125, "learning_rate": 2.1639885998328293e-06, "loss": 0.4627, "step": 6854 }, { "epoch": 0.5567286607650451, "grad_norm": 8.606115530142445, "learning_rate": 2.1633369561969435e-06, "loss": 0.4342, "step": 6855 }, { "epoch": 0.5568098757410866, "grad_norm": 3.6671273924850407, "learning_rate": 2.1626853358563595e-06, "loss": 0.5365, "step": 6856 }, { "epoch": 0.5568910907171283, "grad_norm": 3.835808189282639, "learning_rate": 2.162033738856165e-06, "loss": 0.491, "step": 6857 }, { "epoch": 0.5569723056931698, "grad_norm": 6.132790993906602, "learning_rate": 2.161382165241446e-06, "loss": 0.4935, "step": 6858 }, { "epoch": 0.5570535206692114, "grad_norm": 4.4718395237709485, "learning_rate": 2.1607306150572905e-06, "loss": 0.5225, "step": 6859 }, { "epoch": 0.557134735645253, "grad_norm": 5.4291132456188445, "learning_rate": 2.1600790883487805e-06, "loss": 0.4914, "step": 6860 }, { "epoch": 0.5572159506212946, "grad_norm": 3.396009857568952, "learning_rate": 2.159427585160999e-06, "loss": 0.6867, "step": 6861 }, { "epoch": 0.5572971655973361, "grad_norm": 6.2945750984242945, "learning_rate": 2.1587761055390247e-06, "loss": 0.3675, "step": 6862 }, { "epoch": 0.5573783805733777, "grad_norm": 10.344490630116207, "learning_rate": 2.1581246495279388e-06, "loss": 0.4676, "step": 6863 }, { "epoch": 0.5574595955494193, "grad_norm": 4.03264581660929, "learning_rate": 2.1574732171728187e-06, "loss": 0.5014, "step": 6864 }, { "epoch": 0.5575408105254609, "grad_norm": 5.595299416089842, "learning_rate": 2.1568218085187375e-06, "loss": 0.5567, "step": 6865 }, { "epoch": 0.5576220255015025, "grad_norm": 4.615982346308546, "learning_rate": 2.1561704236107715e-06, "loss": 0.4441, "step": 6866 }, { "epoch": 0.557703240477544, "grad_norm": 3.371649845113746, "learning_rate": 2.1555190624939933e-06, "loss": 0.4734, "step": 6867 }, { "epoch": 0.5577844554535857, "grad_norm": 4.993213905650965, "learning_rate": 2.154867725213472e-06, "loss": 0.5791, "step": 6868 }, { "epoch": 0.5578656704296272, "grad_norm": 5.433984223606404, "learning_rate": 2.154216411814278e-06, "loss": 0.6413, "step": 6869 }, { "epoch": 0.5579468854056688, "grad_norm": 4.1089167750307505, "learning_rate": 2.1535651223414783e-06, "loss": 0.4065, "step": 6870 }, { "epoch": 0.5580281003817104, "grad_norm": 6.410732794666789, "learning_rate": 2.1529138568401377e-06, "loss": 0.4132, "step": 6871 }, { "epoch": 0.558109315357752, "grad_norm": 2.952240951187637, "learning_rate": 2.1522626153553224e-06, "loss": 0.5569, "step": 6872 }, { "epoch": 0.5581905303337935, "grad_norm": 5.778680431168145, "learning_rate": 2.1516113979320937e-06, "loss": 0.615, "step": 6873 }, { "epoch": 0.5582717453098351, "grad_norm": 5.40543144487348, "learning_rate": 2.150960204615511e-06, "loss": 0.4005, "step": 6874 }, { "epoch": 0.5583529602858767, "grad_norm": 5.360050799209055, "learning_rate": 2.1503090354506366e-06, "loss": 0.4878, "step": 6875 }, { "epoch": 0.5584341752619183, "grad_norm": 4.768908334280203, "learning_rate": 2.1496578904825253e-06, "loss": 0.606, "step": 6876 }, { "epoch": 0.5585153902379599, "grad_norm": 4.8420372746526255, "learning_rate": 2.149006769756234e-06, "loss": 0.3823, "step": 6877 }, { "epoch": 0.5585966052140015, "grad_norm": 7.2191793614780835, "learning_rate": 2.148355673316817e-06, "loss": 0.4933, "step": 6878 }, { "epoch": 0.5586778201900431, "grad_norm": 5.635609934888895, "learning_rate": 2.1477046012093263e-06, "loss": 0.4284, "step": 6879 }, { "epoch": 0.5587590351660846, "grad_norm": 4.860255565023494, "learning_rate": 2.147053553478813e-06, "loss": 0.4215, "step": 6880 }, { "epoch": 0.5588402501421262, "grad_norm": 6.326189480768318, "learning_rate": 2.1464025301703243e-06, "loss": 0.3612, "step": 6881 }, { "epoch": 0.5589214651181678, "grad_norm": 10.398795327314401, "learning_rate": 2.145751531328911e-06, "loss": 0.638, "step": 6882 }, { "epoch": 0.5590026800942094, "grad_norm": 5.9840259751672455, "learning_rate": 2.1451005569996157e-06, "loss": 0.4752, "step": 6883 }, { "epoch": 0.5590838950702509, "grad_norm": 6.729449032368889, "learning_rate": 2.144449607227483e-06, "loss": 0.4235, "step": 6884 }, { "epoch": 0.5591651100462925, "grad_norm": 4.99524830446243, "learning_rate": 2.143798682057558e-06, "loss": 0.565, "step": 6885 }, { "epoch": 0.5592463250223341, "grad_norm": 11.826585338715425, "learning_rate": 2.1431477815348775e-06, "loss": 0.644, "step": 6886 }, { "epoch": 0.5593275399983757, "grad_norm": 4.772181141872521, "learning_rate": 2.1424969057044815e-06, "loss": 0.6231, "step": 6887 }, { "epoch": 0.5594087549744173, "grad_norm": 5.569287892562355, "learning_rate": 2.1418460546114087e-06, "loss": 0.4135, "step": 6888 }, { "epoch": 0.5594899699504589, "grad_norm": 3.637243154196817, "learning_rate": 2.141195228300693e-06, "loss": 0.4882, "step": 6889 }, { "epoch": 0.5595711849265005, "grad_norm": 4.970081667441746, "learning_rate": 2.140544426817368e-06, "loss": 0.4451, "step": 6890 }, { "epoch": 0.559652399902542, "grad_norm": 6.0593249594190715, "learning_rate": 2.139893650206467e-06, "loss": 0.4589, "step": 6891 }, { "epoch": 0.5597336148785836, "grad_norm": 4.272966835485669, "learning_rate": 2.1392428985130192e-06, "loss": 0.4929, "step": 6892 }, { "epoch": 0.5598148298546252, "grad_norm": 4.935701907357544, "learning_rate": 2.138592171782053e-06, "loss": 0.6183, "step": 6893 }, { "epoch": 0.5598960448306668, "grad_norm": 4.965836504024703, "learning_rate": 2.137941470058597e-06, "loss": 0.5274, "step": 6894 }, { "epoch": 0.5599772598067083, "grad_norm": 3.8763705524472245, "learning_rate": 2.1372907933876745e-06, "loss": 0.6235, "step": 6895 }, { "epoch": 0.56005847478275, "grad_norm": 3.7922687516844387, "learning_rate": 2.13664014181431e-06, "loss": 0.4554, "step": 6896 }, { "epoch": 0.5601396897587915, "grad_norm": 5.143643713764753, "learning_rate": 2.1359895153835235e-06, "loss": 0.7668, "step": 6897 }, { "epoch": 0.5602209047348331, "grad_norm": 3.9929186611253686, "learning_rate": 2.1353389141403373e-06, "loss": 0.5105, "step": 6898 }, { "epoch": 0.5603021197108747, "grad_norm": 7.4139111080640605, "learning_rate": 2.134688338129768e-06, "loss": 0.3908, "step": 6899 }, { "epoch": 0.5603833346869163, "grad_norm": 4.298081345320913, "learning_rate": 2.1340377873968313e-06, "loss": 0.3907, "step": 6900 }, { "epoch": 0.5604645496629579, "grad_norm": 3.6665218877774404, "learning_rate": 2.133387261986544e-06, "loss": 0.4315, "step": 6901 }, { "epoch": 0.5605457646389994, "grad_norm": 3.345078936648102, "learning_rate": 2.132736761943917e-06, "loss": 0.4787, "step": 6902 }, { "epoch": 0.560626979615041, "grad_norm": 5.603195213119475, "learning_rate": 2.1320862873139627e-06, "loss": 0.517, "step": 6903 }, { "epoch": 0.5607081945910826, "grad_norm": 4.582210135280795, "learning_rate": 2.1314358381416906e-06, "loss": 0.4446, "step": 6904 }, { "epoch": 0.5607894095671242, "grad_norm": 5.520444524716127, "learning_rate": 2.130785414472108e-06, "loss": 0.5447, "step": 6905 }, { "epoch": 0.5608706245431657, "grad_norm": 3.594858666968153, "learning_rate": 2.1301350163502194e-06, "loss": 0.4709, "step": 6906 }, { "epoch": 0.5609518395192074, "grad_norm": 5.775944364817623, "learning_rate": 2.1294846438210316e-06, "loss": 0.5108, "step": 6907 }, { "epoch": 0.5610330544952489, "grad_norm": 4.911293017998235, "learning_rate": 2.128834296929545e-06, "loss": 0.458, "step": 6908 }, { "epoch": 0.5611142694712905, "grad_norm": 5.313549191131658, "learning_rate": 2.12818397572076e-06, "loss": 0.5297, "step": 6909 }, { "epoch": 0.5611954844473321, "grad_norm": 5.235033497622972, "learning_rate": 2.1275336802396775e-06, "loss": 0.6024, "step": 6910 }, { "epoch": 0.5612766994233737, "grad_norm": 8.31540179839902, "learning_rate": 2.1268834105312926e-06, "loss": 0.4589, "step": 6911 }, { "epoch": 0.5613579143994153, "grad_norm": 5.844285401664914, "learning_rate": 2.1262331666406003e-06, "loss": 0.687, "step": 6912 }, { "epoch": 0.5614391293754568, "grad_norm": 5.506426123033813, "learning_rate": 2.125582948612595e-06, "loss": 0.3788, "step": 6913 }, { "epoch": 0.5615203443514984, "grad_norm": 4.938418738241158, "learning_rate": 2.124932756492269e-06, "loss": 0.4786, "step": 6914 }, { "epoch": 0.56160155932754, "grad_norm": 5.011490627219829, "learning_rate": 2.1242825903246104e-06, "loss": 0.5104, "step": 6915 }, { "epoch": 0.5616827743035816, "grad_norm": 4.130351403264876, "learning_rate": 2.1236324501546073e-06, "loss": 0.5, "step": 6916 }, { "epoch": 0.5617639892796231, "grad_norm": 5.131299996818871, "learning_rate": 2.1229823360272483e-06, "loss": 0.5647, "step": 6917 }, { "epoch": 0.5618452042556648, "grad_norm": 4.930401157312547, "learning_rate": 2.1223322479875157e-06, "loss": 0.6934, "step": 6918 }, { "epoch": 0.5619264192317063, "grad_norm": 4.218515382201296, "learning_rate": 2.1216821860803922e-06, "loss": 0.5194, "step": 6919 }, { "epoch": 0.5620076342077479, "grad_norm": 4.955917407462902, "learning_rate": 2.12103215035086e-06, "loss": 0.548, "step": 6920 }, { "epoch": 0.5620888491837895, "grad_norm": 7.097785428246493, "learning_rate": 2.1203821408438973e-06, "loss": 0.3326, "step": 6921 }, { "epoch": 0.5621700641598311, "grad_norm": 5.512144520285594, "learning_rate": 2.1197321576044803e-06, "loss": 0.5434, "step": 6922 }, { "epoch": 0.5622512791358727, "grad_norm": 4.559962704037825, "learning_rate": 2.119082200677587e-06, "loss": 0.706, "step": 6923 }, { "epoch": 0.5623324941119142, "grad_norm": 3.716157164787584, "learning_rate": 2.1184322701081884e-06, "loss": 0.4711, "step": 6924 }, { "epoch": 0.5624137090879558, "grad_norm": 5.005561099415801, "learning_rate": 2.117782365941257e-06, "loss": 0.6163, "step": 6925 }, { "epoch": 0.5624949240639974, "grad_norm": 3.3652327554641808, "learning_rate": 2.1171324882217644e-06, "loss": 0.5811, "step": 6926 }, { "epoch": 0.562576139040039, "grad_norm": 3.4936659274070485, "learning_rate": 2.116482636994677e-06, "loss": 0.4976, "step": 6927 }, { "epoch": 0.5626573540160805, "grad_norm": 15.606156521919054, "learning_rate": 2.11583281230496e-06, "loss": 0.3923, "step": 6928 }, { "epoch": 0.5627385689921222, "grad_norm": 8.796650533681337, "learning_rate": 2.11518301419758e-06, "loss": 0.5009, "step": 6929 }, { "epoch": 0.5628197839681637, "grad_norm": 5.043513216497029, "learning_rate": 2.1145332427174995e-06, "loss": 0.3586, "step": 6930 }, { "epoch": 0.5629009989442053, "grad_norm": 5.302763968985061, "learning_rate": 2.1138834979096778e-06, "loss": 0.4416, "step": 6931 }, { "epoch": 0.5629822139202469, "grad_norm": 4.618030423336223, "learning_rate": 2.1132337798190743e-06, "loss": 0.5591, "step": 6932 }, { "epoch": 0.5630634288962885, "grad_norm": 5.305923379739884, "learning_rate": 2.112584088490647e-06, "loss": 0.6132, "step": 6933 }, { "epoch": 0.5631446438723301, "grad_norm": 8.514031471933345, "learning_rate": 2.11193442396935e-06, "loss": 0.4011, "step": 6934 }, { "epoch": 0.5632258588483716, "grad_norm": 4.100042747142503, "learning_rate": 2.111284786300137e-06, "loss": 0.6595, "step": 6935 }, { "epoch": 0.5633070738244133, "grad_norm": 6.623608750925487, "learning_rate": 2.11063517552796e-06, "loss": 0.4761, "step": 6936 }, { "epoch": 0.5633882888004548, "grad_norm": 4.043561752629549, "learning_rate": 2.1099855916977676e-06, "loss": 0.4361, "step": 6937 }, { "epoch": 0.5634695037764964, "grad_norm": 3.492107617173054, "learning_rate": 2.109336034854508e-06, "loss": 0.6809, "step": 6938 }, { "epoch": 0.5635507187525379, "grad_norm": 5.163084090773521, "learning_rate": 2.1086865050431283e-06, "loss": 0.4603, "step": 6939 }, { "epoch": 0.5636319337285796, "grad_norm": 4.691574509899781, "learning_rate": 2.1080370023085713e-06, "loss": 0.6142, "step": 6940 }, { "epoch": 0.5637131487046211, "grad_norm": 4.128014703538365, "learning_rate": 2.107387526695778e-06, "loss": 0.4824, "step": 6941 }, { "epoch": 0.5637943636806627, "grad_norm": 11.05696154296604, "learning_rate": 2.106738078249691e-06, "loss": 0.5197, "step": 6942 }, { "epoch": 0.5638755786567043, "grad_norm": 5.222620839074155, "learning_rate": 2.1060886570152477e-06, "loss": 0.4981, "step": 6943 }, { "epoch": 0.5639567936327459, "grad_norm": 4.697414322539678, "learning_rate": 2.105439263037384e-06, "loss": 0.4387, "step": 6944 }, { "epoch": 0.5640380086087875, "grad_norm": 3.8406258019033004, "learning_rate": 2.1047898963610354e-06, "loss": 0.4677, "step": 6945 }, { "epoch": 0.564119223584829, "grad_norm": 5.2714849848498115, "learning_rate": 2.1041405570311348e-06, "loss": 0.3368, "step": 6946 }, { "epoch": 0.5642004385608707, "grad_norm": 7.429954267821541, "learning_rate": 2.1034912450926114e-06, "loss": 0.4286, "step": 6947 }, { "epoch": 0.5642816535369122, "grad_norm": 4.461747197120303, "learning_rate": 2.102841960590396e-06, "loss": 0.4246, "step": 6948 }, { "epoch": 0.5643628685129538, "grad_norm": 6.852938483133885, "learning_rate": 2.102192703569416e-06, "loss": 0.4819, "step": 6949 }, { "epoch": 0.5644440834889953, "grad_norm": 4.945485942357694, "learning_rate": 2.1015434740745944e-06, "loss": 0.5224, "step": 6950 }, { "epoch": 0.564525298465037, "grad_norm": 5.1220985098046095, "learning_rate": 2.1008942721508553e-06, "loss": 0.4534, "step": 6951 }, { "epoch": 0.5646065134410785, "grad_norm": 4.645911138798781, "learning_rate": 2.1002450978431216e-06, "loss": 0.4874, "step": 6952 }, { "epoch": 0.5646877284171201, "grad_norm": 4.138063372246911, "learning_rate": 2.099595951196311e-06, "loss": 0.4335, "step": 6953 }, { "epoch": 0.5647689433931617, "grad_norm": 5.760613534796062, "learning_rate": 2.09894683225534e-06, "loss": 0.5575, "step": 6954 }, { "epoch": 0.5648501583692033, "grad_norm": 5.719720985157515, "learning_rate": 2.0982977410651276e-06, "loss": 0.5573, "step": 6955 }, { "epoch": 0.5649313733452449, "grad_norm": 5.21317528533535, "learning_rate": 2.0976486776705853e-06, "loss": 0.4523, "step": 6956 }, { "epoch": 0.5650125883212864, "grad_norm": 6.455893264664072, "learning_rate": 2.0969996421166243e-06, "loss": 0.5827, "step": 6957 }, { "epoch": 0.5650938032973281, "grad_norm": 3.152544873395565, "learning_rate": 2.0963506344481556e-06, "loss": 0.4638, "step": 6958 }, { "epoch": 0.5651750182733696, "grad_norm": 3.711902575933437, "learning_rate": 2.0957016547100867e-06, "loss": 0.5174, "step": 6959 }, { "epoch": 0.5652562332494112, "grad_norm": 3.1937905928416774, "learning_rate": 2.095052702947323e-06, "loss": 0.5568, "step": 6960 }, { "epoch": 0.5653374482254527, "grad_norm": 3.8506487142937336, "learning_rate": 2.09440377920477e-06, "loss": 0.6765, "step": 6961 }, { "epoch": 0.5654186632014944, "grad_norm": 4.161307544084725, "learning_rate": 2.0937548835273285e-06, "loss": 0.4279, "step": 6962 }, { "epoch": 0.5654998781775359, "grad_norm": 5.783786158584384, "learning_rate": 2.0931060159598986e-06, "loss": 0.483, "step": 6963 }, { "epoch": 0.5655810931535775, "grad_norm": 4.391669807993531, "learning_rate": 2.0924571765473793e-06, "loss": 0.5543, "step": 6964 }, { "epoch": 0.5656623081296192, "grad_norm": 11.934622866843036, "learning_rate": 2.091808365334667e-06, "loss": 0.5861, "step": 6965 }, { "epoch": 0.5657435231056607, "grad_norm": 6.272947824575495, "learning_rate": 2.091159582366655e-06, "loss": 0.4361, "step": 6966 }, { "epoch": 0.5658247380817023, "grad_norm": 2.896511468026234, "learning_rate": 2.0905108276882356e-06, "loss": 0.4904, "step": 6967 }, { "epoch": 0.5659059530577438, "grad_norm": 4.7888166758801205, "learning_rate": 2.089862101344301e-06, "loss": 0.4592, "step": 6968 }, { "epoch": 0.5659871680337855, "grad_norm": 10.0912728353721, "learning_rate": 2.0892134033797383e-06, "loss": 0.4861, "step": 6969 }, { "epoch": 0.566068383009827, "grad_norm": 2.573138862813564, "learning_rate": 2.088564733839433e-06, "loss": 0.3952, "step": 6970 }, { "epoch": 0.5661495979858686, "grad_norm": 2.924408214469329, "learning_rate": 2.087916092768271e-06, "loss": 0.4989, "step": 6971 }, { "epoch": 0.5662308129619101, "grad_norm": 5.175327347055258, "learning_rate": 2.087267480211135e-06, "loss": 0.593, "step": 6972 }, { "epoch": 0.5663120279379518, "grad_norm": 5.327737397653494, "learning_rate": 2.086618896212904e-06, "loss": 0.4824, "step": 6973 }, { "epoch": 0.5663932429139933, "grad_norm": 3.5981451480047344, "learning_rate": 2.0859703408184583e-06, "loss": 0.569, "step": 6974 }, { "epoch": 0.5664744578900349, "grad_norm": 3.98952188483392, "learning_rate": 2.085321814072674e-06, "loss": 0.5055, "step": 6975 }, { "epoch": 0.5665556728660766, "grad_norm": 4.760263855647576, "learning_rate": 2.0846733160204244e-06, "loss": 0.5048, "step": 6976 }, { "epoch": 0.5666368878421181, "grad_norm": 4.040158060702142, "learning_rate": 2.084024846706584e-06, "loss": 0.474, "step": 6977 }, { "epoch": 0.5667181028181597, "grad_norm": 3.3443500978774394, "learning_rate": 2.083376406176023e-06, "loss": 0.6342, "step": 6978 }, { "epoch": 0.5667993177942012, "grad_norm": 6.809697122943693, "learning_rate": 2.082727994473609e-06, "loss": 0.4007, "step": 6979 }, { "epoch": 0.5668805327702429, "grad_norm": 4.007751918991223, "learning_rate": 2.08207961164421e-06, "loss": 0.3879, "step": 6980 }, { "epoch": 0.5669617477462844, "grad_norm": 4.107776861520255, "learning_rate": 2.08143125773269e-06, "loss": 0.4471, "step": 6981 }, { "epoch": 0.567042962722326, "grad_norm": 3.806832164663562, "learning_rate": 2.080782932783911e-06, "loss": 0.4513, "step": 6982 }, { "epoch": 0.5671241776983675, "grad_norm": 6.942546340030462, "learning_rate": 2.0801346368427356e-06, "loss": 0.6031, "step": 6983 }, { "epoch": 0.5672053926744092, "grad_norm": 3.225972794340873, "learning_rate": 2.0794863699540206e-06, "loss": 0.6745, "step": 6984 }, { "epoch": 0.5672866076504507, "grad_norm": 4.7018910207675955, "learning_rate": 2.0788381321626237e-06, "loss": 0.531, "step": 6985 }, { "epoch": 0.5673678226264923, "grad_norm": 3.88377602984253, "learning_rate": 2.0781899235133984e-06, "loss": 0.6038, "step": 6986 }, { "epoch": 0.567449037602534, "grad_norm": 4.743493354886736, "learning_rate": 2.077541744051198e-06, "loss": 0.5044, "step": 6987 }, { "epoch": 0.5675302525785755, "grad_norm": 3.725682821827385, "learning_rate": 2.0768935938208735e-06, "loss": 0.545, "step": 6988 }, { "epoch": 0.5676114675546171, "grad_norm": 4.23366272227734, "learning_rate": 2.0762454728672727e-06, "loss": 0.6513, "step": 6989 }, { "epoch": 0.5676926825306586, "grad_norm": 3.219731697269269, "learning_rate": 2.0755973812352424e-06, "loss": 0.5489, "step": 6990 }, { "epoch": 0.5677738975067003, "grad_norm": 5.426598112663618, "learning_rate": 2.074949318969628e-06, "loss": 0.5292, "step": 6991 }, { "epoch": 0.5678551124827418, "grad_norm": 5.501771039935805, "learning_rate": 2.07430128611527e-06, "loss": 0.4586, "step": 6992 }, { "epoch": 0.5679363274587834, "grad_norm": 4.038443635513911, "learning_rate": 2.0736532827170107e-06, "loss": 0.414, "step": 6993 }, { "epoch": 0.5680175424348249, "grad_norm": 5.939660501414922, "learning_rate": 2.0730053088196883e-06, "loss": 0.4727, "step": 6994 }, { "epoch": 0.5680987574108666, "grad_norm": 4.71954105859692, "learning_rate": 2.072357364468138e-06, "loss": 0.4108, "step": 6995 }, { "epoch": 0.5681799723869081, "grad_norm": 12.024015711126426, "learning_rate": 2.0717094497071945e-06, "loss": 0.4805, "step": 6996 }, { "epoch": 0.5682611873629497, "grad_norm": 4.1133512422429455, "learning_rate": 2.0710615645816913e-06, "loss": 0.4311, "step": 6997 }, { "epoch": 0.5683424023389914, "grad_norm": 3.993669100784519, "learning_rate": 2.0704137091364568e-06, "loss": 0.5012, "step": 6998 }, { "epoch": 0.5684236173150329, "grad_norm": 5.302686271074888, "learning_rate": 2.069765883416321e-06, "loss": 0.43, "step": 6999 }, { "epoch": 0.5685048322910745, "grad_norm": 4.755390559997945, "learning_rate": 2.0691180874661086e-06, "loss": 0.5289, "step": 7000 }, { "epoch": 0.568586047267116, "grad_norm": 2.9673265351316704, "learning_rate": 2.0684703213306435e-06, "loss": 0.4691, "step": 7001 }, { "epoch": 0.5686672622431577, "grad_norm": 3.9211031513326486, "learning_rate": 2.0678225850547497e-06, "loss": 0.5234, "step": 7002 }, { "epoch": 0.5687484772191992, "grad_norm": 4.581102578387739, "learning_rate": 2.0671748786832447e-06, "loss": 0.5117, "step": 7003 }, { "epoch": 0.5688296921952408, "grad_norm": 6.219007379440202, "learning_rate": 2.0665272022609482e-06, "loss": 0.5086, "step": 7004 }, { "epoch": 0.5689109071712823, "grad_norm": 6.0686864713771485, "learning_rate": 2.0658795558326745e-06, "loss": 0.4348, "step": 7005 }, { "epoch": 0.568992122147324, "grad_norm": 3.6037676191639885, "learning_rate": 2.065231939443238e-06, "loss": 0.4322, "step": 7006 }, { "epoch": 0.5690733371233655, "grad_norm": 3.160547290178154, "learning_rate": 2.064584353137451e-06, "loss": 0.4967, "step": 7007 }, { "epoch": 0.5691545520994071, "grad_norm": 4.373900459536156, "learning_rate": 2.0639367969601215e-06, "loss": 0.4513, "step": 7008 }, { "epoch": 0.5692357670754488, "grad_norm": 10.5295783218965, "learning_rate": 2.063289270956058e-06, "loss": 0.4657, "step": 7009 }, { "epoch": 0.5693169820514903, "grad_norm": 6.441791257822959, "learning_rate": 2.0626417751700664e-06, "loss": 0.3973, "step": 7010 }, { "epoch": 0.5693981970275319, "grad_norm": 7.65548642747038, "learning_rate": 2.0619943096469484e-06, "loss": 0.4463, "step": 7011 }, { "epoch": 0.5694794120035734, "grad_norm": 2.7981688902513864, "learning_rate": 2.061346874431507e-06, "loss": 0.5198, "step": 7012 }, { "epoch": 0.5695606269796151, "grad_norm": 3.8831757918226586, "learning_rate": 2.0606994695685396e-06, "loss": 0.3686, "step": 7013 }, { "epoch": 0.5696418419556566, "grad_norm": 3.8701501681776156, "learning_rate": 2.0600520951028437e-06, "loss": 0.5688, "step": 7014 }, { "epoch": 0.5697230569316982, "grad_norm": 3.537194498397044, "learning_rate": 2.059404751079215e-06, "loss": 0.4693, "step": 7015 }, { "epoch": 0.5698042719077397, "grad_norm": 4.5238431030094794, "learning_rate": 2.0587574375424456e-06, "loss": 0.4932, "step": 7016 }, { "epoch": 0.5698854868837814, "grad_norm": 6.314721658579622, "learning_rate": 2.0581101545373255e-06, "loss": 0.5723, "step": 7017 }, { "epoch": 0.5699667018598229, "grad_norm": 4.543665245055623, "learning_rate": 2.057462902108645e-06, "loss": 0.6108, "step": 7018 }, { "epoch": 0.5700479168358645, "grad_norm": 4.326213404795697, "learning_rate": 2.0568156803011897e-06, "loss": 0.5064, "step": 7019 }, { "epoch": 0.5701291318119062, "grad_norm": 6.344731487919164, "learning_rate": 2.056168489159744e-06, "loss": 0.4286, "step": 7020 }, { "epoch": 0.5702103467879477, "grad_norm": 6.407501842611541, "learning_rate": 2.0555213287290886e-06, "loss": 0.4281, "step": 7021 }, { "epoch": 0.5702915617639893, "grad_norm": 3.9016889868541873, "learning_rate": 2.0548741990540057e-06, "loss": 0.5229, "step": 7022 }, { "epoch": 0.5703727767400308, "grad_norm": 4.485329708138932, "learning_rate": 2.0542271001792726e-06, "loss": 0.4988, "step": 7023 }, { "epoch": 0.5704539917160725, "grad_norm": 7.011271727948778, "learning_rate": 2.0535800321496645e-06, "loss": 0.4174, "step": 7024 }, { "epoch": 0.570535206692114, "grad_norm": 21.104012028716472, "learning_rate": 2.0529329950099554e-06, "loss": 0.566, "step": 7025 }, { "epoch": 0.5706164216681556, "grad_norm": 5.5743586700224, "learning_rate": 2.052285988804918e-06, "loss": 0.4475, "step": 7026 }, { "epoch": 0.5706976366441971, "grad_norm": 4.416468163895723, "learning_rate": 2.0516390135793192e-06, "loss": 0.3382, "step": 7027 }, { "epoch": 0.5707788516202388, "grad_norm": 4.986468294717369, "learning_rate": 2.050992069377929e-06, "loss": 0.3999, "step": 7028 }, { "epoch": 0.5708600665962803, "grad_norm": 9.4114543685177, "learning_rate": 2.050345156245511e-06, "loss": 0.4653, "step": 7029 }, { "epoch": 0.5709412815723219, "grad_norm": 4.613503552061298, "learning_rate": 2.0496982742268273e-06, "loss": 0.3624, "step": 7030 }, { "epoch": 0.5710224965483636, "grad_norm": 6.062105080879825, "learning_rate": 2.0490514233666413e-06, "loss": 0.5138, "step": 7031 }, { "epoch": 0.5711037115244051, "grad_norm": 3.9841008788202132, "learning_rate": 2.04840460370971e-06, "loss": 0.4635, "step": 7032 }, { "epoch": 0.5711849265004467, "grad_norm": 5.48691904012669, "learning_rate": 2.0477578153007887e-06, "loss": 0.4516, "step": 7033 }, { "epoch": 0.5712661414764882, "grad_norm": 3.3836486599021636, "learning_rate": 2.047111058184635e-06, "loss": 0.5242, "step": 7034 }, { "epoch": 0.5713473564525299, "grad_norm": 3.8672222024944856, "learning_rate": 2.046464332405998e-06, "loss": 0.5653, "step": 7035 }, { "epoch": 0.5714285714285714, "grad_norm": 4.314382703074949, "learning_rate": 2.045817638009629e-06, "loss": 0.5402, "step": 7036 }, { "epoch": 0.571509786404613, "grad_norm": 3.567072195815995, "learning_rate": 2.045170975040276e-06, "loss": 0.4213, "step": 7037 }, { "epoch": 0.5715910013806546, "grad_norm": 4.48375869912088, "learning_rate": 2.0445243435426847e-06, "loss": 0.6564, "step": 7038 }, { "epoch": 0.5716722163566962, "grad_norm": 3.552656399851541, "learning_rate": 2.043877743561598e-06, "loss": 0.5135, "step": 7039 }, { "epoch": 0.5717534313327377, "grad_norm": 3.8377915597339873, "learning_rate": 2.0432311751417568e-06, "loss": 0.6527, "step": 7040 }, { "epoch": 0.5718346463087793, "grad_norm": 6.426418479233554, "learning_rate": 2.042584638327902e-06, "loss": 0.3723, "step": 7041 }, { "epoch": 0.571915861284821, "grad_norm": 4.241032012671777, "learning_rate": 2.0419381331647687e-06, "loss": 0.4957, "step": 7042 }, { "epoch": 0.5719970762608625, "grad_norm": 3.735539966685467, "learning_rate": 2.0412916596970918e-06, "loss": 0.566, "step": 7043 }, { "epoch": 0.5720782912369041, "grad_norm": 3.8956911217710917, "learning_rate": 2.040645217969606e-06, "loss": 0.4935, "step": 7044 }, { "epoch": 0.5721595062129456, "grad_norm": 20.493625970905985, "learning_rate": 2.0399988080270384e-06, "loss": 0.4629, "step": 7045 }, { "epoch": 0.5722407211889873, "grad_norm": 29.937743604580216, "learning_rate": 2.039352429914119e-06, "loss": 0.4411, "step": 7046 }, { "epoch": 0.5723219361650288, "grad_norm": 6.109372993647007, "learning_rate": 2.038706083675574e-06, "loss": 0.4187, "step": 7047 }, { "epoch": 0.5724031511410704, "grad_norm": 4.3015816031863086, "learning_rate": 2.038059769356127e-06, "loss": 0.5831, "step": 7048 }, { "epoch": 0.572484366117112, "grad_norm": 4.347753553845655, "learning_rate": 2.037413487000498e-06, "loss": 0.4969, "step": 7049 }, { "epoch": 0.5725655810931536, "grad_norm": 5.01369388436399, "learning_rate": 2.0367672366534087e-06, "loss": 0.4838, "step": 7050 }, { "epoch": 0.5726467960691951, "grad_norm": 6.956995413047726, "learning_rate": 2.036121018359574e-06, "loss": 0.4995, "step": 7051 }, { "epoch": 0.5727280110452367, "grad_norm": 7.014322634335082, "learning_rate": 2.03547483216371e-06, "loss": 0.5013, "step": 7052 }, { "epoch": 0.5728092260212784, "grad_norm": 6.622483650279607, "learning_rate": 2.0348286781105302e-06, "loss": 0.4124, "step": 7053 }, { "epoch": 0.5728904409973199, "grad_norm": 4.159914158121862, "learning_rate": 2.0341825562447427e-06, "loss": 0.6059, "step": 7054 }, { "epoch": 0.5729716559733615, "grad_norm": 5.011801450523911, "learning_rate": 2.0335364666110572e-06, "loss": 0.4856, "step": 7055 }, { "epoch": 0.573052870949403, "grad_norm": 4.952363770440026, "learning_rate": 2.03289040925418e-06, "loss": 0.4473, "step": 7056 }, { "epoch": 0.5731340859254447, "grad_norm": 7.83403745870332, "learning_rate": 2.032244384218815e-06, "loss": 0.6729, "step": 7057 }, { "epoch": 0.5732153009014862, "grad_norm": 6.993413026032686, "learning_rate": 2.031598391549662e-06, "loss": 0.5622, "step": 7058 }, { "epoch": 0.5732965158775278, "grad_norm": 6.260135249773573, "learning_rate": 2.030952431291421e-06, "loss": 0.5174, "step": 7059 }, { "epoch": 0.5733777308535694, "grad_norm": 4.075720359030366, "learning_rate": 2.0303065034887904e-06, "loss": 0.5199, "step": 7060 }, { "epoch": 0.573458945829611, "grad_norm": 3.9202159123881475, "learning_rate": 2.0296606081864634e-06, "loss": 0.5269, "step": 7061 }, { "epoch": 0.5735401608056525, "grad_norm": 4.8268240982250274, "learning_rate": 2.0290147454291323e-06, "loss": 0.4606, "step": 7062 }, { "epoch": 0.5736213757816941, "grad_norm": 5.603064677109073, "learning_rate": 2.0283689152614896e-06, "loss": 0.6205, "step": 7063 }, { "epoch": 0.5737025907577358, "grad_norm": 4.633814523131622, "learning_rate": 2.0277231177282213e-06, "loss": 0.5806, "step": 7064 }, { "epoch": 0.5737838057337773, "grad_norm": 11.093950775524117, "learning_rate": 2.0270773528740127e-06, "loss": 0.5424, "step": 7065 }, { "epoch": 0.5738650207098189, "grad_norm": 4.963416503789594, "learning_rate": 2.02643162074355e-06, "loss": 0.5403, "step": 7066 }, { "epoch": 0.5739462356858605, "grad_norm": 7.269597973886369, "learning_rate": 2.0257859213815123e-06, "loss": 0.649, "step": 7067 }, { "epoch": 0.5740274506619021, "grad_norm": 4.685273966619034, "learning_rate": 2.0251402548325783e-06, "loss": 0.6087, "step": 7068 }, { "epoch": 0.5741086656379436, "grad_norm": 4.846673111295444, "learning_rate": 2.0244946211414267e-06, "loss": 0.5899, "step": 7069 }, { "epoch": 0.5741898806139852, "grad_norm": 3.9278515943561625, "learning_rate": 2.0238490203527307e-06, "loss": 0.5397, "step": 7070 }, { "epoch": 0.5742710955900268, "grad_norm": 4.034403858762683, "learning_rate": 2.0232034525111617e-06, "loss": 0.5535, "step": 7071 }, { "epoch": 0.5743523105660684, "grad_norm": 6.517927780743827, "learning_rate": 2.0225579176613905e-06, "loss": 0.4494, "step": 7072 }, { "epoch": 0.5744335255421099, "grad_norm": 3.068515611615937, "learning_rate": 2.0219124158480853e-06, "loss": 0.3643, "step": 7073 }, { "epoch": 0.5745147405181515, "grad_norm": 3.6556023426111084, "learning_rate": 2.0212669471159098e-06, "loss": 0.4402, "step": 7074 }, { "epoch": 0.5745959554941932, "grad_norm": 3.202736868542401, "learning_rate": 2.020621511509528e-06, "loss": 0.5183, "step": 7075 }, { "epoch": 0.5746771704702347, "grad_norm": 5.842839385752603, "learning_rate": 2.019976109073601e-06, "loss": 0.477, "step": 7076 }, { "epoch": 0.5747583854462763, "grad_norm": 12.847781405058441, "learning_rate": 2.0193307398527865e-06, "loss": 0.4842, "step": 7077 }, { "epoch": 0.5748396004223179, "grad_norm": 21.13344792621275, "learning_rate": 2.0186854038917405e-06, "loss": 0.4538, "step": 7078 }, { "epoch": 0.5749208153983595, "grad_norm": 10.50726009998217, "learning_rate": 2.0180401012351182e-06, "loss": 0.4578, "step": 7079 }, { "epoch": 0.575002030374401, "grad_norm": 5.987355841136969, "learning_rate": 2.0173948319275696e-06, "loss": 0.5223, "step": 7080 }, { "epoch": 0.5750832453504426, "grad_norm": 5.722234729191682, "learning_rate": 2.016749596013744e-06, "loss": 0.4965, "step": 7081 }, { "epoch": 0.5751644603264842, "grad_norm": 4.664393900667678, "learning_rate": 2.0161043935382897e-06, "loss": 0.4677, "step": 7082 }, { "epoch": 0.5752456753025258, "grad_norm": 7.498350619883631, "learning_rate": 2.0154592245458504e-06, "loss": 0.5619, "step": 7083 }, { "epoch": 0.5753268902785673, "grad_norm": 5.775102597864225, "learning_rate": 2.014814089081067e-06, "loss": 0.5719, "step": 7084 }, { "epoch": 0.575408105254609, "grad_norm": 5.1155695309255025, "learning_rate": 2.014168987188582e-06, "loss": 0.7247, "step": 7085 }, { "epoch": 0.5754893202306506, "grad_norm": 4.2278332646738175, "learning_rate": 2.0135239189130325e-06, "loss": 0.6624, "step": 7086 }, { "epoch": 0.5755705352066921, "grad_norm": 3.7678111182822334, "learning_rate": 2.0128788842990516e-06, "loss": 0.7632, "step": 7087 }, { "epoch": 0.5756517501827337, "grad_norm": 11.009075470970467, "learning_rate": 2.0122338833912743e-06, "loss": 0.4833, "step": 7088 }, { "epoch": 0.5757329651587753, "grad_norm": 4.5002280122486535, "learning_rate": 2.0115889162343316e-06, "loss": 0.4961, "step": 7089 }, { "epoch": 0.5758141801348169, "grad_norm": 4.760191730901538, "learning_rate": 2.01094398287285e-06, "loss": 0.475, "step": 7090 }, { "epoch": 0.5758953951108584, "grad_norm": 3.118348188458208, "learning_rate": 2.010299083351457e-06, "loss": 0.353, "step": 7091 }, { "epoch": 0.5759766100869, "grad_norm": 6.734823400251969, "learning_rate": 2.009654217714776e-06, "loss": 0.5105, "step": 7092 }, { "epoch": 0.5760578250629416, "grad_norm": 5.003848469090975, "learning_rate": 2.0090093860074273e-06, "loss": 0.5659, "step": 7093 }, { "epoch": 0.5761390400389832, "grad_norm": 3.690892599491099, "learning_rate": 2.008364588274031e-06, "loss": 0.5335, "step": 7094 }, { "epoch": 0.5762202550150247, "grad_norm": 5.097697074216427, "learning_rate": 2.0077198245592033e-06, "loss": 0.5175, "step": 7095 }, { "epoch": 0.5763014699910664, "grad_norm": 3.9972301907638133, "learning_rate": 2.0070750949075584e-06, "loss": 0.5674, "step": 7096 }, { "epoch": 0.576382684967108, "grad_norm": 3.765506245804216, "learning_rate": 2.0064303993637073e-06, "loss": 0.4085, "step": 7097 }, { "epoch": 0.5764638999431495, "grad_norm": 5.010605186017302, "learning_rate": 2.005785737972262e-06, "loss": 0.5775, "step": 7098 }, { "epoch": 0.5765451149191911, "grad_norm": 3.786228217501147, "learning_rate": 2.0051411107778273e-06, "loss": 0.5185, "step": 7099 }, { "epoch": 0.5766263298952327, "grad_norm": 9.0650925529505, "learning_rate": 2.004496517825008e-06, "loss": 0.4735, "step": 7100 }, { "epoch": 0.5767075448712743, "grad_norm": 5.189793367646856, "learning_rate": 2.0038519591584078e-06, "loss": 0.5995, "step": 7101 }, { "epoch": 0.5767887598473158, "grad_norm": 4.700410500709092, "learning_rate": 2.0032074348226268e-06, "loss": 0.5761, "step": 7102 }, { "epoch": 0.5768699748233574, "grad_norm": 4.234749790705928, "learning_rate": 2.002562944862261e-06, "loss": 0.5744, "step": 7103 }, { "epoch": 0.576951189799399, "grad_norm": 3.629545684362975, "learning_rate": 2.0019184893219076e-06, "loss": 0.5361, "step": 7104 }, { "epoch": 0.5770324047754406, "grad_norm": 5.227066105381711, "learning_rate": 2.0012740682461585e-06, "loss": 0.424, "step": 7105 }, { "epoch": 0.5771136197514821, "grad_norm": 8.03973120733085, "learning_rate": 2.0006296816796037e-06, "loss": 0.4179, "step": 7106 }, { "epoch": 0.5771948347275238, "grad_norm": 3.3013104736748575, "learning_rate": 1.9999853296668326e-06, "loss": 0.657, "step": 7107 }, { "epoch": 0.5772760497035654, "grad_norm": 4.319311268255759, "learning_rate": 1.999341012252431e-06, "loss": 0.4993, "step": 7108 }, { "epoch": 0.5773572646796069, "grad_norm": 3.6319991209865945, "learning_rate": 1.9986967294809804e-06, "loss": 0.474, "step": 7109 }, { "epoch": 0.5774384796556485, "grad_norm": 4.960374042694389, "learning_rate": 1.9980524813970635e-06, "loss": 0.5666, "step": 7110 }, { "epoch": 0.5775196946316901, "grad_norm": 2.9962262052676314, "learning_rate": 1.997408268045259e-06, "loss": 0.6228, "step": 7111 }, { "epoch": 0.5776009096077317, "grad_norm": 5.668495286840983, "learning_rate": 1.9967640894701424e-06, "loss": 0.5147, "step": 7112 }, { "epoch": 0.5776821245837732, "grad_norm": 4.163547326343956, "learning_rate": 1.9961199457162867e-06, "loss": 0.4194, "step": 7113 }, { "epoch": 0.5777633395598148, "grad_norm": 3.4932620061985555, "learning_rate": 1.995475836828264e-06, "loss": 0.5275, "step": 7114 }, { "epoch": 0.5778445545358564, "grad_norm": 4.368032739037839, "learning_rate": 1.9948317628506444e-06, "loss": 0.4277, "step": 7115 }, { "epoch": 0.577925769511898, "grad_norm": 3.876730464227753, "learning_rate": 1.994187723827992e-06, "loss": 0.4987, "step": 7116 }, { "epoch": 0.5780069844879395, "grad_norm": 3.2062830745638506, "learning_rate": 1.9935437198048722e-06, "loss": 0.5308, "step": 7117 }, { "epoch": 0.5780881994639812, "grad_norm": 7.356711618544573, "learning_rate": 1.9928997508258475e-06, "loss": 0.464, "step": 7118 }, { "epoch": 0.5781694144400228, "grad_norm": 5.484693237704757, "learning_rate": 1.9922558169354752e-06, "loss": 0.4484, "step": 7119 }, { "epoch": 0.5782506294160643, "grad_norm": 6.681361631875744, "learning_rate": 1.9916119181783135e-06, "loss": 0.4322, "step": 7120 }, { "epoch": 0.5783318443921059, "grad_norm": 4.438508741670482, "learning_rate": 1.9909680545989175e-06, "loss": 0.567, "step": 7121 }, { "epoch": 0.5784130593681475, "grad_norm": 5.678949037298267, "learning_rate": 1.9903242262418366e-06, "loss": 0.5764, "step": 7122 }, { "epoch": 0.5784942743441891, "grad_norm": 4.194676378904089, "learning_rate": 1.989680433151622e-06, "loss": 0.7083, "step": 7123 }, { "epoch": 0.5785754893202306, "grad_norm": 8.002991142166685, "learning_rate": 1.989036675372822e-06, "loss": 0.5588, "step": 7124 }, { "epoch": 0.5786567042962723, "grad_norm": 3.764738047010275, "learning_rate": 1.988392952949978e-06, "loss": 0.5561, "step": 7125 }, { "epoch": 0.5787379192723138, "grad_norm": 7.746991321514468, "learning_rate": 1.9877492659276353e-06, "loss": 0.473, "step": 7126 }, { "epoch": 0.5788191342483554, "grad_norm": 5.541041864204196, "learning_rate": 1.9871056143503322e-06, "loss": 0.417, "step": 7127 }, { "epoch": 0.5789003492243969, "grad_norm": 3.424180901190808, "learning_rate": 1.9864619982626064e-06, "loss": 0.743, "step": 7128 }, { "epoch": 0.5789815642004386, "grad_norm": 16.31331645341726, "learning_rate": 1.9858184177089915e-06, "loss": 0.6825, "step": 7129 }, { "epoch": 0.5790627791764802, "grad_norm": 3.593693975763408, "learning_rate": 1.9851748727340214e-06, "loss": 0.5404, "step": 7130 }, { "epoch": 0.5791439941525217, "grad_norm": 6.678160792934489, "learning_rate": 1.9845313633822255e-06, "loss": 0.5591, "step": 7131 }, { "epoch": 0.5792252091285633, "grad_norm": 4.456993833787158, "learning_rate": 1.9838878896981303e-06, "loss": 0.5188, "step": 7132 }, { "epoch": 0.5793064241046049, "grad_norm": 4.230178294928199, "learning_rate": 1.9832444517262625e-06, "loss": 0.3748, "step": 7133 }, { "epoch": 0.5793876390806465, "grad_norm": 4.218687976196768, "learning_rate": 1.982601049511144e-06, "loss": 0.4318, "step": 7134 }, { "epoch": 0.579468854056688, "grad_norm": 3.9051525132078386, "learning_rate": 1.9819576830972938e-06, "loss": 0.5105, "step": 7135 }, { "epoch": 0.5795500690327297, "grad_norm": 5.953375128637914, "learning_rate": 1.9813143525292304e-06, "loss": 0.5147, "step": 7136 }, { "epoch": 0.5796312840087712, "grad_norm": 5.299949856256123, "learning_rate": 1.980671057851469e-06, "loss": 0.5801, "step": 7137 }, { "epoch": 0.5797124989848128, "grad_norm": 5.575356667344601, "learning_rate": 1.9800277991085217e-06, "loss": 0.484, "step": 7138 }, { "epoch": 0.5797937139608543, "grad_norm": 4.75965558776345, "learning_rate": 1.9793845763448987e-06, "loss": 0.3816, "step": 7139 }, { "epoch": 0.579874928936896, "grad_norm": 3.335625778162441, "learning_rate": 1.9787413896051084e-06, "loss": 0.4796, "step": 7140 }, { "epoch": 0.5799561439129376, "grad_norm": 4.108846663125449, "learning_rate": 1.978098238933654e-06, "loss": 0.4803, "step": 7141 }, { "epoch": 0.5800373588889791, "grad_norm": 4.357715753076788, "learning_rate": 1.9774551243750403e-06, "loss": 0.3848, "step": 7142 }, { "epoch": 0.5801185738650207, "grad_norm": 4.9420482058322435, "learning_rate": 1.9768120459737663e-06, "loss": 0.4834, "step": 7143 }, { "epoch": 0.5801997888410623, "grad_norm": 3.610962949080674, "learning_rate": 1.9761690037743293e-06, "loss": 0.4072, "step": 7144 }, { "epoch": 0.5802810038171039, "grad_norm": 4.466463892907367, "learning_rate": 1.9755259978212253e-06, "loss": 0.5826, "step": 7145 }, { "epoch": 0.5803622187931454, "grad_norm": 5.2380939327824345, "learning_rate": 1.9748830281589464e-06, "loss": 0.6036, "step": 7146 }, { "epoch": 0.5804434337691871, "grad_norm": 3.3553525016104704, "learning_rate": 1.9742400948319838e-06, "loss": 0.6842, "step": 7147 }, { "epoch": 0.5805246487452286, "grad_norm": 5.736174896340501, "learning_rate": 1.9735971978848224e-06, "loss": 0.5493, "step": 7148 }, { "epoch": 0.5806058637212702, "grad_norm": 3.703679606497901, "learning_rate": 1.9729543373619497e-06, "loss": 0.4979, "step": 7149 }, { "epoch": 0.5806870786973117, "grad_norm": 9.728903722310404, "learning_rate": 1.972311513307848e-06, "loss": 0.45, "step": 7150 }, { "epoch": 0.5807682936733534, "grad_norm": 5.187888574328153, "learning_rate": 1.971668725766996e-06, "loss": 0.7753, "step": 7151 }, { "epoch": 0.580849508649395, "grad_norm": 3.3139229056956396, "learning_rate": 1.971025974783872e-06, "loss": 0.675, "step": 7152 }, { "epoch": 0.5809307236254365, "grad_norm": 9.118516211139692, "learning_rate": 1.9703832604029523e-06, "loss": 0.5264, "step": 7153 }, { "epoch": 0.5810119386014782, "grad_norm": 4.415066688147641, "learning_rate": 1.9697405826687063e-06, "loss": 0.5034, "step": 7154 }, { "epoch": 0.5810931535775197, "grad_norm": 5.533151171983695, "learning_rate": 1.9690979416256062e-06, "loss": 0.5316, "step": 7155 }, { "epoch": 0.5811743685535613, "grad_norm": 3.4203284332256274, "learning_rate": 1.9684553373181197e-06, "loss": 0.4886, "step": 7156 }, { "epoch": 0.5812555835296028, "grad_norm": 7.5906819086747435, "learning_rate": 1.967812769790709e-06, "loss": 0.5435, "step": 7157 }, { "epoch": 0.5813367985056445, "grad_norm": 5.6092591367647815, "learning_rate": 1.9671702390878396e-06, "loss": 0.4718, "step": 7158 }, { "epoch": 0.581418013481686, "grad_norm": 4.25266168413679, "learning_rate": 1.9665277452539696e-06, "loss": 0.5233, "step": 7159 }, { "epoch": 0.5814992284577276, "grad_norm": 3.843541994359124, "learning_rate": 1.965885288333555e-06, "loss": 0.4787, "step": 7160 }, { "epoch": 0.5815804434337691, "grad_norm": 4.549147478757539, "learning_rate": 1.965242868371053e-06, "loss": 0.6732, "step": 7161 }, { "epoch": 0.5816616584098108, "grad_norm": 7.3370587404827345, "learning_rate": 1.9646004854109136e-06, "loss": 0.3952, "step": 7162 }, { "epoch": 0.5817428733858524, "grad_norm": 4.832489166222658, "learning_rate": 1.963958139497588e-06, "loss": 0.5612, "step": 7163 }, { "epoch": 0.5818240883618939, "grad_norm": 4.612979610999288, "learning_rate": 1.9633158306755206e-06, "loss": 0.4501, "step": 7164 }, { "epoch": 0.5819053033379356, "grad_norm": 3.842889295187542, "learning_rate": 1.962673558989158e-06, "loss": 0.479, "step": 7165 }, { "epoch": 0.5819865183139771, "grad_norm": 3.615074737738207, "learning_rate": 1.9620313244829423e-06, "loss": 0.5962, "step": 7166 }, { "epoch": 0.5820677332900187, "grad_norm": 8.385548343766278, "learning_rate": 1.961389127201311e-06, "loss": 0.5366, "step": 7167 }, { "epoch": 0.5821489482660602, "grad_norm": 4.0857765219914945, "learning_rate": 1.9607469671887015e-06, "loss": 0.3576, "step": 7168 }, { "epoch": 0.5822301632421019, "grad_norm": 4.7701105433833995, "learning_rate": 1.960104844489548e-06, "loss": 0.5, "step": 7169 }, { "epoch": 0.5823113782181434, "grad_norm": 4.079111419609443, "learning_rate": 1.9594627591482817e-06, "loss": 0.6516, "step": 7170 }, { "epoch": 0.582392593194185, "grad_norm": 5.075085483225634, "learning_rate": 1.9588207112093324e-06, "loss": 0.5357, "step": 7171 }, { "epoch": 0.5824738081702265, "grad_norm": 3.9201086563815624, "learning_rate": 1.958178700717125e-06, "loss": 0.4631, "step": 7172 }, { "epoch": 0.5825550231462682, "grad_norm": 4.27616202832829, "learning_rate": 1.957536727716084e-06, "loss": 0.4648, "step": 7173 }, { "epoch": 0.5826362381223098, "grad_norm": 4.882524062302524, "learning_rate": 1.956894792250631e-06, "loss": 0.5446, "step": 7174 }, { "epoch": 0.5827174530983513, "grad_norm": 3.8190132907051435, "learning_rate": 1.9562528943651837e-06, "loss": 0.54, "step": 7175 }, { "epoch": 0.582798668074393, "grad_norm": 6.140657105349779, "learning_rate": 1.955611034104158e-06, "loss": 0.4492, "step": 7176 }, { "epoch": 0.5828798830504345, "grad_norm": 4.341900527053686, "learning_rate": 1.9549692115119685e-06, "loss": 0.5207, "step": 7177 }, { "epoch": 0.5829610980264761, "grad_norm": 3.7483344786953685, "learning_rate": 1.9543274266330244e-06, "loss": 0.4006, "step": 7178 }, { "epoch": 0.5830423130025176, "grad_norm": 3.3504281004368854, "learning_rate": 1.9536856795117344e-06, "loss": 0.5293, "step": 7179 }, { "epoch": 0.5831235279785593, "grad_norm": 6.428753807471014, "learning_rate": 1.9530439701925046e-06, "loss": 0.6898, "step": 7180 }, { "epoch": 0.5832047429546008, "grad_norm": 9.364262509671228, "learning_rate": 1.952402298719737e-06, "loss": 0.3292, "step": 7181 }, { "epoch": 0.5832859579306424, "grad_norm": 3.942276858330493, "learning_rate": 1.951760665137832e-06, "loss": 0.4671, "step": 7182 }, { "epoch": 0.5833671729066839, "grad_norm": 6.118096625229616, "learning_rate": 1.9511190694911875e-06, "loss": 0.4731, "step": 7183 }, { "epoch": 0.5834483878827256, "grad_norm": 4.555831582312394, "learning_rate": 1.9504775118241987e-06, "loss": 0.4638, "step": 7184 }, { "epoch": 0.5835296028587672, "grad_norm": 3.4499740673047214, "learning_rate": 1.9498359921812583e-06, "loss": 0.6663, "step": 7185 }, { "epoch": 0.5836108178348087, "grad_norm": 6.8178554531301705, "learning_rate": 1.9491945106067544e-06, "loss": 0.4266, "step": 7186 }, { "epoch": 0.5836920328108504, "grad_norm": 4.578346378430701, "learning_rate": 1.948553067145076e-06, "loss": 0.5886, "step": 7187 }, { "epoch": 0.5837732477868919, "grad_norm": 3.0536821725267718, "learning_rate": 1.947911661840607e-06, "loss": 0.3962, "step": 7188 }, { "epoch": 0.5838544627629335, "grad_norm": 10.782933310564555, "learning_rate": 1.947270294737728e-06, "loss": 0.3495, "step": 7189 }, { "epoch": 0.583935677738975, "grad_norm": 4.129900561498041, "learning_rate": 1.9466289658808207e-06, "loss": 0.4919, "step": 7190 }, { "epoch": 0.5840168927150167, "grad_norm": 6.249402586479176, "learning_rate": 1.9459876753142593e-06, "loss": 0.541, "step": 7191 }, { "epoch": 0.5840981076910582, "grad_norm": 2.926784134616971, "learning_rate": 1.9453464230824186e-06, "loss": 0.4936, "step": 7192 }, { "epoch": 0.5841793226670998, "grad_norm": 3.6484600396561433, "learning_rate": 1.9447052092296712e-06, "loss": 0.5192, "step": 7193 }, { "epoch": 0.5842605376431413, "grad_norm": 3.898125434059147, "learning_rate": 1.9440640338003835e-06, "loss": 0.4092, "step": 7194 }, { "epoch": 0.584341752619183, "grad_norm": 3.683502965273959, "learning_rate": 1.943422896838922e-06, "loss": 0.5507, "step": 7195 }, { "epoch": 0.5844229675952246, "grad_norm": 5.288308403487764, "learning_rate": 1.9427817983896518e-06, "loss": 0.4068, "step": 7196 }, { "epoch": 0.5845041825712661, "grad_norm": 5.445064431182992, "learning_rate": 1.942140738496931e-06, "loss": 0.7837, "step": 7197 }, { "epoch": 0.5845853975473078, "grad_norm": 6.058709142095907, "learning_rate": 1.9414997172051184e-06, "loss": 0.3739, "step": 7198 }, { "epoch": 0.5846666125233493, "grad_norm": 3.970322164486651, "learning_rate": 1.9408587345585707e-06, "loss": 0.5194, "step": 7199 }, { "epoch": 0.5847478274993909, "grad_norm": 5.3016872496159495, "learning_rate": 1.9402177906016395e-06, "loss": 0.4401, "step": 7200 }, { "epoch": 0.5848290424754324, "grad_norm": 4.775951466036485, "learning_rate": 1.939576885378674e-06, "loss": 0.398, "step": 7201 }, { "epoch": 0.5849102574514741, "grad_norm": 6.899462009323301, "learning_rate": 1.9389360189340213e-06, "loss": 0.5067, "step": 7202 }, { "epoch": 0.5849914724275156, "grad_norm": 3.2063799922509464, "learning_rate": 1.9382951913120276e-06, "loss": 0.5829, "step": 7203 }, { "epoch": 0.5850726874035572, "grad_norm": 9.321053496169446, "learning_rate": 1.937654402557034e-06, "loss": 0.4999, "step": 7204 }, { "epoch": 0.5851539023795987, "grad_norm": 5.677431258954087, "learning_rate": 1.937013652713378e-06, "loss": 0.2643, "step": 7205 }, { "epoch": 0.5852351173556404, "grad_norm": 3.3598751265364446, "learning_rate": 1.9363729418253995e-06, "loss": 0.7468, "step": 7206 }, { "epoch": 0.585316332331682, "grad_norm": 4.156902235894881, "learning_rate": 1.93573226993743e-06, "loss": 0.5862, "step": 7207 }, { "epoch": 0.5853975473077235, "grad_norm": 4.5056414028942084, "learning_rate": 1.9350916370938004e-06, "loss": 0.6588, "step": 7208 }, { "epoch": 0.5854787622837652, "grad_norm": 5.759552128771596, "learning_rate": 1.9344510433388405e-06, "loss": 0.8105, "step": 7209 }, { "epoch": 0.5855599772598067, "grad_norm": 4.3224511149017255, "learning_rate": 1.9338104887168753e-06, "loss": 0.5087, "step": 7210 }, { "epoch": 0.5856411922358483, "grad_norm": 4.7674728901987455, "learning_rate": 1.933169973272227e-06, "loss": 0.4611, "step": 7211 }, { "epoch": 0.5857224072118898, "grad_norm": 5.687185959055942, "learning_rate": 1.932529497049217e-06, "loss": 0.5603, "step": 7212 }, { "epoch": 0.5858036221879315, "grad_norm": 6.09715437191108, "learning_rate": 1.9318890600921638e-06, "loss": 0.6074, "step": 7213 }, { "epoch": 0.585884837163973, "grad_norm": 5.020060984695505, "learning_rate": 1.9312486624453783e-06, "loss": 0.6328, "step": 7214 }, { "epoch": 0.5859660521400146, "grad_norm": 4.477953926588951, "learning_rate": 1.9306083041531773e-06, "loss": 0.5313, "step": 7215 }, { "epoch": 0.5860472671160561, "grad_norm": 3.519455685248619, "learning_rate": 1.9299679852598684e-06, "loss": 0.6649, "step": 7216 }, { "epoch": 0.5861284820920978, "grad_norm": 3.6704238824484325, "learning_rate": 1.929327705809757e-06, "loss": 0.4758, "step": 7217 }, { "epoch": 0.5862096970681394, "grad_norm": 6.283905565847572, "learning_rate": 1.928687465847148e-06, "loss": 0.4217, "step": 7218 }, { "epoch": 0.5862909120441809, "grad_norm": 6.945906876806998, "learning_rate": 1.9280472654163436e-06, "loss": 0.4945, "step": 7219 }, { "epoch": 0.5863721270202226, "grad_norm": 8.370884932771967, "learning_rate": 1.927407104561641e-06, "loss": 0.4238, "step": 7220 }, { "epoch": 0.5864533419962641, "grad_norm": 4.383870501450878, "learning_rate": 1.926766983327336e-06, "loss": 0.5634, "step": 7221 }, { "epoch": 0.5865345569723057, "grad_norm": 9.241632141950406, "learning_rate": 1.9261269017577228e-06, "loss": 0.634, "step": 7222 }, { "epoch": 0.5866157719483472, "grad_norm": 3.7586621345897853, "learning_rate": 1.9254868598970904e-06, "loss": 0.5618, "step": 7223 }, { "epoch": 0.5866969869243889, "grad_norm": 4.209074416276335, "learning_rate": 1.924846857789726e-06, "loss": 0.6119, "step": 7224 }, { "epoch": 0.5867782019004304, "grad_norm": 4.546889702288765, "learning_rate": 1.924206895479916e-06, "loss": 0.4853, "step": 7225 }, { "epoch": 0.586859416876472, "grad_norm": 3.7823061514718423, "learning_rate": 1.9235669730119415e-06, "loss": 0.3448, "step": 7226 }, { "epoch": 0.5869406318525136, "grad_norm": 4.256499430588995, "learning_rate": 1.922927090430081e-06, "loss": 0.5403, "step": 7227 }, { "epoch": 0.5870218468285552, "grad_norm": 44.81613726842311, "learning_rate": 1.9222872477786124e-06, "loss": 0.4421, "step": 7228 }, { "epoch": 0.5871030618045968, "grad_norm": 4.765715400723174, "learning_rate": 1.921647445101809e-06, "loss": 0.4627, "step": 7229 }, { "epoch": 0.5871842767806383, "grad_norm": 20.740588315180986, "learning_rate": 1.921007682443941e-06, "loss": 0.4778, "step": 7230 }, { "epoch": 0.58726549175668, "grad_norm": 5.271002543096211, "learning_rate": 1.920367959849277e-06, "loss": 0.4699, "step": 7231 }, { "epoch": 0.5873467067327215, "grad_norm": 4.633497702253883, "learning_rate": 1.919728277362083e-06, "loss": 0.6411, "step": 7232 }, { "epoch": 0.5874279217087631, "grad_norm": 2.739394598878589, "learning_rate": 1.91908863502662e-06, "loss": 0.3667, "step": 7233 }, { "epoch": 0.5875091366848046, "grad_norm": 5.224714666393266, "learning_rate": 1.9184490328871502e-06, "loss": 0.5021, "step": 7234 }, { "epoch": 0.5875903516608463, "grad_norm": 7.019642199719357, "learning_rate": 1.9178094709879296e-06, "loss": 0.4818, "step": 7235 }, { "epoch": 0.5876715666368878, "grad_norm": 5.128939865748304, "learning_rate": 1.9171699493732122e-06, "loss": 0.455, "step": 7236 }, { "epoch": 0.5877527816129294, "grad_norm": 7.227653257937603, "learning_rate": 1.916530468087249e-06, "loss": 0.4862, "step": 7237 }, { "epoch": 0.587833996588971, "grad_norm": 5.527188153857498, "learning_rate": 1.9158910271742905e-06, "loss": 0.5265, "step": 7238 }, { "epoch": 0.5879152115650126, "grad_norm": 2.6827965014779593, "learning_rate": 1.9152516266785807e-06, "loss": 0.5964, "step": 7239 }, { "epoch": 0.5879964265410542, "grad_norm": 5.211112335596169, "learning_rate": 1.9146122666443635e-06, "loss": 0.5542, "step": 7240 }, { "epoch": 0.5880776415170957, "grad_norm": 4.03076289953267, "learning_rate": 1.91397294711588e-06, "loss": 0.4605, "step": 7241 }, { "epoch": 0.5881588564931374, "grad_norm": 5.996591886378709, "learning_rate": 1.9133336681373673e-06, "loss": 0.5431, "step": 7242 }, { "epoch": 0.5882400714691789, "grad_norm": 4.7750369105364845, "learning_rate": 1.912694429753059e-06, "loss": 0.4664, "step": 7243 }, { "epoch": 0.5883212864452205, "grad_norm": 4.118454697868038, "learning_rate": 1.912055232007188e-06, "loss": 0.4708, "step": 7244 }, { "epoch": 0.588402501421262, "grad_norm": 5.287731670886094, "learning_rate": 1.911416074943984e-06, "loss": 0.4099, "step": 7245 }, { "epoch": 0.5884837163973037, "grad_norm": 3.463709326164506, "learning_rate": 1.9107769586076716e-06, "loss": 0.4555, "step": 7246 }, { "epoch": 0.5885649313733452, "grad_norm": 3.9158367444108855, "learning_rate": 1.9101378830424758e-06, "loss": 0.5739, "step": 7247 }, { "epoch": 0.5886461463493868, "grad_norm": 4.558330070983507, "learning_rate": 1.909498848292617e-06, "loss": 0.561, "step": 7248 }, { "epoch": 0.5887273613254284, "grad_norm": 5.495600846510651, "learning_rate": 1.9088598544023118e-06, "loss": 0.3776, "step": 7249 }, { "epoch": 0.58880857630147, "grad_norm": 4.632815620841959, "learning_rate": 1.908220901415777e-06, "loss": 0.4483, "step": 7250 }, { "epoch": 0.5888897912775116, "grad_norm": 4.978619608664077, "learning_rate": 1.907581989377224e-06, "loss": 0.5724, "step": 7251 }, { "epoch": 0.5889710062535531, "grad_norm": 5.346540069308749, "learning_rate": 1.9069431183308615e-06, "loss": 0.5294, "step": 7252 }, { "epoch": 0.5890522212295948, "grad_norm": 4.59400323772315, "learning_rate": 1.906304288320896e-06, "loss": 0.6616, "step": 7253 }, { "epoch": 0.5891334362056363, "grad_norm": 6.3719239881505665, "learning_rate": 1.9056654993915326e-06, "loss": 0.6185, "step": 7254 }, { "epoch": 0.5892146511816779, "grad_norm": 6.682687810031172, "learning_rate": 1.9050267515869709e-06, "loss": 0.4781, "step": 7255 }, { "epoch": 0.5892958661577195, "grad_norm": 3.7501427973660397, "learning_rate": 1.9043880449514085e-06, "loss": 0.8261, "step": 7256 }, { "epoch": 0.5893770811337611, "grad_norm": 5.558288691226038, "learning_rate": 1.9037493795290421e-06, "loss": 0.4972, "step": 7257 }, { "epoch": 0.5894582961098026, "grad_norm": 5.421642176266866, "learning_rate": 1.9031107553640632e-06, "loss": 0.4625, "step": 7258 }, { "epoch": 0.5895395110858442, "grad_norm": 5.221069697025148, "learning_rate": 1.9024721725006598e-06, "loss": 0.3844, "step": 7259 }, { "epoch": 0.5896207260618858, "grad_norm": 7.976877536523762, "learning_rate": 1.9018336309830202e-06, "loss": 0.3608, "step": 7260 }, { "epoch": 0.5897019410379274, "grad_norm": 9.70876511343524, "learning_rate": 1.9011951308553284e-06, "loss": 0.5597, "step": 7261 }, { "epoch": 0.589783156013969, "grad_norm": 4.2405529559362405, "learning_rate": 1.900556672161763e-06, "loss": 0.5158, "step": 7262 }, { "epoch": 0.5898643709900105, "grad_norm": 6.1774490371345845, "learning_rate": 1.899918254946504e-06, "loss": 0.5368, "step": 7263 }, { "epoch": 0.5899455859660522, "grad_norm": 5.486539007392727, "learning_rate": 1.8992798792537265e-06, "loss": 0.6, "step": 7264 }, { "epoch": 0.5900268009420937, "grad_norm": 4.8768276862477995, "learning_rate": 1.898641545127601e-06, "loss": 0.3974, "step": 7265 }, { "epoch": 0.5901080159181353, "grad_norm": 5.691046376635863, "learning_rate": 1.8980032526122985e-06, "loss": 0.5479, "step": 7266 }, { "epoch": 0.5901892308941769, "grad_norm": 5.4416215817890325, "learning_rate": 1.8973650017519855e-06, "loss": 0.4471, "step": 7267 }, { "epoch": 0.5902704458702185, "grad_norm": 3.7432268661179324, "learning_rate": 1.8967267925908237e-06, "loss": 0.5173, "step": 7268 }, { "epoch": 0.59035166084626, "grad_norm": 8.040626967946537, "learning_rate": 1.8960886251729756e-06, "loss": 0.4184, "step": 7269 }, { "epoch": 0.5904328758223016, "grad_norm": 6.080428319987492, "learning_rate": 1.8954504995425994e-06, "loss": 0.4146, "step": 7270 }, { "epoch": 0.5905140907983432, "grad_norm": 4.115370064107381, "learning_rate": 1.8948124157438485e-06, "loss": 0.4314, "step": 7271 }, { "epoch": 0.5905953057743848, "grad_norm": 4.654941886199049, "learning_rate": 1.8941743738208752e-06, "loss": 0.4831, "step": 7272 }, { "epoch": 0.5906765207504264, "grad_norm": 6.074230738214155, "learning_rate": 1.8935363738178288e-06, "loss": 0.5219, "step": 7273 }, { "epoch": 0.590757735726468, "grad_norm": 4.209033114032256, "learning_rate": 1.8928984157788565e-06, "loss": 0.6185, "step": 7274 }, { "epoch": 0.5908389507025096, "grad_norm": 6.854702739452237, "learning_rate": 1.8922604997480998e-06, "loss": 0.5069, "step": 7275 }, { "epoch": 0.5909201656785511, "grad_norm": 5.0281604588869975, "learning_rate": 1.8916226257697004e-06, "loss": 0.4144, "step": 7276 }, { "epoch": 0.5910013806545927, "grad_norm": 3.904693540082544, "learning_rate": 1.8909847938877962e-06, "loss": 0.697, "step": 7277 }, { "epoch": 0.5910825956306343, "grad_norm": 4.638587360552655, "learning_rate": 1.89034700414652e-06, "loss": 0.3997, "step": 7278 }, { "epoch": 0.5911638106066759, "grad_norm": 4.955343175267275, "learning_rate": 1.8897092565900048e-06, "loss": 0.5474, "step": 7279 }, { "epoch": 0.5912450255827174, "grad_norm": 4.586670514862452, "learning_rate": 1.8890715512623802e-06, "loss": 0.5885, "step": 7280 }, { "epoch": 0.591326240558759, "grad_norm": 6.135861467307459, "learning_rate": 1.8884338882077697e-06, "loss": 0.4193, "step": 7281 }, { "epoch": 0.5914074555348006, "grad_norm": 8.91363405049214, "learning_rate": 1.8877962674702977e-06, "loss": 0.4749, "step": 7282 }, { "epoch": 0.5914886705108422, "grad_norm": 3.5396738186174317, "learning_rate": 1.8871586890940847e-06, "loss": 0.4059, "step": 7283 }, { "epoch": 0.5915698854868838, "grad_norm": 4.623629413772576, "learning_rate": 1.886521153123246e-06, "loss": 0.4456, "step": 7284 }, { "epoch": 0.5916511004629254, "grad_norm": 4.952672810545965, "learning_rate": 1.8858836596018973e-06, "loss": 0.4957, "step": 7285 }, { "epoch": 0.591732315438967, "grad_norm": 4.741145039545014, "learning_rate": 1.8852462085741497e-06, "loss": 0.5374, "step": 7286 }, { "epoch": 0.5918135304150085, "grad_norm": 8.418571766778543, "learning_rate": 1.8846088000841096e-06, "loss": 0.558, "step": 7287 }, { "epoch": 0.5918947453910501, "grad_norm": 3.602706463619327, "learning_rate": 1.8839714341758847e-06, "loss": 0.5848, "step": 7288 }, { "epoch": 0.5919759603670917, "grad_norm": 10.651397817035003, "learning_rate": 1.883334110893576e-06, "loss": 0.5071, "step": 7289 }, { "epoch": 0.5920571753431333, "grad_norm": 4.361973439215471, "learning_rate": 1.8826968302812837e-06, "loss": 0.4555, "step": 7290 }, { "epoch": 0.5921383903191748, "grad_norm": 6.18398302417084, "learning_rate": 1.8820595923831025e-06, "loss": 0.6062, "step": 7291 }, { "epoch": 0.5922196052952164, "grad_norm": 8.248248546443998, "learning_rate": 1.8814223972431276e-06, "loss": 0.5944, "step": 7292 }, { "epoch": 0.592300820271258, "grad_norm": 7.741858864310114, "learning_rate": 1.8807852449054497e-06, "loss": 0.5043, "step": 7293 }, { "epoch": 0.5923820352472996, "grad_norm": 3.9704756953268565, "learning_rate": 1.8801481354141547e-06, "loss": 0.5163, "step": 7294 }, { "epoch": 0.5924632502233412, "grad_norm": 4.432871358786943, "learning_rate": 1.8795110688133283e-06, "loss": 0.4582, "step": 7295 }, { "epoch": 0.5925444651993828, "grad_norm": 4.035829611958314, "learning_rate": 1.878874045147053e-06, "loss": 0.4393, "step": 7296 }, { "epoch": 0.5926256801754244, "grad_norm": 3.2780411870082284, "learning_rate": 1.8782370644594055e-06, "loss": 0.62, "step": 7297 }, { "epoch": 0.5927068951514659, "grad_norm": 5.278700539156712, "learning_rate": 1.8776001267944628e-06, "loss": 0.5785, "step": 7298 }, { "epoch": 0.5927881101275075, "grad_norm": 3.827532458925898, "learning_rate": 1.876963232196298e-06, "loss": 0.57, "step": 7299 }, { "epoch": 0.5928693251035491, "grad_norm": 4.0256389154851595, "learning_rate": 1.876326380708979e-06, "loss": 0.609, "step": 7300 }, { "epoch": 0.5929505400795907, "grad_norm": 8.391342343121048, "learning_rate": 1.8756895723765747e-06, "loss": 0.5095, "step": 7301 }, { "epoch": 0.5930317550556322, "grad_norm": 3.698568278989539, "learning_rate": 1.8750528072431477e-06, "loss": 0.5275, "step": 7302 }, { "epoch": 0.5931129700316738, "grad_norm": 7.336981984750687, "learning_rate": 1.8744160853527579e-06, "loss": 0.6133, "step": 7303 }, { "epoch": 0.5931941850077154, "grad_norm": 4.166581863576458, "learning_rate": 1.8737794067494656e-06, "loss": 0.4626, "step": 7304 }, { "epoch": 0.593275399983757, "grad_norm": 4.274646132254657, "learning_rate": 1.8731427714773233e-06, "loss": 0.5315, "step": 7305 }, { "epoch": 0.5933566149597986, "grad_norm": 3.911879665279761, "learning_rate": 1.8725061795803846e-06, "loss": 0.4508, "step": 7306 }, { "epoch": 0.5934378299358402, "grad_norm": 2.642462457610074, "learning_rate": 1.8718696311026956e-06, "loss": 0.5147, "step": 7307 }, { "epoch": 0.5935190449118818, "grad_norm": 4.675230745750343, "learning_rate": 1.871233126088305e-06, "loss": 0.3801, "step": 7308 }, { "epoch": 0.5936002598879233, "grad_norm": 10.255898660318655, "learning_rate": 1.8705966645812544e-06, "loss": 0.5329, "step": 7309 }, { "epoch": 0.5936814748639649, "grad_norm": 4.954517136307957, "learning_rate": 1.8699602466255828e-06, "loss": 0.4131, "step": 7310 }, { "epoch": 0.5937626898400065, "grad_norm": 4.751776132607896, "learning_rate": 1.8693238722653278e-06, "loss": 0.4266, "step": 7311 }, { "epoch": 0.5938439048160481, "grad_norm": 3.047587348683506, "learning_rate": 1.8686875415445238e-06, "loss": 0.4889, "step": 7312 }, { "epoch": 0.5939251197920896, "grad_norm": 12.990954504278774, "learning_rate": 1.8680512545071999e-06, "loss": 0.4731, "step": 7313 }, { "epoch": 0.5940063347681313, "grad_norm": 4.070689985920183, "learning_rate": 1.8674150111973854e-06, "loss": 0.6631, "step": 7314 }, { "epoch": 0.5940875497441728, "grad_norm": 2.864000413306198, "learning_rate": 1.866778811659104e-06, "loss": 0.8279, "step": 7315 }, { "epoch": 0.5941687647202144, "grad_norm": 3.763015690605599, "learning_rate": 1.8661426559363768e-06, "loss": 0.6624, "step": 7316 }, { "epoch": 0.594249979696256, "grad_norm": 6.349440373441788, "learning_rate": 1.8655065440732243e-06, "loss": 0.5768, "step": 7317 }, { "epoch": 0.5943311946722976, "grad_norm": 3.739887750298035, "learning_rate": 1.8648704761136604e-06, "loss": 0.54, "step": 7318 }, { "epoch": 0.5944124096483392, "grad_norm": 3.8465470522434977, "learning_rate": 1.8642344521016974e-06, "loss": 0.4265, "step": 7319 }, { "epoch": 0.5944936246243807, "grad_norm": 5.442222857316663, "learning_rate": 1.8635984720813471e-06, "loss": 0.5427, "step": 7320 }, { "epoch": 0.5945748396004223, "grad_norm": 5.036275208595901, "learning_rate": 1.8629625360966137e-06, "loss": 0.4966, "step": 7321 }, { "epoch": 0.5946560545764639, "grad_norm": 4.325139913021066, "learning_rate": 1.8623266441915006e-06, "loss": 0.4192, "step": 7322 }, { "epoch": 0.5947372695525055, "grad_norm": 3.7229174258197437, "learning_rate": 1.86169079641001e-06, "loss": 0.4411, "step": 7323 }, { "epoch": 0.594818484528547, "grad_norm": 4.325040819420997, "learning_rate": 1.861054992796138e-06, "loss": 0.5971, "step": 7324 }, { "epoch": 0.5948996995045887, "grad_norm": 5.429278035591401, "learning_rate": 1.860419233393879e-06, "loss": 0.5484, "step": 7325 }, { "epoch": 0.5949809144806302, "grad_norm": 4.384346726547922, "learning_rate": 1.859783518247223e-06, "loss": 0.5676, "step": 7326 }, { "epoch": 0.5950621294566718, "grad_norm": 7.391929682556335, "learning_rate": 1.8591478474001601e-06, "loss": 0.4677, "step": 7327 }, { "epoch": 0.5951433444327134, "grad_norm": 5.680231306618248, "learning_rate": 1.858512220896675e-06, "loss": 0.562, "step": 7328 }, { "epoch": 0.595224559408755, "grad_norm": 4.0720446605312395, "learning_rate": 1.857876638780748e-06, "loss": 0.4176, "step": 7329 }, { "epoch": 0.5953057743847966, "grad_norm": 4.5916594217289575, "learning_rate": 1.85724110109636e-06, "loss": 0.5309, "step": 7330 }, { "epoch": 0.5953869893608381, "grad_norm": 4.6780280469096365, "learning_rate": 1.8566056078874858e-06, "loss": 0.4923, "step": 7331 }, { "epoch": 0.5954682043368797, "grad_norm": 3.6599980641941734, "learning_rate": 1.8559701591980977e-06, "loss": 0.5152, "step": 7332 }, { "epoch": 0.5955494193129213, "grad_norm": 2.978800119941686, "learning_rate": 1.8553347550721672e-06, "loss": 0.4539, "step": 7333 }, { "epoch": 0.5956306342889629, "grad_norm": 3.411250854476356, "learning_rate": 1.8546993955536597e-06, "loss": 0.64, "step": 7334 }, { "epoch": 0.5957118492650044, "grad_norm": 4.069278334737661, "learning_rate": 1.8540640806865379e-06, "loss": 0.5401, "step": 7335 }, { "epoch": 0.595793064241046, "grad_norm": 7.8976077553265505, "learning_rate": 1.8534288105147644e-06, "loss": 0.4868, "step": 7336 }, { "epoch": 0.5958742792170876, "grad_norm": 3.08341050427021, "learning_rate": 1.8527935850822947e-06, "loss": 0.4693, "step": 7337 }, { "epoch": 0.5959554941931292, "grad_norm": 4.0472231931258955, "learning_rate": 1.8521584044330832e-06, "loss": 0.5314, "step": 7338 }, { "epoch": 0.5960367091691708, "grad_norm": 7.289070610294687, "learning_rate": 1.851523268611082e-06, "loss": 0.6015, "step": 7339 }, { "epoch": 0.5961179241452124, "grad_norm": 5.48979102747638, "learning_rate": 1.8508881776602386e-06, "loss": 0.4459, "step": 7340 }, { "epoch": 0.596199139121254, "grad_norm": 15.0231623389192, "learning_rate": 1.850253131624497e-06, "loss": 0.5274, "step": 7341 }, { "epoch": 0.5962803540972955, "grad_norm": 4.569897243481455, "learning_rate": 1.8496181305478014e-06, "loss": 0.7543, "step": 7342 }, { "epoch": 0.5963615690733371, "grad_norm": 5.818717995667628, "learning_rate": 1.8489831744740887e-06, "loss": 0.4809, "step": 7343 }, { "epoch": 0.5964427840493787, "grad_norm": 7.4916301022433505, "learning_rate": 1.8483482634472948e-06, "loss": 0.4216, "step": 7344 }, { "epoch": 0.5965239990254203, "grad_norm": 2.952985348960789, "learning_rate": 1.8477133975113516e-06, "loss": 0.4636, "step": 7345 }, { "epoch": 0.5966052140014618, "grad_norm": 5.3497885917720875, "learning_rate": 1.8470785767101898e-06, "loss": 0.5137, "step": 7346 }, { "epoch": 0.5966864289775035, "grad_norm": 4.056347500018793, "learning_rate": 1.8464438010877348e-06, "loss": 0.6904, "step": 7347 }, { "epoch": 0.596767643953545, "grad_norm": 3.334261926906368, "learning_rate": 1.845809070687909e-06, "loss": 0.5172, "step": 7348 }, { "epoch": 0.5968488589295866, "grad_norm": 8.367386875652794, "learning_rate": 1.8451743855546345e-06, "loss": 0.434, "step": 7349 }, { "epoch": 0.5969300739056282, "grad_norm": 6.559459547573149, "learning_rate": 1.8445397457318265e-06, "loss": 0.5824, "step": 7350 }, { "epoch": 0.5970112888816698, "grad_norm": 6.094611931889219, "learning_rate": 1.8439051512633984e-06, "loss": 0.5153, "step": 7351 }, { "epoch": 0.5970925038577114, "grad_norm": 25.80816640442915, "learning_rate": 1.8432706021932627e-06, "loss": 0.6593, "step": 7352 }, { "epoch": 0.5971737188337529, "grad_norm": 4.977546000712229, "learning_rate": 1.8426360985653248e-06, "loss": 0.6459, "step": 7353 }, { "epoch": 0.5972549338097946, "grad_norm": 5.821123685581579, "learning_rate": 1.8420016404234897e-06, "loss": 0.5861, "step": 7354 }, { "epoch": 0.5973361487858361, "grad_norm": 6.3086144237425525, "learning_rate": 1.8413672278116595e-06, "loss": 0.4389, "step": 7355 }, { "epoch": 0.5974173637618777, "grad_norm": 5.044008936737754, "learning_rate": 1.840732860773731e-06, "loss": 0.4021, "step": 7356 }, { "epoch": 0.5974985787379192, "grad_norm": 4.6734719943119005, "learning_rate": 1.8400985393535986e-06, "loss": 0.4824, "step": 7357 }, { "epoch": 0.5975797937139609, "grad_norm": 3.603622687404737, "learning_rate": 1.8394642635951563e-06, "loss": 0.5207, "step": 7358 }, { "epoch": 0.5976610086900024, "grad_norm": 3.7989831364132, "learning_rate": 1.838830033542291e-06, "loss": 0.6815, "step": 7359 }, { "epoch": 0.597742223666044, "grad_norm": 2.3766163305666153, "learning_rate": 1.8381958492388873e-06, "loss": 0.4749, "step": 7360 }, { "epoch": 0.5978234386420856, "grad_norm": 3.0966906185261993, "learning_rate": 1.837561710728828e-06, "loss": 0.5948, "step": 7361 }, { "epoch": 0.5979046536181272, "grad_norm": 8.468628755593905, "learning_rate": 1.8369276180559933e-06, "loss": 0.3566, "step": 7362 }, { "epoch": 0.5979858685941688, "grad_norm": 9.11735062017391, "learning_rate": 1.836293571264258e-06, "loss": 0.4737, "step": 7363 }, { "epoch": 0.5980670835702103, "grad_norm": 5.041492074022004, "learning_rate": 1.835659570397494e-06, "loss": 0.4476, "step": 7364 }, { "epoch": 0.598148298546252, "grad_norm": 3.99736329645453, "learning_rate": 1.8350256154995733e-06, "loss": 0.6365, "step": 7365 }, { "epoch": 0.5982295135222935, "grad_norm": 8.810624359575181, "learning_rate": 1.8343917066143597e-06, "loss": 0.4232, "step": 7366 }, { "epoch": 0.5983107284983351, "grad_norm": 4.925256449200388, "learning_rate": 1.8337578437857169e-06, "loss": 0.4499, "step": 7367 }, { "epoch": 0.5983919434743766, "grad_norm": 3.5049195280397565, "learning_rate": 1.8331240270575062e-06, "loss": 0.6362, "step": 7368 }, { "epoch": 0.5984731584504183, "grad_norm": 8.10123639728082, "learning_rate": 1.8324902564735834e-06, "loss": 0.4814, "step": 7369 }, { "epoch": 0.5985543734264598, "grad_norm": 3.656092093713114, "learning_rate": 1.831856532077801e-06, "loss": 0.6061, "step": 7370 }, { "epoch": 0.5986355884025014, "grad_norm": 5.372488168989093, "learning_rate": 1.831222853914012e-06, "loss": 0.5401, "step": 7371 }, { "epoch": 0.598716803378543, "grad_norm": 5.279583180258407, "learning_rate": 1.830589222026062e-06, "loss": 0.4336, "step": 7372 }, { "epoch": 0.5987980183545846, "grad_norm": 8.322203353642944, "learning_rate": 1.8299556364577936e-06, "loss": 0.6477, "step": 7373 }, { "epoch": 0.5988792333306262, "grad_norm": 4.359529690940866, "learning_rate": 1.8293220972530498e-06, "loss": 0.5287, "step": 7374 }, { "epoch": 0.5989604483066677, "grad_norm": 4.599497389802597, "learning_rate": 1.8286886044556678e-06, "loss": 0.4167, "step": 7375 }, { "epoch": 0.5990416632827094, "grad_norm": 4.944453273477391, "learning_rate": 1.8280551581094808e-06, "loss": 0.4743, "step": 7376 }, { "epoch": 0.5991228782587509, "grad_norm": 6.818339130298646, "learning_rate": 1.8274217582583207e-06, "loss": 0.6592, "step": 7377 }, { "epoch": 0.5992040932347925, "grad_norm": 7.668165030066059, "learning_rate": 1.826788404946016e-06, "loss": 0.5305, "step": 7378 }, { "epoch": 0.599285308210834, "grad_norm": 4.13273227160322, "learning_rate": 1.8261550982163904e-06, "loss": 0.5133, "step": 7379 }, { "epoch": 0.5993665231868757, "grad_norm": 2.7993562519365196, "learning_rate": 1.825521838113265e-06, "loss": 0.5664, "step": 7380 }, { "epoch": 0.5994477381629172, "grad_norm": 9.626646869624484, "learning_rate": 1.8248886246804598e-06, "loss": 0.4671, "step": 7381 }, { "epoch": 0.5995289531389588, "grad_norm": 7.444176905061055, "learning_rate": 1.8242554579617883e-06, "loss": 0.5715, "step": 7382 }, { "epoch": 0.5996101681150005, "grad_norm": 9.588196848439587, "learning_rate": 1.8236223380010625e-06, "loss": 0.4073, "step": 7383 }, { "epoch": 0.599691383091042, "grad_norm": 6.866343301362604, "learning_rate": 1.8229892648420922e-06, "loss": 0.6084, "step": 7384 }, { "epoch": 0.5997725980670836, "grad_norm": 5.01419995093504, "learning_rate": 1.8223562385286809e-06, "loss": 0.4774, "step": 7385 }, { "epoch": 0.5998538130431251, "grad_norm": 5.956266278463153, "learning_rate": 1.8217232591046313e-06, "loss": 0.5421, "step": 7386 }, { "epoch": 0.5999350280191668, "grad_norm": 5.610744689095254, "learning_rate": 1.8210903266137434e-06, "loss": 0.7168, "step": 7387 }, { "epoch": 0.6000162429952083, "grad_norm": 4.657987631786978, "learning_rate": 1.8204574410998119e-06, "loss": 0.5227, "step": 7388 }, { "epoch": 0.6000974579712499, "grad_norm": 4.434514515163585, "learning_rate": 1.8198246026066279e-06, "loss": 0.4822, "step": 7389 }, { "epoch": 0.6001786729472914, "grad_norm": 5.183467175096997, "learning_rate": 1.819191811177982e-06, "loss": 0.579, "step": 7390 }, { "epoch": 0.6002598879233331, "grad_norm": 4.579031661588377, "learning_rate": 1.8185590668576602e-06, "loss": 0.6555, "step": 7391 }, { "epoch": 0.6003411028993746, "grad_norm": 5.1765990102478066, "learning_rate": 1.817926369689444e-06, "loss": 0.541, "step": 7392 }, { "epoch": 0.6004223178754162, "grad_norm": 3.4939919762553977, "learning_rate": 1.817293719717113e-06, "loss": 0.5579, "step": 7393 }, { "epoch": 0.6005035328514579, "grad_norm": 4.962538720225719, "learning_rate": 1.8166611169844444e-06, "loss": 0.3529, "step": 7394 }, { "epoch": 0.6005847478274994, "grad_norm": 5.042600514532005, "learning_rate": 1.8160285615352092e-06, "loss": 0.4768, "step": 7395 }, { "epoch": 0.600665962803541, "grad_norm": 3.8637771148189097, "learning_rate": 1.8153960534131774e-06, "loss": 0.6365, "step": 7396 }, { "epoch": 0.6007471777795825, "grad_norm": 4.22885301921691, "learning_rate": 1.8147635926621162e-06, "loss": 0.5797, "step": 7397 }, { "epoch": 0.6008283927556242, "grad_norm": 3.6962740741884095, "learning_rate": 1.8141311793257876e-06, "loss": 0.3554, "step": 7398 }, { "epoch": 0.6009096077316657, "grad_norm": 3.758699623360753, "learning_rate": 1.813498813447951e-06, "loss": 0.596, "step": 7399 }, { "epoch": 0.6009908227077073, "grad_norm": 3.201807519185477, "learning_rate": 1.812866495072364e-06, "loss": 0.5367, "step": 7400 }, { "epoch": 0.6010720376837488, "grad_norm": 4.447710834904164, "learning_rate": 1.812234224242779e-06, "loss": 0.4886, "step": 7401 }, { "epoch": 0.6011532526597905, "grad_norm": 4.642639006126207, "learning_rate": 1.8116020010029448e-06, "loss": 0.5678, "step": 7402 }, { "epoch": 0.601234467635832, "grad_norm": 3.848966652372118, "learning_rate": 1.8109698253966092e-06, "loss": 0.4742, "step": 7403 }, { "epoch": 0.6013156826118736, "grad_norm": 4.125486033292355, "learning_rate": 1.8103376974675157e-06, "loss": 0.4872, "step": 7404 }, { "epoch": 0.6013968975879153, "grad_norm": 5.747061637911401, "learning_rate": 1.8097056172594023e-06, "loss": 0.4748, "step": 7405 }, { "epoch": 0.6014781125639568, "grad_norm": 4.074457822433304, "learning_rate": 1.8090735848160079e-06, "loss": 0.3921, "step": 7406 }, { "epoch": 0.6015593275399984, "grad_norm": 9.807247879545486, "learning_rate": 1.808441600181065e-06, "loss": 0.5085, "step": 7407 }, { "epoch": 0.6016405425160399, "grad_norm": 5.7128146988760475, "learning_rate": 1.8078096633983023e-06, "loss": 0.431, "step": 7408 }, { "epoch": 0.6017217574920816, "grad_norm": 21.8207651189527, "learning_rate": 1.8071777745114477e-06, "loss": 0.5174, "step": 7409 }, { "epoch": 0.6018029724681231, "grad_norm": 9.425825620792596, "learning_rate": 1.8065459335642254e-06, "loss": 0.4276, "step": 7410 }, { "epoch": 0.6018841874441647, "grad_norm": 4.023515489113687, "learning_rate": 1.8059141406003532e-06, "loss": 0.4419, "step": 7411 }, { "epoch": 0.6019654024202062, "grad_norm": 3.615004223601, "learning_rate": 1.8052823956635496e-06, "loss": 0.5893, "step": 7412 }, { "epoch": 0.6020466173962479, "grad_norm": 5.159465901388711, "learning_rate": 1.8046506987975278e-06, "loss": 0.429, "step": 7413 }, { "epoch": 0.6021278323722894, "grad_norm": 3.103825361755657, "learning_rate": 1.804019050045998e-06, "loss": 0.6653, "step": 7414 }, { "epoch": 0.602209047348331, "grad_norm": 5.198427303718621, "learning_rate": 1.8033874494526646e-06, "loss": 0.5391, "step": 7415 }, { "epoch": 0.6022902623243727, "grad_norm": 7.073819537725478, "learning_rate": 1.8027558970612347e-06, "loss": 0.7089, "step": 7416 }, { "epoch": 0.6023714773004142, "grad_norm": 3.801629807198999, "learning_rate": 1.8021243929154063e-06, "loss": 0.3996, "step": 7417 }, { "epoch": 0.6024526922764558, "grad_norm": 24.873498159563677, "learning_rate": 1.8014929370588757e-06, "loss": 0.5828, "step": 7418 }, { "epoch": 0.6025339072524973, "grad_norm": 5.485907374005129, "learning_rate": 1.8008615295353376e-06, "loss": 0.5204, "step": 7419 }, { "epoch": 0.602615122228539, "grad_norm": 3.9473453351114705, "learning_rate": 1.8002301703884816e-06, "loss": 0.4032, "step": 7420 }, { "epoch": 0.6026963372045805, "grad_norm": 4.470083941751573, "learning_rate": 1.799598859661994e-06, "loss": 0.4807, "step": 7421 }, { "epoch": 0.6027775521806221, "grad_norm": 5.031697404411579, "learning_rate": 1.7989675973995585e-06, "loss": 0.4221, "step": 7422 }, { "epoch": 0.6028587671566636, "grad_norm": 6.979484374974144, "learning_rate": 1.7983363836448559e-06, "loss": 0.3792, "step": 7423 }, { "epoch": 0.6029399821327053, "grad_norm": 5.457913677824293, "learning_rate": 1.7977052184415606e-06, "loss": 0.3426, "step": 7424 }, { "epoch": 0.6030211971087468, "grad_norm": 4.4056991242013455, "learning_rate": 1.7970741018333482e-06, "loss": 0.5527, "step": 7425 }, { "epoch": 0.6031024120847884, "grad_norm": 9.887330375392434, "learning_rate": 1.7964430338638883e-06, "loss": 0.392, "step": 7426 }, { "epoch": 0.6031836270608301, "grad_norm": 4.068954581333052, "learning_rate": 1.7958120145768457e-06, "loss": 0.4971, "step": 7427 }, { "epoch": 0.6032648420368716, "grad_norm": 4.434746948784985, "learning_rate": 1.7951810440158853e-06, "loss": 0.503, "step": 7428 }, { "epoch": 0.6033460570129132, "grad_norm": 4.082130196766388, "learning_rate": 1.7945501222246673e-06, "loss": 0.4411, "step": 7429 }, { "epoch": 0.6034272719889547, "grad_norm": 10.979428713512291, "learning_rate": 1.793919249246846e-06, "loss": 0.4922, "step": 7430 }, { "epoch": 0.6035084869649964, "grad_norm": 4.413393240856426, "learning_rate": 1.7932884251260767e-06, "loss": 0.5367, "step": 7431 }, { "epoch": 0.6035897019410379, "grad_norm": 3.815015398191507, "learning_rate": 1.7926576499060078e-06, "loss": 0.5533, "step": 7432 }, { "epoch": 0.6036709169170795, "grad_norm": 4.993930990302795, "learning_rate": 1.7920269236302868e-06, "loss": 0.4204, "step": 7433 }, { "epoch": 0.603752131893121, "grad_norm": 4.907619542220186, "learning_rate": 1.7913962463425544e-06, "loss": 0.5547, "step": 7434 }, { "epoch": 0.6038333468691627, "grad_norm": 4.135750795376068, "learning_rate": 1.7907656180864519e-06, "loss": 0.6044, "step": 7435 }, { "epoch": 0.6039145618452042, "grad_norm": 15.259932160511177, "learning_rate": 1.790135038905616e-06, "loss": 0.5229, "step": 7436 }, { "epoch": 0.6039957768212458, "grad_norm": 5.126091226363883, "learning_rate": 1.7895045088436772e-06, "loss": 0.4935, "step": 7437 }, { "epoch": 0.6040769917972875, "grad_norm": 4.347192454488315, "learning_rate": 1.7888740279442669e-06, "loss": 0.4883, "step": 7438 }, { "epoch": 0.604158206773329, "grad_norm": 8.459728931856809, "learning_rate": 1.7882435962510102e-06, "loss": 0.6231, "step": 7439 }, { "epoch": 0.6042394217493706, "grad_norm": 4.357956562088014, "learning_rate": 1.7876132138075292e-06, "loss": 0.6246, "step": 7440 }, { "epoch": 0.6043206367254121, "grad_norm": 5.116492859940842, "learning_rate": 1.786982880657444e-06, "loss": 0.3546, "step": 7441 }, { "epoch": 0.6044018517014538, "grad_norm": 4.281674880545698, "learning_rate": 1.7863525968443705e-06, "loss": 0.3998, "step": 7442 }, { "epoch": 0.6044830666774953, "grad_norm": 4.231714096569343, "learning_rate": 1.785722362411919e-06, "loss": 0.434, "step": 7443 }, { "epoch": 0.6045642816535369, "grad_norm": 4.3628558475147265, "learning_rate": 1.7850921774037012e-06, "loss": 0.4972, "step": 7444 }, { "epoch": 0.6046454966295784, "grad_norm": 3.8871791003489866, "learning_rate": 1.7844620418633202e-06, "loss": 0.5716, "step": 7445 }, { "epoch": 0.6047267116056201, "grad_norm": 5.078543576155098, "learning_rate": 1.7838319558343786e-06, "loss": 0.5441, "step": 7446 }, { "epoch": 0.6048079265816616, "grad_norm": 11.062361952259796, "learning_rate": 1.7832019193604767e-06, "loss": 0.5189, "step": 7447 }, { "epoch": 0.6048891415577032, "grad_norm": 5.15437180400868, "learning_rate": 1.7825719324852075e-06, "loss": 0.5409, "step": 7448 }, { "epoch": 0.6049703565337449, "grad_norm": 6.299357609539531, "learning_rate": 1.7819419952521645e-06, "loss": 0.4818, "step": 7449 }, { "epoch": 0.6050515715097864, "grad_norm": 3.001544405666737, "learning_rate": 1.7813121077049336e-06, "loss": 0.5361, "step": 7450 }, { "epoch": 0.605132786485828, "grad_norm": 4.256801862178172, "learning_rate": 1.7806822698871022e-06, "loss": 0.6223, "step": 7451 }, { "epoch": 0.6052140014618695, "grad_norm": 3.963672219119554, "learning_rate": 1.780052481842251e-06, "loss": 0.4201, "step": 7452 }, { "epoch": 0.6052952164379112, "grad_norm": 3.995116455512054, "learning_rate": 1.7794227436139569e-06, "loss": 0.5345, "step": 7453 }, { "epoch": 0.6053764314139527, "grad_norm": 6.799955515987135, "learning_rate": 1.778793055245796e-06, "loss": 0.5772, "step": 7454 }, { "epoch": 0.6054576463899943, "grad_norm": 5.467231768671727, "learning_rate": 1.7781634167813388e-06, "loss": 0.6087, "step": 7455 }, { "epoch": 0.6055388613660359, "grad_norm": 5.970835497003089, "learning_rate": 1.7775338282641525e-06, "loss": 0.4178, "step": 7456 }, { "epoch": 0.6056200763420775, "grad_norm": 4.908729638196646, "learning_rate": 1.776904289737802e-06, "loss": 0.5252, "step": 7457 }, { "epoch": 0.605701291318119, "grad_norm": 8.614648749798091, "learning_rate": 1.7762748012458481e-06, "loss": 0.527, "step": 7458 }, { "epoch": 0.6057825062941606, "grad_norm": 5.933581667639831, "learning_rate": 1.7756453628318465e-06, "loss": 0.4565, "step": 7459 }, { "epoch": 0.6058637212702023, "grad_norm": 3.8154357222510034, "learning_rate": 1.7750159745393536e-06, "loss": 0.4551, "step": 7460 }, { "epoch": 0.6059449362462438, "grad_norm": 7.8464108786714055, "learning_rate": 1.7743866364119175e-06, "loss": 0.473, "step": 7461 }, { "epoch": 0.6060261512222854, "grad_norm": 4.514141406557562, "learning_rate": 1.7737573484930853e-06, "loss": 0.6192, "step": 7462 }, { "epoch": 0.606107366198327, "grad_norm": 4.9804191071501585, "learning_rate": 1.7731281108264025e-06, "loss": 0.5783, "step": 7463 }, { "epoch": 0.6061885811743686, "grad_norm": 3.8086232760432837, "learning_rate": 1.7724989234554068e-06, "loss": 0.4875, "step": 7464 }, { "epoch": 0.6062697961504101, "grad_norm": 4.453392068019001, "learning_rate": 1.7718697864236344e-06, "loss": 0.4404, "step": 7465 }, { "epoch": 0.6063510111264517, "grad_norm": 6.631970307210538, "learning_rate": 1.771240699774621e-06, "loss": 0.3888, "step": 7466 }, { "epoch": 0.6064322261024933, "grad_norm": 3.3473485905433815, "learning_rate": 1.7706116635518933e-06, "loss": 0.5312, "step": 7467 }, { "epoch": 0.6065134410785349, "grad_norm": 8.74164772322533, "learning_rate": 1.7699826777989788e-06, "loss": 0.4054, "step": 7468 }, { "epoch": 0.6065946560545764, "grad_norm": 4.144159190764816, "learning_rate": 1.7693537425593984e-06, "loss": 0.5257, "step": 7469 }, { "epoch": 0.606675871030618, "grad_norm": 4.098313315005926, "learning_rate": 1.7687248578766727e-06, "loss": 0.3604, "step": 7470 }, { "epoch": 0.6067570860066597, "grad_norm": 6.610586143194924, "learning_rate": 1.7680960237943174e-06, "loss": 0.6578, "step": 7471 }, { "epoch": 0.6068383009827012, "grad_norm": 4.968803137717695, "learning_rate": 1.7674672403558421e-06, "loss": 0.4375, "step": 7472 }, { "epoch": 0.6069195159587428, "grad_norm": 7.6507424354276, "learning_rate": 1.7668385076047584e-06, "loss": 0.4943, "step": 7473 }, { "epoch": 0.6070007309347843, "grad_norm": 5.452374844312675, "learning_rate": 1.7662098255845689e-06, "loss": 0.6319, "step": 7474 }, { "epoch": 0.607081945910826, "grad_norm": 10.145687214568056, "learning_rate": 1.7655811943387758e-06, "loss": 0.4869, "step": 7475 }, { "epoch": 0.6071631608868675, "grad_norm": 4.614412678031245, "learning_rate": 1.764952613910878e-06, "loss": 0.5788, "step": 7476 }, { "epoch": 0.6072443758629091, "grad_norm": 7.111983931763767, "learning_rate": 1.7643240843443686e-06, "loss": 0.505, "step": 7477 }, { "epoch": 0.6073255908389507, "grad_norm": 8.127180595467415, "learning_rate": 1.7636956056827384e-06, "loss": 0.6297, "step": 7478 }, { "epoch": 0.6074068058149923, "grad_norm": 6.123717134281918, "learning_rate": 1.7630671779694768e-06, "loss": 0.5801, "step": 7479 }, { "epoch": 0.6074880207910338, "grad_norm": 5.652896769901682, "learning_rate": 1.7624388012480656e-06, "loss": 0.4418, "step": 7480 }, { "epoch": 0.6075692357670754, "grad_norm": 5.923671663679729, "learning_rate": 1.7618104755619852e-06, "loss": 0.4421, "step": 7481 }, { "epoch": 0.6076504507431171, "grad_norm": 5.6081581939098495, "learning_rate": 1.7611822009547143e-06, "loss": 0.7917, "step": 7482 }, { "epoch": 0.6077316657191586, "grad_norm": 5.284990419872968, "learning_rate": 1.7605539774697244e-06, "loss": 0.567, "step": 7483 }, { "epoch": 0.6078128806952002, "grad_norm": 7.660780757932287, "learning_rate": 1.7599258051504856e-06, "loss": 0.5853, "step": 7484 }, { "epoch": 0.6078940956712418, "grad_norm": 4.687240171351279, "learning_rate": 1.7592976840404652e-06, "loss": 0.4899, "step": 7485 }, { "epoch": 0.6079753106472834, "grad_norm": 4.470994352643192, "learning_rate": 1.7586696141831242e-06, "loss": 0.4768, "step": 7486 }, { "epoch": 0.6080565256233249, "grad_norm": 5.24599057514584, "learning_rate": 1.7580415956219229e-06, "loss": 0.3794, "step": 7487 }, { "epoch": 0.6081377405993665, "grad_norm": 4.900299331302958, "learning_rate": 1.7574136284003158e-06, "loss": 0.5302, "step": 7488 }, { "epoch": 0.6082189555754081, "grad_norm": 4.555248134756306, "learning_rate": 1.756785712561756e-06, "loss": 0.4212, "step": 7489 }, { "epoch": 0.6083001705514497, "grad_norm": 7.786367658307978, "learning_rate": 1.7561578481496917e-06, "loss": 0.4278, "step": 7490 }, { "epoch": 0.6083813855274912, "grad_norm": 4.5786486381903515, "learning_rate": 1.7555300352075662e-06, "loss": 0.6145, "step": 7491 }, { "epoch": 0.6084626005035328, "grad_norm": 7.245725994286198, "learning_rate": 1.7549022737788241e-06, "loss": 0.4126, "step": 7492 }, { "epoch": 0.6085438154795745, "grad_norm": 3.67634983056109, "learning_rate": 1.7542745639069004e-06, "loss": 0.5058, "step": 7493 }, { "epoch": 0.608625030455616, "grad_norm": 9.370780365887464, "learning_rate": 1.7536469056352296e-06, "loss": 0.3767, "step": 7494 }, { "epoch": 0.6087062454316576, "grad_norm": 2.966365312799223, "learning_rate": 1.7530192990072436e-06, "loss": 0.5582, "step": 7495 }, { "epoch": 0.6087874604076992, "grad_norm": 5.219714565139399, "learning_rate": 1.7523917440663687e-06, "loss": 0.409, "step": 7496 }, { "epoch": 0.6088686753837408, "grad_norm": 7.275963784070534, "learning_rate": 1.7517642408560278e-06, "loss": 0.516, "step": 7497 }, { "epoch": 0.6089498903597823, "grad_norm": 5.53725602534592, "learning_rate": 1.7511367894196426e-06, "loss": 0.5271, "step": 7498 }, { "epoch": 0.6090311053358239, "grad_norm": 6.420027579961015, "learning_rate": 1.7505093898006275e-06, "loss": 0.4234, "step": 7499 }, { "epoch": 0.6091123203118655, "grad_norm": 5.134669425660005, "learning_rate": 1.749882042042396e-06, "loss": 0.5215, "step": 7500 }, { "epoch": 0.6091935352879071, "grad_norm": 3.969108933592243, "learning_rate": 1.749254746188358e-06, "loss": 0.4116, "step": 7501 }, { "epoch": 0.6092747502639486, "grad_norm": 9.697561858550591, "learning_rate": 1.7486275022819183e-06, "loss": 0.4249, "step": 7502 }, { "epoch": 0.6093559652399902, "grad_norm": 6.498690544614313, "learning_rate": 1.748000310366478e-06, "loss": 0.5176, "step": 7503 }, { "epoch": 0.6094371802160319, "grad_norm": 3.9324281115521202, "learning_rate": 1.7473731704854363e-06, "loss": 0.566, "step": 7504 }, { "epoch": 0.6095183951920734, "grad_norm": 8.158895435967443, "learning_rate": 1.7467460826821885e-06, "loss": 0.4352, "step": 7505 }, { "epoch": 0.609599610168115, "grad_norm": 6.125377617707914, "learning_rate": 1.7461190470001252e-06, "loss": 0.4992, "step": 7506 }, { "epoch": 0.6096808251441566, "grad_norm": 4.208156614795222, "learning_rate": 1.7454920634826334e-06, "loss": 0.4183, "step": 7507 }, { "epoch": 0.6097620401201982, "grad_norm": 11.55413920993596, "learning_rate": 1.7448651321730985e-06, "loss": 0.4898, "step": 7508 }, { "epoch": 0.6098432550962397, "grad_norm": 4.906701536886049, "learning_rate": 1.7442382531148993e-06, "loss": 0.4855, "step": 7509 }, { "epoch": 0.6099244700722813, "grad_norm": 4.012360366132999, "learning_rate": 1.743611426351413e-06, "loss": 0.4437, "step": 7510 }, { "epoch": 0.6100056850483229, "grad_norm": 4.507122998130638, "learning_rate": 1.7429846519260139e-06, "loss": 0.4269, "step": 7511 }, { "epoch": 0.6100869000243645, "grad_norm": 7.688714435664871, "learning_rate": 1.7423579298820698e-06, "loss": 0.4297, "step": 7512 }, { "epoch": 0.610168115000406, "grad_norm": 4.5702359534664145, "learning_rate": 1.7417312602629466e-06, "loss": 0.4187, "step": 7513 }, { "epoch": 0.6102493299764477, "grad_norm": 7.677278210562309, "learning_rate": 1.7411046431120082e-06, "loss": 0.4983, "step": 7514 }, { "epoch": 0.6103305449524893, "grad_norm": 6.1185795533491545, "learning_rate": 1.7404780784726113e-06, "loss": 0.6269, "step": 7515 }, { "epoch": 0.6104117599285308, "grad_norm": 14.544008771708137, "learning_rate": 1.7398515663881117e-06, "loss": 0.595, "step": 7516 }, { "epoch": 0.6104929749045724, "grad_norm": 10.989546459503224, "learning_rate": 1.7392251069018612e-06, "loss": 0.6077, "step": 7517 }, { "epoch": 0.610574189880614, "grad_norm": 6.706531652553325, "learning_rate": 1.7385987000572072e-06, "loss": 0.558, "step": 7518 }, { "epoch": 0.6106554048566556, "grad_norm": 3.482810028814553, "learning_rate": 1.7379723458974923e-06, "loss": 0.4935, "step": 7519 }, { "epoch": 0.6107366198326971, "grad_norm": 4.216977832082727, "learning_rate": 1.737346044466059e-06, "loss": 0.5477, "step": 7520 }, { "epoch": 0.6108178348087387, "grad_norm": 5.407693817919376, "learning_rate": 1.7367197958062432e-06, "loss": 0.561, "step": 7521 }, { "epoch": 0.6108990497847803, "grad_norm": 7.6500405315944215, "learning_rate": 1.7360935999613777e-06, "loss": 0.7106, "step": 7522 }, { "epoch": 0.6109802647608219, "grad_norm": 8.727840725862155, "learning_rate": 1.7354674569747914e-06, "loss": 0.4518, "step": 7523 }, { "epoch": 0.6110614797368634, "grad_norm": 6.081313481209265, "learning_rate": 1.7348413668898124e-06, "loss": 0.4906, "step": 7524 }, { "epoch": 0.611142694712905, "grad_norm": 4.575030134738263, "learning_rate": 1.73421532974976e-06, "loss": 0.5392, "step": 7525 }, { "epoch": 0.6112239096889467, "grad_norm": 3.827031756918253, "learning_rate": 1.7335893455979538e-06, "loss": 0.5111, "step": 7526 }, { "epoch": 0.6113051246649882, "grad_norm": 4.399973290074878, "learning_rate": 1.7329634144777097e-06, "loss": 0.7858, "step": 7527 }, { "epoch": 0.6113863396410298, "grad_norm": 6.275043877297012, "learning_rate": 1.7323375364323374e-06, "loss": 0.4, "step": 7528 }, { "epoch": 0.6114675546170714, "grad_norm": 3.7917580182798964, "learning_rate": 1.731711711505144e-06, "loss": 0.5784, "step": 7529 }, { "epoch": 0.611548769593113, "grad_norm": 5.449393065833403, "learning_rate": 1.7310859397394356e-06, "loss": 0.4097, "step": 7530 }, { "epoch": 0.6116299845691545, "grad_norm": 9.087763572815122, "learning_rate": 1.7304602211785105e-06, "loss": 0.4269, "step": 7531 }, { "epoch": 0.6117111995451961, "grad_norm": 3.9239408883458196, "learning_rate": 1.7298345558656643e-06, "loss": 0.4189, "step": 7532 }, { "epoch": 0.6117924145212377, "grad_norm": 3.9512250145217327, "learning_rate": 1.7292089438441912e-06, "loss": 0.5171, "step": 7533 }, { "epoch": 0.6118736294972793, "grad_norm": 5.225768936804722, "learning_rate": 1.7285833851573802e-06, "loss": 0.5148, "step": 7534 }, { "epoch": 0.6119548444733209, "grad_norm": 8.877862953830741, "learning_rate": 1.727957879848516e-06, "loss": 0.6574, "step": 7535 }, { "epoch": 0.6120360594493625, "grad_norm": 3.7513803990699457, "learning_rate": 1.72733242796088e-06, "loss": 0.4649, "step": 7536 }, { "epoch": 0.6121172744254041, "grad_norm": 6.526118406530345, "learning_rate": 1.7267070295377519e-06, "loss": 0.5629, "step": 7537 }, { "epoch": 0.6121984894014456, "grad_norm": 8.83748922556791, "learning_rate": 1.726081684622404e-06, "loss": 0.483, "step": 7538 }, { "epoch": 0.6122797043774872, "grad_norm": 4.032292821062164, "learning_rate": 1.7254563932581072e-06, "loss": 0.6093, "step": 7539 }, { "epoch": 0.6123609193535288, "grad_norm": 19.261864290456085, "learning_rate": 1.7248311554881297e-06, "loss": 0.3226, "step": 7540 }, { "epoch": 0.6124421343295704, "grad_norm": 6.178514406690231, "learning_rate": 1.7242059713557336e-06, "loss": 0.4116, "step": 7541 }, { "epoch": 0.6125233493056119, "grad_norm": 6.272920000009786, "learning_rate": 1.7235808409041775e-06, "loss": 0.514, "step": 7542 }, { "epoch": 0.6126045642816536, "grad_norm": 3.700775958812784, "learning_rate": 1.7229557641767191e-06, "loss": 0.3956, "step": 7543 }, { "epoch": 0.6126857792576951, "grad_norm": 5.184180318926046, "learning_rate": 1.7223307412166097e-06, "loss": 0.5573, "step": 7544 }, { "epoch": 0.6127669942337367, "grad_norm": 7.17746460216716, "learning_rate": 1.7217057720670955e-06, "loss": 0.4188, "step": 7545 }, { "epoch": 0.6128482092097783, "grad_norm": 5.19044747938764, "learning_rate": 1.7210808567714244e-06, "loss": 0.6008, "step": 7546 }, { "epoch": 0.6129294241858199, "grad_norm": 6.637207987486905, "learning_rate": 1.7204559953728355e-06, "loss": 0.4163, "step": 7547 }, { "epoch": 0.6130106391618615, "grad_norm": 4.123118258455451, "learning_rate": 1.7198311879145652e-06, "loss": 0.4583, "step": 7548 }, { "epoch": 0.613091854137903, "grad_norm": 4.634690421136184, "learning_rate": 1.719206434439848e-06, "loss": 0.6119, "step": 7549 }, { "epoch": 0.6131730691139446, "grad_norm": 5.330581154155666, "learning_rate": 1.7185817349919137e-06, "loss": 0.679, "step": 7550 }, { "epoch": 0.6132542840899862, "grad_norm": 4.733672202157837, "learning_rate": 1.7179570896139869e-06, "loss": 0.5463, "step": 7551 }, { "epoch": 0.6133354990660278, "grad_norm": 3.417972262827162, "learning_rate": 1.7173324983492912e-06, "loss": 0.5625, "step": 7552 }, { "epoch": 0.6134167140420693, "grad_norm": 5.5606216442015155, "learning_rate": 1.7167079612410448e-06, "loss": 0.541, "step": 7553 }, { "epoch": 0.613497929018111, "grad_norm": 7.366074745822286, "learning_rate": 1.7160834783324608e-06, "loss": 0.5007, "step": 7554 }, { "epoch": 0.6135791439941525, "grad_norm": 4.4511321208599925, "learning_rate": 1.7154590496667523e-06, "loss": 0.4106, "step": 7555 }, { "epoch": 0.6136603589701941, "grad_norm": 4.380414416940069, "learning_rate": 1.7148346752871253e-06, "loss": 0.5297, "step": 7556 }, { "epoch": 0.6137415739462357, "grad_norm": 5.789276347920089, "learning_rate": 1.7142103552367834e-06, "loss": 0.4956, "step": 7557 }, { "epoch": 0.6138227889222773, "grad_norm": 5.435165522246494, "learning_rate": 1.713586089558925e-06, "loss": 0.5016, "step": 7558 }, { "epoch": 0.6139040038983189, "grad_norm": 3.8853370254417796, "learning_rate": 1.7129618782967488e-06, "loss": 0.4796, "step": 7559 }, { "epoch": 0.6139852188743604, "grad_norm": 6.1117435269535765, "learning_rate": 1.712337721493445e-06, "loss": 0.3512, "step": 7560 }, { "epoch": 0.614066433850402, "grad_norm": 5.140667780868619, "learning_rate": 1.7117136191922013e-06, "loss": 0.383, "step": 7561 }, { "epoch": 0.6141476488264436, "grad_norm": 5.239177233517926, "learning_rate": 1.7110895714362035e-06, "loss": 0.4811, "step": 7562 }, { "epoch": 0.6142288638024852, "grad_norm": 5.786445992631991, "learning_rate": 1.710465578268633e-06, "loss": 0.5776, "step": 7563 }, { "epoch": 0.6143100787785267, "grad_norm": 4.264725420672594, "learning_rate": 1.7098416397326647e-06, "loss": 0.548, "step": 7564 }, { "epoch": 0.6143912937545684, "grad_norm": 5.098547500018314, "learning_rate": 1.7092177558714735e-06, "loss": 0.4055, "step": 7565 }, { "epoch": 0.6144725087306099, "grad_norm": 3.972712496253238, "learning_rate": 1.7085939267282292e-06, "loss": 0.5024, "step": 7566 }, { "epoch": 0.6145537237066515, "grad_norm": 6.4566948580335835, "learning_rate": 1.7079701523460957e-06, "loss": 0.3837, "step": 7567 }, { "epoch": 0.6146349386826931, "grad_norm": 4.440870142659438, "learning_rate": 1.707346432768236e-06, "loss": 0.5221, "step": 7568 }, { "epoch": 0.6147161536587347, "grad_norm": 4.191872882758366, "learning_rate": 1.706722768037809e-06, "loss": 0.5073, "step": 7569 }, { "epoch": 0.6147973686347763, "grad_norm": 5.091945928993117, "learning_rate": 1.7060991581979668e-06, "loss": 0.4276, "step": 7570 }, { "epoch": 0.6148785836108178, "grad_norm": 4.128925636949817, "learning_rate": 1.7054756032918619e-06, "loss": 0.5327, "step": 7571 }, { "epoch": 0.6149597985868595, "grad_norm": 4.6497677030599265, "learning_rate": 1.7048521033626406e-06, "loss": 0.4233, "step": 7572 }, { "epoch": 0.615041013562901, "grad_norm": 5.259086945381799, "learning_rate": 1.7042286584534446e-06, "loss": 0.5063, "step": 7573 }, { "epoch": 0.6151222285389426, "grad_norm": 9.344094684965398, "learning_rate": 1.703605268607415e-06, "loss": 0.4642, "step": 7574 }, { "epoch": 0.6152034435149841, "grad_norm": 6.059064695851907, "learning_rate": 1.7029819338676851e-06, "loss": 0.5159, "step": 7575 }, { "epoch": 0.6152846584910258, "grad_norm": 7.950438579454252, "learning_rate": 1.702358654277388e-06, "loss": 0.4775, "step": 7576 }, { "epoch": 0.6153658734670673, "grad_norm": 5.6577857752289695, "learning_rate": 1.7017354298796495e-06, "loss": 0.4521, "step": 7577 }, { "epoch": 0.6154470884431089, "grad_norm": 6.694027039647233, "learning_rate": 1.701112260717595e-06, "loss": 0.5675, "step": 7578 }, { "epoch": 0.6155283034191505, "grad_norm": 7.207274495499145, "learning_rate": 1.7004891468343445e-06, "loss": 0.4767, "step": 7579 }, { "epoch": 0.6156095183951921, "grad_norm": 10.710533616510515, "learning_rate": 1.6998660882730127e-06, "loss": 0.4575, "step": 7580 }, { "epoch": 0.6156907333712337, "grad_norm": 9.051943184654498, "learning_rate": 1.6992430850767133e-06, "loss": 0.4646, "step": 7581 }, { "epoch": 0.6157719483472752, "grad_norm": 6.318629144815555, "learning_rate": 1.6986201372885551e-06, "loss": 0.5112, "step": 7582 }, { "epoch": 0.6158531633233169, "grad_norm": 9.374156721440105, "learning_rate": 1.6979972449516414e-06, "loss": 0.5938, "step": 7583 }, { "epoch": 0.6159343782993584, "grad_norm": 5.954834061127918, "learning_rate": 1.6973744081090737e-06, "loss": 0.5195, "step": 7584 }, { "epoch": 0.6160155932754, "grad_norm": 4.686975998201248, "learning_rate": 1.6967516268039502e-06, "loss": 0.5192, "step": 7585 }, { "epoch": 0.6160968082514415, "grad_norm": 5.255217548237502, "learning_rate": 1.696128901079362e-06, "loss": 0.5776, "step": 7586 }, { "epoch": 0.6161780232274832, "grad_norm": 3.9588121129361915, "learning_rate": 1.6955062309783993e-06, "loss": 0.4449, "step": 7587 }, { "epoch": 0.6162592382035247, "grad_norm": 2.910107855866022, "learning_rate": 1.6948836165441487e-06, "loss": 0.6015, "step": 7588 }, { "epoch": 0.6163404531795663, "grad_norm": 4.0433138018943735, "learning_rate": 1.6942610578196898e-06, "loss": 0.4804, "step": 7589 }, { "epoch": 0.616421668155608, "grad_norm": 12.510674806813109, "learning_rate": 1.6936385548481022e-06, "loss": 0.3454, "step": 7590 }, { "epoch": 0.6165028831316495, "grad_norm": 4.596018618309467, "learning_rate": 1.6930161076724586e-06, "loss": 0.4479, "step": 7591 }, { "epoch": 0.6165840981076911, "grad_norm": 4.171966658619839, "learning_rate": 1.69239371633583e-06, "loss": 0.5211, "step": 7592 }, { "epoch": 0.6166653130837326, "grad_norm": 8.771836978846274, "learning_rate": 1.6917713808812808e-06, "loss": 0.4352, "step": 7593 }, { "epoch": 0.6167465280597743, "grad_norm": 4.139812780948147, "learning_rate": 1.6911491013518752e-06, "loss": 0.5914, "step": 7594 }, { "epoch": 0.6168277430358158, "grad_norm": 5.236782993816778, "learning_rate": 1.6905268777906713e-06, "loss": 0.5286, "step": 7595 }, { "epoch": 0.6169089580118574, "grad_norm": 5.408332820716584, "learning_rate": 1.6899047102407228e-06, "loss": 0.3983, "step": 7596 }, { "epoch": 0.6169901729878989, "grad_norm": 3.729915257776433, "learning_rate": 1.6892825987450811e-06, "loss": 0.5177, "step": 7597 }, { "epoch": 0.6170713879639406, "grad_norm": 4.845431694776476, "learning_rate": 1.6886605433467937e-06, "loss": 0.5994, "step": 7598 }, { "epoch": 0.6171526029399821, "grad_norm": 4.28204183668613, "learning_rate": 1.6880385440889016e-06, "loss": 0.4965, "step": 7599 }, { "epoch": 0.6172338179160237, "grad_norm": 3.81321254807878, "learning_rate": 1.6874166010144454e-06, "loss": 0.5212, "step": 7600 }, { "epoch": 0.6173150328920654, "grad_norm": 4.863898183735516, "learning_rate": 1.6867947141664606e-06, "loss": 0.6291, "step": 7601 }, { "epoch": 0.6173962478681069, "grad_norm": 3.908697856611701, "learning_rate": 1.6861728835879764e-06, "loss": 0.5934, "step": 7602 }, { "epoch": 0.6174774628441485, "grad_norm": 4.930386518701432, "learning_rate": 1.685551109322023e-06, "loss": 0.4434, "step": 7603 }, { "epoch": 0.61755867782019, "grad_norm": 3.7606290186734586, "learning_rate": 1.6849293914116215e-06, "loss": 0.5494, "step": 7604 }, { "epoch": 0.6176398927962317, "grad_norm": 4.342048831757059, "learning_rate": 1.6843077298997924e-06, "loss": 0.5169, "step": 7605 }, { "epoch": 0.6177211077722732, "grad_norm": 5.829444571059767, "learning_rate": 1.6836861248295522e-06, "loss": 0.5501, "step": 7606 }, { "epoch": 0.6178023227483148, "grad_norm": 4.7706445198897915, "learning_rate": 1.6830645762439113e-06, "loss": 0.4783, "step": 7607 }, { "epoch": 0.6178835377243563, "grad_norm": 8.032579213522444, "learning_rate": 1.6824430841858773e-06, "loss": 0.4552, "step": 7608 }, { "epoch": 0.617964752700398, "grad_norm": 6.670715215785492, "learning_rate": 1.6818216486984565e-06, "loss": 0.4983, "step": 7609 }, { "epoch": 0.6180459676764395, "grad_norm": 6.128265762260737, "learning_rate": 1.6812002698246468e-06, "loss": 0.4088, "step": 7610 }, { "epoch": 0.6181271826524811, "grad_norm": 5.814013730186966, "learning_rate": 1.6805789476074457e-06, "loss": 0.5589, "step": 7611 }, { "epoch": 0.6182083976285228, "grad_norm": 6.00192645217164, "learning_rate": 1.6799576820898433e-06, "loss": 0.4175, "step": 7612 }, { "epoch": 0.6182896126045643, "grad_norm": 4.073976353754178, "learning_rate": 1.6793364733148299e-06, "loss": 0.3606, "step": 7613 }, { "epoch": 0.6183708275806059, "grad_norm": 4.36296820738616, "learning_rate": 1.67871532132539e-06, "loss": 0.6092, "step": 7614 }, { "epoch": 0.6184520425566474, "grad_norm": 6.684933223291455, "learning_rate": 1.6780942261645022e-06, "loss": 0.4535, "step": 7615 }, { "epoch": 0.6185332575326891, "grad_norm": 4.124907550937014, "learning_rate": 1.6774731878751443e-06, "loss": 0.5198, "step": 7616 }, { "epoch": 0.6186144725087306, "grad_norm": 2.736304544029034, "learning_rate": 1.6768522065002895e-06, "loss": 0.6509, "step": 7617 }, { "epoch": 0.6186956874847722, "grad_norm": 3.054312384173202, "learning_rate": 1.676231282082904e-06, "loss": 0.5161, "step": 7618 }, { "epoch": 0.6187769024608137, "grad_norm": 8.402180136218686, "learning_rate": 1.6756104146659557e-06, "loss": 0.394, "step": 7619 }, { "epoch": 0.6188581174368554, "grad_norm": 4.6791235959713315, "learning_rate": 1.674989604292403e-06, "loss": 0.6243, "step": 7620 }, { "epoch": 0.6189393324128969, "grad_norm": 9.604477306580446, "learning_rate": 1.6743688510052025e-06, "loss": 0.4754, "step": 7621 }, { "epoch": 0.6190205473889385, "grad_norm": 9.89281790814999, "learning_rate": 1.6737481548473094e-06, "loss": 0.4893, "step": 7622 }, { "epoch": 0.6191017623649802, "grad_norm": 3.3991457520981383, "learning_rate": 1.6731275158616706e-06, "loss": 0.457, "step": 7623 }, { "epoch": 0.6191829773410217, "grad_norm": 2.80593016197633, "learning_rate": 1.6725069340912306e-06, "loss": 0.6278, "step": 7624 }, { "epoch": 0.6192641923170633, "grad_norm": 5.418301042196, "learning_rate": 1.6718864095789328e-06, "loss": 0.4187, "step": 7625 }, { "epoch": 0.6193454072931048, "grad_norm": 13.06513067077772, "learning_rate": 1.671265942367712e-06, "loss": 0.4663, "step": 7626 }, { "epoch": 0.6194266222691465, "grad_norm": 4.625063932915541, "learning_rate": 1.6706455325005022e-06, "loss": 0.4295, "step": 7627 }, { "epoch": 0.619507837245188, "grad_norm": 3.5014064055098175, "learning_rate": 1.6700251800202316e-06, "loss": 0.4579, "step": 7628 }, { "epoch": 0.6195890522212296, "grad_norm": 6.56770323902514, "learning_rate": 1.6694048849698262e-06, "loss": 0.6269, "step": 7629 }, { "epoch": 0.6196702671972711, "grad_norm": 4.07016650700423, "learning_rate": 1.668784647392208e-06, "loss": 0.6907, "step": 7630 }, { "epoch": 0.6197514821733128, "grad_norm": 5.113480533421879, "learning_rate": 1.6681644673302915e-06, "loss": 0.5111, "step": 7631 }, { "epoch": 0.6198326971493543, "grad_norm": 4.254910794569682, "learning_rate": 1.6675443448269924e-06, "loss": 0.5239, "step": 7632 }, { "epoch": 0.6199139121253959, "grad_norm": 5.408424083847769, "learning_rate": 1.666924279925219e-06, "loss": 0.5857, "step": 7633 }, { "epoch": 0.6199951271014376, "grad_norm": 3.2011054128203766, "learning_rate": 1.6663042726678752e-06, "loss": 0.5784, "step": 7634 }, { "epoch": 0.6200763420774791, "grad_norm": 5.235602080991919, "learning_rate": 1.6656843230978647e-06, "loss": 0.3975, "step": 7635 }, { "epoch": 0.6201575570535207, "grad_norm": 5.165250298024449, "learning_rate": 1.6650644312580833e-06, "loss": 0.5924, "step": 7636 }, { "epoch": 0.6202387720295622, "grad_norm": 4.340672727279216, "learning_rate": 1.6644445971914235e-06, "loss": 0.5221, "step": 7637 }, { "epoch": 0.6203199870056039, "grad_norm": 6.841912089854011, "learning_rate": 1.6638248209407767e-06, "loss": 0.443, "step": 7638 }, { "epoch": 0.6204012019816454, "grad_norm": 4.081953440344846, "learning_rate": 1.6632051025490265e-06, "loss": 0.3495, "step": 7639 }, { "epoch": 0.620482416957687, "grad_norm": 3.7772718364769142, "learning_rate": 1.6625854420590538e-06, "loss": 0.5034, "step": 7640 }, { "epoch": 0.6205636319337285, "grad_norm": 5.210380997814291, "learning_rate": 1.6619658395137375e-06, "loss": 0.6397, "step": 7641 }, { "epoch": 0.6206448469097702, "grad_norm": 6.577887175557655, "learning_rate": 1.6613462949559494e-06, "loss": 0.4802, "step": 7642 }, { "epoch": 0.6207260618858117, "grad_norm": 3.7841172613723657, "learning_rate": 1.6607268084285587e-06, "loss": 0.7127, "step": 7643 }, { "epoch": 0.6208072768618533, "grad_norm": 8.73460364920805, "learning_rate": 1.6601073799744322e-06, "loss": 0.4822, "step": 7644 }, { "epoch": 0.620888491837895, "grad_norm": 5.904973312911216, "learning_rate": 1.6594880096364302e-06, "loss": 0.5566, "step": 7645 }, { "epoch": 0.6209697068139365, "grad_norm": 4.941626996148981, "learning_rate": 1.6588686974574086e-06, "loss": 0.3796, "step": 7646 }, { "epoch": 0.6210509217899781, "grad_norm": 5.8291085465722325, "learning_rate": 1.658249443480221e-06, "loss": 0.4666, "step": 7647 }, { "epoch": 0.6211321367660196, "grad_norm": 5.909142689212799, "learning_rate": 1.6576302477477185e-06, "loss": 0.4453, "step": 7648 }, { "epoch": 0.6212133517420613, "grad_norm": 5.951228309961236, "learning_rate": 1.6570111103027436e-06, "loss": 0.3766, "step": 7649 }, { "epoch": 0.6212945667181028, "grad_norm": 6.681405040737811, "learning_rate": 1.6563920311881382e-06, "loss": 0.3992, "step": 7650 }, { "epoch": 0.6213757816941444, "grad_norm": 5.5260582323770135, "learning_rate": 1.6557730104467407e-06, "loss": 0.4259, "step": 7651 }, { "epoch": 0.621456996670186, "grad_norm": 5.076655366887786, "learning_rate": 1.6551540481213817e-06, "loss": 0.5468, "step": 7652 }, { "epoch": 0.6215382116462276, "grad_norm": 5.8919929585227155, "learning_rate": 1.6545351442548915e-06, "loss": 0.4871, "step": 7653 }, { "epoch": 0.6216194266222691, "grad_norm": 4.042380796567633, "learning_rate": 1.6539162988900952e-06, "loss": 0.4153, "step": 7654 }, { "epoch": 0.6217006415983107, "grad_norm": 6.784897682604056, "learning_rate": 1.6532975120698133e-06, "loss": 0.4534, "step": 7655 }, { "epoch": 0.6217818565743524, "grad_norm": 9.65234999480218, "learning_rate": 1.6526787838368616e-06, "loss": 0.4155, "step": 7656 }, { "epoch": 0.6218630715503939, "grad_norm": 17.08200480716379, "learning_rate": 1.6520601142340549e-06, "loss": 0.5921, "step": 7657 }, { "epoch": 0.6219442865264355, "grad_norm": 5.175377372780213, "learning_rate": 1.6514415033041997e-06, "loss": 0.4526, "step": 7658 }, { "epoch": 0.622025501502477, "grad_norm": 3.458058541785411, "learning_rate": 1.6508229510901013e-06, "loss": 0.4555, "step": 7659 }, { "epoch": 0.6221067164785187, "grad_norm": 10.604824315531577, "learning_rate": 1.6502044576345614e-06, "loss": 0.452, "step": 7660 }, { "epoch": 0.6221879314545602, "grad_norm": 5.411425272158323, "learning_rate": 1.6495860229803756e-06, "loss": 0.6222, "step": 7661 }, { "epoch": 0.6222691464306018, "grad_norm": 5.1740005505908995, "learning_rate": 1.6489676471703352e-06, "loss": 0.4145, "step": 7662 }, { "epoch": 0.6223503614066433, "grad_norm": 8.085499523946917, "learning_rate": 1.6483493302472302e-06, "loss": 0.513, "step": 7663 }, { "epoch": 0.622431576382685, "grad_norm": 10.292791376029198, "learning_rate": 1.6477310722538447e-06, "loss": 0.6126, "step": 7664 }, { "epoch": 0.6225127913587265, "grad_norm": 6.0268201310361995, "learning_rate": 1.6471128732329579e-06, "loss": 0.4292, "step": 7665 }, { "epoch": 0.6225940063347681, "grad_norm": 5.022456342536074, "learning_rate": 1.6464947332273459e-06, "loss": 0.6152, "step": 7666 }, { "epoch": 0.6226752213108098, "grad_norm": 8.543190424818972, "learning_rate": 1.6458766522797822e-06, "loss": 0.6125, "step": 7667 }, { "epoch": 0.6227564362868513, "grad_norm": 4.1156125031275135, "learning_rate": 1.6452586304330333e-06, "loss": 0.462, "step": 7668 }, { "epoch": 0.6228376512628929, "grad_norm": 7.81665689239009, "learning_rate": 1.6446406677298632e-06, "loss": 0.4678, "step": 7669 }, { "epoch": 0.6229188662389344, "grad_norm": 6.949544264134675, "learning_rate": 1.644022764213033e-06, "loss": 0.4414, "step": 7670 }, { "epoch": 0.6230000812149761, "grad_norm": 8.196141138164219, "learning_rate": 1.6434049199252966e-06, "loss": 0.4455, "step": 7671 }, { "epoch": 0.6230812961910176, "grad_norm": 6.110724030611538, "learning_rate": 1.6427871349094058e-06, "loss": 0.3828, "step": 7672 }, { "epoch": 0.6231625111670592, "grad_norm": 4.134192350462065, "learning_rate": 1.6421694092081097e-06, "loss": 0.5077, "step": 7673 }, { "epoch": 0.6232437261431008, "grad_norm": 4.263913374724854, "learning_rate": 1.6415517428641504e-06, "loss": 0.6757, "step": 7674 }, { "epoch": 0.6233249411191424, "grad_norm": 6.728733308443024, "learning_rate": 1.640934135920266e-06, "loss": 0.485, "step": 7675 }, { "epoch": 0.6234061560951839, "grad_norm": 6.332399518498263, "learning_rate": 1.6403165884191935e-06, "loss": 0.4495, "step": 7676 }, { "epoch": 0.6234873710712255, "grad_norm": 5.132785407111167, "learning_rate": 1.6396991004036638e-06, "loss": 0.4718, "step": 7677 }, { "epoch": 0.6235685860472672, "grad_norm": 4.651566891974008, "learning_rate": 1.6390816719164022e-06, "loss": 0.5763, "step": 7678 }, { "epoch": 0.6236498010233087, "grad_norm": 4.413484601105079, "learning_rate": 1.6384643030001333e-06, "loss": 0.722, "step": 7679 }, { "epoch": 0.6237310159993503, "grad_norm": 8.71808254189496, "learning_rate": 1.6378469936975752e-06, "loss": 0.5068, "step": 7680 }, { "epoch": 0.6238122309753918, "grad_norm": 6.479840818255055, "learning_rate": 1.6372297440514417e-06, "loss": 0.4299, "step": 7681 }, { "epoch": 0.6238934459514335, "grad_norm": 3.683453972916311, "learning_rate": 1.6366125541044435e-06, "loss": 0.545, "step": 7682 }, { "epoch": 0.623974660927475, "grad_norm": 9.69410298079771, "learning_rate": 1.6359954238992882e-06, "loss": 0.3748, "step": 7683 }, { "epoch": 0.6240558759035166, "grad_norm": 4.97119687493066, "learning_rate": 1.6353783534786763e-06, "loss": 0.4015, "step": 7684 }, { "epoch": 0.6241370908795582, "grad_norm": 7.7545114750163, "learning_rate": 1.6347613428853059e-06, "loss": 0.4756, "step": 7685 }, { "epoch": 0.6242183058555998, "grad_norm": 3.5468434651588976, "learning_rate": 1.634144392161872e-06, "loss": 0.5798, "step": 7686 }, { "epoch": 0.6242995208316413, "grad_norm": 4.706606608660131, "learning_rate": 1.6335275013510638e-06, "loss": 0.5285, "step": 7687 }, { "epoch": 0.6243807358076829, "grad_norm": 6.147626373688174, "learning_rate": 1.632910670495566e-06, "loss": 0.3421, "step": 7688 }, { "epoch": 0.6244619507837246, "grad_norm": 15.303934361158852, "learning_rate": 1.6322938996380617e-06, "loss": 0.46, "step": 7689 }, { "epoch": 0.6245431657597661, "grad_norm": 14.350203161237948, "learning_rate": 1.6316771888212275e-06, "loss": 0.4245, "step": 7690 }, { "epoch": 0.6246243807358077, "grad_norm": 4.190438584869176, "learning_rate": 1.631060538087735e-06, "loss": 0.566, "step": 7691 }, { "epoch": 0.6247055957118492, "grad_norm": 3.82815155289053, "learning_rate": 1.6304439474802554e-06, "loss": 0.4528, "step": 7692 }, { "epoch": 0.6247868106878909, "grad_norm": 6.351971066286823, "learning_rate": 1.6298274170414524e-06, "loss": 0.5233, "step": 7693 }, { "epoch": 0.6248680256639324, "grad_norm": 4.36724324752686, "learning_rate": 1.6292109468139863e-06, "loss": 0.4774, "step": 7694 }, { "epoch": 0.624949240639974, "grad_norm": 6.655112537446748, "learning_rate": 1.6285945368405146e-06, "loss": 0.5333, "step": 7695 }, { "epoch": 0.6250304556160156, "grad_norm": 6.018071010184688, "learning_rate": 1.6279781871636896e-06, "loss": 0.4432, "step": 7696 }, { "epoch": 0.6251116705920572, "grad_norm": 5.306711764250664, "learning_rate": 1.6273618978261576e-06, "loss": 0.5138, "step": 7697 }, { "epoch": 0.6251928855680987, "grad_norm": 3.9883932484916325, "learning_rate": 1.6267456688705647e-06, "loss": 0.5547, "step": 7698 }, { "epoch": 0.6252741005441403, "grad_norm": 6.6197356790826625, "learning_rate": 1.6261295003395506e-06, "loss": 0.4235, "step": 7699 }, { "epoch": 0.625355315520182, "grad_norm": 3.6076841841661667, "learning_rate": 1.6255133922757493e-06, "loss": 0.4933, "step": 7700 }, { "epoch": 0.6254365304962235, "grad_norm": 4.929421788606065, "learning_rate": 1.6248973447217926e-06, "loss": 0.5483, "step": 7701 }, { "epoch": 0.6255177454722651, "grad_norm": 4.242914786049469, "learning_rate": 1.6242813577203093e-06, "loss": 0.4914, "step": 7702 }, { "epoch": 0.6255989604483067, "grad_norm": 4.388993067736639, "learning_rate": 1.6236654313139213e-06, "loss": 0.428, "step": 7703 }, { "epoch": 0.6256801754243483, "grad_norm": 6.684554645774749, "learning_rate": 1.6230495655452466e-06, "loss": 0.5199, "step": 7704 }, { "epoch": 0.6257613904003898, "grad_norm": 4.683366955828708, "learning_rate": 1.6224337604569012e-06, "loss": 0.4485, "step": 7705 }, { "epoch": 0.6258426053764314, "grad_norm": 4.786816490822921, "learning_rate": 1.6218180160914959e-06, "loss": 0.4682, "step": 7706 }, { "epoch": 0.625923820352473, "grad_norm": 5.411731210706411, "learning_rate": 1.6212023324916349e-06, "loss": 0.423, "step": 7707 }, { "epoch": 0.6260050353285146, "grad_norm": 4.143891933212883, "learning_rate": 1.620586709699922e-06, "loss": 0.7241, "step": 7708 }, { "epoch": 0.6260862503045561, "grad_norm": 5.695424075779182, "learning_rate": 1.6199711477589553e-06, "loss": 0.4617, "step": 7709 }, { "epoch": 0.6261674652805977, "grad_norm": 7.132319497435146, "learning_rate": 1.6193556467113264e-06, "loss": 0.3636, "step": 7710 }, { "epoch": 0.6262486802566394, "grad_norm": 7.5139687849034225, "learning_rate": 1.6187402065996267e-06, "loss": 0.4507, "step": 7711 }, { "epoch": 0.6263298952326809, "grad_norm": 3.7215878058767675, "learning_rate": 1.6181248274664413e-06, "loss": 0.5443, "step": 7712 }, { "epoch": 0.6264111102087225, "grad_norm": 3.861480100387473, "learning_rate": 1.617509509354349e-06, "loss": 0.4875, "step": 7713 }, { "epoch": 0.626492325184764, "grad_norm": 5.924413160412586, "learning_rate": 1.616894252305929e-06, "loss": 0.5528, "step": 7714 }, { "epoch": 0.6265735401608057, "grad_norm": 6.36026377881963, "learning_rate": 1.6162790563637538e-06, "loss": 0.4045, "step": 7715 }, { "epoch": 0.6266547551368472, "grad_norm": 4.4919168460790475, "learning_rate": 1.6156639215703896e-06, "loss": 0.4531, "step": 7716 }, { "epoch": 0.6267359701128888, "grad_norm": 3.3752966345250583, "learning_rate": 1.6150488479684022e-06, "loss": 0.4291, "step": 7717 }, { "epoch": 0.6268171850889304, "grad_norm": 4.683411418815497, "learning_rate": 1.6144338356003513e-06, "loss": 0.5847, "step": 7718 }, { "epoch": 0.626898400064972, "grad_norm": 6.26290414561336, "learning_rate": 1.6138188845087926e-06, "loss": 0.4307, "step": 7719 }, { "epoch": 0.6269796150410135, "grad_norm": 7.127187742803709, "learning_rate": 1.613203994736276e-06, "loss": 0.6484, "step": 7720 }, { "epoch": 0.6270608300170551, "grad_norm": 4.465338617099092, "learning_rate": 1.61258916632535e-06, "loss": 0.4218, "step": 7721 }, { "epoch": 0.6271420449930968, "grad_norm": 6.635578415699874, "learning_rate": 1.6119743993185574e-06, "loss": 0.5503, "step": 7722 }, { "epoch": 0.6272232599691383, "grad_norm": 9.293378886349172, "learning_rate": 1.6113596937584358e-06, "loss": 0.4743, "step": 7723 }, { "epoch": 0.6273044749451799, "grad_norm": 3.9282722976253432, "learning_rate": 1.610745049687521e-06, "loss": 0.4391, "step": 7724 }, { "epoch": 0.6273856899212215, "grad_norm": 5.900142462141413, "learning_rate": 1.6101304671483425e-06, "loss": 0.4308, "step": 7725 }, { "epoch": 0.6274669048972631, "grad_norm": 5.538553572045406, "learning_rate": 1.6095159461834252e-06, "loss": 0.3723, "step": 7726 }, { "epoch": 0.6275481198733046, "grad_norm": 5.156039599008567, "learning_rate": 1.6089014868352925e-06, "loss": 0.5306, "step": 7727 }, { "epoch": 0.6276293348493462, "grad_norm": 5.634533236318975, "learning_rate": 1.608287089146461e-06, "loss": 0.525, "step": 7728 }, { "epoch": 0.6277105498253878, "grad_norm": 5.598968838519203, "learning_rate": 1.6076727531594428e-06, "loss": 0.3932, "step": 7729 }, { "epoch": 0.6277917648014294, "grad_norm": 5.654175409392677, "learning_rate": 1.607058478916748e-06, "loss": 0.4358, "step": 7730 }, { "epoch": 0.6278729797774709, "grad_norm": 7.282317038390533, "learning_rate": 1.6064442664608808e-06, "loss": 0.6522, "step": 7731 }, { "epoch": 0.6279541947535126, "grad_norm": 6.6301535369071924, "learning_rate": 1.6058301158343408e-06, "loss": 0.4913, "step": 7732 }, { "epoch": 0.6280354097295542, "grad_norm": 4.56677620849299, "learning_rate": 1.6052160270796252e-06, "loss": 0.4066, "step": 7733 }, { "epoch": 0.6281166247055957, "grad_norm": 6.930735350423143, "learning_rate": 1.6046020002392242e-06, "loss": 0.6191, "step": 7734 }, { "epoch": 0.6281978396816373, "grad_norm": 3.3385261614929584, "learning_rate": 1.603988035355627e-06, "loss": 0.4239, "step": 7735 }, { "epoch": 0.6282790546576789, "grad_norm": 6.108614723656081, "learning_rate": 1.6033741324713143e-06, "loss": 0.5148, "step": 7736 }, { "epoch": 0.6283602696337205, "grad_norm": 5.083243850769626, "learning_rate": 1.6027602916287665e-06, "loss": 0.5385, "step": 7737 }, { "epoch": 0.628441484609762, "grad_norm": 4.3659844044456735, "learning_rate": 1.6021465128704592e-06, "loss": 0.4434, "step": 7738 }, { "epoch": 0.6285226995858036, "grad_norm": 3.246650947537016, "learning_rate": 1.60153279623886e-06, "loss": 0.5304, "step": 7739 }, { "epoch": 0.6286039145618452, "grad_norm": 6.965030627490836, "learning_rate": 1.6009191417764366e-06, "loss": 0.4603, "step": 7740 }, { "epoch": 0.6286851295378868, "grad_norm": 4.3257443108941835, "learning_rate": 1.600305549525651e-06, "loss": 0.4716, "step": 7741 }, { "epoch": 0.6287663445139283, "grad_norm": 5.777339582650028, "learning_rate": 1.5996920195289586e-06, "loss": 0.5073, "step": 7742 }, { "epoch": 0.62884755948997, "grad_norm": 11.440189242706712, "learning_rate": 1.5990785518288144e-06, "loss": 0.6117, "step": 7743 }, { "epoch": 0.6289287744660116, "grad_norm": 5.114208771227363, "learning_rate": 1.5984651464676664e-06, "loss": 0.4889, "step": 7744 }, { "epoch": 0.6290099894420531, "grad_norm": 5.705508333315021, "learning_rate": 1.5978518034879583e-06, "loss": 0.3973, "step": 7745 }, { "epoch": 0.6290912044180947, "grad_norm": 8.471787190410229, "learning_rate": 1.5972385229321313e-06, "loss": 0.5272, "step": 7746 }, { "epoch": 0.6291724193941363, "grad_norm": 4.82503142826085, "learning_rate": 1.5966253048426212e-06, "loss": 0.6467, "step": 7747 }, { "epoch": 0.6292536343701779, "grad_norm": 4.760084685878624, "learning_rate": 1.596012149261858e-06, "loss": 0.544, "step": 7748 }, { "epoch": 0.6293348493462194, "grad_norm": 4.772517813956784, "learning_rate": 1.5953990562322708e-06, "loss": 0.4272, "step": 7749 }, { "epoch": 0.629416064322261, "grad_norm": 8.327385517646334, "learning_rate": 1.5947860257962808e-06, "loss": 0.4509, "step": 7750 }, { "epoch": 0.6294972792983026, "grad_norm": 4.076666918311391, "learning_rate": 1.5941730579963065e-06, "loss": 0.4959, "step": 7751 }, { "epoch": 0.6295784942743442, "grad_norm": 6.179747838755071, "learning_rate": 1.5935601528747635e-06, "loss": 0.4375, "step": 7752 }, { "epoch": 0.6296597092503857, "grad_norm": 12.93986219429753, "learning_rate": 1.5929473104740605e-06, "loss": 0.5545, "step": 7753 }, { "epoch": 0.6297409242264274, "grad_norm": 3.6144579855996346, "learning_rate": 1.5923345308366033e-06, "loss": 0.4124, "step": 7754 }, { "epoch": 0.629822139202469, "grad_norm": 3.832036350634625, "learning_rate": 1.591721814004792e-06, "loss": 0.6357, "step": 7755 }, { "epoch": 0.6299033541785105, "grad_norm": 3.254583154769827, "learning_rate": 1.5911091600210243e-06, "loss": 0.582, "step": 7756 }, { "epoch": 0.6299845691545521, "grad_norm": 4.7973970186134505, "learning_rate": 1.5904965689276935e-06, "loss": 0.809, "step": 7757 }, { "epoch": 0.6300657841305937, "grad_norm": 4.07832937486864, "learning_rate": 1.5898840407671854e-06, "loss": 0.4779, "step": 7758 }, { "epoch": 0.6301469991066353, "grad_norm": 11.649929406013856, "learning_rate": 1.5892715755818855e-06, "loss": 0.4955, "step": 7759 }, { "epoch": 0.6302282140826768, "grad_norm": 7.298974738788142, "learning_rate": 1.588659173414173e-06, "loss": 0.3887, "step": 7760 }, { "epoch": 0.6303094290587185, "grad_norm": 6.5690578488061115, "learning_rate": 1.5880468343064215e-06, "loss": 0.5319, "step": 7761 }, { "epoch": 0.63039064403476, "grad_norm": 4.501194184210136, "learning_rate": 1.5874345583010038e-06, "loss": 0.5715, "step": 7762 }, { "epoch": 0.6304718590108016, "grad_norm": 4.823495001127094, "learning_rate": 1.5868223454402842e-06, "loss": 0.5115, "step": 7763 }, { "epoch": 0.6305530739868431, "grad_norm": 4.765174608509368, "learning_rate": 1.5862101957666251e-06, "loss": 0.4864, "step": 7764 }, { "epoch": 0.6306342889628848, "grad_norm": 8.316397458431478, "learning_rate": 1.5855981093223851e-06, "loss": 0.5312, "step": 7765 }, { "epoch": 0.6307155039389264, "grad_norm": 3.4813538772732096, "learning_rate": 1.5849860861499161e-06, "loss": 0.5063, "step": 7766 }, { "epoch": 0.6307967189149679, "grad_norm": 5.709151262434003, "learning_rate": 1.584374126291567e-06, "loss": 0.6373, "step": 7767 }, { "epoch": 0.6308779338910095, "grad_norm": 6.071499004825764, "learning_rate": 1.5837622297896832e-06, "loss": 0.4196, "step": 7768 }, { "epoch": 0.6309591488670511, "grad_norm": 4.152234565584312, "learning_rate": 1.5831503966866038e-06, "loss": 0.4568, "step": 7769 }, { "epoch": 0.6310403638430927, "grad_norm": 5.124374169356126, "learning_rate": 1.5825386270246649e-06, "loss": 0.5482, "step": 7770 }, { "epoch": 0.6311215788191342, "grad_norm": 6.603032548895071, "learning_rate": 1.5819269208461962e-06, "loss": 0.4628, "step": 7771 }, { "epoch": 0.6312027937951759, "grad_norm": 3.929016081455086, "learning_rate": 1.5813152781935264e-06, "loss": 0.4962, "step": 7772 }, { "epoch": 0.6312840087712174, "grad_norm": 5.248036747944997, "learning_rate": 1.5807036991089781e-06, "loss": 0.5938, "step": 7773 }, { "epoch": 0.631365223747259, "grad_norm": 4.602869975600319, "learning_rate": 1.5800921836348671e-06, "loss": 0.4762, "step": 7774 }, { "epoch": 0.6314464387233005, "grad_norm": 3.998867630823634, "learning_rate": 1.5794807318135097e-06, "loss": 0.4337, "step": 7775 }, { "epoch": 0.6315276536993422, "grad_norm": 4.511923050001, "learning_rate": 1.5788693436872132e-06, "loss": 0.5222, "step": 7776 }, { "epoch": 0.6316088686753838, "grad_norm": 4.732480730008836, "learning_rate": 1.5782580192982827e-06, "loss": 0.5321, "step": 7777 }, { "epoch": 0.6316900836514253, "grad_norm": 6.209675234125353, "learning_rate": 1.57764675868902e-06, "loss": 0.4472, "step": 7778 }, { "epoch": 0.631771298627467, "grad_norm": 4.538684305840299, "learning_rate": 1.5770355619017198e-06, "loss": 0.5362, "step": 7779 }, { "epoch": 0.6318525136035085, "grad_norm": 3.509225017297568, "learning_rate": 1.5764244289786728e-06, "loss": 0.4968, "step": 7780 }, { "epoch": 0.6319337285795501, "grad_norm": 6.521031887625569, "learning_rate": 1.575813359962169e-06, "loss": 0.3996, "step": 7781 }, { "epoch": 0.6320149435555916, "grad_norm": 12.929846426438688, "learning_rate": 1.5752023548944889e-06, "loss": 0.4625, "step": 7782 }, { "epoch": 0.6320961585316333, "grad_norm": 9.52281730299787, "learning_rate": 1.574591413817911e-06, "loss": 0.5613, "step": 7783 }, { "epoch": 0.6321773735076748, "grad_norm": 3.924489403653561, "learning_rate": 1.57398053677471e-06, "loss": 0.5386, "step": 7784 }, { "epoch": 0.6322585884837164, "grad_norm": 3.8784061814789137, "learning_rate": 1.5733697238071553e-06, "loss": 0.4747, "step": 7785 }, { "epoch": 0.6323398034597579, "grad_norm": 4.881789075504623, "learning_rate": 1.5727589749575107e-06, "loss": 0.5493, "step": 7786 }, { "epoch": 0.6324210184357996, "grad_norm": 7.261971143003088, "learning_rate": 1.5721482902680385e-06, "loss": 0.5279, "step": 7787 }, { "epoch": 0.6325022334118412, "grad_norm": 6.077300198937425, "learning_rate": 1.5715376697809937e-06, "loss": 0.5173, "step": 7788 }, { "epoch": 0.6325834483878827, "grad_norm": 5.975876289320817, "learning_rate": 1.570927113538629e-06, "loss": 0.5638, "step": 7789 }, { "epoch": 0.6326646633639244, "grad_norm": 3.670648216770045, "learning_rate": 1.5703166215831899e-06, "loss": 0.4479, "step": 7790 }, { "epoch": 0.6327458783399659, "grad_norm": 3.6145233680649254, "learning_rate": 1.5697061939569214e-06, "loss": 0.5388, "step": 7791 }, { "epoch": 0.6328270933160075, "grad_norm": 4.822710807501337, "learning_rate": 1.56909583070206e-06, "loss": 0.6051, "step": 7792 }, { "epoch": 0.632908308292049, "grad_norm": 6.545220131213222, "learning_rate": 1.56848553186084e-06, "loss": 0.4538, "step": 7793 }, { "epoch": 0.6329895232680907, "grad_norm": 6.080696544546954, "learning_rate": 1.567875297475492e-06, "loss": 0.4151, "step": 7794 }, { "epoch": 0.6330707382441322, "grad_norm": 9.899327382181253, "learning_rate": 1.56726512758824e-06, "loss": 0.5212, "step": 7795 }, { "epoch": 0.6331519532201738, "grad_norm": 4.273991889208243, "learning_rate": 1.566655022241304e-06, "loss": 0.5173, "step": 7796 }, { "epoch": 0.6332331681962153, "grad_norm": 5.170717612782167, "learning_rate": 1.5660449814769021e-06, "loss": 0.4797, "step": 7797 }, { "epoch": 0.633314383172257, "grad_norm": 5.7053519794438685, "learning_rate": 1.5654350053372442e-06, "loss": 0.4382, "step": 7798 }, { "epoch": 0.6333955981482986, "grad_norm": 3.5081020926912667, "learning_rate": 1.564825093864537e-06, "loss": 0.4524, "step": 7799 }, { "epoch": 0.6334768131243401, "grad_norm": 9.01478070880551, "learning_rate": 1.5642152471009849e-06, "loss": 0.4047, "step": 7800 }, { "epoch": 0.6335580281003818, "grad_norm": 5.177815198718995, "learning_rate": 1.563605465088785e-06, "loss": 0.4582, "step": 7801 }, { "epoch": 0.6336392430764233, "grad_norm": 8.412137360840807, "learning_rate": 1.5629957478701303e-06, "loss": 0.4636, "step": 7802 }, { "epoch": 0.6337204580524649, "grad_norm": 5.581728871746328, "learning_rate": 1.5623860954872116e-06, "loss": 0.5674, "step": 7803 }, { "epoch": 0.6338016730285064, "grad_norm": 4.471083148690759, "learning_rate": 1.5617765079822133e-06, "loss": 0.4592, "step": 7804 }, { "epoch": 0.6338828880045481, "grad_norm": 3.475348192814086, "learning_rate": 1.5611669853973141e-06, "loss": 0.5273, "step": 7805 }, { "epoch": 0.6339641029805896, "grad_norm": 5.120001622959669, "learning_rate": 1.5605575277746912e-06, "loss": 0.4363, "step": 7806 }, { "epoch": 0.6340453179566312, "grad_norm": 4.543060760794114, "learning_rate": 1.559948135156516e-06, "loss": 0.4075, "step": 7807 }, { "epoch": 0.6341265329326727, "grad_norm": 4.1227313596641, "learning_rate": 1.559338807584954e-06, "loss": 0.4653, "step": 7808 }, { "epoch": 0.6342077479087144, "grad_norm": 4.831713571856866, "learning_rate": 1.5587295451021678e-06, "loss": 0.5367, "step": 7809 }, { "epoch": 0.634288962884756, "grad_norm": 4.2655332482022414, "learning_rate": 1.5581203477503166e-06, "loss": 0.6405, "step": 7810 }, { "epoch": 0.6343701778607975, "grad_norm": 8.517022361410643, "learning_rate": 1.5575112155715516e-06, "loss": 0.408, "step": 7811 }, { "epoch": 0.6344513928368392, "grad_norm": 4.14581190989536, "learning_rate": 1.5569021486080223e-06, "loss": 0.4273, "step": 7812 }, { "epoch": 0.6345326078128807, "grad_norm": 4.406741970957099, "learning_rate": 1.5562931469018738e-06, "loss": 0.688, "step": 7813 }, { "epoch": 0.6346138227889223, "grad_norm": 4.483314144569286, "learning_rate": 1.555684210495245e-06, "loss": 0.5208, "step": 7814 }, { "epoch": 0.6346950377649638, "grad_norm": 4.337838645295809, "learning_rate": 1.5550753394302702e-06, "loss": 0.5329, "step": 7815 }, { "epoch": 0.6347762527410055, "grad_norm": 6.548340782272945, "learning_rate": 1.5544665337490822e-06, "loss": 0.5038, "step": 7816 }, { "epoch": 0.634857467717047, "grad_norm": 26.26368001727583, "learning_rate": 1.5538577934938051e-06, "loss": 0.415, "step": 7817 }, { "epoch": 0.6349386826930886, "grad_norm": 6.56905135630574, "learning_rate": 1.5532491187065607e-06, "loss": 0.4125, "step": 7818 }, { "epoch": 0.6350198976691301, "grad_norm": 4.494023935708008, "learning_rate": 1.5526405094294682e-06, "loss": 0.4559, "step": 7819 }, { "epoch": 0.6351011126451718, "grad_norm": 7.264576312866752, "learning_rate": 1.5520319657046384e-06, "loss": 0.4749, "step": 7820 }, { "epoch": 0.6351823276212134, "grad_norm": 3.8422635582509037, "learning_rate": 1.5514234875741785e-06, "loss": 0.5266, "step": 7821 }, { "epoch": 0.6352635425972549, "grad_norm": 4.7814474085953815, "learning_rate": 1.550815075080193e-06, "loss": 0.5509, "step": 7822 }, { "epoch": 0.6353447575732966, "grad_norm": 14.680542072418953, "learning_rate": 1.5502067282647821e-06, "loss": 0.4687, "step": 7823 }, { "epoch": 0.6354259725493381, "grad_norm": 5.102982342242324, "learning_rate": 1.5495984471700382e-06, "loss": 0.5625, "step": 7824 }, { "epoch": 0.6355071875253797, "grad_norm": 6.340239866996015, "learning_rate": 1.5489902318380512e-06, "loss": 0.3983, "step": 7825 }, { "epoch": 0.6355884025014212, "grad_norm": 5.566203991765062, "learning_rate": 1.5483820823109078e-06, "loss": 0.5833, "step": 7826 }, { "epoch": 0.6356696174774629, "grad_norm": 4.137320108495999, "learning_rate": 1.5477739986306878e-06, "loss": 0.6481, "step": 7827 }, { "epoch": 0.6357508324535044, "grad_norm": 4.613431700216779, "learning_rate": 1.5471659808394669e-06, "loss": 0.4265, "step": 7828 }, { "epoch": 0.635832047429546, "grad_norm": 3.7043285271670507, "learning_rate": 1.546558028979318e-06, "loss": 0.3759, "step": 7829 }, { "epoch": 0.6359132624055875, "grad_norm": 5.051376359459655, "learning_rate": 1.5459501430923073e-06, "loss": 0.4925, "step": 7830 }, { "epoch": 0.6359944773816292, "grad_norm": 4.08262450327924, "learning_rate": 1.5453423232204968e-06, "loss": 0.4753, "step": 7831 }, { "epoch": 0.6360756923576708, "grad_norm": 4.998294324944461, "learning_rate": 1.5447345694059462e-06, "loss": 0.4227, "step": 7832 }, { "epoch": 0.6361569073337123, "grad_norm": 13.638204885625212, "learning_rate": 1.5441268816907077e-06, "loss": 0.5512, "step": 7833 }, { "epoch": 0.636238122309754, "grad_norm": 22.49883181989276, "learning_rate": 1.5435192601168293e-06, "loss": 0.5912, "step": 7834 }, { "epoch": 0.6363193372857955, "grad_norm": 5.137209289732004, "learning_rate": 1.542911704726356e-06, "loss": 0.425, "step": 7835 }, { "epoch": 0.6364005522618371, "grad_norm": 5.554841196878051, "learning_rate": 1.5423042155613283e-06, "loss": 0.3496, "step": 7836 }, { "epoch": 0.6364817672378786, "grad_norm": 6.808818397785924, "learning_rate": 1.5416967926637793e-06, "loss": 0.3677, "step": 7837 }, { "epoch": 0.6365629822139203, "grad_norm": 10.986138436222886, "learning_rate": 1.5410894360757408e-06, "loss": 0.5055, "step": 7838 }, { "epoch": 0.6366441971899618, "grad_norm": 7.0748129231448695, "learning_rate": 1.540482145839239e-06, "loss": 0.4956, "step": 7839 }, { "epoch": 0.6367254121660034, "grad_norm": 5.517324400634849, "learning_rate": 1.5398749219962935e-06, "loss": 0.5188, "step": 7840 }, { "epoch": 0.636806627142045, "grad_norm": 7.369254822891894, "learning_rate": 1.5392677645889225e-06, "loss": 0.4918, "step": 7841 }, { "epoch": 0.6368878421180866, "grad_norm": 4.443831510239287, "learning_rate": 1.5386606736591381e-06, "loss": 0.5523, "step": 7842 }, { "epoch": 0.6369690570941282, "grad_norm": 5.484153254041072, "learning_rate": 1.5380536492489468e-06, "loss": 0.4526, "step": 7843 }, { "epoch": 0.6370502720701697, "grad_norm": 3.6793655308252364, "learning_rate": 1.5374466914003516e-06, "loss": 0.5204, "step": 7844 }, { "epoch": 0.6371314870462114, "grad_norm": 5.6124428134045, "learning_rate": 1.536839800155352e-06, "loss": 0.5365, "step": 7845 }, { "epoch": 0.6372127020222529, "grad_norm": 4.415081736558317, "learning_rate": 1.5362329755559402e-06, "loss": 0.4503, "step": 7846 }, { "epoch": 0.6372939169982945, "grad_norm": 8.36645209753681, "learning_rate": 1.5356262176441051e-06, "loss": 0.5873, "step": 7847 }, { "epoch": 0.637375131974336, "grad_norm": 4.410444200701278, "learning_rate": 1.5350195264618333e-06, "loss": 0.4172, "step": 7848 }, { "epoch": 0.6374563469503777, "grad_norm": 5.471308320004208, "learning_rate": 1.5344129020511029e-06, "loss": 0.378, "step": 7849 }, { "epoch": 0.6375375619264192, "grad_norm": 2.964980011323985, "learning_rate": 1.5338063444538887e-06, "loss": 0.4858, "step": 7850 }, { "epoch": 0.6376187769024608, "grad_norm": 4.034123659647055, "learning_rate": 1.533199853712162e-06, "loss": 0.4601, "step": 7851 }, { "epoch": 0.6376999918785023, "grad_norm": 5.360248478803197, "learning_rate": 1.5325934298678896e-06, "loss": 0.5198, "step": 7852 }, { "epoch": 0.637781206854544, "grad_norm": 4.128851671530124, "learning_rate": 1.5319870729630303e-06, "loss": 0.5347, "step": 7853 }, { "epoch": 0.6378624218305856, "grad_norm": 5.837367955901182, "learning_rate": 1.5313807830395437e-06, "loss": 0.4249, "step": 7854 }, { "epoch": 0.6379436368066271, "grad_norm": 4.420374712053163, "learning_rate": 1.5307745601393808e-06, "loss": 0.5663, "step": 7855 }, { "epoch": 0.6380248517826688, "grad_norm": 8.34561480709057, "learning_rate": 1.5301684043044875e-06, "loss": 0.569, "step": 7856 }, { "epoch": 0.6381060667587103, "grad_norm": 11.076659493261435, "learning_rate": 1.5295623155768086e-06, "loss": 0.4581, "step": 7857 }, { "epoch": 0.6381872817347519, "grad_norm": 5.281957951201278, "learning_rate": 1.5289562939982822e-06, "loss": 0.5524, "step": 7858 }, { "epoch": 0.6382684967107934, "grad_norm": 4.075328659102114, "learning_rate": 1.5283503396108401e-06, "loss": 0.5083, "step": 7859 }, { "epoch": 0.6383497116868351, "grad_norm": 4.739091105300953, "learning_rate": 1.5277444524564117e-06, "loss": 0.524, "step": 7860 }, { "epoch": 0.6384309266628766, "grad_norm": 12.502865910338423, "learning_rate": 1.5271386325769227e-06, "loss": 0.2867, "step": 7861 }, { "epoch": 0.6385121416389182, "grad_norm": 4.833851282125769, "learning_rate": 1.526532880014292e-06, "loss": 0.6359, "step": 7862 }, { "epoch": 0.6385933566149598, "grad_norm": 4.157660884304162, "learning_rate": 1.5259271948104323e-06, "loss": 0.4139, "step": 7863 }, { "epoch": 0.6386745715910014, "grad_norm": 4.699011951945198, "learning_rate": 1.5253215770072564e-06, "loss": 0.4898, "step": 7864 }, { "epoch": 0.638755786567043, "grad_norm": 5.911530191898309, "learning_rate": 1.5247160266466693e-06, "loss": 0.5325, "step": 7865 }, { "epoch": 0.6388370015430845, "grad_norm": 3.680767366819412, "learning_rate": 1.5241105437705706e-06, "loss": 0.5321, "step": 7866 }, { "epoch": 0.6389182165191262, "grad_norm": 6.398260678627941, "learning_rate": 1.523505128420858e-06, "loss": 0.4228, "step": 7867 }, { "epoch": 0.6389994314951677, "grad_norm": 3.398702627971208, "learning_rate": 1.522899780639423e-06, "loss": 0.5505, "step": 7868 }, { "epoch": 0.6390806464712093, "grad_norm": 6.202752332409573, "learning_rate": 1.5222945004681504e-06, "loss": 0.7697, "step": 7869 }, { "epoch": 0.6391618614472508, "grad_norm": 5.874268780003323, "learning_rate": 1.5216892879489253e-06, "loss": 0.4696, "step": 7870 }, { "epoch": 0.6392430764232925, "grad_norm": 4.404120702182698, "learning_rate": 1.521084143123624e-06, "loss": 0.6132, "step": 7871 }, { "epoch": 0.639324291399334, "grad_norm": 4.8946588555368695, "learning_rate": 1.5204790660341178e-06, "loss": 0.4298, "step": 7872 }, { "epoch": 0.6394055063753756, "grad_norm": 4.660366909329026, "learning_rate": 1.519874056722277e-06, "loss": 0.6014, "step": 7873 }, { "epoch": 0.6394867213514172, "grad_norm": 9.926138073456057, "learning_rate": 1.5192691152299649e-06, "loss": 0.5607, "step": 7874 }, { "epoch": 0.6395679363274588, "grad_norm": 6.862534435729142, "learning_rate": 1.5186642415990382e-06, "loss": 0.4831, "step": 7875 }, { "epoch": 0.6396491513035004, "grad_norm": 5.109846150321177, "learning_rate": 1.518059435871353e-06, "loss": 0.3118, "step": 7876 }, { "epoch": 0.6397303662795419, "grad_norm": 4.824987683441751, "learning_rate": 1.5174546980887585e-06, "loss": 0.407, "step": 7877 }, { "epoch": 0.6398115812555836, "grad_norm": 3.088737083022684, "learning_rate": 1.516850028293099e-06, "loss": 0.6486, "step": 7878 }, { "epoch": 0.6398927962316251, "grad_norm": 4.687914435447199, "learning_rate": 1.516245426526213e-06, "loss": 0.6395, "step": 7879 }, { "epoch": 0.6399740112076667, "grad_norm": 6.45233418054512, "learning_rate": 1.5156408928299377e-06, "loss": 0.5865, "step": 7880 }, { "epoch": 0.6400552261837082, "grad_norm": 4.4953810242052, "learning_rate": 1.5150364272461035e-06, "loss": 0.4263, "step": 7881 }, { "epoch": 0.6401364411597499, "grad_norm": 5.174417200023911, "learning_rate": 1.5144320298165346e-06, "loss": 0.4651, "step": 7882 }, { "epoch": 0.6402176561357914, "grad_norm": 8.999793348428359, "learning_rate": 1.5138277005830538e-06, "loss": 0.4134, "step": 7883 }, { "epoch": 0.640298871111833, "grad_norm": 4.942872220787506, "learning_rate": 1.5132234395874773e-06, "loss": 0.5613, "step": 7884 }, { "epoch": 0.6403800860878746, "grad_norm": 3.7848669467000433, "learning_rate": 1.5126192468716152e-06, "loss": 0.4878, "step": 7885 }, { "epoch": 0.6404613010639162, "grad_norm": 4.408963540747303, "learning_rate": 1.5120151224772765e-06, "loss": 0.6558, "step": 7886 }, { "epoch": 0.6405425160399578, "grad_norm": 6.167053391776367, "learning_rate": 1.5114110664462624e-06, "loss": 0.5179, "step": 7887 }, { "epoch": 0.6406237310159993, "grad_norm": 4.691809504965685, "learning_rate": 1.5108070788203699e-06, "loss": 0.5723, "step": 7888 }, { "epoch": 0.640704945992041, "grad_norm": 4.8601836373152, "learning_rate": 1.5102031596413927e-06, "loss": 0.5001, "step": 7889 }, { "epoch": 0.6407861609680825, "grad_norm": 4.771276325593275, "learning_rate": 1.509599308951119e-06, "loss": 0.4755, "step": 7890 }, { "epoch": 0.6408673759441241, "grad_norm": 7.6294635480914135, "learning_rate": 1.5089955267913303e-06, "loss": 0.3362, "step": 7891 }, { "epoch": 0.6409485909201657, "grad_norm": 4.020044219945879, "learning_rate": 1.5083918132038072e-06, "loss": 0.3609, "step": 7892 }, { "epoch": 0.6410298058962073, "grad_norm": 8.75122403334399, "learning_rate": 1.5077881682303225e-06, "loss": 0.4598, "step": 7893 }, { "epoch": 0.6411110208722488, "grad_norm": 12.260107016009812, "learning_rate": 1.5071845919126448e-06, "loss": 0.5443, "step": 7894 }, { "epoch": 0.6411922358482904, "grad_norm": 6.260976119003239, "learning_rate": 1.5065810842925399e-06, "loss": 0.456, "step": 7895 }, { "epoch": 0.641273450824332, "grad_norm": 4.832778705163437, "learning_rate": 1.5059776454117658e-06, "loss": 0.4861, "step": 7896 }, { "epoch": 0.6413546658003736, "grad_norm": 4.117223333471925, "learning_rate": 1.505374275312078e-06, "loss": 0.4146, "step": 7897 }, { "epoch": 0.6414358807764152, "grad_norm": 7.715748444314561, "learning_rate": 1.504770974035226e-06, "loss": 0.4328, "step": 7898 }, { "epoch": 0.6415170957524567, "grad_norm": 3.698130972907306, "learning_rate": 1.5041677416229556e-06, "loss": 0.5428, "step": 7899 }, { "epoch": 0.6415983107284984, "grad_norm": 8.80799380441798, "learning_rate": 1.5035645781170078e-06, "loss": 0.5696, "step": 7900 }, { "epoch": 0.6416795257045399, "grad_norm": 5.951094606026148, "learning_rate": 1.502961483559116e-06, "loss": 0.6523, "step": 7901 }, { "epoch": 0.6417607406805815, "grad_norm": 5.956077850717039, "learning_rate": 1.502358457991014e-06, "loss": 0.4707, "step": 7902 }, { "epoch": 0.641841955656623, "grad_norm": 17.140276946196106, "learning_rate": 1.5017555014544273e-06, "loss": 0.4322, "step": 7903 }, { "epoch": 0.6419231706326647, "grad_norm": 4.87844418439407, "learning_rate": 1.5011526139910754e-06, "loss": 0.523, "step": 7904 }, { "epoch": 0.6420043856087062, "grad_norm": 4.800050176008073, "learning_rate": 1.5005497956426773e-06, "loss": 0.4496, "step": 7905 }, { "epoch": 0.6420856005847478, "grad_norm": 15.944468140204993, "learning_rate": 1.4999470464509432e-06, "loss": 0.4226, "step": 7906 }, { "epoch": 0.6421668155607894, "grad_norm": 10.898961363880764, "learning_rate": 1.4993443664575807e-06, "loss": 0.4787, "step": 7907 }, { "epoch": 0.642248030536831, "grad_norm": 3.386355939222327, "learning_rate": 1.4987417557042928e-06, "loss": 0.4948, "step": 7908 }, { "epoch": 0.6423292455128726, "grad_norm": 4.044784764443736, "learning_rate": 1.4981392142327761e-06, "loss": 0.504, "step": 7909 }, { "epoch": 0.6424104604889141, "grad_norm": 7.7128980212700915, "learning_rate": 1.4975367420847225e-06, "loss": 0.5462, "step": 7910 }, { "epoch": 0.6424916754649558, "grad_norm": 7.797675381291463, "learning_rate": 1.4969343393018224e-06, "loss": 0.4488, "step": 7911 }, { "epoch": 0.6425728904409973, "grad_norm": 3.7094574496346198, "learning_rate": 1.4963320059257565e-06, "loss": 0.5137, "step": 7912 }, { "epoch": 0.6426541054170389, "grad_norm": 6.476581873676165, "learning_rate": 1.4957297419982047e-06, "loss": 0.4704, "step": 7913 }, { "epoch": 0.6427353203930805, "grad_norm": 4.73377261927438, "learning_rate": 1.4951275475608387e-06, "loss": 0.463, "step": 7914 }, { "epoch": 0.6428165353691221, "grad_norm": 3.1667300398717386, "learning_rate": 1.4945254226553288e-06, "loss": 0.5192, "step": 7915 }, { "epoch": 0.6428977503451636, "grad_norm": 4.733929299465015, "learning_rate": 1.4939233673233387e-06, "loss": 0.6137, "step": 7916 }, { "epoch": 0.6429789653212052, "grad_norm": 5.618667686017774, "learning_rate": 1.4933213816065257e-06, "loss": 0.4966, "step": 7917 }, { "epoch": 0.6430601802972468, "grad_norm": 3.847952094017611, "learning_rate": 1.492719465546546e-06, "loss": 0.4784, "step": 7918 }, { "epoch": 0.6431413952732884, "grad_norm": 4.697213023502698, "learning_rate": 1.492117619185049e-06, "loss": 0.4471, "step": 7919 }, { "epoch": 0.64322261024933, "grad_norm": 5.479689495939854, "learning_rate": 1.4915158425636772e-06, "loss": 0.5378, "step": 7920 }, { "epoch": 0.6433038252253716, "grad_norm": 3.3937052015697367, "learning_rate": 1.4909141357240731e-06, "loss": 0.4669, "step": 7921 }, { "epoch": 0.6433850402014132, "grad_norm": 4.013372087007401, "learning_rate": 1.4903124987078698e-06, "loss": 0.4361, "step": 7922 }, { "epoch": 0.6434662551774547, "grad_norm": 3.746362491526315, "learning_rate": 1.4897109315566974e-06, "loss": 0.4443, "step": 7923 }, { "epoch": 0.6435474701534963, "grad_norm": 4.102308375643804, "learning_rate": 1.4891094343121827e-06, "loss": 0.446, "step": 7924 }, { "epoch": 0.6436286851295379, "grad_norm": 3.8787498068648634, "learning_rate": 1.488508007015944e-06, "loss": 0.7058, "step": 7925 }, { "epoch": 0.6437099001055795, "grad_norm": 7.193908220842526, "learning_rate": 1.487906649709598e-06, "loss": 0.4578, "step": 7926 }, { "epoch": 0.643791115081621, "grad_norm": 7.588715391068836, "learning_rate": 1.4873053624347567e-06, "loss": 0.5096, "step": 7927 }, { "epoch": 0.6438723300576626, "grad_norm": 5.508197098569508, "learning_rate": 1.4867041452330238e-06, "loss": 0.5221, "step": 7928 }, { "epoch": 0.6439535450337042, "grad_norm": 3.6948692770323928, "learning_rate": 1.4861029981460007e-06, "loss": 0.4898, "step": 7929 }, { "epoch": 0.6440347600097458, "grad_norm": 3.770725561207981, "learning_rate": 1.4855019212152852e-06, "loss": 0.4907, "step": 7930 }, { "epoch": 0.6441159749857874, "grad_norm": 6.044600128479535, "learning_rate": 1.484900914482467e-06, "loss": 0.3815, "step": 7931 }, { "epoch": 0.644197189961829, "grad_norm": 5.141487441925867, "learning_rate": 1.484299977989134e-06, "loss": 0.5212, "step": 7932 }, { "epoch": 0.6442784049378706, "grad_norm": 6.722063710271542, "learning_rate": 1.4836991117768657e-06, "loss": 0.465, "step": 7933 }, { "epoch": 0.6443596199139121, "grad_norm": 6.60995189007181, "learning_rate": 1.4830983158872414e-06, "loss": 0.3926, "step": 7934 }, { "epoch": 0.6444408348899537, "grad_norm": 5.159161950257783, "learning_rate": 1.482497590361831e-06, "loss": 0.5679, "step": 7935 }, { "epoch": 0.6445220498659953, "grad_norm": 4.722697499573461, "learning_rate": 1.4818969352422018e-06, "loss": 0.4301, "step": 7936 }, { "epoch": 0.6446032648420369, "grad_norm": 5.60413330763897, "learning_rate": 1.4812963505699179e-06, "loss": 0.5284, "step": 7937 }, { "epoch": 0.6446844798180784, "grad_norm": 4.906758714354853, "learning_rate": 1.4806958363865342e-06, "loss": 0.4881, "step": 7938 }, { "epoch": 0.64476569479412, "grad_norm": 5.416581495053993, "learning_rate": 1.4800953927336036e-06, "loss": 0.4121, "step": 7939 }, { "epoch": 0.6448469097701616, "grad_norm": 3.1479449081387605, "learning_rate": 1.4794950196526753e-06, "loss": 0.4134, "step": 7940 }, { "epoch": 0.6449281247462032, "grad_norm": 4.49719543896871, "learning_rate": 1.4788947171852899e-06, "loss": 0.3979, "step": 7941 }, { "epoch": 0.6450093397222448, "grad_norm": 5.0629449929653205, "learning_rate": 1.4782944853729856e-06, "loss": 0.4008, "step": 7942 }, { "epoch": 0.6450905546982864, "grad_norm": 4.398126234430731, "learning_rate": 1.4776943242572966e-06, "loss": 0.6205, "step": 7943 }, { "epoch": 0.645171769674328, "grad_norm": 4.1699063676723425, "learning_rate": 1.4770942338797491e-06, "loss": 0.5619, "step": 7944 }, { "epoch": 0.6452529846503695, "grad_norm": 83.64698952954932, "learning_rate": 1.4764942142818667e-06, "loss": 0.4946, "step": 7945 }, { "epoch": 0.6453341996264111, "grad_norm": 5.2534053694927625, "learning_rate": 1.475894265505169e-06, "loss": 0.6233, "step": 7946 }, { "epoch": 0.6454154146024527, "grad_norm": 3.941627067484926, "learning_rate": 1.4752943875911673e-06, "loss": 0.4209, "step": 7947 }, { "epoch": 0.6454966295784943, "grad_norm": 9.922976413484275, "learning_rate": 1.4746945805813707e-06, "loss": 0.5124, "step": 7948 }, { "epoch": 0.6455778445545358, "grad_norm": 6.9619380350835085, "learning_rate": 1.4740948445172834e-06, "loss": 0.3986, "step": 7949 }, { "epoch": 0.6456590595305775, "grad_norm": 4.2939089133962325, "learning_rate": 1.4734951794404035e-06, "loss": 0.4537, "step": 7950 }, { "epoch": 0.645740274506619, "grad_norm": 3.1416028655227293, "learning_rate": 1.4728955853922238e-06, "loss": 0.5554, "step": 7951 }, { "epoch": 0.6458214894826606, "grad_norm": 17.016801772420305, "learning_rate": 1.4722960624142336e-06, "loss": 0.4172, "step": 7952 }, { "epoch": 0.6459027044587022, "grad_norm": 5.911786929784765, "learning_rate": 1.4716966105479175e-06, "loss": 0.3805, "step": 7953 }, { "epoch": 0.6459839194347438, "grad_norm": 4.538417255604256, "learning_rate": 1.471097229834753e-06, "loss": 0.4062, "step": 7954 }, { "epoch": 0.6460651344107854, "grad_norm": 7.061375321676026, "learning_rate": 1.4704979203162148e-06, "loss": 0.4817, "step": 7955 }, { "epoch": 0.6461463493868269, "grad_norm": 8.396993615373008, "learning_rate": 1.4698986820337729e-06, "loss": 0.3514, "step": 7956 }, { "epoch": 0.6462275643628685, "grad_norm": 5.11600701972491, "learning_rate": 1.4692995150288896e-06, "loss": 0.4232, "step": 7957 }, { "epoch": 0.6463087793389101, "grad_norm": 3.9201591191720446, "learning_rate": 1.4687004193430248e-06, "loss": 0.6237, "step": 7958 }, { "epoch": 0.6463899943149517, "grad_norm": 4.347166743753607, "learning_rate": 1.4681013950176338e-06, "loss": 0.4223, "step": 7959 }, { "epoch": 0.6464712092909932, "grad_norm": 4.5755798327245625, "learning_rate": 1.4675024420941643e-06, "loss": 0.4167, "step": 7960 }, { "epoch": 0.6465524242670349, "grad_norm": 3.768261457483465, "learning_rate": 1.4669035606140613e-06, "loss": 0.531, "step": 7961 }, { "epoch": 0.6466336392430764, "grad_norm": 4.347537189926498, "learning_rate": 1.4663047506187649e-06, "loss": 0.4719, "step": 7962 }, { "epoch": 0.646714854219118, "grad_norm": 4.256186132283668, "learning_rate": 1.4657060121497095e-06, "loss": 0.55, "step": 7963 }, { "epoch": 0.6467960691951596, "grad_norm": 4.071283516303743, "learning_rate": 1.4651073452483228e-06, "loss": 0.762, "step": 7964 }, { "epoch": 0.6468772841712012, "grad_norm": 10.936892803098575, "learning_rate": 1.4645087499560313e-06, "loss": 0.5069, "step": 7965 }, { "epoch": 0.6469584991472428, "grad_norm": 4.622669916018039, "learning_rate": 1.4639102263142546e-06, "loss": 0.4825, "step": 7966 }, { "epoch": 0.6470397141232843, "grad_norm": 5.995534947389931, "learning_rate": 1.463311774364406e-06, "loss": 0.5098, "step": 7967 }, { "epoch": 0.647120929099326, "grad_norm": 81.25869393551474, "learning_rate": 1.4627133941478958e-06, "loss": 0.6188, "step": 7968 }, { "epoch": 0.6472021440753675, "grad_norm": 5.489822227742221, "learning_rate": 1.46211508570613e-06, "loss": 0.4715, "step": 7969 }, { "epoch": 0.6472833590514091, "grad_norm": 5.211788919279321, "learning_rate": 1.4615168490805066e-06, "loss": 0.3925, "step": 7970 }, { "epoch": 0.6473645740274506, "grad_norm": 4.441097732454784, "learning_rate": 1.4609186843124208e-06, "loss": 0.5827, "step": 7971 }, { "epoch": 0.6474457890034923, "grad_norm": 4.918274629630136, "learning_rate": 1.4603205914432638e-06, "loss": 0.5865, "step": 7972 }, { "epoch": 0.6475270039795338, "grad_norm": 5.381182030506248, "learning_rate": 1.4597225705144189e-06, "loss": 0.427, "step": 7973 }, { "epoch": 0.6476082189555754, "grad_norm": 4.47373690879082, "learning_rate": 1.459124621567266e-06, "loss": 0.5468, "step": 7974 }, { "epoch": 0.647689433931617, "grad_norm": 4.853763535922953, "learning_rate": 1.4585267446431817e-06, "loss": 0.4893, "step": 7975 }, { "epoch": 0.6477706489076586, "grad_norm": 12.67331094044437, "learning_rate": 1.4579289397835344e-06, "loss": 0.6503, "step": 7976 }, { "epoch": 0.6478518638837002, "grad_norm": 3.5376573305444725, "learning_rate": 1.4573312070296885e-06, "loss": 0.48, "step": 7977 }, { "epoch": 0.6479330788597417, "grad_norm": 6.986996329380094, "learning_rate": 1.4567335464230062e-06, "loss": 0.4793, "step": 7978 }, { "epoch": 0.6480142938357834, "grad_norm": 6.098834362052517, "learning_rate": 1.4561359580048394e-06, "loss": 0.567, "step": 7979 }, { "epoch": 0.6480955088118249, "grad_norm": 5.728827122709186, "learning_rate": 1.4555384418165405e-06, "loss": 0.5839, "step": 7980 }, { "epoch": 0.6481767237878665, "grad_norm": 6.108126965155767, "learning_rate": 1.4549409978994543e-06, "loss": 0.721, "step": 7981 }, { "epoch": 0.648257938763908, "grad_norm": 12.175353754173727, "learning_rate": 1.45434362629492e-06, "loss": 0.3989, "step": 7982 }, { "epoch": 0.6483391537399497, "grad_norm": 5.726270390332232, "learning_rate": 1.453746327044272e-06, "loss": 0.5943, "step": 7983 }, { "epoch": 0.6484203687159912, "grad_norm": 8.228427503586767, "learning_rate": 1.4531491001888421e-06, "loss": 0.3702, "step": 7984 }, { "epoch": 0.6485015836920328, "grad_norm": 6.718803197230006, "learning_rate": 1.4525519457699527e-06, "loss": 0.4701, "step": 7985 }, { "epoch": 0.6485827986680744, "grad_norm": 6.288459331865035, "learning_rate": 1.451954863828926e-06, "loss": 0.454, "step": 7986 }, { "epoch": 0.648664013644116, "grad_norm": 5.184071201153414, "learning_rate": 1.4513578544070753e-06, "loss": 0.4614, "step": 7987 }, { "epoch": 0.6487452286201576, "grad_norm": 6.12187597162872, "learning_rate": 1.4507609175457121e-06, "loss": 0.4276, "step": 7988 }, { "epoch": 0.6488264435961991, "grad_norm": 6.050006695871517, "learning_rate": 1.4501640532861405e-06, "loss": 0.4831, "step": 7989 }, { "epoch": 0.6489076585722408, "grad_norm": 6.399026819639625, "learning_rate": 1.4495672616696594e-06, "loss": 0.3898, "step": 7990 }, { "epoch": 0.6489888735482823, "grad_norm": 5.876464671442483, "learning_rate": 1.448970542737565e-06, "loss": 0.5073, "step": 7991 }, { "epoch": 0.6490700885243239, "grad_norm": 4.813231306921125, "learning_rate": 1.4483738965311455e-06, "loss": 0.5262, "step": 7992 }, { "epoch": 0.6491513035003654, "grad_norm": 3.7301648640363716, "learning_rate": 1.4477773230916872e-06, "loss": 0.4813, "step": 7993 }, { "epoch": 0.6492325184764071, "grad_norm": 5.647909285913068, "learning_rate": 1.44718082246047e-06, "loss": 0.442, "step": 7994 }, { "epoch": 0.6493137334524486, "grad_norm": 5.414289909475496, "learning_rate": 1.4465843946787683e-06, "loss": 0.3569, "step": 7995 }, { "epoch": 0.6493949484284902, "grad_norm": 7.022409413689571, "learning_rate": 1.44598803978785e-06, "loss": 0.485, "step": 7996 }, { "epoch": 0.6494761634045318, "grad_norm": 5.325732469754769, "learning_rate": 1.4453917578289823e-06, "loss": 0.6726, "step": 7997 }, { "epoch": 0.6495573783805734, "grad_norm": 6.073037858190773, "learning_rate": 1.4447955488434223e-06, "loss": 0.5305, "step": 7998 }, { "epoch": 0.649638593356615, "grad_norm": 7.0964517666533515, "learning_rate": 1.4441994128724258e-06, "loss": 0.4777, "step": 7999 }, { "epoch": 0.6497198083326565, "grad_norm": 5.760466887618525, "learning_rate": 1.443603349957243e-06, "loss": 0.5041, "step": 8000 }, { "epoch": 0.6498010233086982, "grad_norm": 4.591683516314596, "learning_rate": 1.4430073601391175e-06, "loss": 0.5213, "step": 8001 }, { "epoch": 0.6498822382847397, "grad_norm": 5.711693932834374, "learning_rate": 1.442411443459289e-06, "loss": 0.5668, "step": 8002 }, { "epoch": 0.6499634532607813, "grad_norm": 5.284351713643368, "learning_rate": 1.44181559995899e-06, "loss": 0.6332, "step": 8003 }, { "epoch": 0.6500446682368228, "grad_norm": 7.634523671563961, "learning_rate": 1.4412198296794516e-06, "loss": 0.4598, "step": 8004 }, { "epoch": 0.6501258832128645, "grad_norm": 4.421291562696544, "learning_rate": 1.4406241326618981e-06, "loss": 0.585, "step": 8005 }, { "epoch": 0.650207098188906, "grad_norm": 4.721051742091286, "learning_rate": 1.4400285089475468e-06, "loss": 0.7653, "step": 8006 }, { "epoch": 0.6502883131649476, "grad_norm": 5.351925711601577, "learning_rate": 1.4394329585776143e-06, "loss": 0.3979, "step": 8007 }, { "epoch": 0.6503695281409893, "grad_norm": 4.447327868885781, "learning_rate": 1.4388374815933078e-06, "loss": 0.4846, "step": 8008 }, { "epoch": 0.6504507431170308, "grad_norm": 4.016961136441612, "learning_rate": 1.4382420780358306e-06, "loss": 0.4739, "step": 8009 }, { "epoch": 0.6505319580930724, "grad_norm": 5.414485646946576, "learning_rate": 1.4376467479463832e-06, "loss": 0.3597, "step": 8010 }, { "epoch": 0.6506131730691139, "grad_norm": 11.24686014696604, "learning_rate": 1.4370514913661576e-06, "loss": 0.5058, "step": 8011 }, { "epoch": 0.6506943880451556, "grad_norm": 3.574284491275758, "learning_rate": 1.436456308336343e-06, "loss": 0.6707, "step": 8012 }, { "epoch": 0.6507756030211971, "grad_norm": 4.818858068808228, "learning_rate": 1.4358611988981242e-06, "loss": 0.4094, "step": 8013 }, { "epoch": 0.6508568179972387, "grad_norm": 4.129816073543707, "learning_rate": 1.4352661630926783e-06, "loss": 0.5239, "step": 8014 }, { "epoch": 0.6509380329732802, "grad_norm": 7.2250869844226155, "learning_rate": 1.4346712009611786e-06, "loss": 0.4763, "step": 8015 }, { "epoch": 0.6510192479493219, "grad_norm": 6.519568020534992, "learning_rate": 1.434076312544794e-06, "loss": 0.4563, "step": 8016 }, { "epoch": 0.6511004629253634, "grad_norm": 5.982613565742739, "learning_rate": 1.4334814978846863e-06, "loss": 0.4024, "step": 8017 }, { "epoch": 0.651181677901405, "grad_norm": 6.189908706083811, "learning_rate": 1.4328867570220148e-06, "loss": 0.4997, "step": 8018 }, { "epoch": 0.6512628928774467, "grad_norm": 6.213197790148053, "learning_rate": 1.4322920899979327e-06, "loss": 0.5354, "step": 8019 }, { "epoch": 0.6513441078534882, "grad_norm": 4.184480691904144, "learning_rate": 1.4316974968535873e-06, "loss": 0.3249, "step": 8020 }, { "epoch": 0.6514253228295298, "grad_norm": 4.810772139069584, "learning_rate": 1.4311029776301216e-06, "loss": 0.4938, "step": 8021 }, { "epoch": 0.6515065378055713, "grad_norm": 6.385985541045838, "learning_rate": 1.4305085323686714e-06, "loss": 0.4544, "step": 8022 }, { "epoch": 0.651587752781613, "grad_norm": 4.1470201913398155, "learning_rate": 1.4299141611103717e-06, "loss": 0.5221, "step": 8023 }, { "epoch": 0.6516689677576545, "grad_norm": 4.404511419426839, "learning_rate": 1.4293198638963476e-06, "loss": 0.5142, "step": 8024 }, { "epoch": 0.6517501827336961, "grad_norm": 4.771427690672761, "learning_rate": 1.4287256407677225e-06, "loss": 0.6298, "step": 8025 }, { "epoch": 0.6518313977097376, "grad_norm": 5.110969640062156, "learning_rate": 1.4281314917656144e-06, "loss": 0.4597, "step": 8026 }, { "epoch": 0.6519126126857793, "grad_norm": 4.07743528288326, "learning_rate": 1.4275374169311345e-06, "loss": 0.5317, "step": 8027 }, { "epoch": 0.6519938276618208, "grad_norm": 3.381461916822027, "learning_rate": 1.426943416305388e-06, "loss": 0.6096, "step": 8028 }, { "epoch": 0.6520750426378624, "grad_norm": 6.120291184514056, "learning_rate": 1.4263494899294794e-06, "loss": 0.3527, "step": 8029 }, { "epoch": 0.6521562576139041, "grad_norm": 6.654443944669531, "learning_rate": 1.4257556378445025e-06, "loss": 0.479, "step": 8030 }, { "epoch": 0.6522374725899456, "grad_norm": 5.964757640753549, "learning_rate": 1.4251618600915503e-06, "loss": 0.4986, "step": 8031 }, { "epoch": 0.6523186875659872, "grad_norm": 4.555104801980313, "learning_rate": 1.4245681567117097e-06, "loss": 0.4979, "step": 8032 }, { "epoch": 0.6523999025420287, "grad_norm": 4.9235533317425, "learning_rate": 1.4239745277460614e-06, "loss": 0.4747, "step": 8033 }, { "epoch": 0.6524811175180704, "grad_norm": 6.579534318663932, "learning_rate": 1.4233809732356798e-06, "loss": 0.5299, "step": 8034 }, { "epoch": 0.6525623324941119, "grad_norm": 5.083100688498042, "learning_rate": 1.4227874932216378e-06, "loss": 0.3675, "step": 8035 }, { "epoch": 0.6526435474701535, "grad_norm": 23.029068662665217, "learning_rate": 1.4221940877450006e-06, "loss": 0.5651, "step": 8036 }, { "epoch": 0.652724762446195, "grad_norm": 5.851390054577819, "learning_rate": 1.4216007568468272e-06, "loss": 0.52, "step": 8037 }, { "epoch": 0.6528059774222367, "grad_norm": 3.2181868045877704, "learning_rate": 1.4210075005681737e-06, "loss": 0.5162, "step": 8038 }, { "epoch": 0.6528871923982782, "grad_norm": 7.483188427836994, "learning_rate": 1.420414318950092e-06, "loss": 0.4435, "step": 8039 }, { "epoch": 0.6529684073743198, "grad_norm": 4.4784108288995315, "learning_rate": 1.4198212120336255e-06, "loss": 0.441, "step": 8040 }, { "epoch": 0.6530496223503615, "grad_norm": 6.391569173443781, "learning_rate": 1.4192281798598133e-06, "loss": 0.492, "step": 8041 }, { "epoch": 0.653130837326403, "grad_norm": 5.75063446601935, "learning_rate": 1.4186352224696926e-06, "loss": 0.5992, "step": 8042 }, { "epoch": 0.6532120523024446, "grad_norm": 6.211326632664685, "learning_rate": 1.4180423399042902e-06, "loss": 0.4642, "step": 8043 }, { "epoch": 0.6532932672784861, "grad_norm": 11.997397806821628, "learning_rate": 1.4174495322046316e-06, "loss": 0.5696, "step": 8044 }, { "epoch": 0.6533744822545278, "grad_norm": 7.193039419958394, "learning_rate": 1.4168567994117375e-06, "loss": 0.4744, "step": 8045 }, { "epoch": 0.6534556972305693, "grad_norm": 4.494283159785974, "learning_rate": 1.41626414156662e-06, "loss": 0.3842, "step": 8046 }, { "epoch": 0.6535369122066109, "grad_norm": 5.857275012139722, "learning_rate": 1.4156715587102875e-06, "loss": 0.4835, "step": 8047 }, { "epoch": 0.6536181271826524, "grad_norm": 6.4200372823639675, "learning_rate": 1.4150790508837453e-06, "loss": 0.4392, "step": 8048 }, { "epoch": 0.6536993421586941, "grad_norm": 3.22921759731994, "learning_rate": 1.4144866181279908e-06, "loss": 0.429, "step": 8049 }, { "epoch": 0.6537805571347356, "grad_norm": 4.691847016227414, "learning_rate": 1.4138942604840167e-06, "loss": 0.4329, "step": 8050 }, { "epoch": 0.6538617721107772, "grad_norm": 5.045037115274969, "learning_rate": 1.4133019779928115e-06, "loss": 0.4094, "step": 8051 }, { "epoch": 0.6539429870868189, "grad_norm": 3.3537113111805317, "learning_rate": 1.4127097706953591e-06, "loss": 0.6154, "step": 8052 }, { "epoch": 0.6540242020628604, "grad_norm": 6.660882673507178, "learning_rate": 1.4121176386326352e-06, "loss": 0.3856, "step": 8053 }, { "epoch": 0.654105417038902, "grad_norm": 4.011343252532216, "learning_rate": 1.4115255818456138e-06, "loss": 0.5311, "step": 8054 }, { "epoch": 0.6541866320149435, "grad_norm": 3.3532310150002513, "learning_rate": 1.4109336003752619e-06, "loss": 0.4813, "step": 8055 }, { "epoch": 0.6542678469909852, "grad_norm": 4.86729352164219, "learning_rate": 1.4103416942625397e-06, "loss": 0.4928, "step": 8056 }, { "epoch": 0.6543490619670267, "grad_norm": 3.96263482227693, "learning_rate": 1.4097498635484057e-06, "loss": 0.447, "step": 8057 }, { "epoch": 0.6544302769430683, "grad_norm": 5.204864204379556, "learning_rate": 1.4091581082738122e-06, "loss": 0.3607, "step": 8058 }, { "epoch": 0.6545114919191098, "grad_norm": 4.292415805341833, "learning_rate": 1.4085664284797041e-06, "loss": 0.448, "step": 8059 }, { "epoch": 0.6545927068951515, "grad_norm": 7.880826154296216, "learning_rate": 1.407974824207022e-06, "loss": 0.6328, "step": 8060 }, { "epoch": 0.654673921871193, "grad_norm": 5.310250315901305, "learning_rate": 1.4073832954967032e-06, "loss": 0.4029, "step": 8061 }, { "epoch": 0.6547551368472346, "grad_norm": 6.728553995913191, "learning_rate": 1.406791842389677e-06, "loss": 0.5177, "step": 8062 }, { "epoch": 0.6548363518232763, "grad_norm": 5.981311174156778, "learning_rate": 1.4062004649268696e-06, "loss": 0.4751, "step": 8063 }, { "epoch": 0.6549175667993178, "grad_norm": 6.024059197477494, "learning_rate": 1.405609163149202e-06, "loss": 0.4305, "step": 8064 }, { "epoch": 0.6549987817753594, "grad_norm": 7.798448146594784, "learning_rate": 1.4050179370975886e-06, "loss": 0.4975, "step": 8065 }, { "epoch": 0.6550799967514009, "grad_norm": 4.4502104987754105, "learning_rate": 1.4044267868129374e-06, "loss": 0.4111, "step": 8066 }, { "epoch": 0.6551612117274426, "grad_norm": 3.807078310048636, "learning_rate": 1.4038357123361556e-06, "loss": 0.5769, "step": 8067 }, { "epoch": 0.6552424267034841, "grad_norm": 4.154767587614258, "learning_rate": 1.4032447137081414e-06, "loss": 0.5461, "step": 8068 }, { "epoch": 0.6553236416795257, "grad_norm": 5.8170559826468935, "learning_rate": 1.4026537909697873e-06, "loss": 0.5508, "step": 8069 }, { "epoch": 0.6554048566555672, "grad_norm": 6.732343822655561, "learning_rate": 1.4020629441619831e-06, "loss": 0.4174, "step": 8070 }, { "epoch": 0.6554860716316089, "grad_norm": 15.814471928424329, "learning_rate": 1.4014721733256137e-06, "loss": 0.3694, "step": 8071 }, { "epoch": 0.6555672866076504, "grad_norm": 5.728680709044823, "learning_rate": 1.4008814785015548e-06, "loss": 0.5148, "step": 8072 }, { "epoch": 0.655648501583692, "grad_norm": 5.735426011218967, "learning_rate": 1.4002908597306817e-06, "loss": 0.3364, "step": 8073 }, { "epoch": 0.6557297165597337, "grad_norm": 3.8279642749863916, "learning_rate": 1.3997003170538608e-06, "loss": 0.4943, "step": 8074 }, { "epoch": 0.6558109315357752, "grad_norm": 8.191610658404883, "learning_rate": 1.3991098505119537e-06, "loss": 0.6124, "step": 8075 }, { "epoch": 0.6558921465118168, "grad_norm": 22.562198626943964, "learning_rate": 1.3985194601458192e-06, "loss": 0.6547, "step": 8076 }, { "epoch": 0.6559733614878583, "grad_norm": 5.841079008969222, "learning_rate": 1.3979291459963087e-06, "loss": 0.2968, "step": 8077 }, { "epoch": 0.6560545764639, "grad_norm": 3.714067743777654, "learning_rate": 1.397338908104269e-06, "loss": 0.43, "step": 8078 }, { "epoch": 0.6561357914399415, "grad_norm": 4.366280468082913, "learning_rate": 1.3967487465105401e-06, "loss": 0.643, "step": 8079 }, { "epoch": 0.6562170064159831, "grad_norm": 4.043858317470031, "learning_rate": 1.3961586612559602e-06, "loss": 0.6495, "step": 8080 }, { "epoch": 0.6562982213920246, "grad_norm": 4.59900089958901, "learning_rate": 1.3955686523813588e-06, "loss": 0.3702, "step": 8081 }, { "epoch": 0.6563794363680663, "grad_norm": 6.449912994740261, "learning_rate": 1.3949787199275606e-06, "loss": 0.5347, "step": 8082 }, { "epoch": 0.6564606513441078, "grad_norm": 5.373213623100858, "learning_rate": 1.3943888639353866e-06, "loss": 0.4742, "step": 8083 }, { "epoch": 0.6565418663201494, "grad_norm": 5.296870374063085, "learning_rate": 1.3937990844456528e-06, "loss": 0.4213, "step": 8084 }, { "epoch": 0.6566230812961911, "grad_norm": 3.8879241039555734, "learning_rate": 1.393209381499167e-06, "loss": 0.3738, "step": 8085 }, { "epoch": 0.6567042962722326, "grad_norm": 4.289208191857055, "learning_rate": 1.3926197551367355e-06, "loss": 0.4136, "step": 8086 }, { "epoch": 0.6567855112482742, "grad_norm": 5.234771066049035, "learning_rate": 1.3920302053991564e-06, "loss": 0.3542, "step": 8087 }, { "epoch": 0.6568667262243157, "grad_norm": 5.571643681394787, "learning_rate": 1.3914407323272216e-06, "loss": 0.4513, "step": 8088 }, { "epoch": 0.6569479412003574, "grad_norm": 7.213207108596739, "learning_rate": 1.3908513359617217e-06, "loss": 0.5061, "step": 8089 }, { "epoch": 0.6570291561763989, "grad_norm": 4.780947638090955, "learning_rate": 1.39026201634344e-06, "loss": 0.3634, "step": 8090 }, { "epoch": 0.6571103711524405, "grad_norm": 3.0166870867114053, "learning_rate": 1.3896727735131538e-06, "loss": 0.6613, "step": 8091 }, { "epoch": 0.657191586128482, "grad_norm": 3.8519737800090974, "learning_rate": 1.3890836075116343e-06, "loss": 0.4642, "step": 8092 }, { "epoch": 0.6572728011045237, "grad_norm": 4.334939110181824, "learning_rate": 1.3884945183796505e-06, "loss": 0.5912, "step": 8093 }, { "epoch": 0.6573540160805652, "grad_norm": 6.294374675194089, "learning_rate": 1.3879055061579635e-06, "loss": 0.7303, "step": 8094 }, { "epoch": 0.6574352310566068, "grad_norm": 3.7598306069040546, "learning_rate": 1.3873165708873286e-06, "loss": 0.5085, "step": 8095 }, { "epoch": 0.6575164460326485, "grad_norm": 6.863418465636099, "learning_rate": 1.3867277126084989e-06, "loss": 0.5744, "step": 8096 }, { "epoch": 0.65759766100869, "grad_norm": 7.054233776542322, "learning_rate": 1.3861389313622197e-06, "loss": 0.5576, "step": 8097 }, { "epoch": 0.6576788759847316, "grad_norm": 4.4365820824808555, "learning_rate": 1.3855502271892313e-06, "loss": 0.423, "step": 8098 }, { "epoch": 0.6577600909607731, "grad_norm": 6.862928291636716, "learning_rate": 1.3849616001302696e-06, "loss": 0.4621, "step": 8099 }, { "epoch": 0.6578413059368148, "grad_norm": 5.667668412588925, "learning_rate": 1.3843730502260639e-06, "loss": 0.4926, "step": 8100 }, { "epoch": 0.6579225209128563, "grad_norm": 4.227345644846646, "learning_rate": 1.3837845775173375e-06, "loss": 0.6139, "step": 8101 }, { "epoch": 0.6580037358888979, "grad_norm": 4.744805542732661, "learning_rate": 1.383196182044811e-06, "loss": 0.5144, "step": 8102 }, { "epoch": 0.6580849508649395, "grad_norm": 3.999861099199686, "learning_rate": 1.3826078638491994e-06, "loss": 0.561, "step": 8103 }, { "epoch": 0.6581661658409811, "grad_norm": 4.070333113027189, "learning_rate": 1.3820196229712085e-06, "loss": 0.4381, "step": 8104 }, { "epoch": 0.6582473808170226, "grad_norm": 6.884502103701524, "learning_rate": 1.3814314594515443e-06, "loss": 0.6255, "step": 8105 }, { "epoch": 0.6583285957930642, "grad_norm": 4.918735322058878, "learning_rate": 1.3808433733309028e-06, "loss": 0.548, "step": 8106 }, { "epoch": 0.6584098107691059, "grad_norm": 16.27622530232076, "learning_rate": 1.380255364649976e-06, "loss": 0.3939, "step": 8107 }, { "epoch": 0.6584910257451474, "grad_norm": 10.25631338164915, "learning_rate": 1.3796674334494529e-06, "loss": 0.581, "step": 8108 }, { "epoch": 0.658572240721189, "grad_norm": 5.903422438123663, "learning_rate": 1.3790795797700129e-06, "loss": 0.5879, "step": 8109 }, { "epoch": 0.6586534556972305, "grad_norm": 5.425671652540845, "learning_rate": 1.3784918036523346e-06, "loss": 0.4939, "step": 8110 }, { "epoch": 0.6587346706732722, "grad_norm": 6.259094931538714, "learning_rate": 1.377904105137087e-06, "loss": 0.5689, "step": 8111 }, { "epoch": 0.6588158856493137, "grad_norm": 5.861839266316476, "learning_rate": 1.3773164842649377e-06, "loss": 0.4444, "step": 8112 }, { "epoch": 0.6588971006253553, "grad_norm": 6.629921332417931, "learning_rate": 1.376728941076546e-06, "loss": 0.3919, "step": 8113 }, { "epoch": 0.6589783156013969, "grad_norm": 6.496651649704183, "learning_rate": 1.3761414756125658e-06, "loss": 0.5354, "step": 8114 }, { "epoch": 0.6590595305774385, "grad_norm": 4.3858502738087655, "learning_rate": 1.3755540879136474e-06, "loss": 0.5213, "step": 8115 }, { "epoch": 0.65914074555348, "grad_norm": 5.705183500815472, "learning_rate": 1.3749667780204365e-06, "loss": 0.5628, "step": 8116 }, { "epoch": 0.6592219605295216, "grad_norm": 3.5828316864822236, "learning_rate": 1.3743795459735692e-06, "loss": 0.518, "step": 8117 }, { "epoch": 0.6593031755055633, "grad_norm": 4.1843405148298025, "learning_rate": 1.373792391813681e-06, "loss": 0.6037, "step": 8118 }, { "epoch": 0.6593843904816048, "grad_norm": 5.543134932829188, "learning_rate": 1.3732053155813987e-06, "loss": 0.3632, "step": 8119 }, { "epoch": 0.6594656054576464, "grad_norm": 3.9215981353205938, "learning_rate": 1.3726183173173441e-06, "loss": 0.4956, "step": 8120 }, { "epoch": 0.659546820433688, "grad_norm": 6.56518519854498, "learning_rate": 1.3720313970621369e-06, "loss": 0.479, "step": 8121 }, { "epoch": 0.6596280354097296, "grad_norm": 8.721756176365712, "learning_rate": 1.3714445548563856e-06, "loss": 0.5228, "step": 8122 }, { "epoch": 0.6597092503857711, "grad_norm": 6.414976174126078, "learning_rate": 1.3708577907406988e-06, "loss": 0.5035, "step": 8123 }, { "epoch": 0.6597904653618127, "grad_norm": 5.440110547817854, "learning_rate": 1.3702711047556777e-06, "loss": 0.4776, "step": 8124 }, { "epoch": 0.6598716803378543, "grad_norm": 5.614130528625265, "learning_rate": 1.3696844969419174e-06, "loss": 0.3457, "step": 8125 }, { "epoch": 0.6599528953138959, "grad_norm": 4.022888633337103, "learning_rate": 1.3690979673400067e-06, "loss": 0.445, "step": 8126 }, { "epoch": 0.6600341102899374, "grad_norm": 3.741552498785602, "learning_rate": 1.3685115159905325e-06, "loss": 0.3523, "step": 8127 }, { "epoch": 0.660115325265979, "grad_norm": 5.950815202545946, "learning_rate": 1.3679251429340717e-06, "loss": 0.738, "step": 8128 }, { "epoch": 0.6601965402420207, "grad_norm": 9.51105805660359, "learning_rate": 1.367338848211201e-06, "loss": 0.4257, "step": 8129 }, { "epoch": 0.6602777552180622, "grad_norm": 2.9131542456623314, "learning_rate": 1.3667526318624862e-06, "loss": 0.4203, "step": 8130 }, { "epoch": 0.6603589701941038, "grad_norm": 6.754300006832807, "learning_rate": 1.366166493928493e-06, "loss": 0.6132, "step": 8131 }, { "epoch": 0.6604401851701454, "grad_norm": 11.724662577057464, "learning_rate": 1.3655804344497775e-06, "loss": 0.4595, "step": 8132 }, { "epoch": 0.660521400146187, "grad_norm": 5.387659360154549, "learning_rate": 1.364994453466891e-06, "loss": 0.4081, "step": 8133 }, { "epoch": 0.6606026151222285, "grad_norm": 5.107977084416256, "learning_rate": 1.3644085510203813e-06, "loss": 0.4898, "step": 8134 }, { "epoch": 0.6606838300982701, "grad_norm": 3.8833833190727183, "learning_rate": 1.363822727150791e-06, "loss": 0.5417, "step": 8135 }, { "epoch": 0.6607650450743117, "grad_norm": 7.8141538363601795, "learning_rate": 1.363236981898654e-06, "loss": 0.4968, "step": 8136 }, { "epoch": 0.6608462600503533, "grad_norm": 3.590897276111095, "learning_rate": 1.3626513153045024e-06, "loss": 0.5024, "step": 8137 }, { "epoch": 0.6609274750263948, "grad_norm": 3.612475255145642, "learning_rate": 1.3620657274088606e-06, "loss": 0.4954, "step": 8138 }, { "epoch": 0.6610086900024364, "grad_norm": 14.567400918143866, "learning_rate": 1.3614802182522469e-06, "loss": 0.668, "step": 8139 }, { "epoch": 0.6610899049784781, "grad_norm": 11.017893005217958, "learning_rate": 1.3608947878751777e-06, "loss": 0.4346, "step": 8140 }, { "epoch": 0.6611711199545196, "grad_norm": 3.7950075970163617, "learning_rate": 1.3603094363181596e-06, "loss": 0.5231, "step": 8141 }, { "epoch": 0.6612523349305612, "grad_norm": 7.66686320728164, "learning_rate": 1.3597241636216965e-06, "loss": 0.6696, "step": 8142 }, { "epoch": 0.6613335499066028, "grad_norm": 5.608044387753557, "learning_rate": 1.3591389698262875e-06, "loss": 0.5376, "step": 8143 }, { "epoch": 0.6614147648826444, "grad_norm": 4.03444037697512, "learning_rate": 1.3585538549724242e-06, "loss": 0.526, "step": 8144 }, { "epoch": 0.6614959798586859, "grad_norm": 5.082198135850972, "learning_rate": 1.3579688191005926e-06, "loss": 0.5056, "step": 8145 }, { "epoch": 0.6615771948347275, "grad_norm": 3.3444467710171346, "learning_rate": 1.3573838622512743e-06, "loss": 0.6148, "step": 8146 }, { "epoch": 0.6616584098107691, "grad_norm": 4.212117525236555, "learning_rate": 1.3567989844649448e-06, "loss": 0.5947, "step": 8147 }, { "epoch": 0.6617396247868107, "grad_norm": 5.234823635180376, "learning_rate": 1.3562141857820765e-06, "loss": 0.5863, "step": 8148 }, { "epoch": 0.6618208397628522, "grad_norm": 9.716406215450048, "learning_rate": 1.3556294662431325e-06, "loss": 0.4493, "step": 8149 }, { "epoch": 0.6619020547388939, "grad_norm": 5.722283978029896, "learning_rate": 1.3550448258885734e-06, "loss": 0.4364, "step": 8150 }, { "epoch": 0.6619832697149355, "grad_norm": 4.763814608798664, "learning_rate": 1.3544602647588528e-06, "loss": 0.742, "step": 8151 }, { "epoch": 0.662064484690977, "grad_norm": 5.80232988441981, "learning_rate": 1.3538757828944188e-06, "loss": 0.4423, "step": 8152 }, { "epoch": 0.6621456996670186, "grad_norm": 7.5957193433728145, "learning_rate": 1.353291380335715e-06, "loss": 0.5588, "step": 8153 }, { "epoch": 0.6622269146430602, "grad_norm": 4.336543165787254, "learning_rate": 1.3527070571231786e-06, "loss": 0.5969, "step": 8154 }, { "epoch": 0.6623081296191018, "grad_norm": 4.034182275364077, "learning_rate": 1.3521228132972414e-06, "loss": 0.4803, "step": 8155 }, { "epoch": 0.6623893445951433, "grad_norm": 6.6167843827609865, "learning_rate": 1.3515386488983317e-06, "loss": 0.4521, "step": 8156 }, { "epoch": 0.662470559571185, "grad_norm": 7.161780111206926, "learning_rate": 1.3509545639668691e-06, "loss": 0.3106, "step": 8157 }, { "epoch": 0.6625517745472265, "grad_norm": 5.4096275250827945, "learning_rate": 1.3503705585432687e-06, "loss": 0.4119, "step": 8158 }, { "epoch": 0.6626329895232681, "grad_norm": 7.137614931386881, "learning_rate": 1.349786632667942e-06, "loss": 0.6203, "step": 8159 }, { "epoch": 0.6627142044993096, "grad_norm": 4.038451931034832, "learning_rate": 1.3492027863812924e-06, "loss": 0.749, "step": 8160 }, { "epoch": 0.6627954194753513, "grad_norm": 4.101256157779943, "learning_rate": 1.3486190197237189e-06, "loss": 0.5766, "step": 8161 }, { "epoch": 0.6628766344513929, "grad_norm": 3.827717281308639, "learning_rate": 1.348035332735617e-06, "loss": 0.3621, "step": 8162 }, { "epoch": 0.6629578494274344, "grad_norm": 3.2797754851269856, "learning_rate": 1.3474517254573731e-06, "loss": 0.6257, "step": 8163 }, { "epoch": 0.663039064403476, "grad_norm": 7.6698914171017245, "learning_rate": 1.3468681979293702e-06, "loss": 0.413, "step": 8164 }, { "epoch": 0.6631202793795176, "grad_norm": 6.602624984901768, "learning_rate": 1.3462847501919843e-06, "loss": 0.3934, "step": 8165 }, { "epoch": 0.6632014943555592, "grad_norm": 7.1560351882255215, "learning_rate": 1.3457013822855886e-06, "loss": 0.7646, "step": 8166 }, { "epoch": 0.6632827093316007, "grad_norm": 6.274238416078604, "learning_rate": 1.345118094250547e-06, "loss": 0.408, "step": 8167 }, { "epoch": 0.6633639243076423, "grad_norm": 4.400441017779448, "learning_rate": 1.3445348861272217e-06, "loss": 0.4495, "step": 8168 }, { "epoch": 0.6634451392836839, "grad_norm": 4.941239230227576, "learning_rate": 1.3439517579559675e-06, "loss": 0.3428, "step": 8169 }, { "epoch": 0.6635263542597255, "grad_norm": 6.964343826754793, "learning_rate": 1.3433687097771337e-06, "loss": 0.4209, "step": 8170 }, { "epoch": 0.663607569235767, "grad_norm": 5.512709601692857, "learning_rate": 1.3427857416310626e-06, "loss": 0.5785, "step": 8171 }, { "epoch": 0.6636887842118087, "grad_norm": 4.294567753627923, "learning_rate": 1.3422028535580947e-06, "loss": 0.4546, "step": 8172 }, { "epoch": 0.6637699991878503, "grad_norm": 4.0701024523531135, "learning_rate": 1.3416200455985607e-06, "loss": 0.8187, "step": 8173 }, { "epoch": 0.6638512141638918, "grad_norm": 7.655445454850961, "learning_rate": 1.3410373177927893e-06, "loss": 0.4059, "step": 8174 }, { "epoch": 0.6639324291399334, "grad_norm": 2.7798324043491838, "learning_rate": 1.3404546701811022e-06, "loss": 0.5913, "step": 8175 }, { "epoch": 0.664013644115975, "grad_norm": 3.646058875181944, "learning_rate": 1.3398721028038155e-06, "loss": 0.5026, "step": 8176 }, { "epoch": 0.6640948590920166, "grad_norm": 4.496311517930436, "learning_rate": 1.3392896157012386e-06, "loss": 0.6419, "step": 8177 }, { "epoch": 0.6641760740680581, "grad_norm": 17.84153072314891, "learning_rate": 1.3387072089136776e-06, "loss": 0.5958, "step": 8178 }, { "epoch": 0.6642572890440998, "grad_norm": 3.8720341467833834, "learning_rate": 1.3381248824814326e-06, "loss": 0.5721, "step": 8179 }, { "epoch": 0.6643385040201413, "grad_norm": 4.7847332539123935, "learning_rate": 1.337542636444795e-06, "loss": 0.44, "step": 8180 }, { "epoch": 0.6644197189961829, "grad_norm": 6.021015620083501, "learning_rate": 1.3369604708440548e-06, "loss": 0.3977, "step": 8181 }, { "epoch": 0.6645009339722244, "grad_norm": 11.600335989741517, "learning_rate": 1.3363783857194957e-06, "loss": 0.469, "step": 8182 }, { "epoch": 0.6645821489482661, "grad_norm": 3.859966985311204, "learning_rate": 1.3357963811113938e-06, "loss": 0.4263, "step": 8183 }, { "epoch": 0.6646633639243077, "grad_norm": 18.782716428504518, "learning_rate": 1.3352144570600203e-06, "loss": 0.6374, "step": 8184 }, { "epoch": 0.6647445789003492, "grad_norm": 3.387091226384912, "learning_rate": 1.3346326136056425e-06, "loss": 0.4308, "step": 8185 }, { "epoch": 0.6648257938763908, "grad_norm": 4.252596910764206, "learning_rate": 1.3340508507885194e-06, "loss": 0.5107, "step": 8186 }, { "epoch": 0.6649070088524324, "grad_norm": 7.6038955526056995, "learning_rate": 1.3334691686489064e-06, "loss": 0.4946, "step": 8187 }, { "epoch": 0.664988223828474, "grad_norm": 4.976169303513984, "learning_rate": 1.3328875672270547e-06, "loss": 0.4381, "step": 8188 }, { "epoch": 0.6650694388045155, "grad_norm": 4.667554378601018, "learning_rate": 1.332306046563206e-06, "loss": 0.6222, "step": 8189 }, { "epoch": 0.6651506537805572, "grad_norm": 4.379049488125857, "learning_rate": 1.3317246066975981e-06, "loss": 0.6358, "step": 8190 }, { "epoch": 0.6652318687565987, "grad_norm": 3.794618143261633, "learning_rate": 1.3311432476704655e-06, "loss": 0.4243, "step": 8191 }, { "epoch": 0.6653130837326403, "grad_norm": 5.072184639847683, "learning_rate": 1.3305619695220332e-06, "loss": 0.5623, "step": 8192 }, { "epoch": 0.6653942987086818, "grad_norm": 7.59051423468418, "learning_rate": 1.3299807722925231e-06, "loss": 0.4993, "step": 8193 }, { "epoch": 0.6654755136847235, "grad_norm": 8.819021574798183, "learning_rate": 1.3293996560221526e-06, "loss": 0.4178, "step": 8194 }, { "epoch": 0.6655567286607651, "grad_norm": 6.287522877063349, "learning_rate": 1.3288186207511303e-06, "loss": 0.3488, "step": 8195 }, { "epoch": 0.6656379436368066, "grad_norm": 5.563934142204648, "learning_rate": 1.3282376665196603e-06, "loss": 0.4812, "step": 8196 }, { "epoch": 0.6657191586128482, "grad_norm": 3.033804351653404, "learning_rate": 1.327656793367943e-06, "loss": 0.4407, "step": 8197 }, { "epoch": 0.6658003735888898, "grad_norm": 3.998217455418668, "learning_rate": 1.3270760013361713e-06, "loss": 0.4281, "step": 8198 }, { "epoch": 0.6658815885649314, "grad_norm": 4.64706121281191, "learning_rate": 1.3264952904645317e-06, "loss": 0.6847, "step": 8199 }, { "epoch": 0.6659628035409729, "grad_norm": 6.436231964294674, "learning_rate": 1.325914660793207e-06, "loss": 0.4931, "step": 8200 }, { "epoch": 0.6660440185170146, "grad_norm": 3.707584274007003, "learning_rate": 1.3253341123623756e-06, "loss": 0.5129, "step": 8201 }, { "epoch": 0.6661252334930561, "grad_norm": 6.262622978288038, "learning_rate": 1.3247536452122064e-06, "loss": 0.5491, "step": 8202 }, { "epoch": 0.6662064484690977, "grad_norm": 4.982037820278029, "learning_rate": 1.3241732593828644e-06, "loss": 0.4305, "step": 8203 }, { "epoch": 0.6662876634451392, "grad_norm": 6.381634581129901, "learning_rate": 1.3235929549145105e-06, "loss": 0.4896, "step": 8204 }, { "epoch": 0.6663688784211809, "grad_norm": 7.969785119936784, "learning_rate": 1.3230127318472972e-06, "loss": 0.5373, "step": 8205 }, { "epoch": 0.6664500933972225, "grad_norm": 6.0137267480338945, "learning_rate": 1.3224325902213736e-06, "loss": 0.476, "step": 8206 }, { "epoch": 0.666531308373264, "grad_norm": 6.68986461696678, "learning_rate": 1.3218525300768837e-06, "loss": 0.4782, "step": 8207 }, { "epoch": 0.6666125233493057, "grad_norm": 6.316034448019237, "learning_rate": 1.3212725514539635e-06, "loss": 0.4462, "step": 8208 }, { "epoch": 0.6666937383253472, "grad_norm": 5.228047836489257, "learning_rate": 1.3206926543927435e-06, "loss": 0.3812, "step": 8209 }, { "epoch": 0.6667749533013888, "grad_norm": 6.369850283074102, "learning_rate": 1.320112838933351e-06, "loss": 0.469, "step": 8210 }, { "epoch": 0.6668561682774303, "grad_norm": 4.859894104230745, "learning_rate": 1.3195331051159058e-06, "loss": 0.4444, "step": 8211 }, { "epoch": 0.666937383253472, "grad_norm": 4.0624444842962735, "learning_rate": 1.3189534529805212e-06, "loss": 0.4554, "step": 8212 }, { "epoch": 0.6670185982295135, "grad_norm": 3.4817004946398287, "learning_rate": 1.318373882567307e-06, "loss": 0.5201, "step": 8213 }, { "epoch": 0.6670998132055551, "grad_norm": 4.704982903669908, "learning_rate": 1.3177943939163677e-06, "loss": 0.4444, "step": 8214 }, { "epoch": 0.6671810281815966, "grad_norm": 5.794219397597491, "learning_rate": 1.3172149870677985e-06, "loss": 0.4768, "step": 8215 }, { "epoch": 0.6672622431576383, "grad_norm": 6.208905832820019, "learning_rate": 1.3166356620616932e-06, "loss": 0.5479, "step": 8216 }, { "epoch": 0.6673434581336799, "grad_norm": 8.198766362247154, "learning_rate": 1.3160564189381376e-06, "loss": 0.4532, "step": 8217 }, { "epoch": 0.6674246731097214, "grad_norm": 4.331763449279288, "learning_rate": 1.3154772577372104e-06, "loss": 0.4955, "step": 8218 }, { "epoch": 0.667505888085763, "grad_norm": 5.10837125089982, "learning_rate": 1.3148981784989884e-06, "loss": 0.4523, "step": 8219 }, { "epoch": 0.6675871030618046, "grad_norm": 3.5163841336361807, "learning_rate": 1.3143191812635408e-06, "loss": 0.6125, "step": 8220 }, { "epoch": 0.6676683180378462, "grad_norm": 11.844210364989475, "learning_rate": 1.3137402660709314e-06, "loss": 0.4425, "step": 8221 }, { "epoch": 0.6677495330138877, "grad_norm": 7.996601522252169, "learning_rate": 1.3131614329612158e-06, "loss": 0.6123, "step": 8222 }, { "epoch": 0.6678307479899294, "grad_norm": 6.114487257718311, "learning_rate": 1.3125826819744493e-06, "loss": 0.5059, "step": 8223 }, { "epoch": 0.6679119629659709, "grad_norm": 4.43142805188028, "learning_rate": 1.3120040131506767e-06, "loss": 0.4303, "step": 8224 }, { "epoch": 0.6679931779420125, "grad_norm": 5.687962653675712, "learning_rate": 1.3114254265299379e-06, "loss": 0.4586, "step": 8225 }, { "epoch": 0.668074392918054, "grad_norm": 4.1918238610968475, "learning_rate": 1.310846922152269e-06, "loss": 0.5445, "step": 8226 }, { "epoch": 0.6681556078940957, "grad_norm": 5.179655298264121, "learning_rate": 1.310268500057701e-06, "loss": 0.5172, "step": 8227 }, { "epoch": 0.6682368228701373, "grad_norm": 7.779574674479231, "learning_rate": 1.309690160286255e-06, "loss": 0.5864, "step": 8228 }, { "epoch": 0.6683180378461788, "grad_norm": 6.122052156829049, "learning_rate": 1.3091119028779514e-06, "loss": 0.548, "step": 8229 }, { "epoch": 0.6683992528222205, "grad_norm": 4.789126863334509, "learning_rate": 1.308533727872801e-06, "loss": 0.4313, "step": 8230 }, { "epoch": 0.668480467798262, "grad_norm": 5.766521551851652, "learning_rate": 1.3079556353108106e-06, "loss": 0.532, "step": 8231 }, { "epoch": 0.6685616827743036, "grad_norm": 6.05863385781591, "learning_rate": 1.307377625231981e-06, "loss": 0.443, "step": 8232 }, { "epoch": 0.6686428977503451, "grad_norm": 4.181674552131915, "learning_rate": 1.3067996976763086e-06, "loss": 0.5801, "step": 8233 }, { "epoch": 0.6687241127263868, "grad_norm": 3.206551391730835, "learning_rate": 1.3062218526837828e-06, "loss": 0.5704, "step": 8234 }, { "epoch": 0.6688053277024283, "grad_norm": 10.62932913375947, "learning_rate": 1.3056440902943856e-06, "loss": 0.4123, "step": 8235 }, { "epoch": 0.6688865426784699, "grad_norm": 4.544980879348178, "learning_rate": 1.305066410548097e-06, "loss": 0.5647, "step": 8236 }, { "epoch": 0.6689677576545114, "grad_norm": 5.011004929662773, "learning_rate": 1.304488813484889e-06, "loss": 0.5001, "step": 8237 }, { "epoch": 0.6690489726305531, "grad_norm": 5.610021787019349, "learning_rate": 1.303911299144727e-06, "loss": 0.4763, "step": 8238 }, { "epoch": 0.6691301876065947, "grad_norm": 87.36388987621034, "learning_rate": 1.3033338675675726e-06, "loss": 0.4336, "step": 8239 }, { "epoch": 0.6692114025826362, "grad_norm": 3.9673278123195566, "learning_rate": 1.3027565187933828e-06, "loss": 0.4406, "step": 8240 }, { "epoch": 0.6692926175586779, "grad_norm": 4.0150982480618165, "learning_rate": 1.3021792528621041e-06, "loss": 0.4505, "step": 8241 }, { "epoch": 0.6693738325347194, "grad_norm": 3.8387858911160473, "learning_rate": 1.3016020698136827e-06, "loss": 0.4101, "step": 8242 }, { "epoch": 0.669455047510761, "grad_norm": 3.2719359158581245, "learning_rate": 1.3010249696880558e-06, "loss": 0.5057, "step": 8243 }, { "epoch": 0.6695362624868025, "grad_norm": 8.689632620129073, "learning_rate": 1.3004479525251545e-06, "loss": 0.3984, "step": 8244 }, { "epoch": 0.6696174774628442, "grad_norm": 4.334649919352921, "learning_rate": 1.2998710183649066e-06, "loss": 0.48, "step": 8245 }, { "epoch": 0.6696986924388857, "grad_norm": 3.604389609859541, "learning_rate": 1.2992941672472332e-06, "loss": 0.4935, "step": 8246 }, { "epoch": 0.6697799074149273, "grad_norm": 4.775702711045651, "learning_rate": 1.2987173992120478e-06, "loss": 0.4603, "step": 8247 }, { "epoch": 0.6698611223909688, "grad_norm": 5.179644996464377, "learning_rate": 1.2981407142992618e-06, "loss": 0.4972, "step": 8248 }, { "epoch": 0.6699423373670105, "grad_norm": 4.832968999586265, "learning_rate": 1.2975641125487777e-06, "loss": 0.5587, "step": 8249 }, { "epoch": 0.6700235523430521, "grad_norm": 4.3612144381710065, "learning_rate": 1.2969875940004923e-06, "loss": 0.335, "step": 8250 }, { "epoch": 0.6701047673190936, "grad_norm": 6.055517461311078, "learning_rate": 1.2964111586942996e-06, "loss": 0.6306, "step": 8251 }, { "epoch": 0.6701859822951353, "grad_norm": 3.996269903879433, "learning_rate": 1.2958348066700833e-06, "loss": 0.5962, "step": 8252 }, { "epoch": 0.6702671972711768, "grad_norm": 6.099178781598084, "learning_rate": 1.2952585379677268e-06, "loss": 0.453, "step": 8253 }, { "epoch": 0.6703484122472184, "grad_norm": 6.116289571948397, "learning_rate": 1.2946823526271023e-06, "loss": 0.4299, "step": 8254 }, { "epoch": 0.6704296272232599, "grad_norm": 5.092428454455709, "learning_rate": 1.2941062506880811e-06, "loss": 0.5551, "step": 8255 }, { "epoch": 0.6705108421993016, "grad_norm": 5.682161639107994, "learning_rate": 1.2935302321905252e-06, "loss": 0.6384, "step": 8256 }, { "epoch": 0.6705920571753431, "grad_norm": 4.3807459106680575, "learning_rate": 1.292954297174291e-06, "loss": 0.6603, "step": 8257 }, { "epoch": 0.6706732721513847, "grad_norm": 3.7416878890359517, "learning_rate": 1.2923784456792314e-06, "loss": 0.4831, "step": 8258 }, { "epoch": 0.6707544871274262, "grad_norm": 3.7742295359579874, "learning_rate": 1.291802677745193e-06, "loss": 0.4317, "step": 8259 }, { "epoch": 0.6708357021034679, "grad_norm": 4.136648173116798, "learning_rate": 1.2912269934120142e-06, "loss": 0.5397, "step": 8260 }, { "epoch": 0.6709169170795095, "grad_norm": 10.177523748838187, "learning_rate": 1.2906513927195308e-06, "loss": 0.4942, "step": 8261 }, { "epoch": 0.670998132055551, "grad_norm": 5.041065114037551, "learning_rate": 1.290075875707571e-06, "loss": 0.3985, "step": 8262 }, { "epoch": 0.6710793470315927, "grad_norm": 6.204016855298799, "learning_rate": 1.2895004424159557e-06, "loss": 0.4884, "step": 8263 }, { "epoch": 0.6711605620076342, "grad_norm": 6.451950713200213, "learning_rate": 1.2889250928845038e-06, "loss": 0.3397, "step": 8264 }, { "epoch": 0.6712417769836758, "grad_norm": 5.789167573594882, "learning_rate": 1.2883498271530265e-06, "loss": 0.5267, "step": 8265 }, { "epoch": 0.6713229919597173, "grad_norm": 3.020900821630963, "learning_rate": 1.2877746452613277e-06, "loss": 0.5771, "step": 8266 }, { "epoch": 0.671404206935759, "grad_norm": 7.136575144733493, "learning_rate": 1.2871995472492088e-06, "loss": 0.4145, "step": 8267 }, { "epoch": 0.6714854219118005, "grad_norm": 6.86909884452662, "learning_rate": 1.2866245331564627e-06, "loss": 0.4899, "step": 8268 }, { "epoch": 0.6715666368878421, "grad_norm": 4.392832664582703, "learning_rate": 1.2860496030228763e-06, "loss": 0.7517, "step": 8269 }, { "epoch": 0.6716478518638836, "grad_norm": 4.036722831992974, "learning_rate": 1.2854747568882336e-06, "loss": 0.4853, "step": 8270 }, { "epoch": 0.6717290668399253, "grad_norm": 4.039548813778698, "learning_rate": 1.2848999947923089e-06, "loss": 0.3944, "step": 8271 }, { "epoch": 0.6718102818159669, "grad_norm": 4.0410979346726625, "learning_rate": 1.2843253167748745e-06, "loss": 0.4949, "step": 8272 }, { "epoch": 0.6718914967920084, "grad_norm": 4.170268917322463, "learning_rate": 1.2837507228756934e-06, "loss": 0.7473, "step": 8273 }, { "epoch": 0.6719727117680501, "grad_norm": 5.339351144770748, "learning_rate": 1.2831762131345265e-06, "loss": 0.3525, "step": 8274 }, { "epoch": 0.6720539267440916, "grad_norm": 4.558326943954262, "learning_rate": 1.2826017875911257e-06, "loss": 0.5092, "step": 8275 }, { "epoch": 0.6721351417201332, "grad_norm": 6.121469512209196, "learning_rate": 1.2820274462852373e-06, "loss": 0.3959, "step": 8276 }, { "epoch": 0.6722163566961747, "grad_norm": 6.247553742372245, "learning_rate": 1.2814531892566034e-06, "loss": 0.4574, "step": 8277 }, { "epoch": 0.6722975716722164, "grad_norm": 3.743110767471212, "learning_rate": 1.2808790165449609e-06, "loss": 0.4817, "step": 8278 }, { "epoch": 0.6723787866482579, "grad_norm": 3.847254853806929, "learning_rate": 1.280304928190037e-06, "loss": 0.4439, "step": 8279 }, { "epoch": 0.6724600016242995, "grad_norm": 5.111654340531649, "learning_rate": 1.2797309242315584e-06, "loss": 0.5166, "step": 8280 }, { "epoch": 0.672541216600341, "grad_norm": 5.402468457889438, "learning_rate": 1.2791570047092413e-06, "loss": 0.4379, "step": 8281 }, { "epoch": 0.6726224315763827, "grad_norm": 4.64148894732247, "learning_rate": 1.2785831696627975e-06, "loss": 0.4985, "step": 8282 }, { "epoch": 0.6727036465524243, "grad_norm": 4.977759194515003, "learning_rate": 1.2780094191319348e-06, "loss": 0.3378, "step": 8283 }, { "epoch": 0.6727848615284658, "grad_norm": 5.750546485168828, "learning_rate": 1.2774357531563522e-06, "loss": 0.5315, "step": 8284 }, { "epoch": 0.6728660765045075, "grad_norm": 5.462788945858097, "learning_rate": 1.276862171775745e-06, "loss": 0.611, "step": 8285 }, { "epoch": 0.672947291480549, "grad_norm": 9.040739646165454, "learning_rate": 1.2762886750298033e-06, "loss": 0.4435, "step": 8286 }, { "epoch": 0.6730285064565906, "grad_norm": 5.419042990213366, "learning_rate": 1.275715262958209e-06, "loss": 0.585, "step": 8287 }, { "epoch": 0.6731097214326321, "grad_norm": 8.622423592062997, "learning_rate": 1.275141935600639e-06, "loss": 0.4452, "step": 8288 }, { "epoch": 0.6731909364086738, "grad_norm": 3.39665996833734, "learning_rate": 1.2745686929967632e-06, "loss": 0.5745, "step": 8289 }, { "epoch": 0.6732721513847153, "grad_norm": 9.640446040935274, "learning_rate": 1.2739955351862488e-06, "loss": 0.4325, "step": 8290 }, { "epoch": 0.6733533663607569, "grad_norm": 3.375245810027421, "learning_rate": 1.2734224622087556e-06, "loss": 0.742, "step": 8291 }, { "epoch": 0.6734345813367985, "grad_norm": 6.070864539301688, "learning_rate": 1.2728494741039354e-06, "loss": 0.4282, "step": 8292 }, { "epoch": 0.6735157963128401, "grad_norm": 3.1473742664954956, "learning_rate": 1.2722765709114382e-06, "loss": 0.5101, "step": 8293 }, { "epoch": 0.6735970112888817, "grad_norm": 5.797541760307167, "learning_rate": 1.2717037526709048e-06, "loss": 0.4927, "step": 8294 }, { "epoch": 0.6736782262649232, "grad_norm": 3.9968403294119192, "learning_rate": 1.2711310194219695e-06, "loss": 0.5272, "step": 8295 }, { "epoch": 0.6737594412409649, "grad_norm": 8.686594608534724, "learning_rate": 1.2705583712042654e-06, "loss": 0.3865, "step": 8296 }, { "epoch": 0.6738406562170064, "grad_norm": 5.299840930228158, "learning_rate": 1.2699858080574141e-06, "loss": 0.6049, "step": 8297 }, { "epoch": 0.673921871193048, "grad_norm": 5.383383957246399, "learning_rate": 1.2694133300210354e-06, "loss": 0.5769, "step": 8298 }, { "epoch": 0.6740030861690895, "grad_norm": 5.48276722761316, "learning_rate": 1.2688409371347422e-06, "loss": 0.5609, "step": 8299 }, { "epoch": 0.6740843011451312, "grad_norm": 4.890248642899915, "learning_rate": 1.2682686294381403e-06, "loss": 0.5895, "step": 8300 }, { "epoch": 0.6741655161211727, "grad_norm": 5.286027918182194, "learning_rate": 1.2676964069708294e-06, "loss": 0.406, "step": 8301 }, { "epoch": 0.6742467310972143, "grad_norm": 4.144062037889405, "learning_rate": 1.2671242697724061e-06, "loss": 0.6805, "step": 8302 }, { "epoch": 0.6743279460732559, "grad_norm": 3.10497422112379, "learning_rate": 1.266552217882458e-06, "loss": 0.4172, "step": 8303 }, { "epoch": 0.6744091610492975, "grad_norm": 4.7159885122704495, "learning_rate": 1.265980251340568e-06, "loss": 0.3766, "step": 8304 }, { "epoch": 0.6744903760253391, "grad_norm": 6.029787369464984, "learning_rate": 1.265408370186315e-06, "loss": 0.5068, "step": 8305 }, { "epoch": 0.6745715910013806, "grad_norm": 9.477273387771245, "learning_rate": 1.2648365744592683e-06, "loss": 0.5335, "step": 8306 }, { "epoch": 0.6746528059774223, "grad_norm": 5.274931921378115, "learning_rate": 1.264264864198994e-06, "loss": 0.3893, "step": 8307 }, { "epoch": 0.6747340209534638, "grad_norm": 6.694925532405975, "learning_rate": 1.2636932394450502e-06, "loss": 0.3245, "step": 8308 }, { "epoch": 0.6748152359295054, "grad_norm": 3.4329112146537994, "learning_rate": 1.2631217002369917e-06, "loss": 0.5724, "step": 8309 }, { "epoch": 0.674896450905547, "grad_norm": 5.211887506168933, "learning_rate": 1.2625502466143646e-06, "loss": 0.6977, "step": 8310 }, { "epoch": 0.6749776658815886, "grad_norm": 3.3614053784547258, "learning_rate": 1.2619788786167113e-06, "loss": 0.4674, "step": 8311 }, { "epoch": 0.6750588808576301, "grad_norm": 3.329679930838226, "learning_rate": 1.2614075962835688e-06, "loss": 0.4857, "step": 8312 }, { "epoch": 0.6751400958336717, "grad_norm": 5.737443755358583, "learning_rate": 1.2608363996544654e-06, "loss": 0.5106, "step": 8313 }, { "epoch": 0.6752213108097133, "grad_norm": 6.125567742013193, "learning_rate": 1.2602652887689237e-06, "loss": 0.4443, "step": 8314 }, { "epoch": 0.6753025257857549, "grad_norm": 9.237383534994807, "learning_rate": 1.2596942636664638e-06, "loss": 0.4483, "step": 8315 }, { "epoch": 0.6753837407617965, "grad_norm": 5.106127682199849, "learning_rate": 1.2591233243865958e-06, "loss": 0.4114, "step": 8316 }, { "epoch": 0.675464955737838, "grad_norm": 6.956467222846337, "learning_rate": 1.2585524709688268e-06, "loss": 0.5567, "step": 8317 }, { "epoch": 0.6755461707138797, "grad_norm": 6.165264566742239, "learning_rate": 1.257981703452657e-06, "loss": 0.4624, "step": 8318 }, { "epoch": 0.6756273856899212, "grad_norm": 3.908854775644603, "learning_rate": 1.2574110218775804e-06, "loss": 0.479, "step": 8319 }, { "epoch": 0.6757086006659628, "grad_norm": 5.299167034989398, "learning_rate": 1.2568404262830836e-06, "loss": 0.4723, "step": 8320 }, { "epoch": 0.6757898156420044, "grad_norm": 5.113207491776007, "learning_rate": 1.256269916708651e-06, "loss": 0.4358, "step": 8321 }, { "epoch": 0.675871030618046, "grad_norm": 6.490258586098834, "learning_rate": 1.2556994931937565e-06, "loss": 0.4825, "step": 8322 }, { "epoch": 0.6759522455940875, "grad_norm": 5.544338315054307, "learning_rate": 1.2551291557778721e-06, "loss": 0.4548, "step": 8323 }, { "epoch": 0.6760334605701291, "grad_norm": 4.16562191240731, "learning_rate": 1.2545589045004627e-06, "loss": 0.5272, "step": 8324 }, { "epoch": 0.6761146755461707, "grad_norm": 8.317648547442525, "learning_rate": 1.2539887394009855e-06, "loss": 0.5807, "step": 8325 }, { "epoch": 0.6761958905222123, "grad_norm": 4.863432965134022, "learning_rate": 1.2534186605188933e-06, "loss": 0.4716, "step": 8326 }, { "epoch": 0.6762771054982539, "grad_norm": 4.473812223631319, "learning_rate": 1.2528486678936313e-06, "loss": 0.3962, "step": 8327 }, { "epoch": 0.6763583204742954, "grad_norm": 7.282308721393281, "learning_rate": 1.2522787615646421e-06, "loss": 0.4288, "step": 8328 }, { "epoch": 0.6764395354503371, "grad_norm": 6.136375474933964, "learning_rate": 1.251708941571358e-06, "loss": 0.4208, "step": 8329 }, { "epoch": 0.6765207504263786, "grad_norm": 8.250435251721406, "learning_rate": 1.2511392079532087e-06, "loss": 0.3731, "step": 8330 }, { "epoch": 0.6766019654024202, "grad_norm": 8.8618136486422, "learning_rate": 1.2505695607496176e-06, "loss": 0.4431, "step": 8331 }, { "epoch": 0.6766831803784618, "grad_norm": 6.809802423773869, "learning_rate": 1.2500000000000007e-06, "loss": 0.437, "step": 8332 }, { "epoch": 0.6767643953545034, "grad_norm": 4.412176686499365, "learning_rate": 1.2494305257437669e-06, "loss": 0.6663, "step": 8333 }, { "epoch": 0.6768456103305449, "grad_norm": 8.259279196472452, "learning_rate": 1.2488611380203234e-06, "loss": 0.5233, "step": 8334 }, { "epoch": 0.6769268253065865, "grad_norm": 5.681125220205126, "learning_rate": 1.2482918368690666e-06, "loss": 0.4985, "step": 8335 }, { "epoch": 0.6770080402826281, "grad_norm": 8.25956808119689, "learning_rate": 1.24772262232939e-06, "loss": 0.5764, "step": 8336 }, { "epoch": 0.6770892552586697, "grad_norm": 5.608911686082642, "learning_rate": 1.2471534944406813e-06, "loss": 0.5972, "step": 8337 }, { "epoch": 0.6771704702347113, "grad_norm": 3.3213708352263054, "learning_rate": 1.2465844532423201e-06, "loss": 0.4957, "step": 8338 }, { "epoch": 0.6772516852107529, "grad_norm": 7.167255021326034, "learning_rate": 1.2460154987736806e-06, "loss": 0.5549, "step": 8339 }, { "epoch": 0.6773329001867945, "grad_norm": 4.442760200949274, "learning_rate": 1.2454466310741326e-06, "loss": 0.427, "step": 8340 }, { "epoch": 0.677414115162836, "grad_norm": 5.768478872426691, "learning_rate": 1.244877850183038e-06, "loss": 0.4285, "step": 8341 }, { "epoch": 0.6774953301388776, "grad_norm": 4.225339833775357, "learning_rate": 1.2443091561397527e-06, "loss": 0.5469, "step": 8342 }, { "epoch": 0.6775765451149192, "grad_norm": 6.322105186178602, "learning_rate": 1.2437405489836282e-06, "loss": 0.4678, "step": 8343 }, { "epoch": 0.6776577600909608, "grad_norm": 12.80035724783234, "learning_rate": 1.2431720287540097e-06, "loss": 0.4633, "step": 8344 }, { "epoch": 0.6777389750670023, "grad_norm": 6.865099951483512, "learning_rate": 1.2426035954902356e-06, "loss": 0.5027, "step": 8345 }, { "epoch": 0.677820190043044, "grad_norm": 4.2665462756693975, "learning_rate": 1.2420352492316368e-06, "loss": 0.4114, "step": 8346 }, { "epoch": 0.6779014050190855, "grad_norm": 4.207313035828698, "learning_rate": 1.2414669900175423e-06, "loss": 0.4601, "step": 8347 }, { "epoch": 0.6779826199951271, "grad_norm": 5.802065214743892, "learning_rate": 1.2408988178872699e-06, "loss": 0.3487, "step": 8348 }, { "epoch": 0.6780638349711687, "grad_norm": 5.071217203345165, "learning_rate": 1.240330732880136e-06, "loss": 0.4018, "step": 8349 }, { "epoch": 0.6781450499472103, "grad_norm": 4.658634987558885, "learning_rate": 1.2397627350354494e-06, "loss": 0.5006, "step": 8350 }, { "epoch": 0.6782262649232519, "grad_norm": 5.989526021591514, "learning_rate": 1.2391948243925119e-06, "loss": 0.3748, "step": 8351 }, { "epoch": 0.6783074798992934, "grad_norm": 4.543912419314957, "learning_rate": 1.238627000990619e-06, "loss": 0.3414, "step": 8352 }, { "epoch": 0.678388694875335, "grad_norm": 4.935095050500174, "learning_rate": 1.2380592648690629e-06, "loss": 0.4755, "step": 8353 }, { "epoch": 0.6784699098513766, "grad_norm": 3.9622208310484024, "learning_rate": 1.2374916160671268e-06, "loss": 0.4053, "step": 8354 }, { "epoch": 0.6785511248274182, "grad_norm": 4.65626323141216, "learning_rate": 1.2369240546240881e-06, "loss": 0.5236, "step": 8355 }, { "epoch": 0.6786323398034597, "grad_norm": 5.707169042711556, "learning_rate": 1.2363565805792202e-06, "loss": 0.4848, "step": 8356 }, { "epoch": 0.6787135547795013, "grad_norm": 5.695511088930679, "learning_rate": 1.2357891939717903e-06, "loss": 0.4024, "step": 8357 }, { "epoch": 0.6787947697555429, "grad_norm": 5.28913545779317, "learning_rate": 1.2352218948410563e-06, "loss": 0.4305, "step": 8358 }, { "epoch": 0.6788759847315845, "grad_norm": 3.680524773252283, "learning_rate": 1.2346546832262743e-06, "loss": 0.5757, "step": 8359 }, { "epoch": 0.6789571997076261, "grad_norm": 4.937519409498853, "learning_rate": 1.2340875591666917e-06, "loss": 0.5091, "step": 8360 }, { "epoch": 0.6790384146836677, "grad_norm": 6.252444692643531, "learning_rate": 1.2335205227015494e-06, "loss": 0.3707, "step": 8361 }, { "epoch": 0.6791196296597093, "grad_norm": 6.749516651304727, "learning_rate": 1.2329535738700838e-06, "loss": 0.4521, "step": 8362 }, { "epoch": 0.6792008446357508, "grad_norm": 6.554192643461585, "learning_rate": 1.232386712711526e-06, "loss": 0.4651, "step": 8363 }, { "epoch": 0.6792820596117924, "grad_norm": 6.372502999598231, "learning_rate": 1.2318199392650993e-06, "loss": 0.5727, "step": 8364 }, { "epoch": 0.679363274587834, "grad_norm": 7.402271446804732, "learning_rate": 1.23125325357002e-06, "loss": 0.3813, "step": 8365 }, { "epoch": 0.6794444895638756, "grad_norm": 4.56010296625159, "learning_rate": 1.2306866556655016e-06, "loss": 0.3856, "step": 8366 }, { "epoch": 0.6795257045399171, "grad_norm": 5.070138865153647, "learning_rate": 1.2301201455907492e-06, "loss": 0.3799, "step": 8367 }, { "epoch": 0.6796069195159588, "grad_norm": 7.907043621920382, "learning_rate": 1.2295537233849608e-06, "loss": 0.4336, "step": 8368 }, { "epoch": 0.6796881344920003, "grad_norm": 4.083402933054175, "learning_rate": 1.2289873890873311e-06, "loss": 0.4805, "step": 8369 }, { "epoch": 0.6797693494680419, "grad_norm": 3.3225519411749134, "learning_rate": 1.2284211427370483e-06, "loss": 0.5801, "step": 8370 }, { "epoch": 0.6798505644440835, "grad_norm": 5.893743990272938, "learning_rate": 1.2278549843732915e-06, "loss": 0.4576, "step": 8371 }, { "epoch": 0.6799317794201251, "grad_norm": 4.351833235603589, "learning_rate": 1.2272889140352382e-06, "loss": 0.5446, "step": 8372 }, { "epoch": 0.6800129943961667, "grad_norm": 6.1661761775892945, "learning_rate": 1.2267229317620564e-06, "loss": 0.4169, "step": 8373 }, { "epoch": 0.6800942093722082, "grad_norm": 3.8116371519348404, "learning_rate": 1.2261570375929077e-06, "loss": 0.4341, "step": 8374 }, { "epoch": 0.6801754243482498, "grad_norm": 8.305006268325108, "learning_rate": 1.2255912315669507e-06, "loss": 0.5365, "step": 8375 }, { "epoch": 0.6802566393242914, "grad_norm": 3.87785735314886, "learning_rate": 1.2250255137233363e-06, "loss": 0.3122, "step": 8376 }, { "epoch": 0.680337854300333, "grad_norm": 5.364139835040124, "learning_rate": 1.224459884101209e-06, "loss": 0.4254, "step": 8377 }, { "epoch": 0.6804190692763745, "grad_norm": 5.505941463868022, "learning_rate": 1.2238943427397059e-06, "loss": 0.5725, "step": 8378 }, { "epoch": 0.6805002842524162, "grad_norm": 3.2670918728153495, "learning_rate": 1.2233288896779617e-06, "loss": 0.4677, "step": 8379 }, { "epoch": 0.6805814992284577, "grad_norm": 5.650997362758408, "learning_rate": 1.2227635249551014e-06, "loss": 0.4548, "step": 8380 }, { "epoch": 0.6806627142044993, "grad_norm": 4.85029252716426, "learning_rate": 1.2221982486102446e-06, "loss": 0.533, "step": 8381 }, { "epoch": 0.6807439291805409, "grad_norm": 6.520961416314167, "learning_rate": 1.2216330606825063e-06, "loss": 0.5872, "step": 8382 }, { "epoch": 0.6808251441565825, "grad_norm": 4.358523321449324, "learning_rate": 1.2210679612109957e-06, "loss": 0.463, "step": 8383 }, { "epoch": 0.6809063591326241, "grad_norm": 3.3319401584526256, "learning_rate": 1.2205029502348123e-06, "loss": 0.4915, "step": 8384 }, { "epoch": 0.6809875741086656, "grad_norm": 5.916515103224009, "learning_rate": 1.2199380277930542e-06, "loss": 0.3504, "step": 8385 }, { "epoch": 0.6810687890847072, "grad_norm": 3.720584300258901, "learning_rate": 1.2193731939248098e-06, "loss": 0.5314, "step": 8386 }, { "epoch": 0.6811500040607488, "grad_norm": 6.706752035654295, "learning_rate": 1.218808448669162e-06, "loss": 0.4826, "step": 8387 }, { "epoch": 0.6812312190367904, "grad_norm": 7.136044672639504, "learning_rate": 1.218243792065189e-06, "loss": 0.3701, "step": 8388 }, { "epoch": 0.6813124340128319, "grad_norm": 15.62272184000503, "learning_rate": 1.2176792241519628e-06, "loss": 0.511, "step": 8389 }, { "epoch": 0.6813936489888736, "grad_norm": 5.7775925878468275, "learning_rate": 1.2171147449685469e-06, "loss": 0.4626, "step": 8390 }, { "epoch": 0.6814748639649151, "grad_norm": 8.76813533021635, "learning_rate": 1.2165503545540017e-06, "loss": 0.3721, "step": 8391 }, { "epoch": 0.6815560789409567, "grad_norm": 2.7821535648469307, "learning_rate": 1.2159860529473796e-06, "loss": 0.5439, "step": 8392 }, { "epoch": 0.6816372939169983, "grad_norm": 8.296103570527087, "learning_rate": 1.2154218401877263e-06, "loss": 0.4945, "step": 8393 }, { "epoch": 0.6817185088930399, "grad_norm": 3.680639071610561, "learning_rate": 1.214857716314083e-06, "loss": 0.5059, "step": 8394 }, { "epoch": 0.6817997238690815, "grad_norm": 3.5629979371942997, "learning_rate": 1.2142936813654848e-06, "loss": 0.4787, "step": 8395 }, { "epoch": 0.681880938845123, "grad_norm": 13.02646445886847, "learning_rate": 1.21372973538096e-06, "loss": 0.5225, "step": 8396 }, { "epoch": 0.6819621538211647, "grad_norm": 6.585522603186978, "learning_rate": 1.2131658783995285e-06, "loss": 0.3445, "step": 8397 }, { "epoch": 0.6820433687972062, "grad_norm": 4.999188751543709, "learning_rate": 1.212602110460209e-06, "loss": 0.6751, "step": 8398 }, { "epoch": 0.6821245837732478, "grad_norm": 5.137382890011477, "learning_rate": 1.2120384316020098e-06, "loss": 0.464, "step": 8399 }, { "epoch": 0.6822057987492893, "grad_norm": 3.57144580039995, "learning_rate": 1.2114748418639339e-06, "loss": 0.3924, "step": 8400 }, { "epoch": 0.682287013725331, "grad_norm": 6.448424519995378, "learning_rate": 1.2109113412849792e-06, "loss": 0.5204, "step": 8401 }, { "epoch": 0.6823682287013725, "grad_norm": 4.80310339792193, "learning_rate": 1.2103479299041388e-06, "loss": 0.5435, "step": 8402 }, { "epoch": 0.6824494436774141, "grad_norm": 11.6534555123514, "learning_rate": 1.209784607760395e-06, "loss": 0.514, "step": 8403 }, { "epoch": 0.6825306586534557, "grad_norm": 14.006283811833011, "learning_rate": 1.209221374892729e-06, "loss": 0.5202, "step": 8404 }, { "epoch": 0.6826118736294973, "grad_norm": 3.410546012906801, "learning_rate": 1.2086582313401125e-06, "loss": 0.6359, "step": 8405 }, { "epoch": 0.6826930886055389, "grad_norm": 4.650396850410136, "learning_rate": 1.208095177141511e-06, "loss": 0.6307, "step": 8406 }, { "epoch": 0.6827743035815804, "grad_norm": 4.067175768365543, "learning_rate": 1.2075322123358857e-06, "loss": 0.5884, "step": 8407 }, { "epoch": 0.682855518557622, "grad_norm": 7.834411232708035, "learning_rate": 1.2069693369621924e-06, "loss": 0.4241, "step": 8408 }, { "epoch": 0.6829367335336636, "grad_norm": 5.025485826761079, "learning_rate": 1.2064065510593765e-06, "loss": 0.4524, "step": 8409 }, { "epoch": 0.6830179485097052, "grad_norm": 7.200545827802387, "learning_rate": 1.205843854666382e-06, "loss": 0.6125, "step": 8410 }, { "epoch": 0.6830991634857467, "grad_norm": 5.911187736116562, "learning_rate": 1.2052812478221437e-06, "loss": 0.6645, "step": 8411 }, { "epoch": 0.6831803784617884, "grad_norm": 7.850988389705168, "learning_rate": 1.2047187305655898e-06, "loss": 0.4591, "step": 8412 }, { "epoch": 0.6832615934378299, "grad_norm": 4.511296495523972, "learning_rate": 1.2041563029356454e-06, "loss": 0.6595, "step": 8413 }, { "epoch": 0.6833428084138715, "grad_norm": 5.3954091188162145, "learning_rate": 1.203593964971226e-06, "loss": 0.5137, "step": 8414 }, { "epoch": 0.6834240233899131, "grad_norm": 3.303858814227981, "learning_rate": 1.2030317167112438e-06, "loss": 0.5769, "step": 8415 }, { "epoch": 0.6835052383659547, "grad_norm": 3.763109647115813, "learning_rate": 1.2024695581946016e-06, "loss": 0.4826, "step": 8416 }, { "epoch": 0.6835864533419963, "grad_norm": 17.92152213202901, "learning_rate": 1.2019074894602005e-06, "loss": 0.3603, "step": 8417 }, { "epoch": 0.6836676683180378, "grad_norm": 6.268708930822519, "learning_rate": 1.2013455105469304e-06, "loss": 0.4264, "step": 8418 }, { "epoch": 0.6837488832940795, "grad_norm": 4.52113050826669, "learning_rate": 1.2007836214936773e-06, "loss": 0.5712, "step": 8419 }, { "epoch": 0.683830098270121, "grad_norm": 5.948974609805699, "learning_rate": 1.2002218223393213e-06, "loss": 0.5343, "step": 8420 }, { "epoch": 0.6839113132461626, "grad_norm": 5.13305028563167, "learning_rate": 1.1996601131227376e-06, "loss": 0.6497, "step": 8421 }, { "epoch": 0.6839925282222041, "grad_norm": 6.071521774254982, "learning_rate": 1.1990984938827907e-06, "loss": 0.6081, "step": 8422 }, { "epoch": 0.6840737431982458, "grad_norm": 4.917477348975693, "learning_rate": 1.1985369646583442e-06, "loss": 0.5464, "step": 8423 }, { "epoch": 0.6841549581742873, "grad_norm": 4.304996098971284, "learning_rate": 1.1979755254882519e-06, "loss": 0.4677, "step": 8424 }, { "epoch": 0.6842361731503289, "grad_norm": 7.848305808110954, "learning_rate": 1.1974141764113617e-06, "loss": 0.3471, "step": 8425 }, { "epoch": 0.6843173881263706, "grad_norm": 5.645856868747358, "learning_rate": 1.1968529174665173e-06, "loss": 0.4901, "step": 8426 }, { "epoch": 0.6843986031024121, "grad_norm": 5.273055048309659, "learning_rate": 1.1962917486925532e-06, "loss": 0.5054, "step": 8427 }, { "epoch": 0.6844798180784537, "grad_norm": 4.090778363630613, "learning_rate": 1.1957306701283002e-06, "loss": 0.3776, "step": 8428 }, { "epoch": 0.6845610330544952, "grad_norm": 6.031487669992356, "learning_rate": 1.1951696818125835e-06, "loss": 0.5705, "step": 8429 }, { "epoch": 0.6846422480305369, "grad_norm": 5.241073571056581, "learning_rate": 1.1946087837842188e-06, "loss": 0.3882, "step": 8430 }, { "epoch": 0.6847234630065784, "grad_norm": 6.683258623229494, "learning_rate": 1.1940479760820177e-06, "loss": 0.486, "step": 8431 }, { "epoch": 0.68480467798262, "grad_norm": 5.092498342825334, "learning_rate": 1.1934872587447838e-06, "loss": 0.4291, "step": 8432 }, { "epoch": 0.6848858929586615, "grad_norm": 2.9158897378436746, "learning_rate": 1.1929266318113172e-06, "loss": 0.5162, "step": 8433 }, { "epoch": 0.6849671079347032, "grad_norm": 6.199252123306311, "learning_rate": 1.192366095320411e-06, "loss": 0.4291, "step": 8434 }, { "epoch": 0.6850483229107447, "grad_norm": 6.833595326483498, "learning_rate": 1.1918056493108493e-06, "loss": 0.398, "step": 8435 }, { "epoch": 0.6851295378867863, "grad_norm": 4.488586364488279, "learning_rate": 1.1912452938214142e-06, "loss": 0.5889, "step": 8436 }, { "epoch": 0.685210752862828, "grad_norm": 7.767681900407471, "learning_rate": 1.1906850288908783e-06, "loss": 0.5559, "step": 8437 }, { "epoch": 0.6852919678388695, "grad_norm": 4.340043418686966, "learning_rate": 1.1901248545580082e-06, "loss": 0.4359, "step": 8438 }, { "epoch": 0.6853731828149111, "grad_norm": 4.93465575158808, "learning_rate": 1.1895647708615665e-06, "loss": 0.4433, "step": 8439 }, { "epoch": 0.6854543977909526, "grad_norm": 4.964788137277078, "learning_rate": 1.1890047778403063e-06, "loss": 0.4608, "step": 8440 }, { "epoch": 0.6855356127669943, "grad_norm": 4.5014832518222905, "learning_rate": 1.1884448755329772e-06, "loss": 0.6192, "step": 8441 }, { "epoch": 0.6856168277430358, "grad_norm": 3.814073027261063, "learning_rate": 1.1878850639783224e-06, "loss": 0.65, "step": 8442 }, { "epoch": 0.6856980427190774, "grad_norm": 4.698487957966528, "learning_rate": 1.1873253432150769e-06, "loss": 0.4156, "step": 8443 }, { "epoch": 0.6857792576951189, "grad_norm": 2.6979949526342173, "learning_rate": 1.1867657132819693e-06, "loss": 0.4989, "step": 8444 }, { "epoch": 0.6858604726711606, "grad_norm": 4.663840811404981, "learning_rate": 1.1862061742177253e-06, "loss": 0.3499, "step": 8445 }, { "epoch": 0.6859416876472021, "grad_norm": 4.603946365527777, "learning_rate": 1.1856467260610597e-06, "loss": 0.4283, "step": 8446 }, { "epoch": 0.6860229026232437, "grad_norm": 6.004017761597916, "learning_rate": 1.1850873688506847e-06, "loss": 0.4741, "step": 8447 }, { "epoch": 0.6861041175992854, "grad_norm": 4.91811982386808, "learning_rate": 1.1845281026253055e-06, "loss": 0.4488, "step": 8448 }, { "epoch": 0.6861853325753269, "grad_norm": 4.124631666496995, "learning_rate": 1.1839689274236197e-06, "loss": 0.4576, "step": 8449 }, { "epoch": 0.6862665475513685, "grad_norm": 4.967951805934966, "learning_rate": 1.183409843284319e-06, "loss": 0.4204, "step": 8450 }, { "epoch": 0.68634776252741, "grad_norm": 3.6946567527289083, "learning_rate": 1.1828508502460884e-06, "loss": 0.501, "step": 8451 }, { "epoch": 0.6864289775034517, "grad_norm": 4.3257591185306214, "learning_rate": 1.1822919483476089e-06, "loss": 0.5587, "step": 8452 }, { "epoch": 0.6865101924794932, "grad_norm": 13.53355986373059, "learning_rate": 1.1817331376275518e-06, "loss": 0.3784, "step": 8453 }, { "epoch": 0.6865914074555348, "grad_norm": 4.686471250690919, "learning_rate": 1.181174418124585e-06, "loss": 0.529, "step": 8454 }, { "epoch": 0.6866726224315763, "grad_norm": 13.019567449891584, "learning_rate": 1.1806157898773694e-06, "loss": 0.521, "step": 8455 }, { "epoch": 0.686753837407618, "grad_norm": 5.024504582924606, "learning_rate": 1.1800572529245581e-06, "loss": 0.4579, "step": 8456 }, { "epoch": 0.6868350523836595, "grad_norm": 5.622740484481374, "learning_rate": 1.1794988073047986e-06, "loss": 0.4946, "step": 8457 }, { "epoch": 0.6869162673597011, "grad_norm": 3.383472516465761, "learning_rate": 1.1789404530567338e-06, "loss": 0.454, "step": 8458 }, { "epoch": 0.6869974823357428, "grad_norm": 39.06057548040847, "learning_rate": 1.178382190218997e-06, "loss": 0.4916, "step": 8459 }, { "epoch": 0.6870786973117843, "grad_norm": 5.13942966732467, "learning_rate": 1.1778240188302181e-06, "loss": 0.5374, "step": 8460 }, { "epoch": 0.6871599122878259, "grad_norm": 5.2491128634756805, "learning_rate": 1.177265938929021e-06, "loss": 0.4787, "step": 8461 }, { "epoch": 0.6872411272638674, "grad_norm": 3.883090224748362, "learning_rate": 1.1767079505540198e-06, "loss": 0.4962, "step": 8462 }, { "epoch": 0.6873223422399091, "grad_norm": 4.459207504966209, "learning_rate": 1.1761500537438246e-06, "loss": 0.4901, "step": 8463 }, { "epoch": 0.6874035572159506, "grad_norm": 5.526598548112002, "learning_rate": 1.1755922485370397e-06, "loss": 0.6049, "step": 8464 }, { "epoch": 0.6874847721919922, "grad_norm": 3.9962235924628, "learning_rate": 1.1750345349722611e-06, "loss": 0.6439, "step": 8465 }, { "epoch": 0.6875659871680337, "grad_norm": 5.430777327111154, "learning_rate": 1.1744769130880814e-06, "loss": 0.5233, "step": 8466 }, { "epoch": 0.6876472021440754, "grad_norm": 5.478225172555752, "learning_rate": 1.1739193829230833e-06, "loss": 0.4564, "step": 8467 }, { "epoch": 0.6877284171201169, "grad_norm": 14.175270639083902, "learning_rate": 1.1733619445158465e-06, "loss": 0.5561, "step": 8468 }, { "epoch": 0.6878096320961585, "grad_norm": 4.726662335713709, "learning_rate": 1.1728045979049421e-06, "loss": 0.439, "step": 8469 }, { "epoch": 0.6878908470722002, "grad_norm": 6.585567352748735, "learning_rate": 1.1722473431289344e-06, "loss": 0.4297, "step": 8470 }, { "epoch": 0.6879720620482417, "grad_norm": 7.151511500781442, "learning_rate": 1.1716901802263845e-06, "loss": 0.4433, "step": 8471 }, { "epoch": 0.6880532770242833, "grad_norm": 13.543001844242685, "learning_rate": 1.171133109235843e-06, "loss": 0.3943, "step": 8472 }, { "epoch": 0.6881344920003248, "grad_norm": 6.360919578390468, "learning_rate": 1.1705761301958576e-06, "loss": 0.3842, "step": 8473 }, { "epoch": 0.6882157069763665, "grad_norm": 5.289828859594367, "learning_rate": 1.170019243144969e-06, "loss": 0.3475, "step": 8474 }, { "epoch": 0.688296921952408, "grad_norm": 6.436765945910618, "learning_rate": 1.16946244812171e-06, "loss": 0.5738, "step": 8475 }, { "epoch": 0.6883781369284496, "grad_norm": 9.265057636256202, "learning_rate": 1.1689057451646072e-06, "loss": 0.5015, "step": 8476 }, { "epoch": 0.6884593519044911, "grad_norm": 5.62906939860262, "learning_rate": 1.1683491343121825e-06, "loss": 0.6312, "step": 8477 }, { "epoch": 0.6885405668805328, "grad_norm": 3.608605687615943, "learning_rate": 1.1677926156029495e-06, "loss": 0.5705, "step": 8478 }, { "epoch": 0.6886217818565743, "grad_norm": 7.326218578037191, "learning_rate": 1.1672361890754165e-06, "loss": 0.481, "step": 8479 }, { "epoch": 0.6887029968326159, "grad_norm": 3.101458446740605, "learning_rate": 1.1666798547680871e-06, "loss": 0.5597, "step": 8480 }, { "epoch": 0.6887842118086576, "grad_norm": 4.57887880856938, "learning_rate": 1.166123612719455e-06, "loss": 0.6213, "step": 8481 }, { "epoch": 0.6888654267846991, "grad_norm": 10.086781507059271, "learning_rate": 1.1655674629680083e-06, "loss": 0.5009, "step": 8482 }, { "epoch": 0.6889466417607407, "grad_norm": 5.0242271976636514, "learning_rate": 1.165011405552232e-06, "loss": 0.7435, "step": 8483 }, { "epoch": 0.6890278567367822, "grad_norm": 7.2774357651661274, "learning_rate": 1.164455440510601e-06, "loss": 0.4607, "step": 8484 }, { "epoch": 0.6891090717128239, "grad_norm": 4.136550631258544, "learning_rate": 1.1638995678815843e-06, "loss": 0.5113, "step": 8485 }, { "epoch": 0.6891902866888654, "grad_norm": 5.117305452537372, "learning_rate": 1.1633437877036462e-06, "loss": 0.4338, "step": 8486 }, { "epoch": 0.689271501664907, "grad_norm": 3.714646489723773, "learning_rate": 1.162788100015245e-06, "loss": 0.5338, "step": 8487 }, { "epoch": 0.6893527166409485, "grad_norm": 3.9854040499315877, "learning_rate": 1.1622325048548303e-06, "loss": 0.6151, "step": 8488 }, { "epoch": 0.6894339316169902, "grad_norm": 4.8147850700984804, "learning_rate": 1.1616770022608447e-06, "loss": 0.5104, "step": 8489 }, { "epoch": 0.6895151465930317, "grad_norm": 12.49347709660076, "learning_rate": 1.161121592271729e-06, "loss": 0.4588, "step": 8490 }, { "epoch": 0.6895963615690733, "grad_norm": 4.382600842190844, "learning_rate": 1.1605662749259123e-06, "loss": 0.5458, "step": 8491 }, { "epoch": 0.689677576545115, "grad_norm": 3.4100489659901823, "learning_rate": 1.1600110502618204e-06, "loss": 0.5746, "step": 8492 }, { "epoch": 0.6897587915211565, "grad_norm": 5.056416537418844, "learning_rate": 1.1594559183178727e-06, "loss": 0.4043, "step": 8493 }, { "epoch": 0.6898400064971981, "grad_norm": 4.3441729287251105, "learning_rate": 1.158900879132481e-06, "loss": 0.5067, "step": 8494 }, { "epoch": 0.6899212214732396, "grad_norm": 3.6975358584627447, "learning_rate": 1.1583459327440496e-06, "loss": 0.6198, "step": 8495 }, { "epoch": 0.6900024364492813, "grad_norm": 5.765909988508904, "learning_rate": 1.1577910791909802e-06, "loss": 0.557, "step": 8496 }, { "epoch": 0.6900836514253228, "grad_norm": 6.346348844345966, "learning_rate": 1.1572363185116648e-06, "loss": 0.5423, "step": 8497 }, { "epoch": 0.6901648664013644, "grad_norm": 6.761851681144281, "learning_rate": 1.1566816507444884e-06, "loss": 0.7113, "step": 8498 }, { "epoch": 0.690246081377406, "grad_norm": 4.9827548056260484, "learning_rate": 1.1561270759278326e-06, "loss": 0.4832, "step": 8499 }, { "epoch": 0.6903272963534476, "grad_norm": 7.290496908601368, "learning_rate": 1.1555725941000715e-06, "loss": 0.4773, "step": 8500 }, { "epoch": 0.6904085113294891, "grad_norm": 4.93347902991029, "learning_rate": 1.1550182052995706e-06, "loss": 0.4249, "step": 8501 }, { "epoch": 0.6904897263055307, "grad_norm": 4.857856203684281, "learning_rate": 1.154463909564693e-06, "loss": 0.4705, "step": 8502 }, { "epoch": 0.6905709412815724, "grad_norm": 16.432653414970492, "learning_rate": 1.1539097069337913e-06, "loss": 0.4833, "step": 8503 }, { "epoch": 0.6906521562576139, "grad_norm": 4.517576424813998, "learning_rate": 1.1533555974452128e-06, "loss": 0.5461, "step": 8504 }, { "epoch": 0.6907333712336555, "grad_norm": 6.387090563981134, "learning_rate": 1.1528015811373004e-06, "loss": 0.44, "step": 8505 }, { "epoch": 0.690814586209697, "grad_norm": 7.168256261694638, "learning_rate": 1.1522476580483893e-06, "loss": 0.4357, "step": 8506 }, { "epoch": 0.6908958011857387, "grad_norm": 5.267349807599555, "learning_rate": 1.1516938282168074e-06, "loss": 0.5764, "step": 8507 }, { "epoch": 0.6909770161617802, "grad_norm": 5.446388451950695, "learning_rate": 1.151140091680876e-06, "loss": 0.4762, "step": 8508 }, { "epoch": 0.6910582311378218, "grad_norm": 7.776144758103636, "learning_rate": 1.1505864484789122e-06, "loss": 0.4635, "step": 8509 }, { "epoch": 0.6911394461138634, "grad_norm": 4.652490283523999, "learning_rate": 1.1500328986492246e-06, "loss": 0.4744, "step": 8510 }, { "epoch": 0.691220661089905, "grad_norm": 5.281761321834011, "learning_rate": 1.149479442230115e-06, "loss": 0.5802, "step": 8511 }, { "epoch": 0.6913018760659465, "grad_norm": 4.121830278672146, "learning_rate": 1.1489260792598803e-06, "loss": 0.4812, "step": 8512 }, { "epoch": 0.6913830910419881, "grad_norm": 4.197838163994761, "learning_rate": 1.1483728097768116e-06, "loss": 0.4427, "step": 8513 }, { "epoch": 0.6914643060180298, "grad_norm": 4.757540075293028, "learning_rate": 1.14781963381919e-06, "loss": 0.4649, "step": 8514 }, { "epoch": 0.6915455209940713, "grad_norm": 4.228790147362145, "learning_rate": 1.1472665514252943e-06, "loss": 0.4501, "step": 8515 }, { "epoch": 0.6916267359701129, "grad_norm": 3.198795799968073, "learning_rate": 1.146713562633394e-06, "loss": 0.4265, "step": 8516 }, { "epoch": 0.6917079509461544, "grad_norm": 3.293645191890074, "learning_rate": 1.1461606674817518e-06, "loss": 0.5258, "step": 8517 }, { "epoch": 0.6917891659221961, "grad_norm": 4.377855525825729, "learning_rate": 1.1456078660086266e-06, "loss": 0.4102, "step": 8518 }, { "epoch": 0.6918703808982376, "grad_norm": 9.573511509775727, "learning_rate": 1.1450551582522702e-06, "loss": 0.3486, "step": 8519 }, { "epoch": 0.6919515958742792, "grad_norm": 3.560198474639802, "learning_rate": 1.1445025442509258e-06, "loss": 0.4863, "step": 8520 }, { "epoch": 0.6920328108503208, "grad_norm": 6.303365456673007, "learning_rate": 1.1439500240428304e-06, "loss": 0.5363, "step": 8521 }, { "epoch": 0.6921140258263624, "grad_norm": 5.438200231090006, "learning_rate": 1.1433975976662172e-06, "loss": 0.5908, "step": 8522 }, { "epoch": 0.6921952408024039, "grad_norm": 7.217503219294745, "learning_rate": 1.1428452651593102e-06, "loss": 0.5535, "step": 8523 }, { "epoch": 0.6922764557784455, "grad_norm": 3.113332201559408, "learning_rate": 1.142293026560328e-06, "loss": 0.4252, "step": 8524 }, { "epoch": 0.6923576707544872, "grad_norm": 3.7670638203585396, "learning_rate": 1.1417408819074835e-06, "loss": 0.5357, "step": 8525 }, { "epoch": 0.6924388857305287, "grad_norm": 9.156807084218489, "learning_rate": 1.1411888312389815e-06, "loss": 0.5814, "step": 8526 }, { "epoch": 0.6925201007065703, "grad_norm": 7.477340025472298, "learning_rate": 1.1406368745930201e-06, "loss": 0.5775, "step": 8527 }, { "epoch": 0.6926013156826119, "grad_norm": 5.777723223269901, "learning_rate": 1.140085012007794e-06, "loss": 0.4531, "step": 8528 }, { "epoch": 0.6926825306586535, "grad_norm": 17.932544473465622, "learning_rate": 1.1395332435214873e-06, "loss": 0.5404, "step": 8529 }, { "epoch": 0.692763745634695, "grad_norm": 6.420149798086578, "learning_rate": 1.138981569172279e-06, "loss": 0.4261, "step": 8530 }, { "epoch": 0.6928449606107366, "grad_norm": 9.184826721296508, "learning_rate": 1.1384299889983432e-06, "loss": 0.4474, "step": 8531 }, { "epoch": 0.6929261755867782, "grad_norm": 8.376036553312167, "learning_rate": 1.1378785030378473e-06, "loss": 0.5644, "step": 8532 }, { "epoch": 0.6930073905628198, "grad_norm": 4.928837681497556, "learning_rate": 1.137327111328949e-06, "loss": 0.3391, "step": 8533 }, { "epoch": 0.6930886055388613, "grad_norm": 4.407213199355116, "learning_rate": 1.1367758139098037e-06, "loss": 0.453, "step": 8534 }, { "epoch": 0.693169820514903, "grad_norm": 4.884167064220795, "learning_rate": 1.1362246108185571e-06, "loss": 0.4148, "step": 8535 }, { "epoch": 0.6932510354909446, "grad_norm": 5.275613682866477, "learning_rate": 1.135673502093349e-06, "loss": 0.6555, "step": 8536 }, { "epoch": 0.6933322504669861, "grad_norm": 4.72395706382707, "learning_rate": 1.1351224877723137e-06, "loss": 0.4664, "step": 8537 }, { "epoch": 0.6934134654430277, "grad_norm": 6.683252761194182, "learning_rate": 1.1345715678935802e-06, "loss": 0.4335, "step": 8538 }, { "epoch": 0.6934946804190693, "grad_norm": 3.6056858456464704, "learning_rate": 1.1340207424952673e-06, "loss": 0.4784, "step": 8539 }, { "epoch": 0.6935758953951109, "grad_norm": 7.002949745318934, "learning_rate": 1.133470011615489e-06, "loss": 0.5434, "step": 8540 }, { "epoch": 0.6936571103711524, "grad_norm": 6.659133719702008, "learning_rate": 1.1329193752923543e-06, "loss": 0.5708, "step": 8541 }, { "epoch": 0.693738325347194, "grad_norm": 12.35767990560376, "learning_rate": 1.1323688335639637e-06, "loss": 0.4835, "step": 8542 }, { "epoch": 0.6938195403232356, "grad_norm": 3.77158348926073, "learning_rate": 1.131818386468411e-06, "loss": 0.4171, "step": 8543 }, { "epoch": 0.6939007552992772, "grad_norm": 5.1758054265529205, "learning_rate": 1.1312680340437848e-06, "loss": 0.4684, "step": 8544 }, { "epoch": 0.6939819702753187, "grad_norm": 4.915722528576737, "learning_rate": 1.130717776328168e-06, "loss": 0.2615, "step": 8545 }, { "epoch": 0.6940631852513603, "grad_norm": 3.1585274055113914, "learning_rate": 1.130167613359633e-06, "loss": 0.5899, "step": 8546 }, { "epoch": 0.694144400227402, "grad_norm": 4.7926238667126055, "learning_rate": 1.1296175451762504e-06, "loss": 0.3803, "step": 8547 }, { "epoch": 0.6942256152034435, "grad_norm": 4.742004176848653, "learning_rate": 1.129067571816081e-06, "loss": 0.5899, "step": 8548 }, { "epoch": 0.6943068301794851, "grad_norm": 4.740522772470059, "learning_rate": 1.128517693317179e-06, "loss": 0.5, "step": 8549 }, { "epoch": 0.6943880451555267, "grad_norm": 3.0796548456527457, "learning_rate": 1.1279679097175944e-06, "loss": 0.4379, "step": 8550 }, { "epoch": 0.6944692601315683, "grad_norm": 4.872361456956494, "learning_rate": 1.12741822105537e-06, "loss": 0.4375, "step": 8551 }, { "epoch": 0.6945504751076098, "grad_norm": 3.8126089904616625, "learning_rate": 1.1268686273685391e-06, "loss": 0.41, "step": 8552 }, { "epoch": 0.6946316900836514, "grad_norm": 4.918083532251475, "learning_rate": 1.1263191286951333e-06, "loss": 0.7237, "step": 8553 }, { "epoch": 0.694712905059693, "grad_norm": 4.980898147720527, "learning_rate": 1.1257697250731735e-06, "loss": 0.5379, "step": 8554 }, { "epoch": 0.6947941200357346, "grad_norm": 3.3037282568689204, "learning_rate": 1.1252204165406753e-06, "loss": 0.5215, "step": 8555 }, { "epoch": 0.6948753350117761, "grad_norm": 4.960611865059338, "learning_rate": 1.1246712031356486e-06, "loss": 0.5758, "step": 8556 }, { "epoch": 0.6949565499878178, "grad_norm": 7.319288707757544, "learning_rate": 1.1241220848960952e-06, "loss": 0.4719, "step": 8557 }, { "epoch": 0.6950377649638594, "grad_norm": 3.819399015745177, "learning_rate": 1.1235730618600126e-06, "loss": 0.4936, "step": 8558 }, { "epoch": 0.6951189799399009, "grad_norm": 3.538376081354946, "learning_rate": 1.1230241340653888e-06, "loss": 0.5413, "step": 8559 }, { "epoch": 0.6952001949159425, "grad_norm": 6.249841085120328, "learning_rate": 1.122475301550208e-06, "loss": 0.5515, "step": 8560 }, { "epoch": 0.6952814098919841, "grad_norm": 8.110595035089235, "learning_rate": 1.121926564352446e-06, "loss": 0.6036, "step": 8561 }, { "epoch": 0.6953626248680257, "grad_norm": 4.635874973816129, "learning_rate": 1.1213779225100715e-06, "loss": 0.7324, "step": 8562 }, { "epoch": 0.6954438398440672, "grad_norm": 6.322869921153947, "learning_rate": 1.1208293760610486e-06, "loss": 0.52, "step": 8563 }, { "epoch": 0.6955250548201088, "grad_norm": 7.6067666576253545, "learning_rate": 1.1202809250433345e-06, "loss": 0.3976, "step": 8564 }, { "epoch": 0.6956062697961504, "grad_norm": 4.351083583151557, "learning_rate": 1.1197325694948774e-06, "loss": 0.5236, "step": 8565 }, { "epoch": 0.695687484772192, "grad_norm": 5.090734435411529, "learning_rate": 1.1191843094536225e-06, "loss": 0.4614, "step": 8566 }, { "epoch": 0.6957686997482335, "grad_norm": 6.001665213669127, "learning_rate": 1.1186361449575055e-06, "loss": 0.3699, "step": 8567 }, { "epoch": 0.6958499147242752, "grad_norm": 6.350310371588752, "learning_rate": 1.1180880760444558e-06, "loss": 0.6122, "step": 8568 }, { "epoch": 0.6959311297003168, "grad_norm": 4.918916151218767, "learning_rate": 1.117540102752398e-06, "loss": 0.4498, "step": 8569 }, { "epoch": 0.6960123446763583, "grad_norm": 4.2337365050726286, "learning_rate": 1.116992225119248e-06, "loss": 0.3585, "step": 8570 }, { "epoch": 0.6960935596523999, "grad_norm": 4.164412182555689, "learning_rate": 1.1164444431829163e-06, "loss": 0.5052, "step": 8571 }, { "epoch": 0.6961747746284415, "grad_norm": 4.915037193176119, "learning_rate": 1.1158967569813079e-06, "loss": 0.4742, "step": 8572 }, { "epoch": 0.6962559896044831, "grad_norm": 7.126882606950106, "learning_rate": 1.1153491665523186e-06, "loss": 0.5764, "step": 8573 }, { "epoch": 0.6963372045805246, "grad_norm": 6.194568824850657, "learning_rate": 1.1148016719338387e-06, "loss": 0.3955, "step": 8574 }, { "epoch": 0.6964184195565662, "grad_norm": 4.582251362036855, "learning_rate": 1.1142542731637513e-06, "loss": 0.2946, "step": 8575 }, { "epoch": 0.6964996345326078, "grad_norm": 4.565291424432009, "learning_rate": 1.1137069702799341e-06, "loss": 0.5785, "step": 8576 }, { "epoch": 0.6965808495086494, "grad_norm": 6.268014709041092, "learning_rate": 1.1131597633202587e-06, "loss": 0.5502, "step": 8577 }, { "epoch": 0.6966620644846909, "grad_norm": 4.232016340106684, "learning_rate": 1.1126126523225869e-06, "loss": 0.4602, "step": 8578 }, { "epoch": 0.6967432794607326, "grad_norm": 5.324130939463538, "learning_rate": 1.112065637324778e-06, "loss": 0.4341, "step": 8579 }, { "epoch": 0.6968244944367742, "grad_norm": 3.6701229318957034, "learning_rate": 1.1115187183646814e-06, "loss": 0.5489, "step": 8580 }, { "epoch": 0.6969057094128157, "grad_norm": 4.1075695508244365, "learning_rate": 1.1109718954801398e-06, "loss": 0.5336, "step": 8581 }, { "epoch": 0.6969869243888573, "grad_norm": 4.14721530411858, "learning_rate": 1.110425168708993e-06, "loss": 0.5781, "step": 8582 }, { "epoch": 0.6970681393648989, "grad_norm": 5.832775679972837, "learning_rate": 1.1098785380890696e-06, "loss": 0.427, "step": 8583 }, { "epoch": 0.6971493543409405, "grad_norm": 3.33845605592306, "learning_rate": 1.1093320036581936e-06, "loss": 0.4442, "step": 8584 }, { "epoch": 0.697230569316982, "grad_norm": 5.037481188891807, "learning_rate": 1.1087855654541843e-06, "loss": 0.368, "step": 8585 }, { "epoch": 0.6973117842930237, "grad_norm": 4.499749726051026, "learning_rate": 1.1082392235148509e-06, "loss": 0.4573, "step": 8586 }, { "epoch": 0.6973929992690652, "grad_norm": 4.154523490167273, "learning_rate": 1.1076929778779965e-06, "loss": 0.4741, "step": 8587 }, { "epoch": 0.6974742142451068, "grad_norm": 6.620859889971063, "learning_rate": 1.1071468285814201e-06, "loss": 0.5117, "step": 8588 }, { "epoch": 0.6975554292211483, "grad_norm": 5.322737847031269, "learning_rate": 1.106600775662911e-06, "loss": 0.428, "step": 8589 }, { "epoch": 0.69763664419719, "grad_norm": 6.543366704108589, "learning_rate": 1.1060548191602535e-06, "loss": 0.5052, "step": 8590 }, { "epoch": 0.6977178591732316, "grad_norm": 3.978052554444944, "learning_rate": 1.105508959111226e-06, "loss": 0.4051, "step": 8591 }, { "epoch": 0.6977990741492731, "grad_norm": 4.013135595680058, "learning_rate": 1.1049631955535985e-06, "loss": 0.5263, "step": 8592 }, { "epoch": 0.6978802891253147, "grad_norm": 7.931439768063147, "learning_rate": 1.1044175285251348e-06, "loss": 0.4994, "step": 8593 }, { "epoch": 0.6979615041013563, "grad_norm": 7.9626002926512465, "learning_rate": 1.1038719580635913e-06, "loss": 0.5575, "step": 8594 }, { "epoch": 0.6980427190773979, "grad_norm": 5.399616750795122, "learning_rate": 1.103326484206719e-06, "loss": 0.492, "step": 8595 }, { "epoch": 0.6981239340534394, "grad_norm": 5.8829565085948765, "learning_rate": 1.1027811069922634e-06, "loss": 0.5324, "step": 8596 }, { "epoch": 0.698205149029481, "grad_norm": 4.044823741608967, "learning_rate": 1.1022358264579593e-06, "loss": 0.4975, "step": 8597 }, { "epoch": 0.6982863640055226, "grad_norm": 5.039936723673592, "learning_rate": 1.1016906426415397e-06, "loss": 0.6072, "step": 8598 }, { "epoch": 0.6983675789815642, "grad_norm": 12.620558241769995, "learning_rate": 1.1011455555807272e-06, "loss": 0.4333, "step": 8599 }, { "epoch": 0.6984487939576057, "grad_norm": 4.360858885346072, "learning_rate": 1.1006005653132376e-06, "loss": 0.4666, "step": 8600 }, { "epoch": 0.6985300089336474, "grad_norm": 4.573558742038945, "learning_rate": 1.100055671876784e-06, "loss": 0.5247, "step": 8601 }, { "epoch": 0.698611223909689, "grad_norm": 5.213935252357303, "learning_rate": 1.0995108753090677e-06, "loss": 0.4746, "step": 8602 }, { "epoch": 0.6986924388857305, "grad_norm": 5.309900569340087, "learning_rate": 1.0989661756477869e-06, "loss": 0.4982, "step": 8603 }, { "epoch": 0.6987736538617721, "grad_norm": 6.957485365514494, "learning_rate": 1.0984215729306328e-06, "loss": 0.6216, "step": 8604 }, { "epoch": 0.6988548688378137, "grad_norm": 4.191127343883717, "learning_rate": 1.097877067195288e-06, "loss": 0.5468, "step": 8605 }, { "epoch": 0.6989360838138553, "grad_norm": 4.214027815939813, "learning_rate": 1.0973326584794286e-06, "loss": 0.5753, "step": 8606 }, { "epoch": 0.6990172987898968, "grad_norm": 5.627295117665019, "learning_rate": 1.0967883468207265e-06, "loss": 0.3873, "step": 8607 }, { "epoch": 0.6990985137659385, "grad_norm": 5.494066620160367, "learning_rate": 1.0962441322568437e-06, "loss": 0.4377, "step": 8608 }, { "epoch": 0.69917972874198, "grad_norm": 4.933052728810609, "learning_rate": 1.0957000148254387e-06, "loss": 0.4346, "step": 8609 }, { "epoch": 0.6992609437180216, "grad_norm": 4.67963560490583, "learning_rate": 1.0951559945641592e-06, "loss": 0.4196, "step": 8610 }, { "epoch": 0.6993421586940631, "grad_norm": 4.555476078942722, "learning_rate": 1.094612071510651e-06, "loss": 0.5331, "step": 8611 }, { "epoch": 0.6994233736701048, "grad_norm": 4.542359966546809, "learning_rate": 1.0940682457025498e-06, "loss": 0.3957, "step": 8612 }, { "epoch": 0.6995045886461464, "grad_norm": 4.4445515035232495, "learning_rate": 1.0935245171774842e-06, "loss": 0.6409, "step": 8613 }, { "epoch": 0.6995858036221879, "grad_norm": 5.499967666697894, "learning_rate": 1.092980885973079e-06, "loss": 0.5317, "step": 8614 }, { "epoch": 0.6996670185982296, "grad_norm": 4.090307339205739, "learning_rate": 1.0924373521269492e-06, "loss": 0.3992, "step": 8615 }, { "epoch": 0.6997482335742711, "grad_norm": 5.094031142055631, "learning_rate": 1.091893915676705e-06, "loss": 0.8069, "step": 8616 }, { "epoch": 0.6998294485503127, "grad_norm": 5.102016167710116, "learning_rate": 1.0913505766599506e-06, "loss": 0.5136, "step": 8617 }, { "epoch": 0.6999106635263542, "grad_norm": 6.840823358662633, "learning_rate": 1.090807335114281e-06, "loss": 0.4567, "step": 8618 }, { "epoch": 0.6999918785023959, "grad_norm": 6.604394180582011, "learning_rate": 1.0902641910772852e-06, "loss": 0.411, "step": 8619 }, { "epoch": 0.7000730934784374, "grad_norm": 5.660730236893839, "learning_rate": 1.0897211445865472e-06, "loss": 0.432, "step": 8620 }, { "epoch": 0.700154308454479, "grad_norm": 4.702707757036113, "learning_rate": 1.089178195679641e-06, "loss": 0.3665, "step": 8621 }, { "epoch": 0.7002355234305205, "grad_norm": 4.96553057734693, "learning_rate": 1.0886353443941373e-06, "loss": 0.4387, "step": 8622 }, { "epoch": 0.7003167384065622, "grad_norm": 8.264862786380114, "learning_rate": 1.088092590767599e-06, "loss": 0.3632, "step": 8623 }, { "epoch": 0.7003979533826038, "grad_norm": 6.893091911108661, "learning_rate": 1.0875499348375812e-06, "loss": 0.5675, "step": 8624 }, { "epoch": 0.7004791683586453, "grad_norm": 4.387426391179371, "learning_rate": 1.0870073766416315e-06, "loss": 0.4464, "step": 8625 }, { "epoch": 0.700560383334687, "grad_norm": 5.027871465737512, "learning_rate": 1.0864649162172941e-06, "loss": 0.6075, "step": 8626 }, { "epoch": 0.7006415983107285, "grad_norm": 6.143259232572587, "learning_rate": 1.0859225536021034e-06, "loss": 0.5228, "step": 8627 }, { "epoch": 0.7007228132867701, "grad_norm": 6.464771014837394, "learning_rate": 1.0853802888335874e-06, "loss": 0.6185, "step": 8628 }, { "epoch": 0.7008040282628116, "grad_norm": 4.205829517254945, "learning_rate": 1.0848381219492684e-06, "loss": 0.498, "step": 8629 }, { "epoch": 0.7008852432388533, "grad_norm": 3.9072366994873855, "learning_rate": 1.0842960529866627e-06, "loss": 0.4887, "step": 8630 }, { "epoch": 0.7009664582148948, "grad_norm": 5.509854679596712, "learning_rate": 1.0837540819832779e-06, "loss": 0.4346, "step": 8631 }, { "epoch": 0.7010476731909364, "grad_norm": 4.3762550261485, "learning_rate": 1.0832122089766143e-06, "loss": 0.4646, "step": 8632 }, { "epoch": 0.7011288881669779, "grad_norm": 4.21313805277909, "learning_rate": 1.082670434004168e-06, "loss": 0.605, "step": 8633 }, { "epoch": 0.7012101031430196, "grad_norm": 5.0290295451057245, "learning_rate": 1.0821287571034261e-06, "loss": 0.5164, "step": 8634 }, { "epoch": 0.7012913181190612, "grad_norm": 3.852504953053529, "learning_rate": 1.0815871783118701e-06, "loss": 0.5563, "step": 8635 }, { "epoch": 0.7013725330951027, "grad_norm": 4.4031006382832265, "learning_rate": 1.0810456976669753e-06, "loss": 0.4816, "step": 8636 }, { "epoch": 0.7014537480711444, "grad_norm": 5.020999898610326, "learning_rate": 1.0805043152062086e-06, "loss": 0.5762, "step": 8637 }, { "epoch": 0.7015349630471859, "grad_norm": 12.438678796300241, "learning_rate": 1.07996303096703e-06, "loss": 0.4189, "step": 8638 }, { "epoch": 0.7016161780232275, "grad_norm": 3.8964414970182455, "learning_rate": 1.0794218449868948e-06, "loss": 0.4803, "step": 8639 }, { "epoch": 0.701697392999269, "grad_norm": 6.66037981965594, "learning_rate": 1.07888075730325e-06, "loss": 0.4304, "step": 8640 }, { "epoch": 0.7017786079753107, "grad_norm": 9.601550182923392, "learning_rate": 1.0783397679535343e-06, "loss": 0.5567, "step": 8641 }, { "epoch": 0.7018598229513522, "grad_norm": 5.837088176132076, "learning_rate": 1.077798876975183e-06, "loss": 0.3856, "step": 8642 }, { "epoch": 0.7019410379273938, "grad_norm": 4.127084436289391, "learning_rate": 1.0772580844056232e-06, "loss": 0.5247, "step": 8643 }, { "epoch": 0.7020222529034353, "grad_norm": 3.613384254118512, "learning_rate": 1.0767173902822733e-06, "loss": 0.3272, "step": 8644 }, { "epoch": 0.702103467879477, "grad_norm": 2.5823659536071637, "learning_rate": 1.0761767946425482e-06, "loss": 0.5089, "step": 8645 }, { "epoch": 0.7021846828555186, "grad_norm": 4.879797467260154, "learning_rate": 1.0756362975238539e-06, "loss": 0.8247, "step": 8646 }, { "epoch": 0.7022658978315601, "grad_norm": 6.519914984541137, "learning_rate": 1.0750958989635879e-06, "loss": 0.4224, "step": 8647 }, { "epoch": 0.7023471128076018, "grad_norm": 3.9501597340899903, "learning_rate": 1.074555598999145e-06, "loss": 0.571, "step": 8648 }, { "epoch": 0.7024283277836433, "grad_norm": 6.770723225323952, "learning_rate": 1.0740153976679114e-06, "loss": 0.4328, "step": 8649 }, { "epoch": 0.7025095427596849, "grad_norm": 13.76466570186757, "learning_rate": 1.073475295007265e-06, "loss": 0.424, "step": 8650 }, { "epoch": 0.7025907577357264, "grad_norm": 5.062420028751364, "learning_rate": 1.0729352910545779e-06, "loss": 0.5359, "step": 8651 }, { "epoch": 0.7026719727117681, "grad_norm": 8.4588766125872, "learning_rate": 1.0723953858472167e-06, "loss": 0.4192, "step": 8652 }, { "epoch": 0.7027531876878096, "grad_norm": 4.594691832051315, "learning_rate": 1.0718555794225385e-06, "loss": 0.4356, "step": 8653 }, { "epoch": 0.7028344026638512, "grad_norm": 5.910635422121678, "learning_rate": 1.071315871817896e-06, "loss": 0.4946, "step": 8654 }, { "epoch": 0.7029156176398927, "grad_norm": 7.200371807778343, "learning_rate": 1.0707762630706345e-06, "loss": 0.6541, "step": 8655 }, { "epoch": 0.7029968326159344, "grad_norm": 4.327245345198459, "learning_rate": 1.0702367532180919e-06, "loss": 0.5585, "step": 8656 }, { "epoch": 0.703078047591976, "grad_norm": 5.70898727396328, "learning_rate": 1.0696973422975978e-06, "loss": 0.4336, "step": 8657 }, { "epoch": 0.7031592625680175, "grad_norm": 3.3122955544251, "learning_rate": 1.0691580303464791e-06, "loss": 0.7113, "step": 8658 }, { "epoch": 0.7032404775440592, "grad_norm": 6.853811044628905, "learning_rate": 1.068618817402052e-06, "loss": 0.5942, "step": 8659 }, { "epoch": 0.7033216925201007, "grad_norm": 6.076640488626101, "learning_rate": 1.0680797035016264e-06, "loss": 0.657, "step": 8660 }, { "epoch": 0.7034029074961423, "grad_norm": 6.2346543123925136, "learning_rate": 1.0675406886825065e-06, "loss": 0.5554, "step": 8661 }, { "epoch": 0.7034841224721838, "grad_norm": 4.087802951829346, "learning_rate": 1.0670017729819911e-06, "loss": 0.6035, "step": 8662 }, { "epoch": 0.7035653374482255, "grad_norm": 4.8340326949666, "learning_rate": 1.066462956437369e-06, "loss": 0.4762, "step": 8663 }, { "epoch": 0.703646552424267, "grad_norm": 5.486314431686029, "learning_rate": 1.0659242390859224e-06, "loss": 0.474, "step": 8664 }, { "epoch": 0.7037277674003086, "grad_norm": 5.968816148722602, "learning_rate": 1.0653856209649297e-06, "loss": 0.4181, "step": 8665 }, { "epoch": 0.7038089823763501, "grad_norm": 4.467813005237297, "learning_rate": 1.0648471021116584e-06, "loss": 0.5303, "step": 8666 }, { "epoch": 0.7038901973523918, "grad_norm": 3.4998502366425757, "learning_rate": 1.0643086825633723e-06, "loss": 0.5941, "step": 8667 }, { "epoch": 0.7039714123284334, "grad_norm": 3.138792298225771, "learning_rate": 1.0637703623573278e-06, "loss": 0.6115, "step": 8668 }, { "epoch": 0.7040526273044749, "grad_norm": 4.420119383519892, "learning_rate": 1.0632321415307734e-06, "loss": 0.4483, "step": 8669 }, { "epoch": 0.7041338422805166, "grad_norm": 5.331801055403152, "learning_rate": 1.0626940201209497e-06, "loss": 0.3817, "step": 8670 }, { "epoch": 0.7042150572565581, "grad_norm": 5.750793644056218, "learning_rate": 1.062155998165094e-06, "loss": 0.5817, "step": 8671 }, { "epoch": 0.7042962722325997, "grad_norm": 3.4756270753695104, "learning_rate": 1.0616180757004333e-06, "loss": 0.6701, "step": 8672 }, { "epoch": 0.7043774872086412, "grad_norm": 6.270791519856763, "learning_rate": 1.0610802527641883e-06, "loss": 0.4016, "step": 8673 }, { "epoch": 0.7044587021846829, "grad_norm": 5.4889815502837775, "learning_rate": 1.0605425293935748e-06, "loss": 0.431, "step": 8674 }, { "epoch": 0.7045399171607244, "grad_norm": 4.255319581514907, "learning_rate": 1.0600049056258008e-06, "loss": 0.623, "step": 8675 }, { "epoch": 0.704621132136766, "grad_norm": 4.619912538008321, "learning_rate": 1.0594673814980652e-06, "loss": 0.4224, "step": 8676 }, { "epoch": 0.7047023471128075, "grad_norm": 4.4624099848947445, "learning_rate": 1.058929957047564e-06, "loss": 0.4773, "step": 8677 }, { "epoch": 0.7047835620888492, "grad_norm": 3.568435548062833, "learning_rate": 1.0583926323114829e-06, "loss": 0.5004, "step": 8678 }, { "epoch": 0.7048647770648908, "grad_norm": 5.199277112036462, "learning_rate": 1.057855407327001e-06, "loss": 0.5205, "step": 8679 }, { "epoch": 0.7049459920409323, "grad_norm": 4.806067495687931, "learning_rate": 1.0573182821312927e-06, "loss": 0.4348, "step": 8680 }, { "epoch": 0.705027207016974, "grad_norm": 4.581743151795791, "learning_rate": 1.056781256761525e-06, "loss": 0.5282, "step": 8681 }, { "epoch": 0.7051084219930155, "grad_norm": 4.187102501676517, "learning_rate": 1.0562443312548558e-06, "loss": 0.3811, "step": 8682 }, { "epoch": 0.7051896369690571, "grad_norm": 10.94900482763677, "learning_rate": 1.0557075056484373e-06, "loss": 0.4645, "step": 8683 }, { "epoch": 0.7052708519450986, "grad_norm": 3.286189799041891, "learning_rate": 1.0551707799794164e-06, "loss": 0.515, "step": 8684 }, { "epoch": 0.7053520669211403, "grad_norm": 5.1687388035544535, "learning_rate": 1.054634154284931e-06, "loss": 0.3609, "step": 8685 }, { "epoch": 0.7054332818971818, "grad_norm": 6.765257196850052, "learning_rate": 1.0540976286021115e-06, "loss": 0.5426, "step": 8686 }, { "epoch": 0.7055144968732234, "grad_norm": 5.1893140007432885, "learning_rate": 1.053561202968084e-06, "loss": 0.4086, "step": 8687 }, { "epoch": 0.705595711849265, "grad_norm": 4.075572539458918, "learning_rate": 1.053024877419967e-06, "loss": 0.4706, "step": 8688 }, { "epoch": 0.7056769268253066, "grad_norm": 5.198722303974706, "learning_rate": 1.0524886519948693e-06, "loss": 0.5246, "step": 8689 }, { "epoch": 0.7057581418013482, "grad_norm": 5.742252264604989, "learning_rate": 1.0519525267298972e-06, "loss": 0.3365, "step": 8690 }, { "epoch": 0.7058393567773897, "grad_norm": 4.08553333166632, "learning_rate": 1.0514165016621464e-06, "loss": 0.6085, "step": 8691 }, { "epoch": 0.7059205717534314, "grad_norm": 5.7021732693041995, "learning_rate": 1.0508805768287061e-06, "loss": 0.5561, "step": 8692 }, { "epoch": 0.7060017867294729, "grad_norm": 11.500144742392935, "learning_rate": 1.050344752266661e-06, "loss": 0.4041, "step": 8693 }, { "epoch": 0.7060830017055145, "grad_norm": 5.728188208611933, "learning_rate": 1.0498090280130873e-06, "loss": 0.5148, "step": 8694 }, { "epoch": 0.706164216681556, "grad_norm": 3.8194604173660442, "learning_rate": 1.0492734041050532e-06, "loss": 0.5706, "step": 8695 }, { "epoch": 0.7062454316575977, "grad_norm": 4.238106988426028, "learning_rate": 1.0487378805796225e-06, "loss": 0.4247, "step": 8696 }, { "epoch": 0.7063266466336392, "grad_norm": 4.981104011286187, "learning_rate": 1.0482024574738498e-06, "loss": 0.4802, "step": 8697 }, { "epoch": 0.7064078616096808, "grad_norm": 5.353614712971508, "learning_rate": 1.0476671348247834e-06, "loss": 0.3993, "step": 8698 }, { "epoch": 0.7064890765857224, "grad_norm": 5.533743790772518, "learning_rate": 1.047131912669464e-06, "loss": 0.391, "step": 8699 }, { "epoch": 0.706570291561764, "grad_norm": 5.99441767333963, "learning_rate": 1.0465967910449274e-06, "loss": 0.4343, "step": 8700 }, { "epoch": 0.7066515065378056, "grad_norm": 10.472893307184739, "learning_rate": 1.046061769988201e-06, "loss": 0.3472, "step": 8701 }, { "epoch": 0.7067327215138471, "grad_norm": 6.614917885036994, "learning_rate": 1.045526849536305e-06, "loss": 0.3806, "step": 8702 }, { "epoch": 0.7068139364898888, "grad_norm": 6.161723653557723, "learning_rate": 1.0449920297262542e-06, "loss": 0.5681, "step": 8703 }, { "epoch": 0.7068951514659303, "grad_norm": 4.341342853033297, "learning_rate": 1.0444573105950543e-06, "loss": 0.4967, "step": 8704 }, { "epoch": 0.7069763664419719, "grad_norm": 3.964088413604789, "learning_rate": 1.0439226921797042e-06, "loss": 0.5641, "step": 8705 }, { "epoch": 0.7070575814180134, "grad_norm": 7.436857002923769, "learning_rate": 1.0433881745171976e-06, "loss": 0.3931, "step": 8706 }, { "epoch": 0.7071387963940551, "grad_norm": 21.38007720377836, "learning_rate": 1.042853757644521e-06, "loss": 0.5358, "step": 8707 }, { "epoch": 0.7072200113700966, "grad_norm": 9.571044595510465, "learning_rate": 1.0423194415986518e-06, "loss": 0.394, "step": 8708 }, { "epoch": 0.7073012263461382, "grad_norm": 3.4522432755903045, "learning_rate": 1.0417852264165637e-06, "loss": 0.7711, "step": 8709 }, { "epoch": 0.7073824413221798, "grad_norm": 4.437337860525526, "learning_rate": 1.0412511121352201e-06, "loss": 0.5022, "step": 8710 }, { "epoch": 0.7074636562982214, "grad_norm": 4.832102307573993, "learning_rate": 1.0407170987915786e-06, "loss": 0.3741, "step": 8711 }, { "epoch": 0.707544871274263, "grad_norm": 4.424143419021889, "learning_rate": 1.0401831864225915e-06, "loss": 0.4477, "step": 8712 }, { "epoch": 0.7076260862503045, "grad_norm": 4.167265583846372, "learning_rate": 1.0396493750652008e-06, "loss": 0.6794, "step": 8713 }, { "epoch": 0.7077073012263462, "grad_norm": 5.174243065313732, "learning_rate": 1.039115664756345e-06, "loss": 0.4642, "step": 8714 }, { "epoch": 0.7077885162023877, "grad_norm": 4.436027938464365, "learning_rate": 1.0385820555329543e-06, "loss": 0.5378, "step": 8715 }, { "epoch": 0.7078697311784293, "grad_norm": 6.048625139053751, "learning_rate": 1.0380485474319507e-06, "loss": 0.4807, "step": 8716 }, { "epoch": 0.7079509461544709, "grad_norm": 4.409054430942545, "learning_rate": 1.0375151404902507e-06, "loss": 0.4596, "step": 8717 }, { "epoch": 0.7080321611305125, "grad_norm": 6.231874069745442, "learning_rate": 1.0369818347447617e-06, "loss": 0.396, "step": 8718 }, { "epoch": 0.708113376106554, "grad_norm": 6.571192019178439, "learning_rate": 1.0364486302323868e-06, "loss": 0.6371, "step": 8719 }, { "epoch": 0.7081945910825956, "grad_norm": 6.2588184473383555, "learning_rate": 1.035915526990022e-06, "loss": 0.4857, "step": 8720 }, { "epoch": 0.7082758060586372, "grad_norm": 4.086683961249212, "learning_rate": 1.0353825250545533e-06, "loss": 0.4409, "step": 8721 }, { "epoch": 0.7083570210346788, "grad_norm": 4.896811289532637, "learning_rate": 1.0348496244628633e-06, "loss": 0.4143, "step": 8722 }, { "epoch": 0.7084382360107204, "grad_norm": 6.411254993382916, "learning_rate": 1.0343168252518252e-06, "loss": 0.4197, "step": 8723 }, { "epoch": 0.708519450986762, "grad_norm": 6.5354079259733915, "learning_rate": 1.0337841274583046e-06, "loss": 0.6165, "step": 8724 }, { "epoch": 0.7086006659628036, "grad_norm": 5.167176814080369, "learning_rate": 1.0332515311191627e-06, "loss": 0.5354, "step": 8725 }, { "epoch": 0.7086818809388451, "grad_norm": 7.4442577463893835, "learning_rate": 1.032719036271253e-06, "loss": 0.5054, "step": 8726 }, { "epoch": 0.7087630959148867, "grad_norm": 4.039848430781059, "learning_rate": 1.0321866429514199e-06, "loss": 0.5241, "step": 8727 }, { "epoch": 0.7088443108909283, "grad_norm": 3.2336915820436736, "learning_rate": 1.0316543511965035e-06, "loss": 0.5754, "step": 8728 }, { "epoch": 0.7089255258669699, "grad_norm": 4.7161410228484195, "learning_rate": 1.031122161043335e-06, "loss": 0.377, "step": 8729 }, { "epoch": 0.7090067408430114, "grad_norm": 3.6485307474977624, "learning_rate": 1.030590072528738e-06, "loss": 0.4437, "step": 8730 }, { "epoch": 0.709087955819053, "grad_norm": 4.868323212713634, "learning_rate": 1.030058085689532e-06, "loss": 0.4561, "step": 8731 }, { "epoch": 0.7091691707950946, "grad_norm": 6.225865954562011, "learning_rate": 1.0295262005625262e-06, "loss": 0.413, "step": 8732 }, { "epoch": 0.7092503857711362, "grad_norm": 4.393355867203619, "learning_rate": 1.028994417184525e-06, "loss": 0.5137, "step": 8733 }, { "epoch": 0.7093316007471778, "grad_norm": 4.03858492787502, "learning_rate": 1.0284627355923257e-06, "loss": 0.5499, "step": 8734 }, { "epoch": 0.7094128157232193, "grad_norm": 7.2081095740889225, "learning_rate": 1.0279311558227174e-06, "loss": 0.4145, "step": 8735 }, { "epoch": 0.709494030699261, "grad_norm": 4.52684696449661, "learning_rate": 1.027399677912482e-06, "loss": 0.4802, "step": 8736 }, { "epoch": 0.7095752456753025, "grad_norm": 4.627236682606212, "learning_rate": 1.0268683018983944e-06, "loss": 0.4124, "step": 8737 }, { "epoch": 0.7096564606513441, "grad_norm": 4.60387331373005, "learning_rate": 1.026337027817224e-06, "loss": 0.4463, "step": 8738 }, { "epoch": 0.7097376756273857, "grad_norm": 4.049679102453057, "learning_rate": 1.0258058557057328e-06, "loss": 0.4391, "step": 8739 }, { "epoch": 0.7098188906034273, "grad_norm": 3.932889064119171, "learning_rate": 1.0252747856006735e-06, "loss": 0.4598, "step": 8740 }, { "epoch": 0.7099001055794688, "grad_norm": 5.445177988596398, "learning_rate": 1.0247438175387946e-06, "loss": 0.3975, "step": 8741 }, { "epoch": 0.7099813205555104, "grad_norm": 4.336033272668492, "learning_rate": 1.0242129515568364e-06, "loss": 0.6375, "step": 8742 }, { "epoch": 0.710062535531552, "grad_norm": 5.942395150551147, "learning_rate": 1.0236821876915303e-06, "loss": 0.5762, "step": 8743 }, { "epoch": 0.7101437505075936, "grad_norm": 7.955186448658773, "learning_rate": 1.0231515259796046e-06, "loss": 0.4297, "step": 8744 }, { "epoch": 0.7102249654836352, "grad_norm": 4.589645135923486, "learning_rate": 1.022620966457776e-06, "loss": 0.5287, "step": 8745 }, { "epoch": 0.7103061804596768, "grad_norm": 10.369934951756198, "learning_rate": 1.0220905091627581e-06, "loss": 0.3939, "step": 8746 }, { "epoch": 0.7103873954357184, "grad_norm": 4.830026444654436, "learning_rate": 1.0215601541312556e-06, "loss": 0.4922, "step": 8747 }, { "epoch": 0.7104686104117599, "grad_norm": 4.89390413686288, "learning_rate": 1.0210299013999662e-06, "loss": 0.4854, "step": 8748 }, { "epoch": 0.7105498253878015, "grad_norm": 5.6081171633262015, "learning_rate": 1.0204997510055793e-06, "loss": 0.4383, "step": 8749 }, { "epoch": 0.7106310403638431, "grad_norm": 7.598002540248723, "learning_rate": 1.0199697029847804e-06, "loss": 0.5754, "step": 8750 }, { "epoch": 0.7107122553398847, "grad_norm": 6.496257353872819, "learning_rate": 1.0194397573742442e-06, "loss": 0.3061, "step": 8751 }, { "epoch": 0.7107934703159262, "grad_norm": 7.92240606328125, "learning_rate": 1.0189099142106421e-06, "loss": 0.4262, "step": 8752 }, { "epoch": 0.7108746852919678, "grad_norm": 27.70458593561067, "learning_rate": 1.0183801735306342e-06, "loss": 0.4334, "step": 8753 }, { "epoch": 0.7109559002680094, "grad_norm": 4.449530450041586, "learning_rate": 1.0178505353708779e-06, "loss": 0.5682, "step": 8754 }, { "epoch": 0.711037115244051, "grad_norm": 5.978122767767535, "learning_rate": 1.0173209997680203e-06, "loss": 0.5643, "step": 8755 }, { "epoch": 0.7111183302200926, "grad_norm": 4.850468611110746, "learning_rate": 1.0167915667587019e-06, "loss": 0.47, "step": 8756 }, { "epoch": 0.7111995451961342, "grad_norm": 4.3524338310305835, "learning_rate": 1.016262236379558e-06, "loss": 0.5099, "step": 8757 }, { "epoch": 0.7112807601721758, "grad_norm": 6.685288002285286, "learning_rate": 1.015733008667214e-06, "loss": 0.6591, "step": 8758 }, { "epoch": 0.7113619751482173, "grad_norm": 7.509285220548554, "learning_rate": 1.0152038836582903e-06, "loss": 0.5526, "step": 8759 }, { "epoch": 0.7114431901242589, "grad_norm": 3.6292963710683366, "learning_rate": 1.0146748613894005e-06, "loss": 0.4786, "step": 8760 }, { "epoch": 0.7115244051003005, "grad_norm": 4.730030440457719, "learning_rate": 1.0141459418971496e-06, "loss": 0.409, "step": 8761 }, { "epoch": 0.7116056200763421, "grad_norm": 3.5510572572185977, "learning_rate": 1.0136171252181348e-06, "loss": 0.5859, "step": 8762 }, { "epoch": 0.7116868350523836, "grad_norm": 7.350261219519113, "learning_rate": 1.0130884113889491e-06, "loss": 0.4996, "step": 8763 }, { "epoch": 0.7117680500284252, "grad_norm": 4.1865654744906635, "learning_rate": 1.0125598004461752e-06, "loss": 0.395, "step": 8764 }, { "epoch": 0.7118492650044668, "grad_norm": 8.196706389670227, "learning_rate": 1.012031292426391e-06, "loss": 0.4005, "step": 8765 }, { "epoch": 0.7119304799805084, "grad_norm": 5.9136835481706855, "learning_rate": 1.011502887366167e-06, "loss": 0.4292, "step": 8766 }, { "epoch": 0.71201169495655, "grad_norm": 5.09214562271127, "learning_rate": 1.0109745853020655e-06, "loss": 0.4259, "step": 8767 }, { "epoch": 0.7120929099325916, "grad_norm": 8.352854444325088, "learning_rate": 1.0104463862706414e-06, "loss": 0.3907, "step": 8768 }, { "epoch": 0.7121741249086332, "grad_norm": 4.539279498681904, "learning_rate": 1.0099182903084448e-06, "loss": 0.5434, "step": 8769 }, { "epoch": 0.7122553398846747, "grad_norm": 6.108837746410746, "learning_rate": 1.0093902974520165e-06, "loss": 0.4994, "step": 8770 }, { "epoch": 0.7123365548607163, "grad_norm": 6.827549453211144, "learning_rate": 1.0088624077378897e-06, "loss": 0.4736, "step": 8771 }, { "epoch": 0.7124177698367579, "grad_norm": 5.0626039477295315, "learning_rate": 1.0083346212025923e-06, "loss": 0.6442, "step": 8772 }, { "epoch": 0.7124989848127995, "grad_norm": 4.104258023545044, "learning_rate": 1.0078069378826458e-06, "loss": 0.579, "step": 8773 }, { "epoch": 0.712580199788841, "grad_norm": 3.7706839195991297, "learning_rate": 1.0072793578145618e-06, "loss": 0.5327, "step": 8774 }, { "epoch": 0.7126614147648827, "grad_norm": 7.155111652149057, "learning_rate": 1.0067518810348453e-06, "loss": 0.5282, "step": 8775 }, { "epoch": 0.7127426297409242, "grad_norm": 7.637395079270046, "learning_rate": 1.0062245075799966e-06, "loss": 0.5045, "step": 8776 }, { "epoch": 0.7128238447169658, "grad_norm": 6.413393628245962, "learning_rate": 1.0056972374865054e-06, "loss": 0.6264, "step": 8777 }, { "epoch": 0.7129050596930074, "grad_norm": 5.7064526408846135, "learning_rate": 1.0051700707908569e-06, "loss": 0.7116, "step": 8778 }, { "epoch": 0.712986274669049, "grad_norm": 5.902226546762958, "learning_rate": 1.0046430075295287e-06, "loss": 0.6247, "step": 8779 }, { "epoch": 0.7130674896450906, "grad_norm": 5.831122076718099, "learning_rate": 1.0041160477389909e-06, "loss": 0.4892, "step": 8780 }, { "epoch": 0.7131487046211321, "grad_norm": 5.215468338547022, "learning_rate": 1.0035891914557044e-06, "loss": 0.5625, "step": 8781 }, { "epoch": 0.7132299195971737, "grad_norm": 7.447326637422664, "learning_rate": 1.0030624387161273e-06, "loss": 0.4898, "step": 8782 }, { "epoch": 0.7133111345732153, "grad_norm": 4.541675365615177, "learning_rate": 1.002535789556707e-06, "loss": 0.5263, "step": 8783 }, { "epoch": 0.7133923495492569, "grad_norm": 5.008687031433551, "learning_rate": 1.0020092440138833e-06, "loss": 0.57, "step": 8784 }, { "epoch": 0.7134735645252984, "grad_norm": 4.218282129108528, "learning_rate": 1.0014828021240932e-06, "loss": 0.3873, "step": 8785 }, { "epoch": 0.71355477950134, "grad_norm": 7.9910976481847875, "learning_rate": 1.0009564639237627e-06, "loss": 0.3644, "step": 8786 }, { "epoch": 0.7136359944773816, "grad_norm": 4.348847062904255, "learning_rate": 1.0004302294493104e-06, "loss": 0.4983, "step": 8787 }, { "epoch": 0.7137172094534232, "grad_norm": 6.4121947565646815, "learning_rate": 9.999040987371505e-07, "loss": 0.4464, "step": 8788 }, { "epoch": 0.7137984244294648, "grad_norm": 4.829469885568656, "learning_rate": 9.993780718236882e-07, "loss": 0.3603, "step": 8789 }, { "epoch": 0.7138796394055064, "grad_norm": 3.668444891519201, "learning_rate": 9.988521487453203e-07, "loss": 0.5219, "step": 8790 }, { "epoch": 0.713960854381548, "grad_norm": 4.4085220444412245, "learning_rate": 9.98326329538439e-07, "loss": 0.48, "step": 8791 }, { "epoch": 0.7140420693575895, "grad_norm": 6.942288338011259, "learning_rate": 9.978006142394292e-07, "loss": 0.6356, "step": 8792 }, { "epoch": 0.7141232843336311, "grad_norm": 3.9011079567183353, "learning_rate": 9.972750028846665e-07, "loss": 0.4649, "step": 8793 }, { "epoch": 0.7142044993096727, "grad_norm": 5.691178624558273, "learning_rate": 9.967494955105197e-07, "loss": 0.5606, "step": 8794 }, { "epoch": 0.7142857142857143, "grad_norm": 4.779174640506254, "learning_rate": 9.962240921533528e-07, "loss": 0.4898, "step": 8795 }, { "epoch": 0.7143669292617558, "grad_norm": 5.8645704612877445, "learning_rate": 9.956987928495193e-07, "loss": 0.4236, "step": 8796 }, { "epoch": 0.7144481442377975, "grad_norm": 4.402062224686781, "learning_rate": 9.951735976353677e-07, "loss": 0.4852, "step": 8797 }, { "epoch": 0.714529359213839, "grad_norm": 5.0026387695604875, "learning_rate": 9.946485065472402e-07, "loss": 0.357, "step": 8798 }, { "epoch": 0.7146105741898806, "grad_norm": 7.498003830377879, "learning_rate": 9.941235196214687e-07, "loss": 0.5172, "step": 8799 }, { "epoch": 0.7146917891659222, "grad_norm": 3.3478530645853675, "learning_rate": 9.935986368943796e-07, "loss": 0.4808, "step": 8800 }, { "epoch": 0.7147730041419638, "grad_norm": 6.779585993646702, "learning_rate": 9.930738584022925e-07, "loss": 0.5855, "step": 8801 }, { "epoch": 0.7148542191180054, "grad_norm": 4.424567878048419, "learning_rate": 9.925491841815197e-07, "loss": 0.4775, "step": 8802 }, { "epoch": 0.7149354340940469, "grad_norm": 4.168929836677361, "learning_rate": 9.92024614268364e-07, "loss": 0.611, "step": 8803 }, { "epoch": 0.7150166490700886, "grad_norm": 4.398671369510578, "learning_rate": 9.915001486991243e-07, "loss": 0.6159, "step": 8804 }, { "epoch": 0.7150978640461301, "grad_norm": 4.041777750287028, "learning_rate": 9.909757875100914e-07, "loss": 0.4175, "step": 8805 }, { "epoch": 0.7151790790221717, "grad_norm": 7.847660489045761, "learning_rate": 9.904515307375478e-07, "loss": 0.5913, "step": 8806 }, { "epoch": 0.7152602939982132, "grad_norm": 4.250630000320927, "learning_rate": 9.899273784177681e-07, "loss": 0.3363, "step": 8807 }, { "epoch": 0.7153415089742549, "grad_norm": 8.197363212404545, "learning_rate": 9.894033305870229e-07, "loss": 0.4725, "step": 8808 }, { "epoch": 0.7154227239502964, "grad_norm": 3.91482587309766, "learning_rate": 9.888793872815716e-07, "loss": 0.4924, "step": 8809 }, { "epoch": 0.715503938926338, "grad_norm": 4.933671181822034, "learning_rate": 9.883555485376688e-07, "loss": 0.6437, "step": 8810 }, { "epoch": 0.7155851539023796, "grad_norm": 5.499179017080881, "learning_rate": 9.878318143915633e-07, "loss": 0.3706, "step": 8811 }, { "epoch": 0.7156663688784212, "grad_norm": 7.743811775235963, "learning_rate": 9.873081848794926e-07, "loss": 0.4138, "step": 8812 }, { "epoch": 0.7157475838544628, "grad_norm": 4.332868158341539, "learning_rate": 9.867846600376892e-07, "loss": 0.5267, "step": 8813 }, { "epoch": 0.7158287988305043, "grad_norm": 4.072077480048703, "learning_rate": 9.862612399023797e-07, "loss": 0.6121, "step": 8814 }, { "epoch": 0.715910013806546, "grad_norm": 6.275203078249558, "learning_rate": 9.85737924509781e-07, "loss": 0.4842, "step": 8815 }, { "epoch": 0.7159912287825875, "grad_norm": 6.229912521186827, "learning_rate": 9.852147138961026e-07, "loss": 0.3927, "step": 8816 }, { "epoch": 0.7160724437586291, "grad_norm": 5.84959994620228, "learning_rate": 9.846916080975493e-07, "loss": 0.4515, "step": 8817 }, { "epoch": 0.7161536587346706, "grad_norm": 8.565273476837879, "learning_rate": 9.841686071503178e-07, "loss": 0.4579, "step": 8818 }, { "epoch": 0.7162348737107123, "grad_norm": 5.7709811538417854, "learning_rate": 9.836457110905956e-07, "loss": 0.7484, "step": 8819 }, { "epoch": 0.7163160886867538, "grad_norm": 5.3290551767188035, "learning_rate": 9.831229199545659e-07, "loss": 0.3563, "step": 8820 }, { "epoch": 0.7163973036627954, "grad_norm": 6.90902242367057, "learning_rate": 9.82600233778402e-07, "loss": 0.5667, "step": 8821 }, { "epoch": 0.716478518638837, "grad_norm": 5.0511931716125105, "learning_rate": 9.820776525982703e-07, "loss": 0.4209, "step": 8822 }, { "epoch": 0.7165597336148786, "grad_norm": 4.8369460147576095, "learning_rate": 9.815551764503317e-07, "loss": 0.4668, "step": 8823 }, { "epoch": 0.7166409485909202, "grad_norm": 4.486543984473168, "learning_rate": 9.810328053707394e-07, "loss": 0.8039, "step": 8824 }, { "epoch": 0.7167221635669617, "grad_norm": 4.014813431510159, "learning_rate": 9.805105393956378e-07, "loss": 0.471, "step": 8825 }, { "epoch": 0.7168033785430034, "grad_norm": 7.519778869225568, "learning_rate": 9.799883785611647e-07, "loss": 0.4735, "step": 8826 }, { "epoch": 0.7168845935190449, "grad_norm": 5.5537814970796315, "learning_rate": 9.794663229034518e-07, "loss": 0.4381, "step": 8827 }, { "epoch": 0.7169658084950865, "grad_norm": 19.344716592490663, "learning_rate": 9.78944372458622e-07, "loss": 0.4718, "step": 8828 }, { "epoch": 0.717047023471128, "grad_norm": 5.884182711703903, "learning_rate": 9.784225272627908e-07, "loss": 0.5988, "step": 8829 }, { "epoch": 0.7171282384471697, "grad_norm": 4.891986811172294, "learning_rate": 9.77900787352068e-07, "loss": 0.3812, "step": 8830 }, { "epoch": 0.7172094534232112, "grad_norm": 10.276801237301445, "learning_rate": 9.773791527625557e-07, "loss": 0.6522, "step": 8831 }, { "epoch": 0.7172906683992528, "grad_norm": 6.1468764794545505, "learning_rate": 9.76857623530347e-07, "loss": 0.5199, "step": 8832 }, { "epoch": 0.7173718833752945, "grad_norm": 3.8035905037114097, "learning_rate": 9.763361996915302e-07, "loss": 0.5275, "step": 8833 }, { "epoch": 0.717453098351336, "grad_norm": 6.897927489006048, "learning_rate": 9.75814881282185e-07, "loss": 0.5616, "step": 8834 }, { "epoch": 0.7175343133273776, "grad_norm": 3.453903520221075, "learning_rate": 9.752936683383822e-07, "loss": 0.4339, "step": 8835 }, { "epoch": 0.7176155283034191, "grad_norm": 4.99189702123876, "learning_rate": 9.747725608961881e-07, "loss": 0.5881, "step": 8836 }, { "epoch": 0.7176967432794608, "grad_norm": 5.00035982299125, "learning_rate": 9.742515589916615e-07, "loss": 0.4863, "step": 8837 }, { "epoch": 0.7177779582555023, "grad_norm": 4.906966447423704, "learning_rate": 9.737306626608514e-07, "loss": 0.5093, "step": 8838 }, { "epoch": 0.7178591732315439, "grad_norm": 5.968171309664626, "learning_rate": 9.732098719398025e-07, "loss": 0.5323, "step": 8839 }, { "epoch": 0.7179403882075854, "grad_norm": 4.720388953914238, "learning_rate": 9.726891868645502e-07, "loss": 0.4468, "step": 8840 }, { "epoch": 0.7180216031836271, "grad_norm": 3.842805699155025, "learning_rate": 9.721686074711228e-07, "loss": 0.5385, "step": 8841 }, { "epoch": 0.7181028181596686, "grad_norm": 9.473267130229601, "learning_rate": 9.716481337955411e-07, "loss": 0.4456, "step": 8842 }, { "epoch": 0.7181840331357102, "grad_norm": 5.317653998758775, "learning_rate": 9.711277658738197e-07, "loss": 0.6716, "step": 8843 }, { "epoch": 0.7182652481117519, "grad_norm": 4.085741117429908, "learning_rate": 9.706075037419666e-07, "loss": 0.553, "step": 8844 }, { "epoch": 0.7183464630877934, "grad_norm": 21.53419743501485, "learning_rate": 9.700873474359786e-07, "loss": 0.4472, "step": 8845 }, { "epoch": 0.718427678063835, "grad_norm": 6.994061607271889, "learning_rate": 9.695672969918508e-07, "loss": 0.4094, "step": 8846 }, { "epoch": 0.7185088930398765, "grad_norm": 4.554212460035874, "learning_rate": 9.69047352445566e-07, "loss": 0.4225, "step": 8847 }, { "epoch": 0.7185901080159182, "grad_norm": 7.821491253661923, "learning_rate": 9.68527513833101e-07, "loss": 0.4877, "step": 8848 }, { "epoch": 0.7186713229919597, "grad_norm": 3.1283429984031783, "learning_rate": 9.68007781190427e-07, "loss": 0.3561, "step": 8849 }, { "epoch": 0.7187525379680013, "grad_norm": 5.628658133939152, "learning_rate": 9.674881545535073e-07, "loss": 0.5745, "step": 8850 }, { "epoch": 0.7188337529440428, "grad_norm": 11.726251049043716, "learning_rate": 9.669686339582959e-07, "loss": 0.6452, "step": 8851 }, { "epoch": 0.7189149679200845, "grad_norm": 4.111507695592982, "learning_rate": 9.664492194407425e-07, "loss": 0.559, "step": 8852 }, { "epoch": 0.718996182896126, "grad_norm": 6.507482959837951, "learning_rate": 9.659299110367868e-07, "loss": 0.4079, "step": 8853 }, { "epoch": 0.7190773978721676, "grad_norm": 6.32505374062163, "learning_rate": 9.654107087823613e-07, "loss": 0.6939, "step": 8854 }, { "epoch": 0.7191586128482093, "grad_norm": 7.74320454513088, "learning_rate": 9.64891612713393e-07, "loss": 0.6168, "step": 8855 }, { "epoch": 0.7192398278242508, "grad_norm": 9.529588740791155, "learning_rate": 9.643726228658017e-07, "loss": 0.4821, "step": 8856 }, { "epoch": 0.7193210428002924, "grad_norm": 5.534280027657455, "learning_rate": 9.638537392754968e-07, "loss": 0.4415, "step": 8857 }, { "epoch": 0.7194022577763339, "grad_norm": 5.287392525426979, "learning_rate": 9.63334961978384e-07, "loss": 0.4648, "step": 8858 }, { "epoch": 0.7194834727523756, "grad_norm": 5.411950885514694, "learning_rate": 9.628162910103595e-07, "loss": 0.4528, "step": 8859 }, { "epoch": 0.7195646877284171, "grad_norm": 5.234971772019722, "learning_rate": 9.62297726407312e-07, "loss": 0.5357, "step": 8860 }, { "epoch": 0.7196459027044587, "grad_norm": 3.5786127468446147, "learning_rate": 9.617792682051228e-07, "loss": 0.4252, "step": 8861 }, { "epoch": 0.7197271176805002, "grad_norm": 4.004340303264486, "learning_rate": 9.612609164396672e-07, "loss": 0.4677, "step": 8862 }, { "epoch": 0.7198083326565419, "grad_norm": 5.758524558297638, "learning_rate": 9.607426711468135e-07, "loss": 0.4476, "step": 8863 }, { "epoch": 0.7198895476325834, "grad_norm": 8.535570347156773, "learning_rate": 9.602245323624195e-07, "loss": 0.4546, "step": 8864 }, { "epoch": 0.719970762608625, "grad_norm": 4.001330283022169, "learning_rate": 9.597065001223397e-07, "loss": 0.4607, "step": 8865 }, { "epoch": 0.7200519775846667, "grad_norm": 4.377728948837983, "learning_rate": 9.591885744624183e-07, "loss": 0.3572, "step": 8866 }, { "epoch": 0.7201331925607082, "grad_norm": 3.8588903316964442, "learning_rate": 9.586707554184918e-07, "loss": 0.4552, "step": 8867 }, { "epoch": 0.7202144075367498, "grad_norm": 6.457542883089068, "learning_rate": 9.581530430263919e-07, "loss": 0.4036, "step": 8868 }, { "epoch": 0.7202956225127913, "grad_norm": 6.505717474355659, "learning_rate": 9.57635437321942e-07, "loss": 0.2751, "step": 8869 }, { "epoch": 0.720376837488833, "grad_norm": 7.655308168808172, "learning_rate": 9.571179383409561e-07, "loss": 0.4116, "step": 8870 }, { "epoch": 0.7204580524648745, "grad_norm": 3.1031621847917634, "learning_rate": 9.566005461192444e-07, "loss": 0.4976, "step": 8871 }, { "epoch": 0.7205392674409161, "grad_norm": 6.088083430309052, "learning_rate": 9.560832606926064e-07, "loss": 0.4635, "step": 8872 }, { "epoch": 0.7206204824169576, "grad_norm": 6.004449008237206, "learning_rate": 9.55566082096835e-07, "loss": 0.5112, "step": 8873 }, { "epoch": 0.7207016973929993, "grad_norm": 9.024846945792124, "learning_rate": 9.550490103677176e-07, "loss": 0.3386, "step": 8874 }, { "epoch": 0.7207829123690408, "grad_norm": 4.2699585973289915, "learning_rate": 9.54532045541031e-07, "loss": 0.4822, "step": 8875 }, { "epoch": 0.7208641273450824, "grad_norm": 4.631080829221704, "learning_rate": 9.54015187652548e-07, "loss": 0.5532, "step": 8876 }, { "epoch": 0.7209453423211241, "grad_norm": 6.8091971004552585, "learning_rate": 9.534984367380329e-07, "loss": 0.5322, "step": 8877 }, { "epoch": 0.7210265572971656, "grad_norm": 3.589150990709112, "learning_rate": 9.529817928332411e-07, "loss": 0.523, "step": 8878 }, { "epoch": 0.7211077722732072, "grad_norm": 8.421640744360598, "learning_rate": 9.524652559739217e-07, "loss": 0.3827, "step": 8879 }, { "epoch": 0.7211889872492487, "grad_norm": 5.462634830370036, "learning_rate": 9.519488261958157e-07, "loss": 0.4339, "step": 8880 }, { "epoch": 0.7212702022252904, "grad_norm": 6.652542616373296, "learning_rate": 9.514325035346577e-07, "loss": 0.5582, "step": 8881 }, { "epoch": 0.7213514172013319, "grad_norm": 11.329982918292163, "learning_rate": 9.509162880261757e-07, "loss": 0.4287, "step": 8882 }, { "epoch": 0.7214326321773735, "grad_norm": 4.346970981846385, "learning_rate": 9.504001797060875e-07, "loss": 0.3969, "step": 8883 }, { "epoch": 0.721513847153415, "grad_norm": 10.533676550458823, "learning_rate": 9.498841786101065e-07, "loss": 0.3896, "step": 8884 }, { "epoch": 0.7215950621294567, "grad_norm": 6.522166801681026, "learning_rate": 9.493682847739363e-07, "loss": 0.4312, "step": 8885 }, { "epoch": 0.7216762771054982, "grad_norm": 15.683352653559364, "learning_rate": 9.488524982332734e-07, "loss": 0.4908, "step": 8886 }, { "epoch": 0.7217574920815398, "grad_norm": 4.885082924071923, "learning_rate": 9.483368190238093e-07, "loss": 0.604, "step": 8887 }, { "epoch": 0.7218387070575815, "grad_norm": 6.215187035651405, "learning_rate": 9.478212471812242e-07, "loss": 0.6215, "step": 8888 }, { "epoch": 0.721919922033623, "grad_norm": 6.199409295607022, "learning_rate": 9.473057827411941e-07, "loss": 0.46, "step": 8889 }, { "epoch": 0.7220011370096646, "grad_norm": 5.210688836874202, "learning_rate": 9.467904257393873e-07, "loss": 0.4037, "step": 8890 }, { "epoch": 0.7220823519857061, "grad_norm": 5.580114042556584, "learning_rate": 9.462751762114625e-07, "loss": 0.5964, "step": 8891 }, { "epoch": 0.7221635669617478, "grad_norm": 5.621455969727306, "learning_rate": 9.45760034193072e-07, "loss": 0.5682, "step": 8892 }, { "epoch": 0.7222447819377893, "grad_norm": 11.91702063843931, "learning_rate": 9.45244999719862e-07, "loss": 0.3895, "step": 8893 }, { "epoch": 0.7223259969138309, "grad_norm": 4.231827622581398, "learning_rate": 9.447300728274689e-07, "loss": 0.3848, "step": 8894 }, { "epoch": 0.7224072118898724, "grad_norm": 4.914560874009442, "learning_rate": 9.442152535515245e-07, "loss": 0.4818, "step": 8895 }, { "epoch": 0.7224884268659141, "grad_norm": 3.3941269423081386, "learning_rate": 9.437005419276496e-07, "loss": 0.5125, "step": 8896 }, { "epoch": 0.7225696418419557, "grad_norm": 7.1114894868029515, "learning_rate": 9.431859379914615e-07, "loss": 0.5125, "step": 8897 }, { "epoch": 0.7226508568179972, "grad_norm": 7.590278377377921, "learning_rate": 9.426714417785673e-07, "loss": 0.4268, "step": 8898 }, { "epoch": 0.7227320717940389, "grad_norm": 8.118555760980742, "learning_rate": 9.421570533245663e-07, "loss": 0.6289, "step": 8899 }, { "epoch": 0.7228132867700804, "grad_norm": 4.8344990060798025, "learning_rate": 9.416427726650535e-07, "loss": 0.4768, "step": 8900 }, { "epoch": 0.722894501746122, "grad_norm": 3.812357829292625, "learning_rate": 9.411285998356124e-07, "loss": 0.588, "step": 8901 }, { "epoch": 0.7229757167221635, "grad_norm": 4.3624783009325805, "learning_rate": 9.406145348718218e-07, "loss": 0.6037, "step": 8902 }, { "epoch": 0.7230569316982052, "grad_norm": 7.735840837751049, "learning_rate": 9.401005778092537e-07, "loss": 0.5546, "step": 8903 }, { "epoch": 0.7231381466742467, "grad_norm": 4.2176648353846184, "learning_rate": 9.395867286834695e-07, "loss": 0.4084, "step": 8904 }, { "epoch": 0.7232193616502883, "grad_norm": 10.500833125072006, "learning_rate": 9.390729875300247e-07, "loss": 0.3893, "step": 8905 }, { "epoch": 0.7233005766263298, "grad_norm": 5.323683099575961, "learning_rate": 9.38559354384469e-07, "loss": 0.48, "step": 8906 }, { "epoch": 0.7233817916023715, "grad_norm": 4.623752481785489, "learning_rate": 9.38045829282341e-07, "loss": 0.6171, "step": 8907 }, { "epoch": 0.7234630065784131, "grad_norm": 7.855386690391926, "learning_rate": 9.375324122591753e-07, "loss": 0.5059, "step": 8908 }, { "epoch": 0.7235442215544546, "grad_norm": 4.500533084765437, "learning_rate": 9.370191033504982e-07, "loss": 0.5279, "step": 8909 }, { "epoch": 0.7236254365304963, "grad_norm": 7.958925600183833, "learning_rate": 9.365059025918274e-07, "loss": 0.4437, "step": 8910 }, { "epoch": 0.7237066515065378, "grad_norm": 3.95735207574078, "learning_rate": 9.359928100186724e-07, "loss": 0.5215, "step": 8911 }, { "epoch": 0.7237878664825794, "grad_norm": 11.501156203158045, "learning_rate": 9.354798256665384e-07, "loss": 0.474, "step": 8912 }, { "epoch": 0.7238690814586209, "grad_norm": 4.487504865241946, "learning_rate": 9.349669495709208e-07, "loss": 0.3921, "step": 8913 }, { "epoch": 0.7239502964346626, "grad_norm": 8.160345585558959, "learning_rate": 9.344541817673061e-07, "loss": 0.5789, "step": 8914 }, { "epoch": 0.7240315114107041, "grad_norm": 5.808899053157705, "learning_rate": 9.339415222911766e-07, "loss": 0.5123, "step": 8915 }, { "epoch": 0.7241127263867457, "grad_norm": 3.2770410237558165, "learning_rate": 9.334289711780062e-07, "loss": 0.7287, "step": 8916 }, { "epoch": 0.7241939413627873, "grad_norm": 6.106039871460762, "learning_rate": 9.329165284632602e-07, "loss": 0.4535, "step": 8917 }, { "epoch": 0.7242751563388289, "grad_norm": 4.5914206885164, "learning_rate": 9.324041941823961e-07, "loss": 0.4845, "step": 8918 }, { "epoch": 0.7243563713148705, "grad_norm": 8.623594737117811, "learning_rate": 9.318919683708661e-07, "loss": 0.5425, "step": 8919 }, { "epoch": 0.724437586290912, "grad_norm": 5.503813897391588, "learning_rate": 9.313798510641117e-07, "loss": 0.507, "step": 8920 }, { "epoch": 0.7245188012669537, "grad_norm": 27.071015075508583, "learning_rate": 9.308678422975701e-07, "loss": 0.4499, "step": 8921 }, { "epoch": 0.7246000162429952, "grad_norm": 5.264809337126668, "learning_rate": 9.303559421066699e-07, "loss": 0.4108, "step": 8922 }, { "epoch": 0.7246812312190368, "grad_norm": 4.739116238844419, "learning_rate": 9.298441505268316e-07, "loss": 0.459, "step": 8923 }, { "epoch": 0.7247624461950783, "grad_norm": 7.57503395698494, "learning_rate": 9.29332467593467e-07, "loss": 0.4787, "step": 8924 }, { "epoch": 0.72484366117112, "grad_norm": 7.320416604446665, "learning_rate": 9.28820893341984e-07, "loss": 0.4384, "step": 8925 }, { "epoch": 0.7249248761471615, "grad_norm": 8.330410195117771, "learning_rate": 9.28309427807779e-07, "loss": 0.5397, "step": 8926 }, { "epoch": 0.7250060911232031, "grad_norm": 8.653115313732869, "learning_rate": 9.277980710262432e-07, "loss": 0.5418, "step": 8927 }, { "epoch": 0.7250873060992447, "grad_norm": 5.712763483431208, "learning_rate": 9.272868230327614e-07, "loss": 0.4397, "step": 8928 }, { "epoch": 0.7251685210752863, "grad_norm": 7.20946070132443, "learning_rate": 9.267756838627079e-07, "loss": 0.619, "step": 8929 }, { "epoch": 0.7252497360513279, "grad_norm": 3.836932313032167, "learning_rate": 9.262646535514499e-07, "loss": 0.5411, "step": 8930 }, { "epoch": 0.7253309510273694, "grad_norm": 7.397925686014042, "learning_rate": 9.257537321343499e-07, "loss": 0.5804, "step": 8931 }, { "epoch": 0.7254121660034111, "grad_norm": 7.414052669733408, "learning_rate": 9.252429196467603e-07, "loss": 0.5053, "step": 8932 }, { "epoch": 0.7254933809794526, "grad_norm": 6.780735171626088, "learning_rate": 9.247322161240252e-07, "loss": 0.4038, "step": 8933 }, { "epoch": 0.7255745959554942, "grad_norm": 4.08445905007205, "learning_rate": 9.242216216014838e-07, "loss": 0.6769, "step": 8934 }, { "epoch": 0.7256558109315357, "grad_norm": 5.6918275065058275, "learning_rate": 9.237111361144674e-07, "loss": 0.3345, "step": 8935 }, { "epoch": 0.7257370259075774, "grad_norm": 5.320211088421992, "learning_rate": 9.232007596982978e-07, "loss": 0.5265, "step": 8936 }, { "epoch": 0.7258182408836189, "grad_norm": 5.5679637549611165, "learning_rate": 9.226904923882901e-07, "loss": 0.5231, "step": 8937 }, { "epoch": 0.7258994558596605, "grad_norm": 9.0059520761274, "learning_rate": 9.22180334219753e-07, "loss": 0.4251, "step": 8938 }, { "epoch": 0.7259806708357021, "grad_norm": 7.189840208919893, "learning_rate": 9.216702852279857e-07, "loss": 0.5836, "step": 8939 }, { "epoch": 0.7260618858117437, "grad_norm": 8.24631932538056, "learning_rate": 9.211603454482812e-07, "loss": 0.5427, "step": 8940 }, { "epoch": 0.7261431007877853, "grad_norm": 11.91801174885722, "learning_rate": 9.206505149159259e-07, "loss": 0.5369, "step": 8941 }, { "epoch": 0.7262243157638268, "grad_norm": 4.788846104575723, "learning_rate": 9.201407936661963e-07, "loss": 0.6227, "step": 8942 }, { "epoch": 0.7263055307398685, "grad_norm": 4.572117460437812, "learning_rate": 9.196311817343618e-07, "loss": 0.4761, "step": 8943 }, { "epoch": 0.72638674571591, "grad_norm": 6.432401948520424, "learning_rate": 9.191216791556864e-07, "loss": 0.494, "step": 8944 }, { "epoch": 0.7264679606919516, "grad_norm": 3.666688290179581, "learning_rate": 9.18612285965424e-07, "loss": 0.4724, "step": 8945 }, { "epoch": 0.7265491756679932, "grad_norm": 3.558045855054741, "learning_rate": 9.18103002198821e-07, "loss": 0.6309, "step": 8946 }, { "epoch": 0.7266303906440348, "grad_norm": 5.292568438307435, "learning_rate": 9.175938278911184e-07, "loss": 0.5463, "step": 8947 }, { "epoch": 0.7267116056200763, "grad_norm": 8.253942436158207, "learning_rate": 9.170847630775489e-07, "loss": 0.4128, "step": 8948 }, { "epoch": 0.7267928205961179, "grad_norm": 12.693589172331533, "learning_rate": 9.165758077933365e-07, "loss": 0.5094, "step": 8949 }, { "epoch": 0.7268740355721595, "grad_norm": 4.3211523742949876, "learning_rate": 9.160669620736973e-07, "loss": 0.5676, "step": 8950 }, { "epoch": 0.7269552505482011, "grad_norm": 11.192017775213207, "learning_rate": 9.15558225953842e-07, "loss": 0.5242, "step": 8951 }, { "epoch": 0.7270364655242427, "grad_norm": 25.015456026993192, "learning_rate": 9.150495994689712e-07, "loss": 0.4027, "step": 8952 }, { "epoch": 0.7271176805002842, "grad_norm": 5.152075474730206, "learning_rate": 9.145410826542797e-07, "loss": 0.4545, "step": 8953 }, { "epoch": 0.7271988954763259, "grad_norm": 6.684848268615437, "learning_rate": 9.140326755449555e-07, "loss": 0.4933, "step": 8954 }, { "epoch": 0.7272801104523674, "grad_norm": 5.088644961893354, "learning_rate": 9.135243781761763e-07, "loss": 0.4792, "step": 8955 }, { "epoch": 0.727361325428409, "grad_norm": 3.810037495766052, "learning_rate": 9.130161905831131e-07, "loss": 0.5671, "step": 8956 }, { "epoch": 0.7274425404044506, "grad_norm": 8.122117526021382, "learning_rate": 9.125081128009314e-07, "loss": 0.4428, "step": 8957 }, { "epoch": 0.7275237553804922, "grad_norm": 6.6487819565990325, "learning_rate": 9.120001448647867e-07, "loss": 0.4813, "step": 8958 }, { "epoch": 0.7276049703565337, "grad_norm": 6.992838856608024, "learning_rate": 9.114922868098267e-07, "loss": 0.4352, "step": 8959 }, { "epoch": 0.7276861853325753, "grad_norm": 4.483263285402361, "learning_rate": 9.109845386711932e-07, "loss": 0.5618, "step": 8960 }, { "epoch": 0.7277674003086169, "grad_norm": 4.2451531883975235, "learning_rate": 9.104769004840208e-07, "loss": 0.5705, "step": 8961 }, { "epoch": 0.7278486152846585, "grad_norm": 3.5682877453830186, "learning_rate": 9.099693722834336e-07, "loss": 0.3645, "step": 8962 }, { "epoch": 0.7279298302607001, "grad_norm": 4.835617927031836, "learning_rate": 9.094619541045516e-07, "loss": 0.6062, "step": 8963 }, { "epoch": 0.7280110452367416, "grad_norm": 4.723308760739116, "learning_rate": 9.089546459824846e-07, "loss": 0.441, "step": 8964 }, { "epoch": 0.7280922602127833, "grad_norm": 5.594812767897018, "learning_rate": 9.084474479523347e-07, "loss": 0.4843, "step": 8965 }, { "epoch": 0.7281734751888248, "grad_norm": 5.966924171312079, "learning_rate": 9.079403600491982e-07, "loss": 0.4151, "step": 8966 }, { "epoch": 0.7282546901648664, "grad_norm": 4.336193418701047, "learning_rate": 9.074333823081638e-07, "loss": 0.5172, "step": 8967 }, { "epoch": 0.728335905140908, "grad_norm": 6.063173388232329, "learning_rate": 9.069265147643109e-07, "loss": 0.4559, "step": 8968 }, { "epoch": 0.7284171201169496, "grad_norm": 4.628275139282452, "learning_rate": 9.064197574527112e-07, "loss": 0.5929, "step": 8969 }, { "epoch": 0.7284983350929911, "grad_norm": 3.7566474590811447, "learning_rate": 9.059131104084309e-07, "loss": 0.6199, "step": 8970 }, { "epoch": 0.7285795500690327, "grad_norm": 5.357753313412571, "learning_rate": 9.054065736665268e-07, "loss": 0.519, "step": 8971 }, { "epoch": 0.7286607650450743, "grad_norm": 5.601299513674894, "learning_rate": 9.049001472620481e-07, "loss": 0.6267, "step": 8972 }, { "epoch": 0.7287419800211159, "grad_norm": 6.85651492063141, "learning_rate": 9.043938312300368e-07, "loss": 0.4893, "step": 8973 }, { "epoch": 0.7288231949971575, "grad_norm": 6.764798797003058, "learning_rate": 9.038876256055288e-07, "loss": 0.3582, "step": 8974 }, { "epoch": 0.728904409973199, "grad_norm": 6.409476783815426, "learning_rate": 9.033815304235488e-07, "loss": 0.412, "step": 8975 }, { "epoch": 0.7289856249492407, "grad_norm": 18.649132110530104, "learning_rate": 9.028755457191179e-07, "loss": 0.3983, "step": 8976 }, { "epoch": 0.7290668399252822, "grad_norm": 6.513807711538606, "learning_rate": 9.023696715272468e-07, "loss": 0.3894, "step": 8977 }, { "epoch": 0.7291480549013238, "grad_norm": 6.299055569156435, "learning_rate": 9.018639078829378e-07, "loss": 0.4124, "step": 8978 }, { "epoch": 0.7292292698773654, "grad_norm": 3.514477375819592, "learning_rate": 9.013582548211885e-07, "loss": 0.568, "step": 8979 }, { "epoch": 0.729310484853407, "grad_norm": 3.3460958135923318, "learning_rate": 9.008527123769883e-07, "loss": 0.4293, "step": 8980 }, { "epoch": 0.7293916998294485, "grad_norm": 4.436979913495132, "learning_rate": 9.003472805853161e-07, "loss": 0.3699, "step": 8981 }, { "epoch": 0.7294729148054901, "grad_norm": 5.276006150098665, "learning_rate": 8.998419594811467e-07, "loss": 0.5529, "step": 8982 }, { "epoch": 0.7295541297815317, "grad_norm": 4.525908461414848, "learning_rate": 8.993367490994451e-07, "loss": 0.4417, "step": 8983 }, { "epoch": 0.7296353447575733, "grad_norm": 6.180441338511432, "learning_rate": 8.988316494751683e-07, "loss": 0.4299, "step": 8984 }, { "epoch": 0.7297165597336149, "grad_norm": 3.4284667198540983, "learning_rate": 8.983266606432672e-07, "loss": 0.5197, "step": 8985 }, { "epoch": 0.7297977747096565, "grad_norm": 4.57248481351034, "learning_rate": 8.978217826386853e-07, "loss": 0.5724, "step": 8986 }, { "epoch": 0.7298789896856981, "grad_norm": 3.3815176907013984, "learning_rate": 8.973170154963567e-07, "loss": 0.5855, "step": 8987 }, { "epoch": 0.7299602046617396, "grad_norm": 4.911910066001664, "learning_rate": 8.968123592512076e-07, "loss": 0.5791, "step": 8988 }, { "epoch": 0.7300414196377812, "grad_norm": 6.250038591037473, "learning_rate": 8.963078139381595e-07, "loss": 0.4979, "step": 8989 }, { "epoch": 0.7301226346138228, "grad_norm": 4.472456064967235, "learning_rate": 8.958033795921231e-07, "loss": 0.4881, "step": 8990 }, { "epoch": 0.7302038495898644, "grad_norm": 6.163691073085615, "learning_rate": 8.952990562480021e-07, "loss": 0.3618, "step": 8991 }, { "epoch": 0.7302850645659059, "grad_norm": 6.473412100775378, "learning_rate": 8.947948439406934e-07, "loss": 0.6202, "step": 8992 }, { "epoch": 0.7303662795419475, "grad_norm": 6.060384183610972, "learning_rate": 8.94290742705087e-07, "loss": 0.4327, "step": 8993 }, { "epoch": 0.7304474945179891, "grad_norm": 7.291283308794519, "learning_rate": 8.937867525760622e-07, "loss": 0.6409, "step": 8994 }, { "epoch": 0.7305287094940307, "grad_norm": 4.822306787639834, "learning_rate": 8.932828735884944e-07, "loss": 0.6233, "step": 8995 }, { "epoch": 0.7306099244700723, "grad_norm": 4.5942352939506055, "learning_rate": 8.927791057772481e-07, "loss": 0.4074, "step": 8996 }, { "epoch": 0.7306911394461139, "grad_norm": 5.662519945223263, "learning_rate": 8.922754491771807e-07, "loss": 0.4292, "step": 8997 }, { "epoch": 0.7307723544221555, "grad_norm": 5.270491463874796, "learning_rate": 8.917719038231437e-07, "loss": 0.5461, "step": 8998 }, { "epoch": 0.730853569398197, "grad_norm": 4.428969136605445, "learning_rate": 8.912684697499801e-07, "loss": 0.4651, "step": 8999 }, { "epoch": 0.7309347843742386, "grad_norm": 6.961674772191147, "learning_rate": 8.907651469925236e-07, "loss": 0.4143, "step": 9000 }, { "epoch": 0.7310159993502802, "grad_norm": 4.726617406201288, "learning_rate": 8.902619355856032e-07, "loss": 0.4823, "step": 9001 }, { "epoch": 0.7310972143263218, "grad_norm": 5.037522415851766, "learning_rate": 8.897588355640371e-07, "loss": 0.4849, "step": 9002 }, { "epoch": 0.7311784293023633, "grad_norm": 4.702839968633091, "learning_rate": 8.892558469626375e-07, "loss": 0.4989, "step": 9003 }, { "epoch": 0.731259644278405, "grad_norm": 4.121133343260713, "learning_rate": 8.887529698162079e-07, "loss": 0.4176, "step": 9004 }, { "epoch": 0.7313408592544465, "grad_norm": 5.799397892585551, "learning_rate": 8.882502041595454e-07, "loss": 0.4902, "step": 9005 }, { "epoch": 0.7314220742304881, "grad_norm": 10.129599048058408, "learning_rate": 8.877475500274393e-07, "loss": 0.5977, "step": 9006 }, { "epoch": 0.7315032892065297, "grad_norm": 6.254376447594514, "learning_rate": 8.872450074546696e-07, "loss": 0.4533, "step": 9007 }, { "epoch": 0.7315845041825713, "grad_norm": 7.497553888308231, "learning_rate": 8.867425764760104e-07, "loss": 0.4203, "step": 9008 }, { "epoch": 0.7316657191586129, "grad_norm": 7.377753478494686, "learning_rate": 8.862402571262272e-07, "loss": 0.5435, "step": 9009 }, { "epoch": 0.7317469341346544, "grad_norm": 3.964499330804677, "learning_rate": 8.857380494400764e-07, "loss": 0.6699, "step": 9010 }, { "epoch": 0.731828149110696, "grad_norm": 4.556955979542769, "learning_rate": 8.852359534523091e-07, "loss": 0.4287, "step": 9011 }, { "epoch": 0.7319093640867376, "grad_norm": 5.283260885622843, "learning_rate": 8.847339691976689e-07, "loss": 0.6336, "step": 9012 }, { "epoch": 0.7319905790627792, "grad_norm": 8.239303453615642, "learning_rate": 8.842320967108886e-07, "loss": 0.5459, "step": 9013 }, { "epoch": 0.7320717940388207, "grad_norm": 9.181392373672194, "learning_rate": 8.837303360266966e-07, "loss": 0.4328, "step": 9014 }, { "epoch": 0.7321530090148624, "grad_norm": 9.020007347941661, "learning_rate": 8.832286871798113e-07, "loss": 0.4695, "step": 9015 }, { "epoch": 0.7322342239909039, "grad_norm": 11.695346831333792, "learning_rate": 8.827271502049434e-07, "loss": 0.4455, "step": 9016 }, { "epoch": 0.7323154389669455, "grad_norm": 5.166457906877146, "learning_rate": 8.822257251367983e-07, "loss": 0.4783, "step": 9017 }, { "epoch": 0.7323966539429871, "grad_norm": 3.9452559803067575, "learning_rate": 8.817244120100702e-07, "loss": 0.4167, "step": 9018 }, { "epoch": 0.7324778689190287, "grad_norm": 5.030514569697522, "learning_rate": 8.812232108594482e-07, "loss": 0.3934, "step": 9019 }, { "epoch": 0.7325590838950703, "grad_norm": 10.567800558944477, "learning_rate": 8.807221217196135e-07, "loss": 0.5891, "step": 9020 }, { "epoch": 0.7326402988711118, "grad_norm": 7.700337486933208, "learning_rate": 8.802211446252379e-07, "loss": 0.5597, "step": 9021 }, { "epoch": 0.7327215138471534, "grad_norm": 11.713465789679601, "learning_rate": 8.797202796109869e-07, "loss": 0.5172, "step": 9022 }, { "epoch": 0.732802728823195, "grad_norm": 7.145291394301389, "learning_rate": 8.792195267115163e-07, "loss": 0.4569, "step": 9023 }, { "epoch": 0.7328839437992366, "grad_norm": 4.598031815516408, "learning_rate": 8.787188859614768e-07, "loss": 0.6056, "step": 9024 }, { "epoch": 0.7329651587752781, "grad_norm": 5.275503679314213, "learning_rate": 8.782183573955105e-07, "loss": 0.4652, "step": 9025 }, { "epoch": 0.7330463737513198, "grad_norm": 6.295243362632932, "learning_rate": 8.777179410482498e-07, "loss": 0.6455, "step": 9026 }, { "epoch": 0.7331275887273613, "grad_norm": 4.4474547984782395, "learning_rate": 8.772176369543229e-07, "loss": 0.3385, "step": 9027 }, { "epoch": 0.7332088037034029, "grad_norm": 3.172334169434078, "learning_rate": 8.767174451483468e-07, "loss": 0.5206, "step": 9028 }, { "epoch": 0.7332900186794445, "grad_norm": 4.6456662676555815, "learning_rate": 8.762173656649317e-07, "loss": 0.6199, "step": 9029 }, { "epoch": 0.7333712336554861, "grad_norm": 7.265881502649945, "learning_rate": 8.757173985386819e-07, "loss": 0.4957, "step": 9030 }, { "epoch": 0.7334524486315277, "grad_norm": 3.4982261171160833, "learning_rate": 8.752175438041908e-07, "loss": 0.5664, "step": 9031 }, { "epoch": 0.7335336636075692, "grad_norm": 3.6827798694601754, "learning_rate": 8.747178014960467e-07, "loss": 0.4881, "step": 9032 }, { "epoch": 0.7336148785836109, "grad_norm": 7.960332905031216, "learning_rate": 8.742181716488302e-07, "loss": 0.3276, "step": 9033 }, { "epoch": 0.7336960935596524, "grad_norm": 4.4974178890676075, "learning_rate": 8.737186542971115e-07, "loss": 0.4636, "step": 9034 }, { "epoch": 0.733777308535694, "grad_norm": 4.122814660625721, "learning_rate": 8.732192494754541e-07, "loss": 0.3476, "step": 9035 }, { "epoch": 0.7338585235117355, "grad_norm": 4.213370810352235, "learning_rate": 8.727199572184161e-07, "loss": 0.5909, "step": 9036 }, { "epoch": 0.7339397384877772, "grad_norm": 10.14479503528053, "learning_rate": 8.722207775605437e-07, "loss": 0.4764, "step": 9037 }, { "epoch": 0.7340209534638187, "grad_norm": 4.624804246473112, "learning_rate": 8.717217105363798e-07, "loss": 0.5015, "step": 9038 }, { "epoch": 0.7341021684398603, "grad_norm": 5.183928674417439, "learning_rate": 8.712227561804548e-07, "loss": 0.4768, "step": 9039 }, { "epoch": 0.734183383415902, "grad_norm": 7.897979278065682, "learning_rate": 8.707239145272958e-07, "loss": 0.5025, "step": 9040 }, { "epoch": 0.7342645983919435, "grad_norm": 16.857325758216245, "learning_rate": 8.702251856114191e-07, "loss": 0.4793, "step": 9041 }, { "epoch": 0.7343458133679851, "grad_norm": 6.305732473052533, "learning_rate": 8.697265694673334e-07, "loss": 0.4738, "step": 9042 }, { "epoch": 0.7344270283440266, "grad_norm": 4.796761988404685, "learning_rate": 8.692280661295419e-07, "loss": 0.334, "step": 9043 }, { "epoch": 0.7345082433200683, "grad_norm": 5.71378461104089, "learning_rate": 8.687296756325364e-07, "loss": 0.5107, "step": 9044 }, { "epoch": 0.7345894582961098, "grad_norm": 10.227432767121579, "learning_rate": 8.68231398010804e-07, "loss": 0.6141, "step": 9045 }, { "epoch": 0.7346706732721514, "grad_norm": 6.276424519094259, "learning_rate": 8.677332332988236e-07, "loss": 0.4475, "step": 9046 }, { "epoch": 0.7347518882481929, "grad_norm": 5.843493313646698, "learning_rate": 8.672351815310651e-07, "loss": 0.4045, "step": 9047 }, { "epoch": 0.7348331032242346, "grad_norm": 7.797951199210248, "learning_rate": 8.667372427419895e-07, "loss": 0.4015, "step": 9048 }, { "epoch": 0.7349143182002761, "grad_norm": 7.633315491295807, "learning_rate": 8.66239416966054e-07, "loss": 0.4516, "step": 9049 }, { "epoch": 0.7349955331763177, "grad_norm": 5.0774721644669025, "learning_rate": 8.657417042377034e-07, "loss": 0.5822, "step": 9050 }, { "epoch": 0.7350767481523593, "grad_norm": 3.3621612548870967, "learning_rate": 8.652441045913775e-07, "loss": 0.5267, "step": 9051 }, { "epoch": 0.7351579631284009, "grad_norm": 6.871554047086195, "learning_rate": 8.647466180615085e-07, "loss": 0.5536, "step": 9052 }, { "epoch": 0.7352391781044425, "grad_norm": 6.572776544165536, "learning_rate": 8.642492446825193e-07, "loss": 0.5135, "step": 9053 }, { "epoch": 0.735320393080484, "grad_norm": 5.770321723227472, "learning_rate": 8.637519844888245e-07, "loss": 0.4804, "step": 9054 }, { "epoch": 0.7354016080565257, "grad_norm": 10.272898706445313, "learning_rate": 8.632548375148333e-07, "loss": 0.5766, "step": 9055 }, { "epoch": 0.7354828230325672, "grad_norm": 7.89177634147062, "learning_rate": 8.627578037949441e-07, "loss": 0.6254, "step": 9056 }, { "epoch": 0.7355640380086088, "grad_norm": 6.509795135122351, "learning_rate": 8.62260883363551e-07, "loss": 0.4276, "step": 9057 }, { "epoch": 0.7356452529846503, "grad_norm": 5.327797962449124, "learning_rate": 8.617640762550361e-07, "loss": 0.4104, "step": 9058 }, { "epoch": 0.735726467960692, "grad_norm": 10.83596502585155, "learning_rate": 8.612673825037776e-07, "loss": 0.6383, "step": 9059 }, { "epoch": 0.7358076829367335, "grad_norm": 3.2930515840707257, "learning_rate": 8.607708021441436e-07, "loss": 0.5538, "step": 9060 }, { "epoch": 0.7358888979127751, "grad_norm": 6.123300825914036, "learning_rate": 8.602743352104936e-07, "loss": 0.5416, "step": 9061 }, { "epoch": 0.7359701128888168, "grad_norm": 4.126241878471722, "learning_rate": 8.597779817371824e-07, "loss": 0.4344, "step": 9062 }, { "epoch": 0.7360513278648583, "grad_norm": 4.001896575407721, "learning_rate": 8.592817417585534e-07, "loss": 0.5276, "step": 9063 }, { "epoch": 0.7361325428408999, "grad_norm": 4.410919527265607, "learning_rate": 8.587856153089444e-07, "loss": 0.5128, "step": 9064 }, { "epoch": 0.7362137578169414, "grad_norm": 4.6300581291782885, "learning_rate": 8.582896024226855e-07, "loss": 0.5539, "step": 9065 }, { "epoch": 0.7362949727929831, "grad_norm": 5.804508870075955, "learning_rate": 8.577937031340975e-07, "loss": 0.4621, "step": 9066 }, { "epoch": 0.7363761877690246, "grad_norm": 3.9641442301684653, "learning_rate": 8.572979174774934e-07, "loss": 0.4759, "step": 9067 }, { "epoch": 0.7364574027450662, "grad_norm": 3.935697034995573, "learning_rate": 8.568022454871802e-07, "loss": 0.5874, "step": 9068 }, { "epoch": 0.7365386177211077, "grad_norm": 5.777703937802822, "learning_rate": 8.563066871974543e-07, "loss": 0.4736, "step": 9069 }, { "epoch": 0.7366198326971494, "grad_norm": 3.7700387674835176, "learning_rate": 8.558112426426062e-07, "loss": 0.4482, "step": 9070 }, { "epoch": 0.7367010476731909, "grad_norm": 4.042064333038825, "learning_rate": 8.553159118569196e-07, "loss": 0.3638, "step": 9071 }, { "epoch": 0.7367822626492325, "grad_norm": 5.379208558888325, "learning_rate": 8.548206948746673e-07, "loss": 0.359, "step": 9072 }, { "epoch": 0.7368634776252742, "grad_norm": 5.80342467828285, "learning_rate": 8.543255917301163e-07, "loss": 0.5691, "step": 9073 }, { "epoch": 0.7369446926013157, "grad_norm": 9.897018766300208, "learning_rate": 8.538306024575235e-07, "loss": 0.5253, "step": 9074 }, { "epoch": 0.7370259075773573, "grad_norm": 5.1025049749314055, "learning_rate": 8.533357270911419e-07, "loss": 0.4075, "step": 9075 }, { "epoch": 0.7371071225533988, "grad_norm": 8.535318151291682, "learning_rate": 8.52840965665212e-07, "loss": 0.4842, "step": 9076 }, { "epoch": 0.7371883375294405, "grad_norm": 8.277801039325004, "learning_rate": 8.523463182139699e-07, "loss": 0.4553, "step": 9077 }, { "epoch": 0.737269552505482, "grad_norm": 4.521612840769156, "learning_rate": 8.518517847716435e-07, "loss": 0.627, "step": 9078 }, { "epoch": 0.7373507674815236, "grad_norm": 3.8960485335075603, "learning_rate": 8.513573653724508e-07, "loss": 0.5943, "step": 9079 }, { "epoch": 0.7374319824575651, "grad_norm": 4.606452564353779, "learning_rate": 8.508630600506021e-07, "loss": 0.6701, "step": 9080 }, { "epoch": 0.7375131974336068, "grad_norm": 7.052763751543803, "learning_rate": 8.503688688403028e-07, "loss": 0.5393, "step": 9081 }, { "epoch": 0.7375944124096483, "grad_norm": 4.538323480054813, "learning_rate": 8.498747917757464e-07, "loss": 0.434, "step": 9082 }, { "epoch": 0.7376756273856899, "grad_norm": 3.679103857972724, "learning_rate": 8.49380828891121e-07, "loss": 0.561, "step": 9083 }, { "epoch": 0.7377568423617316, "grad_norm": 9.599527757034656, "learning_rate": 8.488869802206073e-07, "loss": 0.4462, "step": 9084 }, { "epoch": 0.7378380573377731, "grad_norm": 5.51962161938916, "learning_rate": 8.483932457983765e-07, "loss": 0.5055, "step": 9085 }, { "epoch": 0.7379192723138147, "grad_norm": 7.720325892853574, "learning_rate": 8.478996256585909e-07, "loss": 0.3904, "step": 9086 }, { "epoch": 0.7380004872898562, "grad_norm": 4.8150352416660125, "learning_rate": 8.474061198354086e-07, "loss": 0.3425, "step": 9087 }, { "epoch": 0.7380817022658979, "grad_norm": 4.123061421099957, "learning_rate": 8.469127283629766e-07, "loss": 0.4812, "step": 9088 }, { "epoch": 0.7381629172419394, "grad_norm": 4.2839585199281975, "learning_rate": 8.464194512754339e-07, "loss": 0.6729, "step": 9089 }, { "epoch": 0.738244132217981, "grad_norm": 10.07664977406758, "learning_rate": 8.459262886069139e-07, "loss": 0.4227, "step": 9090 }, { "epoch": 0.7383253471940225, "grad_norm": 5.667705109796953, "learning_rate": 8.454332403915416e-07, "loss": 0.4117, "step": 9091 }, { "epoch": 0.7384065621700642, "grad_norm": 5.360148423288729, "learning_rate": 8.44940306663432e-07, "loss": 0.4129, "step": 9092 }, { "epoch": 0.7384877771461057, "grad_norm": 4.840577845593679, "learning_rate": 8.444474874566935e-07, "loss": 0.43, "step": 9093 }, { "epoch": 0.7385689921221473, "grad_norm": 5.228586184130316, "learning_rate": 8.439547828054276e-07, "loss": 0.5745, "step": 9094 }, { "epoch": 0.738650207098189, "grad_norm": 5.804073223258344, "learning_rate": 8.434621927437253e-07, "loss": 0.6014, "step": 9095 }, { "epoch": 0.7387314220742305, "grad_norm": 6.711353565911964, "learning_rate": 8.429697173056726e-07, "loss": 0.4756, "step": 9096 }, { "epoch": 0.7388126370502721, "grad_norm": 5.514320352974961, "learning_rate": 8.42477356525346e-07, "loss": 0.5374, "step": 9097 }, { "epoch": 0.7388938520263136, "grad_norm": 3.835666081477301, "learning_rate": 8.419851104368143e-07, "loss": 0.3875, "step": 9098 }, { "epoch": 0.7389750670023553, "grad_norm": 12.210910906950325, "learning_rate": 8.414929790741371e-07, "loss": 0.4524, "step": 9099 }, { "epoch": 0.7390562819783968, "grad_norm": 5.875608947253129, "learning_rate": 8.410009624713691e-07, "loss": 0.5712, "step": 9100 }, { "epoch": 0.7391374969544384, "grad_norm": 4.500274285625198, "learning_rate": 8.405090606625547e-07, "loss": 0.6175, "step": 9101 }, { "epoch": 0.7392187119304799, "grad_norm": 7.944159885650859, "learning_rate": 8.400172736817294e-07, "loss": 0.5565, "step": 9102 }, { "epoch": 0.7392999269065216, "grad_norm": 10.979049183670785, "learning_rate": 8.395256015629233e-07, "loss": 0.4571, "step": 9103 }, { "epoch": 0.7393811418825631, "grad_norm": 6.5837339811285815, "learning_rate": 8.390340443401588e-07, "loss": 0.4318, "step": 9104 }, { "epoch": 0.7394623568586047, "grad_norm": 6.045591348963159, "learning_rate": 8.385426020474468e-07, "loss": 0.4703, "step": 9105 }, { "epoch": 0.7395435718346464, "grad_norm": 11.087294536746336, "learning_rate": 8.380512747187944e-07, "loss": 0.3586, "step": 9106 }, { "epoch": 0.7396247868106879, "grad_norm": 5.179365997405158, "learning_rate": 8.375600623881983e-07, "loss": 0.3442, "step": 9107 }, { "epoch": 0.7397060017867295, "grad_norm": 5.5937991392764355, "learning_rate": 8.370689650896465e-07, "loss": 0.4803, "step": 9108 }, { "epoch": 0.739787216762771, "grad_norm": 5.345681951321185, "learning_rate": 8.365779828571214e-07, "loss": 0.5371, "step": 9109 }, { "epoch": 0.7398684317388127, "grad_norm": 10.438164567037536, "learning_rate": 8.360871157245973e-07, "loss": 0.4857, "step": 9110 }, { "epoch": 0.7399496467148542, "grad_norm": 12.570837392968885, "learning_rate": 8.355963637260387e-07, "loss": 0.4761, "step": 9111 }, { "epoch": 0.7400308616908958, "grad_norm": 5.293881936064758, "learning_rate": 8.351057268954019e-07, "loss": 0.6403, "step": 9112 }, { "epoch": 0.7401120766669373, "grad_norm": 8.074420587248548, "learning_rate": 8.346152052666385e-07, "loss": 0.4348, "step": 9113 }, { "epoch": 0.740193291642979, "grad_norm": 4.833671703517281, "learning_rate": 8.341247988736889e-07, "loss": 0.5021, "step": 9114 }, { "epoch": 0.7402745066190205, "grad_norm": 5.173257511917773, "learning_rate": 8.336345077504851e-07, "loss": 0.4408, "step": 9115 }, { "epoch": 0.7403557215950621, "grad_norm": 4.696108390642895, "learning_rate": 8.331443319309557e-07, "loss": 0.4978, "step": 9116 }, { "epoch": 0.7404369365711038, "grad_norm": 6.333906802407053, "learning_rate": 8.326542714490172e-07, "loss": 0.4354, "step": 9117 }, { "epoch": 0.7405181515471453, "grad_norm": 4.361831325871427, "learning_rate": 8.321643263385776e-07, "loss": 0.4368, "step": 9118 }, { "epoch": 0.7405993665231869, "grad_norm": 4.6714291210784085, "learning_rate": 8.316744966335408e-07, "loss": 0.6296, "step": 9119 }, { "epoch": 0.7406805814992284, "grad_norm": 4.933127806189035, "learning_rate": 8.31184782367799e-07, "loss": 0.5809, "step": 9120 }, { "epoch": 0.7407617964752701, "grad_norm": 4.025591077863814, "learning_rate": 8.306951835752378e-07, "loss": 0.6013, "step": 9121 }, { "epoch": 0.7408430114513116, "grad_norm": 11.41144080285039, "learning_rate": 8.302057002897349e-07, "loss": 0.5021, "step": 9122 }, { "epoch": 0.7409242264273532, "grad_norm": 4.627955700774956, "learning_rate": 8.297163325451612e-07, "loss": 0.5904, "step": 9123 }, { "epoch": 0.7410054414033947, "grad_norm": 7.878246043105683, "learning_rate": 8.292270803753765e-07, "loss": 0.457, "step": 9124 }, { "epoch": 0.7410866563794364, "grad_norm": 4.436045720835732, "learning_rate": 8.287379438142365e-07, "loss": 0.5311, "step": 9125 }, { "epoch": 0.7411678713554779, "grad_norm": 5.747604125886795, "learning_rate": 8.282489228955856e-07, "loss": 0.4093, "step": 9126 }, { "epoch": 0.7412490863315195, "grad_norm": 26.14780023838032, "learning_rate": 8.277600176532608e-07, "loss": 0.4506, "step": 9127 }, { "epoch": 0.7413303013075612, "grad_norm": 5.408380773620181, "learning_rate": 8.272712281210926e-07, "loss": 0.5179, "step": 9128 }, { "epoch": 0.7414115162836027, "grad_norm": 3.881322907472951, "learning_rate": 8.267825543329033e-07, "loss": 0.5395, "step": 9129 }, { "epoch": 0.7414927312596443, "grad_norm": 4.612767077216341, "learning_rate": 8.262939963225058e-07, "loss": 0.4454, "step": 9130 }, { "epoch": 0.7415739462356858, "grad_norm": 20.881362818292562, "learning_rate": 8.258055541237054e-07, "loss": 0.4422, "step": 9131 }, { "epoch": 0.7416551612117275, "grad_norm": 10.331638585893913, "learning_rate": 8.253172277703006e-07, "loss": 0.4213, "step": 9132 }, { "epoch": 0.741736376187769, "grad_norm": 7.471565014697083, "learning_rate": 8.248290172960804e-07, "loss": 0.4839, "step": 9133 }, { "epoch": 0.7418175911638106, "grad_norm": 6.969777149983467, "learning_rate": 8.24340922734826e-07, "loss": 0.4021, "step": 9134 }, { "epoch": 0.7418988061398522, "grad_norm": 6.91819887817937, "learning_rate": 8.238529441203111e-07, "loss": 0.3838, "step": 9135 }, { "epoch": 0.7419800211158938, "grad_norm": 3.8948651109688703, "learning_rate": 8.233650814863026e-07, "loss": 0.4054, "step": 9136 }, { "epoch": 0.7420612360919353, "grad_norm": 4.018293826197548, "learning_rate": 8.228773348665561e-07, "loss": 0.4144, "step": 9137 }, { "epoch": 0.7421424510679769, "grad_norm": 3.9662330274627355, "learning_rate": 8.223897042948228e-07, "loss": 0.4907, "step": 9138 }, { "epoch": 0.7422236660440186, "grad_norm": 10.723781510737455, "learning_rate": 8.219021898048435e-07, "loss": 0.5779, "step": 9139 }, { "epoch": 0.7423048810200601, "grad_norm": 4.79787389560584, "learning_rate": 8.214147914303505e-07, "loss": 0.4803, "step": 9140 }, { "epoch": 0.7423860959961017, "grad_norm": 3.222506085754893, "learning_rate": 8.209275092050701e-07, "loss": 0.5073, "step": 9141 }, { "epoch": 0.7424673109721432, "grad_norm": 7.127778601731678, "learning_rate": 8.204403431627206e-07, "loss": 0.4199, "step": 9142 }, { "epoch": 0.7425485259481849, "grad_norm": 6.285963377503724, "learning_rate": 8.199532933370094e-07, "loss": 0.3786, "step": 9143 }, { "epoch": 0.7426297409242264, "grad_norm": 8.21223183631982, "learning_rate": 8.194663597616398e-07, "loss": 0.5224, "step": 9144 }, { "epoch": 0.742710955900268, "grad_norm": 9.976782891147915, "learning_rate": 8.18979542470304e-07, "loss": 0.5302, "step": 9145 }, { "epoch": 0.7427921708763096, "grad_norm": 6.028309491334264, "learning_rate": 8.184928414966873e-07, "loss": 0.4914, "step": 9146 }, { "epoch": 0.7428733858523512, "grad_norm": 13.676857941967034, "learning_rate": 8.180062568744657e-07, "loss": 0.5282, "step": 9147 }, { "epoch": 0.7429546008283927, "grad_norm": 17.032146668354567, "learning_rate": 8.175197886373093e-07, "loss": 0.4902, "step": 9148 }, { "epoch": 0.7430358158044343, "grad_norm": 5.4622799815547936, "learning_rate": 8.170334368188798e-07, "loss": 0.3934, "step": 9149 }, { "epoch": 0.743117030780476, "grad_norm": 8.517487460268729, "learning_rate": 8.16547201452829e-07, "loss": 0.5146, "step": 9150 }, { "epoch": 0.7431982457565175, "grad_norm": 5.038501260406377, "learning_rate": 8.160610825728029e-07, "loss": 0.5049, "step": 9151 }, { "epoch": 0.7432794607325591, "grad_norm": 4.455377373168075, "learning_rate": 8.155750802124379e-07, "loss": 0.5832, "step": 9152 }, { "epoch": 0.7433606757086006, "grad_norm": 3.9339441762573863, "learning_rate": 8.150891944053615e-07, "loss": 0.4169, "step": 9153 }, { "epoch": 0.7434418906846423, "grad_norm": 3.517549583636786, "learning_rate": 8.146034251851959e-07, "loss": 0.5344, "step": 9154 }, { "epoch": 0.7435231056606838, "grad_norm": 4.5372086466348085, "learning_rate": 8.141177725855543e-07, "loss": 0.5208, "step": 9155 }, { "epoch": 0.7436043206367254, "grad_norm": 4.458967507222066, "learning_rate": 8.136322366400396e-07, "loss": 0.4537, "step": 9156 }, { "epoch": 0.743685535612767, "grad_norm": 3.5505929046873104, "learning_rate": 8.131468173822499e-07, "loss": 0.3824, "step": 9157 }, { "epoch": 0.7437667505888086, "grad_norm": 4.049572608845729, "learning_rate": 8.126615148457728e-07, "loss": 0.4316, "step": 9158 }, { "epoch": 0.7438479655648501, "grad_norm": 6.329253089634774, "learning_rate": 8.121763290641879e-07, "loss": 0.3844, "step": 9159 }, { "epoch": 0.7439291805408917, "grad_norm": 5.7289617458205235, "learning_rate": 8.116912600710694e-07, "loss": 0.5157, "step": 9160 }, { "epoch": 0.7440103955169334, "grad_norm": 7.461558435663731, "learning_rate": 8.112063078999794e-07, "loss": 0.4108, "step": 9161 }, { "epoch": 0.7440916104929749, "grad_norm": 6.260601884198078, "learning_rate": 8.107214725844753e-07, "loss": 0.4063, "step": 9162 }, { "epoch": 0.7441728254690165, "grad_norm": 7.096380076386157, "learning_rate": 8.102367541581055e-07, "loss": 0.375, "step": 9163 }, { "epoch": 0.744254040445058, "grad_norm": 4.583892134072468, "learning_rate": 8.097521526544094e-07, "loss": 0.5004, "step": 9164 }, { "epoch": 0.7443352554210997, "grad_norm": 4.285669547146368, "learning_rate": 8.092676681069189e-07, "loss": 0.4534, "step": 9165 }, { "epoch": 0.7444164703971412, "grad_norm": 5.84102693467306, "learning_rate": 8.087833005491568e-07, "loss": 0.6221, "step": 9166 }, { "epoch": 0.7444976853731828, "grad_norm": 4.581219331241057, "learning_rate": 8.082990500146398e-07, "loss": 0.5261, "step": 9167 }, { "epoch": 0.7445789003492244, "grad_norm": 6.253344718728393, "learning_rate": 8.078149165368762e-07, "loss": 0.4019, "step": 9168 }, { "epoch": 0.744660115325266, "grad_norm": 4.961021707376709, "learning_rate": 8.073309001493637e-07, "loss": 0.5145, "step": 9169 }, { "epoch": 0.7447413303013075, "grad_norm": 60.59463933382603, "learning_rate": 8.068470008855953e-07, "loss": 0.4394, "step": 9170 }, { "epoch": 0.7448225452773491, "grad_norm": 5.570185363264054, "learning_rate": 8.063632187790538e-07, "loss": 0.4144, "step": 9171 }, { "epoch": 0.7449037602533908, "grad_norm": 6.346150984764303, "learning_rate": 8.05879553863213e-07, "loss": 0.4612, "step": 9172 }, { "epoch": 0.7449849752294323, "grad_norm": 5.7858397897115745, "learning_rate": 8.053960061715421e-07, "loss": 0.5154, "step": 9173 }, { "epoch": 0.7450661902054739, "grad_norm": 9.04581065503539, "learning_rate": 8.049125757374978e-07, "loss": 0.4543, "step": 9174 }, { "epoch": 0.7451474051815155, "grad_norm": 4.5831119885995575, "learning_rate": 8.044292625945327e-07, "loss": 0.5, "step": 9175 }, { "epoch": 0.7452286201575571, "grad_norm": 4.73034396532834, "learning_rate": 8.039460667760892e-07, "loss": 0.6158, "step": 9176 }, { "epoch": 0.7453098351335986, "grad_norm": 7.297415511486316, "learning_rate": 8.034629883156019e-07, "loss": 0.4434, "step": 9177 }, { "epoch": 0.7453910501096402, "grad_norm": 7.087502286901482, "learning_rate": 8.029800272464963e-07, "loss": 0.3851, "step": 9178 }, { "epoch": 0.7454722650856818, "grad_norm": 21.28250462127868, "learning_rate": 8.024971836021922e-07, "loss": 0.3312, "step": 9179 }, { "epoch": 0.7455534800617234, "grad_norm": 4.028786793175537, "learning_rate": 8.020144574160984e-07, "loss": 0.5862, "step": 9180 }, { "epoch": 0.7456346950377649, "grad_norm": 8.089625182959997, "learning_rate": 8.015318487216184e-07, "loss": 0.4179, "step": 9181 }, { "epoch": 0.7457159100138065, "grad_norm": 10.703076445335201, "learning_rate": 8.010493575521444e-07, "loss": 0.5247, "step": 9182 }, { "epoch": 0.7457971249898482, "grad_norm": 5.996668534107049, "learning_rate": 8.005669839410643e-07, "loss": 0.6692, "step": 9183 }, { "epoch": 0.7458783399658897, "grad_norm": 4.264762756411719, "learning_rate": 8.00084727921755e-07, "loss": 0.7248, "step": 9184 }, { "epoch": 0.7459595549419313, "grad_norm": 6.337959117205082, "learning_rate": 7.996025895275846e-07, "loss": 0.4624, "step": 9185 }, { "epoch": 0.7460407699179729, "grad_norm": 4.73694838657318, "learning_rate": 7.991205687919163e-07, "loss": 0.5874, "step": 9186 }, { "epoch": 0.7461219848940145, "grad_norm": 4.444846677665557, "learning_rate": 7.986386657481032e-07, "loss": 0.4305, "step": 9187 }, { "epoch": 0.746203199870056, "grad_norm": 4.371046177977482, "learning_rate": 7.981568804294895e-07, "loss": 0.4158, "step": 9188 }, { "epoch": 0.7462844148460976, "grad_norm": 5.23612698457551, "learning_rate": 7.976752128694134e-07, "loss": 0.5297, "step": 9189 }, { "epoch": 0.7463656298221392, "grad_norm": 5.6064882601787325, "learning_rate": 7.971936631012033e-07, "loss": 0.4, "step": 9190 }, { "epoch": 0.7464468447981808, "grad_norm": 4.736891493880583, "learning_rate": 7.96712231158179e-07, "loss": 0.465, "step": 9191 }, { "epoch": 0.7465280597742223, "grad_norm": 16.20683819170043, "learning_rate": 7.962309170736546e-07, "loss": 0.538, "step": 9192 }, { "epoch": 0.746609274750264, "grad_norm": 4.072174973984312, "learning_rate": 7.957497208809328e-07, "loss": 0.5302, "step": 9193 }, { "epoch": 0.7466904897263056, "grad_norm": 3.329371490211698, "learning_rate": 7.952686426133105e-07, "loss": 0.5949, "step": 9194 }, { "epoch": 0.7467717047023471, "grad_norm": 10.181237849562704, "learning_rate": 7.947876823040771e-07, "loss": 0.5202, "step": 9195 }, { "epoch": 0.7468529196783887, "grad_norm": 4.700711476652391, "learning_rate": 7.943068399865111e-07, "loss": 0.3624, "step": 9196 }, { "epoch": 0.7469341346544303, "grad_norm": 4.764991430944277, "learning_rate": 7.93826115693884e-07, "loss": 0.3774, "step": 9197 }, { "epoch": 0.7470153496304719, "grad_norm": 7.85688314418302, "learning_rate": 7.933455094594602e-07, "loss": 0.3546, "step": 9198 }, { "epoch": 0.7470965646065134, "grad_norm": 4.120541672766695, "learning_rate": 7.928650213164945e-07, "loss": 0.6123, "step": 9199 }, { "epoch": 0.747177779582555, "grad_norm": 7.327958848564803, "learning_rate": 7.92384651298235e-07, "loss": 0.5695, "step": 9200 }, { "epoch": 0.7472589945585966, "grad_norm": 5.719769810682618, "learning_rate": 7.919043994379194e-07, "loss": 0.4224, "step": 9201 }, { "epoch": 0.7473402095346382, "grad_norm": 4.094688049104328, "learning_rate": 7.914242657687804e-07, "loss": 0.5687, "step": 9202 }, { "epoch": 0.7474214245106797, "grad_norm": 8.405367942793522, "learning_rate": 7.909442503240395e-07, "loss": 0.429, "step": 9203 }, { "epoch": 0.7475026394867214, "grad_norm": 6.01950206469752, "learning_rate": 7.904643531369108e-07, "loss": 0.4256, "step": 9204 }, { "epoch": 0.747583854462763, "grad_norm": 7.930996919339839, "learning_rate": 7.899845742406017e-07, "loss": 0.4707, "step": 9205 }, { "epoch": 0.7476650694388045, "grad_norm": 16.262367161009937, "learning_rate": 7.895049136683095e-07, "loss": 0.411, "step": 9206 }, { "epoch": 0.7477462844148461, "grad_norm": 3.3605717819153527, "learning_rate": 7.890253714532245e-07, "loss": 0.4796, "step": 9207 }, { "epoch": 0.7478274993908877, "grad_norm": 4.04997390894807, "learning_rate": 7.885459476285292e-07, "loss": 0.5579, "step": 9208 }, { "epoch": 0.7479087143669293, "grad_norm": 4.435766819763639, "learning_rate": 7.880666422273969e-07, "loss": 0.3457, "step": 9209 }, { "epoch": 0.7479899293429708, "grad_norm": 5.1365435166693505, "learning_rate": 7.875874552829918e-07, "loss": 0.5797, "step": 9210 }, { "epoch": 0.7480711443190124, "grad_norm": 5.591130735882937, "learning_rate": 7.871083868284726e-07, "loss": 0.5462, "step": 9211 }, { "epoch": 0.748152359295054, "grad_norm": 8.16910174430336, "learning_rate": 7.866294368969871e-07, "loss": 0.4643, "step": 9212 }, { "epoch": 0.7482335742710956, "grad_norm": 7.038360149078053, "learning_rate": 7.861506055216764e-07, "loss": 0.5222, "step": 9213 }, { "epoch": 0.7483147892471371, "grad_norm": 5.659068513220842, "learning_rate": 7.856718927356743e-07, "loss": 0.5721, "step": 9214 }, { "epoch": 0.7483960042231788, "grad_norm": 7.479766446282722, "learning_rate": 7.851932985721042e-07, "loss": 0.5484, "step": 9215 }, { "epoch": 0.7484772191992204, "grad_norm": 7.0753754777583575, "learning_rate": 7.847148230640825e-07, "loss": 0.3887, "step": 9216 }, { "epoch": 0.7485584341752619, "grad_norm": 6.933072652515654, "learning_rate": 7.842364662447161e-07, "loss": 0.4933, "step": 9217 }, { "epoch": 0.7486396491513035, "grad_norm": 4.082184303346161, "learning_rate": 7.837582281471065e-07, "loss": 0.4314, "step": 9218 }, { "epoch": 0.7487208641273451, "grad_norm": 5.120311127792642, "learning_rate": 7.832801088043438e-07, "loss": 0.3486, "step": 9219 }, { "epoch": 0.7488020791033867, "grad_norm": 5.389253627233221, "learning_rate": 7.828021082495118e-07, "loss": 0.5207, "step": 9220 }, { "epoch": 0.7488832940794282, "grad_norm": 5.865053171467202, "learning_rate": 7.823242265156866e-07, "loss": 0.3661, "step": 9221 }, { "epoch": 0.7489645090554699, "grad_norm": 4.518682135966551, "learning_rate": 7.818464636359344e-07, "loss": 0.6257, "step": 9222 }, { "epoch": 0.7490457240315114, "grad_norm": 5.19395467966571, "learning_rate": 7.813688196433125e-07, "loss": 0.4796, "step": 9223 }, { "epoch": 0.749126939007553, "grad_norm": 4.850339059401898, "learning_rate": 7.808912945708738e-07, "loss": 0.4585, "step": 9224 }, { "epoch": 0.7492081539835945, "grad_norm": 5.013059286403891, "learning_rate": 7.804138884516583e-07, "loss": 0.3884, "step": 9225 }, { "epoch": 0.7492893689596362, "grad_norm": 4.841130061593439, "learning_rate": 7.799366013187007e-07, "loss": 0.4484, "step": 9226 }, { "epoch": 0.7493705839356778, "grad_norm": 5.230772273048009, "learning_rate": 7.794594332050282e-07, "loss": 0.4274, "step": 9227 }, { "epoch": 0.7494517989117193, "grad_norm": 7.075947384663193, "learning_rate": 7.789823841436567e-07, "loss": 0.4516, "step": 9228 }, { "epoch": 0.749533013887761, "grad_norm": 6.992483714956555, "learning_rate": 7.785054541675954e-07, "loss": 0.3975, "step": 9229 }, { "epoch": 0.7496142288638025, "grad_norm": 4.561345669683036, "learning_rate": 7.780286433098464e-07, "loss": 0.7948, "step": 9230 }, { "epoch": 0.7496954438398441, "grad_norm": 4.478198203586605, "learning_rate": 7.775519516034019e-07, "loss": 0.4752, "step": 9231 }, { "epoch": 0.7497766588158856, "grad_norm": 3.9360754902947894, "learning_rate": 7.770753790812455e-07, "loss": 0.5631, "step": 9232 }, { "epoch": 0.7498578737919273, "grad_norm": 6.792888436524307, "learning_rate": 7.765989257763545e-07, "loss": 0.4384, "step": 9233 }, { "epoch": 0.7499390887679688, "grad_norm": 5.737542594203367, "learning_rate": 7.761225917216978e-07, "loss": 0.4428, "step": 9234 }, { "epoch": 0.7500203037440104, "grad_norm": 10.689284036969283, "learning_rate": 7.75646376950234e-07, "loss": 0.4604, "step": 9235 }, { "epoch": 0.7501015187200519, "grad_norm": 22.015199504327047, "learning_rate": 7.751702814949145e-07, "loss": 0.5013, "step": 9236 }, { "epoch": 0.7501827336960936, "grad_norm": 9.731543225761502, "learning_rate": 7.746943053886835e-07, "loss": 0.4218, "step": 9237 }, { "epoch": 0.7502639486721352, "grad_norm": 6.82059578534053, "learning_rate": 7.742184486644746e-07, "loss": 0.5271, "step": 9238 }, { "epoch": 0.7503451636481767, "grad_norm": 4.580942490166404, "learning_rate": 7.737427113552157e-07, "loss": 0.5683, "step": 9239 }, { "epoch": 0.7504263786242183, "grad_norm": 3.654932263444829, "learning_rate": 7.732670934938257e-07, "loss": 0.4838, "step": 9240 }, { "epoch": 0.7505075936002599, "grad_norm": 5.579207786127818, "learning_rate": 7.727915951132145e-07, "loss": 0.478, "step": 9241 }, { "epoch": 0.7505888085763015, "grad_norm": 4.861880759749431, "learning_rate": 7.723162162462827e-07, "loss": 0.5107, "step": 9242 }, { "epoch": 0.750670023552343, "grad_norm": 3.336615456803476, "learning_rate": 7.718409569259261e-07, "loss": 0.439, "step": 9243 }, { "epoch": 0.7507512385283847, "grad_norm": 3.594363219165978, "learning_rate": 7.713658171850289e-07, "loss": 0.5717, "step": 9244 }, { "epoch": 0.7508324535044262, "grad_norm": 6.352249538029256, "learning_rate": 7.708907970564672e-07, "loss": 0.4124, "step": 9245 }, { "epoch": 0.7509136684804678, "grad_norm": 5.815211228408834, "learning_rate": 7.704158965731126e-07, "loss": 0.6313, "step": 9246 }, { "epoch": 0.7509948834565093, "grad_norm": 4.251212567683483, "learning_rate": 7.699411157678241e-07, "loss": 0.4166, "step": 9247 }, { "epoch": 0.751076098432551, "grad_norm": 4.650261805037805, "learning_rate": 7.694664546734534e-07, "loss": 0.5619, "step": 9248 }, { "epoch": 0.7511573134085926, "grad_norm": 7.169544455959594, "learning_rate": 7.689919133228462e-07, "loss": 0.3819, "step": 9249 }, { "epoch": 0.7512385283846341, "grad_norm": 5.930269929305272, "learning_rate": 7.685174917488375e-07, "loss": 0.45, "step": 9250 }, { "epoch": 0.7513197433606758, "grad_norm": 4.069763452953068, "learning_rate": 7.680431899842538e-07, "loss": 0.4904, "step": 9251 }, { "epoch": 0.7514009583367173, "grad_norm": 6.375436578366889, "learning_rate": 7.67569008061915e-07, "loss": 0.42, "step": 9252 }, { "epoch": 0.7514821733127589, "grad_norm": 4.507854006209749, "learning_rate": 7.670949460146329e-07, "loss": 0.5423, "step": 9253 }, { "epoch": 0.7515633882888004, "grad_norm": 4.6919308246764615, "learning_rate": 7.666210038752092e-07, "loss": 0.4196, "step": 9254 }, { "epoch": 0.7516446032648421, "grad_norm": 7.409437183391873, "learning_rate": 7.661471816764377e-07, "loss": 0.5427, "step": 9255 }, { "epoch": 0.7517258182408836, "grad_norm": 3.4419438252999446, "learning_rate": 7.656734794511056e-07, "loss": 0.4405, "step": 9256 }, { "epoch": 0.7518070332169252, "grad_norm": 12.815965237950453, "learning_rate": 7.65199897231989e-07, "loss": 0.4739, "step": 9257 }, { "epoch": 0.7518882481929667, "grad_norm": 6.654081291463507, "learning_rate": 7.647264350518582e-07, "loss": 0.4545, "step": 9258 }, { "epoch": 0.7519694631690084, "grad_norm": 4.719867564770338, "learning_rate": 7.642530929434752e-07, "loss": 0.4012, "step": 9259 }, { "epoch": 0.75205067814505, "grad_norm": 5.37752756201268, "learning_rate": 7.637798709395919e-07, "loss": 0.5197, "step": 9260 }, { "epoch": 0.7521318931210915, "grad_norm": 5.47385742990871, "learning_rate": 7.633067690729517e-07, "loss": 0.367, "step": 9261 }, { "epoch": 0.7522131080971332, "grad_norm": 4.4733368861841045, "learning_rate": 7.628337873762928e-07, "loss": 0.4252, "step": 9262 }, { "epoch": 0.7522943230731747, "grad_norm": 6.510631608383568, "learning_rate": 7.62360925882342e-07, "loss": 0.4848, "step": 9263 }, { "epoch": 0.7523755380492163, "grad_norm": 8.2985790641125, "learning_rate": 7.618881846238177e-07, "loss": 0.3856, "step": 9264 }, { "epoch": 0.7524567530252578, "grad_norm": 8.29108898122942, "learning_rate": 7.614155636334325e-07, "loss": 0.284, "step": 9265 }, { "epoch": 0.7525379680012995, "grad_norm": 14.447143227318355, "learning_rate": 7.609430629438896e-07, "loss": 0.4909, "step": 9266 }, { "epoch": 0.752619182977341, "grad_norm": 9.786187786821417, "learning_rate": 7.604706825878822e-07, "loss": 0.5262, "step": 9267 }, { "epoch": 0.7527003979533826, "grad_norm": 6.314038643222093, "learning_rate": 7.59998422598098e-07, "loss": 0.4241, "step": 9268 }, { "epoch": 0.7527816129294241, "grad_norm": 3.701696525893301, "learning_rate": 7.595262830072142e-07, "loss": 0.4264, "step": 9269 }, { "epoch": 0.7528628279054658, "grad_norm": 6.147998028522049, "learning_rate": 7.590542638478992e-07, "loss": 0.5874, "step": 9270 }, { "epoch": 0.7529440428815074, "grad_norm": 5.173315947912089, "learning_rate": 7.585823651528157e-07, "loss": 0.5389, "step": 9271 }, { "epoch": 0.7530252578575489, "grad_norm": 5.039146817437555, "learning_rate": 7.581105869546168e-07, "loss": 0.3099, "step": 9272 }, { "epoch": 0.7531064728335906, "grad_norm": 6.7461838507335505, "learning_rate": 7.576389292859465e-07, "loss": 0.5686, "step": 9273 }, { "epoch": 0.7531876878096321, "grad_norm": 4.776535888289513, "learning_rate": 7.5716739217944e-07, "loss": 0.5206, "step": 9274 }, { "epoch": 0.7532689027856737, "grad_norm": 5.586476353122101, "learning_rate": 7.566959756677272e-07, "loss": 0.4071, "step": 9275 }, { "epoch": 0.7533501177617152, "grad_norm": 6.574615954432533, "learning_rate": 7.562246797834266e-07, "loss": 0.4455, "step": 9276 }, { "epoch": 0.7534313327377569, "grad_norm": 3.8731896502314855, "learning_rate": 7.557535045591485e-07, "loss": 0.5776, "step": 9277 }, { "epoch": 0.7535125477137984, "grad_norm": 4.1035501788561906, "learning_rate": 7.552824500274963e-07, "loss": 0.4607, "step": 9278 }, { "epoch": 0.75359376268984, "grad_norm": 3.37747957490476, "learning_rate": 7.548115162210659e-07, "loss": 0.5089, "step": 9279 }, { "epoch": 0.7536749776658815, "grad_norm": 5.307633133304405, "learning_rate": 7.543407031724415e-07, "loss": 0.7355, "step": 9280 }, { "epoch": 0.7537561926419232, "grad_norm": 5.0482613355790695, "learning_rate": 7.538700109142022e-07, "loss": 0.5037, "step": 9281 }, { "epoch": 0.7538374076179648, "grad_norm": 5.458953154438223, "learning_rate": 7.533994394789171e-07, "loss": 0.4972, "step": 9282 }, { "epoch": 0.7539186225940063, "grad_norm": 8.54249426936385, "learning_rate": 7.529289888991462e-07, "loss": 0.5581, "step": 9283 }, { "epoch": 0.753999837570048, "grad_norm": 7.083450524943488, "learning_rate": 7.524586592074432e-07, "loss": 0.4335, "step": 9284 }, { "epoch": 0.7540810525460895, "grad_norm": 3.984907663593172, "learning_rate": 7.519884504363525e-07, "loss": 0.5737, "step": 9285 }, { "epoch": 0.7541622675221311, "grad_norm": 11.241450055916163, "learning_rate": 7.515183626184095e-07, "loss": 0.4297, "step": 9286 }, { "epoch": 0.7542434824981726, "grad_norm": 5.7339323931383035, "learning_rate": 7.510483957861428e-07, "loss": 0.5938, "step": 9287 }, { "epoch": 0.7543246974742143, "grad_norm": 4.59002005480556, "learning_rate": 7.505785499720708e-07, "loss": 0.3925, "step": 9288 }, { "epoch": 0.7544059124502558, "grad_norm": 4.295589622295897, "learning_rate": 7.501088252087046e-07, "loss": 0.5777, "step": 9289 }, { "epoch": 0.7544871274262974, "grad_norm": 6.677871787585794, "learning_rate": 7.496392215285456e-07, "loss": 0.5826, "step": 9290 }, { "epoch": 0.7545683424023389, "grad_norm": 6.4999364694050366, "learning_rate": 7.49169738964089e-07, "loss": 0.6462, "step": 9291 }, { "epoch": 0.7546495573783806, "grad_norm": 4.2437533825373706, "learning_rate": 7.487003775478208e-07, "loss": 0.4204, "step": 9292 }, { "epoch": 0.7547307723544222, "grad_norm": 6.9886624820729475, "learning_rate": 7.482311373122173e-07, "loss": 0.4936, "step": 9293 }, { "epoch": 0.7548119873304637, "grad_norm": 9.509767514085462, "learning_rate": 7.477620182897485e-07, "loss": 0.5231, "step": 9294 }, { "epoch": 0.7548932023065054, "grad_norm": 6.97430223457955, "learning_rate": 7.472930205128748e-07, "loss": 0.614, "step": 9295 }, { "epoch": 0.7549744172825469, "grad_norm": 6.802220975306286, "learning_rate": 7.46824144014047e-07, "loss": 0.4198, "step": 9296 }, { "epoch": 0.7550556322585885, "grad_norm": 5.419177843717326, "learning_rate": 7.4635538882571e-07, "loss": 0.5168, "step": 9297 }, { "epoch": 0.75513684723463, "grad_norm": 4.5756834213715365, "learning_rate": 7.458867549802998e-07, "loss": 0.4739, "step": 9298 }, { "epoch": 0.7552180622106717, "grad_norm": 10.932740692189409, "learning_rate": 7.454182425102418e-07, "loss": 0.3756, "step": 9299 }, { "epoch": 0.7552992771867132, "grad_norm": 7.458052877323758, "learning_rate": 7.449498514479564e-07, "loss": 0.4295, "step": 9300 }, { "epoch": 0.7553804921627548, "grad_norm": 7.572226076657359, "learning_rate": 7.444815818258527e-07, "loss": 0.5706, "step": 9301 }, { "epoch": 0.7554617071387963, "grad_norm": 7.895090561106518, "learning_rate": 7.440134336763316e-07, "loss": 0.7025, "step": 9302 }, { "epoch": 0.755542922114838, "grad_norm": 7.629608085828636, "learning_rate": 7.435454070317885e-07, "loss": 0.4952, "step": 9303 }, { "epoch": 0.7556241370908796, "grad_norm": 5.105832653252446, "learning_rate": 7.430775019246064e-07, "loss": 0.3872, "step": 9304 }, { "epoch": 0.7557053520669211, "grad_norm": 5.491484533703616, "learning_rate": 7.426097183871636e-07, "loss": 0.3978, "step": 9305 }, { "epoch": 0.7557865670429628, "grad_norm": 6.946802048002631, "learning_rate": 7.421420564518267e-07, "loss": 0.6585, "step": 9306 }, { "epoch": 0.7558677820190043, "grad_norm": 3.9872313781539206, "learning_rate": 7.41674516150957e-07, "loss": 0.6047, "step": 9307 }, { "epoch": 0.7559489969950459, "grad_norm": 7.213117113503657, "learning_rate": 7.412070975169047e-07, "loss": 0.583, "step": 9308 }, { "epoch": 0.7560302119710874, "grad_norm": 5.24742138382295, "learning_rate": 7.407398005820123e-07, "loss": 0.4034, "step": 9309 }, { "epoch": 0.7561114269471291, "grad_norm": 4.211946251001222, "learning_rate": 7.402726253786152e-07, "loss": 0.4058, "step": 9310 }, { "epoch": 0.7561926419231706, "grad_norm": 4.415557038084326, "learning_rate": 7.398055719390399e-07, "loss": 0.5176, "step": 9311 }, { "epoch": 0.7562738568992122, "grad_norm": 4.229473804773957, "learning_rate": 7.39338640295602e-07, "loss": 0.533, "step": 9312 }, { "epoch": 0.7563550718752537, "grad_norm": 6.390722780847675, "learning_rate": 7.388718304806133e-07, "loss": 0.4943, "step": 9313 }, { "epoch": 0.7564362868512954, "grad_norm": 3.9879443116206224, "learning_rate": 7.384051425263733e-07, "loss": 0.6664, "step": 9314 }, { "epoch": 0.756517501827337, "grad_norm": 4.396256081895415, "learning_rate": 7.379385764651737e-07, "loss": 0.3858, "step": 9315 }, { "epoch": 0.7565987168033785, "grad_norm": 9.650686792105313, "learning_rate": 7.374721323292985e-07, "loss": 0.6082, "step": 9316 }, { "epoch": 0.7566799317794202, "grad_norm": 4.52159603222765, "learning_rate": 7.370058101510249e-07, "loss": 0.4969, "step": 9317 }, { "epoch": 0.7567611467554617, "grad_norm": 4.592670810869141, "learning_rate": 7.365396099626176e-07, "loss": 0.6284, "step": 9318 }, { "epoch": 0.7568423617315033, "grad_norm": 14.987779633029737, "learning_rate": 7.360735317963374e-07, "loss": 0.4282, "step": 9319 }, { "epoch": 0.7569235767075448, "grad_norm": 5.108452215084867, "learning_rate": 7.356075756844333e-07, "loss": 0.5101, "step": 9320 }, { "epoch": 0.7570047916835865, "grad_norm": 6.336779399554521, "learning_rate": 7.351417416591461e-07, "loss": 0.4218, "step": 9321 }, { "epoch": 0.757086006659628, "grad_norm": 3.2420955010959633, "learning_rate": 7.346760297527109e-07, "loss": 0.4596, "step": 9322 }, { "epoch": 0.7571672216356696, "grad_norm": 14.31442060248585, "learning_rate": 7.342104399973507e-07, "loss": 0.6222, "step": 9323 }, { "epoch": 0.7572484366117112, "grad_norm": 4.841998000446461, "learning_rate": 7.337449724252837e-07, "loss": 0.3906, "step": 9324 }, { "epoch": 0.7573296515877528, "grad_norm": 6.189602667818374, "learning_rate": 7.332796270687159e-07, "loss": 0.5206, "step": 9325 }, { "epoch": 0.7574108665637944, "grad_norm": 3.4330814806419747, "learning_rate": 7.328144039598487e-07, "loss": 0.4608, "step": 9326 }, { "epoch": 0.7574920815398359, "grad_norm": 8.246364139369895, "learning_rate": 7.323493031308718e-07, "loss": 0.3708, "step": 9327 }, { "epoch": 0.7575732965158776, "grad_norm": 11.977800366186257, "learning_rate": 7.318843246139673e-07, "loss": 0.4251, "step": 9328 }, { "epoch": 0.7576545114919191, "grad_norm": 4.480080892757191, "learning_rate": 7.314194684413098e-07, "loss": 0.5398, "step": 9329 }, { "epoch": 0.7577357264679607, "grad_norm": 5.761110269898293, "learning_rate": 7.309547346450658e-07, "loss": 0.486, "step": 9330 }, { "epoch": 0.7578169414440022, "grad_norm": 4.670162793343675, "learning_rate": 7.304901232573908e-07, "loss": 0.4607, "step": 9331 }, { "epoch": 0.7578981564200439, "grad_norm": 7.987770600523061, "learning_rate": 7.300256343104351e-07, "loss": 0.4614, "step": 9332 }, { "epoch": 0.7579793713960854, "grad_norm": 5.479915469263117, "learning_rate": 7.295612678363382e-07, "loss": 0.3639, "step": 9333 }, { "epoch": 0.758060586372127, "grad_norm": 25.580608798332342, "learning_rate": 7.290970238672307e-07, "loss": 0.3886, "step": 9334 }, { "epoch": 0.7581418013481686, "grad_norm": 3.80358687651955, "learning_rate": 7.286329024352376e-07, "loss": 0.5083, "step": 9335 }, { "epoch": 0.7582230163242102, "grad_norm": 3.6188064823034747, "learning_rate": 7.281689035724718e-07, "loss": 0.4203, "step": 9336 }, { "epoch": 0.7583042313002518, "grad_norm": 6.662801100557623, "learning_rate": 7.277050273110408e-07, "loss": 0.428, "step": 9337 }, { "epoch": 0.7583854462762933, "grad_norm": 4.198195373303361, "learning_rate": 7.272412736830431e-07, "loss": 0.5499, "step": 9338 }, { "epoch": 0.758466661252335, "grad_norm": 7.26119245955303, "learning_rate": 7.26777642720567e-07, "loss": 0.5308, "step": 9339 }, { "epoch": 0.7585478762283765, "grad_norm": 6.6134602398961295, "learning_rate": 7.263141344556924e-07, "loss": 0.4514, "step": 9340 }, { "epoch": 0.7586290912044181, "grad_norm": 3.9208951218117427, "learning_rate": 7.258507489204935e-07, "loss": 0.3762, "step": 9341 }, { "epoch": 0.7587103061804596, "grad_norm": 7.76615893471226, "learning_rate": 7.253874861470325e-07, "loss": 0.5326, "step": 9342 }, { "epoch": 0.7587915211565013, "grad_norm": 7.24244627028924, "learning_rate": 7.24924346167366e-07, "loss": 0.4045, "step": 9343 }, { "epoch": 0.7588727361325428, "grad_norm": 3.373479639826732, "learning_rate": 7.244613290135396e-07, "loss": 0.4882, "step": 9344 }, { "epoch": 0.7589539511085844, "grad_norm": 6.628941975131967, "learning_rate": 7.239984347175932e-07, "loss": 0.3442, "step": 9345 }, { "epoch": 0.759035166084626, "grad_norm": 3.784372823794077, "learning_rate": 7.235356633115559e-07, "loss": 0.5905, "step": 9346 }, { "epoch": 0.7591163810606676, "grad_norm": 4.27156618640782, "learning_rate": 7.230730148274478e-07, "loss": 0.3702, "step": 9347 }, { "epoch": 0.7591975960367092, "grad_norm": 3.9436102310547327, "learning_rate": 7.226104892972838e-07, "loss": 0.3701, "step": 9348 }, { "epoch": 0.7592788110127507, "grad_norm": 3.9798514443021196, "learning_rate": 7.221480867530664e-07, "loss": 0.4805, "step": 9349 }, { "epoch": 0.7593600259887924, "grad_norm": 3.87348251530302, "learning_rate": 7.216858072267924e-07, "loss": 0.5473, "step": 9350 }, { "epoch": 0.7594412409648339, "grad_norm": 5.915769976523819, "learning_rate": 7.212236507504494e-07, "loss": 0.5054, "step": 9351 }, { "epoch": 0.7595224559408755, "grad_norm": 3.8732533424148605, "learning_rate": 7.207616173560158e-07, "loss": 0.4806, "step": 9352 }, { "epoch": 0.759603670916917, "grad_norm": 6.11681123799206, "learning_rate": 7.202997070754613e-07, "loss": 0.46, "step": 9353 }, { "epoch": 0.7596848858929587, "grad_norm": 8.365104395894681, "learning_rate": 7.198379199407488e-07, "loss": 0.5624, "step": 9354 }, { "epoch": 0.7597661008690002, "grad_norm": 6.4866744161399685, "learning_rate": 7.193762559838299e-07, "loss": 0.5142, "step": 9355 }, { "epoch": 0.7598473158450418, "grad_norm": 6.4484042472139675, "learning_rate": 7.189147152366504e-07, "loss": 0.3683, "step": 9356 }, { "epoch": 0.7599285308210834, "grad_norm": 4.5783151146826935, "learning_rate": 7.184532977311471e-07, "loss": 0.4517, "step": 9357 }, { "epoch": 0.760009745797125, "grad_norm": 4.142951500808283, "learning_rate": 7.179920034992469e-07, "loss": 0.4783, "step": 9358 }, { "epoch": 0.7600909607731666, "grad_norm": 3.327558136006068, "learning_rate": 7.175308325728689e-07, "loss": 0.6403, "step": 9359 }, { "epoch": 0.7601721757492081, "grad_norm": 7.732587779509069, "learning_rate": 7.170697849839229e-07, "loss": 0.4863, "step": 9360 }, { "epoch": 0.7602533907252498, "grad_norm": 3.4261727551662866, "learning_rate": 7.166088607643123e-07, "loss": 0.4994, "step": 9361 }, { "epoch": 0.7603346057012913, "grad_norm": 7.420461816144346, "learning_rate": 7.161480599459297e-07, "loss": 0.3715, "step": 9362 }, { "epoch": 0.7604158206773329, "grad_norm": 4.177142639059444, "learning_rate": 7.156873825606603e-07, "loss": 0.5635, "step": 9363 }, { "epoch": 0.7604970356533745, "grad_norm": 6.465161714985024, "learning_rate": 7.152268286403813e-07, "loss": 0.4047, "step": 9364 }, { "epoch": 0.7605782506294161, "grad_norm": 6.173359562536338, "learning_rate": 7.147663982169601e-07, "loss": 0.4028, "step": 9365 }, { "epoch": 0.7606594656054576, "grad_norm": 3.9455176413170583, "learning_rate": 7.143060913222552e-07, "loss": 0.6058, "step": 9366 }, { "epoch": 0.7607406805814992, "grad_norm": 10.642099423017335, "learning_rate": 7.138459079881188e-07, "loss": 0.706, "step": 9367 }, { "epoch": 0.7608218955575408, "grad_norm": 4.19417453033392, "learning_rate": 7.133858482463918e-07, "loss": 0.6098, "step": 9368 }, { "epoch": 0.7609031105335824, "grad_norm": 4.253649965044964, "learning_rate": 7.129259121289086e-07, "loss": 0.4453, "step": 9369 }, { "epoch": 0.760984325509624, "grad_norm": 3.668621601717135, "learning_rate": 7.124660996674951e-07, "loss": 0.5733, "step": 9370 }, { "epoch": 0.7610655404856655, "grad_norm": 6.048356569124348, "learning_rate": 7.12006410893967e-07, "loss": 0.4285, "step": 9371 }, { "epoch": 0.7611467554617072, "grad_norm": 4.317558436319294, "learning_rate": 7.115468458401317e-07, "loss": 0.6383, "step": 9372 }, { "epoch": 0.7612279704377487, "grad_norm": 7.296251737717159, "learning_rate": 7.110874045377902e-07, "loss": 0.551, "step": 9373 }, { "epoch": 0.7613091854137903, "grad_norm": 5.66808374273542, "learning_rate": 7.106280870187326e-07, "loss": 0.4665, "step": 9374 }, { "epoch": 0.7613904003898319, "grad_norm": 3.897913983532991, "learning_rate": 7.101688933147397e-07, "loss": 0.5898, "step": 9375 }, { "epoch": 0.7614716153658735, "grad_norm": 5.913213010761257, "learning_rate": 7.097098234575883e-07, "loss": 0.379, "step": 9376 }, { "epoch": 0.761552830341915, "grad_norm": 8.586921010693551, "learning_rate": 7.092508774790424e-07, "loss": 0.3769, "step": 9377 }, { "epoch": 0.7616340453179566, "grad_norm": 6.837790290025689, "learning_rate": 7.087920554108582e-07, "loss": 0.5177, "step": 9378 }, { "epoch": 0.7617152602939982, "grad_norm": 4.809392070744878, "learning_rate": 7.083333572847831e-07, "loss": 0.4528, "step": 9379 }, { "epoch": 0.7617964752700398, "grad_norm": 3.467857419045606, "learning_rate": 7.078747831325583e-07, "loss": 0.5457, "step": 9380 }, { "epoch": 0.7618776902460814, "grad_norm": 4.495338193171126, "learning_rate": 7.074163329859129e-07, "loss": 0.3799, "step": 9381 }, { "epoch": 0.761958905222123, "grad_norm": 4.00025930868917, "learning_rate": 7.069580068765702e-07, "loss": 0.3478, "step": 9382 }, { "epoch": 0.7620401201981646, "grad_norm": 10.827446952884547, "learning_rate": 7.064998048362448e-07, "loss": 0.4943, "step": 9383 }, { "epoch": 0.7621213351742061, "grad_norm": 5.064306842835891, "learning_rate": 7.060417268966408e-07, "loss": 0.4528, "step": 9384 }, { "epoch": 0.7622025501502477, "grad_norm": 4.468211696665271, "learning_rate": 7.055837730894541e-07, "loss": 0.4532, "step": 9385 }, { "epoch": 0.7622837651262893, "grad_norm": 6.800829039851487, "learning_rate": 7.051259434463745e-07, "loss": 0.347, "step": 9386 }, { "epoch": 0.7623649801023309, "grad_norm": 6.208378537952931, "learning_rate": 7.046682379990794e-07, "loss": 0.4928, "step": 9387 }, { "epoch": 0.7624461950783724, "grad_norm": 4.2620464174120425, "learning_rate": 7.042106567792406e-07, "loss": 0.5339, "step": 9388 }, { "epoch": 0.762527410054414, "grad_norm": 5.564383633638113, "learning_rate": 7.03753199818521e-07, "loss": 0.4415, "step": 9389 }, { "epoch": 0.7626086250304556, "grad_norm": 6.375756161119263, "learning_rate": 7.032958671485734e-07, "loss": 0.4055, "step": 9390 }, { "epoch": 0.7626898400064972, "grad_norm": 6.484850680260223, "learning_rate": 7.028386588010421e-07, "loss": 0.5106, "step": 9391 }, { "epoch": 0.7627710549825388, "grad_norm": 6.127381595989031, "learning_rate": 7.023815748075651e-07, "loss": 0.324, "step": 9392 }, { "epoch": 0.7628522699585804, "grad_norm": 4.069418782141938, "learning_rate": 7.019246151997694e-07, "loss": 0.4221, "step": 9393 }, { "epoch": 0.762933484934622, "grad_norm": 3.966681143690813, "learning_rate": 7.014677800092734e-07, "loss": 0.5576, "step": 9394 }, { "epoch": 0.7630146999106635, "grad_norm": 6.497751818986687, "learning_rate": 7.010110692676886e-07, "loss": 0.4702, "step": 9395 }, { "epoch": 0.7630959148867051, "grad_norm": 6.139900453454705, "learning_rate": 7.005544830066172e-07, "loss": 0.3902, "step": 9396 }, { "epoch": 0.7631771298627467, "grad_norm": 5.005470766044911, "learning_rate": 7.000980212576522e-07, "loss": 0.4043, "step": 9397 }, { "epoch": 0.7632583448387883, "grad_norm": 13.852454716609795, "learning_rate": 6.996416840523776e-07, "loss": 0.5992, "step": 9398 }, { "epoch": 0.7633395598148298, "grad_norm": 4.487243506708304, "learning_rate": 6.991854714223711e-07, "loss": 0.4657, "step": 9399 }, { "epoch": 0.7634207747908714, "grad_norm": 5.202920899583729, "learning_rate": 6.987293833991984e-07, "loss": 0.6265, "step": 9400 }, { "epoch": 0.763501989766913, "grad_norm": 17.066516911339072, "learning_rate": 6.982734200144192e-07, "loss": 0.4925, "step": 9401 }, { "epoch": 0.7635832047429546, "grad_norm": 3.6601744793382305, "learning_rate": 6.978175812995847e-07, "loss": 0.4923, "step": 9402 }, { "epoch": 0.7636644197189962, "grad_norm": 6.162261867859467, "learning_rate": 6.973618672862357e-07, "loss": 0.3402, "step": 9403 }, { "epoch": 0.7637456346950378, "grad_norm": 3.383805533605749, "learning_rate": 6.969062780059041e-07, "loss": 0.4892, "step": 9404 }, { "epoch": 0.7638268496710794, "grad_norm": 5.034659251576166, "learning_rate": 6.964508134901162e-07, "loss": 0.4553, "step": 9405 }, { "epoch": 0.7639080646471209, "grad_norm": 4.219393214106526, "learning_rate": 6.959954737703872e-07, "loss": 0.3549, "step": 9406 }, { "epoch": 0.7639892796231625, "grad_norm": 7.125686847574229, "learning_rate": 6.955402588782229e-07, "loss": 0.4162, "step": 9407 }, { "epoch": 0.7640704945992041, "grad_norm": 4.00897406634947, "learning_rate": 6.950851688451224e-07, "loss": 0.4606, "step": 9408 }, { "epoch": 0.7641517095752457, "grad_norm": 5.368532207744499, "learning_rate": 6.94630203702577e-07, "loss": 0.6832, "step": 9409 }, { "epoch": 0.7642329245512872, "grad_norm": 4.61391210253197, "learning_rate": 6.941753634820658e-07, "loss": 0.4458, "step": 9410 }, { "epoch": 0.7643141395273289, "grad_norm": 10.571864997786388, "learning_rate": 6.93720648215063e-07, "loss": 0.3812, "step": 9411 }, { "epoch": 0.7643953545033704, "grad_norm": 6.071699040314471, "learning_rate": 6.932660579330317e-07, "loss": 0.3526, "step": 9412 }, { "epoch": 0.764476569479412, "grad_norm": 5.121163071457247, "learning_rate": 6.928115926674265e-07, "loss": 0.5468, "step": 9413 }, { "epoch": 0.7645577844554536, "grad_norm": 4.5789958968091025, "learning_rate": 6.923572524496946e-07, "loss": 0.4688, "step": 9414 }, { "epoch": 0.7646389994314952, "grad_norm": 4.61496141221473, "learning_rate": 6.919030373112748e-07, "loss": 0.538, "step": 9415 }, { "epoch": 0.7647202144075368, "grad_norm": 5.530010793903644, "learning_rate": 6.914489472835959e-07, "loss": 0.3921, "step": 9416 }, { "epoch": 0.7648014293835783, "grad_norm": 4.053580386916616, "learning_rate": 6.909949823980772e-07, "loss": 0.4411, "step": 9417 }, { "epoch": 0.76488264435962, "grad_norm": 5.361420424547812, "learning_rate": 6.905411426861322e-07, "loss": 0.4674, "step": 9418 }, { "epoch": 0.7649638593356615, "grad_norm": 6.357458261429573, "learning_rate": 6.900874281791639e-07, "loss": 0.4666, "step": 9419 }, { "epoch": 0.7650450743117031, "grad_norm": 6.195545553218025, "learning_rate": 6.89633838908566e-07, "loss": 0.4103, "step": 9420 }, { "epoch": 0.7651262892877446, "grad_norm": 5.87439782429603, "learning_rate": 6.891803749057255e-07, "loss": 0.415, "step": 9421 }, { "epoch": 0.7652075042637863, "grad_norm": 3.5630192542232497, "learning_rate": 6.887270362020199e-07, "loss": 0.4371, "step": 9422 }, { "epoch": 0.7652887192398278, "grad_norm": 6.836068046454834, "learning_rate": 6.882738228288166e-07, "loss": 0.3986, "step": 9423 }, { "epoch": 0.7653699342158694, "grad_norm": 6.446997290383441, "learning_rate": 6.87820734817477e-07, "loss": 0.4896, "step": 9424 }, { "epoch": 0.765451149191911, "grad_norm": 4.358114968607592, "learning_rate": 6.873677721993518e-07, "loss": 0.514, "step": 9425 }, { "epoch": 0.7655323641679526, "grad_norm": 8.732219383382438, "learning_rate": 6.86914935005783e-07, "loss": 0.5265, "step": 9426 }, { "epoch": 0.7656135791439942, "grad_norm": 4.310805179732659, "learning_rate": 6.864622232681048e-07, "loss": 0.4076, "step": 9427 }, { "epoch": 0.7656947941200357, "grad_norm": 4.045216285940855, "learning_rate": 6.860096370176436e-07, "loss": 0.5051, "step": 9428 }, { "epoch": 0.7657760090960773, "grad_norm": 7.3004627003203435, "learning_rate": 6.855571762857144e-07, "loss": 0.5137, "step": 9429 }, { "epoch": 0.7658572240721189, "grad_norm": 4.355642197018315, "learning_rate": 6.851048411036265e-07, "loss": 0.4596, "step": 9430 }, { "epoch": 0.7659384390481605, "grad_norm": 4.107096289312408, "learning_rate": 6.846526315026783e-07, "loss": 0.5929, "step": 9431 }, { "epoch": 0.766019654024202, "grad_norm": 4.55847735238035, "learning_rate": 6.842005475141606e-07, "loss": 0.5062, "step": 9432 }, { "epoch": 0.7661008690002437, "grad_norm": 4.084279279942593, "learning_rate": 6.837485891693541e-07, "loss": 0.5043, "step": 9433 }, { "epoch": 0.7661820839762852, "grad_norm": 13.22441226336881, "learning_rate": 6.83296756499533e-07, "loss": 0.4094, "step": 9434 }, { "epoch": 0.7662632989523268, "grad_norm": 4.334505119567936, "learning_rate": 6.828450495359623e-07, "loss": 0.4747, "step": 9435 }, { "epoch": 0.7663445139283684, "grad_norm": 5.930691768024282, "learning_rate": 6.823934683098963e-07, "loss": 0.6257, "step": 9436 }, { "epoch": 0.76642572890441, "grad_norm": 3.3704120793280197, "learning_rate": 6.819420128525834e-07, "loss": 0.3181, "step": 9437 }, { "epoch": 0.7665069438804516, "grad_norm": 3.842942498154423, "learning_rate": 6.814906831952611e-07, "loss": 0.4773, "step": 9438 }, { "epoch": 0.7665881588564931, "grad_norm": 8.895568429556429, "learning_rate": 6.810394793691585e-07, "loss": 0.3918, "step": 9439 }, { "epoch": 0.7666693738325348, "grad_norm": 6.8916662279772565, "learning_rate": 6.805884014054975e-07, "loss": 0.4543, "step": 9440 }, { "epoch": 0.7667505888085763, "grad_norm": 4.932759072758281, "learning_rate": 6.801374493354907e-07, "loss": 0.555, "step": 9441 }, { "epoch": 0.7668318037846179, "grad_norm": 3.165718589290986, "learning_rate": 6.796866231903402e-07, "loss": 0.5693, "step": 9442 }, { "epoch": 0.7669130187606594, "grad_norm": 5.661881562952782, "learning_rate": 6.792359230012418e-07, "loss": 0.3955, "step": 9443 }, { "epoch": 0.7669942337367011, "grad_norm": 5.857023328763498, "learning_rate": 6.787853487993817e-07, "loss": 0.2898, "step": 9444 }, { "epoch": 0.7670754487127426, "grad_norm": 6.46410699371152, "learning_rate": 6.783349006159359e-07, "loss": 0.326, "step": 9445 }, { "epoch": 0.7671566636887842, "grad_norm": 6.9335177221214614, "learning_rate": 6.778845784820739e-07, "loss": 0.4844, "step": 9446 }, { "epoch": 0.7672378786648258, "grad_norm": 5.439529483084923, "learning_rate": 6.774343824289567e-07, "loss": 0.6114, "step": 9447 }, { "epoch": 0.7673190936408674, "grad_norm": 5.645470096440396, "learning_rate": 6.769843124877343e-07, "loss": 0.5204, "step": 9448 }, { "epoch": 0.767400308616909, "grad_norm": 24.655262909769625, "learning_rate": 6.765343686895484e-07, "loss": 0.4941, "step": 9449 }, { "epoch": 0.7674815235929505, "grad_norm": 6.241765051887523, "learning_rate": 6.760845510655345e-07, "loss": 0.3861, "step": 9450 }, { "epoch": 0.7675627385689922, "grad_norm": 8.035908170373448, "learning_rate": 6.756348596468168e-07, "loss": 0.5332, "step": 9451 }, { "epoch": 0.7676439535450337, "grad_norm": 4.6392581562003405, "learning_rate": 6.751852944645107e-07, "loss": 0.4359, "step": 9452 }, { "epoch": 0.7677251685210753, "grad_norm": 6.355510758018541, "learning_rate": 6.747358555497244e-07, "loss": 0.4006, "step": 9453 }, { "epoch": 0.7678063834971168, "grad_norm": 5.191185254070647, "learning_rate": 6.742865429335576e-07, "loss": 0.6034, "step": 9454 }, { "epoch": 0.7678875984731585, "grad_norm": 4.336515041136279, "learning_rate": 6.738373566470991e-07, "loss": 0.3565, "step": 9455 }, { "epoch": 0.7679688134492, "grad_norm": 7.028294785077958, "learning_rate": 6.733882967214312e-07, "loss": 0.4001, "step": 9456 }, { "epoch": 0.7680500284252416, "grad_norm": 3.345393934309023, "learning_rate": 6.729393631876257e-07, "loss": 0.4984, "step": 9457 }, { "epoch": 0.7681312434012832, "grad_norm": 4.618093066401069, "learning_rate": 6.724905560767464e-07, "loss": 0.3027, "step": 9458 }, { "epoch": 0.7682124583773248, "grad_norm": 5.511844154772235, "learning_rate": 6.720418754198485e-07, "loss": 0.3492, "step": 9459 }, { "epoch": 0.7682936733533664, "grad_norm": 4.512587186638037, "learning_rate": 6.715933212479791e-07, "loss": 0.3126, "step": 9460 }, { "epoch": 0.7683748883294079, "grad_norm": 3.8768216178706996, "learning_rate": 6.711448935921744e-07, "loss": 0.4618, "step": 9461 }, { "epoch": 0.7684561033054496, "grad_norm": 4.849694261108834, "learning_rate": 6.706965924834649e-07, "loss": 0.3315, "step": 9462 }, { "epoch": 0.7685373182814911, "grad_norm": 4.801051258082802, "learning_rate": 6.702484179528699e-07, "loss": 0.5456, "step": 9463 }, { "epoch": 0.7686185332575327, "grad_norm": 9.352209783795086, "learning_rate": 6.698003700313993e-07, "loss": 0.427, "step": 9464 }, { "epoch": 0.7686997482335742, "grad_norm": 7.643828934266029, "learning_rate": 6.69352448750058e-07, "loss": 0.7298, "step": 9465 }, { "epoch": 0.7687809632096159, "grad_norm": 5.286073745430146, "learning_rate": 6.689046541398378e-07, "loss": 0.5492, "step": 9466 }, { "epoch": 0.7688621781856574, "grad_norm": 7.104403560909407, "learning_rate": 6.684569862317255e-07, "loss": 0.4082, "step": 9467 }, { "epoch": 0.768943393161699, "grad_norm": 21.80499840324673, "learning_rate": 6.680094450566957e-07, "loss": 0.4536, "step": 9468 }, { "epoch": 0.7690246081377407, "grad_norm": 9.81351894485181, "learning_rate": 6.675620306457172e-07, "loss": 0.4041, "step": 9469 }, { "epoch": 0.7691058231137822, "grad_norm": 5.220882981566686, "learning_rate": 6.671147430297481e-07, "loss": 0.4332, "step": 9470 }, { "epoch": 0.7691870380898238, "grad_norm": 5.429857205093562, "learning_rate": 6.666675822397378e-07, "loss": 0.4061, "step": 9471 }, { "epoch": 0.7692682530658653, "grad_norm": 8.052028153479819, "learning_rate": 6.662205483066281e-07, "loss": 0.326, "step": 9472 }, { "epoch": 0.769349468041907, "grad_norm": 4.655903640053554, "learning_rate": 6.65773641261352e-07, "loss": 0.469, "step": 9473 }, { "epoch": 0.7694306830179485, "grad_norm": 6.279574614073089, "learning_rate": 6.653268611348315e-07, "loss": 0.3736, "step": 9474 }, { "epoch": 0.7695118979939901, "grad_norm": 4.349805162366064, "learning_rate": 6.64880207957983e-07, "loss": 0.4281, "step": 9475 }, { "epoch": 0.7695931129700316, "grad_norm": 4.485659687329684, "learning_rate": 6.644336817617122e-07, "loss": 0.4795, "step": 9476 }, { "epoch": 0.7696743279460733, "grad_norm": 5.1817758651078885, "learning_rate": 6.63987282576915e-07, "loss": 0.4077, "step": 9477 }, { "epoch": 0.7697555429221148, "grad_norm": 3.6533998487921298, "learning_rate": 6.635410104344819e-07, "loss": 0.5877, "step": 9478 }, { "epoch": 0.7698367578981564, "grad_norm": 5.823071091152919, "learning_rate": 6.630948653652905e-07, "loss": 0.4759, "step": 9479 }, { "epoch": 0.769917972874198, "grad_norm": 6.866234496364005, "learning_rate": 6.62648847400213e-07, "loss": 0.4428, "step": 9480 }, { "epoch": 0.7699991878502396, "grad_norm": 5.054351987364898, "learning_rate": 6.622029565701118e-07, "loss": 0.4417, "step": 9481 }, { "epoch": 0.7700804028262812, "grad_norm": 4.79929266732889, "learning_rate": 6.617571929058397e-07, "loss": 0.4682, "step": 9482 }, { "epoch": 0.7701616178023227, "grad_norm": 4.783027204950755, "learning_rate": 6.613115564382403e-07, "loss": 0.402, "step": 9483 }, { "epoch": 0.7702428327783644, "grad_norm": 11.930675421175312, "learning_rate": 6.608660471981509e-07, "loss": 0.4791, "step": 9484 }, { "epoch": 0.7703240477544059, "grad_norm": 6.054291067646692, "learning_rate": 6.604206652163967e-07, "loss": 0.4235, "step": 9485 }, { "epoch": 0.7704052627304475, "grad_norm": 7.228203275275898, "learning_rate": 6.599754105237974e-07, "loss": 0.4832, "step": 9486 }, { "epoch": 0.770486477706489, "grad_norm": 3.7391869318420285, "learning_rate": 6.595302831511607e-07, "loss": 0.537, "step": 9487 }, { "epoch": 0.7705676926825307, "grad_norm": 5.637019086673706, "learning_rate": 6.590852831292885e-07, "loss": 0.6264, "step": 9488 }, { "epoch": 0.7706489076585722, "grad_norm": 7.91711166378353, "learning_rate": 6.586404104889721e-07, "loss": 0.477, "step": 9489 }, { "epoch": 0.7707301226346138, "grad_norm": 5.980422311755343, "learning_rate": 6.58195665260993e-07, "loss": 0.433, "step": 9490 }, { "epoch": 0.7708113376106555, "grad_norm": 10.886812979952378, "learning_rate": 6.577510474761272e-07, "loss": 0.4323, "step": 9491 }, { "epoch": 0.770892552586697, "grad_norm": 5.242170156688564, "learning_rate": 6.573065571651383e-07, "loss": 0.3652, "step": 9492 }, { "epoch": 0.7709737675627386, "grad_norm": 5.032479849055025, "learning_rate": 6.56862194358783e-07, "loss": 0.4055, "step": 9493 }, { "epoch": 0.7710549825387801, "grad_norm": 9.525750208818708, "learning_rate": 6.5641795908781e-07, "loss": 0.5108, "step": 9494 }, { "epoch": 0.7711361975148218, "grad_norm": 4.492371142698525, "learning_rate": 6.559738513829572e-07, "loss": 0.679, "step": 9495 }, { "epoch": 0.7712174124908633, "grad_norm": 5.27190852592138, "learning_rate": 6.555298712749538e-07, "loss": 0.5479, "step": 9496 }, { "epoch": 0.7712986274669049, "grad_norm": 5.955473036514189, "learning_rate": 6.550860187945227e-07, "loss": 0.4585, "step": 9497 }, { "epoch": 0.7713798424429464, "grad_norm": 5.382341493930171, "learning_rate": 6.546422939723738e-07, "loss": 0.4537, "step": 9498 }, { "epoch": 0.7714610574189881, "grad_norm": 5.842094824500748, "learning_rate": 6.541986968392119e-07, "loss": 0.4458, "step": 9499 }, { "epoch": 0.7715422723950296, "grad_norm": 4.8906020204774165, "learning_rate": 6.537552274257322e-07, "loss": 0.4932, "step": 9500 }, { "epoch": 0.7716234873710712, "grad_norm": 8.934213855225288, "learning_rate": 6.533118857626194e-07, "loss": 0.5223, "step": 9501 }, { "epoch": 0.7717047023471129, "grad_norm": 4.746748520348421, "learning_rate": 6.52868671880551e-07, "loss": 0.5565, "step": 9502 }, { "epoch": 0.7717859173231544, "grad_norm": 8.532705058823186, "learning_rate": 6.524255858101938e-07, "loss": 0.5495, "step": 9503 }, { "epoch": 0.771867132299196, "grad_norm": 10.865485946925974, "learning_rate": 6.519826275822086e-07, "loss": 0.5132, "step": 9504 }, { "epoch": 0.7719483472752375, "grad_norm": 3.470727903343789, "learning_rate": 6.515397972272444e-07, "loss": 0.4055, "step": 9505 }, { "epoch": 0.7720295622512792, "grad_norm": 4.190361234303148, "learning_rate": 6.510970947759434e-07, "loss": 0.4745, "step": 9506 }, { "epoch": 0.7721107772273207, "grad_norm": 4.1269789461455515, "learning_rate": 6.50654520258939e-07, "loss": 0.4198, "step": 9507 }, { "epoch": 0.7721919922033623, "grad_norm": 5.796517123286705, "learning_rate": 6.502120737068543e-07, "loss": 0.5517, "step": 9508 }, { "epoch": 0.7722732071794038, "grad_norm": 3.9798948730265002, "learning_rate": 6.497697551503032e-07, "loss": 0.4771, "step": 9509 }, { "epoch": 0.7723544221554455, "grad_norm": 3.3670043032275445, "learning_rate": 6.493275646198941e-07, "loss": 0.507, "step": 9510 }, { "epoch": 0.772435637131487, "grad_norm": 4.789052775696503, "learning_rate": 6.488855021462218e-07, "loss": 0.651, "step": 9511 }, { "epoch": 0.7725168521075286, "grad_norm": 4.574449211149501, "learning_rate": 6.484435677598761e-07, "loss": 0.3833, "step": 9512 }, { "epoch": 0.7725980670835703, "grad_norm": 5.744708779873541, "learning_rate": 6.480017614914369e-07, "loss": 0.52, "step": 9513 }, { "epoch": 0.7726792820596118, "grad_norm": 6.185709557761946, "learning_rate": 6.475600833714743e-07, "loss": 0.5064, "step": 9514 }, { "epoch": 0.7727604970356534, "grad_norm": 6.093796766128401, "learning_rate": 6.471185334305491e-07, "loss": 0.5049, "step": 9515 }, { "epoch": 0.7728417120116949, "grad_norm": 5.325643999422, "learning_rate": 6.466771116992162e-07, "loss": 0.4234, "step": 9516 }, { "epoch": 0.7729229269877366, "grad_norm": 5.854284079602375, "learning_rate": 6.462358182080175e-07, "loss": 0.504, "step": 9517 }, { "epoch": 0.7730041419637781, "grad_norm": 6.5341398235283865, "learning_rate": 6.457946529874895e-07, "loss": 0.6357, "step": 9518 }, { "epoch": 0.7730853569398197, "grad_norm": 3.7271101675803027, "learning_rate": 6.453536160681592e-07, "loss": 0.484, "step": 9519 }, { "epoch": 0.7731665719158612, "grad_norm": 3.610155546754207, "learning_rate": 6.449127074805428e-07, "loss": 0.464, "step": 9520 }, { "epoch": 0.7732477868919029, "grad_norm": 4.384516099481292, "learning_rate": 6.444719272551491e-07, "loss": 0.4131, "step": 9521 }, { "epoch": 0.7733290018679444, "grad_norm": 7.395867107582111, "learning_rate": 6.440312754224773e-07, "loss": 0.489, "step": 9522 }, { "epoch": 0.773410216843986, "grad_norm": 3.52191839672315, "learning_rate": 6.435907520130191e-07, "loss": 0.3974, "step": 9523 }, { "epoch": 0.7734914318200277, "grad_norm": 5.735438136851437, "learning_rate": 6.431503570572554e-07, "loss": 0.3681, "step": 9524 }, { "epoch": 0.7735726467960692, "grad_norm": 4.138098595663441, "learning_rate": 6.427100905856598e-07, "loss": 0.46, "step": 9525 }, { "epoch": 0.7736538617721108, "grad_norm": 4.627439119735813, "learning_rate": 6.422699526286969e-07, "loss": 0.5792, "step": 9526 }, { "epoch": 0.7737350767481523, "grad_norm": 4.326175948208715, "learning_rate": 6.418299432168215e-07, "loss": 0.6043, "step": 9527 }, { "epoch": 0.773816291724194, "grad_norm": 4.7243209100130334, "learning_rate": 6.413900623804792e-07, "loss": 0.3815, "step": 9528 }, { "epoch": 0.7738975067002355, "grad_norm": 6.9628262078532535, "learning_rate": 6.409503101501086e-07, "loss": 0.5016, "step": 9529 }, { "epoch": 0.7739787216762771, "grad_norm": 3.781446710484677, "learning_rate": 6.405106865561367e-07, "loss": 0.6054, "step": 9530 }, { "epoch": 0.7740599366523186, "grad_norm": 8.30986430715804, "learning_rate": 6.400711916289846e-07, "loss": 0.443, "step": 9531 }, { "epoch": 0.7741411516283603, "grad_norm": 4.838062384790877, "learning_rate": 6.396318253990628e-07, "loss": 0.4772, "step": 9532 }, { "epoch": 0.7742223666044018, "grad_norm": 6.597678306183141, "learning_rate": 6.391925878967728e-07, "loss": 0.4941, "step": 9533 }, { "epoch": 0.7743035815804434, "grad_norm": 4.6205393788485845, "learning_rate": 6.387534791525072e-07, "loss": 0.5037, "step": 9534 }, { "epoch": 0.7743847965564851, "grad_norm": 4.269745258902035, "learning_rate": 6.383144991966508e-07, "loss": 0.7124, "step": 9535 }, { "epoch": 0.7744660115325266, "grad_norm": 7.2941195850877145, "learning_rate": 6.378756480595782e-07, "loss": 0.4697, "step": 9536 }, { "epoch": 0.7745472265085682, "grad_norm": 8.30813779565925, "learning_rate": 6.374369257716548e-07, "loss": 0.4272, "step": 9537 }, { "epoch": 0.7746284414846097, "grad_norm": 5.268713325301926, "learning_rate": 6.369983323632389e-07, "loss": 0.3696, "step": 9538 }, { "epoch": 0.7747096564606514, "grad_norm": 6.353077785001505, "learning_rate": 6.365598678646793e-07, "loss": 0.499, "step": 9539 }, { "epoch": 0.7747908714366929, "grad_norm": 7.787690788411613, "learning_rate": 6.361215323063144e-07, "loss": 0.517, "step": 9540 }, { "epoch": 0.7748720864127345, "grad_norm": 6.712726861513404, "learning_rate": 6.356833257184747e-07, "loss": 0.3541, "step": 9541 }, { "epoch": 0.774953301388776, "grad_norm": 7.036101207768536, "learning_rate": 6.352452481314825e-07, "loss": 0.5433, "step": 9542 }, { "epoch": 0.7750345163648177, "grad_norm": 4.440795097972874, "learning_rate": 6.348072995756497e-07, "loss": 0.4672, "step": 9543 }, { "epoch": 0.7751157313408592, "grad_norm": 6.623356794323855, "learning_rate": 6.3436948008128e-07, "loss": 0.479, "step": 9544 }, { "epoch": 0.7751969463169008, "grad_norm": 3.870234328714397, "learning_rate": 6.339317896786693e-07, "loss": 0.5124, "step": 9545 }, { "epoch": 0.7752781612929425, "grad_norm": 7.935205106400605, "learning_rate": 6.33494228398103e-07, "loss": 0.5108, "step": 9546 }, { "epoch": 0.775359376268984, "grad_norm": 4.477009003593662, "learning_rate": 6.33056796269857e-07, "loss": 0.5893, "step": 9547 }, { "epoch": 0.7754405912450256, "grad_norm": 6.221717457872238, "learning_rate": 6.326194933242006e-07, "loss": 0.5316, "step": 9548 }, { "epoch": 0.7755218062210671, "grad_norm": 5.507373178005721, "learning_rate": 6.321823195913924e-07, "loss": 0.5283, "step": 9549 }, { "epoch": 0.7756030211971088, "grad_norm": 5.749251136653241, "learning_rate": 6.317452751016815e-07, "loss": 0.5322, "step": 9550 }, { "epoch": 0.7756842361731503, "grad_norm": 4.194072441757716, "learning_rate": 6.313083598853101e-07, "loss": 0.5895, "step": 9551 }, { "epoch": 0.7757654511491919, "grad_norm": 5.534135119543315, "learning_rate": 6.308715739725108e-07, "loss": 0.3903, "step": 9552 }, { "epoch": 0.7758466661252335, "grad_norm": 5.890217163199354, "learning_rate": 6.30434917393506e-07, "loss": 0.4641, "step": 9553 }, { "epoch": 0.7759278811012751, "grad_norm": 4.610182903829491, "learning_rate": 6.299983901785109e-07, "loss": 0.5238, "step": 9554 }, { "epoch": 0.7760090960773166, "grad_norm": 5.475473491313324, "learning_rate": 6.295619923577303e-07, "loss": 0.4373, "step": 9555 }, { "epoch": 0.7760903110533582, "grad_norm": 3.7085653093083364, "learning_rate": 6.291257239613599e-07, "loss": 0.5475, "step": 9556 }, { "epoch": 0.7761715260293999, "grad_norm": 5.431007075610515, "learning_rate": 6.286895850195882e-07, "loss": 0.521, "step": 9557 }, { "epoch": 0.7762527410054414, "grad_norm": 7.326528629894263, "learning_rate": 6.28253575562594e-07, "loss": 0.5119, "step": 9558 }, { "epoch": 0.776333955981483, "grad_norm": 6.69328051954586, "learning_rate": 6.278176956205462e-07, "loss": 0.468, "step": 9559 }, { "epoch": 0.7764151709575245, "grad_norm": 7.6181081748087776, "learning_rate": 6.273819452236049e-07, "loss": 0.4013, "step": 9560 }, { "epoch": 0.7764963859335662, "grad_norm": 5.517368857197591, "learning_rate": 6.269463244019231e-07, "loss": 0.4919, "step": 9561 }, { "epoch": 0.7765776009096077, "grad_norm": 5.2478286059509145, "learning_rate": 6.265108331856423e-07, "loss": 0.4883, "step": 9562 }, { "epoch": 0.7766588158856493, "grad_norm": 8.241994509786712, "learning_rate": 6.260754716048961e-07, "loss": 0.4527, "step": 9563 }, { "epoch": 0.7767400308616909, "grad_norm": 5.796864584164602, "learning_rate": 6.256402396898095e-07, "loss": 0.4505, "step": 9564 }, { "epoch": 0.7768212458377325, "grad_norm": 5.288449361519552, "learning_rate": 6.252051374704992e-07, "loss": 0.5593, "step": 9565 }, { "epoch": 0.776902460813774, "grad_norm": 4.215256696565076, "learning_rate": 6.247701649770707e-07, "loss": 0.4833, "step": 9566 }, { "epoch": 0.7769836757898156, "grad_norm": 4.625763848409002, "learning_rate": 6.243353222396229e-07, "loss": 0.4169, "step": 9567 }, { "epoch": 0.7770648907658573, "grad_norm": 7.386761076675003, "learning_rate": 6.239006092882438e-07, "loss": 0.5572, "step": 9568 }, { "epoch": 0.7771461057418988, "grad_norm": 3.960429753595598, "learning_rate": 6.234660261530126e-07, "loss": 0.448, "step": 9569 }, { "epoch": 0.7772273207179404, "grad_norm": 6.218816667980998, "learning_rate": 6.23031572864001e-07, "loss": 0.4399, "step": 9570 }, { "epoch": 0.777308535693982, "grad_norm": 11.37451355769962, "learning_rate": 6.225972494512719e-07, "loss": 0.4474, "step": 9571 }, { "epoch": 0.7773897506700236, "grad_norm": 4.63095111539401, "learning_rate": 6.22163055944876e-07, "loss": 0.4599, "step": 9572 }, { "epoch": 0.7774709656460651, "grad_norm": 4.637853908423072, "learning_rate": 6.217289923748592e-07, "loss": 0.3144, "step": 9573 }, { "epoch": 0.7775521806221067, "grad_norm": 7.081685491650372, "learning_rate": 6.212950587712557e-07, "loss": 0.6146, "step": 9574 }, { "epoch": 0.7776333955981483, "grad_norm": 4.122348761014315, "learning_rate": 6.20861255164091e-07, "loss": 0.5314, "step": 9575 }, { "epoch": 0.7777146105741899, "grad_norm": 5.74857146041133, "learning_rate": 6.204275815833807e-07, "loss": 0.4767, "step": 9576 }, { "epoch": 0.7777958255502314, "grad_norm": 5.874273219932277, "learning_rate": 6.19994038059136e-07, "loss": 0.4707, "step": 9577 }, { "epoch": 0.777877040526273, "grad_norm": 6.134515006712459, "learning_rate": 6.19560624621354e-07, "loss": 0.5144, "step": 9578 }, { "epoch": 0.7779582555023147, "grad_norm": 5.019484047549089, "learning_rate": 6.191273413000237e-07, "loss": 0.3622, "step": 9579 }, { "epoch": 0.7780394704783562, "grad_norm": 4.813176127586792, "learning_rate": 6.186941881251279e-07, "loss": 0.3609, "step": 9580 }, { "epoch": 0.7781206854543978, "grad_norm": 6.957964558660779, "learning_rate": 6.182611651266376e-07, "loss": 0.4625, "step": 9581 }, { "epoch": 0.7782019004304394, "grad_norm": 4.786938388125974, "learning_rate": 6.17828272334515e-07, "loss": 0.4196, "step": 9582 }, { "epoch": 0.778283115406481, "grad_norm": 14.455541969389369, "learning_rate": 6.173955097787149e-07, "loss": 0.3105, "step": 9583 }, { "epoch": 0.7783643303825225, "grad_norm": 6.015474094967977, "learning_rate": 6.169628774891826e-07, "loss": 0.6127, "step": 9584 }, { "epoch": 0.7784455453585641, "grad_norm": 7.69183168267593, "learning_rate": 6.165303754958524e-07, "loss": 0.5329, "step": 9585 }, { "epoch": 0.7785267603346057, "grad_norm": 7.494666404519941, "learning_rate": 6.160980038286529e-07, "loss": 0.3892, "step": 9586 }, { "epoch": 0.7786079753106473, "grad_norm": 3.6352807891152508, "learning_rate": 6.156657625175011e-07, "loss": 0.4718, "step": 9587 }, { "epoch": 0.7786891902866888, "grad_norm": 3.6374040934237066, "learning_rate": 6.152336515923052e-07, "loss": 0.49, "step": 9588 }, { "epoch": 0.7787704052627304, "grad_norm": 4.790151735557977, "learning_rate": 6.148016710829654e-07, "loss": 0.5964, "step": 9589 }, { "epoch": 0.7788516202387721, "grad_norm": 6.6959826045900614, "learning_rate": 6.143698210193738e-07, "loss": 0.7207, "step": 9590 }, { "epoch": 0.7789328352148136, "grad_norm": 5.021956045573845, "learning_rate": 6.139381014314108e-07, "loss": 0.4336, "step": 9591 }, { "epoch": 0.7790140501908552, "grad_norm": 8.574991586907696, "learning_rate": 6.135065123489486e-07, "loss": 0.4282, "step": 9592 }, { "epoch": 0.7790952651668968, "grad_norm": 4.869738833833407, "learning_rate": 6.130750538018524e-07, "loss": 0.5189, "step": 9593 }, { "epoch": 0.7791764801429384, "grad_norm": 5.876692632158238, "learning_rate": 6.12643725819976e-07, "loss": 0.4978, "step": 9594 }, { "epoch": 0.7792576951189799, "grad_norm": 7.302404610138221, "learning_rate": 6.122125284331646e-07, "loss": 0.5031, "step": 9595 }, { "epoch": 0.7793389100950215, "grad_norm": 6.239496444525358, "learning_rate": 6.117814616712548e-07, "loss": 0.4399, "step": 9596 }, { "epoch": 0.7794201250710631, "grad_norm": 4.341820954785043, "learning_rate": 6.113505255640756e-07, "loss": 0.4661, "step": 9597 }, { "epoch": 0.7795013400471047, "grad_norm": 4.82168987167205, "learning_rate": 6.109197201414438e-07, "loss": 0.4285, "step": 9598 }, { "epoch": 0.7795825550231462, "grad_norm": 5.620360996597726, "learning_rate": 6.104890454331702e-07, "loss": 0.6044, "step": 9599 }, { "epoch": 0.7796637699991879, "grad_norm": 4.809738135444571, "learning_rate": 6.100585014690547e-07, "loss": 0.6426, "step": 9600 }, { "epoch": 0.7797449849752295, "grad_norm": 3.621342250605208, "learning_rate": 6.096280882788874e-07, "loss": 0.5404, "step": 9601 }, { "epoch": 0.779826199951271, "grad_norm": 8.468527887913863, "learning_rate": 6.091978058924522e-07, "loss": 0.3656, "step": 9602 }, { "epoch": 0.7799074149273126, "grad_norm": 3.464489359474417, "learning_rate": 6.087676543395224e-07, "loss": 0.6374, "step": 9603 }, { "epoch": 0.7799886299033542, "grad_norm": 5.245897271466931, "learning_rate": 6.083376336498608e-07, "loss": 0.5772, "step": 9604 }, { "epoch": 0.7800698448793958, "grad_norm": 6.62332037882836, "learning_rate": 6.079077438532246e-07, "loss": 0.5533, "step": 9605 }, { "epoch": 0.7801510598554373, "grad_norm": 5.690296444200083, "learning_rate": 6.074779849793585e-07, "loss": 0.5408, "step": 9606 }, { "epoch": 0.780232274831479, "grad_norm": 4.804271179305116, "learning_rate": 6.07048357057999e-07, "loss": 0.558, "step": 9607 }, { "epoch": 0.7803134898075205, "grad_norm": 6.074710914942616, "learning_rate": 6.066188601188757e-07, "loss": 0.5146, "step": 9608 }, { "epoch": 0.7803947047835621, "grad_norm": 6.506228847573674, "learning_rate": 6.061894941917062e-07, "loss": 0.5551, "step": 9609 }, { "epoch": 0.7804759197596036, "grad_norm": 5.035663800312012, "learning_rate": 6.057602593062015e-07, "loss": 0.4042, "step": 9610 }, { "epoch": 0.7805571347356453, "grad_norm": 3.7689993752107234, "learning_rate": 6.053311554920607e-07, "loss": 0.4688, "step": 9611 }, { "epoch": 0.7806383497116869, "grad_norm": 6.095711004984003, "learning_rate": 6.049021827789774e-07, "loss": 0.3031, "step": 9612 }, { "epoch": 0.7807195646877284, "grad_norm": 5.01460034158621, "learning_rate": 6.044733411966336e-07, "loss": 0.4798, "step": 9613 }, { "epoch": 0.78080077966377, "grad_norm": 8.19175212779431, "learning_rate": 6.040446307747019e-07, "loss": 0.5674, "step": 9614 }, { "epoch": 0.7808819946398116, "grad_norm": 4.961605893602875, "learning_rate": 6.036160515428475e-07, "loss": 0.4449, "step": 9615 }, { "epoch": 0.7809632096158532, "grad_norm": 6.915675610043426, "learning_rate": 6.031876035307263e-07, "loss": 0.4569, "step": 9616 }, { "epoch": 0.7810444245918947, "grad_norm": 5.568377529756666, "learning_rate": 6.027592867679838e-07, "loss": 0.3962, "step": 9617 }, { "epoch": 0.7811256395679363, "grad_norm": 6.129156630011155, "learning_rate": 6.023311012842581e-07, "loss": 0.5745, "step": 9618 }, { "epoch": 0.7812068545439779, "grad_norm": 5.108160511033742, "learning_rate": 6.019030471091772e-07, "loss": 0.3951, "step": 9619 }, { "epoch": 0.7812880695200195, "grad_norm": 5.561892142462487, "learning_rate": 6.014751242723591e-07, "loss": 0.5265, "step": 9620 }, { "epoch": 0.781369284496061, "grad_norm": 3.7511127265919244, "learning_rate": 6.010473328034153e-07, "loss": 0.5185, "step": 9621 }, { "epoch": 0.7814504994721027, "grad_norm": 5.167066450794399, "learning_rate": 6.006196727319452e-07, "loss": 0.4383, "step": 9622 }, { "epoch": 0.7815317144481443, "grad_norm": 7.96179114023694, "learning_rate": 6.001921440875414e-07, "loss": 0.3846, "step": 9623 }, { "epoch": 0.7816129294241858, "grad_norm": 5.371452420693181, "learning_rate": 5.997647468997875e-07, "loss": 0.6281, "step": 9624 }, { "epoch": 0.7816941444002274, "grad_norm": 5.8732462009916, "learning_rate": 5.99337481198256e-07, "loss": 0.5173, "step": 9625 }, { "epoch": 0.781775359376269, "grad_norm": 5.56845044238569, "learning_rate": 5.989103470125113e-07, "loss": 0.5523, "step": 9626 }, { "epoch": 0.7818565743523106, "grad_norm": 6.424489397897517, "learning_rate": 5.984833443721097e-07, "loss": 0.3735, "step": 9627 }, { "epoch": 0.7819377893283521, "grad_norm": 4.1718433726191275, "learning_rate": 5.980564733065963e-07, "loss": 0.4501, "step": 9628 }, { "epoch": 0.7820190043043938, "grad_norm": 5.012432986771707, "learning_rate": 5.976297338455101e-07, "loss": 0.5626, "step": 9629 }, { "epoch": 0.7821002192804353, "grad_norm": 13.705889019793014, "learning_rate": 5.972031260183772e-07, "loss": 0.5116, "step": 9630 }, { "epoch": 0.7821814342564769, "grad_norm": 5.262747778591087, "learning_rate": 5.967766498547181e-07, "loss": 0.4009, "step": 9631 }, { "epoch": 0.7822626492325184, "grad_norm": 4.146136341129552, "learning_rate": 5.963503053840425e-07, "loss": 0.4744, "step": 9632 }, { "epoch": 0.7823438642085601, "grad_norm": 6.971385385680695, "learning_rate": 5.959240926358501e-07, "loss": 0.4348, "step": 9633 }, { "epoch": 0.7824250791846017, "grad_norm": 4.565877029510119, "learning_rate": 5.954980116396336e-07, "loss": 0.6681, "step": 9634 }, { "epoch": 0.7825062941606432, "grad_norm": 4.089174310097666, "learning_rate": 5.950720624248749e-07, "loss": 0.484, "step": 9635 }, { "epoch": 0.7825875091366848, "grad_norm": 5.870091842126987, "learning_rate": 5.946462450210477e-07, "loss": 0.4509, "step": 9636 }, { "epoch": 0.7826687241127264, "grad_norm": 17.233940029392116, "learning_rate": 5.942205594576173e-07, "loss": 0.5734, "step": 9637 }, { "epoch": 0.782749939088768, "grad_norm": 12.113335333152698, "learning_rate": 5.937950057640376e-07, "loss": 0.4828, "step": 9638 }, { "epoch": 0.7828311540648095, "grad_norm": 7.210552071495601, "learning_rate": 5.933695839697548e-07, "loss": 0.4928, "step": 9639 }, { "epoch": 0.7829123690408512, "grad_norm": 4.338185170282578, "learning_rate": 5.929442941042066e-07, "loss": 0.578, "step": 9640 }, { "epoch": 0.7829935840168927, "grad_norm": 6.223280812892159, "learning_rate": 5.925191361968194e-07, "loss": 0.4616, "step": 9641 }, { "epoch": 0.7830747989929343, "grad_norm": 3.9906953099976397, "learning_rate": 5.920941102770128e-07, "loss": 0.539, "step": 9642 }, { "epoch": 0.7831560139689758, "grad_norm": 4.895008165041246, "learning_rate": 5.916692163741972e-07, "loss": 0.5437, "step": 9643 }, { "epoch": 0.7832372289450175, "grad_norm": 7.001022857461772, "learning_rate": 5.91244454517772e-07, "loss": 0.4387, "step": 9644 }, { "epoch": 0.7833184439210591, "grad_norm": 8.865782780347201, "learning_rate": 5.908198247371289e-07, "loss": 0.4938, "step": 9645 }, { "epoch": 0.7833996588971006, "grad_norm": 3.951135802126336, "learning_rate": 5.903953270616486e-07, "loss": 0.4084, "step": 9646 }, { "epoch": 0.7834808738731422, "grad_norm": 6.755133249113319, "learning_rate": 5.899709615207055e-07, "loss": 0.5784, "step": 9647 }, { "epoch": 0.7835620888491838, "grad_norm": 7.946028938098305, "learning_rate": 5.895467281436637e-07, "loss": 0.5064, "step": 9648 }, { "epoch": 0.7836433038252254, "grad_norm": 5.014225943190868, "learning_rate": 5.891226269598768e-07, "loss": 0.6636, "step": 9649 }, { "epoch": 0.7837245188012669, "grad_norm": 5.034789109316721, "learning_rate": 5.886986579986917e-07, "loss": 0.4543, "step": 9650 }, { "epoch": 0.7838057337773086, "grad_norm": 5.031634092156704, "learning_rate": 5.882748212894441e-07, "loss": 0.5141, "step": 9651 }, { "epoch": 0.7838869487533501, "grad_norm": 3.651635786477034, "learning_rate": 5.878511168614601e-07, "loss": 0.5764, "step": 9652 }, { "epoch": 0.7839681637293917, "grad_norm": 4.577152340868677, "learning_rate": 5.874275447440599e-07, "loss": 0.4339, "step": 9653 }, { "epoch": 0.7840493787054332, "grad_norm": 6.2520933191135875, "learning_rate": 5.870041049665507e-07, "loss": 0.5523, "step": 9654 }, { "epoch": 0.7841305936814749, "grad_norm": 4.385286574382671, "learning_rate": 5.86580797558233e-07, "loss": 0.4017, "step": 9655 }, { "epoch": 0.7842118086575165, "grad_norm": 3.6357368477226233, "learning_rate": 5.861576225483984e-07, "loss": 0.4594, "step": 9656 }, { "epoch": 0.784293023633558, "grad_norm": 5.476827776987912, "learning_rate": 5.857345799663272e-07, "loss": 0.3793, "step": 9657 }, { "epoch": 0.7843742386095996, "grad_norm": 5.500782599304935, "learning_rate": 5.853116698412913e-07, "loss": 0.4516, "step": 9658 }, { "epoch": 0.7844554535856412, "grad_norm": 4.882146338011125, "learning_rate": 5.848888922025553e-07, "loss": 0.5161, "step": 9659 }, { "epoch": 0.7845366685616828, "grad_norm": 5.737418334120377, "learning_rate": 5.844662470793716e-07, "loss": 0.4623, "step": 9660 }, { "epoch": 0.7846178835377243, "grad_norm": 5.3911366434680685, "learning_rate": 5.840437345009859e-07, "loss": 0.5734, "step": 9661 }, { "epoch": 0.784699098513766, "grad_norm": 4.290906281542307, "learning_rate": 5.83621354496634e-07, "loss": 0.5312, "step": 9662 }, { "epoch": 0.7847803134898075, "grad_norm": 5.746160856248301, "learning_rate": 5.831991070955426e-07, "loss": 0.4219, "step": 9663 }, { "epoch": 0.7848615284658491, "grad_norm": 5.04902095963468, "learning_rate": 5.827769923269283e-07, "loss": 0.5044, "step": 9664 }, { "epoch": 0.7849427434418906, "grad_norm": 10.70114453686239, "learning_rate": 5.823550102199985e-07, "loss": 0.4453, "step": 9665 }, { "epoch": 0.7850239584179323, "grad_norm": 4.150840098492769, "learning_rate": 5.819331608039538e-07, "loss": 0.4819, "step": 9666 }, { "epoch": 0.7851051733939739, "grad_norm": 7.526924883890027, "learning_rate": 5.815114441079825e-07, "loss": 0.5268, "step": 9667 }, { "epoch": 0.7851863883700154, "grad_norm": 6.803056853476825, "learning_rate": 5.810898601612657e-07, "loss": 0.6169, "step": 9668 }, { "epoch": 0.785267603346057, "grad_norm": 6.656336422437745, "learning_rate": 5.806684089929756e-07, "loss": 0.4574, "step": 9669 }, { "epoch": 0.7853488183220986, "grad_norm": 4.099463565565569, "learning_rate": 5.802470906322738e-07, "loss": 0.5343, "step": 9670 }, { "epoch": 0.7854300332981402, "grad_norm": 9.804778473842848, "learning_rate": 5.798259051083124e-07, "loss": 0.4658, "step": 9671 }, { "epoch": 0.7855112482741817, "grad_norm": 5.831632381423531, "learning_rate": 5.794048524502366e-07, "loss": 0.3633, "step": 9672 }, { "epoch": 0.7855924632502234, "grad_norm": 4.512592207772697, "learning_rate": 5.789839326871799e-07, "loss": 0.5674, "step": 9673 }, { "epoch": 0.7856736782262649, "grad_norm": 7.058643124685738, "learning_rate": 5.785631458482679e-07, "loss": 0.4566, "step": 9674 }, { "epoch": 0.7857548932023065, "grad_norm": 9.677711342457231, "learning_rate": 5.781424919626183e-07, "loss": 0.5513, "step": 9675 }, { "epoch": 0.785836108178348, "grad_norm": 7.654772369572434, "learning_rate": 5.777219710593365e-07, "loss": 0.4374, "step": 9676 }, { "epoch": 0.7859173231543897, "grad_norm": 5.412971281189123, "learning_rate": 5.773015831675204e-07, "loss": 0.518, "step": 9677 }, { "epoch": 0.7859985381304313, "grad_norm": 7.042856826106027, "learning_rate": 5.768813283162597e-07, "loss": 0.4422, "step": 9678 }, { "epoch": 0.7860797531064728, "grad_norm": 8.697761416414153, "learning_rate": 5.764612065346328e-07, "loss": 0.3727, "step": 9679 }, { "epoch": 0.7861609680825145, "grad_norm": 11.951299771554737, "learning_rate": 5.760412178517099e-07, "loss": 0.3518, "step": 9680 }, { "epoch": 0.786242183058556, "grad_norm": 5.170910615421707, "learning_rate": 5.75621362296552e-07, "loss": 0.7013, "step": 9681 }, { "epoch": 0.7863233980345976, "grad_norm": 15.391790408942455, "learning_rate": 5.752016398982122e-07, "loss": 0.4973, "step": 9682 }, { "epoch": 0.7864046130106391, "grad_norm": 6.4992455503524145, "learning_rate": 5.747820506857318e-07, "loss": 0.5486, "step": 9683 }, { "epoch": 0.7864858279866808, "grad_norm": 5.514244182216503, "learning_rate": 5.74362594688144e-07, "loss": 0.434, "step": 9684 }, { "epoch": 0.7865670429627223, "grad_norm": 4.203294166143777, "learning_rate": 5.739432719344737e-07, "loss": 0.4048, "step": 9685 }, { "epoch": 0.7866482579387639, "grad_norm": 5.401361035345732, "learning_rate": 5.73524082453735e-07, "loss": 0.4279, "step": 9686 }, { "epoch": 0.7867294729148054, "grad_norm": 4.158513777957124, "learning_rate": 5.731050262749341e-07, "loss": 0.4841, "step": 9687 }, { "epoch": 0.7868106878908471, "grad_norm": 7.879580004180549, "learning_rate": 5.726861034270681e-07, "loss": 0.3681, "step": 9688 }, { "epoch": 0.7868919028668887, "grad_norm": 3.9835235897565107, "learning_rate": 5.722673139391236e-07, "loss": 0.4581, "step": 9689 }, { "epoch": 0.7869731178429302, "grad_norm": 5.506433493815765, "learning_rate": 5.718486578400775e-07, "loss": 0.4241, "step": 9690 }, { "epoch": 0.7870543328189719, "grad_norm": 6.423378795375947, "learning_rate": 5.714301351589008e-07, "loss": 0.6751, "step": 9691 }, { "epoch": 0.7871355477950134, "grad_norm": 4.6037563920362095, "learning_rate": 5.710117459245518e-07, "loss": 0.6218, "step": 9692 }, { "epoch": 0.787216762771055, "grad_norm": 5.535444733765645, "learning_rate": 5.705934901659804e-07, "loss": 0.609, "step": 9693 }, { "epoch": 0.7872979777470965, "grad_norm": 6.072040161253787, "learning_rate": 5.70175367912128e-07, "loss": 0.3255, "step": 9694 }, { "epoch": 0.7873791927231382, "grad_norm": 3.930200276565603, "learning_rate": 5.697573791919275e-07, "loss": 0.3883, "step": 9695 }, { "epoch": 0.7874604076991797, "grad_norm": 13.405961625131877, "learning_rate": 5.693395240343e-07, "loss": 0.469, "step": 9696 }, { "epoch": 0.7875416226752213, "grad_norm": 3.8674362850461357, "learning_rate": 5.689218024681603e-07, "loss": 0.4996, "step": 9697 }, { "epoch": 0.7876228376512628, "grad_norm": 6.411118676070839, "learning_rate": 5.685042145224118e-07, "loss": 0.3813, "step": 9698 }, { "epoch": 0.7877040526273045, "grad_norm": 3.36119530692789, "learning_rate": 5.680867602259485e-07, "loss": 0.449, "step": 9699 }, { "epoch": 0.7877852676033461, "grad_norm": 5.637861341121901, "learning_rate": 5.676694396076568e-07, "loss": 0.5444, "step": 9700 }, { "epoch": 0.7878664825793876, "grad_norm": 8.011959392935337, "learning_rate": 5.672522526964141e-07, "loss": 0.4141, "step": 9701 }, { "epoch": 0.7879476975554293, "grad_norm": 4.254934260559165, "learning_rate": 5.668351995210866e-07, "loss": 0.5489, "step": 9702 }, { "epoch": 0.7880289125314708, "grad_norm": 4.2266389347699995, "learning_rate": 5.664182801105314e-07, "loss": 0.4893, "step": 9703 }, { "epoch": 0.7881101275075124, "grad_norm": 9.12782592389075, "learning_rate": 5.660014944935985e-07, "loss": 0.5584, "step": 9704 }, { "epoch": 0.7881913424835539, "grad_norm": 4.637756752610576, "learning_rate": 5.655848426991267e-07, "loss": 0.4832, "step": 9705 }, { "epoch": 0.7882725574595956, "grad_norm": 5.685502111243551, "learning_rate": 5.651683247559445e-07, "loss": 0.3528, "step": 9706 }, { "epoch": 0.7883537724356371, "grad_norm": 6.879039707986046, "learning_rate": 5.647519406928758e-07, "loss": 0.3939, "step": 9707 }, { "epoch": 0.7884349874116787, "grad_norm": 4.090096342166239, "learning_rate": 5.643356905387307e-07, "loss": 0.6541, "step": 9708 }, { "epoch": 0.7885162023877202, "grad_norm": 5.705199569403093, "learning_rate": 5.639195743223105e-07, "loss": 0.4791, "step": 9709 }, { "epoch": 0.7885974173637619, "grad_norm": 6.503352654339194, "learning_rate": 5.635035920724102e-07, "loss": 0.4197, "step": 9710 }, { "epoch": 0.7886786323398035, "grad_norm": 3.9110515788245204, "learning_rate": 5.630877438178126e-07, "loss": 0.6805, "step": 9711 }, { "epoch": 0.788759847315845, "grad_norm": 6.1493240468982275, "learning_rate": 5.626720295872911e-07, "loss": 0.4074, "step": 9712 }, { "epoch": 0.7888410622918867, "grad_norm": 3.943608994312866, "learning_rate": 5.622564494096122e-07, "loss": 0.4565, "step": 9713 }, { "epoch": 0.7889222772679282, "grad_norm": 7.779376580705911, "learning_rate": 5.618410033135325e-07, "loss": 0.5792, "step": 9714 }, { "epoch": 0.7890034922439698, "grad_norm": 3.3797672800758134, "learning_rate": 5.614256913277968e-07, "loss": 0.3237, "step": 9715 }, { "epoch": 0.7890847072200113, "grad_norm": 5.330674659642353, "learning_rate": 5.610105134811444e-07, "loss": 0.4209, "step": 9716 }, { "epoch": 0.789165922196053, "grad_norm": 4.322221219959529, "learning_rate": 5.605954698023023e-07, "loss": 0.4035, "step": 9717 }, { "epoch": 0.7892471371720945, "grad_norm": 9.488626438377107, "learning_rate": 5.601805603199889e-07, "loss": 0.3763, "step": 9718 }, { "epoch": 0.7893283521481361, "grad_norm": 5.681079603298455, "learning_rate": 5.597657850629145e-07, "loss": 0.5129, "step": 9719 }, { "epoch": 0.7894095671241776, "grad_norm": 4.177871819811436, "learning_rate": 5.593511440597799e-07, "loss": 0.4432, "step": 9720 }, { "epoch": 0.7894907821002193, "grad_norm": 7.924561057142348, "learning_rate": 5.589366373392754e-07, "loss": 0.4561, "step": 9721 }, { "epoch": 0.7895719970762609, "grad_norm": 3.32290128813931, "learning_rate": 5.58522264930082e-07, "loss": 0.5691, "step": 9722 }, { "epoch": 0.7896532120523024, "grad_norm": 7.189607322217405, "learning_rate": 5.581080268608733e-07, "loss": 0.5209, "step": 9723 }, { "epoch": 0.7897344270283441, "grad_norm": 6.847085179945522, "learning_rate": 5.576939231603118e-07, "loss": 0.533, "step": 9724 }, { "epoch": 0.7898156420043856, "grad_norm": 5.441399177870424, "learning_rate": 5.572799538570506e-07, "loss": 0.3767, "step": 9725 }, { "epoch": 0.7898968569804272, "grad_norm": 5.540677770063936, "learning_rate": 5.56866118979735e-07, "loss": 0.5119, "step": 9726 }, { "epoch": 0.7899780719564687, "grad_norm": 8.524810246072764, "learning_rate": 5.564524185570008e-07, "loss": 0.5606, "step": 9727 }, { "epoch": 0.7900592869325104, "grad_norm": 3.4073724271339327, "learning_rate": 5.560388526174723e-07, "loss": 0.6541, "step": 9728 }, { "epoch": 0.7901405019085519, "grad_norm": 4.045724757251306, "learning_rate": 5.556254211897677e-07, "loss": 0.481, "step": 9729 }, { "epoch": 0.7902217168845935, "grad_norm": 6.886529740016336, "learning_rate": 5.552121243024935e-07, "loss": 0.456, "step": 9730 }, { "epoch": 0.790302931860635, "grad_norm": 4.136880017146439, "learning_rate": 5.54798961984247e-07, "loss": 0.4138, "step": 9731 }, { "epoch": 0.7903841468366767, "grad_norm": 5.315316660423206, "learning_rate": 5.543859342636177e-07, "loss": 0.4179, "step": 9732 }, { "epoch": 0.7904653618127183, "grad_norm": 9.808582227554735, "learning_rate": 5.539730411691851e-07, "loss": 0.4608, "step": 9733 }, { "epoch": 0.7905465767887598, "grad_norm": 3.7605530791600725, "learning_rate": 5.535602827295189e-07, "loss": 0.4465, "step": 9734 }, { "epoch": 0.7906277917648015, "grad_norm": 5.688896710047547, "learning_rate": 5.53147658973179e-07, "loss": 0.3428, "step": 9735 }, { "epoch": 0.790709006740843, "grad_norm": 11.37494567313138, "learning_rate": 5.527351699287184e-07, "loss": 0.4684, "step": 9736 }, { "epoch": 0.7907902217168846, "grad_norm": 6.335210866254778, "learning_rate": 5.523228156246782e-07, "loss": 0.5216, "step": 9737 }, { "epoch": 0.7908714366929261, "grad_norm": 3.543537249807746, "learning_rate": 5.519105960895904e-07, "loss": 0.6721, "step": 9738 }, { "epoch": 0.7909526516689678, "grad_norm": 3.746923394479714, "learning_rate": 5.514985113519794e-07, "loss": 0.5059, "step": 9739 }, { "epoch": 0.7910338666450093, "grad_norm": 3.243317942178259, "learning_rate": 5.510865614403599e-07, "loss": 0.5028, "step": 9740 }, { "epoch": 0.7911150816210509, "grad_norm": 5.231091301479268, "learning_rate": 5.506747463832348e-07, "loss": 0.506, "step": 9741 }, { "epoch": 0.7911962965970925, "grad_norm": 3.037936622624663, "learning_rate": 5.502630662091016e-07, "loss": 0.4508, "step": 9742 }, { "epoch": 0.7912775115731341, "grad_norm": 5.011070638322092, "learning_rate": 5.498515209464453e-07, "loss": 0.5612, "step": 9743 }, { "epoch": 0.7913587265491757, "grad_norm": 57.93538906369309, "learning_rate": 5.49440110623742e-07, "loss": 0.4245, "step": 9744 }, { "epoch": 0.7914399415252172, "grad_norm": 4.391572013936799, "learning_rate": 5.490288352694598e-07, "loss": 0.4482, "step": 9745 }, { "epoch": 0.7915211565012589, "grad_norm": 4.558873733467807, "learning_rate": 5.486176949120575e-07, "loss": 0.6039, "step": 9746 }, { "epoch": 0.7916023714773004, "grad_norm": 7.119700732402204, "learning_rate": 5.482066895799825e-07, "loss": 0.467, "step": 9747 }, { "epoch": 0.791683586453342, "grad_norm": 9.302916508581008, "learning_rate": 5.477958193016758e-07, "loss": 0.589, "step": 9748 }, { "epoch": 0.7917648014293835, "grad_norm": 3.810258704896879, "learning_rate": 5.473850841055664e-07, "loss": 0.5026, "step": 9749 }, { "epoch": 0.7918460164054252, "grad_norm": 3.970336831775221, "learning_rate": 5.469744840200741e-07, "loss": 0.4808, "step": 9750 }, { "epoch": 0.7919272313814667, "grad_norm": 4.344838984297026, "learning_rate": 5.465640190736124e-07, "loss": 0.5038, "step": 9751 }, { "epoch": 0.7920084463575083, "grad_norm": 4.912118996595724, "learning_rate": 5.461536892945812e-07, "loss": 0.4581, "step": 9752 }, { "epoch": 0.7920896613335499, "grad_norm": 5.652749892181998, "learning_rate": 5.457434947113749e-07, "loss": 0.4009, "step": 9753 }, { "epoch": 0.7921708763095915, "grad_norm": 17.57147989196119, "learning_rate": 5.453334353523754e-07, "loss": 0.4287, "step": 9754 }, { "epoch": 0.7922520912856331, "grad_norm": 3.898615950581783, "learning_rate": 5.449235112459577e-07, "loss": 0.4907, "step": 9755 }, { "epoch": 0.7923333062616746, "grad_norm": 6.661559536211671, "learning_rate": 5.445137224204861e-07, "loss": 0.6232, "step": 9756 }, { "epoch": 0.7924145212377163, "grad_norm": 4.6703720755130265, "learning_rate": 5.441040689043148e-07, "loss": 0.3485, "step": 9757 }, { "epoch": 0.7924957362137578, "grad_norm": 7.3100697341525995, "learning_rate": 5.436945507257907e-07, "loss": 0.485, "step": 9758 }, { "epoch": 0.7925769511897994, "grad_norm": 5.877076789096071, "learning_rate": 5.432851679132506e-07, "loss": 0.5487, "step": 9759 }, { "epoch": 0.792658166165841, "grad_norm": 6.67706951439948, "learning_rate": 5.428759204950204e-07, "loss": 0.8112, "step": 9760 }, { "epoch": 0.7927393811418826, "grad_norm": 5.439821280458846, "learning_rate": 5.424668084994195e-07, "loss": 0.5156, "step": 9761 }, { "epoch": 0.7928205961179241, "grad_norm": 4.572295130779405, "learning_rate": 5.420578319547551e-07, "loss": 0.3297, "step": 9762 }, { "epoch": 0.7929018110939657, "grad_norm": 5.618034848926441, "learning_rate": 5.416489908893258e-07, "loss": 0.5123, "step": 9763 }, { "epoch": 0.7929830260700073, "grad_norm": 5.221377474566307, "learning_rate": 5.412402853314227e-07, "loss": 0.3917, "step": 9764 }, { "epoch": 0.7930642410460489, "grad_norm": 10.356233466887792, "learning_rate": 5.408317153093245e-07, "loss": 0.6027, "step": 9765 }, { "epoch": 0.7931454560220905, "grad_norm": 4.822402093871006, "learning_rate": 5.404232808513027e-07, "loss": 0.5062, "step": 9766 }, { "epoch": 0.793226670998132, "grad_norm": 4.342762770694148, "learning_rate": 5.400149819856199e-07, "loss": 0.4948, "step": 9767 }, { "epoch": 0.7933078859741737, "grad_norm": 5.561331791979438, "learning_rate": 5.396068187405273e-07, "loss": 0.4032, "step": 9768 }, { "epoch": 0.7933891009502152, "grad_norm": 8.670840918114324, "learning_rate": 5.391987911442667e-07, "loss": 0.613, "step": 9769 }, { "epoch": 0.7934703159262568, "grad_norm": 4.613030257259587, "learning_rate": 5.387908992250731e-07, "loss": 0.3999, "step": 9770 }, { "epoch": 0.7935515309022984, "grad_norm": 5.329616352899454, "learning_rate": 5.383831430111691e-07, "loss": 0.4021, "step": 9771 }, { "epoch": 0.79363274587834, "grad_norm": 5.051235158011703, "learning_rate": 5.379755225307707e-07, "loss": 0.6117, "step": 9772 }, { "epoch": 0.7937139608543815, "grad_norm": 4.393658723431004, "learning_rate": 5.375680378120812e-07, "loss": 0.5466, "step": 9773 }, { "epoch": 0.7937951758304231, "grad_norm": 8.092758743122241, "learning_rate": 5.371606888832984e-07, "loss": 0.4675, "step": 9774 }, { "epoch": 0.7938763908064647, "grad_norm": 4.672078958128614, "learning_rate": 5.367534757726079e-07, "loss": 0.5704, "step": 9775 }, { "epoch": 0.7939576057825063, "grad_norm": 5.017160554885112, "learning_rate": 5.363463985081854e-07, "loss": 0.5743, "step": 9776 }, { "epoch": 0.7940388207585479, "grad_norm": 3.292901108068312, "learning_rate": 5.359394571182e-07, "loss": 0.5656, "step": 9777 }, { "epoch": 0.7941200357345894, "grad_norm": 5.959333091031521, "learning_rate": 5.355326516308102e-07, "loss": 0.484, "step": 9778 }, { "epoch": 0.7942012507106311, "grad_norm": 3.555927641182116, "learning_rate": 5.351259820741633e-07, "loss": 0.5468, "step": 9779 }, { "epoch": 0.7942824656866726, "grad_norm": 9.33015466014825, "learning_rate": 5.347194484764001e-07, "loss": 0.5981, "step": 9780 }, { "epoch": 0.7943636806627142, "grad_norm": 5.675486153355801, "learning_rate": 5.343130508656502e-07, "loss": 0.5224, "step": 9781 }, { "epoch": 0.7944448956387558, "grad_norm": 4.362949089225925, "learning_rate": 5.339067892700331e-07, "loss": 0.4188, "step": 9782 }, { "epoch": 0.7945261106147974, "grad_norm": 4.529916181314969, "learning_rate": 5.335006637176612e-07, "loss": 0.4768, "step": 9783 }, { "epoch": 0.7946073255908389, "grad_norm": 6.84665053039972, "learning_rate": 5.330946742366356e-07, "loss": 0.4384, "step": 9784 }, { "epoch": 0.7946885405668805, "grad_norm": 5.58529418435716, "learning_rate": 5.326888208550485e-07, "loss": 0.3712, "step": 9785 }, { "epoch": 0.7947697555429221, "grad_norm": 5.963815630595506, "learning_rate": 5.32283103600984e-07, "loss": 0.5674, "step": 9786 }, { "epoch": 0.7948509705189637, "grad_norm": 5.054597490597483, "learning_rate": 5.318775225025147e-07, "loss": 0.5324, "step": 9787 }, { "epoch": 0.7949321854950053, "grad_norm": 5.080349599593879, "learning_rate": 5.314720775877046e-07, "loss": 0.4342, "step": 9788 }, { "epoch": 0.7950134004710468, "grad_norm": 4.048899362069842, "learning_rate": 5.31066768884608e-07, "loss": 0.3672, "step": 9789 }, { "epoch": 0.7950946154470885, "grad_norm": 33.20617438804645, "learning_rate": 5.306615964212705e-07, "loss": 0.5476, "step": 9790 }, { "epoch": 0.79517583042313, "grad_norm": 7.450168497762338, "learning_rate": 5.302565602257285e-07, "loss": 0.6486, "step": 9791 }, { "epoch": 0.7952570453991716, "grad_norm": 7.689105107124658, "learning_rate": 5.298516603260071e-07, "loss": 0.3788, "step": 9792 }, { "epoch": 0.7953382603752132, "grad_norm": 4.276239675857061, "learning_rate": 5.294468967501248e-07, "loss": 0.4255, "step": 9793 }, { "epoch": 0.7954194753512548, "grad_norm": 5.366407144652406, "learning_rate": 5.29042269526088e-07, "loss": 0.5787, "step": 9794 }, { "epoch": 0.7955006903272963, "grad_norm": 3.8340051916962503, "learning_rate": 5.286377786818944e-07, "loss": 0.6228, "step": 9795 }, { "epoch": 0.7955819053033379, "grad_norm": 4.50626750141108, "learning_rate": 5.282334242455339e-07, "loss": 0.5358, "step": 9796 }, { "epoch": 0.7956631202793795, "grad_norm": 6.044552408083624, "learning_rate": 5.278292062449844e-07, "loss": 0.5658, "step": 9797 }, { "epoch": 0.7957443352554211, "grad_norm": 4.404044522106774, "learning_rate": 5.274251247082163e-07, "loss": 0.5208, "step": 9798 }, { "epoch": 0.7958255502314627, "grad_norm": 7.333443652190031, "learning_rate": 5.270211796631905e-07, "loss": 0.3324, "step": 9799 }, { "epoch": 0.7959067652075043, "grad_norm": 4.223546024424326, "learning_rate": 5.266173711378572e-07, "loss": 0.5734, "step": 9800 }, { "epoch": 0.7959879801835459, "grad_norm": 5.341487545794119, "learning_rate": 5.262136991601572e-07, "loss": 0.385, "step": 9801 }, { "epoch": 0.7960691951595874, "grad_norm": 4.803777217199065, "learning_rate": 5.258101637580238e-07, "loss": 0.4812, "step": 9802 }, { "epoch": 0.796150410135629, "grad_norm": 6.650426970575839, "learning_rate": 5.254067649593781e-07, "loss": 0.4336, "step": 9803 }, { "epoch": 0.7962316251116706, "grad_norm": 5.441981893530827, "learning_rate": 5.250035027921338e-07, "loss": 0.3946, "step": 9804 }, { "epoch": 0.7963128400877122, "grad_norm": 5.107486747720455, "learning_rate": 5.246003772841953e-07, "loss": 0.4776, "step": 9805 }, { "epoch": 0.7963940550637537, "grad_norm": 3.5626105502835497, "learning_rate": 5.24197388463456e-07, "loss": 0.328, "step": 9806 }, { "epoch": 0.7964752700397953, "grad_norm": 6.074463700483569, "learning_rate": 5.237945363578006e-07, "loss": 0.4275, "step": 9807 }, { "epoch": 0.7965564850158369, "grad_norm": 12.182536694266675, "learning_rate": 5.233918209951039e-07, "loss": 0.3317, "step": 9808 }, { "epoch": 0.7966376999918785, "grad_norm": 6.106295220054585, "learning_rate": 5.229892424032326e-07, "loss": 0.4446, "step": 9809 }, { "epoch": 0.7967189149679201, "grad_norm": 4.589038843159026, "learning_rate": 5.225868006100421e-07, "loss": 0.4815, "step": 9810 }, { "epoch": 0.7968001299439617, "grad_norm": 13.701424714841433, "learning_rate": 5.221844956433794e-07, "loss": 0.3585, "step": 9811 }, { "epoch": 0.7968813449200033, "grad_norm": 4.331772567167185, "learning_rate": 5.21782327531083e-07, "loss": 0.426, "step": 9812 }, { "epoch": 0.7969625598960448, "grad_norm": 5.814768311032915, "learning_rate": 5.213802963009798e-07, "loss": 0.5894, "step": 9813 }, { "epoch": 0.7970437748720864, "grad_norm": 4.7885091116888505, "learning_rate": 5.209784019808877e-07, "loss": 0.401, "step": 9814 }, { "epoch": 0.797124989848128, "grad_norm": 3.7476316440320403, "learning_rate": 5.205766445986174e-07, "loss": 0.4015, "step": 9815 }, { "epoch": 0.7972062048241696, "grad_norm": 4.916810203123699, "learning_rate": 5.201750241819664e-07, "loss": 0.3485, "step": 9816 }, { "epoch": 0.7972874198002111, "grad_norm": 4.506048343346604, "learning_rate": 5.197735407587257e-07, "loss": 0.4305, "step": 9817 }, { "epoch": 0.7973686347762527, "grad_norm": 3.516529711458559, "learning_rate": 5.193721943566762e-07, "loss": 0.5148, "step": 9818 }, { "epoch": 0.7974498497522943, "grad_norm": 4.681381707890613, "learning_rate": 5.189709850035887e-07, "loss": 0.4342, "step": 9819 }, { "epoch": 0.7975310647283359, "grad_norm": 5.7757941641398665, "learning_rate": 5.185699127272243e-07, "loss": 0.3261, "step": 9820 }, { "epoch": 0.7976122797043775, "grad_norm": 3.0498451518722196, "learning_rate": 5.181689775553355e-07, "loss": 0.5548, "step": 9821 }, { "epoch": 0.7976934946804191, "grad_norm": 7.950302605138208, "learning_rate": 5.17768179515665e-07, "loss": 0.3438, "step": 9822 }, { "epoch": 0.7977747096564607, "grad_norm": 6.18464348555484, "learning_rate": 5.173675186359451e-07, "loss": 0.4751, "step": 9823 }, { "epoch": 0.7978559246325022, "grad_norm": 7.611226667202139, "learning_rate": 5.169669949438996e-07, "loss": 0.3245, "step": 9824 }, { "epoch": 0.7979371396085438, "grad_norm": 6.064790144719224, "learning_rate": 5.165666084672439e-07, "loss": 0.4096, "step": 9825 }, { "epoch": 0.7980183545845854, "grad_norm": 4.770411960870846, "learning_rate": 5.161663592336815e-07, "loss": 0.6826, "step": 9826 }, { "epoch": 0.798099569560627, "grad_norm": 7.597577840704956, "learning_rate": 5.157662472709075e-07, "loss": 0.5609, "step": 9827 }, { "epoch": 0.7981807845366685, "grad_norm": 8.243596223621436, "learning_rate": 5.153662726066083e-07, "loss": 0.4814, "step": 9828 }, { "epoch": 0.7982619995127102, "grad_norm": 9.676688199336681, "learning_rate": 5.149664352684586e-07, "loss": 0.5136, "step": 9829 }, { "epoch": 0.7983432144887517, "grad_norm": 5.795108480675768, "learning_rate": 5.14566735284126e-07, "loss": 0.3747, "step": 9830 }, { "epoch": 0.7984244294647933, "grad_norm": 6.471676661405976, "learning_rate": 5.141671726812683e-07, "loss": 0.5808, "step": 9831 }, { "epoch": 0.7985056444408349, "grad_norm": 4.438123959791211, "learning_rate": 5.137677474875324e-07, "loss": 0.5517, "step": 9832 }, { "epoch": 0.7985868594168765, "grad_norm": 7.424549295374417, "learning_rate": 5.133684597305557e-07, "loss": 0.3537, "step": 9833 }, { "epoch": 0.7986680743929181, "grad_norm": 5.275736853110713, "learning_rate": 5.129693094379684e-07, "loss": 0.3372, "step": 9834 }, { "epoch": 0.7987492893689596, "grad_norm": 3.2450613994463056, "learning_rate": 5.125702966373883e-07, "loss": 0.3272, "step": 9835 }, { "epoch": 0.7988305043450012, "grad_norm": 12.566077839285736, "learning_rate": 5.121714213564249e-07, "loss": 0.4256, "step": 9836 }, { "epoch": 0.7989117193210428, "grad_norm": 3.457305067918944, "learning_rate": 5.117726836226786e-07, "loss": 0.4961, "step": 9837 }, { "epoch": 0.7989929342970844, "grad_norm": 5.900089910314316, "learning_rate": 5.113740834637407e-07, "loss": 0.3549, "step": 9838 }, { "epoch": 0.7990741492731259, "grad_norm": 8.54709344532498, "learning_rate": 5.109756209071908e-07, "loss": 0.469, "step": 9839 }, { "epoch": 0.7991553642491676, "grad_norm": 3.7791884771682147, "learning_rate": 5.105772959806021e-07, "loss": 0.5095, "step": 9840 }, { "epoch": 0.7992365792252091, "grad_norm": 6.060813178732931, "learning_rate": 5.101791087115354e-07, "loss": 0.5591, "step": 9841 }, { "epoch": 0.7993177942012507, "grad_norm": 3.1018063396354605, "learning_rate": 5.097810591275429e-07, "loss": 0.3985, "step": 9842 }, { "epoch": 0.7993990091772923, "grad_norm": 5.327968286749991, "learning_rate": 5.093831472561681e-07, "loss": 0.4773, "step": 9843 }, { "epoch": 0.7994802241533339, "grad_norm": 7.284888223378119, "learning_rate": 5.089853731249448e-07, "loss": 0.4411, "step": 9844 }, { "epoch": 0.7995614391293755, "grad_norm": 9.03698387870564, "learning_rate": 5.085877367613964e-07, "loss": 0.4432, "step": 9845 }, { "epoch": 0.799642654105417, "grad_norm": 4.655253921964928, "learning_rate": 5.081902381930365e-07, "loss": 0.5223, "step": 9846 }, { "epoch": 0.7997238690814586, "grad_norm": 6.893829661308056, "learning_rate": 5.077928774473714e-07, "loss": 0.3228, "step": 9847 }, { "epoch": 0.7998050840575002, "grad_norm": 4.268542033494732, "learning_rate": 5.073956545518949e-07, "loss": 0.4231, "step": 9848 }, { "epoch": 0.7998862990335418, "grad_norm": 14.574424300264582, "learning_rate": 5.069985695340931e-07, "loss": 0.4657, "step": 9849 }, { "epoch": 0.7999675140095833, "grad_norm": 5.388314959320381, "learning_rate": 5.066016224214435e-07, "loss": 0.6228, "step": 9850 }, { "epoch": 0.800048728985625, "grad_norm": 5.9704183797863575, "learning_rate": 5.062048132414116e-07, "loss": 0.4077, "step": 9851 }, { "epoch": 0.8001299439616665, "grad_norm": 7.054787471974678, "learning_rate": 5.058081420214538e-07, "loss": 0.394, "step": 9852 }, { "epoch": 0.8002111589377081, "grad_norm": 9.477648614150743, "learning_rate": 5.054116087890196e-07, "loss": 0.5202, "step": 9853 }, { "epoch": 0.8002923739137497, "grad_norm": 5.337615316923885, "learning_rate": 5.050152135715453e-07, "loss": 0.4712, "step": 9854 }, { "epoch": 0.8003735888897913, "grad_norm": 4.045630006380179, "learning_rate": 5.046189563964595e-07, "loss": 0.4629, "step": 9855 }, { "epoch": 0.8004548038658329, "grad_norm": 4.540887627720557, "learning_rate": 5.042228372911815e-07, "loss": 0.4685, "step": 9856 }, { "epoch": 0.8005360188418744, "grad_norm": 5.474722039970322, "learning_rate": 5.038268562831214e-07, "loss": 0.4574, "step": 9857 }, { "epoch": 0.800617233817916, "grad_norm": 4.262273306813277, "learning_rate": 5.034310133996772e-07, "loss": 0.6693, "step": 9858 }, { "epoch": 0.8006984487939576, "grad_norm": 4.349294849404306, "learning_rate": 5.030353086682413e-07, "loss": 0.4987, "step": 9859 }, { "epoch": 0.8007796637699992, "grad_norm": 3.749623526864484, "learning_rate": 5.02639742116193e-07, "loss": 0.5335, "step": 9860 }, { "epoch": 0.8008608787460407, "grad_norm": 3.6511910926652926, "learning_rate": 5.022443137709032e-07, "loss": 0.5024, "step": 9861 }, { "epoch": 0.8009420937220824, "grad_norm": 7.167220329931862, "learning_rate": 5.018490236597337e-07, "loss": 0.7358, "step": 9862 }, { "epoch": 0.8010233086981239, "grad_norm": 4.748703110219761, "learning_rate": 5.014538718100373e-07, "loss": 0.5198, "step": 9863 }, { "epoch": 0.8011045236741655, "grad_norm": 6.53063347027075, "learning_rate": 5.01058858249156e-07, "loss": 0.5499, "step": 9864 }, { "epoch": 0.8011857386502071, "grad_norm": 4.349589259638556, "learning_rate": 5.006639830044219e-07, "loss": 0.4342, "step": 9865 }, { "epoch": 0.8012669536262487, "grad_norm": 4.143371415401599, "learning_rate": 5.002692461031591e-07, "loss": 0.5895, "step": 9866 }, { "epoch": 0.8013481686022903, "grad_norm": 7.7254724395274055, "learning_rate": 4.998746475726815e-07, "loss": 0.5336, "step": 9867 }, { "epoch": 0.8014293835783318, "grad_norm": 5.175670084157273, "learning_rate": 4.994801874402918e-07, "loss": 0.6204, "step": 9868 }, { "epoch": 0.8015105985543735, "grad_norm": 6.735892978669596, "learning_rate": 4.990858657332856e-07, "loss": 0.3911, "step": 9869 }, { "epoch": 0.801591813530415, "grad_norm": 5.104643492256398, "learning_rate": 4.986916824789484e-07, "loss": 0.5035, "step": 9870 }, { "epoch": 0.8016730285064566, "grad_norm": 3.5531328145211476, "learning_rate": 4.982976377045546e-07, "loss": 0.4066, "step": 9871 }, { "epoch": 0.8017542434824981, "grad_norm": 4.292790999663058, "learning_rate": 4.979037314373708e-07, "loss": 0.7253, "step": 9872 }, { "epoch": 0.8018354584585398, "grad_norm": 3.9126112690178934, "learning_rate": 4.975099637046529e-07, "loss": 0.5173, "step": 9873 }, { "epoch": 0.8019166734345813, "grad_norm": 4.952119344921789, "learning_rate": 4.971163345336469e-07, "loss": 0.546, "step": 9874 }, { "epoch": 0.8019978884106229, "grad_norm": 4.007491175094282, "learning_rate": 4.967228439515903e-07, "loss": 0.5014, "step": 9875 }, { "epoch": 0.8020791033866645, "grad_norm": 7.4996807066549405, "learning_rate": 4.963294919857115e-07, "loss": 0.4807, "step": 9876 }, { "epoch": 0.8021603183627061, "grad_norm": 3.8718307958429308, "learning_rate": 4.959362786632274e-07, "loss": 0.3412, "step": 9877 }, { "epoch": 0.8022415333387477, "grad_norm": 14.309366650475116, "learning_rate": 4.955432040113459e-07, "loss": 0.4632, "step": 9878 }, { "epoch": 0.8023227483147892, "grad_norm": 6.072346159252271, "learning_rate": 4.95150268057267e-07, "loss": 0.3375, "step": 9879 }, { "epoch": 0.8024039632908309, "grad_norm": 3.9060655122044956, "learning_rate": 4.947574708281788e-07, "loss": 0.4548, "step": 9880 }, { "epoch": 0.8024851782668724, "grad_norm": 4.543113575494821, "learning_rate": 4.943648123512607e-07, "loss": 0.5799, "step": 9881 }, { "epoch": 0.802566393242914, "grad_norm": 8.154618850928907, "learning_rate": 4.939722926536825e-07, "loss": 0.3684, "step": 9882 }, { "epoch": 0.8026476082189555, "grad_norm": 3.3302480665945673, "learning_rate": 4.935799117626058e-07, "loss": 0.5121, "step": 9883 }, { "epoch": 0.8027288231949972, "grad_norm": 5.010095824925431, "learning_rate": 4.931876697051797e-07, "loss": 0.4584, "step": 9884 }, { "epoch": 0.8028100381710387, "grad_norm": 8.290730780611321, "learning_rate": 4.927955665085466e-07, "loss": 0.5904, "step": 9885 }, { "epoch": 0.8028912531470803, "grad_norm": 6.530421557967707, "learning_rate": 4.924036021998372e-07, "loss": 0.6501, "step": 9886 }, { "epoch": 0.802972468123122, "grad_norm": 3.933371390397304, "learning_rate": 4.92011776806173e-07, "loss": 0.4631, "step": 9887 }, { "epoch": 0.8030536830991635, "grad_norm": 4.630936993671216, "learning_rate": 4.916200903546664e-07, "loss": 0.4694, "step": 9888 }, { "epoch": 0.8031348980752051, "grad_norm": 8.920541461832672, "learning_rate": 4.912285428724214e-07, "loss": 0.4291, "step": 9889 }, { "epoch": 0.8032161130512466, "grad_norm": 4.683558801065991, "learning_rate": 4.908371343865289e-07, "loss": 0.5715, "step": 9890 }, { "epoch": 0.8032973280272883, "grad_norm": 4.064081828276734, "learning_rate": 4.904458649240742e-07, "loss": 0.6187, "step": 9891 }, { "epoch": 0.8033785430033298, "grad_norm": 13.844068111838718, "learning_rate": 4.900547345121304e-07, "loss": 0.4852, "step": 9892 }, { "epoch": 0.8034597579793714, "grad_norm": 7.334487582827237, "learning_rate": 4.896637431777607e-07, "loss": 0.5241, "step": 9893 }, { "epoch": 0.8035409729554129, "grad_norm": 4.075794059330378, "learning_rate": 4.89272890948021e-07, "loss": 0.4231, "step": 9894 }, { "epoch": 0.8036221879314546, "grad_norm": 3.1839585809732913, "learning_rate": 4.88882177849955e-07, "loss": 0.3873, "step": 9895 }, { "epoch": 0.8037034029074961, "grad_norm": 3.9421773033440233, "learning_rate": 4.884916039105994e-07, "loss": 0.4194, "step": 9896 }, { "epoch": 0.8037846178835377, "grad_norm": 4.937157672903017, "learning_rate": 4.881011691569781e-07, "loss": 0.3529, "step": 9897 }, { "epoch": 0.8038658328595794, "grad_norm": 9.136038936527344, "learning_rate": 4.877108736161091e-07, "loss": 0.5727, "step": 9898 }, { "epoch": 0.8039470478356209, "grad_norm": 4.5845637025408, "learning_rate": 4.873207173149974e-07, "loss": 0.608, "step": 9899 }, { "epoch": 0.8040282628116625, "grad_norm": 5.642715408471168, "learning_rate": 4.869307002806397e-07, "loss": 0.471, "step": 9900 }, { "epoch": 0.804109477787704, "grad_norm": 6.2233329502733294, "learning_rate": 4.865408225400234e-07, "loss": 0.51, "step": 9901 }, { "epoch": 0.8041906927637457, "grad_norm": 7.993507725893066, "learning_rate": 4.861510841201266e-07, "loss": 0.3892, "step": 9902 }, { "epoch": 0.8042719077397872, "grad_norm": 4.96226956095677, "learning_rate": 4.857614850479161e-07, "loss": 0.5246, "step": 9903 }, { "epoch": 0.8043531227158288, "grad_norm": 11.603584675552659, "learning_rate": 4.853720253503514e-07, "loss": 0.4751, "step": 9904 }, { "epoch": 0.8044343376918703, "grad_norm": 4.342808900661657, "learning_rate": 4.849827050543801e-07, "loss": 0.5307, "step": 9905 }, { "epoch": 0.804515552667912, "grad_norm": 6.60599795238213, "learning_rate": 4.845935241869409e-07, "loss": 0.4751, "step": 9906 }, { "epoch": 0.8045967676439535, "grad_norm": 4.155426698257137, "learning_rate": 4.842044827749632e-07, "loss": 0.511, "step": 9907 }, { "epoch": 0.8046779826199951, "grad_norm": 5.569956854051911, "learning_rate": 4.838155808453676e-07, "loss": 0.6412, "step": 9908 }, { "epoch": 0.8047591975960368, "grad_norm": 8.778391998701604, "learning_rate": 4.834268184250626e-07, "loss": 0.3876, "step": 9909 }, { "epoch": 0.8048404125720783, "grad_norm": 6.809131934976907, "learning_rate": 4.830381955409497e-07, "loss": 0.438, "step": 9910 }, { "epoch": 0.8049216275481199, "grad_norm": 6.2072199104772885, "learning_rate": 4.826497122199191e-07, "loss": 0.5043, "step": 9911 }, { "epoch": 0.8050028425241614, "grad_norm": 9.779692232356227, "learning_rate": 4.822613684888519e-07, "loss": 0.4799, "step": 9912 }, { "epoch": 0.8050840575002031, "grad_norm": 5.12858071171813, "learning_rate": 4.818731643746186e-07, "loss": 0.5888, "step": 9913 }, { "epoch": 0.8051652724762446, "grad_norm": 3.86454745630488, "learning_rate": 4.814850999040816e-07, "loss": 0.551, "step": 9914 }, { "epoch": 0.8052464874522862, "grad_norm": 4.189470752357414, "learning_rate": 4.810971751040932e-07, "loss": 0.4416, "step": 9915 }, { "epoch": 0.8053277024283277, "grad_norm": 8.768793347385872, "learning_rate": 4.80709390001495e-07, "loss": 0.3583, "step": 9916 }, { "epoch": 0.8054089174043694, "grad_norm": 5.814244672682726, "learning_rate": 4.803217446231206e-07, "loss": 0.5113, "step": 9917 }, { "epoch": 0.8054901323804109, "grad_norm": 4.585923445910094, "learning_rate": 4.799342389957925e-07, "loss": 0.4861, "step": 9918 }, { "epoch": 0.8055713473564525, "grad_norm": 5.831696175264736, "learning_rate": 4.795468731463232e-07, "loss": 0.5333, "step": 9919 }, { "epoch": 0.8056525623324942, "grad_norm": 4.199244928525552, "learning_rate": 4.791596471015175e-07, "loss": 0.5757, "step": 9920 }, { "epoch": 0.8057337773085357, "grad_norm": 4.550642371506264, "learning_rate": 4.787725608881694e-07, "loss": 0.5091, "step": 9921 }, { "epoch": 0.8058149922845773, "grad_norm": 6.904305095897743, "learning_rate": 4.783856145330624e-07, "loss": 0.3853, "step": 9922 }, { "epoch": 0.8058962072606188, "grad_norm": 5.386276895757226, "learning_rate": 4.779988080629722e-07, "loss": 0.547, "step": 9923 }, { "epoch": 0.8059774222366605, "grad_norm": 5.592802881169542, "learning_rate": 4.776121415046634e-07, "loss": 0.6014, "step": 9924 }, { "epoch": 0.806058637212702, "grad_norm": 6.5204999954371825, "learning_rate": 4.772256148848903e-07, "loss": 0.5031, "step": 9925 }, { "epoch": 0.8061398521887436, "grad_norm": 4.641348135597619, "learning_rate": 4.768392282303999e-07, "loss": 0.4754, "step": 9926 }, { "epoch": 0.8062210671647851, "grad_norm": 6.611205228762905, "learning_rate": 4.7645298156792667e-07, "loss": 0.6335, "step": 9927 }, { "epoch": 0.8063022821408268, "grad_norm": 4.080391885424033, "learning_rate": 4.7606687492419785e-07, "loss": 0.5242, "step": 9928 }, { "epoch": 0.8063834971168683, "grad_norm": 12.801912551363564, "learning_rate": 4.7568090832593033e-07, "loss": 0.4403, "step": 9929 }, { "epoch": 0.8064647120929099, "grad_norm": 6.118657536304109, "learning_rate": 4.752950817998303e-07, "loss": 0.505, "step": 9930 }, { "epoch": 0.8065459270689516, "grad_norm": 5.362943206753638, "learning_rate": 4.7490939537259527e-07, "loss": 0.4619, "step": 9931 }, { "epoch": 0.8066271420449931, "grad_norm": 6.975033167992144, "learning_rate": 4.745238490709117e-07, "loss": 0.3267, "step": 9932 }, { "epoch": 0.8067083570210347, "grad_norm": 4.258954432113065, "learning_rate": 4.741384429214579e-07, "loss": 0.4316, "step": 9933 }, { "epoch": 0.8067895719970762, "grad_norm": 3.8337908412938972, "learning_rate": 4.7375317695090295e-07, "loss": 0.5411, "step": 9934 }, { "epoch": 0.8068707869731179, "grad_norm": 3.8911807697106213, "learning_rate": 4.7336805118590375e-07, "loss": 0.4551, "step": 9935 }, { "epoch": 0.8069520019491594, "grad_norm": 3.9612535233371764, "learning_rate": 4.729830656531101e-07, "loss": 0.5383, "step": 9936 }, { "epoch": 0.807033216925201, "grad_norm": 4.951917020944689, "learning_rate": 4.725982203791607e-07, "loss": 0.6442, "step": 9937 }, { "epoch": 0.8071144319012425, "grad_norm": 9.313306419072807, "learning_rate": 4.7221351539068374e-07, "loss": 0.4088, "step": 9938 }, { "epoch": 0.8071956468772842, "grad_norm": 5.792989399998622, "learning_rate": 4.7182895071430036e-07, "loss": 0.4883, "step": 9939 }, { "epoch": 0.8072768618533257, "grad_norm": 6.612743860989568, "learning_rate": 4.7144452637661875e-07, "loss": 0.522, "step": 9940 }, { "epoch": 0.8073580768293673, "grad_norm": 5.3380622522214916, "learning_rate": 4.7106024240424014e-07, "loss": 0.5033, "step": 9941 }, { "epoch": 0.807439291805409, "grad_norm": 10.643012463139256, "learning_rate": 4.706760988237555e-07, "loss": 0.4336, "step": 9942 }, { "epoch": 0.8075205067814505, "grad_norm": 6.794117107098659, "learning_rate": 4.702920956617446e-07, "loss": 0.4817, "step": 9943 }, { "epoch": 0.8076017217574921, "grad_norm": 10.634630209323328, "learning_rate": 4.6990823294477795e-07, "loss": 0.3929, "step": 9944 }, { "epoch": 0.8076829367335336, "grad_norm": 6.387003003213154, "learning_rate": 4.695245106994181e-07, "loss": 0.4227, "step": 9945 }, { "epoch": 0.8077641517095753, "grad_norm": 5.885749845817612, "learning_rate": 4.691409289522156e-07, "loss": 0.4674, "step": 9946 }, { "epoch": 0.8078453666856168, "grad_norm": 5.744010803243097, "learning_rate": 4.6875748772971244e-07, "loss": 0.6015, "step": 9947 }, { "epoch": 0.8079265816616584, "grad_norm": 3.2133616441041704, "learning_rate": 4.683741870584413e-07, "loss": 0.4788, "step": 9948 }, { "epoch": 0.8080077966377, "grad_norm": 6.807257172917265, "learning_rate": 4.679910269649246e-07, "loss": 0.5901, "step": 9949 }, { "epoch": 0.8080890116137416, "grad_norm": 3.9285075156101823, "learning_rate": 4.676080074756745e-07, "loss": 0.5877, "step": 9950 }, { "epoch": 0.8081702265897831, "grad_norm": 5.276676978089501, "learning_rate": 4.6722512861719304e-07, "loss": 0.5472, "step": 9951 }, { "epoch": 0.8082514415658247, "grad_norm": 5.815263402068283, "learning_rate": 4.6684239041597524e-07, "loss": 0.6121, "step": 9952 }, { "epoch": 0.8083326565418664, "grad_norm": 8.366540950724946, "learning_rate": 4.6645979289850316e-07, "loss": 0.3672, "step": 9953 }, { "epoch": 0.8084138715179079, "grad_norm": 4.915630857560634, "learning_rate": 4.66077336091251e-07, "loss": 0.4365, "step": 9954 }, { "epoch": 0.8084950864939495, "grad_norm": 14.896314047238715, "learning_rate": 4.6569502002068336e-07, "loss": 0.3652, "step": 9955 }, { "epoch": 0.808576301469991, "grad_norm": 5.45782242106025, "learning_rate": 4.6531284471325375e-07, "loss": 0.4285, "step": 9956 }, { "epoch": 0.8086575164460327, "grad_norm": 4.550708998274069, "learning_rate": 4.649308101954064e-07, "loss": 0.5137, "step": 9957 }, { "epoch": 0.8087387314220742, "grad_norm": 4.5179592494433125, "learning_rate": 4.645489164935774e-07, "loss": 0.4994, "step": 9958 }, { "epoch": 0.8088199463981158, "grad_norm": 8.186932272267061, "learning_rate": 4.641671636341899e-07, "loss": 0.4727, "step": 9959 }, { "epoch": 0.8089011613741574, "grad_norm": 7.7788457747580635, "learning_rate": 4.637855516436604e-07, "loss": 0.4331, "step": 9960 }, { "epoch": 0.808982376350199, "grad_norm": 9.523483750496569, "learning_rate": 4.634040805483947e-07, "loss": 0.4042, "step": 9961 }, { "epoch": 0.8090635913262405, "grad_norm": 4.262182822939964, "learning_rate": 4.6302275037478804e-07, "loss": 0.6089, "step": 9962 }, { "epoch": 0.8091448063022821, "grad_norm": 5.01145075636782, "learning_rate": 4.6264156114922605e-07, "loss": 0.6344, "step": 9963 }, { "epoch": 0.8092260212783238, "grad_norm": 6.573486459689827, "learning_rate": 4.622605128980862e-07, "loss": 0.3984, "step": 9964 }, { "epoch": 0.8093072362543653, "grad_norm": 6.380655374815469, "learning_rate": 4.61879605647734e-07, "loss": 0.3601, "step": 9965 }, { "epoch": 0.8093884512304069, "grad_norm": 11.839284886283753, "learning_rate": 4.6149883942452595e-07, "loss": 0.626, "step": 9966 }, { "epoch": 0.8094696662064484, "grad_norm": 6.373110585879932, "learning_rate": 4.6111821425480956e-07, "loss": 0.7225, "step": 9967 }, { "epoch": 0.8095508811824901, "grad_norm": 7.7580649386331455, "learning_rate": 4.6073773016492267e-07, "loss": 0.5471, "step": 9968 }, { "epoch": 0.8096320961585316, "grad_norm": 4.824098525076564, "learning_rate": 4.603573871811923e-07, "loss": 0.5152, "step": 9969 }, { "epoch": 0.8097133111345732, "grad_norm": 2.881153799640226, "learning_rate": 4.5997718532993535e-07, "loss": 0.5088, "step": 9970 }, { "epoch": 0.8097945261106148, "grad_norm": 6.514585014658824, "learning_rate": 4.5959712463746144e-07, "loss": 0.4321, "step": 9971 }, { "epoch": 0.8098757410866564, "grad_norm": 7.85731483603247, "learning_rate": 4.5921720513006697e-07, "loss": 0.3917, "step": 9972 }, { "epoch": 0.8099569560626979, "grad_norm": 5.1598024248620495, "learning_rate": 4.588374268340412e-07, "loss": 0.5726, "step": 9973 }, { "epoch": 0.8100381710387395, "grad_norm": 4.512666934916995, "learning_rate": 4.584577897756634e-07, "loss": 0.5283, "step": 9974 }, { "epoch": 0.8101193860147812, "grad_norm": 8.411566127602082, "learning_rate": 4.58078293981202e-07, "loss": 0.4524, "step": 9975 }, { "epoch": 0.8102006009908227, "grad_norm": 8.15739766766053, "learning_rate": 4.5769893947691517e-07, "loss": 0.4705, "step": 9976 }, { "epoch": 0.8102818159668643, "grad_norm": 6.8035924289379235, "learning_rate": 4.5731972628905357e-07, "loss": 0.4252, "step": 9977 }, { "epoch": 0.8103630309429058, "grad_norm": 4.227519769293889, "learning_rate": 4.5694065444385564e-07, "loss": 0.5061, "step": 9978 }, { "epoch": 0.8104442459189475, "grad_norm": 6.581692909390787, "learning_rate": 4.5656172396755156e-07, "loss": 0.4785, "step": 9979 }, { "epoch": 0.810525460894989, "grad_norm": 4.606311339058521, "learning_rate": 4.561829348863622e-07, "loss": 0.5214, "step": 9980 }, { "epoch": 0.8106066758710306, "grad_norm": 6.277335985489802, "learning_rate": 4.55804287226497e-07, "loss": 0.2991, "step": 9981 }, { "epoch": 0.8106878908470722, "grad_norm": 5.904104493248545, "learning_rate": 4.5542578101415576e-07, "loss": 0.4301, "step": 9982 }, { "epoch": 0.8107691058231138, "grad_norm": 3.728461827087679, "learning_rate": 4.550474162755303e-07, "loss": 0.6069, "step": 9983 }, { "epoch": 0.8108503207991553, "grad_norm": 4.995689618250689, "learning_rate": 4.546691930368008e-07, "loss": 0.4381, "step": 9984 }, { "epoch": 0.8109315357751969, "grad_norm": 6.939274659064955, "learning_rate": 4.5429111132413773e-07, "loss": 0.5178, "step": 9985 }, { "epoch": 0.8110127507512386, "grad_norm": 6.822025362372183, "learning_rate": 4.539131711637032e-07, "loss": 0.3963, "step": 9986 }, { "epoch": 0.8110939657272801, "grad_norm": 6.644364425158026, "learning_rate": 4.535353725816488e-07, "loss": 0.504, "step": 9987 }, { "epoch": 0.8111751807033217, "grad_norm": 5.480802203291707, "learning_rate": 4.5315771560411617e-07, "loss": 0.4743, "step": 9988 }, { "epoch": 0.8112563956793633, "grad_norm": 5.912237232094158, "learning_rate": 4.5278020025723596e-07, "loss": 0.4656, "step": 9989 }, { "epoch": 0.8113376106554049, "grad_norm": 5.0776992221582935, "learning_rate": 4.524028265671318e-07, "loss": 0.4281, "step": 9990 }, { "epoch": 0.8114188256314464, "grad_norm": 5.3270257232616025, "learning_rate": 4.5202559455991473e-07, "loss": 0.4505, "step": 9991 }, { "epoch": 0.811500040607488, "grad_norm": 6.190730407880878, "learning_rate": 4.516485042616878e-07, "loss": 0.4437, "step": 9992 }, { "epoch": 0.8115812555835296, "grad_norm": 5.542928858101199, "learning_rate": 4.512715556985442e-07, "loss": 0.4011, "step": 9993 }, { "epoch": 0.8116624705595712, "grad_norm": 7.642739194941673, "learning_rate": 4.508947488965662e-07, "loss": 0.4349, "step": 9994 }, { "epoch": 0.8117436855356127, "grad_norm": 7.8072925191375395, "learning_rate": 4.505180838818263e-07, "loss": 0.5984, "step": 9995 }, { "epoch": 0.8118249005116543, "grad_norm": 4.656078397528526, "learning_rate": 4.501415606803888e-07, "loss": 0.4646, "step": 9996 }, { "epoch": 0.811906115487696, "grad_norm": 4.525381591463549, "learning_rate": 4.4976517931830637e-07, "loss": 0.4135, "step": 9997 }, { "epoch": 0.8119873304637375, "grad_norm": 5.890129159202458, "learning_rate": 4.4938893982162253e-07, "loss": 0.5532, "step": 9998 }, { "epoch": 0.8120685454397791, "grad_norm": 6.903081683256655, "learning_rate": 4.4901284221637113e-07, "loss": 0.5935, "step": 9999 }, { "epoch": 0.8121497604158207, "grad_norm": 19.18119241326282, "learning_rate": 4.48636886528577e-07, "loss": 0.4778, "step": 10000 }, { "epoch": 0.8122309753918623, "grad_norm": 6.157895091798137, "learning_rate": 4.482610727842532e-07, "loss": 0.69, "step": 10001 }, { "epoch": 0.8123121903679038, "grad_norm": 5.599909239938712, "learning_rate": 4.47885401009405e-07, "loss": 0.4685, "step": 10002 }, { "epoch": 0.8123934053439454, "grad_norm": 6.484452032525161, "learning_rate": 4.475098712300263e-07, "loss": 0.5125, "step": 10003 }, { "epoch": 0.812474620319987, "grad_norm": 8.535162150811436, "learning_rate": 4.4713448347210114e-07, "loss": 0.3792, "step": 10004 }, { "epoch": 0.8125558352960286, "grad_norm": 13.187357588897884, "learning_rate": 4.4675923776160533e-07, "loss": 0.4935, "step": 10005 }, { "epoch": 0.8126370502720701, "grad_norm": 7.720879346905562, "learning_rate": 4.463841341245043e-07, "loss": 0.6142, "step": 10006 }, { "epoch": 0.8127182652481117, "grad_norm": 4.332760377433552, "learning_rate": 4.460091725867524e-07, "loss": 0.436, "step": 10007 }, { "epoch": 0.8127994802241534, "grad_norm": 6.514614871896753, "learning_rate": 4.456343531742946e-07, "loss": 0.4141, "step": 10008 }, { "epoch": 0.8128806952001949, "grad_norm": 5.3062919238625605, "learning_rate": 4.4525967591306757e-07, "loss": 0.4301, "step": 10009 }, { "epoch": 0.8129619101762365, "grad_norm": 9.134817655594835, "learning_rate": 4.448851408289964e-07, "loss": 0.5738, "step": 10010 }, { "epoch": 0.8130431251522781, "grad_norm": 4.873683627050256, "learning_rate": 4.4451074794799627e-07, "loss": 0.304, "step": 10011 }, { "epoch": 0.8131243401283197, "grad_norm": 3.4718602879812437, "learning_rate": 4.4413649729597386e-07, "loss": 0.3923, "step": 10012 }, { "epoch": 0.8132055551043612, "grad_norm": 4.570485265053012, "learning_rate": 4.43762388898826e-07, "loss": 0.4372, "step": 10013 }, { "epoch": 0.8132867700804028, "grad_norm": 3.9431671288156664, "learning_rate": 4.4338842278243784e-07, "loss": 0.4041, "step": 10014 }, { "epoch": 0.8133679850564444, "grad_norm": 9.625205916829707, "learning_rate": 4.4301459897268695e-07, "loss": 0.4428, "step": 10015 }, { "epoch": 0.813449200032486, "grad_norm": 7.135399774793007, "learning_rate": 4.426409174954391e-07, "loss": 0.5457, "step": 10016 }, { "epoch": 0.8135304150085275, "grad_norm": 7.241634226766735, "learning_rate": 4.4226737837655106e-07, "loss": 0.4478, "step": 10017 }, { "epoch": 0.8136116299845692, "grad_norm": 5.474906261114699, "learning_rate": 4.418939816418699e-07, "loss": 0.4813, "step": 10018 }, { "epoch": 0.8136928449606108, "grad_norm": 7.248288071920974, "learning_rate": 4.4152072731723336e-07, "loss": 0.5229, "step": 10019 }, { "epoch": 0.8137740599366523, "grad_norm": 4.68601406629194, "learning_rate": 4.411476154284683e-07, "loss": 0.4383, "step": 10020 }, { "epoch": 0.8138552749126939, "grad_norm": 8.548985633110162, "learning_rate": 4.407746460013912e-07, "loss": 0.434, "step": 10021 }, { "epoch": 0.8139364898887355, "grad_norm": 3.598258827038912, "learning_rate": 4.404018190618109e-07, "loss": 0.6293, "step": 10022 }, { "epoch": 0.8140177048647771, "grad_norm": 4.9162560687761525, "learning_rate": 4.4002913463552457e-07, "loss": 0.6338, "step": 10023 }, { "epoch": 0.8140989198408186, "grad_norm": 5.176333776815743, "learning_rate": 4.39656592748319e-07, "loss": 0.4134, "step": 10024 }, { "epoch": 0.8141801348168602, "grad_norm": 5.309773965001737, "learning_rate": 4.392841934259731e-07, "loss": 0.4122, "step": 10025 }, { "epoch": 0.8142613497929018, "grad_norm": 4.204661866235209, "learning_rate": 4.3891193669425567e-07, "loss": 0.5778, "step": 10026 }, { "epoch": 0.8143425647689434, "grad_norm": 5.654465245751199, "learning_rate": 4.3853982257892335e-07, "loss": 0.5514, "step": 10027 }, { "epoch": 0.8144237797449849, "grad_norm": 4.506512234014459, "learning_rate": 4.3816785110572554e-07, "loss": 0.537, "step": 10028 }, { "epoch": 0.8145049947210266, "grad_norm": 3.728172094291727, "learning_rate": 4.3779602230040075e-07, "loss": 0.4989, "step": 10029 }, { "epoch": 0.8145862096970682, "grad_norm": 7.112854813373821, "learning_rate": 4.3742433618867623e-07, "loss": 0.4539, "step": 10030 }, { "epoch": 0.8146674246731097, "grad_norm": 4.9529582527943745, "learning_rate": 4.370527927962717e-07, "loss": 0.6694, "step": 10031 }, { "epoch": 0.8147486396491513, "grad_norm": 5.619899087759572, "learning_rate": 4.366813921488966e-07, "loss": 0.5285, "step": 10032 }, { "epoch": 0.8148298546251929, "grad_norm": 5.707386464605084, "learning_rate": 4.363101342722484e-07, "loss": 0.4412, "step": 10033 }, { "epoch": 0.8149110696012345, "grad_norm": 4.86389548489138, "learning_rate": 4.359390191920176e-07, "loss": 0.5749, "step": 10034 }, { "epoch": 0.814992284577276, "grad_norm": 5.994879397108235, "learning_rate": 4.35568046933883e-07, "loss": 0.3602, "step": 10035 }, { "epoch": 0.8150734995533176, "grad_norm": 3.0944128092213523, "learning_rate": 4.3519721752351305e-07, "loss": 0.448, "step": 10036 }, { "epoch": 0.8151547145293592, "grad_norm": 6.293183554065846, "learning_rate": 4.3482653098656764e-07, "loss": 0.5155, "step": 10037 }, { "epoch": 0.8152359295054008, "grad_norm": 6.726079260676008, "learning_rate": 4.3445598734869725e-07, "loss": 0.5044, "step": 10038 }, { "epoch": 0.8153171444814423, "grad_norm": 5.929353123479193, "learning_rate": 4.340855866355409e-07, "loss": 0.3846, "step": 10039 }, { "epoch": 0.815398359457484, "grad_norm": 4.346247885330788, "learning_rate": 4.3371532887272747e-07, "loss": 0.4431, "step": 10040 }, { "epoch": 0.8154795744335256, "grad_norm": 6.564210241083292, "learning_rate": 4.333452140858782e-07, "loss": 0.4179, "step": 10041 }, { "epoch": 0.8155607894095671, "grad_norm": 6.160516432704893, "learning_rate": 4.3297524230060257e-07, "loss": 0.4109, "step": 10042 }, { "epoch": 0.8156420043856087, "grad_norm": 4.046212520398475, "learning_rate": 4.326054135425001e-07, "loss": 0.4727, "step": 10043 }, { "epoch": 0.8157232193616503, "grad_norm": 4.21989184407862, "learning_rate": 4.322357278371614e-07, "loss": 0.5261, "step": 10044 }, { "epoch": 0.8158044343376919, "grad_norm": 5.065472356667533, "learning_rate": 4.3186618521016745e-07, "loss": 0.4124, "step": 10045 }, { "epoch": 0.8158856493137334, "grad_norm": 6.317472489620383, "learning_rate": 4.314967856870872e-07, "loss": 0.439, "step": 10046 }, { "epoch": 0.815966864289775, "grad_norm": 6.622990894266353, "learning_rate": 4.31127529293483e-07, "loss": 0.2964, "step": 10047 }, { "epoch": 0.8160480792658166, "grad_norm": 5.343326462409636, "learning_rate": 4.3075841605490414e-07, "loss": 0.4928, "step": 10048 }, { "epoch": 0.8161292942418582, "grad_norm": 7.146378891333632, "learning_rate": 4.3038944599689105e-07, "loss": 0.3974, "step": 10049 }, { "epoch": 0.8162105092178997, "grad_norm": 4.603775445290206, "learning_rate": 4.300206191449749e-07, "loss": 0.3021, "step": 10050 }, { "epoch": 0.8162917241939414, "grad_norm": 6.382432355081491, "learning_rate": 4.2965193552467753e-07, "loss": 0.4456, "step": 10051 }, { "epoch": 0.816372939169983, "grad_norm": 5.191619470887934, "learning_rate": 4.292833951615083e-07, "loss": 0.3823, "step": 10052 }, { "epoch": 0.8164541541460245, "grad_norm": 3.9888610071010224, "learning_rate": 4.289149980809698e-07, "loss": 0.6601, "step": 10053 }, { "epoch": 0.8165353691220661, "grad_norm": 8.500802989835256, "learning_rate": 4.2854674430855224e-07, "loss": 0.4692, "step": 10054 }, { "epoch": 0.8166165840981077, "grad_norm": 4.251250937241956, "learning_rate": 4.281786338697369e-07, "loss": 0.4196, "step": 10055 }, { "epoch": 0.8166977990741493, "grad_norm": 5.93357022678489, "learning_rate": 4.278106667899945e-07, "loss": 0.7008, "step": 10056 }, { "epoch": 0.8167790140501908, "grad_norm": 4.588729939434148, "learning_rate": 4.274428430947872e-07, "loss": 0.4552, "step": 10057 }, { "epoch": 0.8168602290262325, "grad_norm": 6.368147028406131, "learning_rate": 4.270751628095668e-07, "loss": 0.3992, "step": 10058 }, { "epoch": 0.816941444002274, "grad_norm": 6.6517461490094565, "learning_rate": 4.2670762595977356e-07, "loss": 0.4615, "step": 10059 }, { "epoch": 0.8170226589783156, "grad_norm": 8.295784988993061, "learning_rate": 4.2634023257084074e-07, "loss": 0.5174, "step": 10060 }, { "epoch": 0.8171038739543571, "grad_norm": 5.523298295622445, "learning_rate": 4.259729826681891e-07, "loss": 0.3388, "step": 10061 }, { "epoch": 0.8171850889303988, "grad_norm": 3.8441813036836137, "learning_rate": 4.2560587627722973e-07, "loss": 0.4553, "step": 10062 }, { "epoch": 0.8172663039064404, "grad_norm": 6.951945297996189, "learning_rate": 4.2523891342336506e-07, "loss": 0.5312, "step": 10063 }, { "epoch": 0.8173475188824819, "grad_norm": 4.871582562337795, "learning_rate": 4.2487209413198784e-07, "loss": 0.3208, "step": 10064 }, { "epoch": 0.8174287338585235, "grad_norm": 7.346570602660871, "learning_rate": 4.245054184284786e-07, "loss": 0.4115, "step": 10065 }, { "epoch": 0.8175099488345651, "grad_norm": 4.392641843655781, "learning_rate": 4.2413888633821064e-07, "loss": 0.5406, "step": 10066 }, { "epoch": 0.8175911638106067, "grad_norm": 7.23578723028704, "learning_rate": 4.237724978865454e-07, "loss": 0.4598, "step": 10067 }, { "epoch": 0.8176723787866482, "grad_norm": 3.553356832932114, "learning_rate": 4.234062530988342e-07, "loss": 0.5109, "step": 10068 }, { "epoch": 0.8177535937626899, "grad_norm": 5.0655040941198255, "learning_rate": 4.2304015200042095e-07, "loss": 0.6022, "step": 10069 }, { "epoch": 0.8178348087387314, "grad_norm": 4.3230856397717226, "learning_rate": 4.2267419461663626e-07, "loss": 0.4315, "step": 10070 }, { "epoch": 0.817916023714773, "grad_norm": 4.6434028931530715, "learning_rate": 4.223083809728032e-07, "loss": 0.489, "step": 10071 }, { "epoch": 0.8179972386908145, "grad_norm": 3.9428152943512673, "learning_rate": 4.219427110942348e-07, "loss": 0.5609, "step": 10072 }, { "epoch": 0.8180784536668562, "grad_norm": 3.9140488822139243, "learning_rate": 4.215771850062328e-07, "loss": 0.5247, "step": 10073 }, { "epoch": 0.8181596686428978, "grad_norm": 5.179289360939204, "learning_rate": 4.2121180273408976e-07, "loss": 0.4456, "step": 10074 }, { "epoch": 0.8182408836189393, "grad_norm": 6.433424694858643, "learning_rate": 4.2084656430308765e-07, "loss": 0.4248, "step": 10075 }, { "epoch": 0.818322098594981, "grad_norm": 3.365422337217589, "learning_rate": 4.204814697384993e-07, "loss": 0.4796, "step": 10076 }, { "epoch": 0.8184033135710225, "grad_norm": 7.511774642841241, "learning_rate": 4.2011651906558815e-07, "loss": 0.6421, "step": 10077 }, { "epoch": 0.8184845285470641, "grad_norm": 8.546191005512076, "learning_rate": 4.1975171230960563e-07, "loss": 0.3865, "step": 10078 }, { "epoch": 0.8185657435231056, "grad_norm": 4.417370933628543, "learning_rate": 4.193870494957958e-07, "loss": 0.453, "step": 10079 }, { "epoch": 0.8186469584991473, "grad_norm": 6.377398869498588, "learning_rate": 4.190225306493906e-07, "loss": 0.4526, "step": 10080 }, { "epoch": 0.8187281734751888, "grad_norm": 4.132468182015889, "learning_rate": 4.186581557956124e-07, "loss": 0.5708, "step": 10081 }, { "epoch": 0.8188093884512304, "grad_norm": 5.42105998479533, "learning_rate": 4.1829392495967485e-07, "loss": 0.5129, "step": 10082 }, { "epoch": 0.8188906034272719, "grad_norm": 5.495216782158462, "learning_rate": 4.1792983816677987e-07, "loss": 0.5831, "step": 10083 }, { "epoch": 0.8189718184033136, "grad_norm": 3.395888728224807, "learning_rate": 4.175658954421208e-07, "loss": 0.5072, "step": 10084 }, { "epoch": 0.8190530333793552, "grad_norm": 5.212070130995405, "learning_rate": 4.172020968108814e-07, "loss": 0.5714, "step": 10085 }, { "epoch": 0.8191342483553967, "grad_norm": 5.41209526251742, "learning_rate": 4.168384422982338e-07, "loss": 0.5516, "step": 10086 }, { "epoch": 0.8192154633314384, "grad_norm": 5.766740901549195, "learning_rate": 4.164749319293404e-07, "loss": 0.54, "step": 10087 }, { "epoch": 0.8192966783074799, "grad_norm": 12.522728877358098, "learning_rate": 4.1611156572935545e-07, "loss": 0.5872, "step": 10088 }, { "epoch": 0.8193778932835215, "grad_norm": 7.055171169509863, "learning_rate": 4.1574834372342053e-07, "loss": 0.3578, "step": 10089 }, { "epoch": 0.819459108259563, "grad_norm": 10.127142685814183, "learning_rate": 4.153852659366697e-07, "loss": 0.4256, "step": 10090 }, { "epoch": 0.8195403232356047, "grad_norm": 6.833361098639403, "learning_rate": 4.1502233239422624e-07, "loss": 0.5271, "step": 10091 }, { "epoch": 0.8196215382116462, "grad_norm": 3.7527822726714057, "learning_rate": 4.14659543121203e-07, "loss": 0.4952, "step": 10092 }, { "epoch": 0.8197027531876878, "grad_norm": 8.853385017010872, "learning_rate": 4.1429689814270284e-07, "loss": 0.4493, "step": 10093 }, { "epoch": 0.8197839681637293, "grad_norm": 4.804086833503905, "learning_rate": 4.139343974838181e-07, "loss": 0.4145, "step": 10094 }, { "epoch": 0.819865183139771, "grad_norm": 2.9666099986165473, "learning_rate": 4.135720411696334e-07, "loss": 0.3967, "step": 10095 }, { "epoch": 0.8199463981158126, "grad_norm": 6.498314301540783, "learning_rate": 4.132098292252204e-07, "loss": 0.3838, "step": 10096 }, { "epoch": 0.8200276130918541, "grad_norm": 5.829495180353205, "learning_rate": 4.128477616756432e-07, "loss": 0.4869, "step": 10097 }, { "epoch": 0.8201088280678958, "grad_norm": 5.33074506215202, "learning_rate": 4.124858385459554e-07, "loss": 0.4165, "step": 10098 }, { "epoch": 0.8201900430439373, "grad_norm": 4.833962665019804, "learning_rate": 4.1212405986119975e-07, "loss": 0.546, "step": 10099 }, { "epoch": 0.8202712580199789, "grad_norm": 4.555452441900101, "learning_rate": 4.117624256464084e-07, "loss": 0.4589, "step": 10100 }, { "epoch": 0.8203524729960204, "grad_norm": 3.2866233085274623, "learning_rate": 4.114009359266061e-07, "loss": 0.482, "step": 10101 }, { "epoch": 0.8204336879720621, "grad_norm": 5.633628638636074, "learning_rate": 4.1103959072680446e-07, "loss": 0.3467, "step": 10102 }, { "epoch": 0.8205149029481036, "grad_norm": 2.6695213124316703, "learning_rate": 4.106783900720074e-07, "loss": 0.5319, "step": 10103 }, { "epoch": 0.8205961179241452, "grad_norm": 6.398738658758001, "learning_rate": 4.1031733398720906e-07, "loss": 0.4174, "step": 10104 }, { "epoch": 0.8206773329001867, "grad_norm": 10.457002673876866, "learning_rate": 4.099564224973915e-07, "loss": 0.4107, "step": 10105 }, { "epoch": 0.8207585478762284, "grad_norm": 3.806681811308465, "learning_rate": 4.0959565562752767e-07, "loss": 0.5535, "step": 10106 }, { "epoch": 0.82083976285227, "grad_norm": 3.11228162293575, "learning_rate": 4.092350334025816e-07, "loss": 0.5205, "step": 10107 }, { "epoch": 0.8209209778283115, "grad_norm": 6.448242144302385, "learning_rate": 4.0887455584750547e-07, "loss": 0.5066, "step": 10108 }, { "epoch": 0.8210021928043532, "grad_norm": 4.566144679361284, "learning_rate": 4.0851422298724354e-07, "loss": 0.6172, "step": 10109 }, { "epoch": 0.8210834077803947, "grad_norm": 5.487995399366438, "learning_rate": 4.081540348467278e-07, "loss": 0.4025, "step": 10110 }, { "epoch": 0.8211646227564363, "grad_norm": 5.156550511751783, "learning_rate": 4.0779399145088247e-07, "loss": 0.4171, "step": 10111 }, { "epoch": 0.8212458377324778, "grad_norm": 5.693851091075354, "learning_rate": 4.074340928246201e-07, "loss": 0.4858, "step": 10112 }, { "epoch": 0.8213270527085195, "grad_norm": 10.430588612793592, "learning_rate": 4.0707433899284333e-07, "loss": 0.4029, "step": 10113 }, { "epoch": 0.821408267684561, "grad_norm": 6.3587031868115975, "learning_rate": 4.067147299804458e-07, "loss": 0.4363, "step": 10114 }, { "epoch": 0.8214894826606026, "grad_norm": 6.255964758565154, "learning_rate": 4.063552658123102e-07, "loss": 0.5734, "step": 10115 }, { "epoch": 0.8215706976366441, "grad_norm": 5.605733474613555, "learning_rate": 4.0599594651330956e-07, "loss": 0.5963, "step": 10116 }, { "epoch": 0.8216519126126858, "grad_norm": 4.079641751194994, "learning_rate": 4.0563677210830763e-07, "loss": 0.5508, "step": 10117 }, { "epoch": 0.8217331275887274, "grad_norm": 6.640081496102607, "learning_rate": 4.0527774262215687e-07, "loss": 0.3412, "step": 10118 }, { "epoch": 0.8218143425647689, "grad_norm": 5.1973586672338845, "learning_rate": 4.049188580796995e-07, "loss": 0.3926, "step": 10119 }, { "epoch": 0.8218955575408106, "grad_norm": 6.292394610949972, "learning_rate": 4.0456011850576985e-07, "loss": 0.3678, "step": 10120 }, { "epoch": 0.8219767725168521, "grad_norm": 7.307142444437324, "learning_rate": 4.0420152392518926e-07, "loss": 0.4533, "step": 10121 }, { "epoch": 0.8220579874928937, "grad_norm": 4.1068923254830345, "learning_rate": 4.038430743627714e-07, "loss": 0.535, "step": 10122 }, { "epoch": 0.8221392024689352, "grad_norm": 10.080633399311306, "learning_rate": 4.0348476984331977e-07, "loss": 0.5241, "step": 10123 }, { "epoch": 0.8222204174449769, "grad_norm": 5.11781610289055, "learning_rate": 4.031266103916262e-07, "loss": 0.6826, "step": 10124 }, { "epoch": 0.8223016324210184, "grad_norm": 10.246403450559914, "learning_rate": 4.0276859603247317e-07, "loss": 0.5244, "step": 10125 }, { "epoch": 0.82238284739706, "grad_norm": 5.888899275267595, "learning_rate": 4.0241072679063437e-07, "loss": 0.5099, "step": 10126 }, { "epoch": 0.8224640623731015, "grad_norm": 4.555118993832531, "learning_rate": 4.02053002690872e-07, "loss": 0.4915, "step": 10127 }, { "epoch": 0.8225452773491432, "grad_norm": 6.444960720915754, "learning_rate": 4.016954237579382e-07, "loss": 0.6126, "step": 10128 }, { "epoch": 0.8226264923251848, "grad_norm": 5.028533921984391, "learning_rate": 4.013379900165756e-07, "loss": 0.4286, "step": 10129 }, { "epoch": 0.8227077073012263, "grad_norm": 9.57264830043398, "learning_rate": 4.009807014915179e-07, "loss": 0.634, "step": 10130 }, { "epoch": 0.822788922277268, "grad_norm": 3.535035189189592, "learning_rate": 4.006235582074866e-07, "loss": 0.519, "step": 10131 }, { "epoch": 0.8228701372533095, "grad_norm": 5.20695512400172, "learning_rate": 4.002665601891939e-07, "loss": 0.4938, "step": 10132 }, { "epoch": 0.8229513522293511, "grad_norm": 4.97816750794527, "learning_rate": 3.9990970746134283e-07, "loss": 0.3337, "step": 10133 }, { "epoch": 0.8230325672053926, "grad_norm": 4.2871354698006945, "learning_rate": 3.99553000048625e-07, "loss": 0.4538, "step": 10134 }, { "epoch": 0.8231137821814343, "grad_norm": 17.838493735891614, "learning_rate": 3.991964379757232e-07, "loss": 0.5731, "step": 10135 }, { "epoch": 0.8231949971574758, "grad_norm": 5.170687504793261, "learning_rate": 3.988400212673099e-07, "loss": 0.5173, "step": 10136 }, { "epoch": 0.8232762121335174, "grad_norm": 6.426083270641829, "learning_rate": 3.9848374994804734e-07, "loss": 0.5226, "step": 10137 }, { "epoch": 0.823357427109559, "grad_norm": 3.7713552353157476, "learning_rate": 3.9812762404258605e-07, "loss": 0.4277, "step": 10138 }, { "epoch": 0.8234386420856006, "grad_norm": 6.423516094367269, "learning_rate": 3.977716435755702e-07, "loss": 0.4386, "step": 10139 }, { "epoch": 0.8235198570616422, "grad_norm": 5.7394083306414325, "learning_rate": 3.9741580857163036e-07, "loss": 0.38, "step": 10140 }, { "epoch": 0.8236010720376837, "grad_norm": 5.863968524558383, "learning_rate": 3.9706011905538827e-07, "loss": 0.3539, "step": 10141 }, { "epoch": 0.8236822870137254, "grad_norm": 7.197622618993387, "learning_rate": 3.9670457505145643e-07, "loss": 0.6156, "step": 10142 }, { "epoch": 0.8237635019897669, "grad_norm": 9.307547582177493, "learning_rate": 3.963491765844371e-07, "loss": 0.5406, "step": 10143 }, { "epoch": 0.8238447169658085, "grad_norm": 3.8307158333938185, "learning_rate": 3.959939236789212e-07, "loss": 0.431, "step": 10144 }, { "epoch": 0.82392593194185, "grad_norm": 5.112333570814876, "learning_rate": 3.9563881635948984e-07, "loss": 0.4988, "step": 10145 }, { "epoch": 0.8240071469178917, "grad_norm": 5.863847427204812, "learning_rate": 3.9528385465071594e-07, "loss": 0.6043, "step": 10146 }, { "epoch": 0.8240883618939332, "grad_norm": 5.626678027585737, "learning_rate": 3.949290385771595e-07, "loss": 0.431, "step": 10147 }, { "epoch": 0.8241695768699748, "grad_norm": 4.081512795131598, "learning_rate": 3.945743681633729e-07, "loss": 0.5401, "step": 10148 }, { "epoch": 0.8242507918460164, "grad_norm": 4.533747792384772, "learning_rate": 3.9421984343389756e-07, "loss": 0.5523, "step": 10149 }, { "epoch": 0.824332006822058, "grad_norm": 3.743850075034431, "learning_rate": 3.9386546441326444e-07, "loss": 0.4706, "step": 10150 }, { "epoch": 0.8244132217980996, "grad_norm": 23.913229954006653, "learning_rate": 3.9351123112599393e-07, "loss": 0.6842, "step": 10151 }, { "epoch": 0.8244944367741411, "grad_norm": 7.100866725720028, "learning_rate": 3.931571435965986e-07, "loss": 0.377, "step": 10152 }, { "epoch": 0.8245756517501828, "grad_norm": 8.290123785692964, "learning_rate": 3.9280320184957864e-07, "loss": 0.5232, "step": 10153 }, { "epoch": 0.8246568667262243, "grad_norm": 4.89062234331338, "learning_rate": 3.9244940590942413e-07, "loss": 0.4422, "step": 10154 }, { "epoch": 0.8247380817022659, "grad_norm": 7.550251223417641, "learning_rate": 3.9209575580061663e-07, "loss": 0.2946, "step": 10155 }, { "epoch": 0.8248192966783074, "grad_norm": 10.05450235709694, "learning_rate": 3.9174225154762766e-07, "loss": 0.6482, "step": 10156 }, { "epoch": 0.8249005116543491, "grad_norm": 5.365029681588859, "learning_rate": 3.9138889317491656e-07, "loss": 0.5071, "step": 10157 }, { "epoch": 0.8249817266303906, "grad_norm": 5.609914610484028, "learning_rate": 3.9103568070693485e-07, "loss": 0.5451, "step": 10158 }, { "epoch": 0.8250629416064322, "grad_norm": 4.514680074383672, "learning_rate": 3.906826141681225e-07, "loss": 0.532, "step": 10159 }, { "epoch": 0.8251441565824738, "grad_norm": 10.206347105991203, "learning_rate": 3.903296935829093e-07, "loss": 0.5132, "step": 10160 }, { "epoch": 0.8252253715585154, "grad_norm": 6.858018717895062, "learning_rate": 3.8997691897571577e-07, "loss": 0.4462, "step": 10161 }, { "epoch": 0.825306586534557, "grad_norm": 5.173227278123777, "learning_rate": 3.896242903709532e-07, "loss": 0.5436, "step": 10162 }, { "epoch": 0.8253878015105985, "grad_norm": 11.450687404556978, "learning_rate": 3.8927180779302076e-07, "loss": 0.4657, "step": 10163 }, { "epoch": 0.8254690164866402, "grad_norm": 4.609261708122115, "learning_rate": 3.889194712663075e-07, "loss": 0.4993, "step": 10164 }, { "epoch": 0.8255502314626817, "grad_norm": 3.5084080535675026, "learning_rate": 3.885672808151947e-07, "loss": 0.4571, "step": 10165 }, { "epoch": 0.8256314464387233, "grad_norm": 21.30482869821724, "learning_rate": 3.882152364640518e-07, "loss": 0.3259, "step": 10166 }, { "epoch": 0.8257126614147648, "grad_norm": 4.848731610369329, "learning_rate": 3.878633382372371e-07, "loss": 0.5129, "step": 10167 }, { "epoch": 0.8257938763908065, "grad_norm": 4.713993311348796, "learning_rate": 3.875115861591014e-07, "loss": 0.4037, "step": 10168 }, { "epoch": 0.825875091366848, "grad_norm": 6.885554383643488, "learning_rate": 3.871599802539841e-07, "loss": 0.2991, "step": 10169 }, { "epoch": 0.8259563063428896, "grad_norm": 11.560653980725935, "learning_rate": 3.868085205462135e-07, "loss": 0.4264, "step": 10170 }, { "epoch": 0.8260375213189312, "grad_norm": 18.94305982548915, "learning_rate": 3.8645720706010997e-07, "loss": 0.4572, "step": 10171 }, { "epoch": 0.8261187362949728, "grad_norm": 4.3868517626424754, "learning_rate": 3.8610603981998204e-07, "loss": 0.4054, "step": 10172 }, { "epoch": 0.8261999512710144, "grad_norm": 10.379799445814022, "learning_rate": 3.85755018850128e-07, "loss": 0.4782, "step": 10173 }, { "epoch": 0.8262811662470559, "grad_norm": 5.464482264230086, "learning_rate": 3.854041441748371e-07, "loss": 0.4246, "step": 10174 }, { "epoch": 0.8263623812230976, "grad_norm": 5.259022216933499, "learning_rate": 3.8505341581838854e-07, "loss": 0.4936, "step": 10175 }, { "epoch": 0.8264435961991391, "grad_norm": 3.834344996242316, "learning_rate": 3.8470283380504987e-07, "loss": 0.47, "step": 10176 }, { "epoch": 0.8265248111751807, "grad_norm": 9.45059948818576, "learning_rate": 3.8435239815908077e-07, "loss": 0.4509, "step": 10177 }, { "epoch": 0.8266060261512223, "grad_norm": 4.5939757403211585, "learning_rate": 3.8400210890472883e-07, "loss": 0.3479, "step": 10178 }, { "epoch": 0.8266872411272639, "grad_norm": 5.823737388475345, "learning_rate": 3.836519660662313e-07, "loss": 0.3906, "step": 10179 }, { "epoch": 0.8267684561033054, "grad_norm": 4.6299433684789095, "learning_rate": 3.8330196966781723e-07, "loss": 0.5363, "step": 10180 }, { "epoch": 0.826849671079347, "grad_norm": 6.306388306208667, "learning_rate": 3.829521197337052e-07, "loss": 0.433, "step": 10181 }, { "epoch": 0.8269308860553886, "grad_norm": 5.650157408413531, "learning_rate": 3.8260241628810203e-07, "loss": 0.5346, "step": 10182 }, { "epoch": 0.8270121010314302, "grad_norm": 4.139343726577053, "learning_rate": 3.8225285935520493e-07, "loss": 0.5034, "step": 10183 }, { "epoch": 0.8270933160074718, "grad_norm": 4.27842292241429, "learning_rate": 3.8190344895920246e-07, "loss": 0.4205, "step": 10184 }, { "epoch": 0.8271745309835133, "grad_norm": 4.60689072938466, "learning_rate": 3.815541851242713e-07, "loss": 0.5121, "step": 10185 }, { "epoch": 0.827255745959555, "grad_norm": 14.92261155989774, "learning_rate": 3.812050678745785e-07, "loss": 0.3954, "step": 10186 }, { "epoch": 0.8273369609355965, "grad_norm": 4.53044644344942, "learning_rate": 3.808560972342812e-07, "loss": 0.5392, "step": 10187 }, { "epoch": 0.8274181759116381, "grad_norm": 5.540716412331475, "learning_rate": 3.8050727322752726e-07, "loss": 0.5574, "step": 10188 }, { "epoch": 0.8274993908876797, "grad_norm": 8.417967136559637, "learning_rate": 3.8015859587845233e-07, "loss": 0.3885, "step": 10189 }, { "epoch": 0.8275806058637213, "grad_norm": 53.802360428112415, "learning_rate": 3.798100652111839e-07, "loss": 0.4015, "step": 10190 }, { "epoch": 0.8276618208397628, "grad_norm": 4.877120364803925, "learning_rate": 3.7946168124983776e-07, "loss": 0.3768, "step": 10191 }, { "epoch": 0.8277430358158044, "grad_norm": 9.495267916803392, "learning_rate": 3.791134440185201e-07, "loss": 0.5635, "step": 10192 }, { "epoch": 0.827824250791846, "grad_norm": 5.733855636295819, "learning_rate": 3.787653535413277e-07, "loss": 0.4829, "step": 10193 }, { "epoch": 0.8279054657678876, "grad_norm": 5.873682615843861, "learning_rate": 3.784174098423465e-07, "loss": 0.5228, "step": 10194 }, { "epoch": 0.8279866807439292, "grad_norm": 4.833822246575063, "learning_rate": 3.780696129456521e-07, "loss": 0.5421, "step": 10195 }, { "epoch": 0.8280678957199707, "grad_norm": 5.208263826542225, "learning_rate": 3.7772196287531066e-07, "loss": 0.5473, "step": 10196 }, { "epoch": 0.8281491106960124, "grad_norm": 8.378333393225612, "learning_rate": 3.773744596553774e-07, "loss": 0.4702, "step": 10197 }, { "epoch": 0.8282303256720539, "grad_norm": 3.4663578157064583, "learning_rate": 3.7702710330989765e-07, "loss": 0.5652, "step": 10198 }, { "epoch": 0.8283115406480955, "grad_norm": 6.827267196321197, "learning_rate": 3.766798938629063e-07, "loss": 0.4027, "step": 10199 }, { "epoch": 0.8283927556241371, "grad_norm": 9.444190295092335, "learning_rate": 3.7633283133842845e-07, "loss": 0.5462, "step": 10200 }, { "epoch": 0.8284739706001787, "grad_norm": 6.929524668677438, "learning_rate": 3.7598591576048e-07, "loss": 0.5488, "step": 10201 }, { "epoch": 0.8285551855762202, "grad_norm": 5.6038151431290935, "learning_rate": 3.756391471530646e-07, "loss": 0.556, "step": 10202 }, { "epoch": 0.8286364005522618, "grad_norm": 4.185345966650429, "learning_rate": 3.7529252554017765e-07, "loss": 0.5079, "step": 10203 }, { "epoch": 0.8287176155283034, "grad_norm": 6.844019960274571, "learning_rate": 3.7494605094580305e-07, "loss": 0.427, "step": 10204 }, { "epoch": 0.828798830504345, "grad_norm": 5.36417941344362, "learning_rate": 3.7459972339391445e-07, "loss": 0.797, "step": 10205 }, { "epoch": 0.8288800454803866, "grad_norm": 5.328404842131845, "learning_rate": 3.742535429084765e-07, "loss": 0.3947, "step": 10206 }, { "epoch": 0.8289612604564282, "grad_norm": 5.223058995984122, "learning_rate": 3.739075095134437e-07, "loss": 0.4567, "step": 10207 }, { "epoch": 0.8290424754324698, "grad_norm": 4.706033163133879, "learning_rate": 3.735616232327582e-07, "loss": 0.4547, "step": 10208 }, { "epoch": 0.8291236904085113, "grad_norm": 5.7396745415353845, "learning_rate": 3.732158840903552e-07, "loss": 0.5567, "step": 10209 }, { "epoch": 0.8292049053845529, "grad_norm": 4.958530850319359, "learning_rate": 3.728702921101571e-07, "loss": 0.5436, "step": 10210 }, { "epoch": 0.8292861203605945, "grad_norm": 4.861206028038226, "learning_rate": 3.725248473160764e-07, "loss": 0.5996, "step": 10211 }, { "epoch": 0.8293673353366361, "grad_norm": 4.682688015412765, "learning_rate": 3.721795497320174e-07, "loss": 0.5646, "step": 10212 }, { "epoch": 0.8294485503126776, "grad_norm": 5.262659812480152, "learning_rate": 3.718343993818718e-07, "loss": 0.5358, "step": 10213 }, { "epoch": 0.8295297652887192, "grad_norm": 9.851823169517866, "learning_rate": 3.7148939628952246e-07, "loss": 0.4423, "step": 10214 }, { "epoch": 0.8296109802647608, "grad_norm": 5.776223337431074, "learning_rate": 3.7114454047884247e-07, "loss": 0.3799, "step": 10215 }, { "epoch": 0.8296921952408024, "grad_norm": 8.26924554342392, "learning_rate": 3.707998319736936e-07, "loss": 0.4953, "step": 10216 }, { "epoch": 0.829773410216844, "grad_norm": 5.982564098065872, "learning_rate": 3.7045527079792753e-07, "loss": 0.593, "step": 10217 }, { "epoch": 0.8298546251928856, "grad_norm": 5.7895130886280635, "learning_rate": 3.7011085697538587e-07, "loss": 0.4198, "step": 10218 }, { "epoch": 0.8299358401689272, "grad_norm": 4.460178263466723, "learning_rate": 3.6976659052990056e-07, "loss": 0.6089, "step": 10219 }, { "epoch": 0.8300170551449687, "grad_norm": 5.700291972867315, "learning_rate": 3.694224714852937e-07, "loss": 0.405, "step": 10220 }, { "epoch": 0.8300982701210103, "grad_norm": 7.327954495107345, "learning_rate": 3.6907849986537516e-07, "loss": 0.4625, "step": 10221 }, { "epoch": 0.8301794850970519, "grad_norm": 5.605599034700318, "learning_rate": 3.687346756939475e-07, "loss": 0.4958, "step": 10222 }, { "epoch": 0.8302607000730935, "grad_norm": 4.754692475561762, "learning_rate": 3.6839099899480033e-07, "loss": 0.3742, "step": 10223 }, { "epoch": 0.830341915049135, "grad_norm": 3.9081424392387327, "learning_rate": 3.680474697917144e-07, "loss": 0.6079, "step": 10224 }, { "epoch": 0.8304231300251766, "grad_norm": 4.983983156364833, "learning_rate": 3.677040881084609e-07, "loss": 0.4596, "step": 10225 }, { "epoch": 0.8305043450012182, "grad_norm": 6.779953311673761, "learning_rate": 3.6736085396879896e-07, "loss": 0.368, "step": 10226 }, { "epoch": 0.8305855599772598, "grad_norm": 3.867084328558502, "learning_rate": 3.6701776739647893e-07, "loss": 0.5043, "step": 10227 }, { "epoch": 0.8306667749533014, "grad_norm": 5.763370281929799, "learning_rate": 3.666748284152413e-07, "loss": 0.6709, "step": 10228 }, { "epoch": 0.830747989929343, "grad_norm": 5.039857359159814, "learning_rate": 3.663320370488152e-07, "loss": 0.4427, "step": 10229 }, { "epoch": 0.8308292049053846, "grad_norm": 3.9366691516556678, "learning_rate": 3.659893933209191e-07, "loss": 0.5353, "step": 10230 }, { "epoch": 0.8309104198814261, "grad_norm": 16.823744196747114, "learning_rate": 3.6564689725526377e-07, "loss": 0.455, "step": 10231 }, { "epoch": 0.8309916348574677, "grad_norm": 3.5430059865653996, "learning_rate": 3.6530454887554636e-07, "loss": 0.6269, "step": 10232 }, { "epoch": 0.8310728498335093, "grad_norm": 4.922946722027889, "learning_rate": 3.649623482054565e-07, "loss": 0.4448, "step": 10233 }, { "epoch": 0.8311540648095509, "grad_norm": 5.418331879921831, "learning_rate": 3.6462029526867335e-07, "loss": 0.3769, "step": 10234 }, { "epoch": 0.8312352797855924, "grad_norm": 6.912181302002352, "learning_rate": 3.642783900888644e-07, "loss": 0.4543, "step": 10235 }, { "epoch": 0.831316494761634, "grad_norm": 7.405882641111188, "learning_rate": 3.639366326896876e-07, "loss": 0.4988, "step": 10236 }, { "epoch": 0.8313977097376756, "grad_norm": 6.548258048190026, "learning_rate": 3.635950230947902e-07, "loss": 0.4483, "step": 10237 }, { "epoch": 0.8314789247137172, "grad_norm": 7.4684554933443925, "learning_rate": 3.632535613278107e-07, "loss": 0.4406, "step": 10238 }, { "epoch": 0.8315601396897588, "grad_norm": 3.8512394389587703, "learning_rate": 3.629122474123767e-07, "loss": 0.4396, "step": 10239 }, { "epoch": 0.8316413546658004, "grad_norm": 6.464656562967151, "learning_rate": 3.6257108137210396e-07, "loss": 0.37, "step": 10240 }, { "epoch": 0.831722569641842, "grad_norm": 5.940372384619711, "learning_rate": 3.622300632306011e-07, "loss": 0.3829, "step": 10241 }, { "epoch": 0.8318037846178835, "grad_norm": 7.830539757818605, "learning_rate": 3.6188919301146375e-07, "loss": 0.4114, "step": 10242 }, { "epoch": 0.8318849995939251, "grad_norm": 4.839330335943199, "learning_rate": 3.615484707382777e-07, "loss": 0.5915, "step": 10243 }, { "epoch": 0.8319662145699667, "grad_norm": 3.688765962182601, "learning_rate": 3.6120789643462053e-07, "loss": 0.5409, "step": 10244 }, { "epoch": 0.8320474295460083, "grad_norm": 6.537020216802781, "learning_rate": 3.608674701240572e-07, "loss": 0.3567, "step": 10245 }, { "epoch": 0.8321286445220498, "grad_norm": 4.589211449298252, "learning_rate": 3.605271918301434e-07, "loss": 0.5638, "step": 10246 }, { "epoch": 0.8322098594980915, "grad_norm": 6.29504528521005, "learning_rate": 3.601870615764258e-07, "loss": 0.5068, "step": 10247 }, { "epoch": 0.832291074474133, "grad_norm": 4.262119801386439, "learning_rate": 3.5984707938643864e-07, "loss": 0.4094, "step": 10248 }, { "epoch": 0.8323722894501746, "grad_norm": 4.841197784154483, "learning_rate": 3.5950724528370615e-07, "loss": 0.4699, "step": 10249 }, { "epoch": 0.8324535044262162, "grad_norm": 5.42937376505632, "learning_rate": 3.591675592917449e-07, "loss": 0.4166, "step": 10250 }, { "epoch": 0.8325347194022578, "grad_norm": 6.296856237155061, "learning_rate": 3.5882802143405755e-07, "loss": 0.449, "step": 10251 }, { "epoch": 0.8326159343782994, "grad_norm": 7.636038816244131, "learning_rate": 3.584886317341396e-07, "loss": 0.4694, "step": 10252 }, { "epoch": 0.8326971493543409, "grad_norm": 21.61523367073273, "learning_rate": 3.58149390215474e-07, "loss": 0.5688, "step": 10253 }, { "epoch": 0.8327783643303825, "grad_norm": 5.877406518099175, "learning_rate": 3.5781029690153567e-07, "loss": 0.5595, "step": 10254 }, { "epoch": 0.8328595793064241, "grad_norm": 4.889479170900449, "learning_rate": 3.574713518157874e-07, "loss": 0.5255, "step": 10255 }, { "epoch": 0.8329407942824657, "grad_norm": 6.633095888789909, "learning_rate": 3.571325549816818e-07, "loss": 0.3982, "step": 10256 }, { "epoch": 0.8330220092585072, "grad_norm": 4.06737934876213, "learning_rate": 3.56793906422663e-07, "loss": 0.5486, "step": 10257 }, { "epoch": 0.8331032242345489, "grad_norm": 4.0549895036371, "learning_rate": 3.564554061621625e-07, "loss": 0.4597, "step": 10258 }, { "epoch": 0.8331844392105904, "grad_norm": 8.500590717560147, "learning_rate": 3.5611705422360335e-07, "loss": 0.491, "step": 10259 }, { "epoch": 0.833265654186632, "grad_norm": 5.841348510557204, "learning_rate": 3.557788506303986e-07, "loss": 0.3571, "step": 10260 }, { "epoch": 0.8333468691626736, "grad_norm": 8.428998076590059, "learning_rate": 3.5544079540594884e-07, "loss": 0.5252, "step": 10261 }, { "epoch": 0.8334280841387152, "grad_norm": 6.0161944380779255, "learning_rate": 3.551028885736457e-07, "loss": 0.5061, "step": 10262 }, { "epoch": 0.8335092991147568, "grad_norm": 26.06934398233002, "learning_rate": 3.5476513015687136e-07, "loss": 0.4221, "step": 10263 }, { "epoch": 0.8335905140907983, "grad_norm": 6.725770937675891, "learning_rate": 3.5442752017899625e-07, "loss": 0.5557, "step": 10264 }, { "epoch": 0.83367172906684, "grad_norm": 5.63216646803754, "learning_rate": 3.5409005866338134e-07, "loss": 0.4044, "step": 10265 }, { "epoch": 0.8337529440428815, "grad_norm": 5.540976364008194, "learning_rate": 3.537527456333778e-07, "loss": 0.4748, "step": 10266 }, { "epoch": 0.8338341590189231, "grad_norm": 4.118260128090111, "learning_rate": 3.5341558111232547e-07, "loss": 0.5088, "step": 10267 }, { "epoch": 0.8339153739949646, "grad_norm": 4.454821659286079, "learning_rate": 3.5307856512355354e-07, "loss": 0.408, "step": 10268 }, { "epoch": 0.8339965889710063, "grad_norm": 7.906091232376149, "learning_rate": 3.527416976903833e-07, "loss": 0.4369, "step": 10269 }, { "epoch": 0.8340778039470479, "grad_norm": 4.946330213556263, "learning_rate": 3.5240497883612333e-07, "loss": 0.5383, "step": 10270 }, { "epoch": 0.8341590189230894, "grad_norm": 3.921389705656694, "learning_rate": 3.5206840858407225e-07, "loss": 0.5159, "step": 10271 }, { "epoch": 0.834240233899131, "grad_norm": 9.466775431263171, "learning_rate": 3.517319869575195e-07, "loss": 0.3524, "step": 10272 }, { "epoch": 0.8343214488751726, "grad_norm": 7.781642113894127, "learning_rate": 3.5139571397974416e-07, "loss": 0.5091, "step": 10273 }, { "epoch": 0.8344026638512142, "grad_norm": 5.031639260527779, "learning_rate": 3.5105958967401404e-07, "loss": 0.3777, "step": 10274 }, { "epoch": 0.8344838788272557, "grad_norm": 4.936172284543063, "learning_rate": 3.5072361406358696e-07, "loss": 0.5142, "step": 10275 }, { "epoch": 0.8345650938032974, "grad_norm": 5.62972700250256, "learning_rate": 3.5038778717171123e-07, "loss": 0.5169, "step": 10276 }, { "epoch": 0.8346463087793389, "grad_norm": 4.876718013101768, "learning_rate": 3.500521090216233e-07, "loss": 0.5267, "step": 10277 }, { "epoch": 0.8347275237553805, "grad_norm": 5.004111684696393, "learning_rate": 3.497165796365512e-07, "loss": 0.6079, "step": 10278 }, { "epoch": 0.834808738731422, "grad_norm": 5.227016363738208, "learning_rate": 3.4938119903971195e-07, "loss": 0.3386, "step": 10279 }, { "epoch": 0.8348899537074637, "grad_norm": 4.70841750512664, "learning_rate": 3.49045967254312e-07, "loss": 0.4524, "step": 10280 }, { "epoch": 0.8349711686835053, "grad_norm": 4.398665081558391, "learning_rate": 3.487108843035467e-07, "loss": 0.4655, "step": 10281 }, { "epoch": 0.8350523836595468, "grad_norm": 4.351497987088346, "learning_rate": 3.4837595021060296e-07, "loss": 0.4796, "step": 10282 }, { "epoch": 0.8351335986355884, "grad_norm": 4.803405743264782, "learning_rate": 3.480411649986565e-07, "loss": 0.3933, "step": 10283 }, { "epoch": 0.83521481361163, "grad_norm": 4.927789971086148, "learning_rate": 3.477065286908715e-07, "loss": 0.407, "step": 10284 }, { "epoch": 0.8352960285876716, "grad_norm": 9.580044334289088, "learning_rate": 3.4737204131040397e-07, "loss": 0.5534, "step": 10285 }, { "epoch": 0.8353772435637131, "grad_norm": 7.407853324575902, "learning_rate": 3.470377028803992e-07, "loss": 0.4651, "step": 10286 }, { "epoch": 0.8354584585397548, "grad_norm": 4.484117272427251, "learning_rate": 3.46703513423991e-07, "loss": 0.5232, "step": 10287 }, { "epoch": 0.8355396735157963, "grad_norm": 4.330327720269298, "learning_rate": 3.4636947296430274e-07, "loss": 0.4451, "step": 10288 }, { "epoch": 0.8356208884918379, "grad_norm": 5.965597862522399, "learning_rate": 3.460355815244498e-07, "loss": 0.5906, "step": 10289 }, { "epoch": 0.8357021034678794, "grad_norm": 8.22500497243567, "learning_rate": 3.457018391275341e-07, "loss": 0.3985, "step": 10290 }, { "epoch": 0.8357833184439211, "grad_norm": 5.544151542569365, "learning_rate": 3.4536824579665007e-07, "loss": 0.4722, "step": 10291 }, { "epoch": 0.8358645334199627, "grad_norm": 5.097926053416765, "learning_rate": 3.4503480155488044e-07, "loss": 0.5343, "step": 10292 }, { "epoch": 0.8359457483960042, "grad_norm": 11.549757808229108, "learning_rate": 3.447015064252976e-07, "loss": 0.3995, "step": 10293 }, { "epoch": 0.8360269633720459, "grad_norm": 4.072804987437322, "learning_rate": 3.443683604309633e-07, "loss": 0.4749, "step": 10294 }, { "epoch": 0.8361081783480874, "grad_norm": 5.09382553179545, "learning_rate": 3.4403536359493034e-07, "loss": 0.4529, "step": 10295 }, { "epoch": 0.836189393324129, "grad_norm": 4.720065781209744, "learning_rate": 3.437025159402399e-07, "loss": 0.58, "step": 10296 }, { "epoch": 0.8362706083001705, "grad_norm": 4.694987719383441, "learning_rate": 3.43369817489923e-07, "loss": 0.4097, "step": 10297 }, { "epoch": 0.8363518232762122, "grad_norm": 5.4566621726960145, "learning_rate": 3.430372682670008e-07, "loss": 0.4231, "step": 10298 }, { "epoch": 0.8364330382522537, "grad_norm": 8.027749216651072, "learning_rate": 3.4270486829448476e-07, "loss": 0.4286, "step": 10299 }, { "epoch": 0.8365142532282953, "grad_norm": 5.684585620616569, "learning_rate": 3.423726175953737e-07, "loss": 0.4205, "step": 10300 }, { "epoch": 0.8365954682043368, "grad_norm": 3.9316108403026178, "learning_rate": 3.4204051619265905e-07, "loss": 0.4315, "step": 10301 }, { "epoch": 0.8366766831803785, "grad_norm": 5.5134529395223915, "learning_rate": 3.4170856410931986e-07, "loss": 0.3852, "step": 10302 }, { "epoch": 0.8367578981564201, "grad_norm": 6.0566636394615285, "learning_rate": 3.41376761368325e-07, "loss": 0.544, "step": 10303 }, { "epoch": 0.8368391131324616, "grad_norm": 5.68024482168099, "learning_rate": 3.4104510799263356e-07, "loss": 0.4424, "step": 10304 }, { "epoch": 0.8369203281085033, "grad_norm": 8.931124333462172, "learning_rate": 3.407136040051953e-07, "loss": 0.4358, "step": 10305 }, { "epoch": 0.8370015430845448, "grad_norm": 6.202340802104501, "learning_rate": 3.40382249428948e-07, "loss": 0.6703, "step": 10306 }, { "epoch": 0.8370827580605864, "grad_norm": 5.36293568403821, "learning_rate": 3.400510442868185e-07, "loss": 0.5796, "step": 10307 }, { "epoch": 0.8371639730366279, "grad_norm": 5.206542300226407, "learning_rate": 3.3971998860172605e-07, "loss": 0.5902, "step": 10308 }, { "epoch": 0.8372451880126696, "grad_norm": 3.560373786193406, "learning_rate": 3.393890823965768e-07, "loss": 0.4523, "step": 10309 }, { "epoch": 0.8373264029887111, "grad_norm": 5.239048746596322, "learning_rate": 3.390583256942681e-07, "loss": 0.5946, "step": 10310 }, { "epoch": 0.8374076179647527, "grad_norm": 4.838192804018576, "learning_rate": 3.3872771851768737e-07, "loss": 0.4644, "step": 10311 }, { "epoch": 0.8374888329407942, "grad_norm": 7.90289534921277, "learning_rate": 3.383972608897099e-07, "loss": 0.4878, "step": 10312 }, { "epoch": 0.8375700479168359, "grad_norm": 3.8504253330468092, "learning_rate": 3.3806695283320145e-07, "loss": 0.3547, "step": 10313 }, { "epoch": 0.8376512628928775, "grad_norm": 11.90866871558082, "learning_rate": 3.377367943710183e-07, "loss": 0.4281, "step": 10314 }, { "epoch": 0.837732477868919, "grad_norm": 4.797876474078643, "learning_rate": 3.374067855260055e-07, "loss": 0.4421, "step": 10315 }, { "epoch": 0.8378136928449607, "grad_norm": 4.661745082017848, "learning_rate": 3.370769263209975e-07, "loss": 0.3877, "step": 10316 }, { "epoch": 0.8378949078210022, "grad_norm": 4.545258248925443, "learning_rate": 3.3674721677881853e-07, "loss": 0.5798, "step": 10317 }, { "epoch": 0.8379761227970438, "grad_norm": 6.305655433495236, "learning_rate": 3.364176569222843e-07, "loss": 0.5197, "step": 10318 }, { "epoch": 0.8380573377730853, "grad_norm": 5.2525205813121945, "learning_rate": 3.360882467741969e-07, "loss": 0.4127, "step": 10319 }, { "epoch": 0.838138552749127, "grad_norm": 4.370136047848057, "learning_rate": 3.35758986357351e-07, "loss": 0.6077, "step": 10320 }, { "epoch": 0.8382197677251685, "grad_norm": 10.028481598001118, "learning_rate": 3.354298756945293e-07, "loss": 0.3249, "step": 10321 }, { "epoch": 0.8383009827012101, "grad_norm": 4.972555184812446, "learning_rate": 3.351009148085038e-07, "loss": 0.5867, "step": 10322 }, { "epoch": 0.8383821976772516, "grad_norm": 3.9492347620594224, "learning_rate": 3.347721037220372e-07, "loss": 0.4575, "step": 10323 }, { "epoch": 0.8384634126532933, "grad_norm": 24.137740084826742, "learning_rate": 3.344434424578824e-07, "loss": 0.5544, "step": 10324 }, { "epoch": 0.8385446276293349, "grad_norm": 7.000852410859473, "learning_rate": 3.3411493103878036e-07, "loss": 0.4401, "step": 10325 }, { "epoch": 0.8386258426053764, "grad_norm": 6.577988762799303, "learning_rate": 3.3378656948746176e-07, "loss": 0.4975, "step": 10326 }, { "epoch": 0.8387070575814181, "grad_norm": 5.455565928416991, "learning_rate": 3.334583578266487e-07, "loss": 0.489, "step": 10327 }, { "epoch": 0.8387882725574596, "grad_norm": 4.806945898095566, "learning_rate": 3.3313029607905087e-07, "loss": 0.394, "step": 10328 }, { "epoch": 0.8388694875335012, "grad_norm": 5.601127834993915, "learning_rate": 3.328023842673678e-07, "loss": 0.5171, "step": 10329 }, { "epoch": 0.8389507025095427, "grad_norm": 8.23233143842366, "learning_rate": 3.324746224142902e-07, "loss": 0.3461, "step": 10330 }, { "epoch": 0.8390319174855844, "grad_norm": 5.534926193999312, "learning_rate": 3.321470105424979e-07, "loss": 0.4056, "step": 10331 }, { "epoch": 0.8391131324616259, "grad_norm": 4.70029893588722, "learning_rate": 3.3181954867465864e-07, "loss": 0.5831, "step": 10332 }, { "epoch": 0.8391943474376675, "grad_norm": 4.310642438062851, "learning_rate": 3.314922368334322e-07, "loss": 0.4913, "step": 10333 }, { "epoch": 0.839275562413709, "grad_norm": 3.3683032390188012, "learning_rate": 3.3116507504146633e-07, "loss": 0.5738, "step": 10334 }, { "epoch": 0.8393567773897507, "grad_norm": 4.726658134420679, "learning_rate": 3.3083806332139837e-07, "loss": 0.3906, "step": 10335 }, { "epoch": 0.8394379923657923, "grad_norm": 4.793027997624646, "learning_rate": 3.305112016958562e-07, "loss": 0.3441, "step": 10336 }, { "epoch": 0.8395192073418338, "grad_norm": 15.53691157648671, "learning_rate": 3.3018449018745765e-07, "loss": 0.4705, "step": 10337 }, { "epoch": 0.8396004223178755, "grad_norm": 5.292837088334354, "learning_rate": 3.298579288188081e-07, "loss": 0.5973, "step": 10338 }, { "epoch": 0.839681637293917, "grad_norm": 7.13356115130351, "learning_rate": 3.2953151761250526e-07, "loss": 0.4081, "step": 10339 }, { "epoch": 0.8397628522699586, "grad_norm": 4.657968605497615, "learning_rate": 3.292052565911344e-07, "loss": 0.3476, "step": 10340 }, { "epoch": 0.8398440672460001, "grad_norm": 5.343433901408026, "learning_rate": 3.288791457772708e-07, "loss": 0.4242, "step": 10341 }, { "epoch": 0.8399252822220418, "grad_norm": 7.943551058264865, "learning_rate": 3.2855318519347924e-07, "loss": 0.3954, "step": 10342 }, { "epoch": 0.8400064971980833, "grad_norm": 4.253480461006163, "learning_rate": 3.282273748623152e-07, "loss": 0.6146, "step": 10343 }, { "epoch": 0.8400877121741249, "grad_norm": 3.649658017480646, "learning_rate": 3.279017148063235e-07, "loss": 0.4316, "step": 10344 }, { "epoch": 0.8401689271501664, "grad_norm": 6.247382983100074, "learning_rate": 3.275762050480369e-07, "loss": 0.6303, "step": 10345 }, { "epoch": 0.8402501421262081, "grad_norm": 5.081757914347488, "learning_rate": 3.272508456099799e-07, "loss": 0.3951, "step": 10346 }, { "epoch": 0.8403313571022497, "grad_norm": 4.2849275181266515, "learning_rate": 3.269256365146653e-07, "loss": 0.4885, "step": 10347 }, { "epoch": 0.8404125720782912, "grad_norm": 5.408392743532504, "learning_rate": 3.2660057778459513e-07, "loss": 0.5593, "step": 10348 }, { "epoch": 0.8404937870543329, "grad_norm": 16.566283426785315, "learning_rate": 3.262756694422628e-07, "loss": 0.4464, "step": 10349 }, { "epoch": 0.8405750020303744, "grad_norm": 9.235201670008626, "learning_rate": 3.2595091151015e-07, "loss": 0.4379, "step": 10350 }, { "epoch": 0.840656217006416, "grad_norm": 3.706535071305654, "learning_rate": 3.2562630401072796e-07, "loss": 0.3997, "step": 10351 }, { "epoch": 0.8407374319824575, "grad_norm": 6.291131527025278, "learning_rate": 3.2530184696645846e-07, "loss": 0.3929, "step": 10352 }, { "epoch": 0.8408186469584992, "grad_norm": 6.059066502473035, "learning_rate": 3.249775403997915e-07, "loss": 0.5271, "step": 10353 }, { "epoch": 0.8408998619345407, "grad_norm": 5.828503261223924, "learning_rate": 3.24653384333167e-07, "loss": 0.4112, "step": 10354 }, { "epoch": 0.8409810769105823, "grad_norm": 5.607819748297354, "learning_rate": 3.243293787890162e-07, "loss": 0.5535, "step": 10355 }, { "epoch": 0.8410622918866238, "grad_norm": 4.947445738957033, "learning_rate": 3.2400552378975744e-07, "loss": 0.5297, "step": 10356 }, { "epoch": 0.8411435068626655, "grad_norm": 12.539916176915135, "learning_rate": 3.236818193577998e-07, "loss": 0.5609, "step": 10357 }, { "epoch": 0.8412247218387071, "grad_norm": 7.269085157428185, "learning_rate": 3.233582655155429e-07, "loss": 0.4367, "step": 10358 }, { "epoch": 0.8413059368147486, "grad_norm": 7.3313861075084725, "learning_rate": 3.2303486228537436e-07, "loss": 0.5678, "step": 10359 }, { "epoch": 0.8413871517907903, "grad_norm": 6.798734073899363, "learning_rate": 3.227116096896718e-07, "loss": 0.4094, "step": 10360 }, { "epoch": 0.8414683667668318, "grad_norm": 5.446235513662214, "learning_rate": 3.223885077508024e-07, "loss": 0.5598, "step": 10361 }, { "epoch": 0.8415495817428734, "grad_norm": 5.464432211318365, "learning_rate": 3.220655564911232e-07, "loss": 0.5195, "step": 10362 }, { "epoch": 0.8416307967189149, "grad_norm": 3.4734065266272767, "learning_rate": 3.217427559329814e-07, "loss": 0.4573, "step": 10363 }, { "epoch": 0.8417120116949566, "grad_norm": 4.612719279186468, "learning_rate": 3.2142010609871236e-07, "loss": 0.4274, "step": 10364 }, { "epoch": 0.8417932266709981, "grad_norm": 5.548612820561558, "learning_rate": 3.2109760701064227e-07, "loss": 0.4346, "step": 10365 }, { "epoch": 0.8418744416470397, "grad_norm": 7.436228501727278, "learning_rate": 3.207752586910862e-07, "loss": 0.4898, "step": 10366 }, { "epoch": 0.8419556566230812, "grad_norm": 7.589666228758146, "learning_rate": 3.2045306116234824e-07, "loss": 0.5607, "step": 10367 }, { "epoch": 0.8420368715991229, "grad_norm": 5.181080777503049, "learning_rate": 3.2013101444672345e-07, "loss": 0.4397, "step": 10368 }, { "epoch": 0.8421180865751645, "grad_norm": 7.40645836332802, "learning_rate": 3.198091185664964e-07, "loss": 0.5543, "step": 10369 }, { "epoch": 0.842199301551206, "grad_norm": 4.3886565655110035, "learning_rate": 3.194873735439391e-07, "loss": 0.394, "step": 10370 }, { "epoch": 0.8422805165272477, "grad_norm": 5.3852633366108655, "learning_rate": 3.1916577940131585e-07, "loss": 0.6436, "step": 10371 }, { "epoch": 0.8423617315032892, "grad_norm": 4.336066931433411, "learning_rate": 3.188443361608787e-07, "loss": 0.5485, "step": 10372 }, { "epoch": 0.8424429464793308, "grad_norm": 7.090559367854015, "learning_rate": 3.185230438448694e-07, "loss": 0.5496, "step": 10373 }, { "epoch": 0.8425241614553723, "grad_norm": 5.001322891880096, "learning_rate": 3.182019024755209e-07, "loss": 0.5178, "step": 10374 }, { "epoch": 0.842605376431414, "grad_norm": 8.355308016625518, "learning_rate": 3.1788091207505285e-07, "loss": 0.5462, "step": 10375 }, { "epoch": 0.8426865914074555, "grad_norm": 5.941932747298044, "learning_rate": 3.175600726656772e-07, "loss": 0.3143, "step": 10376 }, { "epoch": 0.8427678063834971, "grad_norm": 13.808992644334168, "learning_rate": 3.172393842695948e-07, "loss": 0.5841, "step": 10377 }, { "epoch": 0.8428490213595387, "grad_norm": 4.737960132276757, "learning_rate": 3.169188469089945e-07, "loss": 0.6052, "step": 10378 }, { "epoch": 0.8429302363355803, "grad_norm": 6.490398809975363, "learning_rate": 3.165984606060565e-07, "loss": 0.4312, "step": 10379 }, { "epoch": 0.8430114513116219, "grad_norm": 6.469336726754756, "learning_rate": 3.1627822538294883e-07, "loss": 0.381, "step": 10380 }, { "epoch": 0.8430926662876634, "grad_norm": 4.644733440027643, "learning_rate": 3.159581412618309e-07, "loss": 0.488, "step": 10381 }, { "epoch": 0.8431738812637051, "grad_norm": 5.02222613398405, "learning_rate": 3.1563820826485127e-07, "loss": 0.4021, "step": 10382 }, { "epoch": 0.8432550962397466, "grad_norm": 5.6081490476021285, "learning_rate": 3.153184264141465e-07, "loss": 0.4034, "step": 10383 }, { "epoch": 0.8433363112157882, "grad_norm": 12.62989950175683, "learning_rate": 3.1499879573184486e-07, "loss": 0.4546, "step": 10384 }, { "epoch": 0.8434175261918297, "grad_norm": 4.275321566511599, "learning_rate": 3.146793162400627e-07, "loss": 0.3961, "step": 10385 }, { "epoch": 0.8434987411678714, "grad_norm": 5.910594667196357, "learning_rate": 3.143599879609055e-07, "loss": 0.4436, "step": 10386 }, { "epoch": 0.8435799561439129, "grad_norm": 4.218477387154232, "learning_rate": 3.1404081091647027e-07, "loss": 0.5959, "step": 10387 }, { "epoch": 0.8436611711199545, "grad_norm": 7.34082449004889, "learning_rate": 3.1372178512884154e-07, "loss": 0.6363, "step": 10388 }, { "epoch": 0.843742386095996, "grad_norm": 5.370383548087424, "learning_rate": 3.1340291062009446e-07, "loss": 0.5443, "step": 10389 }, { "epoch": 0.8438236010720377, "grad_norm": 3.265960200811652, "learning_rate": 3.130841874122942e-07, "loss": 0.4581, "step": 10390 }, { "epoch": 0.8439048160480793, "grad_norm": 8.155289410608034, "learning_rate": 3.1276561552749415e-07, "loss": 0.5283, "step": 10391 }, { "epoch": 0.8439860310241208, "grad_norm": 4.910342387320026, "learning_rate": 3.1244719498773693e-07, "loss": 0.4737, "step": 10392 }, { "epoch": 0.8440672460001625, "grad_norm": 7.838611981148996, "learning_rate": 3.1212892581505697e-07, "loss": 0.5164, "step": 10393 }, { "epoch": 0.844148460976204, "grad_norm": 3.137926778602015, "learning_rate": 3.118108080314758e-07, "loss": 0.6133, "step": 10394 }, { "epoch": 0.8442296759522456, "grad_norm": 6.8077151459524, "learning_rate": 3.1149284165900627e-07, "loss": 0.3985, "step": 10395 }, { "epoch": 0.8443108909282871, "grad_norm": 4.575748337907764, "learning_rate": 3.111750267196492e-07, "loss": 0.6132, "step": 10396 }, { "epoch": 0.8443921059043288, "grad_norm": 3.4885659741848833, "learning_rate": 3.1085736323539647e-07, "loss": 0.584, "step": 10397 }, { "epoch": 0.8444733208803703, "grad_norm": 5.967423076508702, "learning_rate": 3.1053985122822844e-07, "loss": 0.6819, "step": 10398 }, { "epoch": 0.8445545358564119, "grad_norm": 4.9263444027598124, "learning_rate": 3.1022249072011455e-07, "loss": 0.5207, "step": 10399 }, { "epoch": 0.8446357508324535, "grad_norm": 7.840385643435142, "learning_rate": 3.0990528173301557e-07, "loss": 0.4711, "step": 10400 }, { "epoch": 0.8447169658084951, "grad_norm": 6.013199743602192, "learning_rate": 3.095882242888795e-07, "loss": 0.4817, "step": 10401 }, { "epoch": 0.8447981807845367, "grad_norm": 6.574555556543331, "learning_rate": 3.09271318409646e-07, "loss": 0.4581, "step": 10402 }, { "epoch": 0.8448793957605782, "grad_norm": 3.106518820205016, "learning_rate": 3.089545641172434e-07, "loss": 0.549, "step": 10403 }, { "epoch": 0.8449606107366199, "grad_norm": 4.4853372930685715, "learning_rate": 3.086379614335891e-07, "loss": 0.3649, "step": 10404 }, { "epoch": 0.8450418257126614, "grad_norm": 6.692195853008346, "learning_rate": 3.083215103805895e-07, "loss": 0.5595, "step": 10405 }, { "epoch": 0.845123040688703, "grad_norm": 3.961593687618352, "learning_rate": 3.080052109801429e-07, "loss": 0.4905, "step": 10406 }, { "epoch": 0.8452042556647446, "grad_norm": 5.657024124857219, "learning_rate": 3.0768906325413404e-07, "loss": 0.304, "step": 10407 }, { "epoch": 0.8452854706407862, "grad_norm": 4.947735022977894, "learning_rate": 3.073730672244393e-07, "loss": 0.5874, "step": 10408 }, { "epoch": 0.8453666856168277, "grad_norm": 4.804603939591969, "learning_rate": 3.0705722291292457e-07, "loss": 0.5255, "step": 10409 }, { "epoch": 0.8454479005928693, "grad_norm": 8.27849842792289, "learning_rate": 3.067415303414442e-07, "loss": 0.5203, "step": 10410 }, { "epoch": 0.8455291155689109, "grad_norm": 10.32641648689632, "learning_rate": 3.0642598953184164e-07, "loss": 0.4933, "step": 10411 }, { "epoch": 0.8456103305449525, "grad_norm": 3.764527737605284, "learning_rate": 3.0611060050595166e-07, "loss": 0.5453, "step": 10412 }, { "epoch": 0.8456915455209941, "grad_norm": 5.190043324766475, "learning_rate": 3.057953632855973e-07, "loss": 0.5636, "step": 10413 }, { "epoch": 0.8457727604970356, "grad_norm": 4.262101482407405, "learning_rate": 3.0548027789259057e-07, "loss": 0.5088, "step": 10414 }, { "epoch": 0.8458539754730773, "grad_norm": 4.496146141780251, "learning_rate": 3.05165344348734e-07, "loss": 0.4839, "step": 10415 }, { "epoch": 0.8459351904491188, "grad_norm": 5.9108085665989645, "learning_rate": 3.0485056267582054e-07, "loss": 0.6495, "step": 10416 }, { "epoch": 0.8460164054251604, "grad_norm": 5.4963646098218355, "learning_rate": 3.0453593289563015e-07, "loss": 0.4643, "step": 10417 }, { "epoch": 0.846097620401202, "grad_norm": 11.437063436049456, "learning_rate": 3.0422145502993355e-07, "loss": 0.4108, "step": 10418 }, { "epoch": 0.8461788353772436, "grad_norm": 6.788003957398547, "learning_rate": 3.0390712910049166e-07, "loss": 0.3901, "step": 10419 }, { "epoch": 0.8462600503532851, "grad_norm": 4.716129840076831, "learning_rate": 3.035929551290534e-07, "loss": 0.3811, "step": 10420 }, { "epoch": 0.8463412653293267, "grad_norm": 4.429133558600282, "learning_rate": 3.0327893313735814e-07, "loss": 0.3569, "step": 10421 }, { "epoch": 0.8464224803053683, "grad_norm": 5.2938219799131865, "learning_rate": 3.0296506314713534e-07, "loss": 0.4368, "step": 10422 }, { "epoch": 0.8465036952814099, "grad_norm": 4.19248350579702, "learning_rate": 3.0265134518010274e-07, "loss": 0.5749, "step": 10423 }, { "epoch": 0.8465849102574515, "grad_norm": 8.957100181001996, "learning_rate": 3.0233777925796683e-07, "loss": 0.4146, "step": 10424 }, { "epoch": 0.846666125233493, "grad_norm": 4.999603068660096, "learning_rate": 3.020243654024266e-07, "loss": 0.3864, "step": 10425 }, { "epoch": 0.8467473402095347, "grad_norm": 18.067549039426147, "learning_rate": 3.017111036351672e-07, "loss": 0.4653, "step": 10426 }, { "epoch": 0.8468285551855762, "grad_norm": 6.599994188483718, "learning_rate": 3.01397993977865e-07, "loss": 0.4479, "step": 10427 }, { "epoch": 0.8469097701616178, "grad_norm": 5.582107888533948, "learning_rate": 3.010850364521853e-07, "loss": 0.5203, "step": 10428 }, { "epoch": 0.8469909851376594, "grad_norm": 4.2550759235692555, "learning_rate": 3.007722310797842e-07, "loss": 0.463, "step": 10429 }, { "epoch": 0.847072200113701, "grad_norm": 4.496695788396444, "learning_rate": 3.004595778823055e-07, "loss": 0.6158, "step": 10430 }, { "epoch": 0.8471534150897425, "grad_norm": 4.611503155703553, "learning_rate": 3.0014707688138244e-07, "loss": 0.504, "step": 10431 }, { "epoch": 0.8472346300657841, "grad_norm": 5.19185166511966, "learning_rate": 2.9983472809863996e-07, "loss": 0.5022, "step": 10432 }, { "epoch": 0.8473158450418257, "grad_norm": 4.311886074274771, "learning_rate": 2.995225315556891e-07, "loss": 0.5097, "step": 10433 }, { "epoch": 0.8473970600178673, "grad_norm": 5.401400848063594, "learning_rate": 2.992104872741336e-07, "loss": 0.4966, "step": 10434 }, { "epoch": 0.8474782749939089, "grad_norm": 4.369156116773576, "learning_rate": 2.9889859527556517e-07, "loss": 0.5447, "step": 10435 }, { "epoch": 0.8475594899699505, "grad_norm": 4.166723822178224, "learning_rate": 2.985868555815646e-07, "loss": 0.5355, "step": 10436 }, { "epoch": 0.8476407049459921, "grad_norm": 6.75124539957616, "learning_rate": 2.9827526821370274e-07, "loss": 0.6095, "step": 10437 }, { "epoch": 0.8477219199220336, "grad_norm": 31.475529417596338, "learning_rate": 2.9796383319353997e-07, "loss": 0.4393, "step": 10438 }, { "epoch": 0.8478031348980752, "grad_norm": 5.167153691651564, "learning_rate": 2.976525505426253e-07, "loss": 0.546, "step": 10439 }, { "epoch": 0.8478843498741168, "grad_norm": 5.678318484729858, "learning_rate": 2.9734142028249867e-07, "loss": 0.5173, "step": 10440 }, { "epoch": 0.8479655648501584, "grad_norm": 3.293870862181777, "learning_rate": 2.970304424346887e-07, "loss": 0.4497, "step": 10441 }, { "epoch": 0.8480467798261999, "grad_norm": 7.221626978308798, "learning_rate": 2.9671961702071314e-07, "loss": 0.4407, "step": 10442 }, { "epoch": 0.8481279948022415, "grad_norm": 17.363810182049495, "learning_rate": 2.9640894406207875e-07, "loss": 0.5855, "step": 10443 }, { "epoch": 0.8482092097782831, "grad_norm": 74.21346545661238, "learning_rate": 2.960984235802836e-07, "loss": 0.5255, "step": 10444 }, { "epoch": 0.8482904247543247, "grad_norm": 5.175532691743031, "learning_rate": 2.957880555968137e-07, "loss": 0.6829, "step": 10445 }, { "epoch": 0.8483716397303663, "grad_norm": 6.761383142028467, "learning_rate": 2.95477840133144e-07, "loss": 0.4693, "step": 10446 }, { "epoch": 0.8484528547064079, "grad_norm": 3.230874015786307, "learning_rate": 2.951677772107406e-07, "loss": 0.4897, "step": 10447 }, { "epoch": 0.8485340696824495, "grad_norm": 4.35547207648812, "learning_rate": 2.9485786685105876e-07, "loss": 0.3499, "step": 10448 }, { "epoch": 0.848615284658491, "grad_norm": 4.106258252858656, "learning_rate": 2.945481090755417e-07, "loss": 0.5062, "step": 10449 }, { "epoch": 0.8486964996345326, "grad_norm": 7.391399136928735, "learning_rate": 2.942385039056231e-07, "loss": 0.5064, "step": 10450 }, { "epoch": 0.8487777146105742, "grad_norm": 4.053993280844301, "learning_rate": 2.939290513627266e-07, "loss": 0.5534, "step": 10451 }, { "epoch": 0.8488589295866158, "grad_norm": 7.209839450243032, "learning_rate": 2.936197514682637e-07, "loss": 0.5145, "step": 10452 }, { "epoch": 0.8489401445626573, "grad_norm": 4.068133193825577, "learning_rate": 2.933106042436368e-07, "loss": 0.5653, "step": 10453 }, { "epoch": 0.849021359538699, "grad_norm": 7.446752715397537, "learning_rate": 2.930016097102378e-07, "loss": 0.4394, "step": 10454 }, { "epoch": 0.8491025745147405, "grad_norm": 8.151515842737028, "learning_rate": 2.9269276788944726e-07, "loss": 0.4768, "step": 10455 }, { "epoch": 0.8491837894907821, "grad_norm": 4.210590324590446, "learning_rate": 2.923840788026347e-07, "loss": 0.5837, "step": 10456 }, { "epoch": 0.8492650044668237, "grad_norm": 4.153598656728726, "learning_rate": 2.9207554247116047e-07, "loss": 0.4728, "step": 10457 }, { "epoch": 0.8493462194428653, "grad_norm": 4.033479881839166, "learning_rate": 2.917671589163737e-07, "loss": 0.5434, "step": 10458 }, { "epoch": 0.8494274344189069, "grad_norm": 4.9531557828779045, "learning_rate": 2.9145892815961194e-07, "loss": 0.3945, "step": 10459 }, { "epoch": 0.8495086493949484, "grad_norm": 5.232476687979792, "learning_rate": 2.911508502222041e-07, "loss": 0.6322, "step": 10460 }, { "epoch": 0.84958986437099, "grad_norm": 2.92678651363929, "learning_rate": 2.908429251254674e-07, "loss": 0.4652, "step": 10461 }, { "epoch": 0.8496710793470316, "grad_norm": 6.325678937933874, "learning_rate": 2.90535152890708e-07, "loss": 0.4732, "step": 10462 }, { "epoch": 0.8497522943230732, "grad_norm": 5.685417644326699, "learning_rate": 2.902275335392232e-07, "loss": 0.6602, "step": 10463 }, { "epoch": 0.8498335092991147, "grad_norm": 4.163035284682951, "learning_rate": 2.8992006709229803e-07, "loss": 0.3988, "step": 10464 }, { "epoch": 0.8499147242751564, "grad_norm": 4.57874278542473, "learning_rate": 2.8961275357120704e-07, "loss": 0.4362, "step": 10465 }, { "epoch": 0.8499959392511979, "grad_norm": 5.572554269055086, "learning_rate": 2.893055929972152e-07, "loss": 0.4934, "step": 10466 }, { "epoch": 0.8500771542272395, "grad_norm": 10.65165020002116, "learning_rate": 2.8899858539157694e-07, "loss": 0.5137, "step": 10467 }, { "epoch": 0.8501583692032811, "grad_norm": 9.776998737199031, "learning_rate": 2.886917307755349e-07, "loss": 0.4341, "step": 10468 }, { "epoch": 0.8502395841793227, "grad_norm": 6.586810382710384, "learning_rate": 2.8838502917032136e-07, "loss": 0.4922, "step": 10469 }, { "epoch": 0.8503207991553643, "grad_norm": 5.154039748490755, "learning_rate": 2.880784805971595e-07, "loss": 0.3545, "step": 10470 }, { "epoch": 0.8504020141314058, "grad_norm": 3.9568608557183516, "learning_rate": 2.8777208507726056e-07, "loss": 0.4876, "step": 10471 }, { "epoch": 0.8504832291074474, "grad_norm": 4.288857280998075, "learning_rate": 2.874658426318244e-07, "loss": 0.4853, "step": 10472 }, { "epoch": 0.850564444083489, "grad_norm": 4.368402834061044, "learning_rate": 2.871597532820425e-07, "loss": 0.4401, "step": 10473 }, { "epoch": 0.8506456590595306, "grad_norm": 4.033924407915728, "learning_rate": 2.86853817049095e-07, "loss": 0.5619, "step": 10474 }, { "epoch": 0.8507268740355721, "grad_norm": 5.6815820407123585, "learning_rate": 2.865480339541496e-07, "loss": 0.4642, "step": 10475 }, { "epoch": 0.8508080890116138, "grad_norm": 3.948873040109591, "learning_rate": 2.8624240401836647e-07, "loss": 0.4468, "step": 10476 }, { "epoch": 0.8508893039876553, "grad_norm": 3.2353533973872084, "learning_rate": 2.859369272628928e-07, "loss": 0.532, "step": 10477 }, { "epoch": 0.8509705189636969, "grad_norm": 3.308823715003122, "learning_rate": 2.856316037088655e-07, "loss": 0.4157, "step": 10478 }, { "epoch": 0.8510517339397385, "grad_norm": 13.811883074908797, "learning_rate": 2.8532643337741195e-07, "loss": 0.4894, "step": 10479 }, { "epoch": 0.8511329489157801, "grad_norm": 3.2224568567146945, "learning_rate": 2.8502141628964836e-07, "loss": 0.4306, "step": 10480 }, { "epoch": 0.8512141638918217, "grad_norm": 3.9763076127585313, "learning_rate": 2.8471655246668007e-07, "loss": 0.5575, "step": 10481 }, { "epoch": 0.8512953788678632, "grad_norm": 6.740981385096001, "learning_rate": 2.844118419296024e-07, "loss": 0.4055, "step": 10482 }, { "epoch": 0.8513765938439048, "grad_norm": 20.280874148042784, "learning_rate": 2.841072846994994e-07, "loss": 0.3875, "step": 10483 }, { "epoch": 0.8514578088199464, "grad_norm": 9.735322905143065, "learning_rate": 2.8380288079744494e-07, "loss": 0.3133, "step": 10484 }, { "epoch": 0.851539023795988, "grad_norm": 4.313636812248141, "learning_rate": 2.8349863024450143e-07, "loss": 0.4564, "step": 10485 }, { "epoch": 0.8516202387720295, "grad_norm": 6.764374322762755, "learning_rate": 2.8319453306172225e-07, "loss": 0.396, "step": 10486 }, { "epoch": 0.8517014537480712, "grad_norm": 5.159578175212937, "learning_rate": 2.8289058927014944e-07, "loss": 0.5508, "step": 10487 }, { "epoch": 0.8517826687241127, "grad_norm": 9.297209132292254, "learning_rate": 2.8258679889081346e-07, "loss": 0.4523, "step": 10488 }, { "epoch": 0.8518638837001543, "grad_norm": 3.958399617586093, "learning_rate": 2.8228316194473607e-07, "loss": 0.4515, "step": 10489 }, { "epoch": 0.8519450986761959, "grad_norm": 6.059112536774565, "learning_rate": 2.8197967845292687e-07, "loss": 0.5989, "step": 10490 }, { "epoch": 0.8520263136522375, "grad_norm": 7.811507918796633, "learning_rate": 2.8167634843638434e-07, "loss": 0.3805, "step": 10491 }, { "epoch": 0.8521075286282791, "grad_norm": 6.346071562908303, "learning_rate": 2.8137317191609864e-07, "loss": 0.3511, "step": 10492 }, { "epoch": 0.8521887436043206, "grad_norm": 8.889395270642837, "learning_rate": 2.810701489130477e-07, "loss": 0.3602, "step": 10493 }, { "epoch": 0.8522699585803623, "grad_norm": 5.77337789424289, "learning_rate": 2.807672794481986e-07, "loss": 0.3249, "step": 10494 }, { "epoch": 0.8523511735564038, "grad_norm": 5.856871874864799, "learning_rate": 2.804645635425091e-07, "loss": 0.4549, "step": 10495 }, { "epoch": 0.8524323885324454, "grad_norm": 3.9708905797147924, "learning_rate": 2.801620012169251e-07, "loss": 0.4752, "step": 10496 }, { "epoch": 0.8525136035084869, "grad_norm": 5.58347667715412, "learning_rate": 2.7985959249238165e-07, "loss": 0.5097, "step": 10497 }, { "epoch": 0.8525948184845286, "grad_norm": 5.356550864085213, "learning_rate": 2.7955733738980443e-07, "loss": 0.3949, "step": 10498 }, { "epoch": 0.8526760334605701, "grad_norm": 5.227735405567097, "learning_rate": 2.792552359301087e-07, "loss": 0.4579, "step": 10499 }, { "epoch": 0.8527572484366117, "grad_norm": 10.79645556327445, "learning_rate": 2.789532881341969e-07, "loss": 0.5133, "step": 10500 }, { "epoch": 0.8528384634126533, "grad_norm": 9.986241964497967, "learning_rate": 2.786514940229634e-07, "loss": 0.4597, "step": 10501 }, { "epoch": 0.8529196783886949, "grad_norm": 5.528139225360127, "learning_rate": 2.7834985361728987e-07, "loss": 0.312, "step": 10502 }, { "epoch": 0.8530008933647365, "grad_norm": 5.4346809701316685, "learning_rate": 2.7804836693804905e-07, "loss": 0.6169, "step": 10503 }, { "epoch": 0.853082108340778, "grad_norm": 4.421996352718119, "learning_rate": 2.7774703400610086e-07, "loss": 0.5074, "step": 10504 }, { "epoch": 0.8531633233168197, "grad_norm": 12.764502226614324, "learning_rate": 2.7744585484229674e-07, "loss": 0.2709, "step": 10505 }, { "epoch": 0.8532445382928612, "grad_norm": 5.85671022027329, "learning_rate": 2.771448294674775e-07, "loss": 0.4455, "step": 10506 }, { "epoch": 0.8533257532689028, "grad_norm": 8.599669196683427, "learning_rate": 2.768439579024712e-07, "loss": 0.5653, "step": 10507 }, { "epoch": 0.8534069682449443, "grad_norm": 6.732667207212168, "learning_rate": 2.7654324016809757e-07, "loss": 0.4152, "step": 10508 }, { "epoch": 0.853488183220986, "grad_norm": 4.779828616945185, "learning_rate": 2.7624267628516445e-07, "loss": 0.4589, "step": 10509 }, { "epoch": 0.8535693981970275, "grad_norm": 11.113748713116502, "learning_rate": 2.759422662744682e-07, "loss": 0.3779, "step": 10510 }, { "epoch": 0.8536506131730691, "grad_norm": 4.481185872009129, "learning_rate": 2.7564201015679664e-07, "loss": 0.4627, "step": 10511 }, { "epoch": 0.8537318281491107, "grad_norm": 4.723745824336893, "learning_rate": 2.7534190795292626e-07, "loss": 0.5168, "step": 10512 }, { "epoch": 0.8538130431251523, "grad_norm": 4.5772924241630895, "learning_rate": 2.750419596836215e-07, "loss": 0.5351, "step": 10513 }, { "epoch": 0.8538942581011939, "grad_norm": 4.890888601625891, "learning_rate": 2.7474216536963803e-07, "loss": 0.358, "step": 10514 }, { "epoch": 0.8539754730772354, "grad_norm": 9.34358416833541, "learning_rate": 2.744425250317201e-07, "loss": 0.3687, "step": 10515 }, { "epoch": 0.8540566880532771, "grad_norm": 4.739733132295761, "learning_rate": 2.7414303869059994e-07, "loss": 0.5659, "step": 10516 }, { "epoch": 0.8541379030293186, "grad_norm": 4.5314061539867145, "learning_rate": 2.7384370636700187e-07, "loss": 0.4997, "step": 10517 }, { "epoch": 0.8542191180053602, "grad_norm": 10.166709743253572, "learning_rate": 2.735445280816373e-07, "loss": 0.3698, "step": 10518 }, { "epoch": 0.8543003329814017, "grad_norm": 6.233844850352306, "learning_rate": 2.7324550385520844e-07, "loss": 0.4164, "step": 10519 }, { "epoch": 0.8543815479574434, "grad_norm": 4.1121635178190665, "learning_rate": 2.72946633708405e-07, "loss": 0.5892, "step": 10520 }, { "epoch": 0.8544627629334849, "grad_norm": 10.454864690447497, "learning_rate": 2.726479176619087e-07, "loss": 0.5313, "step": 10521 }, { "epoch": 0.8545439779095265, "grad_norm": 3.298486331376736, "learning_rate": 2.723493557363885e-07, "loss": 0.6448, "step": 10522 }, { "epoch": 0.8546251928855682, "grad_norm": 4.677659267745939, "learning_rate": 2.720509479525027e-07, "loss": 0.5786, "step": 10523 }, { "epoch": 0.8547064078616097, "grad_norm": 4.621472290562395, "learning_rate": 2.7175269433089984e-07, "loss": 0.4443, "step": 10524 }, { "epoch": 0.8547876228376513, "grad_norm": 4.075558864406144, "learning_rate": 2.7145459489221845e-07, "loss": 0.4766, "step": 10525 }, { "epoch": 0.8548688378136928, "grad_norm": 3.9376155282925427, "learning_rate": 2.7115664965708387e-07, "loss": 0.5365, "step": 10526 }, { "epoch": 0.8549500527897345, "grad_norm": 4.9988126907241135, "learning_rate": 2.708588586461139e-07, "loss": 0.5849, "step": 10527 }, { "epoch": 0.855031267765776, "grad_norm": 4.260322647357954, "learning_rate": 2.7056122187991306e-07, "loss": 0.5862, "step": 10528 }, { "epoch": 0.8551124827418176, "grad_norm": 4.653669694516675, "learning_rate": 2.7026373937907636e-07, "loss": 0.6337, "step": 10529 }, { "epoch": 0.8551936977178591, "grad_norm": 16.425837953856785, "learning_rate": 2.6996641116418863e-07, "loss": 0.3838, "step": 10530 }, { "epoch": 0.8552749126939008, "grad_norm": 8.7333194311729, "learning_rate": 2.696692372558224e-07, "loss": 0.3394, "step": 10531 }, { "epoch": 0.8553561276699423, "grad_norm": 6.271008188155812, "learning_rate": 2.6937221767454086e-07, "loss": 0.4557, "step": 10532 }, { "epoch": 0.8554373426459839, "grad_norm": 4.33027681898429, "learning_rate": 2.690753524408973e-07, "loss": 0.4263, "step": 10533 }, { "epoch": 0.8555185576220256, "grad_norm": 8.869308357187672, "learning_rate": 2.6877864157543204e-07, "loss": 0.4403, "step": 10534 }, { "epoch": 0.8555997725980671, "grad_norm": 2.978988832273135, "learning_rate": 2.684820850986758e-07, "loss": 0.5313, "step": 10535 }, { "epoch": 0.8556809875741087, "grad_norm": 8.360644599601576, "learning_rate": 2.6818568303114967e-07, "loss": 0.422, "step": 10536 }, { "epoch": 0.8557622025501502, "grad_norm": 11.386127329062182, "learning_rate": 2.67889435393362e-07, "loss": 0.3573, "step": 10537 }, { "epoch": 0.8558434175261919, "grad_norm": 5.2062587599165475, "learning_rate": 2.6759334220581273e-07, "loss": 0.6315, "step": 10538 }, { "epoch": 0.8559246325022334, "grad_norm": 4.445713015076414, "learning_rate": 2.6729740348898886e-07, "loss": 0.4583, "step": 10539 }, { "epoch": 0.856005847478275, "grad_norm": 5.948969244528773, "learning_rate": 2.670016192633687e-07, "loss": 0.4175, "step": 10540 }, { "epoch": 0.8560870624543165, "grad_norm": 7.049516828972635, "learning_rate": 2.667059895494184e-07, "loss": 0.4651, "step": 10541 }, { "epoch": 0.8561682774303582, "grad_norm": 12.341170101279898, "learning_rate": 2.6641051436759353e-07, "loss": 0.397, "step": 10542 }, { "epoch": 0.8562494924063997, "grad_norm": 4.024117199833696, "learning_rate": 2.6611519373834076e-07, "loss": 0.5794, "step": 10543 }, { "epoch": 0.8563307073824413, "grad_norm": 9.173352219765476, "learning_rate": 2.6582002768209326e-07, "loss": 0.4269, "step": 10544 }, { "epoch": 0.856411922358483, "grad_norm": 6.424827613404485, "learning_rate": 2.6552501621927544e-07, "loss": 0.5124, "step": 10545 }, { "epoch": 0.8564931373345245, "grad_norm": 4.601328438451911, "learning_rate": 2.6523015937030136e-07, "loss": 0.4117, "step": 10546 }, { "epoch": 0.8565743523105661, "grad_norm": 5.439497421746022, "learning_rate": 2.649354571555729e-07, "loss": 0.3709, "step": 10547 }, { "epoch": 0.8566555672866076, "grad_norm": 5.6866827995608755, "learning_rate": 2.6464090959548135e-07, "loss": 0.4981, "step": 10548 }, { "epoch": 0.8567367822626493, "grad_norm": 8.576641000068408, "learning_rate": 2.6434651671040894e-07, "loss": 0.4026, "step": 10549 }, { "epoch": 0.8568179972386908, "grad_norm": 4.32299331991155, "learning_rate": 2.6405227852072504e-07, "loss": 0.5568, "step": 10550 }, { "epoch": 0.8568992122147324, "grad_norm": 6.515668343832238, "learning_rate": 2.637581950467896e-07, "loss": 0.7008, "step": 10551 }, { "epoch": 0.8569804271907739, "grad_norm": 4.115866997613363, "learning_rate": 2.634642663089529e-07, "loss": 0.5308, "step": 10552 }, { "epoch": 0.8570616421668156, "grad_norm": 5.714701058196075, "learning_rate": 2.6317049232755185e-07, "loss": 0.4151, "step": 10553 }, { "epoch": 0.8571428571428571, "grad_norm": 6.258820706442769, "learning_rate": 2.628768731229142e-07, "loss": 0.3542, "step": 10554 }, { "epoch": 0.8572240721188987, "grad_norm": 7.447014078813493, "learning_rate": 2.6258340871535753e-07, "loss": 0.3555, "step": 10555 }, { "epoch": 0.8573052870949404, "grad_norm": 4.687623344122876, "learning_rate": 2.6229009912518754e-07, "loss": 0.3792, "step": 10556 }, { "epoch": 0.8573865020709819, "grad_norm": 4.765486433369644, "learning_rate": 2.619969443726994e-07, "loss": 0.5262, "step": 10557 }, { "epoch": 0.8574677170470235, "grad_norm": 4.429572901564648, "learning_rate": 2.6170394447817824e-07, "loss": 0.3549, "step": 10558 }, { "epoch": 0.857548932023065, "grad_norm": 5.054699713881342, "learning_rate": 2.6141109946189874e-07, "loss": 0.6322, "step": 10559 }, { "epoch": 0.8576301469991067, "grad_norm": 5.352387291113756, "learning_rate": 2.611184093441232e-07, "loss": 0.4139, "step": 10560 }, { "epoch": 0.8577113619751482, "grad_norm": 7.045189522256243, "learning_rate": 2.608258741451045e-07, "loss": 0.6806, "step": 10561 }, { "epoch": 0.8577925769511898, "grad_norm": 4.4588250538525935, "learning_rate": 2.605334938850851e-07, "loss": 0.5799, "step": 10562 }, { "epoch": 0.8578737919272313, "grad_norm": 4.741068158876179, "learning_rate": 2.6024126858429503e-07, "loss": 0.4641, "step": 10563 }, { "epoch": 0.857955006903273, "grad_norm": 6.387856164555808, "learning_rate": 2.599491982629554e-07, "loss": 0.4024, "step": 10564 }, { "epoch": 0.8580362218793145, "grad_norm": 11.111320718396847, "learning_rate": 2.596572829412766e-07, "loss": 0.5682, "step": 10565 }, { "epoch": 0.8581174368553561, "grad_norm": 3.9432874147413366, "learning_rate": 2.59365522639457e-07, "loss": 0.4614, "step": 10566 }, { "epoch": 0.8581986518313978, "grad_norm": 4.644099435339523, "learning_rate": 2.590739173776841e-07, "loss": 0.5058, "step": 10567 }, { "epoch": 0.8582798668074393, "grad_norm": 6.2728581984877865, "learning_rate": 2.5878246717613684e-07, "loss": 0.3661, "step": 10568 }, { "epoch": 0.8583610817834809, "grad_norm": 4.322705528890997, "learning_rate": 2.5849117205498096e-07, "loss": 0.6458, "step": 10569 }, { "epoch": 0.8584422967595224, "grad_norm": 3.749986780079194, "learning_rate": 2.582000320343728e-07, "loss": 0.6983, "step": 10570 }, { "epoch": 0.8585235117355641, "grad_norm": 7.654907749368039, "learning_rate": 2.579090471344584e-07, "loss": 0.4619, "step": 10571 }, { "epoch": 0.8586047267116056, "grad_norm": 7.384217550856351, "learning_rate": 2.576182173753719e-07, "loss": 0.4562, "step": 10572 }, { "epoch": 0.8586859416876472, "grad_norm": 6.675039979352513, "learning_rate": 2.5732754277723703e-07, "loss": 0.5025, "step": 10573 }, { "epoch": 0.8587671566636887, "grad_norm": 4.025631310651602, "learning_rate": 2.5703702336016654e-07, "loss": 0.4889, "step": 10574 }, { "epoch": 0.8588483716397304, "grad_norm": 4.195667201107953, "learning_rate": 2.567466591442638e-07, "loss": 0.3889, "step": 10575 }, { "epoch": 0.8589295866157719, "grad_norm": 5.444485183310965, "learning_rate": 2.5645645014961947e-07, "loss": 0.4681, "step": 10576 }, { "epoch": 0.8590108015918135, "grad_norm": 4.670632142056175, "learning_rate": 2.561663963963151e-07, "loss": 0.558, "step": 10577 }, { "epoch": 0.8590920165678552, "grad_norm": 5.0298060033672645, "learning_rate": 2.558764979044212e-07, "loss": 0.5758, "step": 10578 }, { "epoch": 0.8591732315438967, "grad_norm": 3.3846837426199845, "learning_rate": 2.555867546939969e-07, "loss": 0.4184, "step": 10579 }, { "epoch": 0.8592544465199383, "grad_norm": 3.2790619896944957, "learning_rate": 2.5529716678509007e-07, "loss": 0.3676, "step": 10580 }, { "epoch": 0.8593356614959798, "grad_norm": 8.215528066018926, "learning_rate": 2.5500773419774e-07, "loss": 0.5089, "step": 10581 }, { "epoch": 0.8594168764720215, "grad_norm": 7.985001045047634, "learning_rate": 2.547184569519728e-07, "loss": 0.5952, "step": 10582 }, { "epoch": 0.859498091448063, "grad_norm": 9.533391213718112, "learning_rate": 2.5442933506780536e-07, "loss": 0.4201, "step": 10583 }, { "epoch": 0.8595793064241046, "grad_norm": 3.854806058643972, "learning_rate": 2.541403685652438e-07, "loss": 0.5651, "step": 10584 }, { "epoch": 0.8596605214001461, "grad_norm": 10.475802887731723, "learning_rate": 2.53851557464283e-07, "loss": 0.4844, "step": 10585 }, { "epoch": 0.8597417363761878, "grad_norm": 6.07171775635679, "learning_rate": 2.535629017849062e-07, "loss": 0.4356, "step": 10586 }, { "epoch": 0.8598229513522293, "grad_norm": 9.000523149669734, "learning_rate": 2.532744015470878e-07, "loss": 0.3976, "step": 10587 }, { "epoch": 0.8599041663282709, "grad_norm": 3.9919734198421164, "learning_rate": 2.529860567707904e-07, "loss": 0.4646, "step": 10588 }, { "epoch": 0.8599853813043126, "grad_norm": 5.113963141533617, "learning_rate": 2.5269786747596504e-07, "loss": 0.5287, "step": 10589 }, { "epoch": 0.8600665962803541, "grad_norm": 6.0753816648158345, "learning_rate": 2.5240983368255365e-07, "loss": 0.3916, "step": 10590 }, { "epoch": 0.8601478112563957, "grad_norm": 5.777553433923641, "learning_rate": 2.52121955410487e-07, "loss": 0.7061, "step": 10591 }, { "epoch": 0.8602290262324372, "grad_norm": 6.304746787186769, "learning_rate": 2.518342326796844e-07, "loss": 0.4157, "step": 10592 }, { "epoch": 0.8603102412084789, "grad_norm": 8.184175586338531, "learning_rate": 2.515466655100543e-07, "loss": 0.4792, "step": 10593 }, { "epoch": 0.8603914561845204, "grad_norm": 7.804180717198223, "learning_rate": 2.5125925392149533e-07, "loss": 0.4817, "step": 10594 }, { "epoch": 0.860472671160562, "grad_norm": 7.390682075003671, "learning_rate": 2.5097199793389456e-07, "loss": 0.3667, "step": 10595 }, { "epoch": 0.8605538861366036, "grad_norm": 4.844008089075363, "learning_rate": 2.506848975671283e-07, "loss": 0.4751, "step": 10596 }, { "epoch": 0.8606351011126452, "grad_norm": 15.0313162783177, "learning_rate": 2.5039795284106354e-07, "loss": 0.47, "step": 10597 }, { "epoch": 0.8607163160886867, "grad_norm": 7.827029414244254, "learning_rate": 2.5011116377555463e-07, "loss": 0.3705, "step": 10598 }, { "epoch": 0.8607975310647283, "grad_norm": 4.165034621143684, "learning_rate": 2.4982453039044536e-07, "loss": 0.5141, "step": 10599 }, { "epoch": 0.86087874604077, "grad_norm": 9.403450320448428, "learning_rate": 2.495380527055699e-07, "loss": 0.5018, "step": 10600 }, { "epoch": 0.8609599610168115, "grad_norm": 3.669900811182216, "learning_rate": 2.49251730740751e-07, "loss": 0.5024, "step": 10601 }, { "epoch": 0.8610411759928531, "grad_norm": 8.666914052435182, "learning_rate": 2.4896556451579985e-07, "loss": 0.3806, "step": 10602 }, { "epoch": 0.8611223909688946, "grad_norm": 6.5382282595398955, "learning_rate": 2.4867955405051826e-07, "loss": 0.436, "step": 10603 }, { "epoch": 0.8612036059449363, "grad_norm": 6.652949951974725, "learning_rate": 2.483936993646971e-07, "loss": 0.4528, "step": 10604 }, { "epoch": 0.8612848209209778, "grad_norm": 7.183747958159308, "learning_rate": 2.48108000478115e-07, "loss": 0.4787, "step": 10605 }, { "epoch": 0.8613660358970194, "grad_norm": 10.226892509366555, "learning_rate": 2.4782245741054175e-07, "loss": 0.5123, "step": 10606 }, { "epoch": 0.861447250873061, "grad_norm": 6.451808047428951, "learning_rate": 2.475370701817348e-07, "loss": 0.3932, "step": 10607 }, { "epoch": 0.8615284658491026, "grad_norm": 4.898759701665131, "learning_rate": 2.4725183881144114e-07, "loss": 0.5491, "step": 10608 }, { "epoch": 0.8616096808251441, "grad_norm": 31.142791051653955, "learning_rate": 2.4696676331939786e-07, "loss": 0.4651, "step": 10609 }, { "epoch": 0.8616908958011857, "grad_norm": 3.5669616787142697, "learning_rate": 2.46681843725331e-07, "loss": 0.5049, "step": 10610 }, { "epoch": 0.8617721107772274, "grad_norm": 5.205643471968962, "learning_rate": 2.4639708004895515e-07, "loss": 0.4391, "step": 10611 }, { "epoch": 0.8618533257532689, "grad_norm": 5.483632769288808, "learning_rate": 2.4611247230997366e-07, "loss": 0.5294, "step": 10612 }, { "epoch": 0.8619345407293105, "grad_norm": 7.848858229369638, "learning_rate": 2.458280205280811e-07, "loss": 0.4576, "step": 10613 }, { "epoch": 0.862015755705352, "grad_norm": 4.862179433393447, "learning_rate": 2.455437247229595e-07, "loss": 0.5552, "step": 10614 }, { "epoch": 0.8620969706813937, "grad_norm": 7.004075796989462, "learning_rate": 2.4525958491428026e-07, "loss": 0.573, "step": 10615 }, { "epoch": 0.8621781856574352, "grad_norm": 27.404691438629225, "learning_rate": 2.4497560112170444e-07, "loss": 0.5041, "step": 10616 }, { "epoch": 0.8622594006334768, "grad_norm": 3.9710080611622955, "learning_rate": 2.446917733648829e-07, "loss": 0.6183, "step": 10617 }, { "epoch": 0.8623406156095184, "grad_norm": 7.736011644339195, "learning_rate": 2.444081016634545e-07, "loss": 0.6269, "step": 10618 }, { "epoch": 0.86242183058556, "grad_norm": 3.296357608833365, "learning_rate": 2.4412458603704806e-07, "loss": 0.6128, "step": 10619 }, { "epoch": 0.8625030455616015, "grad_norm": 4.26650357660672, "learning_rate": 2.438412265052814e-07, "loss": 0.4254, "step": 10620 }, { "epoch": 0.8625842605376431, "grad_norm": 7.332608647465775, "learning_rate": 2.4355802308776073e-07, "loss": 0.4871, "step": 10621 }, { "epoch": 0.8626654755136848, "grad_norm": 4.755644735563533, "learning_rate": 2.4327497580408285e-07, "loss": 0.5566, "step": 10622 }, { "epoch": 0.8627466904897263, "grad_norm": 3.8321180175370824, "learning_rate": 2.4299208467383347e-07, "loss": 0.4417, "step": 10623 }, { "epoch": 0.8628279054657679, "grad_norm": 4.133812851532381, "learning_rate": 2.427093497165864e-07, "loss": 0.4393, "step": 10624 }, { "epoch": 0.8629091204418095, "grad_norm": 4.292765424503046, "learning_rate": 2.4242677095190623e-07, "loss": 0.5653, "step": 10625 }, { "epoch": 0.8629903354178511, "grad_norm": 5.735964641093858, "learning_rate": 2.4214434839934545e-07, "loss": 0.3592, "step": 10626 }, { "epoch": 0.8630715503938926, "grad_norm": 8.285033867612887, "learning_rate": 2.418620820784462e-07, "loss": 0.5062, "step": 10627 }, { "epoch": 0.8631527653699342, "grad_norm": 4.512095996980739, "learning_rate": 2.4157997200873945e-07, "loss": 0.3959, "step": 10628 }, { "epoch": 0.8632339803459758, "grad_norm": 5.106558948145591, "learning_rate": 2.4129801820974604e-07, "loss": 0.4257, "step": 10629 }, { "epoch": 0.8633151953220174, "grad_norm": 8.542776714887058, "learning_rate": 2.410162207009761e-07, "loss": 0.4211, "step": 10630 }, { "epoch": 0.8633964102980589, "grad_norm": 4.654592367436493, "learning_rate": 2.4073457950192806e-07, "loss": 0.4189, "step": 10631 }, { "epoch": 0.8634776252741005, "grad_norm": 6.844838480590244, "learning_rate": 2.404530946320904e-07, "loss": 0.5268, "step": 10632 }, { "epoch": 0.8635588402501422, "grad_norm": 5.39856003020653, "learning_rate": 2.401717661109401e-07, "loss": 0.5319, "step": 10633 }, { "epoch": 0.8636400552261837, "grad_norm": 6.044733492723436, "learning_rate": 2.398905939579432e-07, "loss": 0.4763, "step": 10634 }, { "epoch": 0.8637212702022253, "grad_norm": 3.216826620946339, "learning_rate": 2.396095781925556e-07, "loss": 0.4624, "step": 10635 }, { "epoch": 0.8638024851782669, "grad_norm": 4.322010048301707, "learning_rate": 2.3932871883422276e-07, "loss": 0.3858, "step": 10636 }, { "epoch": 0.8638837001543085, "grad_norm": 6.581640577707594, "learning_rate": 2.3904801590237783e-07, "loss": 0.3858, "step": 10637 }, { "epoch": 0.86396491513035, "grad_norm": 5.430902885102804, "learning_rate": 2.3876746941644464e-07, "loss": 0.547, "step": 10638 }, { "epoch": 0.8640461301063916, "grad_norm": 9.539650370477867, "learning_rate": 2.384870793958349e-07, "loss": 0.4549, "step": 10639 }, { "epoch": 0.8641273450824332, "grad_norm": 6.254836784455941, "learning_rate": 2.3820684585995012e-07, "loss": 0.5427, "step": 10640 }, { "epoch": 0.8642085600584748, "grad_norm": 7.936489779390335, "learning_rate": 2.379267688281814e-07, "loss": 0.394, "step": 10641 }, { "epoch": 0.8642897750345163, "grad_norm": 4.9403538410304, "learning_rate": 2.3764684831990874e-07, "loss": 0.5883, "step": 10642 }, { "epoch": 0.864370990010558, "grad_norm": 5.718951894452898, "learning_rate": 2.3736708435450033e-07, "loss": 0.4907, "step": 10643 }, { "epoch": 0.8644522049865996, "grad_norm": 4.730922288439977, "learning_rate": 2.370874769513154e-07, "loss": 0.4677, "step": 10644 }, { "epoch": 0.8645334199626411, "grad_norm": 7.22516165081707, "learning_rate": 2.3680802612970068e-07, "loss": 0.4901, "step": 10645 }, { "epoch": 0.8646146349386827, "grad_norm": 5.535751530022896, "learning_rate": 2.365287319089929e-07, "loss": 0.4482, "step": 10646 }, { "epoch": 0.8646958499147243, "grad_norm": 4.330430490684841, "learning_rate": 2.362495943085172e-07, "loss": 0.3961, "step": 10647 }, { "epoch": 0.8647770648907659, "grad_norm": 5.262984242628663, "learning_rate": 2.3597061334758864e-07, "loss": 0.4269, "step": 10648 }, { "epoch": 0.8648582798668074, "grad_norm": 3.5730441803712014, "learning_rate": 2.3569178904551181e-07, "loss": 0.4865, "step": 10649 }, { "epoch": 0.864939494842849, "grad_norm": 4.35982646863664, "learning_rate": 2.3541312142157934e-07, "loss": 0.62, "step": 10650 }, { "epoch": 0.8650207098188906, "grad_norm": 5.501060251786152, "learning_rate": 2.3513461049507385e-07, "loss": 0.309, "step": 10651 }, { "epoch": 0.8651019247949322, "grad_norm": 18.5509951879782, "learning_rate": 2.3485625628526688e-07, "loss": 0.3867, "step": 10652 }, { "epoch": 0.8651831397709737, "grad_norm": 4.778384806825011, "learning_rate": 2.3457805881141854e-07, "loss": 0.4658, "step": 10653 }, { "epoch": 0.8652643547470154, "grad_norm": 7.32649293368322, "learning_rate": 2.3430001809277873e-07, "loss": 0.483, "step": 10654 }, { "epoch": 0.865345569723057, "grad_norm": 6.199897652234001, "learning_rate": 2.340221341485871e-07, "loss": 0.435, "step": 10655 }, { "epoch": 0.8654267846990985, "grad_norm": 4.754389895318495, "learning_rate": 2.3374440699807072e-07, "loss": 0.5084, "step": 10656 }, { "epoch": 0.8655079996751401, "grad_norm": 4.649740673308111, "learning_rate": 2.334668366604481e-07, "loss": 0.5881, "step": 10657 }, { "epoch": 0.8655892146511817, "grad_norm": 6.638508068101479, "learning_rate": 2.3318942315492477e-07, "loss": 0.38, "step": 10658 }, { "epoch": 0.8656704296272233, "grad_norm": 3.952860937759617, "learning_rate": 2.3291216650069587e-07, "loss": 0.4626, "step": 10659 }, { "epoch": 0.8657516446032648, "grad_norm": 6.6956489482132, "learning_rate": 2.3263506671694747e-07, "loss": 0.4982, "step": 10660 }, { "epoch": 0.8658328595793064, "grad_norm": 3.478866841645943, "learning_rate": 2.323581238228517e-07, "loss": 0.4391, "step": 10661 }, { "epoch": 0.865914074555348, "grad_norm": 4.525751483245147, "learning_rate": 2.3208133783757302e-07, "loss": 0.5142, "step": 10662 }, { "epoch": 0.8659952895313896, "grad_norm": 4.621330522104981, "learning_rate": 2.3180470878026275e-07, "loss": 0.5449, "step": 10663 }, { "epoch": 0.8660765045074311, "grad_norm": 4.973654556392699, "learning_rate": 2.3152823667006248e-07, "loss": 0.4468, "step": 10664 }, { "epoch": 0.8661577194834728, "grad_norm": 5.396466731566874, "learning_rate": 2.3125192152610277e-07, "loss": 0.633, "step": 10665 }, { "epoch": 0.8662389344595144, "grad_norm": 4.918829274592111, "learning_rate": 2.3097576336750248e-07, "loss": 0.4153, "step": 10666 }, { "epoch": 0.8663201494355559, "grad_norm": 6.0092199166872, "learning_rate": 2.3069976221337054e-07, "loss": 0.7968, "step": 10667 }, { "epoch": 0.8664013644115975, "grad_norm": 13.224667725694793, "learning_rate": 2.304239180828055e-07, "loss": 0.6541, "step": 10668 }, { "epoch": 0.8664825793876391, "grad_norm": 4.332528390591684, "learning_rate": 2.3014823099489326e-07, "loss": 0.4286, "step": 10669 }, { "epoch": 0.8665637943636807, "grad_norm": 8.678328469216439, "learning_rate": 2.2987270096871072e-07, "loss": 0.457, "step": 10670 }, { "epoch": 0.8666450093397222, "grad_norm": 6.543536578014087, "learning_rate": 2.2959732802332296e-07, "loss": 0.3446, "step": 10671 }, { "epoch": 0.8667262243157638, "grad_norm": 5.10238287168412, "learning_rate": 2.2932211217778388e-07, "loss": 0.6514, "step": 10672 }, { "epoch": 0.8668074392918054, "grad_norm": 4.902163867519704, "learning_rate": 2.2904705345113743e-07, "loss": 0.6377, "step": 10673 }, { "epoch": 0.866888654267847, "grad_norm": 4.914208356693128, "learning_rate": 2.287721518624156e-07, "loss": 0.5384, "step": 10674 }, { "epoch": 0.8669698692438885, "grad_norm": 4.695162916684552, "learning_rate": 2.2849740743064063e-07, "loss": 0.3763, "step": 10675 }, { "epoch": 0.8670510842199302, "grad_norm": 5.516447784018673, "learning_rate": 2.282228201748238e-07, "loss": 0.4784, "step": 10676 }, { "epoch": 0.8671322991959718, "grad_norm": 4.784027348664785, "learning_rate": 2.2794839011396453e-07, "loss": 0.3715, "step": 10677 }, { "epoch": 0.8672135141720133, "grad_norm": 4.524507243360301, "learning_rate": 2.2767411726705157e-07, "loss": 0.66, "step": 10678 }, { "epoch": 0.8672947291480549, "grad_norm": 6.2607763304203194, "learning_rate": 2.2740000165306393e-07, "loss": 0.4065, "step": 10679 }, { "epoch": 0.8673759441240965, "grad_norm": 6.045389943291753, "learning_rate": 2.2712604329096833e-07, "loss": 0.5403, "step": 10680 }, { "epoch": 0.8674571591001381, "grad_norm": 4.945546166899713, "learning_rate": 2.2685224219972185e-07, "loss": 0.6084, "step": 10681 }, { "epoch": 0.8675383740761796, "grad_norm": 7.020025507043163, "learning_rate": 2.2657859839826934e-07, "loss": 0.4117, "step": 10682 }, { "epoch": 0.8676195890522213, "grad_norm": 3.378493580413376, "learning_rate": 2.2630511190554621e-07, "loss": 0.3219, "step": 10683 }, { "epoch": 0.8677008040282628, "grad_norm": 14.896543415086516, "learning_rate": 2.260317827404762e-07, "loss": 0.3889, "step": 10684 }, { "epoch": 0.8677820190043044, "grad_norm": 4.531796982257481, "learning_rate": 2.2575861092197143e-07, "loss": 0.6512, "step": 10685 }, { "epoch": 0.8678632339803459, "grad_norm": 7.006910779412939, "learning_rate": 2.254855964689351e-07, "loss": 0.5622, "step": 10686 }, { "epoch": 0.8679444489563876, "grad_norm": 5.778562835427389, "learning_rate": 2.2521273940025705e-07, "loss": 0.5571, "step": 10687 }, { "epoch": 0.8680256639324292, "grad_norm": 5.550998523968033, "learning_rate": 2.2494003973481864e-07, "loss": 0.481, "step": 10688 }, { "epoch": 0.8681068789084707, "grad_norm": 5.8427464651275836, "learning_rate": 2.2466749749148919e-07, "loss": 0.4879, "step": 10689 }, { "epoch": 0.8681880938845123, "grad_norm": 4.136202285236531, "learning_rate": 2.2439511268912666e-07, "loss": 0.3003, "step": 10690 }, { "epoch": 0.8682693088605539, "grad_norm": 7.410829723701502, "learning_rate": 2.2412288534657878e-07, "loss": 0.556, "step": 10691 }, { "epoch": 0.8683505238365955, "grad_norm": 5.668425487445851, "learning_rate": 2.2385081548268268e-07, "loss": 0.5302, "step": 10692 }, { "epoch": 0.868431738812637, "grad_norm": 5.0755641679869505, "learning_rate": 2.2357890311626328e-07, "loss": 0.4406, "step": 10693 }, { "epoch": 0.8685129537886787, "grad_norm": 4.6801666968241795, "learning_rate": 2.2330714826613586e-07, "loss": 0.3976, "step": 10694 }, { "epoch": 0.8685941687647202, "grad_norm": 5.777100294603406, "learning_rate": 2.2303555095110507e-07, "loss": 0.4059, "step": 10695 }, { "epoch": 0.8686753837407618, "grad_norm": 5.314872055548993, "learning_rate": 2.2276411118996366e-07, "loss": 0.4719, "step": 10696 }, { "epoch": 0.8687565987168033, "grad_norm": 4.410426094256012, "learning_rate": 2.22492829001493e-07, "loss": 0.4898, "step": 10697 }, { "epoch": 0.868837813692845, "grad_norm": 4.7733912494094755, "learning_rate": 2.2222170440446557e-07, "loss": 0.4248, "step": 10698 }, { "epoch": 0.8689190286688866, "grad_norm": 6.898790830573995, "learning_rate": 2.219507374176408e-07, "loss": 0.4269, "step": 10699 }, { "epoch": 0.8690002436449281, "grad_norm": 5.2231893468594635, "learning_rate": 2.2167992805976896e-07, "loss": 0.5232, "step": 10700 }, { "epoch": 0.8690814586209697, "grad_norm": 7.520502969621975, "learning_rate": 2.2140927634958788e-07, "loss": 0.3991, "step": 10701 }, { "epoch": 0.8691626735970113, "grad_norm": 4.652004187169794, "learning_rate": 2.2113878230582615e-07, "loss": 0.5095, "step": 10702 }, { "epoch": 0.8692438885730529, "grad_norm": 3.840996235904483, "learning_rate": 2.2086844594719993e-07, "loss": 0.5209, "step": 10703 }, { "epoch": 0.8693251035490944, "grad_norm": 5.428257976832266, "learning_rate": 2.205982672924145e-07, "loss": 0.4342, "step": 10704 }, { "epoch": 0.8694063185251361, "grad_norm": 12.646126359342446, "learning_rate": 2.203282463601661e-07, "loss": 0.5407, "step": 10705 }, { "epoch": 0.8694875335011776, "grad_norm": 8.151893101320535, "learning_rate": 2.2005838316913746e-07, "loss": 0.3985, "step": 10706 }, { "epoch": 0.8695687484772192, "grad_norm": 3.6426553923017337, "learning_rate": 2.1978867773800205e-07, "loss": 0.6897, "step": 10707 }, { "epoch": 0.8696499634532607, "grad_norm": 5.392872165186957, "learning_rate": 2.1951913008542297e-07, "loss": 0.318, "step": 10708 }, { "epoch": 0.8697311784293024, "grad_norm": 6.652109485767294, "learning_rate": 2.1924974023005086e-07, "loss": 0.396, "step": 10709 }, { "epoch": 0.869812393405344, "grad_norm": 6.329696488288848, "learning_rate": 2.189805081905255e-07, "loss": 0.3915, "step": 10710 }, { "epoch": 0.8698936083813855, "grad_norm": 3.238598078513282, "learning_rate": 2.1871143398547735e-07, "loss": 0.5392, "step": 10711 }, { "epoch": 0.8699748233574272, "grad_norm": 6.202895386014103, "learning_rate": 2.184425176335239e-07, "loss": 0.535, "step": 10712 }, { "epoch": 0.8700560383334687, "grad_norm": 7.1313443357236075, "learning_rate": 2.1817375915327342e-07, "loss": 0.3804, "step": 10713 }, { "epoch": 0.8701372533095103, "grad_norm": 4.686543627354921, "learning_rate": 2.1790515856332268e-07, "loss": 0.485, "step": 10714 }, { "epoch": 0.8702184682855518, "grad_norm": 4.348632743659829, "learning_rate": 2.1763671588225705e-07, "loss": 0.4805, "step": 10715 }, { "epoch": 0.8702996832615935, "grad_norm": 4.845358239649171, "learning_rate": 2.173684311286517e-07, "loss": 0.4082, "step": 10716 }, { "epoch": 0.870380898237635, "grad_norm": 5.562307404701293, "learning_rate": 2.1710030432106982e-07, "loss": 0.5136, "step": 10717 }, { "epoch": 0.8704621132136766, "grad_norm": 15.50221360580663, "learning_rate": 2.1683233547806494e-07, "loss": 0.4594, "step": 10718 }, { "epoch": 0.8705433281897181, "grad_norm": 8.184235968979692, "learning_rate": 2.1656452461817883e-07, "loss": 0.5134, "step": 10719 }, { "epoch": 0.8706245431657598, "grad_norm": 5.086338805312825, "learning_rate": 2.162968717599423e-07, "loss": 0.6206, "step": 10720 }, { "epoch": 0.8707057581418014, "grad_norm": 6.866907411125338, "learning_rate": 2.1602937692187685e-07, "loss": 0.3373, "step": 10721 }, { "epoch": 0.8707869731178429, "grad_norm": 6.472579080212052, "learning_rate": 2.1576204012249053e-07, "loss": 0.4828, "step": 10722 }, { "epoch": 0.8708681880938846, "grad_norm": 6.9868588515529915, "learning_rate": 2.1549486138028125e-07, "loss": 0.537, "step": 10723 }, { "epoch": 0.8709494030699261, "grad_norm": 4.648588890106703, "learning_rate": 2.152278407137376e-07, "loss": 0.4446, "step": 10724 }, { "epoch": 0.8710306180459677, "grad_norm": 5.2378357215552915, "learning_rate": 2.1496097814133503e-07, "loss": 0.4319, "step": 10725 }, { "epoch": 0.8711118330220092, "grad_norm": 4.349980838222139, "learning_rate": 2.146942736815391e-07, "loss": 0.3548, "step": 10726 }, { "epoch": 0.8711930479980509, "grad_norm": 3.1974318652567026, "learning_rate": 2.1442772735280532e-07, "loss": 0.574, "step": 10727 }, { "epoch": 0.8712742629740924, "grad_norm": 6.948883443858628, "learning_rate": 2.1416133917357668e-07, "loss": 0.5418, "step": 10728 }, { "epoch": 0.871355477950134, "grad_norm": 7.741555115120562, "learning_rate": 2.1389510916228513e-07, "loss": 0.5892, "step": 10729 }, { "epoch": 0.8714366929261755, "grad_norm": 4.815397769745156, "learning_rate": 2.136290373373534e-07, "loss": 0.6456, "step": 10730 }, { "epoch": 0.8715179079022172, "grad_norm": 4.733327642778111, "learning_rate": 2.1336312371719182e-07, "loss": 0.5199, "step": 10731 }, { "epoch": 0.8715991228782588, "grad_norm": 6.863949439219225, "learning_rate": 2.130973683201998e-07, "loss": 0.489, "step": 10732 }, { "epoch": 0.8716803378543003, "grad_norm": 6.500756310540661, "learning_rate": 2.128317711647665e-07, "loss": 0.4344, "step": 10733 }, { "epoch": 0.871761552830342, "grad_norm": 7.69706976013433, "learning_rate": 2.125663322692706e-07, "loss": 0.5088, "step": 10734 }, { "epoch": 0.8718427678063835, "grad_norm": 6.331349228694531, "learning_rate": 2.1230105165207848e-07, "loss": 0.6559, "step": 10735 }, { "epoch": 0.8719239827824251, "grad_norm": 5.041214163229692, "learning_rate": 2.120359293315455e-07, "loss": 0.6072, "step": 10736 }, { "epoch": 0.8720051977584666, "grad_norm": 5.582839372557744, "learning_rate": 2.1177096532601777e-07, "loss": 0.4588, "step": 10737 }, { "epoch": 0.8720864127345083, "grad_norm": 7.814578266112515, "learning_rate": 2.115061596538287e-07, "loss": 0.4815, "step": 10738 }, { "epoch": 0.8721676277105498, "grad_norm": 5.583906428631362, "learning_rate": 2.112415123333014e-07, "loss": 0.5404, "step": 10739 }, { "epoch": 0.8722488426865914, "grad_norm": 7.942670374026383, "learning_rate": 2.1097702338274907e-07, "loss": 0.4854, "step": 10740 }, { "epoch": 0.8723300576626329, "grad_norm": 4.655699223151669, "learning_rate": 2.1071269282047196e-07, "loss": 0.6385, "step": 10741 }, { "epoch": 0.8724112726386746, "grad_norm": 5.347652457276179, "learning_rate": 2.1044852066476052e-07, "loss": 0.5986, "step": 10742 }, { "epoch": 0.8724924876147162, "grad_norm": 6.78105342737064, "learning_rate": 2.1018450693389452e-07, "loss": 0.4435, "step": 10743 }, { "epoch": 0.8725737025907577, "grad_norm": 4.350453860498677, "learning_rate": 2.099206516461419e-07, "loss": 0.5274, "step": 10744 }, { "epoch": 0.8726549175667994, "grad_norm": 4.768332070430383, "learning_rate": 2.096569548197594e-07, "loss": 0.551, "step": 10745 }, { "epoch": 0.8727361325428409, "grad_norm": 6.078372849255497, "learning_rate": 2.0939341647299437e-07, "loss": 0.4384, "step": 10746 }, { "epoch": 0.8728173475188825, "grad_norm": 11.608495384238454, "learning_rate": 2.0913003662408254e-07, "loss": 0.6075, "step": 10747 }, { "epoch": 0.872898562494924, "grad_norm": 8.027806223001297, "learning_rate": 2.0886681529124765e-07, "loss": 0.4888, "step": 10748 }, { "epoch": 0.8729797774709657, "grad_norm": 4.082351604317076, "learning_rate": 2.086037524927037e-07, "loss": 0.5281, "step": 10749 }, { "epoch": 0.8730609924470072, "grad_norm": 5.82956001643256, "learning_rate": 2.0834084824665314e-07, "loss": 0.5854, "step": 10750 }, { "epoch": 0.8731422074230488, "grad_norm": 9.953252455414237, "learning_rate": 2.0807810257128692e-07, "loss": 0.4345, "step": 10751 }, { "epoch": 0.8732234223990903, "grad_norm": 5.897867057523133, "learning_rate": 2.0781551548478607e-07, "loss": 0.5252, "step": 10752 }, { "epoch": 0.873304637375132, "grad_norm": 5.203890577661351, "learning_rate": 2.0755308700532077e-07, "loss": 0.3069, "step": 10753 }, { "epoch": 0.8733858523511736, "grad_norm": 9.388507828923037, "learning_rate": 2.0729081715104958e-07, "loss": 0.3058, "step": 10754 }, { "epoch": 0.8734670673272151, "grad_norm": 4.665994646948376, "learning_rate": 2.070287059401191e-07, "loss": 0.4658, "step": 10755 }, { "epoch": 0.8735482823032568, "grad_norm": 4.509183704578975, "learning_rate": 2.0676675339066726e-07, "loss": 0.4733, "step": 10756 }, { "epoch": 0.8736294972792983, "grad_norm": 4.982775473415814, "learning_rate": 2.0650495952081935e-07, "loss": 0.388, "step": 10757 }, { "epoch": 0.8737107122553399, "grad_norm": 3.915834558563621, "learning_rate": 2.062433243486897e-07, "loss": 0.4357, "step": 10758 }, { "epoch": 0.8737919272313814, "grad_norm": 5.779273705619936, "learning_rate": 2.059818478923825e-07, "loss": 0.4962, "step": 10759 }, { "epoch": 0.8738731422074231, "grad_norm": 7.124975275931914, "learning_rate": 2.0572053016999079e-07, "loss": 0.4915, "step": 10760 }, { "epoch": 0.8739543571834646, "grad_norm": 4.688739212653256, "learning_rate": 2.0545937119959557e-07, "loss": 0.5523, "step": 10761 }, { "epoch": 0.8740355721595062, "grad_norm": 6.512142297147514, "learning_rate": 2.0519837099926888e-07, "loss": 0.4864, "step": 10762 }, { "epoch": 0.8741167871355477, "grad_norm": 3.968204310846973, "learning_rate": 2.0493752958706982e-07, "loss": 0.4545, "step": 10763 }, { "epoch": 0.8741980021115894, "grad_norm": 5.589051586917989, "learning_rate": 2.0467684698104674e-07, "loss": 0.3359, "step": 10764 }, { "epoch": 0.874279217087631, "grad_norm": 9.371561135244747, "learning_rate": 2.0441632319923798e-07, "loss": 0.3767, "step": 10765 }, { "epoch": 0.8743604320636725, "grad_norm": 6.212854338465917, "learning_rate": 2.0415595825967084e-07, "loss": 0.5425, "step": 10766 }, { "epoch": 0.8744416470397142, "grad_norm": 4.1144373279605135, "learning_rate": 2.0389575218036057e-07, "loss": 0.6139, "step": 10767 }, { "epoch": 0.8745228620157557, "grad_norm": 4.023569708161643, "learning_rate": 2.0363570497931252e-07, "loss": 0.4479, "step": 10768 }, { "epoch": 0.8746040769917973, "grad_norm": 3.8449587292031686, "learning_rate": 2.0337581667452034e-07, "loss": 0.5775, "step": 10769 }, { "epoch": 0.8746852919678388, "grad_norm": 6.240516560084869, "learning_rate": 2.0311608728396658e-07, "loss": 0.5565, "step": 10770 }, { "epoch": 0.8747665069438805, "grad_norm": 7.754381694503205, "learning_rate": 2.0285651682562357e-07, "loss": 0.4689, "step": 10771 }, { "epoch": 0.874847721919922, "grad_norm": 5.711700208395236, "learning_rate": 2.0259710531745247e-07, "loss": 0.4805, "step": 10772 }, { "epoch": 0.8749289368959636, "grad_norm": 5.1893458798213326, "learning_rate": 2.023378527774028e-07, "loss": 0.4845, "step": 10773 }, { "epoch": 0.8750101518720051, "grad_norm": 7.133320338283911, "learning_rate": 2.020787592234133e-07, "loss": 0.4978, "step": 10774 }, { "epoch": 0.8750913668480468, "grad_norm": 22.519170871083364, "learning_rate": 2.0181982467341238e-07, "loss": 0.4116, "step": 10775 }, { "epoch": 0.8751725818240884, "grad_norm": 9.287767664431676, "learning_rate": 2.0156104914531656e-07, "loss": 0.4641, "step": 10776 }, { "epoch": 0.8752537968001299, "grad_norm": 4.893583526059918, "learning_rate": 2.0130243265703148e-07, "loss": 0.481, "step": 10777 }, { "epoch": 0.8753350117761716, "grad_norm": 5.320206234300975, "learning_rate": 2.010439752264523e-07, "loss": 0.3727, "step": 10778 }, { "epoch": 0.8754162267522131, "grad_norm": 4.677489372940032, "learning_rate": 2.0078567687146333e-07, "loss": 0.447, "step": 10779 }, { "epoch": 0.8754974417282547, "grad_norm": 4.0249196656831465, "learning_rate": 2.0052753760993693e-07, "loss": 0.4569, "step": 10780 }, { "epoch": 0.8755786567042962, "grad_norm": 6.529303245559503, "learning_rate": 2.002695574597352e-07, "loss": 0.3912, "step": 10781 }, { "epoch": 0.8756598716803379, "grad_norm": 5.418332568962745, "learning_rate": 2.0001173643870915e-07, "loss": 0.5883, "step": 10782 }, { "epoch": 0.8757410866563794, "grad_norm": 5.33583924657257, "learning_rate": 1.9975407456469808e-07, "loss": 0.4482, "step": 10783 }, { "epoch": 0.875822301632421, "grad_norm": 5.518754450917286, "learning_rate": 1.9949657185553113e-07, "loss": 0.4846, "step": 10784 }, { "epoch": 0.8759035166084626, "grad_norm": 4.466728527582831, "learning_rate": 1.992392283290265e-07, "loss": 0.4897, "step": 10785 }, { "epoch": 0.8759847315845042, "grad_norm": 6.062215529147515, "learning_rate": 1.9898204400299021e-07, "loss": 0.4688, "step": 10786 }, { "epoch": 0.8760659465605458, "grad_norm": 6.196645398811499, "learning_rate": 1.9872501889521916e-07, "loss": 0.4591, "step": 10787 }, { "epoch": 0.8761471615365873, "grad_norm": 5.38690541555504, "learning_rate": 1.984681530234972e-07, "loss": 0.3928, "step": 10788 }, { "epoch": 0.876228376512629, "grad_norm": 9.821582116837323, "learning_rate": 1.9821144640559842e-07, "loss": 0.4919, "step": 10789 }, { "epoch": 0.8763095914886705, "grad_norm": 7.245646513525235, "learning_rate": 1.9795489905928527e-07, "loss": 0.459, "step": 10790 }, { "epoch": 0.8763908064647121, "grad_norm": 7.6962316221213385, "learning_rate": 1.976985110023094e-07, "loss": 0.4777, "step": 10791 }, { "epoch": 0.8764720214407536, "grad_norm": 4.354060604688436, "learning_rate": 1.9744228225241248e-07, "loss": 0.4522, "step": 10792 }, { "epoch": 0.8765532364167953, "grad_norm": 4.660763544694212, "learning_rate": 1.9718621282732302e-07, "loss": 0.4305, "step": 10793 }, { "epoch": 0.8766344513928368, "grad_norm": 6.796070237851726, "learning_rate": 1.9693030274476054e-07, "loss": 0.4301, "step": 10794 }, { "epoch": 0.8767156663688784, "grad_norm": 6.8550929057120635, "learning_rate": 1.9667455202243223e-07, "loss": 0.4308, "step": 10795 }, { "epoch": 0.87679688134492, "grad_norm": 10.526313953549316, "learning_rate": 1.9641896067803452e-07, "loss": 0.4715, "step": 10796 }, { "epoch": 0.8768780963209616, "grad_norm": 4.6047250304925615, "learning_rate": 1.9616352872925293e-07, "loss": 0.529, "step": 10797 }, { "epoch": 0.8769593112970032, "grad_norm": 6.880938367850307, "learning_rate": 1.959082561937628e-07, "loss": 0.5294, "step": 10798 }, { "epoch": 0.8770405262730447, "grad_norm": 3.8531909306263272, "learning_rate": 1.9565314308922666e-07, "loss": 0.4086, "step": 10799 }, { "epoch": 0.8771217412490864, "grad_norm": 5.451866169993255, "learning_rate": 1.9539818943329792e-07, "loss": 0.4557, "step": 10800 }, { "epoch": 0.8772029562251279, "grad_norm": 6.999546249788755, "learning_rate": 1.9514339524361742e-07, "loss": 0.4339, "step": 10801 }, { "epoch": 0.8772841712011695, "grad_norm": 5.662909641489243, "learning_rate": 1.9488876053781552e-07, "loss": 0.4453, "step": 10802 }, { "epoch": 0.877365386177211, "grad_norm": 4.898755716797307, "learning_rate": 1.9463428533351202e-07, "loss": 0.6062, "step": 10803 }, { "epoch": 0.8774466011532527, "grad_norm": 4.183435356460552, "learning_rate": 1.943799696483145e-07, "loss": 0.4585, "step": 10804 }, { "epoch": 0.8775278161292942, "grad_norm": 4.396794358678346, "learning_rate": 1.9412581349982113e-07, "loss": 0.4212, "step": 10805 }, { "epoch": 0.8776090311053358, "grad_norm": 4.627929173347972, "learning_rate": 1.938718169056175e-07, "loss": 0.5645, "step": 10806 }, { "epoch": 0.8776902460813774, "grad_norm": 4.99764782498259, "learning_rate": 1.9361797988327961e-07, "loss": 0.6663, "step": 10807 }, { "epoch": 0.877771461057419, "grad_norm": 3.9890801417363937, "learning_rate": 1.933643024503712e-07, "loss": 0.5455, "step": 10808 }, { "epoch": 0.8778526760334606, "grad_norm": 5.065903978646239, "learning_rate": 1.9311078462444484e-07, "loss": 0.633, "step": 10809 }, { "epoch": 0.8779338910095021, "grad_norm": 7.184247053486219, "learning_rate": 1.928574264230429e-07, "loss": 0.3581, "step": 10810 }, { "epoch": 0.8780151059855438, "grad_norm": 7.648845912445987, "learning_rate": 1.9260422786369747e-07, "loss": 0.3958, "step": 10811 }, { "epoch": 0.8780963209615853, "grad_norm": 2.778368867161788, "learning_rate": 1.9235118896392706e-07, "loss": 0.3277, "step": 10812 }, { "epoch": 0.8781775359376269, "grad_norm": 4.8097717739494685, "learning_rate": 1.9209830974124183e-07, "loss": 0.4247, "step": 10813 }, { "epoch": 0.8782587509136685, "grad_norm": 10.137967528923353, "learning_rate": 1.9184559021313914e-07, "loss": 0.4587, "step": 10814 }, { "epoch": 0.8783399658897101, "grad_norm": 4.796708634400599, "learning_rate": 1.9159303039710558e-07, "loss": 0.4826, "step": 10815 }, { "epoch": 0.8784211808657516, "grad_norm": 9.314388410278989, "learning_rate": 1.9134063031061744e-07, "loss": 0.516, "step": 10816 }, { "epoch": 0.8785023958417932, "grad_norm": 7.232628959274363, "learning_rate": 1.910883899711391e-07, "loss": 0.5164, "step": 10817 }, { "epoch": 0.8785836108178348, "grad_norm": 3.9022829908795322, "learning_rate": 1.9083630939612407e-07, "loss": 0.5444, "step": 10818 }, { "epoch": 0.8786648257938764, "grad_norm": 3.8942957114269627, "learning_rate": 1.9058438860301621e-07, "loss": 0.3569, "step": 10819 }, { "epoch": 0.878746040769918, "grad_norm": 3.166404623353523, "learning_rate": 1.9033262760924598e-07, "loss": 0.4069, "step": 10820 }, { "epoch": 0.8788272557459595, "grad_norm": 5.035265769137359, "learning_rate": 1.900810264322339e-07, "loss": 0.3724, "step": 10821 }, { "epoch": 0.8789084707220012, "grad_norm": 6.137216581765903, "learning_rate": 1.8982958508938998e-07, "loss": 0.4983, "step": 10822 }, { "epoch": 0.8789896856980427, "grad_norm": 6.449621491427784, "learning_rate": 1.895783035981119e-07, "loss": 0.5643, "step": 10823 }, { "epoch": 0.8790709006740843, "grad_norm": 7.017513815097841, "learning_rate": 1.8932718197578802e-07, "loss": 0.4371, "step": 10824 }, { "epoch": 0.8791521156501259, "grad_norm": 4.915420085875843, "learning_rate": 1.890762202397936e-07, "loss": 0.7439, "step": 10825 }, { "epoch": 0.8792333306261675, "grad_norm": 5.057095171528424, "learning_rate": 1.8882541840749475e-07, "loss": 0.5297, "step": 10826 }, { "epoch": 0.879314545602209, "grad_norm": 4.992472857392407, "learning_rate": 1.8857477649624533e-07, "loss": 0.5945, "step": 10827 }, { "epoch": 0.8793957605782506, "grad_norm": 8.570114515507747, "learning_rate": 1.883242945233879e-07, "loss": 0.5583, "step": 10828 }, { "epoch": 0.8794769755542922, "grad_norm": 4.881453778874329, "learning_rate": 1.8807397250625497e-07, "loss": 0.3574, "step": 10829 }, { "epoch": 0.8795581905303338, "grad_norm": 9.774498600782247, "learning_rate": 1.878238104621677e-07, "loss": 0.2625, "step": 10830 }, { "epoch": 0.8796394055063754, "grad_norm": 5.93121202645404, "learning_rate": 1.8757380840843526e-07, "loss": 0.4918, "step": 10831 }, { "epoch": 0.879720620482417, "grad_norm": 3.681222565715722, "learning_rate": 1.8732396636235744e-07, "loss": 0.6349, "step": 10832 }, { "epoch": 0.8798018354584586, "grad_norm": 6.108142266361864, "learning_rate": 1.8707428434122155e-07, "loss": 0.576, "step": 10833 }, { "epoch": 0.8798830504345001, "grad_norm": 6.054085118997518, "learning_rate": 1.8682476236230372e-07, "loss": 0.4904, "step": 10834 }, { "epoch": 0.8799642654105417, "grad_norm": 7.843292571737394, "learning_rate": 1.8657540044287047e-07, "loss": 0.3619, "step": 10835 }, { "epoch": 0.8800454803865833, "grad_norm": 3.6574632924680044, "learning_rate": 1.8632619860017547e-07, "loss": 0.3475, "step": 10836 }, { "epoch": 0.8801266953626249, "grad_norm": 5.74599899447566, "learning_rate": 1.8607715685146244e-07, "loss": 0.463, "step": 10837 }, { "epoch": 0.8802079103386664, "grad_norm": 4.130892724817378, "learning_rate": 1.8582827521396453e-07, "loss": 0.5069, "step": 10838 }, { "epoch": 0.880289125314708, "grad_norm": 6.536393003461804, "learning_rate": 1.855795537049021e-07, "loss": 0.4378, "step": 10839 }, { "epoch": 0.8803703402907496, "grad_norm": 5.520938271169849, "learning_rate": 1.853309923414856e-07, "loss": 0.481, "step": 10840 }, { "epoch": 0.8804515552667912, "grad_norm": 13.77624681570704, "learning_rate": 1.8508259114091432e-07, "loss": 0.5875, "step": 10841 }, { "epoch": 0.8805327702428328, "grad_norm": 4.241071474476681, "learning_rate": 1.8483435012037587e-07, "loss": 0.3521, "step": 10842 }, { "epoch": 0.8806139852188744, "grad_norm": 8.981810629766965, "learning_rate": 1.8458626929704821e-07, "loss": 0.4218, "step": 10843 }, { "epoch": 0.880695200194916, "grad_norm": 4.1379268820071236, "learning_rate": 1.843383486880959e-07, "loss": 0.6531, "step": 10844 }, { "epoch": 0.8807764151709575, "grad_norm": 4.30762221426034, "learning_rate": 1.840905883106747e-07, "loss": 0.6161, "step": 10845 }, { "epoch": 0.8808576301469991, "grad_norm": 6.207428929843423, "learning_rate": 1.8384298818192814e-07, "loss": 0.4597, "step": 10846 }, { "epoch": 0.8809388451230407, "grad_norm": 6.194628295310705, "learning_rate": 1.835955483189883e-07, "loss": 0.4932, "step": 10847 }, { "epoch": 0.8810200600990823, "grad_norm": 4.398170364958908, "learning_rate": 1.833482687389776e-07, "loss": 0.4016, "step": 10848 }, { "epoch": 0.8811012750751238, "grad_norm": 6.570347236455752, "learning_rate": 1.831011494590054e-07, "loss": 0.4765, "step": 10849 }, { "epoch": 0.8811824900511654, "grad_norm": 5.039000129540002, "learning_rate": 1.828541904961717e-07, "loss": 0.4721, "step": 10850 }, { "epoch": 0.881263705027207, "grad_norm": 6.470581102201411, "learning_rate": 1.8260739186756527e-07, "loss": 0.7508, "step": 10851 }, { "epoch": 0.8813449200032486, "grad_norm": 4.439815787899128, "learning_rate": 1.8236075359026246e-07, "loss": 0.4522, "step": 10852 }, { "epoch": 0.8814261349792902, "grad_norm": 6.875926190964903, "learning_rate": 1.8211427568132932e-07, "loss": 0.4571, "step": 10853 }, { "epoch": 0.8815073499553318, "grad_norm": 5.042487513250472, "learning_rate": 1.8186795815782143e-07, "loss": 0.4715, "step": 10854 }, { "epoch": 0.8815885649313734, "grad_norm": 6.319914060990893, "learning_rate": 1.8162180103678177e-07, "loss": 0.357, "step": 10855 }, { "epoch": 0.8816697799074149, "grad_norm": 5.98665836166037, "learning_rate": 1.813758043352437e-07, "loss": 0.6732, "step": 10856 }, { "epoch": 0.8817509948834565, "grad_norm": 4.90779966317718, "learning_rate": 1.8112996807022943e-07, "loss": 0.4064, "step": 10857 }, { "epoch": 0.8818322098594981, "grad_norm": 4.954078590442837, "learning_rate": 1.8088429225874865e-07, "loss": 0.5962, "step": 10858 }, { "epoch": 0.8819134248355397, "grad_norm": 7.208628926964807, "learning_rate": 1.8063877691780114e-07, "loss": 0.4485, "step": 10859 }, { "epoch": 0.8819946398115812, "grad_norm": 5.399228378212703, "learning_rate": 1.8039342206437494e-07, "loss": 0.6773, "step": 10860 }, { "epoch": 0.8820758547876228, "grad_norm": 6.733238765318234, "learning_rate": 1.8014822771544787e-07, "loss": 0.5487, "step": 10861 }, { "epoch": 0.8821570697636644, "grad_norm": 6.204508713784901, "learning_rate": 1.7990319388798527e-07, "loss": 0.379, "step": 10862 }, { "epoch": 0.882238284739706, "grad_norm": 4.9418711968437545, "learning_rate": 1.79658320598943e-07, "loss": 0.4973, "step": 10863 }, { "epoch": 0.8823194997157476, "grad_norm": 7.6609348179930805, "learning_rate": 1.79413607865265e-07, "loss": 0.5288, "step": 10864 }, { "epoch": 0.8824007146917892, "grad_norm": 5.302913931640557, "learning_rate": 1.7916905570388387e-07, "loss": 0.4702, "step": 10865 }, { "epoch": 0.8824819296678308, "grad_norm": 5.228383412229427, "learning_rate": 1.7892466413172076e-07, "loss": 0.3797, "step": 10866 }, { "epoch": 0.8825631446438723, "grad_norm": 4.948041110181531, "learning_rate": 1.7868043316568718e-07, "loss": 0.3931, "step": 10867 }, { "epoch": 0.8826443596199139, "grad_norm": 8.069840917352794, "learning_rate": 1.784363628226818e-07, "loss": 0.4122, "step": 10868 }, { "epoch": 0.8827255745959555, "grad_norm": 5.771520601557648, "learning_rate": 1.781924531195933e-07, "loss": 0.4682, "step": 10869 }, { "epoch": 0.8828067895719971, "grad_norm": 5.964777339901852, "learning_rate": 1.7794870407329968e-07, "loss": 0.3881, "step": 10870 }, { "epoch": 0.8828880045480386, "grad_norm": 17.63419248026244, "learning_rate": 1.7770511570066622e-07, "loss": 0.4282, "step": 10871 }, { "epoch": 0.8829692195240803, "grad_norm": 4.937868582682188, "learning_rate": 1.7746168801854786e-07, "loss": 0.4074, "step": 10872 }, { "epoch": 0.8830504345001218, "grad_norm": 7.45687522217002, "learning_rate": 1.772184210437894e-07, "loss": 0.3702, "step": 10873 }, { "epoch": 0.8831316494761634, "grad_norm": 4.456345546727624, "learning_rate": 1.7697531479322294e-07, "loss": 0.4297, "step": 10874 }, { "epoch": 0.883212864452205, "grad_norm": 4.869647315752035, "learning_rate": 1.7673236928366976e-07, "loss": 0.4171, "step": 10875 }, { "epoch": 0.8832940794282466, "grad_norm": 4.695411262956829, "learning_rate": 1.7648958453194086e-07, "loss": 0.4707, "step": 10876 }, { "epoch": 0.8833752944042882, "grad_norm": 7.882022541168915, "learning_rate": 1.7624696055483643e-07, "loss": 0.4886, "step": 10877 }, { "epoch": 0.8834565093803297, "grad_norm": 7.071815658627316, "learning_rate": 1.7600449736914384e-07, "loss": 0.4957, "step": 10878 }, { "epoch": 0.8835377243563713, "grad_norm": 5.21615020327953, "learning_rate": 1.7576219499163995e-07, "loss": 0.395, "step": 10879 }, { "epoch": 0.8836189393324129, "grad_norm": 3.7677226715193015, "learning_rate": 1.7552005343909162e-07, "loss": 0.6283, "step": 10880 }, { "epoch": 0.8837001543084545, "grad_norm": 6.221136511822739, "learning_rate": 1.7527807272825326e-07, "loss": 0.4107, "step": 10881 }, { "epoch": 0.883781369284496, "grad_norm": 4.483843051687907, "learning_rate": 1.7503625287586896e-07, "loss": 0.4108, "step": 10882 }, { "epoch": 0.8838625842605377, "grad_norm": 5.371900604040972, "learning_rate": 1.7479459389867141e-07, "loss": 0.463, "step": 10883 }, { "epoch": 0.8839437992365792, "grad_norm": 4.729136507795324, "learning_rate": 1.7455309581338204e-07, "loss": 0.5657, "step": 10884 }, { "epoch": 0.8840250142126208, "grad_norm": 5.952224993059543, "learning_rate": 1.7431175863671102e-07, "loss": 0.462, "step": 10885 }, { "epoch": 0.8841062291886624, "grad_norm": 5.332245081683401, "learning_rate": 1.740705823853578e-07, "loss": 0.5023, "step": 10886 }, { "epoch": 0.884187444164704, "grad_norm": 4.301376869737246, "learning_rate": 1.7382956707601068e-07, "loss": 0.6023, "step": 10887 }, { "epoch": 0.8842686591407456, "grad_norm": 7.399999113034231, "learning_rate": 1.7358871272534604e-07, "loss": 0.383, "step": 10888 }, { "epoch": 0.8843498741167871, "grad_norm": 6.01935069429763, "learning_rate": 1.7334801935003003e-07, "loss": 0.3966, "step": 10889 }, { "epoch": 0.8844310890928287, "grad_norm": 12.267510974208244, "learning_rate": 1.7310748696671791e-07, "loss": 0.4748, "step": 10890 }, { "epoch": 0.8845123040688703, "grad_norm": 4.471683226123735, "learning_rate": 1.728671155920525e-07, "loss": 0.557, "step": 10891 }, { "epoch": 0.8845935190449119, "grad_norm": 4.5797052007927626, "learning_rate": 1.7262690524266658e-07, "loss": 0.3825, "step": 10892 }, { "epoch": 0.8846747340209534, "grad_norm": 5.819985449696891, "learning_rate": 1.7238685593518157e-07, "loss": 0.4803, "step": 10893 }, { "epoch": 0.8847559489969951, "grad_norm": 12.610582423388232, "learning_rate": 1.7214696768620699e-07, "loss": 0.4021, "step": 10894 }, { "epoch": 0.8848371639730366, "grad_norm": 5.295182105134932, "learning_rate": 1.719072405123423e-07, "loss": 0.4256, "step": 10895 }, { "epoch": 0.8849183789490782, "grad_norm": 4.518250999036192, "learning_rate": 1.7166767443017567e-07, "loss": 0.4549, "step": 10896 }, { "epoch": 0.8849995939251198, "grad_norm": 3.165277818453232, "learning_rate": 1.7142826945628353e-07, "loss": 0.7135, "step": 10897 }, { "epoch": 0.8850808089011614, "grad_norm": 14.285562335378861, "learning_rate": 1.7118902560723072e-07, "loss": 0.5167, "step": 10898 }, { "epoch": 0.885162023877203, "grad_norm": 5.520164322684935, "learning_rate": 1.7094994289957285e-07, "loss": 0.4811, "step": 10899 }, { "epoch": 0.8852432388532445, "grad_norm": 6.172181533567202, "learning_rate": 1.7071102134985224e-07, "loss": 0.4762, "step": 10900 }, { "epoch": 0.8853244538292862, "grad_norm": 4.846940632513523, "learning_rate": 1.7047226097460123e-07, "loss": 0.3914, "step": 10901 }, { "epoch": 0.8854056688053277, "grad_norm": 3.9531646427451927, "learning_rate": 1.7023366179034135e-07, "loss": 0.4715, "step": 10902 }, { "epoch": 0.8854868837813693, "grad_norm": 7.419094636905311, "learning_rate": 1.6999522381358187e-07, "loss": 0.3049, "step": 10903 }, { "epoch": 0.8855680987574108, "grad_norm": 8.695140393852423, "learning_rate": 1.6975694706082125e-07, "loss": 0.4374, "step": 10904 }, { "epoch": 0.8856493137334525, "grad_norm": 4.688891686843388, "learning_rate": 1.6951883154854771e-07, "loss": 0.3473, "step": 10905 }, { "epoch": 0.885730528709494, "grad_norm": 9.242824312960995, "learning_rate": 1.6928087729323695e-07, "loss": 0.4753, "step": 10906 }, { "epoch": 0.8858117436855356, "grad_norm": 21.34205228899008, "learning_rate": 1.6904308431135414e-07, "loss": 0.4638, "step": 10907 }, { "epoch": 0.8858929586615772, "grad_norm": 7.234575319695479, "learning_rate": 1.6880545261935333e-07, "loss": 0.4495, "step": 10908 }, { "epoch": 0.8859741736376188, "grad_norm": 4.584768372230779, "learning_rate": 1.6856798223367777e-07, "loss": 0.4903, "step": 10909 }, { "epoch": 0.8860553886136604, "grad_norm": 3.8405465855706966, "learning_rate": 1.6833067317075875e-07, "loss": 0.5838, "step": 10910 }, { "epoch": 0.8861366035897019, "grad_norm": 11.336953000753638, "learning_rate": 1.680935254470173e-07, "loss": 0.3647, "step": 10911 }, { "epoch": 0.8862178185657436, "grad_norm": 3.4138677524491783, "learning_rate": 1.6785653907886251e-07, "loss": 0.4593, "step": 10912 }, { "epoch": 0.8862990335417851, "grad_norm": 7.281119584083047, "learning_rate": 1.6761971408269184e-07, "loss": 0.4414, "step": 10913 }, { "epoch": 0.8863802485178267, "grad_norm": 8.096587754790656, "learning_rate": 1.673830504748933e-07, "loss": 0.5625, "step": 10914 }, { "epoch": 0.8864614634938682, "grad_norm": 4.3613469827373725, "learning_rate": 1.6714654827184263e-07, "loss": 0.6731, "step": 10915 }, { "epoch": 0.8865426784699099, "grad_norm": 36.26226393417871, "learning_rate": 1.6691020748990455e-07, "loss": 0.4585, "step": 10916 }, { "epoch": 0.8866238934459514, "grad_norm": 4.750194227102958, "learning_rate": 1.6667402814543209e-07, "loss": 0.5141, "step": 10917 }, { "epoch": 0.886705108421993, "grad_norm": 7.5959663569614495, "learning_rate": 1.66438010254768e-07, "loss": 0.6047, "step": 10918 }, { "epoch": 0.8867863233980346, "grad_norm": 6.985478995191811, "learning_rate": 1.662021538342437e-07, "loss": 0.5965, "step": 10919 }, { "epoch": 0.8868675383740762, "grad_norm": 4.967674400774454, "learning_rate": 1.6596645890017832e-07, "loss": 0.4358, "step": 10920 }, { "epoch": 0.8869487533501178, "grad_norm": 4.562428672417342, "learning_rate": 1.6573092546888132e-07, "loss": 0.4619, "step": 10921 }, { "epoch": 0.8870299683261593, "grad_norm": 4.147461101360203, "learning_rate": 1.6549555355665076e-07, "loss": 0.452, "step": 10922 }, { "epoch": 0.887111183302201, "grad_norm": 3.695271801611688, "learning_rate": 1.6526034317977225e-07, "loss": 0.5393, "step": 10923 }, { "epoch": 0.8871923982782425, "grad_norm": 4.837405408033375, "learning_rate": 1.650252943545222e-07, "loss": 0.4885, "step": 10924 }, { "epoch": 0.8872736132542841, "grad_norm": 4.704508929048984, "learning_rate": 1.647904070971637e-07, "loss": 0.5001, "step": 10925 }, { "epoch": 0.8873548282303256, "grad_norm": 3.9115802054782427, "learning_rate": 1.645556814239499e-07, "loss": 0.4963, "step": 10926 }, { "epoch": 0.8874360432063673, "grad_norm": 7.938004047544957, "learning_rate": 1.6432111735112277e-07, "loss": 0.4367, "step": 10927 }, { "epoch": 0.8875172581824088, "grad_norm": 12.387873573548433, "learning_rate": 1.6408671489491323e-07, "loss": 0.3794, "step": 10928 }, { "epoch": 0.8875984731584504, "grad_norm": 7.148993736986469, "learning_rate": 1.6385247407154025e-07, "loss": 0.4554, "step": 10929 }, { "epoch": 0.887679688134492, "grad_norm": 6.143994601377806, "learning_rate": 1.6361839489721227e-07, "loss": 0.46, "step": 10930 }, { "epoch": 0.8877609031105336, "grad_norm": 21.816971684053996, "learning_rate": 1.6338447738812628e-07, "loss": 0.6431, "step": 10931 }, { "epoch": 0.8878421180865752, "grad_norm": 6.378758988903642, "learning_rate": 1.631507215604683e-07, "loss": 0.4062, "step": 10932 }, { "epoch": 0.8879233330626167, "grad_norm": 8.818910417308302, "learning_rate": 1.6291712743041226e-07, "loss": 0.4481, "step": 10933 }, { "epoch": 0.8880045480386584, "grad_norm": 3.3592851595714537, "learning_rate": 1.6268369501412195e-07, "loss": 0.6451, "step": 10934 }, { "epoch": 0.8880857630146999, "grad_norm": 3.537250723511631, "learning_rate": 1.6245042432775054e-07, "loss": 0.4928, "step": 10935 }, { "epoch": 0.8881669779907415, "grad_norm": 6.147519517852104, "learning_rate": 1.622173153874379e-07, "loss": 0.5083, "step": 10936 }, { "epoch": 0.888248192966783, "grad_norm": 3.472301797973062, "learning_rate": 1.61984368209315e-07, "loss": 0.4876, "step": 10937 }, { "epoch": 0.8883294079428247, "grad_norm": 4.755539554685603, "learning_rate": 1.617515828095001e-07, "loss": 0.3319, "step": 10938 }, { "epoch": 0.8884106229188662, "grad_norm": 9.123690619357854, "learning_rate": 1.615189592041e-07, "loss": 0.5498, "step": 10939 }, { "epoch": 0.8884918378949078, "grad_norm": 5.947150990803091, "learning_rate": 1.6128649740921182e-07, "loss": 0.4218, "step": 10940 }, { "epoch": 0.8885730528709495, "grad_norm": 6.785328375370574, "learning_rate": 1.6105419744092105e-07, "loss": 0.5246, "step": 10941 }, { "epoch": 0.888654267846991, "grad_norm": 7.518002211907693, "learning_rate": 1.6082205931530064e-07, "loss": 0.4103, "step": 10942 }, { "epoch": 0.8887354828230326, "grad_norm": 5.241816204201098, "learning_rate": 1.6059008304841417e-07, "loss": 0.422, "step": 10943 }, { "epoch": 0.8888166977990741, "grad_norm": 4.9351607203741175, "learning_rate": 1.6035826865631292e-07, "loss": 0.4762, "step": 10944 }, { "epoch": 0.8888979127751158, "grad_norm": 19.003799619323694, "learning_rate": 1.601266161550366e-07, "loss": 0.6662, "step": 10945 }, { "epoch": 0.8889791277511573, "grad_norm": 3.8825556090436786, "learning_rate": 1.5989512556061516e-07, "loss": 0.6389, "step": 10946 }, { "epoch": 0.8890603427271989, "grad_norm": 7.947300455523199, "learning_rate": 1.5966379688906576e-07, "loss": 0.4424, "step": 10947 }, { "epoch": 0.8891415577032404, "grad_norm": 5.795613017876368, "learning_rate": 1.5943263015639614e-07, "loss": 0.4964, "step": 10948 }, { "epoch": 0.8892227726792821, "grad_norm": 6.8505646268664515, "learning_rate": 1.592016253786008e-07, "loss": 0.6119, "step": 10949 }, { "epoch": 0.8893039876553236, "grad_norm": 8.889358889295613, "learning_rate": 1.5897078257166492e-07, "loss": 0.4562, "step": 10950 }, { "epoch": 0.8893852026313652, "grad_norm": 5.6468959215119225, "learning_rate": 1.5874010175156106e-07, "loss": 0.4509, "step": 10951 }, { "epoch": 0.8894664176074069, "grad_norm": 5.671002281059167, "learning_rate": 1.585095829342509e-07, "loss": 0.3751, "step": 10952 }, { "epoch": 0.8895476325834484, "grad_norm": 7.946535661783076, "learning_rate": 1.5827922613568524e-07, "loss": 0.4659, "step": 10953 }, { "epoch": 0.88962884755949, "grad_norm": 5.9522004926872025, "learning_rate": 1.5804903137180415e-07, "loss": 0.3708, "step": 10954 }, { "epoch": 0.8897100625355315, "grad_norm": 10.178045190705534, "learning_rate": 1.5781899865853544e-07, "loss": 0.5587, "step": 10955 }, { "epoch": 0.8897912775115732, "grad_norm": 4.253031697214437, "learning_rate": 1.5758912801179637e-07, "loss": 0.4611, "step": 10956 }, { "epoch": 0.8898724924876147, "grad_norm": 6.692152756992295, "learning_rate": 1.5735941944749255e-07, "loss": 0.4811, "step": 10957 }, { "epoch": 0.8899537074636563, "grad_norm": 4.694878917526122, "learning_rate": 1.571298729815182e-07, "loss": 0.3912, "step": 10958 }, { "epoch": 0.8900349224396978, "grad_norm": 5.740301752503594, "learning_rate": 1.569004886297576e-07, "loss": 0.5173, "step": 10959 }, { "epoch": 0.8901161374157395, "grad_norm": 4.500280402580769, "learning_rate": 1.5667126640808216e-07, "loss": 0.5058, "step": 10960 }, { "epoch": 0.890197352391781, "grad_norm": 9.359821537794131, "learning_rate": 1.564422063323534e-07, "loss": 0.4839, "step": 10961 }, { "epoch": 0.8902785673678226, "grad_norm": 11.943572724510128, "learning_rate": 1.5621330841842086e-07, "loss": 0.3788, "step": 10962 }, { "epoch": 0.8903597823438643, "grad_norm": 5.141666870810457, "learning_rate": 1.5598457268212353e-07, "loss": 0.4711, "step": 10963 }, { "epoch": 0.8904409973199058, "grad_norm": 3.330373351694768, "learning_rate": 1.5575599913928735e-07, "loss": 0.3897, "step": 10964 }, { "epoch": 0.8905222122959474, "grad_norm": 3.825856052673269, "learning_rate": 1.5552758780572995e-07, "loss": 0.4227, "step": 10965 }, { "epoch": 0.8906034272719889, "grad_norm": 5.142764433332306, "learning_rate": 1.552993386972551e-07, "loss": 0.4816, "step": 10966 }, { "epoch": 0.8906846422480306, "grad_norm": 6.3600498260391785, "learning_rate": 1.5507125182965737e-07, "loss": 0.684, "step": 10967 }, { "epoch": 0.8907658572240721, "grad_norm": 8.817725414443457, "learning_rate": 1.5484332721871804e-07, "loss": 0.5193, "step": 10968 }, { "epoch": 0.8908470722001137, "grad_norm": 7.292802786468984, "learning_rate": 1.5461556488020945e-07, "loss": 0.4265, "step": 10969 }, { "epoch": 0.8909282871761552, "grad_norm": 18.816459578152827, "learning_rate": 1.5438796482989072e-07, "loss": 0.4049, "step": 10970 }, { "epoch": 0.8910095021521969, "grad_norm": 4.741507710988063, "learning_rate": 1.541605270835106e-07, "loss": 0.444, "step": 10971 }, { "epoch": 0.8910907171282384, "grad_norm": 11.630609215202053, "learning_rate": 1.5393325165680707e-07, "loss": 0.4207, "step": 10972 }, { "epoch": 0.89117193210428, "grad_norm": 3.301339255185129, "learning_rate": 1.5370613856550615e-07, "loss": 0.5021, "step": 10973 }, { "epoch": 0.8912531470803217, "grad_norm": 4.4372396364921105, "learning_rate": 1.534791878253228e-07, "loss": 0.3981, "step": 10974 }, { "epoch": 0.8913343620563632, "grad_norm": 8.507200043869714, "learning_rate": 1.5325239945196108e-07, "loss": 0.4659, "step": 10975 }, { "epoch": 0.8914155770324048, "grad_norm": 6.152072516412701, "learning_rate": 1.530257734611132e-07, "loss": 0.4745, "step": 10976 }, { "epoch": 0.8914967920084463, "grad_norm": 3.5425159922458582, "learning_rate": 1.5279930986846047e-07, "loss": 0.6653, "step": 10977 }, { "epoch": 0.891578006984488, "grad_norm": 5.233121096684976, "learning_rate": 1.5257300868967344e-07, "loss": 0.5044, "step": 10978 }, { "epoch": 0.8916592219605295, "grad_norm": 6.291818310904449, "learning_rate": 1.5234686994041016e-07, "loss": 0.4987, "step": 10979 }, { "epoch": 0.8917404369365711, "grad_norm": 5.098504129650742, "learning_rate": 1.521208936363186e-07, "loss": 0.3563, "step": 10980 }, { "epoch": 0.8918216519126126, "grad_norm": 4.803655200270107, "learning_rate": 1.5189507979303575e-07, "loss": 0.7008, "step": 10981 }, { "epoch": 0.8919028668886543, "grad_norm": 4.05749114440078, "learning_rate": 1.5166942842618632e-07, "loss": 0.3977, "step": 10982 }, { "epoch": 0.8919840818646958, "grad_norm": 6.222213392758151, "learning_rate": 1.5144393955138336e-07, "loss": 0.4221, "step": 10983 }, { "epoch": 0.8920652968407374, "grad_norm": 3.854822685859984, "learning_rate": 1.512186131842308e-07, "loss": 0.4855, "step": 10984 }, { "epoch": 0.8921465118167791, "grad_norm": 7.555916028857482, "learning_rate": 1.5099344934031923e-07, "loss": 0.4516, "step": 10985 }, { "epoch": 0.8922277267928206, "grad_norm": 8.539386703282604, "learning_rate": 1.507684480352292e-07, "loss": 0.415, "step": 10986 }, { "epoch": 0.8923089417688622, "grad_norm": 4.964839789056717, "learning_rate": 1.5054360928452915e-07, "loss": 0.4186, "step": 10987 }, { "epoch": 0.8923901567449037, "grad_norm": 6.084627536003772, "learning_rate": 1.5031893310377716e-07, "loss": 0.5064, "step": 10988 }, { "epoch": 0.8924713717209454, "grad_norm": 3.940604600092638, "learning_rate": 1.5009441950851965e-07, "loss": 0.5235, "step": 10989 }, { "epoch": 0.8925525866969869, "grad_norm": 5.3695438746577375, "learning_rate": 1.4987006851429147e-07, "loss": 0.3932, "step": 10990 }, { "epoch": 0.8926338016730285, "grad_norm": 9.145034230965127, "learning_rate": 1.4964588013661657e-07, "loss": 0.3775, "step": 10991 }, { "epoch": 0.89271501664907, "grad_norm": 7.658853077669966, "learning_rate": 1.4942185439100753e-07, "loss": 0.4791, "step": 10992 }, { "epoch": 0.8927962316251117, "grad_norm": 7.168128470178995, "learning_rate": 1.4919799129296615e-07, "loss": 0.3937, "step": 10993 }, { "epoch": 0.8928774466011532, "grad_norm": 5.628929909843431, "learning_rate": 1.489742908579822e-07, "loss": 0.4437, "step": 10994 }, { "epoch": 0.8929586615771948, "grad_norm": 6.054942900449149, "learning_rate": 1.4875075310153504e-07, "loss": 0.4699, "step": 10995 }, { "epoch": 0.8930398765532365, "grad_norm": 5.835599987062635, "learning_rate": 1.4852737803909167e-07, "loss": 0.6048, "step": 10996 }, { "epoch": 0.893121091529278, "grad_norm": 7.175331718017556, "learning_rate": 1.4830416568610893e-07, "loss": 0.5113, "step": 10997 }, { "epoch": 0.8932023065053196, "grad_norm": 8.62919238424243, "learning_rate": 1.4808111605803117e-07, "loss": 0.4906, "step": 10998 }, { "epoch": 0.8932835214813611, "grad_norm": 11.718705876342495, "learning_rate": 1.4785822917029318e-07, "loss": 0.415, "step": 10999 }, { "epoch": 0.8933647364574028, "grad_norm": 6.786219876737145, "learning_rate": 1.476355050383174e-07, "loss": 0.483, "step": 11000 }, { "epoch": 0.8934459514334443, "grad_norm": 4.549779356360936, "learning_rate": 1.4741294367751484e-07, "loss": 0.5062, "step": 11001 }, { "epoch": 0.8935271664094859, "grad_norm": 4.594436506496313, "learning_rate": 1.4719054510328595e-07, "loss": 0.3839, "step": 11002 }, { "epoch": 0.8936083813855275, "grad_norm": 3.87676469142472, "learning_rate": 1.4696830933101868e-07, "loss": 0.4757, "step": 11003 }, { "epoch": 0.8936895963615691, "grad_norm": 4.757027256089799, "learning_rate": 1.467462363760916e-07, "loss": 0.5542, "step": 11004 }, { "epoch": 0.8937708113376106, "grad_norm": 3.9669910321207587, "learning_rate": 1.4652432625387013e-07, "loss": 0.4679, "step": 11005 }, { "epoch": 0.8938520263136522, "grad_norm": 5.6200804103086295, "learning_rate": 1.4630257897970985e-07, "loss": 0.4068, "step": 11006 }, { "epoch": 0.8939332412896939, "grad_norm": 3.8084388168574894, "learning_rate": 1.4608099456895452e-07, "loss": 0.4993, "step": 11007 }, { "epoch": 0.8940144562657354, "grad_norm": 6.108741515365576, "learning_rate": 1.4585957303693664e-07, "loss": 0.4431, "step": 11008 }, { "epoch": 0.894095671241777, "grad_norm": 5.264223037773271, "learning_rate": 1.4563831439897647e-07, "loss": 0.5353, "step": 11009 }, { "epoch": 0.8941768862178185, "grad_norm": 5.540285793593693, "learning_rate": 1.4541721867038532e-07, "loss": 0.4353, "step": 11010 }, { "epoch": 0.8942581011938602, "grad_norm": 21.662383239919144, "learning_rate": 1.4519628586646073e-07, "loss": 0.4418, "step": 11011 }, { "epoch": 0.8943393161699017, "grad_norm": 7.463645051609318, "learning_rate": 1.4497551600249044e-07, "loss": 0.4228, "step": 11012 }, { "epoch": 0.8944205311459433, "grad_norm": 28.77597330752949, "learning_rate": 1.447549090937511e-07, "loss": 0.5227, "step": 11013 }, { "epoch": 0.8945017461219849, "grad_norm": 3.957758324457693, "learning_rate": 1.4453446515550724e-07, "loss": 0.4077, "step": 11014 }, { "epoch": 0.8945829610980265, "grad_norm": 7.878758982064296, "learning_rate": 1.4431418420301157e-07, "loss": 0.5104, "step": 11015 }, { "epoch": 0.894664176074068, "grad_norm": 5.62814185101261, "learning_rate": 1.440940662515075e-07, "loss": 0.6012, "step": 11016 }, { "epoch": 0.8947453910501096, "grad_norm": 5.517645951231921, "learning_rate": 1.4387411131622592e-07, "loss": 0.4324, "step": 11017 }, { "epoch": 0.8948266060261513, "grad_norm": 4.575883407647924, "learning_rate": 1.4365431941238544e-07, "loss": 0.4455, "step": 11018 }, { "epoch": 0.8949078210021928, "grad_norm": 4.43419935645762, "learning_rate": 1.434346905551956e-07, "loss": 0.3927, "step": 11019 }, { "epoch": 0.8949890359782344, "grad_norm": 5.32989054760345, "learning_rate": 1.432152247598534e-07, "loss": 0.456, "step": 11020 }, { "epoch": 0.895070250954276, "grad_norm": 4.528118555469795, "learning_rate": 1.4299592204154445e-07, "loss": 0.4092, "step": 11021 }, { "epoch": 0.8951514659303176, "grad_norm": 5.024689885658865, "learning_rate": 1.4277678241544328e-07, "loss": 0.3977, "step": 11022 }, { "epoch": 0.8952326809063591, "grad_norm": 6.316835150350028, "learning_rate": 1.4255780589671337e-07, "loss": 0.4807, "step": 11023 }, { "epoch": 0.8953138958824007, "grad_norm": 5.3616028545471845, "learning_rate": 1.423389925005067e-07, "loss": 0.4474, "step": 11024 }, { "epoch": 0.8953951108584423, "grad_norm": 3.2813886037054543, "learning_rate": 1.421203422419637e-07, "loss": 0.488, "step": 11025 }, { "epoch": 0.8954763258344839, "grad_norm": 8.181117262341452, "learning_rate": 1.4190185513621473e-07, "loss": 0.4516, "step": 11026 }, { "epoch": 0.8955575408105254, "grad_norm": 5.938608632171009, "learning_rate": 1.416835311983772e-07, "loss": 0.486, "step": 11027 }, { "epoch": 0.895638755786567, "grad_norm": 9.269220540893665, "learning_rate": 1.4146537044355785e-07, "loss": 0.4106, "step": 11028 }, { "epoch": 0.8957199707626087, "grad_norm": 18.999226973217382, "learning_rate": 1.412473728868527e-07, "loss": 0.4675, "step": 11029 }, { "epoch": 0.8958011857386502, "grad_norm": 3.9421317838103103, "learning_rate": 1.410295385433455e-07, "loss": 0.4947, "step": 11030 }, { "epoch": 0.8958824007146918, "grad_norm": 4.337954956064237, "learning_rate": 1.4081186742810948e-07, "loss": 0.4467, "step": 11031 }, { "epoch": 0.8959636156907334, "grad_norm": 5.626043047717378, "learning_rate": 1.4059435955620704e-07, "loss": 0.5096, "step": 11032 }, { "epoch": 0.896044830666775, "grad_norm": 4.67727289541783, "learning_rate": 1.403770149426878e-07, "loss": 0.4278, "step": 11033 }, { "epoch": 0.8961260456428165, "grad_norm": 10.738068197611934, "learning_rate": 1.4015983360259055e-07, "loss": 0.5335, "step": 11034 }, { "epoch": 0.8962072606188581, "grad_norm": 4.8693579129133235, "learning_rate": 1.3994281555094386e-07, "loss": 0.3716, "step": 11035 }, { "epoch": 0.8962884755948997, "grad_norm": 2.9347782845843136, "learning_rate": 1.3972596080276402e-07, "loss": 0.7287, "step": 11036 }, { "epoch": 0.8963696905709413, "grad_norm": 4.942121031320022, "learning_rate": 1.395092693730557e-07, "loss": 0.3883, "step": 11037 }, { "epoch": 0.8964509055469828, "grad_norm": 5.046562171098466, "learning_rate": 1.3929274127681303e-07, "loss": 0.4705, "step": 11038 }, { "epoch": 0.8965321205230244, "grad_norm": 3.9477589888855906, "learning_rate": 1.3907637652901957e-07, "loss": 0.5613, "step": 11039 }, { "epoch": 0.8966133354990661, "grad_norm": 4.192994245868831, "learning_rate": 1.3886017514464555e-07, "loss": 0.39, "step": 11040 }, { "epoch": 0.8966945504751076, "grad_norm": 4.637113922674406, "learning_rate": 1.3864413713865098e-07, "loss": 0.4266, "step": 11041 }, { "epoch": 0.8967757654511492, "grad_norm": 5.8016593697423975, "learning_rate": 1.38428262525985e-07, "loss": 0.5181, "step": 11042 }, { "epoch": 0.8968569804271908, "grad_norm": 10.961817792927185, "learning_rate": 1.3821255132158456e-07, "loss": 0.4214, "step": 11043 }, { "epoch": 0.8969381954032324, "grad_norm": 6.98872156389094, "learning_rate": 1.3799700354037605e-07, "loss": 0.3722, "step": 11044 }, { "epoch": 0.8970194103792739, "grad_norm": 6.014898811863191, "learning_rate": 1.3778161919727472e-07, "loss": 0.4123, "step": 11045 }, { "epoch": 0.8971006253553155, "grad_norm": 4.681509306562985, "learning_rate": 1.3756639830718316e-07, "loss": 0.4154, "step": 11046 }, { "epoch": 0.8971818403313571, "grad_norm": 5.1158065640859425, "learning_rate": 1.373513408849936e-07, "loss": 0.4988, "step": 11047 }, { "epoch": 0.8972630553073987, "grad_norm": 5.355479142787348, "learning_rate": 1.3713644694558742e-07, "loss": 0.447, "step": 11048 }, { "epoch": 0.8973442702834402, "grad_norm": 5.532841373770518, "learning_rate": 1.369217165038339e-07, "loss": 0.4586, "step": 11049 }, { "epoch": 0.8974254852594818, "grad_norm": 14.185804561707538, "learning_rate": 1.367071495745906e-07, "loss": 0.4633, "step": 11050 }, { "epoch": 0.8975067002355235, "grad_norm": 5.325288616133615, "learning_rate": 1.3649274617270531e-07, "loss": 0.539, "step": 11051 }, { "epoch": 0.897587915211565, "grad_norm": 3.940700522997248, "learning_rate": 1.3627850631301344e-07, "loss": 0.4839, "step": 11052 }, { "epoch": 0.8976691301876066, "grad_norm": 4.124491341239392, "learning_rate": 1.3606443001033864e-07, "loss": 0.4044, "step": 11053 }, { "epoch": 0.8977503451636482, "grad_norm": 8.104468950407128, "learning_rate": 1.3585051727949494e-07, "loss": 0.4023, "step": 11054 }, { "epoch": 0.8978315601396898, "grad_norm": 9.103539674813327, "learning_rate": 1.3563676813528325e-07, "loss": 0.3839, "step": 11055 }, { "epoch": 0.8979127751157313, "grad_norm": 4.227081794199914, "learning_rate": 1.354231825924937e-07, "loss": 0.5269, "step": 11056 }, { "epoch": 0.8979939900917729, "grad_norm": 4.986647735979627, "learning_rate": 1.3520976066590557e-07, "loss": 0.3988, "step": 11057 }, { "epoch": 0.8980752050678145, "grad_norm": 4.88191309684984, "learning_rate": 1.3499650237028677e-07, "loss": 0.3746, "step": 11058 }, { "epoch": 0.8981564200438561, "grad_norm": 8.27159257192499, "learning_rate": 1.3478340772039328e-07, "loss": 0.5004, "step": 11059 }, { "epoch": 0.8982376350198976, "grad_norm": 7.866676776482577, "learning_rate": 1.3457047673097024e-07, "loss": 0.3978, "step": 11060 }, { "epoch": 0.8983188499959393, "grad_norm": 6.3754067681303805, "learning_rate": 1.343577094167514e-07, "loss": 0.4521, "step": 11061 }, { "epoch": 0.8984000649719809, "grad_norm": 3.9619701694967744, "learning_rate": 1.341451057924592e-07, "loss": 0.545, "step": 11062 }, { "epoch": 0.8984812799480224, "grad_norm": 4.982039565280355, "learning_rate": 1.3393266587280434e-07, "loss": 0.8769, "step": 11063 }, { "epoch": 0.898562494924064, "grad_norm": 4.163415192247937, "learning_rate": 1.3372038967248647e-07, "loss": 0.5124, "step": 11064 }, { "epoch": 0.8986437099001056, "grad_norm": 5.048906567605772, "learning_rate": 1.335082772061949e-07, "loss": 0.5187, "step": 11065 }, { "epoch": 0.8987249248761472, "grad_norm": 6.207042181936884, "learning_rate": 1.3329632848860545e-07, "loss": 0.6352, "step": 11066 }, { "epoch": 0.8988061398521887, "grad_norm": 7.073497034721812, "learning_rate": 1.33084543534385e-07, "loss": 0.5995, "step": 11067 }, { "epoch": 0.8988873548282303, "grad_norm": 9.491444143304246, "learning_rate": 1.3287292235818732e-07, "loss": 0.4552, "step": 11068 }, { "epoch": 0.8989685698042719, "grad_norm": 3.2830020992185185, "learning_rate": 1.326614649746555e-07, "loss": 0.3663, "step": 11069 }, { "epoch": 0.8990497847803135, "grad_norm": 4.692919809966446, "learning_rate": 1.324501713984211e-07, "loss": 0.443, "step": 11070 }, { "epoch": 0.899130999756355, "grad_norm": 6.01529901714284, "learning_rate": 1.3223904164410494e-07, "loss": 0.3483, "step": 11071 }, { "epoch": 0.8992122147323967, "grad_norm": 8.122333020006561, "learning_rate": 1.3202807572631564e-07, "loss": 0.5003, "step": 11072 }, { "epoch": 0.8992934297084383, "grad_norm": 8.111867294079145, "learning_rate": 1.318172736596518e-07, "loss": 0.4683, "step": 11073 }, { "epoch": 0.8993746446844798, "grad_norm": 9.069724308334372, "learning_rate": 1.3160663545869896e-07, "loss": 0.4858, "step": 11074 }, { "epoch": 0.8994558596605214, "grad_norm": 7.1277675224613075, "learning_rate": 1.3139616113803238e-07, "loss": 0.6833, "step": 11075 }, { "epoch": 0.899537074636563, "grad_norm": 4.1361872264740285, "learning_rate": 1.3118585071221546e-07, "loss": 0.6296, "step": 11076 }, { "epoch": 0.8996182896126046, "grad_norm": 4.28970175862567, "learning_rate": 1.3097570419580096e-07, "loss": 0.5692, "step": 11077 }, { "epoch": 0.8996995045886461, "grad_norm": 8.454970827458345, "learning_rate": 1.3076572160333007e-07, "loss": 0.431, "step": 11078 }, { "epoch": 0.8997807195646877, "grad_norm": 9.012725656230002, "learning_rate": 1.3055590294933196e-07, "loss": 0.5454, "step": 11079 }, { "epoch": 0.8998619345407293, "grad_norm": 5.94575386452439, "learning_rate": 1.303462482483256e-07, "loss": 0.4561, "step": 11080 }, { "epoch": 0.8999431495167709, "grad_norm": 5.789262579104675, "learning_rate": 1.301367575148177e-07, "loss": 0.4424, "step": 11081 }, { "epoch": 0.9000243644928124, "grad_norm": 38.814186598162095, "learning_rate": 1.299274307633036e-07, "loss": 0.5148, "step": 11082 }, { "epoch": 0.9001055794688541, "grad_norm": 9.66072321360139, "learning_rate": 1.297182680082676e-07, "loss": 0.5525, "step": 11083 }, { "epoch": 0.9001867944448957, "grad_norm": 4.4257331245909555, "learning_rate": 1.2950926926418362e-07, "loss": 0.5021, "step": 11084 }, { "epoch": 0.9002680094209372, "grad_norm": 3.0817044530551954, "learning_rate": 1.2930043454551178e-07, "loss": 0.468, "step": 11085 }, { "epoch": 0.9003492243969788, "grad_norm": 5.56969653241233, "learning_rate": 1.2909176386670385e-07, "loss": 0.3354, "step": 11086 }, { "epoch": 0.9004304393730204, "grad_norm": 4.504308093649935, "learning_rate": 1.2888325724219775e-07, "loss": 0.5716, "step": 11087 }, { "epoch": 0.900511654349062, "grad_norm": 4.663416416080067, "learning_rate": 1.2867491468642106e-07, "loss": 0.4541, "step": 11088 }, { "epoch": 0.9005928693251035, "grad_norm": 4.148002537213163, "learning_rate": 1.2846673621379035e-07, "loss": 0.514, "step": 11089 }, { "epoch": 0.9006740843011452, "grad_norm": 4.589279907686427, "learning_rate": 1.282587218387102e-07, "loss": 0.5123, "step": 11090 }, { "epoch": 0.9007552992771867, "grad_norm": 3.974192487774201, "learning_rate": 1.2805087157557434e-07, "loss": 0.5089, "step": 11091 }, { "epoch": 0.9008365142532283, "grad_norm": 5.602158178066266, "learning_rate": 1.2784318543876463e-07, "loss": 0.4654, "step": 11092 }, { "epoch": 0.9009177292292698, "grad_norm": 8.541653052285914, "learning_rate": 1.276356634426526e-07, "loss": 0.4957, "step": 11093 }, { "epoch": 0.9009989442053115, "grad_norm": 3.690935781414827, "learning_rate": 1.274283056015968e-07, "loss": 0.6248, "step": 11094 }, { "epoch": 0.9010801591813531, "grad_norm": 4.875190589469171, "learning_rate": 1.272211119299452e-07, "loss": 0.4652, "step": 11095 }, { "epoch": 0.9011613741573946, "grad_norm": 5.525144142434985, "learning_rate": 1.270140824420349e-07, "loss": 0.4016, "step": 11096 }, { "epoch": 0.9012425891334362, "grad_norm": 3.8356059525271387, "learning_rate": 1.2680721715219168e-07, "loss": 0.4344, "step": 11097 }, { "epoch": 0.9013238041094778, "grad_norm": 4.807969276574533, "learning_rate": 1.2660051607472885e-07, "loss": 0.6449, "step": 11098 }, { "epoch": 0.9014050190855194, "grad_norm": 5.802173378442104, "learning_rate": 1.2639397922394963e-07, "loss": 0.5645, "step": 11099 }, { "epoch": 0.9014862340615609, "grad_norm": 6.895498171533333, "learning_rate": 1.261876066141446e-07, "loss": 0.499, "step": 11100 }, { "epoch": 0.9015674490376026, "grad_norm": 7.515564655019646, "learning_rate": 1.2598139825959393e-07, "loss": 0.4524, "step": 11101 }, { "epoch": 0.9016486640136441, "grad_norm": 10.60238190605972, "learning_rate": 1.2577535417456599e-07, "loss": 0.3594, "step": 11102 }, { "epoch": 0.9017298789896857, "grad_norm": 5.744098794749828, "learning_rate": 1.255694743733185e-07, "loss": 0.3964, "step": 11103 }, { "epoch": 0.9018110939657272, "grad_norm": 3.9046557359930865, "learning_rate": 1.253637588700965e-07, "loss": 0.4082, "step": 11104 }, { "epoch": 0.9018923089417689, "grad_norm": 5.1997775870318765, "learning_rate": 1.251582076791352e-07, "loss": 0.5643, "step": 11105 }, { "epoch": 0.9019735239178105, "grad_norm": 8.139019581385137, "learning_rate": 1.2495282081465747e-07, "loss": 0.5973, "step": 11106 }, { "epoch": 0.902054738893852, "grad_norm": 6.919395171153576, "learning_rate": 1.2474759829087413e-07, "loss": 0.5088, "step": 11107 }, { "epoch": 0.9021359538698936, "grad_norm": 7.322279551123576, "learning_rate": 1.2454254012198657e-07, "loss": 0.5294, "step": 11108 }, { "epoch": 0.9022171688459352, "grad_norm": 4.725685255377581, "learning_rate": 1.2433764632218293e-07, "loss": 0.4372, "step": 11109 }, { "epoch": 0.9022983838219768, "grad_norm": 14.72704680471624, "learning_rate": 1.2413291690564154e-07, "loss": 0.4079, "step": 11110 }, { "epoch": 0.9023795987980183, "grad_norm": 5.242560339794683, "learning_rate": 1.239283518865278e-07, "loss": 0.5958, "step": 11111 }, { "epoch": 0.90246081377406, "grad_norm": 3.152784855668021, "learning_rate": 1.2372395127899728e-07, "loss": 0.4431, "step": 11112 }, { "epoch": 0.9025420287501015, "grad_norm": 12.381085280329925, "learning_rate": 1.2351971509719312e-07, "loss": 0.3836, "step": 11113 }, { "epoch": 0.9026232437261431, "grad_norm": 6.282293943148033, "learning_rate": 1.233156433552471e-07, "loss": 0.4327, "step": 11114 }, { "epoch": 0.9027044587021846, "grad_norm": 9.80917559710566, "learning_rate": 1.2311173606727982e-07, "loss": 0.5872, "step": 11115 }, { "epoch": 0.9027856736782263, "grad_norm": 4.868387447251448, "learning_rate": 1.2290799324740144e-07, "loss": 0.604, "step": 11116 }, { "epoch": 0.9028668886542679, "grad_norm": 4.469666914330099, "learning_rate": 1.2270441490970897e-07, "loss": 0.4203, "step": 11117 }, { "epoch": 0.9029481036303094, "grad_norm": 5.0365987552230935, "learning_rate": 1.2250100106828978e-07, "loss": 0.4397, "step": 11118 }, { "epoch": 0.903029318606351, "grad_norm": 4.266209336176858, "learning_rate": 1.222977517372184e-07, "loss": 0.8365, "step": 11119 }, { "epoch": 0.9031105335823926, "grad_norm": 5.161680402157924, "learning_rate": 1.2209466693055867e-07, "loss": 0.5762, "step": 11120 }, { "epoch": 0.9031917485584342, "grad_norm": 4.148007256784386, "learning_rate": 1.2189174666236314e-07, "loss": 0.4376, "step": 11121 }, { "epoch": 0.9032729635344757, "grad_norm": 5.331795822073706, "learning_rate": 1.2168899094667257e-07, "loss": 0.6871, "step": 11122 }, { "epoch": 0.9033541785105174, "grad_norm": 4.452121593101198, "learning_rate": 1.2148639979751686e-07, "loss": 0.4611, "step": 11123 }, { "epoch": 0.9034353934865589, "grad_norm": 5.092994028700953, "learning_rate": 1.212839732289145e-07, "loss": 0.3484, "step": 11124 }, { "epoch": 0.9035166084626005, "grad_norm": 4.802229400487032, "learning_rate": 1.2108171125487177e-07, "loss": 0.3757, "step": 11125 }, { "epoch": 0.903597823438642, "grad_norm": 5.17753286975903, "learning_rate": 1.2087961388938473e-07, "loss": 0.5139, "step": 11126 }, { "epoch": 0.9036790384146837, "grad_norm": 4.289775333983445, "learning_rate": 1.2067768114643635e-07, "loss": 0.5869, "step": 11127 }, { "epoch": 0.9037602533907253, "grad_norm": 5.723264204379845, "learning_rate": 1.2047591304000044e-07, "loss": 0.4739, "step": 11128 }, { "epoch": 0.9038414683667668, "grad_norm": 10.15998509588337, "learning_rate": 1.2027430958403808e-07, "loss": 0.4689, "step": 11129 }, { "epoch": 0.9039226833428085, "grad_norm": 4.420651137514857, "learning_rate": 1.2007287079249863e-07, "loss": 0.4664, "step": 11130 }, { "epoch": 0.90400389831885, "grad_norm": 4.274612039891857, "learning_rate": 1.1987159667932124e-07, "loss": 0.5313, "step": 11131 }, { "epoch": 0.9040851132948916, "grad_norm": 7.759017785750078, "learning_rate": 1.1967048725843256e-07, "loss": 0.4139, "step": 11132 }, { "epoch": 0.9041663282709331, "grad_norm": 6.646277040961836, "learning_rate": 1.1946954254374838e-07, "loss": 0.5634, "step": 11133 }, { "epoch": 0.9042475432469748, "grad_norm": 4.609209324854486, "learning_rate": 1.1926876254917314e-07, "loss": 0.4933, "step": 11134 }, { "epoch": 0.9043287582230163, "grad_norm": 4.565133981153573, "learning_rate": 1.190681472885996e-07, "loss": 0.4394, "step": 11135 }, { "epoch": 0.9044099731990579, "grad_norm": 8.18111120557696, "learning_rate": 1.188676967759092e-07, "loss": 0.5565, "step": 11136 }, { "epoch": 0.9044911881750994, "grad_norm": 5.013373635726462, "learning_rate": 1.1866741102497275e-07, "loss": 0.5724, "step": 11137 }, { "epoch": 0.9045724031511411, "grad_norm": 4.362396063750319, "learning_rate": 1.1846729004964835e-07, "loss": 0.4643, "step": 11138 }, { "epoch": 0.9046536181271827, "grad_norm": 5.980174069654119, "learning_rate": 1.1826733386378297e-07, "loss": 0.4188, "step": 11139 }, { "epoch": 0.9047348331032242, "grad_norm": 4.951897983742022, "learning_rate": 1.1806754248121333e-07, "loss": 0.4468, "step": 11140 }, { "epoch": 0.9048160480792659, "grad_norm": 4.454466679854169, "learning_rate": 1.1786791591576307e-07, "loss": 0.5959, "step": 11141 }, { "epoch": 0.9048972630553074, "grad_norm": 4.311779144133112, "learning_rate": 1.176684541812459e-07, "loss": 0.7119, "step": 11142 }, { "epoch": 0.904978478031349, "grad_norm": 5.216361658357243, "learning_rate": 1.174691572914638e-07, "loss": 0.5982, "step": 11143 }, { "epoch": 0.9050596930073905, "grad_norm": 4.598499041200991, "learning_rate": 1.1727002526020631e-07, "loss": 0.5352, "step": 11144 }, { "epoch": 0.9051409079834322, "grad_norm": 3.192274398240333, "learning_rate": 1.1707105810125297e-07, "loss": 0.5892, "step": 11145 }, { "epoch": 0.9052221229594737, "grad_norm": 11.971740880860915, "learning_rate": 1.1687225582837052e-07, "loss": 0.3351, "step": 11146 }, { "epoch": 0.9053033379355153, "grad_norm": 8.973707498145792, "learning_rate": 1.1667361845531578e-07, "loss": 0.368, "step": 11147 }, { "epoch": 0.9053845529115568, "grad_norm": 4.2462620886968825, "learning_rate": 1.164751459958327e-07, "loss": 0.3972, "step": 11148 }, { "epoch": 0.9054657678875985, "grad_norm": 4.590797394683434, "learning_rate": 1.1627683846365478e-07, "loss": 0.548, "step": 11149 }, { "epoch": 0.9055469828636401, "grad_norm": 7.826708434408468, "learning_rate": 1.1607869587250464e-07, "loss": 0.4099, "step": 11150 }, { "epoch": 0.9056281978396816, "grad_norm": 4.643007677952196, "learning_rate": 1.1588071823609159e-07, "loss": 0.6002, "step": 11151 }, { "epoch": 0.9057094128157233, "grad_norm": 6.195712206260984, "learning_rate": 1.1568290556811495e-07, "loss": 0.5826, "step": 11152 }, { "epoch": 0.9057906277917648, "grad_norm": 5.225028473648337, "learning_rate": 1.1548525788226267e-07, "loss": 0.5314, "step": 11153 }, { "epoch": 0.9058718427678064, "grad_norm": 9.685757941498554, "learning_rate": 1.1528777519221046e-07, "loss": 0.5205, "step": 11154 }, { "epoch": 0.9059530577438479, "grad_norm": 3.9456543583410526, "learning_rate": 1.1509045751162324e-07, "loss": 0.4093, "step": 11155 }, { "epoch": 0.9060342727198896, "grad_norm": 5.960099778891394, "learning_rate": 1.1489330485415479e-07, "loss": 0.3512, "step": 11156 }, { "epoch": 0.9061154876959311, "grad_norm": 5.908029100963964, "learning_rate": 1.1469631723344671e-07, "loss": 0.4409, "step": 11157 }, { "epoch": 0.9061967026719727, "grad_norm": 4.606484674584257, "learning_rate": 1.1449949466312893e-07, "loss": 0.4521, "step": 11158 }, { "epoch": 0.9062779176480142, "grad_norm": 5.177826704703417, "learning_rate": 1.1430283715682139e-07, "loss": 0.5358, "step": 11159 }, { "epoch": 0.9063591326240559, "grad_norm": 4.966905493065145, "learning_rate": 1.1410634472813098e-07, "loss": 0.4514, "step": 11160 }, { "epoch": 0.9064403476000975, "grad_norm": 5.793696903665879, "learning_rate": 1.1391001739065432e-07, "loss": 0.5214, "step": 11161 }, { "epoch": 0.906521562576139, "grad_norm": 5.420921312113236, "learning_rate": 1.1371385515797695e-07, "loss": 0.4365, "step": 11162 }, { "epoch": 0.9066027775521807, "grad_norm": 3.588449960902482, "learning_rate": 1.1351785804367105e-07, "loss": 0.5352, "step": 11163 }, { "epoch": 0.9066839925282222, "grad_norm": 5.796738656501365, "learning_rate": 1.1332202606129938e-07, "loss": 0.4415, "step": 11164 }, { "epoch": 0.9067652075042638, "grad_norm": 5.625515444116812, "learning_rate": 1.1312635922441195e-07, "loss": 0.4904, "step": 11165 }, { "epoch": 0.9068464224803053, "grad_norm": 4.830094151436432, "learning_rate": 1.129308575465482e-07, "loss": 0.6403, "step": 11166 }, { "epoch": 0.906927637456347, "grad_norm": 4.562587755650938, "learning_rate": 1.1273552104123564e-07, "loss": 0.4543, "step": 11167 }, { "epoch": 0.9070088524323885, "grad_norm": 7.282911765724218, "learning_rate": 1.125403497219904e-07, "loss": 0.4899, "step": 11168 }, { "epoch": 0.9070900674084301, "grad_norm": 4.877190705118263, "learning_rate": 1.123453436023178e-07, "loss": 0.3675, "step": 11169 }, { "epoch": 0.9071712823844716, "grad_norm": 6.299432110607247, "learning_rate": 1.121505026957112e-07, "loss": 0.4837, "step": 11170 }, { "epoch": 0.9072524973605133, "grad_norm": 5.585090356105831, "learning_rate": 1.1195582701565177e-07, "loss": 0.5353, "step": 11171 }, { "epoch": 0.9073337123365549, "grad_norm": 5.1062027169899435, "learning_rate": 1.1176131657561095e-07, "loss": 0.5091, "step": 11172 }, { "epoch": 0.9074149273125964, "grad_norm": 3.970272507111287, "learning_rate": 1.1156697138904715e-07, "loss": 0.73, "step": 11173 }, { "epoch": 0.9074961422886381, "grad_norm": 4.9753615625859196, "learning_rate": 1.1137279146940821e-07, "loss": 0.3436, "step": 11174 }, { "epoch": 0.9075773572646796, "grad_norm": 4.2196215333931, "learning_rate": 1.111787768301309e-07, "loss": 0.5066, "step": 11175 }, { "epoch": 0.9076585722407212, "grad_norm": 9.64461171338353, "learning_rate": 1.1098492748463945e-07, "loss": 0.4379, "step": 11176 }, { "epoch": 0.9077397872167627, "grad_norm": 7.251627691507904, "learning_rate": 1.1079124344634707e-07, "loss": 0.4515, "step": 11177 }, { "epoch": 0.9078210021928044, "grad_norm": 4.400388440896803, "learning_rate": 1.1059772472865632e-07, "loss": 0.408, "step": 11178 }, { "epoch": 0.9079022171688459, "grad_norm": 4.694092152410516, "learning_rate": 1.1040437134495708e-07, "loss": 0.4494, "step": 11179 }, { "epoch": 0.9079834321448875, "grad_norm": 4.510297602747996, "learning_rate": 1.1021118330862835e-07, "loss": 0.455, "step": 11180 }, { "epoch": 0.908064647120929, "grad_norm": 3.8454716633958546, "learning_rate": 1.1001816063303805e-07, "loss": 0.5094, "step": 11181 }, { "epoch": 0.9081458620969707, "grad_norm": 7.989031039920255, "learning_rate": 1.0982530333154245e-07, "loss": 0.6907, "step": 11182 }, { "epoch": 0.9082270770730123, "grad_norm": 5.9615319524726695, "learning_rate": 1.0963261141748616e-07, "loss": 0.3874, "step": 11183 }, { "epoch": 0.9083082920490538, "grad_norm": 4.85080597593589, "learning_rate": 1.0944008490420183e-07, "loss": 0.6927, "step": 11184 }, { "epoch": 0.9083895070250955, "grad_norm": 12.043245022297707, "learning_rate": 1.0924772380501215e-07, "loss": 0.4814, "step": 11185 }, { "epoch": 0.908470722001137, "grad_norm": 3.70058521293832, "learning_rate": 1.0905552813322701e-07, "loss": 0.43, "step": 11186 }, { "epoch": 0.9085519369771786, "grad_norm": 5.677961162801236, "learning_rate": 1.0886349790214495e-07, "loss": 0.4644, "step": 11187 }, { "epoch": 0.9086331519532201, "grad_norm": 5.958066806913473, "learning_rate": 1.0867163312505452e-07, "loss": 0.5697, "step": 11188 }, { "epoch": 0.9087143669292618, "grad_norm": 8.122121992747614, "learning_rate": 1.084799338152312e-07, "loss": 0.4075, "step": 11189 }, { "epoch": 0.9087955819053033, "grad_norm": 3.68707963935213, "learning_rate": 1.082883999859391e-07, "loss": 0.61, "step": 11190 }, { "epoch": 0.9088767968813449, "grad_norm": 6.469919999736764, "learning_rate": 1.0809703165043206e-07, "loss": 0.4208, "step": 11191 }, { "epoch": 0.9089580118573864, "grad_norm": 13.337941491052447, "learning_rate": 1.0790582882195172e-07, "loss": 0.3323, "step": 11192 }, { "epoch": 0.9090392268334281, "grad_norm": 5.027652600451383, "learning_rate": 1.0771479151372749e-07, "loss": 0.3798, "step": 11193 }, { "epoch": 0.9091204418094697, "grad_norm": 19.702276711092185, "learning_rate": 1.0752391973897852e-07, "loss": 0.4353, "step": 11194 }, { "epoch": 0.9092016567855112, "grad_norm": 6.736698974070218, "learning_rate": 1.0733321351091286e-07, "loss": 0.4836, "step": 11195 }, { "epoch": 0.9092828717615529, "grad_norm": 8.31372301888567, "learning_rate": 1.071426728427255e-07, "loss": 0.4187, "step": 11196 }, { "epoch": 0.9093640867375944, "grad_norm": 5.818025708926807, "learning_rate": 1.0695229774760147e-07, "loss": 0.4106, "step": 11197 }, { "epoch": 0.909445301713636, "grad_norm": 4.4508095887650105, "learning_rate": 1.0676208823871326e-07, "loss": 0.545, "step": 11198 }, { "epoch": 0.9095265166896775, "grad_norm": 8.123684613091736, "learning_rate": 1.065720443292223e-07, "loss": 0.3934, "step": 11199 }, { "epoch": 0.9096077316657192, "grad_norm": 5.968329057688115, "learning_rate": 1.0638216603227892e-07, "loss": 0.5218, "step": 11200 }, { "epoch": 0.9096889466417607, "grad_norm": 8.367605133725334, "learning_rate": 1.0619245336102174e-07, "loss": 0.4462, "step": 11201 }, { "epoch": 0.9097701616178023, "grad_norm": 7.432108062190521, "learning_rate": 1.060029063285778e-07, "loss": 0.5753, "step": 11202 }, { "epoch": 0.9098513765938439, "grad_norm": 5.076717044768981, "learning_rate": 1.0581352494806241e-07, "loss": 0.3382, "step": 11203 }, { "epoch": 0.9099325915698855, "grad_norm": 6.177656873247412, "learning_rate": 1.0562430923258037e-07, "loss": 0.5211, "step": 11204 }, { "epoch": 0.9100138065459271, "grad_norm": 6.778475917464361, "learning_rate": 1.0543525919522401e-07, "loss": 0.4251, "step": 11205 }, { "epoch": 0.9100950215219686, "grad_norm": 6.121707124021081, "learning_rate": 1.0524637484907424e-07, "loss": 0.3896, "step": 11206 }, { "epoch": 0.9101762364980103, "grad_norm": 6.136173762658393, "learning_rate": 1.0505765620720143e-07, "loss": 0.6354, "step": 11207 }, { "epoch": 0.9102574514740518, "grad_norm": 7.9613881203228525, "learning_rate": 1.0486910328266403e-07, "loss": 0.3988, "step": 11208 }, { "epoch": 0.9103386664500934, "grad_norm": 5.222762969931984, "learning_rate": 1.0468071608850827e-07, "loss": 0.537, "step": 11209 }, { "epoch": 0.910419881426135, "grad_norm": 4.889505356591718, "learning_rate": 1.0449249463777039e-07, "loss": 0.41, "step": 11210 }, { "epoch": 0.9105010964021766, "grad_norm": 5.393977337842649, "learning_rate": 1.0430443894347358e-07, "loss": 0.8052, "step": 11211 }, { "epoch": 0.9105823113782181, "grad_norm": 4.927676914766112, "learning_rate": 1.041165490186305e-07, "loss": 0.4754, "step": 11212 }, { "epoch": 0.9106635263542597, "grad_norm": 4.738799420453396, "learning_rate": 1.0392882487624212e-07, "loss": 0.5263, "step": 11213 }, { "epoch": 0.9107447413303013, "grad_norm": 3.4587373103230474, "learning_rate": 1.0374126652929805e-07, "loss": 0.3904, "step": 11214 }, { "epoch": 0.9108259563063429, "grad_norm": 4.320689046060023, "learning_rate": 1.0355387399077627e-07, "loss": 0.5351, "step": 11215 }, { "epoch": 0.9109071712823845, "grad_norm": 6.811312905006839, "learning_rate": 1.033666472736436e-07, "loss": 0.4286, "step": 11216 }, { "epoch": 0.910988386258426, "grad_norm": 3.4936475176998845, "learning_rate": 1.0317958639085524e-07, "loss": 0.5945, "step": 11217 }, { "epoch": 0.9110696012344677, "grad_norm": 4.327867349353077, "learning_rate": 1.0299269135535416e-07, "loss": 0.3791, "step": 11218 }, { "epoch": 0.9111508162105092, "grad_norm": 7.91773166597089, "learning_rate": 1.0280596218007254e-07, "loss": 0.4511, "step": 11219 }, { "epoch": 0.9112320311865508, "grad_norm": 5.982009386221902, "learning_rate": 1.0261939887793143e-07, "loss": 0.4298, "step": 11220 }, { "epoch": 0.9113132461625923, "grad_norm": 9.681222480637057, "learning_rate": 1.0243300146184048e-07, "loss": 0.5457, "step": 11221 }, { "epoch": 0.911394461138634, "grad_norm": 8.645718538389323, "learning_rate": 1.0224676994469635e-07, "loss": 0.4311, "step": 11222 }, { "epoch": 0.9114756761146755, "grad_norm": 6.818193527467812, "learning_rate": 1.020607043393862e-07, "loss": 0.4284, "step": 11223 }, { "epoch": 0.9115568910907171, "grad_norm": 4.876516812885544, "learning_rate": 1.0187480465878418e-07, "loss": 0.4452, "step": 11224 }, { "epoch": 0.9116381060667587, "grad_norm": 4.240926353476526, "learning_rate": 1.0168907091575364e-07, "loss": 0.5541, "step": 11225 }, { "epoch": 0.9117193210428003, "grad_norm": 5.868350340007438, "learning_rate": 1.015035031231465e-07, "loss": 0.4558, "step": 11226 }, { "epoch": 0.9118005360188419, "grad_norm": 3.823767663089306, "learning_rate": 1.0131810129380332e-07, "loss": 0.6506, "step": 11227 }, { "epoch": 0.9118817509948834, "grad_norm": 7.147187550366974, "learning_rate": 1.0113286544055245e-07, "loss": 0.4312, "step": 11228 }, { "epoch": 0.9119629659709251, "grad_norm": 50.85174427248752, "learning_rate": 1.0094779557621171e-07, "loss": 0.4037, "step": 11229 }, { "epoch": 0.9120441809469666, "grad_norm": 10.89692418155859, "learning_rate": 1.0076289171358695e-07, "loss": 0.5095, "step": 11230 }, { "epoch": 0.9121253959230082, "grad_norm": 6.406674309261182, "learning_rate": 1.0057815386547181e-07, "loss": 0.6489, "step": 11231 }, { "epoch": 0.9122066108990498, "grad_norm": 4.4242464166233955, "learning_rate": 1.0039358204464943e-07, "loss": 0.5091, "step": 11232 }, { "epoch": 0.9122878258750914, "grad_norm": 4.743762192538487, "learning_rate": 1.0020917626389209e-07, "loss": 0.4469, "step": 11233 }, { "epoch": 0.9123690408511329, "grad_norm": 4.14370710399411, "learning_rate": 1.0002493653595902e-07, "loss": 0.39, "step": 11234 }, { "epoch": 0.9124502558271745, "grad_norm": 14.280052275876619, "learning_rate": 9.984086287359806e-08, "loss": 0.5105, "step": 11235 }, { "epoch": 0.9125314708032161, "grad_norm": 6.244441893382094, "learning_rate": 9.965695528954711e-08, "loss": 0.357, "step": 11236 }, { "epoch": 0.9126126857792577, "grad_norm": 7.898334173798384, "learning_rate": 9.947321379653152e-08, "loss": 0.4236, "step": 11237 }, { "epoch": 0.9126939007552993, "grad_norm": 4.788515846324651, "learning_rate": 9.928963840726418e-08, "loss": 0.5023, "step": 11238 }, { "epoch": 0.9127751157313408, "grad_norm": 25.04518107661059, "learning_rate": 9.910622913444856e-08, "loss": 0.5852, "step": 11239 }, { "epoch": 0.9128563307073825, "grad_norm": 4.841830459052142, "learning_rate": 9.89229859907756e-08, "loss": 0.4615, "step": 11240 }, { "epoch": 0.912937545683424, "grad_norm": 7.939078029749193, "learning_rate": 9.873990898892405e-08, "loss": 0.4802, "step": 11241 }, { "epoch": 0.9130187606594656, "grad_norm": 9.609761146624544, "learning_rate": 9.855699814156266e-08, "loss": 0.4418, "step": 11242 }, { "epoch": 0.9130999756355072, "grad_norm": 4.6087214575599145, "learning_rate": 9.837425346134771e-08, "loss": 0.5425, "step": 11243 }, { "epoch": 0.9131811906115488, "grad_norm": 5.893925469137472, "learning_rate": 9.819167496092352e-08, "loss": 0.4984, "step": 11244 }, { "epoch": 0.9132624055875903, "grad_norm": 5.730428639464196, "learning_rate": 9.800926265292415e-08, "loss": 0.4071, "step": 11245 }, { "epoch": 0.9133436205636319, "grad_norm": 3.5844043953410853, "learning_rate": 9.782701654997145e-08, "loss": 0.5314, "step": 11246 }, { "epoch": 0.9134248355396735, "grad_norm": 4.657341875734708, "learning_rate": 9.764493666467589e-08, "loss": 0.5415, "step": 11247 }, { "epoch": 0.9135060505157151, "grad_norm": 5.362707479330835, "learning_rate": 9.746302300963656e-08, "loss": 0.5306, "step": 11248 }, { "epoch": 0.9135872654917567, "grad_norm": 5.144669261464033, "learning_rate": 9.728127559744089e-08, "loss": 0.4642, "step": 11249 }, { "epoch": 0.9136684804677982, "grad_norm": 8.491237111181297, "learning_rate": 9.709969444066436e-08, "loss": 0.4926, "step": 11250 }, { "epoch": 0.9137496954438399, "grad_norm": 4.176442395340095, "learning_rate": 9.691827955187222e-08, "loss": 0.6502, "step": 11251 }, { "epoch": 0.9138309104198814, "grad_norm": 9.913453531194941, "learning_rate": 9.673703094361664e-08, "loss": 0.609, "step": 11252 }, { "epoch": 0.913912125395923, "grad_norm": 4.875302448335454, "learning_rate": 9.655594862843953e-08, "loss": 0.3666, "step": 11253 }, { "epoch": 0.9139933403719646, "grad_norm": 4.516782702007909, "learning_rate": 9.63750326188706e-08, "loss": 0.4876, "step": 11254 }, { "epoch": 0.9140745553480062, "grad_norm": 9.6025358191127, "learning_rate": 9.619428292742872e-08, "loss": 0.4328, "step": 11255 }, { "epoch": 0.9141557703240477, "grad_norm": 6.319994193854443, "learning_rate": 9.601369956662054e-08, "loss": 0.5117, "step": 11256 }, { "epoch": 0.9142369853000893, "grad_norm": 5.66199645711789, "learning_rate": 9.583328254894109e-08, "loss": 0.657, "step": 11257 }, { "epoch": 0.9143182002761309, "grad_norm": 3.0411540989807317, "learning_rate": 9.565303188687453e-08, "loss": 0.6704, "step": 11258 }, { "epoch": 0.9143994152521725, "grad_norm": 4.937602813989695, "learning_rate": 9.547294759289366e-08, "loss": 0.3464, "step": 11259 }, { "epoch": 0.9144806302282141, "grad_norm": 4.5620361618138885, "learning_rate": 9.52930296794588e-08, "loss": 0.4903, "step": 11260 }, { "epoch": 0.9145618452042557, "grad_norm": 4.986561186017303, "learning_rate": 9.511327815902e-08, "loss": 0.5204, "step": 11261 }, { "epoch": 0.9146430601802973, "grad_norm": 3.2596608136193366, "learning_rate": 9.493369304401423e-08, "loss": 0.3049, "step": 11262 }, { "epoch": 0.9147242751563388, "grad_norm": 3.798410117128246, "learning_rate": 9.475427434686824e-08, "loss": 0.5748, "step": 11263 }, { "epoch": 0.9148054901323804, "grad_norm": 6.954981893846417, "learning_rate": 9.457502207999736e-08, "loss": 0.5748, "step": 11264 }, { "epoch": 0.914886705108422, "grad_norm": 3.8501120004641143, "learning_rate": 9.43959362558039e-08, "loss": 0.525, "step": 11265 }, { "epoch": 0.9149679200844636, "grad_norm": 4.333034848842079, "learning_rate": 9.421701688668017e-08, "loss": 0.4368, "step": 11266 }, { "epoch": 0.9150491350605051, "grad_norm": 5.609179824260918, "learning_rate": 9.403826398500654e-08, "loss": 0.4947, "step": 11267 }, { "epoch": 0.9151303500365467, "grad_norm": 4.516842058122665, "learning_rate": 9.385967756315201e-08, "loss": 0.5003, "step": 11268 }, { "epoch": 0.9152115650125883, "grad_norm": 5.024303496834808, "learning_rate": 9.368125763347336e-08, "loss": 0.502, "step": 11269 }, { "epoch": 0.9152927799886299, "grad_norm": 3.8890995639637858, "learning_rate": 9.350300420831599e-08, "loss": 0.4421, "step": 11270 }, { "epoch": 0.9153739949646715, "grad_norm": 11.469385591277504, "learning_rate": 9.332491730001448e-08, "loss": 0.6776, "step": 11271 }, { "epoch": 0.915455209940713, "grad_norm": 8.781322680809653, "learning_rate": 9.314699692089202e-08, "loss": 0.5468, "step": 11272 }, { "epoch": 0.9155364249167547, "grad_norm": 5.122536771418473, "learning_rate": 9.296924308325905e-08, "loss": 0.5422, "step": 11273 }, { "epoch": 0.9156176398927962, "grad_norm": 5.107338953835333, "learning_rate": 9.279165579941546e-08, "loss": 0.5195, "step": 11274 }, { "epoch": 0.9156988548688378, "grad_norm": 7.603688810286707, "learning_rate": 9.261423508164947e-08, "loss": 0.4056, "step": 11275 }, { "epoch": 0.9157800698448794, "grad_norm": 3.231334009016635, "learning_rate": 9.243698094223735e-08, "loss": 0.3926, "step": 11276 }, { "epoch": 0.915861284820921, "grad_norm": 4.19277721624568, "learning_rate": 9.225989339344432e-08, "loss": 0.5653, "step": 11277 }, { "epoch": 0.9159424997969625, "grad_norm": 4.727861994059561, "learning_rate": 9.208297244752362e-08, "loss": 0.6066, "step": 11278 }, { "epoch": 0.9160237147730041, "grad_norm": 4.796597984182049, "learning_rate": 9.190621811671769e-08, "loss": 0.4402, "step": 11279 }, { "epoch": 0.9161049297490457, "grad_norm": 17.756697315100613, "learning_rate": 9.1729630413257e-08, "loss": 0.4518, "step": 11280 }, { "epoch": 0.9161861447250873, "grad_norm": 6.095353870840864, "learning_rate": 9.155320934936041e-08, "loss": 0.5031, "step": 11281 }, { "epoch": 0.9162673597011289, "grad_norm": 6.9639308227226, "learning_rate": 9.137695493723481e-08, "loss": 0.4176, "step": 11282 }, { "epoch": 0.9163485746771705, "grad_norm": 4.5604405756876485, "learning_rate": 9.120086718907657e-08, "loss": 0.4873, "step": 11283 }, { "epoch": 0.9164297896532121, "grad_norm": 5.467910329584174, "learning_rate": 9.10249461170698e-08, "loss": 0.3554, "step": 11284 }, { "epoch": 0.9165110046292536, "grad_norm": 5.007337212383075, "learning_rate": 9.084919173338758e-08, "loss": 0.499, "step": 11285 }, { "epoch": 0.9165922196052952, "grad_norm": 9.584147277340062, "learning_rate": 9.067360405019099e-08, "loss": 0.4328, "step": 11286 }, { "epoch": 0.9166734345813368, "grad_norm": 4.7954029212995355, "learning_rate": 9.049818307963004e-08, "loss": 0.4119, "step": 11287 }, { "epoch": 0.9167546495573784, "grad_norm": 7.375196976362717, "learning_rate": 9.03229288338428e-08, "loss": 0.4901, "step": 11288 }, { "epoch": 0.9168358645334199, "grad_norm": 5.606112983529189, "learning_rate": 9.014784132495542e-08, "loss": 0.5983, "step": 11289 }, { "epoch": 0.9169170795094616, "grad_norm": 5.297505673484214, "learning_rate": 8.997292056508372e-08, "loss": 0.3459, "step": 11290 }, { "epoch": 0.9169982944855031, "grad_norm": 9.227347098755619, "learning_rate": 8.979816656633084e-08, "loss": 0.4612, "step": 11291 }, { "epoch": 0.9170795094615447, "grad_norm": 2.966819498823606, "learning_rate": 8.962357934078874e-08, "loss": 0.5246, "step": 11292 }, { "epoch": 0.9171607244375863, "grad_norm": 6.824208201533333, "learning_rate": 8.944915890053891e-08, "loss": 0.4562, "step": 11293 }, { "epoch": 0.9172419394136279, "grad_norm": 5.376195914843917, "learning_rate": 8.927490525764942e-08, "loss": 0.6399, "step": 11294 }, { "epoch": 0.9173231543896695, "grad_norm": 4.79660630950142, "learning_rate": 8.910081842417761e-08, "loss": 0.5716, "step": 11295 }, { "epoch": 0.917404369365711, "grad_norm": 4.642397495613151, "learning_rate": 8.892689841216995e-08, "loss": 0.5504, "step": 11296 }, { "epoch": 0.9174855843417526, "grad_norm": 3.4372668067850913, "learning_rate": 8.875314523366014e-08, "loss": 0.5241, "step": 11297 }, { "epoch": 0.9175667993177942, "grad_norm": 5.952139631988675, "learning_rate": 8.857955890067132e-08, "loss": 0.4511, "step": 11298 }, { "epoch": 0.9176480142938358, "grad_norm": 6.811625003043075, "learning_rate": 8.840613942521503e-08, "loss": 0.4252, "step": 11299 }, { "epoch": 0.9177292292698773, "grad_norm": 5.451600879549154, "learning_rate": 8.823288681929082e-08, "loss": 0.4875, "step": 11300 }, { "epoch": 0.917810444245919, "grad_norm": 4.68192691825503, "learning_rate": 8.80598010948866e-08, "loss": 0.6457, "step": 11301 }, { "epoch": 0.9178916592219605, "grad_norm": 4.012662107572786, "learning_rate": 8.788688226397917e-08, "loss": 0.4899, "step": 11302 }, { "epoch": 0.9179728741980021, "grad_norm": 6.570602578436052, "learning_rate": 8.771413033853343e-08, "loss": 0.5515, "step": 11303 }, { "epoch": 0.9180540891740437, "grad_norm": 7.278974590840044, "learning_rate": 8.754154533050285e-08, "loss": 0.3594, "step": 11304 }, { "epoch": 0.9181353041500853, "grad_norm": 5.819321823620186, "learning_rate": 8.736912725182983e-08, "loss": 0.4738, "step": 11305 }, { "epoch": 0.9182165191261269, "grad_norm": 4.523374291477625, "learning_rate": 8.719687611444483e-08, "loss": 0.3854, "step": 11306 }, { "epoch": 0.9182977341021684, "grad_norm": 3.277263217530252, "learning_rate": 8.702479193026608e-08, "loss": 0.4129, "step": 11307 }, { "epoch": 0.91837894907821, "grad_norm": 4.810473743314004, "learning_rate": 8.68528747112013e-08, "loss": 0.3563, "step": 11308 }, { "epoch": 0.9184601640542516, "grad_norm": 4.361251259939714, "learning_rate": 8.668112446914622e-08, "loss": 0.4657, "step": 11309 }, { "epoch": 0.9185413790302932, "grad_norm": 5.725837124786675, "learning_rate": 8.650954121598471e-08, "loss": 0.4554, "step": 11310 }, { "epoch": 0.9186225940063347, "grad_norm": 7.948527051871499, "learning_rate": 8.633812496358973e-08, "loss": 0.4677, "step": 11311 }, { "epoch": 0.9187038089823764, "grad_norm": 8.490193251568819, "learning_rate": 8.616687572382293e-08, "loss": 0.4757, "step": 11312 }, { "epoch": 0.9187850239584179, "grad_norm": 10.643140675212129, "learning_rate": 8.599579350853288e-08, "loss": 0.5886, "step": 11313 }, { "epoch": 0.9188662389344595, "grad_norm": 5.12172366794015, "learning_rate": 8.582487832955788e-08, "loss": 0.4511, "step": 11314 }, { "epoch": 0.9189474539105011, "grad_norm": 4.267431188352919, "learning_rate": 8.565413019872488e-08, "loss": 0.4457, "step": 11315 }, { "epoch": 0.9190286688865427, "grad_norm": 6.46937742703463, "learning_rate": 8.548354912784801e-08, "loss": 0.5193, "step": 11316 }, { "epoch": 0.9191098838625843, "grad_norm": 10.178148390235016, "learning_rate": 8.531313512873063e-08, "loss": 0.4415, "step": 11317 }, { "epoch": 0.9191910988386258, "grad_norm": 5.783583498365241, "learning_rate": 8.514288821316524e-08, "loss": 0.3627, "step": 11318 }, { "epoch": 0.9192723138146675, "grad_norm": 4.717466038627756, "learning_rate": 8.497280839293159e-08, "loss": 0.4085, "step": 11319 }, { "epoch": 0.919353528790709, "grad_norm": 4.386734491499693, "learning_rate": 8.480289567979776e-08, "loss": 0.5006, "step": 11320 }, { "epoch": 0.9194347437667506, "grad_norm": 4.236143189731592, "learning_rate": 8.463315008552158e-08, "loss": 0.5219, "step": 11321 }, { "epoch": 0.9195159587427921, "grad_norm": 6.377920595401447, "learning_rate": 8.446357162184838e-08, "loss": 0.5412, "step": 11322 }, { "epoch": 0.9195971737188338, "grad_norm": 4.393949522713649, "learning_rate": 8.429416030051179e-08, "loss": 0.6075, "step": 11323 }, { "epoch": 0.9196783886948753, "grad_norm": 3.2693096226160097, "learning_rate": 8.412491613323415e-08, "loss": 0.4949, "step": 11324 }, { "epoch": 0.9197596036709169, "grad_norm": 5.690081161082877, "learning_rate": 8.39558391317269e-08, "loss": 0.333, "step": 11325 }, { "epoch": 0.9198408186469585, "grad_norm": 4.591799853216551, "learning_rate": 8.378692930768873e-08, "loss": 0.5179, "step": 11326 }, { "epoch": 0.9199220336230001, "grad_norm": 3.4431361012922017, "learning_rate": 8.361818667280724e-08, "loss": 0.429, "step": 11327 }, { "epoch": 0.9200032485990417, "grad_norm": 7.374506756234914, "learning_rate": 8.344961123875895e-08, "loss": 0.4514, "step": 11328 }, { "epoch": 0.9200844635750832, "grad_norm": 3.5074173112977345, "learning_rate": 8.328120301720783e-08, "loss": 0.5645, "step": 11329 }, { "epoch": 0.9201656785511249, "grad_norm": 4.89262703046508, "learning_rate": 8.311296201980734e-08, "loss": 0.6032, "step": 11330 }, { "epoch": 0.9202468935271664, "grad_norm": 3.5016861133769526, "learning_rate": 8.294488825819875e-08, "loss": 0.9195, "step": 11331 }, { "epoch": 0.920328108503208, "grad_norm": 5.597223977106012, "learning_rate": 8.277698174401189e-08, "loss": 0.644, "step": 11332 }, { "epoch": 0.9204093234792495, "grad_norm": 4.477628857500898, "learning_rate": 8.260924248886471e-08, "loss": 0.5054, "step": 11333 }, { "epoch": 0.9204905384552912, "grad_norm": 5.7562752348647885, "learning_rate": 8.244167050436402e-08, "loss": 0.4682, "step": 11334 }, { "epoch": 0.9205717534313327, "grad_norm": 5.880650217377458, "learning_rate": 8.22742658021053e-08, "loss": 0.6894, "step": 11335 }, { "epoch": 0.9206529684073743, "grad_norm": 5.322917229821901, "learning_rate": 8.210702839367146e-08, "loss": 0.4346, "step": 11336 }, { "epoch": 0.920734183383416, "grad_norm": 4.796792390299345, "learning_rate": 8.193995829063467e-08, "loss": 0.4673, "step": 11337 }, { "epoch": 0.9208153983594575, "grad_norm": 8.87203052295575, "learning_rate": 8.177305550455566e-08, "loss": 0.444, "step": 11338 }, { "epoch": 0.9208966133354991, "grad_norm": 4.841733687058009, "learning_rate": 8.160632004698271e-08, "loss": 0.4699, "step": 11339 }, { "epoch": 0.9209778283115406, "grad_norm": 4.075654039554347, "learning_rate": 8.143975192945325e-08, "loss": 0.7124, "step": 11340 }, { "epoch": 0.9210590432875823, "grad_norm": 4.698000621909273, "learning_rate": 8.127335116349305e-08, "loss": 0.4646, "step": 11341 }, { "epoch": 0.9211402582636238, "grad_norm": 4.494572377526055, "learning_rate": 8.110711776061597e-08, "loss": 0.4942, "step": 11342 }, { "epoch": 0.9212214732396654, "grad_norm": 7.279761243226819, "learning_rate": 8.09410517323242e-08, "loss": 0.4229, "step": 11343 }, { "epoch": 0.9213026882157069, "grad_norm": 4.863071295173902, "learning_rate": 8.077515309010936e-08, "loss": 0.3941, "step": 11344 }, { "epoch": 0.9213839031917486, "grad_norm": 5.892354030279855, "learning_rate": 8.060942184545034e-08, "loss": 0.5919, "step": 11345 }, { "epoch": 0.9214651181677901, "grad_norm": 4.708709314273925, "learning_rate": 8.044385800981464e-08, "loss": 0.3786, "step": 11346 }, { "epoch": 0.9215463331438317, "grad_norm": 8.147418646635652, "learning_rate": 8.02784615946589e-08, "loss": 0.5678, "step": 11347 }, { "epoch": 0.9216275481198734, "grad_norm": 5.376975658563418, "learning_rate": 8.011323261142734e-08, "loss": 0.4418, "step": 11348 }, { "epoch": 0.9217087630959149, "grad_norm": 5.665831071686028, "learning_rate": 7.994817107155301e-08, "loss": 0.4363, "step": 11349 }, { "epoch": 0.9217899780719565, "grad_norm": 4.582354055457941, "learning_rate": 7.978327698645705e-08, "loss": 0.4622, "step": 11350 }, { "epoch": 0.921871193047998, "grad_norm": 7.540612277290582, "learning_rate": 7.961855036754978e-08, "loss": 0.3925, "step": 11351 }, { "epoch": 0.9219524080240397, "grad_norm": 4.845886462847071, "learning_rate": 7.945399122622904e-08, "loss": 0.494, "step": 11352 }, { "epoch": 0.9220336230000812, "grad_norm": 3.6709111199072013, "learning_rate": 7.928959957388154e-08, "loss": 0.625, "step": 11353 }, { "epoch": 0.9221148379761228, "grad_norm": 4.457865490095833, "learning_rate": 7.912537542188264e-08, "loss": 0.4271, "step": 11354 }, { "epoch": 0.9221960529521643, "grad_norm": 6.9852450558056365, "learning_rate": 7.89613187815949e-08, "loss": 0.5399, "step": 11355 }, { "epoch": 0.922277267928206, "grad_norm": 5.777646011684341, "learning_rate": 7.879742966437092e-08, "loss": 0.4892, "step": 11356 }, { "epoch": 0.9223584829042475, "grad_norm": 3.543496182377441, "learning_rate": 7.86337080815508e-08, "loss": 0.5512, "step": 11357 }, { "epoch": 0.9224396978802891, "grad_norm": 4.258648901552076, "learning_rate": 7.847015404446352e-08, "loss": 0.3128, "step": 11358 }, { "epoch": 0.9225209128563308, "grad_norm": 5.717748386729547, "learning_rate": 7.830676756442529e-08, "loss": 0.5077, "step": 11359 }, { "epoch": 0.9226021278323723, "grad_norm": 5.465873996372227, "learning_rate": 7.814354865274237e-08, "loss": 0.4398, "step": 11360 }, { "epoch": 0.9226833428084139, "grad_norm": 9.321969343150906, "learning_rate": 7.798049732070822e-08, "loss": 0.5184, "step": 11361 }, { "epoch": 0.9227645577844554, "grad_norm": 6.551159045640905, "learning_rate": 7.78176135796052e-08, "loss": 0.4808, "step": 11362 }, { "epoch": 0.9228457727604971, "grad_norm": 6.18279810098398, "learning_rate": 7.765489744070459e-08, "loss": 0.495, "step": 11363 }, { "epoch": 0.9229269877365386, "grad_norm": 4.489976108293694, "learning_rate": 7.749234891526486e-08, "loss": 0.5977, "step": 11364 }, { "epoch": 0.9230082027125802, "grad_norm": 4.0322329291396475, "learning_rate": 7.732996801453313e-08, "loss": 0.7121, "step": 11365 }, { "epoch": 0.9230894176886217, "grad_norm": 7.911313584451032, "learning_rate": 7.716775474974625e-08, "loss": 0.3647, "step": 11366 }, { "epoch": 0.9231706326646634, "grad_norm": 4.246785998863406, "learning_rate": 7.70057091321283e-08, "loss": 0.3889, "step": 11367 }, { "epoch": 0.9232518476407049, "grad_norm": 21.982620189789714, "learning_rate": 7.684383117289141e-08, "loss": 0.5681, "step": 11368 }, { "epoch": 0.9233330626167465, "grad_norm": 16.563097433915587, "learning_rate": 7.66821208832369e-08, "loss": 0.4741, "step": 11369 }, { "epoch": 0.9234142775927882, "grad_norm": 4.738887055673006, "learning_rate": 7.652057827435444e-08, "loss": 0.5937, "step": 11370 }, { "epoch": 0.9234954925688297, "grad_norm": 5.91709828726191, "learning_rate": 7.635920335742203e-08, "loss": 0.458, "step": 11371 }, { "epoch": 0.9235767075448713, "grad_norm": 4.025266759061047, "learning_rate": 7.619799614360573e-08, "loss": 0.5648, "step": 11372 }, { "epoch": 0.9236579225209128, "grad_norm": 5.402057661799795, "learning_rate": 7.603695664406053e-08, "loss": 0.4054, "step": 11373 }, { "epoch": 0.9237391374969545, "grad_norm": 5.575279520689622, "learning_rate": 7.587608486992915e-08, "loss": 0.4741, "step": 11374 }, { "epoch": 0.923820352472996, "grad_norm": 3.7480482517554115, "learning_rate": 7.571538083234298e-08, "loss": 0.4311, "step": 11375 }, { "epoch": 0.9239015674490376, "grad_norm": 3.873434245465527, "learning_rate": 7.555484454242229e-08, "loss": 0.5094, "step": 11376 }, { "epoch": 0.9239827824250791, "grad_norm": 4.404320153846423, "learning_rate": 7.539447601127542e-08, "loss": 0.4184, "step": 11377 }, { "epoch": 0.9240639974011208, "grad_norm": 4.253052401961252, "learning_rate": 7.523427524999822e-08, "loss": 0.4376, "step": 11378 }, { "epoch": 0.9241452123771623, "grad_norm": 7.182394888454542, "learning_rate": 7.507424226967681e-08, "loss": 0.4695, "step": 11379 }, { "epoch": 0.9242264273532039, "grad_norm": 6.303359290206109, "learning_rate": 7.491437708138372e-08, "loss": 0.5214, "step": 11380 }, { "epoch": 0.9243076423292456, "grad_norm": 4.296067574108232, "learning_rate": 7.475467969618122e-08, "loss": 0.4966, "step": 11381 }, { "epoch": 0.9243888573052871, "grad_norm": 3.9154299069883187, "learning_rate": 7.459515012511937e-08, "loss": 0.6921, "step": 11382 }, { "epoch": 0.9244700722813287, "grad_norm": 3.5610883326964538, "learning_rate": 7.443578837923709e-08, "loss": 0.4532, "step": 11383 }, { "epoch": 0.9245512872573702, "grad_norm": 4.0069328169869625, "learning_rate": 7.427659446956087e-08, "loss": 0.6126, "step": 11384 }, { "epoch": 0.9246325022334119, "grad_norm": 3.4747848677201554, "learning_rate": 7.41175684071066e-08, "loss": 0.5281, "step": 11385 }, { "epoch": 0.9247137172094534, "grad_norm": 4.640308832890803, "learning_rate": 7.39587102028777e-08, "loss": 0.448, "step": 11386 }, { "epoch": 0.924794932185495, "grad_norm": 4.482953706151723, "learning_rate": 7.38000198678665e-08, "loss": 0.5122, "step": 11387 }, { "epoch": 0.9248761471615365, "grad_norm": 7.944047198700474, "learning_rate": 7.36414974130531e-08, "loss": 0.4532, "step": 11388 }, { "epoch": 0.9249573621375782, "grad_norm": 3.2439380546377583, "learning_rate": 7.348314284940706e-08, "loss": 0.4904, "step": 11389 }, { "epoch": 0.9250385771136197, "grad_norm": 4.141858638205531, "learning_rate": 7.332495618788516e-08, "loss": 0.5272, "step": 11390 }, { "epoch": 0.9251197920896613, "grad_norm": 3.3315239217657773, "learning_rate": 7.316693743943364e-08, "loss": 0.4314, "step": 11391 }, { "epoch": 0.925201007065703, "grad_norm": 3.9642024980963573, "learning_rate": 7.300908661498602e-08, "loss": 0.545, "step": 11392 }, { "epoch": 0.9252822220417445, "grad_norm": 6.271707028230578, "learning_rate": 7.28514037254649e-08, "loss": 0.4979, "step": 11393 }, { "epoch": 0.9253634370177861, "grad_norm": 6.887788740548668, "learning_rate": 7.26938887817813e-08, "loss": 0.5243, "step": 11394 }, { "epoch": 0.9254446519938276, "grad_norm": 4.351861411172992, "learning_rate": 7.2536541794834e-08, "loss": 0.563, "step": 11395 }, { "epoch": 0.9255258669698693, "grad_norm": 13.306179850830969, "learning_rate": 7.237936277551095e-08, "loss": 0.4579, "step": 11396 }, { "epoch": 0.9256070819459108, "grad_norm": 8.741284990101086, "learning_rate": 7.22223517346879e-08, "loss": 0.4368, "step": 11397 }, { "epoch": 0.9256882969219524, "grad_norm": 4.724782847856221, "learning_rate": 7.206550868322947e-08, "loss": 0.5499, "step": 11398 }, { "epoch": 0.925769511897994, "grad_norm": 4.443264413967751, "learning_rate": 7.190883363198815e-08, "loss": 0.4174, "step": 11399 }, { "epoch": 0.9258507268740356, "grad_norm": 10.053412856400966, "learning_rate": 7.175232659180492e-08, "loss": 0.6192, "step": 11400 }, { "epoch": 0.9259319418500771, "grad_norm": 8.327053855258349, "learning_rate": 7.159598757350922e-08, "loss": 0.4117, "step": 11401 }, { "epoch": 0.9260131568261187, "grad_norm": 4.431594842283679, "learning_rate": 7.143981658791933e-08, "loss": 0.4699, "step": 11402 }, { "epoch": 0.9260943718021604, "grad_norm": 3.250722005433043, "learning_rate": 7.128381364584075e-08, "loss": 0.5342, "step": 11403 }, { "epoch": 0.9261755867782019, "grad_norm": 2.957886228743401, "learning_rate": 7.112797875806904e-08, "loss": 0.4175, "step": 11404 }, { "epoch": 0.9262568017542435, "grad_norm": 6.59101582446327, "learning_rate": 7.09723119353864e-08, "loss": 0.4268, "step": 11405 }, { "epoch": 0.926338016730285, "grad_norm": 4.053080793203663, "learning_rate": 7.081681318856392e-08, "loss": 0.4142, "step": 11406 }, { "epoch": 0.9264192317063267, "grad_norm": 48.79556971471609, "learning_rate": 7.066148252836219e-08, "loss": 0.4385, "step": 11407 }, { "epoch": 0.9265004466823682, "grad_norm": 4.586752487959576, "learning_rate": 7.050631996552842e-08, "loss": 0.4813, "step": 11408 }, { "epoch": 0.9265816616584098, "grad_norm": 5.714365886095958, "learning_rate": 7.035132551079932e-08, "loss": 0.3704, "step": 11409 }, { "epoch": 0.9266628766344513, "grad_norm": 8.579523295772237, "learning_rate": 7.019649917490018e-08, "loss": 0.3965, "step": 11410 }, { "epoch": 0.926744091610493, "grad_norm": 3.0236689032912842, "learning_rate": 7.004184096854356e-08, "loss": 0.572, "step": 11411 }, { "epoch": 0.9268253065865345, "grad_norm": 6.812565797682661, "learning_rate": 6.988735090243142e-08, "loss": 0.4516, "step": 11412 }, { "epoch": 0.9269065215625761, "grad_norm": 7.700307129522538, "learning_rate": 6.973302898725303e-08, "loss": 0.4734, "step": 11413 }, { "epoch": 0.9269877365386178, "grad_norm": 4.263009068805309, "learning_rate": 6.957887523368678e-08, "loss": 0.4478, "step": 11414 }, { "epoch": 0.9270689515146593, "grad_norm": 5.759822571490719, "learning_rate": 6.942488965240024e-08, "loss": 0.4749, "step": 11415 }, { "epoch": 0.9271501664907009, "grad_norm": 7.781687526540662, "learning_rate": 6.92710722540471e-08, "loss": 0.398, "step": 11416 }, { "epoch": 0.9272313814667424, "grad_norm": 6.317228165360641, "learning_rate": 6.911742304927166e-08, "loss": 0.3865, "step": 11417 }, { "epoch": 0.9273125964427841, "grad_norm": 6.2711857470132095, "learning_rate": 6.896394204870538e-08, "loss": 0.4675, "step": 11418 }, { "epoch": 0.9273938114188256, "grad_norm": 4.60041184087192, "learning_rate": 6.881062926296783e-08, "loss": 0.5341, "step": 11419 }, { "epoch": 0.9274750263948672, "grad_norm": 5.061233791327525, "learning_rate": 6.865748470266803e-08, "loss": 0.5314, "step": 11420 }, { "epoch": 0.9275562413709088, "grad_norm": 3.7105712181882007, "learning_rate": 6.85045083784025e-08, "loss": 0.3024, "step": 11421 }, { "epoch": 0.9276374563469504, "grad_norm": 12.81065039260799, "learning_rate": 6.835170030075638e-08, "loss": 0.4527, "step": 11422 }, { "epoch": 0.9277186713229919, "grad_norm": 6.058599141728342, "learning_rate": 6.819906048030345e-08, "loss": 0.4631, "step": 11423 }, { "epoch": 0.9277998862990335, "grad_norm": 6.482547478325477, "learning_rate": 6.804658892760552e-08, "loss": 0.4521, "step": 11424 }, { "epoch": 0.9278811012750752, "grad_norm": 14.54413960358652, "learning_rate": 6.789428565321249e-08, "loss": 0.3769, "step": 11425 }, { "epoch": 0.9279623162511167, "grad_norm": 5.183303848496903, "learning_rate": 6.774215066766344e-08, "loss": 0.535, "step": 11426 }, { "epoch": 0.9280435312271583, "grad_norm": 4.383546792589708, "learning_rate": 6.759018398148464e-08, "loss": 0.3777, "step": 11427 }, { "epoch": 0.9281247462031998, "grad_norm": 14.039870214824262, "learning_rate": 6.743838560519189e-08, "loss": 0.508, "step": 11428 }, { "epoch": 0.9282059611792415, "grad_norm": 6.608844886900423, "learning_rate": 6.728675554928898e-08, "loss": 0.3648, "step": 11429 }, { "epoch": 0.928287176155283, "grad_norm": 4.663438602760716, "learning_rate": 6.713529382426726e-08, "loss": 0.3972, "step": 11430 }, { "epoch": 0.9283683911313246, "grad_norm": 3.8710006305176226, "learning_rate": 6.698400044060777e-08, "loss": 0.4688, "step": 11431 }, { "epoch": 0.9284496061073662, "grad_norm": 3.7449549283487324, "learning_rate": 6.683287540877853e-08, "loss": 0.539, "step": 11432 }, { "epoch": 0.9285308210834078, "grad_norm": 4.662838653734012, "learning_rate": 6.668191873923701e-08, "loss": 0.5233, "step": 11433 }, { "epoch": 0.9286120360594493, "grad_norm": 4.3917574513396955, "learning_rate": 6.653113044242904e-08, "loss": 0.5587, "step": 11434 }, { "epoch": 0.9286932510354909, "grad_norm": 9.052354440821007, "learning_rate": 6.638051052878736e-08, "loss": 0.5373, "step": 11435 }, { "epoch": 0.9287744660115326, "grad_norm": 3.219724415933315, "learning_rate": 6.623005900873474e-08, "loss": 0.4253, "step": 11436 }, { "epoch": 0.9288556809875741, "grad_norm": 3.5215886145233584, "learning_rate": 6.607977589268177e-08, "loss": 0.447, "step": 11437 }, { "epoch": 0.9289368959636157, "grad_norm": 6.121975994953795, "learning_rate": 6.59296611910265e-08, "loss": 0.4416, "step": 11438 }, { "epoch": 0.9290181109396572, "grad_norm": 7.404459895688475, "learning_rate": 6.577971491415674e-08, "loss": 0.4477, "step": 11439 }, { "epoch": 0.9290993259156989, "grad_norm": 9.284787320280776, "learning_rate": 6.56299370724478e-08, "loss": 0.5049, "step": 11440 }, { "epoch": 0.9291805408917404, "grad_norm": 7.115335886705074, "learning_rate": 6.548032767626333e-08, "loss": 0.4512, "step": 11441 }, { "epoch": 0.929261755867782, "grad_norm": 6.530284682735083, "learning_rate": 6.533088673595589e-08, "loss": 0.3873, "step": 11442 }, { "epoch": 0.9293429708438236, "grad_norm": 5.05350752987569, "learning_rate": 6.51816142618658e-08, "loss": 0.5982, "step": 11443 }, { "epoch": 0.9294241858198652, "grad_norm": 8.402220447701428, "learning_rate": 6.503251026432179e-08, "loss": 0.517, "step": 11444 }, { "epoch": 0.9295054007959067, "grad_norm": 5.712552445076794, "learning_rate": 6.48835747536411e-08, "loss": 0.4236, "step": 11445 }, { "epoch": 0.9295866157719483, "grad_norm": 8.723406889128468, "learning_rate": 6.473480774012941e-08, "loss": 0.4614, "step": 11446 }, { "epoch": 0.92966783074799, "grad_norm": 3.518539374576921, "learning_rate": 6.458620923408044e-08, "loss": 0.697, "step": 11447 }, { "epoch": 0.9297490457240315, "grad_norm": 4.996661458005611, "learning_rate": 6.443777924577676e-08, "loss": 0.7767, "step": 11448 }, { "epoch": 0.9298302607000731, "grad_norm": 6.029619825429544, "learning_rate": 6.428951778548881e-08, "loss": 0.5338, "step": 11449 }, { "epoch": 0.9299114756761147, "grad_norm": 6.072175958874135, "learning_rate": 6.414142486347557e-08, "loss": 0.53, "step": 11450 }, { "epoch": 0.9299926906521563, "grad_norm": 4.106779337844846, "learning_rate": 6.39935004899836e-08, "loss": 0.5014, "step": 11451 }, { "epoch": 0.9300739056281978, "grad_norm": 5.028928964661914, "learning_rate": 6.38457446752494e-08, "loss": 0.3896, "step": 11452 }, { "epoch": 0.9301551206042394, "grad_norm": 3.916926797126131, "learning_rate": 6.36981574294962e-08, "loss": 0.4988, "step": 11453 }, { "epoch": 0.930236335580281, "grad_norm": 11.573875343354155, "learning_rate": 6.355073876293638e-08, "loss": 0.4861, "step": 11454 }, { "epoch": 0.9303175505563226, "grad_norm": 2.953512192237326, "learning_rate": 6.340348868577123e-08, "loss": 0.4556, "step": 11455 }, { "epoch": 0.9303987655323641, "grad_norm": 5.593041199500861, "learning_rate": 6.325640720818899e-08, "loss": 0.4905, "step": 11456 }, { "epoch": 0.9304799805084057, "grad_norm": 5.055425216937131, "learning_rate": 6.310949434036707e-08, "loss": 0.5173, "step": 11457 }, { "epoch": 0.9305611954844474, "grad_norm": 5.280470361998423, "learning_rate": 6.296275009247121e-08, "loss": 0.4225, "step": 11458 }, { "epoch": 0.9306424104604889, "grad_norm": 3.6832709176499785, "learning_rate": 6.28161744746547e-08, "loss": 0.453, "step": 11459 }, { "epoch": 0.9307236254365305, "grad_norm": 7.653920651385478, "learning_rate": 6.266976749706055e-08, "loss": 0.4662, "step": 11460 }, { "epoch": 0.930804840412572, "grad_norm": 5.873778228921937, "learning_rate": 6.252352916981924e-08, "loss": 0.4553, "step": 11461 }, { "epoch": 0.9308860553886137, "grad_norm": 4.554092544444619, "learning_rate": 6.237745950304963e-08, "loss": 0.6166, "step": 11462 }, { "epoch": 0.9309672703646552, "grad_norm": 5.080682795743635, "learning_rate": 6.223155850685864e-08, "loss": 0.5027, "step": 11463 }, { "epoch": 0.9310484853406968, "grad_norm": 16.051986684845595, "learning_rate": 6.208582619134234e-08, "loss": 0.5546, "step": 11464 }, { "epoch": 0.9311297003167384, "grad_norm": 7.21160759933066, "learning_rate": 6.194026256658437e-08, "loss": 0.412, "step": 11465 }, { "epoch": 0.93121091529278, "grad_norm": 7.017201794326772, "learning_rate": 6.179486764265663e-08, "loss": 0.5635, "step": 11466 }, { "epoch": 0.9312921302688215, "grad_norm": 6.919895696187786, "learning_rate": 6.164964142962027e-08, "loss": 0.4289, "step": 11467 }, { "epoch": 0.9313733452448631, "grad_norm": 6.03620856801876, "learning_rate": 6.15045839375239e-08, "loss": 0.5504, "step": 11468 }, { "epoch": 0.9314545602209048, "grad_norm": 6.279814421491574, "learning_rate": 6.135969517640506e-08, "loss": 0.3537, "step": 11469 }, { "epoch": 0.9315357751969463, "grad_norm": 9.154160836609854, "learning_rate": 6.12149751562885e-08, "loss": 0.6001, "step": 11470 }, { "epoch": 0.9316169901729879, "grad_norm": 6.53932524076683, "learning_rate": 6.107042388718898e-08, "loss": 0.3881, "step": 11471 }, { "epoch": 0.9316982051490295, "grad_norm": 6.633007691995465, "learning_rate": 6.092604137910768e-08, "loss": 0.5065, "step": 11472 }, { "epoch": 0.9317794201250711, "grad_norm": 5.199512015394908, "learning_rate": 6.078182764203605e-08, "loss": 0.5226, "step": 11473 }, { "epoch": 0.9318606351011126, "grad_norm": 7.046681958143693, "learning_rate": 6.063778268595278e-08, "loss": 0.468, "step": 11474 }, { "epoch": 0.9319418500771542, "grad_norm": 7.741942160235145, "learning_rate": 6.04939065208246e-08, "loss": 0.4611, "step": 11475 }, { "epoch": 0.9320230650531958, "grad_norm": 6.7144122182681745, "learning_rate": 6.035019915660717e-08, "loss": 0.4898, "step": 11476 }, { "epoch": 0.9321042800292374, "grad_norm": 4.677019846346351, "learning_rate": 6.020666060324448e-08, "loss": 0.4645, "step": 11477 }, { "epoch": 0.9321854950052789, "grad_norm": 4.133353885157457, "learning_rate": 6.006329087066831e-08, "loss": 0.36, "step": 11478 }, { "epoch": 0.9322667099813206, "grad_norm": 6.172109607468845, "learning_rate": 5.992008996879906e-08, "loss": 0.4392, "step": 11479 }, { "epoch": 0.9323479249573622, "grad_norm": 4.757665364741502, "learning_rate": 5.977705790754546e-08, "loss": 0.5154, "step": 11480 }, { "epoch": 0.9324291399334037, "grad_norm": 9.820132371604062, "learning_rate": 5.963419469680543e-08, "loss": 0.4637, "step": 11481 }, { "epoch": 0.9325103549094453, "grad_norm": 4.899371695926488, "learning_rate": 5.9491500346463005e-08, "loss": 0.5009, "step": 11482 }, { "epoch": 0.9325915698854869, "grad_norm": 3.7801882505750366, "learning_rate": 5.934897486639307e-08, "loss": 0.4272, "step": 11483 }, { "epoch": 0.9326727848615285, "grad_norm": 7.561614245206794, "learning_rate": 5.9206618266456904e-08, "loss": 0.5861, "step": 11484 }, { "epoch": 0.93275399983757, "grad_norm": 4.538003207030416, "learning_rate": 5.906443055650496e-08, "loss": 0.6691, "step": 11485 }, { "epoch": 0.9328352148136116, "grad_norm": 5.474716210455667, "learning_rate": 5.892241174637575e-08, "loss": 0.3982, "step": 11486 }, { "epoch": 0.9329164297896532, "grad_norm": 8.069204894922803, "learning_rate": 5.8780561845896697e-08, "loss": 0.52, "step": 11487 }, { "epoch": 0.9329976447656948, "grad_norm": 5.907171852729622, "learning_rate": 5.863888086488301e-08, "loss": 0.5574, "step": 11488 }, { "epoch": 0.9330788597417363, "grad_norm": 6.090121506693037, "learning_rate": 5.849736881313767e-08, "loss": 0.4025, "step": 11489 }, { "epoch": 0.933160074717778, "grad_norm": 8.516034082485882, "learning_rate": 5.835602570045312e-08, "loss": 0.4993, "step": 11490 }, { "epoch": 0.9332412896938196, "grad_norm": 6.277551920267633, "learning_rate": 5.8214851536609326e-08, "loss": 0.4205, "step": 11491 }, { "epoch": 0.9333225046698611, "grad_norm": 8.065629651818462, "learning_rate": 5.807384633137459e-08, "loss": 0.4213, "step": 11492 }, { "epoch": 0.9334037196459027, "grad_norm": 4.95647438456384, "learning_rate": 5.793301009450636e-08, "loss": 0.4459, "step": 11493 }, { "epoch": 0.9334849346219443, "grad_norm": 4.983399216729912, "learning_rate": 5.779234283574936e-08, "loss": 0.5453, "step": 11494 }, { "epoch": 0.9335661495979859, "grad_norm": 5.591248101931953, "learning_rate": 5.765184456483664e-08, "loss": 0.453, "step": 11495 }, { "epoch": 0.9336473645740274, "grad_norm": 5.18268156667367, "learning_rate": 5.7511515291490686e-08, "loss": 0.4371, "step": 11496 }, { "epoch": 0.933728579550069, "grad_norm": 6.395159139697958, "learning_rate": 5.737135502542124e-08, "loss": 0.5132, "step": 11497 }, { "epoch": 0.9338097945261106, "grad_norm": 5.026018785036258, "learning_rate": 5.7231363776326096e-08, "loss": 0.4138, "step": 11498 }, { "epoch": 0.9338910095021522, "grad_norm": 4.941851075904659, "learning_rate": 5.709154155389279e-08, "loss": 0.5284, "step": 11499 }, { "epoch": 0.9339722244781937, "grad_norm": 5.637616770839022, "learning_rate": 5.6951888367795804e-08, "loss": 0.5964, "step": 11500 }, { "epoch": 0.9340534394542354, "grad_norm": 6.004225890183753, "learning_rate": 5.681240422769879e-08, "loss": 0.4451, "step": 11501 }, { "epoch": 0.934134654430277, "grad_norm": 4.723000628789615, "learning_rate": 5.6673089143252646e-08, "loss": 0.3078, "step": 11502 }, { "epoch": 0.9342158694063185, "grad_norm": 6.077929375109591, "learning_rate": 5.653394312409771e-08, "loss": 0.6627, "step": 11503 }, { "epoch": 0.9342970843823601, "grad_norm": 6.472316849971879, "learning_rate": 5.639496617986184e-08, "loss": 0.4792, "step": 11504 }, { "epoch": 0.9343782993584017, "grad_norm": 6.293958361792907, "learning_rate": 5.625615832016179e-08, "loss": 0.438, "step": 11505 }, { "epoch": 0.9344595143344433, "grad_norm": 5.09268953575727, "learning_rate": 5.6117519554602375e-08, "loss": 0.5255, "step": 11506 }, { "epoch": 0.9345407293104848, "grad_norm": 5.054483428815471, "learning_rate": 5.597904989277675e-08, "loss": 0.437, "step": 11507 }, { "epoch": 0.9346219442865265, "grad_norm": 3.9858596078394943, "learning_rate": 5.584074934426559e-08, "loss": 0.5096, "step": 11508 }, { "epoch": 0.934703159262568, "grad_norm": 8.170161084504295, "learning_rate": 5.570261791863957e-08, "loss": 0.4268, "step": 11509 }, { "epoch": 0.9347843742386096, "grad_norm": 5.199339732120409, "learning_rate": 5.5564655625455766e-08, "loss": 0.7232, "step": 11510 }, { "epoch": 0.9348655892146511, "grad_norm": 4.643135079968665, "learning_rate": 5.5426862474260986e-08, "loss": 0.4998, "step": 11511 }, { "epoch": 0.9349468041906928, "grad_norm": 4.163342887629166, "learning_rate": 5.528923847458928e-08, "loss": 0.4594, "step": 11512 }, { "epoch": 0.9350280191667344, "grad_norm": 5.970898841399132, "learning_rate": 5.5151783635964126e-08, "loss": 0.46, "step": 11513 }, { "epoch": 0.9351092341427759, "grad_norm": 5.478270806254457, "learning_rate": 5.5014497967896266e-08, "loss": 0.4887, "step": 11514 }, { "epoch": 0.9351904491188175, "grad_norm": 4.460190714960654, "learning_rate": 5.4877381479885307e-08, "loss": 0.5598, "step": 11515 }, { "epoch": 0.9352716640948591, "grad_norm": 4.1338050878154355, "learning_rate": 5.4740434181418945e-08, "loss": 0.47, "step": 11516 }, { "epoch": 0.9353528790709007, "grad_norm": 6.435871850940438, "learning_rate": 5.460365608197293e-08, "loss": 0.4158, "step": 11517 }, { "epoch": 0.9354340940469422, "grad_norm": 7.643223731934275, "learning_rate": 5.4467047191011924e-08, "loss": 0.3233, "step": 11518 }, { "epoch": 0.9355153090229839, "grad_norm": 5.4260804495525825, "learning_rate": 5.4330607517988635e-08, "loss": 0.5388, "step": 11519 }, { "epoch": 0.9355965239990254, "grad_norm": 4.953162854169758, "learning_rate": 5.419433707234356e-08, "loss": 0.4487, "step": 11520 }, { "epoch": 0.935677738975067, "grad_norm": 3.6992673910297293, "learning_rate": 5.4058235863506116e-08, "loss": 0.5167, "step": 11521 }, { "epoch": 0.9357589539511085, "grad_norm": 4.996367276417503, "learning_rate": 5.392230390089404e-08, "loss": 0.5104, "step": 11522 }, { "epoch": 0.9358401689271502, "grad_norm": 4.62381128315659, "learning_rate": 5.3786541193912854e-08, "loss": 0.5088, "step": 11523 }, { "epoch": 0.9359213839031918, "grad_norm": 7.587560626305793, "learning_rate": 5.3650947751956174e-08, "loss": 0.3284, "step": 11524 }, { "epoch": 0.9360025988792333, "grad_norm": 5.005872247413698, "learning_rate": 5.351552358440704e-08, "loss": 0.4261, "step": 11525 }, { "epoch": 0.936083813855275, "grad_norm": 5.695031699056669, "learning_rate": 5.3380268700636006e-08, "loss": 0.4862, "step": 11526 }, { "epoch": 0.9361650288313165, "grad_norm": 5.066704771887348, "learning_rate": 5.324518311000143e-08, "loss": 0.5233, "step": 11527 }, { "epoch": 0.9362462438073581, "grad_norm": 3.891689553812398, "learning_rate": 5.311026682185139e-08, "loss": 0.4053, "step": 11528 }, { "epoch": 0.9363274587833996, "grad_norm": 4.149090685885997, "learning_rate": 5.297551984552063e-08, "loss": 0.5201, "step": 11529 }, { "epoch": 0.9364086737594413, "grad_norm": 4.452851895114947, "learning_rate": 5.2840942190333086e-08, "loss": 0.4647, "step": 11530 }, { "epoch": 0.9364898887354828, "grad_norm": 5.464100401282328, "learning_rate": 5.270653386560104e-08, "loss": 0.4444, "step": 11531 }, { "epoch": 0.9365711037115244, "grad_norm": 7.585338070087896, "learning_rate": 5.257229488062482e-08, "loss": 0.4242, "step": 11532 }, { "epoch": 0.9366523186875659, "grad_norm": 9.756341519461458, "learning_rate": 5.243822524469283e-08, "loss": 0.3792, "step": 11533 }, { "epoch": 0.9367335336636076, "grad_norm": 6.0305409951018225, "learning_rate": 5.23043249670821e-08, "loss": 0.4986, "step": 11534 }, { "epoch": 0.9368147486396492, "grad_norm": 8.933422659020032, "learning_rate": 5.2170594057058264e-08, "loss": 0.647, "step": 11535 }, { "epoch": 0.9368959636156907, "grad_norm": 7.945666217999835, "learning_rate": 5.2037032523873654e-08, "loss": 0.4578, "step": 11536 }, { "epoch": 0.9369771785917324, "grad_norm": 8.210836981044634, "learning_rate": 5.190364037677142e-08, "loss": 0.4933, "step": 11537 }, { "epoch": 0.9370583935677739, "grad_norm": 4.671044337470819, "learning_rate": 5.1770417624980306e-08, "loss": 0.4541, "step": 11538 }, { "epoch": 0.9371396085438155, "grad_norm": 6.043521315334802, "learning_rate": 5.1637364277719595e-08, "loss": 0.5181, "step": 11539 }, { "epoch": 0.937220823519857, "grad_norm": 5.1011629271022185, "learning_rate": 5.150448034419525e-08, "loss": 0.5324, "step": 11540 }, { "epoch": 0.9373020384958987, "grad_norm": 3.8201585871136734, "learning_rate": 5.1371765833602703e-08, "loss": 0.4803, "step": 11541 }, { "epoch": 0.9373832534719402, "grad_norm": 5.758818308718375, "learning_rate": 5.123922075512461e-08, "loss": 0.3395, "step": 11542 }, { "epoch": 0.9374644684479818, "grad_norm": 3.435971230625552, "learning_rate": 5.110684511793251e-08, "loss": 0.6348, "step": 11543 }, { "epoch": 0.9375456834240233, "grad_norm": 7.13731160639292, "learning_rate": 5.0974638931186036e-08, "loss": 0.3566, "step": 11544 }, { "epoch": 0.937626898400065, "grad_norm": 9.350076135939414, "learning_rate": 5.084260220403342e-08, "loss": 0.4923, "step": 11545 }, { "epoch": 0.9377081133761066, "grad_norm": 4.901127986761642, "learning_rate": 5.0710734945610686e-08, "loss": 0.4283, "step": 11546 }, { "epoch": 0.9377893283521481, "grad_norm": 4.685757580589546, "learning_rate": 5.057903716504248e-08, "loss": 0.491, "step": 11547 }, { "epoch": 0.9378705433281898, "grad_norm": 5.048648910312251, "learning_rate": 5.044750887144151e-08, "loss": 0.2839, "step": 11548 }, { "epoch": 0.9379517583042313, "grad_norm": 6.1324537629824345, "learning_rate": 5.0316150073908555e-08, "loss": 0.4318, "step": 11549 }, { "epoch": 0.9380329732802729, "grad_norm": 5.300490311250102, "learning_rate": 5.0184960781533844e-08, "loss": 0.3291, "step": 11550 }, { "epoch": 0.9381141882563144, "grad_norm": 4.481525918571603, "learning_rate": 5.005394100339373e-08, "loss": 0.6305, "step": 11551 }, { "epoch": 0.9381954032323561, "grad_norm": 5.628772875047986, "learning_rate": 4.992309074855484e-08, "loss": 0.4441, "step": 11552 }, { "epoch": 0.9382766182083976, "grad_norm": 5.631338094522123, "learning_rate": 4.97924100260716e-08, "loss": 0.362, "step": 11553 }, { "epoch": 0.9383578331844392, "grad_norm": 4.450334044578378, "learning_rate": 4.966189884498596e-08, "loss": 0.4233, "step": 11554 }, { "epoch": 0.9384390481604807, "grad_norm": 4.584301414669546, "learning_rate": 4.953155721432873e-08, "loss": 0.4751, "step": 11555 }, { "epoch": 0.9385202631365224, "grad_norm": 4.749314594480385, "learning_rate": 4.940138514311854e-08, "loss": 0.5722, "step": 11556 }, { "epoch": 0.938601478112564, "grad_norm": 4.863986365737658, "learning_rate": 4.927138264036291e-08, "loss": 0.4222, "step": 11557 }, { "epoch": 0.9386826930886055, "grad_norm": 5.649739794167962, "learning_rate": 4.9141549715057415e-08, "loss": 0.4626, "step": 11558 }, { "epoch": 0.9387639080646472, "grad_norm": 3.4200650534492807, "learning_rate": 4.90118863761857e-08, "loss": 0.412, "step": 11559 }, { "epoch": 0.9388451230406887, "grad_norm": 8.38158690698774, "learning_rate": 4.888239263271977e-08, "loss": 0.5341, "step": 11560 }, { "epoch": 0.9389263380167303, "grad_norm": 3.2373374116022893, "learning_rate": 4.875306849361994e-08, "loss": 0.4586, "step": 11561 }, { "epoch": 0.9390075529927718, "grad_norm": 4.301803597627936, "learning_rate": 4.862391396783461e-08, "loss": 0.3589, "step": 11562 }, { "epoch": 0.9390887679688135, "grad_norm": 5.310812898753721, "learning_rate": 4.849492906430081e-08, "loss": 0.4699, "step": 11563 }, { "epoch": 0.939169982944855, "grad_norm": 5.309327329673865, "learning_rate": 4.836611379194334e-08, "loss": 0.5494, "step": 11564 }, { "epoch": 0.9392511979208966, "grad_norm": 8.887102738638378, "learning_rate": 4.8237468159675896e-08, "loss": 0.3454, "step": 11565 }, { "epoch": 0.9393324128969381, "grad_norm": 4.344793062039761, "learning_rate": 4.810899217639997e-08, "loss": 0.3838, "step": 11566 }, { "epoch": 0.9394136278729798, "grad_norm": 3.970983027129392, "learning_rate": 4.798068585100513e-08, "loss": 0.463, "step": 11567 }, { "epoch": 0.9394948428490214, "grad_norm": 6.57362417638814, "learning_rate": 4.785254919236954e-08, "loss": 0.4031, "step": 11568 }, { "epoch": 0.9395760578250629, "grad_norm": 3.7805553967871552, "learning_rate": 4.772458220936027e-08, "loss": 0.4996, "step": 11569 }, { "epoch": 0.9396572728011046, "grad_norm": 5.108413544198211, "learning_rate": 4.7596784910830804e-08, "loss": 0.4602, "step": 11570 }, { "epoch": 0.9397384877771461, "grad_norm": 4.497648219050411, "learning_rate": 4.74691573056249e-08, "loss": 0.3966, "step": 11571 }, { "epoch": 0.9398197027531877, "grad_norm": 4.1059390155620115, "learning_rate": 4.7341699402573546e-08, "loss": 0.3698, "step": 11572 }, { "epoch": 0.9399009177292292, "grad_norm": 4.091197560447974, "learning_rate": 4.721441121049608e-08, "loss": 0.4798, "step": 11573 }, { "epoch": 0.9399821327052709, "grad_norm": 5.006114532781862, "learning_rate": 4.7087292738200454e-08, "loss": 0.3846, "step": 11574 }, { "epoch": 0.9400633476813124, "grad_norm": 7.010800653937726, "learning_rate": 4.696034399448185e-08, "loss": 0.3729, "step": 11575 }, { "epoch": 0.940144562657354, "grad_norm": 5.687679281302264, "learning_rate": 4.6833564988124914e-08, "loss": 0.4974, "step": 11576 }, { "epoch": 0.9402257776333955, "grad_norm": 5.740323370278037, "learning_rate": 4.670695572790235e-08, "loss": 0.4069, "step": 11577 }, { "epoch": 0.9403069926094372, "grad_norm": 8.241228285955831, "learning_rate": 4.658051622257437e-08, "loss": 0.6067, "step": 11578 }, { "epoch": 0.9403882075854788, "grad_norm": 4.495511178750625, "learning_rate": 4.6454246480890084e-08, "loss": 0.5117, "step": 11579 }, { "epoch": 0.9404694225615203, "grad_norm": 6.551988717623826, "learning_rate": 4.632814651158696e-08, "loss": 0.4359, "step": 11580 }, { "epoch": 0.940550637537562, "grad_norm": 9.683649036003146, "learning_rate": 4.620221632338995e-08, "loss": 0.4789, "step": 11581 }, { "epoch": 0.9406318525136035, "grad_norm": 5.298725894305153, "learning_rate": 4.607645592501347e-08, "loss": 0.4102, "step": 11582 }, { "epoch": 0.9407130674896451, "grad_norm": 5.2892762046601165, "learning_rate": 4.5950865325158636e-08, "loss": 0.5595, "step": 11583 }, { "epoch": 0.9407942824656866, "grad_norm": 6.532711062097573, "learning_rate": 4.582544453251597e-08, "loss": 0.3798, "step": 11584 }, { "epoch": 0.9408754974417283, "grad_norm": 7.025731896892658, "learning_rate": 4.57001935557641e-08, "loss": 0.444, "step": 11585 }, { "epoch": 0.9409567124177698, "grad_norm": 4.380484134636201, "learning_rate": 4.5575112403569985e-08, "loss": 0.4212, "step": 11586 }, { "epoch": 0.9410379273938114, "grad_norm": 78.3339524286403, "learning_rate": 4.545020108458781e-08, "loss": 0.428, "step": 11587 }, { "epoch": 0.941119142369853, "grad_norm": 4.997755103806688, "learning_rate": 4.5325459607461485e-08, "loss": 0.374, "step": 11588 }, { "epoch": 0.9412003573458946, "grad_norm": 4.4071049437250505, "learning_rate": 4.5200887980821897e-08, "loss": 0.6705, "step": 11589 }, { "epoch": 0.9412815723219362, "grad_norm": 4.496642306863949, "learning_rate": 4.5076486213289086e-08, "loss": 0.4301, "step": 11590 }, { "epoch": 0.9413627872979777, "grad_norm": 6.033449529408409, "learning_rate": 4.495225431347089e-08, "loss": 0.4866, "step": 11591 }, { "epoch": 0.9414440022740194, "grad_norm": 8.981792164965075, "learning_rate": 4.482819228996377e-08, "loss": 0.6155, "step": 11592 }, { "epoch": 0.9415252172500609, "grad_norm": 6.80461573911351, "learning_rate": 4.470430015135197e-08, "loss": 0.5014, "step": 11593 }, { "epoch": 0.9416064322261025, "grad_norm": 3.2551970239032495, "learning_rate": 4.458057790620779e-08, "loss": 0.4495, "step": 11594 }, { "epoch": 0.941687647202144, "grad_norm": 6.288968375149101, "learning_rate": 4.4457025563092724e-08, "loss": 0.4876, "step": 11595 }, { "epoch": 0.9417688621781857, "grad_norm": 5.114911602404954, "learning_rate": 4.433364313055549e-08, "loss": 0.6019, "step": 11596 }, { "epoch": 0.9418500771542272, "grad_norm": 10.285824379728702, "learning_rate": 4.42104306171337e-08, "loss": 0.3655, "step": 11597 }, { "epoch": 0.9419312921302688, "grad_norm": 4.250288445214354, "learning_rate": 4.4087388031353316e-08, "loss": 0.5237, "step": 11598 }, { "epoch": 0.9420125071063103, "grad_norm": 4.607829438175469, "learning_rate": 4.39645153817278e-08, "loss": 0.5283, "step": 11599 }, { "epoch": 0.942093722082352, "grad_norm": 7.915466583935277, "learning_rate": 4.384181267675952e-08, "loss": 0.4855, "step": 11600 }, { "epoch": 0.9421749370583936, "grad_norm": 5.46136708291508, "learning_rate": 4.3719279924938626e-08, "loss": 0.4598, "step": 11601 }, { "epoch": 0.9422561520344351, "grad_norm": 9.165200607430856, "learning_rate": 4.35969171347439e-08, "loss": 0.6132, "step": 11602 }, { "epoch": 0.9423373670104768, "grad_norm": 5.415018652981148, "learning_rate": 4.347472431464217e-08, "loss": 0.5896, "step": 11603 }, { "epoch": 0.9424185819865183, "grad_norm": 7.296700648217176, "learning_rate": 4.335270147308862e-08, "loss": 0.378, "step": 11604 }, { "epoch": 0.9424997969625599, "grad_norm": 4.175947964206813, "learning_rate": 4.32308486185265e-08, "loss": 0.4858, "step": 11605 }, { "epoch": 0.9425810119386014, "grad_norm": 5.139558164610286, "learning_rate": 4.3109165759387115e-08, "loss": 0.3888, "step": 11606 }, { "epoch": 0.9426622269146431, "grad_norm": 4.109603857789413, "learning_rate": 4.298765290409096e-08, "loss": 0.5018, "step": 11607 }, { "epoch": 0.9427434418906846, "grad_norm": 5.23991591103504, "learning_rate": 4.286631006104547e-08, "loss": 0.523, "step": 11608 }, { "epoch": 0.9428246568667262, "grad_norm": 5.661548234950453, "learning_rate": 4.2745137238646984e-08, "loss": 0.4094, "step": 11609 }, { "epoch": 0.9429058718427678, "grad_norm": 5.499414318975432, "learning_rate": 4.2624134445280186e-08, "loss": 0.6268, "step": 11610 }, { "epoch": 0.9429870868188094, "grad_norm": 4.495827244130177, "learning_rate": 4.25033016893181e-08, "loss": 0.548, "step": 11611 }, { "epoch": 0.943068301794851, "grad_norm": 3.8103963823755094, "learning_rate": 4.238263897912126e-08, "loss": 0.7681, "step": 11612 }, { "epoch": 0.9431495167708925, "grad_norm": 9.357203074745977, "learning_rate": 4.22621463230391e-08, "loss": 0.3692, "step": 11613 }, { "epoch": 0.9432307317469342, "grad_norm": 3.6750136809793412, "learning_rate": 4.214182372940884e-08, "loss": 0.601, "step": 11614 }, { "epoch": 0.9433119467229757, "grad_norm": 7.385444469733557, "learning_rate": 4.202167120655631e-08, "loss": 0.4761, "step": 11615 }, { "epoch": 0.9433931616990173, "grad_norm": 5.303774677219012, "learning_rate": 4.190168876279571e-08, "loss": 0.4371, "step": 11616 }, { "epoch": 0.9434743766750588, "grad_norm": 4.763056521648861, "learning_rate": 4.1781876406428725e-08, "loss": 0.4442, "step": 11617 }, { "epoch": 0.9435555916511005, "grad_norm": 5.484993623084503, "learning_rate": 4.1662234145746214e-08, "loss": 0.5426, "step": 11618 }, { "epoch": 0.943636806627142, "grad_norm": 4.303398425305994, "learning_rate": 4.154276198902629e-08, "loss": 0.5846, "step": 11619 }, { "epoch": 0.9437180216031836, "grad_norm": 5.295687518325545, "learning_rate": 4.1423459944536224e-08, "loss": 0.6349, "step": 11620 }, { "epoch": 0.9437992365792252, "grad_norm": 4.737076827144997, "learning_rate": 4.1304328020530804e-08, "loss": 0.4874, "step": 11621 }, { "epoch": 0.9438804515552668, "grad_norm": 15.165489379351301, "learning_rate": 4.118536622525315e-08, "loss": 0.3641, "step": 11622 }, { "epoch": 0.9439616665313084, "grad_norm": 3.5716624018017877, "learning_rate": 4.10665745669353e-08, "loss": 0.5441, "step": 11623 }, { "epoch": 0.9440428815073499, "grad_norm": 10.121245968679512, "learning_rate": 4.094795305379679e-08, "loss": 0.4997, "step": 11624 }, { "epoch": 0.9441240964833916, "grad_norm": 7.249621116373978, "learning_rate": 4.082950169404548e-08, "loss": 0.4358, "step": 11625 }, { "epoch": 0.9442053114594331, "grad_norm": 7.423045226596472, "learning_rate": 4.071122049587789e-08, "loss": 0.4766, "step": 11626 }, { "epoch": 0.9442865264354747, "grad_norm": 7.43889332245654, "learning_rate": 4.059310946747802e-08, "loss": 0.3625, "step": 11627 }, { "epoch": 0.9443677414115162, "grad_norm": 5.999420084859953, "learning_rate": 4.047516861701878e-08, "loss": 0.4215, "step": 11628 }, { "epoch": 0.9444489563875579, "grad_norm": 3.9495688216293363, "learning_rate": 4.035739795266086e-08, "loss": 0.5308, "step": 11629 }, { "epoch": 0.9445301713635994, "grad_norm": 8.225837590539006, "learning_rate": 4.0239797482553856e-08, "loss": 0.4483, "step": 11630 }, { "epoch": 0.944611386339641, "grad_norm": 4.507895023066098, "learning_rate": 4.012236721483487e-08, "loss": 0.5678, "step": 11631 }, { "epoch": 0.9446926013156827, "grad_norm": 3.795653600393925, "learning_rate": 4.0005107157628786e-08, "loss": 0.5971, "step": 11632 }, { "epoch": 0.9447738162917242, "grad_norm": 5.829598757301327, "learning_rate": 3.988801731905051e-08, "loss": 0.4818, "step": 11633 }, { "epoch": 0.9448550312677658, "grad_norm": 6.893765169454947, "learning_rate": 3.9771097707201056e-08, "loss": 0.3942, "step": 11634 }, { "epoch": 0.9449362462438073, "grad_norm": 7.0587960070765075, "learning_rate": 3.965434833017118e-08, "loss": 0.5054, "step": 11635 }, { "epoch": 0.945017461219849, "grad_norm": 8.051189199148112, "learning_rate": 3.9537769196039134e-08, "loss": 0.3643, "step": 11636 }, { "epoch": 0.9450986761958905, "grad_norm": 4.735681936699135, "learning_rate": 3.9421360312871804e-08, "loss": 0.6727, "step": 11637 }, { "epoch": 0.9451798911719321, "grad_norm": 3.9449239206953925, "learning_rate": 3.9305121688723855e-08, "loss": 0.5668, "step": 11638 }, { "epoch": 0.9452611061479737, "grad_norm": 6.946560704332956, "learning_rate": 3.918905333163858e-08, "loss": 0.6188, "step": 11639 }, { "epoch": 0.9453423211240153, "grad_norm": 8.04163254796676, "learning_rate": 3.9073155249647055e-08, "loss": 0.6601, "step": 11640 }, { "epoch": 0.9454235361000568, "grad_norm": 7.0432614921051675, "learning_rate": 3.895742745076869e-08, "loss": 0.4597, "step": 11641 }, { "epoch": 0.9455047510760984, "grad_norm": 5.740385711050734, "learning_rate": 3.8841869943011534e-08, "loss": 0.4694, "step": 11642 }, { "epoch": 0.9455859660521401, "grad_norm": 3.826367448009286, "learning_rate": 3.872648273437168e-08, "loss": 0.4556, "step": 11643 }, { "epoch": 0.9456671810281816, "grad_norm": 3.153187096292459, "learning_rate": 3.861126583283303e-08, "loss": 0.3736, "step": 11644 }, { "epoch": 0.9457483960042232, "grad_norm": 5.786888401113132, "learning_rate": 3.849621924636809e-08, "loss": 0.5045, "step": 11645 }, { "epoch": 0.9458296109802647, "grad_norm": 5.70036944601648, "learning_rate": 3.838134298293744e-08, "loss": 0.3993, "step": 11646 }, { "epoch": 0.9459108259563064, "grad_norm": 5.14700753120722, "learning_rate": 3.8266637050489716e-08, "loss": 0.6705, "step": 11647 }, { "epoch": 0.9459920409323479, "grad_norm": 5.028518558037041, "learning_rate": 3.815210145696219e-08, "loss": 0.5472, "step": 11648 }, { "epoch": 0.9460732559083895, "grad_norm": 5.08541011233092, "learning_rate": 3.803773621028045e-08, "loss": 0.4755, "step": 11649 }, { "epoch": 0.946154470884431, "grad_norm": 5.674196216201576, "learning_rate": 3.792354131835735e-08, "loss": 0.4256, "step": 11650 }, { "epoch": 0.9462356858604727, "grad_norm": 5.863265288069775, "learning_rate": 3.780951678909489e-08, "loss": 0.5258, "step": 11651 }, { "epoch": 0.9463169008365142, "grad_norm": 3.5058013583786476, "learning_rate": 3.769566263038288e-08, "loss": 0.534, "step": 11652 }, { "epoch": 0.9463981158125558, "grad_norm": 12.342326134411538, "learning_rate": 3.7581978850099456e-08, "loss": 0.5344, "step": 11653 }, { "epoch": 0.9464793307885975, "grad_norm": 4.819965937928789, "learning_rate": 3.7468465456110825e-08, "loss": 0.4811, "step": 11654 }, { "epoch": 0.946560545764639, "grad_norm": 7.405750445686644, "learning_rate": 3.735512245627182e-08, "loss": 0.3502, "step": 11655 }, { "epoch": 0.9466417607406806, "grad_norm": 9.446771351873101, "learning_rate": 3.7241949858424777e-08, "loss": 0.4258, "step": 11656 }, { "epoch": 0.9467229757167221, "grad_norm": 6.936102522174853, "learning_rate": 3.712894767040093e-08, "loss": 0.5589, "step": 11657 }, { "epoch": 0.9468041906927638, "grad_norm": 4.033630186749134, "learning_rate": 3.7016115900019575e-08, "loss": 0.4372, "step": 11658 }, { "epoch": 0.9468854056688053, "grad_norm": 5.941857393690612, "learning_rate": 3.690345455508754e-08, "loss": 0.5026, "step": 11659 }, { "epoch": 0.9469666206448469, "grad_norm": 9.14091774177606, "learning_rate": 3.679096364340079e-08, "loss": 0.4604, "step": 11660 }, { "epoch": 0.9470478356208885, "grad_norm": 4.223097671499599, "learning_rate": 3.6678643172742836e-08, "loss": 0.4393, "step": 11661 }, { "epoch": 0.9471290505969301, "grad_norm": 3.9468831332746643, "learning_rate": 3.656649315088606e-08, "loss": 0.5762, "step": 11662 }, { "epoch": 0.9472102655729716, "grad_norm": 4.743190373097494, "learning_rate": 3.6454513585590376e-08, "loss": 0.4932, "step": 11663 }, { "epoch": 0.9472914805490132, "grad_norm": 3.896032850566264, "learning_rate": 3.634270448460403e-08, "loss": 0.5384, "step": 11664 }, { "epoch": 0.9473726955250549, "grad_norm": 4.452680361174725, "learning_rate": 3.623106585566388e-08, "loss": 0.3909, "step": 11665 }, { "epoch": 0.9474539105010964, "grad_norm": 6.087481905533953, "learning_rate": 3.611959770649487e-08, "loss": 0.4937, "step": 11666 }, { "epoch": 0.947535125477138, "grad_norm": 4.040614567901036, "learning_rate": 3.600830004480943e-08, "loss": 0.5005, "step": 11667 }, { "epoch": 0.9476163404531796, "grad_norm": 4.44420665256172, "learning_rate": 3.589717287830946e-08, "loss": 0.4575, "step": 11668 }, { "epoch": 0.9476975554292212, "grad_norm": 5.616498527932605, "learning_rate": 3.578621621468381e-08, "loss": 0.5212, "step": 11669 }, { "epoch": 0.9477787704052627, "grad_norm": 5.201502802582349, "learning_rate": 3.567543006161051e-08, "loss": 0.4217, "step": 11670 }, { "epoch": 0.9478599853813043, "grad_norm": 16.378504345168306, "learning_rate": 3.556481442675508e-08, "loss": 0.4972, "step": 11671 }, { "epoch": 0.9479412003573459, "grad_norm": 5.083869322555315, "learning_rate": 3.5454369317771686e-08, "loss": 0.4814, "step": 11672 }, { "epoch": 0.9480224153333875, "grad_norm": 7.335908845803885, "learning_rate": 3.534409474230255e-08, "loss": 0.4003, "step": 11673 }, { "epoch": 0.948103630309429, "grad_norm": 10.162528148573347, "learning_rate": 3.523399070797795e-08, "loss": 0.4699, "step": 11674 }, { "epoch": 0.9481848452854706, "grad_norm": 4.64890275823902, "learning_rate": 3.512405722241652e-08, "loss": 0.4235, "step": 11675 }, { "epoch": 0.9482660602615123, "grad_norm": 5.016364672410445, "learning_rate": 3.501429429322522e-08, "loss": 0.5031, "step": 11676 }, { "epoch": 0.9483472752375538, "grad_norm": 5.858582608768708, "learning_rate": 3.4904701927999385e-08, "loss": 0.4388, "step": 11677 }, { "epoch": 0.9484284902135954, "grad_norm": 4.81391584316998, "learning_rate": 3.479528013432154e-08, "loss": 0.4828, "step": 11678 }, { "epoch": 0.948509705189637, "grad_norm": 8.006015757453834, "learning_rate": 3.468602891976314e-08, "loss": 0.5023, "step": 11679 }, { "epoch": 0.9485909201656786, "grad_norm": 13.29141782498199, "learning_rate": 3.457694829188452e-08, "loss": 0.4342, "step": 11680 }, { "epoch": 0.9486721351417201, "grad_norm": 10.20237535145513, "learning_rate": 3.446803825823269e-08, "loss": 0.3647, "step": 11681 }, { "epoch": 0.9487533501177617, "grad_norm": 5.890040192499068, "learning_rate": 3.435929882634415e-08, "loss": 0.3275, "step": 11682 }, { "epoch": 0.9488345650938033, "grad_norm": 10.83456434309621, "learning_rate": 3.425073000374257e-08, "loss": 0.4327, "step": 11683 }, { "epoch": 0.9489157800698449, "grad_norm": 4.338209138202084, "learning_rate": 3.4142331797940855e-08, "loss": 0.5753, "step": 11684 }, { "epoch": 0.9489969950458864, "grad_norm": 4.739057059307357, "learning_rate": 3.4034104216439655e-08, "loss": 0.5673, "step": 11685 }, { "epoch": 0.949078210021928, "grad_norm": 7.495421522605058, "learning_rate": 3.3926047266727155e-08, "loss": 0.3894, "step": 11686 }, { "epoch": 0.9491594249979697, "grad_norm": 7.2165806929599166, "learning_rate": 3.381816095628071e-08, "loss": 0.5159, "step": 11687 }, { "epoch": 0.9492406399740112, "grad_norm": 3.9257600126354624, "learning_rate": 3.371044529256573e-08, "loss": 0.389, "step": 11688 }, { "epoch": 0.9493218549500528, "grad_norm": 9.018221626405468, "learning_rate": 3.360290028303487e-08, "loss": 0.4364, "step": 11689 }, { "epoch": 0.9494030699260944, "grad_norm": 4.246553322453479, "learning_rate": 3.34955259351305e-08, "loss": 0.5857, "step": 11690 }, { "epoch": 0.949484284902136, "grad_norm": 8.521074043393677, "learning_rate": 3.3388322256281694e-08, "loss": 0.4385, "step": 11691 }, { "epoch": 0.9495654998781775, "grad_norm": 8.037563989138466, "learning_rate": 3.328128925390667e-08, "loss": 0.5179, "step": 11692 }, { "epoch": 0.9496467148542191, "grad_norm": 5.337969212336468, "learning_rate": 3.317442693541145e-08, "loss": 0.5983, "step": 11693 }, { "epoch": 0.9497279298302607, "grad_norm": 5.550986004035013, "learning_rate": 3.306773530819041e-08, "loss": 0.5231, "step": 11694 }, { "epoch": 0.9498091448063023, "grad_norm": 5.0987699618930975, "learning_rate": 3.296121437962624e-08, "loss": 0.6063, "step": 11695 }, { "epoch": 0.9498903597823438, "grad_norm": 6.3006147509879105, "learning_rate": 3.2854864157089164e-08, "loss": 0.3196, "step": 11696 }, { "epoch": 0.9499715747583855, "grad_norm": 4.720216946251891, "learning_rate": 3.2748684647938564e-08, "loss": 0.5137, "step": 11697 }, { "epoch": 0.9500527897344271, "grad_norm": 17.425444481965904, "learning_rate": 3.264267585952108e-08, "loss": 0.5927, "step": 11698 }, { "epoch": 0.9501340047104686, "grad_norm": 6.961626143865487, "learning_rate": 3.253683779917194e-08, "loss": 0.4547, "step": 11699 }, { "epoch": 0.9502152196865102, "grad_norm": 7.6904422344104955, "learning_rate": 3.243117047421501e-08, "loss": 0.5546, "step": 11700 }, { "epoch": 0.9502964346625518, "grad_norm": 5.910333731484439, "learning_rate": 3.2325673891961394e-08, "loss": 0.641, "step": 11701 }, { "epoch": 0.9503776496385934, "grad_norm": 7.209494401473691, "learning_rate": 3.222034805971136e-08, "loss": 0.4257, "step": 11702 }, { "epoch": 0.9504588646146349, "grad_norm": 5.915799539490019, "learning_rate": 3.2115192984752684e-08, "loss": 0.3847, "step": 11703 }, { "epoch": 0.9505400795906765, "grad_norm": 8.657367196517342, "learning_rate": 3.2010208674361774e-08, "loss": 0.6126, "step": 11704 }, { "epoch": 0.9506212945667181, "grad_norm": 5.775245158747669, "learning_rate": 3.190539513580226e-08, "loss": 0.4298, "step": 11705 }, { "epoch": 0.9507025095427597, "grad_norm": 5.310693978477437, "learning_rate": 3.1800752376327515e-08, "loss": 0.5534, "step": 11706 }, { "epoch": 0.9507837245188012, "grad_norm": 4.560752491636321, "learning_rate": 3.169628040317785e-08, "loss": 0.3459, "step": 11707 }, { "epoch": 0.9508649394948429, "grad_norm": 3.5975956386159296, "learning_rate": 3.15919792235822e-08, "loss": 0.4466, "step": 11708 }, { "epoch": 0.9509461544708845, "grad_norm": 4.190834372560731, "learning_rate": 3.1487848844757865e-08, "loss": 0.5566, "step": 11709 }, { "epoch": 0.951027369446926, "grad_norm": 4.688075724324062, "learning_rate": 3.138388927391017e-08, "loss": 0.5138, "step": 11710 }, { "epoch": 0.9511085844229676, "grad_norm": 6.558872976862139, "learning_rate": 3.1280100518231994e-08, "loss": 0.4949, "step": 11711 }, { "epoch": 0.9511897993990092, "grad_norm": 6.1291704390663515, "learning_rate": 3.1176482584905356e-08, "loss": 0.6129, "step": 11712 }, { "epoch": 0.9512710143750508, "grad_norm": 4.609364781861959, "learning_rate": 3.107303548110008e-08, "loss": 0.5307, "step": 11713 }, { "epoch": 0.9513522293510923, "grad_norm": 4.543991780518672, "learning_rate": 3.0969759213974324e-08, "loss": 0.4039, "step": 11714 }, { "epoch": 0.951433444327134, "grad_norm": 5.065242928777515, "learning_rate": 3.086665379067405e-08, "loss": 0.4611, "step": 11715 }, { "epoch": 0.9515146593031755, "grad_norm": 7.643230789019235, "learning_rate": 3.0763719218333545e-08, "loss": 0.4837, "step": 11716 }, { "epoch": 0.9515958742792171, "grad_norm": 3.926885842127398, "learning_rate": 3.066095550407544e-08, "loss": 0.4451, "step": 11717 }, { "epoch": 0.9516770892552586, "grad_norm": 5.510721045832869, "learning_rate": 3.0558362655010443e-08, "loss": 0.5284, "step": 11718 }, { "epoch": 0.9517583042313003, "grad_norm": 4.815199357131891, "learning_rate": 3.045594067823704e-08, "loss": 0.4785, "step": 11719 }, { "epoch": 0.9518395192073419, "grad_norm": 9.394045320573086, "learning_rate": 3.0353689580843174e-08, "loss": 0.5275, "step": 11720 }, { "epoch": 0.9519207341833834, "grad_norm": 7.043341703763583, "learning_rate": 3.025160936990318e-08, "loss": 0.4034, "step": 11721 }, { "epoch": 0.952001949159425, "grad_norm": 5.105234133065711, "learning_rate": 3.0149700052481135e-08, "loss": 0.4658, "step": 11722 }, { "epoch": 0.9520831641354666, "grad_norm": 20.843773029070668, "learning_rate": 3.004796163562834e-08, "loss": 0.4, "step": 11723 }, { "epoch": 0.9521643791115082, "grad_norm": 4.834619084091482, "learning_rate": 2.994639412638445e-08, "loss": 0.4094, "step": 11724 }, { "epoch": 0.9522455940875497, "grad_norm": 5.958564630244604, "learning_rate": 2.984499753177772e-08, "loss": 0.4347, "step": 11725 }, { "epoch": 0.9523268090635914, "grad_norm": 7.525314088951139, "learning_rate": 2.9743771858823657e-08, "loss": 0.3909, "step": 11726 }, { "epoch": 0.9524080240396329, "grad_norm": 4.660331555963379, "learning_rate": 2.9642717114527208e-08, "loss": 0.4202, "step": 11727 }, { "epoch": 0.9524892390156745, "grad_norm": 4.986110534638319, "learning_rate": 2.9541833305880287e-08, "loss": 0.3993, "step": 11728 }, { "epoch": 0.952570453991716, "grad_norm": 3.464614961858637, "learning_rate": 2.9441120439864246e-08, "loss": 0.5475, "step": 11729 }, { "epoch": 0.9526516689677577, "grad_norm": 21.551152924357776, "learning_rate": 2.9340578523447127e-08, "loss": 0.4057, "step": 11730 }, { "epoch": 0.9527328839437993, "grad_norm": 3.6579610094520283, "learning_rate": 2.9240207563586142e-08, "loss": 0.5294, "step": 11731 }, { "epoch": 0.9528140989198408, "grad_norm": 5.768773438976465, "learning_rate": 2.914000756722657e-08, "loss": 0.4105, "step": 11732 }, { "epoch": 0.9528953138958824, "grad_norm": 6.3584167885272285, "learning_rate": 2.903997854130147e-08, "loss": 0.5214, "step": 11733 }, { "epoch": 0.952976528871924, "grad_norm": 7.692904252140693, "learning_rate": 2.8940120492732537e-08, "loss": 0.4954, "step": 11734 }, { "epoch": 0.9530577438479656, "grad_norm": 5.655701924850993, "learning_rate": 2.8840433428429514e-08, "loss": 0.7178, "step": 11735 }, { "epoch": 0.9531389588240071, "grad_norm": 4.970188795620924, "learning_rate": 2.8740917355290222e-08, "loss": 0.5701, "step": 11736 }, { "epoch": 0.9532201738000488, "grad_norm": 5.290922267712118, "learning_rate": 2.864157228019998e-08, "loss": 0.4525, "step": 11737 }, { "epoch": 0.9533013887760903, "grad_norm": 4.627627408383265, "learning_rate": 2.854239821003385e-08, "loss": 0.4818, "step": 11738 }, { "epoch": 0.9533826037521319, "grad_norm": 7.8256164051009325, "learning_rate": 2.8443395151653562e-08, "loss": 0.5673, "step": 11739 }, { "epoch": 0.9534638187281734, "grad_norm": 6.249505047993975, "learning_rate": 2.834456311190975e-08, "loss": 0.3818, "step": 11740 }, { "epoch": 0.9535450337042151, "grad_norm": 6.005374198159657, "learning_rate": 2.8245902097641388e-08, "loss": 0.6104, "step": 11741 }, { "epoch": 0.9536262486802567, "grad_norm": 5.110767892723842, "learning_rate": 2.8147412115674955e-08, "loss": 0.4915, "step": 11742 }, { "epoch": 0.9537074636562982, "grad_norm": 4.5731007153331955, "learning_rate": 2.8049093172825282e-08, "loss": 0.4256, "step": 11743 }, { "epoch": 0.9537886786323398, "grad_norm": 10.106582363128767, "learning_rate": 2.795094527589609e-08, "loss": 0.4245, "step": 11744 }, { "epoch": 0.9538698936083814, "grad_norm": 4.273795431273181, "learning_rate": 2.7852968431678064e-08, "loss": 0.4379, "step": 11745 }, { "epoch": 0.953951108584423, "grad_norm": 4.307573087745445, "learning_rate": 2.7755162646950773e-08, "loss": 0.6883, "step": 11746 }, { "epoch": 0.9540323235604645, "grad_norm": 5.5704933532695105, "learning_rate": 2.7657527928482418e-08, "loss": 0.503, "step": 11747 }, { "epoch": 0.9541135385365062, "grad_norm": 5.927129889565711, "learning_rate": 2.756006428302843e-08, "loss": 0.5112, "step": 11748 }, { "epoch": 0.9541947535125477, "grad_norm": 7.739005076724821, "learning_rate": 2.746277171733258e-08, "loss": 0.4949, "step": 11749 }, { "epoch": 0.9542759684885893, "grad_norm": 4.857345101026582, "learning_rate": 2.736565023812754e-08, "loss": 0.4114, "step": 11750 }, { "epoch": 0.9543571834646308, "grad_norm": 5.671784265799197, "learning_rate": 2.726869985213293e-08, "loss": 0.4542, "step": 11751 }, { "epoch": 0.9544383984406725, "grad_norm": 4.542817164476761, "learning_rate": 2.717192056605783e-08, "loss": 0.4281, "step": 11752 }, { "epoch": 0.9545196134167141, "grad_norm": 4.657810531667513, "learning_rate": 2.7075312386598274e-08, "loss": 0.5609, "step": 11753 }, { "epoch": 0.9546008283927556, "grad_norm": 4.663673614763122, "learning_rate": 2.697887532043947e-08, "loss": 0.4775, "step": 11754 }, { "epoch": 0.9546820433687973, "grad_norm": 5.2321773718184765, "learning_rate": 2.688260937425413e-08, "loss": 0.5095, "step": 11755 }, { "epoch": 0.9547632583448388, "grad_norm": 4.5112988407221835, "learning_rate": 2.67865145547036e-08, "loss": 0.4052, "step": 11756 }, { "epoch": 0.9548444733208804, "grad_norm": 4.994296884071479, "learning_rate": 2.6690590868436728e-08, "loss": 0.5502, "step": 11757 }, { "epoch": 0.9549256882969219, "grad_norm": 5.907644955879376, "learning_rate": 2.6594838322091255e-08, "loss": 0.5292, "step": 11758 }, { "epoch": 0.9550069032729636, "grad_norm": 4.76086278575601, "learning_rate": 2.6499256922292715e-08, "loss": 0.532, "step": 11759 }, { "epoch": 0.9550881182490051, "grad_norm": 10.420115891569926, "learning_rate": 2.640384667565471e-08, "loss": 0.5241, "step": 11760 }, { "epoch": 0.9551693332250467, "grad_norm": 8.326204967482434, "learning_rate": 2.6308607588779177e-08, "loss": 0.3722, "step": 11761 }, { "epoch": 0.9552505482010882, "grad_norm": 4.358771745047045, "learning_rate": 2.6213539668256126e-08, "loss": 0.4494, "step": 11762 }, { "epoch": 0.9553317631771299, "grad_norm": 3.6696692614668676, "learning_rate": 2.6118642920663906e-08, "loss": 0.4823, "step": 11763 }, { "epoch": 0.9554129781531715, "grad_norm": 3.432418349481183, "learning_rate": 2.6023917352568652e-08, "loss": 0.4218, "step": 11764 }, { "epoch": 0.955494193129213, "grad_norm": 5.4984450992011515, "learning_rate": 2.592936297052512e-08, "loss": 0.5265, "step": 11765 }, { "epoch": 0.9555754081052547, "grad_norm": 4.899194056779067, "learning_rate": 2.5834979781075854e-08, "loss": 0.3908, "step": 11766 }, { "epoch": 0.9556566230812962, "grad_norm": 3.7039379030025317, "learning_rate": 2.5740767790751463e-08, "loss": 0.3868, "step": 11767 }, { "epoch": 0.9557378380573378, "grad_norm": 7.5200178859973725, "learning_rate": 2.5646727006071182e-08, "loss": 0.4753, "step": 11768 }, { "epoch": 0.9558190530333793, "grad_norm": 5.533797272476705, "learning_rate": 2.55528574335423e-08, "loss": 0.5694, "step": 11769 }, { "epoch": 0.955900268009421, "grad_norm": 4.804791486887786, "learning_rate": 2.5459159079659625e-08, "loss": 0.4299, "step": 11770 }, { "epoch": 0.9559814829854625, "grad_norm": 5.494381293218291, "learning_rate": 2.5365631950906856e-08, "loss": 0.2743, "step": 11771 }, { "epoch": 0.9560626979615041, "grad_norm": 5.394100187529947, "learning_rate": 2.5272276053755207e-08, "loss": 0.5166, "step": 11772 }, { "epoch": 0.9561439129375456, "grad_norm": 3.979565293315754, "learning_rate": 2.5179091394665346e-08, "loss": 0.3647, "step": 11773 }, { "epoch": 0.9562251279135873, "grad_norm": 5.9411168688277245, "learning_rate": 2.5086077980084057e-08, "loss": 0.4242, "step": 11774 }, { "epoch": 0.9563063428896289, "grad_norm": 6.4302270705636975, "learning_rate": 2.4993235816448136e-08, "loss": 0.4753, "step": 11775 }, { "epoch": 0.9563875578656704, "grad_norm": 4.533132653222595, "learning_rate": 2.4900564910181334e-08, "loss": 0.4724, "step": 11776 }, { "epoch": 0.9564687728417121, "grad_norm": 6.793870761327284, "learning_rate": 2.4808065267696303e-08, "loss": 0.3712, "step": 11777 }, { "epoch": 0.9565499878177536, "grad_norm": 6.118222175954092, "learning_rate": 2.4715736895393195e-08, "loss": 0.4916, "step": 11778 }, { "epoch": 0.9566312027937952, "grad_norm": 6.201932383609763, "learning_rate": 2.462357979966107e-08, "loss": 0.4406, "step": 11779 }, { "epoch": 0.9567124177698367, "grad_norm": 5.730634163072851, "learning_rate": 2.453159398687649e-08, "loss": 0.47, "step": 11780 }, { "epoch": 0.9567936327458784, "grad_norm": 5.240490008736156, "learning_rate": 2.443977946340409e-08, "loss": 0.5144, "step": 11781 }, { "epoch": 0.9568748477219199, "grad_norm": 2.6161093483379783, "learning_rate": 2.4348136235597398e-08, "loss": 0.4603, "step": 11782 }, { "epoch": 0.9569560626979615, "grad_norm": 4.15867662957049, "learning_rate": 2.425666430979773e-08, "loss": 0.5706, "step": 11783 }, { "epoch": 0.957037277674003, "grad_norm": 9.450975853907856, "learning_rate": 2.416536369233391e-08, "loss": 0.477, "step": 11784 }, { "epoch": 0.9571184926500447, "grad_norm": 7.347938995622321, "learning_rate": 2.4074234389523665e-08, "loss": 0.502, "step": 11785 }, { "epoch": 0.9571997076260863, "grad_norm": 4.253055407962608, "learning_rate": 2.3983276407672784e-08, "loss": 0.4742, "step": 11786 }, { "epoch": 0.9572809226021278, "grad_norm": 3.0805970942274135, "learning_rate": 2.389248975307512e-08, "loss": 0.6164, "step": 11787 }, { "epoch": 0.9573621375781695, "grad_norm": 5.749655956883042, "learning_rate": 2.3801874432012594e-08, "loss": 0.5219, "step": 11788 }, { "epoch": 0.957443352554211, "grad_norm": 4.8340592635662345, "learning_rate": 2.371143045075519e-08, "loss": 0.5919, "step": 11789 }, { "epoch": 0.9575245675302526, "grad_norm": 3.8977820934124066, "learning_rate": 2.3621157815561237e-08, "loss": 0.6845, "step": 11790 }, { "epoch": 0.9576057825062941, "grad_norm": 9.678166602329553, "learning_rate": 2.3531056532677122e-08, "loss": 0.5546, "step": 11791 }, { "epoch": 0.9576869974823358, "grad_norm": 7.305637570872725, "learning_rate": 2.3441126608337304e-08, "loss": 0.455, "step": 11792 }, { "epoch": 0.9577682124583773, "grad_norm": 3.8733070193102384, "learning_rate": 2.335136804876459e-08, "loss": 0.5323, "step": 11793 }, { "epoch": 0.9578494274344189, "grad_norm": 3.9403704900752636, "learning_rate": 2.3261780860169558e-08, "loss": 0.4325, "step": 11794 }, { "epoch": 0.9579306424104604, "grad_norm": 5.6576542673173575, "learning_rate": 2.31723650487517e-08, "loss": 0.4517, "step": 11795 }, { "epoch": 0.9580118573865021, "grad_norm": 6.1828309207281755, "learning_rate": 2.3083120620697453e-08, "loss": 0.3581, "step": 11796 }, { "epoch": 0.9580930723625437, "grad_norm": 4.945073828835356, "learning_rate": 2.2994047582182433e-08, "loss": 0.3434, "step": 11797 }, { "epoch": 0.9581742873385852, "grad_norm": 6.8876285655205285, "learning_rate": 2.2905145939369765e-08, "loss": 0.4171, "step": 11798 }, { "epoch": 0.9582555023146269, "grad_norm": 5.814280241290536, "learning_rate": 2.2816415698411475e-08, "loss": 0.518, "step": 11799 }, { "epoch": 0.9583367172906684, "grad_norm": 6.165510069966149, "learning_rate": 2.272785686544682e-08, "loss": 0.4784, "step": 11800 }, { "epoch": 0.95841793226671, "grad_norm": 5.755892898880744, "learning_rate": 2.263946944660367e-08, "loss": 0.4269, "step": 11801 }, { "epoch": 0.9584991472427515, "grad_norm": 10.987936416240586, "learning_rate": 2.2551253447997968e-08, "loss": 0.3444, "step": 11802 }, { "epoch": 0.9585803622187932, "grad_norm": 15.074930100573221, "learning_rate": 2.2463208875733723e-08, "loss": 0.5635, "step": 11803 }, { "epoch": 0.9586615771948347, "grad_norm": 4.1773061107164935, "learning_rate": 2.237533573590328e-08, "loss": 0.5389, "step": 11804 }, { "epoch": 0.9587427921708763, "grad_norm": 4.63238528075123, "learning_rate": 2.228763403458706e-08, "loss": 0.6207, "step": 11805 }, { "epoch": 0.9588240071469178, "grad_norm": 5.9729583458173945, "learning_rate": 2.2200103777853255e-08, "loss": 0.5594, "step": 11806 }, { "epoch": 0.9589052221229595, "grad_norm": 9.831624315329341, "learning_rate": 2.211274497175897e-08, "loss": 0.58, "step": 11807 }, { "epoch": 0.9589864370990011, "grad_norm": 6.030561079423977, "learning_rate": 2.2025557622348537e-08, "loss": 0.5508, "step": 11808 }, { "epoch": 0.9590676520750426, "grad_norm": 5.732610023291474, "learning_rate": 2.1938541735655183e-08, "loss": 0.5854, "step": 11809 }, { "epoch": 0.9591488670510843, "grad_norm": 16.34752832829806, "learning_rate": 2.1851697317699373e-08, "loss": 0.3753, "step": 11810 }, { "epoch": 0.9592300820271258, "grad_norm": 8.516533326876681, "learning_rate": 2.1765024374491018e-08, "loss": 0.4773, "step": 11811 }, { "epoch": 0.9593112970031674, "grad_norm": 4.534008961064627, "learning_rate": 2.1678522912026988e-08, "loss": 0.4447, "step": 11812 }, { "epoch": 0.9593925119792089, "grad_norm": 5.048729203766718, "learning_rate": 2.1592192936292777e-08, "loss": 0.4867, "step": 11813 }, { "epoch": 0.9594737269552506, "grad_norm": 5.401071455659973, "learning_rate": 2.1506034453262214e-08, "loss": 0.4589, "step": 11814 }, { "epoch": 0.9595549419312921, "grad_norm": 4.550782933767591, "learning_rate": 2.142004746889692e-08, "loss": 0.4399, "step": 11815 }, { "epoch": 0.9596361569073337, "grad_norm": 7.657034015101817, "learning_rate": 2.1334231989146304e-08, "loss": 0.5894, "step": 11816 }, { "epoch": 0.9597173718833752, "grad_norm": 4.68873295706115, "learning_rate": 2.124858801994867e-08, "loss": 0.5836, "step": 11817 }, { "epoch": 0.9597985868594169, "grad_norm": 5.932018957670042, "learning_rate": 2.1163115567230386e-08, "loss": 0.6573, "step": 11818 }, { "epoch": 0.9598798018354585, "grad_norm": 6.610287249201689, "learning_rate": 2.1077814636905337e-08, "loss": 0.5133, "step": 11819 }, { "epoch": 0.9599610168115, "grad_norm": 4.576678178324454, "learning_rate": 2.099268523487602e-08, "loss": 0.406, "step": 11820 }, { "epoch": 0.9600422317875417, "grad_norm": 5.480905722183299, "learning_rate": 2.0907727367033005e-08, "loss": 0.5465, "step": 11821 }, { "epoch": 0.9601234467635832, "grad_norm": 10.7296131406781, "learning_rate": 2.0822941039254642e-08, "loss": 0.5529, "step": 11822 }, { "epoch": 0.9602046617396248, "grad_norm": 4.499843737247379, "learning_rate": 2.0738326257407904e-08, "loss": 0.4012, "step": 11823 }, { "epoch": 0.9602858767156663, "grad_norm": 5.585081499968746, "learning_rate": 2.0653883027347832e-08, "loss": 0.4078, "step": 11824 }, { "epoch": 0.960367091691708, "grad_norm": 5.781778186861071, "learning_rate": 2.056961135491725e-08, "loss": 0.4095, "step": 11825 }, { "epoch": 0.9604483066677495, "grad_norm": 5.2006226427597415, "learning_rate": 2.048551124594733e-08, "loss": 0.5062, "step": 11826 }, { "epoch": 0.9605295216437911, "grad_norm": 9.000238161766775, "learning_rate": 2.0401582706257304e-08, "loss": 0.4405, "step": 11827 }, { "epoch": 0.9606107366198327, "grad_norm": 5.406010740569412, "learning_rate": 2.031782574165475e-08, "loss": 0.5036, "step": 11828 }, { "epoch": 0.9606919515958743, "grad_norm": 5.004999912991991, "learning_rate": 2.0234240357935032e-08, "loss": 0.4662, "step": 11829 }, { "epoch": 0.9607731665719159, "grad_norm": 4.837946844168023, "learning_rate": 2.015082656088213e-08, "loss": 0.496, "step": 11830 }, { "epoch": 0.9608543815479574, "grad_norm": 6.627370947037891, "learning_rate": 2.0067584356267545e-08, "loss": 0.375, "step": 11831 }, { "epoch": 0.9609355965239991, "grad_norm": 10.674671755581318, "learning_rate": 1.998451374985111e-08, "loss": 0.5067, "step": 11832 }, { "epoch": 0.9610168115000406, "grad_norm": 17.13229424192217, "learning_rate": 1.9901614747381004e-08, "loss": 0.3741, "step": 11833 }, { "epoch": 0.9610980264760822, "grad_norm": 6.635134277566872, "learning_rate": 1.981888735459375e-08, "loss": 0.5692, "step": 11834 }, { "epoch": 0.9611792414521237, "grad_norm": 3.8371011025366055, "learning_rate": 1.973633157721283e-08, "loss": 0.5985, "step": 11835 }, { "epoch": 0.9612604564281654, "grad_norm": 5.13228830947514, "learning_rate": 1.9653947420951448e-08, "loss": 0.492, "step": 11836 }, { "epoch": 0.9613416714042069, "grad_norm": 6.733892581752105, "learning_rate": 1.9571734891509763e-08, "loss": 0.4515, "step": 11837 }, { "epoch": 0.9614228863802485, "grad_norm": 5.22909116303276, "learning_rate": 1.9489693994576563e-08, "loss": 0.6163, "step": 11838 }, { "epoch": 0.96150410135629, "grad_norm": 4.302408834520267, "learning_rate": 1.9407824735828696e-08, "loss": 0.4538, "step": 11839 }, { "epoch": 0.9615853163323317, "grad_norm": 5.043172467524965, "learning_rate": 1.932612712093107e-08, "loss": 0.4024, "step": 11840 }, { "epoch": 0.9616665313083733, "grad_norm": 8.816715558441784, "learning_rate": 1.9244601155536392e-08, "loss": 0.4981, "step": 11841 }, { "epoch": 0.9617477462844148, "grad_norm": 4.8079654103051475, "learning_rate": 1.9163246845286253e-08, "loss": 0.4698, "step": 11842 }, { "epoch": 0.9618289612604565, "grad_norm": 6.163595049432749, "learning_rate": 1.908206419580977e-08, "loss": 0.5798, "step": 11843 }, { "epoch": 0.961910176236498, "grad_norm": 4.866986542895558, "learning_rate": 1.9001053212724387e-08, "loss": 0.5856, "step": 11844 }, { "epoch": 0.9619913912125396, "grad_norm": 5.782563129780745, "learning_rate": 1.892021390163562e-08, "loss": 0.4907, "step": 11845 }, { "epoch": 0.9620726061885811, "grad_norm": 5.712857730829221, "learning_rate": 1.8839546268137054e-08, "loss": 0.4423, "step": 11846 }, { "epoch": 0.9621538211646228, "grad_norm": 4.414806524066914, "learning_rate": 1.8759050317810612e-08, "loss": 0.4499, "step": 11847 }, { "epoch": 0.9622350361406643, "grad_norm": 4.949387208955272, "learning_rate": 1.8678726056226004e-08, "loss": 0.4297, "step": 11848 }, { "epoch": 0.9623162511167059, "grad_norm": 4.396280286867585, "learning_rate": 1.8598573488941285e-08, "loss": 0.5549, "step": 11849 }, { "epoch": 0.9623974660927475, "grad_norm": 6.050149072377, "learning_rate": 1.8518592621502852e-08, "loss": 0.5692, "step": 11850 }, { "epoch": 0.9624786810687891, "grad_norm": 5.0620158081409246, "learning_rate": 1.8438783459444608e-08, "loss": 0.5378, "step": 11851 }, { "epoch": 0.9625598960448307, "grad_norm": 3.929260770758667, "learning_rate": 1.8359146008289087e-08, "loss": 0.5249, "step": 11852 }, { "epoch": 0.9626411110208722, "grad_norm": 6.751642731964542, "learning_rate": 1.8279680273546874e-08, "loss": 0.3583, "step": 11853 }, { "epoch": 0.9627223259969139, "grad_norm": 7.566915713105148, "learning_rate": 1.8200386260716352e-08, "loss": 0.5411, "step": 11854 }, { "epoch": 0.9628035409729554, "grad_norm": 4.154405022428572, "learning_rate": 1.812126397528452e-08, "loss": 0.5512, "step": 11855 }, { "epoch": 0.962884755948997, "grad_norm": 7.42761465533202, "learning_rate": 1.804231342272589e-08, "loss": 0.5403, "step": 11856 }, { "epoch": 0.9629659709250386, "grad_norm": 7.694593764622218, "learning_rate": 1.796353460850331e-08, "loss": 0.6331, "step": 11857 }, { "epoch": 0.9630471859010802, "grad_norm": 16.086802457999514, "learning_rate": 1.7884927538068532e-08, "loss": 0.4567, "step": 11858 }, { "epoch": 0.9631284008771217, "grad_norm": 6.812124405495696, "learning_rate": 1.7806492216860537e-08, "loss": 0.5277, "step": 11859 }, { "epoch": 0.9632096158531633, "grad_norm": 6.335913061374134, "learning_rate": 1.77282286503061e-08, "loss": 0.4104, "step": 11860 }, { "epoch": 0.9632908308292049, "grad_norm": 5.290742984147669, "learning_rate": 1.7650136843821163e-08, "loss": 0.5395, "step": 11861 }, { "epoch": 0.9633720458052465, "grad_norm": 7.397405822219676, "learning_rate": 1.7572216802808907e-08, "loss": 0.5077, "step": 11862 }, { "epoch": 0.9634532607812881, "grad_norm": 5.393822284388817, "learning_rate": 1.74944685326614e-08, "loss": 0.447, "step": 11863 }, { "epoch": 0.9635344757573296, "grad_norm": 4.001728592267749, "learning_rate": 1.741689203875796e-08, "loss": 0.5923, "step": 11864 }, { "epoch": 0.9636156907333713, "grad_norm": 6.112171929702209, "learning_rate": 1.7339487326466787e-08, "loss": 0.5556, "step": 11865 }, { "epoch": 0.9636969057094128, "grad_norm": 4.7616632757082575, "learning_rate": 1.7262254401143873e-08, "loss": 0.5483, "step": 11866 }, { "epoch": 0.9637781206854544, "grad_norm": 5.833975112122491, "learning_rate": 1.7185193268133282e-08, "loss": 0.4426, "step": 11867 }, { "epoch": 0.963859335661496, "grad_norm": 3.698744760296142, "learning_rate": 1.7108303932767135e-08, "loss": 0.4634, "step": 11868 }, { "epoch": 0.9639405506375376, "grad_norm": 6.198330496132483, "learning_rate": 1.7031586400365895e-08, "loss": 0.4089, "step": 11869 }, { "epoch": 0.9640217656135791, "grad_norm": 4.92658425213048, "learning_rate": 1.695504067623782e-08, "loss": 0.571, "step": 11870 }, { "epoch": 0.9641029805896207, "grad_norm": 4.570520680219551, "learning_rate": 1.6878666765679507e-08, "loss": 0.4378, "step": 11871 }, { "epoch": 0.9641841955656623, "grad_norm": 4.130327563075009, "learning_rate": 1.6802464673975893e-08, "loss": 0.4161, "step": 11872 }, { "epoch": 0.9642654105417039, "grad_norm": 5.2911892010001775, "learning_rate": 1.6726434406399704e-08, "loss": 0.4437, "step": 11873 }, { "epoch": 0.9643466255177455, "grad_norm": 7.92651230061654, "learning_rate": 1.6650575968211458e-08, "loss": 0.3474, "step": 11874 }, { "epoch": 0.964427840493787, "grad_norm": 4.955382254039267, "learning_rate": 1.6574889364660564e-08, "loss": 0.4664, "step": 11875 }, { "epoch": 0.9645090554698287, "grad_norm": 5.462298377112138, "learning_rate": 1.6499374600983943e-08, "loss": 0.4326, "step": 11876 }, { "epoch": 0.9645902704458702, "grad_norm": 3.395717603402873, "learning_rate": 1.642403168240686e-08, "loss": 0.5458, "step": 11877 }, { "epoch": 0.9646714854219118, "grad_norm": 6.599864821668275, "learning_rate": 1.6348860614142646e-08, "loss": 0.5629, "step": 11878 }, { "epoch": 0.9647527003979534, "grad_norm": 3.790645168309616, "learning_rate": 1.62738614013927e-08, "loss": 0.511, "step": 11879 }, { "epoch": 0.964833915373995, "grad_norm": 2.6841332503368918, "learning_rate": 1.6199034049346474e-08, "loss": 0.3197, "step": 11880 }, { "epoch": 0.9649151303500365, "grad_norm": 8.916212568990264, "learning_rate": 1.6124378563182053e-08, "loss": 0.4072, "step": 11881 }, { "epoch": 0.9649963453260781, "grad_norm": 6.5493673229576945, "learning_rate": 1.6049894948064748e-08, "loss": 0.5502, "step": 11882 }, { "epoch": 0.9650775603021197, "grad_norm": 3.681237380060799, "learning_rate": 1.597558320914849e-08, "loss": 0.5055, "step": 11883 }, { "epoch": 0.9651587752781613, "grad_norm": 5.868912226870429, "learning_rate": 1.5901443351575563e-08, "loss": 0.4587, "step": 11884 }, { "epoch": 0.9652399902542029, "grad_norm": 3.216888608216173, "learning_rate": 1.5827475380475744e-08, "loss": 0.4016, "step": 11885 }, { "epoch": 0.9653212052302445, "grad_norm": 6.658602355248441, "learning_rate": 1.575367930096716e-08, "loss": 0.6319, "step": 11886 }, { "epoch": 0.9654024202062861, "grad_norm": 7.62706341624403, "learning_rate": 1.5680055118156566e-08, "loss": 0.4729, "step": 11887 }, { "epoch": 0.9654836351823276, "grad_norm": 5.573341730209802, "learning_rate": 1.5606602837137942e-08, "loss": 0.3682, "step": 11888 }, { "epoch": 0.9655648501583692, "grad_norm": 9.836559250337336, "learning_rate": 1.5533322462993884e-08, "loss": 0.3449, "step": 11889 }, { "epoch": 0.9656460651344108, "grad_norm": 5.722705721907483, "learning_rate": 1.546021400079506e-08, "loss": 0.5415, "step": 11890 }, { "epoch": 0.9657272801104524, "grad_norm": 13.038255642017731, "learning_rate": 1.538727745560048e-08, "loss": 0.5975, "step": 11891 }, { "epoch": 0.9658084950864939, "grad_norm": 4.658075474008751, "learning_rate": 1.5314512832456385e-08, "loss": 0.3905, "step": 11892 }, { "epoch": 0.9658897100625355, "grad_norm": 3.609827209694475, "learning_rate": 1.5241920136397913e-08, "loss": 0.6306, "step": 11893 }, { "epoch": 0.9659709250385771, "grad_norm": 4.924247089415893, "learning_rate": 1.516949937244827e-08, "loss": 0.6758, "step": 11894 }, { "epoch": 0.9660521400146187, "grad_norm": 7.844480475751513, "learning_rate": 1.5097250545618447e-08, "loss": 0.5907, "step": 11895 }, { "epoch": 0.9661333549906603, "grad_norm": 7.5845268596408655, "learning_rate": 1.5025173660907776e-08, "loss": 0.3675, "step": 11896 }, { "epoch": 0.9662145699667019, "grad_norm": 6.1316891281247425, "learning_rate": 1.495326872330366e-08, "loss": 0.5117, "step": 11897 }, { "epoch": 0.9662957849427435, "grad_norm": 13.323521108000474, "learning_rate": 1.4881535737781282e-08, "loss": 0.2842, "step": 11898 }, { "epoch": 0.966376999918785, "grad_norm": 6.657166105699875, "learning_rate": 1.4809974709304176e-08, "loss": 0.4965, "step": 11899 }, { "epoch": 0.9664582148948266, "grad_norm": 4.572946832553962, "learning_rate": 1.4738585642824488e-08, "loss": 0.5393, "step": 11900 }, { "epoch": 0.9665394298708682, "grad_norm": 5.179202728578802, "learning_rate": 1.4667368543281324e-08, "loss": 0.3911, "step": 11901 }, { "epoch": 0.9666206448469098, "grad_norm": 3.397470952294324, "learning_rate": 1.4596323415602965e-08, "loss": 0.5457, "step": 11902 }, { "epoch": 0.9667018598229513, "grad_norm": 5.014658354565728, "learning_rate": 1.4525450264705198e-08, "loss": 0.495, "step": 11903 }, { "epoch": 0.966783074798993, "grad_norm": 4.876907477649273, "learning_rate": 1.4454749095491883e-08, "loss": 0.386, "step": 11904 }, { "epoch": 0.9668642897750345, "grad_norm": 6.641652702622383, "learning_rate": 1.438421991285549e-08, "loss": 0.5017, "step": 11905 }, { "epoch": 0.9669455047510761, "grad_norm": 6.852642511727459, "learning_rate": 1.4313862721676285e-08, "loss": 0.5132, "step": 11906 }, { "epoch": 0.9670267197271177, "grad_norm": 4.757471303632329, "learning_rate": 1.4243677526822319e-08, "loss": 0.5419, "step": 11907 }, { "epoch": 0.9671079347031593, "grad_norm": 7.349401986982847, "learning_rate": 1.4173664333149983e-08, "loss": 0.4691, "step": 11908 }, { "epoch": 0.9671891496792009, "grad_norm": 4.683264212562743, "learning_rate": 1.4103823145504292e-08, "loss": 0.396, "step": 11909 }, { "epoch": 0.9672703646552424, "grad_norm": 8.004000185912112, "learning_rate": 1.4034153968717768e-08, "loss": 0.5567, "step": 11910 }, { "epoch": 0.967351579631284, "grad_norm": 9.131569052331146, "learning_rate": 1.3964656807610721e-08, "loss": 0.4341, "step": 11911 }, { "epoch": 0.9674327946073256, "grad_norm": 4.584784368488165, "learning_rate": 1.3895331666992361e-08, "loss": 0.5043, "step": 11912 }, { "epoch": 0.9675140095833672, "grad_norm": 5.0665939824517165, "learning_rate": 1.3826178551659686e-08, "loss": 0.4195, "step": 11913 }, { "epoch": 0.9675952245594087, "grad_norm": 4.105167424178751, "learning_rate": 1.37571974663972e-08, "loss": 0.4746, "step": 11914 }, { "epoch": 0.9676764395354504, "grad_norm": 7.389787805986678, "learning_rate": 1.3688388415978581e-08, "loss": 0.5362, "step": 11915 }, { "epoch": 0.9677576545114919, "grad_norm": 5.680461317779846, "learning_rate": 1.361975140516475e-08, "loss": 0.5635, "step": 11916 }, { "epoch": 0.9678388694875335, "grad_norm": 3.8294537161607476, "learning_rate": 1.3551286438705513e-08, "loss": 0.517, "step": 11917 }, { "epoch": 0.9679200844635751, "grad_norm": 5.093597645059693, "learning_rate": 1.3482993521337362e-08, "loss": 0.4311, "step": 11918 }, { "epoch": 0.9680012994396167, "grad_norm": 4.15007656176377, "learning_rate": 1.3414872657786793e-08, "loss": 0.5585, "step": 11919 }, { "epoch": 0.9680825144156583, "grad_norm": 5.618091315100742, "learning_rate": 1.3346923852766702e-08, "loss": 0.4277, "step": 11920 }, { "epoch": 0.9681637293916998, "grad_norm": 3.77740666889434, "learning_rate": 1.3279147110979163e-08, "loss": 0.4445, "step": 11921 }, { "epoch": 0.9682449443677414, "grad_norm": 14.059774837034583, "learning_rate": 1.3211542437113755e-08, "loss": 0.4987, "step": 11922 }, { "epoch": 0.968326159343783, "grad_norm": 5.451347013670637, "learning_rate": 1.3144109835848685e-08, "loss": 0.4612, "step": 11923 }, { "epoch": 0.9684073743198246, "grad_norm": 9.206132420636997, "learning_rate": 1.3076849311849382e-08, "loss": 0.5038, "step": 11924 }, { "epoch": 0.9684885892958661, "grad_norm": 11.483754762302455, "learning_rate": 1.300976086977046e-08, "loss": 0.4655, "step": 11925 }, { "epoch": 0.9685698042719078, "grad_norm": 6.743288245848484, "learning_rate": 1.2942844514254038e-08, "loss": 0.5264, "step": 11926 }, { "epoch": 0.9686510192479493, "grad_norm": 6.053479065590689, "learning_rate": 1.2876100249930024e-08, "loss": 0.4499, "step": 11927 }, { "epoch": 0.9687322342239909, "grad_norm": 3.6207902961878853, "learning_rate": 1.2809528081416667e-08, "loss": 0.536, "step": 11928 }, { "epoch": 0.9688134492000325, "grad_norm": 6.866156397584753, "learning_rate": 1.2743128013321115e-08, "loss": 0.3062, "step": 11929 }, { "epoch": 0.9688946641760741, "grad_norm": 5.924869562979613, "learning_rate": 1.2676900050237472e-08, "loss": 0.4935, "step": 11930 }, { "epoch": 0.9689758791521157, "grad_norm": 5.999357098508331, "learning_rate": 1.2610844196748184e-08, "loss": 0.3896, "step": 11931 }, { "epoch": 0.9690570941281572, "grad_norm": 24.091678245998175, "learning_rate": 1.2544960457424316e-08, "loss": 0.4804, "step": 11932 }, { "epoch": 0.9691383091041988, "grad_norm": 4.839750186365689, "learning_rate": 1.2479248836824165e-08, "loss": 0.4291, "step": 11933 }, { "epoch": 0.9692195240802404, "grad_norm": 5.821546388018869, "learning_rate": 1.2413709339495205e-08, "loss": 0.3908, "step": 11934 }, { "epoch": 0.969300739056282, "grad_norm": 4.807004678084896, "learning_rate": 1.2348341969972143e-08, "loss": 0.461, "step": 11935 }, { "epoch": 0.9693819540323235, "grad_norm": 11.290614300170317, "learning_rate": 1.2283146732778306e-08, "loss": 0.498, "step": 11936 }, { "epoch": 0.9694631690083652, "grad_norm": 4.900177210666115, "learning_rate": 1.2218123632424527e-08, "loss": 0.5445, "step": 11937 }, { "epoch": 0.9695443839844067, "grad_norm": 4.9843232783971665, "learning_rate": 1.2153272673409989e-08, "loss": 0.4344, "step": 11938 }, { "epoch": 0.9696255989604483, "grad_norm": 9.329861274666198, "learning_rate": 1.2088593860222487e-08, "loss": 0.523, "step": 11939 }, { "epoch": 0.9697068139364899, "grad_norm": 5.949707334731747, "learning_rate": 1.2024087197337053e-08, "loss": 0.4087, "step": 11940 }, { "epoch": 0.9697880289125315, "grad_norm": 3.2312999306605934, "learning_rate": 1.1959752689217342e-08, "loss": 0.4933, "step": 11941 }, { "epoch": 0.9698692438885731, "grad_norm": 4.579568524123334, "learning_rate": 1.1895590340315343e-08, "loss": 0.5973, "step": 11942 }, { "epoch": 0.9699504588646146, "grad_norm": 4.04721463907932, "learning_rate": 1.183160015507001e-08, "loss": 0.5191, "step": 11943 }, { "epoch": 0.9700316738406563, "grad_norm": 3.4507628169869564, "learning_rate": 1.1767782137909467e-08, "loss": 0.4671, "step": 11944 }, { "epoch": 0.9701128888166978, "grad_norm": 4.199682308907727, "learning_rate": 1.17041362932499e-08, "loss": 0.5631, "step": 11945 }, { "epoch": 0.9701941037927394, "grad_norm": 4.406904013569409, "learning_rate": 1.1640662625494737e-08, "loss": 0.5799, "step": 11946 }, { "epoch": 0.9702753187687809, "grad_norm": 6.977075217955481, "learning_rate": 1.1577361139036292e-08, "loss": 0.5053, "step": 11947 }, { "epoch": 0.9703565337448226, "grad_norm": 5.4502036515579295, "learning_rate": 1.1514231838254674e-08, "loss": 0.4786, "step": 11948 }, { "epoch": 0.9704377487208641, "grad_norm": 11.48385762088185, "learning_rate": 1.1451274727518058e-08, "loss": 0.4975, "step": 11949 }, { "epoch": 0.9705189636969057, "grad_norm": 4.167108685353742, "learning_rate": 1.1388489811182957e-08, "loss": 0.5192, "step": 11950 }, { "epoch": 0.9706001786729473, "grad_norm": 5.2318458897159115, "learning_rate": 1.1325877093593396e-08, "loss": 0.5431, "step": 11951 }, { "epoch": 0.9706813936489889, "grad_norm": 6.6175704623771505, "learning_rate": 1.1263436579082022e-08, "loss": 0.6257, "step": 11952 }, { "epoch": 0.9707626086250305, "grad_norm": 3.4493770443599465, "learning_rate": 1.1201168271969266e-08, "loss": 0.5008, "step": 11953 }, { "epoch": 0.970843823601072, "grad_norm": 9.823049761761046, "learning_rate": 1.1139072176564181e-08, "loss": 0.6227, "step": 11954 }, { "epoch": 0.9709250385771137, "grad_norm": 12.037056297654933, "learning_rate": 1.1077148297163053e-08, "loss": 0.4436, "step": 11955 }, { "epoch": 0.9710062535531552, "grad_norm": 47.76636023695995, "learning_rate": 1.101539663805079e-08, "loss": 0.44, "step": 11956 }, { "epoch": 0.9710874685291968, "grad_norm": 5.334949326289944, "learning_rate": 1.0953817203500084e-08, "loss": 0.4165, "step": 11957 }, { "epoch": 0.9711686835052383, "grad_norm": 9.47687334833281, "learning_rate": 1.0892409997772524e-08, "loss": 0.4979, "step": 11958 }, { "epoch": 0.97124989848128, "grad_norm": 5.061927562596902, "learning_rate": 1.0831175025116658e-08, "loss": 0.4791, "step": 11959 }, { "epoch": 0.9713311134573215, "grad_norm": 8.5596022285608, "learning_rate": 1.0770112289769653e-08, "loss": 0.5829, "step": 11960 }, { "epoch": 0.9714123284333631, "grad_norm": 4.508772275279344, "learning_rate": 1.0709221795956738e-08, "loss": 0.5874, "step": 11961 }, { "epoch": 0.9714935434094047, "grad_norm": 4.701424170250187, "learning_rate": 1.0648503547891487e-08, "loss": 0.4767, "step": 11962 }, { "epoch": 0.9715747583854463, "grad_norm": 3.965969498170774, "learning_rate": 1.0587957549774986e-08, "loss": 0.5583, "step": 11963 }, { "epoch": 0.9716559733614879, "grad_norm": 7.964781868944689, "learning_rate": 1.052758380579666e-08, "loss": 0.5368, "step": 11964 }, { "epoch": 0.9717371883375294, "grad_norm": 5.249845918760695, "learning_rate": 1.0467382320134279e-08, "loss": 0.4113, "step": 11965 }, { "epoch": 0.971818403313571, "grad_norm": 5.825530015816844, "learning_rate": 1.0407353096953398e-08, "loss": 0.3504, "step": 11966 }, { "epoch": 0.9718996182896126, "grad_norm": 9.06281122515128, "learning_rate": 1.034749614040792e-08, "loss": 0.3699, "step": 11967 }, { "epoch": 0.9719808332656542, "grad_norm": 7.498622251328044, "learning_rate": 1.0287811454639252e-08, "loss": 0.4268, "step": 11968 }, { "epoch": 0.9720620482416957, "grad_norm": 6.263774598272872, "learning_rate": 1.0228299043777146e-08, "loss": 0.4959, "step": 11969 }, { "epoch": 0.9721432632177374, "grad_norm": 5.061985215032839, "learning_rate": 1.0168958911939975e-08, "loss": 0.5099, "step": 11970 }, { "epoch": 0.9722244781937789, "grad_norm": 8.537598436187121, "learning_rate": 1.0109791063233898e-08, "loss": 0.3745, "step": 11971 }, { "epoch": 0.9723056931698205, "grad_norm": 3.710316240876283, "learning_rate": 1.0050795501752309e-08, "loss": 0.462, "step": 11972 }, { "epoch": 0.9723869081458621, "grad_norm": 4.819345845034754, "learning_rate": 9.991972231577774e-09, "loss": 0.568, "step": 11973 }, { "epoch": 0.9724681231219037, "grad_norm": 7.126395810841205, "learning_rate": 9.933321256780925e-09, "loss": 0.4745, "step": 11974 }, { "epoch": 0.9725493380979453, "grad_norm": 4.57445118125883, "learning_rate": 9.874842581419631e-09, "loss": 0.3644, "step": 11975 }, { "epoch": 0.9726305530739868, "grad_norm": 8.751285985895777, "learning_rate": 9.816536209540373e-09, "loss": 0.3933, "step": 11976 }, { "epoch": 0.9727117680500285, "grad_norm": 6.666232456269325, "learning_rate": 9.758402145177703e-09, "loss": 0.499, "step": 11977 }, { "epoch": 0.97279298302607, "grad_norm": 7.576271556752231, "learning_rate": 9.70044039235396e-09, "loss": 0.4075, "step": 11978 }, { "epoch": 0.9728741980021116, "grad_norm": 4.487672059560348, "learning_rate": 9.642650955080379e-09, "loss": 0.3907, "step": 11979 }, { "epoch": 0.9729554129781531, "grad_norm": 3.6526471218089864, "learning_rate": 9.585033837355151e-09, "loss": 0.5327, "step": 11980 }, { "epoch": 0.9730366279541948, "grad_norm": 4.83213034119808, "learning_rate": 9.527589043165086e-09, "loss": 0.5155, "step": 11981 }, { "epoch": 0.9731178429302363, "grad_norm": 4.370525558131413, "learning_rate": 9.470316576485616e-09, "loss": 0.4436, "step": 11982 }, { "epoch": 0.9731990579062779, "grad_norm": 6.8962180265204065, "learning_rate": 9.41321644127885e-09, "loss": 0.4835, "step": 11983 }, { "epoch": 0.9732802728823196, "grad_norm": 5.928084396311675, "learning_rate": 9.356288641496624e-09, "loss": 0.3822, "step": 11984 }, { "epoch": 0.9733614878583611, "grad_norm": 5.61471404924446, "learning_rate": 9.299533181077458e-09, "loss": 0.36, "step": 11985 }, { "epoch": 0.9734427028344027, "grad_norm": 4.928900628567097, "learning_rate": 9.242950063948763e-09, "loss": 0.4844, "step": 11986 }, { "epoch": 0.9735239178104442, "grad_norm": 6.227465912824045, "learning_rate": 9.18653929402602e-09, "loss": 0.6315, "step": 11987 }, { "epoch": 0.9736051327864859, "grad_norm": 5.256061365992016, "learning_rate": 9.13030087521194e-09, "loss": 0.5286, "step": 11988 }, { "epoch": 0.9736863477625274, "grad_norm": 5.4504099987714465, "learning_rate": 9.074234811398408e-09, "loss": 0.4854, "step": 11989 }, { "epoch": 0.973767562738569, "grad_norm": 6.539725065060604, "learning_rate": 9.018341106464823e-09, "loss": 0.4206, "step": 11990 }, { "epoch": 0.9738487777146105, "grad_norm": 4.947085140988941, "learning_rate": 8.962619764278923e-09, "loss": 0.4808, "step": 11991 }, { "epoch": 0.9739299926906522, "grad_norm": 6.855737833780214, "learning_rate": 8.907070788695681e-09, "loss": 0.3113, "step": 11992 }, { "epoch": 0.9740112076666937, "grad_norm": 5.986387589137889, "learning_rate": 8.851694183559523e-09, "loss": 0.3221, "step": 11993 }, { "epoch": 0.9740924226427353, "grad_norm": 6.396980239435106, "learning_rate": 8.796489952701825e-09, "loss": 0.5371, "step": 11994 }, { "epoch": 0.974173637618777, "grad_norm": 4.469906653866564, "learning_rate": 8.741458099942313e-09, "loss": 0.5594, "step": 11995 }, { "epoch": 0.9742548525948185, "grad_norm": 6.102062048541683, "learning_rate": 8.686598629089326e-09, "loss": 0.382, "step": 11996 }, { "epoch": 0.9743360675708601, "grad_norm": 4.275407491240544, "learning_rate": 8.63191154393872e-09, "loss": 0.4869, "step": 11997 }, { "epoch": 0.9744172825469016, "grad_norm": 6.27848767223886, "learning_rate": 8.577396848274134e-09, "loss": 0.4735, "step": 11998 }, { "epoch": 0.9744984975229433, "grad_norm": 6.216106726624828, "learning_rate": 8.523054545868381e-09, "loss": 0.5145, "step": 11999 }, { "epoch": 0.9745797124989848, "grad_norm": 6.83477643803915, "learning_rate": 8.468884640480956e-09, "loss": 0.4235, "step": 12000 }, { "epoch": 0.9746609274750264, "grad_norm": 12.60083591448807, "learning_rate": 8.414887135860528e-09, "loss": 0.3258, "step": 12001 }, { "epoch": 0.9747421424510679, "grad_norm": 4.645778183923307, "learning_rate": 8.36106203574355e-09, "loss": 0.5243, "step": 12002 }, { "epoch": 0.9748233574271096, "grad_norm": 6.124524623697457, "learning_rate": 8.307409343854267e-09, "loss": 0.599, "step": 12003 }, { "epoch": 0.9749045724031511, "grad_norm": 21.473022616832324, "learning_rate": 8.253929063904986e-09, "loss": 0.382, "step": 12004 }, { "epoch": 0.9749857873791927, "grad_norm": 4.661852786954021, "learning_rate": 8.200621199596359e-09, "loss": 0.6825, "step": 12005 }, { "epoch": 0.9750670023552344, "grad_norm": 3.538632513896874, "learning_rate": 8.147485754617379e-09, "loss": 0.4224, "step": 12006 }, { "epoch": 0.9751482173312759, "grad_norm": 4.889222428461826, "learning_rate": 8.094522732644272e-09, "loss": 0.5328, "step": 12007 }, { "epoch": 0.9752294323073175, "grad_norm": 8.835696874530043, "learning_rate": 8.041732137341885e-09, "loss": 0.3872, "step": 12008 }, { "epoch": 0.975310647283359, "grad_norm": 5.49476829680145, "learning_rate": 7.989113972363406e-09, "loss": 0.6145, "step": 12009 }, { "epoch": 0.9753918622594007, "grad_norm": 5.131999174481017, "learning_rate": 7.936668241349255e-09, "loss": 0.5043, "step": 12010 }, { "epoch": 0.9754730772354422, "grad_norm": 4.572903995548805, "learning_rate": 7.884394947928476e-09, "loss": 0.5749, "step": 12011 }, { "epoch": 0.9755542922114838, "grad_norm": 4.3477540547148825, "learning_rate": 7.832294095718452e-09, "loss": 0.4026, "step": 12012 }, { "epoch": 0.9756355071875253, "grad_norm": 7.454961091377597, "learning_rate": 7.780365688323798e-09, "loss": 0.3078, "step": 12013 }, { "epoch": 0.975716722163567, "grad_norm": 4.170795190072248, "learning_rate": 7.72860972933831e-09, "loss": 0.5571, "step": 12014 }, { "epoch": 0.9757979371396085, "grad_norm": 7.707305765402371, "learning_rate": 7.677026222342454e-09, "loss": 0.3938, "step": 12015 }, { "epoch": 0.9758791521156501, "grad_norm": 4.686956759781069, "learning_rate": 7.625615170906153e-09, "loss": 0.6371, "step": 12016 }, { "epoch": 0.9759603670916918, "grad_norm": 6.526544319588936, "learning_rate": 7.57437657858684e-09, "loss": 0.4409, "step": 12017 }, { "epoch": 0.9760415820677333, "grad_norm": 5.749185856694462, "learning_rate": 7.523310448929178e-09, "loss": 0.2862, "step": 12018 }, { "epoch": 0.9761227970437749, "grad_norm": 6.413191599955744, "learning_rate": 7.472416785467563e-09, "loss": 0.5205, "step": 12019 }, { "epoch": 0.9762040120198164, "grad_norm": 4.702110106071757, "learning_rate": 7.421695591723066e-09, "loss": 0.4963, "step": 12020 }, { "epoch": 0.9762852269958581, "grad_norm": 4.552204118507779, "learning_rate": 7.371146871205381e-09, "loss": 0.5284, "step": 12021 }, { "epoch": 0.9763664419718996, "grad_norm": 5.639485771969086, "learning_rate": 7.320770627412543e-09, "loss": 0.5295, "step": 12022 }, { "epoch": 0.9764476569479412, "grad_norm": 6.8637890745131145, "learning_rate": 7.27056686382982e-09, "loss": 0.4764, "step": 12023 }, { "epoch": 0.9765288719239827, "grad_norm": 5.248727371974205, "learning_rate": 7.220535583931099e-09, "loss": 0.4443, "step": 12024 }, { "epoch": 0.9766100869000244, "grad_norm": 5.140443422767862, "learning_rate": 7.17067679117861e-09, "loss": 0.5761, "step": 12025 }, { "epoch": 0.9766913018760659, "grad_norm": 4.603187702423827, "learning_rate": 7.120990489022373e-09, "loss": 0.4652, "step": 12026 }, { "epoch": 0.9767725168521075, "grad_norm": 5.4771501562832245, "learning_rate": 7.071476680900191e-09, "loss": 0.3577, "step": 12027 }, { "epoch": 0.9768537318281492, "grad_norm": 9.457901018585092, "learning_rate": 7.022135370237937e-09, "loss": 0.5396, "step": 12028 }, { "epoch": 0.9769349468041907, "grad_norm": 6.7496814308010995, "learning_rate": 6.972966560450101e-09, "loss": 0.4375, "step": 12029 }, { "epoch": 0.9770161617802323, "grad_norm": 9.618733980000252, "learning_rate": 6.923970254938961e-09, "loss": 0.463, "step": 12030 }, { "epoch": 0.9770973767562738, "grad_norm": 3.6437791716015155, "learning_rate": 6.875146457094583e-09, "loss": 0.466, "step": 12031 }, { "epoch": 0.9771785917323155, "grad_norm": 4.087276926389463, "learning_rate": 6.8264951702951e-09, "loss": 0.3838, "step": 12032 }, { "epoch": 0.977259806708357, "grad_norm": 7.648508375870743, "learning_rate": 6.778016397907539e-09, "loss": 0.3517, "step": 12033 }, { "epoch": 0.9773410216843986, "grad_norm": 6.478817726316786, "learning_rate": 6.729710143286161e-09, "loss": 0.3296, "step": 12034 }, { "epoch": 0.9774222366604401, "grad_norm": 4.785457296804816, "learning_rate": 6.681576409773016e-09, "loss": 0.5188, "step": 12035 }, { "epoch": 0.9775034516364818, "grad_norm": 7.296274375209841, "learning_rate": 6.633615200699328e-09, "loss": 0.6136, "step": 12036 }, { "epoch": 0.9775846666125233, "grad_norm": 4.160049221454699, "learning_rate": 6.5858265193835536e-09, "loss": 0.514, "step": 12037 }, { "epoch": 0.9776658815885649, "grad_norm": 5.830756185483621, "learning_rate": 6.538210369132214e-09, "loss": 0.4866, "step": 12038 }, { "epoch": 0.9777470965646066, "grad_norm": 5.407452911485265, "learning_rate": 6.490766753240174e-09, "loss": 0.6008, "step": 12039 }, { "epoch": 0.9778283115406481, "grad_norm": 5.873928827822523, "learning_rate": 6.443495674990641e-09, "loss": 0.5819, "step": 12040 }, { "epoch": 0.9779095265166897, "grad_norm": 6.00288359521902, "learning_rate": 6.396397137654054e-09, "loss": 0.3921, "step": 12041 }, { "epoch": 0.9779907414927312, "grad_norm": 7.297473039045752, "learning_rate": 6.3494711444897495e-09, "loss": 0.4012, "step": 12042 }, { "epoch": 0.9780719564687729, "grad_norm": 4.082939335655915, "learning_rate": 6.302717698744298e-09, "loss": 0.4805, "step": 12043 }, { "epoch": 0.9781531714448144, "grad_norm": 9.09294805829235, "learning_rate": 6.2561368036531676e-09, "loss": 0.5639, "step": 12044 }, { "epoch": 0.978234386420856, "grad_norm": 6.290896132053336, "learning_rate": 6.209728462439613e-09, "loss": 0.5002, "step": 12045 }, { "epoch": 0.9783156013968975, "grad_norm": 8.382100561674394, "learning_rate": 6.1634926783143975e-09, "loss": 0.4165, "step": 12046 }, { "epoch": 0.9783968163729392, "grad_norm": 5.891844640379527, "learning_rate": 6.117429454477186e-09, "loss": 0.3735, "step": 12047 }, { "epoch": 0.9784780313489807, "grad_norm": 5.2181840686707766, "learning_rate": 6.071538794115151e-09, "loss": 0.5358, "step": 12048 }, { "epoch": 0.9785592463250223, "grad_norm": 5.170788538583498, "learning_rate": 6.025820700403529e-09, "loss": 0.5949, "step": 12049 }, { "epoch": 0.978640461301064, "grad_norm": 9.342590625760383, "learning_rate": 5.9802751765061785e-09, "loss": 0.553, "step": 12050 }, { "epoch": 0.9787216762771055, "grad_norm": 6.856775571985887, "learning_rate": 5.9349022255741905e-09, "loss": 0.5531, "step": 12051 }, { "epoch": 0.9788028912531471, "grad_norm": 4.222463014742318, "learning_rate": 5.889701850747276e-09, "loss": 0.5136, "step": 12052 }, { "epoch": 0.9788841062291886, "grad_norm": 6.316498496538767, "learning_rate": 5.844674055153487e-09, "loss": 0.4999, "step": 12053 }, { "epoch": 0.9789653212052303, "grad_norm": 5.951917290095282, "learning_rate": 5.799818841907556e-09, "loss": 0.4719, "step": 12054 }, { "epoch": 0.9790465361812718, "grad_norm": 10.835272741624479, "learning_rate": 5.7551362141142205e-09, "loss": 0.3861, "step": 12055 }, { "epoch": 0.9791277511573134, "grad_norm": 6.226477036302456, "learning_rate": 5.71062617486462e-09, "loss": 0.5043, "step": 12056 }, { "epoch": 0.979208966133355, "grad_norm": 9.360240176470688, "learning_rate": 5.666288727239066e-09, "loss": 0.4041, "step": 12057 }, { "epoch": 0.9792901811093966, "grad_norm": 8.608220193471967, "learning_rate": 5.622123874305108e-09, "loss": 0.3628, "step": 12058 }, { "epoch": 0.9793713960854381, "grad_norm": 5.828438095145518, "learning_rate": 5.578131619118909e-09, "loss": 0.3776, "step": 12059 }, { "epoch": 0.9794526110614797, "grad_norm": 4.52241346057629, "learning_rate": 5.534311964724426e-09, "loss": 0.4952, "step": 12060 }, { "epoch": 0.9795338260375214, "grad_norm": 3.8196975497917736, "learning_rate": 5.490664914153676e-09, "loss": 0.4629, "step": 12061 }, { "epoch": 0.9796150410135629, "grad_norm": 4.37711626897485, "learning_rate": 5.447190470427022e-09, "loss": 0.4722, "step": 12062 }, { "epoch": 0.9796962559896045, "grad_norm": 3.977099684985591, "learning_rate": 5.4038886365523346e-09, "loss": 0.5534, "step": 12063 }, { "epoch": 0.979777470965646, "grad_norm": 3.64657798618061, "learning_rate": 5.360759415526385e-09, "loss": 0.4216, "step": 12064 }, { "epoch": 0.9798586859416877, "grad_norm": 5.625694324125192, "learning_rate": 5.3178028103331725e-09, "loss": 0.4299, "step": 12065 }, { "epoch": 0.9799399009177292, "grad_norm": 3.7515462457345823, "learning_rate": 5.275018823945044e-09, "loss": 0.4106, "step": 12066 }, { "epoch": 0.9800211158937708, "grad_norm": 4.423453741648708, "learning_rate": 5.232407459322408e-09, "loss": 0.3944, "step": 12067 }, { "epoch": 0.9801023308698124, "grad_norm": 5.1224553290352635, "learning_rate": 5.189968719413741e-09, "loss": 0.379, "step": 12068 }, { "epoch": 0.980183545845854, "grad_norm": 7.181016487036675, "learning_rate": 5.14770260715558e-09, "loss": 0.4096, "step": 12069 }, { "epoch": 0.9802647608218955, "grad_norm": 5.486473028790214, "learning_rate": 5.10560912547281e-09, "loss": 0.5059, "step": 12070 }, { "epoch": 0.9803459757979371, "grad_norm": 4.843785469577583, "learning_rate": 5.063688277277545e-09, "loss": 0.444, "step": 12071 }, { "epoch": 0.9804271907739788, "grad_norm": 12.938046238864795, "learning_rate": 5.021940065471076e-09, "loss": 0.4866, "step": 12072 }, { "epoch": 0.9805084057500203, "grad_norm": 10.093029172739994, "learning_rate": 4.980364492941924e-09, "loss": 0.4679, "step": 12073 }, { "epoch": 0.9805896207260619, "grad_norm": 7.273329994281363, "learning_rate": 4.938961562566402e-09, "loss": 0.3914, "step": 12074 }, { "epoch": 0.9806708357021034, "grad_norm": 3.3725698256307437, "learning_rate": 4.8977312772102715e-09, "loss": 0.5183, "step": 12075 }, { "epoch": 0.9807520506781451, "grad_norm": 5.1217541737470835, "learning_rate": 4.856673639725695e-09, "loss": 0.498, "step": 12076 }, { "epoch": 0.9808332656541866, "grad_norm": 3.8730107181355535, "learning_rate": 4.815788652954012e-09, "loss": 0.4615, "step": 12077 }, { "epoch": 0.9809144806302282, "grad_norm": 8.294253938971737, "learning_rate": 4.775076319724348e-09, "loss": 0.5011, "step": 12078 }, { "epoch": 0.9809956956062698, "grad_norm": 3.478676476091108, "learning_rate": 4.734536642853338e-09, "loss": 0.5997, "step": 12079 }, { "epoch": 0.9810769105823114, "grad_norm": 4.181120233195193, "learning_rate": 4.6941696251465165e-09, "loss": 0.6451, "step": 12080 }, { "epoch": 0.9811581255583529, "grad_norm": 3.6134990519276466, "learning_rate": 4.6539752693969265e-09, "loss": 0.4854, "step": 12081 }, { "epoch": 0.9812393405343945, "grad_norm": 11.700469427119463, "learning_rate": 4.613953578385954e-09, "loss": 0.3663, "step": 12082 }, { "epoch": 0.9813205555104362, "grad_norm": 4.369290046988292, "learning_rate": 4.574104554882497e-09, "loss": 0.7481, "step": 12083 }, { "epoch": 0.9814017704864777, "grad_norm": 5.401987698332693, "learning_rate": 4.534428201644348e-09, "loss": 0.423, "step": 12084 }, { "epoch": 0.9814829854625193, "grad_norm": 9.430350487972829, "learning_rate": 4.494924521416533e-09, "loss": 0.4895, "step": 12085 }, { "epoch": 0.9815642004385609, "grad_norm": 3.7707234936613565, "learning_rate": 4.455593516932699e-09, "loss": 0.5178, "step": 12086 }, { "epoch": 0.9816454154146025, "grad_norm": 6.985985247371312, "learning_rate": 4.4164351909142815e-09, "loss": 0.3872, "step": 12087 }, { "epoch": 0.981726630390644, "grad_norm": 5.1330117460195215, "learning_rate": 4.377449546071055e-09, "loss": 0.4994, "step": 12088 }, { "epoch": 0.9818078453666856, "grad_norm": 6.234609568273921, "learning_rate": 4.338636585100309e-09, "loss": 0.5062, "step": 12089 }, { "epoch": 0.9818890603427272, "grad_norm": 7.581218288677617, "learning_rate": 4.299996310687671e-09, "loss": 0.5828, "step": 12090 }, { "epoch": 0.9819702753187688, "grad_norm": 5.845752308546046, "learning_rate": 4.261528725507113e-09, "loss": 0.472, "step": 12091 }, { "epoch": 0.9820514902948103, "grad_norm": 7.146820683688464, "learning_rate": 4.223233832220397e-09, "loss": 0.4632, "step": 12092 }, { "epoch": 0.982132705270852, "grad_norm": 3.2381775735767935, "learning_rate": 4.18511163347679e-09, "loss": 0.6537, "step": 12093 }, { "epoch": 0.9822139202468936, "grad_norm": 5.730027453232988, "learning_rate": 4.147162131914739e-09, "loss": 0.3759, "step": 12094 }, { "epoch": 0.9822951352229351, "grad_norm": 3.5984616089033827, "learning_rate": 4.109385330159921e-09, "loss": 0.5452, "step": 12095 }, { "epoch": 0.9823763501989767, "grad_norm": 6.769530777534317, "learning_rate": 4.071781230826355e-09, "loss": 0.4365, "step": 12096 }, { "epoch": 0.9824575651750183, "grad_norm": 5.819933789150477, "learning_rate": 4.034349836516127e-09, "loss": 0.4541, "step": 12097 }, { "epoch": 0.9825387801510599, "grad_norm": 5.491397739648694, "learning_rate": 3.99709114981911e-09, "loss": 0.3554, "step": 12098 }, { "epoch": 0.9826199951271014, "grad_norm": 5.29270938282257, "learning_rate": 3.960005173313519e-09, "loss": 0.3631, "step": 12099 }, { "epoch": 0.982701210103143, "grad_norm": 5.1898426034045615, "learning_rate": 3.923091909565357e-09, "loss": 0.4277, "step": 12100 }, { "epoch": 0.9827824250791846, "grad_norm": 4.856901331420813, "learning_rate": 3.88635136112897e-09, "loss": 0.4089, "step": 12101 }, { "epoch": 0.9828636400552262, "grad_norm": 3.4872568138932762, "learning_rate": 3.8497835305464915e-09, "loss": 0.6127, "step": 12102 }, { "epoch": 0.9829448550312677, "grad_norm": 13.086059636580629, "learning_rate": 3.813388420348396e-09, "loss": 0.4843, "step": 12103 }, { "epoch": 0.9830260700073093, "grad_norm": 5.395590802101605, "learning_rate": 3.777166033052948e-09, "loss": 0.4887, "step": 12104 }, { "epoch": 0.983107284983351, "grad_norm": 4.809183849119693, "learning_rate": 3.741116371166476e-09, "loss": 0.5797, "step": 12105 }, { "epoch": 0.9831884999593925, "grad_norm": 7.000518609996093, "learning_rate": 3.705239437183372e-09, "loss": 0.6074, "step": 12106 }, { "epoch": 0.9832697149354341, "grad_norm": 7.749862840244079, "learning_rate": 3.6695352335863745e-09, "loss": 0.3449, "step": 12107 }, { "epoch": 0.9833509299114757, "grad_norm": 5.2466883835337494, "learning_rate": 3.6340037628460057e-09, "loss": 0.4941, "step": 12108 }, { "epoch": 0.9834321448875173, "grad_norm": 20.385619121552477, "learning_rate": 3.5986450274205776e-09, "loss": 0.4583, "step": 12109 }, { "epoch": 0.9835133598635588, "grad_norm": 4.0593617565984035, "learning_rate": 3.5634590297570215e-09, "loss": 0.4225, "step": 12110 }, { "epoch": 0.9835945748396004, "grad_norm": 6.465413464659027, "learning_rate": 3.528445772289779e-09, "loss": 0.5616, "step": 12111 }, { "epoch": 0.983675789815642, "grad_norm": 8.991522570276532, "learning_rate": 3.4936052574416345e-09, "loss": 0.3596, "step": 12112 }, { "epoch": 0.9837570047916836, "grad_norm": 4.818729279084131, "learning_rate": 3.458937487623437e-09, "loss": 0.4936, "step": 12113 }, { "epoch": 0.9838382197677251, "grad_norm": 6.112039073938186, "learning_rate": 3.424442465234101e-09, "loss": 0.4451, "step": 12114 }, { "epoch": 0.9839194347437668, "grad_norm": 6.296027842842914, "learning_rate": 3.3901201926606063e-09, "loss": 0.5395, "step": 12115 }, { "epoch": 0.9840006497198084, "grad_norm": 5.253597703880921, "learning_rate": 3.3559706722774423e-09, "loss": 0.6561, "step": 12116 }, { "epoch": 0.9840818646958499, "grad_norm": 5.551682606805065, "learning_rate": 3.3219939064477182e-09, "loss": 0.3641, "step": 12117 }, { "epoch": 0.9841630796718915, "grad_norm": 7.465132500098328, "learning_rate": 3.288189897522609e-09, "loss": 0.4635, "step": 12118 }, { "epoch": 0.9842442946479331, "grad_norm": 8.173920927315894, "learning_rate": 3.254558647841077e-09, "loss": 0.5209, "step": 12119 }, { "epoch": 0.9843255096239747, "grad_norm": 4.788329870369596, "learning_rate": 3.2211001597304283e-09, "loss": 0.5823, "step": 12120 }, { "epoch": 0.9844067246000162, "grad_norm": 6.8587617655903355, "learning_rate": 3.187814435505199e-09, "loss": 0.4951, "step": 12121 }, { "epoch": 0.9844879395760578, "grad_norm": 5.481814887439901, "learning_rate": 3.1547014774693797e-09, "loss": 0.3174, "step": 12122 }, { "epoch": 0.9845691545520994, "grad_norm": 8.885860015107669, "learning_rate": 3.1217612879139158e-09, "loss": 0.4645, "step": 12123 }, { "epoch": 0.984650369528141, "grad_norm": 4.284808548816115, "learning_rate": 3.088993869117818e-09, "loss": 0.4884, "step": 12124 }, { "epoch": 0.9847315845041825, "grad_norm": 4.896143196735318, "learning_rate": 3.056399223348716e-09, "loss": 0.3905, "step": 12125 }, { "epoch": 0.9848127994802242, "grad_norm": 4.786981373349127, "learning_rate": 3.023977352861751e-09, "loss": 0.6363, "step": 12126 }, { "epoch": 0.9848940144562658, "grad_norm": 5.440565139296447, "learning_rate": 2.991728259900684e-09, "loss": 0.4777, "step": 12127 }, { "epoch": 0.9849752294323073, "grad_norm": 4.432486814000843, "learning_rate": 2.959651946696507e-09, "loss": 0.4696, "step": 12128 }, { "epoch": 0.9850564444083489, "grad_norm": 4.593612284410864, "learning_rate": 2.927748415469389e-09, "loss": 0.4412, "step": 12129 }, { "epoch": 0.9851376593843905, "grad_norm": 11.349530667232669, "learning_rate": 2.8960176684261767e-09, "loss": 0.5133, "step": 12130 }, { "epoch": 0.9852188743604321, "grad_norm": 7.977641323262256, "learning_rate": 2.86445970776289e-09, "loss": 0.4928, "step": 12131 }, { "epoch": 0.9853000893364736, "grad_norm": 4.0771155412598805, "learning_rate": 2.833074535663338e-09, "loss": 0.4539, "step": 12132 }, { "epoch": 0.9853813043125152, "grad_norm": 4.941422589211022, "learning_rate": 2.8018621542988402e-09, "loss": 0.4797, "step": 12133 }, { "epoch": 0.9854625192885568, "grad_norm": 8.421332670522345, "learning_rate": 2.7708225658290566e-09, "loss": 0.5164, "step": 12134 }, { "epoch": 0.9855437342645984, "grad_norm": 7.591462565867755, "learning_rate": 2.739955772401992e-09, "loss": 0.621, "step": 12135 }, { "epoch": 0.9856249492406399, "grad_norm": 5.772643166679314, "learning_rate": 2.709261776153438e-09, "loss": 0.4788, "step": 12136 }, { "epoch": 0.9857061642166816, "grad_norm": 6.132821992629694, "learning_rate": 2.6787405792072507e-09, "loss": 0.4194, "step": 12137 }, { "epoch": 0.9857873791927232, "grad_norm": 5.663712178693497, "learning_rate": 2.6483921836753525e-09, "loss": 0.4067, "step": 12138 }, { "epoch": 0.9858685941687647, "grad_norm": 5.534184126963757, "learning_rate": 2.6182165916577295e-09, "loss": 0.547, "step": 12139 }, { "epoch": 0.9859498091448063, "grad_norm": 4.047838425287923, "learning_rate": 2.5882138052421567e-09, "loss": 0.4595, "step": 12140 }, { "epoch": 0.9860310241208479, "grad_norm": 3.5712541301580667, "learning_rate": 2.5583838265050286e-09, "loss": 0.4067, "step": 12141 }, { "epoch": 0.9861122390968895, "grad_norm": 5.1536702438204065, "learning_rate": 2.52872665751025e-09, "loss": 0.5706, "step": 12142 }, { "epoch": 0.986193454072931, "grad_norm": 16.616810366998326, "learning_rate": 2.4992423003095124e-09, "loss": 0.5131, "step": 12143 }, { "epoch": 0.9862746690489727, "grad_norm": 5.2196354867906685, "learning_rate": 2.4699307569436835e-09, "loss": 0.4475, "step": 12144 }, { "epoch": 0.9863558840250142, "grad_norm": 4.94188443326249, "learning_rate": 2.4407920294405864e-09, "loss": 0.381, "step": 12145 }, { "epoch": 0.9864370990010558, "grad_norm": 6.982635794914591, "learning_rate": 2.4118261198166625e-09, "loss": 0.4683, "step": 12146 }, { "epoch": 0.9865183139770973, "grad_norm": 8.68074477826504, "learning_rate": 2.383033030075865e-09, "loss": 0.4486, "step": 12147 }, { "epoch": 0.986599528953139, "grad_norm": 4.488613369870494, "learning_rate": 2.354412762210767e-09, "loss": 0.5145, "step": 12148 }, { "epoch": 0.9866807439291806, "grad_norm": 8.342369544568935, "learning_rate": 2.325965318201728e-09, "loss": 0.4742, "step": 12149 }, { "epoch": 0.9867619589052221, "grad_norm": 4.473725413880338, "learning_rate": 2.2976907000171743e-09, "loss": 0.4517, "step": 12150 }, { "epoch": 0.9868431738812637, "grad_norm": 5.732059155908079, "learning_rate": 2.2695889096133184e-09, "loss": 0.5598, "step": 12151 }, { "epoch": 0.9869243888573053, "grad_norm": 6.627352082908765, "learning_rate": 2.2416599489349933e-09, "loss": 0.4432, "step": 12152 }, { "epoch": 0.9870056038333469, "grad_norm": 6.4462903139711525, "learning_rate": 2.2139038199145424e-09, "loss": 0.641, "step": 12153 }, { "epoch": 0.9870868188093884, "grad_norm": 4.235134074339075, "learning_rate": 2.1863205244726514e-09, "loss": 0.3059, "step": 12154 }, { "epoch": 0.98716803378543, "grad_norm": 10.34898874466498, "learning_rate": 2.1589100645180715e-09, "loss": 0.4217, "step": 12155 }, { "epoch": 0.9872492487614716, "grad_norm": 7.083106400400486, "learning_rate": 2.1316724419470637e-09, "loss": 0.4262, "step": 12156 }, { "epoch": 0.9873304637375132, "grad_norm": 4.880842030469988, "learning_rate": 2.1046076586445084e-09, "loss": 0.4115, "step": 12157 }, { "epoch": 0.9874116787135547, "grad_norm": 5.428807688681806, "learning_rate": 2.077715716483353e-09, "loss": 0.4031, "step": 12158 }, { "epoch": 0.9874928936895964, "grad_norm": 3.5615581342610594, "learning_rate": 2.0509966173240524e-09, "loss": 0.4478, "step": 12159 }, { "epoch": 0.987574108665638, "grad_norm": 4.9802028956802245, "learning_rate": 2.0244503630154066e-09, "loss": 0.566, "step": 12160 }, { "epoch": 0.9876553236416795, "grad_norm": 3.88910810121843, "learning_rate": 1.9980769553948344e-09, "loss": 0.3933, "step": 12161 }, { "epoch": 0.9877365386177211, "grad_norm": 4.551525974114428, "learning_rate": 1.9718763962867094e-09, "loss": 0.4622, "step": 12162 }, { "epoch": 0.9878177535937627, "grad_norm": 6.650520927884947, "learning_rate": 1.945848687504026e-09, "loss": 0.5204, "step": 12163 }, { "epoch": 0.9878989685698043, "grad_norm": 5.106322645005586, "learning_rate": 1.919993830847844e-09, "loss": 0.3685, "step": 12164 }, { "epoch": 0.9879801835458458, "grad_norm": 5.293912542968066, "learning_rate": 1.8943118281070095e-09, "loss": 0.4464, "step": 12165 }, { "epoch": 0.9880613985218875, "grad_norm": 4.885311151399439, "learning_rate": 1.86880268105899e-09, "loss": 0.5068, "step": 12166 }, { "epoch": 0.988142613497929, "grad_norm": 5.1380448467955295, "learning_rate": 1.8434663914687623e-09, "loss": 0.4485, "step": 12167 }, { "epoch": 0.9882238284739706, "grad_norm": 4.9192525339798925, "learning_rate": 1.8183029610890912e-09, "loss": 0.5346, "step": 12168 }, { "epoch": 0.9883050434500121, "grad_norm": 4.9277595849293565, "learning_rate": 1.7933123916613614e-09, "loss": 0.4065, "step": 12169 }, { "epoch": 0.9883862584260538, "grad_norm": 5.8993559652023215, "learning_rate": 1.7684946849150232e-09, "loss": 0.459, "step": 12170 }, { "epoch": 0.9884674734020954, "grad_norm": 4.936973153328386, "learning_rate": 1.7438498425673135e-09, "loss": 0.5248, "step": 12171 }, { "epoch": 0.9885486883781369, "grad_norm": 4.4227762528236125, "learning_rate": 1.7193778663229799e-09, "loss": 0.5324, "step": 12172 }, { "epoch": 0.9886299033541786, "grad_norm": 4.957741580907209, "learning_rate": 1.6950787578759453e-09, "loss": 0.5095, "step": 12173 }, { "epoch": 0.9887111183302201, "grad_norm": 6.64145740938836, "learning_rate": 1.6709525189073649e-09, "loss": 0.4332, "step": 12174 }, { "epoch": 0.9887923333062617, "grad_norm": 6.145921162498026, "learning_rate": 1.646999151086459e-09, "loss": 0.3923, "step": 12175 }, { "epoch": 0.9888735482823032, "grad_norm": 5.254183282644911, "learning_rate": 1.6232186560710684e-09, "loss": 0.4596, "step": 12176 }, { "epoch": 0.9889547632583449, "grad_norm": 17.99187941281074, "learning_rate": 1.599611035506543e-09, "loss": 0.6023, "step": 12177 }, { "epoch": 0.9890359782343864, "grad_norm": 10.290258462437267, "learning_rate": 1.5761762910260214e-09, "loss": 0.5383, "step": 12178 }, { "epoch": 0.989117193210428, "grad_norm": 6.646644372837433, "learning_rate": 1.5529144242518167e-09, "loss": 0.5216, "step": 12179 }, { "epoch": 0.9891984081864695, "grad_norm": 4.4766333642428675, "learning_rate": 1.5298254367926424e-09, "loss": 0.4035, "step": 12180 }, { "epoch": 0.9892796231625112, "grad_norm": 9.914986537758677, "learning_rate": 1.5069093302469418e-09, "loss": 0.4324, "step": 12181 }, { "epoch": 0.9893608381385528, "grad_norm": 4.176667675043893, "learning_rate": 1.4841661061998358e-09, "loss": 0.4502, "step": 12182 }, { "epoch": 0.9894420531145943, "grad_norm": 3.795368320920534, "learning_rate": 1.4615957662250657e-09, "loss": 0.4701, "step": 12183 }, { "epoch": 0.989523268090636, "grad_norm": 5.32972800464418, "learning_rate": 1.4391983118847152e-09, "loss": 0.605, "step": 12184 }, { "epoch": 0.9896044830666775, "grad_norm": 6.008043856391167, "learning_rate": 1.4169737447283782e-09, "loss": 0.3262, "step": 12185 }, { "epoch": 0.9896856980427191, "grad_norm": 7.84003745927447, "learning_rate": 1.394922066293991e-09, "loss": 0.5486, "step": 12186 }, { "epoch": 0.9897669130187606, "grad_norm": 6.066959827307744, "learning_rate": 1.3730432781070002e-09, "loss": 0.4331, "step": 12187 }, { "epoch": 0.9898481279948023, "grad_norm": 17.157061321750632, "learning_rate": 1.3513373816820274e-09, "loss": 0.642, "step": 12188 }, { "epoch": 0.9899293429708438, "grad_norm": 3.932173196085788, "learning_rate": 1.3298043785203718e-09, "loss": 0.4469, "step": 12189 }, { "epoch": 0.9900105579468854, "grad_norm": 8.805868997589002, "learning_rate": 1.30844427011223e-09, "loss": 0.485, "step": 12190 }, { "epoch": 0.9900917729229269, "grad_norm": 4.027618081932802, "learning_rate": 1.287257057935587e-09, "loss": 0.4275, "step": 12191 }, { "epoch": 0.9901729878989686, "grad_norm": 8.085780436510987, "learning_rate": 1.2662427434564916e-09, "loss": 0.5487, "step": 12192 }, { "epoch": 0.9902542028750102, "grad_norm": 6.6055275665447875, "learning_rate": 1.2454013281290589e-09, "loss": 0.4155, "step": 12193 }, { "epoch": 0.9903354178510517, "grad_norm": 5.7796904505260365, "learning_rate": 1.2247328133954683e-09, "loss": 0.4356, "step": 12194 }, { "epoch": 0.9904166328270934, "grad_norm": 4.4243001082018445, "learning_rate": 1.2042372006856873e-09, "loss": 0.4525, "step": 12195 }, { "epoch": 0.9904978478031349, "grad_norm": 5.706811651819559, "learning_rate": 1.1839144914180256e-09, "loss": 0.3391, "step": 12196 }, { "epoch": 0.9905790627791765, "grad_norm": 5.337953154660965, "learning_rate": 1.1637646869985809e-09, "loss": 0.4905, "step": 12197 }, { "epoch": 0.990660277755218, "grad_norm": 7.132197947930379, "learning_rate": 1.143787788821793e-09, "loss": 0.4736, "step": 12198 }, { "epoch": 0.9907414927312597, "grad_norm": 8.488570779233747, "learning_rate": 1.1239837982698898e-09, "loss": 0.4538, "step": 12199 }, { "epoch": 0.9908227077073012, "grad_norm": 8.566390662400726, "learning_rate": 1.104352716713164e-09, "loss": 0.5289, "step": 12200 }, { "epoch": 0.9909039226833428, "grad_norm": 7.713592201697891, "learning_rate": 1.0848945455099734e-09, "loss": 0.4059, "step": 12201 }, { "epoch": 0.9909851376593843, "grad_norm": 8.262715346834725, "learning_rate": 1.0656092860067413e-09, "loss": 0.367, "step": 12202 }, { "epoch": 0.991066352635426, "grad_norm": 5.432388233712127, "learning_rate": 1.046496939538233e-09, "loss": 0.3928, "step": 12203 }, { "epoch": 0.9911475676114676, "grad_norm": 5.147447230251187, "learning_rate": 1.027557507426169e-09, "loss": 0.5214, "step": 12204 }, { "epoch": 0.9912287825875091, "grad_norm": 5.0721034321084515, "learning_rate": 1.0087909909817228e-09, "loss": 0.3809, "step": 12205 }, { "epoch": 0.9913099975635508, "grad_norm": 3.1443588440586963, "learning_rate": 9.901973915033004e-10, "loss": 0.5828, "step": 12206 }, { "epoch": 0.9913912125395923, "grad_norm": 7.70458840955126, "learning_rate": 9.717767102770947e-10, "loss": 0.3964, "step": 12207 }, { "epoch": 0.9914724275156339, "grad_norm": 5.013950694674269, "learning_rate": 9.535289485781973e-10, "loss": 0.5808, "step": 12208 }, { "epoch": 0.9915536424916754, "grad_norm": 5.467259982799362, "learning_rate": 9.354541076692092e-10, "loss": 0.3382, "step": 12209 }, { "epoch": 0.9916348574677171, "grad_norm": 3.502450749974113, "learning_rate": 9.17552188800519e-10, "loss": 0.4828, "step": 12210 }, { "epoch": 0.9917160724437586, "grad_norm": 4.810040707477097, "learning_rate": 8.998231932108581e-10, "loss": 0.4266, "step": 12211 }, { "epoch": 0.9917972874198002, "grad_norm": 4.375638079472969, "learning_rate": 8.822671221273005e-10, "loss": 0.5797, "step": 12212 }, { "epoch": 0.9918785023958417, "grad_norm": 9.293670755342289, "learning_rate": 8.648839767644302e-10, "loss": 0.621, "step": 12213 }, { "epoch": 0.9919597173718834, "grad_norm": 6.6261486011288335, "learning_rate": 8.476737583251737e-10, "loss": 0.4257, "step": 12214 }, { "epoch": 0.992040932347925, "grad_norm": 4.4991871759075375, "learning_rate": 8.306364680002454e-10, "loss": 0.522, "step": 12215 }, { "epoch": 0.9921221473239665, "grad_norm": 4.881228997280039, "learning_rate": 8.137721069687021e-10, "loss": 0.526, "step": 12216 }, { "epoch": 0.9922033623000082, "grad_norm": 3.6297898118275964, "learning_rate": 7.970806763973882e-10, "loss": 0.4575, "step": 12217 }, { "epoch": 0.9922845772760497, "grad_norm": 6.246461103137779, "learning_rate": 7.805621774409356e-10, "loss": 0.3662, "step": 12218 }, { "epoch": 0.9923657922520913, "grad_norm": 5.111388107129699, "learning_rate": 7.642166112428739e-10, "loss": 0.5867, "step": 12219 }, { "epoch": 0.9924470072281328, "grad_norm": 6.103417103656217, "learning_rate": 7.480439789339655e-10, "loss": 0.489, "step": 12220 }, { "epoch": 0.9925282222041745, "grad_norm": 5.872345642365211, "learning_rate": 7.320442816333151e-10, "loss": 0.4852, "step": 12221 }, { "epoch": 0.992609437180216, "grad_norm": 4.896384077479555, "learning_rate": 7.162175204480926e-10, "loss": 0.415, "step": 12222 }, { "epoch": 0.9926906521562576, "grad_norm": 11.24619397959855, "learning_rate": 7.005636964732554e-10, "loss": 0.5892, "step": 12223 }, { "epoch": 0.9927718671322991, "grad_norm": 6.417486574394487, "learning_rate": 6.850828107921037e-10, "loss": 0.4542, "step": 12224 }, { "epoch": 0.9928530821083408, "grad_norm": 5.669079347074385, "learning_rate": 6.697748644757252e-10, "loss": 0.4736, "step": 12225 }, { "epoch": 0.9929342970843824, "grad_norm": 5.1508249817000475, "learning_rate": 6.546398585832725e-10, "loss": 0.3881, "step": 12226 }, { "epoch": 0.9930155120604239, "grad_norm": 6.284781187211057, "learning_rate": 6.396777941622412e-10, "loss": 0.5204, "step": 12227 }, { "epoch": 0.9930967270364656, "grad_norm": 5.867023818166328, "learning_rate": 6.248886722479142e-10, "loss": 0.3868, "step": 12228 }, { "epoch": 0.9931779420125071, "grad_norm": 7.865759003313058, "learning_rate": 6.10272493863362e-10, "loss": 0.4652, "step": 12229 }, { "epoch": 0.9932591569885487, "grad_norm": 5.771846652668555, "learning_rate": 5.958292600202753e-10, "loss": 0.4368, "step": 12230 }, { "epoch": 0.9933403719645902, "grad_norm": 5.555877566179887, "learning_rate": 5.81558971717855e-10, "loss": 0.4511, "step": 12231 }, { "epoch": 0.9934215869406319, "grad_norm": 7.602271947993169, "learning_rate": 5.674616299436441e-10, "loss": 0.3559, "step": 12232 }, { "epoch": 0.9935028019166734, "grad_norm": 3.168714257439046, "learning_rate": 5.53537235672974e-10, "loss": 0.6782, "step": 12233 }, { "epoch": 0.993584016892715, "grad_norm": 8.0565315209248, "learning_rate": 5.397857898692404e-10, "loss": 0.4928, "step": 12234 }, { "epoch": 0.9936652318687565, "grad_norm": 6.7281592050291525, "learning_rate": 5.262072934841822e-10, "loss": 0.4097, "step": 12235 }, { "epoch": 0.9937464468447982, "grad_norm": 5.03523479370938, "learning_rate": 5.128017474573254e-10, "loss": 0.5035, "step": 12236 }, { "epoch": 0.9938276618208398, "grad_norm": 5.581578649372743, "learning_rate": 4.995691527162616e-10, "loss": 0.6748, "step": 12237 }, { "epoch": 0.9939088767968813, "grad_norm": 5.1959863608312755, "learning_rate": 4.86509510176647e-10, "loss": 0.325, "step": 12238 }, { "epoch": 0.993990091772923, "grad_norm": 5.218818979766321, "learning_rate": 4.736228207419258e-10, "loss": 0.5546, "step": 12239 }, { "epoch": 0.9940713067489645, "grad_norm": 6.565937064015396, "learning_rate": 4.60909085304162e-10, "loss": 0.4902, "step": 12240 }, { "epoch": 0.9941525217250061, "grad_norm": 4.111169296382956, "learning_rate": 4.4836830474265235e-10, "loss": 0.4279, "step": 12241 }, { "epoch": 0.9942337367010476, "grad_norm": 4.28308564761494, "learning_rate": 4.3600047992559124e-10, "loss": 0.5878, "step": 12242 }, { "epoch": 0.9943149516770893, "grad_norm": 7.57733637051035, "learning_rate": 4.2380561170840553e-10, "loss": 0.7041, "step": 12243 }, { "epoch": 0.9943961666531308, "grad_norm": 6.372067043265081, "learning_rate": 4.1178370093486463e-10, "loss": 0.4418, "step": 12244 }, { "epoch": 0.9944773816291724, "grad_norm": 3.884746049499681, "learning_rate": 3.9993474843735837e-10, "loss": 0.4731, "step": 12245 }, { "epoch": 0.994558596605214, "grad_norm": 8.66268673867393, "learning_rate": 3.882587550349537e-10, "loss": 0.4845, "step": 12246 }, { "epoch": 0.9946398115812556, "grad_norm": 4.113892528273869, "learning_rate": 3.7675572153644814e-10, "loss": 0.4564, "step": 12247 }, { "epoch": 0.9947210265572972, "grad_norm": 4.958163424799454, "learning_rate": 3.6542564873731645e-10, "loss": 0.4016, "step": 12248 }, { "epoch": 0.9948022415333387, "grad_norm": 3.6688498786954162, "learning_rate": 3.5426853742137613e-10, "loss": 0.6589, "step": 12249 }, { "epoch": 0.9948834565093804, "grad_norm": 10.717003613916313, "learning_rate": 3.432843883610648e-10, "loss": 0.4469, "step": 12250 }, { "epoch": 0.9949646714854219, "grad_norm": 4.462573986139782, "learning_rate": 3.3247320231605265e-10, "loss": 0.5045, "step": 12251 }, { "epoch": 0.9950458864614635, "grad_norm": 5.007934033669896, "learning_rate": 3.218349800346299e-10, "loss": 0.4665, "step": 12252 }, { "epoch": 0.995127101437505, "grad_norm": 3.873104868996485, "learning_rate": 3.1136972225315197e-10, "loss": 0.5173, "step": 12253 }, { "epoch": 0.9952083164135467, "grad_norm": 3.3876902753690796, "learning_rate": 3.0107742969520683e-10, "loss": 0.5485, "step": 12254 }, { "epoch": 0.9952895313895882, "grad_norm": 6.699091681394484, "learning_rate": 2.9095810307328e-10, "loss": 0.367, "step": 12255 }, { "epoch": 0.9953707463656298, "grad_norm": 6.171401454308493, "learning_rate": 2.810117430873671e-10, "loss": 0.3962, "step": 12256 }, { "epoch": 0.9954519613416714, "grad_norm": 4.923317857251655, "learning_rate": 2.71238350426084e-10, "loss": 0.4938, "step": 12257 }, { "epoch": 0.995533176317713, "grad_norm": 5.816170040992201, "learning_rate": 2.61637925765279e-10, "loss": 0.3092, "step": 12258 }, { "epoch": 0.9956143912937546, "grad_norm": 4.783682438235011, "learning_rate": 2.522104697696981e-10, "loss": 0.4861, "step": 12259 }, { "epoch": 0.9956956062697961, "grad_norm": 6.542965672565677, "learning_rate": 2.4295598309131973e-10, "loss": 0.4059, "step": 12260 }, { "epoch": 0.9957768212458378, "grad_norm": 5.290685230558423, "learning_rate": 2.3387446637046506e-10, "loss": 0.433, "step": 12261 }, { "epoch": 0.9958580362218793, "grad_norm": 4.569512260893426, "learning_rate": 2.2496592023579789e-10, "loss": 0.5698, "step": 12262 }, { "epoch": 0.9959392511979209, "grad_norm": 13.1705343773864, "learning_rate": 2.1623034530349197e-10, "loss": 0.4349, "step": 12263 }, { "epoch": 0.9960204661739624, "grad_norm": 11.027548037308877, "learning_rate": 2.076677421783413e-10, "loss": 0.3677, "step": 12264 }, { "epoch": 0.9961016811500041, "grad_norm": 7.761548829661015, "learning_rate": 1.992781114523723e-10, "loss": 0.4086, "step": 12265 }, { "epoch": 0.9961828961260456, "grad_norm": 4.496137975811519, "learning_rate": 1.910614537065092e-10, "loss": 0.523, "step": 12266 }, { "epoch": 0.9962641111020872, "grad_norm": 4.01367922274939, "learning_rate": 1.8301776950918615e-10, "loss": 0.4518, "step": 12267 }, { "epoch": 0.9963453260781288, "grad_norm": 11.271093585704282, "learning_rate": 1.7514705941690247e-10, "loss": 0.4174, "step": 12268 }, { "epoch": 0.9964265410541704, "grad_norm": 5.748127242285962, "learning_rate": 1.6744932397422254e-10, "loss": 0.5133, "step": 12269 }, { "epoch": 0.996507756030212, "grad_norm": 5.1019116955334916, "learning_rate": 1.5992456371377584e-10, "loss": 0.3742, "step": 12270 }, { "epoch": 0.9965889710062535, "grad_norm": 6.408098676761947, "learning_rate": 1.5257277915653458e-10, "loss": 0.4672, "step": 12271 }, { "epoch": 0.9966701859822952, "grad_norm": 6.484656082022969, "learning_rate": 1.4539397081070328e-10, "loss": 0.3491, "step": 12272 }, { "epoch": 0.9967514009583367, "grad_norm": 3.999338555705399, "learning_rate": 1.3838813917366188e-10, "loss": 0.5493, "step": 12273 }, { "epoch": 0.9968326159343783, "grad_norm": 5.4475266405766565, "learning_rate": 1.3155528472974523e-10, "loss": 0.3952, "step": 12274 }, { "epoch": 0.9969138309104199, "grad_norm": 4.872599475892686, "learning_rate": 1.2489540795163068e-10, "loss": 0.5474, "step": 12275 }, { "epoch": 0.9969950458864615, "grad_norm": 3.670731259776475, "learning_rate": 1.18408509300616e-10, "loss": 0.5644, "step": 12276 }, { "epoch": 0.997076260862503, "grad_norm": 6.2602727752630045, "learning_rate": 1.1209458922495365e-10, "loss": 0.5351, "step": 12277 }, { "epoch": 0.9971574758385446, "grad_norm": 4.875410969826889, "learning_rate": 1.0595364816207155e-10, "loss": 0.5048, "step": 12278 }, { "epoch": 0.9972386908145862, "grad_norm": 4.9678315360142395, "learning_rate": 9.998568653690754e-11, "loss": 0.5401, "step": 12279 }, { "epoch": 0.9973199057906278, "grad_norm": 4.51671927225658, "learning_rate": 9.41907047619095e-11, "loss": 0.516, "step": 12280 }, { "epoch": 0.9974011207666694, "grad_norm": 4.930864987720356, "learning_rate": 8.856870323842304e-11, "loss": 0.5275, "step": 12281 }, { "epoch": 0.997482335742711, "grad_norm": 6.047707965039309, "learning_rate": 8.311968235530376e-11, "loss": 0.4833, "step": 12282 }, { "epoch": 0.9975635507187526, "grad_norm": 7.174138199102842, "learning_rate": 7.784364248974996e-11, "loss": 0.5413, "step": 12283 }, { "epoch": 0.9976447656947941, "grad_norm": 6.751919593589131, "learning_rate": 7.274058400674744e-11, "loss": 0.4525, "step": 12284 }, { "epoch": 0.9977259806708357, "grad_norm": 5.318816576138242, "learning_rate": 6.781050725962468e-11, "loss": 0.4845, "step": 12285 }, { "epoch": 0.9978071956468773, "grad_norm": 4.612324143619252, "learning_rate": 6.30534125889426e-11, "loss": 0.5208, "step": 12286 }, { "epoch": 0.9978884106229189, "grad_norm": 6.7018444490349065, "learning_rate": 5.846930032443743e-11, "loss": 0.4859, "step": 12287 }, { "epoch": 0.9979696255989604, "grad_norm": 7.423300865438458, "learning_rate": 5.4058170783077845e-11, "loss": 0.3708, "step": 12288 }, { "epoch": 0.998050840575002, "grad_norm": 6.668113758835644, "learning_rate": 4.982002427017518e-11, "loss": 0.5712, "step": 12289 }, { "epoch": 0.9981320555510436, "grad_norm": 7.202885642026081, "learning_rate": 4.5754861078828314e-11, "loss": 0.4483, "step": 12290 }, { "epoch": 0.9982132705270852, "grad_norm": 8.215415290405625, "learning_rate": 4.186268149047879e-11, "loss": 0.4036, "step": 12291 }, { "epoch": 0.9982944855031268, "grad_norm": 3.4403988676406008, "learning_rate": 3.814348577435567e-11, "loss": 0.4738, "step": 12292 }, { "epoch": 0.9983757004791683, "grad_norm": 4.536469686726864, "learning_rate": 3.4597274187753163e-11, "loss": 0.5479, "step": 12293 }, { "epoch": 0.99845691545521, "grad_norm": 4.166942290927568, "learning_rate": 3.122404697603054e-11, "loss": 0.4988, "step": 12294 }, { "epoch": 0.9985381304312515, "grad_norm": 3.821490393417459, "learning_rate": 2.8023804372889762e-11, "loss": 0.6298, "step": 12295 }, { "epoch": 0.9986193454072931, "grad_norm": 4.579385987949685, "learning_rate": 2.499654659954276e-11, "loss": 0.4335, "step": 12296 }, { "epoch": 0.9987005603833347, "grad_norm": 3.5660288116414067, "learning_rate": 2.214227386554413e-11, "loss": 0.4252, "step": 12297 }, { "epoch": 0.9987817753593763, "grad_norm": 3.8887534314797567, "learning_rate": 1.9460986368513568e-11, "loss": 0.5141, "step": 12298 }, { "epoch": 0.9988629903354178, "grad_norm": 9.174267597596703, "learning_rate": 1.6952684293580767e-11, "loss": 0.4097, "step": 12299 }, { "epoch": 0.9989442053114594, "grad_norm": 4.342933188190943, "learning_rate": 1.4617367814495632e-11, "loss": 0.498, "step": 12300 }, { "epoch": 0.999025420287501, "grad_norm": 5.079424217042842, "learning_rate": 1.2455037093073163e-11, "loss": 0.5183, "step": 12301 }, { "epoch": 0.9991066352635426, "grad_norm": 8.135085893159895, "learning_rate": 1.0465692278638361e-11, "loss": 0.5642, "step": 12302 }, { "epoch": 0.9991878502395842, "grad_norm": 30.1448448129456, "learning_rate": 8.649333509136438e-12, "loss": 0.4073, "step": 12303 }, { "epoch": 0.9992690652156258, "grad_norm": 3.5750066680144217, "learning_rate": 7.005960910022591e-12, "loss": 0.4412, "step": 12304 }, { "epoch": 0.9993502801916674, "grad_norm": 13.396727140828338, "learning_rate": 5.535574594817128e-12, "loss": 0.3997, "step": 12305 }, { "epoch": 0.9994314951677089, "grad_norm": 6.163825951561884, "learning_rate": 4.238174665938122e-12, "loss": 0.3945, "step": 12306 }, { "epoch": 0.9995127101437505, "grad_norm": 5.7285348007196975, "learning_rate": 3.11376121248097e-12, "loss": 0.4673, "step": 12307 }, { "epoch": 0.9995939251197921, "grad_norm": 4.103396802719272, "learning_rate": 2.1623343124388405e-12, "loss": 0.4399, "step": 12308 }, { "epoch": 0.9996751400958337, "grad_norm": 4.630608713223542, "learning_rate": 1.3838940318700034e-12, "loss": 0.4208, "step": 12309 }, { "epoch": 0.9997563550718752, "grad_norm": 5.594155269906072, "learning_rate": 7.784404243427191e-13, "loss": 0.5584, "step": 12310 }, { "epoch": 0.9998375700479168, "grad_norm": 7.544818372955951, "learning_rate": 3.4597353176790696e-13, "loss": 0.4048, "step": 12311 }, { "epoch": 0.9999187850239584, "grad_norm": 4.9742349717120895, "learning_rate": 8.649338439914445e-14, "loss": 0.5781, "step": 12312 }, { "epoch": 1.0, "grad_norm": 4.419415245809011, "learning_rate": 0.0, "loss": 0.4966, "step": 12313 }, { "epoch": 1.0, "step": 12313, "total_flos": 582090100039680.0, "train_loss": 0.5386334688529102, "train_runtime": 33230.6995, "train_samples_per_second": 11.857, "train_steps_per_second": 0.371 } ], "logging_steps": 1.0, "max_steps": 12313, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4101, "total_flos": 582090100039680.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }