diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0116929653611617, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001005846482680581, + "grad_norm": 12582.778327747239, + "learning_rate": 0.0, + "loss": 18.2821, + "step": 1 + }, + { + "epoch": 0.002011692965361162, + "grad_norm": 8175.9099112773565, + "learning_rate": 1.1164325195357284e-06, + "loss": 9.792, + "step": 2 + }, + { + "epoch": 0.0030175394480417427, + "grad_norm": 1649.5623048840205, + "learning_rate": 1.7695036780497693e-06, + "loss": 10.0444, + "step": 3 + }, + { + "epoch": 0.004023385930722324, + "grad_norm": 1686.6179804241558, + "learning_rate": 2.2328650390714567e-06, + "loss": 9.5109, + "step": 4 + }, + { + "epoch": 0.005029232413402905, + "grad_norm": 5483.012027070882, + "learning_rate": 2.5922760331558917e-06, + "loss": 8.2383, + "step": 5 + }, + { + "epoch": 0.006035078896083485, + "grad_norm": 1172.816476755199, + "learning_rate": 2.8859361975854976e-06, + "loss": 7.1834, + "step": 6 + }, + { + "epoch": 0.007040925378764066, + "grad_norm": 657.0193977991333, + "learning_rate": 3.1342223288637992e-06, + "loss": 5.4344, + "step": 7 + }, + { + "epoch": 0.008046771861444648, + "grad_norm": 372.67224237745364, + "learning_rate": 3.349297558607185e-06, + "loss": 4.6949, + "step": 8 + }, + { + "epoch": 0.009052618344125228, + "grad_norm": 380.45025528740604, + "learning_rate": 3.5390073560995386e-06, + "loss": 4.5428, + "step": 9 + }, + { + "epoch": 0.01005846482680581, + "grad_norm": 773.3075300312582, + "learning_rate": 3.7087085526916205e-06, + "loss": 4.2198, + "step": 10 + }, + { + "epoch": 0.01106431130948639, + "grad_norm": 208.01920169269192, + "learning_rate": 3.862221958156801e-06, + "loss": 4.3316, + "step": 11 + }, + { + "epoch": 0.01207015779216697, + "grad_norm": 188.8951453419855, + "learning_rate": 4.0023687171212264e-06, + "loss": 4.0229, + "step": 12 + }, + { + "epoch": 0.013076004274847551, + "grad_norm": 846.972851780641, + "learning_rate": 4.131291237914341e-06, + "loss": 3.7635, + "step": 13 + }, + { + "epoch": 0.014081850757528132, + "grad_norm": 252.04772708907407, + "learning_rate": 4.250654848399527e-06, + "loss": 3.6946, + "step": 14 + }, + { + "epoch": 0.015087697240208713, + "grad_norm": 177.74605915923814, + "learning_rate": 4.3617797112056605e-06, + "loss": 3.539, + "step": 15 + }, + { + "epoch": 0.016093543722889295, + "grad_norm": 157.7895649733611, + "learning_rate": 4.4657300781429134e-06, + "loss": 3.5069, + "step": 16 + }, + { + "epoch": 0.017099390205569876, + "grad_norm": 3471.9413534733717, + "learning_rate": 4.563376438365783e-06, + "loss": 3.457, + "step": 17 + }, + { + "epoch": 0.018105236688250457, + "grad_norm": 9271.52283588444, + "learning_rate": 4.6554398756352665e-06, + "loss": 3.4839, + "step": 18 + }, + { + "epoch": 0.019111083170931038, + "grad_norm": 1707.1301369245853, + "learning_rate": 4.742524416638964e-06, + "loss": 3.4175, + "step": 19 + }, + { + "epoch": 0.02011692965361162, + "grad_norm": 380.3932734821994, + "learning_rate": 4.825141072227348e-06, + "loss": 3.413, + "step": 20 + }, + { + "epoch": 0.0211227761362922, + "grad_norm": 412.42448179438645, + "learning_rate": 4.903726006913569e-06, + "loss": 3.3572, + "step": 21 + }, + { + "epoch": 0.02212862261897278, + "grad_norm": 155.79462929869592, + "learning_rate": 4.97865447769253e-06, + "loss": 3.3687, + "step": 22 + }, + { + "epoch": 0.02313446910165336, + "grad_norm": 146.80182332367627, + "learning_rate": 5.050251671876699e-06, + "loss": 3.2905, + "step": 23 + }, + { + "epoch": 0.02414031558433394, + "grad_norm": 152.70556523236607, + "learning_rate": 5.118801236656955e-06, + "loss": 3.3167, + "step": 24 + }, + { + "epoch": 0.025146162067014522, + "grad_norm": 145.35719764915638, + "learning_rate": 5.184552066311783e-06, + "loss": 3.2275, + "step": 25 + }, + { + "epoch": 0.026152008549695103, + "grad_norm": 142.30381750870376, + "learning_rate": 5.247723757450069e-06, + "loss": 3.2108, + "step": 26 + }, + { + "epoch": 0.027157855032375684, + "grad_norm": 190.7780665632596, + "learning_rate": 5.3085110341493074e-06, + "loss": 3.1127, + "step": 27 + }, + { + "epoch": 0.028163701515056264, + "grad_norm": 140.71233699510987, + "learning_rate": 5.367087367935257e-06, + "loss": 3.2016, + "step": 28 + }, + { + "epoch": 0.029169547997736845, + "grad_norm": 144.31455234163855, + "learning_rate": 5.423607962246961e-06, + "loss": 3.1428, + "step": 29 + }, + { + "epoch": 0.030175394480417426, + "grad_norm": 166.83275617027454, + "learning_rate": 5.47821223074139e-06, + "loss": 3.166, + "step": 30 + }, + { + "epoch": 0.031181240963098007, + "grad_norm": 150.84029040740148, + "learning_rate": 5.531025869079829e-06, + "loss": 3.0718, + "step": 31 + }, + { + "epoch": 0.03218708744577859, + "grad_norm": 167.4297289699058, + "learning_rate": 5.582162597678642e-06, + "loss": 3.0989, + "step": 32 + }, + { + "epoch": 0.03319293392845917, + "grad_norm": 196.37733263631793, + "learning_rate": 5.631725636206569e-06, + "loss": 3.0453, + "step": 33 + }, + { + "epoch": 0.03419878041113975, + "grad_norm": 167.0654826804254, + "learning_rate": 5.679808957901513e-06, + "loss": 3.0685, + "step": 34 + }, + { + "epoch": 0.03520462689382033, + "grad_norm": 157.79329205975614, + "learning_rate": 5.726498362019691e-06, + "loss": 3.0637, + "step": 35 + }, + { + "epoch": 0.036210473376500914, + "grad_norm": 145.63751069723227, + "learning_rate": 5.771872395170995e-06, + "loss": 3.1047, + "step": 36 + }, + { + "epoch": 0.037216319859181494, + "grad_norm": 138.30989338843392, + "learning_rate": 5.816003146393009e-06, + "loss": 3.0059, + "step": 37 + }, + { + "epoch": 0.038222166341862075, + "grad_norm": 141.08916496720514, + "learning_rate": 5.858956936174693e-06, + "loss": 3.0426, + "step": 38 + }, + { + "epoch": 0.039228012824542656, + "grad_norm": 150.01969317858723, + "learning_rate": 5.900794915964109e-06, + "loss": 3.0102, + "step": 39 + }, + { + "epoch": 0.04023385930722324, + "grad_norm": 145.60366402885353, + "learning_rate": 5.941573591763077e-06, + "loss": 2.9662, + "step": 40 + }, + { + "epoch": 0.04123970578990382, + "grad_norm": 256.030725034778, + "learning_rate": 5.98134528305946e-06, + "loss": 3.015, + "step": 41 + }, + { + "epoch": 0.0422455522725844, + "grad_norm": 148.18303193116637, + "learning_rate": 6.020158526449297e-06, + "loss": 3.0394, + "step": 42 + }, + { + "epoch": 0.04325139875526498, + "grad_norm": 401.9892406324886, + "learning_rate": 6.058058431759984e-06, + "loss": 2.9915, + "step": 43 + }, + { + "epoch": 0.04425724523794556, + "grad_norm": 138.76017347757087, + "learning_rate": 6.095086997228258e-06, + "loss": 2.9636, + "step": 44 + }, + { + "epoch": 0.04526309172062614, + "grad_norm": 151.8736160799114, + "learning_rate": 6.13128338925543e-06, + "loss": 3.0144, + "step": 45 + }, + { + "epoch": 0.04626893820330672, + "grad_norm": 776.4808078486112, + "learning_rate": 6.166684191412428e-06, + "loss": 2.9346, + "step": 46 + }, + { + "epoch": 0.0472747846859873, + "grad_norm": 148.56518730989325, + "learning_rate": 6.201323626663534e-06, + "loss": 2.9401, + "step": 47 + }, + { + "epoch": 0.04828063116866788, + "grad_norm": 140.9261131197792, + "learning_rate": 6.235233756192683e-06, + "loss": 2.9184, + "step": 48 + }, + { + "epoch": 0.04928647765134846, + "grad_norm": 140.86675687418395, + "learning_rate": 6.2684446577275984e-06, + "loss": 2.96, + "step": 49 + }, + { + "epoch": 0.050292324134029044, + "grad_norm": 133.56755452224644, + "learning_rate": 6.300984585847511e-06, + "loss": 2.9206, + "step": 50 + }, + { + "epoch": 0.051298170616709625, + "grad_norm": 132.93056724366744, + "learning_rate": 6.332880116415553e-06, + "loss": 2.8758, + "step": 51 + }, + { + "epoch": 0.052304017099390206, + "grad_norm": 138.26029172258959, + "learning_rate": 6.3641562769857975e-06, + "loss": 2.8796, + "step": 52 + }, + { + "epoch": 0.053309863582070786, + "grad_norm": 123.99660733499722, + "learning_rate": 6.394836664788228e-06, + "loss": 2.9349, + "step": 53 + }, + { + "epoch": 0.05431571006475137, + "grad_norm": 113.95184567154406, + "learning_rate": 6.424943553685035e-06, + "loss": 2.8738, + "step": 54 + }, + { + "epoch": 0.05532155654743195, + "grad_norm": 119.01280095728684, + "learning_rate": 6.454497991312694e-06, + "loss": 2.8592, + "step": 55 + }, + { + "epoch": 0.05632740303011253, + "grad_norm": 106.28111012983979, + "learning_rate": 6.4835198874709856e-06, + "loss": 2.8986, + "step": 56 + }, + { + "epoch": 0.05733324951279311, + "grad_norm": 105.28212563495431, + "learning_rate": 6.512028094688734e-06, + "loss": 2.8513, + "step": 57 + }, + { + "epoch": 0.05833909599547369, + "grad_norm": 102.82251214996867, + "learning_rate": 6.540040481782689e-06, + "loss": 2.8233, + "step": 58 + }, + { + "epoch": 0.05934494247815427, + "grad_norm": 88.93066733289517, + "learning_rate": 6.567574001128382e-06, + "loss": 2.9266, + "step": 59 + }, + { + "epoch": 0.06035078896083485, + "grad_norm": 82.42620087801528, + "learning_rate": 6.594644750277117e-06, + "loss": 2.8909, + "step": 60 + }, + { + "epoch": 0.06135663544351543, + "grad_norm": 74.29787586543553, + "learning_rate": 6.621268028479951e-06, + "loss": 2.8756, + "step": 61 + }, + { + "epoch": 0.06236248192619601, + "grad_norm": 68.17633312180041, + "learning_rate": 6.647458388615557e-06, + "loss": 2.8298, + "step": 62 + }, + { + "epoch": 0.0633683284088766, + "grad_norm": 61.94312969580986, + "learning_rate": 6.673229684963337e-06, + "loss": 2.9027, + "step": 63 + }, + { + "epoch": 0.06437417489155718, + "grad_norm": 54.35466540831732, + "learning_rate": 6.69859511721437e-06, + "loss": 2.8293, + "step": 64 + }, + { + "epoch": 0.06538002137423776, + "grad_norm": 44.173923656121296, + "learning_rate": 6.723567271070232e-06, + "loss": 2.8335, + "step": 65 + }, + { + "epoch": 0.06638586785691834, + "grad_norm": 41.577563236699476, + "learning_rate": 6.748158155742298e-06, + "loss": 2.8218, + "step": 66 + }, + { + "epoch": 0.06739171433959892, + "grad_norm": 37.7162349842147, + "learning_rate": 6.7723792386312174e-06, + "loss": 2.7834, + "step": 67 + }, + { + "epoch": 0.0683975608222795, + "grad_norm": 30.862561284051413, + "learning_rate": 6.796241477437241e-06, + "loss": 2.8077, + "step": 68 + }, + { + "epoch": 0.06940340730496009, + "grad_norm": 27.233229248978024, + "learning_rate": 6.8197553499264694e-06, + "loss": 2.7832, + "step": 69 + }, + { + "epoch": 0.07040925378764067, + "grad_norm": 32.95846432991116, + "learning_rate": 6.8429308815554205e-06, + "loss": 2.8153, + "step": 70 + }, + { + "epoch": 0.07141510027032125, + "grad_norm": 34.002333788304036, + "learning_rate": 6.865777671136202e-06, + "loss": 2.8491, + "step": 71 + }, + { + "epoch": 0.07242094675300183, + "grad_norm": 26.769475736430746, + "learning_rate": 6.888304914706724e-06, + "loss": 2.865, + "step": 72 + }, + { + "epoch": 0.07342679323568241, + "grad_norm": 23.983053033180372, + "learning_rate": 6.910521427754546e-06, + "loss": 2.8012, + "step": 73 + }, + { + "epoch": 0.07443263971836299, + "grad_norm": 24.466547342637735, + "learning_rate": 6.932435665928737e-06, + "loss": 2.7997, + "step": 74 + }, + { + "epoch": 0.07543848620104357, + "grad_norm": 25.705189713492953, + "learning_rate": 6.954055744361552e-06, + "loss": 2.7941, + "step": 75 + }, + { + "epoch": 0.07644433268372415, + "grad_norm": 21.916294810974563, + "learning_rate": 6.975389455710421e-06, + "loss": 2.8076, + "step": 76 + }, + { + "epoch": 0.07745017916640473, + "grad_norm": 24.763966899310763, + "learning_rate": 6.9964442870206006e-06, + "loss": 2.7578, + "step": 77 + }, + { + "epoch": 0.07845602564908531, + "grad_norm": 22.805095513442417, + "learning_rate": 7.017227435499838e-06, + "loss": 2.72, + "step": 78 + }, + { + "epoch": 0.07946187213176589, + "grad_norm": 23.044520510111933, + "learning_rate": 7.037745823288182e-06, + "loss": 2.7863, + "step": 79 + }, + { + "epoch": 0.08046771861444647, + "grad_norm": 26.56601715759224, + "learning_rate": 7.058006111298805e-06, + "loss": 2.7645, + "step": 80 + }, + { + "epoch": 0.08147356509712705, + "grad_norm": 23.353217623203157, + "learning_rate": 7.078014712199077e-06, + "loss": 2.8215, + "step": 81 + }, + { + "epoch": 0.08247941157980763, + "grad_norm": 26.391603597567094, + "learning_rate": 7.097777802595188e-06, + "loss": 2.7263, + "step": 82 + }, + { + "epoch": 0.08348525806248822, + "grad_norm": 23.916408410799217, + "learning_rate": 7.117301334478265e-06, + "loss": 2.7403, + "step": 83 + }, + { + "epoch": 0.0844911045451688, + "grad_norm": 20.688015864866205, + "learning_rate": 7.136591045985025e-06, + "loss": 2.7666, + "step": 84 + }, + { + "epoch": 0.08549695102784938, + "grad_norm": 24.511263727618857, + "learning_rate": 7.155652471521676e-06, + "loss": 2.6841, + "step": 85 + }, + { + "epoch": 0.08650279751052996, + "grad_norm": 17.920694446601882, + "learning_rate": 7.174490951295713e-06, + "loss": 2.7248, + "step": 86 + }, + { + "epoch": 0.08750864399321054, + "grad_norm": 18.754958778905763, + "learning_rate": 7.19311164029673e-06, + "loss": 2.7673, + "step": 87 + }, + { + "epoch": 0.08851449047589112, + "grad_norm": 22.81015763762571, + "learning_rate": 7.211519516763988e-06, + "loss": 2.7238, + "step": 88 + }, + { + "epoch": 0.0895203369585717, + "grad_norm": 21.31760582506125, + "learning_rate": 7.229719390175563e-06, + "loss": 2.7277, + "step": 89 + }, + { + "epoch": 0.09052618344125228, + "grad_norm": 19.51350169549616, + "learning_rate": 7.247715908791158e-06, + "loss": 2.7123, + "step": 90 + }, + { + "epoch": 0.09153202992393286, + "grad_norm": 21.820529035616033, + "learning_rate": 7.26551356677814e-06, + "loss": 2.7569, + "step": 91 + }, + { + "epoch": 0.09253787640661344, + "grad_norm": 20.728510611672373, + "learning_rate": 7.283116710948156e-06, + "loss": 2.7776, + "step": 92 + }, + { + "epoch": 0.09354372288929402, + "grad_norm": 21.275335070097242, + "learning_rate": 7.300529547129599e-06, + "loss": 2.6706, + "step": 93 + }, + { + "epoch": 0.0945495693719746, + "grad_norm": 21.20201924741282, + "learning_rate": 7.3177561461992615e-06, + "loss": 2.8248, + "step": 94 + }, + { + "epoch": 0.09555541585465518, + "grad_norm": 19.354596760258314, + "learning_rate": 7.334800449794856e-06, + "loss": 2.657, + "step": 95 + }, + { + "epoch": 0.09656126233733577, + "grad_norm": 26.504627668771203, + "learning_rate": 7.351666275728411e-06, + "loss": 2.7424, + "step": 96 + }, + { + "epoch": 0.09756710882001635, + "grad_norm": 24.920434718375468, + "learning_rate": 7.368357323119185e-06, + "loss": 2.7804, + "step": 97 + }, + { + "epoch": 0.09857295530269693, + "grad_norm": 24.734865665675084, + "learning_rate": 7.384877177263328e-06, + "loss": 2.6857, + "step": 98 + }, + { + "epoch": 0.09957880178537751, + "grad_norm": 20.65028932666736, + "learning_rate": 7.40122931425634e-06, + "loss": 2.6713, + "step": 99 + }, + { + "epoch": 0.10058464826805809, + "grad_norm": 25.244491519043113, + "learning_rate": 7.417417105383241e-06, + "loss": 2.7438, + "step": 100 + }, + { + "epoch": 0.10159049475073867, + "grad_norm": 20.297890638350957, + "learning_rate": 7.433443821290305e-06, + "loss": 2.6464, + "step": 101 + }, + { + "epoch": 0.10259634123341925, + "grad_norm": 25.51714016339381, + "learning_rate": 7.44931263595128e-06, + "loss": 2.7024, + "step": 102 + }, + { + "epoch": 0.10360218771609983, + "grad_norm": 24.087279291502345, + "learning_rate": 7.465026630440138e-06, + "loss": 2.6897, + "step": 103 + }, + { + "epoch": 0.10460803419878041, + "grad_norm": 31.639294543052827, + "learning_rate": 7.480588796521525e-06, + "loss": 2.6928, + "step": 104 + }, + { + "epoch": 0.10561388068146099, + "grad_norm": 21.50022167620154, + "learning_rate": 7.49600204006946e-06, + "loss": 2.6821, + "step": 105 + }, + { + "epoch": 0.10661972716414157, + "grad_norm": 27.095230272732117, + "learning_rate": 7.511269184323955e-06, + "loss": 2.6596, + "step": 106 + }, + { + "epoch": 0.10762557364682215, + "grad_norm": 24.733709994175797, + "learning_rate": 7.526392972994766e-06, + "loss": 2.7506, + "step": 107 + }, + { + "epoch": 0.10863142012950273, + "grad_norm": 25.901406399951554, + "learning_rate": 7.541376073220765e-06, + "loss": 2.7288, + "step": 108 + }, + { + "epoch": 0.10963726661218332, + "grad_norm": 22.812181794557098, + "learning_rate": 7.556221078392927e-06, + "loss": 2.7115, + "step": 109 + }, + { + "epoch": 0.1106431130948639, + "grad_norm": 22.08834703071957, + "learning_rate": 7.570930510848422e-06, + "loss": 2.7194, + "step": 110 + }, + { + "epoch": 0.11164895957754448, + "grad_norm": 19.83989319695105, + "learning_rate": 7.585506824442778e-06, + "loss": 2.7086, + "step": 111 + }, + { + "epoch": 0.11265480606022506, + "grad_norm": 21.193284488575998, + "learning_rate": 7.599952407006712e-06, + "loss": 2.7459, + "step": 112 + }, + { + "epoch": 0.11366065254290564, + "grad_norm": 20.482415173444885, + "learning_rate": 7.614269582693758e-06, + "loss": 2.7904, + "step": 113 + }, + { + "epoch": 0.11466649902558622, + "grad_norm": 22.947534201659618, + "learning_rate": 7.628460614224462e-06, + "loss": 2.7392, + "step": 114 + }, + { + "epoch": 0.1156723455082668, + "grad_norm": 23.010915124299302, + "learning_rate": 7.64252770503259e-06, + "loss": 2.6972, + "step": 115 + }, + { + "epoch": 0.11667819199094738, + "grad_norm": 19.538353567555912, + "learning_rate": 7.656473001318417e-06, + "loss": 2.7021, + "step": 116 + }, + { + "epoch": 0.11768403847362796, + "grad_norm": 24.36272861265073, + "learning_rate": 7.670298594013878e-06, + "loss": 2.6831, + "step": 117 + }, + { + "epoch": 0.11868988495630854, + "grad_norm": 21.199891028702496, + "learning_rate": 7.68400652066411e-06, + "loss": 2.7558, + "step": 118 + }, + { + "epoch": 0.11969573143898912, + "grad_norm": 19.670104942454504, + "learning_rate": 7.697598767229584e-06, + "loss": 2.672, + "step": 119 + }, + { + "epoch": 0.1207015779216697, + "grad_norm": 24.51878373945769, + "learning_rate": 7.711077269812845e-06, + "loss": 2.6764, + "step": 120 + }, + { + "epoch": 0.12170742440435028, + "grad_norm": 21.921013999351562, + "learning_rate": 7.724443916313603e-06, + "loss": 2.6924, + "step": 121 + }, + { + "epoch": 0.12271327088703086, + "grad_norm": 19.16037093255755, + "learning_rate": 7.737700548015679e-06, + "loss": 2.7193, + "step": 122 + }, + { + "epoch": 0.12371911736971145, + "grad_norm": 18.980265808721434, + "learning_rate": 7.750848961109229e-06, + "loss": 2.7391, + "step": 123 + }, + { + "epoch": 0.12472496385239203, + "grad_norm": 18.756909530996253, + "learning_rate": 7.763890908151285e-06, + "loss": 2.7491, + "step": 124 + }, + { + "epoch": 0.1257308103350726, + "grad_norm": 18.846050730646436, + "learning_rate": 7.776828099467677e-06, + "loss": 2.5827, + "step": 125 + }, + { + "epoch": 0.1267366568177532, + "grad_norm": 19.381867511680255, + "learning_rate": 7.789662204499067e-06, + "loss": 2.6621, + "step": 126 + }, + { + "epoch": 0.12774250330043377, + "grad_norm": 18.915258610203843, + "learning_rate": 7.802394853093812e-06, + "loss": 2.6954, + "step": 127 + }, + { + "epoch": 0.12874834978311436, + "grad_norm": 17.902322301001714, + "learning_rate": 7.815027636750099e-06, + "loss": 2.7114, + "step": 128 + }, + { + "epoch": 0.12975419626579493, + "grad_norm": 19.86048307331956, + "learning_rate": 7.827562109809753e-06, + "loss": 2.7168, + "step": 129 + }, + { + "epoch": 0.13076004274847552, + "grad_norm": 16.13146552524454, + "learning_rate": 7.83999979060596e-06, + "loss": 2.673, + "step": 130 + }, + { + "epoch": 0.1317658892311561, + "grad_norm": 16.823542205733474, + "learning_rate": 7.852342162567001e-06, + "loss": 2.6572, + "step": 131 + }, + { + "epoch": 0.13277173571383669, + "grad_norm": 17.565391271847194, + "learning_rate": 7.864590675278027e-06, + "loss": 2.603, + "step": 132 + }, + { + "epoch": 0.13377758219651725, + "grad_norm": 19.184953207489116, + "learning_rate": 7.876746745502763e-06, + "loss": 2.667, + "step": 133 + }, + { + "epoch": 0.13478342867919785, + "grad_norm": 18.7606191056195, + "learning_rate": 7.888811758166946e-06, + "loss": 2.6638, + "step": 134 + }, + { + "epoch": 0.13578927516187841, + "grad_norm": 21.533928798672243, + "learning_rate": 7.9007870673052e-06, + "loss": 2.6812, + "step": 135 + }, + { + "epoch": 0.136795121644559, + "grad_norm": 21.969885173153724, + "learning_rate": 7.912673996972969e-06, + "loss": 2.6645, + "step": 136 + }, + { + "epoch": 0.13780096812723958, + "grad_norm": 23.026383962425232, + "learning_rate": 7.924473842125055e-06, + "loss": 2.6692, + "step": 137 + }, + { + "epoch": 0.13880681460992017, + "grad_norm": 25.308590349511185, + "learning_rate": 7.936187869462198e-06, + "loss": 2.7056, + "step": 138 + }, + { + "epoch": 0.13981266109260074, + "grad_norm": 21.063550016437297, + "learning_rate": 7.947817318247087e-06, + "loss": 2.6697, + "step": 139 + }, + { + "epoch": 0.14081850757528133, + "grad_norm": 24.867208465313563, + "learning_rate": 7.959363401091148e-06, + "loss": 2.6561, + "step": 140 + }, + { + "epoch": 0.1418243540579619, + "grad_norm": 19.72568818674115, + "learning_rate": 7.970827304713302e-06, + "loss": 2.6553, + "step": 141 + }, + { + "epoch": 0.1428302005406425, + "grad_norm": 23.770176802545095, + "learning_rate": 7.98221019067193e-06, + "loss": 2.6591, + "step": 142 + }, + { + "epoch": 0.14383604702332306, + "grad_norm": 20.067524140862375, + "learning_rate": 7.99351319607114e-06, + "loss": 2.6719, + "step": 143 + }, + { + "epoch": 0.14484189350600365, + "grad_norm": 31.768261766580125, + "learning_rate": 8.004737434242453e-06, + "loss": 2.6622, + "step": 144 + }, + { + "epoch": 0.14584773998868422, + "grad_norm": 25.556982955160176, + "learning_rate": 8.015883995402853e-06, + "loss": 2.6526, + "step": 145 + }, + { + "epoch": 0.14685358647136482, + "grad_norm": 23.847177634398342, + "learning_rate": 8.026953947290275e-06, + "loss": 2.6424, + "step": 146 + }, + { + "epoch": 0.14785943295404538, + "grad_norm": 27.152595480642916, + "learning_rate": 8.037948335777368e-06, + "loss": 2.6902, + "step": 147 + }, + { + "epoch": 0.14886527943672598, + "grad_norm": 18.022064635980716, + "learning_rate": 8.048868185464465e-06, + "loss": 2.5969, + "step": 148 + }, + { + "epoch": 0.14987112591940654, + "grad_norm": 27.024734522745767, + "learning_rate": 8.059714500252588e-06, + "loss": 2.6461, + "step": 149 + }, + { + "epoch": 0.15087697240208714, + "grad_norm": 16.732763939745368, + "learning_rate": 8.070488263897281e-06, + "loss": 2.6885, + "step": 150 + }, + { + "epoch": 0.1518828188847677, + "grad_norm": 24.51054903035366, + "learning_rate": 8.081190440544056e-06, + "loss": 2.6653, + "step": 151 + }, + { + "epoch": 0.1528886653674483, + "grad_norm": 21.596701273369327, + "learning_rate": 8.09182197524615e-06, + "loss": 2.6377, + "step": 152 + }, + { + "epoch": 0.15389451185012887, + "grad_norm": 25.587667640660342, + "learning_rate": 8.102383794465321e-06, + "loss": 2.6287, + "step": 153 + }, + { + "epoch": 0.15490035833280946, + "grad_norm": 28.92637297373861, + "learning_rate": 8.112876806556328e-06, + "loss": 2.6187, + "step": 154 + }, + { + "epoch": 0.15590620481549003, + "grad_norm": 20.936164888279848, + "learning_rate": 8.123301902235721e-06, + "loss": 2.634, + "step": 155 + }, + { + "epoch": 0.15691205129817062, + "grad_norm": 23.557901999266143, + "learning_rate": 8.133659955035566e-06, + "loss": 2.587, + "step": 156 + }, + { + "epoch": 0.1579178977808512, + "grad_norm": 21.159821995258497, + "learning_rate": 8.143951821742681e-06, + "loss": 2.6368, + "step": 157 + }, + { + "epoch": 0.15892374426353179, + "grad_norm": 24.642078265141432, + "learning_rate": 8.154178342823911e-06, + "loss": 2.6336, + "step": 158 + }, + { + "epoch": 0.15992959074621235, + "grad_norm": 21.423623973524016, + "learning_rate": 8.164340342837997e-06, + "loss": 2.6378, + "step": 159 + }, + { + "epoch": 0.16093543722889295, + "grad_norm": 20.788847198646504, + "learning_rate": 8.174438630834533e-06, + "loss": 2.5842, + "step": 160 + }, + { + "epoch": 0.1619412837115735, + "grad_norm": 20.628954050610897, + "learning_rate": 8.184474000740498e-06, + "loss": 2.6697, + "step": 161 + }, + { + "epoch": 0.1629471301942541, + "grad_norm": 22.39808575634474, + "learning_rate": 8.194447231734805e-06, + "loss": 2.6348, + "step": 162 + }, + { + "epoch": 0.16395297667693468, + "grad_norm": 24.746404372242566, + "learning_rate": 8.204359088611344e-06, + "loss": 2.6288, + "step": 163 + }, + { + "epoch": 0.16495882315961527, + "grad_norm": 23.61401081832116, + "learning_rate": 8.214210322130917e-06, + "loss": 2.6606, + "step": 164 + }, + { + "epoch": 0.16596466964229584, + "grad_norm": 22.15134362505235, + "learning_rate": 8.224001669362461e-06, + "loss": 2.6394, + "step": 165 + }, + { + "epoch": 0.16697051612497643, + "grad_norm": 24.75566935763776, + "learning_rate": 8.233733854013994e-06, + "loss": 2.6418, + "step": 166 + }, + { + "epoch": 0.167976362607657, + "grad_norm": 20.403196739056543, + "learning_rate": 8.24340758675358e-06, + "loss": 2.6979, + "step": 167 + }, + { + "epoch": 0.1689822090903376, + "grad_norm": 21.928902144665866, + "learning_rate": 8.253023565520753e-06, + "loss": 2.6535, + "step": 168 + }, + { + "epoch": 0.16998805557301816, + "grad_norm": 21.038785162406096, + "learning_rate": 8.262582475828682e-06, + "loss": 2.6418, + "step": 169 + }, + { + "epoch": 0.17099390205569875, + "grad_norm": 15.673934043236647, + "learning_rate": 8.272084991057405e-06, + "loss": 2.6811, + "step": 170 + }, + { + "epoch": 0.17199974853837932, + "grad_norm": 22.43245971157093, + "learning_rate": 8.281531772738503e-06, + "loss": 2.6596, + "step": 171 + }, + { + "epoch": 0.17300559502105992, + "grad_norm": 18.71520704132205, + "learning_rate": 8.290923470831441e-06, + "loss": 2.6451, + "step": 172 + }, + { + "epoch": 0.17401144150374048, + "grad_norm": 20.616928637039162, + "learning_rate": 8.300260723991916e-06, + "loss": 2.6314, + "step": 173 + }, + { + "epoch": 0.17501728798642108, + "grad_norm": 18.91356082247342, + "learning_rate": 8.309544159832458e-06, + "loss": 2.6167, + "step": 174 + }, + { + "epoch": 0.17602313446910164, + "grad_norm": 18.162533499067138, + "learning_rate": 8.318774395175584e-06, + "loss": 2.6243, + "step": 175 + }, + { + "epoch": 0.17702898095178224, + "grad_norm": 20.50907483416954, + "learning_rate": 8.327952036299713e-06, + "loss": 2.5896, + "step": 176 + }, + { + "epoch": 0.1780348274344628, + "grad_norm": 19.102928768874477, + "learning_rate": 8.33707767917815e-06, + "loss": 2.6691, + "step": 177 + }, + { + "epoch": 0.1790406739171434, + "grad_norm": 17.220346981660366, + "learning_rate": 8.34615190971129e-06, + "loss": 2.5311, + "step": 178 + }, + { + "epoch": 0.18004652039982397, + "grad_norm": 18.526421388087602, + "learning_rate": 8.35517530395237e-06, + "loss": 2.6687, + "step": 179 + }, + { + "epoch": 0.18105236688250456, + "grad_norm": 18.68714814663516, + "learning_rate": 8.364148428326887e-06, + "loss": 2.6537, + "step": 180 + }, + { + "epoch": 0.18205821336518513, + "grad_norm": 19.07870561029967, + "learning_rate": 8.373071839845973e-06, + "loss": 2.5946, + "step": 181 + }, + { + "epoch": 0.18306405984786572, + "grad_norm": 16.902568714349183, + "learning_rate": 8.381946086313868e-06, + "loss": 2.6438, + "step": 182 + }, + { + "epoch": 0.1840699063305463, + "grad_norm": 19.86628523072272, + "learning_rate": 8.39077170652972e-06, + "loss": 2.6064, + "step": 183 + }, + { + "epoch": 0.18507575281322688, + "grad_norm": 17.50543397408068, + "learning_rate": 8.399549230483884e-06, + "loss": 2.6389, + "step": 184 + }, + { + "epoch": 0.18608159929590745, + "grad_norm": 21.975997332922287, + "learning_rate": 8.408279179548899e-06, + "loss": 2.6116, + "step": 185 + }, + { + "epoch": 0.18708744577858805, + "grad_norm": 20.239413306373145, + "learning_rate": 8.416962066665327e-06, + "loss": 2.6954, + "step": 186 + }, + { + "epoch": 0.1880932922612686, + "grad_norm": 21.38873332374349, + "learning_rate": 8.425598396522585e-06, + "loss": 2.6121, + "step": 187 + }, + { + "epoch": 0.1890991387439492, + "grad_norm": 21.590789079297604, + "learning_rate": 8.43418866573499e-06, + "loss": 2.6493, + "step": 188 + }, + { + "epoch": 0.19010498522662977, + "grad_norm": 20.836744203370017, + "learning_rate": 8.442733363013107e-06, + "loss": 2.627, + "step": 189 + }, + { + "epoch": 0.19111083170931037, + "grad_norm": 22.441001841694202, + "learning_rate": 8.451232969330584e-06, + "loss": 2.6559, + "step": 190 + }, + { + "epoch": 0.19211667819199094, + "grad_norm": 23.400210842885595, + "learning_rate": 8.459687958086613e-06, + "loss": 2.6221, + "step": 191 + }, + { + "epoch": 0.19312252467467153, + "grad_norm": 17.126955017293337, + "learning_rate": 8.468098795264139e-06, + "loss": 2.6686, + "step": 192 + }, + { + "epoch": 0.1941283711573521, + "grad_norm": 22.286487375080497, + "learning_rate": 8.476465939583975e-06, + "loss": 2.5953, + "step": 193 + }, + { + "epoch": 0.1951342176400327, + "grad_norm": 22.623393456148232, + "learning_rate": 8.484789842654914e-06, + "loss": 2.645, + "step": 194 + }, + { + "epoch": 0.19614006412271326, + "grad_norm": 20.01494469131131, + "learning_rate": 8.493070949120002e-06, + "loss": 2.6935, + "step": 195 + }, + { + "epoch": 0.19714591060539385, + "grad_norm": 20.652229445798564, + "learning_rate": 8.501309696799054e-06, + "loss": 2.5595, + "step": 196 + }, + { + "epoch": 0.19815175708807442, + "grad_norm": 19.356436859627642, + "learning_rate": 8.509506516827565e-06, + "loss": 2.6037, + "step": 197 + }, + { + "epoch": 0.19915760357075502, + "grad_norm": 19.375388210507882, + "learning_rate": 8.517661833792069e-06, + "loss": 2.6151, + "step": 198 + }, + { + "epoch": 0.20016345005343558, + "grad_norm": 19.58164413054047, + "learning_rate": 8.52577606586212e-06, + "loss": 2.646, + "step": 199 + }, + { + "epoch": 0.20116929653611618, + "grad_norm": 19.53007585706627, + "learning_rate": 8.533849624918969e-06, + "loss": 2.6153, + "step": 200 + }, + { + "epoch": 0.20217514301879674, + "grad_norm": 18.412307623077677, + "learning_rate": 8.541882916680986e-06, + "loss": 2.6147, + "step": 201 + }, + { + "epoch": 0.20318098950147734, + "grad_norm": 20.96726260777965, + "learning_rate": 8.549876340826033e-06, + "loss": 2.6719, + "step": 202 + }, + { + "epoch": 0.2041868359841579, + "grad_norm": 15.753376282394372, + "learning_rate": 8.55783029111076e-06, + "loss": 2.5969, + "step": 203 + }, + { + "epoch": 0.2051926824668385, + "grad_norm": 19.19212639218277, + "learning_rate": 8.565745155487009e-06, + "loss": 2.6292, + "step": 204 + }, + { + "epoch": 0.20619852894951907, + "grad_norm": 17.130226605076096, + "learning_rate": 8.573621316215352e-06, + "loss": 2.5878, + "step": 205 + }, + { + "epoch": 0.20720437543219966, + "grad_norm": 20.09366506036711, + "learning_rate": 8.581459149975866e-06, + "loss": 2.628, + "step": 206 + }, + { + "epoch": 0.20821022191488023, + "grad_norm": 18.051831505165058, + "learning_rate": 8.589259027976237e-06, + "loss": 2.6775, + "step": 207 + }, + { + "epoch": 0.20921606839756082, + "grad_norm": 20.88839122977681, + "learning_rate": 8.597021316057254e-06, + "loss": 2.5629, + "step": 208 + }, + { + "epoch": 0.2102219148802414, + "grad_norm": 20.052445051755896, + "learning_rate": 8.604746374795765e-06, + "loss": 2.6581, + "step": 209 + }, + { + "epoch": 0.21122776136292198, + "grad_norm": 20.308993784102455, + "learning_rate": 8.612434559605189e-06, + "loss": 2.6295, + "step": 210 + }, + { + "epoch": 0.21223360784560258, + "grad_norm": 18.750434945903855, + "learning_rate": 8.620086220833631e-06, + "loss": 2.6288, + "step": 211 + }, + { + "epoch": 0.21323945432828315, + "grad_norm": 17.71083780989216, + "learning_rate": 8.627701703859685e-06, + "loss": 2.6408, + "step": 212 + }, + { + "epoch": 0.21424530081096374, + "grad_norm": 18.24620720661975, + "learning_rate": 8.63528134918597e-06, + "loss": 2.6134, + "step": 213 + }, + { + "epoch": 0.2152511472936443, + "grad_norm": 16.69848128026314, + "learning_rate": 8.642825492530494e-06, + "loss": 2.6553, + "step": 214 + }, + { + "epoch": 0.2162569937763249, + "grad_norm": 17.731749852066827, + "learning_rate": 8.650334464915875e-06, + "loss": 2.6056, + "step": 215 + }, + { + "epoch": 0.21726284025900547, + "grad_norm": 15.675950483577644, + "learning_rate": 8.657808592756493e-06, + "loss": 2.5621, + "step": 216 + }, + { + "epoch": 0.21826868674168606, + "grad_norm": 19.14882202694866, + "learning_rate": 8.665248197943628e-06, + "loss": 2.6483, + "step": 217 + }, + { + "epoch": 0.21927453322436663, + "grad_norm": 18.258290501797685, + "learning_rate": 8.672653597928656e-06, + "loss": 2.6001, + "step": 218 + }, + { + "epoch": 0.22028037970704722, + "grad_norm": 17.825922963762647, + "learning_rate": 8.680025105804317e-06, + "loss": 2.6576, + "step": 219 + }, + { + "epoch": 0.2212862261897278, + "grad_norm": 18.489862697446604, + "learning_rate": 8.68736303038415e-06, + "loss": 2.6701, + "step": 220 + }, + { + "epoch": 0.2222920726724084, + "grad_norm": 20.623353777687228, + "learning_rate": 8.694667676280122e-06, + "loss": 2.66, + "step": 221 + }, + { + "epoch": 0.22329791915508895, + "grad_norm": 20.140706964863128, + "learning_rate": 8.701939343978507e-06, + "loss": 2.6114, + "step": 222 + }, + { + "epoch": 0.22430376563776955, + "grad_norm": 21.084508483111602, + "learning_rate": 8.709178329914038e-06, + "loss": 2.6374, + "step": 223 + }, + { + "epoch": 0.22530961212045011, + "grad_norm": 21.044763301093216, + "learning_rate": 8.716384926542442e-06, + "loss": 2.5886, + "step": 224 + }, + { + "epoch": 0.2263154586031307, + "grad_norm": 18.074736561415794, + "learning_rate": 8.723559422411321e-06, + "loss": 2.6019, + "step": 225 + }, + { + "epoch": 0.22732130508581128, + "grad_norm": 18.34158694872784, + "learning_rate": 8.730702102229487e-06, + "loss": 2.6103, + "step": 226 + }, + { + "epoch": 0.22832715156849187, + "grad_norm": 20.820577289047332, + "learning_rate": 8.737813246934741e-06, + "loss": 2.6454, + "step": 227 + }, + { + "epoch": 0.22933299805117244, + "grad_norm": 21.668929247510405, + "learning_rate": 8.744893133760191e-06, + "loss": 2.6261, + "step": 228 + }, + { + "epoch": 0.23033884453385303, + "grad_norm": 19.292963044451643, + "learning_rate": 8.751942036299099e-06, + "loss": 2.5645, + "step": 229 + }, + { + "epoch": 0.2313446910165336, + "grad_norm": 21.315973049999645, + "learning_rate": 8.758960224568318e-06, + "loss": 2.5653, + "step": 230 + }, + { + "epoch": 0.2323505374992142, + "grad_norm": 18.390322171060433, + "learning_rate": 8.765947965070369e-06, + "loss": 2.6106, + "step": 231 + }, + { + "epoch": 0.23335638398189476, + "grad_norm": 18.76067528787121, + "learning_rate": 8.772905520854146e-06, + "loss": 2.6171, + "step": 232 + }, + { + "epoch": 0.23436223046457536, + "grad_norm": 24.814248745806943, + "learning_rate": 8.779833151574344e-06, + "loss": 2.6466, + "step": 233 + }, + { + "epoch": 0.23536807694725592, + "grad_norm": 22.919276078065515, + "learning_rate": 8.786731113549606e-06, + "loss": 2.6033, + "step": 234 + }, + { + "epoch": 0.23637392342993652, + "grad_norm": 18.520466660426873, + "learning_rate": 8.793599659819425e-06, + "loss": 2.606, + "step": 235 + }, + { + "epoch": 0.23737976991261708, + "grad_norm": 22.348067724347846, + "learning_rate": 8.800439040199838e-06, + "loss": 2.5912, + "step": 236 + }, + { + "epoch": 0.23838561639529768, + "grad_norm": 19.173263256977286, + "learning_rate": 8.807249501337953e-06, + "loss": 2.6346, + "step": 237 + }, + { + "epoch": 0.23939146287797824, + "grad_norm": 23.383354298666408, + "learning_rate": 8.814031286765312e-06, + "loss": 2.6386, + "step": 238 + }, + { + "epoch": 0.24039730936065884, + "grad_norm": 25.729808734175805, + "learning_rate": 8.820784636950157e-06, + "loss": 2.625, + "step": 239 + }, + { + "epoch": 0.2414031558433394, + "grad_norm": 18.210794583054227, + "learning_rate": 8.827509789348575e-06, + "loss": 2.5976, + "step": 240 + }, + { + "epoch": 0.24240900232602, + "grad_norm": 17.672902674151604, + "learning_rate": 8.834206978454614e-06, + "loss": 2.627, + "step": 241 + }, + { + "epoch": 0.24341484880870057, + "grad_norm": 17.827338301226895, + "learning_rate": 8.84087643584933e-06, + "loss": 2.588, + "step": 242 + }, + { + "epoch": 0.24442069529138116, + "grad_norm": 17.06456531549323, + "learning_rate": 8.847518390248845e-06, + "loss": 2.6316, + "step": 243 + }, + { + "epoch": 0.24542654177406173, + "grad_norm": 20.713440314623234, + "learning_rate": 8.854133067551409e-06, + "loss": 2.5697, + "step": 244 + }, + { + "epoch": 0.24643238825674232, + "grad_norm": 19.6435903200954, + "learning_rate": 8.860720690883492e-06, + "loss": 2.5903, + "step": 245 + }, + { + "epoch": 0.2474382347394229, + "grad_norm": 18.399526685396268, + "learning_rate": 8.867281480644957e-06, + "loss": 2.6169, + "step": 246 + }, + { + "epoch": 0.24844408122210349, + "grad_norm": 19.663511314871595, + "learning_rate": 8.873815654553305e-06, + "loss": 2.5982, + "step": 247 + }, + { + "epoch": 0.24944992770478405, + "grad_norm": 22.569466086155717, + "learning_rate": 8.880323427687015e-06, + "loss": 2.5776, + "step": 248 + }, + { + "epoch": 0.2504557741874646, + "grad_norm": 22.250217638563107, + "learning_rate": 8.886805012528034e-06, + "loss": 2.612, + "step": 249 + }, + { + "epoch": 0.2514616206701452, + "grad_norm": 20.751848059953005, + "learning_rate": 8.893260619003403e-06, + "loss": 2.5945, + "step": 250 + }, + { + "epoch": 0.2524674671528258, + "grad_norm": 21.836021029599017, + "learning_rate": 8.899690454526055e-06, + "loss": 2.6152, + "step": 251 + }, + { + "epoch": 0.2534733136355064, + "grad_norm": 17.93363953654319, + "learning_rate": 8.906094724034795e-06, + "loss": 2.5886, + "step": 252 + }, + { + "epoch": 0.25447916011818694, + "grad_norm": 20.885718013721352, + "learning_rate": 8.9124736300335e-06, + "loss": 2.5972, + "step": 253 + }, + { + "epoch": 0.25548500660086754, + "grad_norm": 21.796601989759413, + "learning_rate": 8.91882737262954e-06, + "loss": 2.5547, + "step": 254 + }, + { + "epoch": 0.25649085308354813, + "grad_norm": 17.58817706954125, + "learning_rate": 8.925156149571445e-06, + "loss": 2.5832, + "step": 255 + }, + { + "epoch": 0.2574966995662287, + "grad_norm": 21.59816615398544, + "learning_rate": 8.931460156285827e-06, + "loss": 2.5833, + "step": 256 + }, + { + "epoch": 0.25850254604890927, + "grad_norm": 18.077149818844823, + "learning_rate": 8.937739585913602e-06, + "loss": 2.5771, + "step": 257 + }, + { + "epoch": 0.25950839253158986, + "grad_norm": 20.454974748323327, + "learning_rate": 8.943994629345481e-06, + "loss": 2.6646, + "step": 258 + }, + { + "epoch": 0.26051423901427045, + "grad_norm": 19.680650671359444, + "learning_rate": 8.950225475256808e-06, + "loss": 2.6518, + "step": 259 + }, + { + "epoch": 0.26152008549695105, + "grad_norm": 17.07370495402799, + "learning_rate": 8.956432310141688e-06, + "loss": 2.6054, + "step": 260 + }, + { + "epoch": 0.2625259319796316, + "grad_norm": 20.44556881953542, + "learning_rate": 8.962615318346499e-06, + "loss": 2.487, + "step": 261 + }, + { + "epoch": 0.2635317784623122, + "grad_norm": 17.12917889025369, + "learning_rate": 8.968774682102729e-06, + "loss": 2.6147, + "step": 262 + }, + { + "epoch": 0.2645376249449928, + "grad_norm": 19.621476667862606, + "learning_rate": 8.974910581559217e-06, + "loss": 2.5783, + "step": 263 + }, + { + "epoch": 0.26554347142767337, + "grad_norm": 16.290476284537352, + "learning_rate": 8.981023194813755e-06, + "loss": 2.666, + "step": 264 + }, + { + "epoch": 0.2665493179103539, + "grad_norm": 21.622288460717243, + "learning_rate": 8.987112697944119e-06, + "loss": 2.6843, + "step": 265 + }, + { + "epoch": 0.2675551643930345, + "grad_norm": 17.770496455788983, + "learning_rate": 8.993179265038493e-06, + "loss": 2.6217, + "step": 266 + }, + { + "epoch": 0.2685610108757151, + "grad_norm": 18.814542670464103, + "learning_rate": 8.999223068225332e-06, + "loss": 2.5714, + "step": 267 + }, + { + "epoch": 0.2695668573583957, + "grad_norm": 18.786521072518283, + "learning_rate": 9.005244277702674e-06, + "loss": 2.5553, + "step": 268 + }, + { + "epoch": 0.27057270384107623, + "grad_norm": 21.070488274527378, + "learning_rate": 9.011243061766895e-06, + "loss": 2.6352, + "step": 269 + }, + { + "epoch": 0.27157855032375683, + "grad_norm": 19.816693607724073, + "learning_rate": 9.017219586840929e-06, + "loss": 2.6066, + "step": 270 + }, + { + "epoch": 0.2725843968064374, + "grad_norm": 18.96107953875644, + "learning_rate": 9.023174017501975e-06, + "loss": 2.559, + "step": 271 + }, + { + "epoch": 0.273590243289118, + "grad_norm": 26.54474932655252, + "learning_rate": 9.029106516508698e-06, + "loss": 2.5407, + "step": 272 + }, + { + "epoch": 0.27459608977179856, + "grad_norm": 18.8523610186751, + "learning_rate": 9.03501724482791e-06, + "loss": 2.5851, + "step": 273 + }, + { + "epoch": 0.27560193625447915, + "grad_norm": 19.597045879097127, + "learning_rate": 9.040906361660785e-06, + "loss": 2.5512, + "step": 274 + }, + { + "epoch": 0.27660778273715975, + "grad_norm": 23.74080554612753, + "learning_rate": 9.046774024468585e-06, + "loss": 2.6118, + "step": 275 + }, + { + "epoch": 0.27761362921984034, + "grad_norm": 22.714669056716087, + "learning_rate": 9.052620388997924e-06, + "loss": 2.5784, + "step": 276 + }, + { + "epoch": 0.2786194757025209, + "grad_norm": 23.248188625995795, + "learning_rate": 9.058445609305574e-06, + "loss": 2.6503, + "step": 277 + }, + { + "epoch": 0.2796253221852015, + "grad_norm": 22.625705056604087, + "learning_rate": 9.064249837782815e-06, + "loss": 2.5583, + "step": 278 + }, + { + "epoch": 0.28063116866788207, + "grad_norm": 24.60212715825136, + "learning_rate": 9.070033225179367e-06, + "loss": 2.6041, + "step": 279 + }, + { + "epoch": 0.28163701515056266, + "grad_norm": 22.07266263058576, + "learning_rate": 9.075795920626876e-06, + "loss": 2.5903, + "step": 280 + }, + { + "epoch": 0.2826428616332432, + "grad_norm": 21.88899389143494, + "learning_rate": 9.081538071661991e-06, + "loss": 2.5693, + "step": 281 + }, + { + "epoch": 0.2836487081159238, + "grad_norm": 20.285327057873072, + "learning_rate": 9.087259824249031e-06, + "loss": 2.5804, + "step": 282 + }, + { + "epoch": 0.2846545545986044, + "grad_norm": 17.86840355259839, + "learning_rate": 9.092961322802238e-06, + "loss": 2.5799, + "step": 283 + }, + { + "epoch": 0.285660401081285, + "grad_norm": 16.179377763205867, + "learning_rate": 9.098642710207657e-06, + "loss": 2.6577, + "step": 284 + }, + { + "epoch": 0.2866662475639655, + "grad_norm": 18.881043699779436, + "learning_rate": 9.104304127844625e-06, + "loss": 2.5943, + "step": 285 + }, + { + "epoch": 0.2876720940466461, + "grad_norm": 19.32116500725666, + "learning_rate": 9.10994571560687e-06, + "loss": 2.5656, + "step": 286 + }, + { + "epoch": 0.2886779405293267, + "grad_norm": 17.056212076430626, + "learning_rate": 9.11556761192326e-06, + "loss": 2.5439, + "step": 287 + }, + { + "epoch": 0.2896837870120073, + "grad_norm": 17.50398402272584, + "learning_rate": 9.12116995377818e-06, + "loss": 2.5772, + "step": 288 + }, + { + "epoch": 0.29068963349468785, + "grad_norm": 19.554296082869655, + "learning_rate": 9.126752876731566e-06, + "loss": 2.5929, + "step": 289 + }, + { + "epoch": 0.29169547997736844, + "grad_norm": 21.68217435347834, + "learning_rate": 9.13231651493858e-06, + "loss": 2.5923, + "step": 290 + }, + { + "epoch": 0.29270132646004904, + "grad_norm": 20.905430617187555, + "learning_rate": 9.137861001168956e-06, + "loss": 2.5802, + "step": 291 + }, + { + "epoch": 0.29370717294272963, + "grad_norm": 20.198543571392612, + "learning_rate": 9.143386466826003e-06, + "loss": 2.5822, + "step": 292 + }, + { + "epoch": 0.29471301942541017, + "grad_norm": 23.77871648988695, + "learning_rate": 9.148893041965311e-06, + "loss": 2.608, + "step": 293 + }, + { + "epoch": 0.29571886590809077, + "grad_norm": 20.48204598393149, + "learning_rate": 9.154380855313096e-06, + "loss": 2.586, + "step": 294 + }, + { + "epoch": 0.29672471239077136, + "grad_norm": 18.098438212679774, + "learning_rate": 9.159850034284274e-06, + "loss": 2.6228, + "step": 295 + }, + { + "epoch": 0.29773055887345196, + "grad_norm": 18.784691263545824, + "learning_rate": 9.165300705000193e-06, + "loss": 2.5802, + "step": 296 + }, + { + "epoch": 0.2987364053561325, + "grad_norm": 18.5638121249419, + "learning_rate": 9.170732992306109e-06, + "loss": 2.5466, + "step": 297 + }, + { + "epoch": 0.2997422518388131, + "grad_norm": 18.754954662020456, + "learning_rate": 9.176147019788316e-06, + "loss": 2.6204, + "step": 298 + }, + { + "epoch": 0.3007480983214937, + "grad_norm": 19.634341646921474, + "learning_rate": 9.18154290979104e-06, + "loss": 2.569, + "step": 299 + }, + { + "epoch": 0.3017539448041743, + "grad_norm": 16.960368671694702, + "learning_rate": 9.18692078343301e-06, + "loss": 2.5591, + "step": 300 + }, + { + "epoch": 0.3027597912868548, + "grad_norm": 17.789077674900664, + "learning_rate": 9.192280760623784e-06, + "loss": 2.6259, + "step": 301 + }, + { + "epoch": 0.3037656377695354, + "grad_norm": 17.01187285822207, + "learning_rate": 9.197622960079784e-06, + "loss": 2.6004, + "step": 302 + }, + { + "epoch": 0.304771484252216, + "grad_norm": 20.164202493945783, + "learning_rate": 9.202947499340073e-06, + "loss": 2.6163, + "step": 303 + }, + { + "epoch": 0.3057773307348966, + "grad_norm": 17.737721757957956, + "learning_rate": 9.208254494781877e-06, + "loss": 2.5653, + "step": 304 + }, + { + "epoch": 0.30678317721757714, + "grad_norm": 18.472587797259994, + "learning_rate": 9.213544061635843e-06, + "loss": 2.586, + "step": 305 + }, + { + "epoch": 0.30778902370025774, + "grad_norm": 15.17012695048484, + "learning_rate": 9.21881631400105e-06, + "loss": 2.5346, + "step": 306 + }, + { + "epoch": 0.30879487018293833, + "grad_norm": 19.95709941623259, + "learning_rate": 9.224071364859784e-06, + "loss": 2.6546, + "step": 307 + }, + { + "epoch": 0.3098007166656189, + "grad_norm": 20.924787917082103, + "learning_rate": 9.229309326092056e-06, + "loss": 2.5815, + "step": 308 + }, + { + "epoch": 0.31080656314829946, + "grad_norm": 17.65260316523874, + "learning_rate": 9.234530308489906e-06, + "loss": 2.4879, + "step": 309 + }, + { + "epoch": 0.31181240963098006, + "grad_norm": 17.954780984714258, + "learning_rate": 9.239734421771449e-06, + "loss": 2.5751, + "step": 310 + }, + { + "epoch": 0.31281825611366065, + "grad_norm": 20.309850645532567, + "learning_rate": 9.244921774594723e-06, + "loss": 2.6018, + "step": 311 + }, + { + "epoch": 0.31382410259634125, + "grad_norm": 19.4937688848126, + "learning_rate": 9.250092474571294e-06, + "loss": 2.5942, + "step": 312 + }, + { + "epoch": 0.3148299490790218, + "grad_norm": 19.8969123554459, + "learning_rate": 9.255246628279656e-06, + "loss": 2.6084, + "step": 313 + }, + { + "epoch": 0.3158357955617024, + "grad_norm": 21.311968170601972, + "learning_rate": 9.260384341278409e-06, + "loss": 2.5642, + "step": 314 + }, + { + "epoch": 0.316841642044383, + "grad_norm": 19.417179511250875, + "learning_rate": 9.26550571811923e-06, + "loss": 2.6014, + "step": 315 + }, + { + "epoch": 0.31784748852706357, + "grad_norm": 18.3517298148521, + "learning_rate": 9.270610862359639e-06, + "loss": 2.5491, + "step": 316 + }, + { + "epoch": 0.31885333500974417, + "grad_norm": 18.48869905606377, + "learning_rate": 9.275699876575568e-06, + "loss": 2.6088, + "step": 317 + }, + { + "epoch": 0.3198591814924247, + "grad_norm": 18.715905721710747, + "learning_rate": 9.280772862373725e-06, + "loss": 2.5831, + "step": 318 + }, + { + "epoch": 0.3208650279751053, + "grad_norm": 15.934927854149723, + "learning_rate": 9.285829920403762e-06, + "loss": 2.5761, + "step": 319 + }, + { + "epoch": 0.3218708744577859, + "grad_norm": 20.177416493199058, + "learning_rate": 9.290871150370263e-06, + "loss": 2.5826, + "step": 320 + }, + { + "epoch": 0.3228767209404665, + "grad_norm": 22.03439353905269, + "learning_rate": 9.295896651044535e-06, + "loss": 2.5901, + "step": 321 + }, + { + "epoch": 0.323882567423147, + "grad_norm": 21.417786500881242, + "learning_rate": 9.300906520276228e-06, + "loss": 2.5614, + "step": 322 + }, + { + "epoch": 0.3248884139058276, + "grad_norm": 17.202029957803592, + "learning_rate": 9.305900855004747e-06, + "loss": 2.6167, + "step": 323 + }, + { + "epoch": 0.3258942603885082, + "grad_norm": 22.29615093251031, + "learning_rate": 9.310879751270533e-06, + "loss": 2.5662, + "step": 324 + }, + { + "epoch": 0.3269001068711888, + "grad_norm": 17.64208925311755, + "learning_rate": 9.315843304226122e-06, + "loss": 2.5522, + "step": 325 + }, + { + "epoch": 0.32790595335386935, + "grad_norm": 19.98320678509978, + "learning_rate": 9.320791608147074e-06, + "loss": 2.6221, + "step": 326 + }, + { + "epoch": 0.32891179983654995, + "grad_norm": 16.13553457377954, + "learning_rate": 9.325724756442696e-06, + "loss": 2.5761, + "step": 327 + }, + { + "epoch": 0.32991764631923054, + "grad_norm": 17.31786699950618, + "learning_rate": 9.330642841666646e-06, + "loss": 2.5816, + "step": 328 + }, + { + "epoch": 0.33092349280191113, + "grad_norm": 18.712111337036895, + "learning_rate": 9.335545955527333e-06, + "loss": 2.6028, + "step": 329 + }, + { + "epoch": 0.3319293392845917, + "grad_norm": 17.48530200895543, + "learning_rate": 9.340434188898189e-06, + "loss": 2.6095, + "step": 330 + }, + { + "epoch": 0.33293518576727227, + "grad_norm": 20.06501341317573, + "learning_rate": 9.345307631827775e-06, + "loss": 2.5803, + "step": 331 + }, + { + "epoch": 0.33394103224995286, + "grad_norm": 18.81437868466945, + "learning_rate": 9.35016637354972e-06, + "loss": 2.5395, + "step": 332 + }, + { + "epoch": 0.33494687873263346, + "grad_norm": 20.253454715167823, + "learning_rate": 9.355010502492547e-06, + "loss": 2.5588, + "step": 333 + }, + { + "epoch": 0.335952725215314, + "grad_norm": 20.145901278398018, + "learning_rate": 9.359840106289308e-06, + "loss": 2.6307, + "step": 334 + }, + { + "epoch": 0.3369585716979946, + "grad_norm": 18.35310873936949, + "learning_rate": 9.36465527178711e-06, + "loss": 2.5903, + "step": 335 + }, + { + "epoch": 0.3379644181806752, + "grad_norm": 19.993529185424425, + "learning_rate": 9.369456085056482e-06, + "loss": 2.5692, + "step": 336 + }, + { + "epoch": 0.3389702646633558, + "grad_norm": 22.69340117991585, + "learning_rate": 9.374242631400604e-06, + "loss": 2.6053, + "step": 337 + }, + { + "epoch": 0.3399761111460363, + "grad_norm": 21.68690669161774, + "learning_rate": 9.37901499536441e-06, + "loss": 2.5805, + "step": 338 + }, + { + "epoch": 0.3409819576287169, + "grad_norm": 17.746874404228528, + "learning_rate": 9.383773260743527e-06, + "loss": 2.5855, + "step": 339 + }, + { + "epoch": 0.3419878041113975, + "grad_norm": 22.807692121973822, + "learning_rate": 9.388517510593132e-06, + "loss": 2.6305, + "step": 340 + }, + { + "epoch": 0.3429936505940781, + "grad_norm": 19.00067460688236, + "learning_rate": 9.39324782723663e-06, + "loss": 2.5656, + "step": 341 + }, + { + "epoch": 0.34399949707675864, + "grad_norm": 20.121711729728055, + "learning_rate": 9.39796429227423e-06, + "loss": 2.5423, + "step": 342 + }, + { + "epoch": 0.34500534355943924, + "grad_norm": 20.64153176834041, + "learning_rate": 9.402666986591398e-06, + "loss": 2.6087, + "step": 343 + }, + { + "epoch": 0.34601119004211983, + "grad_norm": 20.27675989420653, + "learning_rate": 9.407355990367169e-06, + "loss": 2.5557, + "step": 344 + }, + { + "epoch": 0.3470170365248004, + "grad_norm": 17.450991828252263, + "learning_rate": 9.41203138308236e-06, + "loss": 2.5362, + "step": 345 + }, + { + "epoch": 0.34802288300748097, + "grad_norm": 17.74999896509276, + "learning_rate": 9.416693243527644e-06, + "loss": 2.5383, + "step": 346 + }, + { + "epoch": 0.34902872949016156, + "grad_norm": 22.025427737463747, + "learning_rate": 9.421341649811525e-06, + "loss": 2.5713, + "step": 347 + }, + { + "epoch": 0.35003457597284215, + "grad_norm": 21.459704360516515, + "learning_rate": 9.425976679368188e-06, + "loss": 2.5564, + "step": 348 + }, + { + "epoch": 0.35104042245552275, + "grad_norm": 19.988619711523008, + "learning_rate": 9.43059840896523e-06, + "loss": 2.6236, + "step": 349 + }, + { + "epoch": 0.3520462689382033, + "grad_norm": 18.045344386817177, + "learning_rate": 9.43520691471131e-06, + "loss": 2.5872, + "step": 350 + }, + { + "epoch": 0.3530521154208839, + "grad_norm": 22.81555309347901, + "learning_rate": 9.439802272063646e-06, + "loss": 2.5601, + "step": 351 + }, + { + "epoch": 0.3540579619035645, + "grad_norm": 22.381498108516144, + "learning_rate": 9.444384555835443e-06, + "loss": 2.582, + "step": 352 + }, + { + "epoch": 0.35506380838624507, + "grad_norm": 19.824711666359768, + "learning_rate": 9.44895384020319e-06, + "loss": 2.5217, + "step": 353 + }, + { + "epoch": 0.3560696548689256, + "grad_norm": 21.63777305031021, + "learning_rate": 9.453510198713878e-06, + "loss": 2.582, + "step": 354 + }, + { + "epoch": 0.3570755013516062, + "grad_norm": 18.452715228073657, + "learning_rate": 9.458053704292093e-06, + "loss": 2.5703, + "step": 355 + }, + { + "epoch": 0.3580813478342868, + "grad_norm": 16.645525833264035, + "learning_rate": 9.462584429247019e-06, + "loss": 2.6185, + "step": 356 + }, + { + "epoch": 0.3590871943169674, + "grad_norm": 17.993668104121088, + "learning_rate": 9.467102445279352e-06, + "loss": 2.5823, + "step": 357 + }, + { + "epoch": 0.36009304079964793, + "grad_norm": 16.501004168798346, + "learning_rate": 9.471607823488098e-06, + "loss": 2.5876, + "step": 358 + }, + { + "epoch": 0.36109888728232853, + "grad_norm": 20.814369689080046, + "learning_rate": 9.476100634377292e-06, + "loss": 2.5592, + "step": 359 + }, + { + "epoch": 0.3621047337650091, + "grad_norm": 17.443045103109935, + "learning_rate": 9.480580947862615e-06, + "loss": 2.5701, + "step": 360 + }, + { + "epoch": 0.3631105802476897, + "grad_norm": 21.11025374969233, + "learning_rate": 9.485048833277928e-06, + "loss": 2.6155, + "step": 361 + }, + { + "epoch": 0.36411642673037026, + "grad_norm": 17.98585113382355, + "learning_rate": 9.489504359381702e-06, + "loss": 2.5601, + "step": 362 + }, + { + "epoch": 0.36512227321305085, + "grad_norm": 17.523295432105954, + "learning_rate": 9.49394759436337e-06, + "loss": 2.5736, + "step": 363 + }, + { + "epoch": 0.36612811969573145, + "grad_norm": 21.831805903399736, + "learning_rate": 9.498378605849596e-06, + "loss": 2.5353, + "step": 364 + }, + { + "epoch": 0.36713396617841204, + "grad_norm": 19.70767544694675, + "learning_rate": 9.502797460910437e-06, + "loss": 2.609, + "step": 365 + }, + { + "epoch": 0.3681398126610926, + "grad_norm": 23.989716952193756, + "learning_rate": 9.507204226065449e-06, + "loss": 2.5952, + "step": 366 + }, + { + "epoch": 0.3691456591437732, + "grad_norm": 20.218287412951227, + "learning_rate": 9.511598967289681e-06, + "loss": 2.5784, + "step": 367 + }, + { + "epoch": 0.37015150562645377, + "grad_norm": 20.932848929843594, + "learning_rate": 9.515981750019612e-06, + "loss": 2.5482, + "step": 368 + }, + { + "epoch": 0.37115735210913436, + "grad_norm": 24.64683126624515, + "learning_rate": 9.520352639158998e-06, + "loss": 2.5726, + "step": 369 + }, + { + "epoch": 0.3721631985918149, + "grad_norm": 20.307454841997078, + "learning_rate": 9.524711699084629e-06, + "loss": 2.6273, + "step": 370 + }, + { + "epoch": 0.3731690450744955, + "grad_norm": 24.994119900117653, + "learning_rate": 9.529058993652026e-06, + "loss": 2.5826, + "step": 371 + }, + { + "epoch": 0.3741748915571761, + "grad_norm": 21.779145120794347, + "learning_rate": 9.533394586201055e-06, + "loss": 2.5547, + "step": 372 + }, + { + "epoch": 0.3751807380398567, + "grad_norm": 21.300796801514867, + "learning_rate": 9.537718539561456e-06, + "loss": 2.5621, + "step": 373 + }, + { + "epoch": 0.3761865845225372, + "grad_norm": 24.15121491274878, + "learning_rate": 9.542030916058314e-06, + "loss": 2.5851, + "step": 374 + }, + { + "epoch": 0.3771924310052178, + "grad_norm": 15.35693222917725, + "learning_rate": 9.546331777517445e-06, + "loss": 2.5435, + "step": 375 + }, + { + "epoch": 0.3781982774878984, + "grad_norm": 24.412432857133343, + "learning_rate": 9.550621185270719e-06, + "loss": 2.6019, + "step": 376 + }, + { + "epoch": 0.379204123970579, + "grad_norm": 25.613210763882368, + "learning_rate": 9.5548992001613e-06, + "loss": 2.5155, + "step": 377 + }, + { + "epoch": 0.38020997045325955, + "grad_norm": 22.095030230113984, + "learning_rate": 9.559165882548835e-06, + "loss": 2.5302, + "step": 378 + }, + { + "epoch": 0.38121581693594014, + "grad_norm": 24.99054102758044, + "learning_rate": 9.563421292314553e-06, + "loss": 2.5662, + "step": 379 + }, + { + "epoch": 0.38222166341862074, + "grad_norm": 18.606776693809763, + "learning_rate": 9.567665488866313e-06, + "loss": 2.5297, + "step": 380 + }, + { + "epoch": 0.38322750990130133, + "grad_norm": 20.35605026910687, + "learning_rate": 9.571898531143582e-06, + "loss": 2.5562, + "step": 381 + }, + { + "epoch": 0.38423335638398187, + "grad_norm": 26.209068443572, + "learning_rate": 9.576120477622342e-06, + "loss": 2.5664, + "step": 382 + }, + { + "epoch": 0.38523920286666247, + "grad_norm": 19.305248037487228, + "learning_rate": 9.580331386319938e-06, + "loss": 2.5937, + "step": 383 + }, + { + "epoch": 0.38624504934934306, + "grad_norm": 25.685627098810052, + "learning_rate": 9.584531314799869e-06, + "loss": 2.5336, + "step": 384 + }, + { + "epoch": 0.38725089583202366, + "grad_norm": 20.289828015945567, + "learning_rate": 9.588720320176494e-06, + "loss": 2.5348, + "step": 385 + }, + { + "epoch": 0.3882567423147042, + "grad_norm": 21.9547365407109, + "learning_rate": 9.592898459119703e-06, + "loss": 2.5505, + "step": 386 + }, + { + "epoch": 0.3892625887973848, + "grad_norm": 23.474353452783447, + "learning_rate": 9.597065787859523e-06, + "loss": 2.5506, + "step": 387 + }, + { + "epoch": 0.3902684352800654, + "grad_norm": 19.100670325198177, + "learning_rate": 9.601222362190642e-06, + "loss": 2.5846, + "step": 388 + }, + { + "epoch": 0.391274281762746, + "grad_norm": 21.84151290352025, + "learning_rate": 9.605368237476904e-06, + "loss": 2.5172, + "step": 389 + }, + { + "epoch": 0.3922801282454265, + "grad_norm": 17.584086365684314, + "learning_rate": 9.60950346865573e-06, + "loss": 2.5059, + "step": 390 + }, + { + "epoch": 0.3932859747281071, + "grad_norm": 20.781615247760833, + "learning_rate": 9.613628110242482e-06, + "loss": 2.5477, + "step": 391 + }, + { + "epoch": 0.3942918212107877, + "grad_norm": 19.38108993879063, + "learning_rate": 9.617742216334784e-06, + "loss": 2.5573, + "step": 392 + }, + { + "epoch": 0.3952976676934683, + "grad_norm": 19.846965607767764, + "learning_rate": 9.62184584061677e-06, + "loss": 2.5904, + "step": 393 + }, + { + "epoch": 0.39630351417614884, + "grad_norm": 18.299830732785676, + "learning_rate": 9.625939036363294e-06, + "loss": 2.5537, + "step": 394 + }, + { + "epoch": 0.39730936065882944, + "grad_norm": 18.937189256347306, + "learning_rate": 9.630021856444075e-06, + "loss": 2.5381, + "step": 395 + }, + { + "epoch": 0.39831520714151003, + "grad_norm": 15.860685957888077, + "learning_rate": 9.634094353327797e-06, + "loss": 2.525, + "step": 396 + }, + { + "epoch": 0.3993210536241906, + "grad_norm": 17.579941984547318, + "learning_rate": 9.638156579086155e-06, + "loss": 2.5499, + "step": 397 + }, + { + "epoch": 0.40032690010687116, + "grad_norm": 19.948100443223687, + "learning_rate": 9.64220858539785e-06, + "loss": 2.5226, + "step": 398 + }, + { + "epoch": 0.40133274658955176, + "grad_norm": 17.47822858548496, + "learning_rate": 9.646250423552533e-06, + "loss": 2.5239, + "step": 399 + }, + { + "epoch": 0.40233859307223235, + "grad_norm": 20.927008574523924, + "learning_rate": 9.650282144454697e-06, + "loss": 2.5573, + "step": 400 + }, + { + "epoch": 0.40334443955491295, + "grad_norm": 19.98864807675079, + "learning_rate": 9.654303798627532e-06, + "loss": 2.6445, + "step": 401 + }, + { + "epoch": 0.4043502860375935, + "grad_norm": 18.49557219174862, + "learning_rate": 9.658315436216716e-06, + "loss": 2.5924, + "step": 402 + }, + { + "epoch": 0.4053561325202741, + "grad_norm": 20.118177651660535, + "learning_rate": 9.662317106994168e-06, + "loss": 2.5399, + "step": 403 + }, + { + "epoch": 0.4063619790029547, + "grad_norm": 18.653675069722883, + "learning_rate": 9.666308860361762e-06, + "loss": 2.6146, + "step": 404 + }, + { + "epoch": 0.40736782548563527, + "grad_norm": 18.01802568279985, + "learning_rate": 9.670290745354967e-06, + "loss": 2.5572, + "step": 405 + }, + { + "epoch": 0.4083736719683158, + "grad_norm": 21.29046463917139, + "learning_rate": 9.674262810646488e-06, + "loss": 2.5068, + "step": 406 + }, + { + "epoch": 0.4093795184509964, + "grad_norm": 18.24276490157925, + "learning_rate": 9.678225104549809e-06, + "loss": 2.6318, + "step": 407 + }, + { + "epoch": 0.410385364933677, + "grad_norm": 21.252960660783703, + "learning_rate": 9.682177675022738e-06, + "loss": 2.5443, + "step": 408 + }, + { + "epoch": 0.4113912114163576, + "grad_norm": 18.70132923614444, + "learning_rate": 9.68612056967088e-06, + "loss": 2.5363, + "step": 409 + }, + { + "epoch": 0.41239705789903813, + "grad_norm": 16.132340101807507, + "learning_rate": 9.69005383575108e-06, + "loss": 2.5217, + "step": 410 + }, + { + "epoch": 0.4134029043817187, + "grad_norm": 21.346280277371964, + "learning_rate": 9.693977520174825e-06, + "loss": 2.5621, + "step": 411 + }, + { + "epoch": 0.4144087508643993, + "grad_norm": 16.940635635285602, + "learning_rate": 9.697891669511594e-06, + "loss": 2.5802, + "step": 412 + }, + { + "epoch": 0.4154145973470799, + "grad_norm": 19.26324991277955, + "learning_rate": 9.701796329992181e-06, + "loss": 2.533, + "step": 413 + }, + { + "epoch": 0.41642044382976046, + "grad_norm": 22.835729739016102, + "learning_rate": 9.705691547511965e-06, + "loss": 2.5402, + "step": 414 + }, + { + "epoch": 0.41742629031244105, + "grad_norm": 21.002625565526287, + "learning_rate": 9.709577367634156e-06, + "loss": 2.5658, + "step": 415 + }, + { + "epoch": 0.41843213679512165, + "grad_norm": 24.50852088816323, + "learning_rate": 9.713453835592982e-06, + "loss": 2.5428, + "step": 416 + }, + { + "epoch": 0.41943798327780224, + "grad_norm": 21.492809059871774, + "learning_rate": 9.717320996296857e-06, + "loss": 2.5388, + "step": 417 + }, + { + "epoch": 0.4204438297604828, + "grad_norm": 19.50678145510213, + "learning_rate": 9.721178894331493e-06, + "loss": 2.6011, + "step": 418 + }, + { + "epoch": 0.4214496762431634, + "grad_norm": 20.37546943411706, + "learning_rate": 9.725027573963e-06, + "loss": 2.5422, + "step": 419 + }, + { + "epoch": 0.42245552272584397, + "grad_norm": 19.437883948580463, + "learning_rate": 9.728867079140916e-06, + "loss": 2.557, + "step": 420 + }, + { + "epoch": 0.42346136920852456, + "grad_norm": 19.911077577378947, + "learning_rate": 9.732697453501233e-06, + "loss": 2.5821, + "step": 421 + }, + { + "epoch": 0.42446721569120516, + "grad_norm": 20.36935689145355, + "learning_rate": 9.736518740369361e-06, + "loss": 2.586, + "step": 422 + }, + { + "epoch": 0.4254730621738857, + "grad_norm": 17.840560424560636, + "learning_rate": 9.740330982763071e-06, + "loss": 2.6046, + "step": 423 + }, + { + "epoch": 0.4264789086565663, + "grad_norm": 19.87276506803033, + "learning_rate": 9.744134223395413e-06, + "loss": 2.5743, + "step": 424 + }, + { + "epoch": 0.4274847551392469, + "grad_norm": 17.6454935838629, + "learning_rate": 9.747928504677567e-06, + "loss": 2.5709, + "step": 425 + }, + { + "epoch": 0.4284906016219275, + "grad_norm": 17.696687945509535, + "learning_rate": 9.7517138687217e-06, + "loss": 2.6172, + "step": 426 + }, + { + "epoch": 0.429496448104608, + "grad_norm": 18.402222368633442, + "learning_rate": 9.75549035734375e-06, + "loss": 2.5621, + "step": 427 + }, + { + "epoch": 0.4305022945872886, + "grad_norm": 20.64172248569233, + "learning_rate": 9.759258012066223e-06, + "loss": 2.5456, + "step": 428 + }, + { + "epoch": 0.4315081410699692, + "grad_norm": 19.821918707901084, + "learning_rate": 9.76301687412091e-06, + "loss": 2.5792, + "step": 429 + }, + { + "epoch": 0.4325139875526498, + "grad_norm": 17.51180178485302, + "learning_rate": 9.766766984451605e-06, + "loss": 2.5254, + "step": 430 + }, + { + "epoch": 0.43351983403533034, + "grad_norm": 18.21037361364691, + "learning_rate": 9.770508383716777e-06, + "loss": 2.5228, + "step": 431 + }, + { + "epoch": 0.43452568051801094, + "grad_norm": 22.038929359068945, + "learning_rate": 9.774241112292223e-06, + "loss": 2.6225, + "step": 432 + }, + { + "epoch": 0.43553152700069153, + "grad_norm": 18.73170725911393, + "learning_rate": 9.777965210273664e-06, + "loss": 2.5835, + "step": 433 + }, + { + "epoch": 0.4365373734833721, + "grad_norm": 23.510705394520055, + "learning_rate": 9.781680717479356e-06, + "loss": 2.5343, + "step": 434 + }, + { + "epoch": 0.43754321996605267, + "grad_norm": 23.066316139714914, + "learning_rate": 9.785387673452622e-06, + "loss": 2.5647, + "step": 435 + }, + { + "epoch": 0.43854906644873326, + "grad_norm": 19.07782833604606, + "learning_rate": 9.789086117464384e-06, + "loss": 2.5615, + "step": 436 + }, + { + "epoch": 0.43955491293141385, + "grad_norm": 16.751538869640772, + "learning_rate": 9.792776088515663e-06, + "loss": 2.6065, + "step": 437 + }, + { + "epoch": 0.44056075941409445, + "grad_norm": 20.60303276845087, + "learning_rate": 9.796457625340045e-06, + "loss": 2.5397, + "step": 438 + }, + { + "epoch": 0.441566605896775, + "grad_norm": 18.45324982339736, + "learning_rate": 9.800130766406115e-06, + "loss": 2.6101, + "step": 439 + }, + { + "epoch": 0.4425724523794556, + "grad_norm": 20.238472969240057, + "learning_rate": 9.803795549919879e-06, + "loss": 2.5714, + "step": 440 + }, + { + "epoch": 0.4435782988621362, + "grad_norm": 21.366888626724553, + "learning_rate": 9.807452013827138e-06, + "loss": 2.5863, + "step": 441 + }, + { + "epoch": 0.4445841453448168, + "grad_norm": 23.870758183159623, + "learning_rate": 9.811100195815852e-06, + "loss": 2.5177, + "step": 442 + }, + { + "epoch": 0.4455899918274973, + "grad_norm": 18.602411032505366, + "learning_rate": 9.814740133318472e-06, + "loss": 2.5838, + "step": 443 + }, + { + "epoch": 0.4465958383101779, + "grad_norm": 21.016219269082764, + "learning_rate": 9.818371863514235e-06, + "loss": 2.6025, + "step": 444 + }, + { + "epoch": 0.4476016847928585, + "grad_norm": 17.77671667492051, + "learning_rate": 9.821995423331454e-06, + "loss": 2.5537, + "step": 445 + }, + { + "epoch": 0.4486075312755391, + "grad_norm": 20.10732838634908, + "learning_rate": 9.825610849449766e-06, + "loss": 2.5838, + "step": 446 + }, + { + "epoch": 0.44961337775821963, + "grad_norm": 19.0181000365809, + "learning_rate": 9.829218178302358e-06, + "loss": 2.4954, + "step": 447 + }, + { + "epoch": 0.45061922424090023, + "grad_norm": 19.33953885797201, + "learning_rate": 9.83281744607817e-06, + "loss": 2.5118, + "step": 448 + }, + { + "epoch": 0.4516250707235808, + "grad_norm": 19.85434731351852, + "learning_rate": 9.83640868872408e-06, + "loss": 2.6326, + "step": 449 + }, + { + "epoch": 0.4526309172062614, + "grad_norm": 22.249007362186386, + "learning_rate": 9.83999194194705e-06, + "loss": 2.5417, + "step": 450 + }, + { + "epoch": 0.45363676368894196, + "grad_norm": 22.301275249798355, + "learning_rate": 9.84356724121626e-06, + "loss": 2.6442, + "step": 451 + }, + { + "epoch": 0.45464261017162255, + "grad_norm": 20.462177665673316, + "learning_rate": 9.847134621765215e-06, + "loss": 2.5007, + "step": 452 + }, + { + "epoch": 0.45564845665430315, + "grad_norm": 22.380040927423096, + "learning_rate": 9.850694118593826e-06, + "loss": 2.5468, + "step": 453 + }, + { + "epoch": 0.45665430313698374, + "grad_norm": 19.651208291746624, + "learning_rate": 9.854245766470469e-06, + "loss": 2.5567, + "step": 454 + }, + { + "epoch": 0.4576601496196643, + "grad_norm": 22.186410502702007, + "learning_rate": 9.857789599934032e-06, + "loss": 2.4932, + "step": 455 + }, + { + "epoch": 0.4586659961023449, + "grad_norm": 22.549669207667595, + "learning_rate": 9.861325653295919e-06, + "loss": 2.5198, + "step": 456 + }, + { + "epoch": 0.45967184258502547, + "grad_norm": 18.811742186281815, + "learning_rate": 9.864853960642048e-06, + "loss": 2.5091, + "step": 457 + }, + { + "epoch": 0.46067768906770606, + "grad_norm": 22.424509982770353, + "learning_rate": 9.868374555834827e-06, + "loss": 2.4765, + "step": 458 + }, + { + "epoch": 0.4616835355503866, + "grad_norm": 22.679824568488968, + "learning_rate": 9.871887472515092e-06, + "loss": 2.5785, + "step": 459 + }, + { + "epoch": 0.4626893820330672, + "grad_norm": 21.332414608524278, + "learning_rate": 9.875392744104048e-06, + "loss": 2.5537, + "step": 460 + }, + { + "epoch": 0.4636952285157478, + "grad_norm": 21.78085628200312, + "learning_rate": 9.878890403805172e-06, + "loss": 2.5728, + "step": 461 + }, + { + "epoch": 0.4647010749984284, + "grad_norm": 22.86034758007963, + "learning_rate": 9.882380484606098e-06, + "loss": 2.6021, + "step": 462 + }, + { + "epoch": 0.4657069214811089, + "grad_norm": 22.35062353253106, + "learning_rate": 9.885863019280488e-06, + "loss": 2.5692, + "step": 463 + }, + { + "epoch": 0.4667127679637895, + "grad_norm": 20.634601317147265, + "learning_rate": 9.889338040389874e-06, + "loss": 2.5236, + "step": 464 + }, + { + "epoch": 0.4677186144464701, + "grad_norm": 17.922619313281967, + "learning_rate": 9.892805580285489e-06, + "loss": 2.5819, + "step": 465 + }, + { + "epoch": 0.4687244609291507, + "grad_norm": 21.482681336740185, + "learning_rate": 9.896265671110072e-06, + "loss": 2.5266, + "step": 466 + }, + { + "epoch": 0.46973030741183125, + "grad_norm": 17.049968410688997, + "learning_rate": 9.899718344799657e-06, + "loss": 2.566, + "step": 467 + }, + { + "epoch": 0.47073615389451184, + "grad_norm": 21.62677597122425, + "learning_rate": 9.903163633085336e-06, + "loss": 2.5515, + "step": 468 + }, + { + "epoch": 0.47174200037719244, + "grad_norm": 16.57841641405861, + "learning_rate": 9.906601567495018e-06, + "loss": 2.584, + "step": 469 + }, + { + "epoch": 0.47274784685987303, + "grad_norm": 18.926113122865047, + "learning_rate": 9.910032179355153e-06, + "loss": 2.4928, + "step": 470 + }, + { + "epoch": 0.4737536933425536, + "grad_norm": 19.220992348367954, + "learning_rate": 9.91345549979245e-06, + "loss": 2.5615, + "step": 471 + }, + { + "epoch": 0.47475953982523417, + "grad_norm": 18.388374619911023, + "learning_rate": 9.916871559735566e-06, + "loss": 2.6263, + "step": 472 + }, + { + "epoch": 0.47576538630791476, + "grad_norm": 20.662610639360757, + "learning_rate": 9.920280389916785e-06, + "loss": 2.5093, + "step": 473 + }, + { + "epoch": 0.47677123279059536, + "grad_norm": 19.96841955129098, + "learning_rate": 9.92368202087368e-06, + "loss": 2.5829, + "step": 474 + }, + { + "epoch": 0.4777770792732759, + "grad_norm": 19.651541678361166, + "learning_rate": 9.927076482950749e-06, + "loss": 2.5313, + "step": 475 + }, + { + "epoch": 0.4787829257559565, + "grad_norm": 20.768470047845234, + "learning_rate": 9.93046380630104e-06, + "loss": 2.5703, + "step": 476 + }, + { + "epoch": 0.4797887722386371, + "grad_norm": 19.380117148544503, + "learning_rate": 9.933844020887766e-06, + "loss": 2.5438, + "step": 477 + }, + { + "epoch": 0.4807946187213177, + "grad_norm": 17.478281054077645, + "learning_rate": 9.937217156485885e-06, + "loss": 2.5358, + "step": 478 + }, + { + "epoch": 0.4818004652039982, + "grad_norm": 16.628643607719937, + "learning_rate": 9.940583242683675e-06, + "loss": 2.5197, + "step": 479 + }, + { + "epoch": 0.4828063116866788, + "grad_norm": 21.77545239003663, + "learning_rate": 9.943942308884303e-06, + "loss": 2.5079, + "step": 480 + }, + { + "epoch": 0.4838121581693594, + "grad_norm": 21.324959520122704, + "learning_rate": 9.947294384307348e-06, + "loss": 2.4906, + "step": 481 + }, + { + "epoch": 0.48481800465204, + "grad_norm": 16.398520699639988, + "learning_rate": 9.950639497990342e-06, + "loss": 2.4824, + "step": 482 + }, + { + "epoch": 0.48582385113472054, + "grad_norm": 23.604836816022345, + "learning_rate": 9.953977678790266e-06, + "loss": 2.5571, + "step": 483 + }, + { + "epoch": 0.48682969761740114, + "grad_norm": 18.0895610543458, + "learning_rate": 9.95730895538506e-06, + "loss": 2.5574, + "step": 484 + }, + { + "epoch": 0.48783554410008173, + "grad_norm": 19.596954645717, + "learning_rate": 9.960633356275078e-06, + "loss": 2.5706, + "step": 485 + }, + { + "epoch": 0.4888413905827623, + "grad_norm": 17.69568825924647, + "learning_rate": 9.963950909784575e-06, + "loss": 2.5282, + "step": 486 + }, + { + "epoch": 0.48984723706544286, + "grad_norm": 18.315031047887874, + "learning_rate": 9.96726164406314e-06, + "loss": 2.5438, + "step": 487 + }, + { + "epoch": 0.49085308354812346, + "grad_norm": 21.065742691153716, + "learning_rate": 9.970565587087136e-06, + "loss": 2.5552, + "step": 488 + }, + { + "epoch": 0.49185893003080405, + "grad_norm": 17.188059594385642, + "learning_rate": 9.973862766661114e-06, + "loss": 2.5923, + "step": 489 + }, + { + "epoch": 0.49286477651348465, + "grad_norm": 21.1108953164887, + "learning_rate": 9.977153210419218e-06, + "loss": 2.6044, + "step": 490 + }, + { + "epoch": 0.4938706229961652, + "grad_norm": 21.36747102982352, + "learning_rate": 9.980436945826581e-06, + "loss": 2.5927, + "step": 491 + }, + { + "epoch": 0.4948764694788458, + "grad_norm": 19.868155776423375, + "learning_rate": 9.983714000180685e-06, + "loss": 2.5641, + "step": 492 + }, + { + "epoch": 0.4958823159615264, + "grad_norm": 22.741938569039043, + "learning_rate": 9.986984400612744e-06, + "loss": 2.602, + "step": 493 + }, + { + "epoch": 0.49688816244420697, + "grad_norm": 20.68752819403972, + "learning_rate": 9.990248174089033e-06, + "loss": 2.579, + "step": 494 + }, + { + "epoch": 0.4978940089268875, + "grad_norm": 20.135950916639047, + "learning_rate": 9.99350534741223e-06, + "loss": 2.552, + "step": 495 + }, + { + "epoch": 0.4988998554095681, + "grad_norm": 18.623416371913574, + "learning_rate": 9.996755947222743e-06, + "loss": 2.5241, + "step": 496 + }, + { + "epoch": 0.4999057018922487, + "grad_norm": 18.983653734389193, + "learning_rate": 1e-05, + "loss": 2.5411, + "step": 497 + }, + { + "epoch": 0.5009115483749292, + "grad_norm": 17.004668294054074, + "learning_rate": 1e-05, + "loss": 2.5481, + "step": 498 + }, + { + "epoch": 0.5019173948576099, + "grad_norm": 19.169938319334523, + "learning_rate": 9.997764363961547e-06, + "loss": 2.5168, + "step": 499 + }, + { + "epoch": 0.5029232413402904, + "grad_norm": 15.781569650968905, + "learning_rate": 9.995528727923096e-06, + "loss": 2.5549, + "step": 500 + }, + { + "epoch": 0.503929087822971, + "grad_norm": 20.179625887745996, + "learning_rate": 9.993293091884642e-06, + "loss": 2.4907, + "step": 501 + }, + { + "epoch": 0.5049349343056516, + "grad_norm": 19.05116376715222, + "learning_rate": 9.991057455846189e-06, + "loss": 2.5621, + "step": 502 + }, + { + "epoch": 0.5059407807883322, + "grad_norm": 21.91655117108733, + "learning_rate": 9.988821819807735e-06, + "loss": 2.5457, + "step": 503 + }, + { + "epoch": 0.5069466272710128, + "grad_norm": 20.393552472807908, + "learning_rate": 9.986586183769284e-06, + "loss": 2.5379, + "step": 504 + }, + { + "epoch": 0.5079524737536933, + "grad_norm": 22.422384427517496, + "learning_rate": 9.98435054773083e-06, + "loss": 2.5373, + "step": 505 + }, + { + "epoch": 0.5089583202363739, + "grad_norm": 15.409773180659693, + "learning_rate": 9.982114911692378e-06, + "loss": 2.4445, + "step": 506 + }, + { + "epoch": 0.5099641667190545, + "grad_norm": 21.378407108104337, + "learning_rate": 9.979879275653925e-06, + "loss": 2.4991, + "step": 507 + }, + { + "epoch": 0.5109700132017351, + "grad_norm": 19.128162487040523, + "learning_rate": 9.977643639615471e-06, + "loss": 2.4958, + "step": 508 + }, + { + "epoch": 0.5119758596844156, + "grad_norm": 19.690175531171707, + "learning_rate": 9.975408003577018e-06, + "loss": 2.5555, + "step": 509 + }, + { + "epoch": 0.5129817061670963, + "grad_norm": 20.712793838884973, + "learning_rate": 9.973172367538565e-06, + "loss": 2.5536, + "step": 510 + }, + { + "epoch": 0.5139875526497768, + "grad_norm": 22.092179923348358, + "learning_rate": 9.970936731500113e-06, + "loss": 2.5978, + "step": 511 + }, + { + "epoch": 0.5149933991324575, + "grad_norm": 18.84515345278468, + "learning_rate": 9.96870109546166e-06, + "loss": 2.5589, + "step": 512 + }, + { + "epoch": 0.515999245615138, + "grad_norm": 19.568303464632645, + "learning_rate": 9.966465459423208e-06, + "loss": 2.5411, + "step": 513 + }, + { + "epoch": 0.5170050920978185, + "grad_norm": 21.2416135192592, + "learning_rate": 9.964229823384754e-06, + "loss": 2.5108, + "step": 514 + }, + { + "epoch": 0.5180109385804992, + "grad_norm": 18.574355716211667, + "learning_rate": 9.9619941873463e-06, + "loss": 2.5255, + "step": 515 + }, + { + "epoch": 0.5190167850631797, + "grad_norm": 25.877119413501973, + "learning_rate": 9.959758551307847e-06, + "loss": 2.5063, + "step": 516 + }, + { + "epoch": 0.5200226315458603, + "grad_norm": 21.502468407593128, + "learning_rate": 9.957522915269396e-06, + "loss": 2.5912, + "step": 517 + }, + { + "epoch": 0.5210284780285409, + "grad_norm": 24.69392944829647, + "learning_rate": 9.955287279230942e-06, + "loss": 2.567, + "step": 518 + }, + { + "epoch": 0.5220343245112214, + "grad_norm": 23.76268825844637, + "learning_rate": 9.953051643192489e-06, + "loss": 2.4931, + "step": 519 + }, + { + "epoch": 0.5230401709939021, + "grad_norm": 20.684631513248082, + "learning_rate": 9.950816007154035e-06, + "loss": 2.5733, + "step": 520 + }, + { + "epoch": 0.5240460174765826, + "grad_norm": 19.227634049258327, + "learning_rate": 9.948580371115584e-06, + "loss": 2.552, + "step": 521 + }, + { + "epoch": 0.5250518639592632, + "grad_norm": 24.53885448185181, + "learning_rate": 9.94634473507713e-06, + "loss": 2.5831, + "step": 522 + }, + { + "epoch": 0.5260577104419438, + "grad_norm": 23.124577154789915, + "learning_rate": 9.944109099038677e-06, + "loss": 2.5542, + "step": 523 + }, + { + "epoch": 0.5270635569246244, + "grad_norm": 21.424296302635216, + "learning_rate": 9.941873463000225e-06, + "loss": 2.5702, + "step": 524 + }, + { + "epoch": 0.5280694034073049, + "grad_norm": 21.23884958735581, + "learning_rate": 9.939637826961771e-06, + "loss": 2.4847, + "step": 525 + }, + { + "epoch": 0.5290752498899856, + "grad_norm": 24.491387720457944, + "learning_rate": 9.937402190923318e-06, + "loss": 2.5822, + "step": 526 + }, + { + "epoch": 0.5300810963726661, + "grad_norm": 19.874916575754323, + "learning_rate": 9.935166554884865e-06, + "loss": 2.5269, + "step": 527 + }, + { + "epoch": 0.5310869428553467, + "grad_norm": 60.71635494912012, + "learning_rate": 9.932930918846413e-06, + "loss": 2.5605, + "step": 528 + }, + { + "epoch": 0.5320927893380273, + "grad_norm": 20.456678607924925, + "learning_rate": 9.93069528280796e-06, + "loss": 2.5238, + "step": 529 + }, + { + "epoch": 0.5330986358207078, + "grad_norm": 18.136503927465057, + "learning_rate": 9.928459646769508e-06, + "loss": 2.5667, + "step": 530 + }, + { + "epoch": 0.5341044823033885, + "grad_norm": 16.289075770430706, + "learning_rate": 9.926224010731054e-06, + "loss": 2.5032, + "step": 531 + }, + { + "epoch": 0.535110328786069, + "grad_norm": 18.52577477934986, + "learning_rate": 9.9239883746926e-06, + "loss": 2.5708, + "step": 532 + }, + { + "epoch": 0.5361161752687497, + "grad_norm": 18.712980125548064, + "learning_rate": 9.921752738654147e-06, + "loss": 2.5339, + "step": 533 + }, + { + "epoch": 0.5371220217514302, + "grad_norm": 19.748115495810836, + "learning_rate": 9.919517102615694e-06, + "loss": 2.5236, + "step": 534 + }, + { + "epoch": 0.5381278682341107, + "grad_norm": 16.510969022398314, + "learning_rate": 9.917281466577242e-06, + "loss": 2.5847, + "step": 535 + }, + { + "epoch": 0.5391337147167914, + "grad_norm": 18.080202479501924, + "learning_rate": 9.915045830538789e-06, + "loss": 2.5728, + "step": 536 + }, + { + "epoch": 0.5401395611994719, + "grad_norm": 18.47492461965452, + "learning_rate": 9.912810194500337e-06, + "loss": 2.5226, + "step": 537 + }, + { + "epoch": 0.5411454076821525, + "grad_norm": 18.114656650302823, + "learning_rate": 9.910574558461884e-06, + "loss": 2.5199, + "step": 538 + }, + { + "epoch": 0.5421512541648331, + "grad_norm": 16.58075410612739, + "learning_rate": 9.90833892242343e-06, + "loss": 2.5481, + "step": 539 + }, + { + "epoch": 0.5431571006475137, + "grad_norm": 29.273276472080056, + "learning_rate": 9.906103286384977e-06, + "loss": 2.4944, + "step": 540 + }, + { + "epoch": 0.5441629471301943, + "grad_norm": 18.462379052260196, + "learning_rate": 9.903867650346525e-06, + "loss": 2.5807, + "step": 541 + }, + { + "epoch": 0.5451687936128748, + "grad_norm": 17.13630248386338, + "learning_rate": 9.901632014308071e-06, + "loss": 2.5622, + "step": 542 + }, + { + "epoch": 0.5461746400955554, + "grad_norm": 19.468397745227442, + "learning_rate": 9.89939637826962e-06, + "loss": 2.4879, + "step": 543 + }, + { + "epoch": 0.547180486578236, + "grad_norm": 17.0399957247732, + "learning_rate": 9.897160742231166e-06, + "loss": 2.5107, + "step": 544 + }, + { + "epoch": 0.5481863330609166, + "grad_norm": 18.31755981883234, + "learning_rate": 9.894925106192713e-06, + "loss": 2.5205, + "step": 545 + }, + { + "epoch": 0.5491921795435971, + "grad_norm": 17.749662710789632, + "learning_rate": 9.89268947015426e-06, + "loss": 2.5464, + "step": 546 + }, + { + "epoch": 0.5501980260262778, + "grad_norm": 19.001762577300585, + "learning_rate": 9.890453834115806e-06, + "loss": 2.549, + "step": 547 + }, + { + "epoch": 0.5512038725089583, + "grad_norm": 16.48812300958239, + "learning_rate": 9.888218198077354e-06, + "loss": 2.5059, + "step": 548 + }, + { + "epoch": 0.552209718991639, + "grad_norm": 16.712686374839436, + "learning_rate": 9.8859825620389e-06, + "loss": 2.5445, + "step": 549 + }, + { + "epoch": 0.5532155654743195, + "grad_norm": 17.463415610893698, + "learning_rate": 9.883746926000447e-06, + "loss": 2.5056, + "step": 550 + }, + { + "epoch": 0.554221411957, + "grad_norm": 17.741113008733752, + "learning_rate": 9.881511289961994e-06, + "loss": 2.5551, + "step": 551 + }, + { + "epoch": 0.5552272584396807, + "grad_norm": 16.464918946132286, + "learning_rate": 9.879275653923542e-06, + "loss": 2.5507, + "step": 552 + }, + { + "epoch": 0.5562331049223612, + "grad_norm": 17.12442892090824, + "learning_rate": 9.877040017885089e-06, + "loss": 2.5451, + "step": 553 + }, + { + "epoch": 0.5572389514050418, + "grad_norm": 16.345309381319424, + "learning_rate": 9.874804381846637e-06, + "loss": 2.4611, + "step": 554 + }, + { + "epoch": 0.5582447978877224, + "grad_norm": 18.656826976907546, + "learning_rate": 9.872568745808184e-06, + "loss": 2.5097, + "step": 555 + }, + { + "epoch": 0.559250644370403, + "grad_norm": 15.956055864098987, + "learning_rate": 9.87033310976973e-06, + "loss": 2.5246, + "step": 556 + }, + { + "epoch": 0.5602564908530836, + "grad_norm": 18.909338459825516, + "learning_rate": 9.868097473731277e-06, + "loss": 2.5912, + "step": 557 + }, + { + "epoch": 0.5612623373357641, + "grad_norm": 18.11236716491806, + "learning_rate": 9.865861837692823e-06, + "loss": 2.5275, + "step": 558 + }, + { + "epoch": 0.5622681838184447, + "grad_norm": 19.6612038201225, + "learning_rate": 9.863626201654371e-06, + "loss": 2.5034, + "step": 559 + }, + { + "epoch": 0.5632740303011253, + "grad_norm": 22.041905995039123, + "learning_rate": 9.861390565615918e-06, + "loss": 2.5134, + "step": 560 + }, + { + "epoch": 0.5642798767838059, + "grad_norm": 18.943489012480768, + "learning_rate": 9.859154929577466e-06, + "loss": 2.5176, + "step": 561 + }, + { + "epoch": 0.5652857232664864, + "grad_norm": 16.995644349198606, + "learning_rate": 9.856919293539013e-06, + "loss": 2.5364, + "step": 562 + }, + { + "epoch": 0.5662915697491671, + "grad_norm": 18.390254920919176, + "learning_rate": 9.85468365750056e-06, + "loss": 2.5314, + "step": 563 + }, + { + "epoch": 0.5672974162318476, + "grad_norm": 16.152905937236415, + "learning_rate": 9.852448021462106e-06, + "loss": 2.5169, + "step": 564 + }, + { + "epoch": 0.5683032627145282, + "grad_norm": 18.48631801217191, + "learning_rate": 9.850212385423654e-06, + "loss": 2.5293, + "step": 565 + }, + { + "epoch": 0.5693091091972088, + "grad_norm": 17.715384087930207, + "learning_rate": 9.8479767493852e-06, + "loss": 2.5817, + "step": 566 + }, + { + "epoch": 0.5703149556798893, + "grad_norm": 20.06744852384933, + "learning_rate": 9.845741113346749e-06, + "loss": 2.5676, + "step": 567 + }, + { + "epoch": 0.57132080216257, + "grad_norm": 19.29605365339498, + "learning_rate": 9.843505477308296e-06, + "loss": 2.5231, + "step": 568 + }, + { + "epoch": 0.5723266486452505, + "grad_norm": 17.150675872187414, + "learning_rate": 9.841269841269842e-06, + "loss": 2.5448, + "step": 569 + }, + { + "epoch": 0.573332495127931, + "grad_norm": 16.770426880970362, + "learning_rate": 9.839034205231389e-06, + "loss": 2.5418, + "step": 570 + }, + { + "epoch": 0.5743383416106117, + "grad_norm": 18.415349918817537, + "learning_rate": 9.836798569192935e-06, + "loss": 2.5525, + "step": 571 + }, + { + "epoch": 0.5753441880932922, + "grad_norm": 17.722420126001705, + "learning_rate": 9.834562933154484e-06, + "loss": 2.5643, + "step": 572 + }, + { + "epoch": 0.5763500345759729, + "grad_norm": 17.293290429304342, + "learning_rate": 9.83232729711603e-06, + "loss": 2.5361, + "step": 573 + }, + { + "epoch": 0.5773558810586534, + "grad_norm": 61.680906816389644, + "learning_rate": 9.830091661077578e-06, + "loss": 2.5717, + "step": 574 + }, + { + "epoch": 0.578361727541334, + "grad_norm": 22.24029178280409, + "learning_rate": 9.827856025039125e-06, + "loss": 2.5664, + "step": 575 + }, + { + "epoch": 0.5793675740240146, + "grad_norm": 19.513909304611108, + "learning_rate": 9.825620389000671e-06, + "loss": 2.48, + "step": 576 + }, + { + "epoch": 0.5803734205066952, + "grad_norm": 19.514919116226682, + "learning_rate": 9.823384752962218e-06, + "loss": 2.5554, + "step": 577 + }, + { + "epoch": 0.5813792669893757, + "grad_norm": 19.359797198083314, + "learning_rate": 9.821149116923766e-06, + "loss": 2.5304, + "step": 578 + }, + { + "epoch": 0.5823851134720563, + "grad_norm": 17.480705109387433, + "learning_rate": 9.818913480885313e-06, + "loss": 2.5691, + "step": 579 + }, + { + "epoch": 0.5833909599547369, + "grad_norm": 17.938688964369206, + "learning_rate": 9.816677844846861e-06, + "loss": 2.5095, + "step": 580 + }, + { + "epoch": 0.5843968064374175, + "grad_norm": 20.7902045613152, + "learning_rate": 9.814442208808408e-06, + "loss": 2.5317, + "step": 581 + }, + { + "epoch": 0.5854026529200981, + "grad_norm": 16.80739441378905, + "learning_rate": 9.812206572769954e-06, + "loss": 2.5412, + "step": 582 + }, + { + "epoch": 0.5864084994027786, + "grad_norm": 19.170545572301233, + "learning_rate": 9.8099709367315e-06, + "loss": 2.5462, + "step": 583 + }, + { + "epoch": 0.5874143458854593, + "grad_norm": 17.819113412411937, + "learning_rate": 9.807735300693047e-06, + "loss": 2.5127, + "step": 584 + }, + { + "epoch": 0.5884201923681398, + "grad_norm": 22.38069113233498, + "learning_rate": 9.805499664654596e-06, + "loss": 2.5908, + "step": 585 + }, + { + "epoch": 0.5894260388508203, + "grad_norm": 18.672856432624258, + "learning_rate": 9.803264028616142e-06, + "loss": 2.5348, + "step": 586 + }, + { + "epoch": 0.590431885333501, + "grad_norm": 18.638852489711283, + "learning_rate": 9.801028392577689e-06, + "loss": 2.5037, + "step": 587 + }, + { + "epoch": 0.5914377318161815, + "grad_norm": 19.540961306703963, + "learning_rate": 9.798792756539235e-06, + "loss": 2.5536, + "step": 588 + }, + { + "epoch": 0.5924435782988622, + "grad_norm": 19.04026220736335, + "learning_rate": 9.796557120500783e-06, + "loss": 2.5337, + "step": 589 + }, + { + "epoch": 0.5934494247815427, + "grad_norm": 17.93807746202833, + "learning_rate": 9.79432148446233e-06, + "loss": 2.4845, + "step": 590 + }, + { + "epoch": 0.5944552712642233, + "grad_norm": 19.252745829909692, + "learning_rate": 9.792085848423878e-06, + "loss": 2.5767, + "step": 591 + }, + { + "epoch": 0.5954611177469039, + "grad_norm": 20.02025664672543, + "learning_rate": 9.789850212385425e-06, + "loss": 2.5426, + "step": 592 + }, + { + "epoch": 0.5964669642295845, + "grad_norm": 18.93799725756257, + "learning_rate": 9.787614576346971e-06, + "loss": 2.4909, + "step": 593 + }, + { + "epoch": 0.597472810712265, + "grad_norm": 17.119830462848988, + "learning_rate": 9.785378940308518e-06, + "loss": 2.5408, + "step": 594 + }, + { + "epoch": 0.5984786571949456, + "grad_norm": 19.594325441428992, + "learning_rate": 9.783143304270065e-06, + "loss": 2.5078, + "step": 595 + }, + { + "epoch": 0.5994845036776262, + "grad_norm": 17.676742045845327, + "learning_rate": 9.780907668231613e-06, + "loss": 2.554, + "step": 596 + }, + { + "epoch": 0.6004903501603068, + "grad_norm": 18.41536510551728, + "learning_rate": 9.77867203219316e-06, + "loss": 2.5327, + "step": 597 + }, + { + "epoch": 0.6014961966429874, + "grad_norm": 18.51209433223085, + "learning_rate": 9.776436396154708e-06, + "loss": 2.54, + "step": 598 + }, + { + "epoch": 0.6025020431256679, + "grad_norm": 16.521318221902543, + "learning_rate": 9.774200760116254e-06, + "loss": 2.569, + "step": 599 + }, + { + "epoch": 0.6035078896083486, + "grad_norm": 16.079590282276442, + "learning_rate": 9.7719651240778e-06, + "loss": 2.5368, + "step": 600 + }, + { + "epoch": 0.6045137360910291, + "grad_norm": 20.23619630960961, + "learning_rate": 9.769729488039347e-06, + "loss": 2.5224, + "step": 601 + }, + { + "epoch": 0.6055195825737096, + "grad_norm": 17.54702826113766, + "learning_rate": 9.767493852000896e-06, + "loss": 2.5813, + "step": 602 + }, + { + "epoch": 0.6065254290563903, + "grad_norm": 19.13355798517692, + "learning_rate": 9.765258215962442e-06, + "loss": 2.5462, + "step": 603 + }, + { + "epoch": 0.6075312755390708, + "grad_norm": 16.431191732101087, + "learning_rate": 9.76302257992399e-06, + "loss": 2.5439, + "step": 604 + }, + { + "epoch": 0.6085371220217515, + "grad_norm": 18.408948713655214, + "learning_rate": 9.760786943885537e-06, + "loss": 2.5589, + "step": 605 + }, + { + "epoch": 0.609542968504432, + "grad_norm": 17.712239889116066, + "learning_rate": 9.758551307847083e-06, + "loss": 2.476, + "step": 606 + }, + { + "epoch": 0.6105488149871126, + "grad_norm": 20.013392068620888, + "learning_rate": 9.75631567180863e-06, + "loss": 2.5947, + "step": 607 + }, + { + "epoch": 0.6115546614697932, + "grad_norm": 20.423121099411897, + "learning_rate": 9.754080035770177e-06, + "loss": 2.611, + "step": 608 + }, + { + "epoch": 0.6125605079524737, + "grad_norm": 20.404380298452843, + "learning_rate": 9.751844399731725e-06, + "loss": 2.5225, + "step": 609 + }, + { + "epoch": 0.6135663544351543, + "grad_norm": 23.64833980844967, + "learning_rate": 9.749608763693271e-06, + "loss": 2.4801, + "step": 610 + }, + { + "epoch": 0.6145722009178349, + "grad_norm": 22.721372237492865, + "learning_rate": 9.74737312765482e-06, + "loss": 2.5072, + "step": 611 + }, + { + "epoch": 0.6155780474005155, + "grad_norm": 21.485471706831824, + "learning_rate": 9.745137491616366e-06, + "loss": 2.4746, + "step": 612 + }, + { + "epoch": 0.6165838938831961, + "grad_norm": 18.77121558736787, + "learning_rate": 9.742901855577913e-06, + "loss": 2.5336, + "step": 613 + }, + { + "epoch": 0.6175897403658767, + "grad_norm": 19.085989559745865, + "learning_rate": 9.74066621953946e-06, + "loss": 2.5171, + "step": 614 + }, + { + "epoch": 0.6185955868485572, + "grad_norm": 20.870416879493348, + "learning_rate": 9.738430583501008e-06, + "loss": 2.5151, + "step": 615 + }, + { + "epoch": 0.6196014333312378, + "grad_norm": 21.14090139438579, + "learning_rate": 9.736194947462554e-06, + "loss": 2.5166, + "step": 616 + }, + { + "epoch": 0.6206072798139184, + "grad_norm": 19.01841862253812, + "learning_rate": 9.7339593114241e-06, + "loss": 2.5681, + "step": 617 + }, + { + "epoch": 0.6216131262965989, + "grad_norm": 21.22073856960854, + "learning_rate": 9.731723675385647e-06, + "loss": 2.5043, + "step": 618 + }, + { + "epoch": 0.6226189727792796, + "grad_norm": 17.725436424684155, + "learning_rate": 9.729488039347194e-06, + "loss": 2.5145, + "step": 619 + }, + { + "epoch": 0.6236248192619601, + "grad_norm": 20.751400818923063, + "learning_rate": 9.727252403308742e-06, + "loss": 2.4691, + "step": 620 + }, + { + "epoch": 0.6246306657446408, + "grad_norm": 22.573815319781055, + "learning_rate": 9.725016767270289e-06, + "loss": 2.562, + "step": 621 + }, + { + "epoch": 0.6256365122273213, + "grad_norm": 17.987265540277342, + "learning_rate": 9.722781131231837e-06, + "loss": 2.5302, + "step": 622 + }, + { + "epoch": 0.6266423587100018, + "grad_norm": 17.03654847709092, + "learning_rate": 9.720545495193383e-06, + "loss": 2.4943, + "step": 623 + }, + { + "epoch": 0.6276482051926825, + "grad_norm": 21.47031641830582, + "learning_rate": 9.71830985915493e-06, + "loss": 2.5616, + "step": 624 + }, + { + "epoch": 0.628654051675363, + "grad_norm": 20.301768991814786, + "learning_rate": 9.716074223116477e-06, + "loss": 2.5403, + "step": 625 + }, + { + "epoch": 0.6296598981580436, + "grad_norm": 20.183945617233114, + "learning_rate": 9.713838587078025e-06, + "loss": 2.4984, + "step": 626 + }, + { + "epoch": 0.6306657446407242, + "grad_norm": 18.087166072771478, + "learning_rate": 9.711602951039571e-06, + "loss": 2.5457, + "step": 627 + }, + { + "epoch": 0.6316715911234048, + "grad_norm": 18.497985162549405, + "learning_rate": 9.709367315001118e-06, + "loss": 2.5379, + "step": 628 + }, + { + "epoch": 0.6326774376060854, + "grad_norm": 25.906471020026725, + "learning_rate": 9.707131678962666e-06, + "loss": 2.5112, + "step": 629 + }, + { + "epoch": 0.633683284088766, + "grad_norm": 18.851737821574062, + "learning_rate": 9.704896042924213e-06, + "loss": 2.5183, + "step": 630 + }, + { + "epoch": 0.6346891305714465, + "grad_norm": 19.34370501708874, + "learning_rate": 9.70266040688576e-06, + "loss": 2.528, + "step": 631 + }, + { + "epoch": 0.6356949770541271, + "grad_norm": 17.46671844948673, + "learning_rate": 9.700424770847306e-06, + "loss": 2.4713, + "step": 632 + }, + { + "epoch": 0.6367008235368077, + "grad_norm": 15.857669621477335, + "learning_rate": 9.698189134808854e-06, + "loss": 2.5421, + "step": 633 + }, + { + "epoch": 0.6377066700194883, + "grad_norm": 20.690235507938166, + "learning_rate": 9.6959534987704e-06, + "loss": 2.5477, + "step": 634 + }, + { + "epoch": 0.6387125165021689, + "grad_norm": 17.75939120748436, + "learning_rate": 9.693717862731949e-06, + "loss": 2.4967, + "step": 635 + }, + { + "epoch": 0.6397183629848494, + "grad_norm": 20.86052880385186, + "learning_rate": 9.691482226693496e-06, + "loss": 2.5144, + "step": 636 + }, + { + "epoch": 0.6407242094675301, + "grad_norm": 21.504493399150405, + "learning_rate": 9.689246590655042e-06, + "loss": 2.4755, + "step": 637 + }, + { + "epoch": 0.6417300559502106, + "grad_norm": 19.2512677338903, + "learning_rate": 9.687010954616589e-06, + "loss": 2.5303, + "step": 638 + }, + { + "epoch": 0.6427359024328911, + "grad_norm": 20.932342218853474, + "learning_rate": 9.684775318578137e-06, + "loss": 2.4933, + "step": 639 + }, + { + "epoch": 0.6437417489155718, + "grad_norm": 21.530735790573438, + "learning_rate": 9.682539682539683e-06, + "loss": 2.5472, + "step": 640 + }, + { + "epoch": 0.6447475953982523, + "grad_norm": 19.3718729621729, + "learning_rate": 9.68030404650123e-06, + "loss": 2.5117, + "step": 641 + }, + { + "epoch": 0.645753441880933, + "grad_norm": 25.819881974982856, + "learning_rate": 9.678068410462778e-06, + "loss": 2.4887, + "step": 642 + }, + { + "epoch": 0.6467592883636135, + "grad_norm": 19.999886193848916, + "learning_rate": 9.675832774424325e-06, + "loss": 2.5082, + "step": 643 + }, + { + "epoch": 0.647765134846294, + "grad_norm": 21.357372449932193, + "learning_rate": 9.673597138385871e-06, + "loss": 2.4743, + "step": 644 + }, + { + "epoch": 0.6487709813289747, + "grad_norm": 24.86568125849384, + "learning_rate": 9.671361502347418e-06, + "loss": 2.5181, + "step": 645 + }, + { + "epoch": 0.6497768278116552, + "grad_norm": 19.99327736605139, + "learning_rate": 9.669125866308966e-06, + "loss": 2.5238, + "step": 646 + }, + { + "epoch": 0.6507826742943358, + "grad_norm": 22.801398555696743, + "learning_rate": 9.666890230270513e-06, + "loss": 2.5067, + "step": 647 + }, + { + "epoch": 0.6517885207770164, + "grad_norm": 19.838552809919648, + "learning_rate": 9.664654594232061e-06, + "loss": 2.4869, + "step": 648 + }, + { + "epoch": 0.652794367259697, + "grad_norm": 17.032352556256118, + "learning_rate": 9.662418958193608e-06, + "loss": 2.5488, + "step": 649 + }, + { + "epoch": 0.6538002137423776, + "grad_norm": 22.057043086256787, + "learning_rate": 9.660183322155154e-06, + "loss": 2.5637, + "step": 650 + }, + { + "epoch": 0.6548060602250582, + "grad_norm": 19.929726327790743, + "learning_rate": 9.6579476861167e-06, + "loss": 2.5389, + "step": 651 + }, + { + "epoch": 0.6558119067077387, + "grad_norm": 18.58065737479155, + "learning_rate": 9.655712050078247e-06, + "loss": 2.4967, + "step": 652 + }, + { + "epoch": 0.6568177531904194, + "grad_norm": 17.549376553583564, + "learning_rate": 9.653476414039795e-06, + "loss": 2.476, + "step": 653 + }, + { + "epoch": 0.6578235996730999, + "grad_norm": 20.240240936791928, + "learning_rate": 9.651240778001342e-06, + "loss": 2.5597, + "step": 654 + }, + { + "epoch": 0.6588294461557804, + "grad_norm": 21.625962244270248, + "learning_rate": 9.649005141962889e-06, + "loss": 2.4762, + "step": 655 + }, + { + "epoch": 0.6598352926384611, + "grad_norm": 21.173956358162684, + "learning_rate": 9.646769505924435e-06, + "loss": 2.5553, + "step": 656 + }, + { + "epoch": 0.6608411391211416, + "grad_norm": 18.05355685245098, + "learning_rate": 9.644533869885983e-06, + "loss": 2.5135, + "step": 657 + }, + { + "epoch": 0.6618469856038223, + "grad_norm": 21.37207774541577, + "learning_rate": 9.64229823384753e-06, + "loss": 2.5101, + "step": 658 + }, + { + "epoch": 0.6628528320865028, + "grad_norm": 16.35779599381775, + "learning_rate": 9.640062597809078e-06, + "loss": 2.4552, + "step": 659 + }, + { + "epoch": 0.6638586785691833, + "grad_norm": 17.570702404376508, + "learning_rate": 9.637826961770625e-06, + "loss": 2.5121, + "step": 660 + }, + { + "epoch": 0.664864525051864, + "grad_norm": 18.06740688331599, + "learning_rate": 9.635591325732171e-06, + "loss": 2.4734, + "step": 661 + }, + { + "epoch": 0.6658703715345445, + "grad_norm": 17.64673278928389, + "learning_rate": 9.633355689693718e-06, + "loss": 2.5011, + "step": 662 + }, + { + "epoch": 0.6668762180172251, + "grad_norm": 16.843625112610805, + "learning_rate": 9.631120053655266e-06, + "loss": 2.5188, + "step": 663 + }, + { + "epoch": 0.6678820644999057, + "grad_norm": 16.859967248687216, + "learning_rate": 9.628884417616813e-06, + "loss": 2.5267, + "step": 664 + }, + { + "epoch": 0.6688879109825863, + "grad_norm": 17.160716469928374, + "learning_rate": 9.62664878157836e-06, + "loss": 2.5312, + "step": 665 + }, + { + "epoch": 0.6698937574652669, + "grad_norm": 18.81047677543676, + "learning_rate": 9.624413145539908e-06, + "loss": 2.514, + "step": 666 + }, + { + "epoch": 0.6708996039479475, + "grad_norm": 17.03092392882562, + "learning_rate": 9.622177509501454e-06, + "loss": 2.5182, + "step": 667 + }, + { + "epoch": 0.671905450430628, + "grad_norm": 22.031292359972962, + "learning_rate": 9.619941873463e-06, + "loss": 2.4933, + "step": 668 + }, + { + "epoch": 0.6729112969133086, + "grad_norm": 15.894396840819647, + "learning_rate": 9.617706237424547e-06, + "loss": 2.4675, + "step": 669 + }, + { + "epoch": 0.6739171433959892, + "grad_norm": 16.78761067447768, + "learning_rate": 9.615470601386095e-06, + "loss": 2.5581, + "step": 670 + }, + { + "epoch": 0.6749229898786697, + "grad_norm": 17.818343229735852, + "learning_rate": 9.613234965347642e-06, + "loss": 2.5065, + "step": 671 + }, + { + "epoch": 0.6759288363613504, + "grad_norm": 17.871155303653666, + "learning_rate": 9.61099932930919e-06, + "loss": 2.511, + "step": 672 + }, + { + "epoch": 0.6769346828440309, + "grad_norm": 19.612335334446225, + "learning_rate": 9.608763693270737e-06, + "loss": 2.5465, + "step": 673 + }, + { + "epoch": 0.6779405293267116, + "grad_norm": 20.97735200183468, + "learning_rate": 9.606528057232283e-06, + "loss": 2.5658, + "step": 674 + }, + { + "epoch": 0.6789463758093921, + "grad_norm": 23.948339016531275, + "learning_rate": 9.60429242119383e-06, + "loss": 2.5125, + "step": 675 + }, + { + "epoch": 0.6799522222920726, + "grad_norm": 20.0467714662731, + "learning_rate": 9.602056785155377e-06, + "loss": 2.5191, + "step": 676 + }, + { + "epoch": 0.6809580687747533, + "grad_norm": 19.760663718030475, + "learning_rate": 9.599821149116925e-06, + "loss": 2.5257, + "step": 677 + }, + { + "epoch": 0.6819639152574338, + "grad_norm": 17.66847669368681, + "learning_rate": 9.597585513078471e-06, + "loss": 2.4735, + "step": 678 + }, + { + "epoch": 0.6829697617401144, + "grad_norm": 20.304213959727797, + "learning_rate": 9.59534987704002e-06, + "loss": 2.5135, + "step": 679 + }, + { + "epoch": 0.683975608222795, + "grad_norm": 16.874889043585817, + "learning_rate": 9.593114241001566e-06, + "loss": 2.5537, + "step": 680 + }, + { + "epoch": 0.6849814547054756, + "grad_norm": 17.391916117706774, + "learning_rate": 9.590878604963113e-06, + "loss": 2.5516, + "step": 681 + }, + { + "epoch": 0.6859873011881562, + "grad_norm": 16.97825687844495, + "learning_rate": 9.58864296892466e-06, + "loss": 2.5205, + "step": 682 + }, + { + "epoch": 0.6869931476708367, + "grad_norm": 17.746056983891144, + "learning_rate": 9.586407332886208e-06, + "loss": 2.4779, + "step": 683 + }, + { + "epoch": 0.6879989941535173, + "grad_norm": 21.91146086074379, + "learning_rate": 9.584171696847754e-06, + "loss": 2.5713, + "step": 684 + }, + { + "epoch": 0.6890048406361979, + "grad_norm": 22.158541573895086, + "learning_rate": 9.581936060809302e-06, + "loss": 2.4892, + "step": 685 + }, + { + "epoch": 0.6900106871188785, + "grad_norm": 17.263290599415026, + "learning_rate": 9.579700424770847e-06, + "loss": 2.4782, + "step": 686 + }, + { + "epoch": 0.691016533601559, + "grad_norm": 18.949281510467618, + "learning_rate": 9.577464788732394e-06, + "loss": 2.4675, + "step": 687 + }, + { + "epoch": 0.6920223800842397, + "grad_norm": 21.84072077538254, + "learning_rate": 9.575229152693942e-06, + "loss": 2.5219, + "step": 688 + }, + { + "epoch": 0.6930282265669202, + "grad_norm": 22.45437820067143, + "learning_rate": 9.572993516655489e-06, + "loss": 2.5034, + "step": 689 + }, + { + "epoch": 0.6940340730496009, + "grad_norm": 18.766091949144645, + "learning_rate": 9.570757880617037e-06, + "loss": 2.5564, + "step": 690 + }, + { + "epoch": 0.6950399195322814, + "grad_norm": 21.409093756509865, + "learning_rate": 9.568522244578583e-06, + "loss": 2.537, + "step": 691 + }, + { + "epoch": 0.6960457660149619, + "grad_norm": 24.96254769951716, + "learning_rate": 9.56628660854013e-06, + "loss": 2.5363, + "step": 692 + }, + { + "epoch": 0.6970516124976426, + "grad_norm": 18.60658988302085, + "learning_rate": 9.564050972501677e-06, + "loss": 2.5549, + "step": 693 + }, + { + "epoch": 0.6980574589803231, + "grad_norm": 18.613218112954755, + "learning_rate": 9.561815336463225e-06, + "loss": 2.4987, + "step": 694 + }, + { + "epoch": 0.6990633054630037, + "grad_norm": 22.66026247971019, + "learning_rate": 9.559579700424771e-06, + "loss": 2.5338, + "step": 695 + }, + { + "epoch": 0.7000691519456843, + "grad_norm": 17.804777772482662, + "learning_rate": 9.55734406438632e-06, + "loss": 2.5652, + "step": 696 + }, + { + "epoch": 0.7010749984283648, + "grad_norm": 19.80514523411522, + "learning_rate": 9.555108428347866e-06, + "loss": 2.501, + "step": 697 + }, + { + "epoch": 0.7020808449110455, + "grad_norm": 18.920232209491584, + "learning_rate": 9.552872792309413e-06, + "loss": 2.4585, + "step": 698 + }, + { + "epoch": 0.703086691393726, + "grad_norm": 20.69586994307614, + "learning_rate": 9.55063715627096e-06, + "loss": 2.5144, + "step": 699 + }, + { + "epoch": 0.7040925378764066, + "grad_norm": 20.291013143429755, + "learning_rate": 9.548401520232506e-06, + "loss": 2.4781, + "step": 700 + }, + { + "epoch": 0.7050983843590872, + "grad_norm": 18.869940922356157, + "learning_rate": 9.546165884194054e-06, + "loss": 2.5295, + "step": 701 + }, + { + "epoch": 0.7061042308417678, + "grad_norm": 20.896359569542287, + "learning_rate": 9.5439302481556e-06, + "loss": 2.5302, + "step": 702 + }, + { + "epoch": 0.7071100773244483, + "grad_norm": 20.082650895679876, + "learning_rate": 9.541694612117149e-06, + "loss": 2.5929, + "step": 703 + }, + { + "epoch": 0.708115923807129, + "grad_norm": 20.467778705563482, + "learning_rate": 9.539458976078695e-06, + "loss": 2.5059, + "step": 704 + }, + { + "epoch": 0.7091217702898095, + "grad_norm": 15.681829621018034, + "learning_rate": 9.537223340040242e-06, + "loss": 2.5129, + "step": 705 + }, + { + "epoch": 0.7101276167724901, + "grad_norm": 20.133542563502154, + "learning_rate": 9.534987704001789e-06, + "loss": 2.5555, + "step": 706 + }, + { + "epoch": 0.7111334632551707, + "grad_norm": 21.145027241531068, + "learning_rate": 9.532752067963337e-06, + "loss": 2.5357, + "step": 707 + }, + { + "epoch": 0.7121393097378512, + "grad_norm": 17.53550654089157, + "learning_rate": 9.530516431924883e-06, + "loss": 2.4717, + "step": 708 + }, + { + "epoch": 0.7131451562205319, + "grad_norm": 20.65677074422556, + "learning_rate": 9.528280795886432e-06, + "loss": 2.513, + "step": 709 + }, + { + "epoch": 0.7141510027032124, + "grad_norm": 21.89801877836519, + "learning_rate": 9.526045159847978e-06, + "loss": 2.5101, + "step": 710 + }, + { + "epoch": 0.715156849185893, + "grad_norm": 21.26634341268915, + "learning_rate": 9.523809523809525e-06, + "loss": 2.5282, + "step": 711 + }, + { + "epoch": 0.7161626956685736, + "grad_norm": 19.627607255985847, + "learning_rate": 9.521573887771071e-06, + "loss": 2.3997, + "step": 712 + }, + { + "epoch": 0.7171685421512541, + "grad_norm": 17.518544399386524, + "learning_rate": 9.519338251732618e-06, + "loss": 2.502, + "step": 713 + }, + { + "epoch": 0.7181743886339348, + "grad_norm": 20.25482589759302, + "learning_rate": 9.517102615694166e-06, + "loss": 2.5121, + "step": 714 + }, + { + "epoch": 0.7191802351166153, + "grad_norm": 20.615452010565395, + "learning_rate": 9.514866979655713e-06, + "loss": 2.5037, + "step": 715 + }, + { + "epoch": 0.7201860815992959, + "grad_norm": 16.66160935051711, + "learning_rate": 9.512631343617261e-06, + "loss": 2.5062, + "step": 716 + }, + { + "epoch": 0.7211919280819765, + "grad_norm": 19.14572503732978, + "learning_rate": 9.510395707578807e-06, + "loss": 2.4963, + "step": 717 + }, + { + "epoch": 0.7221977745646571, + "grad_norm": 19.59250241873381, + "learning_rate": 9.508160071540354e-06, + "loss": 2.4692, + "step": 718 + }, + { + "epoch": 0.7232036210473376, + "grad_norm": 17.65339946945576, + "learning_rate": 9.5059244355019e-06, + "loss": 2.4886, + "step": 719 + }, + { + "epoch": 0.7242094675300182, + "grad_norm": 17.059073253679152, + "learning_rate": 9.503688799463449e-06, + "loss": 2.5175, + "step": 720 + }, + { + "epoch": 0.7252153140126988, + "grad_norm": 16.074504376623388, + "learning_rate": 9.501453163424995e-06, + "loss": 2.5276, + "step": 721 + }, + { + "epoch": 0.7262211604953794, + "grad_norm": 19.279211211820908, + "learning_rate": 9.499217527386542e-06, + "loss": 2.5111, + "step": 722 + }, + { + "epoch": 0.72722700697806, + "grad_norm": 18.124284123662274, + "learning_rate": 9.496981891348089e-06, + "loss": 2.5347, + "step": 723 + }, + { + "epoch": 0.7282328534607405, + "grad_norm": 17.712027375920236, + "learning_rate": 9.494746255309635e-06, + "loss": 2.5076, + "step": 724 + }, + { + "epoch": 0.7292386999434212, + "grad_norm": 19.28157010260716, + "learning_rate": 9.492510619271183e-06, + "loss": 2.5033, + "step": 725 + }, + { + "epoch": 0.7302445464261017, + "grad_norm": 19.61185471148692, + "learning_rate": 9.49027498323273e-06, + "loss": 2.4904, + "step": 726 + }, + { + "epoch": 0.7312503929087822, + "grad_norm": 17.408335083465722, + "learning_rate": 9.488039347194278e-06, + "loss": 2.5597, + "step": 727 + }, + { + "epoch": 0.7322562393914629, + "grad_norm": 17.951454457586962, + "learning_rate": 9.485803711155825e-06, + "loss": 2.5006, + "step": 728 + }, + { + "epoch": 0.7332620858741434, + "grad_norm": 20.671307219350894, + "learning_rate": 9.483568075117371e-06, + "loss": 2.5391, + "step": 729 + }, + { + "epoch": 0.7342679323568241, + "grad_norm": 25.277098409010296, + "learning_rate": 9.481332439078918e-06, + "loss": 2.5179, + "step": 730 + }, + { + "epoch": 0.7352737788395046, + "grad_norm": 24.093563555760248, + "learning_rate": 9.479096803040466e-06, + "loss": 2.5197, + "step": 731 + }, + { + "epoch": 0.7362796253221852, + "grad_norm": 19.069594753082438, + "learning_rate": 9.476861167002013e-06, + "loss": 2.5314, + "step": 732 + }, + { + "epoch": 0.7372854718048658, + "grad_norm": 25.159064679345477, + "learning_rate": 9.474625530963561e-06, + "loss": 2.5633, + "step": 733 + }, + { + "epoch": 0.7382913182875463, + "grad_norm": 19.255862388290833, + "learning_rate": 9.472389894925107e-06, + "loss": 2.5535, + "step": 734 + }, + { + "epoch": 0.7392971647702269, + "grad_norm": 20.15846805260267, + "learning_rate": 9.470154258886654e-06, + "loss": 2.5167, + "step": 735 + }, + { + "epoch": 0.7403030112529075, + "grad_norm": 17.05788614121942, + "learning_rate": 9.4679186228482e-06, + "loss": 2.5146, + "step": 736 + }, + { + "epoch": 0.7413088577355881, + "grad_norm": 20.346251298426235, + "learning_rate": 9.465682986809747e-06, + "loss": 2.4956, + "step": 737 + }, + { + "epoch": 0.7423147042182687, + "grad_norm": 17.004794004812634, + "learning_rate": 9.463447350771295e-06, + "loss": 2.5642, + "step": 738 + }, + { + "epoch": 0.7433205507009493, + "grad_norm": 20.394427656456934, + "learning_rate": 9.461211714732842e-06, + "loss": 2.495, + "step": 739 + }, + { + "epoch": 0.7443263971836298, + "grad_norm": 19.281712523889723, + "learning_rate": 9.45897607869439e-06, + "loss": 2.4582, + "step": 740 + }, + { + "epoch": 0.7453322436663105, + "grad_norm": 20.066929363942144, + "learning_rate": 9.456740442655937e-06, + "loss": 2.4908, + "step": 741 + }, + { + "epoch": 0.746338090148991, + "grad_norm": 18.593791504643672, + "learning_rate": 9.454504806617483e-06, + "loss": 2.5306, + "step": 742 + }, + { + "epoch": 0.7473439366316716, + "grad_norm": 19.254184430311664, + "learning_rate": 9.45226917057903e-06, + "loss": 2.5237, + "step": 743 + }, + { + "epoch": 0.7483497831143522, + "grad_norm": 20.26496937620349, + "learning_rate": 9.450033534540578e-06, + "loss": 2.5351, + "step": 744 + }, + { + "epoch": 0.7493556295970327, + "grad_norm": 19.52286145145994, + "learning_rate": 9.447797898502125e-06, + "loss": 2.507, + "step": 745 + }, + { + "epoch": 0.7503614760797134, + "grad_norm": 17.098379884990067, + "learning_rate": 9.445562262463671e-06, + "loss": 2.4855, + "step": 746 + }, + { + "epoch": 0.7513673225623939, + "grad_norm": 19.88240078373313, + "learning_rate": 9.44332662642522e-06, + "loss": 2.4897, + "step": 747 + }, + { + "epoch": 0.7523731690450745, + "grad_norm": 18.15902628291973, + "learning_rate": 9.441090990386766e-06, + "loss": 2.4605, + "step": 748 + }, + { + "epoch": 0.7533790155277551, + "grad_norm": 18.379420609505242, + "learning_rate": 9.438855354348313e-06, + "loss": 2.5071, + "step": 749 + }, + { + "epoch": 0.7543848620104356, + "grad_norm": 19.759355915890186, + "learning_rate": 9.43661971830986e-06, + "loss": 2.5374, + "step": 750 + }, + { + "epoch": 0.7553907084931163, + "grad_norm": 20.846426367756592, + "learning_rate": 9.434384082271407e-06, + "loss": 2.4438, + "step": 751 + }, + { + "epoch": 0.7563965549757968, + "grad_norm": 18.569949093964635, + "learning_rate": 9.432148446232954e-06, + "loss": 2.4316, + "step": 752 + }, + { + "epoch": 0.7574024014584774, + "grad_norm": 18.825641211217608, + "learning_rate": 9.429912810194502e-06, + "loss": 2.4787, + "step": 753 + }, + { + "epoch": 0.758408247941158, + "grad_norm": 18.251254587229408, + "learning_rate": 9.427677174156049e-06, + "loss": 2.5326, + "step": 754 + }, + { + "epoch": 0.7594140944238386, + "grad_norm": 19.148609800011037, + "learning_rate": 9.425441538117595e-06, + "loss": 2.4675, + "step": 755 + }, + { + "epoch": 0.7604199409065191, + "grad_norm": 19.62748223343742, + "learning_rate": 9.423205902079142e-06, + "loss": 2.4738, + "step": 756 + }, + { + "epoch": 0.7614257873891997, + "grad_norm": 17.275490176829774, + "learning_rate": 9.42097026604069e-06, + "loss": 2.4384, + "step": 757 + }, + { + "epoch": 0.7624316338718803, + "grad_norm": 20.315078913702827, + "learning_rate": 9.418734630002237e-06, + "loss": 2.4824, + "step": 758 + }, + { + "epoch": 0.7634374803545609, + "grad_norm": 20.45538876387583, + "learning_rate": 9.416498993963783e-06, + "loss": 2.5446, + "step": 759 + }, + { + "epoch": 0.7644433268372415, + "grad_norm": 18.494779020290373, + "learning_rate": 9.41426335792533e-06, + "loss": 2.4971, + "step": 760 + }, + { + "epoch": 0.765449173319922, + "grad_norm": 17.79268503808769, + "learning_rate": 9.412027721886876e-06, + "loss": 2.497, + "step": 761 + }, + { + "epoch": 0.7664550198026027, + "grad_norm": 17.514485728139903, + "learning_rate": 9.409792085848425e-06, + "loss": 2.4943, + "step": 762 + }, + { + "epoch": 0.7674608662852832, + "grad_norm": 17.08742932206588, + "learning_rate": 9.407556449809971e-06, + "loss": 2.5622, + "step": 763 + }, + { + "epoch": 0.7684667127679637, + "grad_norm": 19.17619661894787, + "learning_rate": 9.40532081377152e-06, + "loss": 2.5019, + "step": 764 + }, + { + "epoch": 0.7694725592506444, + "grad_norm": 20.149404401707567, + "learning_rate": 9.403085177733066e-06, + "loss": 2.5323, + "step": 765 + }, + { + "epoch": 0.7704784057333249, + "grad_norm": 19.522637904808622, + "learning_rate": 9.400849541694613e-06, + "loss": 2.5068, + "step": 766 + }, + { + "epoch": 0.7714842522160056, + "grad_norm": 25.35421241795729, + "learning_rate": 9.39861390565616e-06, + "loss": 2.5455, + "step": 767 + }, + { + "epoch": 0.7724900986986861, + "grad_norm": 21.894940996547597, + "learning_rate": 9.396378269617707e-06, + "loss": 2.4854, + "step": 768 + }, + { + "epoch": 0.7734959451813667, + "grad_norm": 17.243790817037375, + "learning_rate": 9.394142633579254e-06, + "loss": 2.4934, + "step": 769 + }, + { + "epoch": 0.7745017916640473, + "grad_norm": 20.051709239288467, + "learning_rate": 9.3919069975408e-06, + "loss": 2.4467, + "step": 770 + }, + { + "epoch": 0.7755076381467279, + "grad_norm": 19.090236927656047, + "learning_rate": 9.389671361502349e-06, + "loss": 2.5965, + "step": 771 + }, + { + "epoch": 0.7765134846294084, + "grad_norm": 21.659867988546495, + "learning_rate": 9.387435725463895e-06, + "loss": 2.4831, + "step": 772 + }, + { + "epoch": 0.777519331112089, + "grad_norm": 22.068674111225377, + "learning_rate": 9.385200089425442e-06, + "loss": 2.4838, + "step": 773 + }, + { + "epoch": 0.7785251775947696, + "grad_norm": 20.808316291400047, + "learning_rate": 9.382964453386989e-06, + "loss": 2.5204, + "step": 774 + }, + { + "epoch": 0.7795310240774502, + "grad_norm": 25.226727704140117, + "learning_rate": 9.380728817348537e-06, + "loss": 2.4974, + "step": 775 + }, + { + "epoch": 0.7805368705601308, + "grad_norm": 27.240953523535676, + "learning_rate": 9.378493181310083e-06, + "loss": 2.5006, + "step": 776 + }, + { + "epoch": 0.7815427170428113, + "grad_norm": 20.276309928518156, + "learning_rate": 9.376257545271632e-06, + "loss": 2.4803, + "step": 777 + }, + { + "epoch": 0.782548563525492, + "grad_norm": 20.57402971691398, + "learning_rate": 9.374021909233178e-06, + "loss": 2.5833, + "step": 778 + }, + { + "epoch": 0.7835544100081725, + "grad_norm": 19.581526916692166, + "learning_rate": 9.371786273194725e-06, + "loss": 2.5137, + "step": 779 + }, + { + "epoch": 0.784560256490853, + "grad_norm": 17.050469507343866, + "learning_rate": 9.369550637156271e-06, + "loss": 2.4201, + "step": 780 + }, + { + "epoch": 0.7855661029735337, + "grad_norm": 20.7042448692395, + "learning_rate": 9.36731500111782e-06, + "loss": 2.4744, + "step": 781 + }, + { + "epoch": 0.7865719494562142, + "grad_norm": 20.411291333400605, + "learning_rate": 9.365079365079366e-06, + "loss": 2.5243, + "step": 782 + }, + { + "epoch": 0.7875777959388949, + "grad_norm": 18.757539515639955, + "learning_rate": 9.362843729040913e-06, + "loss": 2.4651, + "step": 783 + }, + { + "epoch": 0.7885836424215754, + "grad_norm": 18.517065201543332, + "learning_rate": 9.360608093002461e-06, + "loss": 2.5381, + "step": 784 + }, + { + "epoch": 0.789589488904256, + "grad_norm": 15.998281316407619, + "learning_rate": 9.358372456964007e-06, + "loss": 2.4589, + "step": 785 + }, + { + "epoch": 0.7905953353869366, + "grad_norm": 19.321999707357065, + "learning_rate": 9.356136820925554e-06, + "loss": 2.5073, + "step": 786 + }, + { + "epoch": 0.7916011818696171, + "grad_norm": 17.3471900742081, + "learning_rate": 9.3539011848871e-06, + "loss": 2.5422, + "step": 787 + }, + { + "epoch": 0.7926070283522977, + "grad_norm": 18.923681325265438, + "learning_rate": 9.351665548848649e-06, + "loss": 2.5227, + "step": 788 + }, + { + "epoch": 0.7936128748349783, + "grad_norm": 15.265177415266567, + "learning_rate": 9.349429912810195e-06, + "loss": 2.4649, + "step": 789 + }, + { + "epoch": 0.7946187213176589, + "grad_norm": 16.530611307751233, + "learning_rate": 9.347194276771742e-06, + "loss": 2.4888, + "step": 790 + }, + { + "epoch": 0.7956245678003395, + "grad_norm": 17.95202578745719, + "learning_rate": 9.344958640733288e-06, + "loss": 2.5187, + "step": 791 + }, + { + "epoch": 0.7966304142830201, + "grad_norm": 16.57212408050738, + "learning_rate": 9.342723004694837e-06, + "loss": 2.4524, + "step": 792 + }, + { + "epoch": 0.7976362607657006, + "grad_norm": 17.77570862737382, + "learning_rate": 9.340487368656383e-06, + "loss": 2.5125, + "step": 793 + }, + { + "epoch": 0.7986421072483812, + "grad_norm": 17.09498814803331, + "learning_rate": 9.33825173261793e-06, + "loss": 2.5238, + "step": 794 + }, + { + "epoch": 0.7996479537310618, + "grad_norm": 16.637506590364993, + "learning_rate": 9.336016096579478e-06, + "loss": 2.5148, + "step": 795 + }, + { + "epoch": 0.8006538002137423, + "grad_norm": 17.260042591014948, + "learning_rate": 9.333780460541025e-06, + "loss": 2.5784, + "step": 796 + }, + { + "epoch": 0.801659646696423, + "grad_norm": 16.262798660851523, + "learning_rate": 9.331544824502571e-06, + "loss": 2.4777, + "step": 797 + }, + { + "epoch": 0.8026654931791035, + "grad_norm": 16.23096951371673, + "learning_rate": 9.329309188464118e-06, + "loss": 2.457, + "step": 798 + }, + { + "epoch": 0.8036713396617842, + "grad_norm": 19.520160951426668, + "learning_rate": 9.327073552425666e-06, + "loss": 2.4871, + "step": 799 + }, + { + "epoch": 0.8046771861444647, + "grad_norm": 19.03726995483599, + "learning_rate": 9.324837916387213e-06, + "loss": 2.4776, + "step": 800 + }, + { + "epoch": 0.8056830326271452, + "grad_norm": 15.94571703867295, + "learning_rate": 9.322602280348761e-06, + "loss": 2.5302, + "step": 801 + }, + { + "epoch": 0.8066888791098259, + "grad_norm": 19.562207263305492, + "learning_rate": 9.320366644310307e-06, + "loss": 2.4405, + "step": 802 + }, + { + "epoch": 0.8076947255925064, + "grad_norm": 18.15166430685149, + "learning_rate": 9.318131008271854e-06, + "loss": 2.499, + "step": 803 + }, + { + "epoch": 0.808700572075187, + "grad_norm": 17.669181056591288, + "learning_rate": 9.3158953722334e-06, + "loss": 2.4982, + "step": 804 + }, + { + "epoch": 0.8097064185578676, + "grad_norm": 16.841416364196867, + "learning_rate": 9.313659736194947e-06, + "loss": 2.4725, + "step": 805 + }, + { + "epoch": 0.8107122650405482, + "grad_norm": 18.08362901347937, + "learning_rate": 9.311424100156495e-06, + "loss": 2.5331, + "step": 806 + }, + { + "epoch": 0.8117181115232288, + "grad_norm": 17.783480259954455, + "learning_rate": 9.309188464118042e-06, + "loss": 2.4996, + "step": 807 + }, + { + "epoch": 0.8127239580059094, + "grad_norm": 16.589835867359405, + "learning_rate": 9.30695282807959e-06, + "loss": 2.4701, + "step": 808 + }, + { + "epoch": 0.8137298044885899, + "grad_norm": 17.676330608733828, + "learning_rate": 9.304717192041137e-06, + "loss": 2.552, + "step": 809 + }, + { + "epoch": 0.8147356509712705, + "grad_norm": 15.886159845740128, + "learning_rate": 9.302481556002683e-06, + "loss": 2.4703, + "step": 810 + }, + { + "epoch": 0.8157414974539511, + "grad_norm": 16.05722427471681, + "learning_rate": 9.30024591996423e-06, + "loss": 2.4908, + "step": 811 + }, + { + "epoch": 0.8167473439366316, + "grad_norm": 17.36535231267961, + "learning_rate": 9.298010283925778e-06, + "loss": 2.503, + "step": 812 + }, + { + "epoch": 0.8177531904193123, + "grad_norm": 16.14465414079727, + "learning_rate": 9.295774647887325e-06, + "loss": 2.5195, + "step": 813 + }, + { + "epoch": 0.8187590369019928, + "grad_norm": 17.089258509215345, + "learning_rate": 9.293539011848873e-06, + "loss": 2.4592, + "step": 814 + }, + { + "epoch": 0.8197648833846735, + "grad_norm": 18.955704714718312, + "learning_rate": 9.29130337581042e-06, + "loss": 2.4564, + "step": 815 + }, + { + "epoch": 0.820770729867354, + "grad_norm": 17.309458597444493, + "learning_rate": 9.289067739771966e-06, + "loss": 2.503, + "step": 816 + }, + { + "epoch": 0.8217765763500345, + "grad_norm": 15.089049062269016, + "learning_rate": 9.286832103733513e-06, + "loss": 2.5347, + "step": 817 + }, + { + "epoch": 0.8227824228327152, + "grad_norm": 19.012802639782954, + "learning_rate": 9.284596467695059e-06, + "loss": 2.4771, + "step": 818 + }, + { + "epoch": 0.8237882693153957, + "grad_norm": 18.156252128847964, + "learning_rate": 9.282360831656607e-06, + "loss": 2.4973, + "step": 819 + }, + { + "epoch": 0.8247941157980763, + "grad_norm": 18.40893210604742, + "learning_rate": 9.280125195618154e-06, + "loss": 2.5712, + "step": 820 + }, + { + "epoch": 0.8257999622807569, + "grad_norm": 16.399612728431613, + "learning_rate": 9.277889559579702e-06, + "loss": 2.4728, + "step": 821 + }, + { + "epoch": 0.8268058087634375, + "grad_norm": 18.67911638017806, + "learning_rate": 9.275653923541249e-06, + "loss": 2.5125, + "step": 822 + }, + { + "epoch": 0.8278116552461181, + "grad_norm": 17.322713351462784, + "learning_rate": 9.273418287502795e-06, + "loss": 2.4476, + "step": 823 + }, + { + "epoch": 0.8288175017287986, + "grad_norm": 18.716866313005497, + "learning_rate": 9.271182651464342e-06, + "loss": 2.4796, + "step": 824 + }, + { + "epoch": 0.8298233482114792, + "grad_norm": 18.646291489781458, + "learning_rate": 9.26894701542589e-06, + "loss": 2.4908, + "step": 825 + }, + { + "epoch": 0.8308291946941598, + "grad_norm": 17.84229204379419, + "learning_rate": 9.266711379387437e-06, + "loss": 2.4703, + "step": 826 + }, + { + "epoch": 0.8318350411768404, + "grad_norm": 18.64370946639297, + "learning_rate": 9.264475743348983e-06, + "loss": 2.4739, + "step": 827 + }, + { + "epoch": 0.8328408876595209, + "grad_norm": 18.348032755338412, + "learning_rate": 9.26224010731053e-06, + "loss": 2.497, + "step": 828 + }, + { + "epoch": 0.8338467341422016, + "grad_norm": 16.841822742120723, + "learning_rate": 9.260004471272076e-06, + "loss": 2.4935, + "step": 829 + }, + { + "epoch": 0.8348525806248821, + "grad_norm": 19.438358318778544, + "learning_rate": 9.257768835233625e-06, + "loss": 2.5429, + "step": 830 + }, + { + "epoch": 0.8358584271075628, + "grad_norm": 18.878343814431652, + "learning_rate": 9.255533199195171e-06, + "loss": 2.4938, + "step": 831 + }, + { + "epoch": 0.8368642735902433, + "grad_norm": 15.766988224446346, + "learning_rate": 9.25329756315672e-06, + "loss": 2.4681, + "step": 832 + }, + { + "epoch": 0.8378701200729238, + "grad_norm": 17.799674012352813, + "learning_rate": 9.251061927118266e-06, + "loss": 2.5213, + "step": 833 + }, + { + "epoch": 0.8388759665556045, + "grad_norm": 19.73080921535841, + "learning_rate": 9.248826291079813e-06, + "loss": 2.4887, + "step": 834 + }, + { + "epoch": 0.839881813038285, + "grad_norm": 21.784930874046488, + "learning_rate": 9.246590655041359e-06, + "loss": 2.5385, + "step": 835 + }, + { + "epoch": 0.8408876595209656, + "grad_norm": 16.909577276298187, + "learning_rate": 9.244355019002907e-06, + "loss": 2.5363, + "step": 836 + }, + { + "epoch": 0.8418935060036462, + "grad_norm": 17.585758273866936, + "learning_rate": 9.242119382964454e-06, + "loss": 2.5038, + "step": 837 + }, + { + "epoch": 0.8428993524863267, + "grad_norm": 21.26661156378665, + "learning_rate": 9.239883746926002e-06, + "loss": 2.4911, + "step": 838 + }, + { + "epoch": 0.8439051989690074, + "grad_norm": 15.501969039845058, + "learning_rate": 9.237648110887549e-06, + "loss": 2.5092, + "step": 839 + }, + { + "epoch": 0.8449110454516879, + "grad_norm": 19.35818051179853, + "learning_rate": 9.235412474849095e-06, + "loss": 2.4803, + "step": 840 + }, + { + "epoch": 0.8459168919343685, + "grad_norm": 16.535060755036255, + "learning_rate": 9.233176838810642e-06, + "loss": 2.4715, + "step": 841 + }, + { + "epoch": 0.8469227384170491, + "grad_norm": 19.610807091356513, + "learning_rate": 9.230941202772188e-06, + "loss": 2.5063, + "step": 842 + }, + { + "epoch": 0.8479285848997297, + "grad_norm": 16.2227194450381, + "learning_rate": 9.228705566733737e-06, + "loss": 2.4309, + "step": 843 + }, + { + "epoch": 0.8489344313824103, + "grad_norm": 18.871725206678025, + "learning_rate": 9.226469930695283e-06, + "loss": 2.5185, + "step": 844 + }, + { + "epoch": 0.8499402778650909, + "grad_norm": 19.80150197641624, + "learning_rate": 9.224234294656832e-06, + "loss": 2.4366, + "step": 845 + }, + { + "epoch": 0.8509461243477714, + "grad_norm": 19.82523926255123, + "learning_rate": 9.221998658618378e-06, + "loss": 2.4862, + "step": 846 + }, + { + "epoch": 0.851951970830452, + "grad_norm": 17.005619743686427, + "learning_rate": 9.219763022579925e-06, + "loss": 2.4198, + "step": 847 + }, + { + "epoch": 0.8529578173131326, + "grad_norm": 20.636823632802646, + "learning_rate": 9.217527386541471e-06, + "loss": 2.4665, + "step": 848 + }, + { + "epoch": 0.8539636637958131, + "grad_norm": 19.016174344021117, + "learning_rate": 9.21529175050302e-06, + "loss": 2.4749, + "step": 849 + }, + { + "epoch": 0.8549695102784938, + "grad_norm": 18.41397658496216, + "learning_rate": 9.213056114464566e-06, + "loss": 2.4358, + "step": 850 + }, + { + "epoch": 0.8559753567611743, + "grad_norm": 17.46790390111685, + "learning_rate": 9.210820478426114e-06, + "loss": 2.4979, + "step": 851 + }, + { + "epoch": 0.856981203243855, + "grad_norm": 18.488587583895956, + "learning_rate": 9.20858484238766e-06, + "loss": 2.4829, + "step": 852 + }, + { + "epoch": 0.8579870497265355, + "grad_norm": 24.74417215175763, + "learning_rate": 9.206349206349207e-06, + "loss": 2.5312, + "step": 853 + }, + { + "epoch": 0.858992896209216, + "grad_norm": 19.62898918524025, + "learning_rate": 9.204113570310754e-06, + "loss": 2.4736, + "step": 854 + }, + { + "epoch": 0.8599987426918967, + "grad_norm": 20.709975695069204, + "learning_rate": 9.2018779342723e-06, + "loss": 2.4419, + "step": 855 + }, + { + "epoch": 0.8610045891745772, + "grad_norm": 23.505683041265875, + "learning_rate": 9.199642298233849e-06, + "loss": 2.52, + "step": 856 + }, + { + "epoch": 0.8620104356572578, + "grad_norm": 23.061385212409167, + "learning_rate": 9.197406662195395e-06, + "loss": 2.5224, + "step": 857 + }, + { + "epoch": 0.8630162821399384, + "grad_norm": 19.435142582973928, + "learning_rate": 9.195171026156942e-06, + "loss": 2.5116, + "step": 858 + }, + { + "epoch": 0.864022128622619, + "grad_norm": 20.935679867064998, + "learning_rate": 9.192935390118488e-06, + "loss": 2.5244, + "step": 859 + }, + { + "epoch": 0.8650279751052996, + "grad_norm": 19.541576135091336, + "learning_rate": 9.190699754080037e-06, + "loss": 2.5302, + "step": 860 + }, + { + "epoch": 0.8660338215879801, + "grad_norm": 18.688832058440994, + "learning_rate": 9.188464118041583e-06, + "loss": 2.4724, + "step": 861 + }, + { + "epoch": 0.8670396680706607, + "grad_norm": 19.95676951865818, + "learning_rate": 9.186228482003131e-06, + "loss": 2.4672, + "step": 862 + }, + { + "epoch": 0.8680455145533413, + "grad_norm": 19.390108841449873, + "learning_rate": 9.183992845964678e-06, + "loss": 2.4413, + "step": 863 + }, + { + "epoch": 0.8690513610360219, + "grad_norm": 16.19229969510003, + "learning_rate": 9.181757209926225e-06, + "loss": 2.4524, + "step": 864 + }, + { + "epoch": 0.8700572075187024, + "grad_norm": 18.60982850694698, + "learning_rate": 9.179521573887771e-06, + "loss": 2.5208, + "step": 865 + }, + { + "epoch": 0.8710630540013831, + "grad_norm": 17.614897460064757, + "learning_rate": 9.177285937849318e-06, + "loss": 2.462, + "step": 866 + }, + { + "epoch": 0.8720689004840636, + "grad_norm": 15.123819842436388, + "learning_rate": 9.175050301810866e-06, + "loss": 2.4727, + "step": 867 + }, + { + "epoch": 0.8730747469667443, + "grad_norm": 17.31041379575615, + "learning_rate": 9.172814665772413e-06, + "loss": 2.5213, + "step": 868 + }, + { + "epoch": 0.8740805934494248, + "grad_norm": 19.055356941197203, + "learning_rate": 9.17057902973396e-06, + "loss": 2.4818, + "step": 869 + }, + { + "epoch": 0.8750864399321053, + "grad_norm": 18.077774764214077, + "learning_rate": 9.168343393695507e-06, + "loss": 2.4637, + "step": 870 + }, + { + "epoch": 0.876092286414786, + "grad_norm": 21.143295726131285, + "learning_rate": 9.166107757657054e-06, + "loss": 2.4704, + "step": 871 + }, + { + "epoch": 0.8770981328974665, + "grad_norm": 17.942475112504635, + "learning_rate": 9.1638721216186e-06, + "loss": 2.4841, + "step": 872 + }, + { + "epoch": 0.8781039793801471, + "grad_norm": 19.18496687136911, + "learning_rate": 9.161636485580149e-06, + "loss": 2.4917, + "step": 873 + }, + { + "epoch": 0.8791098258628277, + "grad_norm": 17.588233448036576, + "learning_rate": 9.159400849541695e-06, + "loss": 2.5656, + "step": 874 + }, + { + "epoch": 0.8801156723455082, + "grad_norm": 20.044382673740458, + "learning_rate": 9.157165213503244e-06, + "loss": 2.5114, + "step": 875 + }, + { + "epoch": 0.8811215188281889, + "grad_norm": 17.914129095672784, + "learning_rate": 9.15492957746479e-06, + "loss": 2.5063, + "step": 876 + }, + { + "epoch": 0.8821273653108694, + "grad_norm": 20.358962481262786, + "learning_rate": 9.152693941426337e-06, + "loss": 2.4819, + "step": 877 + }, + { + "epoch": 0.88313321179355, + "grad_norm": 20.562144083555342, + "learning_rate": 9.150458305387883e-06, + "loss": 2.5522, + "step": 878 + }, + { + "epoch": 0.8841390582762306, + "grad_norm": 17.559901617392246, + "learning_rate": 9.14822266934943e-06, + "loss": 2.4994, + "step": 879 + }, + { + "epoch": 0.8851449047589112, + "grad_norm": 19.353815367284025, + "learning_rate": 9.145987033310978e-06, + "loss": 2.4995, + "step": 880 + }, + { + "epoch": 0.8861507512415917, + "grad_norm": 20.337197517860805, + "learning_rate": 9.143751397272525e-06, + "loss": 2.5063, + "step": 881 + }, + { + "epoch": 0.8871565977242724, + "grad_norm": 20.21527535771532, + "learning_rate": 9.141515761234073e-06, + "loss": 2.5583, + "step": 882 + }, + { + "epoch": 0.8881624442069529, + "grad_norm": 21.947674142946916, + "learning_rate": 9.13928012519562e-06, + "loss": 2.4932, + "step": 883 + }, + { + "epoch": 0.8891682906896335, + "grad_norm": 20.241067388653406, + "learning_rate": 9.137044489157166e-06, + "loss": 2.4566, + "step": 884 + }, + { + "epoch": 0.8901741371723141, + "grad_norm": 18.94219764805125, + "learning_rate": 9.134808853118713e-06, + "loss": 2.5193, + "step": 885 + }, + { + "epoch": 0.8911799836549946, + "grad_norm": 18.21141857163298, + "learning_rate": 9.13257321708026e-06, + "loss": 2.5036, + "step": 886 + }, + { + "epoch": 0.8921858301376753, + "grad_norm": 18.879260967660596, + "learning_rate": 9.130337581041807e-06, + "loss": 2.4581, + "step": 887 + }, + { + "epoch": 0.8931916766203558, + "grad_norm": 15.904893478954735, + "learning_rate": 9.128101945003354e-06, + "loss": 2.5704, + "step": 888 + }, + { + "epoch": 0.8941975231030364, + "grad_norm": 17.719412197108404, + "learning_rate": 9.125866308964902e-06, + "loss": 2.4746, + "step": 889 + }, + { + "epoch": 0.895203369585717, + "grad_norm": 17.337500587360978, + "learning_rate": 9.123630672926449e-06, + "loss": 2.4891, + "step": 890 + }, + { + "epoch": 0.8962092160683975, + "grad_norm": 17.40269334016562, + "learning_rate": 9.121395036887995e-06, + "loss": 2.5213, + "step": 891 + }, + { + "epoch": 0.8972150625510782, + "grad_norm": 16.687508641627172, + "learning_rate": 9.119159400849542e-06, + "loss": 2.5013, + "step": 892 + }, + { + "epoch": 0.8982209090337587, + "grad_norm": 18.68430097414652, + "learning_rate": 9.11692376481109e-06, + "loss": 2.5189, + "step": 893 + }, + { + "epoch": 0.8992267555164393, + "grad_norm": 19.021895555033808, + "learning_rate": 9.114688128772637e-06, + "loss": 2.5146, + "step": 894 + }, + { + "epoch": 0.9002326019991199, + "grad_norm": 19.67201740058453, + "learning_rate": 9.112452492734183e-06, + "loss": 2.4956, + "step": 895 + }, + { + "epoch": 0.9012384484818005, + "grad_norm": 21.161941542262337, + "learning_rate": 9.11021685669573e-06, + "loss": 2.4697, + "step": 896 + }, + { + "epoch": 0.902244294964481, + "grad_norm": 21.536270522176043, + "learning_rate": 9.107981220657278e-06, + "loss": 2.511, + "step": 897 + }, + { + "epoch": 0.9032501414471616, + "grad_norm": 18.722958300928553, + "learning_rate": 9.105745584618825e-06, + "loss": 2.5587, + "step": 898 + }, + { + "epoch": 0.9042559879298422, + "grad_norm": 21.90000929319424, + "learning_rate": 9.103509948580373e-06, + "loss": 2.4839, + "step": 899 + }, + { + "epoch": 0.9052618344125228, + "grad_norm": 20.7546999739206, + "learning_rate": 9.10127431254192e-06, + "loss": 2.4831, + "step": 900 + }, + { + "epoch": 0.9062676808952034, + "grad_norm": 16.69773588405377, + "learning_rate": 9.099038676503466e-06, + "loss": 2.432, + "step": 901 + }, + { + "epoch": 0.9072735273778839, + "grad_norm": 17.697663909486884, + "learning_rate": 9.096803040465013e-06, + "loss": 2.4746, + "step": 902 + }, + { + "epoch": 0.9082793738605646, + "grad_norm": 21.136014663688808, + "learning_rate": 9.094567404426559e-06, + "loss": 2.5125, + "step": 903 + }, + { + "epoch": 0.9092852203432451, + "grad_norm": 15.80770427361095, + "learning_rate": 9.092331768388107e-06, + "loss": 2.4444, + "step": 904 + }, + { + "epoch": 0.9102910668259256, + "grad_norm": 17.630039598712568, + "learning_rate": 9.090096132349654e-06, + "loss": 2.5108, + "step": 905 + }, + { + "epoch": 0.9112969133086063, + "grad_norm": 18.196870410777603, + "learning_rate": 9.087860496311202e-06, + "loss": 2.4715, + "step": 906 + }, + { + "epoch": 0.9123027597912868, + "grad_norm": 15.471661891634048, + "learning_rate": 9.085624860272749e-06, + "loss": 2.4712, + "step": 907 + }, + { + "epoch": 0.9133086062739675, + "grad_norm": 20.275945900358217, + "learning_rate": 9.083389224234295e-06, + "loss": 2.4799, + "step": 908 + }, + { + "epoch": 0.914314452756648, + "grad_norm": 22.54244758097815, + "learning_rate": 9.081153588195842e-06, + "loss": 2.5023, + "step": 909 + }, + { + "epoch": 0.9153202992393286, + "grad_norm": 17.139623998998083, + "learning_rate": 9.07891795215739e-06, + "loss": 2.4485, + "step": 910 + }, + { + "epoch": 0.9163261457220092, + "grad_norm": 21.09355687447774, + "learning_rate": 9.076682316118937e-06, + "loss": 2.4921, + "step": 911 + }, + { + "epoch": 0.9173319922046898, + "grad_norm": 20.30710748947842, + "learning_rate": 9.074446680080483e-06, + "loss": 2.513, + "step": 912 + }, + { + "epoch": 0.9183378386873703, + "grad_norm": 17.780778251179097, + "learning_rate": 9.072211044042031e-06, + "loss": 2.478, + "step": 913 + }, + { + "epoch": 0.9193436851700509, + "grad_norm": 16.811738212001085, + "learning_rate": 9.069975408003578e-06, + "loss": 2.5019, + "step": 914 + }, + { + "epoch": 0.9203495316527315, + "grad_norm": 18.395521000320436, + "learning_rate": 9.067739771965125e-06, + "loss": 2.4809, + "step": 915 + }, + { + "epoch": 0.9213553781354121, + "grad_norm": 20.01473796386837, + "learning_rate": 9.065504135926671e-06, + "loss": 2.5079, + "step": 916 + }, + { + "epoch": 0.9223612246180927, + "grad_norm": 19.503234127759303, + "learning_rate": 9.06326849988822e-06, + "loss": 2.5152, + "step": 917 + }, + { + "epoch": 0.9233670711007732, + "grad_norm": 17.988337162833112, + "learning_rate": 9.061032863849766e-06, + "loss": 2.5215, + "step": 918 + }, + { + "epoch": 0.9243729175834539, + "grad_norm": 21.476525188151253, + "learning_rate": 9.058797227811314e-06, + "loss": 2.5268, + "step": 919 + }, + { + "epoch": 0.9253787640661344, + "grad_norm": 17.452253827591644, + "learning_rate": 9.05656159177286e-06, + "loss": 2.5168, + "step": 920 + }, + { + "epoch": 0.9263846105488149, + "grad_norm": 20.067667819078714, + "learning_rate": 9.054325955734407e-06, + "loss": 2.4566, + "step": 921 + }, + { + "epoch": 0.9273904570314956, + "grad_norm": 19.69466816278088, + "learning_rate": 9.052090319695954e-06, + "loss": 2.4962, + "step": 922 + }, + { + "epoch": 0.9283963035141761, + "grad_norm": 18.525210255914946, + "learning_rate": 9.0498546836575e-06, + "loss": 2.4585, + "step": 923 + }, + { + "epoch": 0.9294021499968568, + "grad_norm": 19.444686243782936, + "learning_rate": 9.047619047619049e-06, + "loss": 2.4733, + "step": 924 + }, + { + "epoch": 0.9304079964795373, + "grad_norm": 17.956521398490462, + "learning_rate": 9.045383411580595e-06, + "loss": 2.4444, + "step": 925 + }, + { + "epoch": 0.9314138429622179, + "grad_norm": 18.761525002564817, + "learning_rate": 9.043147775542142e-06, + "loss": 2.5182, + "step": 926 + }, + { + "epoch": 0.9324196894448985, + "grad_norm": 18.517956347635913, + "learning_rate": 9.040912139503688e-06, + "loss": 2.478, + "step": 927 + }, + { + "epoch": 0.933425535927579, + "grad_norm": 16.722299555521936, + "learning_rate": 9.038676503465237e-06, + "loss": 2.4512, + "step": 928 + }, + { + "epoch": 0.9344313824102596, + "grad_norm": 16.348937736219405, + "learning_rate": 9.036440867426783e-06, + "loss": 2.489, + "step": 929 + }, + { + "epoch": 0.9354372288929402, + "grad_norm": 17.128346726995666, + "learning_rate": 9.034205231388331e-06, + "loss": 2.4643, + "step": 930 + }, + { + "epoch": 0.9364430753756208, + "grad_norm": 15.251872588593749, + "learning_rate": 9.031969595349878e-06, + "loss": 2.5159, + "step": 931 + }, + { + "epoch": 0.9374489218583014, + "grad_norm": 15.953362658854287, + "learning_rate": 9.029733959311425e-06, + "loss": 2.461, + "step": 932 + }, + { + "epoch": 0.938454768340982, + "grad_norm": 17.979068884310784, + "learning_rate": 9.027498323272971e-06, + "loss": 2.5227, + "step": 933 + }, + { + "epoch": 0.9394606148236625, + "grad_norm": 18.551703087669928, + "learning_rate": 9.02526268723452e-06, + "loss": 2.5409, + "step": 934 + }, + { + "epoch": 0.9404664613063431, + "grad_norm": 19.45169292949665, + "learning_rate": 9.023027051196066e-06, + "loss": 2.5253, + "step": 935 + }, + { + "epoch": 0.9414723077890237, + "grad_norm": 15.139349601071542, + "learning_rate": 9.020791415157612e-06, + "loss": 2.4184, + "step": 936 + }, + { + "epoch": 0.9424781542717042, + "grad_norm": 17.231348952189535, + "learning_rate": 9.01855577911916e-06, + "loss": 2.4608, + "step": 937 + }, + { + "epoch": 0.9434840007543849, + "grad_norm": 17.017076413253353, + "learning_rate": 9.016320143080707e-06, + "loss": 2.4757, + "step": 938 + }, + { + "epoch": 0.9444898472370654, + "grad_norm": 17.14787644558583, + "learning_rate": 9.014084507042254e-06, + "loss": 2.5353, + "step": 939 + }, + { + "epoch": 0.9454956937197461, + "grad_norm": 17.737202649119144, + "learning_rate": 9.0118488710038e-06, + "loss": 2.5025, + "step": 940 + }, + { + "epoch": 0.9465015402024266, + "grad_norm": 17.792474997732924, + "learning_rate": 9.009613234965349e-06, + "loss": 2.5015, + "step": 941 + }, + { + "epoch": 0.9475073866851071, + "grad_norm": 15.940518080280272, + "learning_rate": 9.007377598926895e-06, + "loss": 2.5094, + "step": 942 + }, + { + "epoch": 0.9485132331677878, + "grad_norm": 18.487826228534622, + "learning_rate": 9.005141962888443e-06, + "loss": 2.4741, + "step": 943 + }, + { + "epoch": 0.9495190796504683, + "grad_norm": 17.216862294112154, + "learning_rate": 9.00290632684999e-06, + "loss": 2.4211, + "step": 944 + }, + { + "epoch": 0.9505249261331489, + "grad_norm": 15.551142433908357, + "learning_rate": 9.000670690811537e-06, + "loss": 2.4976, + "step": 945 + }, + { + "epoch": 0.9515307726158295, + "grad_norm": 16.559861003507038, + "learning_rate": 8.998435054773083e-06, + "loss": 2.4588, + "step": 946 + }, + { + "epoch": 0.9525366190985101, + "grad_norm": 17.63172975324414, + "learning_rate": 8.99619941873463e-06, + "loss": 2.4329, + "step": 947 + }, + { + "epoch": 0.9535424655811907, + "grad_norm": 18.622917952836307, + "learning_rate": 8.993963782696178e-06, + "loss": 2.4807, + "step": 948 + }, + { + "epoch": 0.9545483120638713, + "grad_norm": 16.338398101232766, + "learning_rate": 8.991728146657725e-06, + "loss": 2.4793, + "step": 949 + }, + { + "epoch": 0.9555541585465518, + "grad_norm": 21.981541610522587, + "learning_rate": 8.989492510619273e-06, + "loss": 2.504, + "step": 950 + }, + { + "epoch": 0.9565600050292324, + "grad_norm": 17.581038921848826, + "learning_rate": 8.98725687458082e-06, + "loss": 2.4469, + "step": 951 + }, + { + "epoch": 0.957565851511913, + "grad_norm": 20.139070495761704, + "learning_rate": 8.985021238542366e-06, + "loss": 2.4517, + "step": 952 + }, + { + "epoch": 0.9585716979945936, + "grad_norm": 23.595785077951778, + "learning_rate": 8.982785602503912e-06, + "loss": 2.443, + "step": 953 + }, + { + "epoch": 0.9595775444772742, + "grad_norm": 17.029891375528152, + "learning_rate": 8.98054996646546e-06, + "loss": 2.5467, + "step": 954 + }, + { + "epoch": 0.9605833909599547, + "grad_norm": 25.44050321518066, + "learning_rate": 8.978314330427007e-06, + "loss": 2.4655, + "step": 955 + }, + { + "epoch": 0.9615892374426354, + "grad_norm": 27.67074335823461, + "learning_rate": 8.976078694388556e-06, + "loss": 2.5033, + "step": 956 + }, + { + "epoch": 0.9625950839253159, + "grad_norm": 20.2426136586843, + "learning_rate": 8.973843058350102e-06, + "loss": 2.4978, + "step": 957 + }, + { + "epoch": 0.9636009304079964, + "grad_norm": 18.658850559045447, + "learning_rate": 8.971607422311649e-06, + "loss": 2.436, + "step": 958 + }, + { + "epoch": 0.9646067768906771, + "grad_norm": 22.04603622451962, + "learning_rate": 8.969371786273195e-06, + "loss": 2.4269, + "step": 959 + }, + { + "epoch": 0.9656126233733576, + "grad_norm": 21.75932909465551, + "learning_rate": 8.967136150234742e-06, + "loss": 2.4801, + "step": 960 + }, + { + "epoch": 0.9666184698560383, + "grad_norm": 19.527187135283782, + "learning_rate": 8.96490051419629e-06, + "loss": 2.5017, + "step": 961 + }, + { + "epoch": 0.9676243163387188, + "grad_norm": 23.29560371501392, + "learning_rate": 8.962664878157837e-06, + "loss": 2.4738, + "step": 962 + }, + { + "epoch": 0.9686301628213994, + "grad_norm": 22.706509828966826, + "learning_rate": 8.960429242119383e-06, + "loss": 2.4983, + "step": 963 + }, + { + "epoch": 0.96963600930408, + "grad_norm": 19.302250360711678, + "learning_rate": 8.95819360608093e-06, + "loss": 2.5015, + "step": 964 + }, + { + "epoch": 0.9706418557867605, + "grad_norm": 20.274377821577197, + "learning_rate": 8.955957970042478e-06, + "loss": 2.527, + "step": 965 + }, + { + "epoch": 0.9716477022694411, + "grad_norm": 22.146231245864477, + "learning_rate": 8.953722334004025e-06, + "loss": 2.4754, + "step": 966 + }, + { + "epoch": 0.9726535487521217, + "grad_norm": 19.283732586891457, + "learning_rate": 8.951486697965573e-06, + "loss": 2.5287, + "step": 967 + }, + { + "epoch": 0.9736593952348023, + "grad_norm": 16.697804743440038, + "learning_rate": 8.94925106192712e-06, + "loss": 2.446, + "step": 968 + }, + { + "epoch": 0.9746652417174829, + "grad_norm": 18.750802946568303, + "learning_rate": 8.947015425888666e-06, + "loss": 2.5017, + "step": 969 + }, + { + "epoch": 0.9756710882001635, + "grad_norm": 19.391455727776027, + "learning_rate": 8.944779789850212e-06, + "loss": 2.4437, + "step": 970 + }, + { + "epoch": 0.976676934682844, + "grad_norm": 20.49457113262894, + "learning_rate": 8.942544153811759e-06, + "loss": 2.4844, + "step": 971 + }, + { + "epoch": 0.9776827811655247, + "grad_norm": 16.8204200261022, + "learning_rate": 8.940308517773307e-06, + "loss": 2.5061, + "step": 972 + }, + { + "epoch": 0.9786886276482052, + "grad_norm": 20.60414960158546, + "learning_rate": 8.938072881734854e-06, + "loss": 2.4707, + "step": 973 + }, + { + "epoch": 0.9796944741308857, + "grad_norm": 20.536955242053217, + "learning_rate": 8.935837245696402e-06, + "loss": 2.4961, + "step": 974 + }, + { + "epoch": 0.9807003206135664, + "grad_norm": 19.711651025802677, + "learning_rate": 8.933601609657949e-06, + "loss": 2.5085, + "step": 975 + }, + { + "epoch": 0.9817061670962469, + "grad_norm": 22.207382958127546, + "learning_rate": 8.931365973619495e-06, + "loss": 2.4681, + "step": 976 + }, + { + "epoch": 0.9827120135789276, + "grad_norm": 20.071766709389117, + "learning_rate": 8.929130337581042e-06, + "loss": 2.4954, + "step": 977 + }, + { + "epoch": 0.9837178600616081, + "grad_norm": 16.47582841812017, + "learning_rate": 8.92689470154259e-06, + "loss": 2.5156, + "step": 978 + }, + { + "epoch": 0.9847237065442886, + "grad_norm": 16.271580188410653, + "learning_rate": 8.924659065504137e-06, + "loss": 2.5552, + "step": 979 + }, + { + "epoch": 0.9857295530269693, + "grad_norm": 17.119654997142348, + "learning_rate": 8.922423429465685e-06, + "loss": 2.5, + "step": 980 + }, + { + "epoch": 0.9867353995096498, + "grad_norm": 17.172594870254994, + "learning_rate": 8.920187793427231e-06, + "loss": 2.5154, + "step": 981 + }, + { + "epoch": 0.9877412459923304, + "grad_norm": 15.830398642073979, + "learning_rate": 8.917952157388778e-06, + "loss": 2.4756, + "step": 982 + }, + { + "epoch": 0.988747092475011, + "grad_norm": 18.032949559237505, + "learning_rate": 8.915716521350325e-06, + "loss": 2.4683, + "step": 983 + }, + { + "epoch": 0.9897529389576916, + "grad_norm": 21.19714163524062, + "learning_rate": 8.913480885311871e-06, + "loss": 2.5453, + "step": 984 + }, + { + "epoch": 0.9907587854403722, + "grad_norm": 17.49779963037731, + "learning_rate": 8.91124524927342e-06, + "loss": 2.5045, + "step": 985 + }, + { + "epoch": 0.9917646319230528, + "grad_norm": 16.740921716038336, + "learning_rate": 8.909009613234966e-06, + "loss": 2.4452, + "step": 986 + }, + { + "epoch": 0.9927704784057333, + "grad_norm": 17.19468353475136, + "learning_rate": 8.906773977196514e-06, + "loss": 2.51, + "step": 987 + }, + { + "epoch": 0.9937763248884139, + "grad_norm": 19.231917401802384, + "learning_rate": 8.90453834115806e-06, + "loss": 2.4869, + "step": 988 + }, + { + "epoch": 0.9947821713710945, + "grad_norm": 16.664642597713588, + "learning_rate": 8.902302705119607e-06, + "loss": 2.5429, + "step": 989 + }, + { + "epoch": 0.995788017853775, + "grad_norm": 17.163669929741783, + "learning_rate": 8.900067069081154e-06, + "loss": 2.4918, + "step": 990 + }, + { + "epoch": 0.9967938643364557, + "grad_norm": 18.533245352747766, + "learning_rate": 8.897831433042702e-06, + "loss": 2.4572, + "step": 991 + }, + { + "epoch": 0.9977997108191362, + "grad_norm": 19.22814464766119, + "learning_rate": 8.895595797004249e-06, + "loss": 2.4701, + "step": 992 + }, + { + "epoch": 0.9988055573018169, + "grad_norm": 17.838257655579174, + "learning_rate": 8.893360160965795e-06, + "loss": 2.5084, + "step": 993 + }, + { + "epoch": 0.9998114037844974, + "grad_norm": 19.097020194522255, + "learning_rate": 8.891124524927342e-06, + "loss": 2.5596, + "step": 994 + }, + { + "epoch": 1.000817250267178, + "grad_norm": 18.87435671359403, + "learning_rate": 8.888888888888888e-06, + "loss": 2.3563, + "step": 995 + }, + { + "epoch": 1.0018230967498585, + "grad_norm": 17.90502272682222, + "learning_rate": 8.886653252850437e-06, + "loss": 2.2951, + "step": 996 + }, + { + "epoch": 1.0028289432325392, + "grad_norm": 16.91198265807751, + "learning_rate": 8.884417616811983e-06, + "loss": 2.3775, + "step": 997 + }, + { + "epoch": 1.0038347897152198, + "grad_norm": 15.620591946147508, + "learning_rate": 8.882181980773531e-06, + "loss": 2.2319, + "step": 998 + }, + { + "epoch": 1.0048406361979003, + "grad_norm": 16.42892213916386, + "learning_rate": 8.879946344735078e-06, + "loss": 2.3308, + "step": 999 + }, + { + "epoch": 1.0058464826805809, + "grad_norm": 18.59757875443186, + "learning_rate": 8.877710708696624e-06, + "loss": 2.2971, + "step": 1000 + }, + { + "epoch": 1.0068523291632614, + "grad_norm": 16.42491496826214, + "learning_rate": 8.875475072658171e-06, + "loss": 2.2728, + "step": 1001 + }, + { + "epoch": 1.007858175645942, + "grad_norm": 20.155015522173446, + "learning_rate": 8.87323943661972e-06, + "loss": 2.3029, + "step": 1002 + }, + { + "epoch": 1.0088640221286227, + "grad_norm": 18.16790024429524, + "learning_rate": 8.871003800581266e-06, + "loss": 2.3385, + "step": 1003 + }, + { + "epoch": 1.0098698686113032, + "grad_norm": 19.559520755472036, + "learning_rate": 8.868768164542814e-06, + "loss": 2.3133, + "step": 1004 + }, + { + "epoch": 1.0108757150939838, + "grad_norm": 19.597847391813175, + "learning_rate": 8.86653252850436e-06, + "loss": 2.3219, + "step": 1005 + }, + { + "epoch": 1.0118815615766643, + "grad_norm": 18.564284872046343, + "learning_rate": 8.864296892465907e-06, + "loss": 2.2791, + "step": 1006 + }, + { + "epoch": 1.0128874080593449, + "grad_norm": 19.507875257609435, + "learning_rate": 8.862061256427454e-06, + "loss": 2.3247, + "step": 1007 + }, + { + "epoch": 1.0138932545420256, + "grad_norm": 17.356664585211536, + "learning_rate": 8.859825620389e-06, + "loss": 2.2976, + "step": 1008 + }, + { + "epoch": 1.0148991010247062, + "grad_norm": 17.685890921074805, + "learning_rate": 8.857589984350549e-06, + "loss": 2.334, + "step": 1009 + }, + { + "epoch": 1.0159049475073867, + "grad_norm": 18.007613178096328, + "learning_rate": 8.855354348312095e-06, + "loss": 2.3276, + "step": 1010 + }, + { + "epoch": 1.0169107939900672, + "grad_norm": 21.714385969059393, + "learning_rate": 8.853118712273643e-06, + "loss": 2.3058, + "step": 1011 + }, + { + "epoch": 1.0179166404727478, + "grad_norm": 17.571687311154825, + "learning_rate": 8.85088307623519e-06, + "loss": 2.3071, + "step": 1012 + }, + { + "epoch": 1.0189224869554285, + "grad_norm": 18.709912154375786, + "learning_rate": 8.848647440196737e-06, + "loss": 2.3123, + "step": 1013 + }, + { + "epoch": 1.019928333438109, + "grad_norm": 16.602238512927716, + "learning_rate": 8.846411804158283e-06, + "loss": 2.2744, + "step": 1014 + }, + { + "epoch": 1.0209341799207896, + "grad_norm": 19.883495863934453, + "learning_rate": 8.844176168119831e-06, + "loss": 2.2654, + "step": 1015 + }, + { + "epoch": 1.0219400264034701, + "grad_norm": 16.96892661890455, + "learning_rate": 8.841940532081378e-06, + "loss": 2.3024, + "step": 1016 + }, + { + "epoch": 1.0229458728861507, + "grad_norm": 17.414681329439077, + "learning_rate": 8.839704896042926e-06, + "loss": 2.3003, + "step": 1017 + }, + { + "epoch": 1.0239517193688312, + "grad_norm": 18.858230864626055, + "learning_rate": 8.837469260004473e-06, + "loss": 2.2919, + "step": 1018 + }, + { + "epoch": 1.024957565851512, + "grad_norm": 16.699976095125997, + "learning_rate": 8.83523362396602e-06, + "loss": 2.213, + "step": 1019 + }, + { + "epoch": 1.0259634123341925, + "grad_norm": 15.54260552775944, + "learning_rate": 8.832997987927566e-06, + "loss": 2.3101, + "step": 1020 + }, + { + "epoch": 1.026969258816873, + "grad_norm": 20.651042832781755, + "learning_rate": 8.830762351889112e-06, + "loss": 2.312, + "step": 1021 + }, + { + "epoch": 1.0279751052995536, + "grad_norm": 16.94510148062415, + "learning_rate": 8.82852671585066e-06, + "loss": 2.3125, + "step": 1022 + }, + { + "epoch": 1.0289809517822341, + "grad_norm": 15.84034483396051, + "learning_rate": 8.826291079812207e-06, + "loss": 2.2957, + "step": 1023 + }, + { + "epoch": 1.029986798264915, + "grad_norm": 16.94753969310773, + "learning_rate": 8.824055443773755e-06, + "loss": 2.3357, + "step": 1024 + }, + { + "epoch": 1.0309926447475954, + "grad_norm": 16.41312755901395, + "learning_rate": 8.821819807735302e-06, + "loss": 2.2873, + "step": 1025 + }, + { + "epoch": 1.031998491230276, + "grad_norm": 16.14597452506802, + "learning_rate": 8.819584171696849e-06, + "loss": 2.2386, + "step": 1026 + }, + { + "epoch": 1.0330043377129565, + "grad_norm": 16.81133489039435, + "learning_rate": 8.817348535658395e-06, + "loss": 2.3233, + "step": 1027 + }, + { + "epoch": 1.034010184195637, + "grad_norm": 17.11584396129041, + "learning_rate": 8.815112899619943e-06, + "loss": 2.3202, + "step": 1028 + }, + { + "epoch": 1.0350160306783178, + "grad_norm": 18.735494496930798, + "learning_rate": 8.81287726358149e-06, + "loss": 2.2851, + "step": 1029 + }, + { + "epoch": 1.0360218771609984, + "grad_norm": 17.886553532549136, + "learning_rate": 8.810641627543037e-06, + "loss": 2.3147, + "step": 1030 + }, + { + "epoch": 1.037027723643679, + "grad_norm": 23.272350800324688, + "learning_rate": 8.808405991504583e-06, + "loss": 2.2582, + "step": 1031 + }, + { + "epoch": 1.0380335701263594, + "grad_norm": 20.591526948640638, + "learning_rate": 8.80617035546613e-06, + "loss": 2.2953, + "step": 1032 + }, + { + "epoch": 1.03903941660904, + "grad_norm": 19.400124919183924, + "learning_rate": 8.803934719427678e-06, + "loss": 2.196, + "step": 1033 + }, + { + "epoch": 1.0400452630917205, + "grad_norm": 21.820764887623767, + "learning_rate": 8.801699083389224e-06, + "loss": 2.3064, + "step": 1034 + }, + { + "epoch": 1.0410511095744013, + "grad_norm": 18.083478880895772, + "learning_rate": 8.799463447350773e-06, + "loss": 2.2953, + "step": 1035 + }, + { + "epoch": 1.0420569560570818, + "grad_norm": 18.954547842365145, + "learning_rate": 8.79722781131232e-06, + "loss": 2.3336, + "step": 1036 + }, + { + "epoch": 1.0430628025397624, + "grad_norm": 16.39042328254077, + "learning_rate": 8.794992175273866e-06, + "loss": 2.2782, + "step": 1037 + }, + { + "epoch": 1.044068649022443, + "grad_norm": 19.630757954483688, + "learning_rate": 8.792756539235412e-06, + "loss": 2.2513, + "step": 1038 + }, + { + "epoch": 1.0450744955051234, + "grad_norm": 18.040721411714422, + "learning_rate": 8.79052090319696e-06, + "loss": 2.3292, + "step": 1039 + }, + { + "epoch": 1.0460803419878042, + "grad_norm": 18.652583945177508, + "learning_rate": 8.788285267158507e-06, + "loss": 2.2952, + "step": 1040 + }, + { + "epoch": 1.0470861884704847, + "grad_norm": 20.924815906038443, + "learning_rate": 8.786049631120054e-06, + "loss": 2.3122, + "step": 1041 + }, + { + "epoch": 1.0480920349531653, + "grad_norm": 17.843040001830474, + "learning_rate": 8.783813995081602e-06, + "loss": 2.3082, + "step": 1042 + }, + { + "epoch": 1.0490978814358458, + "grad_norm": 16.320339567013974, + "learning_rate": 8.781578359043149e-06, + "loss": 2.2638, + "step": 1043 + }, + { + "epoch": 1.0501037279185264, + "grad_norm": 17.50310854731714, + "learning_rate": 8.779342723004695e-06, + "loss": 2.2987, + "step": 1044 + }, + { + "epoch": 1.0511095744012071, + "grad_norm": 21.22193429459719, + "learning_rate": 8.777107086966242e-06, + "loss": 2.3096, + "step": 1045 + }, + { + "epoch": 1.0521154208838877, + "grad_norm": 17.836388406412574, + "learning_rate": 8.77487145092779e-06, + "loss": 2.2738, + "step": 1046 + }, + { + "epoch": 1.0531212673665682, + "grad_norm": 19.783838723215364, + "learning_rate": 8.772635814889337e-06, + "loss": 2.3063, + "step": 1047 + }, + { + "epoch": 1.0541271138492487, + "grad_norm": 18.523579033127074, + "learning_rate": 8.770400178850885e-06, + "loss": 2.3036, + "step": 1048 + }, + { + "epoch": 1.0551329603319293, + "grad_norm": 15.720657777619268, + "learning_rate": 8.768164542812431e-06, + "loss": 2.3232, + "step": 1049 + }, + { + "epoch": 1.05613880681461, + "grad_norm": 18.90213796026752, + "learning_rate": 8.765928906773978e-06, + "loss": 2.2807, + "step": 1050 + }, + { + "epoch": 1.0571446532972906, + "grad_norm": 17.276748437659204, + "learning_rate": 8.763693270735524e-06, + "loss": 2.3144, + "step": 1051 + }, + { + "epoch": 1.058150499779971, + "grad_norm": 17.070528576683216, + "learning_rate": 8.761457634697073e-06, + "loss": 2.3109, + "step": 1052 + }, + { + "epoch": 1.0591563462626516, + "grad_norm": 14.745669379798551, + "learning_rate": 8.75922199865862e-06, + "loss": 2.324, + "step": 1053 + }, + { + "epoch": 1.0601621927453322, + "grad_norm": 17.757207171938163, + "learning_rate": 8.756986362620166e-06, + "loss": 2.2951, + "step": 1054 + }, + { + "epoch": 1.0611680392280127, + "grad_norm": 16.553773697289195, + "learning_rate": 8.754750726581714e-06, + "loss": 2.3451, + "step": 1055 + }, + { + "epoch": 1.0621738857106935, + "grad_norm": 18.039386395828323, + "learning_rate": 8.75251509054326e-06, + "loss": 2.3159, + "step": 1056 + }, + { + "epoch": 1.063179732193374, + "grad_norm": 17.969055109206348, + "learning_rate": 8.750279454504807e-06, + "loss": 2.2989, + "step": 1057 + }, + { + "epoch": 1.0641855786760546, + "grad_norm": 16.96062690053515, + "learning_rate": 8.748043818466354e-06, + "loss": 2.3199, + "step": 1058 + }, + { + "epoch": 1.065191425158735, + "grad_norm": 17.158342934397627, + "learning_rate": 8.745808182427902e-06, + "loss": 2.2504, + "step": 1059 + }, + { + "epoch": 1.0661972716414156, + "grad_norm": 16.784646638835156, + "learning_rate": 8.743572546389449e-06, + "loss": 2.3059, + "step": 1060 + }, + { + "epoch": 1.0672031181240964, + "grad_norm": 19.418038825614527, + "learning_rate": 8.741336910350995e-06, + "loss": 2.2612, + "step": 1061 + }, + { + "epoch": 1.068208964606777, + "grad_norm": 17.538394248187252, + "learning_rate": 8.739101274312542e-06, + "loss": 2.2833, + "step": 1062 + }, + { + "epoch": 1.0692148110894575, + "grad_norm": 17.040190678679068, + "learning_rate": 8.73686563827409e-06, + "loss": 2.2854, + "step": 1063 + }, + { + "epoch": 1.070220657572138, + "grad_norm": 17.16856813140221, + "learning_rate": 8.734630002235636e-06, + "loss": 2.2879, + "step": 1064 + }, + { + "epoch": 1.0712265040548186, + "grad_norm": 19.04157743002483, + "learning_rate": 8.732394366197183e-06, + "loss": 2.3107, + "step": 1065 + }, + { + "epoch": 1.072232350537499, + "grad_norm": 16.92014717159457, + "learning_rate": 8.730158730158731e-06, + "loss": 2.2602, + "step": 1066 + }, + { + "epoch": 1.0732381970201799, + "grad_norm": 17.33657041438704, + "learning_rate": 8.727923094120278e-06, + "loss": 2.2437, + "step": 1067 + }, + { + "epoch": 1.0742440435028604, + "grad_norm": 16.117892266244965, + "learning_rate": 8.725687458081824e-06, + "loss": 2.2922, + "step": 1068 + }, + { + "epoch": 1.075249889985541, + "grad_norm": 18.39743693476391, + "learning_rate": 8.723451822043371e-06, + "loss": 2.243, + "step": 1069 + }, + { + "epoch": 1.0762557364682215, + "grad_norm": 17.51753883567021, + "learning_rate": 8.72121618600492e-06, + "loss": 2.257, + "step": 1070 + }, + { + "epoch": 1.077261582950902, + "grad_norm": 19.177939170772685, + "learning_rate": 8.718980549966466e-06, + "loss": 2.3098, + "step": 1071 + }, + { + "epoch": 1.0782674294335828, + "grad_norm": 17.40174902530237, + "learning_rate": 8.716744913928014e-06, + "loss": 2.2857, + "step": 1072 + }, + { + "epoch": 1.0792732759162633, + "grad_norm": 20.080758725973578, + "learning_rate": 8.71450927788956e-06, + "loss": 2.2396, + "step": 1073 + }, + { + "epoch": 1.0802791223989439, + "grad_norm": 19.41196881225004, + "learning_rate": 8.712273641851107e-06, + "loss": 2.2848, + "step": 1074 + }, + { + "epoch": 1.0812849688816244, + "grad_norm": 18.2994680792408, + "learning_rate": 8.710038005812654e-06, + "loss": 2.2964, + "step": 1075 + }, + { + "epoch": 1.082290815364305, + "grad_norm": 17.693788846108696, + "learning_rate": 8.707802369774202e-06, + "loss": 2.2845, + "step": 1076 + }, + { + "epoch": 1.0832966618469857, + "grad_norm": 17.568965785873733, + "learning_rate": 8.705566733735749e-06, + "loss": 2.3162, + "step": 1077 + }, + { + "epoch": 1.0843025083296662, + "grad_norm": 17.32480439026697, + "learning_rate": 8.703331097697295e-06, + "loss": 2.2887, + "step": 1078 + }, + { + "epoch": 1.0853083548123468, + "grad_norm": 17.40755024517668, + "learning_rate": 8.701095461658843e-06, + "loss": 2.2616, + "step": 1079 + }, + { + "epoch": 1.0863142012950273, + "grad_norm": 17.507757383473695, + "learning_rate": 8.69885982562039e-06, + "loss": 2.2929, + "step": 1080 + }, + { + "epoch": 1.0873200477777079, + "grad_norm": 16.714998883542382, + "learning_rate": 8.696624189581936e-06, + "loss": 2.3106, + "step": 1081 + }, + { + "epoch": 1.0883258942603886, + "grad_norm": 17.82449763559636, + "learning_rate": 8.694388553543483e-06, + "loss": 2.3194, + "step": 1082 + }, + { + "epoch": 1.0893317407430692, + "grad_norm": 17.767082137079235, + "learning_rate": 8.692152917505031e-06, + "loss": 2.2838, + "step": 1083 + }, + { + "epoch": 1.0903375872257497, + "grad_norm": 18.537351379451447, + "learning_rate": 8.689917281466578e-06, + "loss": 2.2932, + "step": 1084 + }, + { + "epoch": 1.0913434337084302, + "grad_norm": 22.891169562818433, + "learning_rate": 8.687681645428126e-06, + "loss": 2.2773, + "step": 1085 + }, + { + "epoch": 1.0923492801911108, + "grad_norm": 17.011404027669325, + "learning_rate": 8.685446009389673e-06, + "loss": 2.2595, + "step": 1086 + }, + { + "epoch": 1.0933551266737913, + "grad_norm": 18.034931522205362, + "learning_rate": 8.68321037335122e-06, + "loss": 2.2591, + "step": 1087 + }, + { + "epoch": 1.094360973156472, + "grad_norm": 20.060986740107193, + "learning_rate": 8.680974737312766e-06, + "loss": 2.3045, + "step": 1088 + }, + { + "epoch": 1.0953668196391526, + "grad_norm": 18.99277712623433, + "learning_rate": 8.678739101274312e-06, + "loss": 2.2483, + "step": 1089 + }, + { + "epoch": 1.0963726661218332, + "grad_norm": 19.532669255280176, + "learning_rate": 8.67650346523586e-06, + "loss": 2.3214, + "step": 1090 + }, + { + "epoch": 1.0973785126045137, + "grad_norm": 17.28354302513203, + "learning_rate": 8.674267829197407e-06, + "loss": 2.2861, + "step": 1091 + }, + { + "epoch": 1.0983843590871942, + "grad_norm": 18.82097945985951, + "learning_rate": 8.672032193158955e-06, + "loss": 2.2928, + "step": 1092 + }, + { + "epoch": 1.099390205569875, + "grad_norm": 19.849603926806605, + "learning_rate": 8.669796557120502e-06, + "loss": 2.2887, + "step": 1093 + }, + { + "epoch": 1.1003960520525555, + "grad_norm": 17.784606582580825, + "learning_rate": 8.667560921082049e-06, + "loss": 2.2639, + "step": 1094 + }, + { + "epoch": 1.101401898535236, + "grad_norm": 23.615341983574755, + "learning_rate": 8.665325285043595e-06, + "loss": 2.2542, + "step": 1095 + }, + { + "epoch": 1.1024077450179166, + "grad_norm": 24.102960822027146, + "learning_rate": 8.663089649005143e-06, + "loss": 2.282, + "step": 1096 + }, + { + "epoch": 1.1034135915005971, + "grad_norm": 19.590423010500995, + "learning_rate": 8.66085401296669e-06, + "loss": 2.2569, + "step": 1097 + }, + { + "epoch": 1.1044194379832777, + "grad_norm": 19.718244556833877, + "learning_rate": 8.658618376928236e-06, + "loss": 2.3158, + "step": 1098 + }, + { + "epoch": 1.1054252844659584, + "grad_norm": 20.811154993006742, + "learning_rate": 8.656382740889783e-06, + "loss": 2.2856, + "step": 1099 + }, + { + "epoch": 1.106431130948639, + "grad_norm": 20.793341346867432, + "learning_rate": 8.65414710485133e-06, + "loss": 2.3054, + "step": 1100 + }, + { + "epoch": 1.1074369774313195, + "grad_norm": 15.838546783941315, + "learning_rate": 8.651911468812878e-06, + "loss": 2.2624, + "step": 1101 + }, + { + "epoch": 1.108442823914, + "grad_norm": 17.47078271491607, + "learning_rate": 8.649675832774424e-06, + "loss": 2.3208, + "step": 1102 + }, + { + "epoch": 1.1094486703966806, + "grad_norm": 18.48848617655423, + "learning_rate": 8.647440196735973e-06, + "loss": 2.3271, + "step": 1103 + }, + { + "epoch": 1.1104545168793614, + "grad_norm": 17.91893806068612, + "learning_rate": 8.64520456069752e-06, + "loss": 2.3012, + "step": 1104 + }, + { + "epoch": 1.111460363362042, + "grad_norm": 17.987110558744828, + "learning_rate": 8.642968924659066e-06, + "loss": 2.2325, + "step": 1105 + }, + { + "epoch": 1.1124662098447224, + "grad_norm": 20.347612749596323, + "learning_rate": 8.640733288620612e-06, + "loss": 2.2983, + "step": 1106 + }, + { + "epoch": 1.113472056327403, + "grad_norm": 16.31907596745269, + "learning_rate": 8.63849765258216e-06, + "loss": 2.2954, + "step": 1107 + }, + { + "epoch": 1.1144779028100835, + "grad_norm": 22.37021243709992, + "learning_rate": 8.636262016543707e-06, + "loss": 2.3291, + "step": 1108 + }, + { + "epoch": 1.1154837492927643, + "grad_norm": 19.44131371349581, + "learning_rate": 8.634026380505255e-06, + "loss": 2.286, + "step": 1109 + }, + { + "epoch": 1.1164895957754448, + "grad_norm": 19.948588951628864, + "learning_rate": 8.631790744466802e-06, + "loss": 2.3175, + "step": 1110 + }, + { + "epoch": 1.1174954422581254, + "grad_norm": 21.1765882245939, + "learning_rate": 8.629555108428349e-06, + "loss": 2.2921, + "step": 1111 + }, + { + "epoch": 1.118501288740806, + "grad_norm": 21.7341138660393, + "learning_rate": 8.627319472389895e-06, + "loss": 2.2662, + "step": 1112 + }, + { + "epoch": 1.1195071352234864, + "grad_norm": 22.332431460942793, + "learning_rate": 8.625083836351442e-06, + "loss": 2.3016, + "step": 1113 + }, + { + "epoch": 1.1205129817061672, + "grad_norm": 18.438975206999558, + "learning_rate": 8.62284820031299e-06, + "loss": 2.2919, + "step": 1114 + }, + { + "epoch": 1.1215188281888477, + "grad_norm": 23.38811510430387, + "learning_rate": 8.620612564274536e-06, + "loss": 2.2608, + "step": 1115 + }, + { + "epoch": 1.1225246746715283, + "grad_norm": 16.69490854261764, + "learning_rate": 8.618376928236085e-06, + "loss": 2.3022, + "step": 1116 + }, + { + "epoch": 1.1235305211542088, + "grad_norm": 21.455740370412343, + "learning_rate": 8.616141292197631e-06, + "loss": 2.2514, + "step": 1117 + }, + { + "epoch": 1.1245363676368894, + "grad_norm": 19.19448951730175, + "learning_rate": 8.613905656159178e-06, + "loss": 2.2973, + "step": 1118 + }, + { + "epoch": 1.12554221411957, + "grad_norm": 17.753641330646218, + "learning_rate": 8.611670020120724e-06, + "loss": 2.2801, + "step": 1119 + }, + { + "epoch": 1.1265480606022507, + "grad_norm": 17.94463995944704, + "learning_rate": 8.609434384082273e-06, + "loss": 2.3101, + "step": 1120 + }, + { + "epoch": 1.1275539070849312, + "grad_norm": 19.05898650124038, + "learning_rate": 8.60719874804382e-06, + "loss": 2.2527, + "step": 1121 + }, + { + "epoch": 1.1285597535676117, + "grad_norm": 22.476374958104035, + "learning_rate": 8.604963112005367e-06, + "loss": 2.2742, + "step": 1122 + }, + { + "epoch": 1.1295656000502923, + "grad_norm": 21.12202310587608, + "learning_rate": 8.602727475966914e-06, + "loss": 2.2671, + "step": 1123 + }, + { + "epoch": 1.1305714465329728, + "grad_norm": 20.158353323317353, + "learning_rate": 8.60049183992846e-06, + "loss": 2.289, + "step": 1124 + }, + { + "epoch": 1.1315772930156536, + "grad_norm": 20.297204120696264, + "learning_rate": 8.598256203890007e-06, + "loss": 2.3296, + "step": 1125 + }, + { + "epoch": 1.1325831394983341, + "grad_norm": 20.087346285671085, + "learning_rate": 8.596020567851554e-06, + "loss": 2.3248, + "step": 1126 + }, + { + "epoch": 1.1335889859810147, + "grad_norm": 19.115449262812746, + "learning_rate": 8.593784931813102e-06, + "loss": 2.3214, + "step": 1127 + }, + { + "epoch": 1.1345948324636952, + "grad_norm": 23.077884519254027, + "learning_rate": 8.591549295774648e-06, + "loss": 2.2687, + "step": 1128 + }, + { + "epoch": 1.1356006789463757, + "grad_norm": 17.696502624622784, + "learning_rate": 8.589313659736197e-06, + "loss": 2.2715, + "step": 1129 + }, + { + "epoch": 1.1366065254290563, + "grad_norm": 21.595594846158107, + "learning_rate": 8.587078023697742e-06, + "loss": 2.309, + "step": 1130 + }, + { + "epoch": 1.137612371911737, + "grad_norm": 17.836107574605045, + "learning_rate": 8.58484238765929e-06, + "loss": 2.2824, + "step": 1131 + }, + { + "epoch": 1.1386182183944176, + "grad_norm": 17.17116187740332, + "learning_rate": 8.582606751620836e-06, + "loss": 2.2881, + "step": 1132 + }, + { + "epoch": 1.139624064877098, + "grad_norm": 20.83470927331054, + "learning_rate": 8.580371115582385e-06, + "loss": 2.2674, + "step": 1133 + }, + { + "epoch": 1.1406299113597786, + "grad_norm": 17.37662446021717, + "learning_rate": 8.578135479543931e-06, + "loss": 2.407, + "step": 1134 + }, + { + "epoch": 1.1416357578424594, + "grad_norm": 18.80680126914326, + "learning_rate": 8.575899843505478e-06, + "loss": 2.3063, + "step": 1135 + }, + { + "epoch": 1.14264160432514, + "grad_norm": 24.64418636365614, + "learning_rate": 8.573664207467024e-06, + "loss": 2.3152, + "step": 1136 + }, + { + "epoch": 1.1436474508078205, + "grad_norm": 22.46054687463966, + "learning_rate": 8.571428571428571e-06, + "loss": 2.2916, + "step": 1137 + }, + { + "epoch": 1.144653297290501, + "grad_norm": 18.10782918295246, + "learning_rate": 8.56919293539012e-06, + "loss": 2.3343, + "step": 1138 + }, + { + "epoch": 1.1456591437731816, + "grad_norm": 21.905910472556673, + "learning_rate": 8.566957299351666e-06, + "loss": 2.335, + "step": 1139 + }, + { + "epoch": 1.146664990255862, + "grad_norm": 18.97800330506959, + "learning_rate": 8.564721663313214e-06, + "loss": 2.2896, + "step": 1140 + }, + { + "epoch": 1.1476708367385429, + "grad_norm": 17.11716507880201, + "learning_rate": 8.56248602727476e-06, + "loss": 2.3289, + "step": 1141 + }, + { + "epoch": 1.1486766832212234, + "grad_norm": 19.833371608515453, + "learning_rate": 8.560250391236307e-06, + "loss": 2.2488, + "step": 1142 + }, + { + "epoch": 1.149682529703904, + "grad_norm": 18.23043329147317, + "learning_rate": 8.558014755197854e-06, + "loss": 2.3525, + "step": 1143 + }, + { + "epoch": 1.1506883761865845, + "grad_norm": 18.5255330515471, + "learning_rate": 8.555779119159402e-06, + "loss": 2.2927, + "step": 1144 + }, + { + "epoch": 1.151694222669265, + "grad_norm": 17.61637600476979, + "learning_rate": 8.553543483120948e-06, + "loss": 2.2995, + "step": 1145 + }, + { + "epoch": 1.1527000691519458, + "grad_norm": 20.086175588883503, + "learning_rate": 8.551307847082497e-06, + "loss": 2.284, + "step": 1146 + }, + { + "epoch": 1.1537059156346263, + "grad_norm": 18.77432452018519, + "learning_rate": 8.549072211044043e-06, + "loss": 2.3082, + "step": 1147 + }, + { + "epoch": 1.1547117621173069, + "grad_norm": 17.008905022341153, + "learning_rate": 8.54683657500559e-06, + "loss": 2.2956, + "step": 1148 + }, + { + "epoch": 1.1557176085999874, + "grad_norm": 19.103735106985567, + "learning_rate": 8.544600938967136e-06, + "loss": 2.3223, + "step": 1149 + }, + { + "epoch": 1.156723455082668, + "grad_norm": 16.68086748430005, + "learning_rate": 8.542365302928683e-06, + "loss": 2.2468, + "step": 1150 + }, + { + "epoch": 1.1577293015653485, + "grad_norm": 16.688314349646213, + "learning_rate": 8.540129666890231e-06, + "loss": 2.2702, + "step": 1151 + }, + { + "epoch": 1.1587351480480292, + "grad_norm": 19.553630980497505, + "learning_rate": 8.537894030851778e-06, + "loss": 2.3577, + "step": 1152 + }, + { + "epoch": 1.1597409945307098, + "grad_norm": 17.578996133346863, + "learning_rate": 8.535658394813326e-06, + "loss": 2.3081, + "step": 1153 + }, + { + "epoch": 1.1607468410133903, + "grad_norm": 15.714436666924248, + "learning_rate": 8.533422758774873e-06, + "loss": 2.3318, + "step": 1154 + }, + { + "epoch": 1.1617526874960709, + "grad_norm": 19.104515106493235, + "learning_rate": 8.53118712273642e-06, + "loss": 2.2628, + "step": 1155 + }, + { + "epoch": 1.1627585339787514, + "grad_norm": 17.380121847851086, + "learning_rate": 8.528951486697966e-06, + "loss": 2.3115, + "step": 1156 + }, + { + "epoch": 1.1637643804614322, + "grad_norm": 19.36741749113208, + "learning_rate": 8.526715850659514e-06, + "loss": 2.3205, + "step": 1157 + }, + { + "epoch": 1.1647702269441127, + "grad_norm": 17.21266356569488, + "learning_rate": 8.52448021462106e-06, + "loss": 2.2991, + "step": 1158 + }, + { + "epoch": 1.1657760734267932, + "grad_norm": 17.08889702038637, + "learning_rate": 8.522244578582607e-06, + "loss": 2.2846, + "step": 1159 + }, + { + "epoch": 1.1667819199094738, + "grad_norm": 18.747684738568807, + "learning_rate": 8.520008942544155e-06, + "loss": 2.2952, + "step": 1160 + }, + { + "epoch": 1.1677877663921543, + "grad_norm": 16.235071815437617, + "learning_rate": 8.517773306505702e-06, + "loss": 2.2974, + "step": 1161 + }, + { + "epoch": 1.168793612874835, + "grad_norm": 16.03374356738149, + "learning_rate": 8.515537670467248e-06, + "loss": 2.3409, + "step": 1162 + }, + { + "epoch": 1.1697994593575156, + "grad_norm": 17.165622436072415, + "learning_rate": 8.513302034428795e-06, + "loss": 2.3109, + "step": 1163 + }, + { + "epoch": 1.1708053058401962, + "grad_norm": 16.091528197994307, + "learning_rate": 8.511066398390343e-06, + "loss": 2.2301, + "step": 1164 + }, + { + "epoch": 1.1718111523228767, + "grad_norm": 18.69712187447156, + "learning_rate": 8.50883076235189e-06, + "loss": 2.2939, + "step": 1165 + }, + { + "epoch": 1.1728169988055572, + "grad_norm": 18.730608245623188, + "learning_rate": 8.506595126313436e-06, + "loss": 2.3378, + "step": 1166 + }, + { + "epoch": 1.173822845288238, + "grad_norm": 17.226123777170987, + "learning_rate": 8.504359490274983e-06, + "loss": 2.27, + "step": 1167 + }, + { + "epoch": 1.1748286917709185, + "grad_norm": 17.729146331886696, + "learning_rate": 8.502123854236531e-06, + "loss": 2.3039, + "step": 1168 + }, + { + "epoch": 1.175834538253599, + "grad_norm": 20.300657659255517, + "learning_rate": 8.499888218198078e-06, + "loss": 2.2737, + "step": 1169 + }, + { + "epoch": 1.1768403847362796, + "grad_norm": 17.202357183552945, + "learning_rate": 8.497652582159626e-06, + "loss": 2.2941, + "step": 1170 + }, + { + "epoch": 1.1778462312189601, + "grad_norm": 18.644941015288847, + "learning_rate": 8.495416946121173e-06, + "loss": 2.2561, + "step": 1171 + }, + { + "epoch": 1.1788520777016407, + "grad_norm": 21.554516823425732, + "learning_rate": 8.493181310082719e-06, + "loss": 2.3076, + "step": 1172 + }, + { + "epoch": 1.1798579241843214, + "grad_norm": 21.910009042782725, + "learning_rate": 8.490945674044266e-06, + "loss": 2.2471, + "step": 1173 + }, + { + "epoch": 1.180863770667002, + "grad_norm": 19.125671110079075, + "learning_rate": 8.488710038005812e-06, + "loss": 2.2931, + "step": 1174 + }, + { + "epoch": 1.1818696171496825, + "grad_norm": 21.123339715589903, + "learning_rate": 8.48647440196736e-06, + "loss": 2.2847, + "step": 1175 + }, + { + "epoch": 1.182875463632363, + "grad_norm": 20.86724409603151, + "learning_rate": 8.484238765928907e-06, + "loss": 2.2892, + "step": 1176 + }, + { + "epoch": 1.1838813101150436, + "grad_norm": 17.136272592006435, + "learning_rate": 8.482003129890455e-06, + "loss": 2.2977, + "step": 1177 + }, + { + "epoch": 1.1848871565977244, + "grad_norm": 22.212885659542902, + "learning_rate": 8.479767493852002e-06, + "loss": 2.2907, + "step": 1178 + }, + { + "epoch": 1.185893003080405, + "grad_norm": 20.58646373096641, + "learning_rate": 8.477531857813548e-06, + "loss": 2.3163, + "step": 1179 + }, + { + "epoch": 1.1868988495630854, + "grad_norm": 17.647532371256542, + "learning_rate": 8.475296221775095e-06, + "loss": 2.3357, + "step": 1180 + }, + { + "epoch": 1.187904696045766, + "grad_norm": 20.60417039306096, + "learning_rate": 8.473060585736643e-06, + "loss": 2.294, + "step": 1181 + }, + { + "epoch": 1.1889105425284465, + "grad_norm": 19.29806031230101, + "learning_rate": 8.47082494969819e-06, + "loss": 2.2862, + "step": 1182 + }, + { + "epoch": 1.189916389011127, + "grad_norm": 16.151911685974117, + "learning_rate": 8.468589313659736e-06, + "loss": 2.2804, + "step": 1183 + }, + { + "epoch": 1.1909222354938078, + "grad_norm": 21.45097022316625, + "learning_rate": 8.466353677621285e-06, + "loss": 2.2598, + "step": 1184 + }, + { + "epoch": 1.1919280819764884, + "grad_norm": 19.395288120502105, + "learning_rate": 8.464118041582831e-06, + "loss": 2.2408, + "step": 1185 + }, + { + "epoch": 1.192933928459169, + "grad_norm": 15.936274586244203, + "learning_rate": 8.461882405544378e-06, + "loss": 2.2585, + "step": 1186 + }, + { + "epoch": 1.1939397749418494, + "grad_norm": 19.374911345804087, + "learning_rate": 8.459646769505924e-06, + "loss": 2.3039, + "step": 1187 + }, + { + "epoch": 1.1949456214245302, + "grad_norm": 19.076779151255614, + "learning_rate": 8.457411133467473e-06, + "loss": 2.2606, + "step": 1188 + }, + { + "epoch": 1.1959514679072107, + "grad_norm": 17.225837525556503, + "learning_rate": 8.455175497429019e-06, + "loss": 2.2772, + "step": 1189 + }, + { + "epoch": 1.1969573143898913, + "grad_norm": 19.275586433570965, + "learning_rate": 8.452939861390567e-06, + "loss": 2.2688, + "step": 1190 + }, + { + "epoch": 1.1979631608725718, + "grad_norm": 17.44383544746351, + "learning_rate": 8.450704225352114e-06, + "loss": 2.3338, + "step": 1191 + }, + { + "epoch": 1.1989690073552524, + "grad_norm": 17.17032187743969, + "learning_rate": 8.44846858931366e-06, + "loss": 2.3047, + "step": 1192 + }, + { + "epoch": 1.199974853837933, + "grad_norm": 19.481154732160245, + "learning_rate": 8.446232953275207e-06, + "loss": 2.2649, + "step": 1193 + }, + { + "epoch": 1.2009807003206137, + "grad_norm": 16.256685811166456, + "learning_rate": 8.443997317236755e-06, + "loss": 2.271, + "step": 1194 + }, + { + "epoch": 1.2019865468032942, + "grad_norm": 20.98511432429012, + "learning_rate": 8.441761681198302e-06, + "loss": 2.3088, + "step": 1195 + }, + { + "epoch": 1.2029923932859747, + "grad_norm": 18.56499615168342, + "learning_rate": 8.439526045159848e-06, + "loss": 2.2783, + "step": 1196 + }, + { + "epoch": 1.2039982397686553, + "grad_norm": 18.22411495162368, + "learning_rate": 8.437290409121397e-06, + "loss": 2.3375, + "step": 1197 + }, + { + "epoch": 1.2050040862513358, + "grad_norm": 17.292427441101047, + "learning_rate": 8.435054773082943e-06, + "loss": 2.2597, + "step": 1198 + }, + { + "epoch": 1.2060099327340166, + "grad_norm": 16.76514795076355, + "learning_rate": 8.43281913704449e-06, + "loss": 2.3185, + "step": 1199 + }, + { + "epoch": 1.2070157792166971, + "grad_norm": 17.189964690173653, + "learning_rate": 8.430583501006036e-06, + "loss": 2.2584, + "step": 1200 + }, + { + "epoch": 1.2080216256993777, + "grad_norm": 17.145210093973088, + "learning_rate": 8.428347864967585e-06, + "loss": 2.277, + "step": 1201 + }, + { + "epoch": 1.2090274721820582, + "grad_norm": 17.108566452416703, + "learning_rate": 8.426112228929131e-06, + "loss": 2.3085, + "step": 1202 + }, + { + "epoch": 1.2100333186647387, + "grad_norm": 17.953870673763248, + "learning_rate": 8.423876592890678e-06, + "loss": 2.3264, + "step": 1203 + }, + { + "epoch": 1.2110391651474193, + "grad_norm": 16.519478559734274, + "learning_rate": 8.421640956852224e-06, + "loss": 2.2744, + "step": 1204 + }, + { + "epoch": 1.2120450116301, + "grad_norm": 16.011151811895708, + "learning_rate": 8.419405320813773e-06, + "loss": 2.2456, + "step": 1205 + }, + { + "epoch": 1.2130508581127806, + "grad_norm": 18.155498400723467, + "learning_rate": 8.417169684775319e-06, + "loss": 2.3052, + "step": 1206 + }, + { + "epoch": 1.2140567045954611, + "grad_norm": 17.308785337003986, + "learning_rate": 8.414934048736866e-06, + "loss": 2.252, + "step": 1207 + }, + { + "epoch": 1.2150625510781417, + "grad_norm": 18.73502765302484, + "learning_rate": 8.412698412698414e-06, + "loss": 2.3404, + "step": 1208 + }, + { + "epoch": 1.2160683975608222, + "grad_norm": 18.73042774767471, + "learning_rate": 8.41046277665996e-06, + "loss": 2.298, + "step": 1209 + }, + { + "epoch": 1.217074244043503, + "grad_norm": 21.319913888122183, + "learning_rate": 8.408227140621507e-06, + "loss": 2.3036, + "step": 1210 + }, + { + "epoch": 1.2180800905261835, + "grad_norm": 19.28450061796885, + "learning_rate": 8.405991504583054e-06, + "loss": 2.2989, + "step": 1211 + }, + { + "epoch": 1.219085937008864, + "grad_norm": 19.759425495511334, + "learning_rate": 8.403755868544602e-06, + "loss": 2.3084, + "step": 1212 + }, + { + "epoch": 1.2200917834915446, + "grad_norm": 18.4648546273213, + "learning_rate": 8.401520232506148e-06, + "loss": 2.252, + "step": 1213 + }, + { + "epoch": 1.221097629974225, + "grad_norm": 22.537770266805197, + "learning_rate": 8.399284596467697e-06, + "loss": 2.3585, + "step": 1214 + }, + { + "epoch": 1.2221034764569056, + "grad_norm": 19.213301457694612, + "learning_rate": 8.397048960429243e-06, + "loss": 2.3562, + "step": 1215 + }, + { + "epoch": 1.2231093229395864, + "grad_norm": 16.149206566975227, + "learning_rate": 8.39481332439079e-06, + "loss": 2.2679, + "step": 1216 + }, + { + "epoch": 1.224115169422267, + "grad_norm": 18.60654610752786, + "learning_rate": 8.392577688352336e-06, + "loss": 2.3134, + "step": 1217 + }, + { + "epoch": 1.2251210159049475, + "grad_norm": 17.255750821902858, + "learning_rate": 8.390342052313883e-06, + "loss": 2.2861, + "step": 1218 + }, + { + "epoch": 1.226126862387628, + "grad_norm": 19.432852436736177, + "learning_rate": 8.388106416275431e-06, + "loss": 2.3494, + "step": 1219 + }, + { + "epoch": 1.2271327088703088, + "grad_norm": 16.724437009029767, + "learning_rate": 8.385870780236978e-06, + "loss": 2.3124, + "step": 1220 + }, + { + "epoch": 1.2281385553529893, + "grad_norm": 18.018889584155048, + "learning_rate": 8.383635144198526e-06, + "loss": 2.2987, + "step": 1221 + }, + { + "epoch": 1.2291444018356699, + "grad_norm": 19.397694587440313, + "learning_rate": 8.381399508160073e-06, + "loss": 2.3054, + "step": 1222 + }, + { + "epoch": 1.2301502483183504, + "grad_norm": 20.03040184726134, + "learning_rate": 8.379163872121619e-06, + "loss": 2.3023, + "step": 1223 + }, + { + "epoch": 1.231156094801031, + "grad_norm": 16.056037010914075, + "learning_rate": 8.376928236083166e-06, + "loss": 2.305, + "step": 1224 + }, + { + "epoch": 1.2321619412837115, + "grad_norm": 16.191308119121445, + "learning_rate": 8.374692600044714e-06, + "loss": 2.2859, + "step": 1225 + }, + { + "epoch": 1.2331677877663922, + "grad_norm": 17.60184761068015, + "learning_rate": 8.37245696400626e-06, + "loss": 2.2909, + "step": 1226 + }, + { + "epoch": 1.2341736342490728, + "grad_norm": 17.947870646096533, + "learning_rate": 8.370221327967809e-06, + "loss": 2.3172, + "step": 1227 + }, + { + "epoch": 1.2351794807317533, + "grad_norm": 16.50472984871475, + "learning_rate": 8.367985691929355e-06, + "loss": 2.3022, + "step": 1228 + }, + { + "epoch": 1.2361853272144339, + "grad_norm": 19.25983526255059, + "learning_rate": 8.365750055890902e-06, + "loss": 2.3106, + "step": 1229 + }, + { + "epoch": 1.2371911736971144, + "grad_norm": 19.60196250915634, + "learning_rate": 8.363514419852448e-06, + "loss": 2.2812, + "step": 1230 + }, + { + "epoch": 1.2381970201797952, + "grad_norm": 16.96407219723983, + "learning_rate": 8.361278783813995e-06, + "loss": 2.282, + "step": 1231 + }, + { + "epoch": 1.2392028666624757, + "grad_norm": 18.456480602482316, + "learning_rate": 8.359043147775543e-06, + "loss": 2.2883, + "step": 1232 + }, + { + "epoch": 1.2402087131451562, + "grad_norm": 20.712291216636736, + "learning_rate": 8.35680751173709e-06, + "loss": 2.3425, + "step": 1233 + }, + { + "epoch": 1.2412145596278368, + "grad_norm": 17.536161583264953, + "learning_rate": 8.354571875698636e-06, + "loss": 2.3433, + "step": 1234 + }, + { + "epoch": 1.2422204061105173, + "grad_norm": 17.866349249921104, + "learning_rate": 8.352336239660183e-06, + "loss": 2.2708, + "step": 1235 + }, + { + "epoch": 1.2432262525931979, + "grad_norm": 16.304401450446928, + "learning_rate": 8.350100603621731e-06, + "loss": 2.32, + "step": 1236 + }, + { + "epoch": 1.2442320990758786, + "grad_norm": 22.642132927378558, + "learning_rate": 8.347864967583278e-06, + "loss": 2.295, + "step": 1237 + }, + { + "epoch": 1.2452379455585592, + "grad_norm": 18.725267470668236, + "learning_rate": 8.345629331544826e-06, + "loss": 2.2752, + "step": 1238 + }, + { + "epoch": 1.2462437920412397, + "grad_norm": 16.894777761669875, + "learning_rate": 8.343393695506373e-06, + "loss": 2.2637, + "step": 1239 + }, + { + "epoch": 1.2472496385239202, + "grad_norm": 17.51083240524946, + "learning_rate": 8.341158059467919e-06, + "loss": 2.3437, + "step": 1240 + }, + { + "epoch": 1.2482554850066008, + "grad_norm": 16.689242298902055, + "learning_rate": 8.338922423429466e-06, + "loss": 2.275, + "step": 1241 + }, + { + "epoch": 1.2492613314892815, + "grad_norm": 17.72606774782188, + "learning_rate": 8.336686787391012e-06, + "loss": 2.3099, + "step": 1242 + }, + { + "epoch": 1.250267177971962, + "grad_norm": 18.255072739724646, + "learning_rate": 8.33445115135256e-06, + "loss": 2.3232, + "step": 1243 + }, + { + "epoch": 1.2512730244546426, + "grad_norm": 22.898243346681394, + "learning_rate": 8.332215515314107e-06, + "loss": 2.2872, + "step": 1244 + }, + { + "epoch": 1.2522788709373232, + "grad_norm": 17.303411031287375, + "learning_rate": 8.329979879275655e-06, + "loss": 2.3112, + "step": 1245 + }, + { + "epoch": 1.2532847174200037, + "grad_norm": 23.83708370176368, + "learning_rate": 8.327744243237202e-06, + "loss": 2.3457, + "step": 1246 + }, + { + "epoch": 1.2542905639026842, + "grad_norm": 22.44312354316094, + "learning_rate": 8.325508607198748e-06, + "loss": 2.2677, + "step": 1247 + }, + { + "epoch": 1.255296410385365, + "grad_norm": 20.673549066823906, + "learning_rate": 8.323272971160295e-06, + "loss": 2.3068, + "step": 1248 + }, + { + "epoch": 1.2563022568680455, + "grad_norm": 23.828249133774097, + "learning_rate": 8.321037335121843e-06, + "loss": 2.3217, + "step": 1249 + }, + { + "epoch": 1.257308103350726, + "grad_norm": 17.021661742629366, + "learning_rate": 8.31880169908339e-06, + "loss": 2.3293, + "step": 1250 + }, + { + "epoch": 1.2583139498334066, + "grad_norm": 24.068454583708398, + "learning_rate": 8.316566063044938e-06, + "loss": 2.2811, + "step": 1251 + }, + { + "epoch": 1.2593197963160874, + "grad_norm": 19.943471018011905, + "learning_rate": 8.314330427006485e-06, + "loss": 2.2987, + "step": 1252 + }, + { + "epoch": 1.260325642798768, + "grad_norm": 18.75794377417748, + "learning_rate": 8.312094790968031e-06, + "loss": 2.2931, + "step": 1253 + }, + { + "epoch": 1.2613314892814484, + "grad_norm": 24.490075886483915, + "learning_rate": 8.309859154929578e-06, + "loss": 2.2655, + "step": 1254 + }, + { + "epoch": 1.262337335764129, + "grad_norm": 18.731052471036953, + "learning_rate": 8.307623518891124e-06, + "loss": 2.3088, + "step": 1255 + }, + { + "epoch": 1.2633431822468095, + "grad_norm": 18.791512493395082, + "learning_rate": 8.305387882852673e-06, + "loss": 2.2572, + "step": 1256 + }, + { + "epoch": 1.26434902872949, + "grad_norm": 19.889671056381214, + "learning_rate": 8.303152246814219e-06, + "loss": 2.2602, + "step": 1257 + }, + { + "epoch": 1.2653548752121706, + "grad_norm": 21.002808896258866, + "learning_rate": 8.300916610775767e-06, + "loss": 2.335, + "step": 1258 + }, + { + "epoch": 1.2663607216948514, + "grad_norm": 20.225422380521106, + "learning_rate": 8.298680974737314e-06, + "loss": 2.2879, + "step": 1259 + }, + { + "epoch": 1.267366568177532, + "grad_norm": 23.050568206596104, + "learning_rate": 8.29644533869886e-06, + "loss": 2.2804, + "step": 1260 + }, + { + "epoch": 1.2683724146602124, + "grad_norm": 18.593076086047805, + "learning_rate": 8.294209702660407e-06, + "loss": 2.2453, + "step": 1261 + }, + { + "epoch": 1.269378261142893, + "grad_norm": 22.977584836766557, + "learning_rate": 8.291974066621955e-06, + "loss": 2.2711, + "step": 1262 + }, + { + "epoch": 1.2703841076255737, + "grad_norm": 20.608645625472292, + "learning_rate": 8.289738430583502e-06, + "loss": 2.2741, + "step": 1263 + }, + { + "epoch": 1.2713899541082543, + "grad_norm": 18.303856024466825, + "learning_rate": 8.28750279454505e-06, + "loss": 2.3507, + "step": 1264 + }, + { + "epoch": 1.2723958005909348, + "grad_norm": 19.347928838050038, + "learning_rate": 8.285267158506597e-06, + "loss": 2.2831, + "step": 1265 + }, + { + "epoch": 1.2734016470736154, + "grad_norm": 18.219999067252687, + "learning_rate": 8.283031522468143e-06, + "loss": 2.3249, + "step": 1266 + }, + { + "epoch": 1.274407493556296, + "grad_norm": 20.180292897382635, + "learning_rate": 8.28079588642969e-06, + "loss": 2.2624, + "step": 1267 + }, + { + "epoch": 1.2754133400389764, + "grad_norm": 15.374220912213122, + "learning_rate": 8.278560250391236e-06, + "loss": 2.2866, + "step": 1268 + }, + { + "epoch": 1.2764191865216572, + "grad_norm": 23.30607569252281, + "learning_rate": 8.276324614352785e-06, + "loss": 2.2837, + "step": 1269 + }, + { + "epoch": 1.2774250330043377, + "grad_norm": 17.137243897208975, + "learning_rate": 8.274088978314331e-06, + "loss": 2.3242, + "step": 1270 + }, + { + "epoch": 1.2784308794870183, + "grad_norm": 20.389855351910253, + "learning_rate": 8.271853342275878e-06, + "loss": 2.2921, + "step": 1271 + }, + { + "epoch": 1.2794367259696988, + "grad_norm": 20.884450516450137, + "learning_rate": 8.269617706237424e-06, + "loss": 2.3104, + "step": 1272 + }, + { + "epoch": 1.2804425724523796, + "grad_norm": 18.511079724079504, + "learning_rate": 8.267382070198972e-06, + "loss": 2.3011, + "step": 1273 + }, + { + "epoch": 1.2814484189350601, + "grad_norm": 18.1304178064368, + "learning_rate": 8.265146434160519e-06, + "loss": 2.2874, + "step": 1274 + }, + { + "epoch": 1.2824542654177407, + "grad_norm": 19.385152320988567, + "learning_rate": 8.262910798122067e-06, + "loss": 2.291, + "step": 1275 + }, + { + "epoch": 1.2834601119004212, + "grad_norm": 17.1541489649596, + "learning_rate": 8.260675162083614e-06, + "loss": 2.3131, + "step": 1276 + }, + { + "epoch": 1.2844659583831017, + "grad_norm": 18.36182480840744, + "learning_rate": 8.25843952604516e-06, + "loss": 2.2889, + "step": 1277 + }, + { + "epoch": 1.2854718048657823, + "grad_norm": 20.076674814801525, + "learning_rate": 8.256203890006707e-06, + "loss": 2.2674, + "step": 1278 + }, + { + "epoch": 1.2864776513484628, + "grad_norm": 16.417572435734545, + "learning_rate": 8.253968253968254e-06, + "loss": 2.3015, + "step": 1279 + }, + { + "epoch": 1.2874834978311436, + "grad_norm": 17.885444314453963, + "learning_rate": 8.251732617929802e-06, + "loss": 2.2964, + "step": 1280 + }, + { + "epoch": 1.2884893443138241, + "grad_norm": 16.89635113816944, + "learning_rate": 8.249496981891348e-06, + "loss": 2.2685, + "step": 1281 + }, + { + "epoch": 1.2894951907965047, + "grad_norm": 19.13993347028726, + "learning_rate": 8.247261345852897e-06, + "loss": 2.2531, + "step": 1282 + }, + { + "epoch": 1.2905010372791852, + "grad_norm": 19.69722538251037, + "learning_rate": 8.245025709814443e-06, + "loss": 2.3154, + "step": 1283 + }, + { + "epoch": 1.291506883761866, + "grad_norm": 18.408000700271728, + "learning_rate": 8.24279007377599e-06, + "loss": 2.2434, + "step": 1284 + }, + { + "epoch": 1.2925127302445465, + "grad_norm": 16.97299347011639, + "learning_rate": 8.240554437737536e-06, + "loss": 2.2538, + "step": 1285 + }, + { + "epoch": 1.293518576727227, + "grad_norm": 18.751891645294307, + "learning_rate": 8.238318801699085e-06, + "loss": 2.2749, + "step": 1286 + }, + { + "epoch": 1.2945244232099076, + "grad_norm": 18.046756082355557, + "learning_rate": 8.236083165660631e-06, + "loss": 2.2806, + "step": 1287 + }, + { + "epoch": 1.295530269692588, + "grad_norm": 17.456125883815147, + "learning_rate": 8.23384752962218e-06, + "loss": 2.3182, + "step": 1288 + }, + { + "epoch": 1.2965361161752686, + "grad_norm": 16.18488358180136, + "learning_rate": 8.231611893583726e-06, + "loss": 2.2946, + "step": 1289 + }, + { + "epoch": 1.2975419626579494, + "grad_norm": 15.440372250390611, + "learning_rate": 8.229376257545272e-06, + "loss": 2.3041, + "step": 1290 + }, + { + "epoch": 1.29854780914063, + "grad_norm": 20.329005492708532, + "learning_rate": 8.227140621506819e-06, + "loss": 2.3064, + "step": 1291 + }, + { + "epoch": 1.2995536556233105, + "grad_norm": 17.776054130149582, + "learning_rate": 8.224904985468366e-06, + "loss": 2.3201, + "step": 1292 + }, + { + "epoch": 1.300559502105991, + "grad_norm": 16.816280204724453, + "learning_rate": 8.222669349429914e-06, + "loss": 2.3327, + "step": 1293 + }, + { + "epoch": 1.3015653485886718, + "grad_norm": 17.568973501750424, + "learning_rate": 8.22043371339146e-06, + "loss": 2.2911, + "step": 1294 + }, + { + "epoch": 1.3025711950713523, + "grad_norm": 18.696170316078923, + "learning_rate": 8.218198077353009e-06, + "loss": 2.3178, + "step": 1295 + }, + { + "epoch": 1.3035770415540329, + "grad_norm": 17.548421228757203, + "learning_rate": 8.215962441314555e-06, + "loss": 2.2683, + "step": 1296 + }, + { + "epoch": 1.3045828880367134, + "grad_norm": 16.252199195405446, + "learning_rate": 8.213726805276102e-06, + "loss": 2.3371, + "step": 1297 + }, + { + "epoch": 1.305588734519394, + "grad_norm": 20.124397384897307, + "learning_rate": 8.211491169237648e-06, + "loss": 2.3059, + "step": 1298 + }, + { + "epoch": 1.3065945810020745, + "grad_norm": 21.440072268620945, + "learning_rate": 8.209255533199197e-06, + "loss": 2.2513, + "step": 1299 + }, + { + "epoch": 1.307600427484755, + "grad_norm": 17.556298612821745, + "learning_rate": 8.207019897160743e-06, + "loss": 2.3107, + "step": 1300 + }, + { + "epoch": 1.3086062739674358, + "grad_norm": 18.785195273395882, + "learning_rate": 8.20478426112229e-06, + "loss": 2.3165, + "step": 1301 + }, + { + "epoch": 1.3096121204501163, + "grad_norm": 18.79014461793555, + "learning_rate": 8.202548625083836e-06, + "loss": 2.2382, + "step": 1302 + }, + { + "epoch": 1.3106179669327969, + "grad_norm": 15.792332282171454, + "learning_rate": 8.200312989045383e-06, + "loss": 2.3201, + "step": 1303 + }, + { + "epoch": 1.3116238134154774, + "grad_norm": 21.41599796197624, + "learning_rate": 8.198077353006931e-06, + "loss": 2.2499, + "step": 1304 + }, + { + "epoch": 1.3126296598981582, + "grad_norm": 18.563952098598246, + "learning_rate": 8.195841716968478e-06, + "loss": 2.2913, + "step": 1305 + }, + { + "epoch": 1.3136355063808387, + "grad_norm": 18.28162901925981, + "learning_rate": 8.193606080930026e-06, + "loss": 2.3062, + "step": 1306 + }, + { + "epoch": 1.3146413528635192, + "grad_norm": 20.42790763943371, + "learning_rate": 8.191370444891572e-06, + "loss": 2.3028, + "step": 1307 + }, + { + "epoch": 1.3156471993461998, + "grad_norm": 18.648482980230007, + "learning_rate": 8.189134808853119e-06, + "loss": 2.3107, + "step": 1308 + }, + { + "epoch": 1.3166530458288803, + "grad_norm": 17.67553108046889, + "learning_rate": 8.186899172814666e-06, + "loss": 2.3137, + "step": 1309 + }, + { + "epoch": 1.3176588923115609, + "grad_norm": 24.623618881587323, + "learning_rate": 8.184663536776214e-06, + "loss": 2.2853, + "step": 1310 + }, + { + "epoch": 1.3186647387942414, + "grad_norm": 22.602779570132835, + "learning_rate": 8.18242790073776e-06, + "loss": 2.3202, + "step": 1311 + }, + { + "epoch": 1.3196705852769222, + "grad_norm": 19.920055666801737, + "learning_rate": 8.180192264699309e-06, + "loss": 2.2867, + "step": 1312 + }, + { + "epoch": 1.3206764317596027, + "grad_norm": 20.698995378939966, + "learning_rate": 8.177956628660855e-06, + "loss": 2.3015, + "step": 1313 + }, + { + "epoch": 1.3216822782422832, + "grad_norm": 19.49105745285178, + "learning_rate": 8.175720992622402e-06, + "loss": 2.2992, + "step": 1314 + }, + { + "epoch": 1.3226881247249638, + "grad_norm": 19.980718568169483, + "learning_rate": 8.173485356583948e-06, + "loss": 2.2762, + "step": 1315 + }, + { + "epoch": 1.3236939712076445, + "grad_norm": 20.994595344065768, + "learning_rate": 8.171249720545495e-06, + "loss": 2.3046, + "step": 1316 + }, + { + "epoch": 1.324699817690325, + "grad_norm": 18.561189947529662, + "learning_rate": 8.169014084507043e-06, + "loss": 2.2812, + "step": 1317 + }, + { + "epoch": 1.3257056641730056, + "grad_norm": 20.345641276000574, + "learning_rate": 8.16677844846859e-06, + "loss": 2.2895, + "step": 1318 + }, + { + "epoch": 1.3267115106556862, + "grad_norm": 20.8726976500452, + "learning_rate": 8.164542812430138e-06, + "loss": 2.3045, + "step": 1319 + }, + { + "epoch": 1.3277173571383667, + "grad_norm": 17.641123989553606, + "learning_rate": 8.162307176391685e-06, + "loss": 2.2865, + "step": 1320 + }, + { + "epoch": 1.3287232036210472, + "grad_norm": 18.27076606433099, + "learning_rate": 8.160071540353231e-06, + "loss": 2.3093, + "step": 1321 + }, + { + "epoch": 1.329729050103728, + "grad_norm": 19.810653599016497, + "learning_rate": 8.157835904314778e-06, + "loss": 2.2917, + "step": 1322 + }, + { + "epoch": 1.3307348965864085, + "grad_norm": 19.629184686847974, + "learning_rate": 8.155600268276326e-06, + "loss": 2.3546, + "step": 1323 + }, + { + "epoch": 1.331740743069089, + "grad_norm": 21.380275384171682, + "learning_rate": 8.153364632237872e-06, + "loss": 2.2434, + "step": 1324 + }, + { + "epoch": 1.3327465895517696, + "grad_norm": 21.66339792232214, + "learning_rate": 8.151128996199419e-06, + "loss": 2.2751, + "step": 1325 + }, + { + "epoch": 1.3337524360344504, + "grad_norm": 19.021364106448544, + "learning_rate": 8.148893360160967e-06, + "loss": 2.3185, + "step": 1326 + }, + { + "epoch": 1.334758282517131, + "grad_norm": 19.811645988757654, + "learning_rate": 8.146657724122514e-06, + "loss": 2.3236, + "step": 1327 + }, + { + "epoch": 1.3357641289998115, + "grad_norm": 18.294063874329982, + "learning_rate": 8.14442208808406e-06, + "loss": 2.3302, + "step": 1328 + }, + { + "epoch": 1.336769975482492, + "grad_norm": 17.294912142975505, + "learning_rate": 8.142186452045607e-06, + "loss": 2.2985, + "step": 1329 + }, + { + "epoch": 1.3377758219651725, + "grad_norm": 16.126934879929436, + "learning_rate": 8.139950816007155e-06, + "loss": 2.3015, + "step": 1330 + }, + { + "epoch": 1.338781668447853, + "grad_norm": 16.16225564905107, + "learning_rate": 8.137715179968702e-06, + "loss": 2.2449, + "step": 1331 + }, + { + "epoch": 1.3397875149305336, + "grad_norm": 19.658951752756792, + "learning_rate": 8.13547954393025e-06, + "loss": 2.3109, + "step": 1332 + }, + { + "epoch": 1.3407933614132144, + "grad_norm": 19.389529788632824, + "learning_rate": 8.133243907891797e-06, + "loss": 2.3006, + "step": 1333 + }, + { + "epoch": 1.341799207895895, + "grad_norm": 19.107880682491288, + "learning_rate": 8.131008271853343e-06, + "loss": 2.2806, + "step": 1334 + }, + { + "epoch": 1.3428050543785754, + "grad_norm": 18.50200106508036, + "learning_rate": 8.12877263581489e-06, + "loss": 2.2711, + "step": 1335 + }, + { + "epoch": 1.343810900861256, + "grad_norm": 17.17908472204596, + "learning_rate": 8.126536999776436e-06, + "loss": 2.3178, + "step": 1336 + }, + { + "epoch": 1.3448167473439367, + "grad_norm": 19.957877071610238, + "learning_rate": 8.124301363737984e-06, + "loss": 2.2612, + "step": 1337 + }, + { + "epoch": 1.3458225938266173, + "grad_norm": 21.013195814959914, + "learning_rate": 8.122065727699531e-06, + "loss": 2.3239, + "step": 1338 + }, + { + "epoch": 1.3468284403092978, + "grad_norm": 18.71020075051148, + "learning_rate": 8.119830091661078e-06, + "loss": 2.2599, + "step": 1339 + }, + { + "epoch": 1.3478342867919784, + "grad_norm": 19.25262006006824, + "learning_rate": 8.117594455622624e-06, + "loss": 2.2847, + "step": 1340 + }, + { + "epoch": 1.348840133274659, + "grad_norm": 21.55320098193352, + "learning_rate": 8.115358819584172e-06, + "loss": 2.3184, + "step": 1341 + }, + { + "epoch": 1.3498459797573394, + "grad_norm": 18.3535039370996, + "learning_rate": 8.113123183545719e-06, + "loss": 2.3066, + "step": 1342 + }, + { + "epoch": 1.35085182624002, + "grad_norm": 18.416153435479284, + "learning_rate": 8.110887547507267e-06, + "loss": 2.2912, + "step": 1343 + }, + { + "epoch": 1.3518576727227007, + "grad_norm": 15.017770702208837, + "learning_rate": 8.108651911468814e-06, + "loss": 2.332, + "step": 1344 + }, + { + "epoch": 1.3528635192053813, + "grad_norm": 16.926121576064208, + "learning_rate": 8.10641627543036e-06, + "loss": 2.3573, + "step": 1345 + }, + { + "epoch": 1.3538693656880618, + "grad_norm": 17.416574528957202, + "learning_rate": 8.104180639391907e-06, + "loss": 2.2611, + "step": 1346 + }, + { + "epoch": 1.3548752121707424, + "grad_norm": 16.300702078394863, + "learning_rate": 8.101945003353455e-06, + "loss": 2.2578, + "step": 1347 + }, + { + "epoch": 1.3558810586534231, + "grad_norm": 19.977823623903497, + "learning_rate": 8.099709367315002e-06, + "loss": 2.3109, + "step": 1348 + }, + { + "epoch": 1.3568869051361037, + "grad_norm": 18.642195493227934, + "learning_rate": 8.097473731276548e-06, + "loss": 2.2904, + "step": 1349 + }, + { + "epoch": 1.3578927516187842, + "grad_norm": 18.72968581315017, + "learning_rate": 8.095238095238097e-06, + "loss": 2.3308, + "step": 1350 + }, + { + "epoch": 1.3588985981014647, + "grad_norm": 17.103859568421928, + "learning_rate": 8.093002459199643e-06, + "loss": 2.3178, + "step": 1351 + }, + { + "epoch": 1.3599044445841453, + "grad_norm": 17.298230445169914, + "learning_rate": 8.09076682316119e-06, + "loss": 2.2466, + "step": 1352 + }, + { + "epoch": 1.3609102910668258, + "grad_norm": 17.79454611645911, + "learning_rate": 8.088531187122736e-06, + "loss": 2.2856, + "step": 1353 + }, + { + "epoch": 1.3619161375495066, + "grad_norm": 18.984923023468426, + "learning_rate": 8.086295551084284e-06, + "loss": 2.3291, + "step": 1354 + }, + { + "epoch": 1.3629219840321871, + "grad_norm": 17.768320917083503, + "learning_rate": 8.084059915045831e-06, + "loss": 2.3978, + "step": 1355 + }, + { + "epoch": 1.3639278305148677, + "grad_norm": 17.375971136653813, + "learning_rate": 8.08182427900738e-06, + "loss": 2.3041, + "step": 1356 + }, + { + "epoch": 1.3649336769975482, + "grad_norm": 16.302797771846613, + "learning_rate": 8.079588642968926e-06, + "loss": 2.2716, + "step": 1357 + }, + { + "epoch": 1.365939523480229, + "grad_norm": 19.4372043283243, + "learning_rate": 8.077353006930472e-06, + "loss": 2.2259, + "step": 1358 + }, + { + "epoch": 1.3669453699629095, + "grad_norm": 17.519273113608005, + "learning_rate": 8.075117370892019e-06, + "loss": 2.3083, + "step": 1359 + }, + { + "epoch": 1.36795121644559, + "grad_norm": 18.847836982555016, + "learning_rate": 8.072881734853566e-06, + "loss": 2.2929, + "step": 1360 + }, + { + "epoch": 1.3689570629282706, + "grad_norm": 18.80960331014159, + "learning_rate": 8.070646098815114e-06, + "loss": 2.344, + "step": 1361 + }, + { + "epoch": 1.3699629094109511, + "grad_norm": 18.54062907853193, + "learning_rate": 8.06841046277666e-06, + "loss": 2.291, + "step": 1362 + }, + { + "epoch": 1.3709687558936317, + "grad_norm": 21.020876962064065, + "learning_rate": 8.066174826738209e-06, + "loss": 2.2943, + "step": 1363 + }, + { + "epoch": 1.3719746023763122, + "grad_norm": 19.634452718735282, + "learning_rate": 8.063939190699755e-06, + "loss": 2.3096, + "step": 1364 + }, + { + "epoch": 1.372980448858993, + "grad_norm": 23.75241532805294, + "learning_rate": 8.061703554661302e-06, + "loss": 2.2756, + "step": 1365 + }, + { + "epoch": 1.3739862953416735, + "grad_norm": 23.184088534634178, + "learning_rate": 8.059467918622848e-06, + "loss": 2.3122, + "step": 1366 + }, + { + "epoch": 1.374992141824354, + "grad_norm": 21.683222338914778, + "learning_rate": 8.057232282584397e-06, + "loss": 2.2555, + "step": 1367 + }, + { + "epoch": 1.3759979883070346, + "grad_norm": 17.538940009868362, + "learning_rate": 8.054996646545943e-06, + "loss": 2.2821, + "step": 1368 + }, + { + "epoch": 1.3770038347897153, + "grad_norm": 20.589385488446872, + "learning_rate": 8.05276101050749e-06, + "loss": 2.3005, + "step": 1369 + }, + { + "epoch": 1.3780096812723959, + "grad_norm": 18.228248922770465, + "learning_rate": 8.050525374469036e-06, + "loss": 2.2467, + "step": 1370 + }, + { + "epoch": 1.3790155277550764, + "grad_norm": 18.150099944925707, + "learning_rate": 8.048289738430584e-06, + "loss": 2.3107, + "step": 1371 + }, + { + "epoch": 1.380021374237757, + "grad_norm": 17.600424138374375, + "learning_rate": 8.046054102392131e-06, + "loss": 2.2301, + "step": 1372 + }, + { + "epoch": 1.3810272207204375, + "grad_norm": 19.973595794510803, + "learning_rate": 8.043818466353678e-06, + "loss": 2.326, + "step": 1373 + }, + { + "epoch": 1.382033067203118, + "grad_norm": 18.030406121961228, + "learning_rate": 8.041582830315226e-06, + "loss": 2.299, + "step": 1374 + }, + { + "epoch": 1.3830389136857988, + "grad_norm": 17.130081737007906, + "learning_rate": 8.039347194276772e-06, + "loss": 2.2898, + "step": 1375 + }, + { + "epoch": 1.3840447601684793, + "grad_norm": 16.71810323163366, + "learning_rate": 8.037111558238319e-06, + "loss": 2.2864, + "step": 1376 + }, + { + "epoch": 1.3850506066511599, + "grad_norm": 16.76624558405387, + "learning_rate": 8.034875922199866e-06, + "loss": 2.2581, + "step": 1377 + }, + { + "epoch": 1.3860564531338404, + "grad_norm": 16.552359135070123, + "learning_rate": 8.032640286161414e-06, + "loss": 2.2802, + "step": 1378 + }, + { + "epoch": 1.3870622996165212, + "grad_norm": 19.929742012690642, + "learning_rate": 8.03040465012296e-06, + "loss": 2.2913, + "step": 1379 + }, + { + "epoch": 1.3880681460992017, + "grad_norm": 17.513117588876412, + "learning_rate": 8.028169014084509e-06, + "loss": 2.2613, + "step": 1380 + }, + { + "epoch": 1.3890739925818822, + "grad_norm": 15.22832720058821, + "learning_rate": 8.025933378046055e-06, + "loss": 2.2937, + "step": 1381 + }, + { + "epoch": 1.3900798390645628, + "grad_norm": 17.218042427984003, + "learning_rate": 8.023697742007602e-06, + "loss": 2.2741, + "step": 1382 + }, + { + "epoch": 1.3910856855472433, + "grad_norm": 16.58871727809983, + "learning_rate": 8.021462105969148e-06, + "loss": 2.3226, + "step": 1383 + }, + { + "epoch": 1.3920915320299239, + "grad_norm": 16.06444662928072, + "learning_rate": 8.019226469930695e-06, + "loss": 2.3746, + "step": 1384 + }, + { + "epoch": 1.3930973785126044, + "grad_norm": 17.70240118364787, + "learning_rate": 8.016990833892243e-06, + "loss": 2.28, + "step": 1385 + }, + { + "epoch": 1.3941032249952852, + "grad_norm": 19.892962329697724, + "learning_rate": 8.01475519785379e-06, + "loss": 2.2277, + "step": 1386 + }, + { + "epoch": 1.3951090714779657, + "grad_norm": 17.631779390893282, + "learning_rate": 8.012519561815338e-06, + "loss": 2.2761, + "step": 1387 + }, + { + "epoch": 1.3961149179606462, + "grad_norm": 16.853865251692838, + "learning_rate": 8.010283925776884e-06, + "loss": 2.3478, + "step": 1388 + }, + { + "epoch": 1.3971207644433268, + "grad_norm": 17.66032083738247, + "learning_rate": 8.008048289738431e-06, + "loss": 2.2631, + "step": 1389 + }, + { + "epoch": 1.3981266109260075, + "grad_norm": 17.17318060438319, + "learning_rate": 8.005812653699978e-06, + "loss": 2.3152, + "step": 1390 + }, + { + "epoch": 1.399132457408688, + "grad_norm": 18.074483339002846, + "learning_rate": 8.003577017661526e-06, + "loss": 2.3181, + "step": 1391 + }, + { + "epoch": 1.4001383038913686, + "grad_norm": 20.343979364622978, + "learning_rate": 8.001341381623072e-06, + "loss": 2.3134, + "step": 1392 + }, + { + "epoch": 1.4011441503740492, + "grad_norm": 18.554688640383343, + "learning_rate": 7.99910574558462e-06, + "loss": 2.2422, + "step": 1393 + }, + { + "epoch": 1.4021499968567297, + "grad_norm": 16.381818752704735, + "learning_rate": 7.996870109546167e-06, + "loss": 2.2659, + "step": 1394 + }, + { + "epoch": 1.4031558433394102, + "grad_norm": 17.031748772581285, + "learning_rate": 7.994634473507714e-06, + "loss": 2.2616, + "step": 1395 + }, + { + "epoch": 1.4041616898220908, + "grad_norm": 16.023918356128732, + "learning_rate": 7.99239883746926e-06, + "loss": 2.2754, + "step": 1396 + }, + { + "epoch": 1.4051675363047715, + "grad_norm": 16.43655076009244, + "learning_rate": 7.990163201430807e-06, + "loss": 2.2738, + "step": 1397 + }, + { + "epoch": 1.406173382787452, + "grad_norm": 18.001464549575672, + "learning_rate": 7.987927565392355e-06, + "loss": 2.2448, + "step": 1398 + }, + { + "epoch": 1.4071792292701326, + "grad_norm": 15.548429987816085, + "learning_rate": 7.985691929353902e-06, + "loss": 2.2493, + "step": 1399 + }, + { + "epoch": 1.4081850757528132, + "grad_norm": 16.789719108176232, + "learning_rate": 7.98345629331545e-06, + "loss": 2.2504, + "step": 1400 + }, + { + "epoch": 1.409190922235494, + "grad_norm": 17.761135848676386, + "learning_rate": 7.981220657276996e-06, + "loss": 2.3021, + "step": 1401 + }, + { + "epoch": 1.4101967687181745, + "grad_norm": 16.397080182416595, + "learning_rate": 7.978985021238543e-06, + "loss": 2.2884, + "step": 1402 + }, + { + "epoch": 1.411202615200855, + "grad_norm": 21.89243360012097, + "learning_rate": 7.97674938520009e-06, + "loss": 2.2874, + "step": 1403 + }, + { + "epoch": 1.4122084616835355, + "grad_norm": 18.531696638767684, + "learning_rate": 7.974513749161638e-06, + "loss": 2.2476, + "step": 1404 + }, + { + "epoch": 1.413214308166216, + "grad_norm": 18.655323089753995, + "learning_rate": 7.972278113123184e-06, + "loss": 2.2807, + "step": 1405 + }, + { + "epoch": 1.4142201546488966, + "grad_norm": 19.63140503292154, + "learning_rate": 7.970042477084731e-06, + "loss": 2.3598, + "step": 1406 + }, + { + "epoch": 1.4152260011315774, + "grad_norm": 17.511016093681576, + "learning_rate": 7.967806841046278e-06, + "loss": 2.2915, + "step": 1407 + }, + { + "epoch": 1.416231847614258, + "grad_norm": 17.705733739636496, + "learning_rate": 7.965571205007824e-06, + "loss": 2.2991, + "step": 1408 + }, + { + "epoch": 1.4172376940969384, + "grad_norm": 17.07262553982362, + "learning_rate": 7.963335568969372e-06, + "loss": 2.2565, + "step": 1409 + }, + { + "epoch": 1.418243540579619, + "grad_norm": 21.707460747915643, + "learning_rate": 7.961099932930919e-06, + "loss": 2.2577, + "step": 1410 + }, + { + "epoch": 1.4192493870622997, + "grad_norm": 20.173807801461937, + "learning_rate": 7.958864296892467e-06, + "loss": 2.272, + "step": 1411 + }, + { + "epoch": 1.4202552335449803, + "grad_norm": 16.95295828244757, + "learning_rate": 7.956628660854014e-06, + "loss": 2.2635, + "step": 1412 + }, + { + "epoch": 1.4212610800276608, + "grad_norm": 23.519098966208954, + "learning_rate": 7.95439302481556e-06, + "loss": 2.3345, + "step": 1413 + }, + { + "epoch": 1.4222669265103414, + "grad_norm": 24.33680951481455, + "learning_rate": 7.952157388777107e-06, + "loss": 2.2901, + "step": 1414 + }, + { + "epoch": 1.423272772993022, + "grad_norm": 19.232446660438725, + "learning_rate": 7.949921752738655e-06, + "loss": 2.2884, + "step": 1415 + }, + { + "epoch": 1.4242786194757024, + "grad_norm": 22.943036836976418, + "learning_rate": 7.947686116700202e-06, + "loss": 2.3096, + "step": 1416 + }, + { + "epoch": 1.425284465958383, + "grad_norm": 26.73875559733533, + "learning_rate": 7.94545048066175e-06, + "loss": 2.3043, + "step": 1417 + }, + { + "epoch": 1.4262903124410637, + "grad_norm": 19.798386270865898, + "learning_rate": 7.943214844623296e-06, + "loss": 2.2578, + "step": 1418 + }, + { + "epoch": 1.4272961589237443, + "grad_norm": 19.153287956087972, + "learning_rate": 7.940979208584843e-06, + "loss": 2.2363, + "step": 1419 + }, + { + "epoch": 1.4283020054064248, + "grad_norm": 20.88307415888858, + "learning_rate": 7.93874357254639e-06, + "loss": 2.3176, + "step": 1420 + }, + { + "epoch": 1.4293078518891054, + "grad_norm": 19.113742813769317, + "learning_rate": 7.936507936507936e-06, + "loss": 2.2732, + "step": 1421 + }, + { + "epoch": 1.4303136983717861, + "grad_norm": 19.37026732661173, + "learning_rate": 7.934272300469484e-06, + "loss": 2.3011, + "step": 1422 + }, + { + "epoch": 1.4313195448544667, + "grad_norm": 18.49949387415047, + "learning_rate": 7.932036664431031e-06, + "loss": 2.2737, + "step": 1423 + }, + { + "epoch": 1.4323253913371472, + "grad_norm": 19.959543414268822, + "learning_rate": 7.92980102839258e-06, + "loss": 2.3153, + "step": 1424 + }, + { + "epoch": 1.4333312378198277, + "grad_norm": 28.318795043762517, + "learning_rate": 7.927565392354126e-06, + "loss": 2.3026, + "step": 1425 + }, + { + "epoch": 1.4343370843025083, + "grad_norm": 28.12273607964509, + "learning_rate": 7.925329756315672e-06, + "loss": 2.2637, + "step": 1426 + }, + { + "epoch": 1.4353429307851888, + "grad_norm": 17.778700492210334, + "learning_rate": 7.923094120277219e-06, + "loss": 2.3389, + "step": 1427 + }, + { + "epoch": 1.4363487772678694, + "grad_norm": 20.872414384196436, + "learning_rate": 7.920858484238767e-06, + "loss": 2.3155, + "step": 1428 + }, + { + "epoch": 1.4373546237505501, + "grad_norm": 25.94655838699598, + "learning_rate": 7.918622848200314e-06, + "loss": 2.2919, + "step": 1429 + }, + { + "epoch": 1.4383604702332307, + "grad_norm": 18.553279573237166, + "learning_rate": 7.916387212161862e-06, + "loss": 2.3057, + "step": 1430 + }, + { + "epoch": 1.4393663167159112, + "grad_norm": 20.94861910235284, + "learning_rate": 7.914151576123409e-06, + "loss": 2.3307, + "step": 1431 + }, + { + "epoch": 1.4403721631985917, + "grad_norm": 23.493662347236377, + "learning_rate": 7.911915940084955e-06, + "loss": 2.2584, + "step": 1432 + }, + { + "epoch": 1.4413780096812725, + "grad_norm": 17.882426706437784, + "learning_rate": 7.909680304046502e-06, + "loss": 2.2838, + "step": 1433 + }, + { + "epoch": 1.442383856163953, + "grad_norm": 19.72887454840581, + "learning_rate": 7.907444668008048e-06, + "loss": 2.2449, + "step": 1434 + }, + { + "epoch": 1.4433897026466336, + "grad_norm": 18.291268174889137, + "learning_rate": 7.905209031969596e-06, + "loss": 2.3025, + "step": 1435 + }, + { + "epoch": 1.4443955491293141, + "grad_norm": 20.22369948274378, + "learning_rate": 7.902973395931143e-06, + "loss": 2.306, + "step": 1436 + }, + { + "epoch": 1.4454013956119947, + "grad_norm": 18.7791697107874, + "learning_rate": 7.90073775989269e-06, + "loss": 2.2995, + "step": 1437 + }, + { + "epoch": 1.4464072420946752, + "grad_norm": 19.201215197139422, + "learning_rate": 7.898502123854236e-06, + "loss": 2.2909, + "step": 1438 + }, + { + "epoch": 1.447413088577356, + "grad_norm": 16.628374989794644, + "learning_rate": 7.896266487815784e-06, + "loss": 2.262, + "step": 1439 + }, + { + "epoch": 1.4484189350600365, + "grad_norm": 16.560349526712507, + "learning_rate": 7.894030851777331e-06, + "loss": 2.2959, + "step": 1440 + }, + { + "epoch": 1.449424781542717, + "grad_norm": 18.58682928866191, + "learning_rate": 7.89179521573888e-06, + "loss": 2.3068, + "step": 1441 + }, + { + "epoch": 1.4504306280253976, + "grad_norm": 18.162928908222984, + "learning_rate": 7.889559579700426e-06, + "loss": 2.3476, + "step": 1442 + }, + { + "epoch": 1.4514364745080783, + "grad_norm": 17.10078352883004, + "learning_rate": 7.887323943661972e-06, + "loss": 2.3223, + "step": 1443 + }, + { + "epoch": 1.4524423209907589, + "grad_norm": 14.894578538621019, + "learning_rate": 7.885088307623519e-06, + "loss": 2.3014, + "step": 1444 + }, + { + "epoch": 1.4534481674734394, + "grad_norm": 19.416479931616557, + "learning_rate": 7.882852671585065e-06, + "loss": 2.3405, + "step": 1445 + }, + { + "epoch": 1.45445401395612, + "grad_norm": 19.271151748168545, + "learning_rate": 7.880617035546614e-06, + "loss": 2.2826, + "step": 1446 + }, + { + "epoch": 1.4554598604388005, + "grad_norm": 20.01304026489289, + "learning_rate": 7.87838139950816e-06, + "loss": 2.2408, + "step": 1447 + }, + { + "epoch": 1.456465706921481, + "grad_norm": 18.516319943704005, + "learning_rate": 7.876145763469709e-06, + "loss": 2.3402, + "step": 1448 + }, + { + "epoch": 1.4574715534041616, + "grad_norm": 19.933211770601265, + "learning_rate": 7.873910127431255e-06, + "loss": 2.3071, + "step": 1449 + }, + { + "epoch": 1.4584773998868423, + "grad_norm": 19.876724976053293, + "learning_rate": 7.871674491392802e-06, + "loss": 2.2551, + "step": 1450 + }, + { + "epoch": 1.4594832463695229, + "grad_norm": 20.107325835555148, + "learning_rate": 7.869438855354348e-06, + "loss": 2.2941, + "step": 1451 + }, + { + "epoch": 1.4604890928522034, + "grad_norm": 21.42891005758536, + "learning_rate": 7.867203219315896e-06, + "loss": 2.3477, + "step": 1452 + }, + { + "epoch": 1.461494939334884, + "grad_norm": 15.536307850085874, + "learning_rate": 7.864967583277443e-06, + "loss": 2.3091, + "step": 1453 + }, + { + "epoch": 1.4625007858175647, + "grad_norm": 21.50933449834089, + "learning_rate": 7.86273194723899e-06, + "loss": 2.2951, + "step": 1454 + }, + { + "epoch": 1.4635066323002452, + "grad_norm": 21.12008364466888, + "learning_rate": 7.860496311200538e-06, + "loss": 2.318, + "step": 1455 + }, + { + "epoch": 1.4645124787829258, + "grad_norm": 18.665952854109474, + "learning_rate": 7.858260675162084e-06, + "loss": 2.2878, + "step": 1456 + }, + { + "epoch": 1.4655183252656063, + "grad_norm": 21.961368925244503, + "learning_rate": 7.856025039123631e-06, + "loss": 2.291, + "step": 1457 + }, + { + "epoch": 1.4665241717482869, + "grad_norm": 17.713270839538062, + "learning_rate": 7.853789403085178e-06, + "loss": 2.2357, + "step": 1458 + }, + { + "epoch": 1.4675300182309674, + "grad_norm": 20.24530939385799, + "learning_rate": 7.851553767046726e-06, + "loss": 2.2657, + "step": 1459 + }, + { + "epoch": 1.468535864713648, + "grad_norm": 19.141291616606033, + "learning_rate": 7.849318131008272e-06, + "loss": 2.3313, + "step": 1460 + }, + { + "epoch": 1.4695417111963287, + "grad_norm": 19.037031944583642, + "learning_rate": 7.84708249496982e-06, + "loss": 2.3352, + "step": 1461 + }, + { + "epoch": 1.4705475576790092, + "grad_norm": 18.07282339133904, + "learning_rate": 7.844846858931367e-06, + "loss": 2.3196, + "step": 1462 + }, + { + "epoch": 1.4715534041616898, + "grad_norm": 19.871435309000702, + "learning_rate": 7.842611222892914e-06, + "loss": 2.2901, + "step": 1463 + }, + { + "epoch": 1.4725592506443703, + "grad_norm": 17.65462170569615, + "learning_rate": 7.84037558685446e-06, + "loss": 2.3058, + "step": 1464 + }, + { + "epoch": 1.473565097127051, + "grad_norm": 17.577295712228253, + "learning_rate": 7.838139950816009e-06, + "loss": 2.2996, + "step": 1465 + }, + { + "epoch": 1.4745709436097316, + "grad_norm": 17.72683927987532, + "learning_rate": 7.835904314777555e-06, + "loss": 2.3471, + "step": 1466 + }, + { + "epoch": 1.4755767900924122, + "grad_norm": 16.453599601296077, + "learning_rate": 7.833668678739102e-06, + "loss": 2.271, + "step": 1467 + }, + { + "epoch": 1.4765826365750927, + "grad_norm": 15.182795987382685, + "learning_rate": 7.83143304270065e-06, + "loss": 2.3045, + "step": 1468 + }, + { + "epoch": 1.4775884830577732, + "grad_norm": 15.254890201984024, + "learning_rate": 7.829197406662196e-06, + "loss": 2.2917, + "step": 1469 + }, + { + "epoch": 1.4785943295404538, + "grad_norm": 16.617690368093108, + "learning_rate": 7.826961770623743e-06, + "loss": 2.2954, + "step": 1470 + }, + { + "epoch": 1.4796001760231345, + "grad_norm": 17.407836062399724, + "learning_rate": 7.82472613458529e-06, + "loss": 2.2826, + "step": 1471 + }, + { + "epoch": 1.480606022505815, + "grad_norm": 16.60301913849178, + "learning_rate": 7.822490498546838e-06, + "loss": 2.2914, + "step": 1472 + }, + { + "epoch": 1.4816118689884956, + "grad_norm": 17.802071741821546, + "learning_rate": 7.820254862508384e-06, + "loss": 2.2856, + "step": 1473 + }, + { + "epoch": 1.4826177154711762, + "grad_norm": 23.12527459981009, + "learning_rate": 7.818019226469931e-06, + "loss": 2.3058, + "step": 1474 + }, + { + "epoch": 1.483623561953857, + "grad_norm": 16.6587149986682, + "learning_rate": 7.815783590431477e-06, + "loss": 2.3052, + "step": 1475 + }, + { + "epoch": 1.4846294084365375, + "grad_norm": 16.928897564820495, + "learning_rate": 7.813547954393026e-06, + "loss": 2.289, + "step": 1476 + }, + { + "epoch": 1.485635254919218, + "grad_norm": 16.34399216078306, + "learning_rate": 7.811312318354572e-06, + "loss": 2.291, + "step": 1477 + }, + { + "epoch": 1.4866411014018985, + "grad_norm": 17.973201861561957, + "learning_rate": 7.809076682316119e-06, + "loss": 2.3251, + "step": 1478 + }, + { + "epoch": 1.487646947884579, + "grad_norm": 17.39086510043571, + "learning_rate": 7.806841046277667e-06, + "loss": 2.2327, + "step": 1479 + }, + { + "epoch": 1.4886527943672596, + "grad_norm": 17.920711958288884, + "learning_rate": 7.804605410239214e-06, + "loss": 2.3026, + "step": 1480 + }, + { + "epoch": 1.4896586408499402, + "grad_norm": 20.359023819061278, + "learning_rate": 7.80236977420076e-06, + "loss": 2.2502, + "step": 1481 + }, + { + "epoch": 1.490664487332621, + "grad_norm": 16.90484054607315, + "learning_rate": 7.800134138162307e-06, + "loss": 2.3013, + "step": 1482 + }, + { + "epoch": 1.4916703338153015, + "grad_norm": 21.90767780268189, + "learning_rate": 7.797898502123855e-06, + "loss": 2.2774, + "step": 1483 + }, + { + "epoch": 1.492676180297982, + "grad_norm": 18.721112310966856, + "learning_rate": 7.795662866085402e-06, + "loss": 2.2839, + "step": 1484 + }, + { + "epoch": 1.4936820267806625, + "grad_norm": 17.911089882318574, + "learning_rate": 7.79342723004695e-06, + "loss": 2.2604, + "step": 1485 + }, + { + "epoch": 1.4946878732633433, + "grad_norm": 22.772704071372782, + "learning_rate": 7.791191594008496e-06, + "loss": 2.2407, + "step": 1486 + }, + { + "epoch": 1.4956937197460238, + "grad_norm": 17.543471809793463, + "learning_rate": 7.788955957970043e-06, + "loss": 2.2881, + "step": 1487 + }, + { + "epoch": 1.4966995662287044, + "grad_norm": 20.867966397052506, + "learning_rate": 7.78672032193159e-06, + "loss": 2.2543, + "step": 1488 + }, + { + "epoch": 1.497705412711385, + "grad_norm": 22.221623052043494, + "learning_rate": 7.784484685893138e-06, + "loss": 2.2677, + "step": 1489 + }, + { + "epoch": 1.4987112591940654, + "grad_norm": 17.112705665868003, + "learning_rate": 7.782249049854684e-06, + "loss": 2.2572, + "step": 1490 + }, + { + "epoch": 1.499717105676746, + "grad_norm": 19.690025494932556, + "learning_rate": 7.780013413816231e-06, + "loss": 2.285, + "step": 1491 + }, + { + "epoch": 1.5007229521594265, + "grad_norm": 17.9695282699989, + "learning_rate": 7.77777777777778e-06, + "loss": 2.3181, + "step": 1492 + }, + { + "epoch": 1.5017287986421073, + "grad_norm": 19.36781062267981, + "learning_rate": 7.775542141739326e-06, + "loss": 2.3131, + "step": 1493 + }, + { + "epoch": 1.5027346451247878, + "grad_norm": 20.125903914963455, + "learning_rate": 7.773306505700872e-06, + "loss": 2.2431, + "step": 1494 + }, + { + "epoch": 1.5037404916074684, + "grad_norm": 16.253820796223152, + "learning_rate": 7.771070869662419e-06, + "loss": 2.2616, + "step": 1495 + }, + { + "epoch": 1.5047463380901491, + "grad_norm": 23.8382579830196, + "learning_rate": 7.768835233623967e-06, + "loss": 2.304, + "step": 1496 + }, + { + "epoch": 1.5057521845728297, + "grad_norm": 18.49099241609707, + "learning_rate": 7.766599597585514e-06, + "loss": 2.232, + "step": 1497 + }, + { + "epoch": 1.5067580310555102, + "grad_norm": 20.838284123192146, + "learning_rate": 7.764363961547062e-06, + "loss": 2.318, + "step": 1498 + }, + { + "epoch": 1.5077638775381907, + "grad_norm": 22.883153520829786, + "learning_rate": 7.762128325508608e-06, + "loss": 2.282, + "step": 1499 + }, + { + "epoch": 1.5087697240208713, + "grad_norm": 16.55878728484685, + "learning_rate": 7.759892689470155e-06, + "loss": 2.2729, + "step": 1500 + }, + { + "epoch": 1.5097755705035518, + "grad_norm": 20.53854426478302, + "learning_rate": 7.757657053431702e-06, + "loss": 2.3073, + "step": 1501 + }, + { + "epoch": 1.5107814169862324, + "grad_norm": 22.087387133590592, + "learning_rate": 7.755421417393248e-06, + "loss": 2.2862, + "step": 1502 + }, + { + "epoch": 1.511787263468913, + "grad_norm": 19.819179809534393, + "learning_rate": 7.753185781354796e-06, + "loss": 2.2867, + "step": 1503 + }, + { + "epoch": 1.5127931099515937, + "grad_norm": 18.2453326516021, + "learning_rate": 7.750950145316343e-06, + "loss": 2.2911, + "step": 1504 + }, + { + "epoch": 1.5137989564342742, + "grad_norm": 16.93405386428347, + "learning_rate": 7.748714509277891e-06, + "loss": 2.3279, + "step": 1505 + }, + { + "epoch": 1.514804802916955, + "grad_norm": 22.014071115433765, + "learning_rate": 7.746478873239436e-06, + "loss": 2.303, + "step": 1506 + }, + { + "epoch": 1.5158106493996355, + "grad_norm": 17.916438730856672, + "learning_rate": 7.744243237200984e-06, + "loss": 2.2677, + "step": 1507 + }, + { + "epoch": 1.516816495882316, + "grad_norm": 20.27092818009331, + "learning_rate": 7.742007601162531e-06, + "loss": 2.3191, + "step": 1508 + }, + { + "epoch": 1.5178223423649966, + "grad_norm": 17.669526243256236, + "learning_rate": 7.739771965124079e-06, + "loss": 2.2871, + "step": 1509 + }, + { + "epoch": 1.5188281888476771, + "grad_norm": 15.088838347448466, + "learning_rate": 7.737536329085626e-06, + "loss": 2.2643, + "step": 1510 + }, + { + "epoch": 1.5198340353303577, + "grad_norm": 20.41515148594282, + "learning_rate": 7.735300693047172e-06, + "loss": 2.3322, + "step": 1511 + }, + { + "epoch": 1.5208398818130382, + "grad_norm": 16.05840910527969, + "learning_rate": 7.733065057008719e-06, + "loss": 2.2845, + "step": 1512 + }, + { + "epoch": 1.5218457282957187, + "grad_norm": 18.893387044686634, + "learning_rate": 7.730829420970265e-06, + "loss": 2.3403, + "step": 1513 + }, + { + "epoch": 1.5228515747783995, + "grad_norm": 17.915120537065008, + "learning_rate": 7.728593784931814e-06, + "loss": 2.2217, + "step": 1514 + }, + { + "epoch": 1.52385742126108, + "grad_norm": 17.371245044359178, + "learning_rate": 7.72635814889336e-06, + "loss": 2.2844, + "step": 1515 + }, + { + "epoch": 1.5248632677437606, + "grad_norm": 20.607204737742613, + "learning_rate": 7.724122512854908e-06, + "loss": 2.2285, + "step": 1516 + }, + { + "epoch": 1.5258691142264413, + "grad_norm": 15.979393180675775, + "learning_rate": 7.721886876816455e-06, + "loss": 2.2931, + "step": 1517 + }, + { + "epoch": 1.5268749607091219, + "grad_norm": 19.852453533351184, + "learning_rate": 7.719651240778002e-06, + "loss": 2.3007, + "step": 1518 + }, + { + "epoch": 1.5278808071918024, + "grad_norm": 19.50268486882533, + "learning_rate": 7.717415604739548e-06, + "loss": 2.2704, + "step": 1519 + }, + { + "epoch": 1.528886653674483, + "grad_norm": 17.849737130119692, + "learning_rate": 7.715179968701096e-06, + "loss": 2.3057, + "step": 1520 + }, + { + "epoch": 1.5298925001571635, + "grad_norm": 19.714501629790885, + "learning_rate": 7.712944332662643e-06, + "loss": 2.2569, + "step": 1521 + }, + { + "epoch": 1.530898346639844, + "grad_norm": 20.09140238122812, + "learning_rate": 7.710708696624191e-06, + "loss": 2.2741, + "step": 1522 + }, + { + "epoch": 1.5319041931225246, + "grad_norm": 19.05465132384539, + "learning_rate": 7.708473060585738e-06, + "loss": 2.2615, + "step": 1523 + }, + { + "epoch": 1.532910039605205, + "grad_norm": 16.699820322420052, + "learning_rate": 7.706237424547284e-06, + "loss": 2.2729, + "step": 1524 + }, + { + "epoch": 1.5339158860878859, + "grad_norm": 18.145399056940995, + "learning_rate": 7.704001788508831e-06, + "loss": 2.3133, + "step": 1525 + }, + { + "epoch": 1.5349217325705664, + "grad_norm": 17.27467185746975, + "learning_rate": 7.701766152470377e-06, + "loss": 2.3199, + "step": 1526 + }, + { + "epoch": 1.535927579053247, + "grad_norm": 17.261437605040133, + "learning_rate": 7.699530516431926e-06, + "loss": 2.314, + "step": 1527 + }, + { + "epoch": 1.5369334255359277, + "grad_norm": 18.016550773848497, + "learning_rate": 7.697294880393472e-06, + "loss": 2.2802, + "step": 1528 + }, + { + "epoch": 1.5379392720186082, + "grad_norm": 15.645657281375113, + "learning_rate": 7.69505924435502e-06, + "loss": 2.2203, + "step": 1529 + }, + { + "epoch": 1.5389451185012888, + "grad_norm": 17.11336773058832, + "learning_rate": 7.692823608316567e-06, + "loss": 2.3103, + "step": 1530 + }, + { + "epoch": 1.5399509649839693, + "grad_norm": 19.886367565353304, + "learning_rate": 7.690587972278114e-06, + "loss": 2.262, + "step": 1531 + }, + { + "epoch": 1.5409568114666499, + "grad_norm": 15.994878268078955, + "learning_rate": 7.68835233623966e-06, + "loss": 2.3215, + "step": 1532 + }, + { + "epoch": 1.5419626579493304, + "grad_norm": 17.74213070501031, + "learning_rate": 7.686116700201208e-06, + "loss": 2.3512, + "step": 1533 + }, + { + "epoch": 1.542968504432011, + "grad_norm": 16.83461984011088, + "learning_rate": 7.683881064162755e-06, + "loss": 2.3289, + "step": 1534 + }, + { + "epoch": 1.5439743509146915, + "grad_norm": 18.24770376850407, + "learning_rate": 7.681645428124303e-06, + "loss": 2.3014, + "step": 1535 + }, + { + "epoch": 1.5449801973973722, + "grad_norm": 17.227560803417234, + "learning_rate": 7.67940979208585e-06, + "loss": 2.3434, + "step": 1536 + }, + { + "epoch": 1.5459860438800528, + "grad_norm": 14.279290910690355, + "learning_rate": 7.677174156047396e-06, + "loss": 2.2851, + "step": 1537 + }, + { + "epoch": 1.5469918903627335, + "grad_norm": 17.359903771378765, + "learning_rate": 7.674938520008943e-06, + "loss": 2.286, + "step": 1538 + }, + { + "epoch": 1.547997736845414, + "grad_norm": 19.484495307853788, + "learning_rate": 7.67270288397049e-06, + "loss": 2.351, + "step": 1539 + }, + { + "epoch": 1.5490035833280946, + "grad_norm": 16.96042326031945, + "learning_rate": 7.670467247932038e-06, + "loss": 2.2583, + "step": 1540 + }, + { + "epoch": 1.5500094298107752, + "grad_norm": 15.621909656589937, + "learning_rate": 7.668231611893584e-06, + "loss": 2.2972, + "step": 1541 + }, + { + "epoch": 1.5510152762934557, + "grad_norm": 18.6003281206334, + "learning_rate": 7.665995975855131e-06, + "loss": 2.3043, + "step": 1542 + }, + { + "epoch": 1.5520211227761362, + "grad_norm": 18.128664424443144, + "learning_rate": 7.663760339816677e-06, + "loss": 2.3357, + "step": 1543 + }, + { + "epoch": 1.5530269692588168, + "grad_norm": 15.940286885757834, + "learning_rate": 7.661524703778226e-06, + "loss": 2.3372, + "step": 1544 + }, + { + "epoch": 1.5540328157414973, + "grad_norm": 17.025874797911026, + "learning_rate": 7.659289067739772e-06, + "loss": 2.2931, + "step": 1545 + }, + { + "epoch": 1.555038662224178, + "grad_norm": 19.790708605098256, + "learning_rate": 7.65705343170132e-06, + "loss": 2.2587, + "step": 1546 + }, + { + "epoch": 1.5560445087068586, + "grad_norm": 18.918091015148942, + "learning_rate": 7.654817795662867e-06, + "loss": 2.3327, + "step": 1547 + }, + { + "epoch": 1.5570503551895392, + "grad_norm": 17.33264530456271, + "learning_rate": 7.652582159624414e-06, + "loss": 2.2949, + "step": 1548 + }, + { + "epoch": 1.55805620167222, + "grad_norm": 23.167025477343103, + "learning_rate": 7.65034652358596e-06, + "loss": 2.2936, + "step": 1549 + }, + { + "epoch": 1.5590620481549005, + "grad_norm": 17.69559287598297, + "learning_rate": 7.648110887547507e-06, + "loss": 2.2675, + "step": 1550 + }, + { + "epoch": 1.560067894637581, + "grad_norm": 18.572405920081234, + "learning_rate": 7.645875251509055e-06, + "loss": 2.3261, + "step": 1551 + }, + { + "epoch": 1.5610737411202615, + "grad_norm": 21.371575498912534, + "learning_rate": 7.643639615470602e-06, + "loss": 2.2848, + "step": 1552 + }, + { + "epoch": 1.562079587602942, + "grad_norm": 16.137328309550046, + "learning_rate": 7.64140397943215e-06, + "loss": 2.3007, + "step": 1553 + }, + { + "epoch": 1.5630854340856226, + "grad_norm": 15.887326170285341, + "learning_rate": 7.639168343393696e-06, + "loss": 2.2878, + "step": 1554 + }, + { + "epoch": 1.5640912805683032, + "grad_norm": 19.70258894896291, + "learning_rate": 7.636932707355243e-06, + "loss": 2.2965, + "step": 1555 + }, + { + "epoch": 1.5650971270509837, + "grad_norm": 17.077639987037216, + "learning_rate": 7.63469707131679e-06, + "loss": 2.3249, + "step": 1556 + }, + { + "epoch": 1.5661029735336645, + "grad_norm": 18.204144244300757, + "learning_rate": 7.632461435278338e-06, + "loss": 2.2854, + "step": 1557 + }, + { + "epoch": 1.567108820016345, + "grad_norm": 17.39808010530622, + "learning_rate": 7.630225799239884e-06, + "loss": 2.3217, + "step": 1558 + }, + { + "epoch": 1.5681146664990255, + "grad_norm": 18.207352425342457, + "learning_rate": 7.627990163201432e-06, + "loss": 2.3129, + "step": 1559 + }, + { + "epoch": 1.5691205129817063, + "grad_norm": 16.717726119150125, + "learning_rate": 7.625754527162978e-06, + "loss": 2.3432, + "step": 1560 + }, + { + "epoch": 1.5701263594643868, + "grad_norm": 18.753395192538807, + "learning_rate": 7.623518891124525e-06, + "loss": 2.2698, + "step": 1561 + }, + { + "epoch": 1.5711322059470674, + "grad_norm": 16.586849349807938, + "learning_rate": 7.621283255086073e-06, + "loss": 2.3282, + "step": 1562 + }, + { + "epoch": 1.572138052429748, + "grad_norm": 16.322106428815346, + "learning_rate": 7.61904761904762e-06, + "loss": 2.243, + "step": 1563 + }, + { + "epoch": 1.5731438989124285, + "grad_norm": 16.818011065400956, + "learning_rate": 7.616811983009167e-06, + "loss": 2.347, + "step": 1564 + }, + { + "epoch": 1.574149745395109, + "grad_norm": 17.639483723555664, + "learning_rate": 7.614576346970714e-06, + "loss": 2.2921, + "step": 1565 + }, + { + "epoch": 1.5751555918777895, + "grad_norm": 19.123730656020907, + "learning_rate": 7.612340710932261e-06, + "loss": 2.2522, + "step": 1566 + }, + { + "epoch": 1.57616143836047, + "grad_norm": 16.981957712953395, + "learning_rate": 7.6101050748938076e-06, + "loss": 2.3308, + "step": 1567 + }, + { + "epoch": 1.5771672848431508, + "grad_norm": 18.721902223665907, + "learning_rate": 7.607869438855355e-06, + "loss": 2.3076, + "step": 1568 + }, + { + "epoch": 1.5781731313258314, + "grad_norm": 15.591415803272543, + "learning_rate": 7.6056338028169015e-06, + "loss": 2.3147, + "step": 1569 + }, + { + "epoch": 1.5791789778085121, + "grad_norm": 17.251802483159004, + "learning_rate": 7.60339816677845e-06, + "loss": 2.3193, + "step": 1570 + }, + { + "epoch": 1.5801848242911927, + "grad_norm": 15.26863125866677, + "learning_rate": 7.601162530739996e-06, + "loss": 2.2885, + "step": 1571 + }, + { + "epoch": 1.5811906707738732, + "grad_norm": 15.85702751098647, + "learning_rate": 7.598926894701543e-06, + "loss": 2.2848, + "step": 1572 + }, + { + "epoch": 1.5821965172565537, + "grad_norm": 18.336826845804772, + "learning_rate": 7.59669125866309e-06, + "loss": 2.3111, + "step": 1573 + }, + { + "epoch": 1.5832023637392343, + "grad_norm": 17.863224481239193, + "learning_rate": 7.594455622624637e-06, + "loss": 2.3371, + "step": 1574 + }, + { + "epoch": 1.5842082102219148, + "grad_norm": 17.341866010483724, + "learning_rate": 7.592219986586184e-06, + "loss": 2.3378, + "step": 1575 + }, + { + "epoch": 1.5852140567045954, + "grad_norm": 17.635446248556168, + "learning_rate": 7.589984350547731e-06, + "loss": 2.2879, + "step": 1576 + }, + { + "epoch": 1.586219903187276, + "grad_norm": 15.391015775079701, + "learning_rate": 7.587748714509279e-06, + "loss": 2.2639, + "step": 1577 + }, + { + "epoch": 1.5872257496699567, + "grad_norm": 16.813917700341563, + "learning_rate": 7.585513078470826e-06, + "loss": 2.2648, + "step": 1578 + }, + { + "epoch": 1.5882315961526372, + "grad_norm": 18.37044213143786, + "learning_rate": 7.583277442432373e-06, + "loss": 2.2342, + "step": 1579 + }, + { + "epoch": 1.5892374426353177, + "grad_norm": 16.773237086377012, + "learning_rate": 7.58104180639392e-06, + "loss": 2.2814, + "step": 1580 + }, + { + "epoch": 1.5902432891179985, + "grad_norm": 16.966975249230583, + "learning_rate": 7.578806170355467e-06, + "loss": 2.306, + "step": 1581 + }, + { + "epoch": 1.591249135600679, + "grad_norm": 19.29591516919744, + "learning_rate": 7.576570534317014e-06, + "loss": 2.2499, + "step": 1582 + }, + { + "epoch": 1.5922549820833596, + "grad_norm": 18.323197040030927, + "learning_rate": 7.574334898278561e-06, + "loss": 2.2762, + "step": 1583 + }, + { + "epoch": 1.5932608285660401, + "grad_norm": 16.83769559840373, + "learning_rate": 7.5720992622401075e-06, + "loss": 2.3096, + "step": 1584 + }, + { + "epoch": 1.5942666750487207, + "grad_norm": 19.1920754770466, + "learning_rate": 7.569863626201654e-06, + "loss": 2.2841, + "step": 1585 + }, + { + "epoch": 1.5952725215314012, + "grad_norm": 17.194661471609137, + "learning_rate": 7.567627990163202e-06, + "loss": 2.3223, + "step": 1586 + }, + { + "epoch": 1.5962783680140817, + "grad_norm": 15.766621571000169, + "learning_rate": 7.565392354124749e-06, + "loss": 2.2976, + "step": 1587 + }, + { + "epoch": 1.5972842144967623, + "grad_norm": 18.95605767159444, + "learning_rate": 7.563156718086296e-06, + "loss": 2.3344, + "step": 1588 + }, + { + "epoch": 1.598290060979443, + "grad_norm": 17.318139886192373, + "learning_rate": 7.560921082047843e-06, + "loss": 2.2738, + "step": 1589 + }, + { + "epoch": 1.5992959074621236, + "grad_norm": 17.65570490189223, + "learning_rate": 7.55868544600939e-06, + "loss": 2.3123, + "step": 1590 + }, + { + "epoch": 1.6003017539448043, + "grad_norm": 16.523811335663304, + "learning_rate": 7.556449809970937e-06, + "loss": 2.2588, + "step": 1591 + }, + { + "epoch": 1.6013076004274849, + "grad_norm": 17.58715421753379, + "learning_rate": 7.554214173932485e-06, + "loss": 2.2696, + "step": 1592 + }, + { + "epoch": 1.6023134469101654, + "grad_norm": 15.615437328188383, + "learning_rate": 7.551978537894032e-06, + "loss": 2.2478, + "step": 1593 + }, + { + "epoch": 1.603319293392846, + "grad_norm": 16.20342586210378, + "learning_rate": 7.549742901855579e-06, + "loss": 2.2727, + "step": 1594 + }, + { + "epoch": 1.6043251398755265, + "grad_norm": 19.09460593004032, + "learning_rate": 7.547507265817126e-06, + "loss": 2.3034, + "step": 1595 + }, + { + "epoch": 1.605330986358207, + "grad_norm": 19.305798240899804, + "learning_rate": 7.545271629778672e-06, + "loss": 2.3049, + "step": 1596 + }, + { + "epoch": 1.6063368328408876, + "grad_norm": 20.287377727477924, + "learning_rate": 7.54303599374022e-06, + "loss": 2.3243, + "step": 1597 + }, + { + "epoch": 1.6073426793235681, + "grad_norm": 16.070930894706038, + "learning_rate": 7.540800357701766e-06, + "loss": 2.3084, + "step": 1598 + }, + { + "epoch": 1.6083485258062489, + "grad_norm": 20.07386770790435, + "learning_rate": 7.538564721663314e-06, + "loss": 2.3058, + "step": 1599 + }, + { + "epoch": 1.6093543722889294, + "grad_norm": 19.88535982593183, + "learning_rate": 7.53632908562486e-06, + "loss": 2.2875, + "step": 1600 + }, + { + "epoch": 1.61036021877161, + "grad_norm": 16.693701256020397, + "learning_rate": 7.534093449586408e-06, + "loss": 2.2908, + "step": 1601 + }, + { + "epoch": 1.6113660652542907, + "grad_norm": 22.38511771234477, + "learning_rate": 7.531857813547955e-06, + "loss": 2.3289, + "step": 1602 + }, + { + "epoch": 1.6123719117369713, + "grad_norm": 19.256280771639375, + "learning_rate": 7.529622177509502e-06, + "loss": 2.2923, + "step": 1603 + }, + { + "epoch": 1.6133777582196518, + "grad_norm": 15.807615153380606, + "learning_rate": 7.527386541471049e-06, + "loss": 2.3151, + "step": 1604 + }, + { + "epoch": 1.6143836047023323, + "grad_norm": 17.14795329603459, + "learning_rate": 7.525150905432596e-06, + "loss": 2.2766, + "step": 1605 + }, + { + "epoch": 1.6153894511850129, + "grad_norm": 19.0314082816804, + "learning_rate": 7.522915269394143e-06, + "loss": 2.2283, + "step": 1606 + }, + { + "epoch": 1.6163952976676934, + "grad_norm": 16.827496274558783, + "learning_rate": 7.520679633355691e-06, + "loss": 2.2872, + "step": 1607 + }, + { + "epoch": 1.617401144150374, + "grad_norm": 18.415640161477818, + "learning_rate": 7.518443997317238e-06, + "loss": 2.2646, + "step": 1608 + }, + { + "epoch": 1.6184069906330545, + "grad_norm": 18.884538591397305, + "learning_rate": 7.516208361278784e-06, + "loss": 2.2652, + "step": 1609 + }, + { + "epoch": 1.6194128371157352, + "grad_norm": 19.099860731773862, + "learning_rate": 7.513972725240332e-06, + "loss": 2.2819, + "step": 1610 + }, + { + "epoch": 1.6204186835984158, + "grad_norm": 18.518647433605977, + "learning_rate": 7.511737089201878e-06, + "loss": 2.2982, + "step": 1611 + }, + { + "epoch": 1.6214245300810963, + "grad_norm": 18.662477998814204, + "learning_rate": 7.509501453163426e-06, + "loss": 2.2893, + "step": 1612 + }, + { + "epoch": 1.622430376563777, + "grad_norm": 17.23152071581131, + "learning_rate": 7.507265817124972e-06, + "loss": 2.2361, + "step": 1613 + }, + { + "epoch": 1.6234362230464576, + "grad_norm": 20.286786912700784, + "learning_rate": 7.5050301810865204e-06, + "loss": 2.2675, + "step": 1614 + }, + { + "epoch": 1.6244420695291382, + "grad_norm": 18.447265051511966, + "learning_rate": 7.502794545048067e-06, + "loss": 2.3422, + "step": 1615 + }, + { + "epoch": 1.6254479160118187, + "grad_norm": 16.137844374119336, + "learning_rate": 7.500558909009614e-06, + "loss": 2.2929, + "step": 1616 + }, + { + "epoch": 1.6264537624944992, + "grad_norm": 16.644876712805715, + "learning_rate": 7.498323272971161e-06, + "loss": 2.2395, + "step": 1617 + }, + { + "epoch": 1.6274596089771798, + "grad_norm": 16.375471679331596, + "learning_rate": 7.496087636932708e-06, + "loss": 2.2761, + "step": 1618 + }, + { + "epoch": 1.6284654554598603, + "grad_norm": 14.94674470892123, + "learning_rate": 7.493852000894255e-06, + "loss": 2.3036, + "step": 1619 + }, + { + "epoch": 1.6294713019425409, + "grad_norm": 15.81743617520014, + "learning_rate": 7.4916163648558015e-06, + "loss": 2.2889, + "step": 1620 + }, + { + "epoch": 1.6304771484252216, + "grad_norm": 17.416368848556772, + "learning_rate": 7.489380728817349e-06, + "loss": 2.3139, + "step": 1621 + }, + { + "epoch": 1.6314829949079022, + "grad_norm": 16.010398020295742, + "learning_rate": 7.4871450927788954e-06, + "loss": 2.2733, + "step": 1622 + }, + { + "epoch": 1.632488841390583, + "grad_norm": 17.340450107758183, + "learning_rate": 7.484909456740444e-06, + "loss": 2.2623, + "step": 1623 + }, + { + "epoch": 1.6334946878732635, + "grad_norm": 17.510561385062868, + "learning_rate": 7.48267382070199e-06, + "loss": 2.3156, + "step": 1624 + }, + { + "epoch": 1.634500534355944, + "grad_norm": 16.704347034836946, + "learning_rate": 7.480438184663538e-06, + "loss": 2.2747, + "step": 1625 + }, + { + "epoch": 1.6355063808386245, + "grad_norm": 16.57972190196149, + "learning_rate": 7.478202548625084e-06, + "loss": 2.272, + "step": 1626 + }, + { + "epoch": 1.636512227321305, + "grad_norm": 15.418220828290822, + "learning_rate": 7.475966912586632e-06, + "loss": 2.2657, + "step": 1627 + }, + { + "epoch": 1.6375180738039856, + "grad_norm": 17.242284011573524, + "learning_rate": 7.473731276548178e-06, + "loss": 2.2876, + "step": 1628 + }, + { + "epoch": 1.6385239202866662, + "grad_norm": 17.120144649327322, + "learning_rate": 7.4714956405097264e-06, + "loss": 2.2536, + "step": 1629 + }, + { + "epoch": 1.6395297667693467, + "grad_norm": 15.648808334665938, + "learning_rate": 7.469260004471273e-06, + "loss": 2.3287, + "step": 1630 + }, + { + "epoch": 1.6405356132520275, + "grad_norm": 15.567489200950083, + "learning_rate": 7.46702436843282e-06, + "loss": 2.2619, + "step": 1631 + }, + { + "epoch": 1.641541459734708, + "grad_norm": 19.00407342375508, + "learning_rate": 7.464788732394367e-06, + "loss": 2.302, + "step": 1632 + }, + { + "epoch": 1.6425473062173885, + "grad_norm": 17.781602484456865, + "learning_rate": 7.4625530963559135e-06, + "loss": 2.3069, + "step": 1633 + }, + { + "epoch": 1.6435531527000693, + "grad_norm": 15.920283011983388, + "learning_rate": 7.460317460317461e-06, + "loss": 2.2918, + "step": 1634 + }, + { + "epoch": 1.6445589991827498, + "grad_norm": 17.792057329372298, + "learning_rate": 7.4580818242790075e-06, + "loss": 2.3679, + "step": 1635 + }, + { + "epoch": 1.6455648456654304, + "grad_norm": 21.676130512651092, + "learning_rate": 7.455846188240555e-06, + "loss": 2.283, + "step": 1636 + }, + { + "epoch": 1.646570692148111, + "grad_norm": 16.514698135707608, + "learning_rate": 7.4536105522021015e-06, + "loss": 2.2325, + "step": 1637 + }, + { + "epoch": 1.6475765386307915, + "grad_norm": 21.39786786668378, + "learning_rate": 7.45137491616365e-06, + "loss": 2.2731, + "step": 1638 + }, + { + "epoch": 1.648582385113472, + "grad_norm": 18.00833184983677, + "learning_rate": 7.449139280125196e-06, + "loss": 2.2486, + "step": 1639 + }, + { + "epoch": 1.6495882315961525, + "grad_norm": 15.846871390888415, + "learning_rate": 7.446903644086744e-06, + "loss": 2.2691, + "step": 1640 + }, + { + "epoch": 1.650594078078833, + "grad_norm": 22.000826235719195, + "learning_rate": 7.44466800804829e-06, + "loss": 2.2222, + "step": 1641 + }, + { + "epoch": 1.6515999245615138, + "grad_norm": 19.455691706197182, + "learning_rate": 7.442432372009838e-06, + "loss": 2.328, + "step": 1642 + }, + { + "epoch": 1.6526057710441944, + "grad_norm": 17.94871421325755, + "learning_rate": 7.440196735971384e-06, + "loss": 2.3053, + "step": 1643 + }, + { + "epoch": 1.653611617526875, + "grad_norm": 17.9376483521117, + "learning_rate": 7.437961099932931e-06, + "loss": 2.302, + "step": 1644 + }, + { + "epoch": 1.6546174640095557, + "grad_norm": 16.4389373413972, + "learning_rate": 7.435725463894479e-06, + "loss": 2.2397, + "step": 1645 + }, + { + "epoch": 1.6556233104922362, + "grad_norm": 16.99132641098515, + "learning_rate": 7.433489827856026e-06, + "loss": 2.256, + "step": 1646 + }, + { + "epoch": 1.6566291569749168, + "grad_norm": 17.542599768403043, + "learning_rate": 7.431254191817573e-06, + "loss": 2.2331, + "step": 1647 + }, + { + "epoch": 1.6576350034575973, + "grad_norm": 17.626876401215032, + "learning_rate": 7.4290185557791196e-06, + "loss": 2.3472, + "step": 1648 + }, + { + "epoch": 1.6586408499402778, + "grad_norm": 16.872425273361493, + "learning_rate": 7.426782919740667e-06, + "loss": 2.3248, + "step": 1649 + }, + { + "epoch": 1.6596466964229584, + "grad_norm": 17.162326734465445, + "learning_rate": 7.4245472837022135e-06, + "loss": 2.2991, + "step": 1650 + }, + { + "epoch": 1.660652542905639, + "grad_norm": 19.75105766718973, + "learning_rate": 7.422311647663761e-06, + "loss": 2.2993, + "step": 1651 + }, + { + "epoch": 1.6616583893883194, + "grad_norm": 15.972163852419595, + "learning_rate": 7.4200760116253075e-06, + "loss": 2.2432, + "step": 1652 + }, + { + "epoch": 1.6626642358710002, + "grad_norm": 17.600365845305994, + "learning_rate": 7.417840375586856e-06, + "loss": 2.2461, + "step": 1653 + }, + { + "epoch": 1.6636700823536807, + "grad_norm": 21.500517437821884, + "learning_rate": 7.415604739548402e-06, + "loss": 2.2542, + "step": 1654 + }, + { + "epoch": 1.6646759288363615, + "grad_norm": 17.514071400959207, + "learning_rate": 7.413369103509949e-06, + "loss": 2.2928, + "step": 1655 + }, + { + "epoch": 1.665681775319042, + "grad_norm": 16.516886183339697, + "learning_rate": 7.411133467471496e-06, + "loss": 2.3187, + "step": 1656 + }, + { + "epoch": 1.6666876218017226, + "grad_norm": 16.518222183924482, + "learning_rate": 7.408897831433043e-06, + "loss": 2.2754, + "step": 1657 + }, + { + "epoch": 1.6676934682844031, + "grad_norm": 19.814913536195757, + "learning_rate": 7.40666219539459e-06, + "loss": 2.2452, + "step": 1658 + }, + { + "epoch": 1.6686993147670837, + "grad_norm": 18.775262311804575, + "learning_rate": 7.404426559356137e-06, + "loss": 2.2773, + "step": 1659 + }, + { + "epoch": 1.6697051612497642, + "grad_norm": 17.397581620828824, + "learning_rate": 7.402190923317685e-06, + "loss": 2.3, + "step": 1660 + }, + { + "epoch": 1.6707110077324447, + "grad_norm": 16.93196801056379, + "learning_rate": 7.399955287279232e-06, + "loss": 2.2719, + "step": 1661 + }, + { + "epoch": 1.6717168542151253, + "grad_norm": 16.61964235609142, + "learning_rate": 7.397719651240779e-06, + "loss": 2.2854, + "step": 1662 + }, + { + "epoch": 1.672722700697806, + "grad_norm": 17.253963452306547, + "learning_rate": 7.3954840152023256e-06, + "loss": 2.2387, + "step": 1663 + }, + { + "epoch": 1.6737285471804866, + "grad_norm": 18.0575343659353, + "learning_rate": 7.393248379163873e-06, + "loss": 2.3135, + "step": 1664 + }, + { + "epoch": 1.6747343936631671, + "grad_norm": 18.694711885768108, + "learning_rate": 7.3910127431254195e-06, + "loss": 2.2813, + "step": 1665 + }, + { + "epoch": 1.6757402401458479, + "grad_norm": 18.46894024543316, + "learning_rate": 7.388777107086968e-06, + "loss": 2.3289, + "step": 1666 + }, + { + "epoch": 1.6767460866285284, + "grad_norm": 18.075846886633336, + "learning_rate": 7.386541471048514e-06, + "loss": 2.2717, + "step": 1667 + }, + { + "epoch": 1.677751933111209, + "grad_norm": 16.580635934759897, + "learning_rate": 7.38430583501006e-06, + "loss": 2.3578, + "step": 1668 + }, + { + "epoch": 1.6787577795938895, + "grad_norm": 19.80057502326656, + "learning_rate": 7.382070198971608e-06, + "loss": 2.2358, + "step": 1669 + }, + { + "epoch": 1.67976362607657, + "grad_norm": 22.324624123091027, + "learning_rate": 7.379834562933155e-06, + "loss": 2.2999, + "step": 1670 + }, + { + "epoch": 1.6807694725592506, + "grad_norm": 17.171457668822228, + "learning_rate": 7.377598926894702e-06, + "loss": 2.2892, + "step": 1671 + }, + { + "epoch": 1.6817753190419311, + "grad_norm": 17.459691170633143, + "learning_rate": 7.375363290856249e-06, + "loss": 2.2847, + "step": 1672 + }, + { + "epoch": 1.6827811655246117, + "grad_norm": 24.349415622986733, + "learning_rate": 7.373127654817796e-06, + "loss": 2.3483, + "step": 1673 + }, + { + "epoch": 1.6837870120072924, + "grad_norm": 17.782930821929234, + "learning_rate": 7.370892018779343e-06, + "loss": 2.2817, + "step": 1674 + }, + { + "epoch": 1.684792858489973, + "grad_norm": 17.193230183146444, + "learning_rate": 7.368656382740891e-06, + "loss": 2.307, + "step": 1675 + }, + { + "epoch": 1.6857987049726535, + "grad_norm": 15.99168958034795, + "learning_rate": 7.366420746702438e-06, + "loss": 2.2668, + "step": 1676 + }, + { + "epoch": 1.6868045514553343, + "grad_norm": 16.455265608987492, + "learning_rate": 7.364185110663985e-06, + "loss": 2.2757, + "step": 1677 + }, + { + "epoch": 1.6878103979380148, + "grad_norm": 16.482362943706256, + "learning_rate": 7.361949474625532e-06, + "loss": 2.3429, + "step": 1678 + }, + { + "epoch": 1.6888162444206953, + "grad_norm": 15.595423288163719, + "learning_rate": 7.359713838587078e-06, + "loss": 2.3647, + "step": 1679 + }, + { + "epoch": 1.6898220909033759, + "grad_norm": 16.95527168452514, + "learning_rate": 7.3574782025486256e-06, + "loss": 2.2992, + "step": 1680 + }, + { + "epoch": 1.6908279373860564, + "grad_norm": 18.179540636982864, + "learning_rate": 7.355242566510172e-06, + "loss": 2.2744, + "step": 1681 + }, + { + "epoch": 1.691833783868737, + "grad_norm": 16.78451255952181, + "learning_rate": 7.35300693047172e-06, + "loss": 2.3529, + "step": 1682 + }, + { + "epoch": 1.6928396303514175, + "grad_norm": 16.309874549368278, + "learning_rate": 7.350771294433267e-06, + "loss": 2.2944, + "step": 1683 + }, + { + "epoch": 1.693845476834098, + "grad_norm": 17.405116712862426, + "learning_rate": 7.348535658394814e-06, + "loss": 2.276, + "step": 1684 + }, + { + "epoch": 1.6948513233167788, + "grad_norm": 17.55523954163725, + "learning_rate": 7.346300022356361e-06, + "loss": 2.2905, + "step": 1685 + }, + { + "epoch": 1.6958571697994593, + "grad_norm": 15.709914778295607, + "learning_rate": 7.344064386317908e-06, + "loss": 2.2726, + "step": 1686 + }, + { + "epoch": 1.69686301628214, + "grad_norm": 17.876951517815804, + "learning_rate": 7.341828750279455e-06, + "loss": 2.2789, + "step": 1687 + }, + { + "epoch": 1.6978688627648206, + "grad_norm": 18.745107840949895, + "learning_rate": 7.339593114241002e-06, + "loss": 2.2937, + "step": 1688 + }, + { + "epoch": 1.6988747092475012, + "grad_norm": 16.75408815841215, + "learning_rate": 7.337357478202549e-06, + "loss": 2.2737, + "step": 1689 + }, + { + "epoch": 1.6998805557301817, + "grad_norm": 16.50796612604206, + "learning_rate": 7.335121842164097e-06, + "loss": 2.2846, + "step": 1690 + }, + { + "epoch": 1.7008864022128622, + "grad_norm": 18.436456290854704, + "learning_rate": 7.332886206125644e-06, + "loss": 2.3165, + "step": 1691 + }, + { + "epoch": 1.7018922486955428, + "grad_norm": 15.577438332324887, + "learning_rate": 7.33065057008719e-06, + "loss": 2.2849, + "step": 1692 + }, + { + "epoch": 1.7028980951782233, + "grad_norm": 19.36486387739182, + "learning_rate": 7.328414934048738e-06, + "loss": 2.3413, + "step": 1693 + }, + { + "epoch": 1.7039039416609039, + "grad_norm": 17.7840086570158, + "learning_rate": 7.326179298010284e-06, + "loss": 2.2739, + "step": 1694 + }, + { + "epoch": 1.7049097881435846, + "grad_norm": 19.96875615564056, + "learning_rate": 7.3239436619718316e-06, + "loss": 2.2725, + "step": 1695 + }, + { + "epoch": 1.7059156346262652, + "grad_norm": 18.895880230471608, + "learning_rate": 7.321708025933378e-06, + "loss": 2.2451, + "step": 1696 + }, + { + "epoch": 1.7069214811089457, + "grad_norm": 18.494732987067664, + "learning_rate": 7.319472389894926e-06, + "loss": 2.3039, + "step": 1697 + }, + { + "epoch": 1.7079273275916265, + "grad_norm": 19.548118249517298, + "learning_rate": 7.317236753856473e-06, + "loss": 2.2646, + "step": 1698 + }, + { + "epoch": 1.708933174074307, + "grad_norm": 17.483404102959916, + "learning_rate": 7.31500111781802e-06, + "loss": 2.2878, + "step": 1699 + }, + { + "epoch": 1.7099390205569875, + "grad_norm": 16.087083784695995, + "learning_rate": 7.312765481779567e-06, + "loss": 2.2983, + "step": 1700 + }, + { + "epoch": 1.710944867039668, + "grad_norm": 17.286227907129362, + "learning_rate": 7.310529845741114e-06, + "loss": 2.2769, + "step": 1701 + }, + { + "epoch": 1.7119507135223486, + "grad_norm": 18.121685621444016, + "learning_rate": 7.308294209702661e-06, + "loss": 2.3164, + "step": 1702 + }, + { + "epoch": 1.7129565600050292, + "grad_norm": 16.93381419733215, + "learning_rate": 7.3060585736642074e-06, + "loss": 2.2904, + "step": 1703 + }, + { + "epoch": 1.7139624064877097, + "grad_norm": 17.108344692926316, + "learning_rate": 7.303822937625755e-06, + "loss": 2.2702, + "step": 1704 + }, + { + "epoch": 1.7149682529703902, + "grad_norm": 16.904685203090065, + "learning_rate": 7.301587301587301e-06, + "loss": 2.3053, + "step": 1705 + }, + { + "epoch": 1.715974099453071, + "grad_norm": 15.891046973165288, + "learning_rate": 7.29935166554885e-06, + "loss": 2.2788, + "step": 1706 + }, + { + "epoch": 1.7169799459357515, + "grad_norm": 18.961009088854595, + "learning_rate": 7.297116029510396e-06, + "loss": 2.2647, + "step": 1707 + }, + { + "epoch": 1.7179857924184323, + "grad_norm": 16.77002633978821, + "learning_rate": 7.294880393471944e-06, + "loss": 2.2649, + "step": 1708 + }, + { + "epoch": 1.7189916389011128, + "grad_norm": 19.15137573907095, + "learning_rate": 7.29264475743349e-06, + "loss": 2.2612, + "step": 1709 + }, + { + "epoch": 1.7199974853837934, + "grad_norm": 19.365000785728903, + "learning_rate": 7.290409121395038e-06, + "loss": 2.2832, + "step": 1710 + }, + { + "epoch": 1.721003331866474, + "grad_norm": 18.550525488582906, + "learning_rate": 7.288173485356584e-06, + "loss": 2.2698, + "step": 1711 + }, + { + "epoch": 1.7220091783491545, + "grad_norm": 19.045700127374538, + "learning_rate": 7.285937849318132e-06, + "loss": 2.2863, + "step": 1712 + }, + { + "epoch": 1.723015024831835, + "grad_norm": 16.589908676820695, + "learning_rate": 7.283702213279679e-06, + "loss": 2.3521, + "step": 1713 + }, + { + "epoch": 1.7240208713145155, + "grad_norm": 18.730596467816195, + "learning_rate": 7.2814665772412255e-06, + "loss": 2.3367, + "step": 1714 + }, + { + "epoch": 1.725026717797196, + "grad_norm": 21.02491227040751, + "learning_rate": 7.279230941202773e-06, + "loss": 2.2994, + "step": 1715 + }, + { + "epoch": 1.7260325642798768, + "grad_norm": 18.90894267852598, + "learning_rate": 7.2769953051643195e-06, + "loss": 2.2979, + "step": 1716 + }, + { + "epoch": 1.7270384107625574, + "grad_norm": 16.90454659841854, + "learning_rate": 7.274759669125867e-06, + "loss": 2.2959, + "step": 1717 + }, + { + "epoch": 1.728044257245238, + "grad_norm": 17.165608822801428, + "learning_rate": 7.2725240330874135e-06, + "loss": 2.3404, + "step": 1718 + }, + { + "epoch": 1.7290501037279187, + "grad_norm": 19.372474192014376, + "learning_rate": 7.270288397048961e-06, + "loss": 2.3036, + "step": 1719 + }, + { + "epoch": 1.7300559502105992, + "grad_norm": 18.182535106181874, + "learning_rate": 7.268052761010507e-06, + "loss": 2.3857, + "step": 1720 + }, + { + "epoch": 1.7310617966932798, + "grad_norm": 17.114519608407026, + "learning_rate": 7.265817124972056e-06, + "loss": 2.2924, + "step": 1721 + }, + { + "epoch": 1.7320676431759603, + "grad_norm": 16.53904728868979, + "learning_rate": 7.263581488933602e-06, + "loss": 2.3151, + "step": 1722 + }, + { + "epoch": 1.7330734896586408, + "grad_norm": 18.726040653410717, + "learning_rate": 7.26134585289515e-06, + "loss": 2.2739, + "step": 1723 + }, + { + "epoch": 1.7340793361413214, + "grad_norm": 17.341102661687714, + "learning_rate": 7.259110216856696e-06, + "loss": 2.2504, + "step": 1724 + }, + { + "epoch": 1.735085182624002, + "grad_norm": 16.656227143427788, + "learning_rate": 7.256874580818244e-06, + "loss": 2.2073, + "step": 1725 + }, + { + "epoch": 1.7360910291066824, + "grad_norm": 15.628860790894302, + "learning_rate": 7.25463894477979e-06, + "loss": 2.2826, + "step": 1726 + }, + { + "epoch": 1.7370968755893632, + "grad_norm": 16.375635653933628, + "learning_rate": 7.252403308741337e-06, + "loss": 2.2749, + "step": 1727 + }, + { + "epoch": 1.7381027220720437, + "grad_norm": 16.301987152653233, + "learning_rate": 7.250167672702885e-06, + "loss": 2.3258, + "step": 1728 + }, + { + "epoch": 1.7391085685547243, + "grad_norm": 18.317871380977778, + "learning_rate": 7.2479320366644315e-06, + "loss": 2.2517, + "step": 1729 + }, + { + "epoch": 1.740114415037405, + "grad_norm": 20.755179037495648, + "learning_rate": 7.245696400625979e-06, + "loss": 2.2881, + "step": 1730 + }, + { + "epoch": 1.7411202615200856, + "grad_norm": 20.223197943734334, + "learning_rate": 7.2434607645875255e-06, + "loss": 2.2787, + "step": 1731 + }, + { + "epoch": 1.7421261080027661, + "grad_norm": 18.66793302042133, + "learning_rate": 7.241225128549073e-06, + "loss": 2.2898, + "step": 1732 + }, + { + "epoch": 1.7431319544854467, + "grad_norm": 18.28608426011941, + "learning_rate": 7.2389894925106195e-06, + "loss": 2.3038, + "step": 1733 + }, + { + "epoch": 1.7441378009681272, + "grad_norm": 20.186986215013036, + "learning_rate": 7.236753856472168e-06, + "loss": 2.2693, + "step": 1734 + }, + { + "epoch": 1.7451436474508077, + "grad_norm": 21.940252910209793, + "learning_rate": 7.234518220433714e-06, + "loss": 2.3508, + "step": 1735 + }, + { + "epoch": 1.7461494939334883, + "grad_norm": 16.54537436899838, + "learning_rate": 7.232282584395262e-06, + "loss": 2.2544, + "step": 1736 + }, + { + "epoch": 1.7471553404161688, + "grad_norm": 19.29470763757215, + "learning_rate": 7.230046948356808e-06, + "loss": 2.2422, + "step": 1737 + }, + { + "epoch": 1.7481611868988496, + "grad_norm": 19.38854319430838, + "learning_rate": 7.227811312318355e-06, + "loss": 2.3005, + "step": 1738 + }, + { + "epoch": 1.7491670333815301, + "grad_norm": 17.357463170532714, + "learning_rate": 7.225575676279902e-06, + "loss": 2.2972, + "step": 1739 + }, + { + "epoch": 1.7501728798642109, + "grad_norm": 22.087290432789963, + "learning_rate": 7.223340040241449e-06, + "loss": 2.2539, + "step": 1740 + }, + { + "epoch": 1.7511787263468914, + "grad_norm": 17.76858889717604, + "learning_rate": 7.221104404202996e-06, + "loss": 2.2746, + "step": 1741 + }, + { + "epoch": 1.752184572829572, + "grad_norm": 18.116519033951814, + "learning_rate": 7.218868768164543e-06, + "loss": 2.2829, + "step": 1742 + }, + { + "epoch": 1.7531904193122525, + "grad_norm": 20.309504486994047, + "learning_rate": 7.216633132126091e-06, + "loss": 2.3595, + "step": 1743 + }, + { + "epoch": 1.754196265794933, + "grad_norm": 14.93742846213375, + "learning_rate": 7.2143974960876376e-06, + "loss": 2.2892, + "step": 1744 + }, + { + "epoch": 1.7552021122776136, + "grad_norm": 19.603616852280457, + "learning_rate": 7.212161860049185e-06, + "loss": 2.3173, + "step": 1745 + }, + { + "epoch": 1.7562079587602941, + "grad_norm": 18.398664249151416, + "learning_rate": 7.2099262240107315e-06, + "loss": 2.344, + "step": 1746 + }, + { + "epoch": 1.7572138052429747, + "grad_norm": 18.49824732739366, + "learning_rate": 7.207690587972279e-06, + "loss": 2.272, + "step": 1747 + }, + { + "epoch": 1.7582196517256554, + "grad_norm": 18.83743911457428, + "learning_rate": 7.2054549519338255e-06, + "loss": 2.2819, + "step": 1748 + }, + { + "epoch": 1.759225498208336, + "grad_norm": 16.68493207885488, + "learning_rate": 7.203219315895374e-06, + "loss": 2.2825, + "step": 1749 + }, + { + "epoch": 1.7602313446910165, + "grad_norm": 17.39084821792995, + "learning_rate": 7.20098367985692e-06, + "loss": 2.2607, + "step": 1750 + }, + { + "epoch": 1.7612371911736973, + "grad_norm": 16.26128535383905, + "learning_rate": 7.198748043818467e-06, + "loss": 2.3199, + "step": 1751 + }, + { + "epoch": 1.7622430376563778, + "grad_norm": 18.49950263120119, + "learning_rate": 7.196512407780014e-06, + "loss": 2.2978, + "step": 1752 + }, + { + "epoch": 1.7632488841390583, + "grad_norm": 16.270202570964813, + "learning_rate": 7.194276771741561e-06, + "loss": 2.2844, + "step": 1753 + }, + { + "epoch": 1.7642547306217389, + "grad_norm": 17.054025826733476, + "learning_rate": 7.192041135703108e-06, + "loss": 2.3269, + "step": 1754 + }, + { + "epoch": 1.7652605771044194, + "grad_norm": 18.66246669846856, + "learning_rate": 7.189805499664655e-06, + "loss": 2.2251, + "step": 1755 + }, + { + "epoch": 1.7662664235871, + "grad_norm": 20.331441036286925, + "learning_rate": 7.187569863626202e-06, + "loss": 2.2769, + "step": 1756 + }, + { + "epoch": 1.7672722700697805, + "grad_norm": 17.734428888276117, + "learning_rate": 7.185334227587749e-06, + "loss": 2.3053, + "step": 1757 + }, + { + "epoch": 1.768278116552461, + "grad_norm": 18.036348076403122, + "learning_rate": 7.183098591549297e-06, + "loss": 2.248, + "step": 1758 + }, + { + "epoch": 1.7692839630351418, + "grad_norm": 17.82023633035522, + "learning_rate": 7.180862955510844e-06, + "loss": 2.3191, + "step": 1759 + }, + { + "epoch": 1.7702898095178223, + "grad_norm": 16.823322658814927, + "learning_rate": 7.178627319472391e-06, + "loss": 2.3104, + "step": 1760 + }, + { + "epoch": 1.7712956560005029, + "grad_norm": 18.89933972226583, + "learning_rate": 7.1763916834339375e-06, + "loss": 2.2202, + "step": 1761 + }, + { + "epoch": 1.7723015024831836, + "grad_norm": 19.642607646851978, + "learning_rate": 7.174156047395484e-06, + "loss": 2.2942, + "step": 1762 + }, + { + "epoch": 1.7733073489658642, + "grad_norm": 18.254196769125357, + "learning_rate": 7.1719204113570315e-06, + "loss": 2.2822, + "step": 1763 + }, + { + "epoch": 1.7743131954485447, + "grad_norm": 18.216364417776393, + "learning_rate": 7.169684775318578e-06, + "loss": 2.283, + "step": 1764 + }, + { + "epoch": 1.7753190419312253, + "grad_norm": 18.098869609136802, + "learning_rate": 7.167449139280126e-06, + "loss": 2.3405, + "step": 1765 + }, + { + "epoch": 1.7763248884139058, + "grad_norm": 17.066412960592956, + "learning_rate": 7.165213503241673e-06, + "loss": 2.2829, + "step": 1766 + }, + { + "epoch": 1.7773307348965863, + "grad_norm": 16.025984713835253, + "learning_rate": 7.16297786720322e-06, + "loss": 2.3012, + "step": 1767 + }, + { + "epoch": 1.7783365813792669, + "grad_norm": 19.686451528761513, + "learning_rate": 7.160742231164767e-06, + "loss": 2.2475, + "step": 1768 + }, + { + "epoch": 1.7793424278619474, + "grad_norm": 17.114132490787796, + "learning_rate": 7.158506595126314e-06, + "loss": 2.2603, + "step": 1769 + }, + { + "epoch": 1.7803482743446282, + "grad_norm": 16.788287849009663, + "learning_rate": 7.156270959087861e-06, + "loss": 2.2784, + "step": 1770 + }, + { + "epoch": 1.7813541208273087, + "grad_norm": 14.660268269237488, + "learning_rate": 7.154035323049408e-06, + "loss": 2.2773, + "step": 1771 + }, + { + "epoch": 1.7823599673099895, + "grad_norm": 17.33012795252929, + "learning_rate": 7.151799687010955e-06, + "loss": 2.3154, + "step": 1772 + }, + { + "epoch": 1.78336581379267, + "grad_norm": 16.183451941924876, + "learning_rate": 7.149564050972501e-06, + "loss": 2.2714, + "step": 1773 + }, + { + "epoch": 1.7843716602753505, + "grad_norm": 17.502249863651574, + "learning_rate": 7.14732841493405e-06, + "loss": 2.2987, + "step": 1774 + }, + { + "epoch": 1.785377506758031, + "grad_norm": 16.267689850773287, + "learning_rate": 7.145092778895596e-06, + "loss": 2.2764, + "step": 1775 + }, + { + "epoch": 1.7863833532407116, + "grad_norm": 17.185209369525268, + "learning_rate": 7.1428571428571436e-06, + "loss": 2.3588, + "step": 1776 + }, + { + "epoch": 1.7873891997233922, + "grad_norm": 20.347299173020325, + "learning_rate": 7.14062150681869e-06, + "loss": 2.264, + "step": 1777 + }, + { + "epoch": 1.7883950462060727, + "grad_norm": 15.444704186788066, + "learning_rate": 7.1383858707802375e-06, + "loss": 2.3126, + "step": 1778 + }, + { + "epoch": 1.7894008926887532, + "grad_norm": 18.125816182173008, + "learning_rate": 7.136150234741784e-06, + "loss": 2.2479, + "step": 1779 + }, + { + "epoch": 1.790406739171434, + "grad_norm": 16.508046759412878, + "learning_rate": 7.133914598703332e-06, + "loss": 2.2783, + "step": 1780 + }, + { + "epoch": 1.7914125856541145, + "grad_norm": 17.160063240198856, + "learning_rate": 7.131678962664879e-06, + "loss": 2.2799, + "step": 1781 + }, + { + "epoch": 1.792418432136795, + "grad_norm": 18.21495266317122, + "learning_rate": 7.129443326626426e-06, + "loss": 2.2865, + "step": 1782 + }, + { + "epoch": 1.7934242786194758, + "grad_norm": 17.027319498991748, + "learning_rate": 7.127207690587973e-06, + "loss": 2.2368, + "step": 1783 + }, + { + "epoch": 1.7944301251021564, + "grad_norm": 16.229008671664687, + "learning_rate": 7.12497205454952e-06, + "loss": 2.2556, + "step": 1784 + }, + { + "epoch": 1.795435971584837, + "grad_norm": 18.62974074603085, + "learning_rate": 7.122736418511067e-06, + "loss": 2.3042, + "step": 1785 + }, + { + "epoch": 1.7964418180675175, + "grad_norm": 14.934846550712225, + "learning_rate": 7.120500782472613e-06, + "loss": 2.262, + "step": 1786 + }, + { + "epoch": 1.797447664550198, + "grad_norm": 17.7843294903222, + "learning_rate": 7.118265146434161e-06, + "loss": 2.2989, + "step": 1787 + }, + { + "epoch": 1.7984535110328785, + "grad_norm": 16.043759669850164, + "learning_rate": 7.116029510395707e-06, + "loss": 2.2866, + "step": 1788 + }, + { + "epoch": 1.799459357515559, + "grad_norm": 15.572175597039415, + "learning_rate": 7.113793874357256e-06, + "loss": 2.2915, + "step": 1789 + }, + { + "epoch": 1.8004652039982396, + "grad_norm": 18.262520340255662, + "learning_rate": 7.111558238318802e-06, + "loss": 2.2982, + "step": 1790 + }, + { + "epoch": 1.8014710504809204, + "grad_norm": 14.664239377399232, + "learning_rate": 7.10932260228035e-06, + "loss": 2.2688, + "step": 1791 + }, + { + "epoch": 1.802476896963601, + "grad_norm": 15.788192217263258, + "learning_rate": 7.107086966241896e-06, + "loss": 2.2728, + "step": 1792 + }, + { + "epoch": 1.8034827434462817, + "grad_norm": 17.241784049935667, + "learning_rate": 7.1048513302034435e-06, + "loss": 2.2513, + "step": 1793 + }, + { + "epoch": 1.8044885899289622, + "grad_norm": 18.319754527885618, + "learning_rate": 7.10261569416499e-06, + "loss": 2.2543, + "step": 1794 + }, + { + "epoch": 1.8054944364116428, + "grad_norm": 16.093204298174406, + "learning_rate": 7.100380058126538e-06, + "loss": 2.2525, + "step": 1795 + }, + { + "epoch": 1.8065002828943233, + "grad_norm": 16.992116261097483, + "learning_rate": 7.098144422088085e-06, + "loss": 2.2809, + "step": 1796 + }, + { + "epoch": 1.8075061293770038, + "grad_norm": 19.250314664557777, + "learning_rate": 7.0959087860496315e-06, + "loss": 2.2567, + "step": 1797 + }, + { + "epoch": 1.8085119758596844, + "grad_norm": 15.994902899251477, + "learning_rate": 7.093673150011179e-06, + "loss": 2.2965, + "step": 1798 + }, + { + "epoch": 1.809517822342365, + "grad_norm": 18.988591404386828, + "learning_rate": 7.0914375139727254e-06, + "loss": 2.2487, + "step": 1799 + }, + { + "epoch": 1.8105236688250455, + "grad_norm": 19.11487194496845, + "learning_rate": 7.089201877934273e-06, + "loss": 2.2785, + "step": 1800 + }, + { + "epoch": 1.8115295153077262, + "grad_norm": 19.275348750335183, + "learning_rate": 7.086966241895819e-06, + "loss": 2.3006, + "step": 1801 + }, + { + "epoch": 1.8125353617904068, + "grad_norm": 18.60674521941389, + "learning_rate": 7.084730605857368e-06, + "loss": 2.3034, + "step": 1802 + }, + { + "epoch": 1.8135412082730873, + "grad_norm": 16.767075586894872, + "learning_rate": 7.082494969818914e-06, + "loss": 2.3288, + "step": 1803 + }, + { + "epoch": 1.814547054755768, + "grad_norm": 20.504218280601204, + "learning_rate": 7.080259333780462e-06, + "loss": 2.3001, + "step": 1804 + }, + { + "epoch": 1.8155529012384486, + "grad_norm": 20.98661191621691, + "learning_rate": 7.078023697742008e-06, + "loss": 2.2785, + "step": 1805 + }, + { + "epoch": 1.8165587477211291, + "grad_norm": 17.72071568357678, + "learning_rate": 7.075788061703556e-06, + "loss": 2.2838, + "step": 1806 + }, + { + "epoch": 1.8175645942038097, + "grad_norm": 17.78877227393904, + "learning_rate": 7.073552425665102e-06, + "loss": 2.2929, + "step": 1807 + }, + { + "epoch": 1.8185704406864902, + "grad_norm": 19.519678375167413, + "learning_rate": 7.0713167896266496e-06, + "loss": 2.2756, + "step": 1808 + }, + { + "epoch": 1.8195762871691707, + "grad_norm": 16.776782550396653, + "learning_rate": 7.069081153588196e-06, + "loss": 2.2822, + "step": 1809 + }, + { + "epoch": 1.8205821336518513, + "grad_norm": 17.04682649251197, + "learning_rate": 7.066845517549743e-06, + "loss": 2.2467, + "step": 1810 + }, + { + "epoch": 1.8215879801345318, + "grad_norm": 21.9081731265325, + "learning_rate": 7.064609881511291e-06, + "loss": 2.2639, + "step": 1811 + }, + { + "epoch": 1.8225938266172126, + "grad_norm": 21.67762882011947, + "learning_rate": 7.0623742454728375e-06, + "loss": 2.3243, + "step": 1812 + }, + { + "epoch": 1.8235996730998931, + "grad_norm": 14.755210379226378, + "learning_rate": 7.060138609434385e-06, + "loss": 2.2856, + "step": 1813 + }, + { + "epoch": 1.8246055195825737, + "grad_norm": 20.578781544696486, + "learning_rate": 7.0579029733959315e-06, + "loss": 2.2983, + "step": 1814 + }, + { + "epoch": 1.8256113660652544, + "grad_norm": 21.197669630731394, + "learning_rate": 7.055667337357479e-06, + "loss": 2.2639, + "step": 1815 + }, + { + "epoch": 1.826617212547935, + "grad_norm": 18.485968760798357, + "learning_rate": 7.0534317013190254e-06, + "loss": 2.3215, + "step": 1816 + }, + { + "epoch": 1.8276230590306155, + "grad_norm": 18.363711510600858, + "learning_rate": 7.051196065280574e-06, + "loss": 2.3173, + "step": 1817 + }, + { + "epoch": 1.828628905513296, + "grad_norm": 22.856385538734763, + "learning_rate": 7.04896042924212e-06, + "loss": 2.2784, + "step": 1818 + }, + { + "epoch": 1.8296347519959766, + "grad_norm": 17.64647189595528, + "learning_rate": 7.046724793203668e-06, + "loss": 2.1983, + "step": 1819 + }, + { + "epoch": 1.8306405984786571, + "grad_norm": 19.867650267325015, + "learning_rate": 7.044489157165214e-06, + "loss": 2.3085, + "step": 1820 + }, + { + "epoch": 1.8316464449613377, + "grad_norm": 18.776829497143535, + "learning_rate": 7.042253521126761e-06, + "loss": 2.2682, + "step": 1821 + }, + { + "epoch": 1.8326522914440182, + "grad_norm": 16.021203164915136, + "learning_rate": 7.040017885088308e-06, + "loss": 2.2734, + "step": 1822 + }, + { + "epoch": 1.833658137926699, + "grad_norm": 16.417856249676, + "learning_rate": 7.037782249049855e-06, + "loss": 2.2676, + "step": 1823 + }, + { + "epoch": 1.8346639844093795, + "grad_norm": 16.722581854094724, + "learning_rate": 7.035546613011402e-06, + "loss": 2.2728, + "step": 1824 + }, + { + "epoch": 1.8356698308920603, + "grad_norm": 17.47207904072863, + "learning_rate": 7.033310976972949e-06, + "loss": 2.2823, + "step": 1825 + }, + { + "epoch": 1.8366756773747408, + "grad_norm": 18.9806791066407, + "learning_rate": 7.031075340934497e-06, + "loss": 2.3478, + "step": 1826 + }, + { + "epoch": 1.8376815238574213, + "grad_norm": 16.279698640644096, + "learning_rate": 7.0288397048960435e-06, + "loss": 2.2949, + "step": 1827 + }, + { + "epoch": 1.8386873703401019, + "grad_norm": 17.345829236953485, + "learning_rate": 7.026604068857591e-06, + "loss": 2.3412, + "step": 1828 + }, + { + "epoch": 1.8396932168227824, + "grad_norm": 19.52245272171043, + "learning_rate": 7.0243684328191375e-06, + "loss": 2.3069, + "step": 1829 + }, + { + "epoch": 1.840699063305463, + "grad_norm": 20.41333556532244, + "learning_rate": 7.022132796780685e-06, + "loss": 2.2643, + "step": 1830 + }, + { + "epoch": 1.8417049097881435, + "grad_norm": 16.19207248247733, + "learning_rate": 7.0198971607422314e-06, + "loss": 2.2304, + "step": 1831 + }, + { + "epoch": 1.842710756270824, + "grad_norm": 17.371107507447974, + "learning_rate": 7.017661524703778e-06, + "loss": 2.3066, + "step": 1832 + }, + { + "epoch": 1.8437166027535048, + "grad_norm": 15.865840975698038, + "learning_rate": 7.015425888665326e-06, + "loss": 2.2728, + "step": 1833 + }, + { + "epoch": 1.8447224492361853, + "grad_norm": 18.433634282678582, + "learning_rate": 7.013190252626873e-06, + "loss": 2.2587, + "step": 1834 + }, + { + "epoch": 1.8457282957188659, + "grad_norm": 19.746009444985077, + "learning_rate": 7.01095461658842e-06, + "loss": 2.3259, + "step": 1835 + }, + { + "epoch": 1.8467341422015466, + "grad_norm": 19.45097779972063, + "learning_rate": 7.008718980549967e-06, + "loss": 2.3208, + "step": 1836 + }, + { + "epoch": 1.8477399886842272, + "grad_norm": 17.122946543898664, + "learning_rate": 7.006483344511514e-06, + "loss": 2.2804, + "step": 1837 + }, + { + "epoch": 1.8487458351669077, + "grad_norm": 17.990726363404274, + "learning_rate": 7.004247708473061e-06, + "loss": 2.2692, + "step": 1838 + }, + { + "epoch": 1.8497516816495883, + "grad_norm": 17.06390467461542, + "learning_rate": 7.002012072434608e-06, + "loss": 2.2984, + "step": 1839 + }, + { + "epoch": 1.8507575281322688, + "grad_norm": 20.004890911595176, + "learning_rate": 6.999776436396155e-06, + "loss": 2.3004, + "step": 1840 + }, + { + "epoch": 1.8517633746149493, + "grad_norm": 19.17768686986296, + "learning_rate": 6.997540800357703e-06, + "loss": 2.2812, + "step": 1841 + }, + { + "epoch": 1.8527692210976299, + "grad_norm": 15.70661341588764, + "learning_rate": 6.9953051643192495e-06, + "loss": 2.2868, + "step": 1842 + }, + { + "epoch": 1.8537750675803104, + "grad_norm": 16.556570727346433, + "learning_rate": 6.993069528280797e-06, + "loss": 2.2866, + "step": 1843 + }, + { + "epoch": 1.8547809140629912, + "grad_norm": 15.967921511409507, + "learning_rate": 6.9908338922423435e-06, + "loss": 2.249, + "step": 1844 + }, + { + "epoch": 1.8557867605456717, + "grad_norm": 16.70715263496682, + "learning_rate": 6.98859825620389e-06, + "loss": 2.3087, + "step": 1845 + }, + { + "epoch": 1.8567926070283522, + "grad_norm": 20.495584603518225, + "learning_rate": 6.9863626201654375e-06, + "loss": 2.2572, + "step": 1846 + }, + { + "epoch": 1.857798453511033, + "grad_norm": 17.47988908055069, + "learning_rate": 6.984126984126984e-06, + "loss": 2.2883, + "step": 1847 + }, + { + "epoch": 1.8588042999937135, + "grad_norm": 17.017278697671724, + "learning_rate": 6.981891348088532e-06, + "loss": 2.2793, + "step": 1848 + }, + { + "epoch": 1.859810146476394, + "grad_norm": 18.59038874515862, + "learning_rate": 6.979655712050079e-06, + "loss": 2.2574, + "step": 1849 + }, + { + "epoch": 1.8608159929590746, + "grad_norm": 19.53865063389471, + "learning_rate": 6.977420076011626e-06, + "loss": 2.3442, + "step": 1850 + }, + { + "epoch": 1.8618218394417552, + "grad_norm": 16.338863083503824, + "learning_rate": 6.975184439973173e-06, + "loss": 2.2877, + "step": 1851 + }, + { + "epoch": 1.8628276859244357, + "grad_norm": 17.241535588150537, + "learning_rate": 6.97294880393472e-06, + "loss": 2.296, + "step": 1852 + }, + { + "epoch": 1.8638335324071162, + "grad_norm": 16.442352629632072, + "learning_rate": 6.970713167896267e-06, + "loss": 2.3199, + "step": 1853 + }, + { + "epoch": 1.8648393788897968, + "grad_norm": 18.764930586467397, + "learning_rate": 6.968477531857815e-06, + "loss": 2.2887, + "step": 1854 + }, + { + "epoch": 1.8658452253724775, + "grad_norm": 16.0002800988153, + "learning_rate": 6.966241895819362e-06, + "loss": 2.267, + "step": 1855 + }, + { + "epoch": 1.866851071855158, + "grad_norm": 16.1416440519751, + "learning_rate": 6.964006259780907e-06, + "loss": 2.2854, + "step": 1856 + }, + { + "epoch": 1.8678569183378388, + "grad_norm": 18.03816884235956, + "learning_rate": 6.9617706237424556e-06, + "loss": 2.2578, + "step": 1857 + }, + { + "epoch": 1.8688627648205194, + "grad_norm": 15.317109741476976, + "learning_rate": 6.959534987704002e-06, + "loss": 2.263, + "step": 1858 + }, + { + "epoch": 1.8698686113032, + "grad_norm": 14.828646134819525, + "learning_rate": 6.9572993516655495e-06, + "loss": 2.2415, + "step": 1859 + }, + { + "epoch": 1.8708744577858805, + "grad_norm": 14.97271984899026, + "learning_rate": 6.955063715627096e-06, + "loss": 2.2852, + "step": 1860 + }, + { + "epoch": 1.871880304268561, + "grad_norm": 16.65094367595834, + "learning_rate": 6.9528280795886435e-06, + "loss": 2.3065, + "step": 1861 + }, + { + "epoch": 1.8728861507512415, + "grad_norm": 15.882338299411797, + "learning_rate": 6.95059244355019e-06, + "loss": 2.2813, + "step": 1862 + }, + { + "epoch": 1.873891997233922, + "grad_norm": 15.05351225160577, + "learning_rate": 6.948356807511738e-06, + "loss": 2.2995, + "step": 1863 + }, + { + "epoch": 1.8748978437166026, + "grad_norm": 17.677697128378234, + "learning_rate": 6.946121171473285e-06, + "loss": 2.2247, + "step": 1864 + }, + { + "epoch": 1.8759036901992834, + "grad_norm": 14.832365800788716, + "learning_rate": 6.943885535434832e-06, + "loss": 2.3025, + "step": 1865 + }, + { + "epoch": 1.876909536681964, + "grad_norm": 15.956879223213827, + "learning_rate": 6.941649899396379e-06, + "loss": 2.3567, + "step": 1866 + }, + { + "epoch": 1.8779153831646445, + "grad_norm": 17.36600813497739, + "learning_rate": 6.939414263357926e-06, + "loss": 2.32, + "step": 1867 + }, + { + "epoch": 1.8789212296473252, + "grad_norm": 16.959525128835914, + "learning_rate": 6.937178627319473e-06, + "loss": 2.3037, + "step": 1868 + }, + { + "epoch": 1.8799270761300058, + "grad_norm": 16.21373348481673, + "learning_rate": 6.934942991281019e-06, + "loss": 2.3315, + "step": 1869 + }, + { + "epoch": 1.8809329226126863, + "grad_norm": 16.510470523686852, + "learning_rate": 6.932707355242568e-06, + "loss": 2.2608, + "step": 1870 + }, + { + "epoch": 1.8819387690953668, + "grad_norm": 17.030510814810018, + "learning_rate": 6.930471719204114e-06, + "loss": 2.2677, + "step": 1871 + }, + { + "epoch": 1.8829446155780474, + "grad_norm": 18.22176597603188, + "learning_rate": 6.9282360831656616e-06, + "loss": 2.3149, + "step": 1872 + }, + { + "epoch": 1.883950462060728, + "grad_norm": 16.792187637096355, + "learning_rate": 6.926000447127208e-06, + "loss": 2.2751, + "step": 1873 + }, + { + "epoch": 1.8849563085434085, + "grad_norm": 16.183554558324985, + "learning_rate": 6.9237648110887555e-06, + "loss": 2.2863, + "step": 1874 + }, + { + "epoch": 1.885962155026089, + "grad_norm": 17.300579827350667, + "learning_rate": 6.921529175050302e-06, + "loss": 2.2912, + "step": 1875 + }, + { + "epoch": 1.8869680015087698, + "grad_norm": 18.153546483178403, + "learning_rate": 6.9192935390118495e-06, + "loss": 2.2814, + "step": 1876 + }, + { + "epoch": 1.8879738479914503, + "grad_norm": 19.594562516452527, + "learning_rate": 6.917057902973396e-06, + "loss": 2.3179, + "step": 1877 + }, + { + "epoch": 1.8889796944741308, + "grad_norm": 17.096198547393634, + "learning_rate": 6.914822266934944e-06, + "loss": 2.3012, + "step": 1878 + }, + { + "epoch": 1.8899855409568116, + "grad_norm": 18.147615561617283, + "learning_rate": 6.912586630896491e-06, + "loss": 2.2432, + "step": 1879 + }, + { + "epoch": 1.8909913874394921, + "grad_norm": 16.826685763955773, + "learning_rate": 6.9103509948580374e-06, + "loss": 2.3103, + "step": 1880 + }, + { + "epoch": 1.8919972339221727, + "grad_norm": 16.737903754646183, + "learning_rate": 6.908115358819585e-06, + "loss": 2.3051, + "step": 1881 + }, + { + "epoch": 1.8930030804048532, + "grad_norm": 16.529744230872915, + "learning_rate": 6.905879722781131e-06, + "loss": 2.2755, + "step": 1882 + }, + { + "epoch": 1.8940089268875338, + "grad_norm": 19.09942006258242, + "learning_rate": 6.903644086742679e-06, + "loss": 2.3269, + "step": 1883 + }, + { + "epoch": 1.8950147733702143, + "grad_norm": 19.531445891717162, + "learning_rate": 6.901408450704225e-06, + "loss": 2.2548, + "step": 1884 + }, + { + "epoch": 1.8960206198528948, + "grad_norm": 18.997923333530903, + "learning_rate": 6.899172814665774e-06, + "loss": 2.3261, + "step": 1885 + }, + { + "epoch": 1.8970264663355754, + "grad_norm": 18.461677933705914, + "learning_rate": 6.89693717862732e-06, + "loss": 2.2897, + "step": 1886 + }, + { + "epoch": 1.8980323128182561, + "grad_norm": 18.61814161726463, + "learning_rate": 6.894701542588868e-06, + "loss": 2.3123, + "step": 1887 + }, + { + "epoch": 1.8990381593009367, + "grad_norm": 18.092851713644524, + "learning_rate": 6.892465906550414e-06, + "loss": 2.2905, + "step": 1888 + }, + { + "epoch": 1.9000440057836174, + "grad_norm": 17.044430585074025, + "learning_rate": 6.8902302705119616e-06, + "loss": 2.2941, + "step": 1889 + }, + { + "epoch": 1.901049852266298, + "grad_norm": 17.0026458608325, + "learning_rate": 6.887994634473508e-06, + "loss": 2.2681, + "step": 1890 + }, + { + "epoch": 1.9020556987489785, + "grad_norm": 15.129874898168884, + "learning_rate": 6.885758998435055e-06, + "loss": 2.3103, + "step": 1891 + }, + { + "epoch": 1.903061545231659, + "grad_norm": 18.273974027399, + "learning_rate": 6.883523362396602e-06, + "loss": 2.3008, + "step": 1892 + }, + { + "epoch": 1.9040673917143396, + "grad_norm": 19.742323390440422, + "learning_rate": 6.881287726358149e-06, + "loss": 2.2705, + "step": 1893 + }, + { + "epoch": 1.9050732381970201, + "grad_norm": 17.714002726675265, + "learning_rate": 6.879052090319697e-06, + "loss": 2.263, + "step": 1894 + }, + { + "epoch": 1.9060790846797007, + "grad_norm": 18.144590858661886, + "learning_rate": 6.8768164542812435e-06, + "loss": 2.2798, + "step": 1895 + }, + { + "epoch": 1.9070849311623812, + "grad_norm": 19.275985168763043, + "learning_rate": 6.874580818242791e-06, + "loss": 2.2764, + "step": 1896 + }, + { + "epoch": 1.908090777645062, + "grad_norm": 16.984422505971622, + "learning_rate": 6.872345182204337e-06, + "loss": 2.2937, + "step": 1897 + }, + { + "epoch": 1.9090966241277425, + "grad_norm": 17.473288710210316, + "learning_rate": 6.870109546165885e-06, + "loss": 2.3078, + "step": 1898 + }, + { + "epoch": 1.910102470610423, + "grad_norm": 17.934114969579213, + "learning_rate": 6.867873910127431e-06, + "loss": 2.2774, + "step": 1899 + }, + { + "epoch": 1.9111083170931038, + "grad_norm": 16.511999530124953, + "learning_rate": 6.86563827408898e-06, + "loss": 2.2898, + "step": 1900 + }, + { + "epoch": 1.9121141635757843, + "grad_norm": 17.374441373909356, + "learning_rate": 6.863402638050526e-06, + "loss": 2.289, + "step": 1901 + }, + { + "epoch": 1.9131200100584649, + "grad_norm": 17.336050692279382, + "learning_rate": 6.861167002012074e-06, + "loss": 2.2862, + "step": 1902 + }, + { + "epoch": 1.9141258565411454, + "grad_norm": 17.63830681128583, + "learning_rate": 6.85893136597362e-06, + "loss": 2.3034, + "step": 1903 + }, + { + "epoch": 1.915131703023826, + "grad_norm": 15.818875289328755, + "learning_rate": 6.856695729935167e-06, + "loss": 2.2395, + "step": 1904 + }, + { + "epoch": 1.9161375495065065, + "grad_norm": 21.700760974979666, + "learning_rate": 6.854460093896714e-06, + "loss": 2.2894, + "step": 1905 + }, + { + "epoch": 1.917143395989187, + "grad_norm": 18.11623312546713, + "learning_rate": 6.852224457858261e-06, + "loss": 2.2581, + "step": 1906 + }, + { + "epoch": 1.9181492424718676, + "grad_norm": 17.35648839707561, + "learning_rate": 6.849988821819808e-06, + "loss": 2.2989, + "step": 1907 + }, + { + "epoch": 1.9191550889545483, + "grad_norm": 15.924160561726598, + "learning_rate": 6.847753185781355e-06, + "loss": 2.306, + "step": 1908 + }, + { + "epoch": 1.9201609354372289, + "grad_norm": 16.623839964461403, + "learning_rate": 6.845517549742903e-06, + "loss": 2.332, + "step": 1909 + }, + { + "epoch": 1.9211667819199096, + "grad_norm": 19.17988497622896, + "learning_rate": 6.8432819137044495e-06, + "loss": 2.3114, + "step": 1910 + }, + { + "epoch": 1.9221726284025902, + "grad_norm": 16.50083651306228, + "learning_rate": 6.841046277665997e-06, + "loss": 2.3017, + "step": 1911 + }, + { + "epoch": 1.9231784748852707, + "grad_norm": 17.153877632238867, + "learning_rate": 6.8388106416275434e-06, + "loss": 2.2815, + "step": 1912 + }, + { + "epoch": 1.9241843213679513, + "grad_norm": 17.99505447335445, + "learning_rate": 6.836575005589091e-06, + "loss": 2.256, + "step": 1913 + }, + { + "epoch": 1.9251901678506318, + "grad_norm": 16.618949942022258, + "learning_rate": 6.834339369550637e-06, + "loss": 2.2815, + "step": 1914 + }, + { + "epoch": 1.9261960143333123, + "grad_norm": 17.30100921512153, + "learning_rate": 6.832103733512184e-06, + "loss": 2.2528, + "step": 1915 + }, + { + "epoch": 1.9272018608159929, + "grad_norm": 16.833482665869667, + "learning_rate": 6.829868097473732e-06, + "loss": 2.2769, + "step": 1916 + }, + { + "epoch": 1.9282077072986734, + "grad_norm": 17.327557752569497, + "learning_rate": 6.827632461435279e-06, + "loss": 2.2319, + "step": 1917 + }, + { + "epoch": 1.9292135537813542, + "grad_norm": 16.753107835060206, + "learning_rate": 6.825396825396826e-06, + "loss": 2.3046, + "step": 1918 + }, + { + "epoch": 1.9302194002640347, + "grad_norm": 16.23041816840295, + "learning_rate": 6.823161189358373e-06, + "loss": 2.3226, + "step": 1919 + }, + { + "epoch": 1.9312252467467153, + "grad_norm": 15.740962564667383, + "learning_rate": 6.82092555331992e-06, + "loss": 2.3109, + "step": 1920 + }, + { + "epoch": 1.932231093229396, + "grad_norm": 17.19442784322785, + "learning_rate": 6.818689917281467e-06, + "loss": 2.2663, + "step": 1921 + }, + { + "epoch": 1.9332369397120766, + "grad_norm": 16.598926970052784, + "learning_rate": 6.816454281243015e-06, + "loss": 2.2847, + "step": 1922 + }, + { + "epoch": 1.934242786194757, + "grad_norm": 16.403215664959518, + "learning_rate": 6.8142186452045615e-06, + "loss": 2.2704, + "step": 1923 + }, + { + "epoch": 1.9352486326774376, + "grad_norm": 16.847958638967047, + "learning_rate": 6.811983009166109e-06, + "loss": 2.3015, + "step": 1924 + }, + { + "epoch": 1.9362544791601182, + "grad_norm": 16.187628346715247, + "learning_rate": 6.8097473731276555e-06, + "loss": 2.2388, + "step": 1925 + }, + { + "epoch": 1.9372603256427987, + "grad_norm": 17.867635463374597, + "learning_rate": 6.807511737089203e-06, + "loss": 2.3398, + "step": 1926 + }, + { + "epoch": 1.9382661721254792, + "grad_norm": 16.06214084830298, + "learning_rate": 6.8052761010507495e-06, + "loss": 2.2553, + "step": 1927 + }, + { + "epoch": 1.9392720186081598, + "grad_norm": 15.857951038791413, + "learning_rate": 6.803040465012296e-06, + "loss": 2.3089, + "step": 1928 + }, + { + "epoch": 1.9402778650908405, + "grad_norm": 16.151187192882507, + "learning_rate": 6.800804828973843e-06, + "loss": 2.2297, + "step": 1929 + }, + { + "epoch": 1.941283711573521, + "grad_norm": 20.836594910648667, + "learning_rate": 6.79856919293539e-06, + "loss": 2.285, + "step": 1930 + }, + { + "epoch": 1.9422895580562016, + "grad_norm": 14.215712608950872, + "learning_rate": 6.796333556896938e-06, + "loss": 2.3463, + "step": 1931 + }, + { + "epoch": 1.9432954045388824, + "grad_norm": 16.87665782003356, + "learning_rate": 6.794097920858485e-06, + "loss": 2.3153, + "step": 1932 + }, + { + "epoch": 1.944301251021563, + "grad_norm": 17.73419471480312, + "learning_rate": 6.791862284820032e-06, + "loss": 2.3007, + "step": 1933 + }, + { + "epoch": 1.9453070975042435, + "grad_norm": 18.054020098005882, + "learning_rate": 6.789626648781579e-06, + "loss": 2.2164, + "step": 1934 + }, + { + "epoch": 1.946312943986924, + "grad_norm": 20.446090011166692, + "learning_rate": 6.787391012743126e-06, + "loss": 2.2585, + "step": 1935 + }, + { + "epoch": 1.9473187904696045, + "grad_norm": 15.36859727143387, + "learning_rate": 6.785155376704673e-06, + "loss": 2.3062, + "step": 1936 + }, + { + "epoch": 1.948324636952285, + "grad_norm": 18.920666147270758, + "learning_rate": 6.782919740666221e-06, + "loss": 2.2437, + "step": 1937 + }, + { + "epoch": 1.9493304834349656, + "grad_norm": 20.03365854926881, + "learning_rate": 6.7806841046277675e-06, + "loss": 2.3588, + "step": 1938 + }, + { + "epoch": 1.9503363299176462, + "grad_norm": 17.9178072777866, + "learning_rate": 6.778448468589314e-06, + "loss": 2.3013, + "step": 1939 + }, + { + "epoch": 1.951342176400327, + "grad_norm": 15.531749211477312, + "learning_rate": 6.7762128325508615e-06, + "loss": 2.2827, + "step": 1940 + }, + { + "epoch": 1.9523480228830075, + "grad_norm": 17.621182445872154, + "learning_rate": 6.773977196512408e-06, + "loss": 2.3119, + "step": 1941 + }, + { + "epoch": 1.9533538693656882, + "grad_norm": 14.977545170236327, + "learning_rate": 6.7717415604739555e-06, + "loss": 2.2564, + "step": 1942 + }, + { + "epoch": 1.9543597158483688, + "grad_norm": 20.097789651883318, + "learning_rate": 6.769505924435502e-06, + "loss": 2.2334, + "step": 1943 + }, + { + "epoch": 1.9553655623310493, + "grad_norm": 16.81766797993441, + "learning_rate": 6.7672702883970494e-06, + "loss": 2.3052, + "step": 1944 + }, + { + "epoch": 1.9563714088137298, + "grad_norm": 15.694137260145133, + "learning_rate": 6.765034652358596e-06, + "loss": 2.2931, + "step": 1945 + }, + { + "epoch": 1.9573772552964104, + "grad_norm": 18.560231807833976, + "learning_rate": 6.762799016320144e-06, + "loss": 2.2836, + "step": 1946 + }, + { + "epoch": 1.958383101779091, + "grad_norm": 14.368601263956439, + "learning_rate": 6.760563380281691e-06, + "loss": 2.2922, + "step": 1947 + }, + { + "epoch": 1.9593889482617715, + "grad_norm": 15.779224845368637, + "learning_rate": 6.758327744243238e-06, + "loss": 2.3322, + "step": 1948 + }, + { + "epoch": 1.960394794744452, + "grad_norm": 17.026069508766124, + "learning_rate": 6.756092108204785e-06, + "loss": 2.3136, + "step": 1949 + }, + { + "epoch": 1.9614006412271328, + "grad_norm": 13.895810317269211, + "learning_rate": 6.753856472166331e-06, + "loss": 2.3046, + "step": 1950 + }, + { + "epoch": 1.9624064877098133, + "grad_norm": 15.082309738104678, + "learning_rate": 6.751620836127879e-06, + "loss": 2.2669, + "step": 1951 + }, + { + "epoch": 1.9634123341924938, + "grad_norm": 16.69092843841514, + "learning_rate": 6.749385200089425e-06, + "loss": 2.2927, + "step": 1952 + }, + { + "epoch": 1.9644181806751746, + "grad_norm": 17.305936411163774, + "learning_rate": 6.7471495640509736e-06, + "loss": 2.2891, + "step": 1953 + }, + { + "epoch": 1.9654240271578551, + "grad_norm": 17.003792051735022, + "learning_rate": 6.74491392801252e-06, + "loss": 2.2635, + "step": 1954 + }, + { + "epoch": 1.9664298736405357, + "grad_norm": 16.2653781486701, + "learning_rate": 6.7426782919740675e-06, + "loss": 2.2911, + "step": 1955 + }, + { + "epoch": 1.9674357201232162, + "grad_norm": 17.781504567455677, + "learning_rate": 6.740442655935614e-06, + "loss": 2.3029, + "step": 1956 + }, + { + "epoch": 1.9684415666058968, + "grad_norm": 19.581271792774587, + "learning_rate": 6.7382070198971615e-06, + "loss": 2.3463, + "step": 1957 + }, + { + "epoch": 1.9694474130885773, + "grad_norm": 16.588824708722353, + "learning_rate": 6.735971383858708e-06, + "loss": 2.3208, + "step": 1958 + }, + { + "epoch": 1.9704532595712578, + "grad_norm": 20.341400759529318, + "learning_rate": 6.7337357478202555e-06, + "loss": 2.2971, + "step": 1959 + }, + { + "epoch": 1.9714591060539384, + "grad_norm": 23.39973002026867, + "learning_rate": 6.731500111781802e-06, + "loss": 2.2627, + "step": 1960 + }, + { + "epoch": 1.9724649525366191, + "grad_norm": 17.628348506173374, + "learning_rate": 6.72926447574335e-06, + "loss": 2.2762, + "step": 1961 + }, + { + "epoch": 1.9734707990192997, + "grad_norm": 22.430240616517345, + "learning_rate": 6.727028839704897e-06, + "loss": 2.3183, + "step": 1962 + }, + { + "epoch": 1.9744766455019802, + "grad_norm": 22.262878514597467, + "learning_rate": 6.724793203666443e-06, + "loss": 2.2925, + "step": 1963 + }, + { + "epoch": 1.975482491984661, + "grad_norm": 16.358599953165104, + "learning_rate": 6.722557567627991e-06, + "loss": 2.3594, + "step": 1964 + }, + { + "epoch": 1.9764883384673415, + "grad_norm": 17.417739772341868, + "learning_rate": 6.720321931589537e-06, + "loss": 2.3343, + "step": 1965 + }, + { + "epoch": 1.977494184950022, + "grad_norm": 19.997471037391072, + "learning_rate": 6.718086295551085e-06, + "loss": 2.2964, + "step": 1966 + }, + { + "epoch": 1.9785000314327026, + "grad_norm": 18.199676913970578, + "learning_rate": 6.715850659512631e-06, + "loss": 2.3029, + "step": 1967 + }, + { + "epoch": 1.9795058779153831, + "grad_norm": 20.29139543009488, + "learning_rate": 6.71361502347418e-06, + "loss": 2.3143, + "step": 1968 + }, + { + "epoch": 1.9805117243980637, + "grad_norm": 19.02342024523025, + "learning_rate": 6.711379387435726e-06, + "loss": 2.3403, + "step": 1969 + }, + { + "epoch": 1.9815175708807442, + "grad_norm": 18.105251981323825, + "learning_rate": 6.7091437513972735e-06, + "loss": 2.237, + "step": 1970 + }, + { + "epoch": 1.9825234173634247, + "grad_norm": 18.408253755204424, + "learning_rate": 6.70690811535882e-06, + "loss": 2.307, + "step": 1971 + }, + { + "epoch": 1.9835292638461055, + "grad_norm": 18.72993515240626, + "learning_rate": 6.7046724793203675e-06, + "loss": 2.3008, + "step": 1972 + }, + { + "epoch": 1.984535110328786, + "grad_norm": 17.73016244923382, + "learning_rate": 6.702436843281914e-06, + "loss": 2.2776, + "step": 1973 + }, + { + "epoch": 1.9855409568114668, + "grad_norm": 17.602188647925164, + "learning_rate": 6.700201207243461e-06, + "loss": 2.2459, + "step": 1974 + }, + { + "epoch": 1.9865468032941473, + "grad_norm": 20.771872926086225, + "learning_rate": 6.697965571205008e-06, + "loss": 2.284, + "step": 1975 + }, + { + "epoch": 1.9875526497768279, + "grad_norm": 16.413973765717518, + "learning_rate": 6.695729935166555e-06, + "loss": 2.3418, + "step": 1976 + }, + { + "epoch": 1.9885584962595084, + "grad_norm": 21.346503688042784, + "learning_rate": 6.693494299128103e-06, + "loss": 2.3004, + "step": 1977 + }, + { + "epoch": 1.989564342742189, + "grad_norm": 20.864761239960973, + "learning_rate": 6.691258663089649e-06, + "loss": 2.3018, + "step": 1978 + }, + { + "epoch": 1.9905701892248695, + "grad_norm": 15.97287960488781, + "learning_rate": 6.689023027051197e-06, + "loss": 2.2562, + "step": 1979 + }, + { + "epoch": 1.99157603570755, + "grad_norm": 17.090597044595953, + "learning_rate": 6.686787391012743e-06, + "loss": 2.2424, + "step": 1980 + }, + { + "epoch": 1.9925818821902306, + "grad_norm": 20.64289156909885, + "learning_rate": 6.684551754974291e-06, + "loss": 2.3371, + "step": 1981 + }, + { + "epoch": 1.9935877286729113, + "grad_norm": 17.23349871929263, + "learning_rate": 6.682316118935837e-06, + "loss": 2.2772, + "step": 1982 + }, + { + "epoch": 1.9945935751555919, + "grad_norm": 18.728810327340735, + "learning_rate": 6.680080482897386e-06, + "loss": 2.3038, + "step": 1983 + }, + { + "epoch": 1.9955994216382724, + "grad_norm": 20.432953519649597, + "learning_rate": 6.677844846858932e-06, + "loss": 2.3487, + "step": 1984 + }, + { + "epoch": 1.9966052681209532, + "grad_norm": 17.389497689218956, + "learning_rate": 6.6756092108204796e-06, + "loss": 2.2715, + "step": 1985 + }, + { + "epoch": 1.9976111146036337, + "grad_norm": 15.683209711373696, + "learning_rate": 6.673373574782026e-06, + "loss": 2.2951, + "step": 1986 + }, + { + "epoch": 1.9986169610863143, + "grad_norm": 19.70295022855433, + "learning_rate": 6.671137938743573e-06, + "loss": 2.3248, + "step": 1987 + }, + { + "epoch": 1.9996228075689948, + "grad_norm": 17.3150299364818, + "learning_rate": 6.66890230270512e-06, + "loss": 2.2834, + "step": 1988 + }, + { + "epoch": 2.0006286540516753, + "grad_norm": 16.761655273211517, + "learning_rate": 6.666666666666667e-06, + "loss": 2.1124, + "step": 1989 + }, + { + "epoch": 2.001634500534356, + "grad_norm": 19.19275918856666, + "learning_rate": 6.664431030628215e-06, + "loss": 2.0861, + "step": 1990 + }, + { + "epoch": 2.0026403470170364, + "grad_norm": 18.3665234076943, + "learning_rate": 6.6621953945897615e-06, + "loss": 2.0598, + "step": 1991 + }, + { + "epoch": 2.003646193499717, + "grad_norm": 16.111865050848618, + "learning_rate": 6.659959758551309e-06, + "loss": 2.0566, + "step": 1992 + }, + { + "epoch": 2.0046520399823975, + "grad_norm": 18.032869066711967, + "learning_rate": 6.6577241225128554e-06, + "loss": 2.045, + "step": 1993 + }, + { + "epoch": 2.0056578864650785, + "grad_norm": 16.739105171425805, + "learning_rate": 6.655488486474403e-06, + "loss": 2.0277, + "step": 1994 + }, + { + "epoch": 2.006663732947759, + "grad_norm": 15.937915097580044, + "learning_rate": 6.653252850435949e-06, + "loss": 2.0072, + "step": 1995 + }, + { + "epoch": 2.0076695794304396, + "grad_norm": 18.04853977752509, + "learning_rate": 6.651017214397497e-06, + "loss": 1.984, + "step": 1996 + }, + { + "epoch": 2.00867542591312, + "grad_norm": 17.54238079716516, + "learning_rate": 6.648781578359043e-06, + "loss": 2.041, + "step": 1997 + }, + { + "epoch": 2.0096812723958006, + "grad_norm": 16.72820920478064, + "learning_rate": 6.64654594232059e-06, + "loss": 2.0229, + "step": 1998 + }, + { + "epoch": 2.010687118878481, + "grad_norm": 18.224151388488885, + "learning_rate": 6.644310306282138e-06, + "loss": 2.0096, + "step": 1999 + }, + { + "epoch": 2.0116929653611617, + "grad_norm": 17.796645791493617, + "learning_rate": 6.642074670243685e-06, + "loss": 2.0127, + "step": 2000 + } + ], + "logging_steps": 1.0, + "max_steps": 4970, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}