diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,77028 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9660032712383503, + "eval_steps": 1000, + "global_step": 22000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.39092396017432e-05, + "grad_norm": 350.0, + "learning_rate": 7.246376811594203e-07, + "loss": 6.5493, + "step": 1 + }, + { + "epoch": 8.78184792034864e-05, + "grad_norm": 318.0, + "learning_rate": 1.4492753623188406e-06, + "loss": 6.505, + "step": 2 + }, + { + "epoch": 0.0001756369584069728, + "grad_norm": 366.0, + "learning_rate": 2.898550724637681e-06, + "loss": 6.4657, + "step": 4 + }, + { + "epoch": 0.00026345543761045916, + "grad_norm": 390.0, + "learning_rate": 4.347826086956522e-06, + "loss": 6.1079, + "step": 6 + }, + { + "epoch": 0.0003512739168139456, + "grad_norm": 300.0, + "learning_rate": 5.797101449275362e-06, + "loss": 5.4335, + "step": 8 + }, + { + "epoch": 0.000439092396017432, + "grad_norm": 151.0, + "learning_rate": 7.246376811594203e-06, + "loss": 4.4914, + "step": 10 + }, + { + "epoch": 0.0005269108752209183, + "grad_norm": 85.0, + "learning_rate": 8.695652173913044e-06, + "loss": 3.9792, + "step": 12 + }, + { + "epoch": 0.0006147293544244047, + "grad_norm": 15.0, + "learning_rate": 1.0144927536231885e-05, + "loss": 3.7765, + "step": 14 + }, + { + "epoch": 0.0007025478336278912, + "grad_norm": 9.5625, + "learning_rate": 1.1594202898550725e-05, + "loss": 3.7259, + "step": 16 + }, + { + "epoch": 0.0007903663128313776, + "grad_norm": 7.25, + "learning_rate": 1.3043478260869566e-05, + "loss": 3.657, + "step": 18 + }, + { + "epoch": 0.000878184792034864, + "grad_norm": 16.25, + "learning_rate": 1.4492753623188407e-05, + "loss": 3.6366, + "step": 20 + }, + { + "epoch": 0.0009660032712383504, + "grad_norm": 8.0, + "learning_rate": 1.5942028985507246e-05, + "loss": 3.5582, + "step": 22 + }, + { + "epoch": 0.0010538217504418366, + "grad_norm": 10.8125, + "learning_rate": 1.739130434782609e-05, + "loss": 3.4616, + "step": 24 + }, + { + "epoch": 0.0011416402296453231, + "grad_norm": 6.65625, + "learning_rate": 1.8840579710144928e-05, + "loss": 3.3355, + "step": 26 + }, + { + "epoch": 0.0012294587088488094, + "grad_norm": 6.21875, + "learning_rate": 2.028985507246377e-05, + "loss": 3.2562, + "step": 28 + }, + { + "epoch": 0.001317277188052296, + "grad_norm": 5.75, + "learning_rate": 2.173913043478261e-05, + "loss": 3.1432, + "step": 30 + }, + { + "epoch": 0.0014050956672557824, + "grad_norm": 7.15625, + "learning_rate": 2.318840579710145e-05, + "loss": 3.0361, + "step": 32 + }, + { + "epoch": 0.0014929141464592687, + "grad_norm": 8.25, + "learning_rate": 2.4637681159420292e-05, + "loss": 2.9816, + "step": 34 + }, + { + "epoch": 0.0015807326256627552, + "grad_norm": 7.4375, + "learning_rate": 2.608695652173913e-05, + "loss": 2.8625, + "step": 36 + }, + { + "epoch": 0.0016685511048662414, + "grad_norm": 6.03125, + "learning_rate": 2.753623188405797e-05, + "loss": 2.712, + "step": 38 + }, + { + "epoch": 0.001756369584069728, + "grad_norm": 9.375, + "learning_rate": 2.8985507246376814e-05, + "loss": 2.5506, + "step": 40 + }, + { + "epoch": 0.0018441880632732142, + "grad_norm": 8.8125, + "learning_rate": 3.0434782608695656e-05, + "loss": 2.4374, + "step": 42 + }, + { + "epoch": 0.0019320065424767007, + "grad_norm": 16.625, + "learning_rate": 3.188405797101449e-05, + "loss": 2.3639, + "step": 44 + }, + { + "epoch": 0.002019825021680187, + "grad_norm": 15.1875, + "learning_rate": 3.3333333333333335e-05, + "loss": 2.2934, + "step": 46 + }, + { + "epoch": 0.0021076435008836733, + "grad_norm": 9.375, + "learning_rate": 3.478260869565218e-05, + "loss": 2.2077, + "step": 48 + }, + { + "epoch": 0.00219546198008716, + "grad_norm": 9.875, + "learning_rate": 3.6231884057971014e-05, + "loss": 2.1561, + "step": 50 + }, + { + "epoch": 0.0022832804592906463, + "grad_norm": 13.0, + "learning_rate": 3.7681159420289856e-05, + "loss": 2.1002, + "step": 52 + }, + { + "epoch": 0.0023710989384941325, + "grad_norm": 9.3125, + "learning_rate": 3.91304347826087e-05, + "loss": 2.0411, + "step": 54 + }, + { + "epoch": 0.002458917417697619, + "grad_norm": 9.0625, + "learning_rate": 4.057971014492754e-05, + "loss": 2.0039, + "step": 56 + }, + { + "epoch": 0.0025467358969011055, + "grad_norm": 8.0, + "learning_rate": 4.202898550724638e-05, + "loss": 2.0052, + "step": 58 + }, + { + "epoch": 0.002634554376104592, + "grad_norm": 8.625, + "learning_rate": 4.347826086956522e-05, + "loss": 1.9758, + "step": 60 + }, + { + "epoch": 0.002722372855308078, + "grad_norm": 13.125, + "learning_rate": 4.492753623188406e-05, + "loss": 1.9008, + "step": 62 + }, + { + "epoch": 0.002810191334511565, + "grad_norm": 6.6875, + "learning_rate": 4.63768115942029e-05, + "loss": 1.8888, + "step": 64 + }, + { + "epoch": 0.002898009813715051, + "grad_norm": 11.1875, + "learning_rate": 4.782608695652174e-05, + "loss": 1.8357, + "step": 66 + }, + { + "epoch": 0.0029858282929185373, + "grad_norm": 29.625, + "learning_rate": 4.9275362318840584e-05, + "loss": 1.9061, + "step": 68 + }, + { + "epoch": 0.0030736467721220236, + "grad_norm": 31.125, + "learning_rate": 4.9999999760686754e-05, + "loss": 1.8563, + "step": 70 + }, + { + "epoch": 0.0031614652513255103, + "grad_norm": 13.0625, + "learning_rate": 4.999999784618078e-05, + "loss": 1.8569, + "step": 72 + }, + { + "epoch": 0.0032492837305289966, + "grad_norm": 8.5625, + "learning_rate": 4.999999401716897e-05, + "loss": 1.8224, + "step": 74 + }, + { + "epoch": 0.003337102209732483, + "grad_norm": 8.6875, + "learning_rate": 4.9999988273651635e-05, + "loss": 1.8069, + "step": 76 + }, + { + "epoch": 0.003424920688935969, + "grad_norm": 10.0625, + "learning_rate": 4.9999980615629205e-05, + "loss": 1.7401, + "step": 78 + }, + { + "epoch": 0.003512739168139456, + "grad_norm": 6.75, + "learning_rate": 4.9999971043102264e-05, + "loss": 1.7421, + "step": 80 + }, + { + "epoch": 0.003600557647342942, + "grad_norm": 6.65625, + "learning_rate": 4.999995955607155e-05, + "loss": 1.7189, + "step": 82 + }, + { + "epoch": 0.0036883761265464284, + "grad_norm": 7.125, + "learning_rate": 4.999994615453794e-05, + "loss": 1.714, + "step": 84 + }, + { + "epoch": 0.0037761946057499147, + "grad_norm": 11.25, + "learning_rate": 4.9999930838502456e-05, + "loss": 1.6762, + "step": 86 + }, + { + "epoch": 0.0038640130849534014, + "grad_norm": 12.25, + "learning_rate": 4.999991360796628e-05, + "loss": 1.6994, + "step": 88 + }, + { + "epoch": 0.003951831564156888, + "grad_norm": 7.78125, + "learning_rate": 4.9999894462930725e-05, + "loss": 1.6493, + "step": 90 + }, + { + "epoch": 0.004039650043360374, + "grad_norm": 6.9375, + "learning_rate": 4.9999873403397254e-05, + "loss": 1.6475, + "step": 92 + }, + { + "epoch": 0.004127468522563861, + "grad_norm": 12.8125, + "learning_rate": 4.99998504293675e-05, + "loss": 1.6591, + "step": 94 + }, + { + "epoch": 0.0042152870017673465, + "grad_norm": 11.25, + "learning_rate": 4.999982554084319e-05, + "loss": 1.6258, + "step": 96 + }, + { + "epoch": 0.004303105480970833, + "grad_norm": 12.125, + "learning_rate": 4.999979873782625e-05, + "loss": 1.6365, + "step": 98 + }, + { + "epoch": 0.00439092396017432, + "grad_norm": 10.9375, + "learning_rate": 4.999977002031874e-05, + "loss": 1.6345, + "step": 100 + }, + { + "epoch": 0.004478742439377806, + "grad_norm": 8.9375, + "learning_rate": 4.999973938832284e-05, + "loss": 1.6242, + "step": 102 + }, + { + "epoch": 0.0045665609185812925, + "grad_norm": 9.9375, + "learning_rate": 4.999970684184091e-05, + "loss": 1.6097, + "step": 104 + }, + { + "epoch": 0.004654379397784779, + "grad_norm": 13.6875, + "learning_rate": 4.999967238087544e-05, + "loss": 1.5788, + "step": 106 + }, + { + "epoch": 0.004742197876988265, + "grad_norm": 9.125, + "learning_rate": 4.999963600542906e-05, + "loss": 1.5573, + "step": 108 + }, + { + "epoch": 0.004830016356191752, + "grad_norm": 7.46875, + "learning_rate": 4.999959771550457e-05, + "loss": 1.574, + "step": 110 + }, + { + "epoch": 0.004917834835395238, + "grad_norm": 8.9375, + "learning_rate": 4.999955751110488e-05, + "loss": 1.5836, + "step": 112 + }, + { + "epoch": 0.005005653314598724, + "grad_norm": 14.625, + "learning_rate": 4.999951539223309e-05, + "loss": 1.5808, + "step": 114 + }, + { + "epoch": 0.005093471793802211, + "grad_norm": 10.625, + "learning_rate": 4.999947135889242e-05, + "loss": 1.569, + "step": 116 + }, + { + "epoch": 0.005181290273005697, + "grad_norm": 8.1875, + "learning_rate": 4.999942541108624e-05, + "loss": 1.5499, + "step": 118 + }, + { + "epoch": 0.005269108752209184, + "grad_norm": 8.375, + "learning_rate": 4.999937754881807e-05, + "loss": 1.5262, + "step": 120 + }, + { + "epoch": 0.00535692723141267, + "grad_norm": 8.625, + "learning_rate": 4.999932777209157e-05, + "loss": 1.5405, + "step": 122 + }, + { + "epoch": 0.005444745710616156, + "grad_norm": 9.5625, + "learning_rate": 4.999927608091056e-05, + "loss": 1.5341, + "step": 124 + }, + { + "epoch": 0.005532564189819643, + "grad_norm": 9.9375, + "learning_rate": 4.9999222475279003e-05, + "loss": 1.5037, + "step": 126 + }, + { + "epoch": 0.00562038266902313, + "grad_norm": 7.75, + "learning_rate": 4.999916695520098e-05, + "loss": 1.5074, + "step": 128 + }, + { + "epoch": 0.005708201148226615, + "grad_norm": 9.25, + "learning_rate": 4.999910952068077e-05, + "loss": 1.5116, + "step": 130 + }, + { + "epoch": 0.005796019627430102, + "grad_norm": 6.71875, + "learning_rate": 4.999905017172276e-05, + "loss": 1.5003, + "step": 132 + }, + { + "epoch": 0.005883838106633588, + "grad_norm": 10.5, + "learning_rate": 4.999898890833149e-05, + "loss": 1.5161, + "step": 134 + }, + { + "epoch": 0.005971656585837075, + "grad_norm": 10.125, + "learning_rate": 4.999892573051166e-05, + "loss": 1.477, + "step": 136 + }, + { + "epoch": 0.006059475065040561, + "grad_norm": 10.5, + "learning_rate": 4.999886063826811e-05, + "loss": 1.4996, + "step": 138 + }, + { + "epoch": 0.006147293544244047, + "grad_norm": 10.625, + "learning_rate": 4.999879363160581e-05, + "loss": 1.4417, + "step": 140 + }, + { + "epoch": 0.006235112023447534, + "grad_norm": 13.375, + "learning_rate": 4.999872471052991e-05, + "loss": 1.457, + "step": 142 + }, + { + "epoch": 0.006322930502651021, + "grad_norm": 10.3125, + "learning_rate": 4.9998653875045666e-05, + "loss": 1.4707, + "step": 144 + }, + { + "epoch": 0.0064107489818545065, + "grad_norm": 10.0625, + "learning_rate": 4.999858112515853e-05, + "loss": 1.4844, + "step": 146 + }, + { + "epoch": 0.006498567461057993, + "grad_norm": 11.8125, + "learning_rate": 4.999850646087405e-05, + "loss": 1.4749, + "step": 148 + }, + { + "epoch": 0.00658638594026148, + "grad_norm": 9.125, + "learning_rate": 4.999842988219795e-05, + "loss": 1.4431, + "step": 150 + }, + { + "epoch": 0.006674204419464966, + "grad_norm": 8.1875, + "learning_rate": 4.99983513891361e-05, + "loss": 1.4359, + "step": 152 + }, + { + "epoch": 0.0067620228986684525, + "grad_norm": 6.53125, + "learning_rate": 4.999827098169452e-05, + "loss": 1.4179, + "step": 154 + }, + { + "epoch": 0.006849841377871938, + "grad_norm": 7.9375, + "learning_rate": 4.999818865987934e-05, + "loss": 1.4185, + "step": 156 + }, + { + "epoch": 0.006937659857075425, + "grad_norm": 9.1875, + "learning_rate": 4.999810442369689e-05, + "loss": 1.449, + "step": 158 + }, + { + "epoch": 0.007025478336278912, + "grad_norm": 11.5625, + "learning_rate": 4.9998018273153605e-05, + "loss": 1.416, + "step": 160 + }, + { + "epoch": 0.007113296815482398, + "grad_norm": 9.375, + "learning_rate": 4.999793020825609e-05, + "loss": 1.3884, + "step": 162 + }, + { + "epoch": 0.007201115294685884, + "grad_norm": 5.8125, + "learning_rate": 4.9997840229011085e-05, + "loss": 1.3853, + "step": 164 + }, + { + "epoch": 0.007288933773889371, + "grad_norm": 7.1875, + "learning_rate": 4.999774833542549e-05, + "loss": 1.4316, + "step": 166 + }, + { + "epoch": 0.007376752253092857, + "grad_norm": 9.5, + "learning_rate": 4.999765452750633e-05, + "loss": 1.3818, + "step": 168 + }, + { + "epoch": 0.007464570732296344, + "grad_norm": 7.28125, + "learning_rate": 4.999755880526079e-05, + "loss": 1.3969, + "step": 170 + }, + { + "epoch": 0.007552389211499829, + "grad_norm": 7.9375, + "learning_rate": 4.999746116869621e-05, + "loss": 1.3647, + "step": 172 + }, + { + "epoch": 0.007640207690703316, + "grad_norm": 6.9375, + "learning_rate": 4.999736161782006e-05, + "loss": 1.3707, + "step": 174 + }, + { + "epoch": 0.007728026169906803, + "grad_norm": 7.53125, + "learning_rate": 4.9997260152639966e-05, + "loss": 1.3951, + "step": 176 + }, + { + "epoch": 0.007815844649110289, + "grad_norm": 5.875, + "learning_rate": 4.9997156773163694e-05, + "loss": 1.3626, + "step": 178 + }, + { + "epoch": 0.007903663128313776, + "grad_norm": 7.3125, + "learning_rate": 4.999705147939917e-05, + "loss": 1.3481, + "step": 180 + }, + { + "epoch": 0.007991481607517262, + "grad_norm": 7.03125, + "learning_rate": 4.999694427135445e-05, + "loss": 1.3754, + "step": 182 + }, + { + "epoch": 0.008079300086720748, + "grad_norm": 6.46875, + "learning_rate": 4.9996835149037735e-05, + "loss": 1.3436, + "step": 184 + }, + { + "epoch": 0.008167118565924234, + "grad_norm": 8.625, + "learning_rate": 4.999672411245741e-05, + "loss": 1.3398, + "step": 186 + }, + { + "epoch": 0.008254937045127721, + "grad_norm": 8.125, + "learning_rate": 4.999661116162194e-05, + "loss": 1.3604, + "step": 188 + }, + { + "epoch": 0.008342755524331207, + "grad_norm": 11.875, + "learning_rate": 4.9996496296540005e-05, + "loss": 1.3354, + "step": 190 + }, + { + "epoch": 0.008430574003534693, + "grad_norm": 7.53125, + "learning_rate": 4.99963795172204e-05, + "loss": 1.3341, + "step": 192 + }, + { + "epoch": 0.00851839248273818, + "grad_norm": 6.84375, + "learning_rate": 4.999626082367205e-05, + "loss": 1.3466, + "step": 194 + }, + { + "epoch": 0.008606210961941667, + "grad_norm": 7.8125, + "learning_rate": 4.999614021590405e-05, + "loss": 1.3208, + "step": 196 + }, + { + "epoch": 0.008694029441145152, + "grad_norm": 6.21875, + "learning_rate": 4.999601769392565e-05, + "loss": 1.3211, + "step": 198 + }, + { + "epoch": 0.00878184792034864, + "grad_norm": 6.3125, + "learning_rate": 4.999589325774622e-05, + "loss": 1.3199, + "step": 200 + }, + { + "epoch": 0.008869666399552126, + "grad_norm": 8.875, + "learning_rate": 4.99957669073753e-05, + "loss": 1.2907, + "step": 202 + }, + { + "epoch": 0.008957484878755612, + "grad_norm": 6.03125, + "learning_rate": 4.9995638642822536e-05, + "loss": 1.3227, + "step": 204 + }, + { + "epoch": 0.0090453033579591, + "grad_norm": 10.25, + "learning_rate": 4.9995508464097796e-05, + "loss": 1.3157, + "step": 206 + }, + { + "epoch": 0.009133121837162585, + "grad_norm": 7.625, + "learning_rate": 4.999537637121101e-05, + "loss": 1.2974, + "step": 208 + }, + { + "epoch": 0.009220940316366071, + "grad_norm": 6.4375, + "learning_rate": 4.999524236417232e-05, + "loss": 1.3103, + "step": 210 + }, + { + "epoch": 0.009308758795569558, + "grad_norm": 10.1875, + "learning_rate": 4.999510644299198e-05, + "loss": 1.3, + "step": 212 + }, + { + "epoch": 0.009396577274773044, + "grad_norm": 7.53125, + "learning_rate": 4.999496860768039e-05, + "loss": 1.2689, + "step": 214 + }, + { + "epoch": 0.00948439575397653, + "grad_norm": 6.0625, + "learning_rate": 4.999482885824811e-05, + "loss": 1.2809, + "step": 216 + }, + { + "epoch": 0.009572214233180018, + "grad_norm": 8.125, + "learning_rate": 4.9994687194705846e-05, + "loss": 1.2937, + "step": 218 + }, + { + "epoch": 0.009660032712383504, + "grad_norm": 13.4375, + "learning_rate": 4.9994543617064445e-05, + "loss": 1.3081, + "step": 220 + }, + { + "epoch": 0.00974785119158699, + "grad_norm": 10.5, + "learning_rate": 4.999439812533491e-05, + "loss": 1.3092, + "step": 222 + }, + { + "epoch": 0.009835669670790475, + "grad_norm": 8.375, + "learning_rate": 4.9994250719528366e-05, + "loss": 1.304, + "step": 224 + }, + { + "epoch": 0.009923488149993963, + "grad_norm": 6.53125, + "learning_rate": 4.999410139965612e-05, + "loss": 1.2551, + "step": 226 + }, + { + "epoch": 0.010011306629197449, + "grad_norm": 8.4375, + "learning_rate": 4.9993950165729585e-05, + "loss": 1.2683, + "step": 228 + }, + { + "epoch": 0.010099125108400935, + "grad_norm": 10.5, + "learning_rate": 4.9993797017760364e-05, + "loss": 1.3054, + "step": 230 + }, + { + "epoch": 0.010186943587604422, + "grad_norm": 10.8125, + "learning_rate": 4.999364195576017e-05, + "loss": 1.2796, + "step": 232 + }, + { + "epoch": 0.010274762066807908, + "grad_norm": 13.1875, + "learning_rate": 4.999348497974089e-05, + "loss": 1.2596, + "step": 234 + }, + { + "epoch": 0.010362580546011394, + "grad_norm": 8.9375, + "learning_rate": 4.999332608971453e-05, + "loss": 1.2413, + "step": 236 + }, + { + "epoch": 0.010450399025214881, + "grad_norm": 6.78125, + "learning_rate": 4.9993165285693275e-05, + "loss": 1.2754, + "step": 238 + }, + { + "epoch": 0.010538217504418367, + "grad_norm": 6.75, + "learning_rate": 4.999300256768943e-05, + "loss": 1.2392, + "step": 240 + }, + { + "epoch": 0.010626035983621853, + "grad_norm": 7.4375, + "learning_rate": 4.9992837935715455e-05, + "loss": 1.2867, + "step": 242 + }, + { + "epoch": 0.01071385446282534, + "grad_norm": 8.5625, + "learning_rate": 4.999267138978396e-05, + "loss": 1.2473, + "step": 244 + }, + { + "epoch": 0.010801672942028826, + "grad_norm": 7.75, + "learning_rate": 4.9992502929907706e-05, + "loss": 1.2448, + "step": 246 + }, + { + "epoch": 0.010889491421232312, + "grad_norm": 7.3125, + "learning_rate": 4.999233255609957e-05, + "loss": 1.2255, + "step": 248 + }, + { + "epoch": 0.0109773099004358, + "grad_norm": 8.4375, + "learning_rate": 4.999216026837263e-05, + "loss": 1.2549, + "step": 250 + }, + { + "epoch": 0.011065128379639286, + "grad_norm": 7.71875, + "learning_rate": 4.999198606674006e-05, + "loss": 1.2665, + "step": 252 + }, + { + "epoch": 0.011152946858842772, + "grad_norm": 7.71875, + "learning_rate": 4.9991809951215204e-05, + "loss": 1.2329, + "step": 254 + }, + { + "epoch": 0.01124076533804626, + "grad_norm": 11.0625, + "learning_rate": 4.999163192181155e-05, + "loss": 1.2516, + "step": 256 + }, + { + "epoch": 0.011328583817249745, + "grad_norm": 8.375, + "learning_rate": 4.9991451978542744e-05, + "loss": 1.2441, + "step": 258 + }, + { + "epoch": 0.01141640229645323, + "grad_norm": 7.9375, + "learning_rate": 4.999127012142255e-05, + "loss": 1.2242, + "step": 260 + }, + { + "epoch": 0.011504220775656717, + "grad_norm": 6.34375, + "learning_rate": 4.999108635046489e-05, + "loss": 1.2447, + "step": 262 + }, + { + "epoch": 0.011592039254860204, + "grad_norm": 7.875, + "learning_rate": 4.999090066568385e-05, + "loss": 1.2263, + "step": 264 + }, + { + "epoch": 0.01167985773406369, + "grad_norm": 7.8125, + "learning_rate": 4.9990713067093654e-05, + "loss": 1.2216, + "step": 266 + }, + { + "epoch": 0.011767676213267176, + "grad_norm": 8.5625, + "learning_rate": 4.9990523554708655e-05, + "loss": 1.2352, + "step": 268 + }, + { + "epoch": 0.011855494692470664, + "grad_norm": 6.53125, + "learning_rate": 4.999033212854337e-05, + "loss": 1.2291, + "step": 270 + }, + { + "epoch": 0.01194331317167415, + "grad_norm": 7.8125, + "learning_rate": 4.999013878861246e-05, + "loss": 1.223, + "step": 272 + }, + { + "epoch": 0.012031131650877635, + "grad_norm": 9.5625, + "learning_rate": 4.998994353493074e-05, + "loss": 1.21, + "step": 274 + }, + { + "epoch": 0.012118950130081123, + "grad_norm": 11.5, + "learning_rate": 4.998974636751315e-05, + "loss": 1.227, + "step": 276 + }, + { + "epoch": 0.012206768609284609, + "grad_norm": 9.75, + "learning_rate": 4.998954728637478e-05, + "loss": 1.2142, + "step": 278 + }, + { + "epoch": 0.012294587088488095, + "grad_norm": 7.46875, + "learning_rate": 4.9989346291530904e-05, + "loss": 1.2299, + "step": 280 + }, + { + "epoch": 0.012382405567691582, + "grad_norm": 6.34375, + "learning_rate": 4.998914338299689e-05, + "loss": 1.2197, + "step": 282 + }, + { + "epoch": 0.012470224046895068, + "grad_norm": 7.96875, + "learning_rate": 4.998893856078829e-05, + "loss": 1.227, + "step": 284 + }, + { + "epoch": 0.012558042526098554, + "grad_norm": 11.1875, + "learning_rate": 4.998873182492078e-05, + "loss": 1.2082, + "step": 286 + }, + { + "epoch": 0.012645861005302041, + "grad_norm": 9.9375, + "learning_rate": 4.9988523175410204e-05, + "loss": 1.2347, + "step": 288 + }, + { + "epoch": 0.012733679484505527, + "grad_norm": 8.0625, + "learning_rate": 4.9988312612272524e-05, + "loss": 1.2427, + "step": 290 + }, + { + "epoch": 0.012821497963709013, + "grad_norm": 8.5, + "learning_rate": 4.998810013552388e-05, + "loss": 1.1799, + "step": 292 + }, + { + "epoch": 0.0129093164429125, + "grad_norm": 8.875, + "learning_rate": 4.998788574518054e-05, + "loss": 1.1828, + "step": 294 + }, + { + "epoch": 0.012997134922115986, + "grad_norm": 9.375, + "learning_rate": 4.998766944125891e-05, + "loss": 1.1921, + "step": 296 + }, + { + "epoch": 0.013084953401319472, + "grad_norm": 6.3125, + "learning_rate": 4.998745122377557e-05, + "loss": 1.1933, + "step": 298 + }, + { + "epoch": 0.01317277188052296, + "grad_norm": 6.90625, + "learning_rate": 4.998723109274722e-05, + "loss": 1.1597, + "step": 300 + }, + { + "epoch": 0.013260590359726446, + "grad_norm": 6.71875, + "learning_rate": 4.998700904819074e-05, + "loss": 1.1891, + "step": 302 + }, + { + "epoch": 0.013348408838929932, + "grad_norm": 5.46875, + "learning_rate": 4.99867850901231e-05, + "loss": 1.1977, + "step": 304 + }, + { + "epoch": 0.013436227318133417, + "grad_norm": 7.875, + "learning_rate": 4.998655921856147e-05, + "loss": 1.1838, + "step": 306 + }, + { + "epoch": 0.013524045797336905, + "grad_norm": 5.46875, + "learning_rate": 4.9986331433523156e-05, + "loss": 1.1977, + "step": 308 + }, + { + "epoch": 0.01361186427654039, + "grad_norm": 7.3125, + "learning_rate": 4.998610173502558e-05, + "loss": 1.1807, + "step": 310 + }, + { + "epoch": 0.013699682755743877, + "grad_norm": 5.40625, + "learning_rate": 4.9985870123086345e-05, + "loss": 1.161, + "step": 312 + }, + { + "epoch": 0.013787501234947364, + "grad_norm": 7.8125, + "learning_rate": 4.9985636597723195e-05, + "loss": 1.1476, + "step": 314 + }, + { + "epoch": 0.01387531971415085, + "grad_norm": 5.625, + "learning_rate": 4.9985401158954e-05, + "loss": 1.1881, + "step": 316 + }, + { + "epoch": 0.013963138193354336, + "grad_norm": 7.125, + "learning_rate": 4.998516380679679e-05, + "loss": 1.1831, + "step": 318 + }, + { + "epoch": 0.014050956672557824, + "grad_norm": 9.3125, + "learning_rate": 4.998492454126975e-05, + "loss": 1.1747, + "step": 320 + }, + { + "epoch": 0.01413877515176131, + "grad_norm": 5.5625, + "learning_rate": 4.9984683362391204e-05, + "loss": 1.1628, + "step": 322 + }, + { + "epoch": 0.014226593630964795, + "grad_norm": 7.21875, + "learning_rate": 4.998444027017961e-05, + "loss": 1.1477, + "step": 324 + }, + { + "epoch": 0.014314412110168283, + "grad_norm": 8.625, + "learning_rate": 4.9984195264653596e-05, + "loss": 1.1537, + "step": 326 + }, + { + "epoch": 0.014402230589371769, + "grad_norm": 6.75, + "learning_rate": 4.998394834583191e-05, + "loss": 1.1832, + "step": 328 + }, + { + "epoch": 0.014490049068575254, + "grad_norm": 6.34375, + "learning_rate": 4.998369951373348e-05, + "loss": 1.1519, + "step": 330 + }, + { + "epoch": 0.014577867547778742, + "grad_norm": 6.5, + "learning_rate": 4.998344876837735e-05, + "loss": 1.1786, + "step": 332 + }, + { + "epoch": 0.014665686026982228, + "grad_norm": 7.09375, + "learning_rate": 4.998319610978273e-05, + "loss": 1.1379, + "step": 334 + }, + { + "epoch": 0.014753504506185714, + "grad_norm": 6.5, + "learning_rate": 4.998294153796895e-05, + "loss": 1.1754, + "step": 336 + }, + { + "epoch": 0.014841322985389201, + "grad_norm": 6.5, + "learning_rate": 4.998268505295552e-05, + "loss": 1.1211, + "step": 338 + }, + { + "epoch": 0.014929141464592687, + "grad_norm": 8.5625, + "learning_rate": 4.9982426654762095e-05, + "loss": 1.1669, + "step": 340 + }, + { + "epoch": 0.015016959943796173, + "grad_norm": 7.4375, + "learning_rate": 4.998216634340844e-05, + "loss": 1.1609, + "step": 342 + }, + { + "epoch": 0.015104778422999659, + "grad_norm": 5.625, + "learning_rate": 4.998190411891449e-05, + "loss": 1.1381, + "step": 344 + }, + { + "epoch": 0.015192596902203146, + "grad_norm": 7.09375, + "learning_rate": 4.9981639981300344e-05, + "loss": 1.1383, + "step": 346 + }, + { + "epoch": 0.015280415381406632, + "grad_norm": 8.5625, + "learning_rate": 4.998137393058622e-05, + "loss": 1.1462, + "step": 348 + }, + { + "epoch": 0.015368233860610118, + "grad_norm": 7.0, + "learning_rate": 4.9981105966792485e-05, + "loss": 1.1386, + "step": 350 + }, + { + "epoch": 0.015456052339813606, + "grad_norm": 8.0, + "learning_rate": 4.9980836089939665e-05, + "loss": 1.1212, + "step": 352 + }, + { + "epoch": 0.015543870819017092, + "grad_norm": 6.3125, + "learning_rate": 4.998056430004844e-05, + "loss": 1.1352, + "step": 354 + }, + { + "epoch": 0.015631689298220577, + "grad_norm": 8.8125, + "learning_rate": 4.99802905971396e-05, + "loss": 1.1435, + "step": 356 + }, + { + "epoch": 0.015719507777424063, + "grad_norm": 7.3125, + "learning_rate": 4.998001498123413e-05, + "loss": 1.1223, + "step": 358 + }, + { + "epoch": 0.015807326256627553, + "grad_norm": 5.84375, + "learning_rate": 4.9979737452353114e-05, + "loss": 1.1079, + "step": 360 + }, + { + "epoch": 0.01589514473583104, + "grad_norm": 16.5, + "learning_rate": 4.997945801051782e-05, + "loss": 1.1403, + "step": 362 + }, + { + "epoch": 0.015982963215034524, + "grad_norm": 9.6875, + "learning_rate": 4.997917665574964e-05, + "loss": 1.1463, + "step": 364 + }, + { + "epoch": 0.01607078169423801, + "grad_norm": 7.78125, + "learning_rate": 4.9978893388070135e-05, + "loss": 1.1436, + "step": 366 + }, + { + "epoch": 0.016158600173441496, + "grad_norm": 6.125, + "learning_rate": 4.997860820750098e-05, + "loss": 1.1402, + "step": 368 + }, + { + "epoch": 0.016246418652644982, + "grad_norm": 7.8125, + "learning_rate": 4.997832111406402e-05, + "loss": 1.098, + "step": 370 + }, + { + "epoch": 0.016334237131848468, + "grad_norm": 6.28125, + "learning_rate": 4.997803210778124e-05, + "loss": 1.1279, + "step": 372 + }, + { + "epoch": 0.016422055611051957, + "grad_norm": 5.5625, + "learning_rate": 4.997774118867477e-05, + "loss": 1.1178, + "step": 374 + }, + { + "epoch": 0.016509874090255443, + "grad_norm": 8.8125, + "learning_rate": 4.99774483567669e-05, + "loss": 1.1477, + "step": 376 + }, + { + "epoch": 0.01659769256945893, + "grad_norm": 6.9375, + "learning_rate": 4.997715361208004e-05, + "loss": 1.1008, + "step": 378 + }, + { + "epoch": 0.016685511048662414, + "grad_norm": 5.15625, + "learning_rate": 4.997685695463677e-05, + "loss": 1.1023, + "step": 380 + }, + { + "epoch": 0.0167733295278659, + "grad_norm": 9.375, + "learning_rate": 4.9976558384459815e-05, + "loss": 1.1327, + "step": 382 + }, + { + "epoch": 0.016861148007069386, + "grad_norm": 11.0625, + "learning_rate": 4.997625790157203e-05, + "loss": 1.1455, + "step": 384 + }, + { + "epoch": 0.016948966486272875, + "grad_norm": 8.0625, + "learning_rate": 4.997595550599642e-05, + "loss": 1.1056, + "step": 386 + }, + { + "epoch": 0.01703678496547636, + "grad_norm": 8.6875, + "learning_rate": 4.9975651197756155e-05, + "loss": 1.1077, + "step": 388 + }, + { + "epoch": 0.017124603444679847, + "grad_norm": 7.03125, + "learning_rate": 4.9975344976874536e-05, + "loss": 1.1418, + "step": 390 + }, + { + "epoch": 0.017212421923883333, + "grad_norm": 9.875, + "learning_rate": 4.997503684337501e-05, + "loss": 1.0905, + "step": 392 + }, + { + "epoch": 0.01730024040308682, + "grad_norm": 8.375, + "learning_rate": 4.997472679728118e-05, + "loss": 1.1365, + "step": 394 + }, + { + "epoch": 0.017388058882290305, + "grad_norm": 6.875, + "learning_rate": 4.997441483861678e-05, + "loss": 1.1076, + "step": 396 + }, + { + "epoch": 0.017475877361493794, + "grad_norm": 6.21875, + "learning_rate": 4.99741009674057e-05, + "loss": 1.0944, + "step": 398 + }, + { + "epoch": 0.01756369584069728, + "grad_norm": 9.3125, + "learning_rate": 4.997378518367199e-05, + "loss": 1.1178, + "step": 400 + }, + { + "epoch": 0.017651514319900766, + "grad_norm": 11.0625, + "learning_rate": 4.9973467487439816e-05, + "loss": 1.1157, + "step": 402 + }, + { + "epoch": 0.01773933279910425, + "grad_norm": 10.6875, + "learning_rate": 4.997314787873352e-05, + "loss": 1.1005, + "step": 404 + }, + { + "epoch": 0.017827151278307737, + "grad_norm": 6.9375, + "learning_rate": 4.9972826357577576e-05, + "loss": 1.0849, + "step": 406 + }, + { + "epoch": 0.017914969757511223, + "grad_norm": 5.46875, + "learning_rate": 4.99725029239966e-05, + "loss": 1.1028, + "step": 408 + }, + { + "epoch": 0.01800278823671471, + "grad_norm": 7.34375, + "learning_rate": 4.9972177578015364e-05, + "loss": 1.1027, + "step": 410 + }, + { + "epoch": 0.0180906067159182, + "grad_norm": 6.90625, + "learning_rate": 4.997185031965878e-05, + "loss": 1.0742, + "step": 412 + }, + { + "epoch": 0.018178425195121684, + "grad_norm": 6.9375, + "learning_rate": 4.9971521148951914e-05, + "loss": 1.0998, + "step": 414 + }, + { + "epoch": 0.01826624367432517, + "grad_norm": 6.3125, + "learning_rate": 4.997119006591997e-05, + "loss": 1.0511, + "step": 416 + }, + { + "epoch": 0.018354062153528656, + "grad_norm": 8.875, + "learning_rate": 4.997085707058832e-05, + "loss": 1.0712, + "step": 418 + }, + { + "epoch": 0.018441880632732142, + "grad_norm": 7.625, + "learning_rate": 4.997052216298243e-05, + "loss": 1.1015, + "step": 420 + }, + { + "epoch": 0.018529699111935628, + "grad_norm": 7.5625, + "learning_rate": 4.9970185343127975e-05, + "loss": 1.071, + "step": 422 + }, + { + "epoch": 0.018617517591139117, + "grad_norm": 5.78125, + "learning_rate": 4.9969846611050744e-05, + "loss": 1.0926, + "step": 424 + }, + { + "epoch": 0.018705336070342603, + "grad_norm": 6.34375, + "learning_rate": 4.9969505966776664e-05, + "loss": 1.0808, + "step": 426 + }, + { + "epoch": 0.01879315454954609, + "grad_norm": 6.53125, + "learning_rate": 4.9969163410331845e-05, + "loss": 1.0853, + "step": 428 + }, + { + "epoch": 0.018880973028749574, + "grad_norm": 5.65625, + "learning_rate": 4.996881894174249e-05, + "loss": 1.0674, + "step": 430 + }, + { + "epoch": 0.01896879150795306, + "grad_norm": 9.8125, + "learning_rate": 4.996847256103501e-05, + "loss": 1.0528, + "step": 432 + }, + { + "epoch": 0.019056609987156546, + "grad_norm": 9.1875, + "learning_rate": 4.9968124268235906e-05, + "loss": 1.1181, + "step": 434 + }, + { + "epoch": 0.019144428466360035, + "grad_norm": 7.0625, + "learning_rate": 4.996777406337186e-05, + "loss": 1.0888, + "step": 436 + }, + { + "epoch": 0.01923224694556352, + "grad_norm": 7.0, + "learning_rate": 4.99674219464697e-05, + "loss": 1.043, + "step": 438 + }, + { + "epoch": 0.019320065424767007, + "grad_norm": 5.96875, + "learning_rate": 4.9967067917556376e-05, + "loss": 1.0879, + "step": 440 + }, + { + "epoch": 0.019407883903970493, + "grad_norm": 5.65625, + "learning_rate": 4.996671197665901e-05, + "loss": 1.072, + "step": 442 + }, + { + "epoch": 0.01949570238317398, + "grad_norm": 5.9375, + "learning_rate": 4.996635412380485e-05, + "loss": 1.0603, + "step": 444 + }, + { + "epoch": 0.019583520862377465, + "grad_norm": 10.125, + "learning_rate": 4.996599435902131e-05, + "loss": 1.0686, + "step": 446 + }, + { + "epoch": 0.01967133934158095, + "grad_norm": 13.3125, + "learning_rate": 4.996563268233594e-05, + "loss": 1.075, + "step": 448 + }, + { + "epoch": 0.01975915782078444, + "grad_norm": 12.25, + "learning_rate": 4.996526909377643e-05, + "loss": 1.0792, + "step": 450 + }, + { + "epoch": 0.019846976299987926, + "grad_norm": 7.8125, + "learning_rate": 4.996490359337062e-05, + "loss": 1.0456, + "step": 452 + }, + { + "epoch": 0.01993479477919141, + "grad_norm": 6.59375, + "learning_rate": 4.9964536181146525e-05, + "loss": 1.0603, + "step": 454 + }, + { + "epoch": 0.020022613258394897, + "grad_norm": 9.125, + "learning_rate": 4.9964166857132255e-05, + "loss": 1.0536, + "step": 456 + }, + { + "epoch": 0.020110431737598383, + "grad_norm": 11.0, + "learning_rate": 4.996379562135611e-05, + "loss": 1.0881, + "step": 458 + }, + { + "epoch": 0.02019825021680187, + "grad_norm": 5.4375, + "learning_rate": 4.9963422473846504e-05, + "loss": 1.0952, + "step": 460 + }, + { + "epoch": 0.02028606869600536, + "grad_norm": 5.9375, + "learning_rate": 4.9963047414632024e-05, + "loss": 1.0648, + "step": 462 + }, + { + "epoch": 0.020373887175208844, + "grad_norm": 6.53125, + "learning_rate": 4.996267044374139e-05, + "loss": 1.0648, + "step": 464 + }, + { + "epoch": 0.02046170565441233, + "grad_norm": 8.0625, + "learning_rate": 4.9962291561203464e-05, + "loss": 1.0814, + "step": 466 + }, + { + "epoch": 0.020549524133615816, + "grad_norm": 9.8125, + "learning_rate": 4.9961910767047275e-05, + "loss": 1.0556, + "step": 468 + }, + { + "epoch": 0.020637342612819302, + "grad_norm": 8.375, + "learning_rate": 4.996152806130198e-05, + "loss": 1.0354, + "step": 470 + }, + { + "epoch": 0.020725161092022788, + "grad_norm": 8.375, + "learning_rate": 4.996114344399687e-05, + "loss": 1.0932, + "step": 472 + }, + { + "epoch": 0.020812979571226277, + "grad_norm": 7.90625, + "learning_rate": 4.996075691516141e-05, + "loss": 1.0409, + "step": 474 + }, + { + "epoch": 0.020900798050429763, + "grad_norm": 7.4375, + "learning_rate": 4.996036847482521e-05, + "loss": 1.0539, + "step": 476 + }, + { + "epoch": 0.02098861652963325, + "grad_norm": 7.6875, + "learning_rate": 4.9959978123018006e-05, + "loss": 1.0252, + "step": 478 + }, + { + "epoch": 0.021076435008836734, + "grad_norm": 6.875, + "learning_rate": 4.9959585859769694e-05, + "loss": 1.0492, + "step": 480 + }, + { + "epoch": 0.02116425348804022, + "grad_norm": 6.625, + "learning_rate": 4.995919168511032e-05, + "loss": 1.0662, + "step": 482 + }, + { + "epoch": 0.021252071967243706, + "grad_norm": 6.5, + "learning_rate": 4.995879559907005e-05, + "loss": 1.035, + "step": 484 + }, + { + "epoch": 0.021339890446447192, + "grad_norm": 6.59375, + "learning_rate": 4.995839760167924e-05, + "loss": 1.0408, + "step": 486 + }, + { + "epoch": 0.02142770892565068, + "grad_norm": 6.875, + "learning_rate": 4.995799769296836e-05, + "loss": 1.0301, + "step": 488 + }, + { + "epoch": 0.021515527404854167, + "grad_norm": 5.90625, + "learning_rate": 4.995759587296803e-05, + "loss": 1.0198, + "step": 490 + }, + { + "epoch": 0.021603345884057653, + "grad_norm": 5.53125, + "learning_rate": 4.995719214170902e-05, + "loss": 1.0354, + "step": 492 + }, + { + "epoch": 0.02169116436326114, + "grad_norm": 7.5625, + "learning_rate": 4.9956786499222263e-05, + "loss": 1.044, + "step": 494 + }, + { + "epoch": 0.021778982842464625, + "grad_norm": 6.375, + "learning_rate": 4.995637894553881e-05, + "loss": 1.0413, + "step": 496 + }, + { + "epoch": 0.02186680132166811, + "grad_norm": 6.40625, + "learning_rate": 4.9955969480689865e-05, + "loss": 1.0098, + "step": 498 + }, + { + "epoch": 0.0219546198008716, + "grad_norm": 6.1875, + "learning_rate": 4.995555810470681e-05, + "loss": 1.0262, + "step": 500 + }, + { + "epoch": 0.022042438280075086, + "grad_norm": 4.96875, + "learning_rate": 4.995514481762112e-05, + "loss": 1.0375, + "step": 502 + }, + { + "epoch": 0.02213025675927857, + "grad_norm": 5.8125, + "learning_rate": 4.995472961946447e-05, + "loss": 1.035, + "step": 504 + }, + { + "epoch": 0.022218075238482057, + "grad_norm": 5.5, + "learning_rate": 4.9954312510268636e-05, + "loss": 1.0307, + "step": 506 + }, + { + "epoch": 0.022305893717685543, + "grad_norm": 5.0625, + "learning_rate": 4.995389349006557e-05, + "loss": 0.9841, + "step": 508 + }, + { + "epoch": 0.02239371219688903, + "grad_norm": 6.53125, + "learning_rate": 4.995347255888736e-05, + "loss": 1.0169, + "step": 510 + }, + { + "epoch": 0.02248153067609252, + "grad_norm": 6.125, + "learning_rate": 4.9953049716766234e-05, + "loss": 1.0283, + "step": 512 + }, + { + "epoch": 0.022569349155296004, + "grad_norm": 6.4375, + "learning_rate": 4.9952624963734584e-05, + "loss": 1.0303, + "step": 514 + }, + { + "epoch": 0.02265716763449949, + "grad_norm": 5.84375, + "learning_rate": 4.995219829982494e-05, + "loss": 1.0268, + "step": 516 + }, + { + "epoch": 0.022744986113702976, + "grad_norm": 6.59375, + "learning_rate": 4.9951769725069954e-05, + "loss": 1.0065, + "step": 518 + }, + { + "epoch": 0.02283280459290646, + "grad_norm": 5.5625, + "learning_rate": 4.995133923950247e-05, + "loss": 1.0092, + "step": 520 + }, + { + "epoch": 0.022920623072109948, + "grad_norm": 11.5625, + "learning_rate": 4.995090684315544e-05, + "loss": 1.0142, + "step": 522 + }, + { + "epoch": 0.023008441551313433, + "grad_norm": 9.5625, + "learning_rate": 4.9950472536061984e-05, + "loss": 0.9864, + "step": 524 + }, + { + "epoch": 0.023096260030516923, + "grad_norm": 7.09375, + "learning_rate": 4.9950036318255364e-05, + "loss": 1.0048, + "step": 526 + }, + { + "epoch": 0.02318407850972041, + "grad_norm": 6.8125, + "learning_rate": 4.9949598189768985e-05, + "loss": 1.0157, + "step": 528 + }, + { + "epoch": 0.023271896988923894, + "grad_norm": 5.5, + "learning_rate": 4.994915815063639e-05, + "loss": 1.0379, + "step": 530 + }, + { + "epoch": 0.02335971546812738, + "grad_norm": 8.25, + "learning_rate": 4.994871620089129e-05, + "loss": 1.0017, + "step": 532 + }, + { + "epoch": 0.023447533947330866, + "grad_norm": 7.8125, + "learning_rate": 4.9948272340567514e-05, + "loss": 1.0021, + "step": 534 + }, + { + "epoch": 0.023535352426534352, + "grad_norm": 6.0625, + "learning_rate": 4.994782656969906e-05, + "loss": 0.9881, + "step": 536 + }, + { + "epoch": 0.02362317090573784, + "grad_norm": 6.84375, + "learning_rate": 4.994737888832007e-05, + "loss": 0.9894, + "step": 538 + }, + { + "epoch": 0.023710989384941327, + "grad_norm": 7.46875, + "learning_rate": 4.9946929296464825e-05, + "loss": 1.0131, + "step": 540 + }, + { + "epoch": 0.023798807864144813, + "grad_norm": 8.6875, + "learning_rate": 4.994647779416776e-05, + "loss": 1.017, + "step": 542 + }, + { + "epoch": 0.0238866263433483, + "grad_norm": 10.0625, + "learning_rate": 4.994602438146344e-05, + "loss": 0.982, + "step": 544 + }, + { + "epoch": 0.023974444822551785, + "grad_norm": 8.625, + "learning_rate": 4.99455690583866e-05, + "loss": 1.0208, + "step": 546 + }, + { + "epoch": 0.02406226330175527, + "grad_norm": 11.0625, + "learning_rate": 4.994511182497209e-05, + "loss": 1.0111, + "step": 548 + }, + { + "epoch": 0.02415008178095876, + "grad_norm": 11.875, + "learning_rate": 4.9944652681254944e-05, + "loss": 0.9969, + "step": 550 + }, + { + "epoch": 0.024237900260162246, + "grad_norm": 8.375, + "learning_rate": 4.9944191627270314e-05, + "loss": 1.0336, + "step": 552 + }, + { + "epoch": 0.02432571873936573, + "grad_norm": 7.1875, + "learning_rate": 4.994372866305351e-05, + "loss": 1.0003, + "step": 554 + }, + { + "epoch": 0.024413537218569217, + "grad_norm": 8.6875, + "learning_rate": 4.9943263788639985e-05, + "loss": 1.0197, + "step": 556 + }, + { + "epoch": 0.024501355697772703, + "grad_norm": 6.3125, + "learning_rate": 4.994279700406534e-05, + "loss": 0.9861, + "step": 558 + }, + { + "epoch": 0.02458917417697619, + "grad_norm": 7.125, + "learning_rate": 4.994232830936532e-05, + "loss": 1.0242, + "step": 560 + }, + { + "epoch": 0.02467699265617968, + "grad_norm": 6.09375, + "learning_rate": 4.994185770457582e-05, + "loss": 0.999, + "step": 562 + }, + { + "epoch": 0.024764811135383164, + "grad_norm": 7.5, + "learning_rate": 4.994138518973288e-05, + "loss": 0.9646, + "step": 564 + }, + { + "epoch": 0.02485262961458665, + "grad_norm": 5.59375, + "learning_rate": 4.9940910764872685e-05, + "loss": 0.9966, + "step": 566 + }, + { + "epoch": 0.024940448093790136, + "grad_norm": 5.28125, + "learning_rate": 4.994043443003156e-05, + "loss": 0.9982, + "step": 568 + }, + { + "epoch": 0.02502826657299362, + "grad_norm": 6.15625, + "learning_rate": 4.993995618524598e-05, + "loss": 1.006, + "step": 570 + }, + { + "epoch": 0.025116085052197108, + "grad_norm": 6.40625, + "learning_rate": 4.993947603055259e-05, + "loss": 0.9813, + "step": 572 + }, + { + "epoch": 0.025203903531400593, + "grad_norm": 7.625, + "learning_rate": 4.9938993965988145e-05, + "loss": 0.9576, + "step": 574 + }, + { + "epoch": 0.025291722010604083, + "grad_norm": 5.65625, + "learning_rate": 4.993850999158956e-05, + "loss": 1.0088, + "step": 576 + }, + { + "epoch": 0.02537954048980757, + "grad_norm": 5.6875, + "learning_rate": 4.993802410739391e-05, + "loss": 0.9918, + "step": 578 + }, + { + "epoch": 0.025467358969011054, + "grad_norm": 6.125, + "learning_rate": 4.993753631343838e-05, + "loss": 0.9659, + "step": 580 + }, + { + "epoch": 0.02555517744821454, + "grad_norm": 8.5, + "learning_rate": 4.9937046609760356e-05, + "loss": 0.9646, + "step": 582 + }, + { + "epoch": 0.025642995927418026, + "grad_norm": 7.0625, + "learning_rate": 4.993655499639732e-05, + "loss": 0.9554, + "step": 584 + }, + { + "epoch": 0.025730814406621512, + "grad_norm": 5.0, + "learning_rate": 4.9936061473386925e-05, + "loss": 0.994, + "step": 586 + }, + { + "epoch": 0.025818632885825, + "grad_norm": 5.28125, + "learning_rate": 4.993556604076696e-05, + "loss": 0.9731, + "step": 588 + }, + { + "epoch": 0.025906451365028487, + "grad_norm": 7.0625, + "learning_rate": 4.9935068698575375e-05, + "loss": 0.9583, + "step": 590 + }, + { + "epoch": 0.025994269844231973, + "grad_norm": 5.5, + "learning_rate": 4.9934569446850256e-05, + "loss": 0.9676, + "step": 592 + }, + { + "epoch": 0.02608208832343546, + "grad_norm": 5.3125, + "learning_rate": 4.993406828562982e-05, + "loss": 0.9957, + "step": 594 + }, + { + "epoch": 0.026169906802638945, + "grad_norm": 6.5, + "learning_rate": 4.993356521495246e-05, + "loss": 0.9717, + "step": 596 + }, + { + "epoch": 0.02625772528184243, + "grad_norm": 10.5, + "learning_rate": 4.993306023485671e-05, + "loss": 0.9933, + "step": 598 + }, + { + "epoch": 0.02634554376104592, + "grad_norm": 10.75, + "learning_rate": 4.993255334538122e-05, + "loss": 0.9773, + "step": 600 + }, + { + "epoch": 0.026433362240249406, + "grad_norm": 11.1875, + "learning_rate": 4.9932044546564824e-05, + "loss": 0.9854, + "step": 602 + }, + { + "epoch": 0.02652118071945289, + "grad_norm": 7.625, + "learning_rate": 4.993153383844648e-05, + "loss": 0.995, + "step": 604 + }, + { + "epoch": 0.026608999198656377, + "grad_norm": 5.25, + "learning_rate": 4.993102122106529e-05, + "loss": 0.9621, + "step": 606 + }, + { + "epoch": 0.026696817677859863, + "grad_norm": 6.71875, + "learning_rate": 4.993050669446053e-05, + "loss": 0.9785, + "step": 608 + }, + { + "epoch": 0.02678463615706335, + "grad_norm": 6.1875, + "learning_rate": 4.9929990258671574e-05, + "loss": 0.9875, + "step": 610 + }, + { + "epoch": 0.026872454636266835, + "grad_norm": 6.46875, + "learning_rate": 4.9929471913738e-05, + "loss": 0.9624, + "step": 612 + }, + { + "epoch": 0.026960273115470324, + "grad_norm": 5.46875, + "learning_rate": 4.99289516596995e-05, + "loss": 0.9641, + "step": 614 + }, + { + "epoch": 0.02704809159467381, + "grad_norm": 5.5625, + "learning_rate": 4.992842949659589e-05, + "loss": 0.9695, + "step": 616 + }, + { + "epoch": 0.027135910073877296, + "grad_norm": 5.1875, + "learning_rate": 4.9927905424467184e-05, + "loss": 0.9731, + "step": 618 + }, + { + "epoch": 0.02722372855308078, + "grad_norm": 6.78125, + "learning_rate": 4.992737944335349e-05, + "loss": 0.9354, + "step": 620 + }, + { + "epoch": 0.027311547032284268, + "grad_norm": 8.25, + "learning_rate": 4.992685155329512e-05, + "loss": 0.944, + "step": 622 + }, + { + "epoch": 0.027399365511487753, + "grad_norm": 8.4375, + "learning_rate": 4.992632175433247e-05, + "loss": 0.961, + "step": 624 + }, + { + "epoch": 0.027487183990691243, + "grad_norm": 8.4375, + "learning_rate": 4.9925790046506136e-05, + "loss": 0.9396, + "step": 626 + }, + { + "epoch": 0.02757500246989473, + "grad_norm": 7.65625, + "learning_rate": 4.9925256429856814e-05, + "loss": 0.9557, + "step": 628 + }, + { + "epoch": 0.027662820949098214, + "grad_norm": 7.1875, + "learning_rate": 4.992472090442539e-05, + "loss": 0.9546, + "step": 630 + }, + { + "epoch": 0.0277506394283017, + "grad_norm": 8.0625, + "learning_rate": 4.992418347025286e-05, + "loss": 0.9652, + "step": 632 + }, + { + "epoch": 0.027838457907505186, + "grad_norm": 8.0625, + "learning_rate": 4.9923644127380384e-05, + "loss": 0.9419, + "step": 634 + }, + { + "epoch": 0.027926276386708672, + "grad_norm": 5.75, + "learning_rate": 4.992310287584926e-05, + "loss": 0.9576, + "step": 636 + }, + { + "epoch": 0.02801409486591216, + "grad_norm": 5.5, + "learning_rate": 4.992255971570095e-05, + "loss": 0.9492, + "step": 638 + }, + { + "epoch": 0.028101913345115647, + "grad_norm": 6.0625, + "learning_rate": 4.9922014646977046e-05, + "loss": 0.9552, + "step": 640 + }, + { + "epoch": 0.028189731824319133, + "grad_norm": 6.84375, + "learning_rate": 4.992146766971928e-05, + "loss": 0.9551, + "step": 642 + }, + { + "epoch": 0.02827755030352262, + "grad_norm": 6.03125, + "learning_rate": 4.992091878396955e-05, + "loss": 0.9738, + "step": 644 + }, + { + "epoch": 0.028365368782726105, + "grad_norm": 7.46875, + "learning_rate": 4.9920367989769885e-05, + "loss": 0.9445, + "step": 646 + }, + { + "epoch": 0.02845318726192959, + "grad_norm": 8.4375, + "learning_rate": 4.991981528716246e-05, + "loss": 0.9412, + "step": 648 + }, + { + "epoch": 0.028541005741133076, + "grad_norm": 6.875, + "learning_rate": 4.991926067618962e-05, + "loss": 0.9425, + "step": 650 + }, + { + "epoch": 0.028628824220336566, + "grad_norm": 5.59375, + "learning_rate": 4.991870415689381e-05, + "loss": 0.9306, + "step": 652 + }, + { + "epoch": 0.02871664269954005, + "grad_norm": 5.53125, + "learning_rate": 4.9918145729317664e-05, + "loss": 0.9298, + "step": 654 + }, + { + "epoch": 0.028804461178743537, + "grad_norm": 4.84375, + "learning_rate": 4.991758539350395e-05, + "loss": 0.9475, + "step": 656 + }, + { + "epoch": 0.028892279657947023, + "grad_norm": 6.65625, + "learning_rate": 4.991702314949557e-05, + "loss": 0.9702, + "step": 658 + }, + { + "epoch": 0.02898009813715051, + "grad_norm": 7.5625, + "learning_rate": 4.9916458997335583e-05, + "loss": 0.9722, + "step": 660 + }, + { + "epoch": 0.029067916616353995, + "grad_norm": 7.8125, + "learning_rate": 4.99158929370672e-05, + "loss": 0.9374, + "step": 662 + }, + { + "epoch": 0.029155735095557484, + "grad_norm": 7.96875, + "learning_rate": 4.991532496873376e-05, + "loss": 0.9239, + "step": 664 + }, + { + "epoch": 0.02924355357476097, + "grad_norm": 6.75, + "learning_rate": 4.991475509237876e-05, + "loss": 0.9186, + "step": 666 + }, + { + "epoch": 0.029331372053964456, + "grad_norm": 6.03125, + "learning_rate": 4.9914183308045836e-05, + "loss": 0.9669, + "step": 668 + }, + { + "epoch": 0.02941919053316794, + "grad_norm": 6.28125, + "learning_rate": 4.991360961577879e-05, + "loss": 0.9288, + "step": 670 + }, + { + "epoch": 0.029507009012371428, + "grad_norm": 6.625, + "learning_rate": 4.9913034015621545e-05, + "loss": 0.9278, + "step": 672 + }, + { + "epoch": 0.029594827491574913, + "grad_norm": 6.6875, + "learning_rate": 4.9912456507618185e-05, + "loss": 0.907, + "step": 674 + }, + { + "epoch": 0.029682645970778403, + "grad_norm": 5.78125, + "learning_rate": 4.991187709181293e-05, + "loss": 0.908, + "step": 676 + }, + { + "epoch": 0.02977046444998189, + "grad_norm": 6.5625, + "learning_rate": 4.991129576825016e-05, + "loss": 0.9143, + "step": 678 + }, + { + "epoch": 0.029858282929185374, + "grad_norm": 5.875, + "learning_rate": 4.991071253697439e-05, + "loss": 0.9022, + "step": 680 + }, + { + "epoch": 0.02994610140838886, + "grad_norm": 5.46875, + "learning_rate": 4.991012739803028e-05, + "loss": 0.9249, + "step": 682 + }, + { + "epoch": 0.030033919887592346, + "grad_norm": 4.8125, + "learning_rate": 4.990954035146264e-05, + "loss": 0.9483, + "step": 684 + }, + { + "epoch": 0.030121738366795832, + "grad_norm": 4.75, + "learning_rate": 4.990895139731643e-05, + "loss": 0.9301, + "step": 686 + }, + { + "epoch": 0.030209556845999318, + "grad_norm": 5.0, + "learning_rate": 4.990836053563674e-05, + "loss": 0.9256, + "step": 688 + }, + { + "epoch": 0.030297375325202807, + "grad_norm": 4.84375, + "learning_rate": 4.990776776646885e-05, + "loss": 0.9601, + "step": 690 + }, + { + "epoch": 0.030385193804406293, + "grad_norm": 4.65625, + "learning_rate": 4.990717308985812e-05, + "loss": 0.9244, + "step": 692 + }, + { + "epoch": 0.03047301228360978, + "grad_norm": 5.0, + "learning_rate": 4.990657650585011e-05, + "loss": 0.9357, + "step": 694 + }, + { + "epoch": 0.030560830762813265, + "grad_norm": 6.28125, + "learning_rate": 4.99059780144905e-05, + "loss": 0.9379, + "step": 696 + }, + { + "epoch": 0.03064864924201675, + "grad_norm": 5.90625, + "learning_rate": 4.990537761582512e-05, + "loss": 0.9476, + "step": 698 + }, + { + "epoch": 0.030736467721220236, + "grad_norm": 5.34375, + "learning_rate": 4.9904775309899955e-05, + "loss": 0.9505, + "step": 700 + }, + { + "epoch": 0.030824286200423726, + "grad_norm": 5.65625, + "learning_rate": 4.9904171096761124e-05, + "loss": 0.9479, + "step": 702 + }, + { + "epoch": 0.03091210467962721, + "grad_norm": 4.78125, + "learning_rate": 4.99035649764549e-05, + "loss": 0.9047, + "step": 704 + }, + { + "epoch": 0.030999923158830697, + "grad_norm": 5.46875, + "learning_rate": 4.99029569490277e-05, + "loss": 0.9234, + "step": 706 + }, + { + "epoch": 0.031087741638034183, + "grad_norm": 7.96875, + "learning_rate": 4.9902347014526093e-05, + "loss": 0.9298, + "step": 708 + }, + { + "epoch": 0.03117556011723767, + "grad_norm": 8.8125, + "learning_rate": 4.9901735172996775e-05, + "loss": 0.9372, + "step": 710 + }, + { + "epoch": 0.031263378596441155, + "grad_norm": 7.3125, + "learning_rate": 4.990112142448662e-05, + "loss": 0.9224, + "step": 712 + }, + { + "epoch": 0.03135119707564464, + "grad_norm": 9.75, + "learning_rate": 4.990050576904261e-05, + "loss": 0.9216, + "step": 714 + }, + { + "epoch": 0.031439015554848126, + "grad_norm": 9.625, + "learning_rate": 4.989988820671189e-05, + "loss": 0.9117, + "step": 716 + }, + { + "epoch": 0.03152683403405161, + "grad_norm": 8.25, + "learning_rate": 4.989926873754177e-05, + "loss": 0.9419, + "step": 718 + }, + { + "epoch": 0.031614652513255105, + "grad_norm": 6.375, + "learning_rate": 4.989864736157969e-05, + "loss": 0.8973, + "step": 720 + }, + { + "epoch": 0.03170247099245859, + "grad_norm": 8.1875, + "learning_rate": 4.989802407887321e-05, + "loss": 0.9172, + "step": 722 + }, + { + "epoch": 0.03179028947166208, + "grad_norm": 9.9375, + "learning_rate": 4.9897398889470086e-05, + "loss": 0.9366, + "step": 724 + }, + { + "epoch": 0.03187810795086556, + "grad_norm": 5.46875, + "learning_rate": 4.989677179341818e-05, + "loss": 0.9221, + "step": 726 + }, + { + "epoch": 0.03196592643006905, + "grad_norm": 5.46875, + "learning_rate": 4.989614279076553e-05, + "loss": 0.8898, + "step": 728 + }, + { + "epoch": 0.032053744909272534, + "grad_norm": 7.15625, + "learning_rate": 4.9895511881560294e-05, + "loss": 0.9261, + "step": 730 + }, + { + "epoch": 0.03214156338847602, + "grad_norm": 7.78125, + "learning_rate": 4.98948790658508e-05, + "loss": 0.8904, + "step": 732 + }, + { + "epoch": 0.032229381867679506, + "grad_norm": 7.71875, + "learning_rate": 4.989424434368549e-05, + "loss": 0.9143, + "step": 734 + }, + { + "epoch": 0.03231720034688299, + "grad_norm": 7.875, + "learning_rate": 4.989360771511298e-05, + "loss": 0.9082, + "step": 736 + }, + { + "epoch": 0.03240501882608648, + "grad_norm": 6.28125, + "learning_rate": 4.989296918018203e-05, + "loss": 0.8653, + "step": 738 + }, + { + "epoch": 0.032492837305289964, + "grad_norm": 5.46875, + "learning_rate": 4.989232873894152e-05, + "loss": 0.8899, + "step": 740 + }, + { + "epoch": 0.03258065578449345, + "grad_norm": 5.625, + "learning_rate": 4.989168639144052e-05, + "loss": 0.8836, + "step": 742 + }, + { + "epoch": 0.032668474263696935, + "grad_norm": 5.4375, + "learning_rate": 4.98910421377282e-05, + "loss": 0.8981, + "step": 744 + }, + { + "epoch": 0.03275629274290043, + "grad_norm": 6.03125, + "learning_rate": 4.989039597785392e-05, + "loss": 0.903, + "step": 746 + }, + { + "epoch": 0.032844111222103914, + "grad_norm": 7.59375, + "learning_rate": 4.988974791186713e-05, + "loss": 0.885, + "step": 748 + }, + { + "epoch": 0.0329319297013074, + "grad_norm": 8.3125, + "learning_rate": 4.98890979398175e-05, + "loss": 0.9071, + "step": 750 + }, + { + "epoch": 0.033019748180510886, + "grad_norm": 6.75, + "learning_rate": 4.988844606175477e-05, + "loss": 0.9012, + "step": 752 + }, + { + "epoch": 0.03310756665971437, + "grad_norm": 6.0625, + "learning_rate": 4.988779227772888e-05, + "loss": 0.8822, + "step": 754 + }, + { + "epoch": 0.03319538513891786, + "grad_norm": 7.09375, + "learning_rate": 4.9887136587789885e-05, + "loss": 0.8973, + "step": 756 + }, + { + "epoch": 0.03328320361812134, + "grad_norm": 9.125, + "learning_rate": 4.9886478991988004e-05, + "loss": 0.911, + "step": 758 + }, + { + "epoch": 0.03337102209732483, + "grad_norm": 8.25, + "learning_rate": 4.9885819490373605e-05, + "loss": 0.8973, + "step": 760 + }, + { + "epoch": 0.033458840576528315, + "grad_norm": 6.5, + "learning_rate": 4.988515808299718e-05, + "loss": 0.9305, + "step": 762 + }, + { + "epoch": 0.0335466590557318, + "grad_norm": 6.46875, + "learning_rate": 4.988449476990938e-05, + "loss": 0.8778, + "step": 764 + }, + { + "epoch": 0.033634477534935286, + "grad_norm": 6.21875, + "learning_rate": 4.988382955116101e-05, + "loss": 0.8987, + "step": 766 + }, + { + "epoch": 0.03372229601413877, + "grad_norm": 5.84375, + "learning_rate": 4.988316242680301e-05, + "loss": 0.8735, + "step": 768 + }, + { + "epoch": 0.03381011449334226, + "grad_norm": 5.28125, + "learning_rate": 4.988249339688646e-05, + "loss": 0.901, + "step": 770 + }, + { + "epoch": 0.03389793297254575, + "grad_norm": 5.21875, + "learning_rate": 4.98818224614626e-05, + "loss": 0.8986, + "step": 772 + }, + { + "epoch": 0.03398575145174924, + "grad_norm": 4.90625, + "learning_rate": 4.9881149620582815e-05, + "loss": 0.895, + "step": 774 + }, + { + "epoch": 0.03407356993095272, + "grad_norm": 5.09375, + "learning_rate": 4.9880474874298626e-05, + "loss": 0.8893, + "step": 776 + }, + { + "epoch": 0.03416138841015621, + "grad_norm": 5.1875, + "learning_rate": 4.987979822266171e-05, + "loss": 0.8624, + "step": 778 + }, + { + "epoch": 0.034249206889359694, + "grad_norm": 6.0625, + "learning_rate": 4.9879119665723885e-05, + "loss": 0.8925, + "step": 780 + }, + { + "epoch": 0.03433702536856318, + "grad_norm": 5.53125, + "learning_rate": 4.9878439203537104e-05, + "loss": 0.8488, + "step": 782 + }, + { + "epoch": 0.034424843847766666, + "grad_norm": 6.46875, + "learning_rate": 4.987775683615349e-05, + "loss": 0.8915, + "step": 784 + }, + { + "epoch": 0.03451266232697015, + "grad_norm": 5.09375, + "learning_rate": 4.9877072563625285e-05, + "loss": 0.8532, + "step": 786 + }, + { + "epoch": 0.03460048080617364, + "grad_norm": 4.4375, + "learning_rate": 4.9876386386004905e-05, + "loss": 0.879, + "step": 788 + }, + { + "epoch": 0.034688299285377124, + "grad_norm": 4.75, + "learning_rate": 4.9875698303344896e-05, + "loss": 0.8712, + "step": 790 + }, + { + "epoch": 0.03477611776458061, + "grad_norm": 4.78125, + "learning_rate": 4.987500831569795e-05, + "loss": 0.8756, + "step": 792 + }, + { + "epoch": 0.034863936243784095, + "grad_norm": 5.78125, + "learning_rate": 4.987431642311689e-05, + "loss": 0.8567, + "step": 794 + }, + { + "epoch": 0.03495175472298759, + "grad_norm": 5.9375, + "learning_rate": 4.987362262565474e-05, + "loss": 0.887, + "step": 796 + }, + { + "epoch": 0.035039573202191074, + "grad_norm": 4.9375, + "learning_rate": 4.987292692336458e-05, + "loss": 0.859, + "step": 798 + }, + { + "epoch": 0.03512739168139456, + "grad_norm": 6.09375, + "learning_rate": 4.9872229316299734e-05, + "loss": 0.8759, + "step": 800 + }, + { + "epoch": 0.035215210160598046, + "grad_norm": 4.90625, + "learning_rate": 4.987152980451359e-05, + "loss": 0.8624, + "step": 802 + }, + { + "epoch": 0.03530302863980153, + "grad_norm": 5.90625, + "learning_rate": 4.987082838805974e-05, + "loss": 0.8837, + "step": 804 + }, + { + "epoch": 0.03539084711900502, + "grad_norm": 5.0625, + "learning_rate": 4.9870125066991894e-05, + "loss": 0.8992, + "step": 806 + }, + { + "epoch": 0.0354786655982085, + "grad_norm": 5.1875, + "learning_rate": 4.98694198413639e-05, + "loss": 0.849, + "step": 808 + }, + { + "epoch": 0.03556648407741199, + "grad_norm": 4.875, + "learning_rate": 4.986871271122977e-05, + "loss": 0.8616, + "step": 810 + }, + { + "epoch": 0.035654302556615475, + "grad_norm": 6.1875, + "learning_rate": 4.986800367664367e-05, + "loss": 0.9031, + "step": 812 + }, + { + "epoch": 0.03574212103581896, + "grad_norm": 4.8125, + "learning_rate": 4.986729273765988e-05, + "loss": 0.8518, + "step": 814 + }, + { + "epoch": 0.035829939515022446, + "grad_norm": 4.59375, + "learning_rate": 4.9866579894332857e-05, + "loss": 0.8672, + "step": 816 + }, + { + "epoch": 0.03591775799422593, + "grad_norm": 5.25, + "learning_rate": 4.9865865146717176e-05, + "loss": 0.8661, + "step": 818 + }, + { + "epoch": 0.03600557647342942, + "grad_norm": 6.84375, + "learning_rate": 4.9865148494867584e-05, + "loss": 0.8709, + "step": 820 + }, + { + "epoch": 0.03609339495263291, + "grad_norm": 4.875, + "learning_rate": 4.986442993883896e-05, + "loss": 0.8915, + "step": 822 + }, + { + "epoch": 0.0361812134318364, + "grad_norm": 4.53125, + "learning_rate": 4.986370947868634e-05, + "loss": 0.8497, + "step": 824 + }, + { + "epoch": 0.03626903191103988, + "grad_norm": 5.46875, + "learning_rate": 4.986298711446488e-05, + "loss": 0.8722, + "step": 826 + }, + { + "epoch": 0.03635685039024337, + "grad_norm": 8.0625, + "learning_rate": 4.986226284622991e-05, + "loss": 0.8664, + "step": 828 + }, + { + "epoch": 0.036444668869446854, + "grad_norm": 9.0, + "learning_rate": 4.9861536674036885e-05, + "loss": 0.8541, + "step": 830 + }, + { + "epoch": 0.03653248734865034, + "grad_norm": 8.3125, + "learning_rate": 4.986080859794142e-05, + "loss": 0.8752, + "step": 832 + }, + { + "epoch": 0.036620305827853826, + "grad_norm": 8.8125, + "learning_rate": 4.9860078617999284e-05, + "loss": 0.8758, + "step": 834 + }, + { + "epoch": 0.03670812430705731, + "grad_norm": 8.25, + "learning_rate": 4.9859346734266365e-05, + "loss": 0.8715, + "step": 836 + }, + { + "epoch": 0.0367959427862608, + "grad_norm": 5.84375, + "learning_rate": 4.9858612946798714e-05, + "loss": 0.8564, + "step": 838 + }, + { + "epoch": 0.036883761265464284, + "grad_norm": 5.71875, + "learning_rate": 4.985787725565252e-05, + "loss": 0.8502, + "step": 840 + }, + { + "epoch": 0.03697157974466777, + "grad_norm": 5.71875, + "learning_rate": 4.985713966088412e-05, + "loss": 0.8582, + "step": 842 + }, + { + "epoch": 0.037059398223871255, + "grad_norm": 6.59375, + "learning_rate": 4.985640016255002e-05, + "loss": 0.8623, + "step": 844 + }, + { + "epoch": 0.03714721670307474, + "grad_norm": 5.375, + "learning_rate": 4.985565876070683e-05, + "loss": 0.8719, + "step": 846 + }, + { + "epoch": 0.037235035182278234, + "grad_norm": 5.0, + "learning_rate": 4.9854915455411334e-05, + "loss": 0.9025, + "step": 848 + }, + { + "epoch": 0.03732285366148172, + "grad_norm": 5.75, + "learning_rate": 4.9854170246720456e-05, + "loss": 0.8393, + "step": 850 + }, + { + "epoch": 0.037410672140685206, + "grad_norm": 4.6875, + "learning_rate": 4.9853423134691265e-05, + "loss": 0.8611, + "step": 852 + }, + { + "epoch": 0.03749849061988869, + "grad_norm": 6.375, + "learning_rate": 4.985267411938097e-05, + "loss": 0.8633, + "step": 854 + }, + { + "epoch": 0.03758630909909218, + "grad_norm": 5.78125, + "learning_rate": 4.9851923200846934e-05, + "loss": 0.8516, + "step": 856 + }, + { + "epoch": 0.03767412757829566, + "grad_norm": 6.15625, + "learning_rate": 4.985117037914666e-05, + "loss": 0.872, + "step": 858 + }, + { + "epoch": 0.03776194605749915, + "grad_norm": 5.5625, + "learning_rate": 4.9850415654337804e-05, + "loss": 0.8479, + "step": 860 + }, + { + "epoch": 0.037849764536702635, + "grad_norm": 6.125, + "learning_rate": 4.9849659026478154e-05, + "loss": 0.8318, + "step": 862 + }, + { + "epoch": 0.03793758301590612, + "grad_norm": 5.125, + "learning_rate": 4.9848900495625665e-05, + "loss": 0.8792, + "step": 864 + }, + { + "epoch": 0.038025401495109606, + "grad_norm": 5.09375, + "learning_rate": 4.9848140061838424e-05, + "loss": 0.855, + "step": 866 + }, + { + "epoch": 0.03811321997431309, + "grad_norm": 4.53125, + "learning_rate": 4.984737772517465e-05, + "loss": 0.8471, + "step": 868 + }, + { + "epoch": 0.03820103845351658, + "grad_norm": 6.375, + "learning_rate": 4.984661348569274e-05, + "loss": 0.8514, + "step": 870 + }, + { + "epoch": 0.03828885693272007, + "grad_norm": 5.96875, + "learning_rate": 4.984584734345121e-05, + "loss": 0.842, + "step": 872 + }, + { + "epoch": 0.03837667541192356, + "grad_norm": 8.4375, + "learning_rate": 4.984507929850873e-05, + "loss": 0.872, + "step": 874 + }, + { + "epoch": 0.03846449389112704, + "grad_norm": 8.375, + "learning_rate": 4.9844309350924135e-05, + "loss": 0.8892, + "step": 876 + }, + { + "epoch": 0.03855231237033053, + "grad_norm": 6.6875, + "learning_rate": 4.9843537500756364e-05, + "loss": 0.8866, + "step": 878 + }, + { + "epoch": 0.038640130849534014, + "grad_norm": 5.03125, + "learning_rate": 4.9842763748064536e-05, + "loss": 0.8492, + "step": 880 + }, + { + "epoch": 0.0387279493287375, + "grad_norm": 5.21875, + "learning_rate": 4.984198809290791e-05, + "loss": 0.8236, + "step": 882 + }, + { + "epoch": 0.038815767807940986, + "grad_norm": 5.875, + "learning_rate": 4.984121053534588e-05, + "loss": 0.8429, + "step": 884 + }, + { + "epoch": 0.03890358628714447, + "grad_norm": 6.03125, + "learning_rate": 4.984043107543799e-05, + "loss": 0.864, + "step": 886 + }, + { + "epoch": 0.03899140476634796, + "grad_norm": 5.65625, + "learning_rate": 4.983964971324393e-05, + "loss": 0.8531, + "step": 888 + }, + { + "epoch": 0.039079223245551443, + "grad_norm": 6.34375, + "learning_rate": 4.983886644882354e-05, + "loss": 0.8143, + "step": 890 + }, + { + "epoch": 0.03916704172475493, + "grad_norm": 6.53125, + "learning_rate": 4.9838081282236814e-05, + "loss": 0.8478, + "step": 892 + }, + { + "epoch": 0.039254860203958415, + "grad_norm": 6.96875, + "learning_rate": 4.983729421354386e-05, + "loss": 0.8864, + "step": 894 + }, + { + "epoch": 0.0393426786831619, + "grad_norm": 8.5, + "learning_rate": 4.9836505242804966e-05, + "loss": 0.8228, + "step": 896 + }, + { + "epoch": 0.039430497162365394, + "grad_norm": 7.21875, + "learning_rate": 4.9835714370080546e-05, + "loss": 0.8414, + "step": 898 + }, + { + "epoch": 0.03951831564156888, + "grad_norm": 4.96875, + "learning_rate": 4.983492159543116e-05, + "loss": 0.8727, + "step": 900 + }, + { + "epoch": 0.039606134120772366, + "grad_norm": 5.65625, + "learning_rate": 4.983412691891753e-05, + "loss": 0.8376, + "step": 902 + }, + { + "epoch": 0.03969395259997585, + "grad_norm": 5.25, + "learning_rate": 4.983333034060051e-05, + "loss": 0.8527, + "step": 904 + }, + { + "epoch": 0.03978177107917934, + "grad_norm": 6.1875, + "learning_rate": 4.9832531860541096e-05, + "loss": 0.8227, + "step": 906 + }, + { + "epoch": 0.03986958955838282, + "grad_norm": 4.96875, + "learning_rate": 4.9831731478800434e-05, + "loss": 0.862, + "step": 908 + }, + { + "epoch": 0.03995740803758631, + "grad_norm": 5.125, + "learning_rate": 4.983092919543983e-05, + "loss": 0.8069, + "step": 910 + }, + { + "epoch": 0.040045226516789795, + "grad_norm": 4.03125, + "learning_rate": 4.983012501052072e-05, + "loss": 0.8215, + "step": 912 + }, + { + "epoch": 0.04013304499599328, + "grad_norm": 4.0625, + "learning_rate": 4.982931892410468e-05, + "loss": 0.8391, + "step": 914 + }, + { + "epoch": 0.040220863475196766, + "grad_norm": 5.5625, + "learning_rate": 4.982851093625344e-05, + "loss": 0.8268, + "step": 916 + }, + { + "epoch": 0.04030868195440025, + "grad_norm": 5.25, + "learning_rate": 4.982770104702888e-05, + "loss": 0.8418, + "step": 918 + }, + { + "epoch": 0.04039650043360374, + "grad_norm": 4.71875, + "learning_rate": 4.9826889256493034e-05, + "loss": 0.839, + "step": 920 + }, + { + "epoch": 0.04048431891280723, + "grad_norm": 4.5625, + "learning_rate": 4.9826075564708056e-05, + "loss": 0.8414, + "step": 922 + }, + { + "epoch": 0.04057213739201072, + "grad_norm": 5.4375, + "learning_rate": 4.982525997173625e-05, + "loss": 0.8294, + "step": 924 + }, + { + "epoch": 0.0406599558712142, + "grad_norm": 5.75, + "learning_rate": 4.982444247764009e-05, + "loss": 0.8287, + "step": 926 + }, + { + "epoch": 0.04074777435041769, + "grad_norm": 5.21875, + "learning_rate": 4.982362308248217e-05, + "loss": 0.8332, + "step": 928 + }, + { + "epoch": 0.040835592829621174, + "grad_norm": 3.59375, + "learning_rate": 4.9822801786325245e-05, + "loss": 0.8231, + "step": 930 + }, + { + "epoch": 0.04092341130882466, + "grad_norm": 4.34375, + "learning_rate": 4.982197858923221e-05, + "loss": 0.8413, + "step": 932 + }, + { + "epoch": 0.041011229788028146, + "grad_norm": 5.4375, + "learning_rate": 4.98211534912661e-05, + "loss": 0.8083, + "step": 934 + }, + { + "epoch": 0.04109904826723163, + "grad_norm": 6.21875, + "learning_rate": 4.982032649249011e-05, + "loss": 0.8205, + "step": 936 + }, + { + "epoch": 0.04118686674643512, + "grad_norm": 6.15625, + "learning_rate": 4.981949759296757e-05, + "loss": 0.8354, + "step": 938 + }, + { + "epoch": 0.041274685225638603, + "grad_norm": 5.90625, + "learning_rate": 4.981866679276195e-05, + "loss": 0.8274, + "step": 940 + }, + { + "epoch": 0.04136250370484209, + "grad_norm": 4.9375, + "learning_rate": 4.981783409193689e-05, + "loss": 0.8363, + "step": 942 + }, + { + "epoch": 0.041450322184045575, + "grad_norm": 4.65625, + "learning_rate": 4.981699949055613e-05, + "loss": 0.8658, + "step": 944 + }, + { + "epoch": 0.04153814066324906, + "grad_norm": 4.4375, + "learning_rate": 4.9816162988683604e-05, + "loss": 0.8392, + "step": 946 + }, + { + "epoch": 0.041625959142452554, + "grad_norm": 4.59375, + "learning_rate": 4.981532458638337e-05, + "loss": 0.8418, + "step": 948 + }, + { + "epoch": 0.04171377762165604, + "grad_norm": 4.75, + "learning_rate": 4.9814484283719634e-05, + "loss": 0.8229, + "step": 950 + }, + { + "epoch": 0.041801596100859525, + "grad_norm": 5.5, + "learning_rate": 4.981364208075673e-05, + "loss": 0.8107, + "step": 952 + }, + { + "epoch": 0.04188941458006301, + "grad_norm": 4.90625, + "learning_rate": 4.9812797977559176e-05, + "loss": 0.8279, + "step": 954 + }, + { + "epoch": 0.0419772330592665, + "grad_norm": 5.0, + "learning_rate": 4.981195197419161e-05, + "loss": 0.8215, + "step": 956 + }, + { + "epoch": 0.04206505153846998, + "grad_norm": 4.46875, + "learning_rate": 4.981110407071881e-05, + "loss": 0.8302, + "step": 958 + }, + { + "epoch": 0.04215287001767347, + "grad_norm": 3.78125, + "learning_rate": 4.981025426720571e-05, + "loss": 0.8162, + "step": 960 + }, + { + "epoch": 0.042240688496876955, + "grad_norm": 4.15625, + "learning_rate": 4.980940256371739e-05, + "loss": 0.8318, + "step": 962 + }, + { + "epoch": 0.04232850697608044, + "grad_norm": 3.78125, + "learning_rate": 4.980854896031908e-05, + "loss": 0.8177, + "step": 964 + }, + { + "epoch": 0.042416325455283926, + "grad_norm": 4.125, + "learning_rate": 4.9807693457076144e-05, + "loss": 0.844, + "step": 966 + }, + { + "epoch": 0.04250414393448741, + "grad_norm": 4.65625, + "learning_rate": 4.980683605405408e-05, + "loss": 0.8244, + "step": 968 + }, + { + "epoch": 0.0425919624136909, + "grad_norm": 4.46875, + "learning_rate": 4.980597675131858e-05, + "loss": 0.7891, + "step": 970 + }, + { + "epoch": 0.042679780892894384, + "grad_norm": 3.90625, + "learning_rate": 4.980511554893543e-05, + "loss": 0.8173, + "step": 972 + }, + { + "epoch": 0.04276759937209788, + "grad_norm": 4.75, + "learning_rate": 4.980425244697059e-05, + "loss": 0.8342, + "step": 974 + }, + { + "epoch": 0.04285541785130136, + "grad_norm": 4.375, + "learning_rate": 4.9803387445490144e-05, + "loss": 0.8508, + "step": 976 + }, + { + "epoch": 0.04294323633050485, + "grad_norm": 4.8125, + "learning_rate": 4.980252054456035e-05, + "loss": 0.8379, + "step": 978 + }, + { + "epoch": 0.043031054809708334, + "grad_norm": 5.65625, + "learning_rate": 4.980165174424759e-05, + "loss": 0.8501, + "step": 980 + }, + { + "epoch": 0.04311887328891182, + "grad_norm": 5.3125, + "learning_rate": 4.980078104461838e-05, + "loss": 0.8109, + "step": 982 + }, + { + "epoch": 0.043206691768115306, + "grad_norm": 4.375, + "learning_rate": 4.979990844573942e-05, + "loss": 0.8135, + "step": 984 + }, + { + "epoch": 0.04329451024731879, + "grad_norm": 4.4375, + "learning_rate": 4.979903394767752e-05, + "loss": 0.7713, + "step": 986 + }, + { + "epoch": 0.04338232872652228, + "grad_norm": 6.03125, + "learning_rate": 4.979815755049967e-05, + "loss": 0.8369, + "step": 988 + }, + { + "epoch": 0.04347014720572576, + "grad_norm": 5.09375, + "learning_rate": 4.9797279254272956e-05, + "loss": 0.8063, + "step": 990 + }, + { + "epoch": 0.04355796568492925, + "grad_norm": 4.875, + "learning_rate": 4.979639905906466e-05, + "loss": 0.8432, + "step": 992 + }, + { + "epoch": 0.043645784164132735, + "grad_norm": 4.0625, + "learning_rate": 4.9795516964942175e-05, + "loss": 0.8128, + "step": 994 + }, + { + "epoch": 0.04373360264333622, + "grad_norm": 5.6875, + "learning_rate": 4.979463297197306e-05, + "loss": 0.7988, + "step": 996 + }, + { + "epoch": 0.043821421122539714, + "grad_norm": 5.21875, + "learning_rate": 4.9793747080225004e-05, + "loss": 0.7873, + "step": 998 + }, + { + "epoch": 0.0439092396017432, + "grad_norm": 4.0, + "learning_rate": 4.979285928976586e-05, + "loss": 0.8221, + "step": 1000 + }, + { + "epoch": 0.043997058080946685, + "grad_norm": 4.5, + "learning_rate": 4.9791969600663605e-05, + "loss": 0.8012, + "step": 1002 + }, + { + "epoch": 0.04408487656015017, + "grad_norm": 4.5625, + "learning_rate": 4.9791078012986375e-05, + "loss": 0.8453, + "step": 1004 + }, + { + "epoch": 0.04417269503935366, + "grad_norm": 4.84375, + "learning_rate": 4.9790184526802444e-05, + "loss": 0.8158, + "step": 1006 + }, + { + "epoch": 0.04426051351855714, + "grad_norm": 3.984375, + "learning_rate": 4.978928914218025e-05, + "loss": 0.8095, + "step": 1008 + }, + { + "epoch": 0.04434833199776063, + "grad_norm": 4.40625, + "learning_rate": 4.9788391859188346e-05, + "loss": 0.8196, + "step": 1010 + }, + { + "epoch": 0.044436150476964115, + "grad_norm": 3.90625, + "learning_rate": 4.9787492677895445e-05, + "loss": 0.8302, + "step": 1012 + }, + { + "epoch": 0.0445239689561676, + "grad_norm": 4.40625, + "learning_rate": 4.978659159837041e-05, + "loss": 0.7852, + "step": 1014 + }, + { + "epoch": 0.044611787435371086, + "grad_norm": 4.0625, + "learning_rate": 4.9785688620682265e-05, + "loss": 0.8035, + "step": 1016 + }, + { + "epoch": 0.04469960591457457, + "grad_norm": 5.0, + "learning_rate": 4.978478374490013e-05, + "loss": 0.7945, + "step": 1018 + }, + { + "epoch": 0.04478742439377806, + "grad_norm": 4.21875, + "learning_rate": 4.978387697109333e-05, + "loss": 0.802, + "step": 1020 + }, + { + "epoch": 0.044875242872981544, + "grad_norm": 4.875, + "learning_rate": 4.978296829933127e-05, + "loss": 0.8112, + "step": 1022 + }, + { + "epoch": 0.04496306135218504, + "grad_norm": 4.84375, + "learning_rate": 4.978205772968357e-05, + "loss": 0.8123, + "step": 1024 + }, + { + "epoch": 0.04505087983138852, + "grad_norm": 4.84375, + "learning_rate": 4.978114526221994e-05, + "loss": 0.8189, + "step": 1026 + }, + { + "epoch": 0.04513869831059201, + "grad_norm": 4.59375, + "learning_rate": 4.978023089701027e-05, + "loss": 0.7864, + "step": 1028 + }, + { + "epoch": 0.045226516789795494, + "grad_norm": 5.0, + "learning_rate": 4.977931463412459e-05, + "loss": 0.784, + "step": 1030 + }, + { + "epoch": 0.04531433526899898, + "grad_norm": 4.875, + "learning_rate": 4.9778396473633035e-05, + "loss": 0.8088, + "step": 1032 + }, + { + "epoch": 0.045402153748202466, + "grad_norm": 4.40625, + "learning_rate": 4.977747641560595e-05, + "loss": 0.7992, + "step": 1034 + }, + { + "epoch": 0.04548997222740595, + "grad_norm": 4.21875, + "learning_rate": 4.977655446011378e-05, + "loss": 0.7899, + "step": 1036 + }, + { + "epoch": 0.04557779070660944, + "grad_norm": 4.0625, + "learning_rate": 4.9775630607227126e-05, + "loss": 0.7699, + "step": 1038 + }, + { + "epoch": 0.04566560918581292, + "grad_norm": 4.59375, + "learning_rate": 4.977470485701674e-05, + "loss": 0.8143, + "step": 1040 + }, + { + "epoch": 0.04575342766501641, + "grad_norm": 4.0625, + "learning_rate": 4.9773777209553517e-05, + "loss": 0.8162, + "step": 1042 + }, + { + "epoch": 0.045841246144219895, + "grad_norm": 3.90625, + "learning_rate": 4.9772847664908505e-05, + "loss": 0.7954, + "step": 1044 + }, + { + "epoch": 0.04592906462342338, + "grad_norm": 4.125, + "learning_rate": 4.977191622315288e-05, + "loss": 0.7932, + "step": 1046 + }, + { + "epoch": 0.04601688310262687, + "grad_norm": 5.375, + "learning_rate": 4.977098288435796e-05, + "loss": 0.791, + "step": 1048 + }, + { + "epoch": 0.04610470158183036, + "grad_norm": 4.28125, + "learning_rate": 4.977004764859524e-05, + "loss": 0.7752, + "step": 1050 + }, + { + "epoch": 0.046192520061033845, + "grad_norm": 5.34375, + "learning_rate": 4.976911051593633e-05, + "loss": 0.7977, + "step": 1052 + }, + { + "epoch": 0.04628033854023733, + "grad_norm": 4.09375, + "learning_rate": 4.976817148645301e-05, + "loss": 0.7915, + "step": 1054 + }, + { + "epoch": 0.04636815701944082, + "grad_norm": 5.0625, + "learning_rate": 4.976723056021717e-05, + "loss": 0.8112, + "step": 1056 + }, + { + "epoch": 0.0464559754986443, + "grad_norm": 6.65625, + "learning_rate": 4.976628773730088e-05, + "loss": 0.818, + "step": 1058 + }, + { + "epoch": 0.04654379397784779, + "grad_norm": 5.25, + "learning_rate": 4.976534301777634e-05, + "loss": 0.8005, + "step": 1060 + }, + { + "epoch": 0.046631612457051275, + "grad_norm": 4.28125, + "learning_rate": 4.9764396401715895e-05, + "loss": 0.8215, + "step": 1062 + }, + { + "epoch": 0.04671943093625476, + "grad_norm": 4.25, + "learning_rate": 4.9763447889192034e-05, + "loss": 0.785, + "step": 1064 + }, + { + "epoch": 0.046807249415458246, + "grad_norm": 4.4375, + "learning_rate": 4.97624974802774e-05, + "loss": 0.8075, + "step": 1066 + }, + { + "epoch": 0.04689506789466173, + "grad_norm": 4.84375, + "learning_rate": 4.9761545175044764e-05, + "loss": 0.8031, + "step": 1068 + }, + { + "epoch": 0.04698288637386522, + "grad_norm": 3.765625, + "learning_rate": 4.976059097356708e-05, + "loss": 0.7689, + "step": 1070 + }, + { + "epoch": 0.047070704853068704, + "grad_norm": 5.125, + "learning_rate": 4.975963487591739e-05, + "loss": 0.7636, + "step": 1072 + }, + { + "epoch": 0.0471585233322722, + "grad_norm": 3.921875, + "learning_rate": 4.9758676882168934e-05, + "loss": 0.7856, + "step": 1074 + }, + { + "epoch": 0.04724634181147568, + "grad_norm": 5.25, + "learning_rate": 4.975771699239505e-05, + "loss": 0.7742, + "step": 1076 + }, + { + "epoch": 0.04733416029067917, + "grad_norm": 5.84375, + "learning_rate": 4.975675520666928e-05, + "loss": 0.8068, + "step": 1078 + }, + { + "epoch": 0.047421978769882654, + "grad_norm": 6.96875, + "learning_rate": 4.9755791525065266e-05, + "loss": 0.8261, + "step": 1080 + }, + { + "epoch": 0.04750979724908614, + "grad_norm": 7.625, + "learning_rate": 4.975482594765679e-05, + "loss": 0.8396, + "step": 1082 + }, + { + "epoch": 0.047597615728289626, + "grad_norm": 5.46875, + "learning_rate": 4.9753858474517815e-05, + "loss": 0.7966, + "step": 1084 + }, + { + "epoch": 0.04768543420749311, + "grad_norm": 4.28125, + "learning_rate": 4.975288910572242e-05, + "loss": 0.7745, + "step": 1086 + }, + { + "epoch": 0.0477732526866966, + "grad_norm": 4.59375, + "learning_rate": 4.975191784134485e-05, + "loss": 0.7827, + "step": 1088 + }, + { + "epoch": 0.04786107116590008, + "grad_norm": 4.4375, + "learning_rate": 4.975094468145948e-05, + "loss": 0.7728, + "step": 1090 + }, + { + "epoch": 0.04794888964510357, + "grad_norm": 5.0, + "learning_rate": 4.974996962614083e-05, + "loss": 0.7792, + "step": 1092 + }, + { + "epoch": 0.048036708124307055, + "grad_norm": 4.125, + "learning_rate": 4.974899267546357e-05, + "loss": 0.7991, + "step": 1094 + }, + { + "epoch": 0.04812452660351054, + "grad_norm": 7.0625, + "learning_rate": 4.974801382950252e-05, + "loss": 0.7764, + "step": 1096 + }, + { + "epoch": 0.04821234508271403, + "grad_norm": 8.0, + "learning_rate": 4.9747033088332635e-05, + "loss": 0.8007, + "step": 1098 + }, + { + "epoch": 0.04830016356191752, + "grad_norm": 5.84375, + "learning_rate": 4.9746050452029023e-05, + "loss": 0.8096, + "step": 1100 + }, + { + "epoch": 0.048387982041121005, + "grad_norm": 6.5625, + "learning_rate": 4.974506592066695e-05, + "loss": 0.8101, + "step": 1102 + }, + { + "epoch": 0.04847580052032449, + "grad_norm": 8.8125, + "learning_rate": 4.974407949432178e-05, + "loss": 0.7943, + "step": 1104 + }, + { + "epoch": 0.04856361899952798, + "grad_norm": 5.25, + "learning_rate": 4.9743091173069075e-05, + "loss": 0.8159, + "step": 1106 + }, + { + "epoch": 0.04865143747873146, + "grad_norm": 4.03125, + "learning_rate": 4.974210095698452e-05, + "loss": 0.8159, + "step": 1108 + }, + { + "epoch": 0.04873925595793495, + "grad_norm": 4.9375, + "learning_rate": 4.9741108846143934e-05, + "loss": 0.7488, + "step": 1110 + }, + { + "epoch": 0.048827074437138435, + "grad_norm": 6.25, + "learning_rate": 4.974011484062331e-05, + "loss": 0.7729, + "step": 1112 + }, + { + "epoch": 0.04891489291634192, + "grad_norm": 5.0, + "learning_rate": 4.9739118940498766e-05, + "loss": 0.7887, + "step": 1114 + }, + { + "epoch": 0.049002711395545406, + "grad_norm": 4.40625, + "learning_rate": 4.973812114584655e-05, + "loss": 0.8019, + "step": 1116 + }, + { + "epoch": 0.04909052987474889, + "grad_norm": 4.4375, + "learning_rate": 4.9737121456743095e-05, + "loss": 0.7422, + "step": 1118 + }, + { + "epoch": 0.04917834835395238, + "grad_norm": 4.59375, + "learning_rate": 4.9736119873264946e-05, + "loss": 0.7503, + "step": 1120 + }, + { + "epoch": 0.049266166833155864, + "grad_norm": 4.625, + "learning_rate": 4.973511639548881e-05, + "loss": 0.7732, + "step": 1122 + }, + { + "epoch": 0.04935398531235936, + "grad_norm": 46.25, + "learning_rate": 4.973411102349153e-05, + "loss": 0.8023, + "step": 1124 + }, + { + "epoch": 0.04944180379156284, + "grad_norm": 4.375, + "learning_rate": 4.9733103757350096e-05, + "loss": 0.7675, + "step": 1126 + }, + { + "epoch": 0.04952962227076633, + "grad_norm": 4.65625, + "learning_rate": 4.9732094597141654e-05, + "loss": 0.8107, + "step": 1128 + }, + { + "epoch": 0.049617440749969814, + "grad_norm": 4.46875, + "learning_rate": 4.973108354294347e-05, + "loss": 0.8025, + "step": 1130 + }, + { + "epoch": 0.0497052592291733, + "grad_norm": 5.625, + "learning_rate": 4.973007059483299e-05, + "loss": 0.7767, + "step": 1132 + }, + { + "epoch": 0.049793077708376786, + "grad_norm": 6.5, + "learning_rate": 4.9729055752887764e-05, + "loss": 0.8024, + "step": 1134 + }, + { + "epoch": 0.04988089618758027, + "grad_norm": 6.28125, + "learning_rate": 4.9728039017185535e-05, + "loss": 0.7989, + "step": 1136 + }, + { + "epoch": 0.04996871466678376, + "grad_norm": 5.71875, + "learning_rate": 4.9727020387804136e-05, + "loss": 0.7659, + "step": 1138 + }, + { + "epoch": 0.05005653314598724, + "grad_norm": 5.46875, + "learning_rate": 4.972599986482159e-05, + "loss": 0.7758, + "step": 1140 + }, + { + "epoch": 0.05014435162519073, + "grad_norm": 4.65625, + "learning_rate": 4.972497744831606e-05, + "loss": 0.7691, + "step": 1142 + }, + { + "epoch": 0.050232170104394215, + "grad_norm": 4.84375, + "learning_rate": 4.972395313836582e-05, + "loss": 0.7584, + "step": 1144 + }, + { + "epoch": 0.0503199885835977, + "grad_norm": 3.90625, + "learning_rate": 4.9722926935049316e-05, + "loss": 0.7792, + "step": 1146 + }, + { + "epoch": 0.05040780706280119, + "grad_norm": 4.9375, + "learning_rate": 4.9721898838445155e-05, + "loss": 0.7649, + "step": 1148 + }, + { + "epoch": 0.05049562554200468, + "grad_norm": 5.1875, + "learning_rate": 4.972086884863204e-05, + "loss": 0.7919, + "step": 1150 + }, + { + "epoch": 0.050583444021208165, + "grad_norm": 5.34375, + "learning_rate": 4.971983696568888e-05, + "loss": 0.7349, + "step": 1152 + }, + { + "epoch": 0.05067126250041165, + "grad_norm": 4.375, + "learning_rate": 4.9718803189694666e-05, + "loss": 0.7765, + "step": 1154 + }, + { + "epoch": 0.05075908097961514, + "grad_norm": 3.703125, + "learning_rate": 4.9717767520728585e-05, + "loss": 0.7909, + "step": 1156 + }, + { + "epoch": 0.05084689945881862, + "grad_norm": 5.1875, + "learning_rate": 4.971672995886994e-05, + "loss": 0.7727, + "step": 1158 + }, + { + "epoch": 0.05093471793802211, + "grad_norm": 4.53125, + "learning_rate": 4.9715690504198186e-05, + "loss": 0.7504, + "step": 1160 + }, + { + "epoch": 0.051022536417225595, + "grad_norm": 5.125, + "learning_rate": 4.971464915679293e-05, + "loss": 0.7518, + "step": 1162 + }, + { + "epoch": 0.05111035489642908, + "grad_norm": 4.65625, + "learning_rate": 4.971360591673392e-05, + "loss": 0.764, + "step": 1164 + }, + { + "epoch": 0.051198173375632566, + "grad_norm": 5.4375, + "learning_rate": 4.971256078410104e-05, + "loss": 0.7693, + "step": 1166 + }, + { + "epoch": 0.05128599185483605, + "grad_norm": 4.53125, + "learning_rate": 4.971151375897434e-05, + "loss": 0.7753, + "step": 1168 + }, + { + "epoch": 0.05137381033403954, + "grad_norm": 3.640625, + "learning_rate": 4.9710464841433984e-05, + "loss": 0.7606, + "step": 1170 + }, + { + "epoch": 0.051461628813243024, + "grad_norm": 4.0, + "learning_rate": 4.9709414031560306e-05, + "loss": 0.7781, + "step": 1172 + }, + { + "epoch": 0.05154944729244651, + "grad_norm": 4.25, + "learning_rate": 4.9708361329433787e-05, + "loss": 0.7369, + "step": 1174 + }, + { + "epoch": 0.05163726577165, + "grad_norm": 3.90625, + "learning_rate": 4.970730673513503e-05, + "loss": 0.7699, + "step": 1176 + }, + { + "epoch": 0.05172508425085349, + "grad_norm": 5.25, + "learning_rate": 4.97062502487448e-05, + "loss": 0.778, + "step": 1178 + }, + { + "epoch": 0.051812902730056974, + "grad_norm": 6.21875, + "learning_rate": 4.9705191870344e-05, + "loss": 0.7993, + "step": 1180 + }, + { + "epoch": 0.05190072120926046, + "grad_norm": 6.625, + "learning_rate": 4.9704131600013686e-05, + "loss": 0.7635, + "step": 1182 + }, + { + "epoch": 0.051988539688463946, + "grad_norm": 6.40625, + "learning_rate": 4.970306943783506e-05, + "loss": 0.7772, + "step": 1184 + }, + { + "epoch": 0.05207635816766743, + "grad_norm": 6.4375, + "learning_rate": 4.9702005383889446e-05, + "loss": 0.765, + "step": 1186 + }, + { + "epoch": 0.05216417664687092, + "grad_norm": 7.3125, + "learning_rate": 4.9700939438258334e-05, + "loss": 0.7375, + "step": 1188 + }, + { + "epoch": 0.0522519951260744, + "grad_norm": 6.71875, + "learning_rate": 4.969987160102337e-05, + "loss": 0.7828, + "step": 1190 + }, + { + "epoch": 0.05233981360527789, + "grad_norm": 6.28125, + "learning_rate": 4.969880187226631e-05, + "loss": 0.7674, + "step": 1192 + }, + { + "epoch": 0.052427632084481375, + "grad_norm": 7.34375, + "learning_rate": 4.969773025206908e-05, + "loss": 0.7823, + "step": 1194 + }, + { + "epoch": 0.05251545056368486, + "grad_norm": 6.375, + "learning_rate": 4.969665674051376e-05, + "loss": 0.7578, + "step": 1196 + }, + { + "epoch": 0.05260326904288835, + "grad_norm": 4.625, + "learning_rate": 4.969558133768254e-05, + "loss": 0.738, + "step": 1198 + }, + { + "epoch": 0.05269108752209184, + "grad_norm": 5.09375, + "learning_rate": 4.969450404365777e-05, + "loss": 0.7709, + "step": 1200 + }, + { + "epoch": 0.052778906001295325, + "grad_norm": 6.9375, + "learning_rate": 4.969342485852197e-05, + "loss": 0.7349, + "step": 1202 + }, + { + "epoch": 0.05286672448049881, + "grad_norm": 3.828125, + "learning_rate": 4.969234378235778e-05, + "loss": 0.7782, + "step": 1204 + }, + { + "epoch": 0.0529545429597023, + "grad_norm": 4.0, + "learning_rate": 4.969126081524798e-05, + "loss": 0.7804, + "step": 1206 + }, + { + "epoch": 0.05304236143890578, + "grad_norm": 4.375, + "learning_rate": 4.969017595727551e-05, + "loss": 0.7332, + "step": 1208 + }, + { + "epoch": 0.05313017991810927, + "grad_norm": 4.65625, + "learning_rate": 4.968908920852344e-05, + "loss": 0.7742, + "step": 1210 + }, + { + "epoch": 0.053217998397312755, + "grad_norm": 4.40625, + "learning_rate": 4.9688000569075e-05, + "loss": 0.7555, + "step": 1212 + }, + { + "epoch": 0.05330581687651624, + "grad_norm": 3.609375, + "learning_rate": 4.9686910039013566e-05, + "loss": 0.7543, + "step": 1214 + }, + { + "epoch": 0.053393635355719726, + "grad_norm": 4.15625, + "learning_rate": 4.9685817618422635e-05, + "loss": 0.7569, + "step": 1216 + }, + { + "epoch": 0.05348145383492321, + "grad_norm": 4.25, + "learning_rate": 4.968472330738588e-05, + "loss": 0.7303, + "step": 1218 + }, + { + "epoch": 0.0535692723141267, + "grad_norm": 4.0625, + "learning_rate": 4.96836271059871e-05, + "loss": 0.7597, + "step": 1220 + }, + { + "epoch": 0.053657090793330184, + "grad_norm": 4.6875, + "learning_rate": 4.968252901431023e-05, + "loss": 0.7583, + "step": 1222 + }, + { + "epoch": 0.05374490927253367, + "grad_norm": 4.8125, + "learning_rate": 4.968142903243938e-05, + "loss": 0.754, + "step": 1224 + }, + { + "epoch": 0.05383272775173716, + "grad_norm": 4.21875, + "learning_rate": 4.968032716045877e-05, + "loss": 0.7428, + "step": 1226 + }, + { + "epoch": 0.05392054623094065, + "grad_norm": 3.46875, + "learning_rate": 4.967922339845279e-05, + "loss": 0.7675, + "step": 1228 + }, + { + "epoch": 0.054008364710144134, + "grad_norm": 4.125, + "learning_rate": 4.967811774650597e-05, + "loss": 0.7386, + "step": 1230 + }, + { + "epoch": 0.05409618318934762, + "grad_norm": 3.84375, + "learning_rate": 4.967701020470298e-05, + "loss": 0.7233, + "step": 1232 + }, + { + "epoch": 0.054184001668551106, + "grad_norm": 3.90625, + "learning_rate": 4.967590077312863e-05, + "loss": 0.7297, + "step": 1234 + }, + { + "epoch": 0.05427182014775459, + "grad_norm": 3.84375, + "learning_rate": 4.967478945186788e-05, + "loss": 0.7155, + "step": 1236 + }, + { + "epoch": 0.05435963862695808, + "grad_norm": 4.09375, + "learning_rate": 4.967367624100584e-05, + "loss": 0.7515, + "step": 1238 + }, + { + "epoch": 0.05444745710616156, + "grad_norm": 3.578125, + "learning_rate": 4.967256114062776e-05, + "loss": 0.7572, + "step": 1240 + }, + { + "epoch": 0.05453527558536505, + "grad_norm": 4.53125, + "learning_rate": 4.967144415081903e-05, + "loss": 0.7531, + "step": 1242 + }, + { + "epoch": 0.054623094064568535, + "grad_norm": 4.3125, + "learning_rate": 4.96703252716652e-05, + "loss": 0.7622, + "step": 1244 + }, + { + "epoch": 0.05471091254377202, + "grad_norm": 3.53125, + "learning_rate": 4.966920450325194e-05, + "loss": 0.7755, + "step": 1246 + }, + { + "epoch": 0.05479873102297551, + "grad_norm": 3.9375, + "learning_rate": 4.9668081845665085e-05, + "loss": 0.726, + "step": 1248 + }, + { + "epoch": 0.05488654950217899, + "grad_norm": 4.09375, + "learning_rate": 4.9666957298990616e-05, + "loss": 0.7504, + "step": 1250 + }, + { + "epoch": 0.054974367981382485, + "grad_norm": 4.375, + "learning_rate": 4.9665830863314645e-05, + "loss": 0.7783, + "step": 1252 + }, + { + "epoch": 0.05506218646058597, + "grad_norm": 4.15625, + "learning_rate": 4.966470253872343e-05, + "loss": 0.7782, + "step": 1254 + }, + { + "epoch": 0.05515000493978946, + "grad_norm": 3.65625, + "learning_rate": 4.9663572325303376e-05, + "loss": 0.7509, + "step": 1256 + }, + { + "epoch": 0.05523782341899294, + "grad_norm": 3.984375, + "learning_rate": 4.966244022314105e-05, + "loss": 0.7475, + "step": 1258 + }, + { + "epoch": 0.05532564189819643, + "grad_norm": 4.375, + "learning_rate": 4.9661306232323134e-05, + "loss": 0.7448, + "step": 1260 + }, + { + "epoch": 0.055413460377399915, + "grad_norm": 4.375, + "learning_rate": 4.966017035293648e-05, + "loss": 0.7464, + "step": 1262 + }, + { + "epoch": 0.0555012788566034, + "grad_norm": 4.125, + "learning_rate": 4.965903258506806e-05, + "loss": 0.771, + "step": 1264 + }, + { + "epoch": 0.055589097335806886, + "grad_norm": 4.25, + "learning_rate": 4.965789292880502e-05, + "loss": 0.7344, + "step": 1266 + }, + { + "epoch": 0.05567691581501037, + "grad_norm": 3.828125, + "learning_rate": 4.965675138423463e-05, + "loss": 0.7208, + "step": 1268 + }, + { + "epoch": 0.05576473429421386, + "grad_norm": 3.625, + "learning_rate": 4.9655607951444305e-05, + "loss": 0.7481, + "step": 1270 + }, + { + "epoch": 0.055852552773417344, + "grad_norm": 3.59375, + "learning_rate": 4.9654462630521615e-05, + "loss": 0.7729, + "step": 1272 + }, + { + "epoch": 0.05594037125262083, + "grad_norm": 3.609375, + "learning_rate": 4.9653315421554266e-05, + "loss": 0.7597, + "step": 1274 + }, + { + "epoch": 0.05602818973182432, + "grad_norm": 4.625, + "learning_rate": 4.965216632463011e-05, + "loss": 0.7347, + "step": 1276 + }, + { + "epoch": 0.05611600821102781, + "grad_norm": 4.25, + "learning_rate": 4.965101533983715e-05, + "loss": 0.7614, + "step": 1278 + }, + { + "epoch": 0.056203826690231294, + "grad_norm": 4.53125, + "learning_rate": 4.9649862467263526e-05, + "loss": 0.7808, + "step": 1280 + }, + { + "epoch": 0.05629164516943478, + "grad_norm": 3.65625, + "learning_rate": 4.964870770699752e-05, + "loss": 0.7464, + "step": 1282 + }, + { + "epoch": 0.056379463648638266, + "grad_norm": 4.5625, + "learning_rate": 4.964755105912758e-05, + "loss": 0.7517, + "step": 1284 + }, + { + "epoch": 0.05646728212784175, + "grad_norm": 3.96875, + "learning_rate": 4.964639252374226e-05, + "loss": 0.7742, + "step": 1286 + }, + { + "epoch": 0.05655510060704524, + "grad_norm": 4.75, + "learning_rate": 4.96452321009303e-05, + "loss": 0.7392, + "step": 1288 + }, + { + "epoch": 0.05664291908624872, + "grad_norm": 3.390625, + "learning_rate": 4.964406979078056e-05, + "loss": 0.7295, + "step": 1290 + }, + { + "epoch": 0.05673073756545221, + "grad_norm": 3.71875, + "learning_rate": 4.964290559338204e-05, + "loss": 0.722, + "step": 1292 + }, + { + "epoch": 0.056818556044655695, + "grad_norm": 4.03125, + "learning_rate": 4.96417395088239e-05, + "loss": 0.7417, + "step": 1294 + }, + { + "epoch": 0.05690637452385918, + "grad_norm": 3.28125, + "learning_rate": 4.964057153719545e-05, + "loss": 0.7169, + "step": 1296 + }, + { + "epoch": 0.05699419300306267, + "grad_norm": 3.46875, + "learning_rate": 4.963940167858613e-05, + "loss": 0.7288, + "step": 1298 + }, + { + "epoch": 0.05708201148226615, + "grad_norm": 4.03125, + "learning_rate": 4.963822993308551e-05, + "loss": 0.7436, + "step": 1300 + }, + { + "epoch": 0.057169829961469645, + "grad_norm": 4.5, + "learning_rate": 4.9637056300783343e-05, + "loss": 0.7619, + "step": 1302 + }, + { + "epoch": 0.05725764844067313, + "grad_norm": 4.1875, + "learning_rate": 4.9635880781769495e-05, + "loss": 0.7431, + "step": 1304 + }, + { + "epoch": 0.05734546691987662, + "grad_norm": 3.875, + "learning_rate": 4.963470337613399e-05, + "loss": 0.7413, + "step": 1306 + }, + { + "epoch": 0.0574332853990801, + "grad_norm": 4.3125, + "learning_rate": 4.9633524083967e-05, + "loss": 0.7325, + "step": 1308 + }, + { + "epoch": 0.05752110387828359, + "grad_norm": 4.0625, + "learning_rate": 4.963234290535883e-05, + "loss": 0.7389, + "step": 1310 + }, + { + "epoch": 0.057608922357487075, + "grad_norm": 4.21875, + "learning_rate": 4.9631159840399935e-05, + "loss": 0.7306, + "step": 1312 + }, + { + "epoch": 0.05769674083669056, + "grad_norm": 3.953125, + "learning_rate": 4.962997488918091e-05, + "loss": 0.7374, + "step": 1314 + }, + { + "epoch": 0.057784559315894046, + "grad_norm": 4.375, + "learning_rate": 4.962878805179251e-05, + "loss": 0.7699, + "step": 1316 + }, + { + "epoch": 0.05787237779509753, + "grad_norm": 5.5625, + "learning_rate": 4.9627599328325606e-05, + "loss": 0.7359, + "step": 1318 + }, + { + "epoch": 0.05796019627430102, + "grad_norm": 4.375, + "learning_rate": 4.962640871887126e-05, + "loss": 0.7165, + "step": 1320 + }, + { + "epoch": 0.058048014753504504, + "grad_norm": 3.90625, + "learning_rate": 4.962521622352061e-05, + "loss": 0.7183, + "step": 1322 + }, + { + "epoch": 0.05813583323270799, + "grad_norm": 4.09375, + "learning_rate": 4.962402184236501e-05, + "loss": 0.7297, + "step": 1324 + }, + { + "epoch": 0.05822365171191148, + "grad_norm": 3.328125, + "learning_rate": 4.962282557549591e-05, + "loss": 0.7111, + "step": 1326 + }, + { + "epoch": 0.05831147019111497, + "grad_norm": 3.796875, + "learning_rate": 4.9621627423004933e-05, + "loss": 0.7451, + "step": 1328 + }, + { + "epoch": 0.058399288670318454, + "grad_norm": 4.03125, + "learning_rate": 4.9620427384983824e-05, + "loss": 0.7387, + "step": 1330 + }, + { + "epoch": 0.05848710714952194, + "grad_norm": 4.40625, + "learning_rate": 4.9619225461524484e-05, + "loss": 0.7362, + "step": 1332 + }, + { + "epoch": 0.058574925628725426, + "grad_norm": 3.8125, + "learning_rate": 4.961802165271895e-05, + "loss": 0.7149, + "step": 1334 + }, + { + "epoch": 0.05866274410792891, + "grad_norm": 4.09375, + "learning_rate": 4.9616815958659425e-05, + "loss": 0.715, + "step": 1336 + }, + { + "epoch": 0.0587505625871324, + "grad_norm": 3.75, + "learning_rate": 4.961560837943823e-05, + "loss": 0.7248, + "step": 1338 + }, + { + "epoch": 0.05883838106633588, + "grad_norm": 4.09375, + "learning_rate": 4.961439891514784e-05, + "loss": 0.7568, + "step": 1340 + }, + { + "epoch": 0.05892619954553937, + "grad_norm": 5.875, + "learning_rate": 4.961318756588088e-05, + "loss": 0.7527, + "step": 1342 + }, + { + "epoch": 0.059014018024742855, + "grad_norm": 8.0625, + "learning_rate": 4.961197433173012e-05, + "loss": 0.7473, + "step": 1344 + }, + { + "epoch": 0.05910183650394634, + "grad_norm": 9.0, + "learning_rate": 4.961075921278846e-05, + "loss": 0.7226, + "step": 1346 + }, + { + "epoch": 0.05918965498314983, + "grad_norm": 5.8125, + "learning_rate": 4.960954220914897e-05, + "loss": 0.7375, + "step": 1348 + }, + { + "epoch": 0.05927747346235331, + "grad_norm": 5.1875, + "learning_rate": 4.9608323320904836e-05, + "loss": 0.7501, + "step": 1350 + }, + { + "epoch": 0.059365291941556805, + "grad_norm": 5.8125, + "learning_rate": 4.9607102548149396e-05, + "loss": 0.7003, + "step": 1352 + }, + { + "epoch": 0.05945311042076029, + "grad_norm": 6.96875, + "learning_rate": 4.960587989097615e-05, + "loss": 0.7404, + "step": 1354 + }, + { + "epoch": 0.05954092889996378, + "grad_norm": 5.1875, + "learning_rate": 4.9604655349478726e-05, + "loss": 0.7491, + "step": 1356 + }, + { + "epoch": 0.05962874737916726, + "grad_norm": 5.25, + "learning_rate": 4.960342892375089e-05, + "loss": 0.7291, + "step": 1358 + }, + { + "epoch": 0.05971656585837075, + "grad_norm": 5.53125, + "learning_rate": 4.960220061388657e-05, + "loss": 0.7123, + "step": 1360 + }, + { + "epoch": 0.059804384337574235, + "grad_norm": 5.9375, + "learning_rate": 4.960097041997984e-05, + "loss": 0.7116, + "step": 1362 + }, + { + "epoch": 0.05989220281677772, + "grad_norm": 7.65625, + "learning_rate": 4.9599738342124884e-05, + "loss": 0.7631, + "step": 1364 + }, + { + "epoch": 0.059980021295981206, + "grad_norm": 3.625, + "learning_rate": 4.959850438041608e-05, + "loss": 0.7352, + "step": 1366 + }, + { + "epoch": 0.06006783977518469, + "grad_norm": 3.84375, + "learning_rate": 4.9597268534947906e-05, + "loss": 0.7008, + "step": 1368 + }, + { + "epoch": 0.06015565825438818, + "grad_norm": 3.546875, + "learning_rate": 4.9596030805815016e-05, + "loss": 0.718, + "step": 1370 + }, + { + "epoch": 0.060243476733591664, + "grad_norm": 4.09375, + "learning_rate": 4.9594791193112186e-05, + "loss": 0.7109, + "step": 1372 + }, + { + "epoch": 0.06033129521279515, + "grad_norm": 4.125, + "learning_rate": 4.959354969693436e-05, + "loss": 0.6931, + "step": 1374 + }, + { + "epoch": 0.060419113691998635, + "grad_norm": 4.375, + "learning_rate": 4.959230631737659e-05, + "loss": 0.7214, + "step": 1376 + }, + { + "epoch": 0.06050693217120213, + "grad_norm": 4.90625, + "learning_rate": 4.9591061054534116e-05, + "loss": 0.7364, + "step": 1378 + }, + { + "epoch": 0.060594750650405614, + "grad_norm": 5.96875, + "learning_rate": 4.9589813908502284e-05, + "loss": 0.7526, + "step": 1380 + }, + { + "epoch": 0.0606825691296091, + "grad_norm": 6.78125, + "learning_rate": 4.958856487937661e-05, + "loss": 0.7243, + "step": 1382 + }, + { + "epoch": 0.060770387608812586, + "grad_norm": 6.59375, + "learning_rate": 4.9587313967252755e-05, + "loss": 0.7135, + "step": 1384 + }, + { + "epoch": 0.06085820608801607, + "grad_norm": 8.5, + "learning_rate": 4.958606117222649e-05, + "loss": 0.7682, + "step": 1386 + }, + { + "epoch": 0.06094602456721956, + "grad_norm": 4.59375, + "learning_rate": 4.958480649439377e-05, + "loss": 0.6967, + "step": 1388 + }, + { + "epoch": 0.06103384304642304, + "grad_norm": 6.46875, + "learning_rate": 4.958354993385068e-05, + "loss": 0.737, + "step": 1390 + }, + { + "epoch": 0.06112166152562653, + "grad_norm": 6.875, + "learning_rate": 4.9582291490693434e-05, + "loss": 0.751, + "step": 1392 + }, + { + "epoch": 0.061209480004830015, + "grad_norm": 8.0, + "learning_rate": 4.958103116501842e-05, + "loss": 0.715, + "step": 1394 + }, + { + "epoch": 0.0612972984840335, + "grad_norm": 4.8125, + "learning_rate": 4.9579768956922145e-05, + "loss": 0.7409, + "step": 1396 + }, + { + "epoch": 0.06138511696323699, + "grad_norm": 4.3125, + "learning_rate": 4.957850486650127e-05, + "loss": 0.7097, + "step": 1398 + }, + { + "epoch": 0.06147293544244047, + "grad_norm": 3.609375, + "learning_rate": 4.957723889385259e-05, + "loss": 0.733, + "step": 1400 + }, + { + "epoch": 0.061560753921643965, + "grad_norm": 4.5, + "learning_rate": 4.957597103907309e-05, + "loss": 0.7219, + "step": 1402 + }, + { + "epoch": 0.06164857240084745, + "grad_norm": 4.96875, + "learning_rate": 4.957470130225982e-05, + "loss": 0.7023, + "step": 1404 + }, + { + "epoch": 0.06173639088005094, + "grad_norm": 6.0, + "learning_rate": 4.957342968351003e-05, + "loss": 0.6959, + "step": 1406 + }, + { + "epoch": 0.06182420935925442, + "grad_norm": 5.875, + "learning_rate": 4.957215618292111e-05, + "loss": 0.7014, + "step": 1408 + }, + { + "epoch": 0.06191202783845791, + "grad_norm": 7.25, + "learning_rate": 4.957088080059058e-05, + "loss": 0.7056, + "step": 1410 + }, + { + "epoch": 0.061999846317661395, + "grad_norm": 5.8125, + "learning_rate": 4.95696035366161e-05, + "loss": 0.7327, + "step": 1412 + }, + { + "epoch": 0.06208766479686488, + "grad_norm": 3.734375, + "learning_rate": 4.95683243910955e-05, + "loss": 0.6793, + "step": 1414 + }, + { + "epoch": 0.062175483276068366, + "grad_norm": 3.3125, + "learning_rate": 4.956704336412673e-05, + "loss": 0.6629, + "step": 1416 + }, + { + "epoch": 0.06226330175527185, + "grad_norm": 4.15625, + "learning_rate": 4.9565760455807887e-05, + "loss": 0.7449, + "step": 1418 + }, + { + "epoch": 0.06235112023447534, + "grad_norm": 4.28125, + "learning_rate": 4.956447566623722e-05, + "loss": 0.723, + "step": 1420 + }, + { + "epoch": 0.062438938713678824, + "grad_norm": 4.03125, + "learning_rate": 4.956318899551311e-05, + "loss": 0.695, + "step": 1422 + }, + { + "epoch": 0.06252675719288231, + "grad_norm": 4.96875, + "learning_rate": 4.956190044373411e-05, + "loss": 0.7091, + "step": 1424 + }, + { + "epoch": 0.0626145756720858, + "grad_norm": 5.46875, + "learning_rate": 4.956061001099888e-05, + "loss": 0.716, + "step": 1426 + }, + { + "epoch": 0.06270239415128928, + "grad_norm": 4.6875, + "learning_rate": 4.955931769740625e-05, + "loss": 0.7345, + "step": 1428 + }, + { + "epoch": 0.06279021263049277, + "grad_norm": 4.4375, + "learning_rate": 4.955802350305518e-05, + "loss": 0.7461, + "step": 1430 + }, + { + "epoch": 0.06287803110969625, + "grad_norm": 5.15625, + "learning_rate": 4.955672742804479e-05, + "loss": 0.7486, + "step": 1432 + }, + { + "epoch": 0.06296584958889974, + "grad_norm": 4.53125, + "learning_rate": 4.955542947247432e-05, + "loss": 0.7167, + "step": 1434 + }, + { + "epoch": 0.06305366806810322, + "grad_norm": 4.4375, + "learning_rate": 4.955412963644318e-05, + "loss": 0.6938, + "step": 1436 + }, + { + "epoch": 0.06314148654730671, + "grad_norm": 4.5, + "learning_rate": 4.9552827920050906e-05, + "loss": 0.7398, + "step": 1438 + }, + { + "epoch": 0.06322930502651021, + "grad_norm": 4.3125, + "learning_rate": 4.955152432339718e-05, + "loss": 0.6898, + "step": 1440 + }, + { + "epoch": 0.0633171235057137, + "grad_norm": 4.375, + "learning_rate": 4.955021884658184e-05, + "loss": 0.7295, + "step": 1442 + }, + { + "epoch": 0.06340494198491718, + "grad_norm": 3.765625, + "learning_rate": 4.9548911489704854e-05, + "loss": 0.7075, + "step": 1444 + }, + { + "epoch": 0.06349276046412067, + "grad_norm": 4.75, + "learning_rate": 4.9547602252866343e-05, + "loss": 0.7071, + "step": 1446 + }, + { + "epoch": 0.06358057894332415, + "grad_norm": 4.53125, + "learning_rate": 4.954629113616656e-05, + "loss": 0.6837, + "step": 1448 + }, + { + "epoch": 0.06366839742252764, + "grad_norm": 4.5, + "learning_rate": 4.954497813970592e-05, + "loss": 0.7418, + "step": 1450 + }, + { + "epoch": 0.06375621590173113, + "grad_norm": 3.671875, + "learning_rate": 4.9543663263584974e-05, + "loss": 0.7308, + "step": 1452 + }, + { + "epoch": 0.06384403438093461, + "grad_norm": 4.375, + "learning_rate": 4.9542346507904415e-05, + "loss": 0.7102, + "step": 1454 + }, + { + "epoch": 0.0639318528601381, + "grad_norm": 3.640625, + "learning_rate": 4.954102787276507e-05, + "loss": 0.7463, + "step": 1456 + }, + { + "epoch": 0.06401967133934158, + "grad_norm": 4.6875, + "learning_rate": 4.9539707358267935e-05, + "loss": 0.7124, + "step": 1458 + }, + { + "epoch": 0.06410748981854507, + "grad_norm": 4.3125, + "learning_rate": 4.9538384964514116e-05, + "loss": 0.7378, + "step": 1460 + }, + { + "epoch": 0.06419530829774855, + "grad_norm": 4.625, + "learning_rate": 4.953706069160491e-05, + "loss": 0.7251, + "step": 1462 + }, + { + "epoch": 0.06428312677695204, + "grad_norm": 4.6875, + "learning_rate": 4.95357345396417e-05, + "loss": 0.7246, + "step": 1464 + }, + { + "epoch": 0.06437094525615553, + "grad_norm": 4.71875, + "learning_rate": 4.9534406508726065e-05, + "loss": 0.722, + "step": 1466 + }, + { + "epoch": 0.06445876373535901, + "grad_norm": 4.65625, + "learning_rate": 4.95330765989597e-05, + "loss": 0.7198, + "step": 1468 + }, + { + "epoch": 0.0645465822145625, + "grad_norm": 3.6875, + "learning_rate": 4.9531744810444443e-05, + "loss": 0.691, + "step": 1470 + }, + { + "epoch": 0.06463440069376598, + "grad_norm": 4.125, + "learning_rate": 4.9530411143282283e-05, + "loss": 0.7077, + "step": 1472 + }, + { + "epoch": 0.06472221917296947, + "grad_norm": 3.796875, + "learning_rate": 4.952907559757537e-05, + "loss": 0.7558, + "step": 1474 + }, + { + "epoch": 0.06481003765217296, + "grad_norm": 5.65625, + "learning_rate": 4.9527738173425965e-05, + "loss": 0.7045, + "step": 1476 + }, + { + "epoch": 0.06489785613137644, + "grad_norm": 5.96875, + "learning_rate": 4.952639887093648e-05, + "loss": 0.7126, + "step": 1478 + }, + { + "epoch": 0.06498567461057993, + "grad_norm": 5.03125, + "learning_rate": 4.95250576902095e-05, + "loss": 0.6971, + "step": 1480 + }, + { + "epoch": 0.06507349308978341, + "grad_norm": 4.5, + "learning_rate": 4.9523714631347716e-05, + "loss": 0.7151, + "step": 1482 + }, + { + "epoch": 0.0651613115689869, + "grad_norm": 4.0625, + "learning_rate": 4.9522369694453996e-05, + "loss": 0.7222, + "step": 1484 + }, + { + "epoch": 0.06524913004819038, + "grad_norm": 4.96875, + "learning_rate": 4.9521022879631325e-05, + "loss": 0.6965, + "step": 1486 + }, + { + "epoch": 0.06533694852739387, + "grad_norm": 3.71875, + "learning_rate": 4.951967418698284e-05, + "loss": 0.6897, + "step": 1488 + }, + { + "epoch": 0.06542476700659736, + "grad_norm": 3.765625, + "learning_rate": 4.951832361661183e-05, + "loss": 0.7026, + "step": 1490 + }, + { + "epoch": 0.06551258548580086, + "grad_norm": 4.90625, + "learning_rate": 4.9516971168621716e-05, + "loss": 0.702, + "step": 1492 + }, + { + "epoch": 0.06560040396500434, + "grad_norm": 4.96875, + "learning_rate": 4.951561684311608e-05, + "loss": 0.698, + "step": 1494 + }, + { + "epoch": 0.06568822244420783, + "grad_norm": 6.3125, + "learning_rate": 4.951426064019862e-05, + "loss": 0.7013, + "step": 1496 + }, + { + "epoch": 0.06577604092341131, + "grad_norm": 4.53125, + "learning_rate": 4.951290255997321e-05, + "loss": 0.6693, + "step": 1498 + }, + { + "epoch": 0.0658638594026148, + "grad_norm": 4.375, + "learning_rate": 4.9511542602543836e-05, + "loss": 0.6918, + "step": 1500 + }, + { + "epoch": 0.06595167788181829, + "grad_norm": 4.34375, + "learning_rate": 4.951018076801467e-05, + "loss": 0.6929, + "step": 1502 + }, + { + "epoch": 0.06603949636102177, + "grad_norm": 3.890625, + "learning_rate": 4.950881705648998e-05, + "loss": 0.7204, + "step": 1504 + }, + { + "epoch": 0.06612731484022526, + "grad_norm": 4.15625, + "learning_rate": 4.9507451468074194e-05, + "loss": 0.7144, + "step": 1506 + }, + { + "epoch": 0.06621513331942874, + "grad_norm": 3.578125, + "learning_rate": 4.950608400287191e-05, + "loss": 0.7081, + "step": 1508 + }, + { + "epoch": 0.06630295179863223, + "grad_norm": 4.03125, + "learning_rate": 4.950471466098784e-05, + "loss": 0.7034, + "step": 1510 + }, + { + "epoch": 0.06639077027783571, + "grad_norm": 3.421875, + "learning_rate": 4.950334344252684e-05, + "loss": 0.6886, + "step": 1512 + }, + { + "epoch": 0.0664785887570392, + "grad_norm": 3.265625, + "learning_rate": 4.950197034759393e-05, + "loss": 0.7148, + "step": 1514 + }, + { + "epoch": 0.06656640723624269, + "grad_norm": 3.421875, + "learning_rate": 4.950059537629425e-05, + "loss": 0.7137, + "step": 1516 + }, + { + "epoch": 0.06665422571544617, + "grad_norm": 4.625, + "learning_rate": 4.949921852873311e-05, + "loss": 0.7077, + "step": 1518 + }, + { + "epoch": 0.06674204419464966, + "grad_norm": 3.375, + "learning_rate": 4.9497839805015945e-05, + "loss": 0.7082, + "step": 1520 + }, + { + "epoch": 0.06682986267385314, + "grad_norm": 3.671875, + "learning_rate": 4.9496459205248325e-05, + "loss": 0.6821, + "step": 1522 + }, + { + "epoch": 0.06691768115305663, + "grad_norm": 3.53125, + "learning_rate": 4.9495076729535994e-05, + "loss": 0.697, + "step": 1524 + }, + { + "epoch": 0.06700549963226012, + "grad_norm": 4.1875, + "learning_rate": 4.9493692377984815e-05, + "loss": 0.6875, + "step": 1526 + }, + { + "epoch": 0.0670933181114636, + "grad_norm": 4.21875, + "learning_rate": 4.94923061507008e-05, + "loss": 0.6914, + "step": 1528 + }, + { + "epoch": 0.06718113659066709, + "grad_norm": 4.21875, + "learning_rate": 4.9490918047790114e-05, + "loss": 0.7216, + "step": 1530 + }, + { + "epoch": 0.06726895506987057, + "grad_norm": 5.0625, + "learning_rate": 4.9489528069359047e-05, + "loss": 0.7153, + "step": 1532 + }, + { + "epoch": 0.06735677354907406, + "grad_norm": 4.875, + "learning_rate": 4.9488136215514045e-05, + "loss": 0.7029, + "step": 1534 + }, + { + "epoch": 0.06744459202827754, + "grad_norm": 3.984375, + "learning_rate": 4.9486742486361714e-05, + "loss": 0.6867, + "step": 1536 + }, + { + "epoch": 0.06753241050748103, + "grad_norm": 3.5, + "learning_rate": 4.9485346882008765e-05, + "loss": 0.6959, + "step": 1538 + }, + { + "epoch": 0.06762022898668452, + "grad_norm": 6.3125, + "learning_rate": 4.948394940256209e-05, + "loss": 0.7248, + "step": 1540 + }, + { + "epoch": 0.06770804746588802, + "grad_norm": 5.96875, + "learning_rate": 4.948255004812869e-05, + "loss": 0.7015, + "step": 1542 + }, + { + "epoch": 0.0677958659450915, + "grad_norm": 5.5625, + "learning_rate": 4.9481148818815746e-05, + "loss": 0.7161, + "step": 1544 + }, + { + "epoch": 0.06788368442429499, + "grad_norm": 5.0, + "learning_rate": 4.947974571473055e-05, + "loss": 0.7092, + "step": 1546 + }, + { + "epoch": 0.06797150290349847, + "grad_norm": 3.65625, + "learning_rate": 4.9478340735980565e-05, + "loss": 0.7008, + "step": 1548 + }, + { + "epoch": 0.06805932138270196, + "grad_norm": 4.5, + "learning_rate": 4.947693388267338e-05, + "loss": 0.7142, + "step": 1550 + }, + { + "epoch": 0.06814713986190545, + "grad_norm": 4.5, + "learning_rate": 4.947552515491673e-05, + "loss": 0.6736, + "step": 1552 + }, + { + "epoch": 0.06823495834110893, + "grad_norm": 5.3125, + "learning_rate": 4.94741145528185e-05, + "loss": 0.664, + "step": 1554 + }, + { + "epoch": 0.06832277682031242, + "grad_norm": 4.71875, + "learning_rate": 4.94727020764867e-05, + "loss": 0.7062, + "step": 1556 + }, + { + "epoch": 0.0684105952995159, + "grad_norm": 3.78125, + "learning_rate": 4.947128772602951e-05, + "loss": 0.6652, + "step": 1558 + }, + { + "epoch": 0.06849841377871939, + "grad_norm": 4.15625, + "learning_rate": 4.946987150155525e-05, + "loss": 0.7125, + "step": 1560 + }, + { + "epoch": 0.06858623225792287, + "grad_norm": 5.3125, + "learning_rate": 4.9468453403172356e-05, + "loss": 0.6811, + "step": 1562 + }, + { + "epoch": 0.06867405073712636, + "grad_norm": 4.28125, + "learning_rate": 4.946703343098944e-05, + "loss": 0.7141, + "step": 1564 + }, + { + "epoch": 0.06876186921632985, + "grad_norm": 3.578125, + "learning_rate": 4.9465611585115235e-05, + "loss": 0.7154, + "step": 1566 + }, + { + "epoch": 0.06884968769553333, + "grad_norm": 4.0, + "learning_rate": 4.946418786565863e-05, + "loss": 0.7087, + "step": 1568 + }, + { + "epoch": 0.06893750617473682, + "grad_norm": 3.640625, + "learning_rate": 4.946276227272865e-05, + "loss": 0.6681, + "step": 1570 + }, + { + "epoch": 0.0690253246539403, + "grad_norm": 4.71875, + "learning_rate": 4.9461334806434475e-05, + "loss": 0.6754, + "step": 1572 + }, + { + "epoch": 0.06911314313314379, + "grad_norm": 5.8125, + "learning_rate": 4.945990546688542e-05, + "loss": 0.6888, + "step": 1574 + }, + { + "epoch": 0.06920096161234728, + "grad_norm": 5.21875, + "learning_rate": 4.945847425419094e-05, + "loss": 0.6945, + "step": 1576 + }, + { + "epoch": 0.06928878009155076, + "grad_norm": 4.5625, + "learning_rate": 4.945704116846064e-05, + "loss": 0.6836, + "step": 1578 + }, + { + "epoch": 0.06937659857075425, + "grad_norm": 3.828125, + "learning_rate": 4.945560620980426e-05, + "loss": 0.6874, + "step": 1580 + }, + { + "epoch": 0.06946441704995773, + "grad_norm": 3.65625, + "learning_rate": 4.945416937833169e-05, + "loss": 0.6591, + "step": 1582 + }, + { + "epoch": 0.06955223552916122, + "grad_norm": 3.125, + "learning_rate": 4.945273067415298e-05, + "loss": 0.68, + "step": 1584 + }, + { + "epoch": 0.0696400540083647, + "grad_norm": 3.078125, + "learning_rate": 4.945129009737828e-05, + "loss": 0.6588, + "step": 1586 + }, + { + "epoch": 0.06972787248756819, + "grad_norm": 3.515625, + "learning_rate": 4.944984764811793e-05, + "loss": 0.667, + "step": 1588 + }, + { + "epoch": 0.06981569096677168, + "grad_norm": 4.625, + "learning_rate": 4.9448403326482386e-05, + "loss": 0.6798, + "step": 1590 + }, + { + "epoch": 0.06990350944597518, + "grad_norm": 5.28125, + "learning_rate": 4.944695713258225e-05, + "loss": 0.6867, + "step": 1592 + }, + { + "epoch": 0.06999132792517866, + "grad_norm": 4.84375, + "learning_rate": 4.944550906652828e-05, + "loss": 0.6764, + "step": 1594 + }, + { + "epoch": 0.07007914640438215, + "grad_norm": 6.84375, + "learning_rate": 4.944405912843136e-05, + "loss": 0.6841, + "step": 1596 + }, + { + "epoch": 0.07016696488358563, + "grad_norm": 8.1875, + "learning_rate": 4.9442607318402543e-05, + "loss": 0.7033, + "step": 1598 + }, + { + "epoch": 0.07025478336278912, + "grad_norm": 4.46875, + "learning_rate": 4.944115363655299e-05, + "loss": 0.6732, + "step": 1600 + }, + { + "epoch": 0.0703426018419926, + "grad_norm": 3.78125, + "learning_rate": 4.943969808299404e-05, + "loss": 0.6968, + "step": 1602 + }, + { + "epoch": 0.07043042032119609, + "grad_norm": 3.71875, + "learning_rate": 4.943824065783714e-05, + "loss": 0.6775, + "step": 1604 + }, + { + "epoch": 0.07051823880039958, + "grad_norm": 4.125, + "learning_rate": 4.9436781361193926e-05, + "loss": 0.646, + "step": 1606 + }, + { + "epoch": 0.07060605727960306, + "grad_norm": 6.15625, + "learning_rate": 4.943532019317613e-05, + "loss": 0.7055, + "step": 1608 + }, + { + "epoch": 0.07069387575880655, + "grad_norm": 4.8125, + "learning_rate": 4.943385715389566e-05, + "loss": 0.6727, + "step": 1610 + }, + { + "epoch": 0.07078169423801003, + "grad_norm": 4.65625, + "learning_rate": 4.9432392243464546e-05, + "loss": 0.6792, + "step": 1612 + }, + { + "epoch": 0.07086951271721352, + "grad_norm": 3.921875, + "learning_rate": 4.943092546199498e-05, + "loss": 0.651, + "step": 1614 + }, + { + "epoch": 0.070957331196417, + "grad_norm": 4.0625, + "learning_rate": 4.9429456809599286e-05, + "loss": 0.6823, + "step": 1616 + }, + { + "epoch": 0.07104514967562049, + "grad_norm": 3.828125, + "learning_rate": 4.942798628638994e-05, + "loss": 0.6846, + "step": 1618 + }, + { + "epoch": 0.07113296815482398, + "grad_norm": 3.90625, + "learning_rate": 4.942651389247954e-05, + "loss": 0.6784, + "step": 1620 + }, + { + "epoch": 0.07122078663402746, + "grad_norm": 3.28125, + "learning_rate": 4.942503962798085e-05, + "loss": 0.6826, + "step": 1622 + }, + { + "epoch": 0.07130860511323095, + "grad_norm": 3.453125, + "learning_rate": 4.9423563493006776e-05, + "loss": 0.7006, + "step": 1624 + }, + { + "epoch": 0.07139642359243444, + "grad_norm": 4.1875, + "learning_rate": 4.9422085487670344e-05, + "loss": 0.6968, + "step": 1626 + }, + { + "epoch": 0.07148424207163792, + "grad_norm": 4.34375, + "learning_rate": 4.942060561208476e-05, + "loss": 0.7066, + "step": 1628 + }, + { + "epoch": 0.07157206055084141, + "grad_norm": 4.8125, + "learning_rate": 4.941912386636335e-05, + "loss": 0.6673, + "step": 1630 + }, + { + "epoch": 0.07165987903004489, + "grad_norm": 4.40625, + "learning_rate": 4.941764025061957e-05, + "loss": 0.6775, + "step": 1632 + }, + { + "epoch": 0.07174769750924838, + "grad_norm": 4.65625, + "learning_rate": 4.9416154764967046e-05, + "loss": 0.6848, + "step": 1634 + }, + { + "epoch": 0.07183551598845186, + "grad_norm": 3.90625, + "learning_rate": 4.941466740951954e-05, + "loss": 0.6707, + "step": 1636 + }, + { + "epoch": 0.07192333446765535, + "grad_norm": 3.5, + "learning_rate": 4.941317818439095e-05, + "loss": 0.6565, + "step": 1638 + }, + { + "epoch": 0.07201115294685884, + "grad_norm": 3.734375, + "learning_rate": 4.941168708969533e-05, + "loss": 0.6783, + "step": 1640 + }, + { + "epoch": 0.07209897142606234, + "grad_norm": 3.71875, + "learning_rate": 4.941019412554686e-05, + "loss": 0.6587, + "step": 1642 + }, + { + "epoch": 0.07218678990526582, + "grad_norm": 3.59375, + "learning_rate": 4.9408699292059865e-05, + "loss": 0.6873, + "step": 1644 + }, + { + "epoch": 0.07227460838446931, + "grad_norm": 4.25, + "learning_rate": 4.940720258934883e-05, + "loss": 0.6964, + "step": 1646 + }, + { + "epoch": 0.0723624268636728, + "grad_norm": 4.4375, + "learning_rate": 4.940570401752836e-05, + "loss": 0.6669, + "step": 1648 + }, + { + "epoch": 0.07245024534287628, + "grad_norm": 4.25, + "learning_rate": 4.940420357671324e-05, + "loss": 0.7105, + "step": 1650 + }, + { + "epoch": 0.07253806382207977, + "grad_norm": 3.4375, + "learning_rate": 4.940270126701836e-05, + "loss": 0.6843, + "step": 1652 + }, + { + "epoch": 0.07262588230128325, + "grad_norm": 3.6875, + "learning_rate": 4.940119708855876e-05, + "loss": 0.6931, + "step": 1654 + }, + { + "epoch": 0.07271370078048674, + "grad_norm": 3.96875, + "learning_rate": 4.939969104144964e-05, + "loss": 0.6722, + "step": 1656 + }, + { + "epoch": 0.07280151925969022, + "grad_norm": 5.46875, + "learning_rate": 4.939818312580633e-05, + "loss": 0.708, + "step": 1658 + }, + { + "epoch": 0.07288933773889371, + "grad_norm": 5.3125, + "learning_rate": 4.939667334174431e-05, + "loss": 0.7094, + "step": 1660 + }, + { + "epoch": 0.0729771562180972, + "grad_norm": 4.28125, + "learning_rate": 4.93951616893792e-05, + "loss": 0.7017, + "step": 1662 + }, + { + "epoch": 0.07306497469730068, + "grad_norm": 4.5, + "learning_rate": 4.939364816882676e-05, + "loss": 0.6851, + "step": 1664 + }, + { + "epoch": 0.07315279317650417, + "grad_norm": 3.640625, + "learning_rate": 4.939213278020288e-05, + "loss": 0.6595, + "step": 1666 + }, + { + "epoch": 0.07324061165570765, + "grad_norm": 3.359375, + "learning_rate": 4.939061552362364e-05, + "loss": 0.6878, + "step": 1668 + }, + { + "epoch": 0.07332843013491114, + "grad_norm": 3.515625, + "learning_rate": 4.938909639920521e-05, + "loss": 0.6921, + "step": 1670 + }, + { + "epoch": 0.07341624861411462, + "grad_norm": 3.59375, + "learning_rate": 4.9387575407063936e-05, + "loss": 0.6692, + "step": 1672 + }, + { + "epoch": 0.07350406709331811, + "grad_norm": 3.796875, + "learning_rate": 4.938605254731629e-05, + "loss": 0.6822, + "step": 1674 + }, + { + "epoch": 0.0735918855725216, + "grad_norm": 3.984375, + "learning_rate": 4.93845278200789e-05, + "loss": 0.6804, + "step": 1676 + }, + { + "epoch": 0.07367970405172508, + "grad_norm": 4.25, + "learning_rate": 4.938300122546851e-05, + "loss": 0.6923, + "step": 1678 + }, + { + "epoch": 0.07376752253092857, + "grad_norm": 4.8125, + "learning_rate": 4.938147276360205e-05, + "loss": 0.6968, + "step": 1680 + }, + { + "epoch": 0.07385534101013205, + "grad_norm": 4.71875, + "learning_rate": 4.937994243459656e-05, + "loss": 0.6804, + "step": 1682 + }, + { + "epoch": 0.07394315948933554, + "grad_norm": 4.21875, + "learning_rate": 4.937841023856923e-05, + "loss": 0.6759, + "step": 1684 + }, + { + "epoch": 0.07403097796853902, + "grad_norm": 4.90625, + "learning_rate": 4.937687617563741e-05, + "loss": 0.6791, + "step": 1686 + }, + { + "epoch": 0.07411879644774251, + "grad_norm": 4.53125, + "learning_rate": 4.937534024591856e-05, + "loss": 0.6773, + "step": 1688 + }, + { + "epoch": 0.074206614926946, + "grad_norm": 5.78125, + "learning_rate": 4.9373802449530316e-05, + "loss": 0.7157, + "step": 1690 + }, + { + "epoch": 0.07429443340614948, + "grad_norm": 5.34375, + "learning_rate": 4.9372262786590436e-05, + "loss": 0.6767, + "step": 1692 + }, + { + "epoch": 0.07438225188535298, + "grad_norm": 4.15625, + "learning_rate": 4.9370721257216824e-05, + "loss": 0.6637, + "step": 1694 + }, + { + "epoch": 0.07447007036455647, + "grad_norm": 4.21875, + "learning_rate": 4.936917786152754e-05, + "loss": 0.6797, + "step": 1696 + }, + { + "epoch": 0.07455788884375995, + "grad_norm": 4.21875, + "learning_rate": 4.936763259964078e-05, + "loss": 0.7028, + "step": 1698 + }, + { + "epoch": 0.07464570732296344, + "grad_norm": 4.15625, + "learning_rate": 4.9366085471674864e-05, + "loss": 0.6994, + "step": 1700 + }, + { + "epoch": 0.07473352580216693, + "grad_norm": 4.1875, + "learning_rate": 4.936453647774829e-05, + "loss": 0.693, + "step": 1702 + }, + { + "epoch": 0.07482134428137041, + "grad_norm": 3.9375, + "learning_rate": 4.936298561797966e-05, + "loss": 0.6734, + "step": 1704 + }, + { + "epoch": 0.0749091627605739, + "grad_norm": 3.609375, + "learning_rate": 4.936143289248776e-05, + "loss": 0.681, + "step": 1706 + }, + { + "epoch": 0.07499698123977738, + "grad_norm": 4.15625, + "learning_rate": 4.9359878301391495e-05, + "loss": 0.6764, + "step": 1708 + }, + { + "epoch": 0.07508479971898087, + "grad_norm": 4.625, + "learning_rate": 4.93583218448099e-05, + "loss": 0.6641, + "step": 1710 + }, + { + "epoch": 0.07517261819818435, + "grad_norm": 4.0, + "learning_rate": 4.935676352286218e-05, + "loss": 0.6791, + "step": 1712 + }, + { + "epoch": 0.07526043667738784, + "grad_norm": 5.03125, + "learning_rate": 4.935520333566768e-05, + "loss": 0.6714, + "step": 1714 + }, + { + "epoch": 0.07534825515659133, + "grad_norm": 4.90625, + "learning_rate": 4.9353641283345863e-05, + "loss": 0.7237, + "step": 1716 + }, + { + "epoch": 0.07543607363579481, + "grad_norm": 4.46875, + "learning_rate": 4.9352077366016355e-05, + "loss": 0.6888, + "step": 1718 + }, + { + "epoch": 0.0755238921149983, + "grad_norm": 4.0, + "learning_rate": 4.935051158379893e-05, + "loss": 0.6826, + "step": 1720 + }, + { + "epoch": 0.07561171059420178, + "grad_norm": 3.25, + "learning_rate": 4.934894393681349e-05, + "loss": 0.666, + "step": 1722 + }, + { + "epoch": 0.07569952907340527, + "grad_norm": 4.0625, + "learning_rate": 4.934737442518009e-05, + "loss": 0.6854, + "step": 1724 + }, + { + "epoch": 0.07578734755260876, + "grad_norm": 3.21875, + "learning_rate": 4.9345803049018914e-05, + "loss": 0.692, + "step": 1726 + }, + { + "epoch": 0.07587516603181224, + "grad_norm": 3.59375, + "learning_rate": 4.9344229808450305e-05, + "loss": 0.6561, + "step": 1728 + }, + { + "epoch": 0.07596298451101573, + "grad_norm": 4.125, + "learning_rate": 4.934265470359474e-05, + "loss": 0.6502, + "step": 1730 + }, + { + "epoch": 0.07605080299021921, + "grad_norm": 3.859375, + "learning_rate": 4.934107773457285e-05, + "loss": 0.6824, + "step": 1732 + }, + { + "epoch": 0.0761386214694227, + "grad_norm": 3.625, + "learning_rate": 4.933949890150539e-05, + "loss": 0.6821, + "step": 1734 + }, + { + "epoch": 0.07622643994862618, + "grad_norm": 3.90625, + "learning_rate": 4.933791820451327e-05, + "loss": 0.6656, + "step": 1736 + }, + { + "epoch": 0.07631425842782967, + "grad_norm": 3.65625, + "learning_rate": 4.933633564371753e-05, + "loss": 0.6663, + "step": 1738 + }, + { + "epoch": 0.07640207690703316, + "grad_norm": 3.625, + "learning_rate": 4.933475121923938e-05, + "loss": 0.6612, + "step": 1740 + }, + { + "epoch": 0.07648989538623664, + "grad_norm": 3.46875, + "learning_rate": 4.933316493120015e-05, + "loss": 0.662, + "step": 1742 + }, + { + "epoch": 0.07657771386544014, + "grad_norm": 4.21875, + "learning_rate": 4.9331576779721314e-05, + "loss": 0.6569, + "step": 1744 + }, + { + "epoch": 0.07666553234464363, + "grad_norm": 3.921875, + "learning_rate": 4.93299867649245e-05, + "loss": 0.6772, + "step": 1746 + }, + { + "epoch": 0.07675335082384711, + "grad_norm": 3.078125, + "learning_rate": 4.9328394886931456e-05, + "loss": 0.6619, + "step": 1748 + }, + { + "epoch": 0.0768411693030506, + "grad_norm": 3.609375, + "learning_rate": 4.932680114586411e-05, + "loss": 0.6713, + "step": 1750 + }, + { + "epoch": 0.07692898778225409, + "grad_norm": 3.25, + "learning_rate": 4.9325205541844497e-05, + "loss": 0.6795, + "step": 1752 + }, + { + "epoch": 0.07701680626145757, + "grad_norm": 3.359375, + "learning_rate": 4.932360807499481e-05, + "loss": 0.6586, + "step": 1754 + }, + { + "epoch": 0.07710462474066106, + "grad_norm": 3.515625, + "learning_rate": 4.9322008745437385e-05, + "loss": 0.675, + "step": 1756 + }, + { + "epoch": 0.07719244321986454, + "grad_norm": 3.40625, + "learning_rate": 4.932040755329471e-05, + "loss": 0.6854, + "step": 1758 + }, + { + "epoch": 0.07728026169906803, + "grad_norm": 3.28125, + "learning_rate": 4.9318804498689384e-05, + "loss": 0.694, + "step": 1760 + }, + { + "epoch": 0.07736808017827151, + "grad_norm": 3.296875, + "learning_rate": 4.9317199581744187e-05, + "loss": 0.6677, + "step": 1762 + }, + { + "epoch": 0.077455898657475, + "grad_norm": 3.4375, + "learning_rate": 4.931559280258201e-05, + "loss": 0.6727, + "step": 1764 + }, + { + "epoch": 0.07754371713667849, + "grad_norm": 3.953125, + "learning_rate": 4.931398416132591e-05, + "loss": 0.6514, + "step": 1766 + }, + { + "epoch": 0.07763153561588197, + "grad_norm": 3.640625, + "learning_rate": 4.9312373658099076e-05, + "loss": 0.6559, + "step": 1768 + }, + { + "epoch": 0.07771935409508546, + "grad_norm": 4.21875, + "learning_rate": 4.931076129302484e-05, + "loss": 0.633, + "step": 1770 + }, + { + "epoch": 0.07780717257428894, + "grad_norm": 3.625, + "learning_rate": 4.930914706622668e-05, + "loss": 0.6615, + "step": 1772 + }, + { + "epoch": 0.07789499105349243, + "grad_norm": 3.75, + "learning_rate": 4.93075309778282e-05, + "loss": 0.6607, + "step": 1774 + }, + { + "epoch": 0.07798280953269592, + "grad_norm": 3.40625, + "learning_rate": 4.930591302795318e-05, + "loss": 0.652, + "step": 1776 + }, + { + "epoch": 0.0780706280118994, + "grad_norm": 3.40625, + "learning_rate": 4.9304293216725505e-05, + "loss": 0.7057, + "step": 1778 + }, + { + "epoch": 0.07815844649110289, + "grad_norm": 3.453125, + "learning_rate": 4.930267154426924e-05, + "loss": 0.6711, + "step": 1780 + }, + { + "epoch": 0.07824626497030637, + "grad_norm": 3.796875, + "learning_rate": 4.9301048010708556e-05, + "loss": 0.6659, + "step": 1782 + }, + { + "epoch": 0.07833408344950986, + "grad_norm": 3.859375, + "learning_rate": 4.929942261616779e-05, + "loss": 0.6763, + "step": 1784 + }, + { + "epoch": 0.07842190192871334, + "grad_norm": 3.328125, + "learning_rate": 4.929779536077142e-05, + "loss": 0.6799, + "step": 1786 + }, + { + "epoch": 0.07850972040791683, + "grad_norm": 3.328125, + "learning_rate": 4.929616624464405e-05, + "loss": 0.6641, + "step": 1788 + }, + { + "epoch": 0.07859753888712032, + "grad_norm": 3.65625, + "learning_rate": 4.9294535267910446e-05, + "loss": 0.6557, + "step": 1790 + }, + { + "epoch": 0.0786853573663238, + "grad_norm": 5.125, + "learning_rate": 4.929290243069551e-05, + "loss": 0.6742, + "step": 1792 + }, + { + "epoch": 0.0787731758455273, + "grad_norm": 3.484375, + "learning_rate": 4.929126773312428e-05, + "loss": 0.6501, + "step": 1794 + }, + { + "epoch": 0.07886099432473079, + "grad_norm": 3.90625, + "learning_rate": 4.928963117532195e-05, + "loss": 0.671, + "step": 1796 + }, + { + "epoch": 0.07894881280393427, + "grad_norm": 3.25, + "learning_rate": 4.928799275741384e-05, + "loss": 0.6796, + "step": 1798 + }, + { + "epoch": 0.07903663128313776, + "grad_norm": 3.40625, + "learning_rate": 4.928635247952541e-05, + "loss": 0.6689, + "step": 1800 + }, + { + "epoch": 0.07912444976234125, + "grad_norm": 4.71875, + "learning_rate": 4.92847103417823e-05, + "loss": 0.6507, + "step": 1802 + }, + { + "epoch": 0.07921226824154473, + "grad_norm": 4.71875, + "learning_rate": 4.928306634431025e-05, + "loss": 0.6544, + "step": 1804 + }, + { + "epoch": 0.07930008672074822, + "grad_norm": 4.34375, + "learning_rate": 4.9281420487235144e-05, + "loss": 0.6738, + "step": 1806 + }, + { + "epoch": 0.0793879051999517, + "grad_norm": 3.984375, + "learning_rate": 4.927977277068305e-05, + "loss": 0.6526, + "step": 1808 + }, + { + "epoch": 0.07947572367915519, + "grad_norm": 3.890625, + "learning_rate": 4.9278123194780134e-05, + "loss": 0.7137, + "step": 1810 + }, + { + "epoch": 0.07956354215835867, + "grad_norm": 3.234375, + "learning_rate": 4.927647175965272e-05, + "loss": 0.6777, + "step": 1812 + }, + { + "epoch": 0.07965136063756216, + "grad_norm": 3.765625, + "learning_rate": 4.9274818465427285e-05, + "loss": 0.6403, + "step": 1814 + }, + { + "epoch": 0.07973917911676565, + "grad_norm": 3.828125, + "learning_rate": 4.927316331223043e-05, + "loss": 0.6577, + "step": 1816 + }, + { + "epoch": 0.07982699759596913, + "grad_norm": 4.25, + "learning_rate": 4.927150630018891e-05, + "loss": 0.646, + "step": 1818 + }, + { + "epoch": 0.07991481607517262, + "grad_norm": 3.375, + "learning_rate": 4.926984742942961e-05, + "loss": 0.6679, + "step": 1820 + }, + { + "epoch": 0.0800026345543761, + "grad_norm": 3.5625, + "learning_rate": 4.9268186700079594e-05, + "loss": 0.6644, + "step": 1822 + }, + { + "epoch": 0.08009045303357959, + "grad_norm": 3.53125, + "learning_rate": 4.926652411226601e-05, + "loss": 0.6556, + "step": 1824 + }, + { + "epoch": 0.08017827151278308, + "grad_norm": 3.578125, + "learning_rate": 4.92648596661162e-05, + "loss": 0.6853, + "step": 1826 + }, + { + "epoch": 0.08026608999198656, + "grad_norm": 4.40625, + "learning_rate": 4.926319336175762e-05, + "loss": 0.6583, + "step": 1828 + }, + { + "epoch": 0.08035390847119005, + "grad_norm": 4.21875, + "learning_rate": 4.926152519931787e-05, + "loss": 0.6808, + "step": 1830 + }, + { + "epoch": 0.08044172695039353, + "grad_norm": 4.28125, + "learning_rate": 4.925985517892471e-05, + "loss": 0.7052, + "step": 1832 + }, + { + "epoch": 0.08052954542959702, + "grad_norm": 3.328125, + "learning_rate": 4.9258183300706016e-05, + "loss": 0.6652, + "step": 1834 + }, + { + "epoch": 0.0806173639088005, + "grad_norm": 3.3125, + "learning_rate": 4.9256509564789836e-05, + "loss": 0.6473, + "step": 1836 + }, + { + "epoch": 0.08070518238800399, + "grad_norm": 3.265625, + "learning_rate": 4.9254833971304334e-05, + "loss": 0.6697, + "step": 1838 + }, + { + "epoch": 0.08079300086720748, + "grad_norm": 4.375, + "learning_rate": 4.925315652037784e-05, + "loss": 0.6887, + "step": 1840 + }, + { + "epoch": 0.08088081934641096, + "grad_norm": 3.890625, + "learning_rate": 4.925147721213881e-05, + "loss": 0.652, + "step": 1842 + }, + { + "epoch": 0.08096863782561446, + "grad_norm": 4.125, + "learning_rate": 4.924979604671583e-05, + "loss": 0.6581, + "step": 1844 + }, + { + "epoch": 0.08105645630481795, + "grad_norm": 3.5, + "learning_rate": 4.924811302423766e-05, + "loss": 0.6593, + "step": 1846 + }, + { + "epoch": 0.08114427478402143, + "grad_norm": 5.65625, + "learning_rate": 4.924642814483318e-05, + "loss": 0.6676, + "step": 1848 + }, + { + "epoch": 0.08123209326322492, + "grad_norm": 4.8125, + "learning_rate": 4.924474140863142e-05, + "loss": 0.6476, + "step": 1850 + }, + { + "epoch": 0.0813199117424284, + "grad_norm": 3.953125, + "learning_rate": 4.924305281576156e-05, + "loss": 0.6434, + "step": 1852 + }, + { + "epoch": 0.08140773022163189, + "grad_norm": 4.09375, + "learning_rate": 4.924136236635289e-05, + "loss": 0.647, + "step": 1854 + }, + { + "epoch": 0.08149554870083538, + "grad_norm": 4.75, + "learning_rate": 4.923967006053489e-05, + "loss": 0.6646, + "step": 1856 + }, + { + "epoch": 0.08158336718003886, + "grad_norm": 4.34375, + "learning_rate": 4.9237975898437144e-05, + "loss": 0.6349, + "step": 1858 + }, + { + "epoch": 0.08167118565924235, + "grad_norm": 4.09375, + "learning_rate": 4.923627988018939e-05, + "loss": 0.6667, + "step": 1860 + }, + { + "epoch": 0.08175900413844583, + "grad_norm": 4.0, + "learning_rate": 4.9234582005921514e-05, + "loss": 0.6273, + "step": 1862 + }, + { + "epoch": 0.08184682261764932, + "grad_norm": 3.359375, + "learning_rate": 4.923288227576354e-05, + "loss": 0.6516, + "step": 1864 + }, + { + "epoch": 0.0819346410968528, + "grad_norm": 3.265625, + "learning_rate": 4.923118068984564e-05, + "loss": 0.6475, + "step": 1866 + }, + { + "epoch": 0.08202245957605629, + "grad_norm": 3.171875, + "learning_rate": 4.92294772482981e-05, + "loss": 0.6575, + "step": 1868 + }, + { + "epoch": 0.08211027805525978, + "grad_norm": 3.375, + "learning_rate": 4.922777195125139e-05, + "loss": 0.6461, + "step": 1870 + }, + { + "epoch": 0.08219809653446326, + "grad_norm": 3.46875, + "learning_rate": 4.922606479883609e-05, + "loss": 0.664, + "step": 1872 + }, + { + "epoch": 0.08228591501366675, + "grad_norm": 4.125, + "learning_rate": 4.9224355791182955e-05, + "loss": 0.6572, + "step": 1874 + }, + { + "epoch": 0.08237373349287024, + "grad_norm": 4.28125, + "learning_rate": 4.922264492842283e-05, + "loss": 0.6558, + "step": 1876 + }, + { + "epoch": 0.08246155197207372, + "grad_norm": 3.65625, + "learning_rate": 4.922093221068676e-05, + "loss": 0.6248, + "step": 1878 + }, + { + "epoch": 0.08254937045127721, + "grad_norm": 3.34375, + "learning_rate": 4.92192176381059e-05, + "loss": 0.6539, + "step": 1880 + }, + { + "epoch": 0.08263718893048069, + "grad_norm": 3.03125, + "learning_rate": 4.9217501210811536e-05, + "loss": 0.662, + "step": 1882 + }, + { + "epoch": 0.08272500740968418, + "grad_norm": 3.734375, + "learning_rate": 4.9215782928935126e-05, + "loss": 0.6361, + "step": 1884 + }, + { + "epoch": 0.08281282588888766, + "grad_norm": 3.875, + "learning_rate": 4.921406279260826e-05, + "loss": 0.6498, + "step": 1886 + }, + { + "epoch": 0.08290064436809115, + "grad_norm": 3.078125, + "learning_rate": 4.9212340801962655e-05, + "loss": 0.6539, + "step": 1888 + }, + { + "epoch": 0.08298846284729464, + "grad_norm": 3.75, + "learning_rate": 4.9210616957130185e-05, + "loss": 0.6717, + "step": 1890 + }, + { + "epoch": 0.08307628132649812, + "grad_norm": 3.3125, + "learning_rate": 4.9208891258242874e-05, + "loss": 0.6808, + "step": 1892 + }, + { + "epoch": 0.08316409980570161, + "grad_norm": 3.59375, + "learning_rate": 4.9207163705432855e-05, + "loss": 0.6205, + "step": 1894 + }, + { + "epoch": 0.08325191828490511, + "grad_norm": 3.28125, + "learning_rate": 4.920543429883245e-05, + "loss": 0.6618, + "step": 1896 + }, + { + "epoch": 0.0833397367641086, + "grad_norm": 3.34375, + "learning_rate": 4.9203703038574076e-05, + "loss": 0.6543, + "step": 1898 + }, + { + "epoch": 0.08342755524331208, + "grad_norm": 3.40625, + "learning_rate": 4.9201969924790324e-05, + "loss": 0.6634, + "step": 1900 + }, + { + "epoch": 0.08351537372251557, + "grad_norm": 4.28125, + "learning_rate": 4.9200234957613915e-05, + "loss": 0.6681, + "step": 1902 + }, + { + "epoch": 0.08360319220171905, + "grad_norm": 5.09375, + "learning_rate": 4.9198498137177705e-05, + "loss": 0.6415, + "step": 1904 + }, + { + "epoch": 0.08369101068092254, + "grad_norm": 5.5625, + "learning_rate": 4.919675946361472e-05, + "loss": 0.6455, + "step": 1906 + }, + { + "epoch": 0.08377882916012602, + "grad_norm": 7.09375, + "learning_rate": 4.919501893705808e-05, + "loss": 0.6539, + "step": 1908 + }, + { + "epoch": 0.08386664763932951, + "grad_norm": 6.65625, + "learning_rate": 4.91932765576411e-05, + "loss": 0.669, + "step": 1910 + }, + { + "epoch": 0.083954466118533, + "grad_norm": 5.625, + "learning_rate": 4.91915323254972e-05, + "loss": 0.6528, + "step": 1912 + }, + { + "epoch": 0.08404228459773648, + "grad_norm": 3.5625, + "learning_rate": 4.918978624075995e-05, + "loss": 0.6532, + "step": 1914 + }, + { + "epoch": 0.08413010307693997, + "grad_norm": 3.0625, + "learning_rate": 4.918803830356308e-05, + "loss": 0.6647, + "step": 1916 + }, + { + "epoch": 0.08421792155614345, + "grad_norm": 3.8125, + "learning_rate": 4.918628851404043e-05, + "loss": 0.7015, + "step": 1918 + }, + { + "epoch": 0.08430574003534694, + "grad_norm": 3.9375, + "learning_rate": 4.918453687232601e-05, + "loss": 0.6318, + "step": 1920 + }, + { + "epoch": 0.08439355851455042, + "grad_norm": 3.9375, + "learning_rate": 4.918278337855396e-05, + "loss": 0.6575, + "step": 1922 + }, + { + "epoch": 0.08448137699375391, + "grad_norm": 5.21875, + "learning_rate": 4.918102803285856e-05, + "loss": 0.6639, + "step": 1924 + }, + { + "epoch": 0.0845691954729574, + "grad_norm": 3.875, + "learning_rate": 4.917927083537423e-05, + "loss": 0.6426, + "step": 1926 + }, + { + "epoch": 0.08465701395216088, + "grad_norm": 3.265625, + "learning_rate": 4.9177511786235556e-05, + "loss": 0.6318, + "step": 1928 + }, + { + "epoch": 0.08474483243136437, + "grad_norm": 3.09375, + "learning_rate": 4.917575088557723e-05, + "loss": 0.6358, + "step": 1930 + }, + { + "epoch": 0.08483265091056785, + "grad_norm": 3.5, + "learning_rate": 4.917398813353411e-05, + "loss": 0.6354, + "step": 1932 + }, + { + "epoch": 0.08492046938977134, + "grad_norm": 4.5625, + "learning_rate": 4.917222353024118e-05, + "loss": 0.6469, + "step": 1934 + }, + { + "epoch": 0.08500828786897482, + "grad_norm": 4.28125, + "learning_rate": 4.9170457075833574e-05, + "loss": 0.6359, + "step": 1936 + }, + { + "epoch": 0.08509610634817831, + "grad_norm": 4.15625, + "learning_rate": 4.916868877044657e-05, + "loss": 0.6663, + "step": 1938 + }, + { + "epoch": 0.0851839248273818, + "grad_norm": 5.78125, + "learning_rate": 4.916691861421559e-05, + "loss": 0.6841, + "step": 1940 + }, + { + "epoch": 0.08527174330658528, + "grad_norm": 4.625, + "learning_rate": 4.916514660727619e-05, + "loss": 0.6488, + "step": 1942 + }, + { + "epoch": 0.08535956178578877, + "grad_norm": 4.3125, + "learning_rate": 4.916337274976407e-05, + "loss": 0.6324, + "step": 1944 + }, + { + "epoch": 0.08544738026499227, + "grad_norm": 3.84375, + "learning_rate": 4.9161597041815075e-05, + "loss": 0.6443, + "step": 1946 + }, + { + "epoch": 0.08553519874419575, + "grad_norm": 4.03125, + "learning_rate": 4.9159819483565175e-05, + "loss": 0.6394, + "step": 1948 + }, + { + "epoch": 0.08562301722339924, + "grad_norm": 3.96875, + "learning_rate": 4.915804007515052e-05, + "loss": 0.6319, + "step": 1950 + }, + { + "epoch": 0.08571083570260273, + "grad_norm": 3.984375, + "learning_rate": 4.915625881670736e-05, + "loss": 0.6318, + "step": 1952 + }, + { + "epoch": 0.08579865418180621, + "grad_norm": 3.4375, + "learning_rate": 4.915447570837211e-05, + "loss": 0.6436, + "step": 1954 + }, + { + "epoch": 0.0858864726610097, + "grad_norm": 3.53125, + "learning_rate": 4.9152690750281314e-05, + "loss": 0.6554, + "step": 1956 + }, + { + "epoch": 0.08597429114021318, + "grad_norm": 3.4375, + "learning_rate": 4.915090394257168e-05, + "loss": 0.6466, + "step": 1958 + }, + { + "epoch": 0.08606210961941667, + "grad_norm": 3.046875, + "learning_rate": 4.914911528538003e-05, + "loss": 0.6518, + "step": 1960 + }, + { + "epoch": 0.08614992809862015, + "grad_norm": 3.46875, + "learning_rate": 4.914732477884334e-05, + "loss": 0.6549, + "step": 1962 + }, + { + "epoch": 0.08623774657782364, + "grad_norm": 4.65625, + "learning_rate": 4.914553242309873e-05, + "loss": 0.6223, + "step": 1964 + }, + { + "epoch": 0.08632556505702713, + "grad_norm": 3.953125, + "learning_rate": 4.9143738218283466e-05, + "loss": 0.6593, + "step": 1966 + }, + { + "epoch": 0.08641338353623061, + "grad_norm": 3.25, + "learning_rate": 4.9141942164534936e-05, + "loss": 0.6879, + "step": 1968 + }, + { + "epoch": 0.0865012020154341, + "grad_norm": 4.0625, + "learning_rate": 4.9140144261990687e-05, + "loss": 0.6215, + "step": 1970 + }, + { + "epoch": 0.08658902049463758, + "grad_norm": 3.5625, + "learning_rate": 4.91383445107884e-05, + "loss": 0.6731, + "step": 1972 + }, + { + "epoch": 0.08667683897384107, + "grad_norm": 3.125, + "learning_rate": 4.913654291106591e-05, + "loss": 0.6251, + "step": 1974 + }, + { + "epoch": 0.08676465745304456, + "grad_norm": 3.25, + "learning_rate": 4.9134739462961174e-05, + "loss": 0.6461, + "step": 1976 + }, + { + "epoch": 0.08685247593224804, + "grad_norm": 3.5, + "learning_rate": 4.913293416661231e-05, + "loss": 0.6691, + "step": 1978 + }, + { + "epoch": 0.08694029441145153, + "grad_norm": 3.125, + "learning_rate": 4.913112702215756e-05, + "loss": 0.6742, + "step": 1980 + }, + { + "epoch": 0.08702811289065501, + "grad_norm": 4.4375, + "learning_rate": 4.9129318029735315e-05, + "loss": 0.6441, + "step": 1982 + }, + { + "epoch": 0.0871159313698585, + "grad_norm": 4.0625, + "learning_rate": 4.912750718948411e-05, + "loss": 0.6627, + "step": 1984 + }, + { + "epoch": 0.08720374984906198, + "grad_norm": 4.96875, + "learning_rate": 4.912569450154263e-05, + "loss": 0.667, + "step": 1986 + }, + { + "epoch": 0.08729156832826547, + "grad_norm": 4.75, + "learning_rate": 4.912387996604968e-05, + "loss": 0.6616, + "step": 1988 + }, + { + "epoch": 0.08737938680746896, + "grad_norm": 4.90625, + "learning_rate": 4.9122063583144204e-05, + "loss": 0.6596, + "step": 1990 + }, + { + "epoch": 0.08746720528667244, + "grad_norm": 4.9375, + "learning_rate": 4.912024535296533e-05, + "loss": 0.6549, + "step": 1992 + }, + { + "epoch": 0.08755502376587593, + "grad_norm": 3.3125, + "learning_rate": 4.9118425275652286e-05, + "loss": 0.6287, + "step": 1994 + }, + { + "epoch": 0.08764284224507943, + "grad_norm": 3.78125, + "learning_rate": 4.911660335134445e-05, + "loss": 0.6587, + "step": 1996 + }, + { + "epoch": 0.08773066072428291, + "grad_norm": 3.09375, + "learning_rate": 4.9114779580181345e-05, + "loss": 0.6227, + "step": 1998 + }, + { + "epoch": 0.0878184792034864, + "grad_norm": 3.921875, + "learning_rate": 4.9112953962302646e-05, + "loss": 0.6558, + "step": 2000 + }, + { + "epoch": 0.08790629768268989, + "grad_norm": 4.65625, + "learning_rate": 4.9111126497848144e-05, + "loss": 0.6681, + "step": 2002 + }, + { + "epoch": 0.08799411616189337, + "grad_norm": 5.34375, + "learning_rate": 4.9109297186957796e-05, + "loss": 0.6741, + "step": 2004 + }, + { + "epoch": 0.08808193464109686, + "grad_norm": 3.90625, + "learning_rate": 4.91074660297717e-05, + "loss": 0.6307, + "step": 2006 + }, + { + "epoch": 0.08816975312030034, + "grad_norm": 4.3125, + "learning_rate": 4.910563302643007e-05, + "loss": 0.6463, + "step": 2008 + }, + { + "epoch": 0.08825757159950383, + "grad_norm": 3.984375, + "learning_rate": 4.910379817707328e-05, + "loss": 0.6463, + "step": 2010 + }, + { + "epoch": 0.08834539007870731, + "grad_norm": 3.59375, + "learning_rate": 4.910196148184185e-05, + "loss": 0.6553, + "step": 2012 + }, + { + "epoch": 0.0884332085579108, + "grad_norm": 3.0625, + "learning_rate": 4.9100122940876433e-05, + "loss": 0.6324, + "step": 2014 + }, + { + "epoch": 0.08852102703711429, + "grad_norm": 3.265625, + "learning_rate": 4.9098282554317823e-05, + "loss": 0.6297, + "step": 2016 + }, + { + "epoch": 0.08860884551631777, + "grad_norm": 3.765625, + "learning_rate": 4.9096440322306956e-05, + "loss": 0.6239, + "step": 2018 + }, + { + "epoch": 0.08869666399552126, + "grad_norm": 3.140625, + "learning_rate": 4.909459624498491e-05, + "loss": 0.6589, + "step": 2020 + }, + { + "epoch": 0.08878448247472474, + "grad_norm": 3.734375, + "learning_rate": 4.909275032249292e-05, + "loss": 0.6233, + "step": 2022 + }, + { + "epoch": 0.08887230095392823, + "grad_norm": 3.484375, + "learning_rate": 4.909090255497233e-05, + "loss": 0.6599, + "step": 2024 + }, + { + "epoch": 0.08896011943313172, + "grad_norm": 3.28125, + "learning_rate": 4.908905294256464e-05, + "loss": 0.652, + "step": 2026 + }, + { + "epoch": 0.0890479379123352, + "grad_norm": 3.515625, + "learning_rate": 4.908720148541152e-05, + "loss": 0.632, + "step": 2028 + }, + { + "epoch": 0.08913575639153869, + "grad_norm": 3.484375, + "learning_rate": 4.9085348183654714e-05, + "loss": 0.6419, + "step": 2030 + }, + { + "epoch": 0.08922357487074217, + "grad_norm": 3.15625, + "learning_rate": 4.908349303743618e-05, + "loss": 0.6383, + "step": 2032 + }, + { + "epoch": 0.08931139334994566, + "grad_norm": 3.203125, + "learning_rate": 4.908163604689798e-05, + "loss": 0.6406, + "step": 2034 + }, + { + "epoch": 0.08939921182914914, + "grad_norm": 3.09375, + "learning_rate": 4.907977721218231e-05, + "loss": 0.6367, + "step": 2036 + }, + { + "epoch": 0.08948703030835263, + "grad_norm": 3.765625, + "learning_rate": 4.907791653343153e-05, + "loss": 0.6365, + "step": 2038 + }, + { + "epoch": 0.08957484878755612, + "grad_norm": 3.0, + "learning_rate": 4.907605401078814e-05, + "loss": 0.6033, + "step": 2040 + }, + { + "epoch": 0.0896626672667596, + "grad_norm": 3.28125, + "learning_rate": 4.907418964439475e-05, + "loss": 0.6359, + "step": 2042 + }, + { + "epoch": 0.08975048574596309, + "grad_norm": 3.296875, + "learning_rate": 4.907232343439415e-05, + "loss": 0.6408, + "step": 2044 + }, + { + "epoch": 0.08983830422516659, + "grad_norm": 2.9375, + "learning_rate": 4.907045538092926e-05, + "loss": 0.6232, + "step": 2046 + }, + { + "epoch": 0.08992612270437007, + "grad_norm": 3.171875, + "learning_rate": 4.906858548414311e-05, + "loss": 0.6624, + "step": 2048 + }, + { + "epoch": 0.09001394118357356, + "grad_norm": 3.5625, + "learning_rate": 4.906671374417893e-05, + "loss": 0.6527, + "step": 2050 + }, + { + "epoch": 0.09010175966277705, + "grad_norm": 4.875, + "learning_rate": 4.906484016118004e-05, + "loss": 0.6559, + "step": 2052 + }, + { + "epoch": 0.09018957814198053, + "grad_norm": 3.453125, + "learning_rate": 4.906296473528991e-05, + "loss": 0.6431, + "step": 2054 + }, + { + "epoch": 0.09027739662118402, + "grad_norm": 3.609375, + "learning_rate": 4.9061087466652183e-05, + "loss": 0.6458, + "step": 2056 + }, + { + "epoch": 0.0903652151003875, + "grad_norm": 3.015625, + "learning_rate": 4.905920835541061e-05, + "loss": 0.6394, + "step": 2058 + }, + { + "epoch": 0.09045303357959099, + "grad_norm": 3.90625, + "learning_rate": 4.9057327401709084e-05, + "loss": 0.6526, + "step": 2060 + }, + { + "epoch": 0.09054085205879447, + "grad_norm": 4.71875, + "learning_rate": 4.905544460569167e-05, + "loss": 0.668, + "step": 2062 + }, + { + "epoch": 0.09062867053799796, + "grad_norm": 3.546875, + "learning_rate": 4.9053559967502535e-05, + "loss": 0.6496, + "step": 2064 + }, + { + "epoch": 0.09071648901720145, + "grad_norm": 2.921875, + "learning_rate": 4.905167348728601e-05, + "loss": 0.6582, + "step": 2066 + }, + { + "epoch": 0.09080430749640493, + "grad_norm": 3.3125, + "learning_rate": 4.904978516518657e-05, + "loss": 0.6153, + "step": 2068 + }, + { + "epoch": 0.09089212597560842, + "grad_norm": 4.65625, + "learning_rate": 4.904789500134881e-05, + "loss": 0.6161, + "step": 2070 + }, + { + "epoch": 0.0909799444548119, + "grad_norm": 6.375, + "learning_rate": 4.90460029959175e-05, + "loss": 0.6207, + "step": 2072 + }, + { + "epoch": 0.09106776293401539, + "grad_norm": 5.5, + "learning_rate": 4.90441091490375e-05, + "loss": 0.6453, + "step": 2074 + }, + { + "epoch": 0.09115558141321888, + "grad_norm": 5.8125, + "learning_rate": 4.904221346085387e-05, + "loss": 0.6337, + "step": 2076 + }, + { + "epoch": 0.09124339989242236, + "grad_norm": 4.8125, + "learning_rate": 4.904031593151176e-05, + "loss": 0.6836, + "step": 2078 + }, + { + "epoch": 0.09133121837162585, + "grad_norm": 3.25, + "learning_rate": 4.90384165611565e-05, + "loss": 0.6126, + "step": 2080 + }, + { + "epoch": 0.09141903685082933, + "grad_norm": 3.546875, + "learning_rate": 4.9036515349933534e-05, + "loss": 0.6287, + "step": 2082 + }, + { + "epoch": 0.09150685533003282, + "grad_norm": 3.015625, + "learning_rate": 4.903461229798846e-05, + "loss": 0.6347, + "step": 2084 + }, + { + "epoch": 0.0915946738092363, + "grad_norm": 3.640625, + "learning_rate": 4.903270740546701e-05, + "loss": 0.6548, + "step": 2086 + }, + { + "epoch": 0.09168249228843979, + "grad_norm": 3.796875, + "learning_rate": 4.9030800672515075e-05, + "loss": 0.639, + "step": 2088 + }, + { + "epoch": 0.09177031076764328, + "grad_norm": 2.890625, + "learning_rate": 4.902889209927866e-05, + "loss": 0.6299, + "step": 2090 + }, + { + "epoch": 0.09185812924684676, + "grad_norm": 3.34375, + "learning_rate": 4.902698168590393e-05, + "loss": 0.6132, + "step": 2092 + }, + { + "epoch": 0.09194594772605025, + "grad_norm": 3.234375, + "learning_rate": 4.902506943253717e-05, + "loss": 0.6099, + "step": 2094 + }, + { + "epoch": 0.09203376620525373, + "grad_norm": 3.4375, + "learning_rate": 4.902315533932485e-05, + "loss": 0.6253, + "step": 2096 + }, + { + "epoch": 0.09212158468445723, + "grad_norm": 3.0625, + "learning_rate": 4.9021239406413534e-05, + "loss": 0.6292, + "step": 2098 + }, + { + "epoch": 0.09220940316366072, + "grad_norm": 4.09375, + "learning_rate": 4.901932163394994e-05, + "loss": 0.6073, + "step": 2100 + }, + { + "epoch": 0.0922972216428642, + "grad_norm": 3.84375, + "learning_rate": 4.901740202208094e-05, + "loss": 0.6577, + "step": 2102 + }, + { + "epoch": 0.09238504012206769, + "grad_norm": 3.3125, + "learning_rate": 4.901548057095353e-05, + "loss": 0.6495, + "step": 2104 + }, + { + "epoch": 0.09247285860127118, + "grad_norm": 3.40625, + "learning_rate": 4.9013557280714874e-05, + "loss": 0.6435, + "step": 2106 + }, + { + "epoch": 0.09256067708047466, + "grad_norm": 3.484375, + "learning_rate": 4.901163215151223e-05, + "loss": 0.6131, + "step": 2108 + }, + { + "epoch": 0.09264849555967815, + "grad_norm": 3.34375, + "learning_rate": 4.900970518349305e-05, + "loss": 0.6703, + "step": 2110 + }, + { + "epoch": 0.09273631403888163, + "grad_norm": 2.828125, + "learning_rate": 4.900777637680489e-05, + "loss": 0.6473, + "step": 2112 + }, + { + "epoch": 0.09282413251808512, + "grad_norm": 3.015625, + "learning_rate": 4.9005845731595456e-05, + "loss": 0.6411, + "step": 2114 + }, + { + "epoch": 0.0929119509972886, + "grad_norm": 3.34375, + "learning_rate": 4.9003913248012605e-05, + "loss": 0.6409, + "step": 2116 + }, + { + "epoch": 0.09299976947649209, + "grad_norm": 3.03125, + "learning_rate": 4.900197892620432e-05, + "loss": 0.6326, + "step": 2118 + }, + { + "epoch": 0.09308758795569558, + "grad_norm": 3.0625, + "learning_rate": 4.9000042766318744e-05, + "loss": 0.617, + "step": 2120 + }, + { + "epoch": 0.09317540643489906, + "grad_norm": 4.28125, + "learning_rate": 4.899810476850413e-05, + "loss": 0.627, + "step": 2122 + }, + { + "epoch": 0.09326322491410255, + "grad_norm": 3.171875, + "learning_rate": 4.899616493290891e-05, + "loss": 0.6311, + "step": 2124 + }, + { + "epoch": 0.09335104339330604, + "grad_norm": 3.0625, + "learning_rate": 4.8994223259681615e-05, + "loss": 0.6436, + "step": 2126 + }, + { + "epoch": 0.09343886187250952, + "grad_norm": 3.4375, + "learning_rate": 4.899227974897095e-05, + "loss": 0.6159, + "step": 2128 + }, + { + "epoch": 0.093526680351713, + "grad_norm": 3.515625, + "learning_rate": 4.899033440092576e-05, + "loss": 0.6209, + "step": 2130 + }, + { + "epoch": 0.09361449883091649, + "grad_norm": 3.84375, + "learning_rate": 4.8988387215695007e-05, + "loss": 0.643, + "step": 2132 + }, + { + "epoch": 0.09370231731011998, + "grad_norm": 3.859375, + "learning_rate": 4.898643819342781e-05, + "loss": 0.6131, + "step": 2134 + }, + { + "epoch": 0.09379013578932346, + "grad_norm": 3.53125, + "learning_rate": 4.898448733427343e-05, + "loss": 0.6001, + "step": 2136 + }, + { + "epoch": 0.09387795426852695, + "grad_norm": 3.328125, + "learning_rate": 4.898253463838126e-05, + "loss": 0.6278, + "step": 2138 + }, + { + "epoch": 0.09396577274773044, + "grad_norm": 3.75, + "learning_rate": 4.898058010590083e-05, + "loss": 0.6501, + "step": 2140 + }, + { + "epoch": 0.09405359122693392, + "grad_norm": 3.234375, + "learning_rate": 4.897862373698184e-05, + "loss": 0.6412, + "step": 2142 + }, + { + "epoch": 0.09414140970613741, + "grad_norm": 3.796875, + "learning_rate": 4.8976665531774094e-05, + "loss": 0.6422, + "step": 2144 + }, + { + "epoch": 0.0942292281853409, + "grad_norm": 6.4375, + "learning_rate": 4.897470549042754e-05, + "loss": 0.6464, + "step": 2146 + }, + { + "epoch": 0.0943170466645444, + "grad_norm": 4.4375, + "learning_rate": 4.8972743613092304e-05, + "loss": 0.615, + "step": 2148 + }, + { + "epoch": 0.09440486514374788, + "grad_norm": 4.375, + "learning_rate": 4.897077989991862e-05, + "loss": 0.6224, + "step": 2150 + }, + { + "epoch": 0.09449268362295137, + "grad_norm": 5.0, + "learning_rate": 4.896881435105685e-05, + "loss": 0.6222, + "step": 2152 + }, + { + "epoch": 0.09458050210215485, + "grad_norm": 4.625, + "learning_rate": 4.896684696665754e-05, + "loss": 0.632, + "step": 2154 + }, + { + "epoch": 0.09466832058135834, + "grad_norm": 4.21875, + "learning_rate": 4.896487774687135e-05, + "loss": 0.6198, + "step": 2156 + }, + { + "epoch": 0.09475613906056182, + "grad_norm": 4.28125, + "learning_rate": 4.8962906691849066e-05, + "loss": 0.6298, + "step": 2158 + }, + { + "epoch": 0.09484395753976531, + "grad_norm": 3.4375, + "learning_rate": 4.8960933801741646e-05, + "loss": 0.6407, + "step": 2160 + }, + { + "epoch": 0.0949317760189688, + "grad_norm": 3.328125, + "learning_rate": 4.895895907670017e-05, + "loss": 0.6368, + "step": 2162 + }, + { + "epoch": 0.09501959449817228, + "grad_norm": 3.640625, + "learning_rate": 4.895698251687587e-05, + "loss": 0.6023, + "step": 2164 + }, + { + "epoch": 0.09510741297737577, + "grad_norm": 3.359375, + "learning_rate": 4.895500412242011e-05, + "loss": 0.611, + "step": 2166 + }, + { + "epoch": 0.09519523145657925, + "grad_norm": 3.859375, + "learning_rate": 4.895302389348438e-05, + "loss": 0.6478, + "step": 2168 + }, + { + "epoch": 0.09528304993578274, + "grad_norm": 4.34375, + "learning_rate": 4.8951041830220344e-05, + "loss": 0.6434, + "step": 2170 + }, + { + "epoch": 0.09537086841498622, + "grad_norm": 3.765625, + "learning_rate": 4.8949057932779784e-05, + "loss": 0.6129, + "step": 2172 + }, + { + "epoch": 0.09545868689418971, + "grad_norm": 5.375, + "learning_rate": 4.894707220131463e-05, + "loss": 0.6264, + "step": 2174 + }, + { + "epoch": 0.0955465053733932, + "grad_norm": 4.78125, + "learning_rate": 4.8945084635976944e-05, + "loss": 0.64, + "step": 2176 + }, + { + "epoch": 0.09563432385259668, + "grad_norm": 3.09375, + "learning_rate": 4.894309523691893e-05, + "loss": 0.6259, + "step": 2178 + }, + { + "epoch": 0.09572214233180017, + "grad_norm": 4.03125, + "learning_rate": 4.8941104004292955e-05, + "loss": 0.6351, + "step": 2180 + }, + { + "epoch": 0.09580996081100365, + "grad_norm": 4.09375, + "learning_rate": 4.8939110938251485e-05, + "loss": 0.6288, + "step": 2182 + }, + { + "epoch": 0.09589777929020714, + "grad_norm": 4.34375, + "learning_rate": 4.8937116038947164e-05, + "loss": 0.612, + "step": 2184 + }, + { + "epoch": 0.09598559776941062, + "grad_norm": 3.46875, + "learning_rate": 4.8935119306532764e-05, + "loss": 0.6268, + "step": 2186 + }, + { + "epoch": 0.09607341624861411, + "grad_norm": 3.703125, + "learning_rate": 4.893312074116119e-05, + "loss": 0.609, + "step": 2188 + }, + { + "epoch": 0.0961612347278176, + "grad_norm": 3.625, + "learning_rate": 4.893112034298548e-05, + "loss": 0.6281, + "step": 2190 + }, + { + "epoch": 0.09624905320702108, + "grad_norm": 3.265625, + "learning_rate": 4.892911811215885e-05, + "loss": 0.6052, + "step": 2192 + }, + { + "epoch": 0.09633687168622457, + "grad_norm": 3.484375, + "learning_rate": 4.8927114048834613e-05, + "loss": 0.6247, + "step": 2194 + }, + { + "epoch": 0.09642469016542805, + "grad_norm": 2.796875, + "learning_rate": 4.892510815316625e-05, + "loss": 0.6106, + "step": 2196 + }, + { + "epoch": 0.09651250864463155, + "grad_norm": 3.328125, + "learning_rate": 4.8923100425307365e-05, + "loss": 0.6224, + "step": 2198 + }, + { + "epoch": 0.09660032712383504, + "grad_norm": 3.078125, + "learning_rate": 4.892109086541172e-05, + "loss": 0.6136, + "step": 2200 + }, + { + "epoch": 0.09668814560303853, + "grad_norm": 3.0625, + "learning_rate": 4.89190794736332e-05, + "loss": 0.6062, + "step": 2202 + }, + { + "epoch": 0.09677596408224201, + "grad_norm": 3.265625, + "learning_rate": 4.8917066250125834e-05, + "loss": 0.65, + "step": 2204 + }, + { + "epoch": 0.0968637825614455, + "grad_norm": 3.40625, + "learning_rate": 4.891505119504381e-05, + "loss": 0.5928, + "step": 2206 + }, + { + "epoch": 0.09695160104064898, + "grad_norm": 3.84375, + "learning_rate": 4.8913034308541425e-05, + "loss": 0.6516, + "step": 2208 + }, + { + "epoch": 0.09703941951985247, + "grad_norm": 2.828125, + "learning_rate": 4.8911015590773145e-05, + "loss": 0.6375, + "step": 2210 + }, + { + "epoch": 0.09712723799905595, + "grad_norm": 3.28125, + "learning_rate": 4.890899504189356e-05, + "loss": 0.6432, + "step": 2212 + }, + { + "epoch": 0.09721505647825944, + "grad_norm": 4.1875, + "learning_rate": 4.8906972662057406e-05, + "loss": 0.6211, + "step": 2214 + }, + { + "epoch": 0.09730287495746293, + "grad_norm": 3.46875, + "learning_rate": 4.890494845141955e-05, + "loss": 0.6126, + "step": 2216 + }, + { + "epoch": 0.09739069343666641, + "grad_norm": 3.265625, + "learning_rate": 4.890292241013501e-05, + "loss": 0.6094, + "step": 2218 + }, + { + "epoch": 0.0974785119158699, + "grad_norm": 3.328125, + "learning_rate": 4.8900894538358944e-05, + "loss": 0.6204, + "step": 2220 + }, + { + "epoch": 0.09756633039507338, + "grad_norm": 2.953125, + "learning_rate": 4.889886483624664e-05, + "loss": 0.6227, + "step": 2222 + }, + { + "epoch": 0.09765414887427687, + "grad_norm": 4.09375, + "learning_rate": 4.889683330395355e-05, + "loss": 0.6102, + "step": 2224 + }, + { + "epoch": 0.09774196735348036, + "grad_norm": 3.625, + "learning_rate": 4.889479994163523e-05, + "loss": 0.6213, + "step": 2226 + }, + { + "epoch": 0.09782978583268384, + "grad_norm": 3.125, + "learning_rate": 4.8892764749447395e-05, + "loss": 0.61, + "step": 2228 + }, + { + "epoch": 0.09791760431188733, + "grad_norm": 3.0, + "learning_rate": 4.8890727727545916e-05, + "loss": 0.6275, + "step": 2230 + }, + { + "epoch": 0.09800542279109081, + "grad_norm": 3.0, + "learning_rate": 4.8888688876086786e-05, + "loss": 0.6525, + "step": 2232 + }, + { + "epoch": 0.0980932412702943, + "grad_norm": 3.671875, + "learning_rate": 4.8886648195226124e-05, + "loss": 0.6094, + "step": 2234 + }, + { + "epoch": 0.09818105974949778, + "grad_norm": 4.5625, + "learning_rate": 4.8884605685120224e-05, + "loss": 0.6006, + "step": 2236 + }, + { + "epoch": 0.09826887822870127, + "grad_norm": 5.5, + "learning_rate": 4.888256134592549e-05, + "loss": 0.6413, + "step": 2238 + }, + { + "epoch": 0.09835669670790476, + "grad_norm": 6.125, + "learning_rate": 4.888051517779849e-05, + "loss": 0.6073, + "step": 2240 + }, + { + "epoch": 0.09844451518710824, + "grad_norm": 4.875, + "learning_rate": 4.8878467180895906e-05, + "loss": 0.5941, + "step": 2242 + }, + { + "epoch": 0.09853233366631173, + "grad_norm": 4.53125, + "learning_rate": 4.887641735537459e-05, + "loss": 0.6117, + "step": 2244 + }, + { + "epoch": 0.09862015214551521, + "grad_norm": 5.46875, + "learning_rate": 4.88743657013915e-05, + "loss": 0.6266, + "step": 2246 + }, + { + "epoch": 0.09870797062471871, + "grad_norm": 4.84375, + "learning_rate": 4.887231221910376e-05, + "loss": 0.6203, + "step": 2248 + }, + { + "epoch": 0.0987957891039222, + "grad_norm": 3.109375, + "learning_rate": 4.8870256908668646e-05, + "loss": 0.618, + "step": 2250 + }, + { + "epoch": 0.09888360758312568, + "grad_norm": 2.890625, + "learning_rate": 4.886819977024352e-05, + "loss": 0.6034, + "step": 2252 + }, + { + "epoch": 0.09897142606232917, + "grad_norm": 3.25, + "learning_rate": 4.886614080398594e-05, + "loss": 0.6134, + "step": 2254 + }, + { + "epoch": 0.09905924454153266, + "grad_norm": 3.09375, + "learning_rate": 4.886408001005357e-05, + "loss": 0.6086, + "step": 2256 + }, + { + "epoch": 0.09914706302073614, + "grad_norm": 3.953125, + "learning_rate": 4.886201738860423e-05, + "loss": 0.6463, + "step": 2258 + }, + { + "epoch": 0.09923488149993963, + "grad_norm": 4.3125, + "learning_rate": 4.885995293979589e-05, + "loss": 0.6026, + "step": 2260 + }, + { + "epoch": 0.09932269997914311, + "grad_norm": 4.78125, + "learning_rate": 4.8857886663786626e-05, + "loss": 0.6457, + "step": 2262 + }, + { + "epoch": 0.0994105184583466, + "grad_norm": 4.375, + "learning_rate": 4.885581856073468e-05, + "loss": 0.5953, + "step": 2264 + }, + { + "epoch": 0.09949833693755009, + "grad_norm": 5.03125, + "learning_rate": 4.8853748630798434e-05, + "loss": 0.606, + "step": 2266 + }, + { + "epoch": 0.09958615541675357, + "grad_norm": 6.4375, + "learning_rate": 4.88516768741364e-05, + "loss": 0.6011, + "step": 2268 + }, + { + "epoch": 0.09967397389595706, + "grad_norm": 5.1875, + "learning_rate": 4.8849603290907234e-05, + "loss": 0.6245, + "step": 2270 + }, + { + "epoch": 0.09976179237516054, + "grad_norm": 3.234375, + "learning_rate": 4.884752788126973e-05, + "loss": 0.6454, + "step": 2272 + }, + { + "epoch": 0.09984961085436403, + "grad_norm": 3.15625, + "learning_rate": 4.884545064538283e-05, + "loss": 0.6246, + "step": 2274 + }, + { + "epoch": 0.09993742933356752, + "grad_norm": 2.96875, + "learning_rate": 4.884337158340559e-05, + "loss": 0.6095, + "step": 2276 + }, + { + "epoch": 0.100025247812771, + "grad_norm": 3.6875, + "learning_rate": 4.884129069549726e-05, + "loss": 0.6126, + "step": 2278 + }, + { + "epoch": 0.10011306629197449, + "grad_norm": 3.796875, + "learning_rate": 4.883920798181715e-05, + "loss": 0.621, + "step": 2280 + }, + { + "epoch": 0.10020088477117797, + "grad_norm": 3.9375, + "learning_rate": 4.883712344252479e-05, + "loss": 0.604, + "step": 2282 + }, + { + "epoch": 0.10028870325038146, + "grad_norm": 3.6875, + "learning_rate": 4.88350370777798e-05, + "loss": 0.607, + "step": 2284 + }, + { + "epoch": 0.10037652172958494, + "grad_norm": 2.984375, + "learning_rate": 4.8832948887741956e-05, + "loss": 0.6061, + "step": 2286 + }, + { + "epoch": 0.10046434020878843, + "grad_norm": 3.296875, + "learning_rate": 4.883085887257117e-05, + "loss": 0.6153, + "step": 2288 + }, + { + "epoch": 0.10055215868799192, + "grad_norm": 3.890625, + "learning_rate": 4.882876703242751e-05, + "loss": 0.6025, + "step": 2290 + }, + { + "epoch": 0.1006399771671954, + "grad_norm": 4.03125, + "learning_rate": 4.882667336747115e-05, + "loss": 0.5971, + "step": 2292 + }, + { + "epoch": 0.10072779564639889, + "grad_norm": 3.015625, + "learning_rate": 4.882457787786243e-05, + "loss": 0.5883, + "step": 2294 + }, + { + "epoch": 0.10081561412560237, + "grad_norm": 3.09375, + "learning_rate": 4.882248056376183e-05, + "loss": 0.6166, + "step": 2296 + }, + { + "epoch": 0.10090343260480586, + "grad_norm": 3.5625, + "learning_rate": 4.882038142532995e-05, + "loss": 0.6313, + "step": 2298 + }, + { + "epoch": 0.10099125108400936, + "grad_norm": 2.859375, + "learning_rate": 4.881828046272756e-05, + "loss": 0.5937, + "step": 2300 + }, + { + "epoch": 0.10107906956321284, + "grad_norm": 2.84375, + "learning_rate": 4.881617767611554e-05, + "loss": 0.6125, + "step": 2302 + }, + { + "epoch": 0.10116688804241633, + "grad_norm": 3.390625, + "learning_rate": 4.881407306565492e-05, + "loss": 0.596, + "step": 2304 + }, + { + "epoch": 0.10125470652161982, + "grad_norm": 3.296875, + "learning_rate": 4.881196663150689e-05, + "loss": 0.6083, + "step": 2306 + }, + { + "epoch": 0.1013425250008233, + "grad_norm": 3.484375, + "learning_rate": 4.8809858373832726e-05, + "loss": 0.597, + "step": 2308 + }, + { + "epoch": 0.10143034348002679, + "grad_norm": 3.25, + "learning_rate": 4.880774829279392e-05, + "loss": 0.6188, + "step": 2310 + }, + { + "epoch": 0.10151816195923027, + "grad_norm": 2.9375, + "learning_rate": 4.8805636388552035e-05, + "loss": 0.6203, + "step": 2312 + }, + { + "epoch": 0.10160598043843376, + "grad_norm": 3.078125, + "learning_rate": 4.8803522661268805e-05, + "loss": 0.6083, + "step": 2314 + }, + { + "epoch": 0.10169379891763725, + "grad_norm": 3.140625, + "learning_rate": 4.88014071111061e-05, + "loss": 0.6002, + "step": 2316 + }, + { + "epoch": 0.10178161739684073, + "grad_norm": 3.65625, + "learning_rate": 4.8799289738225936e-05, + "loss": 0.6068, + "step": 2318 + }, + { + "epoch": 0.10186943587604422, + "grad_norm": 3.28125, + "learning_rate": 4.879717054279047e-05, + "loss": 0.6208, + "step": 2320 + }, + { + "epoch": 0.1019572543552477, + "grad_norm": 2.953125, + "learning_rate": 4.879504952496197e-05, + "loss": 0.6341, + "step": 2322 + }, + { + "epoch": 0.10204507283445119, + "grad_norm": 3.4375, + "learning_rate": 4.8792926684902875e-05, + "loss": 0.6172, + "step": 2324 + }, + { + "epoch": 0.10213289131365468, + "grad_norm": 3.34375, + "learning_rate": 4.879080202277575e-05, + "loss": 0.5982, + "step": 2326 + }, + { + "epoch": 0.10222070979285816, + "grad_norm": 4.09375, + "learning_rate": 4.87886755387433e-05, + "loss": 0.6081, + "step": 2328 + }, + { + "epoch": 0.10230852827206165, + "grad_norm": 3.265625, + "learning_rate": 4.878654723296838e-05, + "loss": 0.6505, + "step": 2330 + }, + { + "epoch": 0.10239634675126513, + "grad_norm": 3.453125, + "learning_rate": 4.878441710561397e-05, + "loss": 0.6074, + "step": 2332 + }, + { + "epoch": 0.10248416523046862, + "grad_norm": 3.125, + "learning_rate": 4.878228515684319e-05, + "loss": 0.6145, + "step": 2334 + }, + { + "epoch": 0.1025719837096721, + "grad_norm": 3.734375, + "learning_rate": 4.878015138681932e-05, + "loss": 0.6188, + "step": 2336 + }, + { + "epoch": 0.10265980218887559, + "grad_norm": 4.1875, + "learning_rate": 4.877801579570575e-05, + "loss": 0.6209, + "step": 2338 + }, + { + "epoch": 0.10274762066807908, + "grad_norm": 2.765625, + "learning_rate": 4.8775878383666035e-05, + "loss": 0.6161, + "step": 2340 + }, + { + "epoch": 0.10283543914728256, + "grad_norm": 2.890625, + "learning_rate": 4.877373915086385e-05, + "loss": 0.6258, + "step": 2342 + }, + { + "epoch": 0.10292325762648605, + "grad_norm": 3.59375, + "learning_rate": 4.8771598097463026e-05, + "loss": 0.6049, + "step": 2344 + }, + { + "epoch": 0.10301107610568953, + "grad_norm": 3.546875, + "learning_rate": 4.876945522362752e-05, + "loss": 0.6479, + "step": 2346 + }, + { + "epoch": 0.10309889458489302, + "grad_norm": 3.6875, + "learning_rate": 4.876731052952144e-05, + "loss": 0.6041, + "step": 2348 + }, + { + "epoch": 0.10318671306409652, + "grad_norm": 2.9375, + "learning_rate": 4.876516401530901e-05, + "loss": 0.6092, + "step": 2350 + }, + { + "epoch": 0.1032745315433, + "grad_norm": 3.515625, + "learning_rate": 4.876301568115463e-05, + "loss": 0.5955, + "step": 2352 + }, + { + "epoch": 0.10336235002250349, + "grad_norm": 3.546875, + "learning_rate": 4.876086552722281e-05, + "loss": 0.6191, + "step": 2354 + }, + { + "epoch": 0.10345016850170698, + "grad_norm": 2.890625, + "learning_rate": 4.875871355367822e-05, + "loss": 0.6051, + "step": 2356 + }, + { + "epoch": 0.10353798698091046, + "grad_norm": 2.90625, + "learning_rate": 4.8756559760685644e-05, + "loss": 0.601, + "step": 2358 + }, + { + "epoch": 0.10362580546011395, + "grad_norm": 3.125, + "learning_rate": 4.8754404148410025e-05, + "loss": 0.6016, + "step": 2360 + }, + { + "epoch": 0.10371362393931743, + "grad_norm": 5.03125, + "learning_rate": 4.875224671701645e-05, + "loss": 0.633, + "step": 2362 + }, + { + "epoch": 0.10380144241852092, + "grad_norm": 4.875, + "learning_rate": 4.8750087466670116e-05, + "loss": 0.6384, + "step": 2364 + }, + { + "epoch": 0.1038892608977244, + "grad_norm": 4.3125, + "learning_rate": 4.87479263975364e-05, + "loss": 0.6113, + "step": 2366 + }, + { + "epoch": 0.10397707937692789, + "grad_norm": 3.78125, + "learning_rate": 4.8745763509780785e-05, + "loss": 0.5765, + "step": 2368 + }, + { + "epoch": 0.10406489785613138, + "grad_norm": 3.65625, + "learning_rate": 4.874359880356891e-05, + "loss": 0.6533, + "step": 2370 + }, + { + "epoch": 0.10415271633533486, + "grad_norm": 3.671875, + "learning_rate": 4.874143227906654e-05, + "loss": 0.6002, + "step": 2372 + }, + { + "epoch": 0.10424053481453835, + "grad_norm": 4.4375, + "learning_rate": 4.87392639364396e-05, + "loss": 0.5987, + "step": 2374 + }, + { + "epoch": 0.10432835329374184, + "grad_norm": 4.0625, + "learning_rate": 4.873709377585414e-05, + "loss": 0.6137, + "step": 2376 + }, + { + "epoch": 0.10441617177294532, + "grad_norm": 4.5625, + "learning_rate": 4.873492179747634e-05, + "loss": 0.6091, + "step": 2378 + }, + { + "epoch": 0.1045039902521488, + "grad_norm": 4.75, + "learning_rate": 4.873274800147255e-05, + "loss": 0.6199, + "step": 2380 + }, + { + "epoch": 0.10459180873135229, + "grad_norm": 4.5625, + "learning_rate": 4.873057238800922e-05, + "loss": 0.6132, + "step": 2382 + }, + { + "epoch": 0.10467962721055578, + "grad_norm": 6.375, + "learning_rate": 4.872839495725297e-05, + "loss": 0.6083, + "step": 2384 + }, + { + "epoch": 0.10476744568975926, + "grad_norm": 4.46875, + "learning_rate": 4.8726215709370546e-05, + "loss": 0.6209, + "step": 2386 + }, + { + "epoch": 0.10485526416896275, + "grad_norm": 2.6875, + "learning_rate": 4.872403464452884e-05, + "loss": 0.6004, + "step": 2388 + }, + { + "epoch": 0.10494308264816624, + "grad_norm": 2.96875, + "learning_rate": 4.8721851762894865e-05, + "loss": 0.5997, + "step": 2390 + }, + { + "epoch": 0.10503090112736972, + "grad_norm": 4.0625, + "learning_rate": 4.871966706463581e-05, + "loss": 0.6023, + "step": 2392 + }, + { + "epoch": 0.10511871960657321, + "grad_norm": 3.25, + "learning_rate": 4.871748054991895e-05, + "loss": 0.5691, + "step": 2394 + }, + { + "epoch": 0.1052065380857767, + "grad_norm": 3.59375, + "learning_rate": 4.871529221891175e-05, + "loss": 0.6085, + "step": 2396 + }, + { + "epoch": 0.10529435656498018, + "grad_norm": 4.46875, + "learning_rate": 4.871310207178179e-05, + "loss": 0.6212, + "step": 2398 + }, + { + "epoch": 0.10538217504418368, + "grad_norm": 4.03125, + "learning_rate": 4.8710910108696786e-05, + "loss": 0.6219, + "step": 2400 + }, + { + "epoch": 0.10546999352338716, + "grad_norm": 3.953125, + "learning_rate": 4.87087163298246e-05, + "loss": 0.6105, + "step": 2402 + }, + { + "epoch": 0.10555781200259065, + "grad_norm": 3.96875, + "learning_rate": 4.870652073533324e-05, + "loss": 0.621, + "step": 2404 + }, + { + "epoch": 0.10564563048179414, + "grad_norm": 3.828125, + "learning_rate": 4.8704323325390834e-05, + "loss": 0.6141, + "step": 2406 + }, + { + "epoch": 0.10573344896099762, + "grad_norm": 3.8125, + "learning_rate": 4.8702124100165666e-05, + "loss": 0.6142, + "step": 2408 + }, + { + "epoch": 0.10582126744020111, + "grad_norm": 3.46875, + "learning_rate": 4.869992305982615e-05, + "loss": 0.5919, + "step": 2410 + }, + { + "epoch": 0.1059090859194046, + "grad_norm": 2.859375, + "learning_rate": 4.8697720204540846e-05, + "loss": 0.5876, + "step": 2412 + }, + { + "epoch": 0.10599690439860808, + "grad_norm": 2.90625, + "learning_rate": 4.8695515534478456e-05, + "loss": 0.6464, + "step": 2414 + }, + { + "epoch": 0.10608472287781157, + "grad_norm": 3.0625, + "learning_rate": 4.8693309049807795e-05, + "loss": 0.5842, + "step": 2416 + }, + { + "epoch": 0.10617254135701505, + "grad_norm": 3.40625, + "learning_rate": 4.8691100750697856e-05, + "loss": 0.5864, + "step": 2418 + }, + { + "epoch": 0.10626035983621854, + "grad_norm": 3.296875, + "learning_rate": 4.8688890637317734e-05, + "loss": 0.6038, + "step": 2420 + }, + { + "epoch": 0.10634817831542202, + "grad_norm": 3.375, + "learning_rate": 4.86866787098367e-05, + "loss": 0.6135, + "step": 2422 + }, + { + "epoch": 0.10643599679462551, + "grad_norm": 3.5, + "learning_rate": 4.868446496842412e-05, + "loss": 0.6135, + "step": 2424 + }, + { + "epoch": 0.106523815273829, + "grad_norm": 2.921875, + "learning_rate": 4.868224941324954e-05, + "loss": 0.6238, + "step": 2426 + }, + { + "epoch": 0.10661163375303248, + "grad_norm": 2.859375, + "learning_rate": 4.868003204448263e-05, + "loss": 0.5728, + "step": 2428 + }, + { + "epoch": 0.10669945223223597, + "grad_norm": 3.0, + "learning_rate": 4.8677812862293184e-05, + "loss": 0.5991, + "step": 2430 + }, + { + "epoch": 0.10678727071143945, + "grad_norm": 2.578125, + "learning_rate": 4.867559186685115e-05, + "loss": 0.6117, + "step": 2432 + }, + { + "epoch": 0.10687508919064294, + "grad_norm": 3.3125, + "learning_rate": 4.867336905832661e-05, + "loss": 0.6006, + "step": 2434 + }, + { + "epoch": 0.10696290766984642, + "grad_norm": 3.375, + "learning_rate": 4.8671144436889805e-05, + "loss": 0.5793, + "step": 2436 + }, + { + "epoch": 0.10705072614904991, + "grad_norm": 3.140625, + "learning_rate": 4.866891800271108e-05, + "loss": 0.5901, + "step": 2438 + }, + { + "epoch": 0.1071385446282534, + "grad_norm": 3.0, + "learning_rate": 4.8666689755960936e-05, + "loss": 0.6089, + "step": 2440 + }, + { + "epoch": 0.10722636310745688, + "grad_norm": 3.046875, + "learning_rate": 4.866445969681003e-05, + "loss": 0.6008, + "step": 2442 + }, + { + "epoch": 0.10731418158666037, + "grad_norm": 3.328125, + "learning_rate": 4.866222782542912e-05, + "loss": 0.6111, + "step": 2444 + }, + { + "epoch": 0.10740200006586385, + "grad_norm": 3.078125, + "learning_rate": 4.865999414198913e-05, + "loss": 0.6166, + "step": 2446 + }, + { + "epoch": 0.10748981854506734, + "grad_norm": 3.03125, + "learning_rate": 4.865775864666111e-05, + "loss": 0.5883, + "step": 2448 + }, + { + "epoch": 0.10757763702427084, + "grad_norm": 3.046875, + "learning_rate": 4.8655521339616274e-05, + "loss": 0.5968, + "step": 2450 + }, + { + "epoch": 0.10766545550347432, + "grad_norm": 3.734375, + "learning_rate": 4.865328222102594e-05, + "loss": 0.61, + "step": 2452 + }, + { + "epoch": 0.10775327398267781, + "grad_norm": 3.734375, + "learning_rate": 4.865104129106158e-05, + "loss": 0.5919, + "step": 2454 + }, + { + "epoch": 0.1078410924618813, + "grad_norm": 2.921875, + "learning_rate": 4.86487985498948e-05, + "loss": 0.6421, + "step": 2456 + }, + { + "epoch": 0.10792891094108478, + "grad_norm": 3.3125, + "learning_rate": 4.8646553997697375e-05, + "loss": 0.6171, + "step": 2458 + }, + { + "epoch": 0.10801672942028827, + "grad_norm": 2.875, + "learning_rate": 4.864430763464117e-05, + "loss": 0.6143, + "step": 2460 + }, + { + "epoch": 0.10810454789949175, + "grad_norm": 3.390625, + "learning_rate": 4.8642059460898214e-05, + "loss": 0.616, + "step": 2462 + }, + { + "epoch": 0.10819236637869524, + "grad_norm": 2.875, + "learning_rate": 4.8639809476640685e-05, + "loss": 0.586, + "step": 2464 + }, + { + "epoch": 0.10828018485789873, + "grad_norm": 3.8125, + "learning_rate": 4.8637557682040876e-05, + "loss": 0.581, + "step": 2466 + }, + { + "epoch": 0.10836800333710221, + "grad_norm": 3.53125, + "learning_rate": 4.863530407727123e-05, + "loss": 0.6136, + "step": 2468 + }, + { + "epoch": 0.1084558218163057, + "grad_norm": 3.75, + "learning_rate": 4.863304866250433e-05, + "loss": 0.6392, + "step": 2470 + }, + { + "epoch": 0.10854364029550918, + "grad_norm": 2.9375, + "learning_rate": 4.86307914379129e-05, + "loss": 0.5975, + "step": 2472 + }, + { + "epoch": 0.10863145877471267, + "grad_norm": 3.5, + "learning_rate": 4.8628532403669805e-05, + "loss": 0.6307, + "step": 2474 + }, + { + "epoch": 0.10871927725391616, + "grad_norm": 3.0625, + "learning_rate": 4.8626271559948036e-05, + "loss": 0.6062, + "step": 2476 + }, + { + "epoch": 0.10880709573311964, + "grad_norm": 3.34375, + "learning_rate": 4.8624008906920714e-05, + "loss": 0.6003, + "step": 2478 + }, + { + "epoch": 0.10889491421232313, + "grad_norm": 3.0625, + "learning_rate": 4.862174444476113e-05, + "loss": 0.6069, + "step": 2480 + }, + { + "epoch": 0.10898273269152661, + "grad_norm": 2.84375, + "learning_rate": 4.86194781736427e-05, + "loss": 0.5926, + "step": 2482 + }, + { + "epoch": 0.1090705511707301, + "grad_norm": 3.5625, + "learning_rate": 4.861721009373897e-05, + "loss": 0.6071, + "step": 2484 + }, + { + "epoch": 0.10915836964993358, + "grad_norm": 3.046875, + "learning_rate": 4.8614940205223625e-05, + "loss": 0.585, + "step": 2486 + }, + { + "epoch": 0.10924618812913707, + "grad_norm": 3.6875, + "learning_rate": 4.86126685082705e-05, + "loss": 0.5705, + "step": 2488 + }, + { + "epoch": 0.10933400660834056, + "grad_norm": 2.953125, + "learning_rate": 4.861039500305356e-05, + "loss": 0.6318, + "step": 2490 + }, + { + "epoch": 0.10942182508754404, + "grad_norm": 3.203125, + "learning_rate": 4.860811968974691e-05, + "loss": 0.5763, + "step": 2492 + }, + { + "epoch": 0.10950964356674753, + "grad_norm": 3.46875, + "learning_rate": 4.86058425685248e-05, + "loss": 0.619, + "step": 2494 + }, + { + "epoch": 0.10959746204595101, + "grad_norm": 3.640625, + "learning_rate": 4.86035636395616e-05, + "loss": 0.6244, + "step": 2496 + }, + { + "epoch": 0.1096852805251545, + "grad_norm": 3.265625, + "learning_rate": 4.860128290303184e-05, + "loss": 0.5996, + "step": 2498 + }, + { + "epoch": 0.10977309900435799, + "grad_norm": 3.203125, + "learning_rate": 4.859900035911018e-05, + "loss": 0.5985, + "step": 2500 + }, + { + "epoch": 0.10986091748356148, + "grad_norm": 3.671875, + "learning_rate": 4.859671600797141e-05, + "loss": 0.6311, + "step": 2502 + }, + { + "epoch": 0.10994873596276497, + "grad_norm": 4.46875, + "learning_rate": 4.8594429849790476e-05, + "loss": 0.6137, + "step": 2504 + }, + { + "epoch": 0.11003655444196846, + "grad_norm": 4.53125, + "learning_rate": 4.8592141884742445e-05, + "loss": 0.5877, + "step": 2506 + }, + { + "epoch": 0.11012437292117194, + "grad_norm": 3.265625, + "learning_rate": 4.8589852113002546e-05, + "loss": 0.5774, + "step": 2508 + }, + { + "epoch": 0.11021219140037543, + "grad_norm": 2.984375, + "learning_rate": 4.85875605347461e-05, + "loss": 0.6018, + "step": 2510 + }, + { + "epoch": 0.11030000987957891, + "grad_norm": 3.25, + "learning_rate": 4.8585267150148625e-05, + "loss": 0.5877, + "step": 2512 + }, + { + "epoch": 0.1103878283587824, + "grad_norm": 4.59375, + "learning_rate": 4.8582971959385735e-05, + "loss": 0.5886, + "step": 2514 + }, + { + "epoch": 0.11047564683798589, + "grad_norm": 3.6875, + "learning_rate": 4.85806749626332e-05, + "loss": 0.6101, + "step": 2516 + }, + { + "epoch": 0.11056346531718937, + "grad_norm": 3.421875, + "learning_rate": 4.8578376160066916e-05, + "loss": 0.5614, + "step": 2518 + }, + { + "epoch": 0.11065128379639286, + "grad_norm": 4.53125, + "learning_rate": 4.857607555186294e-05, + "loss": 0.596, + "step": 2520 + }, + { + "epoch": 0.11073910227559634, + "grad_norm": 3.65625, + "learning_rate": 4.857377313819745e-05, + "loss": 0.6134, + "step": 2522 + }, + { + "epoch": 0.11082692075479983, + "grad_norm": 2.96875, + "learning_rate": 4.8571468919246755e-05, + "loss": 0.5675, + "step": 2524 + }, + { + "epoch": 0.11091473923400332, + "grad_norm": 3.484375, + "learning_rate": 4.8569162895187324e-05, + "loss": 0.5876, + "step": 2526 + }, + { + "epoch": 0.1110025577132068, + "grad_norm": 3.953125, + "learning_rate": 4.856685506619575e-05, + "loss": 0.6046, + "step": 2528 + }, + { + "epoch": 0.11109037619241029, + "grad_norm": 4.125, + "learning_rate": 4.8564545432448763e-05, + "loss": 0.5904, + "step": 2530 + }, + { + "epoch": 0.11117819467161377, + "grad_norm": 3.515625, + "learning_rate": 4.856223399412324e-05, + "loss": 0.5889, + "step": 2532 + }, + { + "epoch": 0.11126601315081726, + "grad_norm": 3.203125, + "learning_rate": 4.855992075139618e-05, + "loss": 0.5859, + "step": 2534 + }, + { + "epoch": 0.11135383163002074, + "grad_norm": 3.578125, + "learning_rate": 4.8557605704444754e-05, + "loss": 0.6318, + "step": 2536 + }, + { + "epoch": 0.11144165010922423, + "grad_norm": 3.234375, + "learning_rate": 4.8555288853446226e-05, + "loss": 0.6019, + "step": 2538 + }, + { + "epoch": 0.11152946858842772, + "grad_norm": 3.46875, + "learning_rate": 4.8552970198578044e-05, + "loss": 0.5788, + "step": 2540 + }, + { + "epoch": 0.1116172870676312, + "grad_norm": 3.015625, + "learning_rate": 4.8550649740017744e-05, + "loss": 0.6114, + "step": 2542 + }, + { + "epoch": 0.11170510554683469, + "grad_norm": 2.9375, + "learning_rate": 4.854832747794305e-05, + "loss": 0.5551, + "step": 2544 + }, + { + "epoch": 0.11179292402603817, + "grad_norm": 2.84375, + "learning_rate": 4.8546003412531785e-05, + "loss": 0.5562, + "step": 2546 + }, + { + "epoch": 0.11188074250524166, + "grad_norm": 3.078125, + "learning_rate": 4.854367754396194e-05, + "loss": 0.6005, + "step": 2548 + }, + { + "epoch": 0.11196856098444515, + "grad_norm": 3.65625, + "learning_rate": 4.854134987241162e-05, + "loss": 0.5951, + "step": 2550 + }, + { + "epoch": 0.11205637946364864, + "grad_norm": 4.4375, + "learning_rate": 4.85390203980591e-05, + "loss": 0.5995, + "step": 2552 + }, + { + "epoch": 0.11214419794285213, + "grad_norm": 4.9375, + "learning_rate": 4.853668912108273e-05, + "loss": 0.628, + "step": 2554 + }, + { + "epoch": 0.11223201642205562, + "grad_norm": 3.75, + "learning_rate": 4.8534356041661085e-05, + "loss": 0.6037, + "step": 2556 + }, + { + "epoch": 0.1123198349012591, + "grad_norm": 3.46875, + "learning_rate": 4.8532021159972804e-05, + "loss": 0.5707, + "step": 2558 + }, + { + "epoch": 0.11240765338046259, + "grad_norm": 3.4375, + "learning_rate": 4.8529684476196705e-05, + "loss": 0.6165, + "step": 2560 + }, + { + "epoch": 0.11249547185966607, + "grad_norm": 3.234375, + "learning_rate": 4.852734599051173e-05, + "loss": 0.6, + "step": 2562 + }, + { + "epoch": 0.11258329033886956, + "grad_norm": 2.90625, + "learning_rate": 4.852500570309695e-05, + "loss": 0.5954, + "step": 2564 + }, + { + "epoch": 0.11267110881807305, + "grad_norm": 2.765625, + "learning_rate": 4.8522663614131603e-05, + "loss": 0.6065, + "step": 2566 + }, + { + "epoch": 0.11275892729727653, + "grad_norm": 2.9375, + "learning_rate": 4.8520319723795036e-05, + "loss": 0.6159, + "step": 2568 + }, + { + "epoch": 0.11284674577648002, + "grad_norm": 3.40625, + "learning_rate": 4.8517974032266745e-05, + "loss": 0.5932, + "step": 2570 + }, + { + "epoch": 0.1129345642556835, + "grad_norm": 2.921875, + "learning_rate": 4.851562653972637e-05, + "loss": 0.5993, + "step": 2572 + }, + { + "epoch": 0.11302238273488699, + "grad_norm": 3.375, + "learning_rate": 4.851327724635366e-05, + "loss": 0.5716, + "step": 2574 + }, + { + "epoch": 0.11311020121409048, + "grad_norm": 3.328125, + "learning_rate": 4.851092615232856e-05, + "loss": 0.5877, + "step": 2576 + }, + { + "epoch": 0.11319801969329396, + "grad_norm": 3.40625, + "learning_rate": 4.8508573257831094e-05, + "loss": 0.5821, + "step": 2578 + }, + { + "epoch": 0.11328583817249745, + "grad_norm": 3.03125, + "learning_rate": 4.850621856304145e-05, + "loss": 0.5923, + "step": 2580 + }, + { + "epoch": 0.11337365665170093, + "grad_norm": 3.359375, + "learning_rate": 4.850386206813996e-05, + "loss": 0.6087, + "step": 2582 + }, + { + "epoch": 0.11346147513090442, + "grad_norm": 3.40625, + "learning_rate": 4.8501503773307075e-05, + "loss": 0.6313, + "step": 2584 + }, + { + "epoch": 0.1135492936101079, + "grad_norm": 3.40625, + "learning_rate": 4.849914367872339e-05, + "loss": 0.5768, + "step": 2586 + }, + { + "epoch": 0.11363711208931139, + "grad_norm": 3.40625, + "learning_rate": 4.849678178456966e-05, + "loss": 0.5633, + "step": 2588 + }, + { + "epoch": 0.11372493056851488, + "grad_norm": 3.015625, + "learning_rate": 4.8494418091026745e-05, + "loss": 0.5882, + "step": 2590 + }, + { + "epoch": 0.11381274904771836, + "grad_norm": 3.015625, + "learning_rate": 4.849205259827566e-05, + "loss": 0.5957, + "step": 2592 + }, + { + "epoch": 0.11390056752692185, + "grad_norm": 2.90625, + "learning_rate": 4.8489685306497554e-05, + "loss": 0.5899, + "step": 2594 + }, + { + "epoch": 0.11398838600612533, + "grad_norm": 2.765625, + "learning_rate": 4.8487316215873715e-05, + "loss": 0.5853, + "step": 2596 + }, + { + "epoch": 0.11407620448532882, + "grad_norm": 3.078125, + "learning_rate": 4.848494532658557e-05, + "loss": 0.5899, + "step": 2598 + }, + { + "epoch": 0.1141640229645323, + "grad_norm": 3.21875, + "learning_rate": 4.848257263881469e-05, + "loss": 0.5872, + "step": 2600 + }, + { + "epoch": 0.1142518414437358, + "grad_norm": 3.046875, + "learning_rate": 4.848019815274276e-05, + "loss": 0.5806, + "step": 2602 + }, + { + "epoch": 0.11433965992293929, + "grad_norm": 3.046875, + "learning_rate": 4.847782186855163e-05, + "loss": 0.5754, + "step": 2604 + }, + { + "epoch": 0.11442747840214278, + "grad_norm": 3.0625, + "learning_rate": 4.847544378642327e-05, + "loss": 0.6083, + "step": 2606 + }, + { + "epoch": 0.11451529688134626, + "grad_norm": 3.046875, + "learning_rate": 4.8473063906539804e-05, + "loss": 0.5894, + "step": 2608 + }, + { + "epoch": 0.11460311536054975, + "grad_norm": 3.015625, + "learning_rate": 4.8470682229083477e-05, + "loss": 0.6113, + "step": 2610 + }, + { + "epoch": 0.11469093383975323, + "grad_norm": 2.734375, + "learning_rate": 4.846829875423667e-05, + "loss": 0.5863, + "step": 2612 + }, + { + "epoch": 0.11477875231895672, + "grad_norm": 3.390625, + "learning_rate": 4.846591348218192e-05, + "loss": 0.5958, + "step": 2614 + }, + { + "epoch": 0.1148665707981602, + "grad_norm": 3.421875, + "learning_rate": 4.84635264131019e-05, + "loss": 0.5882, + "step": 2616 + }, + { + "epoch": 0.11495438927736369, + "grad_norm": 2.8125, + "learning_rate": 4.84611375471794e-05, + "loss": 0.6003, + "step": 2618 + }, + { + "epoch": 0.11504220775656718, + "grad_norm": 3.5625, + "learning_rate": 4.845874688459736e-05, + "loss": 0.6034, + "step": 2620 + }, + { + "epoch": 0.11513002623577066, + "grad_norm": 2.84375, + "learning_rate": 4.845635442553885e-05, + "loss": 0.5967, + "step": 2622 + }, + { + "epoch": 0.11521784471497415, + "grad_norm": 3.046875, + "learning_rate": 4.8453960170187104e-05, + "loss": 0.593, + "step": 2624 + }, + { + "epoch": 0.11530566319417763, + "grad_norm": 3.234375, + "learning_rate": 4.8451564118725474e-05, + "loss": 0.5784, + "step": 2626 + }, + { + "epoch": 0.11539348167338112, + "grad_norm": 3.0625, + "learning_rate": 4.8449166271337434e-05, + "loss": 0.5907, + "step": 2628 + }, + { + "epoch": 0.1154813001525846, + "grad_norm": 3.265625, + "learning_rate": 4.844676662820662e-05, + "loss": 0.5818, + "step": 2630 + }, + { + "epoch": 0.11556911863178809, + "grad_norm": 2.875, + "learning_rate": 4.8444365189516796e-05, + "loss": 0.6006, + "step": 2632 + }, + { + "epoch": 0.11565693711099158, + "grad_norm": 3.296875, + "learning_rate": 4.8441961955451865e-05, + "loss": 0.5846, + "step": 2634 + }, + { + "epoch": 0.11574475559019506, + "grad_norm": 2.953125, + "learning_rate": 4.843955692619587e-05, + "loss": 0.5827, + "step": 2636 + }, + { + "epoch": 0.11583257406939855, + "grad_norm": 2.375, + "learning_rate": 4.8437150101932996e-05, + "loss": 0.6059, + "step": 2638 + }, + { + "epoch": 0.11592039254860204, + "grad_norm": 3.078125, + "learning_rate": 4.843474148284753e-05, + "loss": 0.5945, + "step": 2640 + }, + { + "epoch": 0.11600821102780552, + "grad_norm": 3.421875, + "learning_rate": 4.843233106912396e-05, + "loss": 0.5843, + "step": 2642 + }, + { + "epoch": 0.11609602950700901, + "grad_norm": 2.671875, + "learning_rate": 4.842991886094686e-05, + "loss": 0.5994, + "step": 2644 + }, + { + "epoch": 0.1161838479862125, + "grad_norm": 3.625, + "learning_rate": 4.842750485850094e-05, + "loss": 0.5767, + "step": 2646 + }, + { + "epoch": 0.11627166646541598, + "grad_norm": 3.625, + "learning_rate": 4.8425089061971094e-05, + "loss": 0.5543, + "step": 2648 + }, + { + "epoch": 0.11635948494461947, + "grad_norm": 3.140625, + "learning_rate": 4.8422671471542314e-05, + "loss": 0.6315, + "step": 2650 + }, + { + "epoch": 0.11644730342382296, + "grad_norm": 2.921875, + "learning_rate": 4.842025208739973e-05, + "loss": 0.5896, + "step": 2652 + }, + { + "epoch": 0.11653512190302645, + "grad_norm": 3.0, + "learning_rate": 4.8417830909728637e-05, + "loss": 0.5782, + "step": 2654 + }, + { + "epoch": 0.11662294038222994, + "grad_norm": 3.046875, + "learning_rate": 4.841540793871443e-05, + "loss": 0.58, + "step": 2656 + }, + { + "epoch": 0.11671075886143342, + "grad_norm": 3.125, + "learning_rate": 4.841298317454267e-05, + "loss": 0.5825, + "step": 2658 + }, + { + "epoch": 0.11679857734063691, + "grad_norm": 3.28125, + "learning_rate": 4.841055661739905e-05, + "loss": 0.6037, + "step": 2660 + }, + { + "epoch": 0.1168863958198404, + "grad_norm": 3.28125, + "learning_rate": 4.8408128267469394e-05, + "loss": 0.6085, + "step": 2662 + }, + { + "epoch": 0.11697421429904388, + "grad_norm": 2.75, + "learning_rate": 4.840569812493966e-05, + "loss": 0.6025, + "step": 2664 + }, + { + "epoch": 0.11706203277824737, + "grad_norm": 3.0625, + "learning_rate": 4.840326618999595e-05, + "loss": 0.5656, + "step": 2666 + }, + { + "epoch": 0.11714985125745085, + "grad_norm": 3.125, + "learning_rate": 4.840083246282452e-05, + "loss": 0.5908, + "step": 2668 + }, + { + "epoch": 0.11723766973665434, + "grad_norm": 2.6875, + "learning_rate": 4.8398396943611715e-05, + "loss": 0.5884, + "step": 2670 + }, + { + "epoch": 0.11732548821585782, + "grad_norm": 3.328125, + "learning_rate": 4.839595963254407e-05, + "loss": 0.6311, + "step": 2672 + }, + { + "epoch": 0.11741330669506131, + "grad_norm": 3.21875, + "learning_rate": 4.8393520529808224e-05, + "loss": 0.603, + "step": 2674 + }, + { + "epoch": 0.1175011251742648, + "grad_norm": 2.71875, + "learning_rate": 4.839107963559097e-05, + "loss": 0.5678, + "step": 2676 + }, + { + "epoch": 0.11758894365346828, + "grad_norm": 3.25, + "learning_rate": 4.838863695007923e-05, + "loss": 0.6127, + "step": 2678 + }, + { + "epoch": 0.11767676213267177, + "grad_norm": 3.234375, + "learning_rate": 4.838619247346007e-05, + "loss": 0.6211, + "step": 2680 + }, + { + "epoch": 0.11776458061187525, + "grad_norm": 3.046875, + "learning_rate": 4.838374620592068e-05, + "loss": 0.6083, + "step": 2682 + }, + { + "epoch": 0.11785239909107874, + "grad_norm": 3.234375, + "learning_rate": 4.83812981476484e-05, + "loss": 0.5768, + "step": 2684 + }, + { + "epoch": 0.11794021757028222, + "grad_norm": 3.265625, + "learning_rate": 4.8378848298830706e-05, + "loss": 0.5962, + "step": 2686 + }, + { + "epoch": 0.11802803604948571, + "grad_norm": 5.21875, + "learning_rate": 4.83763966596552e-05, + "loss": 0.5941, + "step": 2688 + }, + { + "epoch": 0.1181158545286892, + "grad_norm": 5.71875, + "learning_rate": 4.837394323030964e-05, + "loss": 0.6159, + "step": 2690 + }, + { + "epoch": 0.11820367300789268, + "grad_norm": 4.65625, + "learning_rate": 4.8371488010981894e-05, + "loss": 0.5818, + "step": 2692 + }, + { + "epoch": 0.11829149148709617, + "grad_norm": 4.09375, + "learning_rate": 4.8369031001860005e-05, + "loss": 0.5715, + "step": 2694 + }, + { + "epoch": 0.11837930996629965, + "grad_norm": 4.375, + "learning_rate": 4.836657220313211e-05, + "loss": 0.5782, + "step": 2696 + }, + { + "epoch": 0.11846712844550314, + "grad_norm": 2.890625, + "learning_rate": 4.8364111614986527e-05, + "loss": 0.5572, + "step": 2698 + }, + { + "epoch": 0.11855494692470663, + "grad_norm": 2.8125, + "learning_rate": 4.836164923761166e-05, + "loss": 0.603, + "step": 2700 + }, + { + "epoch": 0.11864276540391011, + "grad_norm": 2.609375, + "learning_rate": 4.835918507119611e-05, + "loss": 0.5851, + "step": 2702 + }, + { + "epoch": 0.11873058388311361, + "grad_norm": 3.203125, + "learning_rate": 4.8356719115928564e-05, + "loss": 0.602, + "step": 2704 + }, + { + "epoch": 0.1188184023623171, + "grad_norm": 2.984375, + "learning_rate": 4.835425137199786e-05, + "loss": 0.5614, + "step": 2706 + }, + { + "epoch": 0.11890622084152058, + "grad_norm": 2.859375, + "learning_rate": 4.835178183959299e-05, + "loss": 0.6055, + "step": 2708 + }, + { + "epoch": 0.11899403932072407, + "grad_norm": 2.6875, + "learning_rate": 4.834931051890308e-05, + "loss": 0.5567, + "step": 2710 + }, + { + "epoch": 0.11908185779992755, + "grad_norm": 3.375, + "learning_rate": 4.8346837410117365e-05, + "loss": 0.5798, + "step": 2712 + }, + { + "epoch": 0.11916967627913104, + "grad_norm": 4.0625, + "learning_rate": 4.834436251342524e-05, + "loss": 0.5719, + "step": 2714 + }, + { + "epoch": 0.11925749475833453, + "grad_norm": 4.125, + "learning_rate": 4.834188582901624e-05, + "loss": 0.6048, + "step": 2716 + }, + { + "epoch": 0.11934531323753801, + "grad_norm": 3.296875, + "learning_rate": 4.833940735708003e-05, + "loss": 0.5629, + "step": 2718 + }, + { + "epoch": 0.1194331317167415, + "grad_norm": 2.703125, + "learning_rate": 4.8336927097806415e-05, + "loss": 0.5777, + "step": 2720 + }, + { + "epoch": 0.11952095019594498, + "grad_norm": 3.171875, + "learning_rate": 4.833444505138533e-05, + "loss": 0.5959, + "step": 2722 + }, + { + "epoch": 0.11960876867514847, + "grad_norm": 2.796875, + "learning_rate": 4.833196121800684e-05, + "loss": 0.5682, + "step": 2724 + }, + { + "epoch": 0.11969658715435195, + "grad_norm": 3.03125, + "learning_rate": 4.832947559786116e-05, + "loss": 0.5562, + "step": 2726 + }, + { + "epoch": 0.11978440563355544, + "grad_norm": 4.28125, + "learning_rate": 4.8326988191138664e-05, + "loss": 0.5755, + "step": 2728 + }, + { + "epoch": 0.11987222411275893, + "grad_norm": 4.375, + "learning_rate": 4.832449899802981e-05, + "loss": 0.5914, + "step": 2730 + }, + { + "epoch": 0.11996004259196241, + "grad_norm": 5.53125, + "learning_rate": 4.832200801872523e-05, + "loss": 0.5999, + "step": 2732 + }, + { + "epoch": 0.1200478610711659, + "grad_norm": 6.25, + "learning_rate": 4.8319515253415695e-05, + "loss": 0.5996, + "step": 2734 + }, + { + "epoch": 0.12013567955036938, + "grad_norm": 2.796875, + "learning_rate": 4.831702070229208e-05, + "loss": 0.6215, + "step": 2736 + }, + { + "epoch": 0.12022349802957287, + "grad_norm": 2.75, + "learning_rate": 4.831452436554544e-05, + "loss": 0.6128, + "step": 2738 + }, + { + "epoch": 0.12031131650877636, + "grad_norm": 2.96875, + "learning_rate": 4.831202624336693e-05, + "loss": 0.5602, + "step": 2740 + }, + { + "epoch": 0.12039913498797984, + "grad_norm": 3.109375, + "learning_rate": 4.830952633594786e-05, + "loss": 0.5663, + "step": 2742 + }, + { + "epoch": 0.12048695346718333, + "grad_norm": 3.359375, + "learning_rate": 4.8307024643479684e-05, + "loss": 0.5638, + "step": 2744 + }, + { + "epoch": 0.12057477194638681, + "grad_norm": 4.5625, + "learning_rate": 4.830452116615397e-05, + "loss": 0.5799, + "step": 2746 + }, + { + "epoch": 0.1206625904255903, + "grad_norm": 3.953125, + "learning_rate": 4.830201590416243e-05, + "loss": 0.5996, + "step": 2748 + }, + { + "epoch": 0.12075040890479379, + "grad_norm": 3.25, + "learning_rate": 4.829950885769694e-05, + "loss": 0.5995, + "step": 2750 + }, + { + "epoch": 0.12083822738399727, + "grad_norm": 2.71875, + "learning_rate": 4.8297000026949466e-05, + "loss": 0.5927, + "step": 2752 + }, + { + "epoch": 0.12092604586320077, + "grad_norm": 2.734375, + "learning_rate": 4.829448941211215e-05, + "loss": 0.5874, + "step": 2754 + }, + { + "epoch": 0.12101386434240426, + "grad_norm": 3.21875, + "learning_rate": 4.8291977013377254e-05, + "loss": 0.5724, + "step": 2756 + }, + { + "epoch": 0.12110168282160774, + "grad_norm": 2.890625, + "learning_rate": 4.828946283093717e-05, + "loss": 0.6041, + "step": 2758 + }, + { + "epoch": 0.12118950130081123, + "grad_norm": 3.09375, + "learning_rate": 4.828694686498444e-05, + "loss": 0.5648, + "step": 2760 + }, + { + "epoch": 0.12127731978001471, + "grad_norm": 3.265625, + "learning_rate": 4.8284429115711735e-05, + "loss": 0.5902, + "step": 2762 + }, + { + "epoch": 0.1213651382592182, + "grad_norm": 2.75, + "learning_rate": 4.828190958331187e-05, + "loss": 0.5663, + "step": 2764 + }, + { + "epoch": 0.12145295673842169, + "grad_norm": 2.953125, + "learning_rate": 4.827938826797779e-05, + "loss": 0.5614, + "step": 2766 + }, + { + "epoch": 0.12154077521762517, + "grad_norm": 3.78125, + "learning_rate": 4.827686516990257e-05, + "loss": 0.581, + "step": 2768 + }, + { + "epoch": 0.12162859369682866, + "grad_norm": 4.375, + "learning_rate": 4.827434028927944e-05, + "loss": 0.5845, + "step": 2770 + }, + { + "epoch": 0.12171641217603214, + "grad_norm": 3.65625, + "learning_rate": 4.827181362630175e-05, + "loss": 0.5912, + "step": 2772 + }, + { + "epoch": 0.12180423065523563, + "grad_norm": 3.015625, + "learning_rate": 4.8269285181162995e-05, + "loss": 0.5885, + "step": 2774 + }, + { + "epoch": 0.12189204913443911, + "grad_norm": 2.828125, + "learning_rate": 4.82667549540568e-05, + "loss": 0.5797, + "step": 2776 + }, + { + "epoch": 0.1219798676136426, + "grad_norm": 3.25, + "learning_rate": 4.826422294517693e-05, + "loss": 0.5669, + "step": 2778 + }, + { + "epoch": 0.12206768609284609, + "grad_norm": 3.125, + "learning_rate": 4.826168915471729e-05, + "loss": 0.6038, + "step": 2780 + }, + { + "epoch": 0.12215550457204957, + "grad_norm": 3.609375, + "learning_rate": 4.825915358287193e-05, + "loss": 0.578, + "step": 2782 + }, + { + "epoch": 0.12224332305125306, + "grad_norm": 3.140625, + "learning_rate": 4.8256616229835e-05, + "loss": 0.5762, + "step": 2784 + }, + { + "epoch": 0.12233114153045654, + "grad_norm": 3.96875, + "learning_rate": 4.825407709580083e-05, + "loss": 0.612, + "step": 2786 + }, + { + "epoch": 0.12241896000966003, + "grad_norm": 3.28125, + "learning_rate": 4.825153618096386e-05, + "loss": 0.6211, + "step": 2788 + }, + { + "epoch": 0.12250677848886352, + "grad_norm": 3.6875, + "learning_rate": 4.824899348551868e-05, + "loss": 0.5634, + "step": 2790 + }, + { + "epoch": 0.122594596968067, + "grad_norm": 2.71875, + "learning_rate": 4.8246449009660004e-05, + "loss": 0.5689, + "step": 2792 + }, + { + "epoch": 0.12268241544727049, + "grad_norm": 3.25, + "learning_rate": 4.8243902753582695e-05, + "loss": 0.6071, + "step": 2794 + }, + { + "epoch": 0.12277023392647397, + "grad_norm": 3.15625, + "learning_rate": 4.8241354717481734e-05, + "loss": 0.5847, + "step": 2796 + }, + { + "epoch": 0.12285805240567746, + "grad_norm": 2.671875, + "learning_rate": 4.823880490155226e-05, + "loss": 0.5787, + "step": 2798 + }, + { + "epoch": 0.12294587088488095, + "grad_norm": 2.90625, + "learning_rate": 4.823625330598953e-05, + "loss": 0.5927, + "step": 2800 + }, + { + "epoch": 0.12303368936408443, + "grad_norm": 3.4375, + "learning_rate": 4.823369993098896e-05, + "loss": 0.593, + "step": 2802 + }, + { + "epoch": 0.12312150784328793, + "grad_norm": 3.203125, + "learning_rate": 4.823114477674607e-05, + "loss": 0.5603, + "step": 2804 + }, + { + "epoch": 0.12320932632249142, + "grad_norm": 2.96875, + "learning_rate": 4.822858784345655e-05, + "loss": 0.5736, + "step": 2806 + }, + { + "epoch": 0.1232971448016949, + "grad_norm": 3.015625, + "learning_rate": 4.822602913131621e-05, + "loss": 0.594, + "step": 2808 + }, + { + "epoch": 0.12338496328089839, + "grad_norm": 2.84375, + "learning_rate": 4.8223468640520984e-05, + "loss": 0.5968, + "step": 2810 + }, + { + "epoch": 0.12347278176010187, + "grad_norm": 3.0625, + "learning_rate": 4.822090637126697e-05, + "loss": 0.5796, + "step": 2812 + }, + { + "epoch": 0.12356060023930536, + "grad_norm": 2.890625, + "learning_rate": 4.821834232375037e-05, + "loss": 0.5728, + "step": 2814 + }, + { + "epoch": 0.12364841871850885, + "grad_norm": 2.9375, + "learning_rate": 4.8215776498167555e-05, + "loss": 0.5684, + "step": 2816 + }, + { + "epoch": 0.12373623719771233, + "grad_norm": 2.859375, + "learning_rate": 4.8213208894715e-05, + "loss": 0.5754, + "step": 2818 + }, + { + "epoch": 0.12382405567691582, + "grad_norm": 2.78125, + "learning_rate": 4.821063951358936e-05, + "loss": 0.579, + "step": 2820 + }, + { + "epoch": 0.1239118741561193, + "grad_norm": 2.90625, + "learning_rate": 4.820806835498737e-05, + "loss": 0.6119, + "step": 2822 + }, + { + "epoch": 0.12399969263532279, + "grad_norm": 2.890625, + "learning_rate": 4.820549541910595e-05, + "loss": 0.5824, + "step": 2824 + }, + { + "epoch": 0.12408751111452627, + "grad_norm": 3.28125, + "learning_rate": 4.820292070614212e-05, + "loss": 0.5643, + "step": 2826 + }, + { + "epoch": 0.12417532959372976, + "grad_norm": 2.671875, + "learning_rate": 4.820034421629307e-05, + "loss": 0.5791, + "step": 2828 + }, + { + "epoch": 0.12426314807293325, + "grad_norm": 3.71875, + "learning_rate": 4.819776594975609e-05, + "loss": 0.5905, + "step": 2830 + }, + { + "epoch": 0.12435096655213673, + "grad_norm": 3.21875, + "learning_rate": 4.819518590672863e-05, + "loss": 0.6018, + "step": 2832 + }, + { + "epoch": 0.12443878503134022, + "grad_norm": 3.515625, + "learning_rate": 4.8192604087408285e-05, + "loss": 0.5928, + "step": 2834 + }, + { + "epoch": 0.1245266035105437, + "grad_norm": 4.15625, + "learning_rate": 4.819002049199276e-05, + "loss": 0.5817, + "step": 2836 + }, + { + "epoch": 0.12461442198974719, + "grad_norm": 2.765625, + "learning_rate": 4.818743512067989e-05, + "loss": 0.6143, + "step": 2838 + }, + { + "epoch": 0.12470224046895068, + "grad_norm": 3.21875, + "learning_rate": 4.8184847973667695e-05, + "loss": 0.5789, + "step": 2840 + }, + { + "epoch": 0.12479005894815416, + "grad_norm": 2.9375, + "learning_rate": 4.818225905115428e-05, + "loss": 0.5805, + "step": 2842 + }, + { + "epoch": 0.12487787742735765, + "grad_norm": 3.265625, + "learning_rate": 4.817966835333791e-05, + "loss": 0.5753, + "step": 2844 + }, + { + "epoch": 0.12496569590656113, + "grad_norm": 2.90625, + "learning_rate": 4.817707588041698e-05, + "loss": 0.5686, + "step": 2846 + }, + { + "epoch": 0.12505351438576462, + "grad_norm": 2.671875, + "learning_rate": 4.817448163259002e-05, + "loss": 0.5567, + "step": 2848 + }, + { + "epoch": 0.12514133286496812, + "grad_norm": 2.703125, + "learning_rate": 4.8171885610055705e-05, + "loss": 0.5747, + "step": 2850 + }, + { + "epoch": 0.1252291513441716, + "grad_norm": 3.0, + "learning_rate": 4.816928781301283e-05, + "loss": 0.5622, + "step": 2852 + }, + { + "epoch": 0.1253169698233751, + "grad_norm": 3.03125, + "learning_rate": 4.816668824166035e-05, + "loss": 0.5603, + "step": 2854 + }, + { + "epoch": 0.12540478830257856, + "grad_norm": 2.59375, + "learning_rate": 4.8164086896197325e-05, + "loss": 0.5682, + "step": 2856 + }, + { + "epoch": 0.12549260678178206, + "grad_norm": 3.09375, + "learning_rate": 4.816148377682297e-05, + "loss": 0.6089, + "step": 2858 + }, + { + "epoch": 0.12558042526098553, + "grad_norm": 2.859375, + "learning_rate": 4.8158878883736644e-05, + "loss": 0.5962, + "step": 2860 + }, + { + "epoch": 0.12566824374018903, + "grad_norm": 3.375, + "learning_rate": 4.815627221713781e-05, + "loss": 0.5576, + "step": 2862 + }, + { + "epoch": 0.1257560622193925, + "grad_norm": 2.84375, + "learning_rate": 4.815366377722611e-05, + "loss": 0.6009, + "step": 2864 + }, + { + "epoch": 0.125843880698596, + "grad_norm": 3.109375, + "learning_rate": 4.815105356420128e-05, + "loss": 0.5888, + "step": 2866 + }, + { + "epoch": 0.12593169917779948, + "grad_norm": 3.03125, + "learning_rate": 4.814844157826323e-05, + "loss": 0.5765, + "step": 2868 + }, + { + "epoch": 0.12601951765700298, + "grad_norm": 3.25, + "learning_rate": 4.814582781961195e-05, + "loss": 0.5808, + "step": 2870 + }, + { + "epoch": 0.12610733613620645, + "grad_norm": 3.515625, + "learning_rate": 4.814321228844765e-05, + "loss": 0.5683, + "step": 2872 + }, + { + "epoch": 0.12619515461540995, + "grad_norm": 3.25, + "learning_rate": 4.81405949849706e-05, + "loss": 0.5604, + "step": 2874 + }, + { + "epoch": 0.12628297309461342, + "grad_norm": 3.15625, + "learning_rate": 4.813797590938124e-05, + "loss": 0.5842, + "step": 2876 + }, + { + "epoch": 0.12637079157381692, + "grad_norm": 2.875, + "learning_rate": 4.8135355061880124e-05, + "loss": 0.577, + "step": 2878 + }, + { + "epoch": 0.12645861005302042, + "grad_norm": 4.0625, + "learning_rate": 4.813273244266799e-05, + "loss": 0.5972, + "step": 2880 + }, + { + "epoch": 0.1265464285322239, + "grad_norm": 4.6875, + "learning_rate": 4.8130108051945655e-05, + "loss": 0.5887, + "step": 2882 + }, + { + "epoch": 0.1266342470114274, + "grad_norm": 4.5625, + "learning_rate": 4.8127481889914096e-05, + "loss": 0.6029, + "step": 2884 + }, + { + "epoch": 0.12672206549063086, + "grad_norm": 3.609375, + "learning_rate": 4.812485395677443e-05, + "loss": 0.589, + "step": 2886 + }, + { + "epoch": 0.12680988396983436, + "grad_norm": 3.34375, + "learning_rate": 4.812222425272791e-05, + "loss": 0.5527, + "step": 2888 + }, + { + "epoch": 0.12689770244903784, + "grad_norm": 3.015625, + "learning_rate": 4.811959277797591e-05, + "loss": 0.6044, + "step": 2890 + }, + { + "epoch": 0.12698552092824134, + "grad_norm": 3.3125, + "learning_rate": 4.8116959532719954e-05, + "loss": 0.5792, + "step": 2892 + }, + { + "epoch": 0.1270733394074448, + "grad_norm": 3.734375, + "learning_rate": 4.81143245171617e-05, + "loss": 0.5781, + "step": 2894 + }, + { + "epoch": 0.1271611578866483, + "grad_norm": 2.8125, + "learning_rate": 4.8111687731502925e-05, + "loss": 0.5827, + "step": 2896 + }, + { + "epoch": 0.12724897636585178, + "grad_norm": 2.984375, + "learning_rate": 4.8109049175945566e-05, + "loss": 0.5314, + "step": 2898 + }, + { + "epoch": 0.12733679484505528, + "grad_norm": 3.140625, + "learning_rate": 4.810640885069169e-05, + "loss": 0.5687, + "step": 2900 + }, + { + "epoch": 0.12742461332425875, + "grad_norm": 3.0, + "learning_rate": 4.810376675594347e-05, + "loss": 0.5552, + "step": 2902 + }, + { + "epoch": 0.12751243180346225, + "grad_norm": 3.375, + "learning_rate": 4.8101122891903263e-05, + "loss": 0.5637, + "step": 2904 + }, + { + "epoch": 0.12760025028266572, + "grad_norm": 3.265625, + "learning_rate": 4.809847725877352e-05, + "loss": 0.5685, + "step": 2906 + }, + { + "epoch": 0.12768806876186922, + "grad_norm": 3.109375, + "learning_rate": 4.8095829856756866e-05, + "loss": 0.581, + "step": 2908 + }, + { + "epoch": 0.1277758872410727, + "grad_norm": 3.421875, + "learning_rate": 4.809318068605602e-05, + "loss": 0.5857, + "step": 2910 + }, + { + "epoch": 0.1278637057202762, + "grad_norm": 2.890625, + "learning_rate": 4.8090529746873845e-05, + "loss": 0.5687, + "step": 2912 + }, + { + "epoch": 0.12795152419947967, + "grad_norm": 2.796875, + "learning_rate": 4.8087877039413386e-05, + "loss": 0.5868, + "step": 2914 + }, + { + "epoch": 0.12803934267868317, + "grad_norm": 3.03125, + "learning_rate": 4.8085222563877766e-05, + "loss": 0.5768, + "step": 2916 + }, + { + "epoch": 0.12812716115788664, + "grad_norm": 2.9375, + "learning_rate": 4.808256632047026e-05, + "loss": 0.5833, + "step": 2918 + }, + { + "epoch": 0.12821497963709014, + "grad_norm": 2.640625, + "learning_rate": 4.80799083093943e-05, + "loss": 0.5633, + "step": 2920 + }, + { + "epoch": 0.1283027981162936, + "grad_norm": 3.296875, + "learning_rate": 4.8077248530853416e-05, + "loss": 0.5658, + "step": 2922 + }, + { + "epoch": 0.1283906165954971, + "grad_norm": 2.8125, + "learning_rate": 4.807458698505132e-05, + "loss": 0.6031, + "step": 2924 + }, + { + "epoch": 0.12847843507470058, + "grad_norm": 2.9375, + "learning_rate": 4.807192367219182e-05, + "loss": 0.5631, + "step": 2926 + }, + { + "epoch": 0.12856625355390408, + "grad_norm": 2.65625, + "learning_rate": 4.8069258592478875e-05, + "loss": 0.5809, + "step": 2928 + }, + { + "epoch": 0.12865407203310758, + "grad_norm": 2.59375, + "learning_rate": 4.8066591746116575e-05, + "loss": 0.5332, + "step": 2930 + }, + { + "epoch": 0.12874189051231105, + "grad_norm": 2.796875, + "learning_rate": 4.8063923133309144e-05, + "loss": 0.5767, + "step": 2932 + }, + { + "epoch": 0.12882970899151455, + "grad_norm": 3.5625, + "learning_rate": 4.8061252754260954e-05, + "loss": 0.5972, + "step": 2934 + }, + { + "epoch": 0.12891752747071802, + "grad_norm": 3.734375, + "learning_rate": 4.805858060917651e-05, + "loss": 0.5573, + "step": 2936 + }, + { + "epoch": 0.12900534594992152, + "grad_norm": 4.4375, + "learning_rate": 4.8055906698260424e-05, + "loss": 0.582, + "step": 2938 + }, + { + "epoch": 0.129093164429125, + "grad_norm": 4.09375, + "learning_rate": 4.805323102171748e-05, + "loss": 0.5729, + "step": 2940 + }, + { + "epoch": 0.1291809829083285, + "grad_norm": 4.40625, + "learning_rate": 4.805055357975257e-05, + "loss": 0.5866, + "step": 2942 + }, + { + "epoch": 0.12926880138753197, + "grad_norm": 4.59375, + "learning_rate": 4.804787437257075e-05, + "loss": 0.6013, + "step": 2944 + }, + { + "epoch": 0.12935661986673547, + "grad_norm": 4.59375, + "learning_rate": 4.8045193400377186e-05, + "loss": 0.5556, + "step": 2946 + }, + { + "epoch": 0.12944443834593894, + "grad_norm": 5.9375, + "learning_rate": 4.8042510663377184e-05, + "loss": 0.5914, + "step": 2948 + }, + { + "epoch": 0.12953225682514244, + "grad_norm": 4.375, + "learning_rate": 4.803982616177619e-05, + "loss": 0.5841, + "step": 2950 + }, + { + "epoch": 0.1296200753043459, + "grad_norm": 2.34375, + "learning_rate": 4.803713989577979e-05, + "loss": 0.5732, + "step": 2952 + }, + { + "epoch": 0.1297078937835494, + "grad_norm": 3.015625, + "learning_rate": 4.803445186559369e-05, + "loss": 0.5683, + "step": 2954 + }, + { + "epoch": 0.12979571226275288, + "grad_norm": 3.59375, + "learning_rate": 4.803176207142375e-05, + "loss": 0.5492, + "step": 2956 + }, + { + "epoch": 0.12988353074195638, + "grad_norm": 4.0625, + "learning_rate": 4.802907051347595e-05, + "loss": 0.5642, + "step": 2958 + }, + { + "epoch": 0.12997134922115985, + "grad_norm": 3.421875, + "learning_rate": 4.8026377191956404e-05, + "loss": 0.5762, + "step": 2960 + }, + { + "epoch": 0.13005916770036335, + "grad_norm": 3.625, + "learning_rate": 4.802368210707138e-05, + "loss": 0.5616, + "step": 2962 + }, + { + "epoch": 0.13014698617956683, + "grad_norm": 3.203125, + "learning_rate": 4.802098525902725e-05, + "loss": 0.557, + "step": 2964 + }, + { + "epoch": 0.13023480465877033, + "grad_norm": 2.875, + "learning_rate": 4.801828664803056e-05, + "loss": 0.5589, + "step": 2966 + }, + { + "epoch": 0.1303226231379738, + "grad_norm": 2.5, + "learning_rate": 4.8015586274287954e-05, + "loss": 0.5587, + "step": 2968 + }, + { + "epoch": 0.1304104416171773, + "grad_norm": 2.578125, + "learning_rate": 4.8012884138006236e-05, + "loss": 0.5536, + "step": 2970 + }, + { + "epoch": 0.13049826009638077, + "grad_norm": 2.53125, + "learning_rate": 4.8010180239392336e-05, + "loss": 0.5832, + "step": 2972 + }, + { + "epoch": 0.13058607857558427, + "grad_norm": 2.75, + "learning_rate": 4.8007474578653315e-05, + "loss": 0.5525, + "step": 2974 + }, + { + "epoch": 0.13067389705478774, + "grad_norm": 2.78125, + "learning_rate": 4.800476715599638e-05, + "loss": 0.554, + "step": 2976 + }, + { + "epoch": 0.13076171553399124, + "grad_norm": 2.640625, + "learning_rate": 4.800205797162885e-05, + "loss": 0.5811, + "step": 2978 + }, + { + "epoch": 0.1308495340131947, + "grad_norm": 2.890625, + "learning_rate": 4.7999347025758226e-05, + "loss": 0.5601, + "step": 2980 + }, + { + "epoch": 0.1309373524923982, + "grad_norm": 2.78125, + "learning_rate": 4.7996634318592084e-05, + "loss": 0.5816, + "step": 2982 + }, + { + "epoch": 0.1310251709716017, + "grad_norm": 2.9375, + "learning_rate": 4.7993919850338165e-05, + "loss": 0.5796, + "step": 2984 + }, + { + "epoch": 0.13111298945080518, + "grad_norm": 3.15625, + "learning_rate": 4.799120362120436e-05, + "loss": 0.5404, + "step": 2986 + }, + { + "epoch": 0.13120080793000868, + "grad_norm": 3.359375, + "learning_rate": 4.798848563139867e-05, + "loss": 0.5436, + "step": 2988 + }, + { + "epoch": 0.13128862640921216, + "grad_norm": 3.0, + "learning_rate": 4.7985765881129244e-05, + "loss": 0.5516, + "step": 2990 + }, + { + "epoch": 0.13137644488841566, + "grad_norm": 2.546875, + "learning_rate": 4.798304437060435e-05, + "loss": 0.5522, + "step": 2992 + }, + { + "epoch": 0.13146426336761913, + "grad_norm": 2.796875, + "learning_rate": 4.798032110003241e-05, + "loss": 0.5484, + "step": 2994 + }, + { + "epoch": 0.13155208184682263, + "grad_norm": 3.078125, + "learning_rate": 4.797759606962196e-05, + "loss": 0.5625, + "step": 2996 + }, + { + "epoch": 0.1316399003260261, + "grad_norm": 3.984375, + "learning_rate": 4.797486927958171e-05, + "loss": 0.5799, + "step": 2998 + }, + { + "epoch": 0.1317277188052296, + "grad_norm": 2.828125, + "learning_rate": 4.797214073012046e-05, + "loss": 0.5912, + "step": 3000 + }, + { + "epoch": 0.13181553728443307, + "grad_norm": 2.484375, + "learning_rate": 4.796941042144717e-05, + "loss": 0.5771, + "step": 3002 + }, + { + "epoch": 0.13190335576363657, + "grad_norm": 2.65625, + "learning_rate": 4.796667835377092e-05, + "loss": 0.5842, + "step": 3004 + }, + { + "epoch": 0.13199117424284004, + "grad_norm": 3.328125, + "learning_rate": 4.796394452730094e-05, + "loss": 0.5651, + "step": 3006 + }, + { + "epoch": 0.13207899272204354, + "grad_norm": 2.734375, + "learning_rate": 4.796120894224657e-05, + "loss": 0.5555, + "step": 3008 + }, + { + "epoch": 0.13216681120124701, + "grad_norm": 2.8125, + "learning_rate": 4.795847159881733e-05, + "loss": 0.5811, + "step": 3010 + }, + { + "epoch": 0.13225462968045051, + "grad_norm": 2.78125, + "learning_rate": 4.795573249722282e-05, + "loss": 0.5715, + "step": 3012 + }, + { + "epoch": 0.13234244815965399, + "grad_norm": 2.5, + "learning_rate": 4.795299163767282e-05, + "loss": 0.5762, + "step": 3014 + }, + { + "epoch": 0.13243026663885749, + "grad_norm": 2.984375, + "learning_rate": 4.7950249020377215e-05, + "loss": 0.5867, + "step": 3016 + }, + { + "epoch": 0.13251808511806096, + "grad_norm": 3.34375, + "learning_rate": 4.7947504645546034e-05, + "loss": 0.5801, + "step": 3018 + }, + { + "epoch": 0.13260590359726446, + "grad_norm": 3.203125, + "learning_rate": 4.794475851338946e-05, + "loss": 0.5964, + "step": 3020 + }, + { + "epoch": 0.13269372207646793, + "grad_norm": 2.921875, + "learning_rate": 4.794201062411777e-05, + "loss": 0.54, + "step": 3022 + }, + { + "epoch": 0.13278154055567143, + "grad_norm": 2.65625, + "learning_rate": 4.79392609779414e-05, + "loss": 0.5766, + "step": 3024 + }, + { + "epoch": 0.1328693590348749, + "grad_norm": 3.0, + "learning_rate": 4.7936509575070945e-05, + "loss": 0.6018, + "step": 3026 + }, + { + "epoch": 0.1329571775140784, + "grad_norm": 3.0, + "learning_rate": 4.793375641571707e-05, + "loss": 0.5358, + "step": 3028 + }, + { + "epoch": 0.13304499599328187, + "grad_norm": 3.109375, + "learning_rate": 4.793100150009064e-05, + "loss": 0.5788, + "step": 3030 + }, + { + "epoch": 0.13313281447248537, + "grad_norm": 3.015625, + "learning_rate": 4.7928244828402613e-05, + "loss": 0.5617, + "step": 3032 + }, + { + "epoch": 0.13322063295168887, + "grad_norm": 2.734375, + "learning_rate": 4.7925486400864104e-05, + "loss": 0.5712, + "step": 3034 + }, + { + "epoch": 0.13330845143089234, + "grad_norm": 2.765625, + "learning_rate": 4.7922726217686355e-05, + "loss": 0.5818, + "step": 3036 + }, + { + "epoch": 0.13339626991009584, + "grad_norm": 2.71875, + "learning_rate": 4.7919964279080724e-05, + "loss": 0.58, + "step": 3038 + }, + { + "epoch": 0.13348408838929932, + "grad_norm": 3.015625, + "learning_rate": 4.7917200585258746e-05, + "loss": 0.563, + "step": 3040 + }, + { + "epoch": 0.13357190686850282, + "grad_norm": 2.921875, + "learning_rate": 4.791443513643205e-05, + "loss": 0.5491, + "step": 3042 + }, + { + "epoch": 0.1336597253477063, + "grad_norm": 2.71875, + "learning_rate": 4.791166793281242e-05, + "loss": 0.5647, + "step": 3044 + }, + { + "epoch": 0.1337475438269098, + "grad_norm": 2.640625, + "learning_rate": 4.790889897461176e-05, + "loss": 0.5556, + "step": 3046 + }, + { + "epoch": 0.13383536230611326, + "grad_norm": 3.1875, + "learning_rate": 4.790612826204214e-05, + "loss": 0.5618, + "step": 3048 + }, + { + "epoch": 0.13392318078531676, + "grad_norm": 3.25, + "learning_rate": 4.7903355795315714e-05, + "loss": 0.5705, + "step": 3050 + }, + { + "epoch": 0.13401099926452023, + "grad_norm": 3.125, + "learning_rate": 4.790058157464481e-05, + "loss": 0.5471, + "step": 3052 + }, + { + "epoch": 0.13409881774372373, + "grad_norm": 2.921875, + "learning_rate": 4.789780560024188e-05, + "loss": 0.6094, + "step": 3054 + }, + { + "epoch": 0.1341866362229272, + "grad_norm": 2.6875, + "learning_rate": 4.789502787231952e-05, + "loss": 0.5525, + "step": 3056 + }, + { + "epoch": 0.1342744547021307, + "grad_norm": 2.609375, + "learning_rate": 4.7892248391090426e-05, + "loss": 0.5455, + "step": 3058 + }, + { + "epoch": 0.13436227318133417, + "grad_norm": 2.765625, + "learning_rate": 4.788946715676747e-05, + "loss": 0.5991, + "step": 3060 + }, + { + "epoch": 0.13445009166053767, + "grad_norm": 2.671875, + "learning_rate": 4.788668416956362e-05, + "loss": 0.5645, + "step": 3062 + }, + { + "epoch": 0.13453791013974115, + "grad_norm": 2.890625, + "learning_rate": 4.788389942969202e-05, + "loss": 0.5732, + "step": 3064 + }, + { + "epoch": 0.13462572861894465, + "grad_norm": 3.3125, + "learning_rate": 4.788111293736591e-05, + "loss": 0.5562, + "step": 3066 + }, + { + "epoch": 0.13471354709814812, + "grad_norm": 2.703125, + "learning_rate": 4.7878324692798694e-05, + "loss": 0.5768, + "step": 3068 + }, + { + "epoch": 0.13480136557735162, + "grad_norm": 2.5, + "learning_rate": 4.787553469620388e-05, + "loss": 0.5782, + "step": 3070 + }, + { + "epoch": 0.1348891840565551, + "grad_norm": 2.703125, + "learning_rate": 4.787274294779515e-05, + "loss": 0.5552, + "step": 3072 + }, + { + "epoch": 0.1349770025357586, + "grad_norm": 2.828125, + "learning_rate": 4.7869949447786266e-05, + "loss": 0.5638, + "step": 3074 + }, + { + "epoch": 0.13506482101496206, + "grad_norm": 2.828125, + "learning_rate": 4.7867154196391184e-05, + "loss": 0.5941, + "step": 3076 + }, + { + "epoch": 0.13515263949416556, + "grad_norm": 2.65625, + "learning_rate": 4.786435719382394e-05, + "loss": 0.5407, + "step": 3078 + }, + { + "epoch": 0.13524045797336903, + "grad_norm": 3.109375, + "learning_rate": 4.7861558440298745e-05, + "loss": 0.5807, + "step": 3080 + }, + { + "epoch": 0.13532827645257253, + "grad_norm": 3.296875, + "learning_rate": 4.785875793602993e-05, + "loss": 0.5439, + "step": 3082 + }, + { + "epoch": 0.13541609493177603, + "grad_norm": 2.765625, + "learning_rate": 4.785595568123195e-05, + "loss": 0.5342, + "step": 3084 + }, + { + "epoch": 0.1355039134109795, + "grad_norm": 2.90625, + "learning_rate": 4.78531516761194e-05, + "loss": 0.5854, + "step": 3086 + }, + { + "epoch": 0.135591731890183, + "grad_norm": 2.96875, + "learning_rate": 4.785034592090702e-05, + "loss": 0.5676, + "step": 3088 + }, + { + "epoch": 0.13567955036938648, + "grad_norm": 3.203125, + "learning_rate": 4.784753841580967e-05, + "loss": 0.5573, + "step": 3090 + }, + { + "epoch": 0.13576736884858998, + "grad_norm": 3.375, + "learning_rate": 4.7844729161042355e-05, + "loss": 0.567, + "step": 3092 + }, + { + "epoch": 0.13585518732779345, + "grad_norm": 3.046875, + "learning_rate": 4.78419181568202e-05, + "loss": 0.5619, + "step": 3094 + }, + { + "epoch": 0.13594300580699695, + "grad_norm": 2.84375, + "learning_rate": 4.783910540335848e-05, + "loss": 0.5454, + "step": 3096 + }, + { + "epoch": 0.13603082428620042, + "grad_norm": 3.4375, + "learning_rate": 4.783629090087259e-05, + "loss": 0.5401, + "step": 3098 + }, + { + "epoch": 0.13611864276540392, + "grad_norm": 4.375, + "learning_rate": 4.783347464957807e-05, + "loss": 0.5515, + "step": 3100 + }, + { + "epoch": 0.1362064612446074, + "grad_norm": 3.9375, + "learning_rate": 4.783065664969059e-05, + "loss": 0.5766, + "step": 3102 + }, + { + "epoch": 0.1362942797238109, + "grad_norm": 4.3125, + "learning_rate": 4.782783690142595e-05, + "loss": 0.5804, + "step": 3104 + }, + { + "epoch": 0.13638209820301436, + "grad_norm": 3.96875, + "learning_rate": 4.782501540500009e-05, + "loss": 0.5908, + "step": 3106 + }, + { + "epoch": 0.13646991668221786, + "grad_norm": 3.15625, + "learning_rate": 4.7822192160629074e-05, + "loss": 0.5553, + "step": 3108 + }, + { + "epoch": 0.13655773516142133, + "grad_norm": 3.796875, + "learning_rate": 4.781936716852912e-05, + "loss": 0.5416, + "step": 3110 + }, + { + "epoch": 0.13664555364062483, + "grad_norm": 2.8125, + "learning_rate": 4.781654042891655e-05, + "loss": 0.5252, + "step": 3112 + }, + { + "epoch": 0.1367333721198283, + "grad_norm": 2.578125, + "learning_rate": 4.781371194200784e-05, + "loss": 0.5582, + "step": 3114 + }, + { + "epoch": 0.1368211905990318, + "grad_norm": 2.59375, + "learning_rate": 4.781088170801961e-05, + "loss": 0.5422, + "step": 3116 + }, + { + "epoch": 0.13690900907823528, + "grad_norm": 3.0625, + "learning_rate": 4.780804972716859e-05, + "loss": 0.5477, + "step": 3118 + }, + { + "epoch": 0.13699682755743878, + "grad_norm": 2.484375, + "learning_rate": 4.780521599967165e-05, + "loss": 0.567, + "step": 3120 + }, + { + "epoch": 0.13708464603664225, + "grad_norm": 3.0625, + "learning_rate": 4.78023805257458e-05, + "loss": 0.5809, + "step": 3122 + }, + { + "epoch": 0.13717246451584575, + "grad_norm": 2.75, + "learning_rate": 4.77995433056082e-05, + "loss": 0.5676, + "step": 3124 + }, + { + "epoch": 0.13726028299504922, + "grad_norm": 2.71875, + "learning_rate": 4.779670433947608e-05, + "loss": 0.5617, + "step": 3126 + }, + { + "epoch": 0.13734810147425272, + "grad_norm": 2.828125, + "learning_rate": 4.77938636275669e-05, + "loss": 0.5592, + "step": 3128 + }, + { + "epoch": 0.1374359199534562, + "grad_norm": 2.984375, + "learning_rate": 4.779102117009817e-05, + "loss": 0.5645, + "step": 3130 + }, + { + "epoch": 0.1375237384326597, + "grad_norm": 2.609375, + "learning_rate": 4.778817696728758e-05, + "loss": 0.5756, + "step": 3132 + }, + { + "epoch": 0.1376115569118632, + "grad_norm": 3.09375, + "learning_rate": 4.778533101935293e-05, + "loss": 0.5531, + "step": 3134 + }, + { + "epoch": 0.13769937539106666, + "grad_norm": 2.953125, + "learning_rate": 4.778248332651217e-05, + "loss": 0.588, + "step": 3136 + }, + { + "epoch": 0.13778719387027016, + "grad_norm": 2.96875, + "learning_rate": 4.7779633888983375e-05, + "loss": 0.5559, + "step": 3138 + }, + { + "epoch": 0.13787501234947364, + "grad_norm": 3.015625, + "learning_rate": 4.7776782706984754e-05, + "loss": 0.5385, + "step": 3140 + }, + { + "epoch": 0.13796283082867714, + "grad_norm": 2.65625, + "learning_rate": 4.777392978073466e-05, + "loss": 0.5593, + "step": 3142 + }, + { + "epoch": 0.1380506493078806, + "grad_norm": 2.90625, + "learning_rate": 4.777107511045157e-05, + "loss": 0.5674, + "step": 3144 + }, + { + "epoch": 0.1381384677870841, + "grad_norm": 3.34375, + "learning_rate": 4.776821869635407e-05, + "loss": 0.5298, + "step": 3146 + }, + { + "epoch": 0.13822628626628758, + "grad_norm": 3.34375, + "learning_rate": 4.776536053866094e-05, + "loss": 0.5588, + "step": 3148 + }, + { + "epoch": 0.13831410474549108, + "grad_norm": 2.9375, + "learning_rate": 4.7762500637591036e-05, + "loss": 0.5656, + "step": 3150 + }, + { + "epoch": 0.13840192322469455, + "grad_norm": 2.65625, + "learning_rate": 4.775963899336338e-05, + "loss": 0.5465, + "step": 3152 + }, + { + "epoch": 0.13848974170389805, + "grad_norm": 2.890625, + "learning_rate": 4.7756775606197114e-05, + "loss": 0.556, + "step": 3154 + }, + { + "epoch": 0.13857756018310152, + "grad_norm": 2.859375, + "learning_rate": 4.775391047631151e-05, + "loss": 0.5616, + "step": 3156 + }, + { + "epoch": 0.13866537866230502, + "grad_norm": 3.09375, + "learning_rate": 4.7751043603925996e-05, + "loss": 0.5721, + "step": 3158 + }, + { + "epoch": 0.1387531971415085, + "grad_norm": 3.1875, + "learning_rate": 4.77481749892601e-05, + "loss": 0.5668, + "step": 3160 + }, + { + "epoch": 0.138841015620712, + "grad_norm": 3.0625, + "learning_rate": 4.774530463253352e-05, + "loss": 0.5333, + "step": 3162 + }, + { + "epoch": 0.13892883409991547, + "grad_norm": 2.984375, + "learning_rate": 4.774243253396605e-05, + "loss": 0.5342, + "step": 3164 + }, + { + "epoch": 0.13901665257911897, + "grad_norm": 3.171875, + "learning_rate": 4.7739558693777654e-05, + "loss": 0.5521, + "step": 3166 + }, + { + "epoch": 0.13910447105832244, + "grad_norm": 3.796875, + "learning_rate": 4.7736683112188396e-05, + "loss": 0.5404, + "step": 3168 + }, + { + "epoch": 0.13919228953752594, + "grad_norm": 3.921875, + "learning_rate": 4.77338057894185e-05, + "loss": 0.5795, + "step": 3170 + }, + { + "epoch": 0.1392801080167294, + "grad_norm": 4.75, + "learning_rate": 4.773092672568829e-05, + "loss": 0.5364, + "step": 3172 + }, + { + "epoch": 0.1393679264959329, + "grad_norm": 5.125, + "learning_rate": 4.7728045921218286e-05, + "loss": 0.5539, + "step": 3174 + }, + { + "epoch": 0.13945574497513638, + "grad_norm": 4.59375, + "learning_rate": 4.7725163376229064e-05, + "loss": 0.5781, + "step": 3176 + }, + { + "epoch": 0.13954356345433988, + "grad_norm": 3.5, + "learning_rate": 4.772227909094139e-05, + "loss": 0.572, + "step": 3178 + }, + { + "epoch": 0.13963138193354335, + "grad_norm": 3.59375, + "learning_rate": 4.771939306557613e-05, + "loss": 0.5433, + "step": 3180 + }, + { + "epoch": 0.13971920041274685, + "grad_norm": 2.625, + "learning_rate": 4.77165053003543e-05, + "loss": 0.5494, + "step": 3182 + }, + { + "epoch": 0.13980701889195035, + "grad_norm": 2.765625, + "learning_rate": 4.7713615795497055e-05, + "loss": 0.5664, + "step": 3184 + }, + { + "epoch": 0.13989483737115382, + "grad_norm": 2.984375, + "learning_rate": 4.771072455122567e-05, + "loss": 0.5408, + "step": 3186 + }, + { + "epoch": 0.13998265585035732, + "grad_norm": 2.765625, + "learning_rate": 4.770783156776155e-05, + "loss": 0.5764, + "step": 3188 + }, + { + "epoch": 0.1400704743295608, + "grad_norm": 3.109375, + "learning_rate": 4.770493684532624e-05, + "loss": 0.5377, + "step": 3190 + }, + { + "epoch": 0.1401582928087643, + "grad_norm": 3.078125, + "learning_rate": 4.770204038414143e-05, + "loss": 0.5654, + "step": 3192 + }, + { + "epoch": 0.14024611128796777, + "grad_norm": 2.65625, + "learning_rate": 4.769914218442892e-05, + "loss": 0.5729, + "step": 3194 + }, + { + "epoch": 0.14033392976717127, + "grad_norm": 2.53125, + "learning_rate": 4.7696242246410674e-05, + "loss": 0.5623, + "step": 3196 + }, + { + "epoch": 0.14042174824637474, + "grad_norm": 3.03125, + "learning_rate": 4.769334057030874e-05, + "loss": 0.5485, + "step": 3198 + }, + { + "epoch": 0.14050956672557824, + "grad_norm": 3.140625, + "learning_rate": 4.7690437156345356e-05, + "loss": 0.5654, + "step": 3200 + }, + { + "epoch": 0.1405973852047817, + "grad_norm": 2.859375, + "learning_rate": 4.768753200474285e-05, + "loss": 0.5322, + "step": 3202 + }, + { + "epoch": 0.1406852036839852, + "grad_norm": 2.46875, + "learning_rate": 4.768462511572371e-05, + "loss": 0.5448, + "step": 3204 + }, + { + "epoch": 0.14077302216318868, + "grad_norm": 2.8125, + "learning_rate": 4.768171648951054e-05, + "loss": 0.5783, + "step": 3206 + }, + { + "epoch": 0.14086084064239218, + "grad_norm": 3.25, + "learning_rate": 4.767880612632608e-05, + "loss": 0.5502, + "step": 3208 + }, + { + "epoch": 0.14094865912159565, + "grad_norm": 3.28125, + "learning_rate": 4.767589402639321e-05, + "loss": 0.5776, + "step": 3210 + }, + { + "epoch": 0.14103647760079915, + "grad_norm": 3.09375, + "learning_rate": 4.7672980189934935e-05, + "loss": 0.5423, + "step": 3212 + }, + { + "epoch": 0.14112429608000263, + "grad_norm": 2.828125, + "learning_rate": 4.7670064617174414e-05, + "loss": 0.5671, + "step": 3214 + }, + { + "epoch": 0.14121211455920613, + "grad_norm": 2.859375, + "learning_rate": 4.7667147308334906e-05, + "loss": 0.5757, + "step": 3216 + }, + { + "epoch": 0.1412999330384096, + "grad_norm": 2.859375, + "learning_rate": 4.766422826363982e-05, + "loss": 0.5452, + "step": 3218 + }, + { + "epoch": 0.1413877515176131, + "grad_norm": 2.515625, + "learning_rate": 4.76613074833127e-05, + "loss": 0.5573, + "step": 3220 + }, + { + "epoch": 0.14147556999681657, + "grad_norm": 2.765625, + "learning_rate": 4.765838496757722e-05, + "loss": 0.5512, + "step": 3222 + }, + { + "epoch": 0.14156338847602007, + "grad_norm": 3.390625, + "learning_rate": 4.765546071665719e-05, + "loss": 0.5689, + "step": 3224 + }, + { + "epoch": 0.14165120695522354, + "grad_norm": 3.34375, + "learning_rate": 4.765253473077655e-05, + "loss": 0.5606, + "step": 3226 + }, + { + "epoch": 0.14173902543442704, + "grad_norm": 2.765625, + "learning_rate": 4.764960701015937e-05, + "loss": 0.5573, + "step": 3228 + }, + { + "epoch": 0.1418268439136305, + "grad_norm": 2.921875, + "learning_rate": 4.764667755502985e-05, + "loss": 0.5643, + "step": 3230 + }, + { + "epoch": 0.141914662392834, + "grad_norm": 3.34375, + "learning_rate": 4.764374636561234e-05, + "loss": 0.5225, + "step": 3232 + }, + { + "epoch": 0.1420024808720375, + "grad_norm": 2.953125, + "learning_rate": 4.76408134421313e-05, + "loss": 0.5291, + "step": 3234 + }, + { + "epoch": 0.14209029935124098, + "grad_norm": 2.984375, + "learning_rate": 4.7637878784811343e-05, + "loss": 0.5567, + "step": 3236 + }, + { + "epoch": 0.14217811783044448, + "grad_norm": 2.453125, + "learning_rate": 4.76349423938772e-05, + "loss": 0.5848, + "step": 3238 + }, + { + "epoch": 0.14226593630964796, + "grad_norm": 2.8125, + "learning_rate": 4.7632004269553746e-05, + "loss": 0.5553, + "step": 3240 + }, + { + "epoch": 0.14235375478885146, + "grad_norm": 3.65625, + "learning_rate": 4.762906441206597e-05, + "loss": 0.5725, + "step": 3242 + }, + { + "epoch": 0.14244157326805493, + "grad_norm": 2.765625, + "learning_rate": 4.762612282163903e-05, + "loss": 0.5571, + "step": 3244 + }, + { + "epoch": 0.14252939174725843, + "grad_norm": 2.9375, + "learning_rate": 4.762317949849817e-05, + "loss": 0.5418, + "step": 3246 + }, + { + "epoch": 0.1426172102264619, + "grad_norm": 2.734375, + "learning_rate": 4.7620234442868806e-05, + "loss": 0.5296, + "step": 3248 + }, + { + "epoch": 0.1427050287056654, + "grad_norm": 2.90625, + "learning_rate": 4.7617287654976466e-05, + "loss": 0.5702, + "step": 3250 + }, + { + "epoch": 0.14279284718486887, + "grad_norm": 2.96875, + "learning_rate": 4.7614339135046816e-05, + "loss": 0.5605, + "step": 3252 + }, + { + "epoch": 0.14288066566407237, + "grad_norm": 2.96875, + "learning_rate": 4.761138888330565e-05, + "loss": 0.546, + "step": 3254 + }, + { + "epoch": 0.14296848414327584, + "grad_norm": 2.953125, + "learning_rate": 4.760843689997891e-05, + "loss": 0.5669, + "step": 3256 + }, + { + "epoch": 0.14305630262247934, + "grad_norm": 2.875, + "learning_rate": 4.760548318529265e-05, + "loss": 0.5651, + "step": 3258 + }, + { + "epoch": 0.14314412110168281, + "grad_norm": 2.984375, + "learning_rate": 4.760252773947307e-05, + "loss": 0.5565, + "step": 3260 + }, + { + "epoch": 0.1432319395808863, + "grad_norm": 2.546875, + "learning_rate": 4.7599570562746486e-05, + "loss": 0.5608, + "step": 3262 + }, + { + "epoch": 0.14331975806008979, + "grad_norm": 2.921875, + "learning_rate": 4.7596611655339384e-05, + "loss": 0.5184, + "step": 3264 + }, + { + "epoch": 0.14340757653929329, + "grad_norm": 3.328125, + "learning_rate": 4.759365101747833e-05, + "loss": 0.5226, + "step": 3266 + }, + { + "epoch": 0.14349539501849676, + "grad_norm": 3.046875, + "learning_rate": 4.759068864939008e-05, + "loss": 0.5637, + "step": 3268 + }, + { + "epoch": 0.14358321349770026, + "grad_norm": 3.734375, + "learning_rate": 4.7587724551301474e-05, + "loss": 0.5445, + "step": 3270 + }, + { + "epoch": 0.14367103197690373, + "grad_norm": 3.203125, + "learning_rate": 4.758475872343951e-05, + "loss": 0.5533, + "step": 3272 + }, + { + "epoch": 0.14375885045610723, + "grad_norm": 2.796875, + "learning_rate": 4.7581791166031307e-05, + "loss": 0.5308, + "step": 3274 + }, + { + "epoch": 0.1438466689353107, + "grad_norm": 2.484375, + "learning_rate": 4.757882187930412e-05, + "loss": 0.5278, + "step": 3276 + }, + { + "epoch": 0.1439344874145142, + "grad_norm": 3.15625, + "learning_rate": 4.7575850863485345e-05, + "loss": 0.5135, + "step": 3278 + }, + { + "epoch": 0.14402230589371767, + "grad_norm": 2.953125, + "learning_rate": 4.7572878118802496e-05, + "loss": 0.5093, + "step": 3280 + }, + { + "epoch": 0.14411012437292117, + "grad_norm": 3.484375, + "learning_rate": 4.756990364548323e-05, + "loss": 0.5549, + "step": 3282 + }, + { + "epoch": 0.14419794285212467, + "grad_norm": 3.390625, + "learning_rate": 4.7566927443755324e-05, + "loss": 0.5789, + "step": 3284 + }, + { + "epoch": 0.14428576133132814, + "grad_norm": 2.328125, + "learning_rate": 4.756394951384672e-05, + "loss": 0.533, + "step": 3286 + }, + { + "epoch": 0.14437357981053164, + "grad_norm": 2.9375, + "learning_rate": 4.756096985598545e-05, + "loss": 0.5552, + "step": 3288 + }, + { + "epoch": 0.14446139828973512, + "grad_norm": 2.59375, + "learning_rate": 4.7557988470399695e-05, + "loss": 0.5436, + "step": 3290 + }, + { + "epoch": 0.14454921676893862, + "grad_norm": 2.71875, + "learning_rate": 4.7555005357317774e-05, + "loss": 0.5551, + "step": 3292 + }, + { + "epoch": 0.1446370352481421, + "grad_norm": 3.0, + "learning_rate": 4.7552020516968144e-05, + "loss": 0.5224, + "step": 3294 + }, + { + "epoch": 0.1447248537273456, + "grad_norm": 2.75, + "learning_rate": 4.754903394957937e-05, + "loss": 0.5368, + "step": 3296 + }, + { + "epoch": 0.14481267220654906, + "grad_norm": 2.78125, + "learning_rate": 4.7546045655380174e-05, + "loss": 0.5495, + "step": 3298 + }, + { + "epoch": 0.14490049068575256, + "grad_norm": 2.375, + "learning_rate": 4.7543055634599394e-05, + "loss": 0.5518, + "step": 3300 + }, + { + "epoch": 0.14498830916495603, + "grad_norm": 2.484375, + "learning_rate": 4.754006388746601e-05, + "loss": 0.5558, + "step": 3302 + }, + { + "epoch": 0.14507612764415953, + "grad_norm": 2.40625, + "learning_rate": 4.7537070414209134e-05, + "loss": 0.5431, + "step": 3304 + }, + { + "epoch": 0.145163946123363, + "grad_norm": 2.90625, + "learning_rate": 4.7534075215058e-05, + "loss": 0.5392, + "step": 3306 + }, + { + "epoch": 0.1452517646025665, + "grad_norm": 2.578125, + "learning_rate": 4.753107829024198e-05, + "loss": 0.5357, + "step": 3308 + }, + { + "epoch": 0.14533958308176997, + "grad_norm": 2.46875, + "learning_rate": 4.7528079639990596e-05, + "loss": 0.5398, + "step": 3310 + }, + { + "epoch": 0.14542740156097347, + "grad_norm": 2.640625, + "learning_rate": 4.7525079264533464e-05, + "loss": 0.5326, + "step": 3312 + }, + { + "epoch": 0.14551522004017695, + "grad_norm": 3.125, + "learning_rate": 4.752207716410036e-05, + "loss": 0.5444, + "step": 3314 + }, + { + "epoch": 0.14560303851938045, + "grad_norm": 2.640625, + "learning_rate": 4.7519073338921196e-05, + "loss": 0.5697, + "step": 3316 + }, + { + "epoch": 0.14569085699858392, + "grad_norm": 2.59375, + "learning_rate": 4.751606778922599e-05, + "loss": 0.551, + "step": 3318 + }, + { + "epoch": 0.14577867547778742, + "grad_norm": 2.703125, + "learning_rate": 4.751306051524492e-05, + "loss": 0.5416, + "step": 3320 + }, + { + "epoch": 0.1458664939569909, + "grad_norm": 3.359375, + "learning_rate": 4.7510051517208276e-05, + "loss": 0.5223, + "step": 3322 + }, + { + "epoch": 0.1459543124361944, + "grad_norm": 4.34375, + "learning_rate": 4.750704079534649e-05, + "loss": 0.5606, + "step": 3324 + }, + { + "epoch": 0.14604213091539786, + "grad_norm": 3.921875, + "learning_rate": 4.750402834989013e-05, + "loss": 0.5694, + "step": 3326 + }, + { + "epoch": 0.14612994939460136, + "grad_norm": 3.578125, + "learning_rate": 4.7501014181069884e-05, + "loss": 0.5552, + "step": 3328 + }, + { + "epoch": 0.14621776787380483, + "grad_norm": 3.640625, + "learning_rate": 4.749799828911657e-05, + "loss": 0.5606, + "step": 3330 + }, + { + "epoch": 0.14630558635300833, + "grad_norm": 3.0625, + "learning_rate": 4.749498067426116e-05, + "loss": 0.5545, + "step": 3332 + }, + { + "epoch": 0.14639340483221183, + "grad_norm": 2.6875, + "learning_rate": 4.7491961336734735e-05, + "loss": 0.5326, + "step": 3334 + }, + { + "epoch": 0.1464812233114153, + "grad_norm": 2.546875, + "learning_rate": 4.7488940276768525e-05, + "loss": 0.5556, + "step": 3336 + }, + { + "epoch": 0.1465690417906188, + "grad_norm": 3.328125, + "learning_rate": 4.7485917494593866e-05, + "loss": 0.5552, + "step": 3338 + }, + { + "epoch": 0.14665686026982228, + "grad_norm": 2.59375, + "learning_rate": 4.748289299044226e-05, + "loss": 0.5554, + "step": 3340 + }, + { + "epoch": 0.14674467874902578, + "grad_norm": 3.09375, + "learning_rate": 4.747986676454533e-05, + "loss": 0.5636, + "step": 3342 + }, + { + "epoch": 0.14683249722822925, + "grad_norm": 3.109375, + "learning_rate": 4.747683881713481e-05, + "loss": 0.5333, + "step": 3344 + }, + { + "epoch": 0.14692031570743275, + "grad_norm": 2.859375, + "learning_rate": 4.747380914844257e-05, + "loss": 0.5331, + "step": 3346 + }, + { + "epoch": 0.14700813418663622, + "grad_norm": 2.5625, + "learning_rate": 4.7470777758700655e-05, + "loss": 0.5181, + "step": 3348 + }, + { + "epoch": 0.14709595266583972, + "grad_norm": 3.078125, + "learning_rate": 4.746774464814119e-05, + "loss": 0.5348, + "step": 3350 + }, + { + "epoch": 0.1471837711450432, + "grad_norm": 3.03125, + "learning_rate": 4.7464709816996445e-05, + "loss": 0.5384, + "step": 3352 + }, + { + "epoch": 0.1472715896242467, + "grad_norm": 2.78125, + "learning_rate": 4.746167326549884e-05, + "loss": 0.5614, + "step": 3354 + }, + { + "epoch": 0.14735940810345016, + "grad_norm": 3.078125, + "learning_rate": 4.745863499388092e-05, + "loss": 0.5327, + "step": 3356 + }, + { + "epoch": 0.14744722658265366, + "grad_norm": 2.890625, + "learning_rate": 4.7455595002375344e-05, + "loss": 0.5381, + "step": 3358 + }, + { + "epoch": 0.14753504506185713, + "grad_norm": 3.6875, + "learning_rate": 4.745255329121492e-05, + "loss": 0.5582, + "step": 3360 + }, + { + "epoch": 0.14762286354106063, + "grad_norm": 4.28125, + "learning_rate": 4.744950986063258e-05, + "loss": 0.5844, + "step": 3362 + }, + { + "epoch": 0.1477106820202641, + "grad_norm": 3.390625, + "learning_rate": 4.744646471086139e-05, + "loss": 0.525, + "step": 3364 + }, + { + "epoch": 0.1477985004994676, + "grad_norm": 3.1875, + "learning_rate": 4.744341784213456e-05, + "loss": 0.5444, + "step": 3366 + }, + { + "epoch": 0.14788631897867108, + "grad_norm": 2.6875, + "learning_rate": 4.744036925468541e-05, + "loss": 0.5375, + "step": 3368 + }, + { + "epoch": 0.14797413745787458, + "grad_norm": 2.796875, + "learning_rate": 4.74373189487474e-05, + "loss": 0.5399, + "step": 3370 + }, + { + "epoch": 0.14806195593707805, + "grad_norm": 3.4375, + "learning_rate": 4.743426692455413e-05, + "loss": 0.5581, + "step": 3372 + }, + { + "epoch": 0.14814977441628155, + "grad_norm": 3.71875, + "learning_rate": 4.7431213182339315e-05, + "loss": 0.5768, + "step": 3374 + }, + { + "epoch": 0.14823759289548502, + "grad_norm": 3.375, + "learning_rate": 4.742815772233682e-05, + "loss": 0.5311, + "step": 3376 + }, + { + "epoch": 0.14832541137468852, + "grad_norm": 3.25, + "learning_rate": 4.742510054478063e-05, + "loss": 0.539, + "step": 3378 + }, + { + "epoch": 0.148413229853892, + "grad_norm": 3.375, + "learning_rate": 4.7422041649904867e-05, + "loss": 0.5256, + "step": 3380 + }, + { + "epoch": 0.1485010483330955, + "grad_norm": 2.859375, + "learning_rate": 4.7418981037943785e-05, + "loss": 0.5492, + "step": 3382 + }, + { + "epoch": 0.14858886681229896, + "grad_norm": 3.34375, + "learning_rate": 4.741591870913175e-05, + "loss": 0.5769, + "step": 3384 + }, + { + "epoch": 0.14867668529150246, + "grad_norm": 2.921875, + "learning_rate": 4.741285466370329e-05, + "loss": 0.5478, + "step": 3386 + }, + { + "epoch": 0.14876450377070596, + "grad_norm": 3.046875, + "learning_rate": 4.740978890189305e-05, + "loss": 0.5265, + "step": 3388 + }, + { + "epoch": 0.14885232224990944, + "grad_norm": 2.8125, + "learning_rate": 4.740672142393581e-05, + "loss": 0.5097, + "step": 3390 + }, + { + "epoch": 0.14894014072911294, + "grad_norm": 2.46875, + "learning_rate": 4.740365223006646e-05, + "loss": 0.5272, + "step": 3392 + }, + { + "epoch": 0.1490279592083164, + "grad_norm": 3.0, + "learning_rate": 4.7400581320520055e-05, + "loss": 0.5462, + "step": 3394 + }, + { + "epoch": 0.1491157776875199, + "grad_norm": 2.421875, + "learning_rate": 4.7397508695531764e-05, + "loss": 0.5324, + "step": 3396 + }, + { + "epoch": 0.14920359616672338, + "grad_norm": 2.859375, + "learning_rate": 4.739443435533689e-05, + "loss": 0.5603, + "step": 3398 + }, + { + "epoch": 0.14929141464592688, + "grad_norm": 2.515625, + "learning_rate": 4.7391358300170865e-05, + "loss": 0.5247, + "step": 3400 + }, + { + "epoch": 0.14937923312513035, + "grad_norm": 3.046875, + "learning_rate": 4.738828053026925e-05, + "loss": 0.5094, + "step": 3402 + }, + { + "epoch": 0.14946705160433385, + "grad_norm": 2.828125, + "learning_rate": 4.7385201045867747e-05, + "loss": 0.5529, + "step": 3404 + }, + { + "epoch": 0.14955487008353732, + "grad_norm": 2.828125, + "learning_rate": 4.738211984720218e-05, + "loss": 0.5386, + "step": 3406 + }, + { + "epoch": 0.14964268856274082, + "grad_norm": 2.640625, + "learning_rate": 4.7379036934508506e-05, + "loss": 0.5707, + "step": 3408 + }, + { + "epoch": 0.1497305070419443, + "grad_norm": 2.75, + "learning_rate": 4.7375952308022824e-05, + "loss": 0.5598, + "step": 3410 + }, + { + "epoch": 0.1498183255211478, + "grad_norm": 3.65625, + "learning_rate": 4.737286596798135e-05, + "loss": 0.5739, + "step": 3412 + }, + { + "epoch": 0.14990614400035127, + "grad_norm": 3.734375, + "learning_rate": 4.7369777914620436e-05, + "loss": 0.5426, + "step": 3414 + }, + { + "epoch": 0.14999396247955477, + "grad_norm": 2.625, + "learning_rate": 4.736668814817657e-05, + "loss": 0.5247, + "step": 3416 + }, + { + "epoch": 0.15008178095875824, + "grad_norm": 2.828125, + "learning_rate": 4.7363596668886364e-05, + "loss": 0.5513, + "step": 3418 + }, + { + "epoch": 0.15016959943796174, + "grad_norm": 2.96875, + "learning_rate": 4.736050347698656e-05, + "loss": 0.5728, + "step": 3420 + }, + { + "epoch": 0.1502574179171652, + "grad_norm": 3.0625, + "learning_rate": 4.7357408572714046e-05, + "loss": 0.554, + "step": 3422 + }, + { + "epoch": 0.1503452363963687, + "grad_norm": 2.96875, + "learning_rate": 4.735431195630582e-05, + "loss": 0.5098, + "step": 3424 + }, + { + "epoch": 0.15043305487557218, + "grad_norm": 2.390625, + "learning_rate": 4.7351213627999027e-05, + "loss": 0.5609, + "step": 3426 + }, + { + "epoch": 0.15052087335477568, + "grad_norm": 3.0, + "learning_rate": 4.734811358803093e-05, + "loss": 0.5503, + "step": 3428 + }, + { + "epoch": 0.15060869183397915, + "grad_norm": 3.046875, + "learning_rate": 4.734501183663894e-05, + "loss": 0.5682, + "step": 3430 + }, + { + "epoch": 0.15069651031318265, + "grad_norm": 2.828125, + "learning_rate": 4.7341908374060595e-05, + "loss": 0.5374, + "step": 3432 + }, + { + "epoch": 0.15078432879238612, + "grad_norm": 2.90625, + "learning_rate": 4.733880320053354e-05, + "loss": 0.5598, + "step": 3434 + }, + { + "epoch": 0.15087214727158962, + "grad_norm": 2.828125, + "learning_rate": 4.733569631629559e-05, + "loss": 0.5408, + "step": 3436 + }, + { + "epoch": 0.15095996575079312, + "grad_norm": 2.875, + "learning_rate": 4.7332587721584656e-05, + "loss": 0.5214, + "step": 3438 + }, + { + "epoch": 0.1510477842299966, + "grad_norm": 2.625, + "learning_rate": 4.732947741663881e-05, + "loss": 0.5384, + "step": 3440 + }, + { + "epoch": 0.1511356027092001, + "grad_norm": 2.515625, + "learning_rate": 4.732636540169621e-05, + "loss": 0.5455, + "step": 3442 + }, + { + "epoch": 0.15122342118840357, + "grad_norm": 2.578125, + "learning_rate": 4.732325167699522e-05, + "loss": 0.5517, + "step": 3444 + }, + { + "epoch": 0.15131123966760707, + "grad_norm": 2.59375, + "learning_rate": 4.732013624277425e-05, + "loss": 0.5336, + "step": 3446 + }, + { + "epoch": 0.15139905814681054, + "grad_norm": 2.640625, + "learning_rate": 4.73170190992719e-05, + "loss": 0.5347, + "step": 3448 + }, + { + "epoch": 0.15148687662601404, + "grad_norm": 2.5, + "learning_rate": 4.731390024672688e-05, + "loss": 0.5212, + "step": 3450 + }, + { + "epoch": 0.1515746951052175, + "grad_norm": 2.609375, + "learning_rate": 4.731077968537803e-05, + "loss": 0.5469, + "step": 3452 + }, + { + "epoch": 0.151662513584421, + "grad_norm": 3.234375, + "learning_rate": 4.7307657415464324e-05, + "loss": 0.559, + "step": 3454 + }, + { + "epoch": 0.15175033206362448, + "grad_norm": 2.859375, + "learning_rate": 4.7304533437224866e-05, + "loss": 0.5247, + "step": 3456 + }, + { + "epoch": 0.15183815054282798, + "grad_norm": 3.765625, + "learning_rate": 4.730140775089888e-05, + "loss": 0.5356, + "step": 3458 + }, + { + "epoch": 0.15192596902203145, + "grad_norm": 3.59375, + "learning_rate": 4.729828035672576e-05, + "loss": 0.5345, + "step": 3460 + }, + { + "epoch": 0.15201378750123495, + "grad_norm": 3.578125, + "learning_rate": 4.729515125494497e-05, + "loss": 0.5386, + "step": 3462 + }, + { + "epoch": 0.15210160598043843, + "grad_norm": 5.0625, + "learning_rate": 4.729202044579616e-05, + "loss": 0.531, + "step": 3464 + }, + { + "epoch": 0.15218942445964193, + "grad_norm": 7.71875, + "learning_rate": 4.7288887929519074e-05, + "loss": 0.5581, + "step": 3466 + }, + { + "epoch": 0.1522772429388454, + "grad_norm": 3.484375, + "learning_rate": 4.7285753706353614e-05, + "loss": 0.5721, + "step": 3468 + }, + { + "epoch": 0.1523650614180489, + "grad_norm": 2.703125, + "learning_rate": 4.728261777653979e-05, + "loss": 0.5701, + "step": 3470 + }, + { + "epoch": 0.15245287989725237, + "grad_norm": 4.1875, + "learning_rate": 4.7279480140317756e-05, + "loss": 0.5465, + "step": 3472 + }, + { + "epoch": 0.15254069837645587, + "grad_norm": 3.90625, + "learning_rate": 4.727634079792779e-05, + "loss": 0.5398, + "step": 3474 + }, + { + "epoch": 0.15262851685565934, + "grad_norm": 3.3125, + "learning_rate": 4.727319974961031e-05, + "loss": 0.5536, + "step": 3476 + }, + { + "epoch": 0.15271633533486284, + "grad_norm": 3.71875, + "learning_rate": 4.7270056995605846e-05, + "loss": 0.5496, + "step": 3478 + }, + { + "epoch": 0.1528041538140663, + "grad_norm": 3.65625, + "learning_rate": 4.726691253615509e-05, + "loss": 0.5222, + "step": 3480 + }, + { + "epoch": 0.1528919722932698, + "grad_norm": 3.25, + "learning_rate": 4.726376637149883e-05, + "loss": 0.5456, + "step": 3482 + }, + { + "epoch": 0.15297979077247328, + "grad_norm": 3.078125, + "learning_rate": 4.7260618501877994e-05, + "loss": 0.5076, + "step": 3484 + }, + { + "epoch": 0.15306760925167678, + "grad_norm": 3.3125, + "learning_rate": 4.725746892753367e-05, + "loss": 0.5468, + "step": 3486 + }, + { + "epoch": 0.15315542773088028, + "grad_norm": 3.171875, + "learning_rate": 4.725431764870704e-05, + "loss": 0.5179, + "step": 3488 + }, + { + "epoch": 0.15324324621008376, + "grad_norm": 3.1875, + "learning_rate": 4.7251164665639426e-05, + "loss": 0.5522, + "step": 3490 + }, + { + "epoch": 0.15333106468928726, + "grad_norm": 3.953125, + "learning_rate": 4.724800997857228e-05, + "loss": 0.5371, + "step": 3492 + }, + { + "epoch": 0.15341888316849073, + "grad_norm": 3.796875, + "learning_rate": 4.724485358774721e-05, + "loss": 0.5303, + "step": 3494 + }, + { + "epoch": 0.15350670164769423, + "grad_norm": 3.59375, + "learning_rate": 4.724169549340591e-05, + "loss": 0.5051, + "step": 3496 + }, + { + "epoch": 0.1535945201268977, + "grad_norm": 3.59375, + "learning_rate": 4.723853569579024e-05, + "loss": 0.5499, + "step": 3498 + }, + { + "epoch": 0.1536823386061012, + "grad_norm": 3.265625, + "learning_rate": 4.723537419514218e-05, + "loss": 0.5448, + "step": 3500 + }, + { + "epoch": 0.15377015708530467, + "grad_norm": 3.953125, + "learning_rate": 4.723221099170383e-05, + "loss": 0.5605, + "step": 3502 + }, + { + "epoch": 0.15385797556450817, + "grad_norm": 2.765625, + "learning_rate": 4.7229046085717434e-05, + "loss": 0.5353, + "step": 3504 + }, + { + "epoch": 0.15394579404371164, + "grad_norm": 2.5625, + "learning_rate": 4.7225879477425364e-05, + "loss": 0.5276, + "step": 3506 + }, + { + "epoch": 0.15403361252291514, + "grad_norm": 2.546875, + "learning_rate": 4.722271116707011e-05, + "loss": 0.5488, + "step": 3508 + }, + { + "epoch": 0.15412143100211861, + "grad_norm": 2.640625, + "learning_rate": 4.72195411548943e-05, + "loss": 0.5281, + "step": 3510 + }, + { + "epoch": 0.1542092494813221, + "grad_norm": 3.203125, + "learning_rate": 4.7216369441140715e-05, + "loss": 0.5644, + "step": 3512 + }, + { + "epoch": 0.15429706796052559, + "grad_norm": 2.84375, + "learning_rate": 4.721319602605223e-05, + "loss": 0.503, + "step": 3514 + }, + { + "epoch": 0.15438488643972909, + "grad_norm": 2.859375, + "learning_rate": 4.721002090987187e-05, + "loss": 0.5472, + "step": 3516 + }, + { + "epoch": 0.15447270491893256, + "grad_norm": 2.9375, + "learning_rate": 4.720684409284277e-05, + "loss": 0.5533, + "step": 3518 + }, + { + "epoch": 0.15456052339813606, + "grad_norm": 3.125, + "learning_rate": 4.7203665575208244e-05, + "loss": 0.5354, + "step": 3520 + }, + { + "epoch": 0.15464834187733953, + "grad_norm": 2.5625, + "learning_rate": 4.720048535721168e-05, + "loss": 0.5225, + "step": 3522 + }, + { + "epoch": 0.15473616035654303, + "grad_norm": 2.78125, + "learning_rate": 4.7197303439096626e-05, + "loss": 0.5436, + "step": 3524 + }, + { + "epoch": 0.1548239788357465, + "grad_norm": 3.21875, + "learning_rate": 4.7194119821106754e-05, + "loss": 0.5509, + "step": 3526 + }, + { + "epoch": 0.15491179731495, + "grad_norm": 3.28125, + "learning_rate": 4.719093450348586e-05, + "loss": 0.5602, + "step": 3528 + }, + { + "epoch": 0.15499961579415347, + "grad_norm": 3.09375, + "learning_rate": 4.718774748647789e-05, + "loss": 0.5344, + "step": 3530 + }, + { + "epoch": 0.15508743427335697, + "grad_norm": 3.1875, + "learning_rate": 4.718455877032689e-05, + "loss": 0.5435, + "step": 3532 + }, + { + "epoch": 0.15517525275256044, + "grad_norm": 3.0625, + "learning_rate": 4.718136835527707e-05, + "loss": 0.5291, + "step": 3534 + }, + { + "epoch": 0.15526307123176394, + "grad_norm": 2.953125, + "learning_rate": 4.7178176241572735e-05, + "loss": 0.5308, + "step": 3536 + }, + { + "epoch": 0.15535088971096744, + "grad_norm": 3.0, + "learning_rate": 4.717498242945836e-05, + "loss": 0.5524, + "step": 3538 + }, + { + "epoch": 0.15543870819017092, + "grad_norm": 2.578125, + "learning_rate": 4.717178691917851e-05, + "loss": 0.5144, + "step": 3540 + }, + { + "epoch": 0.15552652666937442, + "grad_norm": 2.84375, + "learning_rate": 4.71685897109779e-05, + "loss": 0.566, + "step": 3542 + }, + { + "epoch": 0.1556143451485779, + "grad_norm": 3.578125, + "learning_rate": 4.716539080510137e-05, + "loss": 0.5421, + "step": 3544 + }, + { + "epoch": 0.1557021636277814, + "grad_norm": 2.90625, + "learning_rate": 4.7162190201793904e-05, + "loss": 0.5453, + "step": 3546 + }, + { + "epoch": 0.15578998210698486, + "grad_norm": 3.09375, + "learning_rate": 4.71589879013006e-05, + "loss": 0.5369, + "step": 3548 + }, + { + "epoch": 0.15587780058618836, + "grad_norm": 3.0, + "learning_rate": 4.715578390386669e-05, + "loss": 0.5164, + "step": 3550 + }, + { + "epoch": 0.15596561906539183, + "grad_norm": 2.46875, + "learning_rate": 4.715257820973754e-05, + "loss": 0.5128, + "step": 3552 + }, + { + "epoch": 0.15605343754459533, + "grad_norm": 2.65625, + "learning_rate": 4.7149370819158635e-05, + "loss": 0.5106, + "step": 3554 + }, + { + "epoch": 0.1561412560237988, + "grad_norm": 2.578125, + "learning_rate": 4.714616173237561e-05, + "loss": 0.5405, + "step": 3556 + }, + { + "epoch": 0.1562290745030023, + "grad_norm": 2.515625, + "learning_rate": 4.7142950949634206e-05, + "loss": 0.5533, + "step": 3558 + }, + { + "epoch": 0.15631689298220577, + "grad_norm": 2.765625, + "learning_rate": 4.7139738471180314e-05, + "loss": 0.5214, + "step": 3560 + }, + { + "epoch": 0.15640471146140927, + "grad_norm": 2.875, + "learning_rate": 4.713652429725994e-05, + "loss": 0.5371, + "step": 3562 + }, + { + "epoch": 0.15649252994061275, + "grad_norm": 2.578125, + "learning_rate": 4.713330842811923e-05, + "loss": 0.5322, + "step": 3564 + }, + { + "epoch": 0.15658034841981625, + "grad_norm": 2.59375, + "learning_rate": 4.713009086400445e-05, + "loss": 0.5006, + "step": 3566 + }, + { + "epoch": 0.15666816689901972, + "grad_norm": 2.5625, + "learning_rate": 4.712687160516202e-05, + "loss": 0.5391, + "step": 3568 + }, + { + "epoch": 0.15675598537822322, + "grad_norm": 2.515625, + "learning_rate": 4.712365065183844e-05, + "loss": 0.5302, + "step": 3570 + }, + { + "epoch": 0.1568438038574267, + "grad_norm": 2.703125, + "learning_rate": 4.71204280042804e-05, + "loss": 0.5188, + "step": 3572 + }, + { + "epoch": 0.1569316223366302, + "grad_norm": 2.671875, + "learning_rate": 4.711720366273468e-05, + "loss": 0.5314, + "step": 3574 + }, + { + "epoch": 0.15701944081583366, + "grad_norm": 2.46875, + "learning_rate": 4.71139776274482e-05, + "loss": 0.5141, + "step": 3576 + }, + { + "epoch": 0.15710725929503716, + "grad_norm": 2.5625, + "learning_rate": 4.711074989866802e-05, + "loss": 0.5583, + "step": 3578 + }, + { + "epoch": 0.15719507777424063, + "grad_norm": 2.578125, + "learning_rate": 4.71075204766413e-05, + "loss": 0.5613, + "step": 3580 + }, + { + "epoch": 0.15728289625344413, + "grad_norm": 2.53125, + "learning_rate": 4.710428936161537e-05, + "loss": 0.5293, + "step": 3582 + }, + { + "epoch": 0.1573707147326476, + "grad_norm": 2.765625, + "learning_rate": 4.7101056553837665e-05, + "loss": 0.5258, + "step": 3584 + }, + { + "epoch": 0.1574585332118511, + "grad_norm": 2.78125, + "learning_rate": 4.709782205355574e-05, + "loss": 0.5555, + "step": 3586 + }, + { + "epoch": 0.1575463516910546, + "grad_norm": 4.46875, + "learning_rate": 4.709458586101731e-05, + "loss": 0.5505, + "step": 3588 + }, + { + "epoch": 0.15763417017025808, + "grad_norm": 4.09375, + "learning_rate": 4.70913479764702e-05, + "loss": 0.5502, + "step": 3590 + }, + { + "epoch": 0.15772198864946158, + "grad_norm": 3.28125, + "learning_rate": 4.708810840016237e-05, + "loss": 0.5308, + "step": 3592 + }, + { + "epoch": 0.15780980712866505, + "grad_norm": 3.25, + "learning_rate": 4.7084867132341895e-05, + "loss": 0.5257, + "step": 3594 + }, + { + "epoch": 0.15789762560786855, + "grad_norm": 2.90625, + "learning_rate": 4.708162417325701e-05, + "loss": 0.5202, + "step": 3596 + }, + { + "epoch": 0.15798544408707202, + "grad_norm": 2.578125, + "learning_rate": 4.7078379523156045e-05, + "loss": 0.5356, + "step": 3598 + }, + { + "epoch": 0.15807326256627552, + "grad_norm": 2.90625, + "learning_rate": 4.707513318228749e-05, + "loss": 0.5395, + "step": 3600 + }, + { + "epoch": 0.158161081045479, + "grad_norm": 2.828125, + "learning_rate": 4.707188515089994e-05, + "loss": 0.5471, + "step": 3602 + }, + { + "epoch": 0.1582488995246825, + "grad_norm": 2.640625, + "learning_rate": 4.706863542924213e-05, + "loss": 0.544, + "step": 3604 + }, + { + "epoch": 0.15833671800388596, + "grad_norm": 2.796875, + "learning_rate": 4.706538401756294e-05, + "loss": 0.5097, + "step": 3606 + }, + { + "epoch": 0.15842453648308946, + "grad_norm": 2.8125, + "learning_rate": 4.7062130916111344e-05, + "loss": 0.5489, + "step": 3608 + }, + { + "epoch": 0.15851235496229293, + "grad_norm": 2.65625, + "learning_rate": 4.705887612513647e-05, + "loss": 0.5344, + "step": 3610 + }, + { + "epoch": 0.15860017344149643, + "grad_norm": 3.359375, + "learning_rate": 4.705561964488758e-05, + "loss": 0.5395, + "step": 3612 + }, + { + "epoch": 0.1586879919206999, + "grad_norm": 2.75, + "learning_rate": 4.705236147561405e-05, + "loss": 0.5617, + "step": 3614 + }, + { + "epoch": 0.1587758103999034, + "grad_norm": 2.40625, + "learning_rate": 4.70491016175654e-05, + "loss": 0.5352, + "step": 3616 + }, + { + "epoch": 0.15886362887910688, + "grad_norm": 2.421875, + "learning_rate": 4.704584007099125e-05, + "loss": 0.5261, + "step": 3618 + }, + { + "epoch": 0.15895144735831038, + "grad_norm": 2.6875, + "learning_rate": 4.7042576836141395e-05, + "loss": 0.5221, + "step": 3620 + }, + { + "epoch": 0.15903926583751385, + "grad_norm": 2.4375, + "learning_rate": 4.703931191326572e-05, + "loss": 0.5078, + "step": 3622 + }, + { + "epoch": 0.15912708431671735, + "grad_norm": 2.671875, + "learning_rate": 4.703604530261424e-05, + "loss": 0.5281, + "step": 3624 + }, + { + "epoch": 0.15921490279592082, + "grad_norm": 2.890625, + "learning_rate": 4.7032777004437136e-05, + "loss": 0.532, + "step": 3626 + }, + { + "epoch": 0.15930272127512432, + "grad_norm": 2.84375, + "learning_rate": 4.70295070189847e-05, + "loss": 0.5323, + "step": 3628 + }, + { + "epoch": 0.1593905397543278, + "grad_norm": 2.890625, + "learning_rate": 4.702623534650732e-05, + "loss": 0.5521, + "step": 3630 + }, + { + "epoch": 0.1594783582335313, + "grad_norm": 2.609375, + "learning_rate": 4.7022961987255566e-05, + "loss": 0.5377, + "step": 3632 + }, + { + "epoch": 0.15956617671273476, + "grad_norm": 3.515625, + "learning_rate": 4.70196869414801e-05, + "loss": 0.5471, + "step": 3634 + }, + { + "epoch": 0.15965399519193826, + "grad_norm": 3.609375, + "learning_rate": 4.701641020943173e-05, + "loss": 0.5355, + "step": 3636 + }, + { + "epoch": 0.15974181367114176, + "grad_norm": 3.546875, + "learning_rate": 4.7013131791361385e-05, + "loss": 0.5412, + "step": 3638 + }, + { + "epoch": 0.15982963215034524, + "grad_norm": 2.640625, + "learning_rate": 4.7009851687520134e-05, + "loss": 0.5286, + "step": 3640 + }, + { + "epoch": 0.15991745062954874, + "grad_norm": 3.234375, + "learning_rate": 4.7006569898159165e-05, + "loss": 0.5662, + "step": 3642 + }, + { + "epoch": 0.1600052691087522, + "grad_norm": 3.0, + "learning_rate": 4.7003286423529795e-05, + "loss": 0.5374, + "step": 3644 + }, + { + "epoch": 0.1600930875879557, + "grad_norm": 2.703125, + "learning_rate": 4.700000126388348e-05, + "loss": 0.535, + "step": 3646 + }, + { + "epoch": 0.16018090606715918, + "grad_norm": 2.515625, + "learning_rate": 4.69967144194718e-05, + "loss": 0.5094, + "step": 3648 + }, + { + "epoch": 0.16026872454636268, + "grad_norm": 2.453125, + "learning_rate": 4.6993425890546444e-05, + "loss": 0.5443, + "step": 3650 + }, + { + "epoch": 0.16035654302556615, + "grad_norm": 2.21875, + "learning_rate": 4.699013567735927e-05, + "loss": 0.5244, + "step": 3652 + }, + { + "epoch": 0.16044436150476965, + "grad_norm": 2.578125, + "learning_rate": 4.698684378016222e-05, + "loss": 0.5368, + "step": 3654 + }, + { + "epoch": 0.16053217998397312, + "grad_norm": 2.78125, + "learning_rate": 4.698355019920743e-05, + "loss": 0.5424, + "step": 3656 + }, + { + "epoch": 0.16061999846317662, + "grad_norm": 2.921875, + "learning_rate": 4.698025493474707e-05, + "loss": 0.5187, + "step": 3658 + }, + { + "epoch": 0.1607078169423801, + "grad_norm": 3.609375, + "learning_rate": 4.697695798703353e-05, + "loss": 0.5546, + "step": 3660 + }, + { + "epoch": 0.1607956354215836, + "grad_norm": 3.09375, + "learning_rate": 4.697365935631928e-05, + "loss": 0.5323, + "step": 3662 + }, + { + "epoch": 0.16088345390078707, + "grad_norm": 2.3125, + "learning_rate": 4.697035904285693e-05, + "loss": 0.5484, + "step": 3664 + }, + { + "epoch": 0.16097127237999057, + "grad_norm": 2.84375, + "learning_rate": 4.696705704689921e-05, + "loss": 0.524, + "step": 3666 + }, + { + "epoch": 0.16105909085919404, + "grad_norm": 3.34375, + "learning_rate": 4.6963753368699e-05, + "loss": 0.5624, + "step": 3668 + }, + { + "epoch": 0.16114690933839754, + "grad_norm": 2.53125, + "learning_rate": 4.6960448008509296e-05, + "loss": 0.5487, + "step": 3670 + }, + { + "epoch": 0.161234727817601, + "grad_norm": 2.640625, + "learning_rate": 4.695714096658321e-05, + "loss": 0.5272, + "step": 3672 + }, + { + "epoch": 0.1613225462968045, + "grad_norm": 2.71875, + "learning_rate": 4.695383224317401e-05, + "loss": 0.5231, + "step": 3674 + }, + { + "epoch": 0.16141036477600798, + "grad_norm": 2.859375, + "learning_rate": 4.695052183853508e-05, + "loss": 0.5264, + "step": 3676 + }, + { + "epoch": 0.16149818325521148, + "grad_norm": 2.640625, + "learning_rate": 4.694720975291991e-05, + "loss": 0.5296, + "step": 3678 + }, + { + "epoch": 0.16158600173441495, + "grad_norm": 3.140625, + "learning_rate": 4.694389598658217e-05, + "loss": 0.533, + "step": 3680 + }, + { + "epoch": 0.16167382021361845, + "grad_norm": 2.671875, + "learning_rate": 4.6940580539775616e-05, + "loss": 0.4755, + "step": 3682 + }, + { + "epoch": 0.16176163869282192, + "grad_norm": 2.9375, + "learning_rate": 4.6937263412754135e-05, + "loss": 0.5346, + "step": 3684 + }, + { + "epoch": 0.16184945717202542, + "grad_norm": 3.09375, + "learning_rate": 4.693394460577177e-05, + "loss": 0.5129, + "step": 3686 + }, + { + "epoch": 0.16193727565122892, + "grad_norm": 2.6875, + "learning_rate": 4.693062411908267e-05, + "loss": 0.5131, + "step": 3688 + }, + { + "epoch": 0.1620250941304324, + "grad_norm": 2.5, + "learning_rate": 4.6927301952941105e-05, + "loss": 0.5093, + "step": 3690 + }, + { + "epoch": 0.1621129126096359, + "grad_norm": 2.828125, + "learning_rate": 4.6923978107601516e-05, + "loss": 0.5425, + "step": 3692 + }, + { + "epoch": 0.16220073108883937, + "grad_norm": 2.5, + "learning_rate": 4.692065258331842e-05, + "loss": 0.5489, + "step": 3694 + }, + { + "epoch": 0.16228854956804287, + "grad_norm": 2.8125, + "learning_rate": 4.69173253803465e-05, + "loss": 0.5584, + "step": 3696 + }, + { + "epoch": 0.16237636804724634, + "grad_norm": 3.34375, + "learning_rate": 4.691399649894054e-05, + "loss": 0.5217, + "step": 3698 + }, + { + "epoch": 0.16246418652644984, + "grad_norm": 2.828125, + "learning_rate": 4.691066593935548e-05, + "loss": 0.5334, + "step": 3700 + }, + { + "epoch": 0.1625520050056533, + "grad_norm": 3.140625, + "learning_rate": 4.6907333701846365e-05, + "loss": 0.5613, + "step": 3702 + }, + { + "epoch": 0.1626398234848568, + "grad_norm": 4.25, + "learning_rate": 4.690399978666839e-05, + "loss": 0.5262, + "step": 3704 + }, + { + "epoch": 0.16272764196406028, + "grad_norm": 2.828125, + "learning_rate": 4.690066419407686e-05, + "loss": 0.5465, + "step": 3706 + }, + { + "epoch": 0.16281546044326378, + "grad_norm": 3.25, + "learning_rate": 4.689732692432722e-05, + "loss": 0.511, + "step": 3708 + }, + { + "epoch": 0.16290327892246725, + "grad_norm": 3.109375, + "learning_rate": 4.6893987977675026e-05, + "loss": 0.5364, + "step": 3710 + }, + { + "epoch": 0.16299109740167075, + "grad_norm": 2.859375, + "learning_rate": 4.689064735437599e-05, + "loss": 0.5153, + "step": 3712 + }, + { + "epoch": 0.16307891588087423, + "grad_norm": 2.515625, + "learning_rate": 4.688730505468593e-05, + "loss": 0.5287, + "step": 3714 + }, + { + "epoch": 0.16316673436007773, + "grad_norm": 2.625, + "learning_rate": 4.688396107886081e-05, + "loss": 0.5298, + "step": 3716 + }, + { + "epoch": 0.1632545528392812, + "grad_norm": 2.75, + "learning_rate": 4.688061542715669e-05, + "loss": 0.5106, + "step": 3718 + }, + { + "epoch": 0.1633423713184847, + "grad_norm": 2.375, + "learning_rate": 4.6877268099829804e-05, + "loss": 0.555, + "step": 3720 + }, + { + "epoch": 0.16343018979768817, + "grad_norm": 2.78125, + "learning_rate": 4.687391909713648e-05, + "loss": 0.5382, + "step": 3722 + }, + { + "epoch": 0.16351800827689167, + "grad_norm": 2.578125, + "learning_rate": 4.6870568419333185e-05, + "loss": 0.5163, + "step": 3724 + }, + { + "epoch": 0.16360582675609514, + "grad_norm": 2.4375, + "learning_rate": 4.6867216066676524e-05, + "loss": 0.5185, + "step": 3726 + }, + { + "epoch": 0.16369364523529864, + "grad_norm": 2.515625, + "learning_rate": 4.686386203942321e-05, + "loss": 0.5344, + "step": 3728 + }, + { + "epoch": 0.1637814637145021, + "grad_norm": 2.953125, + "learning_rate": 4.6860506337830105e-05, + "loss": 0.5251, + "step": 3730 + }, + { + "epoch": 0.1638692821937056, + "grad_norm": 2.640625, + "learning_rate": 4.6857148962154185e-05, + "loss": 0.4998, + "step": 3732 + }, + { + "epoch": 0.16395710067290908, + "grad_norm": 2.859375, + "learning_rate": 4.6853789912652554e-05, + "loss": 0.5155, + "step": 3734 + }, + { + "epoch": 0.16404491915211258, + "grad_norm": 2.859375, + "learning_rate": 4.6850429189582454e-05, + "loss": 0.5498, + "step": 3736 + }, + { + "epoch": 0.16413273763131608, + "grad_norm": 2.859375, + "learning_rate": 4.684706679320125e-05, + "loss": 0.4993, + "step": 3738 + }, + { + "epoch": 0.16422055611051956, + "grad_norm": 2.46875, + "learning_rate": 4.684370272376643e-05, + "loss": 0.543, + "step": 3740 + }, + { + "epoch": 0.16430837458972306, + "grad_norm": 2.40625, + "learning_rate": 4.684033698153562e-05, + "loss": 0.5161, + "step": 3742 + }, + { + "epoch": 0.16439619306892653, + "grad_norm": 3.140625, + "learning_rate": 4.683696956676657e-05, + "loss": 0.5639, + "step": 3744 + }, + { + "epoch": 0.16448401154813003, + "grad_norm": 2.53125, + "learning_rate": 4.6833600479717155e-05, + "loss": 0.5518, + "step": 3746 + }, + { + "epoch": 0.1645718300273335, + "grad_norm": 2.625, + "learning_rate": 4.683022972064538e-05, + "loss": 0.5062, + "step": 3748 + }, + { + "epoch": 0.164659648506537, + "grad_norm": 2.203125, + "learning_rate": 4.682685728980939e-05, + "loss": 0.537, + "step": 3750 + }, + { + "epoch": 0.16474746698574047, + "grad_norm": 2.78125, + "learning_rate": 4.682348318746742e-05, + "loss": 0.5347, + "step": 3752 + }, + { + "epoch": 0.16483528546494397, + "grad_norm": 2.75, + "learning_rate": 4.682010741387789e-05, + "loss": 0.514, + "step": 3754 + }, + { + "epoch": 0.16492310394414744, + "grad_norm": 3.3125, + "learning_rate": 4.6816729969299295e-05, + "loss": 0.5405, + "step": 3756 + }, + { + "epoch": 0.16501092242335094, + "grad_norm": 2.578125, + "learning_rate": 4.681335085399029e-05, + "loss": 0.5304, + "step": 3758 + }, + { + "epoch": 0.16509874090255441, + "grad_norm": 2.59375, + "learning_rate": 4.680997006820965e-05, + "loss": 0.5137, + "step": 3760 + }, + { + "epoch": 0.1651865593817579, + "grad_norm": 2.796875, + "learning_rate": 4.680658761221628e-05, + "loss": 0.53, + "step": 3762 + }, + { + "epoch": 0.16527437786096139, + "grad_norm": 2.34375, + "learning_rate": 4.6803203486269195e-05, + "loss": 0.552, + "step": 3764 + }, + { + "epoch": 0.16536219634016489, + "grad_norm": 2.421875, + "learning_rate": 4.679981769062756e-05, + "loss": 0.527, + "step": 3766 + }, + { + "epoch": 0.16545001481936836, + "grad_norm": 2.46875, + "learning_rate": 4.6796430225550664e-05, + "loss": 0.5067, + "step": 3768 + }, + { + "epoch": 0.16553783329857186, + "grad_norm": 2.625, + "learning_rate": 4.679304109129792e-05, + "loss": 0.5127, + "step": 3770 + }, + { + "epoch": 0.16562565177777533, + "grad_norm": 2.578125, + "learning_rate": 4.6789650288128855e-05, + "loss": 0.5087, + "step": 3772 + }, + { + "epoch": 0.16571347025697883, + "grad_norm": 2.8125, + "learning_rate": 4.678625781630315e-05, + "loss": 0.5192, + "step": 3774 + }, + { + "epoch": 0.1658012887361823, + "grad_norm": 3.46875, + "learning_rate": 4.6782863676080605e-05, + "loss": 0.5258, + "step": 3776 + }, + { + "epoch": 0.1658891072153858, + "grad_norm": 3.296875, + "learning_rate": 4.6779467867721135e-05, + "loss": 0.5352, + "step": 3778 + }, + { + "epoch": 0.16597692569458927, + "grad_norm": 3.28125, + "learning_rate": 4.67760703914848e-05, + "loss": 0.5331, + "step": 3780 + }, + { + "epoch": 0.16606474417379277, + "grad_norm": 2.75, + "learning_rate": 4.6772671247631764e-05, + "loss": 0.5256, + "step": 3782 + }, + { + "epoch": 0.16615256265299624, + "grad_norm": 2.671875, + "learning_rate": 4.676927043642235e-05, + "loss": 0.5047, + "step": 3784 + }, + { + "epoch": 0.16624038113219974, + "grad_norm": 2.5625, + "learning_rate": 4.676586795811699e-05, + "loss": 0.5068, + "step": 3786 + }, + { + "epoch": 0.16632819961140322, + "grad_norm": 2.578125, + "learning_rate": 4.676246381297624e-05, + "loss": 0.5406, + "step": 3788 + }, + { + "epoch": 0.16641601809060672, + "grad_norm": 2.671875, + "learning_rate": 4.6759058001260794e-05, + "loss": 0.4963, + "step": 3790 + }, + { + "epoch": 0.16650383656981022, + "grad_norm": 2.75, + "learning_rate": 4.6755650523231476e-05, + "loss": 0.5411, + "step": 3792 + }, + { + "epoch": 0.1665916550490137, + "grad_norm": 2.6875, + "learning_rate": 4.675224137914922e-05, + "loss": 0.5449, + "step": 3794 + }, + { + "epoch": 0.1666794735282172, + "grad_norm": 3.0625, + "learning_rate": 4.674883056927511e-05, + "loss": 0.5374, + "step": 3796 + }, + { + "epoch": 0.16676729200742066, + "grad_norm": 3.40625, + "learning_rate": 4.674541809387033e-05, + "loss": 0.531, + "step": 3798 + }, + { + "epoch": 0.16685511048662416, + "grad_norm": 2.828125, + "learning_rate": 4.674200395319623e-05, + "loss": 0.5496, + "step": 3800 + }, + { + "epoch": 0.16694292896582763, + "grad_norm": 2.578125, + "learning_rate": 4.673858814751425e-05, + "loss": 0.5026, + "step": 3802 + }, + { + "epoch": 0.16703074744503113, + "grad_norm": 2.65625, + "learning_rate": 4.673517067708598e-05, + "loss": 0.5376, + "step": 3804 + }, + { + "epoch": 0.1671185659242346, + "grad_norm": 2.5625, + "learning_rate": 4.6731751542173136e-05, + "loss": 0.4992, + "step": 3806 + }, + { + "epoch": 0.1672063844034381, + "grad_norm": 2.515625, + "learning_rate": 4.672833074303754e-05, + "loss": 0.5246, + "step": 3808 + }, + { + "epoch": 0.16729420288264157, + "grad_norm": 2.9375, + "learning_rate": 4.672490827994117e-05, + "loss": 0.5128, + "step": 3810 + }, + { + "epoch": 0.16738202136184507, + "grad_norm": 3.078125, + "learning_rate": 4.6721484153146124e-05, + "loss": 0.5459, + "step": 3812 + }, + { + "epoch": 0.16746983984104855, + "grad_norm": 3.265625, + "learning_rate": 4.671805836291461e-05, + "loss": 0.5388, + "step": 3814 + }, + { + "epoch": 0.16755765832025205, + "grad_norm": 2.75, + "learning_rate": 4.671463090950897e-05, + "loss": 0.5167, + "step": 3816 + }, + { + "epoch": 0.16764547679945552, + "grad_norm": 3.125, + "learning_rate": 4.67112017931917e-05, + "loss": 0.5059, + "step": 3818 + }, + { + "epoch": 0.16773329527865902, + "grad_norm": 2.8125, + "learning_rate": 4.670777101422539e-05, + "loss": 0.5303, + "step": 3820 + }, + { + "epoch": 0.1678211137578625, + "grad_norm": 2.703125, + "learning_rate": 4.6704338572872773e-05, + "loss": 0.5233, + "step": 3822 + }, + { + "epoch": 0.167908932237066, + "grad_norm": 3.21875, + "learning_rate": 4.67009044693967e-05, + "loss": 0.5091, + "step": 3824 + }, + { + "epoch": 0.16799675071626946, + "grad_norm": 2.703125, + "learning_rate": 4.6697468704060166e-05, + "loss": 0.524, + "step": 3826 + }, + { + "epoch": 0.16808456919547296, + "grad_norm": 2.90625, + "learning_rate": 4.6694031277126285e-05, + "loss": 0.5386, + "step": 3828 + }, + { + "epoch": 0.16817238767467643, + "grad_norm": 3.1875, + "learning_rate": 4.6690592188858275e-05, + "loss": 0.5196, + "step": 3830 + }, + { + "epoch": 0.16826020615387993, + "grad_norm": 3.4375, + "learning_rate": 4.6687151439519516e-05, + "loss": 0.5395, + "step": 3832 + }, + { + "epoch": 0.1683480246330834, + "grad_norm": 3.34375, + "learning_rate": 4.668370902937351e-05, + "loss": 0.5279, + "step": 3834 + }, + { + "epoch": 0.1684358431122869, + "grad_norm": 3.46875, + "learning_rate": 4.668026495868386e-05, + "loss": 0.5156, + "step": 3836 + }, + { + "epoch": 0.16852366159149038, + "grad_norm": 2.625, + "learning_rate": 4.667681922771433e-05, + "loss": 0.4916, + "step": 3838 + }, + { + "epoch": 0.16861148007069388, + "grad_norm": 2.546875, + "learning_rate": 4.6673371836728785e-05, + "loss": 0.5422, + "step": 3840 + }, + { + "epoch": 0.16869929854989738, + "grad_norm": 2.71875, + "learning_rate": 4.6669922785991225e-05, + "loss": 0.5224, + "step": 3842 + }, + { + "epoch": 0.16878711702910085, + "grad_norm": 2.65625, + "learning_rate": 4.666647207576579e-05, + "loss": 0.5403, + "step": 3844 + }, + { + "epoch": 0.16887493550830435, + "grad_norm": 2.515625, + "learning_rate": 4.666301970631672e-05, + "loss": 0.5061, + "step": 3846 + }, + { + "epoch": 0.16896275398750782, + "grad_norm": 2.859375, + "learning_rate": 4.6659565677908414e-05, + "loss": 0.504, + "step": 3848 + }, + { + "epoch": 0.16905057246671132, + "grad_norm": 3.953125, + "learning_rate": 4.665610999080537e-05, + "loss": 0.5388, + "step": 3850 + }, + { + "epoch": 0.1691383909459148, + "grad_norm": 3.0625, + "learning_rate": 4.6652652645272244e-05, + "loss": 0.5559, + "step": 3852 + }, + { + "epoch": 0.1692262094251183, + "grad_norm": 3.046875, + "learning_rate": 4.6649193641573784e-05, + "loss": 0.5189, + "step": 3854 + }, + { + "epoch": 0.16931402790432176, + "grad_norm": 3.453125, + "learning_rate": 4.6645732979974884e-05, + "loss": 0.5244, + "step": 3856 + }, + { + "epoch": 0.16940184638352526, + "grad_norm": 3.0, + "learning_rate": 4.664227066074056e-05, + "loss": 0.5008, + "step": 3858 + }, + { + "epoch": 0.16948966486272873, + "grad_norm": 2.953125, + "learning_rate": 4.663880668413596e-05, + "loss": 0.5339, + "step": 3860 + }, + { + "epoch": 0.16957748334193223, + "grad_norm": 2.828125, + "learning_rate": 4.663534105042636e-05, + "loss": 0.5334, + "step": 3862 + }, + { + "epoch": 0.1696653018211357, + "grad_norm": 3.296875, + "learning_rate": 4.6631873759877156e-05, + "loss": 0.5465, + "step": 3864 + }, + { + "epoch": 0.1697531203003392, + "grad_norm": 3.9375, + "learning_rate": 4.6628404812753876e-05, + "loss": 0.523, + "step": 3866 + }, + { + "epoch": 0.16984093877954268, + "grad_norm": 3.453125, + "learning_rate": 4.662493420932217e-05, + "loss": 0.5374, + "step": 3868 + }, + { + "epoch": 0.16992875725874618, + "grad_norm": 3.234375, + "learning_rate": 4.662146194984782e-05, + "loss": 0.5576, + "step": 3870 + }, + { + "epoch": 0.17001657573794965, + "grad_norm": 2.890625, + "learning_rate": 4.661798803459673e-05, + "loss": 0.521, + "step": 3872 + }, + { + "epoch": 0.17010439421715315, + "grad_norm": 3.328125, + "learning_rate": 4.661451246383492e-05, + "loss": 0.5187, + "step": 3874 + }, + { + "epoch": 0.17019221269635662, + "grad_norm": 3.46875, + "learning_rate": 4.661103523782858e-05, + "loss": 0.4808, + "step": 3876 + }, + { + "epoch": 0.17028003117556012, + "grad_norm": 3.703125, + "learning_rate": 4.660755635684398e-05, + "loss": 0.5104, + "step": 3878 + }, + { + "epoch": 0.1703678496547636, + "grad_norm": 3.734375, + "learning_rate": 4.660407582114753e-05, + "loss": 0.5527, + "step": 3880 + }, + { + "epoch": 0.1704556681339671, + "grad_norm": 3.328125, + "learning_rate": 4.6600593631005776e-05, + "loss": 0.5165, + "step": 3882 + }, + { + "epoch": 0.17054348661317056, + "grad_norm": 2.59375, + "learning_rate": 4.6597109786685386e-05, + "loss": 0.5142, + "step": 3884 + }, + { + "epoch": 0.17063130509237406, + "grad_norm": 2.65625, + "learning_rate": 4.6593624288453146e-05, + "loss": 0.5223, + "step": 3886 + }, + { + "epoch": 0.17071912357157754, + "grad_norm": 2.4375, + "learning_rate": 4.6590137136575986e-05, + "loss": 0.54, + "step": 3888 + }, + { + "epoch": 0.17080694205078104, + "grad_norm": 2.6875, + "learning_rate": 4.6586648331320946e-05, + "loss": 0.4989, + "step": 3890 + }, + { + "epoch": 0.17089476052998454, + "grad_norm": 2.53125, + "learning_rate": 4.6583157872955205e-05, + "loss": 0.5059, + "step": 3892 + }, + { + "epoch": 0.170982579009188, + "grad_norm": 2.84375, + "learning_rate": 4.657966576174606e-05, + "loss": 0.5045, + "step": 3894 + }, + { + "epoch": 0.1710703974883915, + "grad_norm": 2.75, + "learning_rate": 4.657617199796094e-05, + "loss": 0.5024, + "step": 3896 + }, + { + "epoch": 0.17115821596759498, + "grad_norm": 2.8125, + "learning_rate": 4.657267658186739e-05, + "loss": 0.5587, + "step": 3898 + }, + { + "epoch": 0.17124603444679848, + "grad_norm": 2.59375, + "learning_rate": 4.65691795137331e-05, + "loss": 0.4818, + "step": 3900 + }, + { + "epoch": 0.17133385292600195, + "grad_norm": 2.78125, + "learning_rate": 4.6565680793825874e-05, + "loss": 0.5296, + "step": 3902 + }, + { + "epoch": 0.17142167140520545, + "grad_norm": 2.640625, + "learning_rate": 4.6562180422413635e-05, + "loss": 0.5506, + "step": 3904 + }, + { + "epoch": 0.17150948988440892, + "grad_norm": 2.78125, + "learning_rate": 4.655867839976447e-05, + "loss": 0.5211, + "step": 3906 + }, + { + "epoch": 0.17159730836361242, + "grad_norm": 2.484375, + "learning_rate": 4.655517472614652e-05, + "loss": 0.5108, + "step": 3908 + }, + { + "epoch": 0.1716851268428159, + "grad_norm": 2.390625, + "learning_rate": 4.655166940182813e-05, + "loss": 0.5492, + "step": 3910 + }, + { + "epoch": 0.1717729453220194, + "grad_norm": 2.75, + "learning_rate": 4.654816242707774e-05, + "loss": 0.5077, + "step": 3912 + }, + { + "epoch": 0.17186076380122287, + "grad_norm": 2.53125, + "learning_rate": 4.6544653802163904e-05, + "loss": 0.5369, + "step": 3914 + }, + { + "epoch": 0.17194858228042637, + "grad_norm": 2.328125, + "learning_rate": 4.654114352735531e-05, + "loss": 0.512, + "step": 3916 + }, + { + "epoch": 0.17203640075962984, + "grad_norm": 2.28125, + "learning_rate": 4.653763160292078e-05, + "loss": 0.5409, + "step": 3918 + }, + { + "epoch": 0.17212421923883334, + "grad_norm": 2.578125, + "learning_rate": 4.653411802912926e-05, + "loss": 0.5141, + "step": 3920 + }, + { + "epoch": 0.1722120377180368, + "grad_norm": 2.65625, + "learning_rate": 4.6530602806249825e-05, + "loss": 0.5354, + "step": 3922 + }, + { + "epoch": 0.1722998561972403, + "grad_norm": 2.390625, + "learning_rate": 4.652708593455166e-05, + "loss": 0.5425, + "step": 3924 + }, + { + "epoch": 0.17238767467644378, + "grad_norm": 2.71875, + "learning_rate": 4.6523567414304095e-05, + "loss": 0.5343, + "step": 3926 + }, + { + "epoch": 0.17247549315564728, + "grad_norm": 2.953125, + "learning_rate": 4.6520047245776575e-05, + "loss": 0.5373, + "step": 3928 + }, + { + "epoch": 0.17256331163485075, + "grad_norm": 2.640625, + "learning_rate": 4.651652542923869e-05, + "loss": 0.5034, + "step": 3930 + }, + { + "epoch": 0.17265113011405425, + "grad_norm": 2.71875, + "learning_rate": 4.6513001964960116e-05, + "loss": 0.514, + "step": 3932 + }, + { + "epoch": 0.17273894859325772, + "grad_norm": 3.515625, + "learning_rate": 4.65094768532107e-05, + "loss": 0.5189, + "step": 3934 + }, + { + "epoch": 0.17282676707246122, + "grad_norm": 3.25, + "learning_rate": 4.650595009426039e-05, + "loss": 0.5327, + "step": 3936 + }, + { + "epoch": 0.1729145855516647, + "grad_norm": 2.78125, + "learning_rate": 4.650242168837927e-05, + "loss": 0.5272, + "step": 3938 + }, + { + "epoch": 0.1730024040308682, + "grad_norm": 2.90625, + "learning_rate": 4.649889163583754e-05, + "loss": 0.5732, + "step": 3940 + }, + { + "epoch": 0.1730902225100717, + "grad_norm": 2.671875, + "learning_rate": 4.649535993690554e-05, + "loss": 0.5236, + "step": 3942 + }, + { + "epoch": 0.17317804098927517, + "grad_norm": 2.5625, + "learning_rate": 4.649182659185371e-05, + "loss": 0.4678, + "step": 3944 + }, + { + "epoch": 0.17326585946847867, + "grad_norm": 2.40625, + "learning_rate": 4.648829160095265e-05, + "loss": 0.5066, + "step": 3946 + }, + { + "epoch": 0.17335367794768214, + "grad_norm": 2.421875, + "learning_rate": 4.6484754964473076e-05, + "loss": 0.4917, + "step": 3948 + }, + { + "epoch": 0.17344149642688564, + "grad_norm": 2.578125, + "learning_rate": 4.6481216682685816e-05, + "loss": 0.5121, + "step": 3950 + }, + { + "epoch": 0.1735293149060891, + "grad_norm": 2.359375, + "learning_rate": 4.647767675586183e-05, + "loss": 0.5145, + "step": 3952 + }, + { + "epoch": 0.1736171333852926, + "grad_norm": 2.875, + "learning_rate": 4.647413518427221e-05, + "loss": 0.5074, + "step": 3954 + }, + { + "epoch": 0.17370495186449608, + "grad_norm": 2.546875, + "learning_rate": 4.647059196818817e-05, + "loss": 0.5168, + "step": 3956 + }, + { + "epoch": 0.17379277034369958, + "grad_norm": 2.421875, + "learning_rate": 4.646704710788105e-05, + "loss": 0.5364, + "step": 3958 + }, + { + "epoch": 0.17388058882290305, + "grad_norm": 2.71875, + "learning_rate": 4.6463500603622316e-05, + "loss": 0.5028, + "step": 3960 + }, + { + "epoch": 0.17396840730210655, + "grad_norm": 3.140625, + "learning_rate": 4.645995245568357e-05, + "loss": 0.5176, + "step": 3962 + }, + { + "epoch": 0.17405622578131003, + "grad_norm": 3.203125, + "learning_rate": 4.645640266433651e-05, + "loss": 0.5466, + "step": 3964 + }, + { + "epoch": 0.17414404426051353, + "grad_norm": 2.9375, + "learning_rate": 4.645285122985299e-05, + "loss": 0.5072, + "step": 3966 + }, + { + "epoch": 0.174231862739717, + "grad_norm": 3.40625, + "learning_rate": 4.6449298152504986e-05, + "loss": 0.5595, + "step": 3968 + }, + { + "epoch": 0.1743196812189205, + "grad_norm": 3.546875, + "learning_rate": 4.644574343256458e-05, + "loss": 0.5338, + "step": 3970 + }, + { + "epoch": 0.17440749969812397, + "grad_norm": 3.46875, + "learning_rate": 4.6442187070304004e-05, + "loss": 0.5155, + "step": 3972 + }, + { + "epoch": 0.17449531817732747, + "grad_norm": 3.421875, + "learning_rate": 4.643862906599561e-05, + "loss": 0.513, + "step": 3974 + }, + { + "epoch": 0.17458313665653094, + "grad_norm": 3.5625, + "learning_rate": 4.6435069419911854e-05, + "loss": 0.5436, + "step": 3976 + }, + { + "epoch": 0.17467095513573444, + "grad_norm": 3.09375, + "learning_rate": 4.643150813232534e-05, + "loss": 0.5065, + "step": 3978 + }, + { + "epoch": 0.1747587736149379, + "grad_norm": 3.90625, + "learning_rate": 4.6427945203508806e-05, + "loss": 0.5314, + "step": 3980 + }, + { + "epoch": 0.1748465920941414, + "grad_norm": 4.03125, + "learning_rate": 4.642438063373509e-05, + "loss": 0.5111, + "step": 3982 + }, + { + "epoch": 0.17493441057334488, + "grad_norm": 3.125, + "learning_rate": 4.642081442327716e-05, + "loss": 0.5112, + "step": 3984 + }, + { + "epoch": 0.17502222905254838, + "grad_norm": 3.015625, + "learning_rate": 4.6417246572408134e-05, + "loss": 0.5393, + "step": 3986 + }, + { + "epoch": 0.17511004753175186, + "grad_norm": 2.546875, + "learning_rate": 4.641367708140124e-05, + "loss": 0.5214, + "step": 3988 + }, + { + "epoch": 0.17519786601095536, + "grad_norm": 2.703125, + "learning_rate": 4.641010595052981e-05, + "loss": 0.5073, + "step": 3990 + }, + { + "epoch": 0.17528568449015886, + "grad_norm": 2.578125, + "learning_rate": 4.640653318006733e-05, + "loss": 0.5078, + "step": 3992 + }, + { + "epoch": 0.17537350296936233, + "grad_norm": 2.625, + "learning_rate": 4.640295877028742e-05, + "loss": 0.5006, + "step": 3994 + }, + { + "epoch": 0.17546132144856583, + "grad_norm": 2.390625, + "learning_rate": 4.639938272146379e-05, + "loss": 0.5217, + "step": 3996 + }, + { + "epoch": 0.1755491399277693, + "grad_norm": 2.8125, + "learning_rate": 4.639580503387031e-05, + "loss": 0.5335, + "step": 3998 + }, + { + "epoch": 0.1756369584069728, + "grad_norm": 2.875, + "learning_rate": 4.6392225707780935e-05, + "loss": 0.5138, + "step": 4000 + }, + { + "epoch": 0.17572477688617627, + "grad_norm": 2.890625, + "learning_rate": 4.63886447434698e-05, + "loss": 0.5127, + "step": 4002 + }, + { + "epoch": 0.17581259536537977, + "grad_norm": 2.65625, + "learning_rate": 4.638506214121112e-05, + "loss": 0.5268, + "step": 4004 + }, + { + "epoch": 0.17590041384458324, + "grad_norm": 2.9375, + "learning_rate": 4.6381477901279255e-05, + "loss": 0.5176, + "step": 4006 + }, + { + "epoch": 0.17598823232378674, + "grad_norm": 2.53125, + "learning_rate": 4.6377892023948684e-05, + "loss": 0.5373, + "step": 4008 + }, + { + "epoch": 0.1760760508029902, + "grad_norm": 2.390625, + "learning_rate": 4.6374304509494016e-05, + "loss": 0.5187, + "step": 4010 + }, + { + "epoch": 0.1761638692821937, + "grad_norm": 3.1875, + "learning_rate": 4.637071535818999e-05, + "loss": 0.5387, + "step": 4012 + }, + { + "epoch": 0.17625168776139719, + "grad_norm": 2.53125, + "learning_rate": 4.636712457031146e-05, + "loss": 0.5008, + "step": 4014 + }, + { + "epoch": 0.17633950624060069, + "grad_norm": 2.5625, + "learning_rate": 4.63635321461334e-05, + "loss": 0.5233, + "step": 4016 + }, + { + "epoch": 0.17642732471980416, + "grad_norm": 2.5625, + "learning_rate": 4.635993808593093e-05, + "loss": 0.5024, + "step": 4018 + }, + { + "epoch": 0.17651514319900766, + "grad_norm": 2.765625, + "learning_rate": 4.635634238997929e-05, + "loss": 0.5411, + "step": 4020 + }, + { + "epoch": 0.17660296167821113, + "grad_norm": 2.28125, + "learning_rate": 4.6352745058553826e-05, + "loss": 0.5314, + "step": 4022 + }, + { + "epoch": 0.17669078015741463, + "grad_norm": 2.484375, + "learning_rate": 4.634914609193002e-05, + "loss": 0.5405, + "step": 4024 + }, + { + "epoch": 0.1767785986366181, + "grad_norm": 2.703125, + "learning_rate": 4.63455454903835e-05, + "loss": 0.5601, + "step": 4026 + }, + { + "epoch": 0.1768664171158216, + "grad_norm": 2.8125, + "learning_rate": 4.634194325418998e-05, + "loss": 0.5059, + "step": 4028 + }, + { + "epoch": 0.17695423559502507, + "grad_norm": 2.5, + "learning_rate": 4.633833938362533e-05, + "loss": 0.5236, + "step": 4030 + }, + { + "epoch": 0.17704205407422857, + "grad_norm": 2.828125, + "learning_rate": 4.633473387896554e-05, + "loss": 0.5242, + "step": 4032 + }, + { + "epoch": 0.17712987255343204, + "grad_norm": 2.625, + "learning_rate": 4.6331126740486704e-05, + "loss": 0.5302, + "step": 4034 + }, + { + "epoch": 0.17721769103263554, + "grad_norm": 2.21875, + "learning_rate": 4.632751796846508e-05, + "loss": 0.4761, + "step": 4036 + }, + { + "epoch": 0.17730550951183902, + "grad_norm": 2.5, + "learning_rate": 4.632390756317702e-05, + "loss": 0.5416, + "step": 4038 + }, + { + "epoch": 0.17739332799104252, + "grad_norm": 2.34375, + "learning_rate": 4.6320295524898996e-05, + "loss": 0.5317, + "step": 4040 + }, + { + "epoch": 0.17748114647024602, + "grad_norm": 2.765625, + "learning_rate": 4.631668185390764e-05, + "loss": 0.5271, + "step": 4042 + }, + { + "epoch": 0.1775689649494495, + "grad_norm": 2.3125, + "learning_rate": 4.631306655047968e-05, + "loss": 0.5316, + "step": 4044 + }, + { + "epoch": 0.177656783428653, + "grad_norm": 2.6875, + "learning_rate": 4.630944961489196e-05, + "loss": 0.548, + "step": 4046 + }, + { + "epoch": 0.17774460190785646, + "grad_norm": 2.484375, + "learning_rate": 4.630583104742149e-05, + "loss": 0.5027, + "step": 4048 + }, + { + "epoch": 0.17783242038705996, + "grad_norm": 2.421875, + "learning_rate": 4.6302210848345367e-05, + "loss": 0.5404, + "step": 4050 + }, + { + "epoch": 0.17792023886626343, + "grad_norm": 2.9375, + "learning_rate": 4.6298589017940836e-05, + "loss": 0.5267, + "step": 4052 + }, + { + "epoch": 0.17800805734546693, + "grad_norm": 2.28125, + "learning_rate": 4.629496555648525e-05, + "loss": 0.5045, + "step": 4054 + }, + { + "epoch": 0.1780958758246704, + "grad_norm": 2.25, + "learning_rate": 4.629134046425609e-05, + "loss": 0.4785, + "step": 4056 + }, + { + "epoch": 0.1781836943038739, + "grad_norm": 2.34375, + "learning_rate": 4.628771374153098e-05, + "loss": 0.5289, + "step": 4058 + }, + { + "epoch": 0.17827151278307737, + "grad_norm": 2.4375, + "learning_rate": 4.628408538858765e-05, + "loss": 0.5295, + "step": 4060 + }, + { + "epoch": 0.17835933126228087, + "grad_norm": 2.421875, + "learning_rate": 4.628045540570396e-05, + "loss": 0.4988, + "step": 4062 + }, + { + "epoch": 0.17844714974148435, + "grad_norm": 2.6875, + "learning_rate": 4.6276823793157884e-05, + "loss": 0.5448, + "step": 4064 + }, + { + "epoch": 0.17853496822068785, + "grad_norm": 2.5625, + "learning_rate": 4.627319055122755e-05, + "loss": 0.5266, + "step": 4066 + }, + { + "epoch": 0.17862278669989132, + "grad_norm": 2.640625, + "learning_rate": 4.626955568019119e-05, + "loss": 0.5096, + "step": 4068 + }, + { + "epoch": 0.17871060517909482, + "grad_norm": 3.21875, + "learning_rate": 4.626591918032715e-05, + "loss": 0.5105, + "step": 4070 + }, + { + "epoch": 0.1787984236582983, + "grad_norm": 3.265625, + "learning_rate": 4.626228105191392e-05, + "loss": 0.4999, + "step": 4072 + }, + { + "epoch": 0.1788862421375018, + "grad_norm": 3.0, + "learning_rate": 4.625864129523011e-05, + "loss": 0.5009, + "step": 4074 + }, + { + "epoch": 0.17897406061670526, + "grad_norm": 3.265625, + "learning_rate": 4.6254999910554456e-05, + "loss": 0.5187, + "step": 4076 + }, + { + "epoch": 0.17906187909590876, + "grad_norm": 2.90625, + "learning_rate": 4.625135689816581e-05, + "loss": 0.5063, + "step": 4078 + }, + { + "epoch": 0.17914969757511223, + "grad_norm": 3.015625, + "learning_rate": 4.624771225834316e-05, + "loss": 0.5143, + "step": 4080 + }, + { + "epoch": 0.17923751605431573, + "grad_norm": 2.734375, + "learning_rate": 4.624406599136562e-05, + "loss": 0.519, + "step": 4082 + }, + { + "epoch": 0.1793253345335192, + "grad_norm": 2.640625, + "learning_rate": 4.62404180975124e-05, + "loss": 0.5254, + "step": 4084 + }, + { + "epoch": 0.1794131530127227, + "grad_norm": 2.78125, + "learning_rate": 4.6236768577062876e-05, + "loss": 0.5358, + "step": 4086 + }, + { + "epoch": 0.17950097149192618, + "grad_norm": 2.578125, + "learning_rate": 4.623311743029652e-05, + "loss": 0.4824, + "step": 4088 + }, + { + "epoch": 0.17958878997112968, + "grad_norm": 2.9375, + "learning_rate": 4.622946465749295e-05, + "loss": 0.5151, + "step": 4090 + }, + { + "epoch": 0.17967660845033318, + "grad_norm": 3.1875, + "learning_rate": 4.622581025893189e-05, + "loss": 0.5062, + "step": 4092 + }, + { + "epoch": 0.17976442692953665, + "grad_norm": 3.234375, + "learning_rate": 4.622215423489318e-05, + "loss": 0.5197, + "step": 4094 + }, + { + "epoch": 0.17985224540874015, + "grad_norm": 3.09375, + "learning_rate": 4.6218496585656815e-05, + "loss": 0.5057, + "step": 4096 + }, + { + "epoch": 0.17994006388794362, + "grad_norm": 2.78125, + "learning_rate": 4.62148373115029e-05, + "loss": 0.516, + "step": 4098 + }, + { + "epoch": 0.18002788236714712, + "grad_norm": 2.921875, + "learning_rate": 4.621117641271166e-05, + "loss": 0.5342, + "step": 4100 + }, + { + "epoch": 0.1801157008463506, + "grad_norm": 2.796875, + "learning_rate": 4.620751388956344e-05, + "loss": 0.52, + "step": 4102 + }, + { + "epoch": 0.1802035193255541, + "grad_norm": 2.703125, + "learning_rate": 4.620384974233872e-05, + "loss": 0.5188, + "step": 4104 + }, + { + "epoch": 0.18029133780475756, + "grad_norm": 2.765625, + "learning_rate": 4.6200183971318104e-05, + "loss": 0.5444, + "step": 4106 + }, + { + "epoch": 0.18037915628396106, + "grad_norm": 3.171875, + "learning_rate": 4.619651657678232e-05, + "loss": 0.526, + "step": 4108 + }, + { + "epoch": 0.18046697476316453, + "grad_norm": 2.34375, + "learning_rate": 4.619284755901222e-05, + "loss": 0.476, + "step": 4110 + }, + { + "epoch": 0.18055479324236803, + "grad_norm": 3.078125, + "learning_rate": 4.618917691828877e-05, + "loss": 0.5217, + "step": 4112 + }, + { + "epoch": 0.1806426117215715, + "grad_norm": 2.84375, + "learning_rate": 4.618550465489307e-05, + "loss": 0.5009, + "step": 4114 + }, + { + "epoch": 0.180730430200775, + "grad_norm": 2.59375, + "learning_rate": 4.618183076910635e-05, + "loss": 0.5349, + "step": 4116 + }, + { + "epoch": 0.18081824867997848, + "grad_norm": 3.109375, + "learning_rate": 4.6178155261209946e-05, + "loss": 0.5149, + "step": 4118 + }, + { + "epoch": 0.18090606715918198, + "grad_norm": 2.90625, + "learning_rate": 4.6174478131485334e-05, + "loss": 0.513, + "step": 4120 + }, + { + "epoch": 0.18099388563838545, + "grad_norm": 3.359375, + "learning_rate": 4.617079938021412e-05, + "loss": 0.5095, + "step": 4122 + }, + { + "epoch": 0.18108170411758895, + "grad_norm": 3.5625, + "learning_rate": 4.616711900767801e-05, + "loss": 0.4875, + "step": 4124 + }, + { + "epoch": 0.18116952259679242, + "grad_norm": 2.953125, + "learning_rate": 4.616343701415885e-05, + "loss": 0.5149, + "step": 4126 + }, + { + "epoch": 0.18125734107599592, + "grad_norm": 3.125, + "learning_rate": 4.615975339993861e-05, + "loss": 0.5326, + "step": 4128 + }, + { + "epoch": 0.1813451595551994, + "grad_norm": 2.859375, + "learning_rate": 4.615606816529939e-05, + "loss": 0.4981, + "step": 4130 + }, + { + "epoch": 0.1814329780344029, + "grad_norm": 2.6875, + "learning_rate": 4.6152381310523387e-05, + "loss": 0.5039, + "step": 4132 + }, + { + "epoch": 0.18152079651360636, + "grad_norm": 2.359375, + "learning_rate": 4.614869283589296e-05, + "loss": 0.5351, + "step": 4134 + }, + { + "epoch": 0.18160861499280986, + "grad_norm": 2.5625, + "learning_rate": 4.614500274169057e-05, + "loss": 0.5039, + "step": 4136 + }, + { + "epoch": 0.18169643347201334, + "grad_norm": 2.6875, + "learning_rate": 4.6141311028198786e-05, + "loss": 0.5379, + "step": 4138 + }, + { + "epoch": 0.18178425195121684, + "grad_norm": 2.4375, + "learning_rate": 4.613761769570035e-05, + "loss": 0.5038, + "step": 4140 + }, + { + "epoch": 0.18187207043042034, + "grad_norm": 3.3125, + "learning_rate": 4.6133922744478076e-05, + "loss": 0.509, + "step": 4142 + }, + { + "epoch": 0.1819598889096238, + "grad_norm": 3.109375, + "learning_rate": 4.613022617481494e-05, + "loss": 0.4996, + "step": 4144 + }, + { + "epoch": 0.1820477073888273, + "grad_norm": 2.546875, + "learning_rate": 4.612652798699401e-05, + "loss": 0.5164, + "step": 4146 + }, + { + "epoch": 0.18213552586803078, + "grad_norm": 2.84375, + "learning_rate": 4.612282818129851e-05, + "loss": 0.4928, + "step": 4148 + }, + { + "epoch": 0.18222334434723428, + "grad_norm": 2.5, + "learning_rate": 4.611912675801176e-05, + "loss": 0.499, + "step": 4150 + }, + { + "epoch": 0.18231116282643775, + "grad_norm": 2.359375, + "learning_rate": 4.611542371741722e-05, + "loss": 0.4836, + "step": 4152 + }, + { + "epoch": 0.18239898130564125, + "grad_norm": 2.453125, + "learning_rate": 4.6111719059798466e-05, + "loss": 0.4947, + "step": 4154 + }, + { + "epoch": 0.18248679978484472, + "grad_norm": 2.5, + "learning_rate": 4.610801278543921e-05, + "loss": 0.5091, + "step": 4156 + }, + { + "epoch": 0.18257461826404822, + "grad_norm": 2.640625, + "learning_rate": 4.610430489462327e-05, + "loss": 0.5287, + "step": 4158 + }, + { + "epoch": 0.1826624367432517, + "grad_norm": 3.015625, + "learning_rate": 4.6100595387634616e-05, + "loss": 0.4934, + "step": 4160 + }, + { + "epoch": 0.1827502552224552, + "grad_norm": 2.875, + "learning_rate": 4.60968842647573e-05, + "loss": 0.5256, + "step": 4162 + }, + { + "epoch": 0.18283807370165867, + "grad_norm": 2.578125, + "learning_rate": 4.6093171526275524e-05, + "loss": 0.5091, + "step": 4164 + }, + { + "epoch": 0.18292589218086217, + "grad_norm": 2.421875, + "learning_rate": 4.608945717247363e-05, + "loss": 0.5385, + "step": 4166 + }, + { + "epoch": 0.18301371066006564, + "grad_norm": 2.546875, + "learning_rate": 4.6085741203636035e-05, + "loss": 0.5128, + "step": 4168 + }, + { + "epoch": 0.18310152913926914, + "grad_norm": 2.53125, + "learning_rate": 4.6082023620047335e-05, + "loss": 0.5135, + "step": 4170 + }, + { + "epoch": 0.1831893476184726, + "grad_norm": 2.453125, + "learning_rate": 4.60783044219922e-05, + "loss": 0.5336, + "step": 4172 + }, + { + "epoch": 0.1832771660976761, + "grad_norm": 2.46875, + "learning_rate": 4.607458360975547e-05, + "loss": 0.5043, + "step": 4174 + }, + { + "epoch": 0.18336498457687958, + "grad_norm": 2.609375, + "learning_rate": 4.607086118362208e-05, + "loss": 0.4748, + "step": 4176 + }, + { + "epoch": 0.18345280305608308, + "grad_norm": 3.125, + "learning_rate": 4.6067137143877084e-05, + "loss": 0.5004, + "step": 4178 + }, + { + "epoch": 0.18354062153528655, + "grad_norm": 3.6875, + "learning_rate": 4.606341149080567e-05, + "loss": 0.4805, + "step": 4180 + }, + { + "epoch": 0.18362844001449005, + "grad_norm": 3.71875, + "learning_rate": 4.605968422469316e-05, + "loss": 0.4993, + "step": 4182 + }, + { + "epoch": 0.18371625849369352, + "grad_norm": 4.28125, + "learning_rate": 4.605595534582498e-05, + "loss": 0.539, + "step": 4184 + }, + { + "epoch": 0.18380407697289702, + "grad_norm": 3.953125, + "learning_rate": 4.60522248544867e-05, + "loss": 0.5105, + "step": 4186 + }, + { + "epoch": 0.1838918954521005, + "grad_norm": 3.15625, + "learning_rate": 4.604849275096399e-05, + "loss": 0.5194, + "step": 4188 + }, + { + "epoch": 0.183979713931304, + "grad_norm": 2.6875, + "learning_rate": 4.604475903554266e-05, + "loss": 0.5028, + "step": 4190 + }, + { + "epoch": 0.18406753241050747, + "grad_norm": 2.53125, + "learning_rate": 4.6041023708508635e-05, + "loss": 0.49, + "step": 4192 + }, + { + "epoch": 0.18415535088971097, + "grad_norm": 2.296875, + "learning_rate": 4.603728677014797e-05, + "loss": 0.4791, + "step": 4194 + }, + { + "epoch": 0.18424316936891447, + "grad_norm": 2.9375, + "learning_rate": 4.6033548220746846e-05, + "loss": 0.5261, + "step": 4196 + }, + { + "epoch": 0.18433098784811794, + "grad_norm": 2.359375, + "learning_rate": 4.602980806059155e-05, + "loss": 0.5155, + "step": 4198 + }, + { + "epoch": 0.18441880632732144, + "grad_norm": 2.671875, + "learning_rate": 4.602606628996852e-05, + "loss": 0.4972, + "step": 4200 + }, + { + "epoch": 0.1845066248065249, + "grad_norm": 2.65625, + "learning_rate": 4.6022322909164286e-05, + "loss": 0.5362, + "step": 4202 + }, + { + "epoch": 0.1845944432857284, + "grad_norm": 3.5625, + "learning_rate": 4.601857791846553e-05, + "loss": 0.4848, + "step": 4204 + }, + { + "epoch": 0.18468226176493188, + "grad_norm": 3.59375, + "learning_rate": 4.6014831318159036e-05, + "loss": 0.5352, + "step": 4206 + }, + { + "epoch": 0.18477008024413538, + "grad_norm": 3.4375, + "learning_rate": 4.6011083108531725e-05, + "loss": 0.5178, + "step": 4208 + }, + { + "epoch": 0.18485789872333885, + "grad_norm": 2.921875, + "learning_rate": 4.6007333289870625e-05, + "loss": 0.4915, + "step": 4210 + }, + { + "epoch": 0.18494571720254235, + "grad_norm": 2.59375, + "learning_rate": 4.600358186246291e-05, + "loss": 0.4989, + "step": 4212 + }, + { + "epoch": 0.18503353568174583, + "grad_norm": 2.578125, + "learning_rate": 4.599982882659586e-05, + "loss": 0.5105, + "step": 4214 + }, + { + "epoch": 0.18512135416094933, + "grad_norm": 2.4375, + "learning_rate": 4.599607418255689e-05, + "loss": 0.5102, + "step": 4216 + }, + { + "epoch": 0.1852091726401528, + "grad_norm": 2.421875, + "learning_rate": 4.5992317930633524e-05, + "loss": 0.5115, + "step": 4218 + }, + { + "epoch": 0.1852969911193563, + "grad_norm": 2.46875, + "learning_rate": 4.5988560071113416e-05, + "loss": 0.5128, + "step": 4220 + }, + { + "epoch": 0.18538480959855977, + "grad_norm": 2.921875, + "learning_rate": 4.598480060428435e-05, + "loss": 0.5167, + "step": 4222 + }, + { + "epoch": 0.18547262807776327, + "grad_norm": 3.25, + "learning_rate": 4.598103953043422e-05, + "loss": 0.525, + "step": 4224 + }, + { + "epoch": 0.18556044655696674, + "grad_norm": 3.640625, + "learning_rate": 4.597727684985105e-05, + "loss": 0.5016, + "step": 4226 + }, + { + "epoch": 0.18564826503617024, + "grad_norm": 4.9375, + "learning_rate": 4.5973512562823e-05, + "loss": 0.5174, + "step": 4228 + }, + { + "epoch": 0.1857360835153737, + "grad_norm": 3.78125, + "learning_rate": 4.596974666963832e-05, + "loss": 0.5166, + "step": 4230 + }, + { + "epoch": 0.1858239019945772, + "grad_norm": 2.78125, + "learning_rate": 4.596597917058542e-05, + "loss": 0.5226, + "step": 4232 + }, + { + "epoch": 0.18591172047378068, + "grad_norm": 2.203125, + "learning_rate": 4.59622100659528e-05, + "loss": 0.5479, + "step": 4234 + }, + { + "epoch": 0.18599953895298418, + "grad_norm": 2.71875, + "learning_rate": 4.595843935602913e-05, + "loss": 0.5321, + "step": 4236 + }, + { + "epoch": 0.18608735743218766, + "grad_norm": 3.25, + "learning_rate": 4.5954667041103126e-05, + "loss": 0.5096, + "step": 4238 + }, + { + "epoch": 0.18617517591139116, + "grad_norm": 5.15625, + "learning_rate": 4.595089312146371e-05, + "loss": 0.4967, + "step": 4240 + }, + { + "epoch": 0.18626299439059463, + "grad_norm": 4.5625, + "learning_rate": 4.594711759739987e-05, + "loss": 0.5105, + "step": 4242 + }, + { + "epoch": 0.18635081286979813, + "grad_norm": 3.453125, + "learning_rate": 4.594334046920075e-05, + "loss": 0.5182, + "step": 4244 + }, + { + "epoch": 0.18643863134900163, + "grad_norm": 2.671875, + "learning_rate": 4.593956173715559e-05, + "loss": 0.5176, + "step": 4246 + }, + { + "epoch": 0.1865264498282051, + "grad_norm": 3.25, + "learning_rate": 4.593578140155377e-05, + "loss": 0.4972, + "step": 4248 + }, + { + "epoch": 0.1866142683074086, + "grad_norm": 3.3125, + "learning_rate": 4.593199946268479e-05, + "loss": 0.4917, + "step": 4250 + }, + { + "epoch": 0.18670208678661207, + "grad_norm": 2.984375, + "learning_rate": 4.592821592083829e-05, + "loss": 0.5161, + "step": 4252 + }, + { + "epoch": 0.18678990526581557, + "grad_norm": 3.0625, + "learning_rate": 4.592443077630398e-05, + "loss": 0.4934, + "step": 4254 + }, + { + "epoch": 0.18687772374501904, + "grad_norm": 2.8125, + "learning_rate": 4.5920644029371764e-05, + "loss": 0.4934, + "step": 4256 + }, + { + "epoch": 0.18696554222422254, + "grad_norm": 2.625, + "learning_rate": 4.5916855680331596e-05, + "loss": 0.493, + "step": 4258 + }, + { + "epoch": 0.187053360703426, + "grad_norm": 2.5, + "learning_rate": 4.5913065729473614e-05, + "loss": 0.5358, + "step": 4260 + }, + { + "epoch": 0.1871411791826295, + "grad_norm": 2.703125, + "learning_rate": 4.590927417708804e-05, + "loss": 0.4723, + "step": 4262 + }, + { + "epoch": 0.18722899766183299, + "grad_norm": 2.59375, + "learning_rate": 4.590548102346525e-05, + "loss": 0.5063, + "step": 4264 + }, + { + "epoch": 0.18731681614103649, + "grad_norm": 2.921875, + "learning_rate": 4.5901686268895694e-05, + "loss": 0.5412, + "step": 4266 + }, + { + "epoch": 0.18740463462023996, + "grad_norm": 2.953125, + "learning_rate": 4.589788991367e-05, + "loss": 0.496, + "step": 4268 + }, + { + "epoch": 0.18749245309944346, + "grad_norm": 2.828125, + "learning_rate": 4.5894091958078886e-05, + "loss": 0.4843, + "step": 4270 + }, + { + "epoch": 0.18758027157864693, + "grad_norm": 2.34375, + "learning_rate": 4.5890292402413205e-05, + "loss": 0.4969, + "step": 4272 + }, + { + "epoch": 0.18766809005785043, + "grad_norm": 2.359375, + "learning_rate": 4.5886491246963925e-05, + "loss": 0.4921, + "step": 4274 + }, + { + "epoch": 0.1877559085370539, + "grad_norm": 2.578125, + "learning_rate": 4.5882688492022134e-05, + "loss": 0.5024, + "step": 4276 + }, + { + "epoch": 0.1878437270162574, + "grad_norm": 2.609375, + "learning_rate": 4.587888413787905e-05, + "loss": 0.4909, + "step": 4278 + }, + { + "epoch": 0.18793154549546087, + "grad_norm": 2.5, + "learning_rate": 4.587507818482603e-05, + "loss": 0.5371, + "step": 4280 + }, + { + "epoch": 0.18801936397466437, + "grad_norm": 2.578125, + "learning_rate": 4.58712706331545e-05, + "loss": 0.5265, + "step": 4282 + }, + { + "epoch": 0.18810718245386784, + "grad_norm": 2.265625, + "learning_rate": 4.586746148315607e-05, + "loss": 0.5251, + "step": 4284 + }, + { + "epoch": 0.18819500093307134, + "grad_norm": 2.546875, + "learning_rate": 4.5863650735122435e-05, + "loss": 0.4889, + "step": 4286 + }, + { + "epoch": 0.18828281941227482, + "grad_norm": 2.484375, + "learning_rate": 4.5859838389345425e-05, + "loss": 0.4883, + "step": 4288 + }, + { + "epoch": 0.18837063789147832, + "grad_norm": 2.359375, + "learning_rate": 4.585602444611699e-05, + "loss": 0.4715, + "step": 4290 + }, + { + "epoch": 0.1884584563706818, + "grad_norm": 2.578125, + "learning_rate": 4.58522089057292e-05, + "loss": 0.4844, + "step": 4292 + }, + { + "epoch": 0.1885462748498853, + "grad_norm": 2.6875, + "learning_rate": 4.584839176847426e-05, + "loss": 0.516, + "step": 4294 + }, + { + "epoch": 0.1886340933290888, + "grad_norm": 2.25, + "learning_rate": 4.5844573034644485e-05, + "loss": 0.5009, + "step": 4296 + }, + { + "epoch": 0.18872191180829226, + "grad_norm": 2.5625, + "learning_rate": 4.5840752704532304e-05, + "loss": 0.5116, + "step": 4298 + }, + { + "epoch": 0.18880973028749576, + "grad_norm": 3.015625, + "learning_rate": 4.583693077843028e-05, + "loss": 0.5064, + "step": 4300 + }, + { + "epoch": 0.18889754876669923, + "grad_norm": 3.078125, + "learning_rate": 4.5833107256631114e-05, + "loss": 0.513, + "step": 4302 + }, + { + "epoch": 0.18898536724590273, + "grad_norm": 2.484375, + "learning_rate": 4.582928213942759e-05, + "loss": 0.5207, + "step": 4304 + }, + { + "epoch": 0.1890731857251062, + "grad_norm": 2.3125, + "learning_rate": 4.582545542711265e-05, + "loss": 0.5153, + "step": 4306 + }, + { + "epoch": 0.1891610042043097, + "grad_norm": 2.40625, + "learning_rate": 4.582162711997934e-05, + "loss": 0.4788, + "step": 4308 + }, + { + "epoch": 0.18924882268351317, + "grad_norm": 2.5625, + "learning_rate": 4.581779721832084e-05, + "loss": 0.4913, + "step": 4310 + }, + { + "epoch": 0.18933664116271667, + "grad_norm": 2.296875, + "learning_rate": 4.581396572243043e-05, + "loss": 0.507, + "step": 4312 + }, + { + "epoch": 0.18942445964192015, + "grad_norm": 2.375, + "learning_rate": 4.581013263260154e-05, + "loss": 0.4794, + "step": 4314 + }, + { + "epoch": 0.18951227812112365, + "grad_norm": 2.578125, + "learning_rate": 4.58062979491277e-05, + "loss": 0.4966, + "step": 4316 + }, + { + "epoch": 0.18960009660032712, + "grad_norm": 2.625, + "learning_rate": 4.580246167230258e-05, + "loss": 0.5002, + "step": 4318 + }, + { + "epoch": 0.18968791507953062, + "grad_norm": 2.640625, + "learning_rate": 4.579862380241996e-05, + "loss": 0.4809, + "step": 4320 + }, + { + "epoch": 0.1897757335587341, + "grad_norm": 3.046875, + "learning_rate": 4.579478433977374e-05, + "loss": 0.4919, + "step": 4322 + }, + { + "epoch": 0.1898635520379376, + "grad_norm": 2.796875, + "learning_rate": 4.579094328465795e-05, + "loss": 0.5036, + "step": 4324 + }, + { + "epoch": 0.18995137051714106, + "grad_norm": 2.65625, + "learning_rate": 4.578710063736674e-05, + "loss": 0.5281, + "step": 4326 + }, + { + "epoch": 0.19003918899634456, + "grad_norm": 2.71875, + "learning_rate": 4.578325639819438e-05, + "loss": 0.5063, + "step": 4328 + }, + { + "epoch": 0.19012700747554803, + "grad_norm": 3.046875, + "learning_rate": 4.5779410567435264e-05, + "loss": 0.5193, + "step": 4330 + }, + { + "epoch": 0.19021482595475153, + "grad_norm": 2.765625, + "learning_rate": 4.577556314538391e-05, + "loss": 0.5139, + "step": 4332 + }, + { + "epoch": 0.190302644433955, + "grad_norm": 2.28125, + "learning_rate": 4.5771714132334945e-05, + "loss": 0.5132, + "step": 4334 + }, + { + "epoch": 0.1903904629131585, + "grad_norm": 2.46875, + "learning_rate": 4.576786352858313e-05, + "loss": 0.5337, + "step": 4336 + }, + { + "epoch": 0.19047828139236198, + "grad_norm": 2.234375, + "learning_rate": 4.5764011334423354e-05, + "loss": 0.5202, + "step": 4338 + }, + { + "epoch": 0.19056609987156548, + "grad_norm": 2.484375, + "learning_rate": 4.576015755015061e-05, + "loss": 0.5288, + "step": 4340 + }, + { + "epoch": 0.19065391835076895, + "grad_norm": 2.625, + "learning_rate": 4.575630217606002e-05, + "loss": 0.5126, + "step": 4342 + }, + { + "epoch": 0.19074173682997245, + "grad_norm": 2.359375, + "learning_rate": 4.5752445212446836e-05, + "loss": 0.5032, + "step": 4344 + }, + { + "epoch": 0.19082955530917595, + "grad_norm": 2.390625, + "learning_rate": 4.574858665960643e-05, + "loss": 0.5159, + "step": 4346 + }, + { + "epoch": 0.19091737378837942, + "grad_norm": 2.59375, + "learning_rate": 4.574472651783428e-05, + "loss": 0.5548, + "step": 4348 + }, + { + "epoch": 0.19100519226758292, + "grad_norm": 2.96875, + "learning_rate": 4.5740864787425995e-05, + "loss": 0.4725, + "step": 4350 + }, + { + "epoch": 0.1910930107467864, + "grad_norm": 2.734375, + "learning_rate": 4.573700146867732e-05, + "loss": 0.4993, + "step": 4352 + }, + { + "epoch": 0.1911808292259899, + "grad_norm": 2.84375, + "learning_rate": 4.573313656188409e-05, + "loss": 0.5261, + "step": 4354 + }, + { + "epoch": 0.19126864770519336, + "grad_norm": 2.59375, + "learning_rate": 4.57292700673423e-05, + "loss": 0.5127, + "step": 4356 + }, + { + "epoch": 0.19135646618439686, + "grad_norm": 2.828125, + "learning_rate": 4.572540198534804e-05, + "loss": 0.5066, + "step": 4358 + }, + { + "epoch": 0.19144428466360033, + "grad_norm": 2.40625, + "learning_rate": 4.572153231619752e-05, + "loss": 0.5019, + "step": 4360 + }, + { + "epoch": 0.19153210314280383, + "grad_norm": 2.65625, + "learning_rate": 4.57176610601871e-05, + "loss": 0.4771, + "step": 4362 + }, + { + "epoch": 0.1916199216220073, + "grad_norm": 2.1875, + "learning_rate": 4.571378821761322e-05, + "loss": 0.525, + "step": 4364 + }, + { + "epoch": 0.1917077401012108, + "grad_norm": 2.296875, + "learning_rate": 4.5709913788772486e-05, + "loss": 0.5299, + "step": 4366 + }, + { + "epoch": 0.19179555858041428, + "grad_norm": 2.953125, + "learning_rate": 4.570603777396158e-05, + "loss": 0.5113, + "step": 4368 + }, + { + "epoch": 0.19188337705961778, + "grad_norm": 4.09375, + "learning_rate": 4.5702160173477336e-05, + "loss": 0.4981, + "step": 4370 + }, + { + "epoch": 0.19197119553882125, + "grad_norm": 4.59375, + "learning_rate": 4.56982809876167e-05, + "loss": 0.4895, + "step": 4372 + }, + { + "epoch": 0.19205901401802475, + "grad_norm": 4.03125, + "learning_rate": 4.5694400216676755e-05, + "loss": 0.4936, + "step": 4374 + }, + { + "epoch": 0.19214683249722822, + "grad_norm": 3.234375, + "learning_rate": 4.569051786095467e-05, + "loss": 0.5052, + "step": 4376 + }, + { + "epoch": 0.19223465097643172, + "grad_norm": 2.78125, + "learning_rate": 4.568663392074778e-05, + "loss": 0.4914, + "step": 4378 + }, + { + "epoch": 0.1923224694556352, + "grad_norm": 2.4375, + "learning_rate": 4.568274839635349e-05, + "loss": 0.4874, + "step": 4380 + }, + { + "epoch": 0.1924102879348387, + "grad_norm": 2.734375, + "learning_rate": 4.567886128806939e-05, + "loss": 0.5129, + "step": 4382 + }, + { + "epoch": 0.19249810641404216, + "grad_norm": 2.84375, + "learning_rate": 4.5674972596193114e-05, + "loss": 0.5016, + "step": 4384 + }, + { + "epoch": 0.19258592489324566, + "grad_norm": 3.0625, + "learning_rate": 4.567108232102249e-05, + "loss": 0.5084, + "step": 4386 + }, + { + "epoch": 0.19267374337244914, + "grad_norm": 2.96875, + "learning_rate": 4.566719046285542e-05, + "loss": 0.5344, + "step": 4388 + }, + { + "epoch": 0.19276156185165264, + "grad_norm": 2.625, + "learning_rate": 4.5663297021989963e-05, + "loss": 0.4685, + "step": 4390 + }, + { + "epoch": 0.1928493803308561, + "grad_norm": 2.84375, + "learning_rate": 4.565940199872426e-05, + "loss": 0.5278, + "step": 4392 + }, + { + "epoch": 0.1929371988100596, + "grad_norm": 2.796875, + "learning_rate": 4.56555053933566e-05, + "loss": 0.4925, + "step": 4394 + }, + { + "epoch": 0.1930250172892631, + "grad_norm": 3.078125, + "learning_rate": 4.5651607206185384e-05, + "loss": 0.5183, + "step": 4396 + }, + { + "epoch": 0.19311283576846658, + "grad_norm": 2.984375, + "learning_rate": 4.564770743750914e-05, + "loss": 0.4626, + "step": 4398 + }, + { + "epoch": 0.19320065424767008, + "grad_norm": 3.453125, + "learning_rate": 4.564380608762651e-05, + "loss": 0.4903, + "step": 4400 + }, + { + "epoch": 0.19328847272687355, + "grad_norm": 3.984375, + "learning_rate": 4.563990315683626e-05, + "loss": 0.4885, + "step": 4402 + }, + { + "epoch": 0.19337629120607705, + "grad_norm": 3.25, + "learning_rate": 4.5635998645437286e-05, + "loss": 0.5027, + "step": 4404 + }, + { + "epoch": 0.19346410968528052, + "grad_norm": 3.140625, + "learning_rate": 4.5632092553728576e-05, + "loss": 0.4832, + "step": 4406 + }, + { + "epoch": 0.19355192816448402, + "grad_norm": 3.34375, + "learning_rate": 4.5628184882009276e-05, + "loss": 0.514, + "step": 4408 + }, + { + "epoch": 0.1936397466436875, + "grad_norm": 2.890625, + "learning_rate": 4.562427563057864e-05, + "loss": 0.5072, + "step": 4410 + }, + { + "epoch": 0.193727565122891, + "grad_norm": 2.515625, + "learning_rate": 4.5620364799736026e-05, + "loss": 0.4741, + "step": 4412 + }, + { + "epoch": 0.19381538360209447, + "grad_norm": 2.65625, + "learning_rate": 4.561645238978094e-05, + "loss": 0.4841, + "step": 4414 + }, + { + "epoch": 0.19390320208129797, + "grad_norm": 2.703125, + "learning_rate": 4.561253840101298e-05, + "loss": 0.5125, + "step": 4416 + }, + { + "epoch": 0.19399102056050144, + "grad_norm": 2.796875, + "learning_rate": 4.560862283373188e-05, + "loss": 0.4877, + "step": 4418 + }, + { + "epoch": 0.19407883903970494, + "grad_norm": 3.21875, + "learning_rate": 4.560470568823753e-05, + "loss": 0.5164, + "step": 4420 + }, + { + "epoch": 0.1941666575189084, + "grad_norm": 3.328125, + "learning_rate": 4.5600786964829854e-05, + "loss": 0.481, + "step": 4422 + }, + { + "epoch": 0.1942544759981119, + "grad_norm": 3.15625, + "learning_rate": 4.559686666380898e-05, + "loss": 0.4857, + "step": 4424 + }, + { + "epoch": 0.19434229447731538, + "grad_norm": 3.546875, + "learning_rate": 4.559294478547512e-05, + "loss": 0.4905, + "step": 4426 + }, + { + "epoch": 0.19443011295651888, + "grad_norm": 3.96875, + "learning_rate": 4.5589021330128615e-05, + "loss": 0.512, + "step": 4428 + }, + { + "epoch": 0.19451793143572235, + "grad_norm": 3.140625, + "learning_rate": 4.558509629806992e-05, + "loss": 0.5058, + "step": 4430 + }, + { + "epoch": 0.19460574991492585, + "grad_norm": 2.90625, + "learning_rate": 4.558116968959962e-05, + "loss": 0.4877, + "step": 4432 + }, + { + "epoch": 0.19469356839412932, + "grad_norm": 2.5, + "learning_rate": 4.5577241505018404e-05, + "loss": 0.4911, + "step": 4434 + }, + { + "epoch": 0.19478138687333282, + "grad_norm": 2.546875, + "learning_rate": 4.557331174462711e-05, + "loss": 0.5044, + "step": 4436 + }, + { + "epoch": 0.1948692053525363, + "grad_norm": 2.8125, + "learning_rate": 4.556938040872666e-05, + "loss": 0.5064, + "step": 4438 + }, + { + "epoch": 0.1949570238317398, + "grad_norm": 2.4375, + "learning_rate": 4.556544749761813e-05, + "loss": 0.4767, + "step": 4440 + }, + { + "epoch": 0.19504484231094327, + "grad_norm": 2.265625, + "learning_rate": 4.556151301160271e-05, + "loss": 0.5161, + "step": 4442 + }, + { + "epoch": 0.19513266079014677, + "grad_norm": 2.28125, + "learning_rate": 4.5557576950981684e-05, + "loss": 0.5014, + "step": 4444 + }, + { + "epoch": 0.19522047926935027, + "grad_norm": 2.46875, + "learning_rate": 4.555363931605649e-05, + "loss": 0.4857, + "step": 4446 + }, + { + "epoch": 0.19530829774855374, + "grad_norm": 2.5625, + "learning_rate": 4.554970010712867e-05, + "loss": 0.5094, + "step": 4448 + }, + { + "epoch": 0.19539611622775724, + "grad_norm": 2.265625, + "learning_rate": 4.5545759324499896e-05, + "loss": 0.5082, + "step": 4450 + }, + { + "epoch": 0.1954839347069607, + "grad_norm": 2.640625, + "learning_rate": 4.554181696847194e-05, + "loss": 0.515, + "step": 4452 + }, + { + "epoch": 0.1955717531861642, + "grad_norm": 2.3125, + "learning_rate": 4.5537873039346716e-05, + "loss": 0.5094, + "step": 4454 + }, + { + "epoch": 0.19565957166536768, + "grad_norm": 2.390625, + "learning_rate": 4.553392753742626e-05, + "loss": 0.5027, + "step": 4456 + }, + { + "epoch": 0.19574739014457118, + "grad_norm": 2.5625, + "learning_rate": 4.55299804630127e-05, + "loss": 0.5025, + "step": 4458 + }, + { + "epoch": 0.19583520862377465, + "grad_norm": 2.546875, + "learning_rate": 4.5526031816408324e-05, + "loss": 0.4915, + "step": 4460 + }, + { + "epoch": 0.19592302710297815, + "grad_norm": 2.75, + "learning_rate": 4.55220815979155e-05, + "loss": 0.5053, + "step": 4462 + }, + { + "epoch": 0.19601084558218163, + "grad_norm": 2.5625, + "learning_rate": 4.5518129807836755e-05, + "loss": 0.5056, + "step": 4464 + }, + { + "epoch": 0.19609866406138513, + "grad_norm": 2.59375, + "learning_rate": 4.551417644647471e-05, + "loss": 0.476, + "step": 4466 + }, + { + "epoch": 0.1961864825405886, + "grad_norm": 2.3125, + "learning_rate": 4.551022151413212e-05, + "loss": 0.5206, + "step": 4468 + }, + { + "epoch": 0.1962743010197921, + "grad_norm": 2.640625, + "learning_rate": 4.5506265011111845e-05, + "loss": 0.4979, + "step": 4470 + }, + { + "epoch": 0.19636211949899557, + "grad_norm": 2.9375, + "learning_rate": 4.5502306937716873e-05, + "loss": 0.5007, + "step": 4472 + }, + { + "epoch": 0.19644993797819907, + "grad_norm": 2.96875, + "learning_rate": 4.5498347294250335e-05, + "loss": 0.4995, + "step": 4474 + }, + { + "epoch": 0.19653775645740254, + "grad_norm": 2.890625, + "learning_rate": 4.549438608101544e-05, + "loss": 0.5034, + "step": 4476 + }, + { + "epoch": 0.19662557493660604, + "grad_norm": 2.921875, + "learning_rate": 4.549042329831554e-05, + "loss": 0.5002, + "step": 4478 + }, + { + "epoch": 0.1967133934158095, + "grad_norm": 2.84375, + "learning_rate": 4.548645894645413e-05, + "loss": 0.5025, + "step": 4480 + }, + { + "epoch": 0.196801211895013, + "grad_norm": 2.96875, + "learning_rate": 4.548249302573477e-05, + "loss": 0.5033, + "step": 4482 + }, + { + "epoch": 0.19688903037421648, + "grad_norm": 2.75, + "learning_rate": 4.547852553646118e-05, + "loss": 0.4976, + "step": 4484 + }, + { + "epoch": 0.19697684885341998, + "grad_norm": 2.375, + "learning_rate": 4.54745564789372e-05, + "loss": 0.4862, + "step": 4486 + }, + { + "epoch": 0.19706466733262346, + "grad_norm": 2.375, + "learning_rate": 4.547058585346678e-05, + "loss": 0.4548, + "step": 4488 + }, + { + "epoch": 0.19715248581182696, + "grad_norm": 2.796875, + "learning_rate": 4.5466613660353985e-05, + "loss": 0.5254, + "step": 4490 + }, + { + "epoch": 0.19724030429103043, + "grad_norm": 2.234375, + "learning_rate": 4.546263989990302e-05, + "loss": 0.4946, + "step": 4492 + }, + { + "epoch": 0.19732812277023393, + "grad_norm": 2.609375, + "learning_rate": 4.545866457241817e-05, + "loss": 0.4971, + "step": 4494 + }, + { + "epoch": 0.19741594124943743, + "grad_norm": 2.625, + "learning_rate": 4.545468767820389e-05, + "loss": 0.4876, + "step": 4496 + }, + { + "epoch": 0.1975037597286409, + "grad_norm": 3.015625, + "learning_rate": 4.545070921756472e-05, + "loss": 0.5235, + "step": 4498 + }, + { + "epoch": 0.1975915782078444, + "grad_norm": 3.0625, + "learning_rate": 4.5446729190805345e-05, + "loss": 0.501, + "step": 4500 + }, + { + "epoch": 0.19767939668704787, + "grad_norm": 3.015625, + "learning_rate": 4.544274759823054e-05, + "loss": 0.5097, + "step": 4502 + }, + { + "epoch": 0.19776721516625137, + "grad_norm": 2.375, + "learning_rate": 4.5438764440145225e-05, + "loss": 0.4899, + "step": 4504 + }, + { + "epoch": 0.19785503364545484, + "grad_norm": 2.296875, + "learning_rate": 4.543477971685443e-05, + "loss": 0.4941, + "step": 4506 + }, + { + "epoch": 0.19794285212465834, + "grad_norm": 2.625, + "learning_rate": 4.543079342866331e-05, + "loss": 0.5089, + "step": 4508 + }, + { + "epoch": 0.1980306706038618, + "grad_norm": 2.578125, + "learning_rate": 4.542680557587712e-05, + "loss": 0.4812, + "step": 4510 + }, + { + "epoch": 0.1981184890830653, + "grad_norm": 2.390625, + "learning_rate": 4.5422816158801275e-05, + "loss": 0.4979, + "step": 4512 + }, + { + "epoch": 0.19820630756226879, + "grad_norm": 2.4375, + "learning_rate": 4.5418825177741266e-05, + "loss": 0.4836, + "step": 4514 + }, + { + "epoch": 0.19829412604147229, + "grad_norm": 2.53125, + "learning_rate": 4.541483263300273e-05, + "loss": 0.5024, + "step": 4516 + }, + { + "epoch": 0.19838194452067576, + "grad_norm": 2.28125, + "learning_rate": 4.5410838524891416e-05, + "loss": 0.4997, + "step": 4518 + }, + { + "epoch": 0.19846976299987926, + "grad_norm": 2.4375, + "learning_rate": 4.540684285371321e-05, + "loss": 0.5085, + "step": 4520 + }, + { + "epoch": 0.19855758147908273, + "grad_norm": 2.609375, + "learning_rate": 4.540284561977406e-05, + "loss": 0.4939, + "step": 4522 + }, + { + "epoch": 0.19864539995828623, + "grad_norm": 2.796875, + "learning_rate": 4.5398846823380124e-05, + "loss": 0.5064, + "step": 4524 + }, + { + "epoch": 0.1987332184374897, + "grad_norm": 2.765625, + "learning_rate": 4.53948464648376e-05, + "loss": 0.462, + "step": 4526 + }, + { + "epoch": 0.1988210369166932, + "grad_norm": 2.375, + "learning_rate": 4.539084454445284e-05, + "loss": 0.492, + "step": 4528 + }, + { + "epoch": 0.19890885539589667, + "grad_norm": 2.359375, + "learning_rate": 4.5386841062532325e-05, + "loss": 0.4829, + "step": 4530 + }, + { + "epoch": 0.19899667387510017, + "grad_norm": 2.421875, + "learning_rate": 4.538283601938263e-05, + "loss": 0.4771, + "step": 4532 + }, + { + "epoch": 0.19908449235430364, + "grad_norm": 2.609375, + "learning_rate": 4.5378829415310465e-05, + "loss": 0.5275, + "step": 4534 + }, + { + "epoch": 0.19917231083350714, + "grad_norm": 2.5, + "learning_rate": 4.5374821250622665e-05, + "loss": 0.5158, + "step": 4536 + }, + { + "epoch": 0.19926012931271062, + "grad_norm": 2.65625, + "learning_rate": 4.5370811525626166e-05, + "loss": 0.5223, + "step": 4538 + }, + { + "epoch": 0.19934794779191412, + "grad_norm": 2.734375, + "learning_rate": 4.5366800240628034e-05, + "loss": 0.5189, + "step": 4540 + }, + { + "epoch": 0.1994357662711176, + "grad_norm": 2.40625, + "learning_rate": 4.536278739593546e-05, + "loss": 0.5121, + "step": 4542 + }, + { + "epoch": 0.1995235847503211, + "grad_norm": 2.609375, + "learning_rate": 4.535877299185575e-05, + "loss": 0.5178, + "step": 4544 + }, + { + "epoch": 0.19961140322952456, + "grad_norm": 2.59375, + "learning_rate": 4.5354757028696315e-05, + "loss": 0.5079, + "step": 4546 + }, + { + "epoch": 0.19969922170872806, + "grad_norm": 2.3125, + "learning_rate": 4.5350739506764714e-05, + "loss": 0.4885, + "step": 4548 + }, + { + "epoch": 0.19978704018793156, + "grad_norm": 2.703125, + "learning_rate": 4.53467204263686e-05, + "loss": 0.5099, + "step": 4550 + }, + { + "epoch": 0.19987485866713503, + "grad_norm": 2.46875, + "learning_rate": 4.5342699787815755e-05, + "loss": 0.4823, + "step": 4552 + }, + { + "epoch": 0.19996267714633853, + "grad_norm": 2.890625, + "learning_rate": 4.5338677591414095e-05, + "loss": 0.4902, + "step": 4554 + }, + { + "epoch": 0.200050495625542, + "grad_norm": 2.3125, + "learning_rate": 4.533465383747162e-05, + "loss": 0.5072, + "step": 4556 + }, + { + "epoch": 0.2001383141047455, + "grad_norm": 2.40625, + "learning_rate": 4.533062852629648e-05, + "loss": 0.5244, + "step": 4558 + }, + { + "epoch": 0.20022613258394897, + "grad_norm": 2.265625, + "learning_rate": 4.5326601658196937e-05, + "loss": 0.5189, + "step": 4560 + }, + { + "epoch": 0.20031395106315247, + "grad_norm": 2.53125, + "learning_rate": 4.532257323348137e-05, + "loss": 0.5431, + "step": 4562 + }, + { + "epoch": 0.20040176954235595, + "grad_norm": 2.5625, + "learning_rate": 4.5318543252458266e-05, + "loss": 0.4636, + "step": 4564 + }, + { + "epoch": 0.20048958802155945, + "grad_norm": 2.484375, + "learning_rate": 4.5314511715436246e-05, + "loss": 0.4813, + "step": 4566 + }, + { + "epoch": 0.20057740650076292, + "grad_norm": 3.140625, + "learning_rate": 4.531047862272406e-05, + "loss": 0.5049, + "step": 4568 + }, + { + "epoch": 0.20066522497996642, + "grad_norm": 2.296875, + "learning_rate": 4.530644397463055e-05, + "loss": 0.5004, + "step": 4570 + }, + { + "epoch": 0.2007530434591699, + "grad_norm": 2.859375, + "learning_rate": 4.530240777146468e-05, + "loss": 0.4946, + "step": 4572 + }, + { + "epoch": 0.2008408619383734, + "grad_norm": 2.640625, + "learning_rate": 4.529837001353557e-05, + "loss": 0.4981, + "step": 4574 + }, + { + "epoch": 0.20092868041757686, + "grad_norm": 2.296875, + "learning_rate": 4.529433070115241e-05, + "loss": 0.5206, + "step": 4576 + }, + { + "epoch": 0.20101649889678036, + "grad_norm": 2.703125, + "learning_rate": 4.529028983462455e-05, + "loss": 0.5008, + "step": 4578 + }, + { + "epoch": 0.20110431737598383, + "grad_norm": 2.40625, + "learning_rate": 4.528624741426143e-05, + "loss": 0.4744, + "step": 4580 + }, + { + "epoch": 0.20119213585518733, + "grad_norm": 2.515625, + "learning_rate": 4.528220344037261e-05, + "loss": 0.5024, + "step": 4582 + }, + { + "epoch": 0.2012799543343908, + "grad_norm": 3.0625, + "learning_rate": 4.527815791326779e-05, + "loss": 0.5053, + "step": 4584 + }, + { + "epoch": 0.2013677728135943, + "grad_norm": 2.625, + "learning_rate": 4.5274110833256785e-05, + "loss": 0.5126, + "step": 4586 + }, + { + "epoch": 0.20145559129279778, + "grad_norm": 2.5, + "learning_rate": 4.52700622006495e-05, + "loss": 0.4648, + "step": 4588 + }, + { + "epoch": 0.20154340977200128, + "grad_norm": 2.53125, + "learning_rate": 4.5266012015756e-05, + "loss": 0.5069, + "step": 4590 + }, + { + "epoch": 0.20163122825120475, + "grad_norm": 2.53125, + "learning_rate": 4.5261960278886443e-05, + "loss": 0.5303, + "step": 4592 + }, + { + "epoch": 0.20171904673040825, + "grad_norm": 3.015625, + "learning_rate": 4.5257906990351114e-05, + "loss": 0.5173, + "step": 4594 + }, + { + "epoch": 0.20180686520961172, + "grad_norm": 2.890625, + "learning_rate": 4.5253852150460406e-05, + "loss": 0.4768, + "step": 4596 + }, + { + "epoch": 0.20189468368881522, + "grad_norm": 2.234375, + "learning_rate": 4.524979575952484e-05, + "loss": 0.4905, + "step": 4598 + }, + { + "epoch": 0.20198250216801872, + "grad_norm": 2.375, + "learning_rate": 4.524573781785507e-05, + "loss": 0.4856, + "step": 4600 + }, + { + "epoch": 0.2020703206472222, + "grad_norm": 2.6875, + "learning_rate": 4.5241678325761835e-05, + "loss": 0.5001, + "step": 4602 + }, + { + "epoch": 0.2021581391264257, + "grad_norm": 2.5625, + "learning_rate": 4.523761728355603e-05, + "loss": 0.5012, + "step": 4604 + }, + { + "epoch": 0.20224595760562916, + "grad_norm": 2.5, + "learning_rate": 4.523355469154864e-05, + "loss": 0.497, + "step": 4606 + }, + { + "epoch": 0.20233377608483266, + "grad_norm": 2.484375, + "learning_rate": 4.522949055005078e-05, + "loss": 0.5138, + "step": 4608 + }, + { + "epoch": 0.20242159456403613, + "grad_norm": 2.515625, + "learning_rate": 4.522542485937369e-05, + "loss": 0.506, + "step": 4610 + }, + { + "epoch": 0.20250941304323963, + "grad_norm": 2.3125, + "learning_rate": 4.522135761982871e-05, + "loss": 0.5185, + "step": 4612 + }, + { + "epoch": 0.2025972315224431, + "grad_norm": 2.515625, + "learning_rate": 4.521728883172732e-05, + "loss": 0.4962, + "step": 4614 + }, + { + "epoch": 0.2026850500016466, + "grad_norm": 2.390625, + "learning_rate": 4.52132184953811e-05, + "loss": 0.4924, + "step": 4616 + }, + { + "epoch": 0.20277286848085008, + "grad_norm": 2.453125, + "learning_rate": 4.520914661110176e-05, + "loss": 0.4647, + "step": 4618 + }, + { + "epoch": 0.20286068696005358, + "grad_norm": 2.5625, + "learning_rate": 4.5205073179201134e-05, + "loss": 0.4827, + "step": 4620 + }, + { + "epoch": 0.20294850543925705, + "grad_norm": 2.59375, + "learning_rate": 4.5200998199991165e-05, + "loss": 0.4757, + "step": 4622 + }, + { + "epoch": 0.20303632391846055, + "grad_norm": 2.5, + "learning_rate": 4.5196921673783896e-05, + "loss": 0.5093, + "step": 4624 + }, + { + "epoch": 0.20312414239766402, + "grad_norm": 2.3125, + "learning_rate": 4.519284360089153e-05, + "loss": 0.5032, + "step": 4626 + }, + { + "epoch": 0.20321196087686752, + "grad_norm": 2.484375, + "learning_rate": 4.518876398162637e-05, + "loss": 0.5029, + "step": 4628 + }, + { + "epoch": 0.203299779356071, + "grad_norm": 2.734375, + "learning_rate": 4.5184682816300815e-05, + "loss": 0.4898, + "step": 4630 + }, + { + "epoch": 0.2033875978352745, + "grad_norm": 2.5625, + "learning_rate": 4.518060010522742e-05, + "loss": 0.4841, + "step": 4632 + }, + { + "epoch": 0.20347541631447796, + "grad_norm": 2.484375, + "learning_rate": 4.5176515848718825e-05, + "loss": 0.4545, + "step": 4634 + }, + { + "epoch": 0.20356323479368146, + "grad_norm": 2.21875, + "learning_rate": 4.517243004708781e-05, + "loss": 0.4862, + "step": 4636 + }, + { + "epoch": 0.20365105327288494, + "grad_norm": 2.484375, + "learning_rate": 4.5168342700647264e-05, + "loss": 0.4902, + "step": 4638 + }, + { + "epoch": 0.20373887175208844, + "grad_norm": 2.515625, + "learning_rate": 4.51642538097102e-05, + "loss": 0.478, + "step": 4640 + }, + { + "epoch": 0.2038266902312919, + "grad_norm": 2.234375, + "learning_rate": 4.516016337458975e-05, + "loss": 0.5129, + "step": 4642 + }, + { + "epoch": 0.2039145087104954, + "grad_norm": 2.5, + "learning_rate": 4.5156071395599156e-05, + "loss": 0.4711, + "step": 4644 + }, + { + "epoch": 0.20400232718969888, + "grad_norm": 2.453125, + "learning_rate": 4.5151977873051774e-05, + "loss": 0.4948, + "step": 4646 + }, + { + "epoch": 0.20409014566890238, + "grad_norm": 2.390625, + "learning_rate": 4.5147882807261104e-05, + "loss": 0.4951, + "step": 4648 + }, + { + "epoch": 0.20417796414810588, + "grad_norm": 2.8125, + "learning_rate": 4.5143786198540736e-05, + "loss": 0.4779, + "step": 4650 + }, + { + "epoch": 0.20426578262730935, + "grad_norm": 2.703125, + "learning_rate": 4.51396880472044e-05, + "loss": 0.518, + "step": 4652 + }, + { + "epoch": 0.20435360110651285, + "grad_norm": 2.96875, + "learning_rate": 4.5135588353565914e-05, + "loss": 0.491, + "step": 4654 + }, + { + "epoch": 0.20444141958571632, + "grad_norm": 3.453125, + "learning_rate": 4.5131487117939246e-05, + "loss": 0.5192, + "step": 4656 + }, + { + "epoch": 0.20452923806491982, + "grad_norm": 2.75, + "learning_rate": 4.5127384340638474e-05, + "loss": 0.4967, + "step": 4658 + }, + { + "epoch": 0.2046170565441233, + "grad_norm": 2.359375, + "learning_rate": 4.512328002197779e-05, + "loss": 0.5179, + "step": 4660 + }, + { + "epoch": 0.2047048750233268, + "grad_norm": 2.34375, + "learning_rate": 4.5119174162271484e-05, + "loss": 0.4966, + "step": 4662 + }, + { + "epoch": 0.20479269350253027, + "grad_norm": 2.421875, + "learning_rate": 4.5115066761834e-05, + "loss": 0.5001, + "step": 4664 + }, + { + "epoch": 0.20488051198173376, + "grad_norm": 2.640625, + "learning_rate": 4.511095782097989e-05, + "loss": 0.5273, + "step": 4666 + }, + { + "epoch": 0.20496833046093724, + "grad_norm": 2.671875, + "learning_rate": 4.510684734002381e-05, + "loss": 0.4918, + "step": 4668 + }, + { + "epoch": 0.20505614894014074, + "grad_norm": 2.578125, + "learning_rate": 4.510273531928053e-05, + "loss": 0.4928, + "step": 4670 + }, + { + "epoch": 0.2051439674193442, + "grad_norm": 3.265625, + "learning_rate": 4.5098621759064964e-05, + "loss": 0.4911, + "step": 4672 + }, + { + "epoch": 0.2052317858985477, + "grad_norm": 3.109375, + "learning_rate": 4.5094506659692126e-05, + "loss": 0.5041, + "step": 4674 + }, + { + "epoch": 0.20531960437775118, + "grad_norm": 2.71875, + "learning_rate": 4.509039002147716e-05, + "loss": 0.4829, + "step": 4676 + }, + { + "epoch": 0.20540742285695468, + "grad_norm": 2.671875, + "learning_rate": 4.50862718447353e-05, + "loss": 0.4726, + "step": 4678 + }, + { + "epoch": 0.20549524133615815, + "grad_norm": 2.734375, + "learning_rate": 4.508215212978193e-05, + "loss": 0.4512, + "step": 4680 + }, + { + "epoch": 0.20558305981536165, + "grad_norm": 2.6875, + "learning_rate": 4.507803087693253e-05, + "loss": 0.497, + "step": 4682 + }, + { + "epoch": 0.20567087829456512, + "grad_norm": 2.859375, + "learning_rate": 4.507390808650272e-05, + "loss": 0.4954, + "step": 4684 + }, + { + "epoch": 0.20575869677376862, + "grad_norm": 2.65625, + "learning_rate": 4.506978375880822e-05, + "loss": 0.4796, + "step": 4686 + }, + { + "epoch": 0.2058465152529721, + "grad_norm": 2.5, + "learning_rate": 4.5065657894164856e-05, + "loss": 0.478, + "step": 4688 + }, + { + "epoch": 0.2059343337321756, + "grad_norm": 2.28125, + "learning_rate": 4.506153049288861e-05, + "loss": 0.5176, + "step": 4690 + }, + { + "epoch": 0.20602215221137907, + "grad_norm": 2.625, + "learning_rate": 4.505740155529555e-05, + "loss": 0.5044, + "step": 4692 + }, + { + "epoch": 0.20610997069058257, + "grad_norm": 2.765625, + "learning_rate": 4.505327108170187e-05, + "loss": 0.5094, + "step": 4694 + }, + { + "epoch": 0.20619778916978604, + "grad_norm": 2.359375, + "learning_rate": 4.504913907242389e-05, + "loss": 0.4767, + "step": 4696 + }, + { + "epoch": 0.20628560764898954, + "grad_norm": 2.609375, + "learning_rate": 4.504500552777802e-05, + "loss": 0.4734, + "step": 4698 + }, + { + "epoch": 0.20637342612819304, + "grad_norm": 2.234375, + "learning_rate": 4.5040870448080824e-05, + "loss": 0.4629, + "step": 4700 + }, + { + "epoch": 0.2064612446073965, + "grad_norm": 2.78125, + "learning_rate": 4.503673383364897e-05, + "loss": 0.499, + "step": 4702 + }, + { + "epoch": 0.2065490630866, + "grad_norm": 2.71875, + "learning_rate": 4.5032595684799236e-05, + "loss": 0.5027, + "step": 4704 + }, + { + "epoch": 0.20663688156580348, + "grad_norm": 2.359375, + "learning_rate": 4.502845600184852e-05, + "loss": 0.5164, + "step": 4706 + }, + { + "epoch": 0.20672470004500698, + "grad_norm": 2.84375, + "learning_rate": 4.502431478511384e-05, + "loss": 0.4971, + "step": 4708 + }, + { + "epoch": 0.20681251852421045, + "grad_norm": 2.734375, + "learning_rate": 4.5020172034912336e-05, + "loss": 0.448, + "step": 4710 + }, + { + "epoch": 0.20690033700341395, + "grad_norm": 2.734375, + "learning_rate": 4.501602775156126e-05, + "loss": 0.4835, + "step": 4712 + }, + { + "epoch": 0.20698815548261743, + "grad_norm": 2.859375, + "learning_rate": 4.501188193537798e-05, + "loss": 0.4712, + "step": 4714 + }, + { + "epoch": 0.20707597396182092, + "grad_norm": 3.03125, + "learning_rate": 4.500773458667998e-05, + "loss": 0.492, + "step": 4716 + }, + { + "epoch": 0.2071637924410244, + "grad_norm": 2.578125, + "learning_rate": 4.500358570578488e-05, + "loss": 0.4948, + "step": 4718 + }, + { + "epoch": 0.2072516109202279, + "grad_norm": 2.359375, + "learning_rate": 4.4999435293010384e-05, + "loss": 0.458, + "step": 4720 + }, + { + "epoch": 0.20733942939943137, + "grad_norm": 2.65625, + "learning_rate": 4.4995283348674336e-05, + "loss": 0.4802, + "step": 4722 + }, + { + "epoch": 0.20742724787863487, + "grad_norm": 2.5625, + "learning_rate": 4.49911298730947e-05, + "loss": 0.4884, + "step": 4724 + }, + { + "epoch": 0.20751506635783834, + "grad_norm": 2.671875, + "learning_rate": 4.4986974866589546e-05, + "loss": 0.476, + "step": 4726 + }, + { + "epoch": 0.20760288483704184, + "grad_norm": 2.46875, + "learning_rate": 4.498281832947706e-05, + "loss": 0.5139, + "step": 4728 + }, + { + "epoch": 0.2076907033162453, + "grad_norm": 2.703125, + "learning_rate": 4.4978660262075566e-05, + "loss": 0.4709, + "step": 4730 + }, + { + "epoch": 0.2077785217954488, + "grad_norm": 3.0625, + "learning_rate": 4.4974500664703475e-05, + "loss": 0.5018, + "step": 4732 + }, + { + "epoch": 0.20786634027465228, + "grad_norm": 2.78125, + "learning_rate": 4.4970339537679336e-05, + "loss": 0.4881, + "step": 4734 + }, + { + "epoch": 0.20795415875385578, + "grad_norm": 2.640625, + "learning_rate": 4.49661768813218e-05, + "loss": 0.5178, + "step": 4736 + }, + { + "epoch": 0.20804197723305926, + "grad_norm": 2.375, + "learning_rate": 4.496201269594966e-05, + "loss": 0.5246, + "step": 4738 + }, + { + "epoch": 0.20812979571226276, + "grad_norm": 2.390625, + "learning_rate": 4.49578469818818e-05, + "loss": 0.4945, + "step": 4740 + }, + { + "epoch": 0.20821761419146623, + "grad_norm": 2.453125, + "learning_rate": 4.495367973943724e-05, + "loss": 0.5048, + "step": 4742 + }, + { + "epoch": 0.20830543267066973, + "grad_norm": 2.59375, + "learning_rate": 4.494951096893509e-05, + "loss": 0.4675, + "step": 4744 + }, + { + "epoch": 0.2083932511498732, + "grad_norm": 3.234375, + "learning_rate": 4.494534067069463e-05, + "loss": 0.5013, + "step": 4746 + }, + { + "epoch": 0.2084810696290767, + "grad_norm": 3.171875, + "learning_rate": 4.494116884503518e-05, + "loss": 0.4897, + "step": 4748 + }, + { + "epoch": 0.2085688881082802, + "grad_norm": 2.296875, + "learning_rate": 4.493699549227624e-05, + "loss": 0.5059, + "step": 4750 + }, + { + "epoch": 0.20865670658748367, + "grad_norm": 2.515625, + "learning_rate": 4.493282061273741e-05, + "loss": 0.5089, + "step": 4752 + }, + { + "epoch": 0.20874452506668717, + "grad_norm": 2.125, + "learning_rate": 4.4928644206738404e-05, + "loss": 0.4795, + "step": 4754 + }, + { + "epoch": 0.20883234354589064, + "grad_norm": 2.3125, + "learning_rate": 4.492446627459904e-05, + "loss": 0.4664, + "step": 4756 + }, + { + "epoch": 0.20892016202509414, + "grad_norm": 2.515625, + "learning_rate": 4.4920286816639277e-05, + "loss": 0.5075, + "step": 4758 + }, + { + "epoch": 0.2090079805042976, + "grad_norm": 2.109375, + "learning_rate": 4.491610583317918e-05, + "loss": 0.4821, + "step": 4760 + }, + { + "epoch": 0.2090957989835011, + "grad_norm": 2.09375, + "learning_rate": 4.491192332453892e-05, + "loss": 0.5116, + "step": 4762 + }, + { + "epoch": 0.20918361746270459, + "grad_norm": 2.71875, + "learning_rate": 4.490773929103879e-05, + "loss": 0.4873, + "step": 4764 + }, + { + "epoch": 0.20927143594190808, + "grad_norm": 2.734375, + "learning_rate": 4.4903553732999225e-05, + "loss": 0.5058, + "step": 4766 + }, + { + "epoch": 0.20935925442111156, + "grad_norm": 2.484375, + "learning_rate": 4.489936665074074e-05, + "loss": 0.4896, + "step": 4768 + }, + { + "epoch": 0.20944707290031506, + "grad_norm": 2.90625, + "learning_rate": 4.489517804458398e-05, + "loss": 0.4906, + "step": 4770 + }, + { + "epoch": 0.20953489137951853, + "grad_norm": 3.3125, + "learning_rate": 4.4890987914849724e-05, + "loss": 0.4861, + "step": 4772 + }, + { + "epoch": 0.20962270985872203, + "grad_norm": 3.125, + "learning_rate": 4.488679626185884e-05, + "loss": 0.4878, + "step": 4774 + }, + { + "epoch": 0.2097105283379255, + "grad_norm": 3.15625, + "learning_rate": 4.488260308593233e-05, + "loss": 0.4785, + "step": 4776 + }, + { + "epoch": 0.209798346817129, + "grad_norm": 3.84375, + "learning_rate": 4.4878408387391314e-05, + "loss": 0.4722, + "step": 4778 + }, + { + "epoch": 0.20988616529633247, + "grad_norm": 3.046875, + "learning_rate": 4.487421216655702e-05, + "loss": 0.4687, + "step": 4780 + }, + { + "epoch": 0.20997398377553597, + "grad_norm": 2.84375, + "learning_rate": 4.487001442375079e-05, + "loss": 0.4903, + "step": 4782 + }, + { + "epoch": 0.21006180225473944, + "grad_norm": 2.53125, + "learning_rate": 4.4865815159294094e-05, + "loss": 0.4738, + "step": 4784 + }, + { + "epoch": 0.21014962073394294, + "grad_norm": 2.6875, + "learning_rate": 4.486161437350851e-05, + "loss": 0.4617, + "step": 4786 + }, + { + "epoch": 0.21023743921314642, + "grad_norm": 2.515625, + "learning_rate": 4.485741206671574e-05, + "loss": 0.4792, + "step": 4788 + }, + { + "epoch": 0.21032525769234992, + "grad_norm": 2.5, + "learning_rate": 4.4853208239237586e-05, + "loss": 0.4983, + "step": 4790 + }, + { + "epoch": 0.2104130761715534, + "grad_norm": 3.078125, + "learning_rate": 4.4849002891395995e-05, + "loss": 0.5093, + "step": 4792 + }, + { + "epoch": 0.2105008946507569, + "grad_norm": 2.546875, + "learning_rate": 4.4844796023512995e-05, + "loss": 0.4946, + "step": 4794 + }, + { + "epoch": 0.21058871312996036, + "grad_norm": 2.390625, + "learning_rate": 4.484058763591077e-05, + "loss": 0.5118, + "step": 4796 + }, + { + "epoch": 0.21067653160916386, + "grad_norm": 2.515625, + "learning_rate": 4.4836377728911574e-05, + "loss": 0.5033, + "step": 4798 + }, + { + "epoch": 0.21076435008836736, + "grad_norm": 2.65625, + "learning_rate": 4.483216630283783e-05, + "loss": 0.4579, + "step": 4800 + }, + { + "epoch": 0.21085216856757083, + "grad_norm": 2.21875, + "learning_rate": 4.4827953358012024e-05, + "loss": 0.5025, + "step": 4802 + }, + { + "epoch": 0.21093998704677433, + "grad_norm": 2.515625, + "learning_rate": 4.482373889475681e-05, + "loss": 0.474, + "step": 4804 + }, + { + "epoch": 0.2110278055259778, + "grad_norm": 2.453125, + "learning_rate": 4.481952291339491e-05, + "loss": 0.5077, + "step": 4806 + }, + { + "epoch": 0.2111156240051813, + "grad_norm": 2.4375, + "learning_rate": 4.48153054142492e-05, + "loss": 0.4601, + "step": 4808 + }, + { + "epoch": 0.21120344248438477, + "grad_norm": 2.671875, + "learning_rate": 4.481108639764264e-05, + "loss": 0.4722, + "step": 4810 + }, + { + "epoch": 0.21129126096358827, + "grad_norm": 2.421875, + "learning_rate": 4.4806865863898346e-05, + "loss": 0.4735, + "step": 4812 + }, + { + "epoch": 0.21137907944279175, + "grad_norm": 2.484375, + "learning_rate": 4.480264381333951e-05, + "loss": 0.459, + "step": 4814 + }, + { + "epoch": 0.21146689792199524, + "grad_norm": 2.625, + "learning_rate": 4.479842024628946e-05, + "loss": 0.4863, + "step": 4816 + }, + { + "epoch": 0.21155471640119872, + "grad_norm": 2.234375, + "learning_rate": 4.4794195163071656e-05, + "loss": 0.4931, + "step": 4818 + }, + { + "epoch": 0.21164253488040222, + "grad_norm": 2.15625, + "learning_rate": 4.478996856400963e-05, + "loss": 0.4681, + "step": 4820 + }, + { + "epoch": 0.2117303533596057, + "grad_norm": 2.40625, + "learning_rate": 4.478574044942707e-05, + "loss": 0.5064, + "step": 4822 + }, + { + "epoch": 0.2118181718388092, + "grad_norm": 2.40625, + "learning_rate": 4.478151081964777e-05, + "loss": 0.4992, + "step": 4824 + }, + { + "epoch": 0.21190599031801266, + "grad_norm": 2.75, + "learning_rate": 4.477727967499562e-05, + "loss": 0.4752, + "step": 4826 + }, + { + "epoch": 0.21199380879721616, + "grad_norm": 2.4375, + "learning_rate": 4.477304701579466e-05, + "loss": 0.4972, + "step": 4828 + }, + { + "epoch": 0.21208162727641963, + "grad_norm": 2.203125, + "learning_rate": 4.476881284236901e-05, + "loss": 0.4839, + "step": 4830 + }, + { + "epoch": 0.21216944575562313, + "grad_norm": 2.78125, + "learning_rate": 4.4764577155042935e-05, + "loss": 0.4915, + "step": 4832 + }, + { + "epoch": 0.2122572642348266, + "grad_norm": 2.421875, + "learning_rate": 4.4760339954140805e-05, + "loss": 0.516, + "step": 4834 + }, + { + "epoch": 0.2123450827140301, + "grad_norm": 2.265625, + "learning_rate": 4.475610123998711e-05, + "loss": 0.4683, + "step": 4836 + }, + { + "epoch": 0.21243290119323358, + "grad_norm": 2.5625, + "learning_rate": 4.4751861012906445e-05, + "loss": 0.4993, + "step": 4838 + }, + { + "epoch": 0.21252071967243708, + "grad_norm": 2.21875, + "learning_rate": 4.4747619273223525e-05, + "loss": 0.483, + "step": 4840 + }, + { + "epoch": 0.21260853815164055, + "grad_norm": 2.203125, + "learning_rate": 4.474337602126319e-05, + "loss": 0.4766, + "step": 4842 + }, + { + "epoch": 0.21269635663084405, + "grad_norm": 2.53125, + "learning_rate": 4.473913125735038e-05, + "loss": 0.4787, + "step": 4844 + }, + { + "epoch": 0.21278417511004752, + "grad_norm": 2.640625, + "learning_rate": 4.4734884981810174e-05, + "loss": 0.4699, + "step": 4846 + }, + { + "epoch": 0.21287199358925102, + "grad_norm": 2.703125, + "learning_rate": 4.473063719496774e-05, + "loss": 0.4876, + "step": 4848 + }, + { + "epoch": 0.21295981206845452, + "grad_norm": 2.453125, + "learning_rate": 4.472638789714838e-05, + "loss": 0.483, + "step": 4850 + }, + { + "epoch": 0.213047630547658, + "grad_norm": 2.34375, + "learning_rate": 4.472213708867751e-05, + "loss": 0.5009, + "step": 4852 + }, + { + "epoch": 0.2131354490268615, + "grad_norm": 2.828125, + "learning_rate": 4.471788476988066e-05, + "loss": 0.4802, + "step": 4854 + }, + { + "epoch": 0.21322326750606496, + "grad_norm": 2.453125, + "learning_rate": 4.4713630941083454e-05, + "loss": 0.4822, + "step": 4856 + }, + { + "epoch": 0.21331108598526846, + "grad_norm": 2.25, + "learning_rate": 4.470937560261167e-05, + "loss": 0.4849, + "step": 4858 + }, + { + "epoch": 0.21339890446447193, + "grad_norm": 2.46875, + "learning_rate": 4.4705118754791184e-05, + "loss": 0.4776, + "step": 4860 + }, + { + "epoch": 0.21348672294367543, + "grad_norm": 2.265625, + "learning_rate": 4.470086039794797e-05, + "loss": 0.4665, + "step": 4862 + }, + { + "epoch": 0.2135745414228789, + "grad_norm": 2.953125, + "learning_rate": 4.469660053240815e-05, + "loss": 0.4877, + "step": 4864 + }, + { + "epoch": 0.2136623599020824, + "grad_norm": 2.8125, + "learning_rate": 4.469233915849794e-05, + "loss": 0.5137, + "step": 4866 + }, + { + "epoch": 0.21375017838128588, + "grad_norm": 2.46875, + "learning_rate": 4.468807627654368e-05, + "loss": 0.498, + "step": 4868 + }, + { + "epoch": 0.21383799686048938, + "grad_norm": 2.984375, + "learning_rate": 4.4683811886871804e-05, + "loss": 0.495, + "step": 4870 + }, + { + "epoch": 0.21392581533969285, + "grad_norm": 2.453125, + "learning_rate": 4.467954598980891e-05, + "loss": 0.4879, + "step": 4872 + }, + { + "epoch": 0.21401363381889635, + "grad_norm": 2.578125, + "learning_rate": 4.4675278585681665e-05, + "loss": 0.4437, + "step": 4874 + }, + { + "epoch": 0.21410145229809982, + "grad_norm": 2.453125, + "learning_rate": 4.467100967481687e-05, + "loss": 0.4846, + "step": 4876 + }, + { + "epoch": 0.21418927077730332, + "grad_norm": 2.484375, + "learning_rate": 4.466673925754143e-05, + "loss": 0.4913, + "step": 4878 + }, + { + "epoch": 0.2142770892565068, + "grad_norm": 2.109375, + "learning_rate": 4.4662467334182387e-05, + "loss": 0.4757, + "step": 4880 + }, + { + "epoch": 0.2143649077357103, + "grad_norm": 2.40625, + "learning_rate": 4.465819390506689e-05, + "loss": 0.4906, + "step": 4882 + }, + { + "epoch": 0.21445272621491376, + "grad_norm": 2.3125, + "learning_rate": 4.465391897052218e-05, + "loss": 0.4691, + "step": 4884 + }, + { + "epoch": 0.21454054469411726, + "grad_norm": 2.328125, + "learning_rate": 4.4649642530875645e-05, + "loss": 0.497, + "step": 4886 + }, + { + "epoch": 0.21462836317332074, + "grad_norm": 2.515625, + "learning_rate": 4.464536458645479e-05, + "loss": 0.4595, + "step": 4888 + }, + { + "epoch": 0.21471618165252424, + "grad_norm": 2.1875, + "learning_rate": 4.464108513758719e-05, + "loss": 0.4769, + "step": 4890 + }, + { + "epoch": 0.2148040001317277, + "grad_norm": 2.046875, + "learning_rate": 4.4636804184600575e-05, + "loss": 0.4988, + "step": 4892 + }, + { + "epoch": 0.2148918186109312, + "grad_norm": 2.171875, + "learning_rate": 4.4632521727822805e-05, + "loss": 0.4872, + "step": 4894 + }, + { + "epoch": 0.21497963709013468, + "grad_norm": 2.71875, + "learning_rate": 4.4628237767581814e-05, + "loss": 0.4902, + "step": 4896 + }, + { + "epoch": 0.21506745556933818, + "grad_norm": 2.46875, + "learning_rate": 4.462395230420566e-05, + "loss": 0.5361, + "step": 4898 + }, + { + "epoch": 0.21515527404854168, + "grad_norm": 2.609375, + "learning_rate": 4.4619665338022545e-05, + "loss": 0.5288, + "step": 4900 + }, + { + "epoch": 0.21524309252774515, + "grad_norm": 2.921875, + "learning_rate": 4.461537686936075e-05, + "loss": 0.481, + "step": 4902 + }, + { + "epoch": 0.21533091100694865, + "grad_norm": 2.9375, + "learning_rate": 4.461108689854869e-05, + "loss": 0.4923, + "step": 4904 + }, + { + "epoch": 0.21541872948615212, + "grad_norm": 2.4375, + "learning_rate": 4.460679542591489e-05, + "loss": 0.4724, + "step": 4906 + }, + { + "epoch": 0.21550654796535562, + "grad_norm": 2.421875, + "learning_rate": 4.4602502451788005e-05, + "loss": 0.4788, + "step": 4908 + }, + { + "epoch": 0.2155943664445591, + "grad_norm": 2.4375, + "learning_rate": 4.459820797649678e-05, + "loss": 0.4707, + "step": 4910 + }, + { + "epoch": 0.2156821849237626, + "grad_norm": 2.28125, + "learning_rate": 4.4593912000370085e-05, + "loss": 0.4789, + "step": 4912 + }, + { + "epoch": 0.21577000340296607, + "grad_norm": 2.59375, + "learning_rate": 4.458961452373692e-05, + "loss": 0.4999, + "step": 4914 + }, + { + "epoch": 0.21585782188216956, + "grad_norm": 2.484375, + "learning_rate": 4.458531554692638e-05, + "loss": 0.4717, + "step": 4916 + }, + { + "epoch": 0.21594564036137304, + "grad_norm": 2.4375, + "learning_rate": 4.458101507026767e-05, + "loss": 0.4894, + "step": 4918 + }, + { + "epoch": 0.21603345884057654, + "grad_norm": 2.09375, + "learning_rate": 4.4576713094090146e-05, + "loss": 0.4699, + "step": 4920 + }, + { + "epoch": 0.21612127731978, + "grad_norm": 2.28125, + "learning_rate": 4.457240961872323e-05, + "loss": 0.4841, + "step": 4922 + }, + { + "epoch": 0.2162090957989835, + "grad_norm": 2.296875, + "learning_rate": 4.45681046444965e-05, + "loss": 0.4954, + "step": 4924 + }, + { + "epoch": 0.21629691427818698, + "grad_norm": 2.984375, + "learning_rate": 4.4563798171739626e-05, + "loss": 0.4853, + "step": 4926 + }, + { + "epoch": 0.21638473275739048, + "grad_norm": 2.3125, + "learning_rate": 4.455949020078239e-05, + "loss": 0.5074, + "step": 4928 + }, + { + "epoch": 0.21647255123659395, + "grad_norm": 2.328125, + "learning_rate": 4.455518073195471e-05, + "loss": 0.5, + "step": 4930 + }, + { + "epoch": 0.21656036971579745, + "grad_norm": 2.546875, + "learning_rate": 4.45508697655866e-05, + "loss": 0.473, + "step": 4932 + }, + { + "epoch": 0.21664818819500092, + "grad_norm": 2.421875, + "learning_rate": 4.4546557302008195e-05, + "loss": 0.4746, + "step": 4934 + }, + { + "epoch": 0.21673600667420442, + "grad_norm": 2.234375, + "learning_rate": 4.454224334154975e-05, + "loss": 0.4836, + "step": 4936 + }, + { + "epoch": 0.2168238251534079, + "grad_norm": 2.34375, + "learning_rate": 4.453792788454163e-05, + "loss": 0.4828, + "step": 4938 + }, + { + "epoch": 0.2169116436326114, + "grad_norm": 2.3125, + "learning_rate": 4.45336109313143e-05, + "loss": 0.5062, + "step": 4940 + }, + { + "epoch": 0.21699946211181487, + "grad_norm": 2.546875, + "learning_rate": 4.452929248219837e-05, + "loss": 0.4902, + "step": 4942 + }, + { + "epoch": 0.21708728059101837, + "grad_norm": 2.515625, + "learning_rate": 4.4524972537524535e-05, + "loss": 0.4679, + "step": 4944 + }, + { + "epoch": 0.21717509907022184, + "grad_norm": 2.40625, + "learning_rate": 4.4520651097623625e-05, + "loss": 0.4979, + "step": 4946 + }, + { + "epoch": 0.21726291754942534, + "grad_norm": 2.21875, + "learning_rate": 4.451632816282657e-05, + "loss": 0.4708, + "step": 4948 + }, + { + "epoch": 0.2173507360286288, + "grad_norm": 2.40625, + "learning_rate": 4.4512003733464435e-05, + "loss": 0.4937, + "step": 4950 + }, + { + "epoch": 0.2174385545078323, + "grad_norm": 2.359375, + "learning_rate": 4.450767780986837e-05, + "loss": 0.47, + "step": 4952 + }, + { + "epoch": 0.2175263729870358, + "grad_norm": 2.46875, + "learning_rate": 4.4503350392369664e-05, + "loss": 0.4776, + "step": 4954 + }, + { + "epoch": 0.21761419146623928, + "grad_norm": 2.40625, + "learning_rate": 4.4499021481299705e-05, + "loss": 0.4668, + "step": 4956 + }, + { + "epoch": 0.21770200994544278, + "grad_norm": 2.15625, + "learning_rate": 4.449469107699001e-05, + "loss": 0.4934, + "step": 4958 + }, + { + "epoch": 0.21778982842464625, + "grad_norm": 2.421875, + "learning_rate": 4.4490359179772204e-05, + "loss": 0.4434, + "step": 4960 + }, + { + "epoch": 0.21787764690384975, + "grad_norm": 2.484375, + "learning_rate": 4.4486025789978016e-05, + "loss": 0.4831, + "step": 4962 + }, + { + "epoch": 0.21796546538305323, + "grad_norm": 2.578125, + "learning_rate": 4.44816909079393e-05, + "loss": 0.4856, + "step": 4964 + }, + { + "epoch": 0.21805328386225672, + "grad_norm": 2.8125, + "learning_rate": 4.4477354533988025e-05, + "loss": 0.4619, + "step": 4966 + }, + { + "epoch": 0.2181411023414602, + "grad_norm": 2.53125, + "learning_rate": 4.447301666845628e-05, + "loss": 0.5101, + "step": 4968 + }, + { + "epoch": 0.2182289208206637, + "grad_norm": 2.953125, + "learning_rate": 4.4468677311676236e-05, + "loss": 0.4775, + "step": 4970 + }, + { + "epoch": 0.21831673929986717, + "grad_norm": 2.640625, + "learning_rate": 4.4464336463980226e-05, + "loss": 0.4981, + "step": 4972 + }, + { + "epoch": 0.21840455777907067, + "grad_norm": 2.140625, + "learning_rate": 4.445999412570065e-05, + "loss": 0.496, + "step": 4974 + }, + { + "epoch": 0.21849237625827414, + "grad_norm": 2.125, + "learning_rate": 4.445565029717008e-05, + "loss": 0.499, + "step": 4976 + }, + { + "epoch": 0.21858019473747764, + "grad_norm": 2.1875, + "learning_rate": 4.445130497872113e-05, + "loss": 0.4771, + "step": 4978 + }, + { + "epoch": 0.2186680132166811, + "grad_norm": 2.375, + "learning_rate": 4.4446958170686593e-05, + "loss": 0.5265, + "step": 4980 + }, + { + "epoch": 0.2187558316958846, + "grad_norm": 2.40625, + "learning_rate": 4.444260987339933e-05, + "loss": 0.4932, + "step": 4982 + }, + { + "epoch": 0.21884365017508808, + "grad_norm": 2.5, + "learning_rate": 4.443826008719235e-05, + "loss": 0.4904, + "step": 4984 + }, + { + "epoch": 0.21893146865429158, + "grad_norm": 2.34375, + "learning_rate": 4.4433908812398736e-05, + "loss": 0.4713, + "step": 4986 + }, + { + "epoch": 0.21901928713349506, + "grad_norm": 2.484375, + "learning_rate": 4.442955604935174e-05, + "loss": 0.5122, + "step": 4988 + }, + { + "epoch": 0.21910710561269856, + "grad_norm": 2.34375, + "learning_rate": 4.4425201798384686e-05, + "loss": 0.4549, + "step": 4990 + }, + { + "epoch": 0.21919492409190203, + "grad_norm": 2.25, + "learning_rate": 4.442084605983102e-05, + "loss": 0.4765, + "step": 4992 + }, + { + "epoch": 0.21928274257110553, + "grad_norm": 2.78125, + "learning_rate": 4.441648883402431e-05, + "loss": 0.5142, + "step": 4994 + }, + { + "epoch": 0.219370561050309, + "grad_norm": 2.765625, + "learning_rate": 4.441213012129822e-05, + "loss": 0.4851, + "step": 4996 + }, + { + "epoch": 0.2194583795295125, + "grad_norm": 2.265625, + "learning_rate": 4.4407769921986554e-05, + "loss": 0.4862, + "step": 4998 + }, + { + "epoch": 0.21954619800871597, + "grad_norm": 2.34375, + "learning_rate": 4.4403408236423224e-05, + "loss": 0.4953, + "step": 5000 + }, + { + "epoch": 0.21963401648791947, + "grad_norm": 2.453125, + "learning_rate": 4.4399045064942236e-05, + "loss": 0.478, + "step": 5002 + }, + { + "epoch": 0.21972183496712297, + "grad_norm": 2.171875, + "learning_rate": 4.439468040787772e-05, + "loss": 0.5044, + "step": 5004 + }, + { + "epoch": 0.21980965344632644, + "grad_norm": 2.375, + "learning_rate": 4.439031426556394e-05, + "loss": 0.5156, + "step": 5006 + }, + { + "epoch": 0.21989747192552994, + "grad_norm": 2.296875, + "learning_rate": 4.438594663833523e-05, + "loss": 0.4612, + "step": 5008 + }, + { + "epoch": 0.2199852904047334, + "grad_norm": 2.203125, + "learning_rate": 4.4381577526526094e-05, + "loss": 0.4944, + "step": 5010 + }, + { + "epoch": 0.2200731088839369, + "grad_norm": 2.34375, + "learning_rate": 4.43772069304711e-05, + "loss": 0.511, + "step": 5012 + }, + { + "epoch": 0.22016092736314039, + "grad_norm": 2.375, + "learning_rate": 4.437283485050495e-05, + "loss": 0.4886, + "step": 5014 + }, + { + "epoch": 0.22024874584234388, + "grad_norm": 2.5625, + "learning_rate": 4.436846128696247e-05, + "loss": 0.5139, + "step": 5016 + }, + { + "epoch": 0.22033656432154736, + "grad_norm": 2.28125, + "learning_rate": 4.4364086240178584e-05, + "loss": 0.4997, + "step": 5018 + }, + { + "epoch": 0.22042438280075086, + "grad_norm": 2.40625, + "learning_rate": 4.435970971048832e-05, + "loss": 0.4923, + "step": 5020 + }, + { + "epoch": 0.22051220127995433, + "grad_norm": 2.703125, + "learning_rate": 4.435533169822685e-05, + "loss": 0.5019, + "step": 5022 + }, + { + "epoch": 0.22060001975915783, + "grad_norm": 2.25, + "learning_rate": 4.435095220372945e-05, + "loss": 0.4697, + "step": 5024 + }, + { + "epoch": 0.2206878382383613, + "grad_norm": 2.109375, + "learning_rate": 4.434657122733148e-05, + "loss": 0.4798, + "step": 5026 + }, + { + "epoch": 0.2207756567175648, + "grad_norm": 2.4375, + "learning_rate": 4.4342188769368446e-05, + "loss": 0.4827, + "step": 5028 + }, + { + "epoch": 0.22086347519676827, + "grad_norm": 2.3125, + "learning_rate": 4.433780483017597e-05, + "loss": 0.4766, + "step": 5030 + }, + { + "epoch": 0.22095129367597177, + "grad_norm": 2.25, + "learning_rate": 4.433341941008975e-05, + "loss": 0.502, + "step": 5032 + }, + { + "epoch": 0.22103911215517524, + "grad_norm": 2.40625, + "learning_rate": 4.432903250944565e-05, + "loss": 0.4846, + "step": 5034 + }, + { + "epoch": 0.22112693063437874, + "grad_norm": 2.65625, + "learning_rate": 4.43246441285796e-05, + "loss": 0.4871, + "step": 5036 + }, + { + "epoch": 0.22121474911358222, + "grad_norm": 2.0625, + "learning_rate": 4.4320254267827675e-05, + "loss": 0.4614, + "step": 5038 + }, + { + "epoch": 0.22130256759278571, + "grad_norm": 2.296875, + "learning_rate": 4.4315862927526044e-05, + "loss": 0.525, + "step": 5040 + }, + { + "epoch": 0.2213903860719892, + "grad_norm": 2.296875, + "learning_rate": 4.4311470108011e-05, + "loss": 0.4742, + "step": 5042 + }, + { + "epoch": 0.2214782045511927, + "grad_norm": 2.078125, + "learning_rate": 4.4307075809618946e-05, + "loss": 0.4935, + "step": 5044 + }, + { + "epoch": 0.22156602303039616, + "grad_norm": 2.484375, + "learning_rate": 4.4302680032686395e-05, + "loss": 0.4644, + "step": 5046 + }, + { + "epoch": 0.22165384150959966, + "grad_norm": 2.234375, + "learning_rate": 4.429828277754998e-05, + "loss": 0.4973, + "step": 5048 + }, + { + "epoch": 0.22174165998880313, + "grad_norm": 2.140625, + "learning_rate": 4.4293884044546455e-05, + "loss": 0.4719, + "step": 5050 + }, + { + "epoch": 0.22182947846800663, + "grad_norm": 2.484375, + "learning_rate": 4.428948383401265e-05, + "loss": 0.4938, + "step": 5052 + }, + { + "epoch": 0.22191729694721013, + "grad_norm": 2.375, + "learning_rate": 4.4285082146285556e-05, + "loss": 0.4923, + "step": 5054 + }, + { + "epoch": 0.2220051154264136, + "grad_norm": 2.28125, + "learning_rate": 4.428067898170225e-05, + "loss": 0.4724, + "step": 5056 + }, + { + "epoch": 0.2220929339056171, + "grad_norm": 2.171875, + "learning_rate": 4.427627434059992e-05, + "loss": 0.5209, + "step": 5058 + }, + { + "epoch": 0.22218075238482057, + "grad_norm": 2.421875, + "learning_rate": 4.4271868223315884e-05, + "loss": 0.4846, + "step": 5060 + }, + { + "epoch": 0.22226857086402407, + "grad_norm": 2.125, + "learning_rate": 4.4267460630187566e-05, + "loss": 0.4762, + "step": 5062 + }, + { + "epoch": 0.22235638934322755, + "grad_norm": 2.328125, + "learning_rate": 4.4263051561552485e-05, + "loss": 0.4835, + "step": 5064 + }, + { + "epoch": 0.22244420782243104, + "grad_norm": 2.609375, + "learning_rate": 4.42586410177483e-05, + "loss": 0.4987, + "step": 5066 + }, + { + "epoch": 0.22253202630163452, + "grad_norm": 2.3125, + "learning_rate": 4.425422899911277e-05, + "loss": 0.4749, + "step": 5068 + }, + { + "epoch": 0.22261984478083802, + "grad_norm": 2.421875, + "learning_rate": 4.424981550598376e-05, + "loss": 0.5063, + "step": 5070 + }, + { + "epoch": 0.2227076632600415, + "grad_norm": 2.484375, + "learning_rate": 4.4245400538699275e-05, + "loss": 0.4963, + "step": 5072 + }, + { + "epoch": 0.222795481739245, + "grad_norm": 2.359375, + "learning_rate": 4.42409840975974e-05, + "loss": 0.5134, + "step": 5074 + }, + { + "epoch": 0.22288330021844846, + "grad_norm": 2.5625, + "learning_rate": 4.4236566183016345e-05, + "loss": 0.4914, + "step": 5076 + }, + { + "epoch": 0.22297111869765196, + "grad_norm": 2.328125, + "learning_rate": 4.423214679529445e-05, + "loss": 0.5302, + "step": 5078 + }, + { + "epoch": 0.22305893717685543, + "grad_norm": 2.671875, + "learning_rate": 4.422772593477014e-05, + "loss": 0.4669, + "step": 5080 + }, + { + "epoch": 0.22314675565605893, + "grad_norm": 2.625, + "learning_rate": 4.4223303601781966e-05, + "loss": 0.4804, + "step": 5082 + }, + { + "epoch": 0.2232345741352624, + "grad_norm": 2.3125, + "learning_rate": 4.421887979666859e-05, + "loss": 0.4832, + "step": 5084 + }, + { + "epoch": 0.2233223926144659, + "grad_norm": 2.53125, + "learning_rate": 4.42144545197688e-05, + "loss": 0.4871, + "step": 5086 + }, + { + "epoch": 0.22341021109366938, + "grad_norm": 2.3125, + "learning_rate": 4.421002777142148e-05, + "loss": 0.4789, + "step": 5088 + }, + { + "epoch": 0.22349802957287287, + "grad_norm": 2.3125, + "learning_rate": 4.420559955196562e-05, + "loss": 0.4875, + "step": 5090 + }, + { + "epoch": 0.22358584805207635, + "grad_norm": 2.4375, + "learning_rate": 4.420116986174034e-05, + "loss": 0.48, + "step": 5092 + }, + { + "epoch": 0.22367366653127985, + "grad_norm": 2.59375, + "learning_rate": 4.419673870108488e-05, + "loss": 0.4575, + "step": 5094 + }, + { + "epoch": 0.22376148501048332, + "grad_norm": 3.109375, + "learning_rate": 4.419230607033856e-05, + "loss": 0.4913, + "step": 5096 + }, + { + "epoch": 0.22384930348968682, + "grad_norm": 2.328125, + "learning_rate": 4.4187871969840844e-05, + "loss": 0.4545, + "step": 5098 + }, + { + "epoch": 0.2239371219688903, + "grad_norm": 2.5625, + "learning_rate": 4.418343639993129e-05, + "loss": 0.4741, + "step": 5100 + }, + { + "epoch": 0.2240249404480938, + "grad_norm": 2.296875, + "learning_rate": 4.417899936094958e-05, + "loss": 0.4738, + "step": 5102 + }, + { + "epoch": 0.2241127589272973, + "grad_norm": 2.640625, + "learning_rate": 4.4174560853235505e-05, + "loss": 0.5039, + "step": 5104 + }, + { + "epoch": 0.22420057740650076, + "grad_norm": 2.359375, + "learning_rate": 4.4170120877128964e-05, + "loss": 0.4679, + "step": 5106 + }, + { + "epoch": 0.22428839588570426, + "grad_norm": 2.25, + "learning_rate": 4.4165679432969956e-05, + "loss": 0.4549, + "step": 5108 + }, + { + "epoch": 0.22437621436490773, + "grad_norm": 2.578125, + "learning_rate": 4.416123652109864e-05, + "loss": 0.5007, + "step": 5110 + }, + { + "epoch": 0.22446403284411123, + "grad_norm": 2.59375, + "learning_rate": 4.415679214185523e-05, + "loss": 0.4696, + "step": 5112 + }, + { + "epoch": 0.2245518513233147, + "grad_norm": 2.234375, + "learning_rate": 4.415234629558008e-05, + "loss": 0.4881, + "step": 5114 + }, + { + "epoch": 0.2246396698025182, + "grad_norm": 2.359375, + "learning_rate": 4.4147898982613675e-05, + "loss": 0.4651, + "step": 5116 + }, + { + "epoch": 0.22472748828172168, + "grad_norm": 2.5, + "learning_rate": 4.4143450203296566e-05, + "loss": 0.4887, + "step": 5118 + }, + { + "epoch": 0.22481530676092518, + "grad_norm": 2.78125, + "learning_rate": 4.413899995796945e-05, + "loss": 0.4802, + "step": 5120 + }, + { + "epoch": 0.22490312524012865, + "grad_norm": 2.3125, + "learning_rate": 4.413454824697313e-05, + "loss": 0.4907, + "step": 5122 + }, + { + "epoch": 0.22499094371933215, + "grad_norm": 2.71875, + "learning_rate": 4.4130095070648524e-05, + "loss": 0.4741, + "step": 5124 + }, + { + "epoch": 0.22507876219853562, + "grad_norm": 2.6875, + "learning_rate": 4.4125640429336646e-05, + "loss": 0.4801, + "step": 5126 + }, + { + "epoch": 0.22516658067773912, + "grad_norm": 2.3125, + "learning_rate": 4.4121184323378636e-05, + "loss": 0.4828, + "step": 5128 + }, + { + "epoch": 0.2252543991569426, + "grad_norm": 2.265625, + "learning_rate": 4.411672675311576e-05, + "loss": 0.468, + "step": 5130 + }, + { + "epoch": 0.2253422176361461, + "grad_norm": 2.453125, + "learning_rate": 4.4112267718889355e-05, + "loss": 0.4707, + "step": 5132 + }, + { + "epoch": 0.22543003611534956, + "grad_norm": 3.015625, + "learning_rate": 4.410780722104091e-05, + "loss": 0.5055, + "step": 5134 + }, + { + "epoch": 0.22551785459455306, + "grad_norm": 2.546875, + "learning_rate": 4.410334525991201e-05, + "loss": 0.4714, + "step": 5136 + }, + { + "epoch": 0.22560567307375654, + "grad_norm": 2.953125, + "learning_rate": 4.409888183584435e-05, + "loss": 0.4737, + "step": 5138 + }, + { + "epoch": 0.22569349155296003, + "grad_norm": 3.421875, + "learning_rate": 4.409441694917973e-05, + "loss": 0.4804, + "step": 5140 + }, + { + "epoch": 0.2257813100321635, + "grad_norm": 2.5, + "learning_rate": 4.40899506002601e-05, + "loss": 0.4848, + "step": 5142 + }, + { + "epoch": 0.225869128511367, + "grad_norm": 2.09375, + "learning_rate": 4.408548278942747e-05, + "loss": 0.4637, + "step": 5144 + }, + { + "epoch": 0.22595694699057048, + "grad_norm": 2.03125, + "learning_rate": 4.408101351702398e-05, + "loss": 0.4872, + "step": 5146 + }, + { + "epoch": 0.22604476546977398, + "grad_norm": 2.296875, + "learning_rate": 4.4076542783391925e-05, + "loss": 0.4784, + "step": 5148 + }, + { + "epoch": 0.22613258394897745, + "grad_norm": 2.328125, + "learning_rate": 4.4072070588873635e-05, + "loss": 0.5011, + "step": 5150 + }, + { + "epoch": 0.22622040242818095, + "grad_norm": 2.21875, + "learning_rate": 4.406759693381161e-05, + "loss": 0.4579, + "step": 5152 + }, + { + "epoch": 0.22630822090738445, + "grad_norm": 2.96875, + "learning_rate": 4.4063121818548435e-05, + "loss": 0.4964, + "step": 5154 + }, + { + "epoch": 0.22639603938658792, + "grad_norm": 2.71875, + "learning_rate": 4.4058645243426835e-05, + "loss": 0.4828, + "step": 5156 + }, + { + "epoch": 0.22648385786579142, + "grad_norm": 2.5625, + "learning_rate": 4.40541672087896e-05, + "loss": 0.4821, + "step": 5158 + }, + { + "epoch": 0.2265716763449949, + "grad_norm": 2.8125, + "learning_rate": 4.404968771497968e-05, + "loss": 0.4809, + "step": 5160 + }, + { + "epoch": 0.2266594948241984, + "grad_norm": 2.515625, + "learning_rate": 4.404520676234011e-05, + "loss": 0.4946, + "step": 5162 + }, + { + "epoch": 0.22674731330340187, + "grad_norm": 1.9921875, + "learning_rate": 4.404072435121404e-05, + "loss": 0.46, + "step": 5164 + }, + { + "epoch": 0.22683513178260536, + "grad_norm": 2.25, + "learning_rate": 4.403624048194474e-05, + "loss": 0.4687, + "step": 5166 + }, + { + "epoch": 0.22692295026180884, + "grad_norm": 2.390625, + "learning_rate": 4.403175515487557e-05, + "loss": 0.494, + "step": 5168 + }, + { + "epoch": 0.22701076874101234, + "grad_norm": 2.59375, + "learning_rate": 4.402726837035002e-05, + "loss": 0.4823, + "step": 5170 + }, + { + "epoch": 0.2270985872202158, + "grad_norm": 2.65625, + "learning_rate": 4.402278012871172e-05, + "loss": 0.4808, + "step": 5172 + }, + { + "epoch": 0.2271864056994193, + "grad_norm": 2.546875, + "learning_rate": 4.401829043030434e-05, + "loss": 0.4776, + "step": 5174 + }, + { + "epoch": 0.22727422417862278, + "grad_norm": 2.765625, + "learning_rate": 4.401379927547172e-05, + "loss": 0.4646, + "step": 5176 + }, + { + "epoch": 0.22736204265782628, + "grad_norm": 2.3125, + "learning_rate": 4.40093066645578e-05, + "loss": 0.4778, + "step": 5178 + }, + { + "epoch": 0.22744986113702975, + "grad_norm": 2.265625, + "learning_rate": 4.400481259790662e-05, + "loss": 0.4713, + "step": 5180 + }, + { + "epoch": 0.22753767961623325, + "grad_norm": 2.28125, + "learning_rate": 4.400031707586234e-05, + "loss": 0.4591, + "step": 5182 + }, + { + "epoch": 0.22762549809543672, + "grad_norm": 2.75, + "learning_rate": 4.3995820098769217e-05, + "loss": 0.4826, + "step": 5184 + }, + { + "epoch": 0.22771331657464022, + "grad_norm": 2.53125, + "learning_rate": 4.3991321666971636e-05, + "loss": 0.4527, + "step": 5186 + }, + { + "epoch": 0.2278011350538437, + "grad_norm": 2.359375, + "learning_rate": 4.3986821780814095e-05, + "loss": 0.4742, + "step": 5188 + }, + { + "epoch": 0.2278889535330472, + "grad_norm": 2.390625, + "learning_rate": 4.398232044064118e-05, + "loss": 0.464, + "step": 5190 + }, + { + "epoch": 0.22797677201225067, + "grad_norm": 2.359375, + "learning_rate": 4.397781764679762e-05, + "loss": 0.4821, + "step": 5192 + }, + { + "epoch": 0.22806459049145417, + "grad_norm": 2.125, + "learning_rate": 4.397331339962824e-05, + "loss": 0.5096, + "step": 5194 + }, + { + "epoch": 0.22815240897065764, + "grad_norm": 2.34375, + "learning_rate": 4.396880769947796e-05, + "loss": 0.5087, + "step": 5196 + }, + { + "epoch": 0.22824022744986114, + "grad_norm": 2.125, + "learning_rate": 4.396430054669186e-05, + "loss": 0.4886, + "step": 5198 + }, + { + "epoch": 0.2283280459290646, + "grad_norm": 2.5, + "learning_rate": 4.395979194161506e-05, + "loss": 0.4747, + "step": 5200 + }, + { + "epoch": 0.2284158644082681, + "grad_norm": 2.71875, + "learning_rate": 4.395528188459286e-05, + "loss": 0.4934, + "step": 5202 + }, + { + "epoch": 0.2285036828874716, + "grad_norm": 2.234375, + "learning_rate": 4.395077037597062e-05, + "loss": 0.4631, + "step": 5204 + }, + { + "epoch": 0.22859150136667508, + "grad_norm": 2.453125, + "learning_rate": 4.394625741609384e-05, + "loss": 0.4963, + "step": 5206 + }, + { + "epoch": 0.22867931984587858, + "grad_norm": 2.15625, + "learning_rate": 4.3941743005308136e-05, + "loss": 0.4941, + "step": 5208 + }, + { + "epoch": 0.22876713832508205, + "grad_norm": 2.40625, + "learning_rate": 4.39372271439592e-05, + "loss": 0.4873, + "step": 5210 + }, + { + "epoch": 0.22885495680428555, + "grad_norm": 2.765625, + "learning_rate": 4.393270983239288e-05, + "loss": 0.4807, + "step": 5212 + }, + { + "epoch": 0.22894277528348903, + "grad_norm": 2.453125, + "learning_rate": 4.39281910709551e-05, + "loss": 0.5022, + "step": 5214 + }, + { + "epoch": 0.22903059376269252, + "grad_norm": 2.421875, + "learning_rate": 4.3923670859991906e-05, + "loss": 0.4848, + "step": 5216 + }, + { + "epoch": 0.229118412241896, + "grad_norm": 2.28125, + "learning_rate": 4.391914919984947e-05, + "loss": 0.4556, + "step": 5218 + }, + { + "epoch": 0.2292062307210995, + "grad_norm": 2.703125, + "learning_rate": 4.3914626090874044e-05, + "loss": 0.4881, + "step": 5220 + }, + { + "epoch": 0.22929404920030297, + "grad_norm": 2.46875, + "learning_rate": 4.391010153341203e-05, + "loss": 0.478, + "step": 5222 + }, + { + "epoch": 0.22938186767950647, + "grad_norm": 2.328125, + "learning_rate": 4.390557552780989e-05, + "loss": 0.485, + "step": 5224 + }, + { + "epoch": 0.22946968615870994, + "grad_norm": 2.3125, + "learning_rate": 4.390104807441425e-05, + "loss": 0.4779, + "step": 5226 + }, + { + "epoch": 0.22955750463791344, + "grad_norm": 2.0, + "learning_rate": 4.3896519173571824e-05, + "loss": 0.4636, + "step": 5228 + }, + { + "epoch": 0.2296453231171169, + "grad_norm": 2.265625, + "learning_rate": 4.389198882562943e-05, + "loss": 0.4515, + "step": 5230 + }, + { + "epoch": 0.2297331415963204, + "grad_norm": 2.546875, + "learning_rate": 4.3887457030934e-05, + "loss": 0.4524, + "step": 5232 + }, + { + "epoch": 0.22982096007552388, + "grad_norm": 2.578125, + "learning_rate": 4.388292378983258e-05, + "loss": 0.4781, + "step": 5234 + }, + { + "epoch": 0.22990877855472738, + "grad_norm": 2.53125, + "learning_rate": 4.387838910267233e-05, + "loss": 0.48, + "step": 5236 + }, + { + "epoch": 0.22999659703393086, + "grad_norm": 2.375, + "learning_rate": 4.387385296980052e-05, + "loss": 0.4608, + "step": 5238 + }, + { + "epoch": 0.23008441551313435, + "grad_norm": 2.515625, + "learning_rate": 4.3869315391564525e-05, + "loss": 0.4516, + "step": 5240 + }, + { + "epoch": 0.23017223399233783, + "grad_norm": 2.53125, + "learning_rate": 4.3864776368311835e-05, + "loss": 0.4758, + "step": 5242 + }, + { + "epoch": 0.23026005247154133, + "grad_norm": 2.28125, + "learning_rate": 4.3860235900390046e-05, + "loss": 0.474, + "step": 5244 + }, + { + "epoch": 0.2303478709507448, + "grad_norm": 2.8125, + "learning_rate": 4.3855693988146876e-05, + "loss": 0.4854, + "step": 5246 + }, + { + "epoch": 0.2304356894299483, + "grad_norm": 2.390625, + "learning_rate": 4.3851150631930124e-05, + "loss": 0.4763, + "step": 5248 + }, + { + "epoch": 0.23052350790915177, + "grad_norm": 2.25, + "learning_rate": 4.384660583208776e-05, + "loss": 0.4844, + "step": 5250 + }, + { + "epoch": 0.23061132638835527, + "grad_norm": 2.4375, + "learning_rate": 4.3842059588967785e-05, + "loss": 0.485, + "step": 5252 + }, + { + "epoch": 0.23069914486755877, + "grad_norm": 2.546875, + "learning_rate": 4.3837511902918384e-05, + "loss": 0.4942, + "step": 5254 + }, + { + "epoch": 0.23078696334676224, + "grad_norm": 2.625, + "learning_rate": 4.38329627742878e-05, + "loss": 0.4815, + "step": 5256 + }, + { + "epoch": 0.23087478182596574, + "grad_norm": 2.15625, + "learning_rate": 4.382841220342441e-05, + "loss": 0.4634, + "step": 5258 + }, + { + "epoch": 0.2309626003051692, + "grad_norm": 2.390625, + "learning_rate": 4.38238601906767e-05, + "loss": 0.4528, + "step": 5260 + }, + { + "epoch": 0.2310504187843727, + "grad_norm": 2.4375, + "learning_rate": 4.3819306736393265e-05, + "loss": 0.4836, + "step": 5262 + }, + { + "epoch": 0.23113823726357619, + "grad_norm": 2.390625, + "learning_rate": 4.3814751840922816e-05, + "loss": 0.4351, + "step": 5264 + }, + { + "epoch": 0.23122605574277968, + "grad_norm": 2.296875, + "learning_rate": 4.381019550461415e-05, + "loss": 0.4912, + "step": 5266 + }, + { + "epoch": 0.23131387422198316, + "grad_norm": 2.21875, + "learning_rate": 4.3805637727816205e-05, + "loss": 0.4542, + "step": 5268 + }, + { + "epoch": 0.23140169270118666, + "grad_norm": 2.5625, + "learning_rate": 4.3801078510878025e-05, + "loss": 0.4752, + "step": 5270 + }, + { + "epoch": 0.23148951118039013, + "grad_norm": 2.453125, + "learning_rate": 4.3796517854148735e-05, + "loss": 0.4868, + "step": 5272 + }, + { + "epoch": 0.23157732965959363, + "grad_norm": 2.734375, + "learning_rate": 4.3791955757977604e-05, + "loss": 0.4772, + "step": 5274 + }, + { + "epoch": 0.2316651481387971, + "grad_norm": 2.828125, + "learning_rate": 4.3787392222713996e-05, + "loss": 0.5021, + "step": 5276 + }, + { + "epoch": 0.2317529666180006, + "grad_norm": 2.96875, + "learning_rate": 4.378282724870739e-05, + "loss": 0.4864, + "step": 5278 + }, + { + "epoch": 0.23184078509720407, + "grad_norm": 3.765625, + "learning_rate": 4.3778260836307373e-05, + "loss": 0.4819, + "step": 5280 + }, + { + "epoch": 0.23192860357640757, + "grad_norm": 3.9375, + "learning_rate": 4.3773692985863635e-05, + "loss": 0.4849, + "step": 5282 + }, + { + "epoch": 0.23201642205561104, + "grad_norm": 3.921875, + "learning_rate": 4.3769123697725986e-05, + "loss": 0.4509, + "step": 5284 + }, + { + "epoch": 0.23210424053481454, + "grad_norm": 2.484375, + "learning_rate": 4.376455297224435e-05, + "loss": 0.4961, + "step": 5286 + }, + { + "epoch": 0.23219205901401802, + "grad_norm": 2.359375, + "learning_rate": 4.3759980809768756e-05, + "loss": 0.4676, + "step": 5288 + }, + { + "epoch": 0.23227987749322151, + "grad_norm": 3.875, + "learning_rate": 4.3755407210649325e-05, + "loss": 0.5097, + "step": 5290 + }, + { + "epoch": 0.232367695972425, + "grad_norm": 3.609375, + "learning_rate": 4.375083217523631e-05, + "loss": 0.4754, + "step": 5292 + }, + { + "epoch": 0.2324555144516285, + "grad_norm": 3.453125, + "learning_rate": 4.374625570388008e-05, + "loss": 0.466, + "step": 5294 + }, + { + "epoch": 0.23254333293083196, + "grad_norm": 3.28125, + "learning_rate": 4.374167779693109e-05, + "loss": 0.4691, + "step": 5296 + }, + { + "epoch": 0.23263115141003546, + "grad_norm": 2.5625, + "learning_rate": 4.3737098454739924e-05, + "loss": 0.4651, + "step": 5298 + }, + { + "epoch": 0.23271896988923893, + "grad_norm": 2.78125, + "learning_rate": 4.373251767765727e-05, + "loss": 0.4742, + "step": 5300 + }, + { + "epoch": 0.23280678836844243, + "grad_norm": 3.625, + "learning_rate": 4.372793546603392e-05, + "loss": 0.4641, + "step": 5302 + }, + { + "epoch": 0.23289460684764593, + "grad_norm": 2.234375, + "learning_rate": 4.372335182022078e-05, + "loss": 0.4854, + "step": 5304 + }, + { + "epoch": 0.2329824253268494, + "grad_norm": 2.359375, + "learning_rate": 4.371876674056886e-05, + "loss": 0.4809, + "step": 5306 + }, + { + "epoch": 0.2330702438060529, + "grad_norm": 2.078125, + "learning_rate": 4.3714180227429316e-05, + "loss": 0.4776, + "step": 5308 + }, + { + "epoch": 0.23315806228525637, + "grad_norm": 2.59375, + "learning_rate": 4.370959228115335e-05, + "loss": 0.4895, + "step": 5310 + }, + { + "epoch": 0.23324588076445987, + "grad_norm": 2.515625, + "learning_rate": 4.3705002902092326e-05, + "loss": 0.4962, + "step": 5312 + }, + { + "epoch": 0.23333369924366335, + "grad_norm": 2.328125, + "learning_rate": 4.37004120905977e-05, + "loss": 0.4616, + "step": 5314 + }, + { + "epoch": 0.23342151772286684, + "grad_norm": 2.296875, + "learning_rate": 4.369581984702102e-05, + "loss": 0.4799, + "step": 5316 + }, + { + "epoch": 0.23350933620207032, + "grad_norm": 2.328125, + "learning_rate": 4.3691226171713986e-05, + "loss": 0.4715, + "step": 5318 + }, + { + "epoch": 0.23359715468127382, + "grad_norm": 2.5, + "learning_rate": 4.3686631065028374e-05, + "loss": 0.4639, + "step": 5320 + }, + { + "epoch": 0.2336849731604773, + "grad_norm": 2.6875, + "learning_rate": 4.3682034527316064e-05, + "loss": 0.4588, + "step": 5322 + }, + { + "epoch": 0.2337727916396808, + "grad_norm": 3.328125, + "learning_rate": 4.367743655892908e-05, + "loss": 0.4905, + "step": 5324 + }, + { + "epoch": 0.23386061011888426, + "grad_norm": 2.859375, + "learning_rate": 4.367283716021953e-05, + "loss": 0.4552, + "step": 5326 + }, + { + "epoch": 0.23394842859808776, + "grad_norm": 2.765625, + "learning_rate": 4.366823633153963e-05, + "loss": 0.5003, + "step": 5328 + }, + { + "epoch": 0.23403624707729123, + "grad_norm": 3.078125, + "learning_rate": 4.366363407324171e-05, + "loss": 0.4557, + "step": 5330 + }, + { + "epoch": 0.23412406555649473, + "grad_norm": 2.84375, + "learning_rate": 4.365903038567822e-05, + "loss": 0.4511, + "step": 5332 + }, + { + "epoch": 0.2342118840356982, + "grad_norm": 2.875, + "learning_rate": 4.3654425269201716e-05, + "loss": 0.4675, + "step": 5334 + }, + { + "epoch": 0.2342997025149017, + "grad_norm": 3.203125, + "learning_rate": 4.364981872416485e-05, + "loss": 0.4511, + "step": 5336 + }, + { + "epoch": 0.23438752099410518, + "grad_norm": 3.21875, + "learning_rate": 4.36452107509204e-05, + "loss": 0.4933, + "step": 5338 + }, + { + "epoch": 0.23447533947330867, + "grad_norm": 2.90625, + "learning_rate": 4.364060134982124e-05, + "loss": 0.4911, + "step": 5340 + }, + { + "epoch": 0.23456315795251215, + "grad_norm": 2.796875, + "learning_rate": 4.3635990521220355e-05, + "loss": 0.5039, + "step": 5342 + }, + { + "epoch": 0.23465097643171565, + "grad_norm": 3.390625, + "learning_rate": 4.363137826547085e-05, + "loss": 0.4649, + "step": 5344 + }, + { + "epoch": 0.23473879491091912, + "grad_norm": 2.203125, + "learning_rate": 4.362676458292594e-05, + "loss": 0.4797, + "step": 5346 + }, + { + "epoch": 0.23482661339012262, + "grad_norm": 2.125, + "learning_rate": 4.362214947393892e-05, + "loss": 0.4704, + "step": 5348 + }, + { + "epoch": 0.2349144318693261, + "grad_norm": 2.671875, + "learning_rate": 4.361753293886324e-05, + "loss": 0.4758, + "step": 5350 + }, + { + "epoch": 0.2350022503485296, + "grad_norm": 2.921875, + "learning_rate": 4.361291497805242e-05, + "loss": 0.4899, + "step": 5352 + }, + { + "epoch": 0.23509006882773306, + "grad_norm": 3.03125, + "learning_rate": 4.3608295591860105e-05, + "loss": 0.4964, + "step": 5354 + }, + { + "epoch": 0.23517788730693656, + "grad_norm": 2.890625, + "learning_rate": 4.360367478064006e-05, + "loss": 0.4743, + "step": 5356 + }, + { + "epoch": 0.23526570578614006, + "grad_norm": 2.71875, + "learning_rate": 4.3599052544746136e-05, + "loss": 0.4564, + "step": 5358 + }, + { + "epoch": 0.23535352426534353, + "grad_norm": 2.21875, + "learning_rate": 4.359442888453231e-05, + "loss": 0.4697, + "step": 5360 + }, + { + "epoch": 0.23544134274454703, + "grad_norm": 2.84375, + "learning_rate": 4.3589803800352666e-05, + "loss": 0.4618, + "step": 5362 + }, + { + "epoch": 0.2355291612237505, + "grad_norm": 2.34375, + "learning_rate": 4.3585177292561386e-05, + "loss": 0.4454, + "step": 5364 + }, + { + "epoch": 0.235616979702954, + "grad_norm": 2.4375, + "learning_rate": 4.358054936151278e-05, + "loss": 0.4874, + "step": 5366 + }, + { + "epoch": 0.23570479818215748, + "grad_norm": 2.328125, + "learning_rate": 4.3575920007561245e-05, + "loss": 0.4826, + "step": 5368 + }, + { + "epoch": 0.23579261666136098, + "grad_norm": 2.421875, + "learning_rate": 4.35712892310613e-05, + "loss": 0.4856, + "step": 5370 + }, + { + "epoch": 0.23588043514056445, + "grad_norm": 2.140625, + "learning_rate": 4.356665703236758e-05, + "loss": 0.4558, + "step": 5372 + }, + { + "epoch": 0.23596825361976795, + "grad_norm": 2.5625, + "learning_rate": 4.356202341183481e-05, + "loss": 0.4791, + "step": 5374 + }, + { + "epoch": 0.23605607209897142, + "grad_norm": 2.9375, + "learning_rate": 4.355738836981784e-05, + "loss": 0.4504, + "step": 5376 + }, + { + "epoch": 0.23614389057817492, + "grad_norm": 3.296875, + "learning_rate": 4.3552751906671616e-05, + "loss": 0.4624, + "step": 5378 + }, + { + "epoch": 0.2362317090573784, + "grad_norm": 2.65625, + "learning_rate": 4.3548114022751206e-05, + "loss": 0.4574, + "step": 5380 + }, + { + "epoch": 0.2363195275365819, + "grad_norm": 2.203125, + "learning_rate": 4.354347471841178e-05, + "loss": 0.476, + "step": 5382 + }, + { + "epoch": 0.23640734601578536, + "grad_norm": 2.3125, + "learning_rate": 4.3538833994008614e-05, + "loss": 0.4668, + "step": 5384 + }, + { + "epoch": 0.23649516449498886, + "grad_norm": 2.265625, + "learning_rate": 4.35341918498971e-05, + "loss": 0.4589, + "step": 5386 + }, + { + "epoch": 0.23658298297419234, + "grad_norm": 2.640625, + "learning_rate": 4.3529548286432724e-05, + "loss": 0.456, + "step": 5388 + }, + { + "epoch": 0.23667080145339583, + "grad_norm": 2.140625, + "learning_rate": 4.3524903303971104e-05, + "loss": 0.4643, + "step": 5390 + }, + { + "epoch": 0.2367586199325993, + "grad_norm": 2.140625, + "learning_rate": 4.352025690286795e-05, + "loss": 0.4801, + "step": 5392 + }, + { + "epoch": 0.2368464384118028, + "grad_norm": 2.1875, + "learning_rate": 4.3515609083479066e-05, + "loss": 0.4824, + "step": 5394 + }, + { + "epoch": 0.23693425689100628, + "grad_norm": 2.359375, + "learning_rate": 4.351095984616042e-05, + "loss": 0.4977, + "step": 5396 + }, + { + "epoch": 0.23702207537020978, + "grad_norm": 2.4375, + "learning_rate": 4.350630919126803e-05, + "loss": 0.4508, + "step": 5398 + }, + { + "epoch": 0.23710989384941325, + "grad_norm": 2.140625, + "learning_rate": 4.350165711915803e-05, + "loss": 0.453, + "step": 5400 + }, + { + "epoch": 0.23719771232861675, + "grad_norm": 2.25, + "learning_rate": 4.349700363018671e-05, + "loss": 0.4878, + "step": 5402 + }, + { + "epoch": 0.23728553080782022, + "grad_norm": 2.25, + "learning_rate": 4.349234872471041e-05, + "loss": 0.454, + "step": 5404 + }, + { + "epoch": 0.23737334928702372, + "grad_norm": 2.546875, + "learning_rate": 4.348769240308561e-05, + "loss": 0.4635, + "step": 5406 + }, + { + "epoch": 0.23746116776622722, + "grad_norm": 2.921875, + "learning_rate": 4.34830346656689e-05, + "loss": 0.4546, + "step": 5408 + }, + { + "epoch": 0.2375489862454307, + "grad_norm": 3.5, + "learning_rate": 4.347837551281696e-05, + "loss": 0.4531, + "step": 5410 + }, + { + "epoch": 0.2376368047246342, + "grad_norm": 3.0, + "learning_rate": 4.347371494488659e-05, + "loss": 0.4688, + "step": 5412 + }, + { + "epoch": 0.23772462320383766, + "grad_norm": 2.375, + "learning_rate": 4.346905296223471e-05, + "loss": 0.4568, + "step": 5414 + }, + { + "epoch": 0.23781244168304116, + "grad_norm": 3.078125, + "learning_rate": 4.346438956521832e-05, + "loss": 0.4738, + "step": 5416 + }, + { + "epoch": 0.23790026016224464, + "grad_norm": 2.90625, + "learning_rate": 4.345972475419455e-05, + "loss": 0.4641, + "step": 5418 + }, + { + "epoch": 0.23798807864144814, + "grad_norm": 2.859375, + "learning_rate": 4.345505852952064e-05, + "loss": 0.4585, + "step": 5420 + }, + { + "epoch": 0.2380758971206516, + "grad_norm": 2.375, + "learning_rate": 4.345039089155392e-05, + "loss": 0.4776, + "step": 5422 + }, + { + "epoch": 0.2381637155998551, + "grad_norm": 2.21875, + "learning_rate": 4.344572184065184e-05, + "loss": 0.4306, + "step": 5424 + }, + { + "epoch": 0.23825153407905858, + "grad_norm": 2.15625, + "learning_rate": 4.344105137717197e-05, + "loss": 0.455, + "step": 5426 + }, + { + "epoch": 0.23833935255826208, + "grad_norm": 2.28125, + "learning_rate": 4.343637950147196e-05, + "loss": 0.4614, + "step": 5428 + }, + { + "epoch": 0.23842717103746555, + "grad_norm": 2.4375, + "learning_rate": 4.343170621390958e-05, + "loss": 0.453, + "step": 5430 + }, + { + "epoch": 0.23851498951666905, + "grad_norm": 2.5, + "learning_rate": 4.3427031514842733e-05, + "loss": 0.4712, + "step": 5432 + }, + { + "epoch": 0.23860280799587252, + "grad_norm": 2.203125, + "learning_rate": 4.3422355404629384e-05, + "loss": 0.452, + "step": 5434 + }, + { + "epoch": 0.23869062647507602, + "grad_norm": 2.25, + "learning_rate": 4.3417677883627644e-05, + "loss": 0.455, + "step": 5436 + }, + { + "epoch": 0.2387784449542795, + "grad_norm": 2.3125, + "learning_rate": 4.341299895219572e-05, + "loss": 0.4595, + "step": 5438 + }, + { + "epoch": 0.238866263433483, + "grad_norm": 2.0625, + "learning_rate": 4.340831861069192e-05, + "loss": 0.4738, + "step": 5440 + }, + { + "epoch": 0.23895408191268647, + "grad_norm": 2.453125, + "learning_rate": 4.340363685947467e-05, + "loss": 0.4465, + "step": 5442 + }, + { + "epoch": 0.23904190039188997, + "grad_norm": 2.421875, + "learning_rate": 4.33989536989025e-05, + "loss": 0.4713, + "step": 5444 + }, + { + "epoch": 0.23912971887109344, + "grad_norm": 2.15625, + "learning_rate": 4.3394269129334044e-05, + "loss": 0.4876, + "step": 5446 + }, + { + "epoch": 0.23921753735029694, + "grad_norm": 2.78125, + "learning_rate": 4.338958315112804e-05, + "loss": 0.484, + "step": 5448 + }, + { + "epoch": 0.2393053558295004, + "grad_norm": 2.59375, + "learning_rate": 4.338489576464336e-05, + "loss": 0.4481, + "step": 5450 + }, + { + "epoch": 0.2393931743087039, + "grad_norm": 2.46875, + "learning_rate": 4.338020697023895e-05, + "loss": 0.465, + "step": 5452 + }, + { + "epoch": 0.23948099278790738, + "grad_norm": 2.234375, + "learning_rate": 4.337551676827389e-05, + "loss": 0.4872, + "step": 5454 + }, + { + "epoch": 0.23956881126711088, + "grad_norm": 2.171875, + "learning_rate": 4.337082515910734e-05, + "loss": 0.488, + "step": 5456 + }, + { + "epoch": 0.23965662974631438, + "grad_norm": 2.546875, + "learning_rate": 4.3366132143098606e-05, + "loss": 0.4662, + "step": 5458 + }, + { + "epoch": 0.23974444822551785, + "grad_norm": 2.65625, + "learning_rate": 4.336143772060707e-05, + "loss": 0.4747, + "step": 5460 + }, + { + "epoch": 0.23983226670472135, + "grad_norm": 3.0, + "learning_rate": 4.3356741891992226e-05, + "loss": 0.4739, + "step": 5462 + }, + { + "epoch": 0.23992008518392482, + "grad_norm": 2.3125, + "learning_rate": 4.335204465761369e-05, + "loss": 0.427, + "step": 5464 + }, + { + "epoch": 0.24000790366312832, + "grad_norm": 2.1875, + "learning_rate": 4.334734601783117e-05, + "loss": 0.4881, + "step": 5466 + }, + { + "epoch": 0.2400957221423318, + "grad_norm": 2.046875, + "learning_rate": 4.3342645973004504e-05, + "loss": 0.4672, + "step": 5468 + }, + { + "epoch": 0.2401835406215353, + "grad_norm": 2.75, + "learning_rate": 4.33379445234936e-05, + "loss": 0.468, + "step": 5470 + }, + { + "epoch": 0.24027135910073877, + "grad_norm": 2.53125, + "learning_rate": 4.333324166965852e-05, + "loss": 0.491, + "step": 5472 + }, + { + "epoch": 0.24035917757994227, + "grad_norm": 2.484375, + "learning_rate": 4.3328537411859394e-05, + "loss": 0.4633, + "step": 5474 + }, + { + "epoch": 0.24044699605914574, + "grad_norm": 2.828125, + "learning_rate": 4.332383175045648e-05, + "loss": 0.4597, + "step": 5476 + }, + { + "epoch": 0.24053481453834924, + "grad_norm": 2.390625, + "learning_rate": 4.331912468581013e-05, + "loss": 0.453, + "step": 5478 + }, + { + "epoch": 0.2406226330175527, + "grad_norm": 2.171875, + "learning_rate": 4.331441621828083e-05, + "loss": 0.4825, + "step": 5480 + }, + { + "epoch": 0.2407104514967562, + "grad_norm": 2.265625, + "learning_rate": 4.330970634822914e-05, + "loss": 0.5258, + "step": 5482 + }, + { + "epoch": 0.24079826997595968, + "grad_norm": 2.46875, + "learning_rate": 4.330499507601575e-05, + "loss": 0.4577, + "step": 5484 + }, + { + "epoch": 0.24088608845516318, + "grad_norm": 2.171875, + "learning_rate": 4.330028240200146e-05, + "loss": 0.4376, + "step": 5486 + }, + { + "epoch": 0.24097390693436666, + "grad_norm": 2.453125, + "learning_rate": 4.3295568326547144e-05, + "loss": 0.4581, + "step": 5488 + }, + { + "epoch": 0.24106172541357015, + "grad_norm": 2.25, + "learning_rate": 4.329085285001382e-05, + "loss": 0.4796, + "step": 5490 + }, + { + "epoch": 0.24114954389277363, + "grad_norm": 2.1875, + "learning_rate": 4.3286135972762596e-05, + "loss": 0.4707, + "step": 5492 + }, + { + "epoch": 0.24123736237197713, + "grad_norm": 2.296875, + "learning_rate": 4.328141769515471e-05, + "loss": 0.4675, + "step": 5494 + }, + { + "epoch": 0.2413251808511806, + "grad_norm": 2.109375, + "learning_rate": 4.3276698017551464e-05, + "loss": 0.4809, + "step": 5496 + }, + { + "epoch": 0.2414129993303841, + "grad_norm": 3.109375, + "learning_rate": 4.327197694031431e-05, + "loss": 0.4557, + "step": 5498 + }, + { + "epoch": 0.24150081780958757, + "grad_norm": 2.65625, + "learning_rate": 4.3267254463804775e-05, + "loss": 0.4709, + "step": 5500 + }, + { + "epoch": 0.24158863628879107, + "grad_norm": 2.375, + "learning_rate": 4.326253058838452e-05, + "loss": 0.4782, + "step": 5502 + }, + { + "epoch": 0.24167645476799454, + "grad_norm": 2.171875, + "learning_rate": 4.325780531441529e-05, + "loss": 0.4655, + "step": 5504 + }, + { + "epoch": 0.24176427324719804, + "grad_norm": 2.28125, + "learning_rate": 4.325307864225895e-05, + "loss": 0.4288, + "step": 5506 + }, + { + "epoch": 0.24185209172640154, + "grad_norm": 2.265625, + "learning_rate": 4.324835057227748e-05, + "loss": 0.4439, + "step": 5508 + }, + { + "epoch": 0.241939910205605, + "grad_norm": 2.203125, + "learning_rate": 4.324362110483294e-05, + "loss": 0.4501, + "step": 5510 + }, + { + "epoch": 0.2420277286848085, + "grad_norm": 2.390625, + "learning_rate": 4.3238890240287536e-05, + "loss": 0.4746, + "step": 5512 + }, + { + "epoch": 0.24211554716401198, + "grad_norm": 2.390625, + "learning_rate": 4.323415797900353e-05, + "loss": 0.4878, + "step": 5514 + }, + { + "epoch": 0.24220336564321548, + "grad_norm": 2.609375, + "learning_rate": 4.322942432134335e-05, + "loss": 0.4404, + "step": 5516 + }, + { + "epoch": 0.24229118412241896, + "grad_norm": 2.609375, + "learning_rate": 4.322468926766947e-05, + "loss": 0.4871, + "step": 5518 + }, + { + "epoch": 0.24237900260162246, + "grad_norm": 2.78125, + "learning_rate": 4.321995281834452e-05, + "loss": 0.4665, + "step": 5520 + }, + { + "epoch": 0.24246682108082593, + "grad_norm": 2.5, + "learning_rate": 4.3215214973731225e-05, + "loss": 0.4825, + "step": 5522 + }, + { + "epoch": 0.24255463956002943, + "grad_norm": 2.3125, + "learning_rate": 4.32104757341924e-05, + "loss": 0.4422, + "step": 5524 + }, + { + "epoch": 0.2426424580392329, + "grad_norm": 2.25, + "learning_rate": 4.320573510009097e-05, + "loss": 0.4534, + "step": 5526 + }, + { + "epoch": 0.2427302765184364, + "grad_norm": 2.15625, + "learning_rate": 4.320099307178999e-05, + "loss": 0.4488, + "step": 5528 + }, + { + "epoch": 0.24281809499763987, + "grad_norm": 2.203125, + "learning_rate": 4.3196249649652585e-05, + "loss": 0.4662, + "step": 5530 + }, + { + "epoch": 0.24290591347684337, + "grad_norm": 2.3125, + "learning_rate": 4.319150483404203e-05, + "loss": 0.4582, + "step": 5532 + }, + { + "epoch": 0.24299373195604684, + "grad_norm": 2.1875, + "learning_rate": 4.318675862532167e-05, + "loss": 0.4754, + "step": 5534 + }, + { + "epoch": 0.24308155043525034, + "grad_norm": 2.375, + "learning_rate": 4.318201102385497e-05, + "loss": 0.4612, + "step": 5536 + }, + { + "epoch": 0.24316936891445382, + "grad_norm": 2.359375, + "learning_rate": 4.317726203000552e-05, + "loss": 0.4556, + "step": 5538 + }, + { + "epoch": 0.24325718739365731, + "grad_norm": 2.328125, + "learning_rate": 4.317251164413698e-05, + "loss": 0.5085, + "step": 5540 + }, + { + "epoch": 0.2433450058728608, + "grad_norm": 2.171875, + "learning_rate": 4.316775986661314e-05, + "loss": 0.4796, + "step": 5542 + }, + { + "epoch": 0.2434328243520643, + "grad_norm": 2.421875, + "learning_rate": 4.31630066977979e-05, + "loss": 0.4599, + "step": 5544 + }, + { + "epoch": 0.24352064283126776, + "grad_norm": 2.046875, + "learning_rate": 4.315825213805525e-05, + "loss": 0.4443, + "step": 5546 + }, + { + "epoch": 0.24360846131047126, + "grad_norm": 2.1875, + "learning_rate": 4.31534961877493e-05, + "loss": 0.4585, + "step": 5548 + }, + { + "epoch": 0.24369627978967473, + "grad_norm": 2.359375, + "learning_rate": 4.314873884724425e-05, + "loss": 0.4578, + "step": 5550 + }, + { + "epoch": 0.24378409826887823, + "grad_norm": 2.40625, + "learning_rate": 4.3143980116904436e-05, + "loss": 0.4922, + "step": 5552 + }, + { + "epoch": 0.2438719167480817, + "grad_norm": 2.25, + "learning_rate": 4.313921999709428e-05, + "loss": 0.4907, + "step": 5554 + }, + { + "epoch": 0.2439597352272852, + "grad_norm": 2.640625, + "learning_rate": 4.313445848817831e-05, + "loss": 0.4842, + "step": 5556 + }, + { + "epoch": 0.2440475537064887, + "grad_norm": 2.3125, + "learning_rate": 4.312969559052115e-05, + "loss": 0.4853, + "step": 5558 + }, + { + "epoch": 0.24413537218569217, + "grad_norm": 2.4375, + "learning_rate": 4.312493130448756e-05, + "loss": 0.4377, + "step": 5560 + }, + { + "epoch": 0.24422319066489567, + "grad_norm": 2.375, + "learning_rate": 4.312016563044239e-05, + "loss": 0.4781, + "step": 5562 + }, + { + "epoch": 0.24431100914409914, + "grad_norm": 2.5, + "learning_rate": 4.311539856875059e-05, + "loss": 0.4625, + "step": 5564 + }, + { + "epoch": 0.24439882762330264, + "grad_norm": 2.34375, + "learning_rate": 4.311063011977723e-05, + "loss": 0.4681, + "step": 5566 + }, + { + "epoch": 0.24448664610250612, + "grad_norm": 2.421875, + "learning_rate": 4.3105860283887464e-05, + "loss": 0.4618, + "step": 5568 + }, + { + "epoch": 0.24457446458170962, + "grad_norm": 2.75, + "learning_rate": 4.3101089061446585e-05, + "loss": 0.4472, + "step": 5570 + }, + { + "epoch": 0.2446622830609131, + "grad_norm": 2.296875, + "learning_rate": 4.3096316452819964e-05, + "loss": 0.4859, + "step": 5572 + }, + { + "epoch": 0.2447501015401166, + "grad_norm": 2.265625, + "learning_rate": 4.309154245837309e-05, + "loss": 0.4876, + "step": 5574 + }, + { + "epoch": 0.24483792001932006, + "grad_norm": 2.515625, + "learning_rate": 4.308676707847156e-05, + "loss": 0.4829, + "step": 5576 + }, + { + "epoch": 0.24492573849852356, + "grad_norm": 2.203125, + "learning_rate": 4.308199031348107e-05, + "loss": 0.4657, + "step": 5578 + }, + { + "epoch": 0.24501355697772703, + "grad_norm": 2.390625, + "learning_rate": 4.3077212163767425e-05, + "loss": 0.4184, + "step": 5580 + }, + { + "epoch": 0.24510137545693053, + "grad_norm": 2.515625, + "learning_rate": 4.307243262969654e-05, + "loss": 0.471, + "step": 5582 + }, + { + "epoch": 0.245189193936134, + "grad_norm": 2.5625, + "learning_rate": 4.306765171163443e-05, + "loss": 0.4686, + "step": 5584 + }, + { + "epoch": 0.2452770124153375, + "grad_norm": 1.9765625, + "learning_rate": 4.306286940994723e-05, + "loss": 0.48, + "step": 5586 + }, + { + "epoch": 0.24536483089454098, + "grad_norm": 2.421875, + "learning_rate": 4.3058085725001154e-05, + "loss": 0.4761, + "step": 5588 + }, + { + "epoch": 0.24545264937374447, + "grad_norm": 2.03125, + "learning_rate": 4.305330065716254e-05, + "loss": 0.492, + "step": 5590 + }, + { + "epoch": 0.24554046785294795, + "grad_norm": 2.28125, + "learning_rate": 4.304851420679784e-05, + "loss": 0.466, + "step": 5592 + }, + { + "epoch": 0.24562828633215145, + "grad_norm": 2.15625, + "learning_rate": 4.30437263742736e-05, + "loss": 0.4475, + "step": 5594 + }, + { + "epoch": 0.24571610481135492, + "grad_norm": 2.328125, + "learning_rate": 4.303893715995646e-05, + "loss": 0.459, + "step": 5596 + }, + { + "epoch": 0.24580392329055842, + "grad_norm": 2.296875, + "learning_rate": 4.303414656421319e-05, + "loss": 0.4527, + "step": 5598 + }, + { + "epoch": 0.2458917417697619, + "grad_norm": 2.265625, + "learning_rate": 4.302935458741066e-05, + "loss": 0.4525, + "step": 5600 + }, + { + "epoch": 0.2459795602489654, + "grad_norm": 2.53125, + "learning_rate": 4.3024561229915826e-05, + "loss": 0.4693, + "step": 5602 + }, + { + "epoch": 0.24606737872816886, + "grad_norm": 2.375, + "learning_rate": 4.301976649209577e-05, + "loss": 0.4656, + "step": 5604 + }, + { + "epoch": 0.24615519720737236, + "grad_norm": 2.1875, + "learning_rate": 4.301497037431769e-05, + "loss": 0.4751, + "step": 5606 + }, + { + "epoch": 0.24624301568657586, + "grad_norm": 2.5, + "learning_rate": 4.3010172876948844e-05, + "loss": 0.463, + "step": 5608 + }, + { + "epoch": 0.24633083416577933, + "grad_norm": 2.203125, + "learning_rate": 4.300537400035665e-05, + "loss": 0.4701, + "step": 5610 + }, + { + "epoch": 0.24641865264498283, + "grad_norm": 2.1875, + "learning_rate": 4.30005737449086e-05, + "loss": 0.4647, + "step": 5612 + }, + { + "epoch": 0.2465064711241863, + "grad_norm": 2.203125, + "learning_rate": 4.2995772110972296e-05, + "loss": 0.4869, + "step": 5614 + }, + { + "epoch": 0.2465942896033898, + "grad_norm": 2.328125, + "learning_rate": 4.299096909891545e-05, + "loss": 0.445, + "step": 5616 + }, + { + "epoch": 0.24668210808259328, + "grad_norm": 2.296875, + "learning_rate": 4.2986164709105877e-05, + "loss": 0.475, + "step": 5618 + }, + { + "epoch": 0.24676992656179678, + "grad_norm": 2.1875, + "learning_rate": 4.29813589419115e-05, + "loss": 0.4494, + "step": 5620 + }, + { + "epoch": 0.24685774504100025, + "grad_norm": 2.265625, + "learning_rate": 4.2976551797700336e-05, + "loss": 0.4499, + "step": 5622 + }, + { + "epoch": 0.24694556352020375, + "grad_norm": 2.5, + "learning_rate": 4.297174327684054e-05, + "loss": 0.4804, + "step": 5624 + }, + { + "epoch": 0.24703338199940722, + "grad_norm": 2.421875, + "learning_rate": 4.296693337970033e-05, + "loss": 0.4644, + "step": 5626 + }, + { + "epoch": 0.24712120047861072, + "grad_norm": 2.625, + "learning_rate": 4.296212210664805e-05, + "loss": 0.4614, + "step": 5628 + }, + { + "epoch": 0.2472090189578142, + "grad_norm": 2.828125, + "learning_rate": 4.2957309458052156e-05, + "loss": 0.4825, + "step": 5630 + }, + { + "epoch": 0.2472968374370177, + "grad_norm": 2.390625, + "learning_rate": 4.2952495434281204e-05, + "loss": 0.4424, + "step": 5632 + }, + { + "epoch": 0.24738465591622116, + "grad_norm": 2.671875, + "learning_rate": 4.294768003570384e-05, + "loss": 0.486, + "step": 5634 + }, + { + "epoch": 0.24747247439542466, + "grad_norm": 2.078125, + "learning_rate": 4.294286326268885e-05, + "loss": 0.4528, + "step": 5636 + }, + { + "epoch": 0.24756029287462814, + "grad_norm": 2.46875, + "learning_rate": 4.2938045115605074e-05, + "loss": 0.4554, + "step": 5638 + }, + { + "epoch": 0.24764811135383163, + "grad_norm": 2.234375, + "learning_rate": 4.2933225594821514e-05, + "loss": 0.4572, + "step": 5640 + }, + { + "epoch": 0.2477359298330351, + "grad_norm": 2.15625, + "learning_rate": 4.292840470070724e-05, + "loss": 0.4692, + "step": 5642 + }, + { + "epoch": 0.2478237483122386, + "grad_norm": 2.21875, + "learning_rate": 4.2923582433631424e-05, + "loss": 0.4767, + "step": 5644 + }, + { + "epoch": 0.24791156679144208, + "grad_norm": 2.484375, + "learning_rate": 4.291875879396338e-05, + "loss": 0.4665, + "step": 5646 + }, + { + "epoch": 0.24799938527064558, + "grad_norm": 2.53125, + "learning_rate": 4.291393378207249e-05, + "loss": 0.4811, + "step": 5648 + }, + { + "epoch": 0.24808720374984905, + "grad_norm": 2.09375, + "learning_rate": 4.290910739832825e-05, + "loss": 0.4653, + "step": 5650 + }, + { + "epoch": 0.24817502222905255, + "grad_norm": 2.234375, + "learning_rate": 4.2904279643100276e-05, + "loss": 0.4598, + "step": 5652 + }, + { + "epoch": 0.24826284070825602, + "grad_norm": 2.390625, + "learning_rate": 4.2899450516758275e-05, + "loss": 0.4546, + "step": 5654 + }, + { + "epoch": 0.24835065918745952, + "grad_norm": 2.796875, + "learning_rate": 4.289462001967207e-05, + "loss": 0.4407, + "step": 5656 + }, + { + "epoch": 0.24843847766666302, + "grad_norm": 3.125, + "learning_rate": 4.288978815221157e-05, + "loss": 0.466, + "step": 5658 + }, + { + "epoch": 0.2485262961458665, + "grad_norm": 3.328125, + "learning_rate": 4.28849549147468e-05, + "loss": 0.4913, + "step": 5660 + }, + { + "epoch": 0.24861411462507, + "grad_norm": 3.484375, + "learning_rate": 4.28801203076479e-05, + "loss": 0.4755, + "step": 5662 + }, + { + "epoch": 0.24870193310427346, + "grad_norm": 3.09375, + "learning_rate": 4.2875284331285105e-05, + "loss": 0.4563, + "step": 5664 + }, + { + "epoch": 0.24878975158347696, + "grad_norm": 2.671875, + "learning_rate": 4.287044698602874e-05, + "loss": 0.4635, + "step": 5666 + }, + { + "epoch": 0.24887757006268044, + "grad_norm": 2.203125, + "learning_rate": 4.286560827224927e-05, + "loss": 0.4683, + "step": 5668 + }, + { + "epoch": 0.24896538854188394, + "grad_norm": 2.15625, + "learning_rate": 4.286076819031723e-05, + "loss": 0.4425, + "step": 5670 + }, + { + "epoch": 0.2490532070210874, + "grad_norm": 2.390625, + "learning_rate": 4.285592674060328e-05, + "loss": 0.4542, + "step": 5672 + }, + { + "epoch": 0.2491410255002909, + "grad_norm": 2.53125, + "learning_rate": 4.2851083923478186e-05, + "loss": 0.4687, + "step": 5674 + }, + { + "epoch": 0.24922884397949438, + "grad_norm": 2.140625, + "learning_rate": 4.2846239739312796e-05, + "loss": 0.4361, + "step": 5676 + }, + { + "epoch": 0.24931666245869788, + "grad_norm": 2.21875, + "learning_rate": 4.28413941884781e-05, + "loss": 0.4518, + "step": 5678 + }, + { + "epoch": 0.24940448093790135, + "grad_norm": 2.53125, + "learning_rate": 4.283654727134515e-05, + "loss": 0.47, + "step": 5680 + }, + { + "epoch": 0.24949229941710485, + "grad_norm": 3.09375, + "learning_rate": 4.2831698988285144e-05, + "loss": 0.4626, + "step": 5682 + }, + { + "epoch": 0.24958011789630832, + "grad_norm": 2.828125, + "learning_rate": 4.282684933966935e-05, + "loss": 0.4662, + "step": 5684 + }, + { + "epoch": 0.24966793637551182, + "grad_norm": 2.296875, + "learning_rate": 4.282199832586916e-05, + "loss": 0.4362, + "step": 5686 + }, + { + "epoch": 0.2497557548547153, + "grad_norm": 2.328125, + "learning_rate": 4.2817145947256064e-05, + "loss": 0.4661, + "step": 5688 + }, + { + "epoch": 0.2498435733339188, + "grad_norm": 2.40625, + "learning_rate": 4.281229220420167e-05, + "loss": 0.4632, + "step": 5690 + }, + { + "epoch": 0.24993139181312227, + "grad_norm": 2.40625, + "learning_rate": 4.2807437097077654e-05, + "loss": 0.4642, + "step": 5692 + }, + { + "epoch": 0.25001921029232577, + "grad_norm": 2.28125, + "learning_rate": 4.280258062625585e-05, + "loss": 0.4546, + "step": 5694 + }, + { + "epoch": 0.25010702877152924, + "grad_norm": 2.265625, + "learning_rate": 4.279772279210814e-05, + "loss": 0.441, + "step": 5696 + }, + { + "epoch": 0.2501948472507327, + "grad_norm": 2.1875, + "learning_rate": 4.279286359500657e-05, + "loss": 0.4551, + "step": 5698 + }, + { + "epoch": 0.25028266572993624, + "grad_norm": 2.359375, + "learning_rate": 4.2788003035323225e-05, + "loss": 0.4417, + "step": 5700 + }, + { + "epoch": 0.2503704842091397, + "grad_norm": 2.078125, + "learning_rate": 4.278314111343035e-05, + "loss": 0.4275, + "step": 5702 + }, + { + "epoch": 0.2504583026883432, + "grad_norm": 2.15625, + "learning_rate": 4.277827782970026e-05, + "loss": 0.4649, + "step": 5704 + }, + { + "epoch": 0.25054612116754665, + "grad_norm": 2.453125, + "learning_rate": 4.277341318450541e-05, + "loss": 0.4618, + "step": 5706 + }, + { + "epoch": 0.2506339396467502, + "grad_norm": 2.4375, + "learning_rate": 4.27685471782183e-05, + "loss": 0.4681, + "step": 5708 + }, + { + "epoch": 0.25072175812595365, + "grad_norm": 2.21875, + "learning_rate": 4.276367981121159e-05, + "loss": 0.4521, + "step": 5710 + }, + { + "epoch": 0.2508095766051571, + "grad_norm": 2.4375, + "learning_rate": 4.275881108385802e-05, + "loss": 0.455, + "step": 5712 + }, + { + "epoch": 0.25089739508436065, + "grad_norm": 2.109375, + "learning_rate": 4.275394099653045e-05, + "loss": 0.4738, + "step": 5714 + }, + { + "epoch": 0.2509852135635641, + "grad_norm": 2.109375, + "learning_rate": 4.2749069549601816e-05, + "loss": 0.4609, + "step": 5716 + }, + { + "epoch": 0.2510730320427676, + "grad_norm": 2.171875, + "learning_rate": 4.274419674344519e-05, + "loss": 0.4693, + "step": 5718 + }, + { + "epoch": 0.25116085052197107, + "grad_norm": 2.40625, + "learning_rate": 4.273932257843371e-05, + "loss": 0.4611, + "step": 5720 + }, + { + "epoch": 0.2512486690011746, + "grad_norm": 2.1875, + "learning_rate": 4.273444705494066e-05, + "loss": 0.4809, + "step": 5722 + }, + { + "epoch": 0.25133648748037807, + "grad_norm": 2.390625, + "learning_rate": 4.272957017333941e-05, + "loss": 0.446, + "step": 5724 + }, + { + "epoch": 0.25142430595958154, + "grad_norm": 2.265625, + "learning_rate": 4.2724691934003414e-05, + "loss": 0.463, + "step": 5726 + }, + { + "epoch": 0.251512124438785, + "grad_norm": 2.203125, + "learning_rate": 4.271981233730626e-05, + "loss": 0.4614, + "step": 5728 + }, + { + "epoch": 0.25159994291798854, + "grad_norm": 2.15625, + "learning_rate": 4.271493138362165e-05, + "loss": 0.4615, + "step": 5730 + }, + { + "epoch": 0.251687761397192, + "grad_norm": 2.40625, + "learning_rate": 4.271004907332333e-05, + "loss": 0.4866, + "step": 5732 + }, + { + "epoch": 0.2517755798763955, + "grad_norm": 2.453125, + "learning_rate": 4.270516540678521e-05, + "loss": 0.473, + "step": 5734 + }, + { + "epoch": 0.25186339835559896, + "grad_norm": 2.5625, + "learning_rate": 4.2700280384381275e-05, + "loss": 0.4526, + "step": 5736 + }, + { + "epoch": 0.2519512168348025, + "grad_norm": 2.5, + "learning_rate": 4.269539400648563e-05, + "loss": 0.4582, + "step": 5738 + }, + { + "epoch": 0.25203903531400595, + "grad_norm": 2.328125, + "learning_rate": 4.269050627347247e-05, + "loss": 0.4638, + "step": 5740 + }, + { + "epoch": 0.2521268537932094, + "grad_norm": 1.9609375, + "learning_rate": 4.2685617185716104e-05, + "loss": 0.4635, + "step": 5742 + }, + { + "epoch": 0.2522146722724129, + "grad_norm": 2.15625, + "learning_rate": 4.268072674359093e-05, + "loss": 0.4475, + "step": 5744 + }, + { + "epoch": 0.2523024907516164, + "grad_norm": 2.375, + "learning_rate": 4.267583494747146e-05, + "loss": 0.4297, + "step": 5746 + }, + { + "epoch": 0.2523903092308199, + "grad_norm": 2.421875, + "learning_rate": 4.267094179773232e-05, + "loss": 0.4615, + "step": 5748 + }, + { + "epoch": 0.25247812771002337, + "grad_norm": 2.515625, + "learning_rate": 4.266604729474821e-05, + "loss": 0.4567, + "step": 5750 + }, + { + "epoch": 0.25256594618922684, + "grad_norm": 2.390625, + "learning_rate": 4.2661151438893974e-05, + "loss": 0.4569, + "step": 5752 + }, + { + "epoch": 0.25265376466843037, + "grad_norm": 2.390625, + "learning_rate": 4.265625423054452e-05, + "loss": 0.4475, + "step": 5754 + }, + { + "epoch": 0.25274158314763384, + "grad_norm": 2.4375, + "learning_rate": 4.265135567007489e-05, + "loss": 0.4421, + "step": 5756 + }, + { + "epoch": 0.2528294016268373, + "grad_norm": 2.609375, + "learning_rate": 4.264645575786021e-05, + "loss": 0.4542, + "step": 5758 + }, + { + "epoch": 0.25291722010604084, + "grad_norm": 2.109375, + "learning_rate": 4.264155449427572e-05, + "loss": 0.4345, + "step": 5760 + }, + { + "epoch": 0.2530050385852443, + "grad_norm": 2.21875, + "learning_rate": 4.263665187969675e-05, + "loss": 0.4468, + "step": 5762 + }, + { + "epoch": 0.2530928570644478, + "grad_norm": 2.25, + "learning_rate": 4.263174791449875e-05, + "loss": 0.4696, + "step": 5764 + }, + { + "epoch": 0.25318067554365126, + "grad_norm": 2.21875, + "learning_rate": 4.262684259905728e-05, + "loss": 0.4805, + "step": 5766 + }, + { + "epoch": 0.2532684940228548, + "grad_norm": 2.359375, + "learning_rate": 4.2621935933747974e-05, + "loss": 0.4899, + "step": 5768 + }, + { + "epoch": 0.25335631250205826, + "grad_norm": 2.203125, + "learning_rate": 4.261702791894659e-05, + "loss": 0.4855, + "step": 5770 + }, + { + "epoch": 0.25344413098126173, + "grad_norm": 2.203125, + "learning_rate": 4.261211855502898e-05, + "loss": 0.4947, + "step": 5772 + }, + { + "epoch": 0.2535319494604652, + "grad_norm": 2.140625, + "learning_rate": 4.260720784237111e-05, + "loss": 0.457, + "step": 5774 + }, + { + "epoch": 0.2536197679396687, + "grad_norm": 2.71875, + "learning_rate": 4.260229578134904e-05, + "loss": 0.4763, + "step": 5776 + }, + { + "epoch": 0.2537075864188722, + "grad_norm": 2.28125, + "learning_rate": 4.259738237233896e-05, + "loss": 0.4496, + "step": 5778 + }, + { + "epoch": 0.25379540489807567, + "grad_norm": 2.140625, + "learning_rate": 4.2592467615717105e-05, + "loss": 0.4484, + "step": 5780 + }, + { + "epoch": 0.25388322337727914, + "grad_norm": 2.109375, + "learning_rate": 4.258755151185986e-05, + "loss": 0.4883, + "step": 5782 + }, + { + "epoch": 0.25397104185648267, + "grad_norm": 2.53125, + "learning_rate": 4.25826340611437e-05, + "loss": 0.4727, + "step": 5784 + }, + { + "epoch": 0.25405886033568614, + "grad_norm": 2.46875, + "learning_rate": 4.2577715263945216e-05, + "loss": 0.4786, + "step": 5786 + }, + { + "epoch": 0.2541466788148896, + "grad_norm": 2.53125, + "learning_rate": 4.2572795120641086e-05, + "loss": 0.4564, + "step": 5788 + }, + { + "epoch": 0.2542344972940931, + "grad_norm": 2.34375, + "learning_rate": 4.256787363160809e-05, + "loss": 0.477, + "step": 5790 + }, + { + "epoch": 0.2543223157732966, + "grad_norm": 2.234375, + "learning_rate": 4.256295079722311e-05, + "loss": 0.4591, + "step": 5792 + }, + { + "epoch": 0.2544101342525001, + "grad_norm": 2.796875, + "learning_rate": 4.255802661786316e-05, + "loss": 0.4847, + "step": 5794 + }, + { + "epoch": 0.25449795273170356, + "grad_norm": 2.96875, + "learning_rate": 4.2553101093905325e-05, + "loss": 0.4582, + "step": 5796 + }, + { + "epoch": 0.25458577121090703, + "grad_norm": 2.640625, + "learning_rate": 4.2548174225726795e-05, + "loss": 0.4743, + "step": 5798 + }, + { + "epoch": 0.25467358969011056, + "grad_norm": 2.3125, + "learning_rate": 4.254324601370487e-05, + "loss": 0.4626, + "step": 5800 + }, + { + "epoch": 0.25476140816931403, + "grad_norm": 2.703125, + "learning_rate": 4.253831645821697e-05, + "loss": 0.453, + "step": 5802 + }, + { + "epoch": 0.2548492266485175, + "grad_norm": 2.3125, + "learning_rate": 4.253338555964059e-05, + "loss": 0.455, + "step": 5804 + }, + { + "epoch": 0.254937045127721, + "grad_norm": 2.265625, + "learning_rate": 4.252845331835333e-05, + "loss": 0.447, + "step": 5806 + }, + { + "epoch": 0.2550248636069245, + "grad_norm": 2.265625, + "learning_rate": 4.252351973473293e-05, + "loss": 0.4533, + "step": 5808 + }, + { + "epoch": 0.255112682086128, + "grad_norm": 2.234375, + "learning_rate": 4.251858480915718e-05, + "loss": 0.4477, + "step": 5810 + }, + { + "epoch": 0.25520050056533145, + "grad_norm": 2.359375, + "learning_rate": 4.2513648542004006e-05, + "loss": 0.4466, + "step": 5812 + }, + { + "epoch": 0.255288319044535, + "grad_norm": 2.296875, + "learning_rate": 4.250871093365143e-05, + "loss": 0.4656, + "step": 5814 + }, + { + "epoch": 0.25537613752373844, + "grad_norm": 2.390625, + "learning_rate": 4.250377198447757e-05, + "loss": 0.4657, + "step": 5816 + }, + { + "epoch": 0.2554639560029419, + "grad_norm": 2.1875, + "learning_rate": 4.249883169486066e-05, + "loss": 0.4741, + "step": 5818 + }, + { + "epoch": 0.2555517744821454, + "grad_norm": 2.5625, + "learning_rate": 4.249389006517902e-05, + "loss": 0.4608, + "step": 5820 + }, + { + "epoch": 0.2556395929613489, + "grad_norm": 2.125, + "learning_rate": 4.248894709581108e-05, + "loss": 0.4406, + "step": 5822 + }, + { + "epoch": 0.2557274114405524, + "grad_norm": 2.421875, + "learning_rate": 4.248400278713539e-05, + "loss": 0.4559, + "step": 5824 + }, + { + "epoch": 0.25581522991975586, + "grad_norm": 2.984375, + "learning_rate": 4.247905713953057e-05, + "loss": 0.441, + "step": 5826 + }, + { + "epoch": 0.25590304839895933, + "grad_norm": 2.859375, + "learning_rate": 4.247411015337537e-05, + "loss": 0.464, + "step": 5828 + }, + { + "epoch": 0.25599086687816286, + "grad_norm": 2.578125, + "learning_rate": 4.246916182904862e-05, + "loss": 0.4813, + "step": 5830 + }, + { + "epoch": 0.25607868535736633, + "grad_norm": 2.515625, + "learning_rate": 4.246421216692926e-05, + "loss": 0.4666, + "step": 5832 + }, + { + "epoch": 0.2561665038365698, + "grad_norm": 2.34375, + "learning_rate": 4.245926116739636e-05, + "loss": 0.4726, + "step": 5834 + }, + { + "epoch": 0.2562543223157733, + "grad_norm": 2.28125, + "learning_rate": 4.245430883082905e-05, + "loss": 0.4512, + "step": 5836 + }, + { + "epoch": 0.2563421407949768, + "grad_norm": 3.1875, + "learning_rate": 4.2449355157606584e-05, + "loss": 0.4321, + "step": 5838 + }, + { + "epoch": 0.2564299592741803, + "grad_norm": 2.875, + "learning_rate": 4.244440014810832e-05, + "loss": 0.4447, + "step": 5840 + }, + { + "epoch": 0.25651777775338375, + "grad_norm": 3.125, + "learning_rate": 4.243944380271372e-05, + "loss": 0.4621, + "step": 5842 + }, + { + "epoch": 0.2566055962325872, + "grad_norm": 2.625, + "learning_rate": 4.243448612180232e-05, + "loss": 0.445, + "step": 5844 + }, + { + "epoch": 0.25669341471179075, + "grad_norm": 2.875, + "learning_rate": 4.242952710575379e-05, + "loss": 0.4493, + "step": 5846 + }, + { + "epoch": 0.2567812331909942, + "grad_norm": 3.1875, + "learning_rate": 4.24245667549479e-05, + "loss": 0.472, + "step": 5848 + }, + { + "epoch": 0.2568690516701977, + "grad_norm": 3.671875, + "learning_rate": 4.241960506976452e-05, + "loss": 0.4749, + "step": 5850 + }, + { + "epoch": 0.25695687014940116, + "grad_norm": 3.140625, + "learning_rate": 4.2414642050583595e-05, + "loss": 0.4881, + "step": 5852 + }, + { + "epoch": 0.2570446886286047, + "grad_norm": 3.03125, + "learning_rate": 4.240967769778522e-05, + "loss": 0.4714, + "step": 5854 + }, + { + "epoch": 0.25713250710780816, + "grad_norm": 2.234375, + "learning_rate": 4.240471201174955e-05, + "loss": 0.424, + "step": 5856 + }, + { + "epoch": 0.25722032558701163, + "grad_norm": 2.28125, + "learning_rate": 4.239974499285686e-05, + "loss": 0.4458, + "step": 5858 + }, + { + "epoch": 0.25730814406621516, + "grad_norm": 2.90625, + "learning_rate": 4.2394776641487525e-05, + "loss": 0.4459, + "step": 5860 + }, + { + "epoch": 0.25739596254541863, + "grad_norm": 2.515625, + "learning_rate": 4.238980695802203e-05, + "loss": 0.4279, + "step": 5862 + }, + { + "epoch": 0.2574837810246221, + "grad_norm": 2.84375, + "learning_rate": 4.238483594284094e-05, + "loss": 0.471, + "step": 5864 + }, + { + "epoch": 0.2575715995038256, + "grad_norm": 3.03125, + "learning_rate": 4.2379863596324953e-05, + "loss": 0.4962, + "step": 5866 + }, + { + "epoch": 0.2576594179830291, + "grad_norm": 2.859375, + "learning_rate": 4.2374889918854846e-05, + "loss": 0.4755, + "step": 5868 + }, + { + "epoch": 0.2577472364622326, + "grad_norm": 3.265625, + "learning_rate": 4.236991491081151e-05, + "loss": 0.4539, + "step": 5870 + }, + { + "epoch": 0.25783505494143605, + "grad_norm": 2.546875, + "learning_rate": 4.236493857257591e-05, + "loss": 0.4678, + "step": 5872 + }, + { + "epoch": 0.2579228734206395, + "grad_norm": 2.65625, + "learning_rate": 4.235996090452916e-05, + "loss": 0.4569, + "step": 5874 + }, + { + "epoch": 0.25801069189984305, + "grad_norm": 2.9375, + "learning_rate": 4.2354981907052446e-05, + "loss": 0.4773, + "step": 5876 + }, + { + "epoch": 0.2580985103790465, + "grad_norm": 2.46875, + "learning_rate": 4.2350001580527057e-05, + "loss": 0.4491, + "step": 5878 + }, + { + "epoch": 0.25818632885825, + "grad_norm": 1.9765625, + "learning_rate": 4.234501992533438e-05, + "loss": 0.4411, + "step": 5880 + }, + { + "epoch": 0.25827414733745346, + "grad_norm": 2.0, + "learning_rate": 4.2340036941855924e-05, + "loss": 0.4516, + "step": 5882 + }, + { + "epoch": 0.258361965816657, + "grad_norm": 2.15625, + "learning_rate": 4.233505263047328e-05, + "loss": 0.4443, + "step": 5884 + }, + { + "epoch": 0.25844978429586046, + "grad_norm": 2.15625, + "learning_rate": 4.233006699156816e-05, + "loss": 0.4445, + "step": 5886 + }, + { + "epoch": 0.25853760277506393, + "grad_norm": 2.65625, + "learning_rate": 4.232508002552235e-05, + "loss": 0.458, + "step": 5888 + }, + { + "epoch": 0.2586254212542674, + "grad_norm": 2.234375, + "learning_rate": 4.232009173271776e-05, + "loss": 0.4336, + "step": 5890 + }, + { + "epoch": 0.25871323973347093, + "grad_norm": 2.015625, + "learning_rate": 4.231510211353639e-05, + "loss": 0.4569, + "step": 5892 + }, + { + "epoch": 0.2588010582126744, + "grad_norm": 2.046875, + "learning_rate": 4.2310111168360345e-05, + "loss": 0.4377, + "step": 5894 + }, + { + "epoch": 0.2588888766918779, + "grad_norm": 2.078125, + "learning_rate": 4.230511889757184e-05, + "loss": 0.4822, + "step": 5896 + }, + { + "epoch": 0.25897669517108135, + "grad_norm": 2.0, + "learning_rate": 4.230012530155318e-05, + "loss": 0.4582, + "step": 5898 + }, + { + "epoch": 0.2590645136502849, + "grad_norm": 2.078125, + "learning_rate": 4.229513038068678e-05, + "loss": 0.4536, + "step": 5900 + }, + { + "epoch": 0.25915233212948835, + "grad_norm": 2.203125, + "learning_rate": 4.229013413535515e-05, + "loss": 0.4522, + "step": 5902 + }, + { + "epoch": 0.2592401506086918, + "grad_norm": 2.328125, + "learning_rate": 4.228513656594091e-05, + "loss": 0.4709, + "step": 5904 + }, + { + "epoch": 0.2593279690878953, + "grad_norm": 2.171875, + "learning_rate": 4.228013767282676e-05, + "loss": 0.4666, + "step": 5906 + }, + { + "epoch": 0.2594157875670988, + "grad_norm": 2.484375, + "learning_rate": 4.227513745639553e-05, + "loss": 0.4448, + "step": 5908 + }, + { + "epoch": 0.2595036060463023, + "grad_norm": 2.359375, + "learning_rate": 4.227013591703012e-05, + "loss": 0.4491, + "step": 5910 + }, + { + "epoch": 0.25959142452550577, + "grad_norm": 2.328125, + "learning_rate": 4.226513305511357e-05, + "loss": 0.4451, + "step": 5912 + }, + { + "epoch": 0.2596792430047093, + "grad_norm": 2.09375, + "learning_rate": 4.226012887102899e-05, + "loss": 0.4603, + "step": 5914 + }, + { + "epoch": 0.25976706148391276, + "grad_norm": 2.328125, + "learning_rate": 4.225512336515961e-05, + "loss": 0.4343, + "step": 5916 + }, + { + "epoch": 0.25985487996311624, + "grad_norm": 2.109375, + "learning_rate": 4.225011653788874e-05, + "loss": 0.465, + "step": 5918 + }, + { + "epoch": 0.2599426984423197, + "grad_norm": 2.3125, + "learning_rate": 4.224510838959981e-05, + "loss": 0.4542, + "step": 5920 + }, + { + "epoch": 0.26003051692152324, + "grad_norm": 2.25, + "learning_rate": 4.2240098920676353e-05, + "loss": 0.4268, + "step": 5922 + }, + { + "epoch": 0.2601183354007267, + "grad_norm": 2.125, + "learning_rate": 4.223508813150198e-05, + "loss": 0.476, + "step": 5924 + }, + { + "epoch": 0.2602061538799302, + "grad_norm": 2.265625, + "learning_rate": 4.223007602246043e-05, + "loss": 0.4375, + "step": 5926 + }, + { + "epoch": 0.26029397235913365, + "grad_norm": 2.25, + "learning_rate": 4.2225062593935524e-05, + "loss": 0.4468, + "step": 5928 + }, + { + "epoch": 0.2603817908383372, + "grad_norm": 2.765625, + "learning_rate": 4.22200478463112e-05, + "loss": 0.4386, + "step": 5930 + }, + { + "epoch": 0.26046960931754065, + "grad_norm": 2.359375, + "learning_rate": 4.221503177997148e-05, + "loss": 0.4495, + "step": 5932 + }, + { + "epoch": 0.2605574277967441, + "grad_norm": 2.34375, + "learning_rate": 4.221001439530051e-05, + "loss": 0.4712, + "step": 5934 + }, + { + "epoch": 0.2606452462759476, + "grad_norm": 2.15625, + "learning_rate": 4.2204995692682504e-05, + "loss": 0.4401, + "step": 5936 + }, + { + "epoch": 0.2607330647551511, + "grad_norm": 2.203125, + "learning_rate": 4.21999756725018e-05, + "loss": 0.4546, + "step": 5938 + }, + { + "epoch": 0.2608208832343546, + "grad_norm": 2.359375, + "learning_rate": 4.219495433514284e-05, + "loss": 0.4559, + "step": 5940 + }, + { + "epoch": 0.26090870171355807, + "grad_norm": 3.0, + "learning_rate": 4.2189931680990155e-05, + "loss": 0.444, + "step": 5942 + }, + { + "epoch": 0.26099652019276154, + "grad_norm": 3.0, + "learning_rate": 4.2184907710428375e-05, + "loss": 0.4885, + "step": 5944 + }, + { + "epoch": 0.26108433867196507, + "grad_norm": 2.421875, + "learning_rate": 4.217988242384225e-05, + "loss": 0.468, + "step": 5946 + }, + { + "epoch": 0.26117215715116854, + "grad_norm": 2.703125, + "learning_rate": 4.217485582161661e-05, + "loss": 0.4635, + "step": 5948 + }, + { + "epoch": 0.261259975630372, + "grad_norm": 2.625, + "learning_rate": 4.2169827904136396e-05, + "loss": 0.4728, + "step": 5950 + }, + { + "epoch": 0.2613477941095755, + "grad_norm": 2.640625, + "learning_rate": 4.216479867178664e-05, + "loss": 0.4572, + "step": 5952 + }, + { + "epoch": 0.261435612588779, + "grad_norm": 2.5, + "learning_rate": 4.215976812495249e-05, + "loss": 0.4628, + "step": 5954 + }, + { + "epoch": 0.2615234310679825, + "grad_norm": 2.5625, + "learning_rate": 4.2154736264019184e-05, + "loss": 0.444, + "step": 5956 + }, + { + "epoch": 0.26161124954718595, + "grad_norm": 2.53125, + "learning_rate": 4.214970308937206e-05, + "loss": 0.4545, + "step": 5958 + }, + { + "epoch": 0.2616990680263894, + "grad_norm": 2.25, + "learning_rate": 4.2144668601396566e-05, + "loss": 0.4274, + "step": 5960 + }, + { + "epoch": 0.26178688650559295, + "grad_norm": 2.59375, + "learning_rate": 4.2139632800478234e-05, + "loss": 0.4839, + "step": 5962 + }, + { + "epoch": 0.2618747049847964, + "grad_norm": 2.6875, + "learning_rate": 4.213459568700273e-05, + "loss": 0.4527, + "step": 5964 + }, + { + "epoch": 0.2619625234639999, + "grad_norm": 2.9375, + "learning_rate": 4.212955726135577e-05, + "loss": 0.453, + "step": 5966 + }, + { + "epoch": 0.2620503419432034, + "grad_norm": 2.09375, + "learning_rate": 4.21245175239232e-05, + "loss": 0.4747, + "step": 5968 + }, + { + "epoch": 0.2621381604224069, + "grad_norm": 2.25, + "learning_rate": 4.211947647509098e-05, + "loss": 0.4459, + "step": 5970 + }, + { + "epoch": 0.26222597890161037, + "grad_norm": 2.171875, + "learning_rate": 4.211443411524515e-05, + "loss": 0.4452, + "step": 5972 + }, + { + "epoch": 0.26231379738081384, + "grad_norm": 2.296875, + "learning_rate": 4.210939044477185e-05, + "loss": 0.4412, + "step": 5974 + }, + { + "epoch": 0.26240161586001737, + "grad_norm": 2.421875, + "learning_rate": 4.210434546405733e-05, + "loss": 0.4675, + "step": 5976 + }, + { + "epoch": 0.26248943433922084, + "grad_norm": 2.171875, + "learning_rate": 4.2099299173487936e-05, + "loss": 0.4373, + "step": 5978 + }, + { + "epoch": 0.2625772528184243, + "grad_norm": 2.1875, + "learning_rate": 4.209425157345011e-05, + "loss": 0.4514, + "step": 5980 + }, + { + "epoch": 0.2626650712976278, + "grad_norm": 2.421875, + "learning_rate": 4.20892026643304e-05, + "loss": 0.4374, + "step": 5982 + }, + { + "epoch": 0.2627528897768313, + "grad_norm": 2.5, + "learning_rate": 4.208415244651546e-05, + "loss": 0.4501, + "step": 5984 + }, + { + "epoch": 0.2628407082560348, + "grad_norm": 2.28125, + "learning_rate": 4.207910092039202e-05, + "loss": 0.4409, + "step": 5986 + }, + { + "epoch": 0.26292852673523825, + "grad_norm": 2.921875, + "learning_rate": 4.207404808634694e-05, + "loss": 0.4469, + "step": 5988 + }, + { + "epoch": 0.2630163452144417, + "grad_norm": 2.109375, + "learning_rate": 4.206899394476717e-05, + "loss": 0.4775, + "step": 5990 + }, + { + "epoch": 0.26310416369364525, + "grad_norm": 2.234375, + "learning_rate": 4.2063938496039746e-05, + "loss": 0.4512, + "step": 5992 + }, + { + "epoch": 0.2631919821728487, + "grad_norm": 2.40625, + "learning_rate": 4.2058881740551825e-05, + "loss": 0.4476, + "step": 5994 + }, + { + "epoch": 0.2632798006520522, + "grad_norm": 2.421875, + "learning_rate": 4.2053823678690655e-05, + "loss": 0.4599, + "step": 5996 + }, + { + "epoch": 0.26336761913125567, + "grad_norm": 2.09375, + "learning_rate": 4.2048764310843566e-05, + "loss": 0.4945, + "step": 5998 + }, + { + "epoch": 0.2634554376104592, + "grad_norm": 2.15625, + "learning_rate": 4.204370363739803e-05, + "loss": 0.4648, + "step": 6000 + }, + { + "epoch": 0.26354325608966267, + "grad_norm": 2.46875, + "learning_rate": 4.203864165874158e-05, + "loss": 0.4385, + "step": 6002 + }, + { + "epoch": 0.26363107456886614, + "grad_norm": 2.125, + "learning_rate": 4.203357837526187e-05, + "loss": 0.4218, + "step": 6004 + }, + { + "epoch": 0.2637188930480696, + "grad_norm": 2.09375, + "learning_rate": 4.202851378734664e-05, + "loss": 0.4294, + "step": 6006 + }, + { + "epoch": 0.26380671152727314, + "grad_norm": 2.078125, + "learning_rate": 4.2023447895383746e-05, + "loss": 0.4554, + "step": 6008 + }, + { + "epoch": 0.2638945300064766, + "grad_norm": 2.375, + "learning_rate": 4.201838069976114e-05, + "loss": 0.4748, + "step": 6010 + }, + { + "epoch": 0.2639823484856801, + "grad_norm": 2.046875, + "learning_rate": 4.201331220086685e-05, + "loss": 0.4362, + "step": 6012 + }, + { + "epoch": 0.2640701669648836, + "grad_norm": 2.265625, + "learning_rate": 4.2008242399089036e-05, + "loss": 0.4517, + "step": 6014 + }, + { + "epoch": 0.2641579854440871, + "grad_norm": 2.375, + "learning_rate": 4.200317129481594e-05, + "loss": 0.4676, + "step": 6016 + }, + { + "epoch": 0.26424580392329056, + "grad_norm": 2.125, + "learning_rate": 4.199809888843591e-05, + "loss": 0.4595, + "step": 6018 + }, + { + "epoch": 0.26433362240249403, + "grad_norm": 2.421875, + "learning_rate": 4.19930251803374e-05, + "loss": 0.434, + "step": 6020 + }, + { + "epoch": 0.26442144088169756, + "grad_norm": 2.28125, + "learning_rate": 4.198795017090894e-05, + "loss": 0.4706, + "step": 6022 + }, + { + "epoch": 0.26450925936090103, + "grad_norm": 2.21875, + "learning_rate": 4.1982873860539186e-05, + "loss": 0.4541, + "step": 6024 + }, + { + "epoch": 0.2645970778401045, + "grad_norm": 2.234375, + "learning_rate": 4.197779624961688e-05, + "loss": 0.4787, + "step": 6026 + }, + { + "epoch": 0.26468489631930797, + "grad_norm": 2.203125, + "learning_rate": 4.1972717338530865e-05, + "loss": 0.4714, + "step": 6028 + }, + { + "epoch": 0.2647727147985115, + "grad_norm": 2.03125, + "learning_rate": 4.196763712767009e-05, + "loss": 0.4667, + "step": 6030 + }, + { + "epoch": 0.26486053327771497, + "grad_norm": 2.40625, + "learning_rate": 4.19625556174236e-05, + "loss": 0.4655, + "step": 6032 + }, + { + "epoch": 0.26494835175691844, + "grad_norm": 2.234375, + "learning_rate": 4.195747280818053e-05, + "loss": 0.4729, + "step": 6034 + }, + { + "epoch": 0.2650361702361219, + "grad_norm": 2.078125, + "learning_rate": 4.195238870033012e-05, + "loss": 0.4532, + "step": 6036 + }, + { + "epoch": 0.26512398871532544, + "grad_norm": 2.09375, + "learning_rate": 4.194730329426173e-05, + "loss": 0.4614, + "step": 6038 + }, + { + "epoch": 0.2652118071945289, + "grad_norm": 2.046875, + "learning_rate": 4.194221659036479e-05, + "loss": 0.4355, + "step": 6040 + }, + { + "epoch": 0.2652996256737324, + "grad_norm": 2.109375, + "learning_rate": 4.1937128589028845e-05, + "loss": 0.4523, + "step": 6042 + }, + { + "epoch": 0.26538744415293586, + "grad_norm": 2.265625, + "learning_rate": 4.193203929064353e-05, + "loss": 0.4551, + "step": 6044 + }, + { + "epoch": 0.2654752626321394, + "grad_norm": 2.140625, + "learning_rate": 4.192694869559859e-05, + "loss": 0.4766, + "step": 6046 + }, + { + "epoch": 0.26556308111134286, + "grad_norm": 2.328125, + "learning_rate": 4.1921856804283854e-05, + "loss": 0.4742, + "step": 6048 + }, + { + "epoch": 0.26565089959054633, + "grad_norm": 2.5, + "learning_rate": 4.191676361708927e-05, + "loss": 0.4476, + "step": 6050 + }, + { + "epoch": 0.2657387180697498, + "grad_norm": 2.5, + "learning_rate": 4.191166913440487e-05, + "loss": 0.4644, + "step": 6052 + }, + { + "epoch": 0.26582653654895333, + "grad_norm": 2.3125, + "learning_rate": 4.1906573356620795e-05, + "loss": 0.4646, + "step": 6054 + }, + { + "epoch": 0.2659143550281568, + "grad_norm": 2.46875, + "learning_rate": 4.190147628412729e-05, + "loss": 0.4577, + "step": 6056 + }, + { + "epoch": 0.2660021735073603, + "grad_norm": 2.28125, + "learning_rate": 4.189637791731467e-05, + "loss": 0.4707, + "step": 6058 + }, + { + "epoch": 0.26608999198656375, + "grad_norm": 2.109375, + "learning_rate": 4.1891278256573384e-05, + "loss": 0.4336, + "step": 6060 + }, + { + "epoch": 0.2661778104657673, + "grad_norm": 2.75, + "learning_rate": 4.188617730229395e-05, + "loss": 0.4459, + "step": 6062 + }, + { + "epoch": 0.26626562894497074, + "grad_norm": 2.171875, + "learning_rate": 4.188107505486702e-05, + "loss": 0.4513, + "step": 6064 + }, + { + "epoch": 0.2663534474241742, + "grad_norm": 2.15625, + "learning_rate": 4.187597151468331e-05, + "loss": 0.4408, + "step": 6066 + }, + { + "epoch": 0.26644126590337774, + "grad_norm": 2.421875, + "learning_rate": 4.187086668213366e-05, + "loss": 0.454, + "step": 6068 + }, + { + "epoch": 0.2665290843825812, + "grad_norm": 2.328125, + "learning_rate": 4.186576055760899e-05, + "loss": 0.4469, + "step": 6070 + }, + { + "epoch": 0.2666169028617847, + "grad_norm": 2.0625, + "learning_rate": 4.186065314150034e-05, + "loss": 0.4321, + "step": 6072 + }, + { + "epoch": 0.26670472134098816, + "grad_norm": 2.0625, + "learning_rate": 4.1855544434198826e-05, + "loss": 0.4323, + "step": 6074 + }, + { + "epoch": 0.2667925398201917, + "grad_norm": 2.3125, + "learning_rate": 4.185043443609569e-05, + "loss": 0.4487, + "step": 6076 + }, + { + "epoch": 0.26688035829939516, + "grad_norm": 2.171875, + "learning_rate": 4.184532314758223e-05, + "loss": 0.4653, + "step": 6078 + }, + { + "epoch": 0.26696817677859863, + "grad_norm": 2.1875, + "learning_rate": 4.184021056904989e-05, + "loss": 0.489, + "step": 6080 + }, + { + "epoch": 0.2670559952578021, + "grad_norm": 2.78125, + "learning_rate": 4.183509670089018e-05, + "loss": 0.4378, + "step": 6082 + }, + { + "epoch": 0.26714381373700563, + "grad_norm": 3.28125, + "learning_rate": 4.1829981543494746e-05, + "loss": 0.4667, + "step": 6084 + }, + { + "epoch": 0.2672316322162091, + "grad_norm": 3.875, + "learning_rate": 4.1824865097255284e-05, + "loss": 0.4541, + "step": 6086 + }, + { + "epoch": 0.2673194506954126, + "grad_norm": 2.15625, + "learning_rate": 4.181974736256362e-05, + "loss": 0.4202, + "step": 6088 + }, + { + "epoch": 0.26740726917461605, + "grad_norm": 2.515625, + "learning_rate": 4.181462833981167e-05, + "loss": 0.474, + "step": 6090 + }, + { + "epoch": 0.2674950876538196, + "grad_norm": 2.359375, + "learning_rate": 4.180950802939145e-05, + "loss": 0.4343, + "step": 6092 + }, + { + "epoch": 0.26758290613302305, + "grad_norm": 1.9921875, + "learning_rate": 4.1804386431695076e-05, + "loss": 0.4172, + "step": 6094 + }, + { + "epoch": 0.2676707246122265, + "grad_norm": 2.296875, + "learning_rate": 4.179926354711476e-05, + "loss": 0.442, + "step": 6096 + }, + { + "epoch": 0.26775854309143, + "grad_norm": 2.015625, + "learning_rate": 4.179413937604282e-05, + "loss": 0.4464, + "step": 6098 + }, + { + "epoch": 0.2678463615706335, + "grad_norm": 2.21875, + "learning_rate": 4.178901391887165e-05, + "loss": 0.4462, + "step": 6100 + }, + { + "epoch": 0.267934180049837, + "grad_norm": 2.296875, + "learning_rate": 4.178388717599378e-05, + "loss": 0.4389, + "step": 6102 + }, + { + "epoch": 0.26802199852904046, + "grad_norm": 2.859375, + "learning_rate": 4.17787591478018e-05, + "loss": 0.4595, + "step": 6104 + }, + { + "epoch": 0.26810981700824393, + "grad_norm": 2.640625, + "learning_rate": 4.177362983468843e-05, + "loss": 0.4419, + "step": 6106 + }, + { + "epoch": 0.26819763548744746, + "grad_norm": 2.671875, + "learning_rate": 4.1768499237046455e-05, + "loss": 0.4286, + "step": 6108 + }, + { + "epoch": 0.26828545396665093, + "grad_norm": 2.203125, + "learning_rate": 4.17633673552688e-05, + "loss": 0.4302, + "step": 6110 + }, + { + "epoch": 0.2683732724458544, + "grad_norm": 2.1875, + "learning_rate": 4.175823418974845e-05, + "loss": 0.4421, + "step": 6112 + }, + { + "epoch": 0.26846109092505793, + "grad_norm": 2.765625, + "learning_rate": 4.17530997408785e-05, + "loss": 0.471, + "step": 6114 + }, + { + "epoch": 0.2685489094042614, + "grad_norm": 2.890625, + "learning_rate": 4.174796400905216e-05, + "loss": 0.4329, + "step": 6116 + }, + { + "epoch": 0.2686367278834649, + "grad_norm": 2.421875, + "learning_rate": 4.1742826994662734e-05, + "loss": 0.4473, + "step": 6118 + }, + { + "epoch": 0.26872454636266835, + "grad_norm": 2.296875, + "learning_rate": 4.1737688698103595e-05, + "loss": 0.5077, + "step": 6120 + }, + { + "epoch": 0.2688123648418719, + "grad_norm": 2.296875, + "learning_rate": 4.173254911976824e-05, + "loss": 0.4531, + "step": 6122 + }, + { + "epoch": 0.26890018332107535, + "grad_norm": 2.453125, + "learning_rate": 4.172740826005027e-05, + "loss": 0.4318, + "step": 6124 + }, + { + "epoch": 0.2689880018002788, + "grad_norm": 2.40625, + "learning_rate": 4.1722266119343357e-05, + "loss": 0.4643, + "step": 6126 + }, + { + "epoch": 0.2690758202794823, + "grad_norm": 2.34375, + "learning_rate": 4.1717122698041296e-05, + "loss": 0.4479, + "step": 6128 + }, + { + "epoch": 0.2691636387586858, + "grad_norm": 2.015625, + "learning_rate": 4.1711977996537976e-05, + "loss": 0.4423, + "step": 6130 + }, + { + "epoch": 0.2692514572378893, + "grad_norm": 2.0, + "learning_rate": 4.170683201522737e-05, + "loss": 0.4564, + "step": 6132 + }, + { + "epoch": 0.26933927571709276, + "grad_norm": 2.0625, + "learning_rate": 4.170168475450357e-05, + "loss": 0.4654, + "step": 6134 + }, + { + "epoch": 0.26942709419629624, + "grad_norm": 2.90625, + "learning_rate": 4.1696536214760746e-05, + "loss": 0.4642, + "step": 6136 + }, + { + "epoch": 0.26951491267549976, + "grad_norm": 2.390625, + "learning_rate": 4.169138639639317e-05, + "loss": 0.452, + "step": 6138 + }, + { + "epoch": 0.26960273115470323, + "grad_norm": 2.28125, + "learning_rate": 4.1686235299795226e-05, + "loss": 0.4645, + "step": 6140 + }, + { + "epoch": 0.2696905496339067, + "grad_norm": 2.15625, + "learning_rate": 4.168108292536139e-05, + "loss": 0.4592, + "step": 6142 + }, + { + "epoch": 0.2697783681131102, + "grad_norm": 2.34375, + "learning_rate": 4.167592927348622e-05, + "loss": 0.4458, + "step": 6144 + }, + { + "epoch": 0.2698661865923137, + "grad_norm": 2.203125, + "learning_rate": 4.167077434456439e-05, + "loss": 0.4366, + "step": 6146 + }, + { + "epoch": 0.2699540050715172, + "grad_norm": 2.1875, + "learning_rate": 4.166561813899066e-05, + "loss": 0.4384, + "step": 6148 + }, + { + "epoch": 0.27004182355072065, + "grad_norm": 2.265625, + "learning_rate": 4.16604606571599e-05, + "loss": 0.4309, + "step": 6150 + }, + { + "epoch": 0.2701296420299241, + "grad_norm": 2.234375, + "learning_rate": 4.165530189946707e-05, + "loss": 0.4639, + "step": 6152 + }, + { + "epoch": 0.27021746050912765, + "grad_norm": 2.203125, + "learning_rate": 4.1650141866307224e-05, + "loss": 0.4496, + "step": 6154 + }, + { + "epoch": 0.2703052789883311, + "grad_norm": 2.46875, + "learning_rate": 4.164498055807553e-05, + "loss": 0.4343, + "step": 6156 + }, + { + "epoch": 0.2703930974675346, + "grad_norm": 2.734375, + "learning_rate": 4.163981797516723e-05, + "loss": 0.4438, + "step": 6158 + }, + { + "epoch": 0.27048091594673807, + "grad_norm": 3.28125, + "learning_rate": 4.163465411797768e-05, + "loss": 0.4517, + "step": 6160 + }, + { + "epoch": 0.2705687344259416, + "grad_norm": 3.484375, + "learning_rate": 4.162948898690233e-05, + "loss": 0.4539, + "step": 6162 + }, + { + "epoch": 0.27065655290514506, + "grad_norm": 2.5, + "learning_rate": 4.162432258233673e-05, + "loss": 0.4514, + "step": 6164 + }, + { + "epoch": 0.27074437138434854, + "grad_norm": 2.828125, + "learning_rate": 4.1619154904676525e-05, + "loss": 0.4465, + "step": 6166 + }, + { + "epoch": 0.27083218986355206, + "grad_norm": 2.71875, + "learning_rate": 4.1613985954317446e-05, + "loss": 0.44, + "step": 6168 + }, + { + "epoch": 0.27092000834275554, + "grad_norm": 2.0625, + "learning_rate": 4.1608815731655345e-05, + "loss": 0.4482, + "step": 6170 + }, + { + "epoch": 0.271007826821959, + "grad_norm": 2.078125, + "learning_rate": 4.160364423708615e-05, + "loss": 0.4504, + "step": 6172 + }, + { + "epoch": 0.2710956453011625, + "grad_norm": 2.421875, + "learning_rate": 4.15984714710059e-05, + "loss": 0.4325, + "step": 6174 + }, + { + "epoch": 0.271183463780366, + "grad_norm": 2.34375, + "learning_rate": 4.159329743381072e-05, + "loss": 0.4344, + "step": 6176 + }, + { + "epoch": 0.2712712822595695, + "grad_norm": 2.1875, + "learning_rate": 4.1588122125896854e-05, + "loss": 0.4401, + "step": 6178 + }, + { + "epoch": 0.27135910073877295, + "grad_norm": 2.171875, + "learning_rate": 4.1582945547660625e-05, + "loss": 0.4931, + "step": 6180 + }, + { + "epoch": 0.2714469192179764, + "grad_norm": 2.078125, + "learning_rate": 4.157776769949844e-05, + "loss": 0.4262, + "step": 6182 + }, + { + "epoch": 0.27153473769717995, + "grad_norm": 2.140625, + "learning_rate": 4.157258858180683e-05, + "loss": 0.4675, + "step": 6184 + }, + { + "epoch": 0.2716225561763834, + "grad_norm": 2.3125, + "learning_rate": 4.156740819498242e-05, + "loss": 0.4665, + "step": 6186 + }, + { + "epoch": 0.2717103746555869, + "grad_norm": 2.0625, + "learning_rate": 4.156222653942191e-05, + "loss": 0.4537, + "step": 6188 + }, + { + "epoch": 0.27179819313479037, + "grad_norm": 2.234375, + "learning_rate": 4.1557043615522125e-05, + "loss": 0.4607, + "step": 6190 + }, + { + "epoch": 0.2718860116139939, + "grad_norm": 2.21875, + "learning_rate": 4.155185942367997e-05, + "loss": 0.4709, + "step": 6192 + }, + { + "epoch": 0.27197383009319737, + "grad_norm": 2.25, + "learning_rate": 4.154667396429246e-05, + "loss": 0.4343, + "step": 6194 + }, + { + "epoch": 0.27206164857240084, + "grad_norm": 2.296875, + "learning_rate": 4.1541487237756686e-05, + "loss": 0.4345, + "step": 6196 + }, + { + "epoch": 0.2721494670516043, + "grad_norm": 2.171875, + "learning_rate": 4.153629924446986e-05, + "loss": 0.457, + "step": 6198 + }, + { + "epoch": 0.27223728553080784, + "grad_norm": 2.109375, + "learning_rate": 4.153110998482926e-05, + "loss": 0.4675, + "step": 6200 + }, + { + "epoch": 0.2723251040100113, + "grad_norm": 2.578125, + "learning_rate": 4.152591945923231e-05, + "loss": 0.4952, + "step": 6202 + }, + { + "epoch": 0.2724129224892148, + "grad_norm": 2.125, + "learning_rate": 4.152072766807648e-05, + "loss": 0.46, + "step": 6204 + }, + { + "epoch": 0.27250074096841825, + "grad_norm": 2.71875, + "learning_rate": 4.151553461175936e-05, + "loss": 0.4656, + "step": 6206 + }, + { + "epoch": 0.2725885594476218, + "grad_norm": 2.421875, + "learning_rate": 4.151034029067864e-05, + "loss": 0.4346, + "step": 6208 + }, + { + "epoch": 0.27267637792682525, + "grad_norm": 2.359375, + "learning_rate": 4.1505144705232114e-05, + "loss": 0.4237, + "step": 6210 + }, + { + "epoch": 0.2727641964060287, + "grad_norm": 1.9296875, + "learning_rate": 4.149994785581764e-05, + "loss": 0.4196, + "step": 6212 + }, + { + "epoch": 0.27285201488523225, + "grad_norm": 2.03125, + "learning_rate": 4.149474974283321e-05, + "loss": 0.4359, + "step": 6214 + }, + { + "epoch": 0.2729398333644357, + "grad_norm": 2.265625, + "learning_rate": 4.148955036667689e-05, + "loss": 0.4525, + "step": 6216 + }, + { + "epoch": 0.2730276518436392, + "grad_norm": 2.15625, + "learning_rate": 4.148434972774685e-05, + "loss": 0.4554, + "step": 6218 + }, + { + "epoch": 0.27311547032284267, + "grad_norm": 2.09375, + "learning_rate": 4.147914782644134e-05, + "loss": 0.4339, + "step": 6220 + }, + { + "epoch": 0.2732032888020462, + "grad_norm": 2.265625, + "learning_rate": 4.147394466315876e-05, + "loss": 0.4376, + "step": 6222 + }, + { + "epoch": 0.27329110728124967, + "grad_norm": 2.484375, + "learning_rate": 4.146874023829754e-05, + "loss": 0.4694, + "step": 6224 + }, + { + "epoch": 0.27337892576045314, + "grad_norm": 2.125, + "learning_rate": 4.146353455225625e-05, + "loss": 0.4433, + "step": 6226 + }, + { + "epoch": 0.2734667442396566, + "grad_norm": 2.125, + "learning_rate": 4.145832760543353e-05, + "loss": 0.4423, + "step": 6228 + }, + { + "epoch": 0.27355456271886014, + "grad_norm": 2.15625, + "learning_rate": 4.1453119398228146e-05, + "loss": 0.4558, + "step": 6230 + }, + { + "epoch": 0.2736423811980636, + "grad_norm": 2.140625, + "learning_rate": 4.144790993103893e-05, + "loss": 0.4371, + "step": 6232 + }, + { + "epoch": 0.2737301996772671, + "grad_norm": 2.359375, + "learning_rate": 4.144269920426482e-05, + "loss": 0.4518, + "step": 6234 + }, + { + "epoch": 0.27381801815647056, + "grad_norm": 2.40625, + "learning_rate": 4.1437487218304875e-05, + "loss": 0.4369, + "step": 6236 + }, + { + "epoch": 0.2739058366356741, + "grad_norm": 2.671875, + "learning_rate": 4.1432273973558215e-05, + "loss": 0.4403, + "step": 6238 + }, + { + "epoch": 0.27399365511487755, + "grad_norm": 2.28125, + "learning_rate": 4.142705947042408e-05, + "loss": 0.4483, + "step": 6240 + }, + { + "epoch": 0.274081473594081, + "grad_norm": 2.1875, + "learning_rate": 4.142184370930178e-05, + "loss": 0.4426, + "step": 6242 + }, + { + "epoch": 0.2741692920732845, + "grad_norm": 1.890625, + "learning_rate": 4.141662669059076e-05, + "loss": 0.4282, + "step": 6244 + }, + { + "epoch": 0.274257110552488, + "grad_norm": 2.09375, + "learning_rate": 4.1411408414690536e-05, + "loss": 0.4406, + "step": 6246 + }, + { + "epoch": 0.2743449290316915, + "grad_norm": 2.328125, + "learning_rate": 4.140618888200072e-05, + "loss": 0.4505, + "step": 6248 + }, + { + "epoch": 0.27443274751089497, + "grad_norm": 2.25, + "learning_rate": 4.140096809292102e-05, + "loss": 0.4422, + "step": 6250 + }, + { + "epoch": 0.27452056599009844, + "grad_norm": 2.15625, + "learning_rate": 4.1395746047851256e-05, + "loss": 0.4614, + "step": 6252 + }, + { + "epoch": 0.27460838446930197, + "grad_norm": 2.3125, + "learning_rate": 4.139052274719133e-05, + "loss": 0.4416, + "step": 6254 + }, + { + "epoch": 0.27469620294850544, + "grad_norm": 2.21875, + "learning_rate": 4.1385298191341246e-05, + "loss": 0.468, + "step": 6256 + }, + { + "epoch": 0.2747840214277089, + "grad_norm": 3.890625, + "learning_rate": 4.1380072380701097e-05, + "loss": 0.453, + "step": 6258 + }, + { + "epoch": 0.2748718399069124, + "grad_norm": 2.03125, + "learning_rate": 4.137484531567107e-05, + "loss": 0.4224, + "step": 6260 + }, + { + "epoch": 0.2749596583861159, + "grad_norm": 2.234375, + "learning_rate": 4.136961699665147e-05, + "loss": 0.4824, + "step": 6262 + }, + { + "epoch": 0.2750474768653194, + "grad_norm": 2.203125, + "learning_rate": 4.136438742404268e-05, + "loss": 0.4251, + "step": 6264 + }, + { + "epoch": 0.27513529534452286, + "grad_norm": 2.265625, + "learning_rate": 4.1359156598245176e-05, + "loss": 0.4623, + "step": 6266 + }, + { + "epoch": 0.2752231138237264, + "grad_norm": 2.3125, + "learning_rate": 4.1353924519659534e-05, + "loss": 0.4438, + "step": 6268 + }, + { + "epoch": 0.27531093230292986, + "grad_norm": 2.359375, + "learning_rate": 4.1348691188686436e-05, + "loss": 0.466, + "step": 6270 + }, + { + "epoch": 0.27539875078213333, + "grad_norm": 2.1875, + "learning_rate": 4.134345660572665e-05, + "loss": 0.4888, + "step": 6272 + }, + { + "epoch": 0.2754865692613368, + "grad_norm": 2.28125, + "learning_rate": 4.1338220771181036e-05, + "loss": 0.4234, + "step": 6274 + }, + { + "epoch": 0.2755743877405403, + "grad_norm": 2.5, + "learning_rate": 4.1332983685450556e-05, + "loss": 0.4546, + "step": 6276 + }, + { + "epoch": 0.2756622062197438, + "grad_norm": 2.171875, + "learning_rate": 4.132774534893628e-05, + "loss": 0.4614, + "step": 6278 + }, + { + "epoch": 0.27575002469894727, + "grad_norm": 2.15625, + "learning_rate": 4.132250576203934e-05, + "loss": 0.4509, + "step": 6280 + }, + { + "epoch": 0.27583784317815074, + "grad_norm": 2.21875, + "learning_rate": 4.131726492516099e-05, + "loss": 0.4766, + "step": 6282 + }, + { + "epoch": 0.27592566165735427, + "grad_norm": 2.203125, + "learning_rate": 4.1312022838702595e-05, + "loss": 0.4546, + "step": 6284 + }, + { + "epoch": 0.27601348013655774, + "grad_norm": 2.5625, + "learning_rate": 4.1306779503065585e-05, + "loss": 0.4589, + "step": 6286 + }, + { + "epoch": 0.2761012986157612, + "grad_norm": 2.296875, + "learning_rate": 4.130153491865148e-05, + "loss": 0.4231, + "step": 6288 + }, + { + "epoch": 0.2761891170949647, + "grad_norm": 2.0625, + "learning_rate": 4.1296289085861924e-05, + "loss": 0.4614, + "step": 6290 + }, + { + "epoch": 0.2762769355741682, + "grad_norm": 2.34375, + "learning_rate": 4.129104200509865e-05, + "loss": 0.4485, + "step": 6292 + }, + { + "epoch": 0.2763647540533717, + "grad_norm": 2.390625, + "learning_rate": 4.128579367676346e-05, + "loss": 0.4608, + "step": 6294 + }, + { + "epoch": 0.27645257253257516, + "grad_norm": 2.625, + "learning_rate": 4.12805441012583e-05, + "loss": 0.4491, + "step": 6296 + }, + { + "epoch": 0.27654039101177863, + "grad_norm": 2.203125, + "learning_rate": 4.1275293278985163e-05, + "loss": 0.4294, + "step": 6298 + }, + { + "epoch": 0.27662820949098216, + "grad_norm": 2.640625, + "learning_rate": 4.127004121034617e-05, + "loss": 0.4214, + "step": 6300 + }, + { + "epoch": 0.27671602797018563, + "grad_norm": 2.453125, + "learning_rate": 4.126478789574352e-05, + "loss": 0.4354, + "step": 6302 + }, + { + "epoch": 0.2768038464493891, + "grad_norm": 2.25, + "learning_rate": 4.1259533335579516e-05, + "loss": 0.4204, + "step": 6304 + }, + { + "epoch": 0.2768916649285926, + "grad_norm": 2.140625, + "learning_rate": 4.125427753025655e-05, + "loss": 0.439, + "step": 6306 + }, + { + "epoch": 0.2769794834077961, + "grad_norm": 2.421875, + "learning_rate": 4.12490204801771e-05, + "loss": 0.4297, + "step": 6308 + }, + { + "epoch": 0.2770673018869996, + "grad_norm": 2.234375, + "learning_rate": 4.1243762185743784e-05, + "loss": 0.4689, + "step": 6310 + }, + { + "epoch": 0.27715512036620304, + "grad_norm": 2.21875, + "learning_rate": 4.123850264735926e-05, + "loss": 0.4831, + "step": 6312 + }, + { + "epoch": 0.2772429388454066, + "grad_norm": 2.234375, + "learning_rate": 4.123324186542631e-05, + "loss": 0.4377, + "step": 6314 + }, + { + "epoch": 0.27733075732461004, + "grad_norm": 2.4375, + "learning_rate": 4.1227979840347806e-05, + "loss": 0.4299, + "step": 6316 + }, + { + "epoch": 0.2774185758038135, + "grad_norm": 2.203125, + "learning_rate": 4.1222716572526725e-05, + "loss": 0.4539, + "step": 6318 + }, + { + "epoch": 0.277506394283017, + "grad_norm": 2.296875, + "learning_rate": 4.121745206236611e-05, + "loss": 0.4649, + "step": 6320 + }, + { + "epoch": 0.2775942127622205, + "grad_norm": 2.0, + "learning_rate": 4.121218631026913e-05, + "loss": 0.4221, + "step": 6322 + }, + { + "epoch": 0.277682031241424, + "grad_norm": 2.390625, + "learning_rate": 4.120691931663904e-05, + "loss": 0.4379, + "step": 6324 + }, + { + "epoch": 0.27776984972062746, + "grad_norm": 2.21875, + "learning_rate": 4.120165108187918e-05, + "loss": 0.4545, + "step": 6326 + }, + { + "epoch": 0.27785766819983093, + "grad_norm": 2.203125, + "learning_rate": 4.1196381606393e-05, + "loss": 0.4437, + "step": 6328 + }, + { + "epoch": 0.27794548667903446, + "grad_norm": 2.3125, + "learning_rate": 4.119111089058403e-05, + "loss": 0.4657, + "step": 6330 + }, + { + "epoch": 0.27803330515823793, + "grad_norm": 2.265625, + "learning_rate": 4.118583893485592e-05, + "loss": 0.4183, + "step": 6332 + }, + { + "epoch": 0.2781211236374414, + "grad_norm": 2.34375, + "learning_rate": 4.1180565739612365e-05, + "loss": 0.4495, + "step": 6334 + }, + { + "epoch": 0.2782089421166449, + "grad_norm": 2.1875, + "learning_rate": 4.117529130525721e-05, + "loss": 0.4459, + "step": 6336 + }, + { + "epoch": 0.2782967605958484, + "grad_norm": 2.46875, + "learning_rate": 4.117001563219438e-05, + "loss": 0.4392, + "step": 6338 + }, + { + "epoch": 0.2783845790750519, + "grad_norm": 2.078125, + "learning_rate": 4.1164738720827864e-05, + "loss": 0.4609, + "step": 6340 + }, + { + "epoch": 0.27847239755425535, + "grad_norm": 2.59375, + "learning_rate": 4.1159460571561795e-05, + "loss": 0.451, + "step": 6342 + }, + { + "epoch": 0.2785602160334588, + "grad_norm": 2.71875, + "learning_rate": 4.1154181184800344e-05, + "loss": 0.4578, + "step": 6344 + }, + { + "epoch": 0.27864803451266235, + "grad_norm": 2.65625, + "learning_rate": 4.114890056094784e-05, + "loss": 0.4411, + "step": 6346 + }, + { + "epoch": 0.2787358529918658, + "grad_norm": 2.65625, + "learning_rate": 4.114361870040866e-05, + "loss": 0.4315, + "step": 6348 + }, + { + "epoch": 0.2788236714710693, + "grad_norm": 2.53125, + "learning_rate": 4.1138335603587284e-05, + "loss": 0.4445, + "step": 6350 + }, + { + "epoch": 0.27891148995027276, + "grad_norm": 2.6875, + "learning_rate": 4.11330512708883e-05, + "loss": 0.4583, + "step": 6352 + }, + { + "epoch": 0.2789993084294763, + "grad_norm": 2.375, + "learning_rate": 4.112776570271639e-05, + "loss": 0.4679, + "step": 6354 + }, + { + "epoch": 0.27908712690867976, + "grad_norm": 2.9375, + "learning_rate": 4.11224788994763e-05, + "loss": 0.4387, + "step": 6356 + }, + { + "epoch": 0.27917494538788323, + "grad_norm": 2.734375, + "learning_rate": 4.111719086157293e-05, + "loss": 0.4503, + "step": 6358 + }, + { + "epoch": 0.2792627638670867, + "grad_norm": 2.78125, + "learning_rate": 4.111190158941121e-05, + "loss": 0.4793, + "step": 6360 + }, + { + "epoch": 0.27935058234629023, + "grad_norm": 3.171875, + "learning_rate": 4.11066110833962e-05, + "loss": 0.4142, + "step": 6362 + }, + { + "epoch": 0.2794384008254937, + "grad_norm": 3.265625, + "learning_rate": 4.1101319343933064e-05, + "loss": 0.4495, + "step": 6364 + }, + { + "epoch": 0.2795262193046972, + "grad_norm": 3.15625, + "learning_rate": 4.109602637142703e-05, + "loss": 0.4436, + "step": 6366 + }, + { + "epoch": 0.2796140377839007, + "grad_norm": 2.703125, + "learning_rate": 4.109073216628343e-05, + "loss": 0.4644, + "step": 6368 + }, + { + "epoch": 0.2797018562631042, + "grad_norm": 2.34375, + "learning_rate": 4.108543672890771e-05, + "loss": 0.461, + "step": 6370 + }, + { + "epoch": 0.27978967474230765, + "grad_norm": 2.390625, + "learning_rate": 4.108014005970538e-05, + "loss": 0.4372, + "step": 6372 + }, + { + "epoch": 0.2798774932215111, + "grad_norm": 2.28125, + "learning_rate": 4.107484215908208e-05, + "loss": 0.4156, + "step": 6374 + }, + { + "epoch": 0.27996531170071465, + "grad_norm": 2.515625, + "learning_rate": 4.106954302744351e-05, + "loss": 0.4507, + "step": 6376 + }, + { + "epoch": 0.2800531301799181, + "grad_norm": 2.265625, + "learning_rate": 4.1064242665195486e-05, + "loss": 0.4574, + "step": 6378 + }, + { + "epoch": 0.2801409486591216, + "grad_norm": 3.015625, + "learning_rate": 4.105894107274391e-05, + "loss": 0.4322, + "step": 6380 + }, + { + "epoch": 0.28022876713832506, + "grad_norm": 2.203125, + "learning_rate": 4.105363825049476e-05, + "loss": 0.4504, + "step": 6382 + }, + { + "epoch": 0.2803165856175286, + "grad_norm": 2.109375, + "learning_rate": 4.104833419885417e-05, + "loss": 0.4319, + "step": 6384 + }, + { + "epoch": 0.28040440409673206, + "grad_norm": 2.265625, + "learning_rate": 4.104302891822828e-05, + "loss": 0.4637, + "step": 6386 + }, + { + "epoch": 0.28049222257593553, + "grad_norm": 2.125, + "learning_rate": 4.1037722409023396e-05, + "loss": 0.4497, + "step": 6388 + }, + { + "epoch": 0.280580041055139, + "grad_norm": 2.296875, + "learning_rate": 4.1032414671645894e-05, + "loss": 0.4482, + "step": 6390 + }, + { + "epoch": 0.28066785953434253, + "grad_norm": 2.75, + "learning_rate": 4.102710570650222e-05, + "loss": 0.457, + "step": 6392 + }, + { + "epoch": 0.280755678013546, + "grad_norm": 2.96875, + "learning_rate": 4.102179551399895e-05, + "loss": 0.4158, + "step": 6394 + }, + { + "epoch": 0.2808434964927495, + "grad_norm": 2.734375, + "learning_rate": 4.1016484094542754e-05, + "loss": 0.4455, + "step": 6396 + }, + { + "epoch": 0.28093131497195295, + "grad_norm": 2.6875, + "learning_rate": 4.101117144854035e-05, + "loss": 0.433, + "step": 6398 + }, + { + "epoch": 0.2810191334511565, + "grad_norm": 2.515625, + "learning_rate": 4.10058575763986e-05, + "loss": 0.4576, + "step": 6400 + }, + { + "epoch": 0.28110695193035995, + "grad_norm": 2.375, + "learning_rate": 4.100054247852445e-05, + "loss": 0.4346, + "step": 6402 + }, + { + "epoch": 0.2811947704095634, + "grad_norm": 2.390625, + "learning_rate": 4.099522615532491e-05, + "loss": 0.4536, + "step": 6404 + }, + { + "epoch": 0.2812825888887669, + "grad_norm": 2.453125, + "learning_rate": 4.098990860720712e-05, + "loss": 0.4584, + "step": 6406 + }, + { + "epoch": 0.2813704073679704, + "grad_norm": 2.25, + "learning_rate": 4.09845898345783e-05, + "loss": 0.4555, + "step": 6408 + }, + { + "epoch": 0.2814582258471739, + "grad_norm": 2.34375, + "learning_rate": 4.0979269837845754e-05, + "loss": 0.4722, + "step": 6410 + }, + { + "epoch": 0.28154604432637736, + "grad_norm": 2.328125, + "learning_rate": 4.09739486174169e-05, + "loss": 0.4457, + "step": 6412 + }, + { + "epoch": 0.28163386280558084, + "grad_norm": 2.125, + "learning_rate": 4.0968626173699234e-05, + "loss": 0.4123, + "step": 6414 + }, + { + "epoch": 0.28172168128478436, + "grad_norm": 2.0625, + "learning_rate": 4.0963302507100336e-05, + "loss": 0.4087, + "step": 6416 + }, + { + "epoch": 0.28180949976398784, + "grad_norm": 2.34375, + "learning_rate": 4.095797761802791e-05, + "loss": 0.4396, + "step": 6418 + }, + { + "epoch": 0.2818973182431913, + "grad_norm": 2.03125, + "learning_rate": 4.0952651506889735e-05, + "loss": 0.4509, + "step": 6420 + }, + { + "epoch": 0.28198513672239484, + "grad_norm": 2.140625, + "learning_rate": 4.094732417409368e-05, + "loss": 0.4242, + "step": 6422 + }, + { + "epoch": 0.2820729552015983, + "grad_norm": 2.0, + "learning_rate": 4.094199562004772e-05, + "loss": 0.428, + "step": 6424 + }, + { + "epoch": 0.2821607736808018, + "grad_norm": 2.328125, + "learning_rate": 4.0936665845159915e-05, + "loss": 0.4399, + "step": 6426 + }, + { + "epoch": 0.28224859216000525, + "grad_norm": 2.15625, + "learning_rate": 4.0931334849838414e-05, + "loss": 0.4446, + "step": 6428 + }, + { + "epoch": 0.2823364106392088, + "grad_norm": 2.296875, + "learning_rate": 4.0926002634491476e-05, + "loss": 0.4622, + "step": 6430 + }, + { + "epoch": 0.28242422911841225, + "grad_norm": 2.140625, + "learning_rate": 4.092066919952743e-05, + "loss": 0.4531, + "step": 6432 + }, + { + "epoch": 0.2825120475976157, + "grad_norm": 2.140625, + "learning_rate": 4.0915334545354734e-05, + "loss": 0.4387, + "step": 6434 + }, + { + "epoch": 0.2825998660768192, + "grad_norm": 2.09375, + "learning_rate": 4.0909998672381897e-05, + "loss": 0.4392, + "step": 6436 + }, + { + "epoch": 0.2826876845560227, + "grad_norm": 2.359375, + "learning_rate": 4.090466158101754e-05, + "loss": 0.4385, + "step": 6438 + }, + { + "epoch": 0.2827755030352262, + "grad_norm": 2.109375, + "learning_rate": 4.0899323271670395e-05, + "loss": 0.4336, + "step": 6440 + }, + { + "epoch": 0.28286332151442967, + "grad_norm": 2.390625, + "learning_rate": 4.0893983744749265e-05, + "loss": 0.4414, + "step": 6442 + }, + { + "epoch": 0.28295113999363314, + "grad_norm": 2.15625, + "learning_rate": 4.088864300066304e-05, + "loss": 0.4467, + "step": 6444 + }, + { + "epoch": 0.28303895847283667, + "grad_norm": 2.09375, + "learning_rate": 4.088330103982074e-05, + "loss": 0.4311, + "step": 6446 + }, + { + "epoch": 0.28312677695204014, + "grad_norm": 2.546875, + "learning_rate": 4.0877957862631425e-05, + "loss": 0.4265, + "step": 6448 + }, + { + "epoch": 0.2832145954312436, + "grad_norm": 2.84375, + "learning_rate": 4.087261346950429e-05, + "loss": 0.4405, + "step": 6450 + }, + { + "epoch": 0.2833024139104471, + "grad_norm": 2.1875, + "learning_rate": 4.086726786084862e-05, + "loss": 0.42, + "step": 6452 + }, + { + "epoch": 0.2833902323896506, + "grad_norm": 2.703125, + "learning_rate": 4.086192103707377e-05, + "loss": 0.4197, + "step": 6454 + }, + { + "epoch": 0.2834780508688541, + "grad_norm": 2.546875, + "learning_rate": 4.0856572998589206e-05, + "loss": 0.4347, + "step": 6456 + }, + { + "epoch": 0.28356586934805755, + "grad_norm": 2.578125, + "learning_rate": 4.0851223745804476e-05, + "loss": 0.4318, + "step": 6458 + }, + { + "epoch": 0.283653687827261, + "grad_norm": 2.25, + "learning_rate": 4.0845873279129246e-05, + "loss": 0.4353, + "step": 6460 + }, + { + "epoch": 0.28374150630646455, + "grad_norm": 2.109375, + "learning_rate": 4.0840521598973223e-05, + "loss": 0.4395, + "step": 6462 + }, + { + "epoch": 0.283829324785668, + "grad_norm": 2.203125, + "learning_rate": 4.083516870574626e-05, + "loss": 0.4345, + "step": 6464 + }, + { + "epoch": 0.2839171432648715, + "grad_norm": 2.328125, + "learning_rate": 4.0829814599858296e-05, + "loss": 0.4437, + "step": 6466 + }, + { + "epoch": 0.284004961744075, + "grad_norm": 2.140625, + "learning_rate": 4.0824459281719326e-05, + "loss": 0.4519, + "step": 6468 + }, + { + "epoch": 0.2840927802232785, + "grad_norm": 2.015625, + "learning_rate": 4.0819102751739466e-05, + "loss": 0.4521, + "step": 6470 + }, + { + "epoch": 0.28418059870248197, + "grad_norm": 2.171875, + "learning_rate": 4.081374501032894e-05, + "loss": 0.462, + "step": 6472 + }, + { + "epoch": 0.28426841718168544, + "grad_norm": 2.171875, + "learning_rate": 4.080838605789802e-05, + "loss": 0.4368, + "step": 6474 + }, + { + "epoch": 0.28435623566088897, + "grad_norm": 2.0625, + "learning_rate": 4.08030258948571e-05, + "loss": 0.4292, + "step": 6476 + }, + { + "epoch": 0.28444405414009244, + "grad_norm": 2.078125, + "learning_rate": 4.0797664521616684e-05, + "loss": 0.4388, + "step": 6478 + }, + { + "epoch": 0.2845318726192959, + "grad_norm": 2.171875, + "learning_rate": 4.079230193858732e-05, + "loss": 0.4426, + "step": 6480 + }, + { + "epoch": 0.2846196910984994, + "grad_norm": 2.28125, + "learning_rate": 4.078693814617969e-05, + "loss": 0.4409, + "step": 6482 + }, + { + "epoch": 0.2847075095777029, + "grad_norm": 2.015625, + "learning_rate": 4.078157314480456e-05, + "loss": 0.4673, + "step": 6484 + }, + { + "epoch": 0.2847953280569064, + "grad_norm": 2.15625, + "learning_rate": 4.077620693487277e-05, + "loss": 0.4272, + "step": 6486 + }, + { + "epoch": 0.28488314653610985, + "grad_norm": 2.21875, + "learning_rate": 4.0770839516795265e-05, + "loss": 0.4592, + "step": 6488 + }, + { + "epoch": 0.2849709650153133, + "grad_norm": 2.3125, + "learning_rate": 4.07654708909831e-05, + "loss": 0.4513, + "step": 6490 + }, + { + "epoch": 0.28505878349451685, + "grad_norm": 2.28125, + "learning_rate": 4.076010105784739e-05, + "loss": 0.4398, + "step": 6492 + }, + { + "epoch": 0.2851466019737203, + "grad_norm": 2.109375, + "learning_rate": 4.075473001779936e-05, + "loss": 0.4455, + "step": 6494 + }, + { + "epoch": 0.2852344204529238, + "grad_norm": 2.484375, + "learning_rate": 4.0749357771250335e-05, + "loss": 0.4208, + "step": 6496 + }, + { + "epoch": 0.28532223893212727, + "grad_norm": 2.28125, + "learning_rate": 4.074398431861171e-05, + "loss": 0.4249, + "step": 6498 + }, + { + "epoch": 0.2854100574113308, + "grad_norm": 2.453125, + "learning_rate": 4.0738609660295e-05, + "loss": 0.4631, + "step": 6500 + }, + { + "epoch": 0.28549787589053427, + "grad_norm": 2.265625, + "learning_rate": 4.073323379671179e-05, + "loss": 0.4618, + "step": 6502 + }, + { + "epoch": 0.28558569436973774, + "grad_norm": 2.203125, + "learning_rate": 4.072785672827375e-05, + "loss": 0.4415, + "step": 6504 + }, + { + "epoch": 0.2856735128489412, + "grad_norm": 2.578125, + "learning_rate": 4.072247845539268e-05, + "loss": 0.4629, + "step": 6506 + }, + { + "epoch": 0.28576133132814474, + "grad_norm": 2.28125, + "learning_rate": 4.0717098978480444e-05, + "loss": 0.45, + "step": 6508 + }, + { + "epoch": 0.2858491498073482, + "grad_norm": 2.03125, + "learning_rate": 4.0711718297949e-05, + "loss": 0.4641, + "step": 6510 + }, + { + "epoch": 0.2859369682865517, + "grad_norm": 2.203125, + "learning_rate": 4.07063364142104e-05, + "loss": 0.429, + "step": 6512 + }, + { + "epoch": 0.28602478676575516, + "grad_norm": 2.34375, + "learning_rate": 4.0700953327676797e-05, + "loss": 0.4409, + "step": 6514 + }, + { + "epoch": 0.2861126052449587, + "grad_norm": 2.40625, + "learning_rate": 4.0695569038760416e-05, + "loss": 0.4768, + "step": 6516 + }, + { + "epoch": 0.28620042372416216, + "grad_norm": 2.234375, + "learning_rate": 4.0690183547873594e-05, + "loss": 0.453, + "step": 6518 + }, + { + "epoch": 0.28628824220336563, + "grad_norm": 2.125, + "learning_rate": 4.068479685542876e-05, + "loss": 0.4708, + "step": 6520 + }, + { + "epoch": 0.28637606068256916, + "grad_norm": 2.28125, + "learning_rate": 4.067940896183843e-05, + "loss": 0.4502, + "step": 6522 + }, + { + "epoch": 0.2864638791617726, + "grad_norm": 2.140625, + "learning_rate": 4.067401986751519e-05, + "loss": 0.4225, + "step": 6524 + }, + { + "epoch": 0.2865516976409761, + "grad_norm": 2.21875, + "learning_rate": 4.0668629572871765e-05, + "loss": 0.4193, + "step": 6526 + }, + { + "epoch": 0.28663951612017957, + "grad_norm": 2.484375, + "learning_rate": 4.066323807832092e-05, + "loss": 0.4282, + "step": 6528 + }, + { + "epoch": 0.2867273345993831, + "grad_norm": 2.296875, + "learning_rate": 4.065784538427555e-05, + "loss": 0.4534, + "step": 6530 + }, + { + "epoch": 0.28681515307858657, + "grad_norm": 2.328125, + "learning_rate": 4.0652451491148636e-05, + "loss": 0.4383, + "step": 6532 + }, + { + "epoch": 0.28690297155779004, + "grad_norm": 2.21875, + "learning_rate": 4.0647056399353225e-05, + "loss": 0.4463, + "step": 6534 + }, + { + "epoch": 0.2869907900369935, + "grad_norm": 2.328125, + "learning_rate": 4.0641660109302485e-05, + "loss": 0.4432, + "step": 6536 + }, + { + "epoch": 0.28707860851619704, + "grad_norm": 2.015625, + "learning_rate": 4.063626262140967e-05, + "loss": 0.4369, + "step": 6538 + }, + { + "epoch": 0.2871664269954005, + "grad_norm": 2.28125, + "learning_rate": 4.0630863936088104e-05, + "loss": 0.462, + "step": 6540 + }, + { + "epoch": 0.287254245474604, + "grad_norm": 2.015625, + "learning_rate": 4.062546405375124e-05, + "loss": 0.451, + "step": 6542 + }, + { + "epoch": 0.28734206395380746, + "grad_norm": 2.203125, + "learning_rate": 4.062006297481259e-05, + "loss": 0.4816, + "step": 6544 + }, + { + "epoch": 0.287429882433011, + "grad_norm": 2.21875, + "learning_rate": 4.061466069968577e-05, + "loss": 0.4531, + "step": 6546 + }, + { + "epoch": 0.28751770091221446, + "grad_norm": 2.078125, + "learning_rate": 4.06092572287845e-05, + "loss": 0.4612, + "step": 6548 + }, + { + "epoch": 0.28760551939141793, + "grad_norm": 2.3125, + "learning_rate": 4.0603852562522564e-05, + "loss": 0.4173, + "step": 6550 + }, + { + "epoch": 0.2876933378706214, + "grad_norm": 2.234375, + "learning_rate": 4.0598446701313865e-05, + "loss": 0.457, + "step": 6552 + }, + { + "epoch": 0.28778115634982493, + "grad_norm": 2.421875, + "learning_rate": 4.059303964557237e-05, + "loss": 0.4227, + "step": 6554 + }, + { + "epoch": 0.2878689748290284, + "grad_norm": 2.84375, + "learning_rate": 4.058763139571216e-05, + "loss": 0.4605, + "step": 6556 + }, + { + "epoch": 0.2879567933082319, + "grad_norm": 2.84375, + "learning_rate": 4.05822219521474e-05, + "loss": 0.438, + "step": 6558 + }, + { + "epoch": 0.28804461178743535, + "grad_norm": 2.453125, + "learning_rate": 4.057681131529235e-05, + "loss": 0.4462, + "step": 6560 + }, + { + "epoch": 0.2881324302666389, + "grad_norm": 2.328125, + "learning_rate": 4.0571399485561366e-05, + "loss": 0.4389, + "step": 6562 + }, + { + "epoch": 0.28822024874584234, + "grad_norm": 2.34375, + "learning_rate": 4.0565986463368865e-05, + "loss": 0.4411, + "step": 6564 + }, + { + "epoch": 0.2883080672250458, + "grad_norm": 2.296875, + "learning_rate": 4.0560572249129394e-05, + "loss": 0.4564, + "step": 6566 + }, + { + "epoch": 0.28839588570424934, + "grad_norm": 2.0625, + "learning_rate": 4.0555156843257566e-05, + "loss": 0.4626, + "step": 6568 + }, + { + "epoch": 0.2884837041834528, + "grad_norm": 2.109375, + "learning_rate": 4.05497402461681e-05, + "loss": 0.4507, + "step": 6570 + }, + { + "epoch": 0.2885715226626563, + "grad_norm": 1.9453125, + "learning_rate": 4.0544322458275796e-05, + "loss": 0.4225, + "step": 6572 + }, + { + "epoch": 0.28865934114185976, + "grad_norm": 2.34375, + "learning_rate": 4.0538903479995554e-05, + "loss": 0.4069, + "step": 6574 + }, + { + "epoch": 0.2887471596210633, + "grad_norm": 2.421875, + "learning_rate": 4.053348331174236e-05, + "loss": 0.4189, + "step": 6576 + }, + { + "epoch": 0.28883497810026676, + "grad_norm": 2.5625, + "learning_rate": 4.052806195393129e-05, + "loss": 0.4453, + "step": 6578 + }, + { + "epoch": 0.28892279657947023, + "grad_norm": 2.984375, + "learning_rate": 4.0522639406977516e-05, + "loss": 0.4598, + "step": 6580 + }, + { + "epoch": 0.2890106150586737, + "grad_norm": 3.484375, + "learning_rate": 4.051721567129629e-05, + "loss": 0.4437, + "step": 6582 + }, + { + "epoch": 0.28909843353787723, + "grad_norm": 2.625, + "learning_rate": 4.051179074730297e-05, + "loss": 0.446, + "step": 6584 + }, + { + "epoch": 0.2891862520170807, + "grad_norm": 2.5, + "learning_rate": 4.0506364635413e-05, + "loss": 0.4417, + "step": 6586 + }, + { + "epoch": 0.2892740704962842, + "grad_norm": 2.046875, + "learning_rate": 4.05009373360419e-05, + "loss": 0.4179, + "step": 6588 + }, + { + "epoch": 0.28936188897548765, + "grad_norm": 2.375, + "learning_rate": 4.049550884960531e-05, + "loss": 0.4153, + "step": 6590 + }, + { + "epoch": 0.2894497074546912, + "grad_norm": 2.40625, + "learning_rate": 4.049007917651894e-05, + "loss": 0.457, + "step": 6592 + }, + { + "epoch": 0.28953752593389465, + "grad_norm": 2.6875, + "learning_rate": 4.0484648317198585e-05, + "loss": 0.4651, + "step": 6594 + }, + { + "epoch": 0.2896253444130981, + "grad_norm": 2.703125, + "learning_rate": 4.047921627206015e-05, + "loss": 0.4265, + "step": 6596 + }, + { + "epoch": 0.2897131628923016, + "grad_norm": 2.265625, + "learning_rate": 4.047378304151963e-05, + "loss": 0.437, + "step": 6598 + }, + { + "epoch": 0.2898009813715051, + "grad_norm": 2.296875, + "learning_rate": 4.046834862599309e-05, + "loss": 0.4362, + "step": 6600 + }, + { + "epoch": 0.2898887998507086, + "grad_norm": 2.015625, + "learning_rate": 4.046291302589671e-05, + "loss": 0.4426, + "step": 6602 + }, + { + "epoch": 0.28997661832991206, + "grad_norm": 2.328125, + "learning_rate": 4.045747624164674e-05, + "loss": 0.4191, + "step": 6604 + }, + { + "epoch": 0.29006443680911553, + "grad_norm": 2.046875, + "learning_rate": 4.045203827365953e-05, + "loss": 0.3972, + "step": 6606 + }, + { + "epoch": 0.29015225528831906, + "grad_norm": 2.09375, + "learning_rate": 4.0446599122351535e-05, + "loss": 0.4173, + "step": 6608 + }, + { + "epoch": 0.29024007376752253, + "grad_norm": 1.9765625, + "learning_rate": 4.044115878813927e-05, + "loss": 0.4359, + "step": 6610 + }, + { + "epoch": 0.290327892246726, + "grad_norm": 2.15625, + "learning_rate": 4.043571727143936e-05, + "loss": 0.4373, + "step": 6612 + }, + { + "epoch": 0.2904157107259295, + "grad_norm": 2.46875, + "learning_rate": 4.043027457266853e-05, + "loss": 0.4602, + "step": 6614 + }, + { + "epoch": 0.290503529205133, + "grad_norm": 2.359375, + "learning_rate": 4.0424830692243566e-05, + "loss": 0.4517, + "step": 6616 + }, + { + "epoch": 0.2905913476843365, + "grad_norm": 2.40625, + "learning_rate": 4.0419385630581376e-05, + "loss": 0.4215, + "step": 6618 + }, + { + "epoch": 0.29067916616353995, + "grad_norm": 2.328125, + "learning_rate": 4.041393938809893e-05, + "loss": 0.4377, + "step": 6620 + }, + { + "epoch": 0.2907669846427435, + "grad_norm": 1.9765625, + "learning_rate": 4.0408491965213315e-05, + "loss": 0.4292, + "step": 6622 + }, + { + "epoch": 0.29085480312194695, + "grad_norm": 2.109375, + "learning_rate": 4.04030433623417e-05, + "loss": 0.4647, + "step": 6624 + }, + { + "epoch": 0.2909426216011504, + "grad_norm": 2.234375, + "learning_rate": 4.039759357990133e-05, + "loss": 0.416, + "step": 6626 + }, + { + "epoch": 0.2910304400803539, + "grad_norm": 2.234375, + "learning_rate": 4.039214261830954e-05, + "loss": 0.4596, + "step": 6628 + }, + { + "epoch": 0.2911182585595574, + "grad_norm": 2.28125, + "learning_rate": 4.0386690477983786e-05, + "loss": 0.4676, + "step": 6630 + }, + { + "epoch": 0.2912060770387609, + "grad_norm": 2.1875, + "learning_rate": 4.038123715934158e-05, + "loss": 0.4404, + "step": 6632 + }, + { + "epoch": 0.29129389551796436, + "grad_norm": 2.21875, + "learning_rate": 4.0375782662800555e-05, + "loss": 0.4359, + "step": 6634 + }, + { + "epoch": 0.29138171399716783, + "grad_norm": 2.28125, + "learning_rate": 4.037032698877841e-05, + "loss": 0.4296, + "step": 6636 + }, + { + "epoch": 0.29146953247637136, + "grad_norm": 2.578125, + "learning_rate": 4.0364870137692925e-05, + "loss": 0.4485, + "step": 6638 + }, + { + "epoch": 0.29155735095557483, + "grad_norm": 1.984375, + "learning_rate": 4.035941210996202e-05, + "loss": 0.4222, + "step": 6640 + }, + { + "epoch": 0.2916451694347783, + "grad_norm": 2.203125, + "learning_rate": 4.035395290600365e-05, + "loss": 0.4314, + "step": 6642 + }, + { + "epoch": 0.2917329879139818, + "grad_norm": 2.359375, + "learning_rate": 4.034849252623587e-05, + "loss": 0.455, + "step": 6644 + }, + { + "epoch": 0.2918208063931853, + "grad_norm": 1.875, + "learning_rate": 4.034303097107687e-05, + "loss": 0.4175, + "step": 6646 + }, + { + "epoch": 0.2919086248723888, + "grad_norm": 2.09375, + "learning_rate": 4.033756824094487e-05, + "loss": 0.4194, + "step": 6648 + }, + { + "epoch": 0.29199644335159225, + "grad_norm": 2.140625, + "learning_rate": 4.033210433625822e-05, + "loss": 0.4332, + "step": 6650 + }, + { + "epoch": 0.2920842618307957, + "grad_norm": 2.25, + "learning_rate": 4.0326639257435343e-05, + "loss": 0.4202, + "step": 6652 + }, + { + "epoch": 0.29217208030999925, + "grad_norm": 2.3125, + "learning_rate": 4.032117300489476e-05, + "loss": 0.4369, + "step": 6654 + }, + { + "epoch": 0.2922598987892027, + "grad_norm": 2.359375, + "learning_rate": 4.031570557905508e-05, + "loss": 0.4293, + "step": 6656 + }, + { + "epoch": 0.2923477172684062, + "grad_norm": 2.234375, + "learning_rate": 4.031023698033499e-05, + "loss": 0.4306, + "step": 6658 + }, + { + "epoch": 0.29243553574760967, + "grad_norm": 2.921875, + "learning_rate": 4.030476720915328e-05, + "loss": 0.4439, + "step": 6660 + }, + { + "epoch": 0.2925233542268132, + "grad_norm": 2.265625, + "learning_rate": 4.029929626592884e-05, + "loss": 0.4316, + "step": 6662 + }, + { + "epoch": 0.29261117270601666, + "grad_norm": 2.28125, + "learning_rate": 4.0293824151080614e-05, + "loss": 0.4389, + "step": 6664 + }, + { + "epoch": 0.29269899118522014, + "grad_norm": 2.203125, + "learning_rate": 4.028835086502767e-05, + "loss": 0.423, + "step": 6666 + }, + { + "epoch": 0.29278680966442366, + "grad_norm": 2.0625, + "learning_rate": 4.028287640818915e-05, + "loss": 0.4596, + "step": 6668 + }, + { + "epoch": 0.29287462814362714, + "grad_norm": 2.265625, + "learning_rate": 4.02774007809843e-05, + "loss": 0.4145, + "step": 6670 + }, + { + "epoch": 0.2929624466228306, + "grad_norm": 2.21875, + "learning_rate": 4.027192398383243e-05, + "loss": 0.4104, + "step": 6672 + }, + { + "epoch": 0.2930502651020341, + "grad_norm": 2.234375, + "learning_rate": 4.0266446017152956e-05, + "loss": 0.4175, + "step": 6674 + }, + { + "epoch": 0.2931380835812376, + "grad_norm": 2.671875, + "learning_rate": 4.0260966881365395e-05, + "loss": 0.4496, + "step": 6676 + }, + { + "epoch": 0.2932259020604411, + "grad_norm": 2.25, + "learning_rate": 4.0255486576889315e-05, + "loss": 0.4373, + "step": 6678 + }, + { + "epoch": 0.29331372053964455, + "grad_norm": 2.359375, + "learning_rate": 4.0250005104144425e-05, + "loss": 0.4386, + "step": 6680 + }, + { + "epoch": 0.293401539018848, + "grad_norm": 2.140625, + "learning_rate": 4.0244522463550494e-05, + "loss": 0.4338, + "step": 6682 + }, + { + "epoch": 0.29348935749805155, + "grad_norm": 2.046875, + "learning_rate": 4.023903865552738e-05, + "loss": 0.4828, + "step": 6684 + }, + { + "epoch": 0.293577175977255, + "grad_norm": 1.9296875, + "learning_rate": 4.0233553680495027e-05, + "loss": 0.4387, + "step": 6686 + }, + { + "epoch": 0.2936649944564585, + "grad_norm": 2.296875, + "learning_rate": 4.022806753887349e-05, + "loss": 0.4357, + "step": 6688 + }, + { + "epoch": 0.29375281293566197, + "grad_norm": 2.203125, + "learning_rate": 4.022258023108288e-05, + "loss": 0.4067, + "step": 6690 + }, + { + "epoch": 0.2938406314148655, + "grad_norm": 2.65625, + "learning_rate": 4.021709175754342e-05, + "loss": 0.4142, + "step": 6692 + }, + { + "epoch": 0.29392844989406897, + "grad_norm": 2.640625, + "learning_rate": 4.021160211867544e-05, + "loss": 0.4647, + "step": 6694 + }, + { + "epoch": 0.29401626837327244, + "grad_norm": 2.84375, + "learning_rate": 4.020611131489932e-05, + "loss": 0.4448, + "step": 6696 + }, + { + "epoch": 0.2941040868524759, + "grad_norm": 2.25, + "learning_rate": 4.020061934663555e-05, + "loss": 0.4183, + "step": 6698 + }, + { + "epoch": 0.29419190533167944, + "grad_norm": 2.375, + "learning_rate": 4.0195126214304704e-05, + "loss": 0.4467, + "step": 6700 + }, + { + "epoch": 0.2942797238108829, + "grad_norm": 2.140625, + "learning_rate": 4.018963191832746e-05, + "loss": 0.4432, + "step": 6702 + }, + { + "epoch": 0.2943675422900864, + "grad_norm": 2.203125, + "learning_rate": 4.018413645912455e-05, + "loss": 0.4159, + "step": 6704 + }, + { + "epoch": 0.29445536076928985, + "grad_norm": 2.203125, + "learning_rate": 4.0178639837116836e-05, + "loss": 0.412, + "step": 6706 + }, + { + "epoch": 0.2945431792484934, + "grad_norm": 2.328125, + "learning_rate": 4.0173142052725244e-05, + "loss": 0.4499, + "step": 6708 + }, + { + "epoch": 0.29463099772769685, + "grad_norm": 2.328125, + "learning_rate": 4.0167643106370786e-05, + "loss": 0.4282, + "step": 6710 + }, + { + "epoch": 0.2947188162069003, + "grad_norm": 2.71875, + "learning_rate": 4.016214299847459e-05, + "loss": 0.4674, + "step": 6712 + }, + { + "epoch": 0.2948066346861038, + "grad_norm": 2.796875, + "learning_rate": 4.0156641729457855e-05, + "loss": 0.4387, + "step": 6714 + }, + { + "epoch": 0.2948944531653073, + "grad_norm": 2.640625, + "learning_rate": 4.015113929974187e-05, + "loss": 0.4541, + "step": 6716 + }, + { + "epoch": 0.2949822716445108, + "grad_norm": 2.546875, + "learning_rate": 4.014563570974799e-05, + "loss": 0.4202, + "step": 6718 + }, + { + "epoch": 0.29507009012371427, + "grad_norm": 3.296875, + "learning_rate": 4.01401309598977e-05, + "loss": 0.4555, + "step": 6720 + }, + { + "epoch": 0.2951579086029178, + "grad_norm": 2.484375, + "learning_rate": 4.013462505061255e-05, + "loss": 0.4327, + "step": 6722 + }, + { + "epoch": 0.29524572708212127, + "grad_norm": 2.3125, + "learning_rate": 4.0129117982314194e-05, + "loss": 0.4166, + "step": 6724 + }, + { + "epoch": 0.29533354556132474, + "grad_norm": 2.390625, + "learning_rate": 4.012360975542434e-05, + "loss": 0.3892, + "step": 6726 + }, + { + "epoch": 0.2954213640405282, + "grad_norm": 2.34375, + "learning_rate": 4.011810037036484e-05, + "loss": 0.4572, + "step": 6728 + }, + { + "epoch": 0.29550918251973174, + "grad_norm": 2.390625, + "learning_rate": 4.011258982755759e-05, + "loss": 0.4488, + "step": 6730 + }, + { + "epoch": 0.2955970009989352, + "grad_norm": 2.171875, + "learning_rate": 4.010707812742459e-05, + "loss": 0.4369, + "step": 6732 + }, + { + "epoch": 0.2956848194781387, + "grad_norm": 2.296875, + "learning_rate": 4.010156527038791e-05, + "loss": 0.433, + "step": 6734 + }, + { + "epoch": 0.29577263795734215, + "grad_norm": 2.6875, + "learning_rate": 4.009605125686975e-05, + "loss": 0.4481, + "step": 6736 + }, + { + "epoch": 0.2958604564365457, + "grad_norm": 2.421875, + "learning_rate": 4.009053608729237e-05, + "loss": 0.4412, + "step": 6738 + }, + { + "epoch": 0.29594827491574915, + "grad_norm": 2.359375, + "learning_rate": 4.0085019762078116e-05, + "loss": 0.4396, + "step": 6740 + }, + { + "epoch": 0.2960360933949526, + "grad_norm": 2.78125, + "learning_rate": 4.007950228164943e-05, + "loss": 0.4157, + "step": 6742 + }, + { + "epoch": 0.2961239118741561, + "grad_norm": 2.328125, + "learning_rate": 4.007398364642885e-05, + "loss": 0.4439, + "step": 6744 + }, + { + "epoch": 0.2962117303533596, + "grad_norm": 2.46875, + "learning_rate": 4.006846385683899e-05, + "loss": 0.4452, + "step": 6746 + }, + { + "epoch": 0.2962995488325631, + "grad_norm": 2.875, + "learning_rate": 4.006294291330255e-05, + "loss": 0.4207, + "step": 6748 + }, + { + "epoch": 0.29638736731176657, + "grad_norm": 2.578125, + "learning_rate": 4.005742081624233e-05, + "loss": 0.4459, + "step": 6750 + }, + { + "epoch": 0.29647518579097004, + "grad_norm": 2.53125, + "learning_rate": 4.005189756608122e-05, + "loss": 0.433, + "step": 6752 + }, + { + "epoch": 0.29656300427017357, + "grad_norm": 2.359375, + "learning_rate": 4.004637316324218e-05, + "loss": 0.4451, + "step": 6754 + }, + { + "epoch": 0.29665082274937704, + "grad_norm": 2.265625, + "learning_rate": 4.004084760814828e-05, + "loss": 0.4489, + "step": 6756 + }, + { + "epoch": 0.2967386412285805, + "grad_norm": 2.390625, + "learning_rate": 4.003532090122266e-05, + "loss": 0.4177, + "step": 6758 + }, + { + "epoch": 0.296826459707784, + "grad_norm": 2.25, + "learning_rate": 4.002979304288857e-05, + "loss": 0.4251, + "step": 6760 + }, + { + "epoch": 0.2969142781869875, + "grad_norm": 2.375, + "learning_rate": 4.002426403356932e-05, + "loss": 0.4323, + "step": 6762 + }, + { + "epoch": 0.297002096666191, + "grad_norm": 2.125, + "learning_rate": 4.001873387368833e-05, + "loss": 0.4477, + "step": 6764 + }, + { + "epoch": 0.29708991514539446, + "grad_norm": 2.296875, + "learning_rate": 4.0013202563669104e-05, + "loss": 0.4452, + "step": 6766 + }, + { + "epoch": 0.29717773362459793, + "grad_norm": 2.671875, + "learning_rate": 4.000767010393522e-05, + "loss": 0.4423, + "step": 6768 + }, + { + "epoch": 0.29726555210380146, + "grad_norm": 2.265625, + "learning_rate": 4.0002136494910366e-05, + "loss": 0.4301, + "step": 6770 + }, + { + "epoch": 0.29735337058300493, + "grad_norm": 2.5, + "learning_rate": 3.9996601737018294e-05, + "loss": 0.4507, + "step": 6772 + }, + { + "epoch": 0.2974411890622084, + "grad_norm": 2.234375, + "learning_rate": 3.9991065830682875e-05, + "loss": 0.4243, + "step": 6774 + }, + { + "epoch": 0.2975290075414119, + "grad_norm": 2.203125, + "learning_rate": 3.998552877632804e-05, + "loss": 0.4369, + "step": 6776 + }, + { + "epoch": 0.2976168260206154, + "grad_norm": 2.0, + "learning_rate": 3.9979990574377825e-05, + "loss": 0.4455, + "step": 6778 + }, + { + "epoch": 0.29770464449981887, + "grad_norm": 2.265625, + "learning_rate": 3.997445122525633e-05, + "loss": 0.413, + "step": 6780 + }, + { + "epoch": 0.29779246297902234, + "grad_norm": 2.046875, + "learning_rate": 3.996891072938778e-05, + "loss": 0.4478, + "step": 6782 + }, + { + "epoch": 0.29788028145822587, + "grad_norm": 2.359375, + "learning_rate": 3.9963369087196444e-05, + "loss": 0.4367, + "step": 6784 + }, + { + "epoch": 0.29796809993742934, + "grad_norm": 2.328125, + "learning_rate": 3.9957826299106724e-05, + "loss": 0.4002, + "step": 6786 + }, + { + "epoch": 0.2980559184166328, + "grad_norm": 2.28125, + "learning_rate": 3.9952282365543083e-05, + "loss": 0.4108, + "step": 6788 + }, + { + "epoch": 0.2981437368958363, + "grad_norm": 2.15625, + "learning_rate": 3.9946737286930076e-05, + "loss": 0.4468, + "step": 6790 + }, + { + "epoch": 0.2982315553750398, + "grad_norm": 2.125, + "learning_rate": 3.994119106369234e-05, + "loss": 0.4116, + "step": 6792 + }, + { + "epoch": 0.2983193738542433, + "grad_norm": 2.203125, + "learning_rate": 3.993564369625461e-05, + "loss": 0.4294, + "step": 6794 + }, + { + "epoch": 0.29840719233344676, + "grad_norm": 2.296875, + "learning_rate": 3.993009518504171e-05, + "loss": 0.4148, + "step": 6796 + }, + { + "epoch": 0.29849501081265023, + "grad_norm": 2.546875, + "learning_rate": 3.9924545530478544e-05, + "loss": 0.4224, + "step": 6798 + }, + { + "epoch": 0.29858282929185376, + "grad_norm": 2.0, + "learning_rate": 3.99189947329901e-05, + "loss": 0.3931, + "step": 6800 + }, + { + "epoch": 0.29867064777105723, + "grad_norm": 2.1875, + "learning_rate": 3.991344279300145e-05, + "loss": 0.445, + "step": 6802 + }, + { + "epoch": 0.2987584662502607, + "grad_norm": 2.390625, + "learning_rate": 3.990788971093779e-05, + "loss": 0.4219, + "step": 6804 + }, + { + "epoch": 0.2988462847294642, + "grad_norm": 2.28125, + "learning_rate": 3.9902335487224364e-05, + "loss": 0.4343, + "step": 6806 + }, + { + "epoch": 0.2989341032086677, + "grad_norm": 2.15625, + "learning_rate": 3.989678012228651e-05, + "loss": 0.4439, + "step": 6808 + }, + { + "epoch": 0.2990219216878712, + "grad_norm": 2.421875, + "learning_rate": 3.989122361654967e-05, + "loss": 0.452, + "step": 6810 + }, + { + "epoch": 0.29910974016707464, + "grad_norm": 2.03125, + "learning_rate": 3.988566597043935e-05, + "loss": 0.4407, + "step": 6812 + }, + { + "epoch": 0.2991975586462781, + "grad_norm": 2.484375, + "learning_rate": 3.988010718438115e-05, + "loss": 0.4095, + "step": 6814 + }, + { + "epoch": 0.29928537712548164, + "grad_norm": 2.6875, + "learning_rate": 3.987454725880079e-05, + "loss": 0.4345, + "step": 6816 + }, + { + "epoch": 0.2993731956046851, + "grad_norm": 2.0, + "learning_rate": 3.986898619412402e-05, + "loss": 0.4371, + "step": 6818 + }, + { + "epoch": 0.2994610140838886, + "grad_norm": 2.203125, + "learning_rate": 3.986342399077674e-05, + "loss": 0.4375, + "step": 6820 + }, + { + "epoch": 0.2995488325630921, + "grad_norm": 1.9765625, + "learning_rate": 3.985786064918489e-05, + "loss": 0.4311, + "step": 6822 + }, + { + "epoch": 0.2996366510422956, + "grad_norm": 2.359375, + "learning_rate": 3.9852296169774493e-05, + "loss": 0.4073, + "step": 6824 + }, + { + "epoch": 0.29972446952149906, + "grad_norm": 2.171875, + "learning_rate": 3.9846730552971705e-05, + "loss": 0.4334, + "step": 6826 + }, + { + "epoch": 0.29981228800070253, + "grad_norm": 2.328125, + "learning_rate": 3.984116379920273e-05, + "loss": 0.4197, + "step": 6828 + }, + { + "epoch": 0.29990010647990606, + "grad_norm": 2.375, + "learning_rate": 3.983559590889387e-05, + "loss": 0.453, + "step": 6830 + }, + { + "epoch": 0.29998792495910953, + "grad_norm": 2.6875, + "learning_rate": 3.9830026882471526e-05, + "loss": 0.4483, + "step": 6832 + }, + { + "epoch": 0.300075743438313, + "grad_norm": 2.828125, + "learning_rate": 3.982445672036216e-05, + "loss": 0.4343, + "step": 6834 + }, + { + "epoch": 0.3001635619175165, + "grad_norm": 2.5625, + "learning_rate": 3.9818885422992355e-05, + "loss": 0.4299, + "step": 6836 + }, + { + "epoch": 0.30025138039672, + "grad_norm": 2.34375, + "learning_rate": 3.9813312990788757e-05, + "loss": 0.4304, + "step": 6838 + }, + { + "epoch": 0.3003391988759235, + "grad_norm": 2.5, + "learning_rate": 3.980773942417808e-05, + "loss": 0.4219, + "step": 6840 + }, + { + "epoch": 0.30042701735512695, + "grad_norm": 2.265625, + "learning_rate": 3.980216472358718e-05, + "loss": 0.4112, + "step": 6842 + }, + { + "epoch": 0.3005148358343304, + "grad_norm": 1.921875, + "learning_rate": 3.979658888944296e-05, + "loss": 0.4221, + "step": 6844 + }, + { + "epoch": 0.30060265431353395, + "grad_norm": 2.0, + "learning_rate": 3.97910119221724e-05, + "loss": 0.4513, + "step": 6846 + }, + { + "epoch": 0.3006904727927374, + "grad_norm": 2.34375, + "learning_rate": 3.9785433822202614e-05, + "loss": 0.4373, + "step": 6848 + }, + { + "epoch": 0.3007782912719409, + "grad_norm": 2.25, + "learning_rate": 3.977985458996076e-05, + "loss": 0.43, + "step": 6850 + }, + { + "epoch": 0.30086610975114436, + "grad_norm": 1.9765625, + "learning_rate": 3.97742742258741e-05, + "loss": 0.3945, + "step": 6852 + }, + { + "epoch": 0.3009539282303479, + "grad_norm": 2.203125, + "learning_rate": 3.976869273036997e-05, + "loss": 0.4025, + "step": 6854 + }, + { + "epoch": 0.30104174670955136, + "grad_norm": 2.1875, + "learning_rate": 3.9763110103875824e-05, + "loss": 0.4207, + "step": 6856 + }, + { + "epoch": 0.30112956518875483, + "grad_norm": 2.21875, + "learning_rate": 3.975752634681915e-05, + "loss": 0.4163, + "step": 6858 + }, + { + "epoch": 0.3012173836679583, + "grad_norm": 2.1875, + "learning_rate": 3.975194145962758e-05, + "loss": 0.4299, + "step": 6860 + }, + { + "epoch": 0.30130520214716183, + "grad_norm": 1.8984375, + "learning_rate": 3.97463554427288e-05, + "loss": 0.434, + "step": 6862 + }, + { + "epoch": 0.3013930206263653, + "grad_norm": 2.40625, + "learning_rate": 3.974076829655058e-05, + "loss": 0.4472, + "step": 6864 + }, + { + "epoch": 0.3014808391055688, + "grad_norm": 2.28125, + "learning_rate": 3.97351800215208e-05, + "loss": 0.417, + "step": 6866 + }, + { + "epoch": 0.30156865758477225, + "grad_norm": 2.21875, + "learning_rate": 3.972959061806739e-05, + "loss": 0.425, + "step": 6868 + }, + { + "epoch": 0.3016564760639758, + "grad_norm": 2.125, + "learning_rate": 3.972400008661841e-05, + "loss": 0.4394, + "step": 6870 + }, + { + "epoch": 0.30174429454317925, + "grad_norm": 2.1875, + "learning_rate": 3.971840842760196e-05, + "loss": 0.4763, + "step": 6872 + }, + { + "epoch": 0.3018321130223827, + "grad_norm": 2.3125, + "learning_rate": 3.971281564144628e-05, + "loss": 0.4281, + "step": 6874 + }, + { + "epoch": 0.30191993150158625, + "grad_norm": 1.953125, + "learning_rate": 3.9707221728579634e-05, + "loss": 0.459, + "step": 6876 + }, + { + "epoch": 0.3020077499807897, + "grad_norm": 2.5625, + "learning_rate": 3.970162668943044e-05, + "loss": 0.4476, + "step": 6878 + }, + { + "epoch": 0.3020955684599932, + "grad_norm": 2.375, + "learning_rate": 3.9696030524427144e-05, + "loss": 0.4341, + "step": 6880 + }, + { + "epoch": 0.30218338693919666, + "grad_norm": 2.25, + "learning_rate": 3.96904332339983e-05, + "loss": 0.4166, + "step": 6882 + }, + { + "epoch": 0.3022712054184002, + "grad_norm": 2.171875, + "learning_rate": 3.968483481857256e-05, + "loss": 0.4051, + "step": 6884 + }, + { + "epoch": 0.30235902389760366, + "grad_norm": 2.1875, + "learning_rate": 3.9679235278578654e-05, + "loss": 0.4281, + "step": 6886 + }, + { + "epoch": 0.30244684237680713, + "grad_norm": 2.078125, + "learning_rate": 3.967363461444539e-05, + "loss": 0.431, + "step": 6888 + }, + { + "epoch": 0.3025346608560106, + "grad_norm": 2.03125, + "learning_rate": 3.966803282660167e-05, + "loss": 0.4171, + "step": 6890 + }, + { + "epoch": 0.30262247933521413, + "grad_norm": 2.171875, + "learning_rate": 3.9662429915476476e-05, + "loss": 0.4436, + "step": 6892 + }, + { + "epoch": 0.3027102978144176, + "grad_norm": 2.34375, + "learning_rate": 3.9656825881498885e-05, + "loss": 0.4615, + "step": 6894 + }, + { + "epoch": 0.3027981162936211, + "grad_norm": 1.984375, + "learning_rate": 3.965122072509806e-05, + "loss": 0.446, + "step": 6896 + }, + { + "epoch": 0.30288593477282455, + "grad_norm": 2.21875, + "learning_rate": 3.964561444670324e-05, + "loss": 0.4475, + "step": 6898 + }, + { + "epoch": 0.3029737532520281, + "grad_norm": 2.34375, + "learning_rate": 3.9640007046743756e-05, + "loss": 0.3973, + "step": 6900 + }, + { + "epoch": 0.30306157173123155, + "grad_norm": 2.203125, + "learning_rate": 3.963439852564901e-05, + "loss": 0.427, + "step": 6902 + }, + { + "epoch": 0.303149390210435, + "grad_norm": 2.171875, + "learning_rate": 3.962878888384853e-05, + "loss": 0.427, + "step": 6904 + }, + { + "epoch": 0.3032372086896385, + "grad_norm": 2.1875, + "learning_rate": 3.9623178121771886e-05, + "loss": 0.4341, + "step": 6906 + }, + { + "epoch": 0.303325027168842, + "grad_norm": 2.40625, + "learning_rate": 3.9617566239848755e-05, + "loss": 0.4268, + "step": 6908 + }, + { + "epoch": 0.3034128456480455, + "grad_norm": 2.4375, + "learning_rate": 3.961195323850889e-05, + "loss": 0.4002, + "step": 6910 + }, + { + "epoch": 0.30350066412724896, + "grad_norm": 3.109375, + "learning_rate": 3.960633911818216e-05, + "loss": 0.4214, + "step": 6912 + }, + { + "epoch": 0.30358848260645244, + "grad_norm": 2.65625, + "learning_rate": 3.960072387929847e-05, + "loss": 0.4517, + "step": 6914 + }, + { + "epoch": 0.30367630108565596, + "grad_norm": 2.34375, + "learning_rate": 3.959510752228784e-05, + "loss": 0.4415, + "step": 6916 + }, + { + "epoch": 0.30376411956485944, + "grad_norm": 2.375, + "learning_rate": 3.958949004758039e-05, + "loss": 0.4185, + "step": 6918 + }, + { + "epoch": 0.3038519380440629, + "grad_norm": 2.90625, + "learning_rate": 3.958387145560628e-05, + "loss": 0.431, + "step": 6920 + }, + { + "epoch": 0.30393975652326644, + "grad_norm": 2.296875, + "learning_rate": 3.957825174679581e-05, + "loss": 0.4448, + "step": 6922 + }, + { + "epoch": 0.3040275750024699, + "grad_norm": 2.296875, + "learning_rate": 3.9572630921579324e-05, + "loss": 0.4327, + "step": 6924 + }, + { + "epoch": 0.3041153934816734, + "grad_norm": 2.34375, + "learning_rate": 3.9567008980387264e-05, + "loss": 0.4188, + "step": 6926 + }, + { + "epoch": 0.30420321196087685, + "grad_norm": 2.875, + "learning_rate": 3.956138592365017e-05, + "loss": 0.4284, + "step": 6928 + }, + { + "epoch": 0.3042910304400804, + "grad_norm": 3.46875, + "learning_rate": 3.9555761751798646e-05, + "loss": 0.4069, + "step": 6930 + }, + { + "epoch": 0.30437884891928385, + "grad_norm": 2.453125, + "learning_rate": 3.95501364652634e-05, + "loss": 0.4025, + "step": 6932 + }, + { + "epoch": 0.3044666673984873, + "grad_norm": 2.21875, + "learning_rate": 3.9544510064475214e-05, + "loss": 0.4497, + "step": 6934 + }, + { + "epoch": 0.3045544858776908, + "grad_norm": 1.921875, + "learning_rate": 3.953888254986496e-05, + "loss": 0.4263, + "step": 6936 + }, + { + "epoch": 0.3046423043568943, + "grad_norm": 2.375, + "learning_rate": 3.95332539218636e-05, + "loss": 0.4268, + "step": 6938 + }, + { + "epoch": 0.3047301228360978, + "grad_norm": 2.65625, + "learning_rate": 3.952762418090217e-05, + "loss": 0.4395, + "step": 6940 + }, + { + "epoch": 0.30481794131530127, + "grad_norm": 2.3125, + "learning_rate": 3.9521993327411797e-05, + "loss": 0.4417, + "step": 6942 + }, + { + "epoch": 0.30490575979450474, + "grad_norm": 2.03125, + "learning_rate": 3.9516361361823696e-05, + "loss": 0.4297, + "step": 6944 + }, + { + "epoch": 0.30499357827370827, + "grad_norm": 2.28125, + "learning_rate": 3.951072828456916e-05, + "loss": 0.4091, + "step": 6946 + }, + { + "epoch": 0.30508139675291174, + "grad_norm": 2.40625, + "learning_rate": 3.9505094096079577e-05, + "loss": 0.4255, + "step": 6948 + }, + { + "epoch": 0.3051692152321152, + "grad_norm": 2.21875, + "learning_rate": 3.9499458796786406e-05, + "loss": 0.433, + "step": 6950 + }, + { + "epoch": 0.3052570337113187, + "grad_norm": 2.359375, + "learning_rate": 3.94938223871212e-05, + "loss": 0.4533, + "step": 6952 + }, + { + "epoch": 0.3053448521905222, + "grad_norm": 2.4375, + "learning_rate": 3.948818486751561e-05, + "loss": 0.4138, + "step": 6954 + }, + { + "epoch": 0.3054326706697257, + "grad_norm": 2.203125, + "learning_rate": 3.948254623840134e-05, + "loss": 0.4251, + "step": 6956 + }, + { + "epoch": 0.30552048914892915, + "grad_norm": 2.390625, + "learning_rate": 3.947690650021022e-05, + "loss": 0.4489, + "step": 6958 + }, + { + "epoch": 0.3056083076281326, + "grad_norm": 2.28125, + "learning_rate": 3.947126565337412e-05, + "loss": 0.4479, + "step": 6960 + }, + { + "epoch": 0.30569612610733615, + "grad_norm": 1.890625, + "learning_rate": 3.946562369832503e-05, + "loss": 0.4553, + "step": 6962 + }, + { + "epoch": 0.3057839445865396, + "grad_norm": 1.96875, + "learning_rate": 3.945998063549501e-05, + "loss": 0.4209, + "step": 6964 + }, + { + "epoch": 0.3058717630657431, + "grad_norm": 1.9453125, + "learning_rate": 3.945433646531621e-05, + "loss": 0.3993, + "step": 6966 + }, + { + "epoch": 0.30595958154494657, + "grad_norm": 2.125, + "learning_rate": 3.9448691188220854e-05, + "loss": 0.4497, + "step": 6968 + }, + { + "epoch": 0.3060474000241501, + "grad_norm": 2.171875, + "learning_rate": 3.944304480464126e-05, + "loss": 0.4177, + "step": 6970 + }, + { + "epoch": 0.30613521850335357, + "grad_norm": 2.140625, + "learning_rate": 3.943739731500984e-05, + "loss": 0.4476, + "step": 6972 + }, + { + "epoch": 0.30622303698255704, + "grad_norm": 1.96875, + "learning_rate": 3.943174871975907e-05, + "loss": 0.4581, + "step": 6974 + }, + { + "epoch": 0.30631085546176057, + "grad_norm": 1.96875, + "learning_rate": 3.942609901932153e-05, + "loss": 0.448, + "step": 6976 + }, + { + "epoch": 0.30639867394096404, + "grad_norm": 2.109375, + "learning_rate": 3.942044821412986e-05, + "loss": 0.4786, + "step": 6978 + }, + { + "epoch": 0.3064864924201675, + "grad_norm": 2.3125, + "learning_rate": 3.941479630461681e-05, + "loss": 0.4589, + "step": 6980 + }, + { + "epoch": 0.306574310899371, + "grad_norm": 2.4375, + "learning_rate": 3.94091432912152e-05, + "loss": 0.4237, + "step": 6982 + }, + { + "epoch": 0.3066621293785745, + "grad_norm": 2.578125, + "learning_rate": 3.940348917435796e-05, + "loss": 0.4291, + "step": 6984 + }, + { + "epoch": 0.306749947857778, + "grad_norm": 2.296875, + "learning_rate": 3.939783395447805e-05, + "loss": 0.4183, + "step": 6986 + }, + { + "epoch": 0.30683776633698145, + "grad_norm": 2.25, + "learning_rate": 3.939217763200857e-05, + "loss": 0.4078, + "step": 6988 + }, + { + "epoch": 0.3069255848161849, + "grad_norm": 2.328125, + "learning_rate": 3.938652020738267e-05, + "loss": 0.4128, + "step": 6990 + }, + { + "epoch": 0.30701340329538845, + "grad_norm": 2.0, + "learning_rate": 3.9380861681033606e-05, + "loss": 0.4092, + "step": 6992 + }, + { + "epoch": 0.3071012217745919, + "grad_norm": 2.25, + "learning_rate": 3.937520205339471e-05, + "loss": 0.4189, + "step": 6994 + }, + { + "epoch": 0.3071890402537954, + "grad_norm": 2.265625, + "learning_rate": 3.93695413248994e-05, + "loss": 0.4114, + "step": 6996 + }, + { + "epoch": 0.30727685873299887, + "grad_norm": 2.15625, + "learning_rate": 3.9363879495981166e-05, + "loss": 0.4252, + "step": 6998 + }, + { + "epoch": 0.3073646772122024, + "grad_norm": 2.296875, + "learning_rate": 3.935821656707359e-05, + "loss": 0.4006, + "step": 7000 + }, + { + "epoch": 0.30745249569140587, + "grad_norm": 2.203125, + "learning_rate": 3.935255253861036e-05, + "loss": 0.434, + "step": 7002 + }, + { + "epoch": 0.30754031417060934, + "grad_norm": 1.9375, + "learning_rate": 3.934688741102521e-05, + "loss": 0.4251, + "step": 7004 + }, + { + "epoch": 0.3076281326498128, + "grad_norm": 2.421875, + "learning_rate": 3.934122118475197e-05, + "loss": 0.4504, + "step": 7006 + }, + { + "epoch": 0.30771595112901634, + "grad_norm": 2.640625, + "learning_rate": 3.93355538602246e-05, + "loss": 0.426, + "step": 7008 + }, + { + "epoch": 0.3078037696082198, + "grad_norm": 2.75, + "learning_rate": 3.932988543787707e-05, + "loss": 0.398, + "step": 7010 + }, + { + "epoch": 0.3078915880874233, + "grad_norm": 2.21875, + "learning_rate": 3.932421591814347e-05, + "loss": 0.4376, + "step": 7012 + }, + { + "epoch": 0.30797940656662676, + "grad_norm": 2.625, + "learning_rate": 3.9318545301457985e-05, + "loss": 0.4604, + "step": 7014 + }, + { + "epoch": 0.3080672250458303, + "grad_norm": 1.9765625, + "learning_rate": 3.931287358825486e-05, + "loss": 0.4587, + "step": 7016 + }, + { + "epoch": 0.30815504352503376, + "grad_norm": 2.453125, + "learning_rate": 3.930720077896846e-05, + "loss": 0.4166, + "step": 7018 + }, + { + "epoch": 0.30824286200423723, + "grad_norm": 2.140625, + "learning_rate": 3.930152687403319e-05, + "loss": 0.4667, + "step": 7020 + }, + { + "epoch": 0.30833068048344076, + "grad_norm": 2.125, + "learning_rate": 3.929585187388356e-05, + "loss": 0.4017, + "step": 7022 + }, + { + "epoch": 0.3084184989626442, + "grad_norm": 2.09375, + "learning_rate": 3.929017577895416e-05, + "loss": 0.426, + "step": 7024 + }, + { + "epoch": 0.3085063174418477, + "grad_norm": 2.15625, + "learning_rate": 3.928449858967969e-05, + "loss": 0.4435, + "step": 7026 + }, + { + "epoch": 0.30859413592105117, + "grad_norm": 2.1875, + "learning_rate": 3.9278820306494876e-05, + "loss": 0.4275, + "step": 7028 + }, + { + "epoch": 0.3086819544002547, + "grad_norm": 2.109375, + "learning_rate": 3.92731409298346e-05, + "loss": 0.4406, + "step": 7030 + }, + { + "epoch": 0.30876977287945817, + "grad_norm": 2.1875, + "learning_rate": 3.9267460460133756e-05, + "loss": 0.4393, + "step": 7032 + }, + { + "epoch": 0.30885759135866164, + "grad_norm": 2.328125, + "learning_rate": 3.9261778897827376e-05, + "loss": 0.443, + "step": 7034 + }, + { + "epoch": 0.3089454098378651, + "grad_norm": 2.0, + "learning_rate": 3.925609624335054e-05, + "loss": 0.4156, + "step": 7036 + }, + { + "epoch": 0.30903322831706864, + "grad_norm": 2.203125, + "learning_rate": 3.925041249713844e-05, + "loss": 0.4142, + "step": 7038 + }, + { + "epoch": 0.3091210467962721, + "grad_norm": 2.046875, + "learning_rate": 3.9244727659626346e-05, + "loss": 0.421, + "step": 7040 + }, + { + "epoch": 0.3092088652754756, + "grad_norm": 2.265625, + "learning_rate": 3.923904173124958e-05, + "loss": 0.4122, + "step": 7042 + }, + { + "epoch": 0.30929668375467906, + "grad_norm": 2.390625, + "learning_rate": 3.923335471244359e-05, + "loss": 0.4319, + "step": 7044 + }, + { + "epoch": 0.3093845022338826, + "grad_norm": 1.9609375, + "learning_rate": 3.922766660364388e-05, + "loss": 0.4363, + "step": 7046 + }, + { + "epoch": 0.30947232071308606, + "grad_norm": 2.375, + "learning_rate": 3.9221977405286057e-05, + "loss": 0.4407, + "step": 7048 + }, + { + "epoch": 0.30956013919228953, + "grad_norm": 2.078125, + "learning_rate": 3.9216287117805787e-05, + "loss": 0.4583, + "step": 7050 + }, + { + "epoch": 0.309647957671493, + "grad_norm": 2.296875, + "learning_rate": 3.9210595741638853e-05, + "loss": 0.4364, + "step": 7052 + }, + { + "epoch": 0.30973577615069653, + "grad_norm": 1.9140625, + "learning_rate": 3.9204903277221086e-05, + "loss": 0.4248, + "step": 7054 + }, + { + "epoch": 0.3098235946299, + "grad_norm": 1.953125, + "learning_rate": 3.9199209724988415e-05, + "loss": 0.4312, + "step": 7056 + }, + { + "epoch": 0.3099114131091035, + "grad_norm": 2.21875, + "learning_rate": 3.9193515085376867e-05, + "loss": 0.43, + "step": 7058 + }, + { + "epoch": 0.30999923158830694, + "grad_norm": 2.21875, + "learning_rate": 3.918781935882253e-05, + "loss": 0.4222, + "step": 7060 + }, + { + "epoch": 0.3100870500675105, + "grad_norm": 2.515625, + "learning_rate": 3.918212254576158e-05, + "loss": 0.4161, + "step": 7062 + }, + { + "epoch": 0.31017486854671394, + "grad_norm": 2.03125, + "learning_rate": 3.91764246466303e-05, + "loss": 0.4363, + "step": 7064 + }, + { + "epoch": 0.3102626870259174, + "grad_norm": 2.421875, + "learning_rate": 3.9170725661865e-05, + "loss": 0.3983, + "step": 7066 + }, + { + "epoch": 0.3103505055051209, + "grad_norm": 2.171875, + "learning_rate": 3.916502559190215e-05, + "loss": 0.4453, + "step": 7068 + }, + { + "epoch": 0.3104383239843244, + "grad_norm": 2.15625, + "learning_rate": 3.9159324437178236e-05, + "loss": 0.4246, + "step": 7070 + }, + { + "epoch": 0.3105261424635279, + "grad_norm": 2.0625, + "learning_rate": 3.915362219812986e-05, + "loss": 0.4217, + "step": 7072 + }, + { + "epoch": 0.31061396094273136, + "grad_norm": 2.296875, + "learning_rate": 3.914791887519371e-05, + "loss": 0.4183, + "step": 7074 + }, + { + "epoch": 0.3107017794219349, + "grad_norm": 2.25, + "learning_rate": 3.914221446880654e-05, + "loss": 0.4117, + "step": 7076 + }, + { + "epoch": 0.31078959790113836, + "grad_norm": 2.328125, + "learning_rate": 3.91365089794052e-05, + "loss": 0.4156, + "step": 7078 + }, + { + "epoch": 0.31087741638034183, + "grad_norm": 2.375, + "learning_rate": 3.91308024074266e-05, + "loss": 0.4281, + "step": 7080 + }, + { + "epoch": 0.3109652348595453, + "grad_norm": 2.1875, + "learning_rate": 3.9125094753307775e-05, + "loss": 0.428, + "step": 7082 + }, + { + "epoch": 0.31105305333874883, + "grad_norm": 2.296875, + "learning_rate": 3.91193860174858e-05, + "loss": 0.4356, + "step": 7084 + }, + { + "epoch": 0.3111408718179523, + "grad_norm": 2.375, + "learning_rate": 3.911367620039787e-05, + "loss": 0.4351, + "step": 7086 + }, + { + "epoch": 0.3112286902971558, + "grad_norm": 2.140625, + "learning_rate": 3.910796530248123e-05, + "loss": 0.455, + "step": 7088 + }, + { + "epoch": 0.31131650877635925, + "grad_norm": 2.703125, + "learning_rate": 3.910225332417322e-05, + "loss": 0.3949, + "step": 7090 + }, + { + "epoch": 0.3114043272555628, + "grad_norm": 2.546875, + "learning_rate": 3.909654026591127e-05, + "loss": 0.4086, + "step": 7092 + }, + { + "epoch": 0.31149214573476625, + "grad_norm": 2.265625, + "learning_rate": 3.9090826128132896e-05, + "loss": 0.4271, + "step": 7094 + }, + { + "epoch": 0.3115799642139697, + "grad_norm": 2.421875, + "learning_rate": 3.908511091127567e-05, + "loss": 0.4195, + "step": 7096 + }, + { + "epoch": 0.3116677826931732, + "grad_norm": 2.328125, + "learning_rate": 3.907939461577727e-05, + "loss": 0.4229, + "step": 7098 + }, + { + "epoch": 0.3117556011723767, + "grad_norm": 2.0625, + "learning_rate": 3.9073677242075466e-05, + "loss": 0.4651, + "step": 7100 + }, + { + "epoch": 0.3118434196515802, + "grad_norm": 2.296875, + "learning_rate": 3.906795879060809e-05, + "loss": 0.4029, + "step": 7102 + }, + { + "epoch": 0.31193123813078366, + "grad_norm": 2.140625, + "learning_rate": 3.906223926181305e-05, + "loss": 0.4254, + "step": 7104 + }, + { + "epoch": 0.31201905660998713, + "grad_norm": 2.1875, + "learning_rate": 3.905651865612835e-05, + "loss": 0.4422, + "step": 7106 + }, + { + "epoch": 0.31210687508919066, + "grad_norm": 2.03125, + "learning_rate": 3.9050796973992084e-05, + "loss": 0.432, + "step": 7108 + }, + { + "epoch": 0.31219469356839413, + "grad_norm": 2.234375, + "learning_rate": 3.9045074215842425e-05, + "loss": 0.4265, + "step": 7110 + }, + { + "epoch": 0.3122825120475976, + "grad_norm": 2.328125, + "learning_rate": 3.903935038211761e-05, + "loss": 0.4098, + "step": 7112 + }, + { + "epoch": 0.3123703305268011, + "grad_norm": 1.8828125, + "learning_rate": 3.903362547325598e-05, + "loss": 0.407, + "step": 7114 + }, + { + "epoch": 0.3124581490060046, + "grad_norm": 1.8984375, + "learning_rate": 3.9027899489695954e-05, + "loss": 0.4245, + "step": 7116 + }, + { + "epoch": 0.3125459674852081, + "grad_norm": 2.328125, + "learning_rate": 3.902217243187601e-05, + "loss": 0.4124, + "step": 7118 + }, + { + "epoch": 0.31263378596441155, + "grad_norm": 2.4375, + "learning_rate": 3.9016444300234747e-05, + "loss": 0.4477, + "step": 7120 + }, + { + "epoch": 0.312721604443615, + "grad_norm": 2.015625, + "learning_rate": 3.9010715095210816e-05, + "loss": 0.403, + "step": 7122 + }, + { + "epoch": 0.31280942292281855, + "grad_norm": 2.09375, + "learning_rate": 3.900498481724296e-05, + "loss": 0.4407, + "step": 7124 + }, + { + "epoch": 0.312897241402022, + "grad_norm": 2.171875, + "learning_rate": 3.899925346677002e-05, + "loss": 0.4491, + "step": 7126 + }, + { + "epoch": 0.3129850598812255, + "grad_norm": 2.015625, + "learning_rate": 3.8993521044230884e-05, + "loss": 0.4311, + "step": 7128 + }, + { + "epoch": 0.313072878360429, + "grad_norm": 2.1875, + "learning_rate": 3.8987787550064555e-05, + "loss": 0.4356, + "step": 7130 + }, + { + "epoch": 0.3131606968396325, + "grad_norm": 2.546875, + "learning_rate": 3.8982052984710105e-05, + "loss": 0.4152, + "step": 7132 + }, + { + "epoch": 0.31324851531883596, + "grad_norm": 2.1875, + "learning_rate": 3.8976317348606684e-05, + "loss": 0.411, + "step": 7134 + }, + { + "epoch": 0.31333633379803943, + "grad_norm": 2.40625, + "learning_rate": 3.8970580642193534e-05, + "loss": 0.4311, + "step": 7136 + }, + { + "epoch": 0.31342415227724296, + "grad_norm": 2.34375, + "learning_rate": 3.896484286590997e-05, + "loss": 0.4259, + "step": 7138 + }, + { + "epoch": 0.31351197075644643, + "grad_norm": 2.078125, + "learning_rate": 3.895910402019538e-05, + "loss": 0.4198, + "step": 7140 + }, + { + "epoch": 0.3135997892356499, + "grad_norm": 2.0625, + "learning_rate": 3.895336410548927e-05, + "loss": 0.4099, + "step": 7142 + }, + { + "epoch": 0.3136876077148534, + "grad_norm": 2.140625, + "learning_rate": 3.894762312223118e-05, + "loss": 0.4274, + "step": 7144 + }, + { + "epoch": 0.3137754261940569, + "grad_norm": 1.9921875, + "learning_rate": 3.894188107086078e-05, + "loss": 0.436, + "step": 7146 + }, + { + "epoch": 0.3138632446732604, + "grad_norm": 2.046875, + "learning_rate": 3.893613795181778e-05, + "loss": 0.437, + "step": 7148 + }, + { + "epoch": 0.31395106315246385, + "grad_norm": 2.328125, + "learning_rate": 3.8930393765542e-05, + "loss": 0.4435, + "step": 7150 + }, + { + "epoch": 0.3140388816316673, + "grad_norm": 2.65625, + "learning_rate": 3.892464851247332e-05, + "loss": 0.412, + "step": 7152 + }, + { + "epoch": 0.31412670011087085, + "grad_norm": 2.453125, + "learning_rate": 3.891890219305172e-05, + "loss": 0.4144, + "step": 7154 + }, + { + "epoch": 0.3142145185900743, + "grad_norm": 2.046875, + "learning_rate": 3.8913154807717255e-05, + "loss": 0.4099, + "step": 7156 + }, + { + "epoch": 0.3143023370692778, + "grad_norm": 1.9609375, + "learning_rate": 3.8907406356910054e-05, + "loss": 0.4196, + "step": 7158 + }, + { + "epoch": 0.31439015554848126, + "grad_norm": 2.078125, + "learning_rate": 3.890165684107035e-05, + "loss": 0.4157, + "step": 7160 + }, + { + "epoch": 0.3144779740276848, + "grad_norm": 2.15625, + "learning_rate": 3.8895906260638426e-05, + "loss": 0.4065, + "step": 7162 + }, + { + "epoch": 0.31456579250688826, + "grad_norm": 2.328125, + "learning_rate": 3.8890154616054676e-05, + "loss": 0.4396, + "step": 7164 + }, + { + "epoch": 0.31465361098609174, + "grad_norm": 1.9921875, + "learning_rate": 3.888440190775955e-05, + "loss": 0.4087, + "step": 7166 + }, + { + "epoch": 0.3147414294652952, + "grad_norm": 1.859375, + "learning_rate": 3.8878648136193596e-05, + "loss": 0.4512, + "step": 7168 + }, + { + "epoch": 0.31482924794449874, + "grad_norm": 1.921875, + "learning_rate": 3.887289330179744e-05, + "loss": 0.4095, + "step": 7170 + }, + { + "epoch": 0.3149170664237022, + "grad_norm": 1.96875, + "learning_rate": 3.8867137405011786e-05, + "loss": 0.4463, + "step": 7172 + }, + { + "epoch": 0.3150048849029057, + "grad_norm": 2.15625, + "learning_rate": 3.886138044627744e-05, + "loss": 0.4024, + "step": 7174 + }, + { + "epoch": 0.3150927033821092, + "grad_norm": 2.1875, + "learning_rate": 3.885562242603525e-05, + "loss": 0.4472, + "step": 7176 + }, + { + "epoch": 0.3151805218613127, + "grad_norm": 1.9765625, + "learning_rate": 3.8849863344726175e-05, + "loss": 0.4177, + "step": 7178 + }, + { + "epoch": 0.31526834034051615, + "grad_norm": 2.0625, + "learning_rate": 3.884410320279124e-05, + "loss": 0.4592, + "step": 7180 + }, + { + "epoch": 0.3153561588197196, + "grad_norm": 2.140625, + "learning_rate": 3.883834200067157e-05, + "loss": 0.4344, + "step": 7182 + }, + { + "epoch": 0.31544397729892315, + "grad_norm": 2.0625, + "learning_rate": 3.883257973880834e-05, + "loss": 0.4343, + "step": 7184 + }, + { + "epoch": 0.3155317957781266, + "grad_norm": 1.9453125, + "learning_rate": 3.8826816417642845e-05, + "loss": 0.4252, + "step": 7186 + }, + { + "epoch": 0.3156196142573301, + "grad_norm": 2.03125, + "learning_rate": 3.882105203761644e-05, + "loss": 0.4246, + "step": 7188 + }, + { + "epoch": 0.31570743273653357, + "grad_norm": 2.09375, + "learning_rate": 3.8815286599170544e-05, + "loss": 0.4142, + "step": 7190 + }, + { + "epoch": 0.3157952512157371, + "grad_norm": 2.28125, + "learning_rate": 3.88095201027467e-05, + "loss": 0.4259, + "step": 7192 + }, + { + "epoch": 0.31588306969494057, + "grad_norm": 2.046875, + "learning_rate": 3.880375254878649e-05, + "loss": 0.4249, + "step": 7194 + }, + { + "epoch": 0.31597088817414404, + "grad_norm": 2.5, + "learning_rate": 3.87979839377316e-05, + "loss": 0.4373, + "step": 7196 + }, + { + "epoch": 0.3160587066533475, + "grad_norm": 2.03125, + "learning_rate": 3.87922142700238e-05, + "loss": 0.4233, + "step": 7198 + }, + { + "epoch": 0.31614652513255104, + "grad_norm": 2.359375, + "learning_rate": 3.878644354610492e-05, + "loss": 0.4137, + "step": 7200 + }, + { + "epoch": 0.3162343436117545, + "grad_norm": 2.140625, + "learning_rate": 3.8780671766416885e-05, + "loss": 0.4349, + "step": 7202 + }, + { + "epoch": 0.316322162090958, + "grad_norm": 2.765625, + "learning_rate": 3.8774898931401706e-05, + "loss": 0.425, + "step": 7204 + }, + { + "epoch": 0.31640998057016145, + "grad_norm": 2.328125, + "learning_rate": 3.8769125041501466e-05, + "loss": 0.466, + "step": 7206 + }, + { + "epoch": 0.316497799049365, + "grad_norm": 1.9453125, + "learning_rate": 3.876335009715833e-05, + "loss": 0.4166, + "step": 7208 + }, + { + "epoch": 0.31658561752856845, + "grad_norm": 2.140625, + "learning_rate": 3.8757574098814544e-05, + "loss": 0.4177, + "step": 7210 + }, + { + "epoch": 0.3166734360077719, + "grad_norm": 2.65625, + "learning_rate": 3.875179704691243e-05, + "loss": 0.4321, + "step": 7212 + }, + { + "epoch": 0.3167612544869754, + "grad_norm": 2.46875, + "learning_rate": 3.874601894189441e-05, + "loss": 0.4159, + "step": 7214 + }, + { + "epoch": 0.3168490729661789, + "grad_norm": 2.625, + "learning_rate": 3.8740239784202956e-05, + "loss": 0.4265, + "step": 7216 + }, + { + "epoch": 0.3169368914453824, + "grad_norm": 2.75, + "learning_rate": 3.873445957428065e-05, + "loss": 0.4415, + "step": 7218 + }, + { + "epoch": 0.31702470992458587, + "grad_norm": 2.140625, + "learning_rate": 3.872867831257014e-05, + "loss": 0.4174, + "step": 7220 + }, + { + "epoch": 0.31711252840378934, + "grad_norm": 2.375, + "learning_rate": 3.872289599951415e-05, + "loss": 0.4364, + "step": 7222 + }, + { + "epoch": 0.31720034688299287, + "grad_norm": 2.328125, + "learning_rate": 3.8717112635555494e-05, + "loss": 0.4321, + "step": 7224 + }, + { + "epoch": 0.31728816536219634, + "grad_norm": 2.109375, + "learning_rate": 3.8711328221137066e-05, + "loss": 0.4507, + "step": 7226 + }, + { + "epoch": 0.3173759838413998, + "grad_norm": 2.078125, + "learning_rate": 3.870554275670184e-05, + "loss": 0.4347, + "step": 7228 + }, + { + "epoch": 0.31746380232060334, + "grad_norm": 1.9765625, + "learning_rate": 3.8699756242692854e-05, + "loss": 0.3905, + "step": 7230 + }, + { + "epoch": 0.3175516207998068, + "grad_norm": 2.046875, + "learning_rate": 3.869396867955326e-05, + "loss": 0.4089, + "step": 7232 + }, + { + "epoch": 0.3176394392790103, + "grad_norm": 2.109375, + "learning_rate": 3.868818006772626e-05, + "loss": 0.4114, + "step": 7234 + }, + { + "epoch": 0.31772725775821375, + "grad_norm": 2.203125, + "learning_rate": 3.8682390407655145e-05, + "loss": 0.39, + "step": 7236 + }, + { + "epoch": 0.3178150762374173, + "grad_norm": 2.078125, + "learning_rate": 3.86765996997833e-05, + "loss": 0.4383, + "step": 7238 + }, + { + "epoch": 0.31790289471662075, + "grad_norm": 2.046875, + "learning_rate": 3.867080794455416e-05, + "loss": 0.415, + "step": 7240 + }, + { + "epoch": 0.3179907131958242, + "grad_norm": 2.109375, + "learning_rate": 3.866501514241129e-05, + "loss": 0.4171, + "step": 7242 + }, + { + "epoch": 0.3180785316750277, + "grad_norm": 2.15625, + "learning_rate": 3.8659221293798265e-05, + "loss": 0.4294, + "step": 7244 + }, + { + "epoch": 0.3181663501542312, + "grad_norm": 1.984375, + "learning_rate": 3.865342639915881e-05, + "loss": 0.4058, + "step": 7246 + }, + { + "epoch": 0.3182541686334347, + "grad_norm": 2.078125, + "learning_rate": 3.864763045893668e-05, + "loss": 0.4064, + "step": 7248 + }, + { + "epoch": 0.31834198711263817, + "grad_norm": 2.15625, + "learning_rate": 3.8641833473575745e-05, + "loss": 0.4323, + "step": 7250 + }, + { + "epoch": 0.31842980559184164, + "grad_norm": 2.359375, + "learning_rate": 3.863603544351993e-05, + "loss": 0.4051, + "step": 7252 + }, + { + "epoch": 0.31851762407104517, + "grad_norm": 2.484375, + "learning_rate": 3.863023636921326e-05, + "loss": 0.4387, + "step": 7254 + }, + { + "epoch": 0.31860544255024864, + "grad_norm": 2.421875, + "learning_rate": 3.862443625109981e-05, + "loss": 0.4106, + "step": 7256 + }, + { + "epoch": 0.3186932610294521, + "grad_norm": 2.578125, + "learning_rate": 3.861863508962377e-05, + "loss": 0.4498, + "step": 7258 + }, + { + "epoch": 0.3187810795086556, + "grad_norm": 2.8125, + "learning_rate": 3.861283288522939e-05, + "loss": 0.4025, + "step": 7260 + }, + { + "epoch": 0.3188688979878591, + "grad_norm": 2.390625, + "learning_rate": 3.8607029638361005e-05, + "loss": 0.4204, + "step": 7262 + }, + { + "epoch": 0.3189567164670626, + "grad_norm": 2.34375, + "learning_rate": 3.860122534946302e-05, + "loss": 0.4304, + "step": 7264 + }, + { + "epoch": 0.31904453494626606, + "grad_norm": 1.96875, + "learning_rate": 3.859542001897994e-05, + "loss": 0.4281, + "step": 7266 + }, + { + "epoch": 0.31913235342546953, + "grad_norm": 1.953125, + "learning_rate": 3.858961364735635e-05, + "loss": 0.4151, + "step": 7268 + }, + { + "epoch": 0.31922017190467306, + "grad_norm": 2.0625, + "learning_rate": 3.858380623503688e-05, + "loss": 0.421, + "step": 7270 + }, + { + "epoch": 0.3193079903838765, + "grad_norm": 1.890625, + "learning_rate": 3.857799778246627e-05, + "loss": 0.429, + "step": 7272 + }, + { + "epoch": 0.31939580886308, + "grad_norm": 2.015625, + "learning_rate": 3.857218829008934e-05, + "loss": 0.442, + "step": 7274 + }, + { + "epoch": 0.3194836273422835, + "grad_norm": 2.0625, + "learning_rate": 3.856637775835097e-05, + "loss": 0.4132, + "step": 7276 + }, + { + "epoch": 0.319571445821487, + "grad_norm": 2.203125, + "learning_rate": 3.856056618769614e-05, + "loss": 0.4476, + "step": 7278 + }, + { + "epoch": 0.31965926430069047, + "grad_norm": 1.9765625, + "learning_rate": 3.855475357856991e-05, + "loss": 0.4238, + "step": 7280 + }, + { + "epoch": 0.31974708277989394, + "grad_norm": 2.046875, + "learning_rate": 3.854893993141739e-05, + "loss": 0.3769, + "step": 7282 + }, + { + "epoch": 0.31983490125909747, + "grad_norm": 2.1875, + "learning_rate": 3.854312524668381e-05, + "loss": 0.4558, + "step": 7284 + }, + { + "epoch": 0.31992271973830094, + "grad_norm": 2.359375, + "learning_rate": 3.853730952481446e-05, + "loss": 0.4316, + "step": 7286 + }, + { + "epoch": 0.3200105382175044, + "grad_norm": 2.125, + "learning_rate": 3.853149276625468e-05, + "loss": 0.4224, + "step": 7288 + }, + { + "epoch": 0.3200983566967079, + "grad_norm": 2.296875, + "learning_rate": 3.8525674971449956e-05, + "loss": 0.4253, + "step": 7290 + }, + { + "epoch": 0.3201861751759114, + "grad_norm": 2.21875, + "learning_rate": 3.85198561408458e-05, + "loss": 0.435, + "step": 7292 + }, + { + "epoch": 0.3202739936551149, + "grad_norm": 2.296875, + "learning_rate": 3.851403627488781e-05, + "loss": 0.4294, + "step": 7294 + }, + { + "epoch": 0.32036181213431836, + "grad_norm": 2.640625, + "learning_rate": 3.8508215374021695e-05, + "loss": 0.4705, + "step": 7296 + }, + { + "epoch": 0.32044963061352183, + "grad_norm": 2.140625, + "learning_rate": 3.8502393438693203e-05, + "loss": 0.4311, + "step": 7298 + }, + { + "epoch": 0.32053744909272536, + "grad_norm": 2.109375, + "learning_rate": 3.8496570469348184e-05, + "loss": 0.436, + "step": 7300 + }, + { + "epoch": 0.32062526757192883, + "grad_norm": 2.0625, + "learning_rate": 3.8490746466432556e-05, + "loss": 0.4277, + "step": 7302 + }, + { + "epoch": 0.3207130860511323, + "grad_norm": 2.25, + "learning_rate": 3.848492143039234e-05, + "loss": 0.4082, + "step": 7304 + }, + { + "epoch": 0.3208009045303358, + "grad_norm": 2.0625, + "learning_rate": 3.8479095361673604e-05, + "loss": 0.4182, + "step": 7306 + }, + { + "epoch": 0.3208887230095393, + "grad_norm": 1.9609375, + "learning_rate": 3.847326826072252e-05, + "loss": 0.4079, + "step": 7308 + }, + { + "epoch": 0.3209765414887428, + "grad_norm": 2.046875, + "learning_rate": 3.846744012798531e-05, + "loss": 0.4158, + "step": 7310 + }, + { + "epoch": 0.32106435996794624, + "grad_norm": 2.15625, + "learning_rate": 3.8461610963908314e-05, + "loss": 0.4115, + "step": 7312 + }, + { + "epoch": 0.3211521784471497, + "grad_norm": 1.875, + "learning_rate": 3.845578076893793e-05, + "loss": 0.4487, + "step": 7314 + }, + { + "epoch": 0.32123999692635324, + "grad_norm": 2.171875, + "learning_rate": 3.8449949543520625e-05, + "loss": 0.4127, + "step": 7316 + }, + { + "epoch": 0.3213278154055567, + "grad_norm": 2.125, + "learning_rate": 3.8444117288102956e-05, + "loss": 0.434, + "step": 7318 + }, + { + "epoch": 0.3214156338847602, + "grad_norm": 1.9765625, + "learning_rate": 3.8438284003131566e-05, + "loss": 0.4443, + "step": 7320 + }, + { + "epoch": 0.32150345236396366, + "grad_norm": 2.125, + "learning_rate": 3.843244968905316e-05, + "loss": 0.4392, + "step": 7322 + }, + { + "epoch": 0.3215912708431672, + "grad_norm": 2.078125, + "learning_rate": 3.842661434631454e-05, + "loss": 0.434, + "step": 7324 + }, + { + "epoch": 0.32167908932237066, + "grad_norm": 2.109375, + "learning_rate": 3.842077797536258e-05, + "loss": 0.4356, + "step": 7326 + }, + { + "epoch": 0.32176690780157413, + "grad_norm": 2.5, + "learning_rate": 3.8414940576644215e-05, + "loss": 0.4315, + "step": 7328 + }, + { + "epoch": 0.32185472628077766, + "grad_norm": 2.3125, + "learning_rate": 3.840910215060649e-05, + "loss": 0.4577, + "step": 7330 + }, + { + "epoch": 0.32194254475998113, + "grad_norm": 3.03125, + "learning_rate": 3.8403262697696514e-05, + "loss": 0.4243, + "step": 7332 + }, + { + "epoch": 0.3220303632391846, + "grad_norm": 2.71875, + "learning_rate": 3.8397422218361454e-05, + "loss": 0.4241, + "step": 7334 + }, + { + "epoch": 0.3221181817183881, + "grad_norm": 3.078125, + "learning_rate": 3.8391580713048604e-05, + "loss": 0.3999, + "step": 7336 + }, + { + "epoch": 0.3222060001975916, + "grad_norm": 3.734375, + "learning_rate": 3.8385738182205276e-05, + "loss": 0.4357, + "step": 7338 + }, + { + "epoch": 0.3222938186767951, + "grad_norm": 2.34375, + "learning_rate": 3.837989462627893e-05, + "loss": 0.4123, + "step": 7340 + }, + { + "epoch": 0.32238163715599855, + "grad_norm": 2.09375, + "learning_rate": 3.837405004571703e-05, + "loss": 0.4154, + "step": 7342 + }, + { + "epoch": 0.322469455635202, + "grad_norm": 2.234375, + "learning_rate": 3.836820444096718e-05, + "loss": 0.3876, + "step": 7344 + }, + { + "epoch": 0.32255727411440555, + "grad_norm": 2.671875, + "learning_rate": 3.8362357812477025e-05, + "loss": 0.4418, + "step": 7346 + }, + { + "epoch": 0.322645092593609, + "grad_norm": 2.671875, + "learning_rate": 3.8356510160694305e-05, + "loss": 0.4353, + "step": 7348 + }, + { + "epoch": 0.3227329110728125, + "grad_norm": 2.65625, + "learning_rate": 3.835066148606683e-05, + "loss": 0.4217, + "step": 7350 + }, + { + "epoch": 0.32282072955201596, + "grad_norm": 2.640625, + "learning_rate": 3.834481178904251e-05, + "loss": 0.4379, + "step": 7352 + }, + { + "epoch": 0.3229085480312195, + "grad_norm": 2.1875, + "learning_rate": 3.83389610700693e-05, + "loss": 0.4549, + "step": 7354 + }, + { + "epoch": 0.32299636651042296, + "grad_norm": 2.046875, + "learning_rate": 3.833310932959525e-05, + "loss": 0.4456, + "step": 7356 + }, + { + "epoch": 0.32308418498962643, + "grad_norm": 2.46875, + "learning_rate": 3.8327256568068495e-05, + "loss": 0.4234, + "step": 7358 + }, + { + "epoch": 0.3231720034688299, + "grad_norm": 2.453125, + "learning_rate": 3.832140278593724e-05, + "loss": 0.42, + "step": 7360 + }, + { + "epoch": 0.32325982194803343, + "grad_norm": 2.796875, + "learning_rate": 3.8315547983649764e-05, + "loss": 0.4267, + "step": 7362 + }, + { + "epoch": 0.3233476404272369, + "grad_norm": 2.359375, + "learning_rate": 3.830969216165443e-05, + "loss": 0.4332, + "step": 7364 + }, + { + "epoch": 0.3234354589064404, + "grad_norm": 2.15625, + "learning_rate": 3.8303835320399685e-05, + "loss": 0.4232, + "step": 7366 + }, + { + "epoch": 0.32352327738564385, + "grad_norm": 2.28125, + "learning_rate": 3.829797746033404e-05, + "loss": 0.4553, + "step": 7368 + }, + { + "epoch": 0.3236110958648474, + "grad_norm": 2.4375, + "learning_rate": 3.829211858190608e-05, + "loss": 0.4008, + "step": 7370 + }, + { + "epoch": 0.32369891434405085, + "grad_norm": 2.21875, + "learning_rate": 3.8286258685564505e-05, + "loss": 0.4607, + "step": 7372 + }, + { + "epoch": 0.3237867328232543, + "grad_norm": 2.515625, + "learning_rate": 3.828039777175805e-05, + "loss": 0.4038, + "step": 7374 + }, + { + "epoch": 0.32387455130245785, + "grad_norm": 2.671875, + "learning_rate": 3.8274535840935553e-05, + "loss": 0.4249, + "step": 7376 + }, + { + "epoch": 0.3239623697816613, + "grad_norm": 2.140625, + "learning_rate": 3.8268672893545924e-05, + "loss": 0.4222, + "step": 7378 + }, + { + "epoch": 0.3240501882608648, + "grad_norm": 2.140625, + "learning_rate": 3.826280893003814e-05, + "loss": 0.4074, + "step": 7380 + }, + { + "epoch": 0.32413800674006826, + "grad_norm": 2.03125, + "learning_rate": 3.8256943950861264e-05, + "loss": 0.4298, + "step": 7382 + }, + { + "epoch": 0.3242258252192718, + "grad_norm": 2.203125, + "learning_rate": 3.825107795646444e-05, + "loss": 0.4323, + "step": 7384 + }, + { + "epoch": 0.32431364369847526, + "grad_norm": 2.265625, + "learning_rate": 3.82452109472969e-05, + "loss": 0.4272, + "step": 7386 + }, + { + "epoch": 0.32440146217767873, + "grad_norm": 2.125, + "learning_rate": 3.823934292380793e-05, + "loss": 0.4031, + "step": 7388 + }, + { + "epoch": 0.3244892806568822, + "grad_norm": 2.203125, + "learning_rate": 3.82334738864469e-05, + "loss": 0.4268, + "step": 7390 + }, + { + "epoch": 0.32457709913608573, + "grad_norm": 2.203125, + "learning_rate": 3.822760383566327e-05, + "loss": 0.413, + "step": 7392 + }, + { + "epoch": 0.3246649176152892, + "grad_norm": 2.0625, + "learning_rate": 3.822173277190657e-05, + "loss": 0.4249, + "step": 7394 + }, + { + "epoch": 0.3247527360944927, + "grad_norm": 2.109375, + "learning_rate": 3.8215860695626396e-05, + "loss": 0.4091, + "step": 7396 + }, + { + "epoch": 0.32484055457369615, + "grad_norm": 1.9375, + "learning_rate": 3.8209987607272444e-05, + "loss": 0.4415, + "step": 7398 + }, + { + "epoch": 0.3249283730528997, + "grad_norm": 2.34375, + "learning_rate": 3.820411350729448e-05, + "loss": 0.4045, + "step": 7400 + }, + { + "epoch": 0.32501619153210315, + "grad_norm": 2.046875, + "learning_rate": 3.819823839614234e-05, + "loss": 0.4163, + "step": 7402 + }, + { + "epoch": 0.3251040100113066, + "grad_norm": 2.375, + "learning_rate": 3.8192362274265934e-05, + "loss": 0.3869, + "step": 7404 + }, + { + "epoch": 0.3251918284905101, + "grad_norm": 2.1875, + "learning_rate": 3.8186485142115266e-05, + "loss": 0.4192, + "step": 7406 + }, + { + "epoch": 0.3252796469697136, + "grad_norm": 1.9609375, + "learning_rate": 3.81806070001404e-05, + "loss": 0.4053, + "step": 7408 + }, + { + "epoch": 0.3253674654489171, + "grad_norm": 1.96875, + "learning_rate": 3.817472784879149e-05, + "loss": 0.4142, + "step": 7410 + }, + { + "epoch": 0.32545528392812056, + "grad_norm": 1.984375, + "learning_rate": 3.816884768851877e-05, + "loss": 0.4375, + "step": 7412 + }, + { + "epoch": 0.32554310240732404, + "grad_norm": 1.921875, + "learning_rate": 3.816296651977254e-05, + "loss": 0.4372, + "step": 7414 + }, + { + "epoch": 0.32563092088652756, + "grad_norm": 2.171875, + "learning_rate": 3.815708434300317e-05, + "loss": 0.4311, + "step": 7416 + }, + { + "epoch": 0.32571873936573104, + "grad_norm": 2.375, + "learning_rate": 3.815120115866113e-05, + "loss": 0.4264, + "step": 7418 + }, + { + "epoch": 0.3258065578449345, + "grad_norm": 2.171875, + "learning_rate": 3.814531696719695e-05, + "loss": 0.4248, + "step": 7420 + }, + { + "epoch": 0.325894376324138, + "grad_norm": 2.359375, + "learning_rate": 3.813943176906125e-05, + "loss": 0.4473, + "step": 7422 + }, + { + "epoch": 0.3259821948033415, + "grad_norm": 2.3125, + "learning_rate": 3.813354556470471e-05, + "loss": 0.4678, + "step": 7424 + }, + { + "epoch": 0.326070013282545, + "grad_norm": 2.46875, + "learning_rate": 3.812765835457811e-05, + "loss": 0.432, + "step": 7426 + }, + { + "epoch": 0.32615783176174845, + "grad_norm": 2.0625, + "learning_rate": 3.812177013913228e-05, + "loss": 0.4186, + "step": 7428 + }, + { + "epoch": 0.326245650240952, + "grad_norm": 2.03125, + "learning_rate": 3.8115880918818147e-05, + "loss": 0.4128, + "step": 7430 + }, + { + "epoch": 0.32633346872015545, + "grad_norm": 2.109375, + "learning_rate": 3.810999069408671e-05, + "loss": 0.4101, + "step": 7432 + }, + { + "epoch": 0.3264212871993589, + "grad_norm": 2.03125, + "learning_rate": 3.810409946538904e-05, + "loss": 0.3973, + "step": 7434 + }, + { + "epoch": 0.3265091056785624, + "grad_norm": 1.921875, + "learning_rate": 3.80982072331763e-05, + "loss": 0.417, + "step": 7436 + }, + { + "epoch": 0.3265969241577659, + "grad_norm": 2.015625, + "learning_rate": 3.809231399789971e-05, + "loss": 0.4263, + "step": 7438 + }, + { + "epoch": 0.3266847426369694, + "grad_norm": 2.203125, + "learning_rate": 3.808641976001057e-05, + "loss": 0.4417, + "step": 7440 + }, + { + "epoch": 0.32677256111617287, + "grad_norm": 2.234375, + "learning_rate": 3.808052451996027e-05, + "loss": 0.4295, + "step": 7442 + }, + { + "epoch": 0.32686037959537634, + "grad_norm": 2.28125, + "learning_rate": 3.8074628278200266e-05, + "loss": 0.405, + "step": 7444 + }, + { + "epoch": 0.32694819807457987, + "grad_norm": 2.265625, + "learning_rate": 3.806873103518209e-05, + "loss": 0.4161, + "step": 7446 + }, + { + "epoch": 0.32703601655378334, + "grad_norm": 2.125, + "learning_rate": 3.806283279135736e-05, + "loss": 0.4151, + "step": 7448 + }, + { + "epoch": 0.3271238350329868, + "grad_norm": 2.125, + "learning_rate": 3.805693354717777e-05, + "loss": 0.4141, + "step": 7450 + }, + { + "epoch": 0.3272116535121903, + "grad_norm": 2.140625, + "learning_rate": 3.805103330309508e-05, + "loss": 0.4028, + "step": 7452 + }, + { + "epoch": 0.3272994719913938, + "grad_norm": 2.203125, + "learning_rate": 3.804513205956113e-05, + "loss": 0.4393, + "step": 7454 + }, + { + "epoch": 0.3273872904705973, + "grad_norm": 2.171875, + "learning_rate": 3.8039229817027834e-05, + "loss": 0.4222, + "step": 7456 + }, + { + "epoch": 0.32747510894980075, + "grad_norm": 2.15625, + "learning_rate": 3.803332657594719e-05, + "loss": 0.4171, + "step": 7458 + }, + { + "epoch": 0.3275629274290042, + "grad_norm": 2.03125, + "learning_rate": 3.8027422336771275e-05, + "loss": 0.3707, + "step": 7460 + }, + { + "epoch": 0.32765074590820775, + "grad_norm": 2.03125, + "learning_rate": 3.802151709995224e-05, + "loss": 0.39, + "step": 7462 + }, + { + "epoch": 0.3277385643874112, + "grad_norm": 2.5, + "learning_rate": 3.80156108659423e-05, + "loss": 0.44, + "step": 7464 + }, + { + "epoch": 0.3278263828666147, + "grad_norm": 2.15625, + "learning_rate": 3.800970363519376e-05, + "loss": 0.3907, + "step": 7466 + }, + { + "epoch": 0.32791420134581817, + "grad_norm": 2.421875, + "learning_rate": 3.8003795408159004e-05, + "loss": 0.412, + "step": 7468 + }, + { + "epoch": 0.3280020198250217, + "grad_norm": 2.109375, + "learning_rate": 3.799788618529046e-05, + "loss": 0.4174, + "step": 7470 + }, + { + "epoch": 0.32808983830422517, + "grad_norm": 2.25, + "learning_rate": 3.7991975967040694e-05, + "loss": 0.4239, + "step": 7472 + }, + { + "epoch": 0.32817765678342864, + "grad_norm": 2.453125, + "learning_rate": 3.798606475386229e-05, + "loss": 0.4289, + "step": 7474 + }, + { + "epoch": 0.32826547526263217, + "grad_norm": 2.234375, + "learning_rate": 3.798015254620794e-05, + "loss": 0.4332, + "step": 7476 + }, + { + "epoch": 0.32835329374183564, + "grad_norm": 1.8984375, + "learning_rate": 3.797423934453038e-05, + "loss": 0.4123, + "step": 7478 + }, + { + "epoch": 0.3284411122210391, + "grad_norm": 2.0625, + "learning_rate": 3.796832514928247e-05, + "loss": 0.4131, + "step": 7480 + }, + { + "epoch": 0.3285289307002426, + "grad_norm": 2.0625, + "learning_rate": 3.796240996091711e-05, + "loss": 0.4259, + "step": 7482 + }, + { + "epoch": 0.3286167491794461, + "grad_norm": 2.34375, + "learning_rate": 3.795649377988729e-05, + "loss": 0.3994, + "step": 7484 + }, + { + "epoch": 0.3287045676586496, + "grad_norm": 1.8046875, + "learning_rate": 3.795057660664607e-05, + "loss": 0.4358, + "step": 7486 + }, + { + "epoch": 0.32879238613785305, + "grad_norm": 2.03125, + "learning_rate": 3.794465844164659e-05, + "loss": 0.4326, + "step": 7488 + }, + { + "epoch": 0.3288802046170565, + "grad_norm": 2.234375, + "learning_rate": 3.793873928534206e-05, + "loss": 0.4255, + "step": 7490 + }, + { + "epoch": 0.32896802309626005, + "grad_norm": 1.9765625, + "learning_rate": 3.793281913818578e-05, + "loss": 0.4384, + "step": 7492 + }, + { + "epoch": 0.3290558415754635, + "grad_norm": 1.90625, + "learning_rate": 3.7926898000631106e-05, + "loss": 0.4206, + "step": 7494 + }, + { + "epoch": 0.329143660054667, + "grad_norm": 2.21875, + "learning_rate": 3.792097587313148e-05, + "loss": 0.4219, + "step": 7496 + }, + { + "epoch": 0.32923147853387047, + "grad_norm": 2.140625, + "learning_rate": 3.791505275614043e-05, + "loss": 0.4579, + "step": 7498 + }, + { + "epoch": 0.329319297013074, + "grad_norm": 2.0, + "learning_rate": 3.790912865011154e-05, + "loss": 0.4063, + "step": 7500 + }, + { + "epoch": 0.32940711549227747, + "grad_norm": 2.015625, + "learning_rate": 3.790320355549849e-05, + "loss": 0.4278, + "step": 7502 + }, + { + "epoch": 0.32949493397148094, + "grad_norm": 2.03125, + "learning_rate": 3.789727747275502e-05, + "loss": 0.4088, + "step": 7504 + }, + { + "epoch": 0.3295827524506844, + "grad_norm": 2.28125, + "learning_rate": 3.7891350402334935e-05, + "loss": 0.3918, + "step": 7506 + }, + { + "epoch": 0.32967057092988794, + "grad_norm": 2.0, + "learning_rate": 3.788542234469216e-05, + "loss": 0.4272, + "step": 7508 + }, + { + "epoch": 0.3297583894090914, + "grad_norm": 2.53125, + "learning_rate": 3.7879493300280643e-05, + "loss": 0.4163, + "step": 7510 + }, + { + "epoch": 0.3298462078882949, + "grad_norm": 2.3125, + "learning_rate": 3.7873563269554454e-05, + "loss": 0.436, + "step": 7512 + }, + { + "epoch": 0.32993402636749836, + "grad_norm": 2.53125, + "learning_rate": 3.78676322529677e-05, + "loss": 0.4093, + "step": 7514 + }, + { + "epoch": 0.3300218448467019, + "grad_norm": 2.625, + "learning_rate": 3.786170025097457e-05, + "loss": 0.4155, + "step": 7516 + }, + { + "epoch": 0.33010966332590536, + "grad_norm": 2.046875, + "learning_rate": 3.7855767264029366e-05, + "loss": 0.401, + "step": 7518 + }, + { + "epoch": 0.33019748180510883, + "grad_norm": 2.296875, + "learning_rate": 3.784983329258642e-05, + "loss": 0.4521, + "step": 7520 + }, + { + "epoch": 0.3302853002843123, + "grad_norm": 2.15625, + "learning_rate": 3.784389833710016e-05, + "loss": 0.3999, + "step": 7522 + }, + { + "epoch": 0.3303731187635158, + "grad_norm": 1.9375, + "learning_rate": 3.783796239802509e-05, + "loss": 0.44, + "step": 7524 + }, + { + "epoch": 0.3304609372427193, + "grad_norm": 1.9609375, + "learning_rate": 3.783202547581577e-05, + "loss": 0.4255, + "step": 7526 + }, + { + "epoch": 0.33054875572192277, + "grad_norm": 2.296875, + "learning_rate": 3.782608757092687e-05, + "loss": 0.4362, + "step": 7528 + }, + { + "epoch": 0.3306365742011263, + "grad_norm": 2.03125, + "learning_rate": 3.782014868381312e-05, + "loss": 0.4049, + "step": 7530 + }, + { + "epoch": 0.33072439268032977, + "grad_norm": 2.171875, + "learning_rate": 3.781420881492929e-05, + "loss": 0.4214, + "step": 7532 + }, + { + "epoch": 0.33081221115953324, + "grad_norm": 2.109375, + "learning_rate": 3.780826796473029e-05, + "loss": 0.4206, + "step": 7534 + }, + { + "epoch": 0.3309000296387367, + "grad_norm": 2.140625, + "learning_rate": 3.780232613367105e-05, + "loss": 0.4223, + "step": 7536 + }, + { + "epoch": 0.33098784811794024, + "grad_norm": 1.8359375, + "learning_rate": 3.779638332220662e-05, + "loss": 0.3965, + "step": 7538 + }, + { + "epoch": 0.3310756665971437, + "grad_norm": 2.0, + "learning_rate": 3.7790439530792075e-05, + "loss": 0.386, + "step": 7540 + }, + { + "epoch": 0.3311634850763472, + "grad_norm": 2.109375, + "learning_rate": 3.778449475988261e-05, + "loss": 0.4401, + "step": 7542 + }, + { + "epoch": 0.33125130355555066, + "grad_norm": 1.921875, + "learning_rate": 3.777854900993347e-05, + "loss": 0.4216, + "step": 7544 + }, + { + "epoch": 0.3313391220347542, + "grad_norm": 2.015625, + "learning_rate": 3.777260228139999e-05, + "loss": 0.3927, + "step": 7546 + }, + { + "epoch": 0.33142694051395766, + "grad_norm": 2.234375, + "learning_rate": 3.776665457473756e-05, + "loss": 0.4544, + "step": 7548 + }, + { + "epoch": 0.33151475899316113, + "grad_norm": 1.9375, + "learning_rate": 3.776070589040166e-05, + "loss": 0.401, + "step": 7550 + }, + { + "epoch": 0.3316025774723646, + "grad_norm": 2.09375, + "learning_rate": 3.775475622884785e-05, + "loss": 0.4037, + "step": 7552 + }, + { + "epoch": 0.33169039595156813, + "grad_norm": 2.078125, + "learning_rate": 3.774880559053175e-05, + "loss": 0.4327, + "step": 7554 + }, + { + "epoch": 0.3317782144307716, + "grad_norm": 2.0, + "learning_rate": 3.7742853975909056e-05, + "loss": 0.4202, + "step": 7556 + }, + { + "epoch": 0.3318660329099751, + "grad_norm": 2.03125, + "learning_rate": 3.773690138543555e-05, + "loss": 0.3931, + "step": 7558 + }, + { + "epoch": 0.33195385138917854, + "grad_norm": 2.09375, + "learning_rate": 3.773094781956709e-05, + "loss": 0.4, + "step": 7560 + }, + { + "epoch": 0.33204166986838207, + "grad_norm": 2.125, + "learning_rate": 3.772499327875959e-05, + "loss": 0.4021, + "step": 7562 + }, + { + "epoch": 0.33212948834758554, + "grad_norm": 2.4375, + "learning_rate": 3.771903776346905e-05, + "loss": 0.4401, + "step": 7564 + }, + { + "epoch": 0.332217306826789, + "grad_norm": 1.9140625, + "learning_rate": 3.771308127415155e-05, + "loss": 0.4201, + "step": 7566 + }, + { + "epoch": 0.3323051253059925, + "grad_norm": 2.15625, + "learning_rate": 3.770712381126325e-05, + "loss": 0.4519, + "step": 7568 + }, + { + "epoch": 0.332392943785196, + "grad_norm": 2.1875, + "learning_rate": 3.7701165375260344e-05, + "loss": 0.4011, + "step": 7570 + }, + { + "epoch": 0.3324807622643995, + "grad_norm": 2.171875, + "learning_rate": 3.7695205966599154e-05, + "loss": 0.3996, + "step": 7572 + }, + { + "epoch": 0.33256858074360296, + "grad_norm": 2.171875, + "learning_rate": 3.768924558573606e-05, + "loss": 0.406, + "step": 7574 + }, + { + "epoch": 0.33265639922280643, + "grad_norm": 2.09375, + "learning_rate": 3.768328423312749e-05, + "loss": 0.4221, + "step": 7576 + }, + { + "epoch": 0.33274421770200996, + "grad_norm": 2.03125, + "learning_rate": 3.767732190922997e-05, + "loss": 0.4178, + "step": 7578 + }, + { + "epoch": 0.33283203618121343, + "grad_norm": 2.09375, + "learning_rate": 3.767135861450011e-05, + "loss": 0.4026, + "step": 7580 + }, + { + "epoch": 0.3329198546604169, + "grad_norm": 1.921875, + "learning_rate": 3.7665394349394556e-05, + "loss": 0.4005, + "step": 7582 + }, + { + "epoch": 0.33300767313962043, + "grad_norm": 2.171875, + "learning_rate": 3.765942911437007e-05, + "loss": 0.4165, + "step": 7584 + }, + { + "epoch": 0.3330954916188239, + "grad_norm": 1.9140625, + "learning_rate": 3.7653462909883474e-05, + "loss": 0.3877, + "step": 7586 + }, + { + "epoch": 0.3331833100980274, + "grad_norm": 1.953125, + "learning_rate": 3.764749573639165e-05, + "loss": 0.4211, + "step": 7588 + }, + { + "epoch": 0.33327112857723085, + "grad_norm": 2.453125, + "learning_rate": 3.7641527594351577e-05, + "loss": 0.4552, + "step": 7590 + }, + { + "epoch": 0.3333589470564344, + "grad_norm": 2.203125, + "learning_rate": 3.763555848422028e-05, + "loss": 0.4462, + "step": 7592 + }, + { + "epoch": 0.33344676553563785, + "grad_norm": 2.0625, + "learning_rate": 3.762958840645489e-05, + "loss": 0.4193, + "step": 7594 + }, + { + "epoch": 0.3335345840148413, + "grad_norm": 2.0625, + "learning_rate": 3.7623617361512595e-05, + "loss": 0.4079, + "step": 7596 + }, + { + "epoch": 0.3336224024940448, + "grad_norm": 2.3125, + "learning_rate": 3.7617645349850655e-05, + "loss": 0.4191, + "step": 7598 + }, + { + "epoch": 0.3337102209732483, + "grad_norm": 2.109375, + "learning_rate": 3.761167237192641e-05, + "loss": 0.4295, + "step": 7600 + }, + { + "epoch": 0.3337980394524518, + "grad_norm": 2.203125, + "learning_rate": 3.7605698428197265e-05, + "loss": 0.4517, + "step": 7602 + }, + { + "epoch": 0.33388585793165526, + "grad_norm": 2.078125, + "learning_rate": 3.7599723519120725e-05, + "loss": 0.4201, + "step": 7604 + }, + { + "epoch": 0.33397367641085873, + "grad_norm": 1.90625, + "learning_rate": 3.759374764515433e-05, + "loss": 0.4051, + "step": 7606 + }, + { + "epoch": 0.33406149489006226, + "grad_norm": 2.21875, + "learning_rate": 3.7587770806755715e-05, + "loss": 0.4344, + "step": 7608 + }, + { + "epoch": 0.33414931336926573, + "grad_norm": 2.0625, + "learning_rate": 3.7581793004382603e-05, + "loss": 0.4551, + "step": 7610 + }, + { + "epoch": 0.3342371318484692, + "grad_norm": 2.296875, + "learning_rate": 3.757581423849277e-05, + "loss": 0.4337, + "step": 7612 + }, + { + "epoch": 0.3343249503276727, + "grad_norm": 2.140625, + "learning_rate": 3.7569834509544054e-05, + "loss": 0.4249, + "step": 7614 + }, + { + "epoch": 0.3344127688068762, + "grad_norm": 2.28125, + "learning_rate": 3.756385381799441e-05, + "loss": 0.4328, + "step": 7616 + }, + { + "epoch": 0.3345005872860797, + "grad_norm": 1.984375, + "learning_rate": 3.755787216430182e-05, + "loss": 0.3575, + "step": 7618 + }, + { + "epoch": 0.33458840576528315, + "grad_norm": 2.46875, + "learning_rate": 3.755188954892438e-05, + "loss": 0.4209, + "step": 7620 + }, + { + "epoch": 0.3346762242444866, + "grad_norm": 2.40625, + "learning_rate": 3.754590597232023e-05, + "loss": 0.4171, + "step": 7622 + }, + { + "epoch": 0.33476404272369015, + "grad_norm": 2.21875, + "learning_rate": 3.753992143494759e-05, + "loss": 0.4345, + "step": 7624 + }, + { + "epoch": 0.3348518612028936, + "grad_norm": 2.109375, + "learning_rate": 3.753393593726475e-05, + "loss": 0.4031, + "step": 7626 + }, + { + "epoch": 0.3349396796820971, + "grad_norm": 2.203125, + "learning_rate": 3.7527949479730104e-05, + "loss": 0.4153, + "step": 7628 + }, + { + "epoch": 0.3350274981613006, + "grad_norm": 2.078125, + "learning_rate": 3.752196206280207e-05, + "loss": 0.4425, + "step": 7630 + }, + { + "epoch": 0.3351153166405041, + "grad_norm": 1.9609375, + "learning_rate": 3.7515973686939184e-05, + "loss": 0.3951, + "step": 7632 + }, + { + "epoch": 0.33520313511970756, + "grad_norm": 1.984375, + "learning_rate": 3.7509984352600044e-05, + "loss": 0.4232, + "step": 7634 + }, + { + "epoch": 0.33529095359891103, + "grad_norm": 2.609375, + "learning_rate": 3.7503994060243296e-05, + "loss": 0.3971, + "step": 7636 + }, + { + "epoch": 0.33537877207811456, + "grad_norm": 2.234375, + "learning_rate": 3.7498002810327694e-05, + "loss": 0.451, + "step": 7638 + }, + { + "epoch": 0.33546659055731803, + "grad_norm": 2.234375, + "learning_rate": 3.749201060331203e-05, + "loss": 0.3823, + "step": 7640 + }, + { + "epoch": 0.3355544090365215, + "grad_norm": 2.328125, + "learning_rate": 3.74860174396552e-05, + "loss": 0.4138, + "step": 7642 + }, + { + "epoch": 0.335642227515725, + "grad_norm": 1.7890625, + "learning_rate": 3.7480023319816164e-05, + "loss": 0.4207, + "step": 7644 + }, + { + "epoch": 0.3357300459949285, + "grad_norm": 1.8828125, + "learning_rate": 3.747402824425395e-05, + "loss": 0.3933, + "step": 7646 + }, + { + "epoch": 0.335817864474132, + "grad_norm": 2.359375, + "learning_rate": 3.7468032213427666e-05, + "loss": 0.4109, + "step": 7648 + }, + { + "epoch": 0.33590568295333545, + "grad_norm": 1.8046875, + "learning_rate": 3.7462035227796484e-05, + "loss": 0.447, + "step": 7650 + }, + { + "epoch": 0.3359935014325389, + "grad_norm": 2.171875, + "learning_rate": 3.745603728781966e-05, + "loss": 0.3935, + "step": 7652 + }, + { + "epoch": 0.33608131991174245, + "grad_norm": 2.03125, + "learning_rate": 3.745003839395651e-05, + "loss": 0.4265, + "step": 7654 + }, + { + "epoch": 0.3361691383909459, + "grad_norm": 1.9453125, + "learning_rate": 3.744403854666643e-05, + "loss": 0.4304, + "step": 7656 + }, + { + "epoch": 0.3362569568701494, + "grad_norm": 1.953125, + "learning_rate": 3.743803774640891e-05, + "loss": 0.4293, + "step": 7658 + }, + { + "epoch": 0.33634477534935286, + "grad_norm": 2.125, + "learning_rate": 3.743203599364347e-05, + "loss": 0.3955, + "step": 7660 + }, + { + "epoch": 0.3364325938285564, + "grad_norm": 2.0625, + "learning_rate": 3.7426033288829725e-05, + "loss": 0.4054, + "step": 7662 + }, + { + "epoch": 0.33652041230775986, + "grad_norm": 2.21875, + "learning_rate": 3.7420029632427375e-05, + "loss": 0.4184, + "step": 7664 + }, + { + "epoch": 0.33660823078696334, + "grad_norm": 2.28125, + "learning_rate": 3.7414025024896195e-05, + "loss": 0.4242, + "step": 7666 + }, + { + "epoch": 0.3366960492661668, + "grad_norm": 2.34375, + "learning_rate": 3.740801946669599e-05, + "loss": 0.4264, + "step": 7668 + }, + { + "epoch": 0.33678386774537034, + "grad_norm": 2.34375, + "learning_rate": 3.740201295828668e-05, + "loss": 0.4062, + "step": 7670 + }, + { + "epoch": 0.3368716862245738, + "grad_norm": 2.453125, + "learning_rate": 3.739600550012824e-05, + "loss": 0.4151, + "step": 7672 + }, + { + "epoch": 0.3369595047037773, + "grad_norm": 2.046875, + "learning_rate": 3.738999709268074e-05, + "loss": 0.3959, + "step": 7674 + }, + { + "epoch": 0.33704732318298075, + "grad_norm": 2.15625, + "learning_rate": 3.738398773640428e-05, + "loss": 0.4408, + "step": 7676 + }, + { + "epoch": 0.3371351416621843, + "grad_norm": 1.921875, + "learning_rate": 3.737797743175907e-05, + "loss": 0.4428, + "step": 7678 + }, + { + "epoch": 0.33722296014138775, + "grad_norm": 2.078125, + "learning_rate": 3.7371966179205386e-05, + "loss": 0.4136, + "step": 7680 + }, + { + "epoch": 0.3373107786205912, + "grad_norm": 1.8359375, + "learning_rate": 3.7365953979203574e-05, + "loss": 0.3965, + "step": 7682 + }, + { + "epoch": 0.33739859709979475, + "grad_norm": 1.890625, + "learning_rate": 3.735994083221403e-05, + "loss": 0.4064, + "step": 7684 + }, + { + "epoch": 0.3374864155789982, + "grad_norm": 1.984375, + "learning_rate": 3.735392673869726e-05, + "loss": 0.4215, + "step": 7686 + }, + { + "epoch": 0.3375742340582017, + "grad_norm": 2.25, + "learning_rate": 3.734791169911382e-05, + "loss": 0.4152, + "step": 7688 + }, + { + "epoch": 0.33766205253740517, + "grad_norm": 2.328125, + "learning_rate": 3.734189571392434e-05, + "loss": 0.402, + "step": 7690 + }, + { + "epoch": 0.3377498710166087, + "grad_norm": 2.0, + "learning_rate": 3.7335878783589525e-05, + "loss": 0.4335, + "step": 7692 + }, + { + "epoch": 0.33783768949581217, + "grad_norm": 1.984375, + "learning_rate": 3.7329860908570156e-05, + "loss": 0.4104, + "step": 7694 + }, + { + "epoch": 0.33792550797501564, + "grad_norm": 1.921875, + "learning_rate": 3.732384208932709e-05, + "loss": 0.4066, + "step": 7696 + }, + { + "epoch": 0.3380133264542191, + "grad_norm": 1.9609375, + "learning_rate": 3.7317822326321236e-05, + "loss": 0.4157, + "step": 7698 + }, + { + "epoch": 0.33810114493342264, + "grad_norm": 2.296875, + "learning_rate": 3.7311801620013596e-05, + "loss": 0.4167, + "step": 7700 + }, + { + "epoch": 0.3381889634126261, + "grad_norm": 2.0, + "learning_rate": 3.730577997086524e-05, + "loss": 0.4178, + "step": 7702 + }, + { + "epoch": 0.3382767818918296, + "grad_norm": 2.1875, + "learning_rate": 3.72997573793373e-05, + "loss": 0.4049, + "step": 7704 + }, + { + "epoch": 0.33836460037103305, + "grad_norm": 2.21875, + "learning_rate": 3.729373384589099e-05, + "loss": 0.428, + "step": 7706 + }, + { + "epoch": 0.3384524188502366, + "grad_norm": 2.171875, + "learning_rate": 3.72877093709876e-05, + "loss": 0.4219, + "step": 7708 + }, + { + "epoch": 0.33854023732944005, + "grad_norm": 2.328125, + "learning_rate": 3.728168395508848e-05, + "loss": 0.415, + "step": 7710 + }, + { + "epoch": 0.3386280558086435, + "grad_norm": 2.109375, + "learning_rate": 3.7275657598655066e-05, + "loss": 0.4394, + "step": 7712 + }, + { + "epoch": 0.338715874287847, + "grad_norm": 2.171875, + "learning_rate": 3.726963030214884e-05, + "loss": 0.425, + "step": 7714 + }, + { + "epoch": 0.3388036927670505, + "grad_norm": 2.125, + "learning_rate": 3.726360206603138e-05, + "loss": 0.4231, + "step": 7716 + }, + { + "epoch": 0.338891511246254, + "grad_norm": 2.03125, + "learning_rate": 3.725757289076434e-05, + "loss": 0.4206, + "step": 7718 + }, + { + "epoch": 0.33897932972545747, + "grad_norm": 2.203125, + "learning_rate": 3.725154277680943e-05, + "loss": 0.4093, + "step": 7720 + }, + { + "epoch": 0.33906714820466094, + "grad_norm": 1.90625, + "learning_rate": 3.7245511724628444e-05, + "loss": 0.4074, + "step": 7722 + }, + { + "epoch": 0.33915496668386447, + "grad_norm": 2.15625, + "learning_rate": 3.7239479734683226e-05, + "loss": 0.4005, + "step": 7724 + }, + { + "epoch": 0.33924278516306794, + "grad_norm": 2.484375, + "learning_rate": 3.723344680743571e-05, + "loss": 0.436, + "step": 7726 + }, + { + "epoch": 0.3393306036422714, + "grad_norm": 1.96875, + "learning_rate": 3.722741294334792e-05, + "loss": 0.4087, + "step": 7728 + }, + { + "epoch": 0.33941842212147494, + "grad_norm": 2.265625, + "learning_rate": 3.722137814288191e-05, + "loss": 0.4382, + "step": 7730 + }, + { + "epoch": 0.3395062406006784, + "grad_norm": 2.140625, + "learning_rate": 3.721534240649983e-05, + "loss": 0.3787, + "step": 7732 + }, + { + "epoch": 0.3395940590798819, + "grad_norm": 2.28125, + "learning_rate": 3.720930573466389e-05, + "loss": 0.401, + "step": 7734 + }, + { + "epoch": 0.33968187755908535, + "grad_norm": 2.125, + "learning_rate": 3.720326812783641e-05, + "loss": 0.4417, + "step": 7736 + }, + { + "epoch": 0.3397696960382889, + "grad_norm": 2.3125, + "learning_rate": 3.719722958647972e-05, + "loss": 0.4282, + "step": 7738 + }, + { + "epoch": 0.33985751451749235, + "grad_norm": 2.125, + "learning_rate": 3.719119011105627e-05, + "loss": 0.4352, + "step": 7740 + }, + { + "epoch": 0.3399453329966958, + "grad_norm": 2.09375, + "learning_rate": 3.7185149702028555e-05, + "loss": 0.4154, + "step": 7742 + }, + { + "epoch": 0.3400331514758993, + "grad_norm": 2.234375, + "learning_rate": 3.717910835985916e-05, + "loss": 0.4041, + "step": 7744 + }, + { + "epoch": 0.3401209699551028, + "grad_norm": 2.09375, + "learning_rate": 3.717306608501072e-05, + "loss": 0.3961, + "step": 7746 + }, + { + "epoch": 0.3402087884343063, + "grad_norm": 2.140625, + "learning_rate": 3.716702287794597e-05, + "loss": 0.4197, + "step": 7748 + }, + { + "epoch": 0.34029660691350977, + "grad_norm": 2.046875, + "learning_rate": 3.7160978739127684e-05, + "loss": 0.4037, + "step": 7750 + }, + { + "epoch": 0.34038442539271324, + "grad_norm": 2.234375, + "learning_rate": 3.7154933669018724e-05, + "loss": 0.4529, + "step": 7752 + }, + { + "epoch": 0.34047224387191677, + "grad_norm": 2.03125, + "learning_rate": 3.714888766808204e-05, + "loss": 0.414, + "step": 7754 + }, + { + "epoch": 0.34056006235112024, + "grad_norm": 2.265625, + "learning_rate": 3.714284073678063e-05, + "loss": 0.4048, + "step": 7756 + }, + { + "epoch": 0.3406478808303237, + "grad_norm": 2.0625, + "learning_rate": 3.7136792875577556e-05, + "loss": 0.4016, + "step": 7758 + }, + { + "epoch": 0.3407356993095272, + "grad_norm": 2.46875, + "learning_rate": 3.713074408493598e-05, + "loss": 0.4241, + "step": 7760 + }, + { + "epoch": 0.3408235177887307, + "grad_norm": 2.1875, + "learning_rate": 3.7124694365319114e-05, + "loss": 0.4218, + "step": 7762 + }, + { + "epoch": 0.3409113362679342, + "grad_norm": 1.90625, + "learning_rate": 3.711864371719024e-05, + "loss": 0.4143, + "step": 7764 + }, + { + "epoch": 0.34099915474713766, + "grad_norm": 2.0, + "learning_rate": 3.711259214101273e-05, + "loss": 0.4018, + "step": 7766 + }, + { + "epoch": 0.34108697322634113, + "grad_norm": 2.171875, + "learning_rate": 3.710653963725001e-05, + "loss": 0.4196, + "step": 7768 + }, + { + "epoch": 0.34117479170554466, + "grad_norm": 2.109375, + "learning_rate": 3.710048620636558e-05, + "loss": 0.404, + "step": 7770 + }, + { + "epoch": 0.3412626101847481, + "grad_norm": 1.984375, + "learning_rate": 3.7094431848823026e-05, + "loss": 0.3881, + "step": 7772 + }, + { + "epoch": 0.3413504286639516, + "grad_norm": 2.34375, + "learning_rate": 3.708837656508597e-05, + "loss": 0.3954, + "step": 7774 + }, + { + "epoch": 0.34143824714315507, + "grad_norm": 2.1875, + "learning_rate": 3.708232035561815e-05, + "loss": 0.3994, + "step": 7776 + }, + { + "epoch": 0.3415260656223586, + "grad_norm": 2.21875, + "learning_rate": 3.707626322088333e-05, + "loss": 0.4353, + "step": 7778 + }, + { + "epoch": 0.34161388410156207, + "grad_norm": 2.15625, + "learning_rate": 3.707020516134539e-05, + "loss": 0.4419, + "step": 7780 + }, + { + "epoch": 0.34170170258076554, + "grad_norm": 2.203125, + "learning_rate": 3.706414617746823e-05, + "loss": 0.4037, + "step": 7782 + }, + { + "epoch": 0.34178952105996907, + "grad_norm": 1.9453125, + "learning_rate": 3.705808626971587e-05, + "loss": 0.4231, + "step": 7784 + }, + { + "epoch": 0.34187733953917254, + "grad_norm": 2.09375, + "learning_rate": 3.705202543855237e-05, + "loss": 0.4201, + "step": 7786 + }, + { + "epoch": 0.341965158018376, + "grad_norm": 2.125, + "learning_rate": 3.704596368444187e-05, + "loss": 0.4198, + "step": 7788 + }, + { + "epoch": 0.3420529764975795, + "grad_norm": 1.8671875, + "learning_rate": 3.70399010078486e-05, + "loss": 0.4011, + "step": 7790 + }, + { + "epoch": 0.342140794976783, + "grad_norm": 2.015625, + "learning_rate": 3.703383740923682e-05, + "loss": 0.4002, + "step": 7792 + }, + { + "epoch": 0.3422286134559865, + "grad_norm": 2.171875, + "learning_rate": 3.7027772889070875e-05, + "loss": 0.4169, + "step": 7794 + }, + { + "epoch": 0.34231643193518996, + "grad_norm": 1.859375, + "learning_rate": 3.702170744781521e-05, + "loss": 0.3867, + "step": 7796 + }, + { + "epoch": 0.34240425041439343, + "grad_norm": 2.03125, + "learning_rate": 3.70156410859343e-05, + "loss": 0.4391, + "step": 7798 + }, + { + "epoch": 0.34249206889359696, + "grad_norm": 2.296875, + "learning_rate": 3.700957380389272e-05, + "loss": 0.4103, + "step": 7800 + }, + { + "epoch": 0.34257988737280043, + "grad_norm": 2.28125, + "learning_rate": 3.7003505602155085e-05, + "loss": 0.4221, + "step": 7802 + }, + { + "epoch": 0.3426677058520039, + "grad_norm": 2.0, + "learning_rate": 3.6997436481186134e-05, + "loss": 0.4157, + "step": 7804 + }, + { + "epoch": 0.3427555243312074, + "grad_norm": 2.21875, + "learning_rate": 3.699136644145061e-05, + "loss": 0.4063, + "step": 7806 + }, + { + "epoch": 0.3428433428104109, + "grad_norm": 2.125, + "learning_rate": 3.698529548341337e-05, + "loss": 0.4183, + "step": 7808 + }, + { + "epoch": 0.3429311612896144, + "grad_norm": 1.875, + "learning_rate": 3.697922360753933e-05, + "loss": 0.4057, + "step": 7810 + }, + { + "epoch": 0.34301897976881784, + "grad_norm": 1.828125, + "learning_rate": 3.697315081429348e-05, + "loss": 0.3984, + "step": 7812 + }, + { + "epoch": 0.3431067982480213, + "grad_norm": 1.9609375, + "learning_rate": 3.696707710414086e-05, + "loss": 0.4262, + "step": 7814 + }, + { + "epoch": 0.34319461672722484, + "grad_norm": 2.0625, + "learning_rate": 3.696100247754661e-05, + "loss": 0.4487, + "step": 7816 + }, + { + "epoch": 0.3432824352064283, + "grad_norm": 2.25, + "learning_rate": 3.6954926934975925e-05, + "loss": 0.4345, + "step": 7818 + }, + { + "epoch": 0.3433702536856318, + "grad_norm": 2.3125, + "learning_rate": 3.694885047689407e-05, + "loss": 0.3907, + "step": 7820 + }, + { + "epoch": 0.34345807216483526, + "grad_norm": 2.1875, + "learning_rate": 3.694277310376639e-05, + "loss": 0.4209, + "step": 7822 + }, + { + "epoch": 0.3435458906440388, + "grad_norm": 2.390625, + "learning_rate": 3.693669481605827e-05, + "loss": 0.4242, + "step": 7824 + }, + { + "epoch": 0.34363370912324226, + "grad_norm": 2.640625, + "learning_rate": 3.69306156142352e-05, + "loss": 0.3965, + "step": 7826 + }, + { + "epoch": 0.34372152760244573, + "grad_norm": 2.671875, + "learning_rate": 3.692453549876273e-05, + "loss": 0.4348, + "step": 7828 + }, + { + "epoch": 0.34380934608164926, + "grad_norm": 2.375, + "learning_rate": 3.691845447010647e-05, + "loss": 0.3989, + "step": 7830 + }, + { + "epoch": 0.34389716456085273, + "grad_norm": 2.40625, + "learning_rate": 3.691237252873211e-05, + "loss": 0.3943, + "step": 7832 + }, + { + "epoch": 0.3439849830400562, + "grad_norm": 2.234375, + "learning_rate": 3.690628967510541e-05, + "loss": 0.4105, + "step": 7834 + }, + { + "epoch": 0.3440728015192597, + "grad_norm": 2.46875, + "learning_rate": 3.69002059096922e-05, + "loss": 0.4227, + "step": 7836 + }, + { + "epoch": 0.3441606199984632, + "grad_norm": 2.265625, + "learning_rate": 3.6894121232958354e-05, + "loss": 0.4361, + "step": 7838 + }, + { + "epoch": 0.3442484384776667, + "grad_norm": 2.15625, + "learning_rate": 3.688803564536986e-05, + "loss": 0.4565, + "step": 7840 + }, + { + "epoch": 0.34433625695687015, + "grad_norm": 2.09375, + "learning_rate": 3.688194914739274e-05, + "loss": 0.4395, + "step": 7842 + }, + { + "epoch": 0.3444240754360736, + "grad_norm": 2.484375, + "learning_rate": 3.687586173949311e-05, + "loss": 0.4276, + "step": 7844 + }, + { + "epoch": 0.34451189391527715, + "grad_norm": 2.3125, + "learning_rate": 3.686977342213714e-05, + "loss": 0.4263, + "step": 7846 + }, + { + "epoch": 0.3445997123944806, + "grad_norm": 2.765625, + "learning_rate": 3.686368419579108e-05, + "loss": 0.3967, + "step": 7848 + }, + { + "epoch": 0.3446875308736841, + "grad_norm": 2.328125, + "learning_rate": 3.685759406092124e-05, + "loss": 0.4122, + "step": 7850 + }, + { + "epoch": 0.34477534935288756, + "grad_norm": 1.8984375, + "learning_rate": 3.6851503017994e-05, + "loss": 0.4182, + "step": 7852 + }, + { + "epoch": 0.3448631678320911, + "grad_norm": 2.0, + "learning_rate": 3.6845411067475825e-05, + "loss": 0.4255, + "step": 7854 + }, + { + "epoch": 0.34495098631129456, + "grad_norm": 2.140625, + "learning_rate": 3.683931820983322e-05, + "loss": 0.3841, + "step": 7856 + }, + { + "epoch": 0.34503880479049803, + "grad_norm": 2.09375, + "learning_rate": 3.68332244455328e-05, + "loss": 0.4518, + "step": 7858 + }, + { + "epoch": 0.3451266232697015, + "grad_norm": 2.625, + "learning_rate": 3.6827129775041216e-05, + "loss": 0.3772, + "step": 7860 + }, + { + "epoch": 0.34521444174890503, + "grad_norm": 2.0625, + "learning_rate": 3.682103419882519e-05, + "loss": 0.4353, + "step": 7862 + }, + { + "epoch": 0.3453022602281085, + "grad_norm": 2.078125, + "learning_rate": 3.6814937717351525e-05, + "loss": 0.3897, + "step": 7864 + }, + { + "epoch": 0.345390078707312, + "grad_norm": 2.296875, + "learning_rate": 3.6808840331087115e-05, + "loss": 0.4197, + "step": 7866 + }, + { + "epoch": 0.34547789718651545, + "grad_norm": 1.953125, + "learning_rate": 3.6802742040498875e-05, + "loss": 0.4177, + "step": 7868 + }, + { + "epoch": 0.345565715665719, + "grad_norm": 2.203125, + "learning_rate": 3.679664284605381e-05, + "loss": 0.4177, + "step": 7870 + }, + { + "epoch": 0.34565353414492245, + "grad_norm": 2.390625, + "learning_rate": 3.679054274821903e-05, + "loss": 0.4031, + "step": 7872 + }, + { + "epoch": 0.3457413526241259, + "grad_norm": 2.6875, + "learning_rate": 3.678444174746164e-05, + "loss": 0.4048, + "step": 7874 + }, + { + "epoch": 0.3458291711033294, + "grad_norm": 2.265625, + "learning_rate": 3.677833984424888e-05, + "loss": 0.4164, + "step": 7876 + }, + { + "epoch": 0.3459169895825329, + "grad_norm": 2.140625, + "learning_rate": 3.677223703904803e-05, + "loss": 0.4107, + "step": 7878 + }, + { + "epoch": 0.3460048080617364, + "grad_norm": 2.65625, + "learning_rate": 3.6766133332326455e-05, + "loss": 0.4107, + "step": 7880 + }, + { + "epoch": 0.34609262654093986, + "grad_norm": 2.375, + "learning_rate": 3.676002872455157e-05, + "loss": 0.4097, + "step": 7882 + }, + { + "epoch": 0.3461804450201434, + "grad_norm": 2.421875, + "learning_rate": 3.675392321619086e-05, + "loss": 0.4145, + "step": 7884 + }, + { + "epoch": 0.34626826349934686, + "grad_norm": 1.921875, + "learning_rate": 3.674781680771189e-05, + "loss": 0.4026, + "step": 7886 + }, + { + "epoch": 0.34635608197855033, + "grad_norm": 1.921875, + "learning_rate": 3.674170949958229e-05, + "loss": 0.4055, + "step": 7888 + }, + { + "epoch": 0.3464439004577538, + "grad_norm": 1.8125, + "learning_rate": 3.673560129226976e-05, + "loss": 0.4262, + "step": 7890 + }, + { + "epoch": 0.34653171893695733, + "grad_norm": 2.359375, + "learning_rate": 3.6729492186242073e-05, + "loss": 0.4097, + "step": 7892 + }, + { + "epoch": 0.3466195374161608, + "grad_norm": 2.140625, + "learning_rate": 3.672338218196708e-05, + "loss": 0.4043, + "step": 7894 + }, + { + "epoch": 0.3467073558953643, + "grad_norm": 2.1875, + "learning_rate": 3.6717271279912645e-05, + "loss": 0.4346, + "step": 7896 + }, + { + "epoch": 0.34679517437456775, + "grad_norm": 2.296875, + "learning_rate": 3.6711159480546785e-05, + "loss": 0.4076, + "step": 7898 + }, + { + "epoch": 0.3468829928537713, + "grad_norm": 2.515625, + "learning_rate": 3.6705046784337514e-05, + "loss": 0.4282, + "step": 7900 + }, + { + "epoch": 0.34697081133297475, + "grad_norm": 2.734375, + "learning_rate": 3.669893319175296e-05, + "loss": 0.4104, + "step": 7902 + }, + { + "epoch": 0.3470586298121782, + "grad_norm": 2.5, + "learning_rate": 3.6692818703261286e-05, + "loss": 0.4188, + "step": 7904 + }, + { + "epoch": 0.3471464482913817, + "grad_norm": 1.8671875, + "learning_rate": 3.668670331933076e-05, + "loss": 0.4156, + "step": 7906 + }, + { + "epoch": 0.3472342667705852, + "grad_norm": 2.078125, + "learning_rate": 3.6680587040429696e-05, + "loss": 0.4276, + "step": 7908 + }, + { + "epoch": 0.3473220852497887, + "grad_norm": 1.796875, + "learning_rate": 3.667446986702647e-05, + "loss": 0.4195, + "step": 7910 + }, + { + "epoch": 0.34740990372899216, + "grad_norm": 2.0, + "learning_rate": 3.6668351799589557e-05, + "loss": 0.4033, + "step": 7912 + }, + { + "epoch": 0.34749772220819564, + "grad_norm": 2.1875, + "learning_rate": 3.666223283858745e-05, + "loss": 0.4044, + "step": 7914 + }, + { + "epoch": 0.34758554068739916, + "grad_norm": 1.9921875, + "learning_rate": 3.6656112984488765e-05, + "loss": 0.4108, + "step": 7916 + }, + { + "epoch": 0.34767335916660264, + "grad_norm": 2.28125, + "learning_rate": 3.664999223776215e-05, + "loss": 0.4093, + "step": 7918 + }, + { + "epoch": 0.3477611776458061, + "grad_norm": 2.109375, + "learning_rate": 3.664387059887634e-05, + "loss": 0.3886, + "step": 7920 + }, + { + "epoch": 0.3478489961250096, + "grad_norm": 2.0625, + "learning_rate": 3.6637748068300123e-05, + "loss": 0.4283, + "step": 7922 + }, + { + "epoch": 0.3479368146042131, + "grad_norm": 2.015625, + "learning_rate": 3.663162464650237e-05, + "loss": 0.398, + "step": 7924 + }, + { + "epoch": 0.3480246330834166, + "grad_norm": 1.7578125, + "learning_rate": 3.662550033395202e-05, + "loss": 0.4176, + "step": 7926 + }, + { + "epoch": 0.34811245156262005, + "grad_norm": 2.0, + "learning_rate": 3.661937513111806e-05, + "loss": 0.3965, + "step": 7928 + }, + { + "epoch": 0.3482002700418235, + "grad_norm": 2.34375, + "learning_rate": 3.661324903846957e-05, + "loss": 0.4322, + "step": 7930 + }, + { + "epoch": 0.34828808852102705, + "grad_norm": 2.1875, + "learning_rate": 3.660712205647568e-05, + "loss": 0.3999, + "step": 7932 + }, + { + "epoch": 0.3483759070002305, + "grad_norm": 2.234375, + "learning_rate": 3.6600994185605614e-05, + "loss": 0.4251, + "step": 7934 + }, + { + "epoch": 0.348463725479434, + "grad_norm": 2.4375, + "learning_rate": 3.6594865426328625e-05, + "loss": 0.4191, + "step": 7936 + }, + { + "epoch": 0.3485515439586375, + "grad_norm": 2.484375, + "learning_rate": 3.658873577911406e-05, + "loss": 0.3965, + "step": 7938 + }, + { + "epoch": 0.348639362437841, + "grad_norm": 1.9140625, + "learning_rate": 3.658260524443133e-05, + "loss": 0.4126, + "step": 7940 + }, + { + "epoch": 0.34872718091704447, + "grad_norm": 2.046875, + "learning_rate": 3.657647382274992e-05, + "loss": 0.4249, + "step": 7942 + }, + { + "epoch": 0.34881499939624794, + "grad_norm": 2.203125, + "learning_rate": 3.657034151453936e-05, + "loss": 0.39, + "step": 7944 + }, + { + "epoch": 0.34890281787545147, + "grad_norm": 2.1875, + "learning_rate": 3.656420832026928e-05, + "loss": 0.4209, + "step": 7946 + }, + { + "epoch": 0.34899063635465494, + "grad_norm": 1.890625, + "learning_rate": 3.655807424040936e-05, + "loss": 0.4177, + "step": 7948 + }, + { + "epoch": 0.3490784548338584, + "grad_norm": 1.921875, + "learning_rate": 3.655193927542933e-05, + "loss": 0.4071, + "step": 7950 + }, + { + "epoch": 0.3491662733130619, + "grad_norm": 1.921875, + "learning_rate": 3.654580342579903e-05, + "loss": 0.4208, + "step": 7952 + }, + { + "epoch": 0.3492540917922654, + "grad_norm": 2.015625, + "learning_rate": 3.6539666691988336e-05, + "loss": 0.3849, + "step": 7954 + }, + { + "epoch": 0.3493419102714689, + "grad_norm": 1.8828125, + "learning_rate": 3.65335290744672e-05, + "loss": 0.4318, + "step": 7956 + }, + { + "epoch": 0.34942972875067235, + "grad_norm": 1.890625, + "learning_rate": 3.6527390573705645e-05, + "loss": 0.3816, + "step": 7958 + }, + { + "epoch": 0.3495175472298758, + "grad_norm": 2.015625, + "learning_rate": 3.652125119017375e-05, + "loss": 0.4051, + "step": 7960 + }, + { + "epoch": 0.34960536570907935, + "grad_norm": 2.109375, + "learning_rate": 3.651511092434168e-05, + "loss": 0.386, + "step": 7962 + }, + { + "epoch": 0.3496931841882828, + "grad_norm": 1.84375, + "learning_rate": 3.650896977667965e-05, + "loss": 0.438, + "step": 7964 + }, + { + "epoch": 0.3497810026674863, + "grad_norm": 2.1875, + "learning_rate": 3.6502827747657964e-05, + "loss": 0.3965, + "step": 7966 + }, + { + "epoch": 0.34986882114668977, + "grad_norm": 1.9375, + "learning_rate": 3.649668483774696e-05, + "loss": 0.419, + "step": 7968 + }, + { + "epoch": 0.3499566396258933, + "grad_norm": 1.9140625, + "learning_rate": 3.649054104741709e-05, + "loss": 0.3912, + "step": 7970 + }, + { + "epoch": 0.35004445810509677, + "grad_norm": 2.109375, + "learning_rate": 3.648439637713883e-05, + "loss": 0.4051, + "step": 7972 + }, + { + "epoch": 0.35013227658430024, + "grad_norm": 1.9453125, + "learning_rate": 3.6478250827382734e-05, + "loss": 0.4089, + "step": 7974 + }, + { + "epoch": 0.3502200950635037, + "grad_norm": 1.96875, + "learning_rate": 3.647210439861944e-05, + "loss": 0.4123, + "step": 7976 + }, + { + "epoch": 0.35030791354270724, + "grad_norm": 2.125, + "learning_rate": 3.646595709131965e-05, + "loss": 0.4088, + "step": 7978 + }, + { + "epoch": 0.3503957320219107, + "grad_norm": 1.8671875, + "learning_rate": 3.6459808905954105e-05, + "loss": 0.396, + "step": 7980 + }, + { + "epoch": 0.3504835505011142, + "grad_norm": 1.8203125, + "learning_rate": 3.645365984299366e-05, + "loss": 0.4095, + "step": 7982 + }, + { + "epoch": 0.3505713689803177, + "grad_norm": 2.015625, + "learning_rate": 3.644750990290919e-05, + "loss": 0.4179, + "step": 7984 + }, + { + "epoch": 0.3506591874595212, + "grad_norm": 2.171875, + "learning_rate": 3.6441359086171665e-05, + "loss": 0.4492, + "step": 7986 + }, + { + "epoch": 0.35074700593872465, + "grad_norm": 2.015625, + "learning_rate": 3.643520739325213e-05, + "loss": 0.4249, + "step": 7988 + }, + { + "epoch": 0.3508348244179281, + "grad_norm": 2.3125, + "learning_rate": 3.6429054824621656e-05, + "loss": 0.4198, + "step": 7990 + }, + { + "epoch": 0.35092264289713165, + "grad_norm": 1.9296875, + "learning_rate": 3.6422901380751436e-05, + "loss": 0.4131, + "step": 7992 + }, + { + "epoch": 0.3510104613763351, + "grad_norm": 2.21875, + "learning_rate": 3.641674706211269e-05, + "loss": 0.4094, + "step": 7994 + }, + { + "epoch": 0.3510982798555386, + "grad_norm": 2.1875, + "learning_rate": 3.641059186917671e-05, + "loss": 0.418, + "step": 7996 + }, + { + "epoch": 0.35118609833474207, + "grad_norm": 2.34375, + "learning_rate": 3.6404435802414866e-05, + "loss": 0.4279, + "step": 7998 + }, + { + "epoch": 0.3512739168139456, + "grad_norm": 1.8984375, + "learning_rate": 3.63982788622986e-05, + "loss": 0.4254, + "step": 8000 + }, + { + "epoch": 0.35136173529314907, + "grad_norm": 2.125, + "learning_rate": 3.63921210492994e-05, + "loss": 0.3997, + "step": 8002 + }, + { + "epoch": 0.35144955377235254, + "grad_norm": 2.1875, + "learning_rate": 3.638596236388886e-05, + "loss": 0.4476, + "step": 8004 + }, + { + "epoch": 0.351537372251556, + "grad_norm": 2.09375, + "learning_rate": 3.637980280653858e-05, + "loss": 0.4054, + "step": 8006 + }, + { + "epoch": 0.35162519073075954, + "grad_norm": 1.984375, + "learning_rate": 3.637364237772027e-05, + "loss": 0.4005, + "step": 8008 + }, + { + "epoch": 0.351713009209963, + "grad_norm": 2.140625, + "learning_rate": 3.63674810779057e-05, + "loss": 0.4214, + "step": 8010 + }, + { + "epoch": 0.3518008276891665, + "grad_norm": 2.015625, + "learning_rate": 3.63613189075667e-05, + "loss": 0.4369, + "step": 8012 + }, + { + "epoch": 0.35188864616836996, + "grad_norm": 2.21875, + "learning_rate": 3.6355155867175185e-05, + "loss": 0.4079, + "step": 8014 + }, + { + "epoch": 0.3519764646475735, + "grad_norm": 2.328125, + "learning_rate": 3.63489919572031e-05, + "loss": 0.3932, + "step": 8016 + }, + { + "epoch": 0.35206428312677696, + "grad_norm": 2.6875, + "learning_rate": 3.6342827178122505e-05, + "loss": 0.4338, + "step": 8018 + }, + { + "epoch": 0.3521521016059804, + "grad_norm": 2.234375, + "learning_rate": 3.6336661530405486e-05, + "loss": 0.4306, + "step": 8020 + }, + { + "epoch": 0.3522399200851839, + "grad_norm": 2.125, + "learning_rate": 3.63304950145242e-05, + "loss": 0.3814, + "step": 8022 + }, + { + "epoch": 0.3523277385643874, + "grad_norm": 2.375, + "learning_rate": 3.6324327630950886e-05, + "loss": 0.4214, + "step": 8024 + }, + { + "epoch": 0.3524155570435909, + "grad_norm": 1.9921875, + "learning_rate": 3.6318159380157855e-05, + "loss": 0.4044, + "step": 8026 + }, + { + "epoch": 0.35250337552279437, + "grad_norm": 1.953125, + "learning_rate": 3.631199026261746e-05, + "loss": 0.402, + "step": 8028 + }, + { + "epoch": 0.35259119400199784, + "grad_norm": 1.78125, + "learning_rate": 3.6305820278802143e-05, + "loss": 0.4106, + "step": 8030 + }, + { + "epoch": 0.35267901248120137, + "grad_norm": 1.90625, + "learning_rate": 3.62996494291844e-05, + "loss": 0.3897, + "step": 8032 + }, + { + "epoch": 0.35276683096040484, + "grad_norm": 1.9765625, + "learning_rate": 3.62934777142368e-05, + "loss": 0.4002, + "step": 8034 + }, + { + "epoch": 0.3528546494396083, + "grad_norm": 1.9609375, + "learning_rate": 3.6287305134431956e-05, + "loss": 0.4293, + "step": 8036 + }, + { + "epoch": 0.35294246791881184, + "grad_norm": 2.078125, + "learning_rate": 3.6281131690242594e-05, + "loss": 0.4017, + "step": 8038 + }, + { + "epoch": 0.3530302863980153, + "grad_norm": 1.9609375, + "learning_rate": 3.627495738214145e-05, + "loss": 0.3893, + "step": 8040 + }, + { + "epoch": 0.3531181048772188, + "grad_norm": 1.9453125, + "learning_rate": 3.626878221060137e-05, + "loss": 0.4135, + "step": 8042 + }, + { + "epoch": 0.35320592335642226, + "grad_norm": 2.296875, + "learning_rate": 3.626260617609525e-05, + "loss": 0.4264, + "step": 8044 + }, + { + "epoch": 0.3532937418356258, + "grad_norm": 1.96875, + "learning_rate": 3.625642927909605e-05, + "loss": 0.4158, + "step": 8046 + }, + { + "epoch": 0.35338156031482926, + "grad_norm": 2.15625, + "learning_rate": 3.625025152007679e-05, + "loss": 0.4414, + "step": 8048 + }, + { + "epoch": 0.35346937879403273, + "grad_norm": 2.09375, + "learning_rate": 3.6244072899510586e-05, + "loss": 0.4108, + "step": 8050 + }, + { + "epoch": 0.3535571972732362, + "grad_norm": 1.8671875, + "learning_rate": 3.6237893417870574e-05, + "loss": 0.4202, + "step": 8052 + }, + { + "epoch": 0.35364501575243973, + "grad_norm": 2.171875, + "learning_rate": 3.623171307562999e-05, + "loss": 0.4186, + "step": 8054 + }, + { + "epoch": 0.3537328342316432, + "grad_norm": 2.046875, + "learning_rate": 3.622553187326213e-05, + "loss": 0.3793, + "step": 8056 + }, + { + "epoch": 0.3538206527108467, + "grad_norm": 2.078125, + "learning_rate": 3.6219349811240345e-05, + "loss": 0.4024, + "step": 8058 + }, + { + "epoch": 0.35390847119005014, + "grad_norm": 1.96875, + "learning_rate": 3.621316689003806e-05, + "loss": 0.4224, + "step": 8060 + }, + { + "epoch": 0.35399628966925367, + "grad_norm": 2.109375, + "learning_rate": 3.6206983110128765e-05, + "loss": 0.4361, + "step": 8062 + }, + { + "epoch": 0.35408410814845714, + "grad_norm": 2.1875, + "learning_rate": 3.620079847198602e-05, + "loss": 0.4311, + "step": 8064 + }, + { + "epoch": 0.3541719266276606, + "grad_norm": 2.09375, + "learning_rate": 3.619461297608345e-05, + "loss": 0.3927, + "step": 8066 + }, + { + "epoch": 0.3542597451068641, + "grad_norm": 2.03125, + "learning_rate": 3.6188426622894726e-05, + "loss": 0.429, + "step": 8068 + }, + { + "epoch": 0.3543475635860676, + "grad_norm": 2.140625, + "learning_rate": 3.618223941289362e-05, + "loss": 0.3921, + "step": 8070 + }, + { + "epoch": 0.3544353820652711, + "grad_norm": 2.046875, + "learning_rate": 3.617605134655393e-05, + "loss": 0.4144, + "step": 8072 + }, + { + "epoch": 0.35452320054447456, + "grad_norm": 2.5625, + "learning_rate": 3.6169862424349544e-05, + "loss": 0.3993, + "step": 8074 + }, + { + "epoch": 0.35461101902367803, + "grad_norm": 2.34375, + "learning_rate": 3.6163672646754423e-05, + "loss": 0.3845, + "step": 8076 + }, + { + "epoch": 0.35469883750288156, + "grad_norm": 2.15625, + "learning_rate": 3.615748201424257e-05, + "loss": 0.4187, + "step": 8078 + }, + { + "epoch": 0.35478665598208503, + "grad_norm": 2.421875, + "learning_rate": 3.615129052728808e-05, + "loss": 0.3998, + "step": 8080 + }, + { + "epoch": 0.3548744744612885, + "grad_norm": 2.1875, + "learning_rate": 3.6145098186365085e-05, + "loss": 0.437, + "step": 8082 + }, + { + "epoch": 0.35496229294049203, + "grad_norm": 2.53125, + "learning_rate": 3.6138904991947794e-05, + "loss": 0.4331, + "step": 8084 + }, + { + "epoch": 0.3550501114196955, + "grad_norm": 1.9765625, + "learning_rate": 3.613271094451049e-05, + "loss": 0.392, + "step": 8086 + }, + { + "epoch": 0.355137929898899, + "grad_norm": 2.046875, + "learning_rate": 3.612651604452752e-05, + "loss": 0.407, + "step": 8088 + }, + { + "epoch": 0.35522574837810245, + "grad_norm": 2.140625, + "learning_rate": 3.612032029247326e-05, + "loss": 0.4184, + "step": 8090 + }, + { + "epoch": 0.355313566857306, + "grad_norm": 2.078125, + "learning_rate": 3.611412368882223e-05, + "loss": 0.4275, + "step": 8092 + }, + { + "epoch": 0.35540138533650945, + "grad_norm": 1.9921875, + "learning_rate": 3.610792623404894e-05, + "loss": 0.3976, + "step": 8094 + }, + { + "epoch": 0.3554892038157129, + "grad_norm": 2.25, + "learning_rate": 3.6101727928628e-05, + "loss": 0.3745, + "step": 8096 + }, + { + "epoch": 0.3555770222949164, + "grad_norm": 2.15625, + "learning_rate": 3.6095528773034065e-05, + "loss": 0.4196, + "step": 8098 + }, + { + "epoch": 0.3556648407741199, + "grad_norm": 2.25, + "learning_rate": 3.608932876774188e-05, + "loss": 0.3838, + "step": 8100 + }, + { + "epoch": 0.3557526592533234, + "grad_norm": 2.359375, + "learning_rate": 3.6083127913226235e-05, + "loss": 0.4328, + "step": 8102 + }, + { + "epoch": 0.35584047773252686, + "grad_norm": 2.6875, + "learning_rate": 3.6076926209962e-05, + "loss": 0.41, + "step": 8104 + }, + { + "epoch": 0.35592829621173033, + "grad_norm": 2.640625, + "learning_rate": 3.607072365842411e-05, + "loss": 0.3981, + "step": 8106 + }, + { + "epoch": 0.35601611469093386, + "grad_norm": 2.171875, + "learning_rate": 3.606452025908754e-05, + "loss": 0.4053, + "step": 8108 + }, + { + "epoch": 0.35610393317013733, + "grad_norm": 2.109375, + "learning_rate": 3.605831601242735e-05, + "loss": 0.4144, + "step": 8110 + }, + { + "epoch": 0.3561917516493408, + "grad_norm": 1.8515625, + "learning_rate": 3.605211091891868e-05, + "loss": 0.3831, + "step": 8112 + }, + { + "epoch": 0.3562795701285443, + "grad_norm": 2.109375, + "learning_rate": 3.604590497903671e-05, + "loss": 0.404, + "step": 8114 + }, + { + "epoch": 0.3563673886077478, + "grad_norm": 2.6875, + "learning_rate": 3.603969819325668e-05, + "loss": 0.4024, + "step": 8116 + }, + { + "epoch": 0.3564552070869513, + "grad_norm": 2.375, + "learning_rate": 3.6033490562053915e-05, + "loss": 0.4221, + "step": 8118 + }, + { + "epoch": 0.35654302556615475, + "grad_norm": 2.09375, + "learning_rate": 3.6027282085903795e-05, + "loss": 0.3815, + "step": 8120 + }, + { + "epoch": 0.3566308440453582, + "grad_norm": 2.265625, + "learning_rate": 3.6021072765281776e-05, + "loss": 0.3893, + "step": 8122 + }, + { + "epoch": 0.35671866252456175, + "grad_norm": 2.421875, + "learning_rate": 3.6014862600663354e-05, + "loss": 0.4161, + "step": 8124 + }, + { + "epoch": 0.3568064810037652, + "grad_norm": 2.3125, + "learning_rate": 3.600865159252413e-05, + "loss": 0.4167, + "step": 8126 + }, + { + "epoch": 0.3568942994829687, + "grad_norm": 2.09375, + "learning_rate": 3.6002439741339715e-05, + "loss": 0.4056, + "step": 8128 + }, + { + "epoch": 0.35698211796217216, + "grad_norm": 1.90625, + "learning_rate": 3.5996227047585837e-05, + "loss": 0.416, + "step": 8130 + }, + { + "epoch": 0.3570699364413757, + "grad_norm": 2.515625, + "learning_rate": 3.599001351173825e-05, + "loss": 0.386, + "step": 8132 + }, + { + "epoch": 0.35715775492057916, + "grad_norm": 2.25, + "learning_rate": 3.598379913427279e-05, + "loss": 0.4333, + "step": 8134 + }, + { + "epoch": 0.35724557339978263, + "grad_norm": 1.9921875, + "learning_rate": 3.597758391566536e-05, + "loss": 0.396, + "step": 8136 + }, + { + "epoch": 0.35733339187898616, + "grad_norm": 2.140625, + "learning_rate": 3.5971367856391925e-05, + "loss": 0.3873, + "step": 8138 + }, + { + "epoch": 0.35742121035818963, + "grad_norm": 2.25, + "learning_rate": 3.596515095692851e-05, + "loss": 0.4016, + "step": 8140 + }, + { + "epoch": 0.3575090288373931, + "grad_norm": 2.28125, + "learning_rate": 3.5958933217751214e-05, + "loss": 0.3966, + "step": 8142 + }, + { + "epoch": 0.3575968473165966, + "grad_norm": 2.46875, + "learning_rate": 3.595271463933617e-05, + "loss": 0.3895, + "step": 8144 + }, + { + "epoch": 0.3576846657958001, + "grad_norm": 2.28125, + "learning_rate": 3.5946495222159624e-05, + "loss": 0.4021, + "step": 8146 + }, + { + "epoch": 0.3577724842750036, + "grad_norm": 2.09375, + "learning_rate": 3.5940274966697846e-05, + "loss": 0.4019, + "step": 8148 + }, + { + "epoch": 0.35786030275420705, + "grad_norm": 2.21875, + "learning_rate": 3.593405387342719e-05, + "loss": 0.4004, + "step": 8150 + }, + { + "epoch": 0.3579481212334105, + "grad_norm": 1.84375, + "learning_rate": 3.5927831942824056e-05, + "loss": 0.3691, + "step": 8152 + }, + { + "epoch": 0.35803593971261405, + "grad_norm": 2.0, + "learning_rate": 3.592160917536495e-05, + "loss": 0.386, + "step": 8154 + }, + { + "epoch": 0.3581237581918175, + "grad_norm": 1.8828125, + "learning_rate": 3.5915385571526385e-05, + "loss": 0.394, + "step": 8156 + }, + { + "epoch": 0.358211576671021, + "grad_norm": 2.03125, + "learning_rate": 3.590916113178498e-05, + "loss": 0.398, + "step": 8158 + }, + { + "epoch": 0.35829939515022446, + "grad_norm": 1.9609375, + "learning_rate": 3.5902935856617403e-05, + "loss": 0.411, + "step": 8160 + }, + { + "epoch": 0.358387213629428, + "grad_norm": 2.203125, + "learning_rate": 3.589670974650038e-05, + "loss": 0.4071, + "step": 8162 + }, + { + "epoch": 0.35847503210863146, + "grad_norm": 2.390625, + "learning_rate": 3.5890482801910705e-05, + "loss": 0.4023, + "step": 8164 + }, + { + "epoch": 0.35856285058783494, + "grad_norm": 2.15625, + "learning_rate": 3.5884255023325256e-05, + "loss": 0.3921, + "step": 8166 + }, + { + "epoch": 0.3586506690670384, + "grad_norm": 2.203125, + "learning_rate": 3.587802641122095e-05, + "loss": 0.449, + "step": 8168 + }, + { + "epoch": 0.35873848754624194, + "grad_norm": 2.390625, + "learning_rate": 3.5871796966074775e-05, + "loss": 0.398, + "step": 8170 + }, + { + "epoch": 0.3588263060254454, + "grad_norm": 2.25, + "learning_rate": 3.586556668836378e-05, + "loss": 0.4156, + "step": 8172 + }, + { + "epoch": 0.3589141245046489, + "grad_norm": 2.0625, + "learning_rate": 3.585933557856508e-05, + "loss": 0.4224, + "step": 8174 + }, + { + "epoch": 0.35900194298385235, + "grad_norm": 2.296875, + "learning_rate": 3.5853103637155854e-05, + "loss": 0.4249, + "step": 8176 + }, + { + "epoch": 0.3590897614630559, + "grad_norm": 1.8046875, + "learning_rate": 3.5846870864613355e-05, + "loss": 0.4294, + "step": 8178 + }, + { + "epoch": 0.35917757994225935, + "grad_norm": 2.203125, + "learning_rate": 3.584063726141489e-05, + "loss": 0.4183, + "step": 8180 + }, + { + "epoch": 0.3592653984214628, + "grad_norm": 1.9140625, + "learning_rate": 3.5834402828037816e-05, + "loss": 0.4175, + "step": 8182 + }, + { + "epoch": 0.35935321690066635, + "grad_norm": 1.8984375, + "learning_rate": 3.582816756495958e-05, + "loss": 0.4311, + "step": 8184 + }, + { + "epoch": 0.3594410353798698, + "grad_norm": 2.0625, + "learning_rate": 3.5821931472657674e-05, + "loss": 0.38, + "step": 8186 + }, + { + "epoch": 0.3595288538590733, + "grad_norm": 2.375, + "learning_rate": 3.581569455160967e-05, + "loss": 0.4245, + "step": 8188 + }, + { + "epoch": 0.35961667233827677, + "grad_norm": 1.984375, + "learning_rate": 3.580945680229317e-05, + "loss": 0.3865, + "step": 8190 + }, + { + "epoch": 0.3597044908174803, + "grad_norm": 1.8125, + "learning_rate": 3.580321822518588e-05, + "loss": 0.4175, + "step": 8192 + }, + { + "epoch": 0.35979230929668377, + "grad_norm": 1.8671875, + "learning_rate": 3.579697882076557e-05, + "loss": 0.3859, + "step": 8194 + }, + { + "epoch": 0.35988012777588724, + "grad_norm": 2.25, + "learning_rate": 3.5790738589510015e-05, + "loss": 0.3937, + "step": 8196 + }, + { + "epoch": 0.3599679462550907, + "grad_norm": 2.21875, + "learning_rate": 3.578449753189711e-05, + "loss": 0.397, + "step": 8198 + }, + { + "epoch": 0.36005576473429424, + "grad_norm": 2.125, + "learning_rate": 3.5778255648404805e-05, + "loss": 0.409, + "step": 8200 + }, + { + "epoch": 0.3601435832134977, + "grad_norm": 2.140625, + "learning_rate": 3.57720129395111e-05, + "loss": 0.4007, + "step": 8202 + }, + { + "epoch": 0.3602314016927012, + "grad_norm": 2.03125, + "learning_rate": 3.576576940569406e-05, + "loss": 0.4047, + "step": 8204 + }, + { + "epoch": 0.36031922017190465, + "grad_norm": 1.8515625, + "learning_rate": 3.5759525047431816e-05, + "loss": 0.4141, + "step": 8206 + }, + { + "epoch": 0.3604070386511082, + "grad_norm": 1.9375, + "learning_rate": 3.575327986520257e-05, + "loss": 0.3848, + "step": 8208 + }, + { + "epoch": 0.36049485713031165, + "grad_norm": 2.0625, + "learning_rate": 3.574703385948457e-05, + "loss": 0.4296, + "step": 8210 + }, + { + "epoch": 0.3605826756095151, + "grad_norm": 2.03125, + "learning_rate": 3.574078703075613e-05, + "loss": 0.4021, + "step": 8212 + }, + { + "epoch": 0.3606704940887186, + "grad_norm": 2.046875, + "learning_rate": 3.573453937949566e-05, + "loss": 0.4013, + "step": 8214 + }, + { + "epoch": 0.3607583125679221, + "grad_norm": 2.015625, + "learning_rate": 3.572829090618159e-05, + "loss": 0.376, + "step": 8216 + }, + { + "epoch": 0.3608461310471256, + "grad_norm": 2.15625, + "learning_rate": 3.572204161129243e-05, + "loss": 0.4091, + "step": 8218 + }, + { + "epoch": 0.36093394952632907, + "grad_norm": 1.953125, + "learning_rate": 3.571579149530675e-05, + "loss": 0.4112, + "step": 8220 + }, + { + "epoch": 0.36102176800553254, + "grad_norm": 2.046875, + "learning_rate": 3.5709540558703186e-05, + "loss": 0.4014, + "step": 8222 + }, + { + "epoch": 0.36110958648473607, + "grad_norm": 1.921875, + "learning_rate": 3.570328880196044e-05, + "loss": 0.3932, + "step": 8224 + }, + { + "epoch": 0.36119740496393954, + "grad_norm": 2.125, + "learning_rate": 3.569703622555727e-05, + "loss": 0.4222, + "step": 8226 + }, + { + "epoch": 0.361285223443143, + "grad_norm": 2.359375, + "learning_rate": 3.569078282997251e-05, + "loss": 0.4291, + "step": 8228 + }, + { + "epoch": 0.3613730419223465, + "grad_norm": 1.9609375, + "learning_rate": 3.568452861568503e-05, + "loss": 0.3827, + "step": 8230 + }, + { + "epoch": 0.36146086040155, + "grad_norm": 1.9140625, + "learning_rate": 3.5678273583173795e-05, + "loss": 0.4059, + "step": 8232 + }, + { + "epoch": 0.3615486788807535, + "grad_norm": 2.125, + "learning_rate": 3.567201773291781e-05, + "loss": 0.4157, + "step": 8234 + }, + { + "epoch": 0.36163649735995695, + "grad_norm": 2.375, + "learning_rate": 3.5665761065396134e-05, + "loss": 0.3946, + "step": 8236 + }, + { + "epoch": 0.3617243158391605, + "grad_norm": 2.171875, + "learning_rate": 3.565950358108793e-05, + "loss": 0.3954, + "step": 8238 + }, + { + "epoch": 0.36181213431836395, + "grad_norm": 2.40625, + "learning_rate": 3.565324528047238e-05, + "loss": 0.4223, + "step": 8240 + }, + { + "epoch": 0.3618999527975674, + "grad_norm": 2.09375, + "learning_rate": 3.5646986164028765e-05, + "loss": 0.4139, + "step": 8242 + }, + { + "epoch": 0.3619877712767709, + "grad_norm": 2.515625, + "learning_rate": 3.564072623223639e-05, + "loss": 0.4005, + "step": 8244 + }, + { + "epoch": 0.3620755897559744, + "grad_norm": 2.328125, + "learning_rate": 3.5634465485574644e-05, + "loss": 0.3918, + "step": 8246 + }, + { + "epoch": 0.3621634082351779, + "grad_norm": 2.84375, + "learning_rate": 3.5628203924522984e-05, + "loss": 0.3848, + "step": 8248 + }, + { + "epoch": 0.36225122671438137, + "grad_norm": 2.78125, + "learning_rate": 3.562194154956093e-05, + "loss": 0.391, + "step": 8250 + }, + { + "epoch": 0.36233904519358484, + "grad_norm": 2.171875, + "learning_rate": 3.561567836116804e-05, + "loss": 0.4067, + "step": 8252 + }, + { + "epoch": 0.36242686367278837, + "grad_norm": 1.9921875, + "learning_rate": 3.5609414359823956e-05, + "loss": 0.4135, + "step": 8254 + }, + { + "epoch": 0.36251468215199184, + "grad_norm": 2.0, + "learning_rate": 3.5603149546008373e-05, + "loss": 0.4034, + "step": 8256 + }, + { + "epoch": 0.3626025006311953, + "grad_norm": 1.875, + "learning_rate": 3.5596883920201054e-05, + "loss": 0.4041, + "step": 8258 + }, + { + "epoch": 0.3626903191103988, + "grad_norm": 1.984375, + "learning_rate": 3.559061748288183e-05, + "loss": 0.4162, + "step": 8260 + }, + { + "epoch": 0.3627781375896023, + "grad_norm": 2.078125, + "learning_rate": 3.558435023453058e-05, + "loss": 0.3935, + "step": 8262 + }, + { + "epoch": 0.3628659560688058, + "grad_norm": 2.21875, + "learning_rate": 3.557808217562726e-05, + "loss": 0.4009, + "step": 8264 + }, + { + "epoch": 0.36295377454800926, + "grad_norm": 1.8828125, + "learning_rate": 3.557181330665186e-05, + "loss": 0.369, + "step": 8266 + }, + { + "epoch": 0.36304159302721273, + "grad_norm": 2.046875, + "learning_rate": 3.5565543628084466e-05, + "loss": 0.3915, + "step": 8268 + }, + { + "epoch": 0.36312941150641626, + "grad_norm": 1.9453125, + "learning_rate": 3.55592731404052e-05, + "loss": 0.3905, + "step": 8270 + }, + { + "epoch": 0.3632172299856197, + "grad_norm": 2.078125, + "learning_rate": 3.555300184409428e-05, + "loss": 0.4033, + "step": 8272 + }, + { + "epoch": 0.3633050484648232, + "grad_norm": 2.0625, + "learning_rate": 3.554672973963194e-05, + "loss": 0.3804, + "step": 8274 + }, + { + "epoch": 0.36339286694402667, + "grad_norm": 2.03125, + "learning_rate": 3.554045682749851e-05, + "loss": 0.388, + "step": 8276 + }, + { + "epoch": 0.3634806854232302, + "grad_norm": 2.40625, + "learning_rate": 3.553418310817437e-05, + "loss": 0.4108, + "step": 8278 + }, + { + "epoch": 0.36356850390243367, + "grad_norm": 1.90625, + "learning_rate": 3.5527908582139965e-05, + "loss": 0.3755, + "step": 8280 + }, + { + "epoch": 0.36365632238163714, + "grad_norm": 1.8984375, + "learning_rate": 3.5521633249875796e-05, + "loss": 0.3783, + "step": 8282 + }, + { + "epoch": 0.36374414086084067, + "grad_norm": 2.09375, + "learning_rate": 3.551535711186243e-05, + "loss": 0.405, + "step": 8284 + }, + { + "epoch": 0.36383195934004414, + "grad_norm": 2.171875, + "learning_rate": 3.550908016858049e-05, + "loss": 0.3899, + "step": 8286 + }, + { + "epoch": 0.3639197778192476, + "grad_norm": 2.234375, + "learning_rate": 3.550280242051067e-05, + "loss": 0.4381, + "step": 8288 + }, + { + "epoch": 0.3640075962984511, + "grad_norm": 2.140625, + "learning_rate": 3.5496523868133735e-05, + "loss": 0.3999, + "step": 8290 + }, + { + "epoch": 0.3640954147776546, + "grad_norm": 2.015625, + "learning_rate": 3.549024451193048e-05, + "loss": 0.4372, + "step": 8292 + }, + { + "epoch": 0.3641832332568581, + "grad_norm": 2.0625, + "learning_rate": 3.548396435238179e-05, + "loss": 0.3942, + "step": 8294 + }, + { + "epoch": 0.36427105173606156, + "grad_norm": 2.203125, + "learning_rate": 3.547768338996859e-05, + "loss": 0.3907, + "step": 8296 + }, + { + "epoch": 0.36435887021526503, + "grad_norm": 2.71875, + "learning_rate": 3.547140162517189e-05, + "loss": 0.3835, + "step": 8298 + }, + { + "epoch": 0.36444668869446856, + "grad_norm": 1.84375, + "learning_rate": 3.5465119058472736e-05, + "loss": 0.4294, + "step": 8300 + }, + { + "epoch": 0.36453450717367203, + "grad_norm": 2.015625, + "learning_rate": 3.545883569035226e-05, + "loss": 0.4001, + "step": 8302 + }, + { + "epoch": 0.3646223256528755, + "grad_norm": 1.890625, + "learning_rate": 3.545255152129164e-05, + "loss": 0.3965, + "step": 8304 + }, + { + "epoch": 0.364710144132079, + "grad_norm": 1.859375, + "learning_rate": 3.544626655177212e-05, + "loss": 0.411, + "step": 8306 + }, + { + "epoch": 0.3647979626112825, + "grad_norm": 1.9140625, + "learning_rate": 3.5439980782275e-05, + "loss": 0.4039, + "step": 8308 + }, + { + "epoch": 0.36488578109048597, + "grad_norm": 2.140625, + "learning_rate": 3.543369421328165e-05, + "loss": 0.4358, + "step": 8310 + }, + { + "epoch": 0.36497359956968944, + "grad_norm": 2.171875, + "learning_rate": 3.5427406845273506e-05, + "loss": 0.4205, + "step": 8312 + }, + { + "epoch": 0.3650614180488929, + "grad_norm": 1.9609375, + "learning_rate": 3.542111867873203e-05, + "loss": 0.391, + "step": 8314 + }, + { + "epoch": 0.36514923652809644, + "grad_norm": 1.8984375, + "learning_rate": 3.5414829714138795e-05, + "loss": 0.4033, + "step": 8316 + }, + { + "epoch": 0.3652370550072999, + "grad_norm": 2.046875, + "learning_rate": 3.540853995197541e-05, + "loss": 0.3773, + "step": 8318 + }, + { + "epoch": 0.3653248734865034, + "grad_norm": 2.0625, + "learning_rate": 3.540224939272353e-05, + "loss": 0.396, + "step": 8320 + }, + { + "epoch": 0.36541269196570686, + "grad_norm": 2.0625, + "learning_rate": 3.5395958036864896e-05, + "loss": 0.3724, + "step": 8322 + }, + { + "epoch": 0.3655005104449104, + "grad_norm": 2.046875, + "learning_rate": 3.538966588488131e-05, + "loss": 0.398, + "step": 8324 + }, + { + "epoch": 0.36558832892411386, + "grad_norm": 1.84375, + "learning_rate": 3.538337293725462e-05, + "loss": 0.3854, + "step": 8326 + }, + { + "epoch": 0.36567614740331733, + "grad_norm": 1.9609375, + "learning_rate": 3.5377079194466737e-05, + "loss": 0.4364, + "step": 8328 + }, + { + "epoch": 0.3657639658825208, + "grad_norm": 2.15625, + "learning_rate": 3.5370784656999655e-05, + "loss": 0.414, + "step": 8330 + }, + { + "epoch": 0.36585178436172433, + "grad_norm": 2.34375, + "learning_rate": 3.536448932533538e-05, + "loss": 0.3896, + "step": 8332 + }, + { + "epoch": 0.3659396028409278, + "grad_norm": 1.921875, + "learning_rate": 3.5358193199956036e-05, + "loss": 0.4054, + "step": 8334 + }, + { + "epoch": 0.3660274213201313, + "grad_norm": 2.34375, + "learning_rate": 3.5351896281343774e-05, + "loss": 0.3983, + "step": 8336 + }, + { + "epoch": 0.3661152397993348, + "grad_norm": 1.9453125, + "learning_rate": 3.5345598569980814e-05, + "loss": 0.4626, + "step": 8338 + }, + { + "epoch": 0.3662030582785383, + "grad_norm": 2.375, + "learning_rate": 3.5339300066349435e-05, + "loss": 0.4138, + "step": 8340 + }, + { + "epoch": 0.36629087675774175, + "grad_norm": 1.7890625, + "learning_rate": 3.5333000770931986e-05, + "loss": 0.4116, + "step": 8342 + }, + { + "epoch": 0.3663786952369452, + "grad_norm": 1.9765625, + "learning_rate": 3.532670068421085e-05, + "loss": 0.4138, + "step": 8344 + }, + { + "epoch": 0.36646651371614875, + "grad_norm": 2.15625, + "learning_rate": 3.5320399806668506e-05, + "loss": 0.4108, + "step": 8346 + }, + { + "epoch": 0.3665543321953522, + "grad_norm": 2.328125, + "learning_rate": 3.531409813878746e-05, + "loss": 0.3859, + "step": 8348 + }, + { + "epoch": 0.3666421506745557, + "grad_norm": 2.34375, + "learning_rate": 3.5307795681050316e-05, + "loss": 0.4094, + "step": 8350 + }, + { + "epoch": 0.36672996915375916, + "grad_norm": 2.09375, + "learning_rate": 3.530149243393971e-05, + "loss": 0.3775, + "step": 8352 + }, + { + "epoch": 0.3668177876329627, + "grad_norm": 2.0, + "learning_rate": 3.5295188397938336e-05, + "loss": 0.4058, + "step": 8354 + }, + { + "epoch": 0.36690560611216616, + "grad_norm": 2.078125, + "learning_rate": 3.528888357352898e-05, + "loss": 0.374, + "step": 8356 + }, + { + "epoch": 0.36699342459136963, + "grad_norm": 2.046875, + "learning_rate": 3.528257796119443e-05, + "loss": 0.3964, + "step": 8358 + }, + { + "epoch": 0.3670812430705731, + "grad_norm": 2.140625, + "learning_rate": 3.527627156141761e-05, + "loss": 0.4265, + "step": 8360 + }, + { + "epoch": 0.36716906154977663, + "grad_norm": 1.8671875, + "learning_rate": 3.5269964374681454e-05, + "loss": 0.3935, + "step": 8362 + }, + { + "epoch": 0.3672568800289801, + "grad_norm": 2.0, + "learning_rate": 3.526365640146896e-05, + "loss": 0.4064, + "step": 8364 + }, + { + "epoch": 0.3673446985081836, + "grad_norm": 2.03125, + "learning_rate": 3.5257347642263194e-05, + "loss": 0.4018, + "step": 8366 + }, + { + "epoch": 0.36743251698738705, + "grad_norm": 2.046875, + "learning_rate": 3.52510380975473e-05, + "loss": 0.405, + "step": 8368 + }, + { + "epoch": 0.3675203354665906, + "grad_norm": 2.171875, + "learning_rate": 3.524472776780443e-05, + "loss": 0.4101, + "step": 8370 + }, + { + "epoch": 0.36760815394579405, + "grad_norm": 2.0625, + "learning_rate": 3.523841665351787e-05, + "loss": 0.4013, + "step": 8372 + }, + { + "epoch": 0.3676959724249975, + "grad_norm": 1.90625, + "learning_rate": 3.5232104755170905e-05, + "loss": 0.4238, + "step": 8374 + }, + { + "epoch": 0.367783790904201, + "grad_norm": 2.765625, + "learning_rate": 3.522579207324689e-05, + "loss": 0.3948, + "step": 8376 + }, + { + "epoch": 0.3678716093834045, + "grad_norm": 1.9921875, + "learning_rate": 3.5219478608229284e-05, + "loss": 0.4009, + "step": 8378 + }, + { + "epoch": 0.367959427862608, + "grad_norm": 2.0625, + "learning_rate": 3.5213164360601555e-05, + "loss": 0.4123, + "step": 8380 + }, + { + "epoch": 0.36804724634181146, + "grad_norm": 2.015625, + "learning_rate": 3.5206849330847244e-05, + "loss": 0.3838, + "step": 8382 + }, + { + "epoch": 0.36813506482101493, + "grad_norm": 2.21875, + "learning_rate": 3.520053351944996e-05, + "loss": 0.4118, + "step": 8384 + }, + { + "epoch": 0.36822288330021846, + "grad_norm": 2.34375, + "learning_rate": 3.5194216926893395e-05, + "loss": 0.4044, + "step": 8386 + }, + { + "epoch": 0.36831070177942193, + "grad_norm": 2.03125, + "learning_rate": 3.5187899553661236e-05, + "loss": 0.4166, + "step": 8388 + }, + { + "epoch": 0.3683985202586254, + "grad_norm": 2.03125, + "learning_rate": 3.518158140023729e-05, + "loss": 0.4072, + "step": 8390 + }, + { + "epoch": 0.36848633873782893, + "grad_norm": 2.015625, + "learning_rate": 3.5175262467105404e-05, + "loss": 0.4168, + "step": 8392 + }, + { + "epoch": 0.3685741572170324, + "grad_norm": 2.28125, + "learning_rate": 3.5168942754749476e-05, + "loss": 0.3971, + "step": 8394 + }, + { + "epoch": 0.3686619756962359, + "grad_norm": 2.203125, + "learning_rate": 3.516262226365347e-05, + "loss": 0.3804, + "step": 8396 + }, + { + "epoch": 0.36874979417543935, + "grad_norm": 2.03125, + "learning_rate": 3.515630099430142e-05, + "loss": 0.4292, + "step": 8398 + }, + { + "epoch": 0.3688376126546429, + "grad_norm": 2.1875, + "learning_rate": 3.5149978947177396e-05, + "loss": 0.4162, + "step": 8400 + }, + { + "epoch": 0.36892543113384635, + "grad_norm": 2.15625, + "learning_rate": 3.514365612276557e-05, + "loss": 0.4324, + "step": 8402 + }, + { + "epoch": 0.3690132496130498, + "grad_norm": 2.03125, + "learning_rate": 3.5137332521550116e-05, + "loss": 0.4021, + "step": 8404 + }, + { + "epoch": 0.3691010680922533, + "grad_norm": 1.8671875, + "learning_rate": 3.513100814401531e-05, + "loss": 0.4144, + "step": 8406 + }, + { + "epoch": 0.3691888865714568, + "grad_norm": 1.9765625, + "learning_rate": 3.512468299064546e-05, + "loss": 0.3933, + "step": 8408 + }, + { + "epoch": 0.3692767050506603, + "grad_norm": 2.0, + "learning_rate": 3.5118357061924974e-05, + "loss": 0.3806, + "step": 8410 + }, + { + "epoch": 0.36936452352986376, + "grad_norm": 2.4375, + "learning_rate": 3.511203035833827e-05, + "loss": 0.3926, + "step": 8412 + }, + { + "epoch": 0.36945234200906724, + "grad_norm": 2.3125, + "learning_rate": 3.5105702880369864e-05, + "loss": 0.4062, + "step": 8414 + }, + { + "epoch": 0.36954016048827076, + "grad_norm": 2.578125, + "learning_rate": 3.509937462850431e-05, + "loss": 0.4053, + "step": 8416 + }, + { + "epoch": 0.36962797896747424, + "grad_norm": 2.390625, + "learning_rate": 3.509304560322622e-05, + "loss": 0.4269, + "step": 8418 + }, + { + "epoch": 0.3697157974466777, + "grad_norm": 2.25, + "learning_rate": 3.5086715805020274e-05, + "loss": 0.38, + "step": 8420 + }, + { + "epoch": 0.3698036159258812, + "grad_norm": 2.09375, + "learning_rate": 3.508038523437122e-05, + "loss": 0.3872, + "step": 8422 + }, + { + "epoch": 0.3698914344050847, + "grad_norm": 2.171875, + "learning_rate": 3.5074053891763844e-05, + "loss": 0.3803, + "step": 8424 + }, + { + "epoch": 0.3699792528842882, + "grad_norm": 2.203125, + "learning_rate": 3.506772177768301e-05, + "loss": 0.4023, + "step": 8426 + }, + { + "epoch": 0.37006707136349165, + "grad_norm": 2.046875, + "learning_rate": 3.506138889261364e-05, + "loss": 0.4042, + "step": 8428 + }, + { + "epoch": 0.3701548898426951, + "grad_norm": 1.8984375, + "learning_rate": 3.505505523704068e-05, + "loss": 0.3977, + "step": 8430 + }, + { + "epoch": 0.37024270832189865, + "grad_norm": 2.046875, + "learning_rate": 3.5048720811449185e-05, + "loss": 0.3787, + "step": 8432 + }, + { + "epoch": 0.3703305268011021, + "grad_norm": 2.03125, + "learning_rate": 3.504238561632424e-05, + "loss": 0.3843, + "step": 8434 + }, + { + "epoch": 0.3704183452803056, + "grad_norm": 2.0, + "learning_rate": 3.5036049652151e-05, + "loss": 0.3983, + "step": 8436 + }, + { + "epoch": 0.3705061637595091, + "grad_norm": 1.984375, + "learning_rate": 3.5029712919414664e-05, + "loss": 0.4091, + "step": 8438 + }, + { + "epoch": 0.3705939822387126, + "grad_norm": 1.9296875, + "learning_rate": 3.5023375418600524e-05, + "loss": 0.3897, + "step": 8440 + }, + { + "epoch": 0.37068180071791607, + "grad_norm": 1.9921875, + "learning_rate": 3.501703715019388e-05, + "loss": 0.4017, + "step": 8442 + }, + { + "epoch": 0.37076961919711954, + "grad_norm": 1.953125, + "learning_rate": 3.501069811468013e-05, + "loss": 0.4036, + "step": 8444 + }, + { + "epoch": 0.37085743767632307, + "grad_norm": 2.0625, + "learning_rate": 3.5004358312544714e-05, + "loss": 0.3832, + "step": 8446 + }, + { + "epoch": 0.37094525615552654, + "grad_norm": 1.8203125, + "learning_rate": 3.499801774427315e-05, + "loss": 0.4105, + "step": 8448 + }, + { + "epoch": 0.37103307463473, + "grad_norm": 2.3125, + "learning_rate": 3.499167641035099e-05, + "loss": 0.4005, + "step": 8450 + }, + { + "epoch": 0.3711208931139335, + "grad_norm": 2.09375, + "learning_rate": 3.498533431126386e-05, + "loss": 0.3815, + "step": 8452 + }, + { + "epoch": 0.371208711593137, + "grad_norm": 2.203125, + "learning_rate": 3.497899144749742e-05, + "loss": 0.4332, + "step": 8454 + }, + { + "epoch": 0.3712965300723405, + "grad_norm": 2.390625, + "learning_rate": 3.497264781953743e-05, + "loss": 0.4117, + "step": 8456 + }, + { + "epoch": 0.37138434855154395, + "grad_norm": 2.3125, + "learning_rate": 3.496630342786968e-05, + "loss": 0.3979, + "step": 8458 + }, + { + "epoch": 0.3714721670307474, + "grad_norm": 2.3125, + "learning_rate": 3.495995827298002e-05, + "loss": 0.4247, + "step": 8460 + }, + { + "epoch": 0.37155998550995095, + "grad_norm": 1.9453125, + "learning_rate": 3.4953612355354373e-05, + "loss": 0.4288, + "step": 8462 + }, + { + "epoch": 0.3716478039891544, + "grad_norm": 1.890625, + "learning_rate": 3.494726567547871e-05, + "loss": 0.4028, + "step": 8464 + }, + { + "epoch": 0.3717356224683579, + "grad_norm": 2.265625, + "learning_rate": 3.4940918233839056e-05, + "loss": 0.379, + "step": 8466 + }, + { + "epoch": 0.37182344094756137, + "grad_norm": 2.21875, + "learning_rate": 3.4934570030921494e-05, + "loss": 0.4114, + "step": 8468 + }, + { + "epoch": 0.3719112594267649, + "grad_norm": 2.078125, + "learning_rate": 3.492822106721217e-05, + "loss": 0.3756, + "step": 8470 + }, + { + "epoch": 0.37199907790596837, + "grad_norm": 2.296875, + "learning_rate": 3.492187134319731e-05, + "loss": 0.4375, + "step": 8472 + }, + { + "epoch": 0.37208689638517184, + "grad_norm": 2.0625, + "learning_rate": 3.491552085936316e-05, + "loss": 0.4181, + "step": 8474 + }, + { + "epoch": 0.3721747148643753, + "grad_norm": 2.53125, + "learning_rate": 3.4909169616196055e-05, + "loss": 0.3895, + "step": 8476 + }, + { + "epoch": 0.37226253334357884, + "grad_norm": 2.109375, + "learning_rate": 3.490281761418236e-05, + "loss": 0.3932, + "step": 8478 + }, + { + "epoch": 0.3723503518227823, + "grad_norm": 2.046875, + "learning_rate": 3.489646485380851e-05, + "loss": 0.4005, + "step": 8480 + }, + { + "epoch": 0.3724381703019858, + "grad_norm": 1.78125, + "learning_rate": 3.4890111335561016e-05, + "loss": 0.3688, + "step": 8482 + }, + { + "epoch": 0.37252598878118925, + "grad_norm": 2.0625, + "learning_rate": 3.488375705992642e-05, + "loss": 0.3811, + "step": 8484 + }, + { + "epoch": 0.3726138072603928, + "grad_norm": 2.109375, + "learning_rate": 3.487740202739134e-05, + "loss": 0.4228, + "step": 8486 + }, + { + "epoch": 0.37270162573959625, + "grad_norm": 2.015625, + "learning_rate": 3.487104623844245e-05, + "loss": 0.3903, + "step": 8488 + }, + { + "epoch": 0.3727894442187997, + "grad_norm": 2.0625, + "learning_rate": 3.486468969356647e-05, + "loss": 0.3835, + "step": 8490 + }, + { + "epoch": 0.37287726269800325, + "grad_norm": 1.984375, + "learning_rate": 3.485833239325019e-05, + "loss": 0.4037, + "step": 8492 + }, + { + "epoch": 0.3729650811772067, + "grad_norm": 2.375, + "learning_rate": 3.485197433798045e-05, + "loss": 0.372, + "step": 8494 + }, + { + "epoch": 0.3730528996564102, + "grad_norm": 2.25, + "learning_rate": 3.484561552824416e-05, + "loss": 0.3698, + "step": 8496 + }, + { + "epoch": 0.37314071813561367, + "grad_norm": 1.984375, + "learning_rate": 3.483925596452826e-05, + "loss": 0.4107, + "step": 8498 + }, + { + "epoch": 0.3732285366148172, + "grad_norm": 2.03125, + "learning_rate": 3.4832895647319786e-05, + "loss": 0.4162, + "step": 8500 + }, + { + "epoch": 0.37331635509402067, + "grad_norm": 2.171875, + "learning_rate": 3.482653457710581e-05, + "loss": 0.3997, + "step": 8502 + }, + { + "epoch": 0.37340417357322414, + "grad_norm": 2.140625, + "learning_rate": 3.482017275437346e-05, + "loss": 0.4082, + "step": 8504 + }, + { + "epoch": 0.3734919920524276, + "grad_norm": 2.203125, + "learning_rate": 3.481381017960992e-05, + "loss": 0.3791, + "step": 8506 + }, + { + "epoch": 0.37357981053163114, + "grad_norm": 2.09375, + "learning_rate": 3.480744685330244e-05, + "loss": 0.3819, + "step": 8508 + }, + { + "epoch": 0.3736676290108346, + "grad_norm": 2.296875, + "learning_rate": 3.480108277593834e-05, + "loss": 0.4309, + "step": 8510 + }, + { + "epoch": 0.3737554474900381, + "grad_norm": 1.953125, + "learning_rate": 3.4794717948004974e-05, + "loss": 0.3988, + "step": 8512 + }, + { + "epoch": 0.37384326596924156, + "grad_norm": 1.828125, + "learning_rate": 3.478835236998976e-05, + "loss": 0.4027, + "step": 8514 + }, + { + "epoch": 0.3739310844484451, + "grad_norm": 1.8046875, + "learning_rate": 3.4781986042380164e-05, + "loss": 0.3995, + "step": 8516 + }, + { + "epoch": 0.37401890292764856, + "grad_norm": 1.9609375, + "learning_rate": 3.4775618965663736e-05, + "loss": 0.4285, + "step": 8518 + }, + { + "epoch": 0.374106721406852, + "grad_norm": 2.015625, + "learning_rate": 3.476925114032806e-05, + "loss": 0.3945, + "step": 8520 + }, + { + "epoch": 0.3741945398860555, + "grad_norm": 1.921875, + "learning_rate": 3.47628825668608e-05, + "loss": 0.3842, + "step": 8522 + }, + { + "epoch": 0.374282358365259, + "grad_norm": 1.9765625, + "learning_rate": 3.475651324574965e-05, + "loss": 0.4019, + "step": 8524 + }, + { + "epoch": 0.3743701768444625, + "grad_norm": 2.0, + "learning_rate": 3.4750143177482366e-05, + "loss": 0.4039, + "step": 8526 + }, + { + "epoch": 0.37445799532366597, + "grad_norm": 2.25, + "learning_rate": 3.47437723625468e-05, + "loss": 0.3881, + "step": 8528 + }, + { + "epoch": 0.37454581380286944, + "grad_norm": 2.296875, + "learning_rate": 3.47374008014308e-05, + "loss": 0.4122, + "step": 8530 + }, + { + "epoch": 0.37463363228207297, + "grad_norm": 1.8671875, + "learning_rate": 3.473102849462231e-05, + "loss": 0.4023, + "step": 8532 + }, + { + "epoch": 0.37472145076127644, + "grad_norm": 2.390625, + "learning_rate": 3.472465544260932e-05, + "loss": 0.406, + "step": 8534 + }, + { + "epoch": 0.3748092692404799, + "grad_norm": 2.328125, + "learning_rate": 3.471828164587989e-05, + "loss": 0.3963, + "step": 8536 + }, + { + "epoch": 0.37489708771968344, + "grad_norm": 1.9296875, + "learning_rate": 3.471190710492213e-05, + "loss": 0.3803, + "step": 8538 + }, + { + "epoch": 0.3749849061988869, + "grad_norm": 1.953125, + "learning_rate": 3.470553182022419e-05, + "loss": 0.4057, + "step": 8540 + }, + { + "epoch": 0.3750727246780904, + "grad_norm": 2.125, + "learning_rate": 3.46991557922743e-05, + "loss": 0.384, + "step": 8542 + }, + { + "epoch": 0.37516054315729386, + "grad_norm": 2.15625, + "learning_rate": 3.4692779021560726e-05, + "loss": 0.3787, + "step": 8544 + }, + { + "epoch": 0.3752483616364974, + "grad_norm": 2.1875, + "learning_rate": 3.468640150857181e-05, + "loss": 0.3673, + "step": 8546 + }, + { + "epoch": 0.37533618011570086, + "grad_norm": 1.8359375, + "learning_rate": 3.4680023253795956e-05, + "loss": 0.3796, + "step": 8548 + }, + { + "epoch": 0.37542399859490433, + "grad_norm": 1.8125, + "learning_rate": 3.46736442577216e-05, + "loss": 0.3703, + "step": 8550 + }, + { + "epoch": 0.3755118170741078, + "grad_norm": 1.8125, + "learning_rate": 3.466726452083724e-05, + "loss": 0.4013, + "step": 8552 + }, + { + "epoch": 0.37559963555331133, + "grad_norm": 2.28125, + "learning_rate": 3.466088404363145e-05, + "loss": 0.3854, + "step": 8554 + }, + { + "epoch": 0.3756874540325148, + "grad_norm": 2.09375, + "learning_rate": 3.465450282659285e-05, + "loss": 0.4204, + "step": 8556 + }, + { + "epoch": 0.3757752725117183, + "grad_norm": 2.078125, + "learning_rate": 3.464812087021009e-05, + "loss": 0.412, + "step": 8558 + }, + { + "epoch": 0.37586309099092174, + "grad_norm": 1.984375, + "learning_rate": 3.4641738174971936e-05, + "loss": 0.4043, + "step": 8560 + }, + { + "epoch": 0.37595090947012527, + "grad_norm": 2.046875, + "learning_rate": 3.463535474136716e-05, + "loss": 0.3919, + "step": 8562 + }, + { + "epoch": 0.37603872794932874, + "grad_norm": 2.25, + "learning_rate": 3.462897056988461e-05, + "loss": 0.389, + "step": 8564 + }, + { + "epoch": 0.3761265464285322, + "grad_norm": 1.859375, + "learning_rate": 3.462258566101318e-05, + "loss": 0.3905, + "step": 8566 + }, + { + "epoch": 0.3762143649077357, + "grad_norm": 2.6875, + "learning_rate": 3.461620001524183e-05, + "loss": 0.4025, + "step": 8568 + }, + { + "epoch": 0.3763021833869392, + "grad_norm": 2.046875, + "learning_rate": 3.460981363305959e-05, + "loss": 0.4231, + "step": 8570 + }, + { + "epoch": 0.3763900018661427, + "grad_norm": 2.28125, + "learning_rate": 3.460342651495551e-05, + "loss": 0.423, + "step": 8572 + }, + { + "epoch": 0.37647782034534616, + "grad_norm": 1.984375, + "learning_rate": 3.459703866141872e-05, + "loss": 0.396, + "step": 8574 + }, + { + "epoch": 0.37656563882454963, + "grad_norm": 2.046875, + "learning_rate": 3.459065007293842e-05, + "loss": 0.4062, + "step": 8576 + }, + { + "epoch": 0.37665345730375316, + "grad_norm": 1.9296875, + "learning_rate": 3.458426075000383e-05, + "loss": 0.409, + "step": 8578 + }, + { + "epoch": 0.37674127578295663, + "grad_norm": 1.875, + "learning_rate": 3.4577870693104256e-05, + "loss": 0.4096, + "step": 8580 + }, + { + "epoch": 0.3768290942621601, + "grad_norm": 1.96875, + "learning_rate": 3.4571479902729045e-05, + "loss": 0.4013, + "step": 8582 + }, + { + "epoch": 0.3769169127413636, + "grad_norm": 2.078125, + "learning_rate": 3.456508837936762e-05, + "loss": 0.4027, + "step": 8584 + }, + { + "epoch": 0.3770047312205671, + "grad_norm": 1.9140625, + "learning_rate": 3.4558696123509426e-05, + "loss": 0.4047, + "step": 8586 + }, + { + "epoch": 0.3770925496997706, + "grad_norm": 1.8984375, + "learning_rate": 3.455230313564399e-05, + "loss": 0.3918, + "step": 8588 + }, + { + "epoch": 0.37718036817897405, + "grad_norm": 2.078125, + "learning_rate": 3.45459094162609e-05, + "loss": 0.3935, + "step": 8590 + }, + { + "epoch": 0.3772681866581776, + "grad_norm": 1.9765625, + "learning_rate": 3.453951496584977e-05, + "loss": 0.4381, + "step": 8592 + }, + { + "epoch": 0.37735600513738105, + "grad_norm": 1.9296875, + "learning_rate": 3.45331197849003e-05, + "loss": 0.3799, + "step": 8594 + }, + { + "epoch": 0.3774438236165845, + "grad_norm": 2.125, + "learning_rate": 3.452672387390223e-05, + "loss": 0.3954, + "step": 8596 + }, + { + "epoch": 0.377531642095788, + "grad_norm": 2.203125, + "learning_rate": 3.452032723334536e-05, + "loss": 0.4193, + "step": 8598 + }, + { + "epoch": 0.3776194605749915, + "grad_norm": 2.234375, + "learning_rate": 3.451392986371955e-05, + "loss": 0.3764, + "step": 8600 + }, + { + "epoch": 0.377707279054195, + "grad_norm": 1.90625, + "learning_rate": 3.450753176551472e-05, + "loss": 0.4466, + "step": 8602 + }, + { + "epoch": 0.37779509753339846, + "grad_norm": 2.0625, + "learning_rate": 3.4501132939220816e-05, + "loss": 0.3867, + "step": 8604 + }, + { + "epoch": 0.37788291601260193, + "grad_norm": 1.7890625, + "learning_rate": 3.4494733385327875e-05, + "loss": 0.4123, + "step": 8606 + }, + { + "epoch": 0.37797073449180546, + "grad_norm": 1.9765625, + "learning_rate": 3.4488333104325975e-05, + "loss": 0.3987, + "step": 8608 + }, + { + "epoch": 0.37805855297100893, + "grad_norm": 2.03125, + "learning_rate": 3.448193209670526e-05, + "loss": 0.3798, + "step": 8610 + }, + { + "epoch": 0.3781463714502124, + "grad_norm": 2.03125, + "learning_rate": 3.44755303629559e-05, + "loss": 0.3834, + "step": 8612 + }, + { + "epoch": 0.3782341899294159, + "grad_norm": 1.9453125, + "learning_rate": 3.446912790356817e-05, + "loss": 0.3721, + "step": 8614 + }, + { + "epoch": 0.3783220084086194, + "grad_norm": 2.109375, + "learning_rate": 3.446272471903235e-05, + "loss": 0.3949, + "step": 8616 + }, + { + "epoch": 0.3784098268878229, + "grad_norm": 2.296875, + "learning_rate": 3.445632080983879e-05, + "loss": 0.389, + "step": 8618 + }, + { + "epoch": 0.37849764536702635, + "grad_norm": 2.5625, + "learning_rate": 3.444991617647792e-05, + "loss": 0.3948, + "step": 8620 + }, + { + "epoch": 0.3785854638462298, + "grad_norm": 2.21875, + "learning_rate": 3.44435108194402e-05, + "loss": 0.4126, + "step": 8622 + }, + { + "epoch": 0.37867328232543335, + "grad_norm": 2.15625, + "learning_rate": 3.443710473921617e-05, + "loss": 0.4307, + "step": 8624 + }, + { + "epoch": 0.3787611008046368, + "grad_norm": 2.265625, + "learning_rate": 3.44306979362964e-05, + "loss": 0.3778, + "step": 8626 + }, + { + "epoch": 0.3788489192838403, + "grad_norm": 2.34375, + "learning_rate": 3.4424290411171505e-05, + "loss": 0.4118, + "step": 8628 + }, + { + "epoch": 0.37893673776304376, + "grad_norm": 2.359375, + "learning_rate": 3.44178821643322e-05, + "loss": 0.3992, + "step": 8630 + }, + { + "epoch": 0.3790245562422473, + "grad_norm": 2.53125, + "learning_rate": 3.441147319626922e-05, + "loss": 0.4063, + "step": 8632 + }, + { + "epoch": 0.37911237472145076, + "grad_norm": 1.890625, + "learning_rate": 3.440506350747337e-05, + "loss": 0.3811, + "step": 8634 + }, + { + "epoch": 0.37920019320065423, + "grad_norm": 2.078125, + "learning_rate": 3.43986530984355e-05, + "loss": 0.3884, + "step": 8636 + }, + { + "epoch": 0.37928801167985776, + "grad_norm": 2.078125, + "learning_rate": 3.439224196964652e-05, + "loss": 0.3733, + "step": 8638 + }, + { + "epoch": 0.37937583015906123, + "grad_norm": 2.21875, + "learning_rate": 3.43858301215974e-05, + "loss": 0.3773, + "step": 8640 + }, + { + "epoch": 0.3794636486382647, + "grad_norm": 2.046875, + "learning_rate": 3.437941755477916e-05, + "loss": 0.4134, + "step": 8642 + }, + { + "epoch": 0.3795514671174682, + "grad_norm": 1.890625, + "learning_rate": 3.437300426968287e-05, + "loss": 0.3893, + "step": 8644 + }, + { + "epoch": 0.3796392855966717, + "grad_norm": 1.9921875, + "learning_rate": 3.436659026679967e-05, + "loss": 0.4045, + "step": 8646 + }, + { + "epoch": 0.3797271040758752, + "grad_norm": 1.9921875, + "learning_rate": 3.436017554662074e-05, + "loss": 0.4129, + "step": 8648 + }, + { + "epoch": 0.37981492255507865, + "grad_norm": 2.140625, + "learning_rate": 3.4353760109637336e-05, + "loss": 0.4034, + "step": 8650 + }, + { + "epoch": 0.3799027410342821, + "grad_norm": 2.125, + "learning_rate": 3.4347343956340726e-05, + "loss": 0.3959, + "step": 8652 + }, + { + "epoch": 0.37999055951348565, + "grad_norm": 2.15625, + "learning_rate": 3.434092708722228e-05, + "loss": 0.4124, + "step": 8654 + }, + { + "epoch": 0.3800783779926891, + "grad_norm": 1.9375, + "learning_rate": 3.43345095027734e-05, + "loss": 0.4073, + "step": 8656 + }, + { + "epoch": 0.3801661964718926, + "grad_norm": 2.140625, + "learning_rate": 3.432809120348553e-05, + "loss": 0.3992, + "step": 8658 + }, + { + "epoch": 0.38025401495109606, + "grad_norm": 1.9296875, + "learning_rate": 3.432167218985022e-05, + "loss": 0.4143, + "step": 8660 + }, + { + "epoch": 0.3803418334302996, + "grad_norm": 1.890625, + "learning_rate": 3.4315252462359015e-05, + "loss": 0.4135, + "step": 8662 + }, + { + "epoch": 0.38042965190950306, + "grad_norm": 2.109375, + "learning_rate": 3.4308832021503544e-05, + "loss": 0.3911, + "step": 8664 + }, + { + "epoch": 0.38051747038870654, + "grad_norm": 2.078125, + "learning_rate": 3.430241086777548e-05, + "loss": 0.4136, + "step": 8666 + }, + { + "epoch": 0.38060528886791, + "grad_norm": 2.25, + "learning_rate": 3.429598900166656e-05, + "loss": 0.3959, + "step": 8668 + }, + { + "epoch": 0.38069310734711354, + "grad_norm": 1.7890625, + "learning_rate": 3.428956642366857e-05, + "loss": 0.3828, + "step": 8670 + }, + { + "epoch": 0.380780925826317, + "grad_norm": 1.9921875, + "learning_rate": 3.4283143134273365e-05, + "loss": 0.414, + "step": 8672 + }, + { + "epoch": 0.3808687443055205, + "grad_norm": 1.921875, + "learning_rate": 3.427671913397283e-05, + "loss": 0.4011, + "step": 8674 + }, + { + "epoch": 0.38095656278472395, + "grad_norm": 2.0625, + "learning_rate": 3.427029442325893e-05, + "loss": 0.37, + "step": 8676 + }, + { + "epoch": 0.3810443812639275, + "grad_norm": 2.078125, + "learning_rate": 3.426386900262365e-05, + "loss": 0.4073, + "step": 8678 + }, + { + "epoch": 0.38113219974313095, + "grad_norm": 2.5625, + "learning_rate": 3.425744287255907e-05, + "loss": 0.4084, + "step": 8680 + }, + { + "epoch": 0.3812200182223344, + "grad_norm": 1.9609375, + "learning_rate": 3.425101603355728e-05, + "loss": 0.4015, + "step": 8682 + }, + { + "epoch": 0.3813078367015379, + "grad_norm": 1.8515625, + "learning_rate": 3.4244588486110475e-05, + "loss": 0.3831, + "step": 8684 + }, + { + "epoch": 0.3813956551807414, + "grad_norm": 2.03125, + "learning_rate": 3.423816023071087e-05, + "loss": 0.4229, + "step": 8686 + }, + { + "epoch": 0.3814834736599449, + "grad_norm": 2.21875, + "learning_rate": 3.423173126785073e-05, + "loss": 0.4307, + "step": 8688 + }, + { + "epoch": 0.38157129213914837, + "grad_norm": 2.15625, + "learning_rate": 3.42253015980224e-05, + "loss": 0.4027, + "step": 8690 + }, + { + "epoch": 0.3816591106183519, + "grad_norm": 1.9765625, + "learning_rate": 3.4218871221718266e-05, + "loss": 0.4117, + "step": 8692 + }, + { + "epoch": 0.38174692909755537, + "grad_norm": 2.03125, + "learning_rate": 3.4212440139430765e-05, + "loss": 0.362, + "step": 8694 + }, + { + "epoch": 0.38183474757675884, + "grad_norm": 1.9296875, + "learning_rate": 3.420600835165239e-05, + "loss": 0.396, + "step": 8696 + }, + { + "epoch": 0.3819225660559623, + "grad_norm": 2.0625, + "learning_rate": 3.419957585887568e-05, + "loss": 0.3867, + "step": 8698 + }, + { + "epoch": 0.38201038453516584, + "grad_norm": 1.9140625, + "learning_rate": 3.4193142661593255e-05, + "loss": 0.4201, + "step": 8700 + }, + { + "epoch": 0.3820982030143693, + "grad_norm": 2.34375, + "learning_rate": 3.418670876029776e-05, + "loss": 0.3989, + "step": 8702 + }, + { + "epoch": 0.3821860214935728, + "grad_norm": 1.765625, + "learning_rate": 3.41802741554819e-05, + "loss": 0.3829, + "step": 8704 + }, + { + "epoch": 0.38227383997277625, + "grad_norm": 1.8671875, + "learning_rate": 3.417383884763845e-05, + "loss": 0.4079, + "step": 8706 + }, + { + "epoch": 0.3823616584519798, + "grad_norm": 2.453125, + "learning_rate": 3.416740283726022e-05, + "loss": 0.396, + "step": 8708 + }, + { + "epoch": 0.38244947693118325, + "grad_norm": 1.9609375, + "learning_rate": 3.416096612484008e-05, + "loss": 0.3813, + "step": 8710 + }, + { + "epoch": 0.3825372954103867, + "grad_norm": 2.140625, + "learning_rate": 3.415452871087097e-05, + "loss": 0.4056, + "step": 8712 + }, + { + "epoch": 0.3826251138895902, + "grad_norm": 2.03125, + "learning_rate": 3.414809059584585e-05, + "loss": 0.3915, + "step": 8714 + }, + { + "epoch": 0.3827129323687937, + "grad_norm": 1.96875, + "learning_rate": 3.414165178025775e-05, + "loss": 0.3685, + "step": 8716 + }, + { + "epoch": 0.3828007508479972, + "grad_norm": 2.65625, + "learning_rate": 3.413521226459977e-05, + "loss": 0.4022, + "step": 8718 + }, + { + "epoch": 0.38288856932720067, + "grad_norm": 2.640625, + "learning_rate": 3.412877204936505e-05, + "loss": 0.4032, + "step": 8720 + }, + { + "epoch": 0.38297638780640414, + "grad_norm": 2.328125, + "learning_rate": 3.412233113504677e-05, + "loss": 0.3842, + "step": 8722 + }, + { + "epoch": 0.38306420628560767, + "grad_norm": 2.125, + "learning_rate": 3.41158895221382e-05, + "loss": 0.3474, + "step": 8724 + }, + { + "epoch": 0.38315202476481114, + "grad_norm": 2.015625, + "learning_rate": 3.4109447211132616e-05, + "loss": 0.38, + "step": 8726 + }, + { + "epoch": 0.3832398432440146, + "grad_norm": 1.9765625, + "learning_rate": 3.410300420252338e-05, + "loss": 0.3728, + "step": 8728 + }, + { + "epoch": 0.3833276617232181, + "grad_norm": 2.015625, + "learning_rate": 3.40965604968039e-05, + "loss": 0.3944, + "step": 8730 + }, + { + "epoch": 0.3834154802024216, + "grad_norm": 2.515625, + "learning_rate": 3.409011609446763e-05, + "loss": 0.4302, + "step": 8732 + }, + { + "epoch": 0.3835032986816251, + "grad_norm": 2.140625, + "learning_rate": 3.40836709960081e-05, + "loss": 0.4424, + "step": 8734 + }, + { + "epoch": 0.38359111716082855, + "grad_norm": 2.140625, + "learning_rate": 3.407722520191887e-05, + "loss": 0.4114, + "step": 8736 + }, + { + "epoch": 0.383678935640032, + "grad_norm": 2.0625, + "learning_rate": 3.4070778712693555e-05, + "loss": 0.388, + "step": 8738 + }, + { + "epoch": 0.38376675411923555, + "grad_norm": 2.109375, + "learning_rate": 3.4064331528825834e-05, + "loss": 0.3817, + "step": 8740 + }, + { + "epoch": 0.383854572598439, + "grad_norm": 2.046875, + "learning_rate": 3.405788365080942e-05, + "loss": 0.4326, + "step": 8742 + }, + { + "epoch": 0.3839423910776425, + "grad_norm": 1.7890625, + "learning_rate": 3.405143507913812e-05, + "loss": 0.4267, + "step": 8744 + }, + { + "epoch": 0.384030209556846, + "grad_norm": 1.984375, + "learning_rate": 3.404498581430574e-05, + "loss": 0.4047, + "step": 8746 + }, + { + "epoch": 0.3841180280360495, + "grad_norm": 1.9140625, + "learning_rate": 3.403853585680619e-05, + "loss": 0.3583, + "step": 8748 + }, + { + "epoch": 0.38420584651525297, + "grad_norm": 1.9921875, + "learning_rate": 3.403208520713338e-05, + "loss": 0.3668, + "step": 8750 + }, + { + "epoch": 0.38429366499445644, + "grad_norm": 2.125, + "learning_rate": 3.402563386578133e-05, + "loss": 0.4008, + "step": 8752 + }, + { + "epoch": 0.38438148347365997, + "grad_norm": 1.9140625, + "learning_rate": 3.401918183324408e-05, + "loss": 0.3952, + "step": 8754 + }, + { + "epoch": 0.38446930195286344, + "grad_norm": 2.09375, + "learning_rate": 3.4012729110015715e-05, + "loss": 0.3952, + "step": 8756 + }, + { + "epoch": 0.3845571204320669, + "grad_norm": 1.9296875, + "learning_rate": 3.4006275696590394e-05, + "loss": 0.4012, + "step": 8758 + }, + { + "epoch": 0.3846449389112704, + "grad_norm": 1.8203125, + "learning_rate": 3.399982159346232e-05, + "loss": 0.3867, + "step": 8760 + }, + { + "epoch": 0.3847327573904739, + "grad_norm": 2.046875, + "learning_rate": 3.3993366801125766e-05, + "loss": 0.3622, + "step": 8762 + }, + { + "epoch": 0.3848205758696774, + "grad_norm": 2.015625, + "learning_rate": 3.398691132007501e-05, + "loss": 0.4084, + "step": 8764 + }, + { + "epoch": 0.38490839434888086, + "grad_norm": 1.921875, + "learning_rate": 3.398045515080443e-05, + "loss": 0.4114, + "step": 8766 + }, + { + "epoch": 0.3849962128280843, + "grad_norm": 2.171875, + "learning_rate": 3.397399829380845e-05, + "loss": 0.3966, + "step": 8768 + }, + { + "epoch": 0.38508403130728786, + "grad_norm": 2.203125, + "learning_rate": 3.3967540749581535e-05, + "loss": 0.4238, + "step": 8770 + }, + { + "epoch": 0.3851718497864913, + "grad_norm": 2.5625, + "learning_rate": 3.3961082518618195e-05, + "loss": 0.3761, + "step": 8772 + }, + { + "epoch": 0.3852596682656948, + "grad_norm": 2.03125, + "learning_rate": 3.395462360141301e-05, + "loss": 0.368, + "step": 8774 + }, + { + "epoch": 0.38534748674489827, + "grad_norm": 1.90625, + "learning_rate": 3.394816399846059e-05, + "loss": 0.4066, + "step": 8776 + }, + { + "epoch": 0.3854353052241018, + "grad_norm": 1.8828125, + "learning_rate": 3.3941703710255634e-05, + "loss": 0.4185, + "step": 8778 + }, + { + "epoch": 0.38552312370330527, + "grad_norm": 1.9140625, + "learning_rate": 3.393524273729286e-05, + "loss": 0.3871, + "step": 8780 + }, + { + "epoch": 0.38561094218250874, + "grad_norm": 2.5, + "learning_rate": 3.3928781080067064e-05, + "loss": 0.4065, + "step": 8782 + }, + { + "epoch": 0.3856987606617122, + "grad_norm": 2.21875, + "learning_rate": 3.392231873907307e-05, + "loss": 0.4016, + "step": 8784 + }, + { + "epoch": 0.38578657914091574, + "grad_norm": 2.109375, + "learning_rate": 3.3915855714805766e-05, + "loss": 0.3787, + "step": 8786 + }, + { + "epoch": 0.3858743976201192, + "grad_norm": 2.109375, + "learning_rate": 3.39093920077601e-05, + "loss": 0.3677, + "step": 8788 + }, + { + "epoch": 0.3859622160993227, + "grad_norm": 1.9609375, + "learning_rate": 3.3902927618431044e-05, + "loss": 0.3667, + "step": 8790 + }, + { + "epoch": 0.3860500345785262, + "grad_norm": 1.875, + "learning_rate": 3.3896462547313665e-05, + "loss": 0.3921, + "step": 8792 + }, + { + "epoch": 0.3861378530577297, + "grad_norm": 2.421875, + "learning_rate": 3.3889996794903055e-05, + "loss": 0.4078, + "step": 8794 + }, + { + "epoch": 0.38622567153693316, + "grad_norm": 2.328125, + "learning_rate": 3.3883530361694355e-05, + "loss": 0.388, + "step": 8796 + }, + { + "epoch": 0.38631349001613663, + "grad_norm": 2.4375, + "learning_rate": 3.3877063248182775e-05, + "loss": 0.386, + "step": 8798 + }, + { + "epoch": 0.38640130849534016, + "grad_norm": 1.9453125, + "learning_rate": 3.3870595454863564e-05, + "loss": 0.4035, + "step": 8800 + }, + { + "epoch": 0.38648912697454363, + "grad_norm": 2.4375, + "learning_rate": 3.386412698223202e-05, + "loss": 0.3703, + "step": 8802 + }, + { + "epoch": 0.3865769454537471, + "grad_norm": 2.1875, + "learning_rate": 3.385765783078351e-05, + "loss": 0.4085, + "step": 8804 + }, + { + "epoch": 0.3866647639329506, + "grad_norm": 2.25, + "learning_rate": 3.385118800101344e-05, + "loss": 0.3968, + "step": 8806 + }, + { + "epoch": 0.3867525824121541, + "grad_norm": 2.15625, + "learning_rate": 3.384471749341727e-05, + "loss": 0.3996, + "step": 8808 + }, + { + "epoch": 0.38684040089135757, + "grad_norm": 1.890625, + "learning_rate": 3.383824630849052e-05, + "loss": 0.4039, + "step": 8810 + }, + { + "epoch": 0.38692821937056104, + "grad_norm": 1.9453125, + "learning_rate": 3.383177444672874e-05, + "loss": 0.3871, + "step": 8812 + }, + { + "epoch": 0.3870160378497645, + "grad_norm": 2.078125, + "learning_rate": 3.3825301908627556e-05, + "loss": 0.4155, + "step": 8814 + }, + { + "epoch": 0.38710385632896804, + "grad_norm": 1.8515625, + "learning_rate": 3.381882869468264e-05, + "loss": 0.374, + "step": 8816 + }, + { + "epoch": 0.3871916748081715, + "grad_norm": 1.8046875, + "learning_rate": 3.3812354805389713e-05, + "loss": 0.4077, + "step": 8818 + }, + { + "epoch": 0.387279493287375, + "grad_norm": 1.8359375, + "learning_rate": 3.380588024124454e-05, + "loss": 0.3662, + "step": 8820 + }, + { + "epoch": 0.38736731176657846, + "grad_norm": 2.078125, + "learning_rate": 3.379940500274294e-05, + "loss": 0.4216, + "step": 8822 + }, + { + "epoch": 0.387455130245782, + "grad_norm": 2.015625, + "learning_rate": 3.3792929090380806e-05, + "loss": 0.4041, + "step": 8824 + }, + { + "epoch": 0.38754294872498546, + "grad_norm": 1.9609375, + "learning_rate": 3.3786452504654045e-05, + "loss": 0.3596, + "step": 8826 + }, + { + "epoch": 0.38763076720418893, + "grad_norm": 1.75, + "learning_rate": 3.377997524605865e-05, + "loss": 0.3894, + "step": 8828 + }, + { + "epoch": 0.3877185856833924, + "grad_norm": 1.9453125, + "learning_rate": 3.377349731509064e-05, + "loss": 0.3847, + "step": 8830 + }, + { + "epoch": 0.38780640416259593, + "grad_norm": 2.265625, + "learning_rate": 3.376701871224611e-05, + "loss": 0.39, + "step": 8832 + }, + { + "epoch": 0.3878942226417994, + "grad_norm": 2.1875, + "learning_rate": 3.3760539438021184e-05, + "loss": 0.4149, + "step": 8834 + }, + { + "epoch": 0.3879820411210029, + "grad_norm": 2.09375, + "learning_rate": 3.375405949291205e-05, + "loss": 0.3674, + "step": 8836 + }, + { + "epoch": 0.38806985960020635, + "grad_norm": 1.90625, + "learning_rate": 3.374757887741494e-05, + "loss": 0.3604, + "step": 8838 + }, + { + "epoch": 0.3881576780794099, + "grad_norm": 1.921875, + "learning_rate": 3.3741097592026136e-05, + "loss": 0.392, + "step": 8840 + }, + { + "epoch": 0.38824549655861335, + "grad_norm": 2.03125, + "learning_rate": 3.373461563724198e-05, + "loss": 0.411, + "step": 8842 + }, + { + "epoch": 0.3883333150378168, + "grad_norm": 2.109375, + "learning_rate": 3.372813301355888e-05, + "loss": 0.4219, + "step": 8844 + }, + { + "epoch": 0.38842113351702034, + "grad_norm": 2.3125, + "learning_rate": 3.3721649721473255e-05, + "loss": 0.4035, + "step": 8846 + }, + { + "epoch": 0.3885089519962238, + "grad_norm": 1.9140625, + "learning_rate": 3.3715165761481606e-05, + "loss": 0.3774, + "step": 8848 + }, + { + "epoch": 0.3885967704754273, + "grad_norm": 1.9453125, + "learning_rate": 3.370868113408047e-05, + "loss": 0.3749, + "step": 8850 + }, + { + "epoch": 0.38868458895463076, + "grad_norm": 1.8984375, + "learning_rate": 3.3702195839766445e-05, + "loss": 0.4078, + "step": 8852 + }, + { + "epoch": 0.3887724074338343, + "grad_norm": 1.9609375, + "learning_rate": 3.369570987903618e-05, + "loss": 0.3886, + "step": 8854 + }, + { + "epoch": 0.38886022591303776, + "grad_norm": 2.421875, + "learning_rate": 3.368922325238636e-05, + "loss": 0.3892, + "step": 8856 + }, + { + "epoch": 0.38894804439224123, + "grad_norm": 1.9140625, + "learning_rate": 3.368273596031374e-05, + "loss": 0.388, + "step": 8858 + }, + { + "epoch": 0.3890358628714447, + "grad_norm": 2.140625, + "learning_rate": 3.367624800331513e-05, + "loss": 0.3847, + "step": 8860 + }, + { + "epoch": 0.38912368135064823, + "grad_norm": 2.40625, + "learning_rate": 3.366975938188737e-05, + "loss": 0.3817, + "step": 8862 + }, + { + "epoch": 0.3892114998298517, + "grad_norm": 2.109375, + "learning_rate": 3.3663270096527344e-05, + "loss": 0.3949, + "step": 8864 + }, + { + "epoch": 0.3892993183090552, + "grad_norm": 2.078125, + "learning_rate": 3.3656780147732024e-05, + "loss": 0.3954, + "step": 8866 + }, + { + "epoch": 0.38938713678825865, + "grad_norm": 2.078125, + "learning_rate": 3.3650289535998406e-05, + "loss": 0.3788, + "step": 8868 + }, + { + "epoch": 0.3894749552674622, + "grad_norm": 1.90625, + "learning_rate": 3.364379826182354e-05, + "loss": 0.3814, + "step": 8870 + }, + { + "epoch": 0.38956277374666565, + "grad_norm": 2.125, + "learning_rate": 3.363730632570453e-05, + "loss": 0.4086, + "step": 8872 + }, + { + "epoch": 0.3896505922258691, + "grad_norm": 1.7578125, + "learning_rate": 3.363081372813853e-05, + "loss": 0.3798, + "step": 8874 + }, + { + "epoch": 0.3897384107050726, + "grad_norm": 1.9296875, + "learning_rate": 3.362432046962275e-05, + "loss": 0.4, + "step": 8876 + }, + { + "epoch": 0.3898262291842761, + "grad_norm": 1.8515625, + "learning_rate": 3.3617826550654445e-05, + "loss": 0.4174, + "step": 8878 + }, + { + "epoch": 0.3899140476634796, + "grad_norm": 2.0, + "learning_rate": 3.361133197173091e-05, + "loss": 0.388, + "step": 8880 + }, + { + "epoch": 0.39000186614268306, + "grad_norm": 2.03125, + "learning_rate": 3.360483673334951e-05, + "loss": 0.3948, + "step": 8882 + }, + { + "epoch": 0.39008968462188653, + "grad_norm": 2.078125, + "learning_rate": 3.359834083600765e-05, + "loss": 0.3934, + "step": 8884 + }, + { + "epoch": 0.39017750310109006, + "grad_norm": 1.7421875, + "learning_rate": 3.359184428020279e-05, + "loss": 0.3894, + "step": 8886 + }, + { + "epoch": 0.39026532158029353, + "grad_norm": 1.8515625, + "learning_rate": 3.3585347066432435e-05, + "loss": 0.4051, + "step": 8888 + }, + { + "epoch": 0.390353140059497, + "grad_norm": 1.828125, + "learning_rate": 3.357884919519414e-05, + "loss": 0.3849, + "step": 8890 + }, + { + "epoch": 0.39044095853870053, + "grad_norm": 1.78125, + "learning_rate": 3.357235066698552e-05, + "loss": 0.4023, + "step": 8892 + }, + { + "epoch": 0.390528777017904, + "grad_norm": 1.90625, + "learning_rate": 3.356585148230423e-05, + "loss": 0.3851, + "step": 8894 + }, + { + "epoch": 0.3906165954971075, + "grad_norm": 1.890625, + "learning_rate": 3.355935164164798e-05, + "loss": 0.4059, + "step": 8896 + }, + { + "epoch": 0.39070441397631095, + "grad_norm": 1.8046875, + "learning_rate": 3.355285114551453e-05, + "loss": 0.3912, + "step": 8898 + }, + { + "epoch": 0.3907922324555145, + "grad_norm": 2.015625, + "learning_rate": 3.3546349994401686e-05, + "loss": 0.4149, + "step": 8900 + }, + { + "epoch": 0.39088005093471795, + "grad_norm": 2.046875, + "learning_rate": 3.3539848188807315e-05, + "loss": 0.3998, + "step": 8902 + }, + { + "epoch": 0.3909678694139214, + "grad_norm": 1.890625, + "learning_rate": 3.3533345729229315e-05, + "loss": 0.3907, + "step": 8904 + }, + { + "epoch": 0.3910556878931249, + "grad_norm": 2.109375, + "learning_rate": 3.352684261616566e-05, + "loss": 0.4005, + "step": 8906 + }, + { + "epoch": 0.3911435063723284, + "grad_norm": 2.0, + "learning_rate": 3.352033885011436e-05, + "loss": 0.3899, + "step": 8908 + }, + { + "epoch": 0.3912313248515319, + "grad_norm": 2.078125, + "learning_rate": 3.351383443157347e-05, + "loss": 0.3959, + "step": 8910 + }, + { + "epoch": 0.39131914333073536, + "grad_norm": 2.40625, + "learning_rate": 3.350732936104108e-05, + "loss": 0.3978, + "step": 8912 + }, + { + "epoch": 0.39140696180993884, + "grad_norm": 2.078125, + "learning_rate": 3.3500823639015376e-05, + "loss": 0.3955, + "step": 8914 + }, + { + "epoch": 0.39149478028914236, + "grad_norm": 2.484375, + "learning_rate": 3.3494317265994565e-05, + "loss": 0.4163, + "step": 8916 + }, + { + "epoch": 0.39158259876834584, + "grad_norm": 2.34375, + "learning_rate": 3.348781024247689e-05, + "loss": 0.3811, + "step": 8918 + }, + { + "epoch": 0.3916704172475493, + "grad_norm": 2.09375, + "learning_rate": 3.3481302568960684e-05, + "loss": 0.3828, + "step": 8920 + }, + { + "epoch": 0.3917582357267528, + "grad_norm": 1.8125, + "learning_rate": 3.3474794245944294e-05, + "loss": 0.3912, + "step": 8922 + }, + { + "epoch": 0.3918460542059563, + "grad_norm": 1.9765625, + "learning_rate": 3.3468285273926123e-05, + "loss": 0.3943, + "step": 8924 + }, + { + "epoch": 0.3919338726851598, + "grad_norm": 1.7578125, + "learning_rate": 3.346177565340464e-05, + "loss": 0.3705, + "step": 8926 + }, + { + "epoch": 0.39202169116436325, + "grad_norm": 2.078125, + "learning_rate": 3.345526538487834e-05, + "loss": 0.4072, + "step": 8928 + }, + { + "epoch": 0.3921095096435667, + "grad_norm": 2.0625, + "learning_rate": 3.34487544688458e-05, + "loss": 0.4101, + "step": 8930 + }, + { + "epoch": 0.39219732812277025, + "grad_norm": 1.875, + "learning_rate": 3.3442242905805614e-05, + "loss": 0.3849, + "step": 8932 + }, + { + "epoch": 0.3922851466019737, + "grad_norm": 2.046875, + "learning_rate": 3.343573069625645e-05, + "loss": 0.3763, + "step": 8934 + }, + { + "epoch": 0.3923729650811772, + "grad_norm": 2.0625, + "learning_rate": 3.3429217840697e-05, + "loss": 0.3715, + "step": 8936 + }, + { + "epoch": 0.39246078356038067, + "grad_norm": 1.953125, + "learning_rate": 3.3422704339626024e-05, + "loss": 0.4216, + "step": 8938 + }, + { + "epoch": 0.3925486020395842, + "grad_norm": 1.875, + "learning_rate": 3.341619019354233e-05, + "loss": 0.3712, + "step": 8940 + }, + { + "epoch": 0.39263642051878767, + "grad_norm": 1.8359375, + "learning_rate": 3.3409675402944786e-05, + "loss": 0.3723, + "step": 8942 + }, + { + "epoch": 0.39272423899799114, + "grad_norm": 1.9921875, + "learning_rate": 3.3403159968332273e-05, + "loss": 0.3759, + "step": 8944 + }, + { + "epoch": 0.39281205747719466, + "grad_norm": 2.3125, + "learning_rate": 3.339664389020376e-05, + "loss": 0.3894, + "step": 8946 + }, + { + "epoch": 0.39289987595639814, + "grad_norm": 1.9375, + "learning_rate": 3.339012716905824e-05, + "loss": 0.3792, + "step": 8948 + }, + { + "epoch": 0.3929876944356016, + "grad_norm": 2.109375, + "learning_rate": 3.338360980539477e-05, + "loss": 0.4002, + "step": 8950 + }, + { + "epoch": 0.3930755129148051, + "grad_norm": 2.140625, + "learning_rate": 3.3377091799712454e-05, + "loss": 0.3979, + "step": 8952 + }, + { + "epoch": 0.3931633313940086, + "grad_norm": 1.8125, + "learning_rate": 3.3370573152510445e-05, + "loss": 0.4109, + "step": 8954 + }, + { + "epoch": 0.3932511498732121, + "grad_norm": 1.96875, + "learning_rate": 3.336405386428792e-05, + "loss": 0.4168, + "step": 8956 + }, + { + "epoch": 0.39333896835241555, + "grad_norm": 1.78125, + "learning_rate": 3.335753393554416e-05, + "loss": 0.3975, + "step": 8958 + }, + { + "epoch": 0.393426786831619, + "grad_norm": 1.984375, + "learning_rate": 3.335101336677845e-05, + "loss": 0.388, + "step": 8960 + }, + { + "epoch": 0.39351460531082255, + "grad_norm": 1.9296875, + "learning_rate": 3.3344492158490134e-05, + "loss": 0.4136, + "step": 8962 + }, + { + "epoch": 0.393602423790026, + "grad_norm": 2.171875, + "learning_rate": 3.33379703111786e-05, + "loss": 0.4321, + "step": 8964 + }, + { + "epoch": 0.3936902422692295, + "grad_norm": 2.015625, + "learning_rate": 3.3331447825343306e-05, + "loss": 0.3847, + "step": 8966 + }, + { + "epoch": 0.39377806074843297, + "grad_norm": 1.984375, + "learning_rate": 3.3324924701483734e-05, + "loss": 0.3922, + "step": 8968 + }, + { + "epoch": 0.3938658792276365, + "grad_norm": 1.890625, + "learning_rate": 3.331840094009944e-05, + "loss": 0.3798, + "step": 8970 + }, + { + "epoch": 0.39395369770683997, + "grad_norm": 1.921875, + "learning_rate": 3.3311876541690014e-05, + "loss": 0.3968, + "step": 8972 + }, + { + "epoch": 0.39404151618604344, + "grad_norm": 2.046875, + "learning_rate": 3.330535150675508e-05, + "loss": 0.4071, + "step": 8974 + }, + { + "epoch": 0.3941293346652469, + "grad_norm": 2.140625, + "learning_rate": 3.329882583579433e-05, + "loss": 0.381, + "step": 8976 + }, + { + "epoch": 0.39421715314445044, + "grad_norm": 2.421875, + "learning_rate": 3.329229952930752e-05, + "loss": 0.3968, + "step": 8978 + }, + { + "epoch": 0.3943049716236539, + "grad_norm": 2.296875, + "learning_rate": 3.3285772587794417e-05, + "loss": 0.3864, + "step": 8980 + }, + { + "epoch": 0.3943927901028574, + "grad_norm": 2.03125, + "learning_rate": 3.3279245011754874e-05, + "loss": 0.4029, + "step": 8982 + }, + { + "epoch": 0.39448060858206085, + "grad_norm": 2.046875, + "learning_rate": 3.3272716801688754e-05, + "loss": 0.3987, + "step": 8984 + }, + { + "epoch": 0.3945684270612644, + "grad_norm": 1.859375, + "learning_rate": 3.3266187958096e-05, + "loss": 0.4145, + "step": 8986 + }, + { + "epoch": 0.39465624554046785, + "grad_norm": 1.96875, + "learning_rate": 3.325965848147659e-05, + "loss": 0.3853, + "step": 8988 + }, + { + "epoch": 0.3947440640196713, + "grad_norm": 2.015625, + "learning_rate": 3.3253128372330556e-05, + "loss": 0.3784, + "step": 8990 + }, + { + "epoch": 0.39483188249887485, + "grad_norm": 1.7734375, + "learning_rate": 3.324659763115797e-05, + "loss": 0.3869, + "step": 8992 + }, + { + "epoch": 0.3949197009780783, + "grad_norm": 1.78125, + "learning_rate": 3.324006625845896e-05, + "loss": 0.4024, + "step": 8994 + }, + { + "epoch": 0.3950075194572818, + "grad_norm": 2.046875, + "learning_rate": 3.3233534254733706e-05, + "loss": 0.4002, + "step": 8996 + }, + { + "epoch": 0.39509533793648527, + "grad_norm": 1.875, + "learning_rate": 3.322700162048242e-05, + "loss": 0.3972, + "step": 8998 + }, + { + "epoch": 0.3951831564156888, + "grad_norm": 1.84375, + "learning_rate": 3.322046835620538e-05, + "loss": 0.3898, + "step": 9000 + }, + { + "epoch": 0.39527097489489227, + "grad_norm": 2.046875, + "learning_rate": 3.32139344624029e-05, + "loss": 0.3801, + "step": 9002 + }, + { + "epoch": 0.39535879337409574, + "grad_norm": 1.9453125, + "learning_rate": 3.320739993957535e-05, + "loss": 0.4005, + "step": 9004 + }, + { + "epoch": 0.3954466118532992, + "grad_norm": 2.078125, + "learning_rate": 3.320086478822315e-05, + "loss": 0.3623, + "step": 9006 + }, + { + "epoch": 0.39553443033250274, + "grad_norm": 1.953125, + "learning_rate": 3.319432900884676e-05, + "loss": 0.379, + "step": 9008 + }, + { + "epoch": 0.3956222488117062, + "grad_norm": 1.8515625, + "learning_rate": 3.318779260194668e-05, + "loss": 0.3844, + "step": 9010 + }, + { + "epoch": 0.3957100672909097, + "grad_norm": 1.8359375, + "learning_rate": 3.318125556802348e-05, + "loss": 0.3694, + "step": 9012 + }, + { + "epoch": 0.39579788577011316, + "grad_norm": 2.078125, + "learning_rate": 3.317471790757778e-05, + "loss": 0.3808, + "step": 9014 + }, + { + "epoch": 0.3958857042493167, + "grad_norm": 1.8984375, + "learning_rate": 3.316817962111022e-05, + "loss": 0.4207, + "step": 9016 + }, + { + "epoch": 0.39597352272852016, + "grad_norm": 1.9296875, + "learning_rate": 3.31616407091215e-05, + "loss": 0.3916, + "step": 9018 + }, + { + "epoch": 0.3960613412077236, + "grad_norm": 1.96875, + "learning_rate": 3.315510117211238e-05, + "loss": 0.379, + "step": 9020 + }, + { + "epoch": 0.3961491596869271, + "grad_norm": 1.7734375, + "learning_rate": 3.314856101058366e-05, + "loss": 0.3991, + "step": 9022 + }, + { + "epoch": 0.3962369781661306, + "grad_norm": 2.03125, + "learning_rate": 3.314202022503618e-05, + "loss": 0.3913, + "step": 9024 + }, + { + "epoch": 0.3963247966453341, + "grad_norm": 1.90625, + "learning_rate": 3.313547881597084e-05, + "loss": 0.3706, + "step": 9026 + }, + { + "epoch": 0.39641261512453757, + "grad_norm": 1.9375, + "learning_rate": 3.312893678388858e-05, + "loss": 0.3919, + "step": 9028 + }, + { + "epoch": 0.39650043360374104, + "grad_norm": 1.8671875, + "learning_rate": 3.31223941292904e-05, + "loss": 0.3523, + "step": 9030 + }, + { + "epoch": 0.39658825208294457, + "grad_norm": 2.125, + "learning_rate": 3.3115850852677327e-05, + "loss": 0.3827, + "step": 9032 + }, + { + "epoch": 0.39667607056214804, + "grad_norm": 2.0, + "learning_rate": 3.310930695455046e-05, + "loss": 0.3588, + "step": 9034 + }, + { + "epoch": 0.3967638890413515, + "grad_norm": 2.03125, + "learning_rate": 3.3102762435410904e-05, + "loss": 0.4117, + "step": 9036 + }, + { + "epoch": 0.396851707520555, + "grad_norm": 1.9921875, + "learning_rate": 3.3096217295759866e-05, + "loss": 0.3824, + "step": 9038 + }, + { + "epoch": 0.3969395259997585, + "grad_norm": 2.09375, + "learning_rate": 3.308967153609857e-05, + "loss": 0.3642, + "step": 9040 + }, + { + "epoch": 0.397027344478962, + "grad_norm": 2.15625, + "learning_rate": 3.308312515692828e-05, + "loss": 0.3674, + "step": 9042 + }, + { + "epoch": 0.39711516295816546, + "grad_norm": 1.8671875, + "learning_rate": 3.307657815875034e-05, + "loss": 0.4016, + "step": 9044 + }, + { + "epoch": 0.397202981437369, + "grad_norm": 2.15625, + "learning_rate": 3.30700305420661e-05, + "loss": 0.372, + "step": 9046 + }, + { + "epoch": 0.39729079991657246, + "grad_norm": 2.203125, + "learning_rate": 3.306348230737699e-05, + "loss": 0.426, + "step": 9048 + }, + { + "epoch": 0.39737861839577593, + "grad_norm": 1.859375, + "learning_rate": 3.305693345518447e-05, + "loss": 0.3717, + "step": 9050 + }, + { + "epoch": 0.3974664368749794, + "grad_norm": 2.171875, + "learning_rate": 3.3050383985990056e-05, + "loss": 0.4166, + "step": 9052 + }, + { + "epoch": 0.39755425535418293, + "grad_norm": 2.328125, + "learning_rate": 3.304383390029531e-05, + "loss": 0.4124, + "step": 9054 + }, + { + "epoch": 0.3976420738333864, + "grad_norm": 2.21875, + "learning_rate": 3.3037283198601826e-05, + "loss": 0.4108, + "step": 9056 + }, + { + "epoch": 0.39772989231258987, + "grad_norm": 1.8515625, + "learning_rate": 3.303073188141128e-05, + "loss": 0.3818, + "step": 9058 + }, + { + "epoch": 0.39781771079179334, + "grad_norm": 2.046875, + "learning_rate": 3.3024179949225355e-05, + "loss": 0.3845, + "step": 9060 + }, + { + "epoch": 0.39790552927099687, + "grad_norm": 2.078125, + "learning_rate": 3.30176274025458e-05, + "loss": 0.4132, + "step": 9062 + }, + { + "epoch": 0.39799334775020034, + "grad_norm": 2.3125, + "learning_rate": 3.301107424187443e-05, + "loss": 0.3977, + "step": 9064 + }, + { + "epoch": 0.3980811662294038, + "grad_norm": 2.265625, + "learning_rate": 3.300452046771306e-05, + "loss": 0.3447, + "step": 9066 + }, + { + "epoch": 0.3981689847086073, + "grad_norm": 1.953125, + "learning_rate": 3.29979660805636e-05, + "loss": 0.3903, + "step": 9068 + }, + { + "epoch": 0.3982568031878108, + "grad_norm": 1.984375, + "learning_rate": 3.299141108092799e-05, + "loss": 0.3841, + "step": 9070 + }, + { + "epoch": 0.3983446216670143, + "grad_norm": 2.125, + "learning_rate": 3.298485546930819e-05, + "loss": 0.3854, + "step": 9072 + }, + { + "epoch": 0.39843244014621776, + "grad_norm": 1.8125, + "learning_rate": 3.2978299246206246e-05, + "loss": 0.3778, + "step": 9074 + }, + { + "epoch": 0.39852025862542123, + "grad_norm": 2.125, + "learning_rate": 3.297174241212424e-05, + "loss": 0.4199, + "step": 9076 + }, + { + "epoch": 0.39860807710462476, + "grad_norm": 2.09375, + "learning_rate": 3.296518496756428e-05, + "loss": 0.3954, + "step": 9078 + }, + { + "epoch": 0.39869589558382823, + "grad_norm": 1.984375, + "learning_rate": 3.295862691302855e-05, + "loss": 0.3856, + "step": 9080 + }, + { + "epoch": 0.3987837140630317, + "grad_norm": 1.84375, + "learning_rate": 3.295206824901926e-05, + "loss": 0.3974, + "step": 9082 + }, + { + "epoch": 0.3988715325422352, + "grad_norm": 2.046875, + "learning_rate": 3.2945508976038694e-05, + "loss": 0.4178, + "step": 9084 + }, + { + "epoch": 0.3989593510214387, + "grad_norm": 1.9609375, + "learning_rate": 3.293894909458913e-05, + "loss": 0.3664, + "step": 9086 + }, + { + "epoch": 0.3990471695006422, + "grad_norm": 1.984375, + "learning_rate": 3.2932388605172946e-05, + "loss": 0.3838, + "step": 9088 + }, + { + "epoch": 0.39913498797984565, + "grad_norm": 2.625, + "learning_rate": 3.2925827508292535e-05, + "loss": 0.3785, + "step": 9090 + }, + { + "epoch": 0.3992228064590491, + "grad_norm": 1.96875, + "learning_rate": 3.2919265804450364e-05, + "loss": 0.3911, + "step": 9092 + }, + { + "epoch": 0.39931062493825265, + "grad_norm": 2.109375, + "learning_rate": 3.291270349414891e-05, + "loss": 0.3595, + "step": 9094 + }, + { + "epoch": 0.3993984434174561, + "grad_norm": 1.921875, + "learning_rate": 3.290614057789073e-05, + "loss": 0.3703, + "step": 9096 + }, + { + "epoch": 0.3994862618966596, + "grad_norm": 1.9453125, + "learning_rate": 3.289957705617841e-05, + "loss": 0.3909, + "step": 9098 + }, + { + "epoch": 0.3995740803758631, + "grad_norm": 2.015625, + "learning_rate": 3.2893012929514574e-05, + "loss": 0.391, + "step": 9100 + }, + { + "epoch": 0.3996618988550666, + "grad_norm": 2.53125, + "learning_rate": 3.288644819840193e-05, + "loss": 0.3775, + "step": 9102 + }, + { + "epoch": 0.39974971733427006, + "grad_norm": 2.0, + "learning_rate": 3.2879882863343184e-05, + "loss": 0.4158, + "step": 9104 + }, + { + "epoch": 0.39983753581347353, + "grad_norm": 2.15625, + "learning_rate": 3.287331692484113e-05, + "loss": 0.37, + "step": 9106 + }, + { + "epoch": 0.39992535429267706, + "grad_norm": 2.109375, + "learning_rate": 3.286675038339857e-05, + "loss": 0.3767, + "step": 9108 + }, + { + "epoch": 0.40001317277188053, + "grad_norm": 2.1875, + "learning_rate": 3.286018323951838e-05, + "loss": 0.3919, + "step": 9110 + }, + { + "epoch": 0.400100991251084, + "grad_norm": 1.9375, + "learning_rate": 3.2853615493703475e-05, + "loss": 0.4056, + "step": 9112 + }, + { + "epoch": 0.4001888097302875, + "grad_norm": 1.8046875, + "learning_rate": 3.284704714645681e-05, + "loss": 0.4012, + "step": 9114 + }, + { + "epoch": 0.400276628209491, + "grad_norm": 1.9453125, + "learning_rate": 3.28404781982814e-05, + "loss": 0.4101, + "step": 9116 + }, + { + "epoch": 0.4003644466886945, + "grad_norm": 1.8359375, + "learning_rate": 3.283390864968029e-05, + "loss": 0.3904, + "step": 9118 + }, + { + "epoch": 0.40045226516789795, + "grad_norm": 1.8359375, + "learning_rate": 3.282733850115657e-05, + "loss": 0.4022, + "step": 9120 + }, + { + "epoch": 0.4005400836471014, + "grad_norm": 1.796875, + "learning_rate": 3.28207677532134e-05, + "loss": 0.3885, + "step": 9122 + }, + { + "epoch": 0.40062790212630495, + "grad_norm": 1.8046875, + "learning_rate": 3.281419640635395e-05, + "loss": 0.4061, + "step": 9124 + }, + { + "epoch": 0.4007157206055084, + "grad_norm": 1.875, + "learning_rate": 3.2807624461081477e-05, + "loss": 0.3681, + "step": 9126 + }, + { + "epoch": 0.4008035390847119, + "grad_norm": 1.8515625, + "learning_rate": 3.280105191789925e-05, + "loss": 0.3851, + "step": 9128 + }, + { + "epoch": 0.40089135756391536, + "grad_norm": 1.8125, + "learning_rate": 3.279447877731058e-05, + "loss": 0.3873, + "step": 9130 + }, + { + "epoch": 0.4009791760431189, + "grad_norm": 1.8828125, + "learning_rate": 3.2787905039818875e-05, + "loss": 0.3851, + "step": 9132 + }, + { + "epoch": 0.40106699452232236, + "grad_norm": 2.125, + "learning_rate": 3.278133070592753e-05, + "loss": 0.3864, + "step": 9134 + }, + { + "epoch": 0.40115481300152583, + "grad_norm": 2.0625, + "learning_rate": 3.277475577614002e-05, + "loss": 0.3968, + "step": 9136 + }, + { + "epoch": 0.4012426314807293, + "grad_norm": 1.8671875, + "learning_rate": 3.276818025095984e-05, + "loss": 0.3765, + "step": 9138 + }, + { + "epoch": 0.40133044995993283, + "grad_norm": 1.859375, + "learning_rate": 3.276160413089056e-05, + "loss": 0.3751, + "step": 9140 + }, + { + "epoch": 0.4014182684391363, + "grad_norm": 1.7890625, + "learning_rate": 3.275502741643577e-05, + "loss": 0.4117, + "step": 9142 + }, + { + "epoch": 0.4015060869183398, + "grad_norm": 1.8046875, + "learning_rate": 3.274845010809913e-05, + "loss": 0.3599, + "step": 9144 + }, + { + "epoch": 0.4015939053975433, + "grad_norm": 1.96875, + "learning_rate": 3.274187220638431e-05, + "loss": 0.4034, + "step": 9146 + }, + { + "epoch": 0.4016817238767468, + "grad_norm": 2.15625, + "learning_rate": 3.273529371179507e-05, + "loss": 0.3909, + "step": 9148 + }, + { + "epoch": 0.40176954235595025, + "grad_norm": 2.171875, + "learning_rate": 3.2728714624835174e-05, + "loss": 0.3765, + "step": 9150 + }, + { + "epoch": 0.4018573608351537, + "grad_norm": 2.359375, + "learning_rate": 3.272213494600847e-05, + "loss": 0.3848, + "step": 9152 + }, + { + "epoch": 0.40194517931435725, + "grad_norm": 1.984375, + "learning_rate": 3.271555467581882e-05, + "loss": 0.3683, + "step": 9154 + }, + { + "epoch": 0.4020329977935607, + "grad_norm": 2.078125, + "learning_rate": 3.270897381477014e-05, + "loss": 0.376, + "step": 9156 + }, + { + "epoch": 0.4021208162727642, + "grad_norm": 2.265625, + "learning_rate": 3.27023923633664e-05, + "loss": 0.4018, + "step": 9158 + }, + { + "epoch": 0.40220863475196766, + "grad_norm": 1.9609375, + "learning_rate": 3.26958103221116e-05, + "loss": 0.4136, + "step": 9160 + }, + { + "epoch": 0.4022964532311712, + "grad_norm": 1.875, + "learning_rate": 3.26892276915098e-05, + "loss": 0.3917, + "step": 9162 + }, + { + "epoch": 0.40238427171037466, + "grad_norm": 1.8359375, + "learning_rate": 3.268264447206511e-05, + "loss": 0.3996, + "step": 9164 + }, + { + "epoch": 0.40247209018957814, + "grad_norm": 2.0, + "learning_rate": 3.267606066428166e-05, + "loss": 0.3714, + "step": 9166 + }, + { + "epoch": 0.4025599086687816, + "grad_norm": 2.25, + "learning_rate": 3.266947626866365e-05, + "loss": 0.3664, + "step": 9168 + }, + { + "epoch": 0.40264772714798513, + "grad_norm": 1.984375, + "learning_rate": 3.26628912857153e-05, + "loss": 0.3979, + "step": 9170 + }, + { + "epoch": 0.4027355456271886, + "grad_norm": 1.796875, + "learning_rate": 3.2656305715940905e-05, + "loss": 0.3641, + "step": 9172 + }, + { + "epoch": 0.4028233641063921, + "grad_norm": 2.1875, + "learning_rate": 3.264971955984478e-05, + "loss": 0.3947, + "step": 9174 + }, + { + "epoch": 0.40291118258559555, + "grad_norm": 1.9140625, + "learning_rate": 3.2643132817931294e-05, + "loss": 0.3627, + "step": 9176 + }, + { + "epoch": 0.4029990010647991, + "grad_norm": 1.96875, + "learning_rate": 3.263654549070486e-05, + "loss": 0.3983, + "step": 9178 + }, + { + "epoch": 0.40308681954400255, + "grad_norm": 1.8671875, + "learning_rate": 3.262995757866996e-05, + "loss": 0.4208, + "step": 9180 + }, + { + "epoch": 0.403174638023206, + "grad_norm": 2.09375, + "learning_rate": 3.262336908233106e-05, + "loss": 0.4071, + "step": 9182 + }, + { + "epoch": 0.4032624565024095, + "grad_norm": 2.1875, + "learning_rate": 3.2616780002192746e-05, + "loss": 0.4131, + "step": 9184 + }, + { + "epoch": 0.403350274981613, + "grad_norm": 1.9375, + "learning_rate": 3.2610190338759586e-05, + "loss": 0.4001, + "step": 9186 + }, + { + "epoch": 0.4034380934608165, + "grad_norm": 1.875, + "learning_rate": 3.2603600092536216e-05, + "loss": 0.4055, + "step": 9188 + }, + { + "epoch": 0.40352591194001997, + "grad_norm": 1.8046875, + "learning_rate": 3.259700926402734e-05, + "loss": 0.3863, + "step": 9190 + }, + { + "epoch": 0.40361373041922344, + "grad_norm": 1.8828125, + "learning_rate": 3.2590417853737666e-05, + "loss": 0.4105, + "step": 9192 + }, + { + "epoch": 0.40370154889842697, + "grad_norm": 1.8828125, + "learning_rate": 3.258382586217198e-05, + "loss": 0.4068, + "step": 9194 + }, + { + "epoch": 0.40378936737763044, + "grad_norm": 1.9453125, + "learning_rate": 3.2577233289835085e-05, + "loss": 0.3828, + "step": 9196 + }, + { + "epoch": 0.4038771858568339, + "grad_norm": 1.9609375, + "learning_rate": 3.257064013723185e-05, + "loss": 0.3971, + "step": 9198 + }, + { + "epoch": 0.40396500433603744, + "grad_norm": 1.984375, + "learning_rate": 3.256404640486719e-05, + "loss": 0.4, + "step": 9200 + }, + { + "epoch": 0.4040528228152409, + "grad_norm": 1.875, + "learning_rate": 3.2557452093246025e-05, + "loss": 0.3995, + "step": 9202 + }, + { + "epoch": 0.4041406412944444, + "grad_norm": 1.828125, + "learning_rate": 3.255085720287337e-05, + "loss": 0.3784, + "step": 9204 + }, + { + "epoch": 0.40422845977364785, + "grad_norm": 2.140625, + "learning_rate": 3.254426173425428e-05, + "loss": 0.4098, + "step": 9206 + }, + { + "epoch": 0.4043162782528514, + "grad_norm": 2.234375, + "learning_rate": 3.2537665687893796e-05, + "loss": 0.4037, + "step": 9208 + }, + { + "epoch": 0.40440409673205485, + "grad_norm": 2.15625, + "learning_rate": 3.253106906429707e-05, + "loss": 0.3656, + "step": 9210 + }, + { + "epoch": 0.4044919152112583, + "grad_norm": 2.09375, + "learning_rate": 3.2524471863969274e-05, + "loss": 0.382, + "step": 9212 + }, + { + "epoch": 0.4045797336904618, + "grad_norm": 1.828125, + "learning_rate": 3.251787408741562e-05, + "loss": 0.3949, + "step": 9214 + }, + { + "epoch": 0.4046675521696653, + "grad_norm": 2.40625, + "learning_rate": 3.2511275735141365e-05, + "loss": 0.4174, + "step": 9216 + }, + { + "epoch": 0.4047553706488688, + "grad_norm": 1.9609375, + "learning_rate": 3.250467680765181e-05, + "loss": 0.4042, + "step": 9218 + }, + { + "epoch": 0.40484318912807227, + "grad_norm": 1.9140625, + "learning_rate": 3.2498077305452316e-05, + "loss": 0.3811, + "step": 9220 + }, + { + "epoch": 0.40493100760727574, + "grad_norm": 1.796875, + "learning_rate": 3.249147722904826e-05, + "loss": 0.3887, + "step": 9222 + }, + { + "epoch": 0.40501882608647927, + "grad_norm": 1.8828125, + "learning_rate": 3.248487657894508e-05, + "loss": 0.3687, + "step": 9224 + }, + { + "epoch": 0.40510664456568274, + "grad_norm": 1.90625, + "learning_rate": 3.247827535564826e-05, + "loss": 0.377, + "step": 9226 + }, + { + "epoch": 0.4051944630448862, + "grad_norm": 2.046875, + "learning_rate": 3.2471673559663314e-05, + "loss": 0.382, + "step": 9228 + }, + { + "epoch": 0.4052822815240897, + "grad_norm": 1.828125, + "learning_rate": 3.246507119149582e-05, + "loss": 0.3965, + "step": 9230 + }, + { + "epoch": 0.4053701000032932, + "grad_norm": 1.859375, + "learning_rate": 3.245846825165139e-05, + "loss": 0.3912, + "step": 9232 + }, + { + "epoch": 0.4054579184824967, + "grad_norm": 2.203125, + "learning_rate": 3.245186474063566e-05, + "loss": 0.3826, + "step": 9234 + }, + { + "epoch": 0.40554573696170015, + "grad_norm": 2.3125, + "learning_rate": 3.244526065895436e-05, + "loss": 0.3925, + "step": 9236 + }, + { + "epoch": 0.4056335554409036, + "grad_norm": 2.0625, + "learning_rate": 3.24386560071132e-05, + "loss": 0.3818, + "step": 9238 + }, + { + "epoch": 0.40572137392010715, + "grad_norm": 1.7890625, + "learning_rate": 3.243205078561798e-05, + "loss": 0.3815, + "step": 9240 + }, + { + "epoch": 0.4058091923993106, + "grad_norm": 1.875, + "learning_rate": 3.242544499497453e-05, + "loss": 0.3767, + "step": 9242 + }, + { + "epoch": 0.4058970108785141, + "grad_norm": 2.125, + "learning_rate": 3.241883863568873e-05, + "loss": 0.3857, + "step": 9244 + }, + { + "epoch": 0.4059848293577176, + "grad_norm": 1.9453125, + "learning_rate": 3.241223170826648e-05, + "loss": 0.393, + "step": 9246 + }, + { + "epoch": 0.4060726478369211, + "grad_norm": 2.234375, + "learning_rate": 3.240562421321376e-05, + "loss": 0.3945, + "step": 9248 + }, + { + "epoch": 0.40616046631612457, + "grad_norm": 1.8671875, + "learning_rate": 3.2399016151036555e-05, + "loss": 0.3851, + "step": 9250 + }, + { + "epoch": 0.40624828479532804, + "grad_norm": 2.4375, + "learning_rate": 3.239240752224091e-05, + "loss": 0.377, + "step": 9252 + }, + { + "epoch": 0.40633610327453157, + "grad_norm": 2.03125, + "learning_rate": 3.238579832733294e-05, + "loss": 0.3738, + "step": 9254 + }, + { + "epoch": 0.40642392175373504, + "grad_norm": 2.125, + "learning_rate": 3.2379188566818765e-05, + "loss": 0.3562, + "step": 9256 + }, + { + "epoch": 0.4065117402329385, + "grad_norm": 1.9765625, + "learning_rate": 3.237257824120455e-05, + "loss": 0.3817, + "step": 9258 + }, + { + "epoch": 0.406599558712142, + "grad_norm": 2.078125, + "learning_rate": 3.2365967350996526e-05, + "loss": 0.3536, + "step": 9260 + }, + { + "epoch": 0.4066873771913455, + "grad_norm": 2.4375, + "learning_rate": 3.2359355896700964e-05, + "loss": 0.3519, + "step": 9262 + }, + { + "epoch": 0.406775195670549, + "grad_norm": 1.859375, + "learning_rate": 3.235274387882416e-05, + "loss": 0.3692, + "step": 9264 + }, + { + "epoch": 0.40686301414975246, + "grad_norm": 1.9375, + "learning_rate": 3.234613129787246e-05, + "loss": 0.3724, + "step": 9266 + }, + { + "epoch": 0.4069508326289559, + "grad_norm": 1.75, + "learning_rate": 3.233951815435228e-05, + "loss": 0.3574, + "step": 9268 + }, + { + "epoch": 0.40703865110815945, + "grad_norm": 2.25, + "learning_rate": 3.233290444877003e-05, + "loss": 0.3948, + "step": 9270 + }, + { + "epoch": 0.4071264695873629, + "grad_norm": 2.0625, + "learning_rate": 3.2326290181632204e-05, + "loss": 0.4047, + "step": 9272 + }, + { + "epoch": 0.4072142880665664, + "grad_norm": 1.953125, + "learning_rate": 3.2319675353445314e-05, + "loss": 0.3698, + "step": 9274 + }, + { + "epoch": 0.40730210654576987, + "grad_norm": 1.875, + "learning_rate": 3.231305996471593e-05, + "loss": 0.3717, + "step": 9276 + }, + { + "epoch": 0.4073899250249734, + "grad_norm": 2.015625, + "learning_rate": 3.230644401595067e-05, + "loss": 0.4248, + "step": 9278 + }, + { + "epoch": 0.40747774350417687, + "grad_norm": 2.140625, + "learning_rate": 3.2299827507656165e-05, + "loss": 0.3944, + "step": 9280 + }, + { + "epoch": 0.40756556198338034, + "grad_norm": 2.46875, + "learning_rate": 3.229321044033913e-05, + "loss": 0.3982, + "step": 9282 + }, + { + "epoch": 0.4076533804625838, + "grad_norm": 3.28125, + "learning_rate": 3.228659281450628e-05, + "loss": 0.3924, + "step": 9284 + }, + { + "epoch": 0.40774119894178734, + "grad_norm": 2.125, + "learning_rate": 3.227997463066441e-05, + "loss": 0.3693, + "step": 9286 + }, + { + "epoch": 0.4078290174209908, + "grad_norm": 2.3125, + "learning_rate": 3.227335588932034e-05, + "loss": 0.3468, + "step": 9288 + }, + { + "epoch": 0.4079168359001943, + "grad_norm": 1.6875, + "learning_rate": 3.226673659098093e-05, + "loss": 0.4128, + "step": 9290 + }, + { + "epoch": 0.40800465437939776, + "grad_norm": 1.828125, + "learning_rate": 3.226011673615309e-05, + "loss": 0.3843, + "step": 9292 + }, + { + "epoch": 0.4080924728586013, + "grad_norm": 2.15625, + "learning_rate": 3.225349632534378e-05, + "loss": 0.4072, + "step": 9294 + }, + { + "epoch": 0.40818029133780476, + "grad_norm": 1.765625, + "learning_rate": 3.2246875359059966e-05, + "loss": 0.3747, + "step": 9296 + }, + { + "epoch": 0.40826810981700823, + "grad_norm": 2.15625, + "learning_rate": 3.2240253837808706e-05, + "loss": 0.3809, + "step": 9298 + }, + { + "epoch": 0.40835592829621176, + "grad_norm": 2.234375, + "learning_rate": 3.223363176209708e-05, + "loss": 0.3934, + "step": 9300 + }, + { + "epoch": 0.40844374677541523, + "grad_norm": 1.8828125, + "learning_rate": 3.222700913243219e-05, + "loss": 0.4021, + "step": 9302 + }, + { + "epoch": 0.4085315652546187, + "grad_norm": 1.796875, + "learning_rate": 3.2220385949321215e-05, + "loss": 0.379, + "step": 9304 + }, + { + "epoch": 0.4086193837338222, + "grad_norm": 1.859375, + "learning_rate": 3.221376221327135e-05, + "loss": 0.4074, + "step": 9306 + }, + { + "epoch": 0.4087072022130257, + "grad_norm": 1.9765625, + "learning_rate": 3.220713792478984e-05, + "loss": 0.4007, + "step": 9308 + }, + { + "epoch": 0.40879502069222917, + "grad_norm": 1.8515625, + "learning_rate": 3.220051308438399e-05, + "loss": 0.3983, + "step": 9310 + }, + { + "epoch": 0.40888283917143264, + "grad_norm": 1.8828125, + "learning_rate": 3.2193887692561115e-05, + "loss": 0.3699, + "step": 9312 + }, + { + "epoch": 0.4089706576506361, + "grad_norm": 2.0, + "learning_rate": 3.2187261749828594e-05, + "loss": 0.3806, + "step": 9314 + }, + { + "epoch": 0.40905847612983964, + "grad_norm": 2.296875, + "learning_rate": 3.218063525669385e-05, + "loss": 0.3909, + "step": 9316 + }, + { + "epoch": 0.4091462946090431, + "grad_norm": 1.8984375, + "learning_rate": 3.2174008213664335e-05, + "loss": 0.3955, + "step": 9318 + }, + { + "epoch": 0.4092341130882466, + "grad_norm": 2.15625, + "learning_rate": 3.216738062124756e-05, + "loss": 0.3532, + "step": 9320 + }, + { + "epoch": 0.40932193156745006, + "grad_norm": 1.9375, + "learning_rate": 3.216075247995105e-05, + "loss": 0.375, + "step": 9322 + }, + { + "epoch": 0.4094097500466536, + "grad_norm": 1.9140625, + "learning_rate": 3.21541237902824e-05, + "loss": 0.378, + "step": 9324 + }, + { + "epoch": 0.40949756852585706, + "grad_norm": 1.9296875, + "learning_rate": 3.214749455274923e-05, + "loss": 0.3989, + "step": 9326 + }, + { + "epoch": 0.40958538700506053, + "grad_norm": 2.28125, + "learning_rate": 3.2140864767859216e-05, + "loss": 0.3949, + "step": 9328 + }, + { + "epoch": 0.409673205484264, + "grad_norm": 1.9453125, + "learning_rate": 3.213423443612007e-05, + "loss": 0.4091, + "step": 9330 + }, + { + "epoch": 0.40976102396346753, + "grad_norm": 1.9296875, + "learning_rate": 3.2127603558039545e-05, + "loss": 0.4058, + "step": 9332 + }, + { + "epoch": 0.409848842442671, + "grad_norm": 1.890625, + "learning_rate": 3.212097213412542e-05, + "loss": 0.3779, + "step": 9334 + }, + { + "epoch": 0.4099366609218745, + "grad_norm": 1.9921875, + "learning_rate": 3.211434016488555e-05, + "loss": 0.3734, + "step": 9336 + }, + { + "epoch": 0.41002447940107795, + "grad_norm": 2.0625, + "learning_rate": 3.2107707650827804e-05, + "loss": 0.3795, + "step": 9338 + }, + { + "epoch": 0.4101122978802815, + "grad_norm": 1.921875, + "learning_rate": 3.2101074592460094e-05, + "loss": 0.3807, + "step": 9340 + }, + { + "epoch": 0.41020011635948495, + "grad_norm": 1.8359375, + "learning_rate": 3.2094440990290395e-05, + "loss": 0.3797, + "step": 9342 + }, + { + "epoch": 0.4102879348386884, + "grad_norm": 1.8515625, + "learning_rate": 3.208780684482671e-05, + "loss": 0.3895, + "step": 9344 + }, + { + "epoch": 0.41037575331789194, + "grad_norm": 2.015625, + "learning_rate": 3.208117215657707e-05, + "loss": 0.3704, + "step": 9346 + }, + { + "epoch": 0.4104635717970954, + "grad_norm": 1.9453125, + "learning_rate": 3.207453692604957e-05, + "loss": 0.3739, + "step": 9348 + }, + { + "epoch": 0.4105513902762989, + "grad_norm": 2.03125, + "learning_rate": 3.206790115375234e-05, + "loss": 0.3953, + "step": 9350 + }, + { + "epoch": 0.41063920875550236, + "grad_norm": 1.9453125, + "learning_rate": 3.206126484019354e-05, + "loss": 0.3564, + "step": 9352 + }, + { + "epoch": 0.4107270272347059, + "grad_norm": 2.015625, + "learning_rate": 3.205462798588139e-05, + "loss": 0.3534, + "step": 9354 + }, + { + "epoch": 0.41081484571390936, + "grad_norm": 1.8828125, + "learning_rate": 3.204799059132414e-05, + "loss": 0.4022, + "step": 9356 + }, + { + "epoch": 0.41090266419311283, + "grad_norm": 2.28125, + "learning_rate": 3.204135265703008e-05, + "loss": 0.385, + "step": 9358 + }, + { + "epoch": 0.4109904826723163, + "grad_norm": 2.203125, + "learning_rate": 3.203471418350754e-05, + "loss": 0.3867, + "step": 9360 + }, + { + "epoch": 0.41107830115151983, + "grad_norm": 2.0625, + "learning_rate": 3.20280751712649e-05, + "loss": 0.4418, + "step": 9362 + }, + { + "epoch": 0.4111661196307233, + "grad_norm": 2.21875, + "learning_rate": 3.2021435620810587e-05, + "loss": 0.3827, + "step": 9364 + }, + { + "epoch": 0.4112539381099268, + "grad_norm": 1.796875, + "learning_rate": 3.2014795532653054e-05, + "loss": 0.4018, + "step": 9366 + }, + { + "epoch": 0.41134175658913025, + "grad_norm": 1.9921875, + "learning_rate": 3.200815490730079e-05, + "loss": 0.4129, + "step": 9368 + }, + { + "epoch": 0.4114295750683338, + "grad_norm": 2.046875, + "learning_rate": 3.200151374526234e-05, + "loss": 0.3971, + "step": 9370 + }, + { + "epoch": 0.41151739354753725, + "grad_norm": 1.984375, + "learning_rate": 3.19948720470463e-05, + "loss": 0.3694, + "step": 9372 + }, + { + "epoch": 0.4116052120267407, + "grad_norm": 1.9921875, + "learning_rate": 3.198822981316127e-05, + "loss": 0.3975, + "step": 9374 + }, + { + "epoch": 0.4116930305059442, + "grad_norm": 1.984375, + "learning_rate": 3.198158704411593e-05, + "loss": 0.3705, + "step": 9376 + }, + { + "epoch": 0.4117808489851477, + "grad_norm": 2.0, + "learning_rate": 3.1974943740418986e-05, + "loss": 0.4057, + "step": 9378 + }, + { + "epoch": 0.4118686674643512, + "grad_norm": 1.953125, + "learning_rate": 3.1968299902579166e-05, + "loss": 0.3794, + "step": 9380 + }, + { + "epoch": 0.41195648594355466, + "grad_norm": 2.265625, + "learning_rate": 3.196165553110528e-05, + "loss": 0.3638, + "step": 9382 + }, + { + "epoch": 0.41204430442275813, + "grad_norm": 2.1875, + "learning_rate": 3.1955010626506146e-05, + "loss": 0.3815, + "step": 9384 + }, + { + "epoch": 0.41213212290196166, + "grad_norm": 2.109375, + "learning_rate": 3.1948365189290625e-05, + "loss": 0.379, + "step": 9386 + }, + { + "epoch": 0.41221994138116513, + "grad_norm": 1.8828125, + "learning_rate": 3.194171921996763e-05, + "loss": 0.3453, + "step": 9388 + }, + { + "epoch": 0.4123077598603686, + "grad_norm": 1.65625, + "learning_rate": 3.1935072719046115e-05, + "loss": 0.3958, + "step": 9390 + }, + { + "epoch": 0.4123955783395721, + "grad_norm": 1.9375, + "learning_rate": 3.192842568703508e-05, + "loss": 0.4017, + "step": 9392 + }, + { + "epoch": 0.4124833968187756, + "grad_norm": 2.140625, + "learning_rate": 3.192177812444353e-05, + "loss": 0.4008, + "step": 9394 + }, + { + "epoch": 0.4125712152979791, + "grad_norm": 1.8359375, + "learning_rate": 3.191513003178055e-05, + "loss": 0.3786, + "step": 9396 + }, + { + "epoch": 0.41265903377718255, + "grad_norm": 2.015625, + "learning_rate": 3.1908481409555266e-05, + "loss": 0.3702, + "step": 9398 + }, + { + "epoch": 0.4127468522563861, + "grad_norm": 1.8671875, + "learning_rate": 3.190183225827682e-05, + "loss": 0.3878, + "step": 9400 + }, + { + "epoch": 0.41283467073558955, + "grad_norm": 2.171875, + "learning_rate": 3.1895182578454395e-05, + "loss": 0.3722, + "step": 9402 + }, + { + "epoch": 0.412922489214793, + "grad_norm": 1.8671875, + "learning_rate": 3.188853237059725e-05, + "loss": 0.3754, + "step": 9404 + }, + { + "epoch": 0.4130103076939965, + "grad_norm": 2.25, + "learning_rate": 3.188188163521463e-05, + "loss": 0.4122, + "step": 9406 + }, + { + "epoch": 0.4130981261732, + "grad_norm": 2.03125, + "learning_rate": 3.1875230372815864e-05, + "loss": 0.3988, + "step": 9408 + }, + { + "epoch": 0.4131859446524035, + "grad_norm": 1.9296875, + "learning_rate": 3.1868578583910316e-05, + "loss": 0.364, + "step": 9410 + }, + { + "epoch": 0.41327376313160696, + "grad_norm": 2.0625, + "learning_rate": 3.186192626900737e-05, + "loss": 0.379, + "step": 9412 + }, + { + "epoch": 0.41336158161081044, + "grad_norm": 2.0, + "learning_rate": 3.185527342861647e-05, + "loss": 0.4191, + "step": 9414 + }, + { + "epoch": 0.41344940009001396, + "grad_norm": 2.25, + "learning_rate": 3.184862006324709e-05, + "loss": 0.414, + "step": 9416 + }, + { + "epoch": 0.41353721856921744, + "grad_norm": 1.984375, + "learning_rate": 3.184196617340874e-05, + "loss": 0.4011, + "step": 9418 + }, + { + "epoch": 0.4136250370484209, + "grad_norm": 1.7421875, + "learning_rate": 3.1835311759610975e-05, + "loss": 0.3796, + "step": 9420 + }, + { + "epoch": 0.4137128555276244, + "grad_norm": 1.9765625, + "learning_rate": 3.18286568223634e-05, + "loss": 0.3948, + "step": 9422 + }, + { + "epoch": 0.4138006740068279, + "grad_norm": 1.9140625, + "learning_rate": 3.1822001362175646e-05, + "loss": 0.3655, + "step": 9424 + }, + { + "epoch": 0.4138884924860314, + "grad_norm": 2.109375, + "learning_rate": 3.18153453795574e-05, + "loss": 0.3793, + "step": 9426 + }, + { + "epoch": 0.41397631096523485, + "grad_norm": 1.8125, + "learning_rate": 3.180868887501837e-05, + "loss": 0.3982, + "step": 9428 + }, + { + "epoch": 0.4140641294444383, + "grad_norm": 1.96875, + "learning_rate": 3.1802031849068316e-05, + "loss": 0.3917, + "step": 9430 + }, + { + "epoch": 0.41415194792364185, + "grad_norm": 1.9453125, + "learning_rate": 3.1795374302217025e-05, + "loss": 0.3937, + "step": 9432 + }, + { + "epoch": 0.4142397664028453, + "grad_norm": 2.0, + "learning_rate": 3.178871623497434e-05, + "loss": 0.3886, + "step": 9434 + }, + { + "epoch": 0.4143275848820488, + "grad_norm": 2.046875, + "learning_rate": 3.178205764785014e-05, + "loss": 0.3583, + "step": 9436 + }, + { + "epoch": 0.41441540336125227, + "grad_norm": 1.8359375, + "learning_rate": 3.177539854135434e-05, + "loss": 0.3792, + "step": 9438 + }, + { + "epoch": 0.4145032218404558, + "grad_norm": 1.8828125, + "learning_rate": 3.1768738915996896e-05, + "loss": 0.3909, + "step": 9440 + }, + { + "epoch": 0.41459104031965927, + "grad_norm": 2.015625, + "learning_rate": 3.1762078772287804e-05, + "loss": 0.3741, + "step": 9442 + }, + { + "epoch": 0.41467885879886274, + "grad_norm": 2.3125, + "learning_rate": 3.17554181107371e-05, + "loss": 0.3985, + "step": 9444 + }, + { + "epoch": 0.41476667727806626, + "grad_norm": 1.921875, + "learning_rate": 3.174875693185486e-05, + "loss": 0.3948, + "step": 9446 + }, + { + "epoch": 0.41485449575726974, + "grad_norm": 2.078125, + "learning_rate": 3.174209523615119e-05, + "loss": 0.3797, + "step": 9448 + }, + { + "epoch": 0.4149423142364732, + "grad_norm": 2.234375, + "learning_rate": 3.173543302413625e-05, + "loss": 0.3699, + "step": 9450 + }, + { + "epoch": 0.4150301327156767, + "grad_norm": 1.953125, + "learning_rate": 3.172877029632023e-05, + "loss": 0.3681, + "step": 9452 + }, + { + "epoch": 0.4151179511948802, + "grad_norm": 1.90625, + "learning_rate": 3.1722107053213386e-05, + "loss": 0.361, + "step": 9454 + }, + { + "epoch": 0.4152057696740837, + "grad_norm": 1.9921875, + "learning_rate": 3.171544329532596e-05, + "loss": 0.3652, + "step": 9456 + }, + { + "epoch": 0.41529358815328715, + "grad_norm": 2.265625, + "learning_rate": 3.1708779023168275e-05, + "loss": 0.3872, + "step": 9458 + }, + { + "epoch": 0.4153814066324906, + "grad_norm": 1.984375, + "learning_rate": 3.170211423725069e-05, + "loss": 0.3984, + "step": 9460 + }, + { + "epoch": 0.41546922511169415, + "grad_norm": 2.09375, + "learning_rate": 3.169544893808359e-05, + "loss": 0.4018, + "step": 9462 + }, + { + "epoch": 0.4155570435908976, + "grad_norm": 1.9296875, + "learning_rate": 3.16887831261774e-05, + "loss": 0.4096, + "step": 9464 + }, + { + "epoch": 0.4156448620701011, + "grad_norm": 1.859375, + "learning_rate": 3.16821168020426e-05, + "loss": 0.3844, + "step": 9466 + }, + { + "epoch": 0.41573268054930457, + "grad_norm": 1.9921875, + "learning_rate": 3.167544996618969e-05, + "loss": 0.4051, + "step": 9468 + }, + { + "epoch": 0.4158204990285081, + "grad_norm": 2.09375, + "learning_rate": 3.166878261912922e-05, + "loss": 0.3821, + "step": 9470 + }, + { + "epoch": 0.41590831750771157, + "grad_norm": 1.9140625, + "learning_rate": 3.166211476137178e-05, + "loss": 0.4056, + "step": 9472 + }, + { + "epoch": 0.41599613598691504, + "grad_norm": 1.9765625, + "learning_rate": 3.1655446393427994e-05, + "loss": 0.3796, + "step": 9474 + }, + { + "epoch": 0.4160839544661185, + "grad_norm": 2.484375, + "learning_rate": 3.164877751580853e-05, + "loss": 0.3845, + "step": 9476 + }, + { + "epoch": 0.41617177294532204, + "grad_norm": 2.078125, + "learning_rate": 3.164210812902409e-05, + "loss": 0.3913, + "step": 9478 + }, + { + "epoch": 0.4162595914245255, + "grad_norm": 2.09375, + "learning_rate": 3.1635438233585425e-05, + "loss": 0.401, + "step": 9480 + }, + { + "epoch": 0.416347409903729, + "grad_norm": 1.9140625, + "learning_rate": 3.162876783000329e-05, + "loss": 0.3596, + "step": 9482 + }, + { + "epoch": 0.41643522838293245, + "grad_norm": 2.15625, + "learning_rate": 3.162209691878854e-05, + "loss": 0.3905, + "step": 9484 + }, + { + "epoch": 0.416523046862136, + "grad_norm": 1.828125, + "learning_rate": 3.161542550045202e-05, + "loss": 0.3869, + "step": 9486 + }, + { + "epoch": 0.41661086534133945, + "grad_norm": 2.015625, + "learning_rate": 3.160875357550462e-05, + "loss": 0.3911, + "step": 9488 + }, + { + "epoch": 0.4166986838205429, + "grad_norm": 1.7890625, + "learning_rate": 3.1602081144457297e-05, + "loss": 0.3836, + "step": 9490 + }, + { + "epoch": 0.4167865022997464, + "grad_norm": 1.9296875, + "learning_rate": 3.159540820782102e-05, + "loss": 0.3814, + "step": 9492 + }, + { + "epoch": 0.4168743207789499, + "grad_norm": 1.8828125, + "learning_rate": 3.1588734766106794e-05, + "loss": 0.3724, + "step": 9494 + }, + { + "epoch": 0.4169621392581534, + "grad_norm": 1.7421875, + "learning_rate": 3.1582060819825674e-05, + "loss": 0.387, + "step": 9496 + }, + { + "epoch": 0.41704995773735687, + "grad_norm": 1.7890625, + "learning_rate": 3.1575386369488766e-05, + "loss": 0.3574, + "step": 9498 + }, + { + "epoch": 0.4171377762165604, + "grad_norm": 1.9609375, + "learning_rate": 3.15687114156072e-05, + "loss": 0.3769, + "step": 9500 + }, + { + "epoch": 0.41722559469576387, + "grad_norm": 1.7734375, + "learning_rate": 3.156203595869213e-05, + "loss": 0.4157, + "step": 9502 + }, + { + "epoch": 0.41731341317496734, + "grad_norm": 2.015625, + "learning_rate": 3.155535999925478e-05, + "loss": 0.401, + "step": 9504 + }, + { + "epoch": 0.4174012316541708, + "grad_norm": 1.8203125, + "learning_rate": 3.1548683537806384e-05, + "loss": 0.3857, + "step": 9506 + }, + { + "epoch": 0.41748905013337434, + "grad_norm": 1.90625, + "learning_rate": 3.1542006574858236e-05, + "loss": 0.365, + "step": 9508 + }, + { + "epoch": 0.4175768686125778, + "grad_norm": 2.140625, + "learning_rate": 3.153532911092165e-05, + "loss": 0.3915, + "step": 9510 + }, + { + "epoch": 0.4176646870917813, + "grad_norm": 2.125, + "learning_rate": 3.1528651146508e-05, + "loss": 0.3779, + "step": 9512 + }, + { + "epoch": 0.41775250557098476, + "grad_norm": 2.1875, + "learning_rate": 3.1521972682128674e-05, + "loss": 0.3807, + "step": 9514 + }, + { + "epoch": 0.4178403240501883, + "grad_norm": 2.1875, + "learning_rate": 3.151529371829513e-05, + "loss": 0.3603, + "step": 9516 + }, + { + "epoch": 0.41792814252939176, + "grad_norm": 1.8828125, + "learning_rate": 3.150861425551882e-05, + "loss": 0.3942, + "step": 9518 + }, + { + "epoch": 0.4180159610085952, + "grad_norm": 1.890625, + "learning_rate": 3.150193429431127e-05, + "loss": 0.3824, + "step": 9520 + }, + { + "epoch": 0.4181037794877987, + "grad_norm": 1.984375, + "learning_rate": 3.149525383518404e-05, + "loss": 0.4087, + "step": 9522 + }, + { + "epoch": 0.4181915979670022, + "grad_norm": 1.953125, + "learning_rate": 3.148857287864871e-05, + "loss": 0.3705, + "step": 9524 + }, + { + "epoch": 0.4182794164462057, + "grad_norm": 1.8984375, + "learning_rate": 3.148189142521691e-05, + "loss": 0.3858, + "step": 9526 + }, + { + "epoch": 0.41836723492540917, + "grad_norm": 1.984375, + "learning_rate": 3.1475209475400316e-05, + "loss": 0.3966, + "step": 9528 + }, + { + "epoch": 0.41845505340461264, + "grad_norm": 1.8828125, + "learning_rate": 3.146852702971063e-05, + "loss": 0.3847, + "step": 9530 + }, + { + "epoch": 0.41854287188381617, + "grad_norm": 1.9296875, + "learning_rate": 3.146184408865959e-05, + "loss": 0.4065, + "step": 9532 + }, + { + "epoch": 0.41863069036301964, + "grad_norm": 1.8515625, + "learning_rate": 3.1455160652758975e-05, + "loss": 0.3839, + "step": 9534 + }, + { + "epoch": 0.4187185088422231, + "grad_norm": 1.8359375, + "learning_rate": 3.1448476722520625e-05, + "loss": 0.3895, + "step": 9536 + }, + { + "epoch": 0.4188063273214266, + "grad_norm": 2.125, + "learning_rate": 3.144179229845637e-05, + "loss": 0.4202, + "step": 9538 + }, + { + "epoch": 0.4188941458006301, + "grad_norm": 1.953125, + "learning_rate": 3.143510738107812e-05, + "loss": 0.4095, + "step": 9540 + }, + { + "epoch": 0.4189819642798336, + "grad_norm": 1.6484375, + "learning_rate": 3.1428421970897804e-05, + "loss": 0.4062, + "step": 9542 + }, + { + "epoch": 0.41906978275903706, + "grad_norm": 1.9609375, + "learning_rate": 3.142173606842739e-05, + "loss": 0.3639, + "step": 9544 + }, + { + "epoch": 0.41915760123824053, + "grad_norm": 1.9140625, + "learning_rate": 3.1415049674178884e-05, + "loss": 0.3899, + "step": 9546 + }, + { + "epoch": 0.41924541971744406, + "grad_norm": 2.03125, + "learning_rate": 3.1408362788664346e-05, + "loss": 0.3779, + "step": 9548 + }, + { + "epoch": 0.41933323819664753, + "grad_norm": 2.0, + "learning_rate": 3.1401675412395845e-05, + "loss": 0.4029, + "step": 9550 + }, + { + "epoch": 0.419421056675851, + "grad_norm": 1.9140625, + "learning_rate": 3.139498754588551e-05, + "loss": 0.3936, + "step": 9552 + }, + { + "epoch": 0.41950887515505453, + "grad_norm": 1.7578125, + "learning_rate": 3.1388299189645496e-05, + "loss": 0.407, + "step": 9554 + }, + { + "epoch": 0.419596693634258, + "grad_norm": 1.9140625, + "learning_rate": 3.1381610344187995e-05, + "loss": 0.3474, + "step": 9556 + }, + { + "epoch": 0.41968451211346147, + "grad_norm": 2.296875, + "learning_rate": 3.1374921010025246e-05, + "loss": 0.4169, + "step": 9558 + }, + { + "epoch": 0.41977233059266494, + "grad_norm": 1.9921875, + "learning_rate": 3.136823118766951e-05, + "loss": 0.3908, + "step": 9560 + }, + { + "epoch": 0.41986014907186847, + "grad_norm": 1.859375, + "learning_rate": 3.1361540877633114e-05, + "loss": 0.3901, + "step": 9562 + }, + { + "epoch": 0.41994796755107194, + "grad_norm": 2.09375, + "learning_rate": 3.135485008042839e-05, + "loss": 0.3783, + "step": 9564 + }, + { + "epoch": 0.4200357860302754, + "grad_norm": 1.8359375, + "learning_rate": 3.134815879656772e-05, + "loss": 0.3723, + "step": 9566 + }, + { + "epoch": 0.4201236045094789, + "grad_norm": 1.7734375, + "learning_rate": 3.134146702656353e-05, + "loss": 0.391, + "step": 9568 + }, + { + "epoch": 0.4202114229886824, + "grad_norm": 1.828125, + "learning_rate": 3.1334774770928276e-05, + "loss": 0.3982, + "step": 9570 + }, + { + "epoch": 0.4202992414678859, + "grad_norm": 1.9375, + "learning_rate": 3.132808203017445e-05, + "loss": 0.3727, + "step": 9572 + }, + { + "epoch": 0.42038705994708936, + "grad_norm": 1.8984375, + "learning_rate": 3.1321388804814584e-05, + "loss": 0.365, + "step": 9574 + }, + { + "epoch": 0.42047487842629283, + "grad_norm": 1.703125, + "learning_rate": 3.131469509536125e-05, + "loss": 0.4185, + "step": 9576 + }, + { + "epoch": 0.42056269690549636, + "grad_norm": 1.953125, + "learning_rate": 3.130800090232705e-05, + "loss": 0.3825, + "step": 9578 + }, + { + "epoch": 0.42065051538469983, + "grad_norm": 1.8515625, + "learning_rate": 3.1301306226224625e-05, + "loss": 0.3819, + "step": 9580 + }, + { + "epoch": 0.4207383338639033, + "grad_norm": 1.828125, + "learning_rate": 3.129461106756667e-05, + "loss": 0.3885, + "step": 9582 + }, + { + "epoch": 0.4208261523431068, + "grad_norm": 1.9140625, + "learning_rate": 3.128791542686588e-05, + "loss": 0.3542, + "step": 9584 + }, + { + "epoch": 0.4209139708223103, + "grad_norm": 2.140625, + "learning_rate": 3.128121930463502e-05, + "loss": 0.3729, + "step": 9586 + }, + { + "epoch": 0.4210017893015138, + "grad_norm": 1.96875, + "learning_rate": 3.1274522701386886e-05, + "loss": 0.4038, + "step": 9588 + }, + { + "epoch": 0.42108960778071725, + "grad_norm": 2.40625, + "learning_rate": 3.12678256176343e-05, + "loss": 0.3794, + "step": 9590 + }, + { + "epoch": 0.4211774262599207, + "grad_norm": 1.9140625, + "learning_rate": 3.126112805389012e-05, + "loss": 0.3976, + "step": 9592 + }, + { + "epoch": 0.42126524473912424, + "grad_norm": 1.7890625, + "learning_rate": 3.1254430010667254e-05, + "loss": 0.3973, + "step": 9594 + }, + { + "epoch": 0.4213530632183277, + "grad_norm": 1.96875, + "learning_rate": 3.1247731488478636e-05, + "loss": 0.3854, + "step": 9596 + }, + { + "epoch": 0.4214408816975312, + "grad_norm": 2.140625, + "learning_rate": 3.124103248783725e-05, + "loss": 0.4122, + "step": 9598 + }, + { + "epoch": 0.4215287001767347, + "grad_norm": 1.9453125, + "learning_rate": 3.1234333009256095e-05, + "loss": 0.3758, + "step": 9600 + }, + { + "epoch": 0.4216165186559382, + "grad_norm": 2.09375, + "learning_rate": 3.122763305324823e-05, + "loss": 0.374, + "step": 9602 + }, + { + "epoch": 0.42170433713514166, + "grad_norm": 1.9453125, + "learning_rate": 3.1220932620326726e-05, + "loss": 0.3778, + "step": 9604 + }, + { + "epoch": 0.42179215561434513, + "grad_norm": 2.09375, + "learning_rate": 3.1214231711004716e-05, + "loss": 0.3878, + "step": 9606 + }, + { + "epoch": 0.42187997409354866, + "grad_norm": 1.90625, + "learning_rate": 3.1207530325795344e-05, + "loss": 0.3833, + "step": 9608 + }, + { + "epoch": 0.42196779257275213, + "grad_norm": 1.7890625, + "learning_rate": 3.120082846521181e-05, + "loss": 0.3614, + "step": 9610 + }, + { + "epoch": 0.4220556110519556, + "grad_norm": 1.90625, + "learning_rate": 3.1194126129767356e-05, + "loss": 0.3733, + "step": 9612 + }, + { + "epoch": 0.4221434295311591, + "grad_norm": 1.875, + "learning_rate": 3.1187423319975236e-05, + "loss": 0.3895, + "step": 9614 + }, + { + "epoch": 0.4222312480103626, + "grad_norm": 1.8515625, + "learning_rate": 3.118072003634876e-05, + "loss": 0.3825, + "step": 9616 + }, + { + "epoch": 0.4223190664895661, + "grad_norm": 1.9453125, + "learning_rate": 3.117401627940125e-05, + "loss": 0.3708, + "step": 9618 + }, + { + "epoch": 0.42240688496876955, + "grad_norm": 1.890625, + "learning_rate": 3.1167312049646094e-05, + "loss": 0.3634, + "step": 9620 + }, + { + "epoch": 0.422494703447973, + "grad_norm": 1.7734375, + "learning_rate": 3.116060734759671e-05, + "loss": 0.3742, + "step": 9622 + }, + { + "epoch": 0.42258252192717655, + "grad_norm": 1.96875, + "learning_rate": 3.115390217376654e-05, + "loss": 0.3826, + "step": 9624 + }, + { + "epoch": 0.42267034040638, + "grad_norm": 1.8984375, + "learning_rate": 3.1147196528669056e-05, + "loss": 0.3936, + "step": 9626 + }, + { + "epoch": 0.4227581588855835, + "grad_norm": 1.9453125, + "learning_rate": 3.11404904128178e-05, + "loss": 0.3715, + "step": 9628 + }, + { + "epoch": 0.42284597736478696, + "grad_norm": 1.7578125, + "learning_rate": 3.113378382672631e-05, + "loss": 0.3611, + "step": 9630 + }, + { + "epoch": 0.4229337958439905, + "grad_norm": 1.71875, + "learning_rate": 3.112707677090818e-05, + "loss": 0.3716, + "step": 9632 + }, + { + "epoch": 0.42302161432319396, + "grad_norm": 1.7890625, + "learning_rate": 3.1120369245877045e-05, + "loss": 0.3768, + "step": 9634 + }, + { + "epoch": 0.42310943280239743, + "grad_norm": 1.890625, + "learning_rate": 3.111366125214657e-05, + "loss": 0.356, + "step": 9636 + }, + { + "epoch": 0.4231972512816009, + "grad_norm": 1.8671875, + "learning_rate": 3.110695279023045e-05, + "loss": 0.3886, + "step": 9638 + }, + { + "epoch": 0.42328506976080443, + "grad_norm": 2.46875, + "learning_rate": 3.110024386064242e-05, + "loss": 0.3937, + "step": 9640 + }, + { + "epoch": 0.4233728882400079, + "grad_norm": 1.9609375, + "learning_rate": 3.109353446389625e-05, + "loss": 0.3737, + "step": 9642 + }, + { + "epoch": 0.4234607067192114, + "grad_norm": 1.8984375, + "learning_rate": 3.108682460050576e-05, + "loss": 0.3738, + "step": 9644 + }, + { + "epoch": 0.42354852519841485, + "grad_norm": 2.125, + "learning_rate": 3.108011427098477e-05, + "loss": 0.38, + "step": 9646 + }, + { + "epoch": 0.4236363436776184, + "grad_norm": 2.171875, + "learning_rate": 3.107340347584718e-05, + "loss": 0.3693, + "step": 9648 + }, + { + "epoch": 0.42372416215682185, + "grad_norm": 2.0625, + "learning_rate": 3.1066692215606894e-05, + "loss": 0.3689, + "step": 9650 + }, + { + "epoch": 0.4238119806360253, + "grad_norm": 2.15625, + "learning_rate": 3.1059980490777865e-05, + "loss": 0.3757, + "step": 9652 + }, + { + "epoch": 0.42389979911522885, + "grad_norm": 1.8046875, + "learning_rate": 3.1053268301874074e-05, + "loss": 0.3936, + "step": 9654 + }, + { + "epoch": 0.4239876175944323, + "grad_norm": 2.25, + "learning_rate": 3.104655564940954e-05, + "loss": 0.3597, + "step": 9656 + }, + { + "epoch": 0.4240754360736358, + "grad_norm": 2.046875, + "learning_rate": 3.103984253389833e-05, + "loss": 0.3453, + "step": 9658 + }, + { + "epoch": 0.42416325455283926, + "grad_norm": 2.390625, + "learning_rate": 3.103312895585454e-05, + "loss": 0.3657, + "step": 9660 + }, + { + "epoch": 0.4242510730320428, + "grad_norm": 2.3125, + "learning_rate": 3.102641491579228e-05, + "loss": 0.3713, + "step": 9662 + }, + { + "epoch": 0.42433889151124626, + "grad_norm": 1.859375, + "learning_rate": 3.101970041422572e-05, + "loss": 0.3867, + "step": 9664 + }, + { + "epoch": 0.42442670999044974, + "grad_norm": 1.875, + "learning_rate": 3.1012985451669065e-05, + "loss": 0.4136, + "step": 9666 + }, + { + "epoch": 0.4245145284696532, + "grad_norm": 1.890625, + "learning_rate": 3.100627002863654e-05, + "loss": 0.4042, + "step": 9668 + }, + { + "epoch": 0.42460234694885673, + "grad_norm": 1.953125, + "learning_rate": 3.099955414564241e-05, + "loss": 0.3814, + "step": 9670 + }, + { + "epoch": 0.4246901654280602, + "grad_norm": 1.84375, + "learning_rate": 3.099283780320099e-05, + "loss": 0.3804, + "step": 9672 + }, + { + "epoch": 0.4247779839072637, + "grad_norm": 2.171875, + "learning_rate": 3.098612100182662e-05, + "loss": 0.3602, + "step": 9674 + }, + { + "epoch": 0.42486580238646715, + "grad_norm": 2.0625, + "learning_rate": 3.0979403742033664e-05, + "loss": 0.3581, + "step": 9676 + }, + { + "epoch": 0.4249536208656707, + "grad_norm": 1.8125, + "learning_rate": 3.097268602433654e-05, + "loss": 0.3673, + "step": 9678 + }, + { + "epoch": 0.42504143934487415, + "grad_norm": 1.8515625, + "learning_rate": 3.096596784924968e-05, + "loss": 0.3822, + "step": 9680 + }, + { + "epoch": 0.4251292578240776, + "grad_norm": 1.8984375, + "learning_rate": 3.0959249217287574e-05, + "loss": 0.3619, + "step": 9682 + }, + { + "epoch": 0.4252170763032811, + "grad_norm": 1.8359375, + "learning_rate": 3.095253012896474e-05, + "loss": 0.4173, + "step": 9684 + }, + { + "epoch": 0.4253048947824846, + "grad_norm": 1.984375, + "learning_rate": 3.094581058479571e-05, + "loss": 0.3643, + "step": 9686 + }, + { + "epoch": 0.4253927132616881, + "grad_norm": 2.046875, + "learning_rate": 3.0939090585295094e-05, + "loss": 0.3801, + "step": 9688 + }, + { + "epoch": 0.42548053174089157, + "grad_norm": 1.78125, + "learning_rate": 3.09323701309775e-05, + "loss": 0.4019, + "step": 9690 + }, + { + "epoch": 0.42556835022009504, + "grad_norm": 1.78125, + "learning_rate": 3.092564922235757e-05, + "loss": 0.4079, + "step": 9692 + }, + { + "epoch": 0.42565616869929856, + "grad_norm": 1.703125, + "learning_rate": 3.091892785995e-05, + "loss": 0.3587, + "step": 9694 + }, + { + "epoch": 0.42574398717850204, + "grad_norm": 1.8125, + "learning_rate": 3.0912206044269515e-05, + "loss": 0.4073, + "step": 9696 + }, + { + "epoch": 0.4258318056577055, + "grad_norm": 2.203125, + "learning_rate": 3.090548377583088e-05, + "loss": 0.3923, + "step": 9698 + }, + { + "epoch": 0.42591962413690904, + "grad_norm": 1.984375, + "learning_rate": 3.089876105514888e-05, + "loss": 0.4088, + "step": 9700 + }, + { + "epoch": 0.4260074426161125, + "grad_norm": 1.921875, + "learning_rate": 3.089203788273834e-05, + "loss": 0.3799, + "step": 9702 + }, + { + "epoch": 0.426095261095316, + "grad_norm": 2.28125, + "learning_rate": 3.088531425911413e-05, + "loss": 0.3668, + "step": 9704 + }, + { + "epoch": 0.42618307957451945, + "grad_norm": 2.1875, + "learning_rate": 3.087859018479115e-05, + "loss": 0.3907, + "step": 9706 + }, + { + "epoch": 0.426270898053723, + "grad_norm": 1.8046875, + "learning_rate": 3.0871865660284316e-05, + "loss": 0.4238, + "step": 9708 + }, + { + "epoch": 0.42635871653292645, + "grad_norm": 2.71875, + "learning_rate": 3.0865140686108606e-05, + "loss": 0.3756, + "step": 9710 + }, + { + "epoch": 0.4264465350121299, + "grad_norm": 2.078125, + "learning_rate": 3.085841526277901e-05, + "loss": 0.3871, + "step": 9712 + }, + { + "epoch": 0.4265343534913334, + "grad_norm": 1.875, + "learning_rate": 3.085168939081058e-05, + "loss": 0.3876, + "step": 9714 + }, + { + "epoch": 0.4266221719705369, + "grad_norm": 2.03125, + "learning_rate": 3.0844963070718366e-05, + "loss": 0.3558, + "step": 9716 + }, + { + "epoch": 0.4267099904497404, + "grad_norm": 2.0625, + "learning_rate": 3.0838236303017476e-05, + "loss": 0.3686, + "step": 9718 + }, + { + "epoch": 0.42679780892894387, + "grad_norm": 2.1875, + "learning_rate": 3.083150908822306e-05, + "loss": 0.3684, + "step": 9720 + }, + { + "epoch": 0.42688562740814734, + "grad_norm": 1.9765625, + "learning_rate": 3.082478142685027e-05, + "loss": 0.3866, + "step": 9722 + }, + { + "epoch": 0.42697344588735087, + "grad_norm": 1.9453125, + "learning_rate": 3.081805331941433e-05, + "loss": 0.3644, + "step": 9724 + }, + { + "epoch": 0.42706126436655434, + "grad_norm": 1.8515625, + "learning_rate": 3.081132476643047e-05, + "loss": 0.4389, + "step": 9726 + }, + { + "epoch": 0.4271490828457578, + "grad_norm": 1.8125, + "learning_rate": 3.0804595768413964e-05, + "loss": 0.3857, + "step": 9728 + }, + { + "epoch": 0.4272369013249613, + "grad_norm": 1.890625, + "learning_rate": 3.079786632588012e-05, + "loss": 0.3536, + "step": 9730 + }, + { + "epoch": 0.4273247198041648, + "grad_norm": 1.9609375, + "learning_rate": 3.079113643934429e-05, + "loss": 0.4163, + "step": 9732 + }, + { + "epoch": 0.4274125382833683, + "grad_norm": 1.9765625, + "learning_rate": 3.078440610932184e-05, + "loss": 0.3987, + "step": 9734 + }, + { + "epoch": 0.42750035676257175, + "grad_norm": 2.015625, + "learning_rate": 3.077767533632818e-05, + "loss": 0.3931, + "step": 9736 + }, + { + "epoch": 0.4275881752417752, + "grad_norm": 1.8046875, + "learning_rate": 3.077094412087877e-05, + "loss": 0.3957, + "step": 9738 + }, + { + "epoch": 0.42767599372097875, + "grad_norm": 1.921875, + "learning_rate": 3.076421246348906e-05, + "loss": 0.342, + "step": 9740 + }, + { + "epoch": 0.4277638122001822, + "grad_norm": 1.9140625, + "learning_rate": 3.075748036467458e-05, + "loss": 0.3825, + "step": 9742 + }, + { + "epoch": 0.4278516306793857, + "grad_norm": 1.8828125, + "learning_rate": 3.0750747824950885e-05, + "loss": 0.3787, + "step": 9744 + }, + { + "epoch": 0.42793944915858917, + "grad_norm": 1.8671875, + "learning_rate": 3.0744014844833535e-05, + "loss": 0.3855, + "step": 9746 + }, + { + "epoch": 0.4280272676377927, + "grad_norm": 1.703125, + "learning_rate": 3.0737281424838146e-05, + "loss": 0.3554, + "step": 9748 + }, + { + "epoch": 0.42811508611699617, + "grad_norm": 1.9609375, + "learning_rate": 3.073054756548038e-05, + "loss": 0.4223, + "step": 9750 + }, + { + "epoch": 0.42820290459619964, + "grad_norm": 1.9375, + "learning_rate": 3.0723813267275915e-05, + "loss": 0.3837, + "step": 9752 + }, + { + "epoch": 0.42829072307540317, + "grad_norm": 1.9609375, + "learning_rate": 3.071707853074045e-05, + "loss": 0.4027, + "step": 9754 + }, + { + "epoch": 0.42837854155460664, + "grad_norm": 1.875, + "learning_rate": 3.071034335638973e-05, + "loss": 0.3697, + "step": 9756 + }, + { + "epoch": 0.4284663600338101, + "grad_norm": 2.234375, + "learning_rate": 3.070360774473956e-05, + "loss": 0.3743, + "step": 9758 + }, + { + "epoch": 0.4285541785130136, + "grad_norm": 2.03125, + "learning_rate": 3.0696871696305726e-05, + "loss": 0.3974, + "step": 9760 + }, + { + "epoch": 0.4286419969922171, + "grad_norm": 1.8984375, + "learning_rate": 3.069013521160411e-05, + "loss": 0.3907, + "step": 9762 + }, + { + "epoch": 0.4287298154714206, + "grad_norm": 1.859375, + "learning_rate": 3.068339829115057e-05, + "loss": 0.427, + "step": 9764 + }, + { + "epoch": 0.42881763395062406, + "grad_norm": 1.875, + "learning_rate": 3.067666093546102e-05, + "loss": 0.3888, + "step": 9766 + }, + { + "epoch": 0.4289054524298275, + "grad_norm": 2.0, + "learning_rate": 3.066992314505142e-05, + "loss": 0.3796, + "step": 9768 + }, + { + "epoch": 0.42899327090903105, + "grad_norm": 2.03125, + "learning_rate": 3.066318492043774e-05, + "loss": 0.357, + "step": 9770 + }, + { + "epoch": 0.4290810893882345, + "grad_norm": 1.78125, + "learning_rate": 3.065644626213601e-05, + "loss": 0.3919, + "step": 9772 + }, + { + "epoch": 0.429168907867438, + "grad_norm": 1.890625, + "learning_rate": 3.064970717066227e-05, + "loss": 0.3633, + "step": 9774 + }, + { + "epoch": 0.42925672634664147, + "grad_norm": 1.8359375, + "learning_rate": 3.064296764653259e-05, + "loss": 0.3975, + "step": 9776 + }, + { + "epoch": 0.429344544825845, + "grad_norm": 2.015625, + "learning_rate": 3.0636227690263104e-05, + "loss": 0.3941, + "step": 9778 + }, + { + "epoch": 0.42943236330504847, + "grad_norm": 1.984375, + "learning_rate": 3.0629487302369945e-05, + "loss": 0.3861, + "step": 9780 + }, + { + "epoch": 0.42952018178425194, + "grad_norm": 1.9609375, + "learning_rate": 3.0622746483369306e-05, + "loss": 0.3714, + "step": 9782 + }, + { + "epoch": 0.4296080002634554, + "grad_norm": 1.9609375, + "learning_rate": 3.061600523377739e-05, + "loss": 0.3706, + "step": 9784 + }, + { + "epoch": 0.42969581874265894, + "grad_norm": 2.046875, + "learning_rate": 3.0609263554110445e-05, + "loss": 0.3677, + "step": 9786 + }, + { + "epoch": 0.4297836372218624, + "grad_norm": 1.875, + "learning_rate": 3.060252144488476e-05, + "loss": 0.3784, + "step": 9788 + }, + { + "epoch": 0.4298714557010659, + "grad_norm": 2.203125, + "learning_rate": 3.059577890661663e-05, + "loss": 0.3611, + "step": 9790 + }, + { + "epoch": 0.42995927418026936, + "grad_norm": 2.328125, + "learning_rate": 3.058903593982241e-05, + "loss": 0.41, + "step": 9792 + }, + { + "epoch": 0.4300470926594729, + "grad_norm": 2.0625, + "learning_rate": 3.058229254501848e-05, + "loss": 0.371, + "step": 9794 + }, + { + "epoch": 0.43013491113867636, + "grad_norm": 1.6953125, + "learning_rate": 3.057554872272125e-05, + "loss": 0.3496, + "step": 9796 + }, + { + "epoch": 0.43022272961787983, + "grad_norm": 2.0, + "learning_rate": 3.0568804473447164e-05, + "loss": 0.3618, + "step": 9798 + }, + { + "epoch": 0.43031054809708336, + "grad_norm": 1.9140625, + "learning_rate": 3.05620597977127e-05, + "loss": 0.3626, + "step": 9800 + }, + { + "epoch": 0.43039836657628683, + "grad_norm": 2.078125, + "learning_rate": 3.0555314696034356e-05, + "loss": 0.3567, + "step": 9802 + }, + { + "epoch": 0.4304861850554903, + "grad_norm": 1.875, + "learning_rate": 3.054856916892868e-05, + "loss": 0.3825, + "step": 9804 + }, + { + "epoch": 0.43057400353469377, + "grad_norm": 1.7265625, + "learning_rate": 3.0541823216912245e-05, + "loss": 0.3721, + "step": 9806 + }, + { + "epoch": 0.4306618220138973, + "grad_norm": 1.796875, + "learning_rate": 3.0535076840501665e-05, + "loss": 0.3852, + "step": 9808 + }, + { + "epoch": 0.43074964049310077, + "grad_norm": 2.1875, + "learning_rate": 3.052833004021357e-05, + "loss": 0.3748, + "step": 9810 + }, + { + "epoch": 0.43083745897230424, + "grad_norm": 1.90625, + "learning_rate": 3.052158281656465e-05, + "loss": 0.4204, + "step": 9812 + }, + { + "epoch": 0.4309252774515077, + "grad_norm": 1.7578125, + "learning_rate": 3.0514835170071582e-05, + "loss": 0.3771, + "step": 9814 + }, + { + "epoch": 0.43101309593071124, + "grad_norm": 1.9375, + "learning_rate": 3.0508087101251115e-05, + "loss": 0.3823, + "step": 9816 + }, + { + "epoch": 0.4311009144099147, + "grad_norm": 2.234375, + "learning_rate": 3.0501338610620017e-05, + "loss": 0.377, + "step": 9818 + }, + { + "epoch": 0.4311887328891182, + "grad_norm": 1.8984375, + "learning_rate": 3.0494589698695087e-05, + "loss": 0.405, + "step": 9820 + }, + { + "epoch": 0.43127655136832166, + "grad_norm": 1.96875, + "learning_rate": 3.0487840365993164e-05, + "loss": 0.3876, + "step": 9822 + }, + { + "epoch": 0.4313643698475252, + "grad_norm": 1.9453125, + "learning_rate": 3.0481090613031115e-05, + "loss": 0.3827, + "step": 9824 + }, + { + "epoch": 0.43145218832672866, + "grad_norm": 2.015625, + "learning_rate": 3.0474340440325822e-05, + "loss": 0.4024, + "step": 9826 + }, + { + "epoch": 0.43154000680593213, + "grad_norm": 1.859375, + "learning_rate": 3.046758984839424e-05, + "loss": 0.3622, + "step": 9828 + }, + { + "epoch": 0.4316278252851356, + "grad_norm": 1.765625, + "learning_rate": 3.0460838837753304e-05, + "loss": 0.4005, + "step": 9830 + }, + { + "epoch": 0.43171564376433913, + "grad_norm": 1.7890625, + "learning_rate": 3.0454087408920024e-05, + "loss": 0.3643, + "step": 9832 + }, + { + "epoch": 0.4318034622435426, + "grad_norm": 2.03125, + "learning_rate": 3.0447335562411423e-05, + "loss": 0.3626, + "step": 9834 + }, + { + "epoch": 0.4318912807227461, + "grad_norm": 1.953125, + "learning_rate": 3.044058329874456e-05, + "loss": 0.3631, + "step": 9836 + }, + { + "epoch": 0.43197909920194955, + "grad_norm": 2.03125, + "learning_rate": 3.0433830618436528e-05, + "loss": 0.413, + "step": 9838 + }, + { + "epoch": 0.4320669176811531, + "grad_norm": 2.15625, + "learning_rate": 3.042707752200444e-05, + "loss": 0.3812, + "step": 9840 + }, + { + "epoch": 0.43215473616035655, + "grad_norm": 1.65625, + "learning_rate": 3.042032400996545e-05, + "loss": 0.3471, + "step": 9842 + }, + { + "epoch": 0.43224255463956, + "grad_norm": 2.15625, + "learning_rate": 3.0413570082836757e-05, + "loss": 0.3952, + "step": 9844 + }, + { + "epoch": 0.4323303731187635, + "grad_norm": 1.8046875, + "learning_rate": 3.0406815741135563e-05, + "loss": 0.3993, + "step": 9846 + }, + { + "epoch": 0.432418191597967, + "grad_norm": 2.171875, + "learning_rate": 3.0400060985379124e-05, + "loss": 0.3543, + "step": 9848 + }, + { + "epoch": 0.4325060100771705, + "grad_norm": 1.8359375, + "learning_rate": 3.0393305816084728e-05, + "loss": 0.4002, + "step": 9850 + }, + { + "epoch": 0.43259382855637396, + "grad_norm": 2.171875, + "learning_rate": 3.0386550233769673e-05, + "loss": 0.3702, + "step": 9852 + }, + { + "epoch": 0.4326816470355775, + "grad_norm": 1.8515625, + "learning_rate": 3.037979423895131e-05, + "loss": 0.402, + "step": 9854 + }, + { + "epoch": 0.43276946551478096, + "grad_norm": 1.875, + "learning_rate": 3.037303783214701e-05, + "loss": 0.3626, + "step": 9856 + }, + { + "epoch": 0.43285728399398443, + "grad_norm": 2.03125, + "learning_rate": 3.03662810138742e-05, + "loss": 0.3723, + "step": 9858 + }, + { + "epoch": 0.4329451024731879, + "grad_norm": 2.15625, + "learning_rate": 3.035952378465029e-05, + "loss": 0.3783, + "step": 9860 + }, + { + "epoch": 0.43303292095239143, + "grad_norm": 2.03125, + "learning_rate": 3.0352766144992768e-05, + "loss": 0.3801, + "step": 9862 + }, + { + "epoch": 0.4331207394315949, + "grad_norm": 1.9296875, + "learning_rate": 3.034600809541913e-05, + "loss": 0.3734, + "step": 9864 + }, + { + "epoch": 0.4332085579107984, + "grad_norm": 1.859375, + "learning_rate": 3.033924963644691e-05, + "loss": 0.3632, + "step": 9866 + }, + { + "epoch": 0.43329637639000185, + "grad_norm": 1.8359375, + "learning_rate": 3.0332490768593675e-05, + "loss": 0.3899, + "step": 9868 + }, + { + "epoch": 0.4333841948692054, + "grad_norm": 1.921875, + "learning_rate": 3.0325731492377015e-05, + "loss": 0.3708, + "step": 9870 + }, + { + "epoch": 0.43347201334840885, + "grad_norm": 2.109375, + "learning_rate": 3.0318971808314566e-05, + "loss": 0.3751, + "step": 9872 + }, + { + "epoch": 0.4335598318276123, + "grad_norm": 1.8515625, + "learning_rate": 3.0312211716923978e-05, + "loss": 0.3686, + "step": 9874 + }, + { + "epoch": 0.4336476503068158, + "grad_norm": 2.265625, + "learning_rate": 3.0305451218722947e-05, + "loss": 0.3846, + "step": 9876 + }, + { + "epoch": 0.4337354687860193, + "grad_norm": 1.9453125, + "learning_rate": 3.0298690314229184e-05, + "loss": 0.3559, + "step": 9878 + }, + { + "epoch": 0.4338232872652228, + "grad_norm": 2.28125, + "learning_rate": 3.029192900396045e-05, + "loss": 0.4058, + "step": 9880 + }, + { + "epoch": 0.43391110574442626, + "grad_norm": 1.9609375, + "learning_rate": 3.0285167288434518e-05, + "loss": 0.3496, + "step": 9882 + }, + { + "epoch": 0.43399892422362973, + "grad_norm": 1.8515625, + "learning_rate": 3.0278405168169215e-05, + "loss": 0.3949, + "step": 9884 + }, + { + "epoch": 0.43408674270283326, + "grad_norm": 1.828125, + "learning_rate": 3.0271642643682378e-05, + "loss": 0.3738, + "step": 9886 + }, + { + "epoch": 0.43417456118203673, + "grad_norm": 1.75, + "learning_rate": 3.0264879715491883e-05, + "loss": 0.3839, + "step": 9888 + }, + { + "epoch": 0.4342623796612402, + "grad_norm": 2.015625, + "learning_rate": 3.0258116384115643e-05, + "loss": 0.3591, + "step": 9890 + }, + { + "epoch": 0.4343501981404437, + "grad_norm": 1.8359375, + "learning_rate": 3.0251352650071578e-05, + "loss": 0.3743, + "step": 9892 + }, + { + "epoch": 0.4344380166196472, + "grad_norm": 2.21875, + "learning_rate": 3.0244588513877676e-05, + "loss": 0.3767, + "step": 9894 + }, + { + "epoch": 0.4345258350988507, + "grad_norm": 1.890625, + "learning_rate": 3.0237823976051925e-05, + "loss": 0.3498, + "step": 9896 + }, + { + "epoch": 0.43461365357805415, + "grad_norm": 1.78125, + "learning_rate": 3.0231059037112363e-05, + "loss": 0.3935, + "step": 9898 + }, + { + "epoch": 0.4347014720572576, + "grad_norm": 1.734375, + "learning_rate": 3.0224293697577045e-05, + "loss": 0.3661, + "step": 9900 + }, + { + "epoch": 0.43478929053646115, + "grad_norm": 2.046875, + "learning_rate": 3.021752795796406e-05, + "loss": 0.4047, + "step": 9902 + }, + { + "epoch": 0.4348771090156646, + "grad_norm": 1.984375, + "learning_rate": 3.021076181879154e-05, + "loss": 0.3855, + "step": 9904 + }, + { + "epoch": 0.4349649274948681, + "grad_norm": 1.984375, + "learning_rate": 3.0203995280577618e-05, + "loss": 0.3811, + "step": 9906 + }, + { + "epoch": 0.4350527459740716, + "grad_norm": 1.7890625, + "learning_rate": 3.0197228343840502e-05, + "loss": 0.3624, + "step": 9908 + }, + { + "epoch": 0.4351405644532751, + "grad_norm": 1.953125, + "learning_rate": 3.0190461009098382e-05, + "loss": 0.3668, + "step": 9910 + }, + { + "epoch": 0.43522838293247856, + "grad_norm": 1.859375, + "learning_rate": 3.018369327686953e-05, + "loss": 0.3465, + "step": 9912 + }, + { + "epoch": 0.43531620141168204, + "grad_norm": 1.984375, + "learning_rate": 3.0176925147672192e-05, + "loss": 0.4023, + "step": 9914 + }, + { + "epoch": 0.43540401989088556, + "grad_norm": 1.921875, + "learning_rate": 3.017015662202468e-05, + "loss": 0.3568, + "step": 9916 + }, + { + "epoch": 0.43549183837008903, + "grad_norm": 1.7265625, + "learning_rate": 3.0163387700445345e-05, + "loss": 0.3598, + "step": 9918 + }, + { + "epoch": 0.4355796568492925, + "grad_norm": 1.796875, + "learning_rate": 3.0156618383452545e-05, + "loss": 0.3737, + "step": 9920 + }, + { + "epoch": 0.435667475328496, + "grad_norm": 1.8203125, + "learning_rate": 3.0149848671564663e-05, + "loss": 0.4093, + "step": 9922 + }, + { + "epoch": 0.4357552938076995, + "grad_norm": 1.734375, + "learning_rate": 3.014307856530015e-05, + "loss": 0.3829, + "step": 9924 + }, + { + "epoch": 0.435843112286903, + "grad_norm": 1.8828125, + "learning_rate": 3.0136308065177434e-05, + "loss": 0.3661, + "step": 9926 + }, + { + "epoch": 0.43593093076610645, + "grad_norm": 2.078125, + "learning_rate": 3.0129537171715016e-05, + "loss": 0.389, + "step": 9928 + }, + { + "epoch": 0.4360187492453099, + "grad_norm": 2.21875, + "learning_rate": 3.0122765885431414e-05, + "loss": 0.4172, + "step": 9930 + }, + { + "epoch": 0.43610656772451345, + "grad_norm": 1.84375, + "learning_rate": 3.0115994206845173e-05, + "loss": 0.3846, + "step": 9932 + }, + { + "epoch": 0.4361943862037169, + "grad_norm": 1.8828125, + "learning_rate": 3.010922213647487e-05, + "loss": 0.3701, + "step": 9934 + }, + { + "epoch": 0.4362822046829204, + "grad_norm": 2.09375, + "learning_rate": 3.0102449674839117e-05, + "loss": 0.3837, + "step": 9936 + }, + { + "epoch": 0.43637002316212387, + "grad_norm": 2.0625, + "learning_rate": 3.0095676822456532e-05, + "loss": 0.394, + "step": 9938 + }, + { + "epoch": 0.4364578416413274, + "grad_norm": 1.890625, + "learning_rate": 3.00889035798458e-05, + "loss": 0.3849, + "step": 9940 + }, + { + "epoch": 0.43654566012053087, + "grad_norm": 1.796875, + "learning_rate": 3.008212994752561e-05, + "loss": 0.3773, + "step": 9942 + }, + { + "epoch": 0.43663347859973434, + "grad_norm": 1.9375, + "learning_rate": 3.007535592601469e-05, + "loss": 0.3691, + "step": 9944 + }, + { + "epoch": 0.4367212970789378, + "grad_norm": 1.8125, + "learning_rate": 3.00685815158318e-05, + "loss": 0.3364, + "step": 9946 + }, + { + "epoch": 0.43680911555814134, + "grad_norm": 2.09375, + "learning_rate": 3.0061806717495728e-05, + "loss": 0.3628, + "step": 9948 + }, + { + "epoch": 0.4368969340373448, + "grad_norm": 1.828125, + "learning_rate": 3.005503153152528e-05, + "loss": 0.3841, + "step": 9950 + }, + { + "epoch": 0.4369847525165483, + "grad_norm": 2.296875, + "learning_rate": 3.0048255958439303e-05, + "loss": 0.347, + "step": 9952 + }, + { + "epoch": 0.4370725709957518, + "grad_norm": 1.8828125, + "learning_rate": 3.0041479998756673e-05, + "loss": 0.386, + "step": 9954 + }, + { + "epoch": 0.4371603894749553, + "grad_norm": 1.84375, + "learning_rate": 3.00347036529963e-05, + "loss": 0.3891, + "step": 9956 + }, + { + "epoch": 0.43724820795415875, + "grad_norm": 1.8671875, + "learning_rate": 3.0027926921677108e-05, + "loss": 0.4011, + "step": 9958 + }, + { + "epoch": 0.4373360264333622, + "grad_norm": 1.9921875, + "learning_rate": 3.0021149805318072e-05, + "loss": 0.4107, + "step": 9960 + }, + { + "epoch": 0.43742384491256575, + "grad_norm": 2.078125, + "learning_rate": 3.001437230443818e-05, + "loss": 0.3708, + "step": 9962 + }, + { + "epoch": 0.4375116633917692, + "grad_norm": 1.9140625, + "learning_rate": 3.0007594419556456e-05, + "loss": 0.3526, + "step": 9964 + }, + { + "epoch": 0.4375994818709727, + "grad_norm": 2.3125, + "learning_rate": 3.0000816151191952e-05, + "loss": 0.3973, + "step": 9966 + }, + { + "epoch": 0.43768730035017617, + "grad_norm": 1.953125, + "learning_rate": 2.9994037499863747e-05, + "loss": 0.3634, + "step": 9968 + }, + { + "epoch": 0.4377751188293797, + "grad_norm": 1.875, + "learning_rate": 2.998725846609095e-05, + "loss": 0.3703, + "step": 9970 + }, + { + "epoch": 0.43786293730858317, + "grad_norm": 1.9296875, + "learning_rate": 2.9980479050392702e-05, + "loss": 0.382, + "step": 9972 + }, + { + "epoch": 0.43795075578778664, + "grad_norm": 2.234375, + "learning_rate": 2.9973699253288186e-05, + "loss": 0.3804, + "step": 9974 + }, + { + "epoch": 0.4380385742669901, + "grad_norm": 2.109375, + "learning_rate": 2.996691907529658e-05, + "loss": 0.3785, + "step": 9976 + }, + { + "epoch": 0.43812639274619364, + "grad_norm": 1.7578125, + "learning_rate": 2.996013851693712e-05, + "loss": 0.3478, + "step": 9978 + }, + { + "epoch": 0.4382142112253971, + "grad_norm": 1.7890625, + "learning_rate": 2.9953357578729064e-05, + "loss": 0.3744, + "step": 9980 + }, + { + "epoch": 0.4383020297046006, + "grad_norm": 1.9140625, + "learning_rate": 2.99465762611917e-05, + "loss": 0.3685, + "step": 9982 + }, + { + "epoch": 0.43838984818380405, + "grad_norm": 2.125, + "learning_rate": 2.9939794564844335e-05, + "loss": 0.3512, + "step": 9984 + }, + { + "epoch": 0.4384776666630076, + "grad_norm": 2.546875, + "learning_rate": 2.993301249020633e-05, + "loss": 0.3636, + "step": 9986 + }, + { + "epoch": 0.43856548514221105, + "grad_norm": 2.125, + "learning_rate": 2.9926230037797036e-05, + "loss": 0.3955, + "step": 9988 + }, + { + "epoch": 0.4386533036214145, + "grad_norm": 1.875, + "learning_rate": 2.9919447208135865e-05, + "loss": 0.383, + "step": 9990 + }, + { + "epoch": 0.438741122100618, + "grad_norm": 2.84375, + "learning_rate": 2.9912664001742246e-05, + "loss": 0.3825, + "step": 9992 + }, + { + "epoch": 0.4388289405798215, + "grad_norm": 2.390625, + "learning_rate": 2.9905880419135646e-05, + "loss": 0.3879, + "step": 9994 + }, + { + "epoch": 0.438916759059025, + "grad_norm": 2.109375, + "learning_rate": 2.9899096460835545e-05, + "loss": 0.3831, + "step": 9996 + }, + { + "epoch": 0.43900457753822847, + "grad_norm": 1.6796875, + "learning_rate": 2.9892312127361464e-05, + "loss": 0.3886, + "step": 9998 + }, + { + "epoch": 0.43909239601743194, + "grad_norm": 2.953125, + "learning_rate": 2.988552741923295e-05, + "loss": 0.3905, + "step": 10000 + }, + { + "epoch": 0.43918021449663547, + "grad_norm": 2.46875, + "learning_rate": 2.9878742336969568e-05, + "loss": 0.367, + "step": 10002 + }, + { + "epoch": 0.43926803297583894, + "grad_norm": 2.15625, + "learning_rate": 2.987195688109093e-05, + "loss": 0.4148, + "step": 10004 + }, + { + "epoch": 0.4393558514550424, + "grad_norm": 1.9453125, + "learning_rate": 2.9865171052116664e-05, + "loss": 0.3781, + "step": 10006 + }, + { + "epoch": 0.43944366993424594, + "grad_norm": 2.359375, + "learning_rate": 2.9858384850566435e-05, + "loss": 0.3622, + "step": 10008 + }, + { + "epoch": 0.4395314884134494, + "grad_norm": 2.25, + "learning_rate": 2.9851598276959935e-05, + "loss": 0.3743, + "step": 10010 + }, + { + "epoch": 0.4396193068926529, + "grad_norm": 1.765625, + "learning_rate": 2.984481133181688e-05, + "loss": 0.4079, + "step": 10012 + }, + { + "epoch": 0.43970712537185636, + "grad_norm": 1.9765625, + "learning_rate": 2.9838024015657e-05, + "loss": 0.3881, + "step": 10014 + }, + { + "epoch": 0.4397949438510599, + "grad_norm": 2.0625, + "learning_rate": 2.9831236329000084e-05, + "loss": 0.3975, + "step": 10016 + }, + { + "epoch": 0.43988276233026335, + "grad_norm": 2.140625, + "learning_rate": 2.9824448272365928e-05, + "loss": 0.3801, + "step": 10018 + }, + { + "epoch": 0.4399705808094668, + "grad_norm": 2.046875, + "learning_rate": 2.981765984627437e-05, + "loss": 0.3606, + "step": 10020 + }, + { + "epoch": 0.4400583992886703, + "grad_norm": 2.046875, + "learning_rate": 2.981087105124527e-05, + "loss": 0.3692, + "step": 10022 + }, + { + "epoch": 0.4401462177678738, + "grad_norm": 1.8828125, + "learning_rate": 2.9804081887798508e-05, + "loss": 0.3693, + "step": 10024 + }, + { + "epoch": 0.4402340362470773, + "grad_norm": 1.6875, + "learning_rate": 2.9797292356454004e-05, + "loss": 0.3681, + "step": 10026 + }, + { + "epoch": 0.44032185472628077, + "grad_norm": 1.671875, + "learning_rate": 2.9790502457731706e-05, + "loss": 0.3738, + "step": 10028 + }, + { + "epoch": 0.44040967320548424, + "grad_norm": 1.8671875, + "learning_rate": 2.9783712192151576e-05, + "loss": 0.3856, + "step": 10030 + }, + { + "epoch": 0.44049749168468777, + "grad_norm": 2.03125, + "learning_rate": 2.9776921560233616e-05, + "loss": 0.3701, + "step": 10032 + }, + { + "epoch": 0.44058531016389124, + "grad_norm": 2.0, + "learning_rate": 2.9770130562497867e-05, + "loss": 0.3902, + "step": 10034 + }, + { + "epoch": 0.4406731286430947, + "grad_norm": 1.7734375, + "learning_rate": 2.9763339199464374e-05, + "loss": 0.3585, + "step": 10036 + }, + { + "epoch": 0.4407609471222982, + "grad_norm": 1.8828125, + "learning_rate": 2.9756547471653218e-05, + "loss": 0.3656, + "step": 10038 + }, + { + "epoch": 0.4408487656015017, + "grad_norm": 1.9140625, + "learning_rate": 2.9749755379584515e-05, + "loss": 0.3603, + "step": 10040 + }, + { + "epoch": 0.4409365840807052, + "grad_norm": 1.828125, + "learning_rate": 2.9742962923778417e-05, + "loss": 0.4048, + "step": 10042 + }, + { + "epoch": 0.44102440255990866, + "grad_norm": 2.09375, + "learning_rate": 2.9736170104755075e-05, + "loss": 0.4039, + "step": 10044 + }, + { + "epoch": 0.44111222103911213, + "grad_norm": 1.921875, + "learning_rate": 2.9729376923034684e-05, + "loss": 0.3617, + "step": 10046 + }, + { + "epoch": 0.44120003951831566, + "grad_norm": 2.1875, + "learning_rate": 2.9722583379137493e-05, + "loss": 0.4059, + "step": 10048 + }, + { + "epoch": 0.44128785799751913, + "grad_norm": 1.9609375, + "learning_rate": 2.9715789473583715e-05, + "loss": 0.376, + "step": 10050 + }, + { + "epoch": 0.4413756764767226, + "grad_norm": 1.859375, + "learning_rate": 2.9708995206893658e-05, + "loss": 0.351, + "step": 10052 + }, + { + "epoch": 0.44146349495592613, + "grad_norm": 1.875, + "learning_rate": 2.970220057958762e-05, + "loss": 0.3672, + "step": 10054 + }, + { + "epoch": 0.4415513134351296, + "grad_norm": 1.9375, + "learning_rate": 2.9695405592185925e-05, + "loss": 0.36, + "step": 10056 + }, + { + "epoch": 0.44163913191433307, + "grad_norm": 2.1875, + "learning_rate": 2.968861024520896e-05, + "loss": 0.3663, + "step": 10058 + }, + { + "epoch": 0.44172695039353654, + "grad_norm": 1.9609375, + "learning_rate": 2.9681814539177094e-05, + "loss": 0.3859, + "step": 10060 + }, + { + "epoch": 0.44181476887274007, + "grad_norm": 2.25, + "learning_rate": 2.9675018474610743e-05, + "loss": 0.3617, + "step": 10062 + }, + { + "epoch": 0.44190258735194354, + "grad_norm": 1.75, + "learning_rate": 2.9668222052030353e-05, + "loss": 0.364, + "step": 10064 + }, + { + "epoch": 0.441990405831147, + "grad_norm": 1.9453125, + "learning_rate": 2.9661425271956406e-05, + "loss": 0.3872, + "step": 10066 + }, + { + "epoch": 0.4420782243103505, + "grad_norm": 1.859375, + "learning_rate": 2.965462813490939e-05, + "loss": 0.3916, + "step": 10068 + }, + { + "epoch": 0.442166042789554, + "grad_norm": 1.8046875, + "learning_rate": 2.964783064140984e-05, + "loss": 0.3791, + "step": 10070 + }, + { + "epoch": 0.4422538612687575, + "grad_norm": 2.171875, + "learning_rate": 2.9641032791978307e-05, + "loss": 0.3698, + "step": 10072 + }, + { + "epoch": 0.44234167974796096, + "grad_norm": 1.75, + "learning_rate": 2.9634234587135366e-05, + "loss": 0.3751, + "step": 10074 + }, + { + "epoch": 0.44242949822716443, + "grad_norm": 1.921875, + "learning_rate": 2.9627436027401633e-05, + "loss": 0.3814, + "step": 10076 + }, + { + "epoch": 0.44251731670636796, + "grad_norm": 1.828125, + "learning_rate": 2.9620637113297735e-05, + "loss": 0.3538, + "step": 10078 + }, + { + "epoch": 0.44260513518557143, + "grad_norm": 2.40625, + "learning_rate": 2.961383784534434e-05, + "loss": 0.3811, + "step": 10080 + }, + { + "epoch": 0.4426929536647749, + "grad_norm": 1.7890625, + "learning_rate": 2.9607038224062133e-05, + "loss": 0.3739, + "step": 10082 + }, + { + "epoch": 0.4427807721439784, + "grad_norm": 1.984375, + "learning_rate": 2.9600238249971846e-05, + "loss": 0.3835, + "step": 10084 + }, + { + "epoch": 0.4428685906231819, + "grad_norm": 1.984375, + "learning_rate": 2.9593437923594204e-05, + "loss": 0.3694, + "step": 10086 + }, + { + "epoch": 0.4429564091023854, + "grad_norm": 1.890625, + "learning_rate": 2.958663724544999e-05, + "loss": 0.3621, + "step": 10088 + }, + { + "epoch": 0.44304422758158885, + "grad_norm": 1.734375, + "learning_rate": 2.9579836216059988e-05, + "loss": 0.3912, + "step": 10090 + }, + { + "epoch": 0.4431320460607923, + "grad_norm": 2.28125, + "learning_rate": 2.9573034835945028e-05, + "loss": 0.3912, + "step": 10092 + }, + { + "epoch": 0.44321986453999584, + "grad_norm": 1.8984375, + "learning_rate": 2.9566233105625973e-05, + "loss": 0.3883, + "step": 10094 + }, + { + "epoch": 0.4433076830191993, + "grad_norm": 1.9140625, + "learning_rate": 2.955943102562369e-05, + "loss": 0.361, + "step": 10096 + }, + { + "epoch": 0.4433955014984028, + "grad_norm": 1.9140625, + "learning_rate": 2.9552628596459086e-05, + "loss": 0.3964, + "step": 10098 + }, + { + "epoch": 0.44348331997760626, + "grad_norm": 1.9453125, + "learning_rate": 2.9545825818653087e-05, + "loss": 0.3786, + "step": 10100 + }, + { + "epoch": 0.4435711384568098, + "grad_norm": 1.859375, + "learning_rate": 2.9539022692726665e-05, + "loss": 0.3958, + "step": 10102 + }, + { + "epoch": 0.44365895693601326, + "grad_norm": 1.90625, + "learning_rate": 2.9532219219200797e-05, + "loss": 0.3772, + "step": 10104 + }, + { + "epoch": 0.44374677541521673, + "grad_norm": 1.921875, + "learning_rate": 2.952541539859649e-05, + "loss": 0.3753, + "step": 10106 + }, + { + "epoch": 0.44383459389442026, + "grad_norm": 1.7890625, + "learning_rate": 2.951861123143479e-05, + "loss": 0.373, + "step": 10108 + }, + { + "epoch": 0.44392241237362373, + "grad_norm": 2.03125, + "learning_rate": 2.9511806718236764e-05, + "loss": 0.3747, + "step": 10110 + }, + { + "epoch": 0.4440102308528272, + "grad_norm": 1.8203125, + "learning_rate": 2.9505001859523484e-05, + "loss": 0.357, + "step": 10112 + }, + { + "epoch": 0.4440980493320307, + "grad_norm": 1.84375, + "learning_rate": 2.949819665581609e-05, + "loss": 0.3729, + "step": 10114 + }, + { + "epoch": 0.4441858678112342, + "grad_norm": 2.03125, + "learning_rate": 2.9491391107635715e-05, + "loss": 0.3629, + "step": 10116 + }, + { + "epoch": 0.4442736862904377, + "grad_norm": 1.953125, + "learning_rate": 2.9484585215503537e-05, + "loss": 0.3556, + "step": 10118 + }, + { + "epoch": 0.44436150476964115, + "grad_norm": 2.046875, + "learning_rate": 2.9477778979940745e-05, + "loss": 0.3578, + "step": 10120 + }, + { + "epoch": 0.4444493232488446, + "grad_norm": 1.734375, + "learning_rate": 2.947097240146857e-05, + "loss": 0.3788, + "step": 10122 + }, + { + "epoch": 0.44453714172804815, + "grad_norm": 1.9296875, + "learning_rate": 2.9464165480608252e-05, + "loss": 0.3716, + "step": 10124 + }, + { + "epoch": 0.4446249602072516, + "grad_norm": 2.078125, + "learning_rate": 2.945735821788107e-05, + "loss": 0.3521, + "step": 10126 + }, + { + "epoch": 0.4447127786864551, + "grad_norm": 1.9453125, + "learning_rate": 2.945055061380833e-05, + "loss": 0.3624, + "step": 10128 + }, + { + "epoch": 0.44480059716565856, + "grad_norm": 1.9296875, + "learning_rate": 2.9443742668911357e-05, + "loss": 0.3788, + "step": 10130 + }, + { + "epoch": 0.4448884156448621, + "grad_norm": 2.03125, + "learning_rate": 2.9436934383711508e-05, + "loss": 0.3681, + "step": 10132 + }, + { + "epoch": 0.44497623412406556, + "grad_norm": 1.9140625, + "learning_rate": 2.943012575873016e-05, + "loss": 0.3704, + "step": 10134 + }, + { + "epoch": 0.44506405260326903, + "grad_norm": 1.8828125, + "learning_rate": 2.9423316794488716e-05, + "loss": 0.4046, + "step": 10136 + }, + { + "epoch": 0.4451518710824725, + "grad_norm": 1.6640625, + "learning_rate": 2.9416507491508606e-05, + "loss": 0.3611, + "step": 10138 + }, + { + "epoch": 0.44523968956167603, + "grad_norm": 1.9296875, + "learning_rate": 2.9409697850311296e-05, + "loss": 0.3797, + "step": 10140 + }, + { + "epoch": 0.4453275080408795, + "grad_norm": 2.171875, + "learning_rate": 2.940288787141827e-05, + "loss": 0.4003, + "step": 10142 + }, + { + "epoch": 0.445415326520083, + "grad_norm": 2.1875, + "learning_rate": 2.9396077555351038e-05, + "loss": 0.3505, + "step": 10144 + }, + { + "epoch": 0.44550314499928645, + "grad_norm": 2.265625, + "learning_rate": 2.9389266902631137e-05, + "loss": 0.3681, + "step": 10146 + }, + { + "epoch": 0.44559096347849, + "grad_norm": 2.140625, + "learning_rate": 2.9382455913780115e-05, + "loss": 0.3881, + "step": 10148 + }, + { + "epoch": 0.44567878195769345, + "grad_norm": 1.765625, + "learning_rate": 2.9375644589319572e-05, + "loss": 0.3798, + "step": 10150 + }, + { + "epoch": 0.4457666004368969, + "grad_norm": 2.15625, + "learning_rate": 2.936883292977112e-05, + "loss": 0.39, + "step": 10152 + }, + { + "epoch": 0.44585441891610045, + "grad_norm": 2.09375, + "learning_rate": 2.936202093565639e-05, + "loss": 0.3701, + "step": 10154 + }, + { + "epoch": 0.4459422373953039, + "grad_norm": 2.0625, + "learning_rate": 2.9355208607497053e-05, + "loss": 0.3647, + "step": 10156 + }, + { + "epoch": 0.4460300558745074, + "grad_norm": 1.9609375, + "learning_rate": 2.93483959458148e-05, + "loss": 0.3879, + "step": 10158 + }, + { + "epoch": 0.44611787435371086, + "grad_norm": 1.875, + "learning_rate": 2.9341582951131343e-05, + "loss": 0.3899, + "step": 10160 + }, + { + "epoch": 0.4462056928329144, + "grad_norm": 2.015625, + "learning_rate": 2.9334769623968417e-05, + "loss": 0.3776, + "step": 10162 + }, + { + "epoch": 0.44629351131211786, + "grad_norm": 2.0625, + "learning_rate": 2.9327955964847798e-05, + "loss": 0.3546, + "step": 10164 + }, + { + "epoch": 0.44638132979132134, + "grad_norm": 2.15625, + "learning_rate": 2.9321141974291277e-05, + "loss": 0.3971, + "step": 10166 + }, + { + "epoch": 0.4464691482705248, + "grad_norm": 2.1875, + "learning_rate": 2.931432765282066e-05, + "loss": 0.3852, + "step": 10168 + }, + { + "epoch": 0.44655696674972833, + "grad_norm": 1.9921875, + "learning_rate": 2.9307513000957797e-05, + "loss": 0.3891, + "step": 10170 + }, + { + "epoch": 0.4466447852289318, + "grad_norm": 1.9296875, + "learning_rate": 2.930069801922457e-05, + "loss": 0.3529, + "step": 10172 + }, + { + "epoch": 0.4467326037081353, + "grad_norm": 1.9453125, + "learning_rate": 2.9293882708142846e-05, + "loss": 0.3754, + "step": 10174 + }, + { + "epoch": 0.44682042218733875, + "grad_norm": 1.71875, + "learning_rate": 2.9287067068234554e-05, + "loss": 0.3734, + "step": 10176 + }, + { + "epoch": 0.4469082406665423, + "grad_norm": 1.859375, + "learning_rate": 2.928025110002164e-05, + "loss": 0.378, + "step": 10178 + }, + { + "epoch": 0.44699605914574575, + "grad_norm": 1.7578125, + "learning_rate": 2.9273434804026072e-05, + "loss": 0.3614, + "step": 10180 + }, + { + "epoch": 0.4470838776249492, + "grad_norm": 1.75, + "learning_rate": 2.9266618180769846e-05, + "loss": 0.3388, + "step": 10182 + }, + { + "epoch": 0.4471716961041527, + "grad_norm": 1.875, + "learning_rate": 2.9259801230774974e-05, + "loss": 0.3852, + "step": 10184 + }, + { + "epoch": 0.4472595145833562, + "grad_norm": 1.7890625, + "learning_rate": 2.92529839545635e-05, + "loss": 0.3585, + "step": 10186 + }, + { + "epoch": 0.4473473330625597, + "grad_norm": 1.8984375, + "learning_rate": 2.9246166352657494e-05, + "loss": 0.3929, + "step": 10188 + }, + { + "epoch": 0.44743515154176317, + "grad_norm": 2.1875, + "learning_rate": 2.923934842557905e-05, + "loss": 0.3593, + "step": 10190 + }, + { + "epoch": 0.44752297002096664, + "grad_norm": 1.703125, + "learning_rate": 2.923253017385029e-05, + "loss": 0.4071, + "step": 10192 + }, + { + "epoch": 0.44761078850017016, + "grad_norm": 1.984375, + "learning_rate": 2.9225711597993362e-05, + "loss": 0.34, + "step": 10194 + }, + { + "epoch": 0.44769860697937364, + "grad_norm": 2.140625, + "learning_rate": 2.9218892698530427e-05, + "loss": 0.3652, + "step": 10196 + }, + { + "epoch": 0.4477864254585771, + "grad_norm": 1.9140625, + "learning_rate": 2.9212073475983663e-05, + "loss": 0.398, + "step": 10198 + }, + { + "epoch": 0.4478742439377806, + "grad_norm": 2.140625, + "learning_rate": 2.9205253930875315e-05, + "loss": 0.3866, + "step": 10200 + }, + { + "epoch": 0.4479620624169841, + "grad_norm": 1.8828125, + "learning_rate": 2.9198434063727602e-05, + "loss": 0.4063, + "step": 10202 + }, + { + "epoch": 0.4480498808961876, + "grad_norm": 2.25, + "learning_rate": 2.919161387506281e-05, + "loss": 0.4008, + "step": 10204 + }, + { + "epoch": 0.44813769937539105, + "grad_norm": 1.671875, + "learning_rate": 2.9184793365403217e-05, + "loss": 0.3405, + "step": 10206 + }, + { + "epoch": 0.4482255178545946, + "grad_norm": 2.125, + "learning_rate": 2.917797253527116e-05, + "loss": 0.4057, + "step": 10208 + }, + { + "epoch": 0.44831333633379805, + "grad_norm": 1.671875, + "learning_rate": 2.917115138518895e-05, + "loss": 0.3616, + "step": 10210 + }, + { + "epoch": 0.4484011548130015, + "grad_norm": 1.6640625, + "learning_rate": 2.916432991567897e-05, + "loss": 0.3644, + "step": 10212 + }, + { + "epoch": 0.448488973292205, + "grad_norm": 1.90625, + "learning_rate": 2.9157508127263612e-05, + "loss": 0.3851, + "step": 10214 + }, + { + "epoch": 0.4485767917714085, + "grad_norm": 1.859375, + "learning_rate": 2.915068602046528e-05, + "loss": 0.3828, + "step": 10216 + }, + { + "epoch": 0.448664610250612, + "grad_norm": 1.8515625, + "learning_rate": 2.9143863595806413e-05, + "loss": 0.382, + "step": 10218 + }, + { + "epoch": 0.44875242872981547, + "grad_norm": 2.03125, + "learning_rate": 2.9137040853809487e-05, + "loss": 0.3827, + "step": 10220 + }, + { + "epoch": 0.44884024720901894, + "grad_norm": 1.90625, + "learning_rate": 2.9130217794996977e-05, + "loss": 0.3805, + "step": 10222 + }, + { + "epoch": 0.44892806568822247, + "grad_norm": 1.734375, + "learning_rate": 2.9123394419891396e-05, + "loss": 0.3908, + "step": 10224 + }, + { + "epoch": 0.44901588416742594, + "grad_norm": 2.03125, + "learning_rate": 2.911657072901529e-05, + "loss": 0.3937, + "step": 10226 + }, + { + "epoch": 0.4491037026466294, + "grad_norm": 1.8984375, + "learning_rate": 2.91097467228912e-05, + "loss": 0.3825, + "step": 10228 + }, + { + "epoch": 0.4491915211258329, + "grad_norm": 2.078125, + "learning_rate": 2.9102922402041728e-05, + "loss": 0.388, + "step": 10230 + }, + { + "epoch": 0.4492793396050364, + "grad_norm": 1.859375, + "learning_rate": 2.9096097766989478e-05, + "loss": 0.3475, + "step": 10232 + }, + { + "epoch": 0.4493671580842399, + "grad_norm": 1.7734375, + "learning_rate": 2.9089272818257073e-05, + "loss": 0.4138, + "step": 10234 + }, + { + "epoch": 0.44945497656344335, + "grad_norm": 2.0, + "learning_rate": 2.908244755636717e-05, + "loss": 0.3681, + "step": 10236 + }, + { + "epoch": 0.4495427950426468, + "grad_norm": 1.75, + "learning_rate": 2.907562198184246e-05, + "loss": 0.3713, + "step": 10238 + }, + { + "epoch": 0.44963061352185035, + "grad_norm": 1.765625, + "learning_rate": 2.906879609520564e-05, + "loss": 0.3717, + "step": 10240 + }, + { + "epoch": 0.4497184320010538, + "grad_norm": 1.859375, + "learning_rate": 2.9061969896979447e-05, + "loss": 0.3774, + "step": 10242 + }, + { + "epoch": 0.4498062504802573, + "grad_norm": 1.84375, + "learning_rate": 2.9055143387686624e-05, + "loss": 0.3603, + "step": 10244 + }, + { + "epoch": 0.44989406895946077, + "grad_norm": 1.796875, + "learning_rate": 2.9048316567849947e-05, + "loss": 0.3547, + "step": 10246 + }, + { + "epoch": 0.4499818874386643, + "grad_norm": 1.765625, + "learning_rate": 2.9041489437992215e-05, + "loss": 0.3901, + "step": 10248 + }, + { + "epoch": 0.45006970591786777, + "grad_norm": 2.09375, + "learning_rate": 2.9034661998636248e-05, + "loss": 0.3638, + "step": 10250 + }, + { + "epoch": 0.45015752439707124, + "grad_norm": 1.8671875, + "learning_rate": 2.9027834250304904e-05, + "loss": 0.3831, + "step": 10252 + }, + { + "epoch": 0.45024534287627477, + "grad_norm": 1.8203125, + "learning_rate": 2.9021006193521043e-05, + "loss": 0.3749, + "step": 10254 + }, + { + "epoch": 0.45033316135547824, + "grad_norm": 1.890625, + "learning_rate": 2.901417782880757e-05, + "loss": 0.3908, + "step": 10256 + }, + { + "epoch": 0.4504209798346817, + "grad_norm": 1.8125, + "learning_rate": 2.9007349156687404e-05, + "loss": 0.3754, + "step": 10258 + }, + { + "epoch": 0.4505087983138852, + "grad_norm": 1.8203125, + "learning_rate": 2.900052017768346e-05, + "loss": 0.375, + "step": 10260 + }, + { + "epoch": 0.4505966167930887, + "grad_norm": 2.015625, + "learning_rate": 2.899369089231873e-05, + "loss": 0.3856, + "step": 10262 + }, + { + "epoch": 0.4506844352722922, + "grad_norm": 1.8515625, + "learning_rate": 2.8986861301116196e-05, + "loss": 0.3618, + "step": 10264 + }, + { + "epoch": 0.45077225375149566, + "grad_norm": 2.171875, + "learning_rate": 2.8980031404598862e-05, + "loss": 0.414, + "step": 10266 + }, + { + "epoch": 0.4508600722306991, + "grad_norm": 1.8203125, + "learning_rate": 2.897320120328978e-05, + "loss": 0.4064, + "step": 10268 + }, + { + "epoch": 0.45094789070990265, + "grad_norm": 1.828125, + "learning_rate": 2.8966370697711988e-05, + "loss": 0.3708, + "step": 10270 + }, + { + "epoch": 0.4510357091891061, + "grad_norm": 2.09375, + "learning_rate": 2.895953988838859e-05, + "loss": 0.3726, + "step": 10272 + }, + { + "epoch": 0.4511235276683096, + "grad_norm": 1.703125, + "learning_rate": 2.8952708775842664e-05, + "loss": 0.3504, + "step": 10274 + }, + { + "epoch": 0.45121134614751307, + "grad_norm": 1.8203125, + "learning_rate": 2.8945877360597352e-05, + "loss": 0.3642, + "step": 10276 + }, + { + "epoch": 0.4512991646267166, + "grad_norm": 2.15625, + "learning_rate": 2.8939045643175812e-05, + "loss": 0.3775, + "step": 10278 + }, + { + "epoch": 0.45138698310592007, + "grad_norm": 1.828125, + "learning_rate": 2.8932213624101207e-05, + "loss": 0.3676, + "step": 10280 + }, + { + "epoch": 0.45147480158512354, + "grad_norm": 2.921875, + "learning_rate": 2.8925381303896747e-05, + "loss": 0.3688, + "step": 10282 + }, + { + "epoch": 0.451562620064327, + "grad_norm": 2.0625, + "learning_rate": 2.8918548683085643e-05, + "loss": 0.3705, + "step": 10284 + }, + { + "epoch": 0.45165043854353054, + "grad_norm": 2.015625, + "learning_rate": 2.891171576219114e-05, + "loss": 0.3961, + "step": 10286 + }, + { + "epoch": 0.451738257022734, + "grad_norm": 1.7890625, + "learning_rate": 2.8904882541736512e-05, + "loss": 0.3567, + "step": 10288 + }, + { + "epoch": 0.4518260755019375, + "grad_norm": 2.484375, + "learning_rate": 2.8898049022245034e-05, + "loss": 0.3565, + "step": 10290 + }, + { + "epoch": 0.45191389398114096, + "grad_norm": 1.8671875, + "learning_rate": 2.889121520424003e-05, + "loss": 0.3552, + "step": 10292 + }, + { + "epoch": 0.4520017124603445, + "grad_norm": 1.8984375, + "learning_rate": 2.888438108824484e-05, + "loss": 0.3441, + "step": 10294 + }, + { + "epoch": 0.45208953093954796, + "grad_norm": 1.75, + "learning_rate": 2.8877546674782806e-05, + "loss": 0.3561, + "step": 10296 + }, + { + "epoch": 0.45217734941875143, + "grad_norm": 1.734375, + "learning_rate": 2.8870711964377322e-05, + "loss": 0.365, + "step": 10298 + }, + { + "epoch": 0.4522651678979549, + "grad_norm": 2.078125, + "learning_rate": 2.8863876957551784e-05, + "loss": 0.381, + "step": 10300 + }, + { + "epoch": 0.45235298637715843, + "grad_norm": 2.203125, + "learning_rate": 2.8857041654829625e-05, + "loss": 0.3885, + "step": 10302 + }, + { + "epoch": 0.4524408048563619, + "grad_norm": 2.390625, + "learning_rate": 2.8850206056734297e-05, + "loss": 0.3848, + "step": 10304 + }, + { + "epoch": 0.45252862333556537, + "grad_norm": 1.9140625, + "learning_rate": 2.8843370163789264e-05, + "loss": 0.3426, + "step": 10306 + }, + { + "epoch": 0.4526164418147689, + "grad_norm": 1.6328125, + "learning_rate": 2.883653397651802e-05, + "loss": 0.3648, + "step": 10308 + }, + { + "epoch": 0.45270426029397237, + "grad_norm": 1.7265625, + "learning_rate": 2.8829697495444087e-05, + "loss": 0.365, + "step": 10310 + }, + { + "epoch": 0.45279207877317584, + "grad_norm": 1.8984375, + "learning_rate": 2.8822860721090995e-05, + "loss": 0.3502, + "step": 10312 + }, + { + "epoch": 0.4528798972523793, + "grad_norm": 1.828125, + "learning_rate": 2.8816023653982317e-05, + "loss": 0.3714, + "step": 10314 + }, + { + "epoch": 0.45296771573158284, + "grad_norm": 1.8046875, + "learning_rate": 2.8809186294641634e-05, + "loss": 0.3823, + "step": 10316 + }, + { + "epoch": 0.4530555342107863, + "grad_norm": 1.9765625, + "learning_rate": 2.8802348643592552e-05, + "loss": 0.3837, + "step": 10318 + }, + { + "epoch": 0.4531433526899898, + "grad_norm": 1.921875, + "learning_rate": 2.8795510701358703e-05, + "loss": 0.3675, + "step": 10320 + }, + { + "epoch": 0.45323117116919326, + "grad_norm": 1.921875, + "learning_rate": 2.878867246846373e-05, + "loss": 0.373, + "step": 10322 + }, + { + "epoch": 0.4533189896483968, + "grad_norm": 1.8125, + "learning_rate": 2.87818339454313e-05, + "loss": 0.3807, + "step": 10324 + }, + { + "epoch": 0.45340680812760026, + "grad_norm": 1.9375, + "learning_rate": 2.877499513278513e-05, + "loss": 0.3547, + "step": 10326 + }, + { + "epoch": 0.45349462660680373, + "grad_norm": 1.9609375, + "learning_rate": 2.876815603104893e-05, + "loss": 0.3703, + "step": 10328 + }, + { + "epoch": 0.4535824450860072, + "grad_norm": 1.796875, + "learning_rate": 2.8761316640746437e-05, + "loss": 0.3889, + "step": 10330 + }, + { + "epoch": 0.45367026356521073, + "grad_norm": 1.796875, + "learning_rate": 2.8754476962401418e-05, + "loss": 0.384, + "step": 10332 + }, + { + "epoch": 0.4537580820444142, + "grad_norm": 1.8046875, + "learning_rate": 2.874763699653765e-05, + "loss": 0.3686, + "step": 10334 + }, + { + "epoch": 0.4538459005236177, + "grad_norm": 1.796875, + "learning_rate": 2.874079674367894e-05, + "loss": 0.3623, + "step": 10336 + }, + { + "epoch": 0.45393371900282115, + "grad_norm": 1.7734375, + "learning_rate": 2.8733956204349117e-05, + "loss": 0.3689, + "step": 10338 + }, + { + "epoch": 0.4540215374820247, + "grad_norm": 1.828125, + "learning_rate": 2.8727115379072034e-05, + "loss": 0.3677, + "step": 10340 + }, + { + "epoch": 0.45410935596122814, + "grad_norm": 1.8359375, + "learning_rate": 2.872027426837156e-05, + "loss": 0.3815, + "step": 10342 + }, + { + "epoch": 0.4541971744404316, + "grad_norm": 1.984375, + "learning_rate": 2.87134328727716e-05, + "loss": 0.3619, + "step": 10344 + }, + { + "epoch": 0.4542849929196351, + "grad_norm": 1.8203125, + "learning_rate": 2.870659119279605e-05, + "loss": 0.3892, + "step": 10346 + }, + { + "epoch": 0.4543728113988386, + "grad_norm": 1.8203125, + "learning_rate": 2.8699749228968865e-05, + "loss": 0.4045, + "step": 10348 + }, + { + "epoch": 0.4544606298780421, + "grad_norm": 2.0625, + "learning_rate": 2.8692906981813993e-05, + "loss": 0.4142, + "step": 10350 + }, + { + "epoch": 0.45454844835724556, + "grad_norm": 1.65625, + "learning_rate": 2.8686064451855422e-05, + "loss": 0.3741, + "step": 10352 + }, + { + "epoch": 0.45463626683644903, + "grad_norm": 1.9921875, + "learning_rate": 2.867922163961715e-05, + "loss": 0.3552, + "step": 10354 + }, + { + "epoch": 0.45472408531565256, + "grad_norm": 2.109375, + "learning_rate": 2.86723785456232e-05, + "loss": 0.398, + "step": 10356 + }, + { + "epoch": 0.45481190379485603, + "grad_norm": 1.8203125, + "learning_rate": 2.866553517039763e-05, + "loss": 0.3416, + "step": 10358 + }, + { + "epoch": 0.4548997222740595, + "grad_norm": 1.9375, + "learning_rate": 2.8658691514464488e-05, + "loss": 0.3898, + "step": 10360 + }, + { + "epoch": 0.45498754075326303, + "grad_norm": 1.7890625, + "learning_rate": 2.8651847578347873e-05, + "loss": 0.3765, + "step": 10362 + }, + { + "epoch": 0.4550753592324665, + "grad_norm": 2.0, + "learning_rate": 2.8645003362571897e-05, + "loss": 0.3381, + "step": 10364 + }, + { + "epoch": 0.45516317771167, + "grad_norm": 1.8671875, + "learning_rate": 2.863815886766069e-05, + "loss": 0.3649, + "step": 10366 + }, + { + "epoch": 0.45525099619087345, + "grad_norm": 1.9296875, + "learning_rate": 2.8631314094138405e-05, + "loss": 0.3858, + "step": 10368 + }, + { + "epoch": 0.455338814670077, + "grad_norm": 1.9921875, + "learning_rate": 2.862446904252922e-05, + "loss": 0.3454, + "step": 10370 + }, + { + "epoch": 0.45542663314928045, + "grad_norm": 1.953125, + "learning_rate": 2.8617623713357326e-05, + "loss": 0.3886, + "step": 10372 + }, + { + "epoch": 0.4555144516284839, + "grad_norm": 1.7109375, + "learning_rate": 2.8610778107146934e-05, + "loss": 0.3711, + "step": 10374 + }, + { + "epoch": 0.4556022701076874, + "grad_norm": 1.890625, + "learning_rate": 2.8603932224422297e-05, + "loss": 0.3676, + "step": 10376 + }, + { + "epoch": 0.4556900885868909, + "grad_norm": 1.8671875, + "learning_rate": 2.8597086065707655e-05, + "loss": 0.3693, + "step": 10378 + }, + { + "epoch": 0.4557779070660944, + "grad_norm": 2.03125, + "learning_rate": 2.8590239631527314e-05, + "loss": 0.34, + "step": 10380 + }, + { + "epoch": 0.45586572554529786, + "grad_norm": 1.7578125, + "learning_rate": 2.858339292240556e-05, + "loss": 0.3826, + "step": 10382 + }, + { + "epoch": 0.45595354402450133, + "grad_norm": 1.875, + "learning_rate": 2.857654593886671e-05, + "loss": 0.3651, + "step": 10384 + }, + { + "epoch": 0.45604136250370486, + "grad_norm": 1.9765625, + "learning_rate": 2.856969868143512e-05, + "loss": 0.3667, + "step": 10386 + }, + { + "epoch": 0.45612918098290833, + "grad_norm": 2.03125, + "learning_rate": 2.856285115063514e-05, + "loss": 0.3768, + "step": 10388 + }, + { + "epoch": 0.4562169994621118, + "grad_norm": 1.9765625, + "learning_rate": 2.8556003346991174e-05, + "loss": 0.3444, + "step": 10390 + }, + { + "epoch": 0.4563048179413153, + "grad_norm": 1.84375, + "learning_rate": 2.8549155271027617e-05, + "loss": 0.3787, + "step": 10392 + }, + { + "epoch": 0.4563926364205188, + "grad_norm": 2.0, + "learning_rate": 2.8542306923268897e-05, + "loss": 0.379, + "step": 10394 + }, + { + "epoch": 0.4564804548997223, + "grad_norm": 1.90625, + "learning_rate": 2.853545830423947e-05, + "loss": 0.373, + "step": 10396 + }, + { + "epoch": 0.45656827337892575, + "grad_norm": 1.9375, + "learning_rate": 2.8528609414463793e-05, + "loss": 0.3874, + "step": 10398 + }, + { + "epoch": 0.4566560918581292, + "grad_norm": 1.765625, + "learning_rate": 2.8521760254466355e-05, + "loss": 0.3398, + "step": 10400 + }, + { + "epoch": 0.45674391033733275, + "grad_norm": 1.8359375, + "learning_rate": 2.851491082477168e-05, + "loss": 0.3801, + "step": 10402 + }, + { + "epoch": 0.4568317288165362, + "grad_norm": 1.8515625, + "learning_rate": 2.8508061125904284e-05, + "loss": 0.3614, + "step": 10404 + }, + { + "epoch": 0.4569195472957397, + "grad_norm": 1.9765625, + "learning_rate": 2.850121115838874e-05, + "loss": 0.3851, + "step": 10406 + }, + { + "epoch": 0.4570073657749432, + "grad_norm": 2.046875, + "learning_rate": 2.8494360922749595e-05, + "loss": 0.3495, + "step": 10408 + }, + { + "epoch": 0.4570951842541467, + "grad_norm": 1.7734375, + "learning_rate": 2.848751041951146e-05, + "loss": 0.3773, + "step": 10410 + }, + { + "epoch": 0.45718300273335016, + "grad_norm": 1.8828125, + "learning_rate": 2.8480659649198937e-05, + "loss": 0.3699, + "step": 10412 + }, + { + "epoch": 0.45727082121255364, + "grad_norm": 2.0, + "learning_rate": 2.8473808612336662e-05, + "loss": 0.3567, + "step": 10414 + }, + { + "epoch": 0.45735863969175716, + "grad_norm": 1.84375, + "learning_rate": 2.8466957309449287e-05, + "loss": 0.3647, + "step": 10416 + }, + { + "epoch": 0.45744645817096063, + "grad_norm": 1.9453125, + "learning_rate": 2.8460105741061505e-05, + "loss": 0.3707, + "step": 10418 + }, + { + "epoch": 0.4575342766501641, + "grad_norm": 2.25, + "learning_rate": 2.8453253907697985e-05, + "loss": 0.3728, + "step": 10420 + }, + { + "epoch": 0.4576220951293676, + "grad_norm": 2.0, + "learning_rate": 2.844640180988345e-05, + "loss": 0.3649, + "step": 10422 + }, + { + "epoch": 0.4577099136085711, + "grad_norm": 1.9296875, + "learning_rate": 2.8439549448142644e-05, + "loss": 0.3704, + "step": 10424 + }, + { + "epoch": 0.4577977320877746, + "grad_norm": 2.09375, + "learning_rate": 2.8432696823000314e-05, + "loss": 0.3709, + "step": 10426 + }, + { + "epoch": 0.45788555056697805, + "grad_norm": 2.0625, + "learning_rate": 2.8425843934981245e-05, + "loss": 0.3849, + "step": 10428 + }, + { + "epoch": 0.4579733690461815, + "grad_norm": 2.046875, + "learning_rate": 2.8418990784610223e-05, + "loss": 0.388, + "step": 10430 + }, + { + "epoch": 0.45806118752538505, + "grad_norm": 2.125, + "learning_rate": 2.8412137372412062e-05, + "loss": 0.3925, + "step": 10432 + }, + { + "epoch": 0.4581490060045885, + "grad_norm": 1.890625, + "learning_rate": 2.8405283698911605e-05, + "loss": 0.3642, + "step": 10434 + }, + { + "epoch": 0.458236824483792, + "grad_norm": 1.921875, + "learning_rate": 2.8398429764633706e-05, + "loss": 0.3798, + "step": 10436 + }, + { + "epoch": 0.45832464296299547, + "grad_norm": 1.8046875, + "learning_rate": 2.839157557010324e-05, + "loss": 0.3691, + "step": 10438 + }, + { + "epoch": 0.458412461442199, + "grad_norm": 1.71875, + "learning_rate": 2.838472111584511e-05, + "loss": 0.3631, + "step": 10440 + }, + { + "epoch": 0.45850027992140246, + "grad_norm": 2.0625, + "learning_rate": 2.8377866402384223e-05, + "loss": 0.4049, + "step": 10442 + }, + { + "epoch": 0.45858809840060594, + "grad_norm": 1.8984375, + "learning_rate": 2.837101143024552e-05, + "loss": 0.374, + "step": 10444 + }, + { + "epoch": 0.4586759168798094, + "grad_norm": 1.8828125, + "learning_rate": 2.836415619995395e-05, + "loss": 0.3938, + "step": 10446 + }, + { + "epoch": 0.45876373535901294, + "grad_norm": 1.7421875, + "learning_rate": 2.835730071203449e-05, + "loss": 0.3711, + "step": 10448 + }, + { + "epoch": 0.4588515538382164, + "grad_norm": 2.1875, + "learning_rate": 2.8350444967012134e-05, + "loss": 0.3739, + "step": 10450 + }, + { + "epoch": 0.4589393723174199, + "grad_norm": 1.859375, + "learning_rate": 2.8343588965411905e-05, + "loss": 0.3799, + "step": 10452 + }, + { + "epoch": 0.45902719079662335, + "grad_norm": 2.015625, + "learning_rate": 2.833673270775883e-05, + "loss": 0.3414, + "step": 10454 + }, + { + "epoch": 0.4591150092758269, + "grad_norm": 1.7578125, + "learning_rate": 2.832987619457797e-05, + "loss": 0.3798, + "step": 10456 + }, + { + "epoch": 0.45920282775503035, + "grad_norm": 2.390625, + "learning_rate": 2.832301942639439e-05, + "loss": 0.3828, + "step": 10458 + }, + { + "epoch": 0.4592906462342338, + "grad_norm": 1.984375, + "learning_rate": 2.8316162403733177e-05, + "loss": 0.3562, + "step": 10460 + }, + { + "epoch": 0.45937846471343735, + "grad_norm": 2.140625, + "learning_rate": 2.8309305127119456e-05, + "loss": 0.3902, + "step": 10462 + }, + { + "epoch": 0.4594662831926408, + "grad_norm": 2.203125, + "learning_rate": 2.8302447597078353e-05, + "loss": 0.3925, + "step": 10464 + }, + { + "epoch": 0.4595541016718443, + "grad_norm": 2.09375, + "learning_rate": 2.8295589814135032e-05, + "loss": 0.3432, + "step": 10466 + }, + { + "epoch": 0.45964192015104777, + "grad_norm": 2.078125, + "learning_rate": 2.8288731778814642e-05, + "loss": 0.3987, + "step": 10468 + }, + { + "epoch": 0.4597297386302513, + "grad_norm": 1.9140625, + "learning_rate": 2.8281873491642392e-05, + "loss": 0.3686, + "step": 10470 + }, + { + "epoch": 0.45981755710945477, + "grad_norm": 1.9765625, + "learning_rate": 2.827501495314348e-05, + "loss": 0.3607, + "step": 10472 + }, + { + "epoch": 0.45990537558865824, + "grad_norm": 2.109375, + "learning_rate": 2.8268156163843136e-05, + "loss": 0.3818, + "step": 10474 + }, + { + "epoch": 0.4599931940678617, + "grad_norm": 2.1875, + "learning_rate": 2.8261297124266613e-05, + "loss": 0.3941, + "step": 10476 + }, + { + "epoch": 0.46008101254706524, + "grad_norm": 1.84375, + "learning_rate": 2.825443783493917e-05, + "loss": 0.3418, + "step": 10478 + }, + { + "epoch": 0.4601688310262687, + "grad_norm": 1.859375, + "learning_rate": 2.8247578296386102e-05, + "loss": 0.3479, + "step": 10480 + }, + { + "epoch": 0.4602566495054722, + "grad_norm": 1.7265625, + "learning_rate": 2.824071850913271e-05, + "loss": 0.3828, + "step": 10482 + }, + { + "epoch": 0.46034446798467565, + "grad_norm": 1.78125, + "learning_rate": 2.823385847370431e-05, + "loss": 0.3475, + "step": 10484 + }, + { + "epoch": 0.4604322864638792, + "grad_norm": 2.015625, + "learning_rate": 2.822699819062626e-05, + "loss": 0.3772, + "step": 10486 + }, + { + "epoch": 0.46052010494308265, + "grad_norm": 1.7578125, + "learning_rate": 2.822013766042391e-05, + "loss": 0.3601, + "step": 10488 + }, + { + "epoch": 0.4606079234222861, + "grad_norm": 1.8828125, + "learning_rate": 2.8213276883622654e-05, + "loss": 0.383, + "step": 10490 + }, + { + "epoch": 0.4606957419014896, + "grad_norm": 1.8984375, + "learning_rate": 2.820641586074788e-05, + "loss": 0.3687, + "step": 10492 + }, + { + "epoch": 0.4607835603806931, + "grad_norm": 1.8125, + "learning_rate": 2.8199554592325005e-05, + "loss": 0.3863, + "step": 10494 + }, + { + "epoch": 0.4608713788598966, + "grad_norm": 2.015625, + "learning_rate": 2.819269307887948e-05, + "loss": 0.3222, + "step": 10496 + }, + { + "epoch": 0.46095919733910007, + "grad_norm": 1.7421875, + "learning_rate": 2.818583132093675e-05, + "loss": 0.3738, + "step": 10498 + }, + { + "epoch": 0.46104701581830354, + "grad_norm": 2.0625, + "learning_rate": 2.8178969319022292e-05, + "loss": 0.3666, + "step": 10500 + }, + { + "epoch": 0.46113483429750707, + "grad_norm": 1.859375, + "learning_rate": 2.8172107073661607e-05, + "loss": 0.3504, + "step": 10502 + }, + { + "epoch": 0.46122265277671054, + "grad_norm": 1.7578125, + "learning_rate": 2.8165244585380197e-05, + "loss": 0.3886, + "step": 10504 + }, + { + "epoch": 0.461310471255914, + "grad_norm": 1.8515625, + "learning_rate": 2.815838185470361e-05, + "loss": 0.3935, + "step": 10506 + }, + { + "epoch": 0.46139828973511754, + "grad_norm": 1.8125, + "learning_rate": 2.815151888215737e-05, + "loss": 0.3804, + "step": 10508 + }, + { + "epoch": 0.461486108214321, + "grad_norm": 1.6640625, + "learning_rate": 2.8144655668267056e-05, + "loss": 0.3321, + "step": 10510 + }, + { + "epoch": 0.4615739266935245, + "grad_norm": 2.015625, + "learning_rate": 2.813779221355826e-05, + "loss": 0.3835, + "step": 10512 + }, + { + "epoch": 0.46166174517272796, + "grad_norm": 1.828125, + "learning_rate": 2.8130928518556588e-05, + "loss": 0.3922, + "step": 10514 + }, + { + "epoch": 0.4617495636519315, + "grad_norm": 1.734375, + "learning_rate": 2.8124064583787662e-05, + "loss": 0.3696, + "step": 10516 + }, + { + "epoch": 0.46183738213113495, + "grad_norm": 2.15625, + "learning_rate": 2.8117200409777124e-05, + "loss": 0.3481, + "step": 10518 + }, + { + "epoch": 0.4619252006103384, + "grad_norm": 1.7890625, + "learning_rate": 2.8110335997050624e-05, + "loss": 0.3764, + "step": 10520 + }, + { + "epoch": 0.4620130190895419, + "grad_norm": 1.859375, + "learning_rate": 2.8103471346133848e-05, + "loss": 0.3645, + "step": 10522 + }, + { + "epoch": 0.4621008375687454, + "grad_norm": 1.9375, + "learning_rate": 2.8096606457552488e-05, + "loss": 0.3537, + "step": 10524 + }, + { + "epoch": 0.4621886560479489, + "grad_norm": 1.703125, + "learning_rate": 2.808974133183227e-05, + "loss": 0.3692, + "step": 10526 + }, + { + "epoch": 0.46227647452715237, + "grad_norm": 1.90625, + "learning_rate": 2.8082875969498922e-05, + "loss": 0.3552, + "step": 10528 + }, + { + "epoch": 0.46236429300635584, + "grad_norm": 1.8671875, + "learning_rate": 2.8076010371078186e-05, + "loss": 0.3821, + "step": 10530 + }, + { + "epoch": 0.46245211148555937, + "grad_norm": 2.109375, + "learning_rate": 2.8069144537095842e-05, + "loss": 0.3728, + "step": 10532 + }, + { + "epoch": 0.46253992996476284, + "grad_norm": 1.859375, + "learning_rate": 2.8062278468077678e-05, + "loss": 0.3553, + "step": 10534 + }, + { + "epoch": 0.4626277484439663, + "grad_norm": 1.875, + "learning_rate": 2.8055412164549488e-05, + "loss": 0.36, + "step": 10536 + }, + { + "epoch": 0.4627155669231698, + "grad_norm": 1.8984375, + "learning_rate": 2.8048545627037102e-05, + "loss": 0.3602, + "step": 10538 + }, + { + "epoch": 0.4628033854023733, + "grad_norm": 2.140625, + "learning_rate": 2.8041678856066367e-05, + "loss": 0.3827, + "step": 10540 + }, + { + "epoch": 0.4628912038815768, + "grad_norm": 2.015625, + "learning_rate": 2.8034811852163136e-05, + "loss": 0.3562, + "step": 10542 + }, + { + "epoch": 0.46297902236078026, + "grad_norm": 1.828125, + "learning_rate": 2.802794461585328e-05, + "loss": 0.3726, + "step": 10544 + }, + { + "epoch": 0.46306684083998373, + "grad_norm": 2.140625, + "learning_rate": 2.80210771476627e-05, + "loss": 0.3806, + "step": 10546 + }, + { + "epoch": 0.46315465931918726, + "grad_norm": 1.9140625, + "learning_rate": 2.8014209448117317e-05, + "loss": 0.3816, + "step": 10548 + }, + { + "epoch": 0.46324247779839073, + "grad_norm": 1.921875, + "learning_rate": 2.8007341517743042e-05, + "loss": 0.3968, + "step": 10550 + }, + { + "epoch": 0.4633302962775942, + "grad_norm": 1.8515625, + "learning_rate": 2.8000473357065838e-05, + "loss": 0.3709, + "step": 10552 + }, + { + "epoch": 0.46341811475679767, + "grad_norm": 2.203125, + "learning_rate": 2.7993604966611665e-05, + "loss": 0.3425, + "step": 10554 + }, + { + "epoch": 0.4635059332360012, + "grad_norm": 1.75, + "learning_rate": 2.79867363469065e-05, + "loss": 0.3793, + "step": 10556 + }, + { + "epoch": 0.46359375171520467, + "grad_norm": 2.3125, + "learning_rate": 2.7979867498476354e-05, + "loss": 0.3728, + "step": 10558 + }, + { + "epoch": 0.46368157019440814, + "grad_norm": 1.828125, + "learning_rate": 2.7972998421847235e-05, + "loss": 0.382, + "step": 10560 + }, + { + "epoch": 0.46376938867361167, + "grad_norm": 2.484375, + "learning_rate": 2.7966129117545194e-05, + "loss": 0.3628, + "step": 10562 + }, + { + "epoch": 0.46385720715281514, + "grad_norm": 1.828125, + "learning_rate": 2.7959259586096272e-05, + "loss": 0.372, + "step": 10564 + }, + { + "epoch": 0.4639450256320186, + "grad_norm": 2.28125, + "learning_rate": 2.7952389828026538e-05, + "loss": 0.3911, + "step": 10566 + }, + { + "epoch": 0.4640328441112221, + "grad_norm": 1.734375, + "learning_rate": 2.7945519843862083e-05, + "loss": 0.3848, + "step": 10568 + }, + { + "epoch": 0.4641206625904256, + "grad_norm": 1.890625, + "learning_rate": 2.7938649634129015e-05, + "loss": 0.3828, + "step": 10570 + }, + { + "epoch": 0.4642084810696291, + "grad_norm": 1.8359375, + "learning_rate": 2.7931779199353448e-05, + "loss": 0.3414, + "step": 10572 + }, + { + "epoch": 0.46429629954883256, + "grad_norm": 1.953125, + "learning_rate": 2.792490854006153e-05, + "loss": 0.3428, + "step": 10574 + }, + { + "epoch": 0.46438411802803603, + "grad_norm": 1.84375, + "learning_rate": 2.7918037656779417e-05, + "loss": 0.3689, + "step": 10576 + }, + { + "epoch": 0.46447193650723956, + "grad_norm": 2.09375, + "learning_rate": 2.791116655003328e-05, + "loss": 0.3652, + "step": 10578 + }, + { + "epoch": 0.46455975498644303, + "grad_norm": 1.984375, + "learning_rate": 2.790429522034932e-05, + "loss": 0.3702, + "step": 10580 + }, + { + "epoch": 0.4646475734656465, + "grad_norm": 2.40625, + "learning_rate": 2.789742366825372e-05, + "loss": 0.3559, + "step": 10582 + }, + { + "epoch": 0.46473539194485, + "grad_norm": 2.25, + "learning_rate": 2.7890551894272726e-05, + "loss": 0.3607, + "step": 10584 + }, + { + "epoch": 0.4648232104240535, + "grad_norm": 1.9453125, + "learning_rate": 2.7883679898932575e-05, + "loss": 0.3528, + "step": 10586 + }, + { + "epoch": 0.464911028903257, + "grad_norm": 2.1875, + "learning_rate": 2.7876807682759526e-05, + "loss": 0.3599, + "step": 10588 + }, + { + "epoch": 0.46499884738246045, + "grad_norm": 2.203125, + "learning_rate": 2.786993524627986e-05, + "loss": 0.3848, + "step": 10590 + }, + { + "epoch": 0.4650866658616639, + "grad_norm": 2.046875, + "learning_rate": 2.786306259001986e-05, + "loss": 0.3684, + "step": 10592 + }, + { + "epoch": 0.46517448434086744, + "grad_norm": 2.09375, + "learning_rate": 2.7856189714505843e-05, + "loss": 0.38, + "step": 10594 + }, + { + "epoch": 0.4652623028200709, + "grad_norm": 1.9140625, + "learning_rate": 2.7849316620264133e-05, + "loss": 0.3607, + "step": 10596 + }, + { + "epoch": 0.4653501212992744, + "grad_norm": 1.8359375, + "learning_rate": 2.784244330782107e-05, + "loss": 0.3588, + "step": 10598 + }, + { + "epoch": 0.46543793977847786, + "grad_norm": 2.015625, + "learning_rate": 2.783556977770302e-05, + "loss": 0.3734, + "step": 10600 + }, + { + "epoch": 0.4655257582576814, + "grad_norm": 1.984375, + "learning_rate": 2.7828696030436353e-05, + "loss": 0.3406, + "step": 10602 + }, + { + "epoch": 0.46561357673688486, + "grad_norm": 2.171875, + "learning_rate": 2.782182206654747e-05, + "loss": 0.3711, + "step": 10604 + }, + { + "epoch": 0.46570139521608833, + "grad_norm": 1.8515625, + "learning_rate": 2.7814947886562774e-05, + "loss": 0.3544, + "step": 10606 + }, + { + "epoch": 0.46578921369529186, + "grad_norm": 1.6484375, + "learning_rate": 2.7808073491008698e-05, + "loss": 0.3632, + "step": 10608 + }, + { + "epoch": 0.46587703217449533, + "grad_norm": 1.8828125, + "learning_rate": 2.780119888041168e-05, + "loss": 0.3707, + "step": 10610 + }, + { + "epoch": 0.4659648506536988, + "grad_norm": 2.296875, + "learning_rate": 2.7794324055298178e-05, + "loss": 0.3833, + "step": 10612 + }, + { + "epoch": 0.4660526691329023, + "grad_norm": 2.203125, + "learning_rate": 2.7787449016194665e-05, + "loss": 0.3892, + "step": 10614 + }, + { + "epoch": 0.4661404876121058, + "grad_norm": 1.78125, + "learning_rate": 2.7780573763627648e-05, + "loss": 0.3694, + "step": 10616 + }, + { + "epoch": 0.4662283060913093, + "grad_norm": 1.8671875, + "learning_rate": 2.777369829812362e-05, + "loss": 0.3798, + "step": 10618 + }, + { + "epoch": 0.46631612457051275, + "grad_norm": 1.9140625, + "learning_rate": 2.776682262020911e-05, + "loss": 0.388, + "step": 10620 + }, + { + "epoch": 0.4664039430497162, + "grad_norm": 2.015625, + "learning_rate": 2.7759946730410663e-05, + "loss": 0.3559, + "step": 10622 + }, + { + "epoch": 0.46649176152891975, + "grad_norm": 2.09375, + "learning_rate": 2.7753070629254835e-05, + "loss": 0.3877, + "step": 10624 + }, + { + "epoch": 0.4665795800081232, + "grad_norm": 1.7734375, + "learning_rate": 2.7746194317268197e-05, + "loss": 0.3885, + "step": 10626 + }, + { + "epoch": 0.4666673984873267, + "grad_norm": 1.7578125, + "learning_rate": 2.773931779497735e-05, + "loss": 0.3809, + "step": 10628 + }, + { + "epoch": 0.46675521696653016, + "grad_norm": 1.734375, + "learning_rate": 2.7732441062908877e-05, + "loss": 0.3759, + "step": 10630 + }, + { + "epoch": 0.4668430354457337, + "grad_norm": 2.3125, + "learning_rate": 2.772556412158941e-05, + "loss": 0.3576, + "step": 10632 + }, + { + "epoch": 0.46693085392493716, + "grad_norm": 1.6953125, + "learning_rate": 2.7718686971545592e-05, + "loss": 0.3965, + "step": 10634 + }, + { + "epoch": 0.46701867240414063, + "grad_norm": 1.78125, + "learning_rate": 2.771180961330408e-05, + "loss": 0.3518, + "step": 10636 + }, + { + "epoch": 0.4671064908833441, + "grad_norm": 1.7421875, + "learning_rate": 2.770493204739153e-05, + "loss": 0.3663, + "step": 10638 + }, + { + "epoch": 0.46719430936254763, + "grad_norm": 1.8203125, + "learning_rate": 2.7698054274334645e-05, + "loss": 0.3468, + "step": 10640 + }, + { + "epoch": 0.4672821278417511, + "grad_norm": 1.859375, + "learning_rate": 2.7691176294660114e-05, + "loss": 0.3605, + "step": 10642 + }, + { + "epoch": 0.4673699463209546, + "grad_norm": 1.7734375, + "learning_rate": 2.768429810889465e-05, + "loss": 0.3526, + "step": 10644 + }, + { + "epoch": 0.46745776480015805, + "grad_norm": 2.078125, + "learning_rate": 2.7677419717565e-05, + "loss": 0.3834, + "step": 10646 + }, + { + "epoch": 0.4675455832793616, + "grad_norm": 2.140625, + "learning_rate": 2.76705411211979e-05, + "loss": 0.3985, + "step": 10648 + }, + { + "epoch": 0.46763340175856505, + "grad_norm": 1.78125, + "learning_rate": 2.766366232032013e-05, + "loss": 0.3632, + "step": 10650 + }, + { + "epoch": 0.4677212202377685, + "grad_norm": 1.7109375, + "learning_rate": 2.7656783315458457e-05, + "loss": 0.341, + "step": 10652 + }, + { + "epoch": 0.467809038716972, + "grad_norm": 2.546875, + "learning_rate": 2.7649904107139675e-05, + "loss": 0.3662, + "step": 10654 + }, + { + "epoch": 0.4678968571961755, + "grad_norm": 2.3125, + "learning_rate": 2.7643024695890613e-05, + "loss": 0.3479, + "step": 10656 + }, + { + "epoch": 0.467984675675379, + "grad_norm": 1.7265625, + "learning_rate": 2.7636145082238074e-05, + "loss": 0.3481, + "step": 10658 + }, + { + "epoch": 0.46807249415458246, + "grad_norm": 1.9140625, + "learning_rate": 2.762926526670892e-05, + "loss": 0.3795, + "step": 10660 + }, + { + "epoch": 0.468160312633786, + "grad_norm": 1.9296875, + "learning_rate": 2.7622385249829997e-05, + "loss": 0.3054, + "step": 10662 + }, + { + "epoch": 0.46824813111298946, + "grad_norm": 2.453125, + "learning_rate": 2.761550503212818e-05, + "loss": 0.3707, + "step": 10664 + }, + { + "epoch": 0.46833594959219293, + "grad_norm": 1.8984375, + "learning_rate": 2.7608624614130374e-05, + "loss": 0.3639, + "step": 10666 + }, + { + "epoch": 0.4684237680713964, + "grad_norm": 2.265625, + "learning_rate": 2.760174399636346e-05, + "loss": 0.3831, + "step": 10668 + }, + { + "epoch": 0.46851158655059993, + "grad_norm": 2.03125, + "learning_rate": 2.759486317935437e-05, + "loss": 0.3721, + "step": 10670 + }, + { + "epoch": 0.4685994050298034, + "grad_norm": 1.8984375, + "learning_rate": 2.7587982163630033e-05, + "loss": 0.365, + "step": 10672 + }, + { + "epoch": 0.4686872235090069, + "grad_norm": 1.9765625, + "learning_rate": 2.7581100949717402e-05, + "loss": 0.3823, + "step": 10674 + }, + { + "epoch": 0.46877504198821035, + "grad_norm": 1.796875, + "learning_rate": 2.7574219538143443e-05, + "loss": 0.3744, + "step": 10676 + }, + { + "epoch": 0.4688628604674139, + "grad_norm": 1.8984375, + "learning_rate": 2.756733792943514e-05, + "loss": 0.3758, + "step": 10678 + }, + { + "epoch": 0.46895067894661735, + "grad_norm": 1.890625, + "learning_rate": 2.7560456124119473e-05, + "loss": 0.3831, + "step": 10680 + }, + { + "epoch": 0.4690384974258208, + "grad_norm": 1.7109375, + "learning_rate": 2.7553574122723462e-05, + "loss": 0.3569, + "step": 10682 + }, + { + "epoch": 0.4691263159050243, + "grad_norm": 1.9609375, + "learning_rate": 2.7546691925774137e-05, + "loss": 0.3486, + "step": 10684 + }, + { + "epoch": 0.4692141343842278, + "grad_norm": 1.78125, + "learning_rate": 2.753980953379854e-05, + "loss": 0.3897, + "step": 10686 + }, + { + "epoch": 0.4693019528634313, + "grad_norm": 1.875, + "learning_rate": 2.753292694732371e-05, + "loss": 0.3905, + "step": 10688 + }, + { + "epoch": 0.46938977134263477, + "grad_norm": 1.84375, + "learning_rate": 2.7526044166876737e-05, + "loss": 0.3352, + "step": 10690 + }, + { + "epoch": 0.46947758982183824, + "grad_norm": 1.875, + "learning_rate": 2.751916119298469e-05, + "loss": 0.3845, + "step": 10692 + }, + { + "epoch": 0.46956540830104176, + "grad_norm": 1.6640625, + "learning_rate": 2.751227802617467e-05, + "loss": 0.3655, + "step": 10694 + }, + { + "epoch": 0.46965322678024524, + "grad_norm": 1.7265625, + "learning_rate": 2.7505394666973803e-05, + "loss": 0.3813, + "step": 10696 + }, + { + "epoch": 0.4697410452594487, + "grad_norm": 1.9609375, + "learning_rate": 2.749851111590921e-05, + "loss": 0.4094, + "step": 10698 + }, + { + "epoch": 0.4698288637386522, + "grad_norm": 1.7890625, + "learning_rate": 2.7491627373508035e-05, + "loss": 0.3623, + "step": 10700 + }, + { + "epoch": 0.4699166822178557, + "grad_norm": 1.921875, + "learning_rate": 2.7484743440297445e-05, + "loss": 0.3805, + "step": 10702 + }, + { + "epoch": 0.4700045006970592, + "grad_norm": 1.875, + "learning_rate": 2.7477859316804604e-05, + "loss": 0.3795, + "step": 10704 + }, + { + "epoch": 0.47009231917626265, + "grad_norm": 1.953125, + "learning_rate": 2.7470975003556705e-05, + "loss": 0.3744, + "step": 10706 + }, + { + "epoch": 0.4701801376554661, + "grad_norm": 1.671875, + "learning_rate": 2.746409050108094e-05, + "loss": 0.3915, + "step": 10708 + }, + { + "epoch": 0.47026795613466965, + "grad_norm": 1.8828125, + "learning_rate": 2.7457205809904534e-05, + "loss": 0.3823, + "step": 10710 + }, + { + "epoch": 0.4703557746138731, + "grad_norm": 1.890625, + "learning_rate": 2.7450320930554724e-05, + "loss": 0.3686, + "step": 10712 + }, + { + "epoch": 0.4704435930930766, + "grad_norm": 1.9921875, + "learning_rate": 2.7443435863558748e-05, + "loss": 0.3657, + "step": 10714 + }, + { + "epoch": 0.4705314115722801, + "grad_norm": 1.8125, + "learning_rate": 2.743655060944387e-05, + "loss": 0.3807, + "step": 10716 + }, + { + "epoch": 0.4706192300514836, + "grad_norm": 2.015625, + "learning_rate": 2.7429665168737363e-05, + "loss": 0.3675, + "step": 10718 + }, + { + "epoch": 0.47070704853068707, + "grad_norm": 1.6875, + "learning_rate": 2.742277954196651e-05, + "loss": 0.3704, + "step": 10720 + }, + { + "epoch": 0.47079486700989054, + "grad_norm": 2.0, + "learning_rate": 2.7415893729658616e-05, + "loss": 0.3794, + "step": 10722 + }, + { + "epoch": 0.47088268548909407, + "grad_norm": 1.90625, + "learning_rate": 2.7409007732341004e-05, + "loss": 0.3534, + "step": 10724 + }, + { + "epoch": 0.47097050396829754, + "grad_norm": 2.046875, + "learning_rate": 2.740212155054101e-05, + "loss": 0.3631, + "step": 10726 + }, + { + "epoch": 0.471058322447501, + "grad_norm": 1.8515625, + "learning_rate": 2.739523518478596e-05, + "loss": 0.3783, + "step": 10728 + }, + { + "epoch": 0.4711461409267045, + "grad_norm": 2.03125, + "learning_rate": 2.7388348635603235e-05, + "loss": 0.3625, + "step": 10730 + }, + { + "epoch": 0.471233959405908, + "grad_norm": 1.8671875, + "learning_rate": 2.7381461903520194e-05, + "loss": 0.3539, + "step": 10732 + }, + { + "epoch": 0.4713217778851115, + "grad_norm": 1.9296875, + "learning_rate": 2.7374574989064234e-05, + "loss": 0.3576, + "step": 10734 + }, + { + "epoch": 0.47140959636431495, + "grad_norm": 1.8125, + "learning_rate": 2.7367687892762745e-05, + "loss": 0.3955, + "step": 10736 + }, + { + "epoch": 0.4714974148435184, + "grad_norm": 1.8125, + "learning_rate": 2.736080061514315e-05, + "loss": 0.3655, + "step": 10738 + }, + { + "epoch": 0.47158523332272195, + "grad_norm": 1.859375, + "learning_rate": 2.7353913156732884e-05, + "loss": 0.3624, + "step": 10740 + }, + { + "epoch": 0.4716730518019254, + "grad_norm": 1.984375, + "learning_rate": 2.7347025518059383e-05, + "loss": 0.3643, + "step": 10742 + }, + { + "epoch": 0.4717608702811289, + "grad_norm": 1.7890625, + "learning_rate": 2.7340137699650103e-05, + "loss": 0.353, + "step": 10744 + }, + { + "epoch": 0.47184868876033237, + "grad_norm": 2.03125, + "learning_rate": 2.7333249702032516e-05, + "loss": 0.3639, + "step": 10746 + }, + { + "epoch": 0.4719365072395359, + "grad_norm": 1.875, + "learning_rate": 2.7326361525734112e-05, + "loss": 0.3855, + "step": 10748 + }, + { + "epoch": 0.47202432571873937, + "grad_norm": 1.8125, + "learning_rate": 2.731947317128238e-05, + "loss": 0.3408, + "step": 10750 + }, + { + "epoch": 0.47211214419794284, + "grad_norm": 1.7578125, + "learning_rate": 2.731258463920484e-05, + "loss": 0.3931, + "step": 10752 + }, + { + "epoch": 0.4721999626771463, + "grad_norm": 1.890625, + "learning_rate": 2.730569593002901e-05, + "loss": 0.3563, + "step": 10754 + }, + { + "epoch": 0.47228778115634984, + "grad_norm": 2.0625, + "learning_rate": 2.7298807044282438e-05, + "loss": 0.3742, + "step": 10756 + }, + { + "epoch": 0.4723755996355533, + "grad_norm": 2.140625, + "learning_rate": 2.7291917982492664e-05, + "loss": 0.4072, + "step": 10758 + }, + { + "epoch": 0.4724634181147568, + "grad_norm": 2.0625, + "learning_rate": 2.7285028745187268e-05, + "loss": 0.3598, + "step": 10760 + }, + { + "epoch": 0.4725512365939603, + "grad_norm": 1.8828125, + "learning_rate": 2.7278139332893827e-05, + "loss": 0.3398, + "step": 10762 + }, + { + "epoch": 0.4726390550731638, + "grad_norm": 2.0, + "learning_rate": 2.727124974613992e-05, + "loss": 0.3882, + "step": 10764 + }, + { + "epoch": 0.47272687355236725, + "grad_norm": 1.75, + "learning_rate": 2.7264359985453168e-05, + "loss": 0.391, + "step": 10766 + }, + { + "epoch": 0.4728146920315707, + "grad_norm": 1.7578125, + "learning_rate": 2.7257470051361184e-05, + "loss": 0.3629, + "step": 10768 + }, + { + "epoch": 0.47290251051077425, + "grad_norm": 2.0, + "learning_rate": 2.72505799443916e-05, + "loss": 0.3444, + "step": 10770 + }, + { + "epoch": 0.4729903289899777, + "grad_norm": 2.390625, + "learning_rate": 2.724368966507207e-05, + "loss": 0.3814, + "step": 10772 + }, + { + "epoch": 0.4730781474691812, + "grad_norm": 1.8984375, + "learning_rate": 2.7236799213930243e-05, + "loss": 0.3891, + "step": 10774 + }, + { + "epoch": 0.47316596594838467, + "grad_norm": 1.8984375, + "learning_rate": 2.7229908591493804e-05, + "loss": 0.3767, + "step": 10776 + }, + { + "epoch": 0.4732537844275882, + "grad_norm": 1.890625, + "learning_rate": 2.7223017798290423e-05, + "loss": 0.3763, + "step": 10778 + }, + { + "epoch": 0.47334160290679167, + "grad_norm": 1.8125, + "learning_rate": 2.7216126834847805e-05, + "loss": 0.3715, + "step": 10780 + }, + { + "epoch": 0.47342942138599514, + "grad_norm": 1.8203125, + "learning_rate": 2.7209235701693663e-05, + "loss": 0.3626, + "step": 10782 + }, + { + "epoch": 0.4735172398651986, + "grad_norm": 1.796875, + "learning_rate": 2.7202344399355722e-05, + "loss": 0.3342, + "step": 10784 + }, + { + "epoch": 0.47360505834440214, + "grad_norm": 1.84375, + "learning_rate": 2.719545292836172e-05, + "loss": 0.3692, + "step": 10786 + }, + { + "epoch": 0.4736928768236056, + "grad_norm": 1.7421875, + "learning_rate": 2.7188561289239413e-05, + "loss": 0.3625, + "step": 10788 + }, + { + "epoch": 0.4737806953028091, + "grad_norm": 1.859375, + "learning_rate": 2.7181669482516546e-05, + "loss": 0.3626, + "step": 10790 + }, + { + "epoch": 0.47386851378201256, + "grad_norm": 1.890625, + "learning_rate": 2.7174777508720917e-05, + "loss": 0.3878, + "step": 10792 + }, + { + "epoch": 0.4739563322612161, + "grad_norm": 1.8125, + "learning_rate": 2.7167885368380304e-05, + "loss": 0.347, + "step": 10794 + }, + { + "epoch": 0.47404415074041956, + "grad_norm": 1.921875, + "learning_rate": 2.7160993062022504e-05, + "loss": 0.3518, + "step": 10796 + }, + { + "epoch": 0.47413196921962303, + "grad_norm": 1.953125, + "learning_rate": 2.715410059017534e-05, + "loss": 0.3369, + "step": 10798 + }, + { + "epoch": 0.4742197876988265, + "grad_norm": 1.890625, + "learning_rate": 2.7147207953366633e-05, + "loss": 0.3438, + "step": 10800 + }, + { + "epoch": 0.47430760617803003, + "grad_norm": 1.859375, + "learning_rate": 2.7140315152124233e-05, + "loss": 0.3813, + "step": 10802 + }, + { + "epoch": 0.4743954246572335, + "grad_norm": 1.8828125, + "learning_rate": 2.713342218697598e-05, + "loss": 0.3613, + "step": 10804 + }, + { + "epoch": 0.47448324313643697, + "grad_norm": 1.625, + "learning_rate": 2.7126529058449747e-05, + "loss": 0.3814, + "step": 10806 + }, + { + "epoch": 0.47457106161564044, + "grad_norm": 2.078125, + "learning_rate": 2.711963576707341e-05, + "loss": 0.3403, + "step": 10808 + }, + { + "epoch": 0.47465888009484397, + "grad_norm": 2.0, + "learning_rate": 2.7112742313374855e-05, + "loss": 0.3498, + "step": 10810 + }, + { + "epoch": 0.47474669857404744, + "grad_norm": 1.84375, + "learning_rate": 2.710584869788199e-05, + "loss": 0.3991, + "step": 10812 + }, + { + "epoch": 0.4748345170532509, + "grad_norm": 1.734375, + "learning_rate": 2.7098954921122726e-05, + "loss": 0.3418, + "step": 10814 + }, + { + "epoch": 0.47492233553245444, + "grad_norm": 1.75, + "learning_rate": 2.709206098362499e-05, + "loss": 0.3639, + "step": 10816 + }, + { + "epoch": 0.4750101540116579, + "grad_norm": 1.9609375, + "learning_rate": 2.7085166885916723e-05, + "loss": 0.3521, + "step": 10818 + }, + { + "epoch": 0.4750979724908614, + "grad_norm": 1.90625, + "learning_rate": 2.7078272628525875e-05, + "loss": 0.3859, + "step": 10820 + }, + { + "epoch": 0.47518579097006486, + "grad_norm": 1.78125, + "learning_rate": 2.707137821198041e-05, + "loss": 0.3664, + "step": 10822 + }, + { + "epoch": 0.4752736094492684, + "grad_norm": 1.890625, + "learning_rate": 2.7064483636808313e-05, + "loss": 0.3766, + "step": 10824 + }, + { + "epoch": 0.47536142792847186, + "grad_norm": 1.7265625, + "learning_rate": 2.705758890353756e-05, + "loss": 0.3534, + "step": 10826 + }, + { + "epoch": 0.47544924640767533, + "grad_norm": 1.859375, + "learning_rate": 2.7050694012696155e-05, + "loss": 0.3678, + "step": 10828 + }, + { + "epoch": 0.4755370648868788, + "grad_norm": 1.8359375, + "learning_rate": 2.7043798964812107e-05, + "loss": 0.3803, + "step": 10830 + }, + { + "epoch": 0.47562488336608233, + "grad_norm": 1.890625, + "learning_rate": 2.7036903760413447e-05, + "loss": 0.3502, + "step": 10832 + }, + { + "epoch": 0.4757127018452858, + "grad_norm": 1.9453125, + "learning_rate": 2.7030008400028205e-05, + "loss": 0.381, + "step": 10834 + }, + { + "epoch": 0.4758005203244893, + "grad_norm": 1.859375, + "learning_rate": 2.702311288418443e-05, + "loss": 0.3547, + "step": 10836 + }, + { + "epoch": 0.47588833880369275, + "grad_norm": 1.734375, + "learning_rate": 2.70162172134102e-05, + "loss": 0.3324, + "step": 10838 + }, + { + "epoch": 0.4759761572828963, + "grad_norm": 2.125, + "learning_rate": 2.7009321388233567e-05, + "loss": 0.3896, + "step": 10840 + }, + { + "epoch": 0.47606397576209974, + "grad_norm": 2.265625, + "learning_rate": 2.700242540918262e-05, + "loss": 0.3554, + "step": 10842 + }, + { + "epoch": 0.4761517942413032, + "grad_norm": 1.9609375, + "learning_rate": 2.6995529276785446e-05, + "loss": 0.3728, + "step": 10844 + }, + { + "epoch": 0.4762396127205067, + "grad_norm": 1.7421875, + "learning_rate": 2.6988632991570174e-05, + "loss": 0.364, + "step": 10846 + }, + { + "epoch": 0.4763274311997102, + "grad_norm": 1.875, + "learning_rate": 2.6981736554064903e-05, + "loss": 0.4019, + "step": 10848 + }, + { + "epoch": 0.4764152496789137, + "grad_norm": 1.9921875, + "learning_rate": 2.6974839964797776e-05, + "loss": 0.3606, + "step": 10850 + }, + { + "epoch": 0.47650306815811716, + "grad_norm": 1.828125, + "learning_rate": 2.6967943224296927e-05, + "loss": 0.3407, + "step": 10852 + }, + { + "epoch": 0.47659088663732063, + "grad_norm": 1.828125, + "learning_rate": 2.6961046333090517e-05, + "loss": 0.3656, + "step": 10854 + }, + { + "epoch": 0.47667870511652416, + "grad_norm": 1.9375, + "learning_rate": 2.695414929170671e-05, + "loss": 0.3354, + "step": 10856 + }, + { + "epoch": 0.47676652359572763, + "grad_norm": 1.8984375, + "learning_rate": 2.694725210067368e-05, + "loss": 0.3991, + "step": 10858 + }, + { + "epoch": 0.4768543420749311, + "grad_norm": 2.15625, + "learning_rate": 2.6940354760519616e-05, + "loss": 0.3437, + "step": 10860 + }, + { + "epoch": 0.47694216055413463, + "grad_norm": 1.9140625, + "learning_rate": 2.6933457271772718e-05, + "loss": 0.3562, + "step": 10862 + }, + { + "epoch": 0.4770299790333381, + "grad_norm": 1.921875, + "learning_rate": 2.692655963496121e-05, + "loss": 0.4094, + "step": 10864 + }, + { + "epoch": 0.4771177975125416, + "grad_norm": 1.859375, + "learning_rate": 2.6919661850613287e-05, + "loss": 0.3828, + "step": 10866 + }, + { + "epoch": 0.47720561599174505, + "grad_norm": 1.8984375, + "learning_rate": 2.6912763919257207e-05, + "loss": 0.3572, + "step": 10868 + }, + { + "epoch": 0.4772934344709486, + "grad_norm": 2.28125, + "learning_rate": 2.690586584142121e-05, + "loss": 0.3654, + "step": 10870 + }, + { + "epoch": 0.47738125295015205, + "grad_norm": 2.375, + "learning_rate": 2.6898967617633546e-05, + "loss": 0.3713, + "step": 10872 + }, + { + "epoch": 0.4774690714293555, + "grad_norm": 1.953125, + "learning_rate": 2.6892069248422486e-05, + "loss": 0.3445, + "step": 10874 + }, + { + "epoch": 0.477556889908559, + "grad_norm": 1.8203125, + "learning_rate": 2.6885170734316317e-05, + "loss": 0.36, + "step": 10876 + }, + { + "epoch": 0.4776447083877625, + "grad_norm": 1.9765625, + "learning_rate": 2.6878272075843313e-05, + "loss": 0.334, + "step": 10878 + }, + { + "epoch": 0.477732526866966, + "grad_norm": 1.875, + "learning_rate": 2.6871373273531785e-05, + "loss": 0.3442, + "step": 10880 + }, + { + "epoch": 0.47782034534616946, + "grad_norm": 1.890625, + "learning_rate": 2.6864474327910038e-05, + "loss": 0.3635, + "step": 10882 + }, + { + "epoch": 0.47790816382537293, + "grad_norm": 1.6796875, + "learning_rate": 2.6857575239506406e-05, + "loss": 0.3776, + "step": 10884 + }, + { + "epoch": 0.47799598230457646, + "grad_norm": 1.6953125, + "learning_rate": 2.6850676008849222e-05, + "loss": 0.3681, + "step": 10886 + }, + { + "epoch": 0.47808380078377993, + "grad_norm": 1.78125, + "learning_rate": 2.6843776636466828e-05, + "loss": 0.3486, + "step": 10888 + }, + { + "epoch": 0.4781716192629834, + "grad_norm": 1.75, + "learning_rate": 2.6836877122887565e-05, + "loss": 0.3536, + "step": 10890 + }, + { + "epoch": 0.4782594377421869, + "grad_norm": 1.7421875, + "learning_rate": 2.6829977468639815e-05, + "loss": 0.3377, + "step": 10892 + }, + { + "epoch": 0.4783472562213904, + "grad_norm": 2.4375, + "learning_rate": 2.682307767425195e-05, + "loss": 0.357, + "step": 10894 + }, + { + "epoch": 0.4784350747005939, + "grad_norm": 1.96875, + "learning_rate": 2.6816177740252362e-05, + "loss": 0.3781, + "step": 10896 + }, + { + "epoch": 0.47852289317979735, + "grad_norm": 1.71875, + "learning_rate": 2.6809277667169446e-05, + "loss": 0.3761, + "step": 10898 + }, + { + "epoch": 0.4786107116590008, + "grad_norm": 1.78125, + "learning_rate": 2.680237745553162e-05, + "loss": 0.3505, + "step": 10900 + }, + { + "epoch": 0.47869853013820435, + "grad_norm": 1.9765625, + "learning_rate": 2.6795477105867295e-05, + "loss": 0.3985, + "step": 10902 + }, + { + "epoch": 0.4787863486174078, + "grad_norm": 2.03125, + "learning_rate": 2.6788576618704897e-05, + "loss": 0.382, + "step": 10904 + }, + { + "epoch": 0.4788741670966113, + "grad_norm": 1.6875, + "learning_rate": 2.6781675994572876e-05, + "loss": 0.3692, + "step": 10906 + }, + { + "epoch": 0.47896198557581476, + "grad_norm": 2.046875, + "learning_rate": 2.677477523399967e-05, + "loss": 0.3521, + "step": 10908 + }, + { + "epoch": 0.4790498040550183, + "grad_norm": 2.328125, + "learning_rate": 2.676787433751376e-05, + "loss": 0.3662, + "step": 10910 + }, + { + "epoch": 0.47913762253422176, + "grad_norm": 2.234375, + "learning_rate": 2.6760973305643616e-05, + "loss": 0.3562, + "step": 10912 + }, + { + "epoch": 0.47922544101342524, + "grad_norm": 1.75, + "learning_rate": 2.6754072138917708e-05, + "loss": 0.362, + "step": 10914 + }, + { + "epoch": 0.47931325949262876, + "grad_norm": 2.265625, + "learning_rate": 2.674717083786454e-05, + "loss": 0.3507, + "step": 10916 + }, + { + "epoch": 0.47940107797183223, + "grad_norm": 3.125, + "learning_rate": 2.6740269403012595e-05, + "loss": 0.3584, + "step": 10918 + }, + { + "epoch": 0.4794888964510357, + "grad_norm": 1.796875, + "learning_rate": 2.6733367834890415e-05, + "loss": 0.376, + "step": 10920 + }, + { + "epoch": 0.4795767149302392, + "grad_norm": 1.75, + "learning_rate": 2.6726466134026507e-05, + "loss": 0.3462, + "step": 10922 + }, + { + "epoch": 0.4796645334094427, + "grad_norm": 1.7265625, + "learning_rate": 2.6719564300949413e-05, + "loss": 0.3701, + "step": 10924 + }, + { + "epoch": 0.4797523518886462, + "grad_norm": 1.9921875, + "learning_rate": 2.6712662336187667e-05, + "loss": 0.3719, + "step": 10926 + }, + { + "epoch": 0.47984017036784965, + "grad_norm": 1.9921875, + "learning_rate": 2.6705760240269828e-05, + "loss": 0.3555, + "step": 10928 + }, + { + "epoch": 0.4799279888470531, + "grad_norm": 1.78125, + "learning_rate": 2.6698858013724466e-05, + "loss": 0.365, + "step": 10930 + }, + { + "epoch": 0.48001580732625665, + "grad_norm": 1.7734375, + "learning_rate": 2.669195565708015e-05, + "loss": 0.3934, + "step": 10932 + }, + { + "epoch": 0.4801036258054601, + "grad_norm": 1.96875, + "learning_rate": 2.668505317086546e-05, + "loss": 0.3618, + "step": 10934 + }, + { + "epoch": 0.4801914442846636, + "grad_norm": 2.171875, + "learning_rate": 2.6678150555608993e-05, + "loss": 0.3615, + "step": 10936 + }, + { + "epoch": 0.48027926276386707, + "grad_norm": 2.25, + "learning_rate": 2.667124781183936e-05, + "loss": 0.3968, + "step": 10938 + }, + { + "epoch": 0.4803670812430706, + "grad_norm": 1.671875, + "learning_rate": 2.6664344940085166e-05, + "loss": 0.3858, + "step": 10940 + }, + { + "epoch": 0.48045489972227406, + "grad_norm": 2.046875, + "learning_rate": 2.665744194087503e-05, + "loss": 0.3488, + "step": 10942 + }, + { + "epoch": 0.48054271820147754, + "grad_norm": 2.40625, + "learning_rate": 2.6650538814737603e-05, + "loss": 0.3385, + "step": 10944 + }, + { + "epoch": 0.480630536680681, + "grad_norm": 1.890625, + "learning_rate": 2.6643635562201513e-05, + "loss": 0.3834, + "step": 10946 + }, + { + "epoch": 0.48071835515988454, + "grad_norm": 1.9375, + "learning_rate": 2.6636732183795426e-05, + "loss": 0.3588, + "step": 10948 + }, + { + "epoch": 0.480806173639088, + "grad_norm": 1.9140625, + "learning_rate": 2.6629828680047998e-05, + "loss": 0.3857, + "step": 10950 + }, + { + "epoch": 0.4808939921182915, + "grad_norm": 2.234375, + "learning_rate": 2.6622925051487884e-05, + "loss": 0.3765, + "step": 10952 + }, + { + "epoch": 0.48098181059749495, + "grad_norm": 2.421875, + "learning_rate": 2.6616021298643788e-05, + "loss": 0.3642, + "step": 10954 + }, + { + "epoch": 0.4810696290766985, + "grad_norm": 1.8046875, + "learning_rate": 2.6609117422044394e-05, + "loss": 0.3503, + "step": 10956 + }, + { + "epoch": 0.48115744755590195, + "grad_norm": 2.125, + "learning_rate": 2.66022134222184e-05, + "loss": 0.3387, + "step": 10958 + }, + { + "epoch": 0.4812452660351054, + "grad_norm": 2.109375, + "learning_rate": 2.659530929969452e-05, + "loss": 0.3796, + "step": 10960 + }, + { + "epoch": 0.48133308451430895, + "grad_norm": 1.9296875, + "learning_rate": 2.658840505500147e-05, + "loss": 0.3883, + "step": 10962 + }, + { + "epoch": 0.4814209029935124, + "grad_norm": 2.4375, + "learning_rate": 2.658150068866798e-05, + "loss": 0.3505, + "step": 10964 + }, + { + "epoch": 0.4815087214727159, + "grad_norm": 1.5859375, + "learning_rate": 2.657459620122279e-05, + "loss": 0.3476, + "step": 10966 + }, + { + "epoch": 0.48159653995191937, + "grad_norm": 2.046875, + "learning_rate": 2.6567691593194644e-05, + "loss": 0.3604, + "step": 10968 + }, + { + "epoch": 0.4816843584311229, + "grad_norm": 1.8828125, + "learning_rate": 2.6560786865112296e-05, + "loss": 0.3394, + "step": 10970 + }, + { + "epoch": 0.48177217691032637, + "grad_norm": 1.890625, + "learning_rate": 2.6553882017504523e-05, + "loss": 0.3632, + "step": 10972 + }, + { + "epoch": 0.48185999538952984, + "grad_norm": 1.765625, + "learning_rate": 2.654697705090009e-05, + "loss": 0.3815, + "step": 10974 + }, + { + "epoch": 0.4819478138687333, + "grad_norm": 1.9296875, + "learning_rate": 2.6540071965827784e-05, + "loss": 0.3448, + "step": 10976 + }, + { + "epoch": 0.48203563234793684, + "grad_norm": 1.8671875, + "learning_rate": 2.6533166762816404e-05, + "loss": 0.3669, + "step": 10978 + }, + { + "epoch": 0.4821234508271403, + "grad_norm": 2.109375, + "learning_rate": 2.6526261442394734e-05, + "loss": 0.3838, + "step": 10980 + }, + { + "epoch": 0.4822112693063438, + "grad_norm": 2.03125, + "learning_rate": 2.65193560050916e-05, + "loss": 0.3641, + "step": 10982 + }, + { + "epoch": 0.48229908778554725, + "grad_norm": 1.7421875, + "learning_rate": 2.6512450451435822e-05, + "loss": 0.3629, + "step": 10984 + }, + { + "epoch": 0.4823869062647508, + "grad_norm": 2.03125, + "learning_rate": 2.6505544781956225e-05, + "loss": 0.3732, + "step": 10986 + }, + { + "epoch": 0.48247472474395425, + "grad_norm": 1.7734375, + "learning_rate": 2.6498638997181648e-05, + "loss": 0.3918, + "step": 10988 + }, + { + "epoch": 0.4825625432231577, + "grad_norm": 1.8515625, + "learning_rate": 2.649173309764093e-05, + "loss": 0.3696, + "step": 10990 + }, + { + "epoch": 0.4826503617023612, + "grad_norm": 1.8515625, + "learning_rate": 2.648482708386294e-05, + "loss": 0.3544, + "step": 10992 + }, + { + "epoch": 0.4827381801815647, + "grad_norm": 1.734375, + "learning_rate": 2.647792095637654e-05, + "loss": 0.3607, + "step": 10994 + }, + { + "epoch": 0.4828259986607682, + "grad_norm": 2.140625, + "learning_rate": 2.6471014715710595e-05, + "loss": 0.3559, + "step": 10996 + }, + { + "epoch": 0.48291381713997167, + "grad_norm": 1.8515625, + "learning_rate": 2.646410836239399e-05, + "loss": 0.3294, + "step": 10998 + }, + { + "epoch": 0.48300163561917514, + "grad_norm": 1.734375, + "learning_rate": 2.645720189695562e-05, + "loss": 0.3633, + "step": 11000 + }, + { + "epoch": 0.48308945409837867, + "grad_norm": 1.9140625, + "learning_rate": 2.6450295319924374e-05, + "loss": 0.359, + "step": 11002 + }, + { + "epoch": 0.48317727257758214, + "grad_norm": 1.953125, + "learning_rate": 2.6443388631829162e-05, + "loss": 0.3626, + "step": 11004 + }, + { + "epoch": 0.4832650910567856, + "grad_norm": 1.84375, + "learning_rate": 2.6436481833198905e-05, + "loss": 0.3442, + "step": 11006 + }, + { + "epoch": 0.4833529095359891, + "grad_norm": 1.8203125, + "learning_rate": 2.6429574924562534e-05, + "loss": 0.3785, + "step": 11008 + }, + { + "epoch": 0.4834407280151926, + "grad_norm": 1.71875, + "learning_rate": 2.642266790644896e-05, + "loss": 0.3794, + "step": 11010 + }, + { + "epoch": 0.4835285464943961, + "grad_norm": 2.015625, + "learning_rate": 2.641576077938715e-05, + "loss": 0.3482, + "step": 11012 + }, + { + "epoch": 0.48361636497359956, + "grad_norm": 1.875, + "learning_rate": 2.640885354390603e-05, + "loss": 0.3612, + "step": 11014 + }, + { + "epoch": 0.4837041834528031, + "grad_norm": 1.875, + "learning_rate": 2.6401946200534567e-05, + "loss": 0.3577, + "step": 11016 + }, + { + "epoch": 0.48379200193200655, + "grad_norm": 1.9296875, + "learning_rate": 2.639503874980173e-05, + "loss": 0.3491, + "step": 11018 + }, + { + "epoch": 0.48387982041121, + "grad_norm": 1.78125, + "learning_rate": 2.6388131192236493e-05, + "loss": 0.372, + "step": 11020 + }, + { + "epoch": 0.4839676388904135, + "grad_norm": 1.7421875, + "learning_rate": 2.6381223528367836e-05, + "loss": 0.3848, + "step": 11022 + }, + { + "epoch": 0.484055457369617, + "grad_norm": 1.8359375, + "learning_rate": 2.6374315758724755e-05, + "loss": 0.3491, + "step": 11024 + }, + { + "epoch": 0.4841432758488205, + "grad_norm": 1.859375, + "learning_rate": 2.6367407883836238e-05, + "loss": 0.3776, + "step": 11026 + }, + { + "epoch": 0.48423109432802397, + "grad_norm": 1.8359375, + "learning_rate": 2.6360499904231297e-05, + "loss": 0.3584, + "step": 11028 + }, + { + "epoch": 0.48431891280722744, + "grad_norm": 1.703125, + "learning_rate": 2.6353591820438945e-05, + "loss": 0.356, + "step": 11030 + }, + { + "epoch": 0.48440673128643097, + "grad_norm": 1.8515625, + "learning_rate": 2.6346683632988213e-05, + "loss": 0.3673, + "step": 11032 + }, + { + "epoch": 0.48449454976563444, + "grad_norm": 1.8671875, + "learning_rate": 2.633977534240812e-05, + "loss": 0.3659, + "step": 11034 + }, + { + "epoch": 0.4845823682448379, + "grad_norm": 1.84375, + "learning_rate": 2.6332866949227713e-05, + "loss": 0.3441, + "step": 11036 + }, + { + "epoch": 0.4846701867240414, + "grad_norm": 1.828125, + "learning_rate": 2.6325958453976036e-05, + "loss": 0.374, + "step": 11038 + }, + { + "epoch": 0.4847580052032449, + "grad_norm": 1.7265625, + "learning_rate": 2.6319049857182144e-05, + "loss": 0.3385, + "step": 11040 + }, + { + "epoch": 0.4848458236824484, + "grad_norm": 2.03125, + "learning_rate": 2.631214115937509e-05, + "loss": 0.3632, + "step": 11042 + }, + { + "epoch": 0.48493364216165186, + "grad_norm": 1.859375, + "learning_rate": 2.630523236108396e-05, + "loss": 0.3257, + "step": 11044 + }, + { + "epoch": 0.48502146064085533, + "grad_norm": 1.7578125, + "learning_rate": 2.629832346283782e-05, + "loss": 0.3693, + "step": 11046 + }, + { + "epoch": 0.48510927912005886, + "grad_norm": 2.15625, + "learning_rate": 2.629141446516576e-05, + "loss": 0.3815, + "step": 11048 + }, + { + "epoch": 0.48519709759926233, + "grad_norm": 1.984375, + "learning_rate": 2.628450536859687e-05, + "loss": 0.3517, + "step": 11050 + }, + { + "epoch": 0.4852849160784658, + "grad_norm": 1.75, + "learning_rate": 2.6277596173660246e-05, + "loss": 0.3754, + "step": 11052 + }, + { + "epoch": 0.48537273455766927, + "grad_norm": 1.75, + "learning_rate": 2.627068688088501e-05, + "loss": 0.401, + "step": 11054 + }, + { + "epoch": 0.4854605530368728, + "grad_norm": 1.7578125, + "learning_rate": 2.6263777490800263e-05, + "loss": 0.3613, + "step": 11056 + }, + { + "epoch": 0.48554837151607627, + "grad_norm": 1.6875, + "learning_rate": 2.6256868003935127e-05, + "loss": 0.3688, + "step": 11058 + }, + { + "epoch": 0.48563618999527974, + "grad_norm": 1.75, + "learning_rate": 2.6249958420818744e-05, + "loss": 0.3926, + "step": 11060 + }, + { + "epoch": 0.48572400847448327, + "grad_norm": 1.6796875, + "learning_rate": 2.6243048741980246e-05, + "loss": 0.3964, + "step": 11062 + }, + { + "epoch": 0.48581182695368674, + "grad_norm": 1.7578125, + "learning_rate": 2.6236138967948776e-05, + "loss": 0.3771, + "step": 11064 + }, + { + "epoch": 0.4858996454328902, + "grad_norm": 1.859375, + "learning_rate": 2.622922909925349e-05, + "loss": 0.3446, + "step": 11066 + }, + { + "epoch": 0.4859874639120937, + "grad_norm": 1.9140625, + "learning_rate": 2.6222319136423533e-05, + "loss": 0.337, + "step": 11068 + }, + { + "epoch": 0.4860752823912972, + "grad_norm": 1.8125, + "learning_rate": 2.62154090799881e-05, + "loss": 0.3735, + "step": 11070 + }, + { + "epoch": 0.4861631008705007, + "grad_norm": 1.7421875, + "learning_rate": 2.6208498930476337e-05, + "loss": 0.3807, + "step": 11072 + }, + { + "epoch": 0.48625091934970416, + "grad_norm": 1.671875, + "learning_rate": 2.620158868841745e-05, + "loss": 0.3486, + "step": 11074 + }, + { + "epoch": 0.48633873782890763, + "grad_norm": 1.6640625, + "learning_rate": 2.6194678354340602e-05, + "loss": 0.3378, + "step": 11076 + }, + { + "epoch": 0.48642655630811116, + "grad_norm": 1.8359375, + "learning_rate": 2.6187767928774998e-05, + "loss": 0.3508, + "step": 11078 + }, + { + "epoch": 0.48651437478731463, + "grad_norm": 1.8359375, + "learning_rate": 2.6180857412249842e-05, + "loss": 0.3512, + "step": 11080 + }, + { + "epoch": 0.4866021932665181, + "grad_norm": 1.75, + "learning_rate": 2.6173946805294348e-05, + "loss": 0.338, + "step": 11082 + }, + { + "epoch": 0.4866900117457216, + "grad_norm": 1.859375, + "learning_rate": 2.6167036108437724e-05, + "loss": 0.3863, + "step": 11084 + }, + { + "epoch": 0.4867778302249251, + "grad_norm": 1.7734375, + "learning_rate": 2.6160125322209194e-05, + "loss": 0.3513, + "step": 11086 + }, + { + "epoch": 0.4868656487041286, + "grad_norm": 1.8515625, + "learning_rate": 2.615321444713799e-05, + "loss": 0.3806, + "step": 11088 + }, + { + "epoch": 0.48695346718333204, + "grad_norm": 2.0, + "learning_rate": 2.6146303483753343e-05, + "loss": 0.3856, + "step": 11090 + }, + { + "epoch": 0.4870412856625355, + "grad_norm": 1.71875, + "learning_rate": 2.6139392432584504e-05, + "loss": 0.3408, + "step": 11092 + }, + { + "epoch": 0.48712910414173904, + "grad_norm": 1.8203125, + "learning_rate": 2.6132481294160715e-05, + "loss": 0.3723, + "step": 11094 + }, + { + "epoch": 0.4872169226209425, + "grad_norm": 2.125, + "learning_rate": 2.612557006901124e-05, + "loss": 0.3541, + "step": 11096 + }, + { + "epoch": 0.487304741100146, + "grad_norm": 1.7109375, + "learning_rate": 2.6118658757665343e-05, + "loss": 0.37, + "step": 11098 + }, + { + "epoch": 0.48739255957934946, + "grad_norm": 1.9453125, + "learning_rate": 2.6111747360652295e-05, + "loss": 0.3398, + "step": 11100 + }, + { + "epoch": 0.487480378058553, + "grad_norm": 1.671875, + "learning_rate": 2.6104835878501353e-05, + "loss": 0.3591, + "step": 11102 + }, + { + "epoch": 0.48756819653775646, + "grad_norm": 1.9453125, + "learning_rate": 2.609792431174182e-05, + "loss": 0.3655, + "step": 11104 + }, + { + "epoch": 0.48765601501695993, + "grad_norm": 1.7109375, + "learning_rate": 2.609101266090298e-05, + "loss": 0.3481, + "step": 11106 + }, + { + "epoch": 0.4877438334961634, + "grad_norm": 1.9765625, + "learning_rate": 2.608410092651413e-05, + "loss": 0.372, + "step": 11108 + }, + { + "epoch": 0.48783165197536693, + "grad_norm": 1.9140625, + "learning_rate": 2.6077189109104577e-05, + "loss": 0.3566, + "step": 11110 + }, + { + "epoch": 0.4879194704545704, + "grad_norm": 2.09375, + "learning_rate": 2.6070277209203613e-05, + "loss": 0.352, + "step": 11112 + }, + { + "epoch": 0.4880072889337739, + "grad_norm": 1.828125, + "learning_rate": 2.606336522734057e-05, + "loss": 0.3427, + "step": 11114 + }, + { + "epoch": 0.4880951074129774, + "grad_norm": 1.8203125, + "learning_rate": 2.605645316404477e-05, + "loss": 0.3579, + "step": 11116 + }, + { + "epoch": 0.4881829258921809, + "grad_norm": 2.015625, + "learning_rate": 2.604954101984552e-05, + "loss": 0.3444, + "step": 11118 + }, + { + "epoch": 0.48827074437138435, + "grad_norm": 1.8671875, + "learning_rate": 2.6042628795272174e-05, + "loss": 0.345, + "step": 11120 + }, + { + "epoch": 0.4883585628505878, + "grad_norm": 1.8046875, + "learning_rate": 2.6035716490854067e-05, + "loss": 0.3319, + "step": 11122 + }, + { + "epoch": 0.48844638132979135, + "grad_norm": 1.75, + "learning_rate": 2.6028804107120535e-05, + "loss": 0.3964, + "step": 11124 + }, + { + "epoch": 0.4885341998089948, + "grad_norm": 1.6875, + "learning_rate": 2.6021891644600944e-05, + "loss": 0.3471, + "step": 11126 + }, + { + "epoch": 0.4886220182881983, + "grad_norm": 2.015625, + "learning_rate": 2.6014979103824637e-05, + "loss": 0.3365, + "step": 11128 + }, + { + "epoch": 0.48870983676740176, + "grad_norm": 1.640625, + "learning_rate": 2.6008066485320998e-05, + "loss": 0.3383, + "step": 11130 + }, + { + "epoch": 0.4887976552466053, + "grad_norm": 2.125, + "learning_rate": 2.6001153789619383e-05, + "loss": 0.3568, + "step": 11132 + }, + { + "epoch": 0.48888547372580876, + "grad_norm": 1.8359375, + "learning_rate": 2.5994241017249167e-05, + "loss": 0.3439, + "step": 11134 + }, + { + "epoch": 0.48897329220501223, + "grad_norm": 2.1875, + "learning_rate": 2.5987328168739745e-05, + "loss": 0.3543, + "step": 11136 + }, + { + "epoch": 0.4890611106842157, + "grad_norm": 1.8984375, + "learning_rate": 2.5980415244620486e-05, + "loss": 0.3801, + "step": 11138 + }, + { + "epoch": 0.48914892916341923, + "grad_norm": 1.8359375, + "learning_rate": 2.5973502245420793e-05, + "loss": 0.3759, + "step": 11140 + }, + { + "epoch": 0.4892367476426227, + "grad_norm": 1.6953125, + "learning_rate": 2.5966589171670064e-05, + "loss": 0.3677, + "step": 11142 + }, + { + "epoch": 0.4893245661218262, + "grad_norm": 1.9453125, + "learning_rate": 2.5959676023897706e-05, + "loss": 0.3884, + "step": 11144 + }, + { + "epoch": 0.48941238460102965, + "grad_norm": 1.75, + "learning_rate": 2.5952762802633136e-05, + "loss": 0.3424, + "step": 11146 + }, + { + "epoch": 0.4895002030802332, + "grad_norm": 1.734375, + "learning_rate": 2.5945849508405755e-05, + "loss": 0.3576, + "step": 11148 + }, + { + "epoch": 0.48958802155943665, + "grad_norm": 1.7421875, + "learning_rate": 2.593893614174499e-05, + "loss": 0.3672, + "step": 11150 + }, + { + "epoch": 0.4896758400386401, + "grad_norm": 1.796875, + "learning_rate": 2.5932022703180277e-05, + "loss": 0.3371, + "step": 11152 + }, + { + "epoch": 0.4897636585178436, + "grad_norm": 1.6953125, + "learning_rate": 2.592510919324103e-05, + "loss": 0.3762, + "step": 11154 + }, + { + "epoch": 0.4898514769970471, + "grad_norm": 1.65625, + "learning_rate": 2.591819561245671e-05, + "loss": 0.3559, + "step": 11156 + }, + { + "epoch": 0.4899392954762506, + "grad_norm": 1.8515625, + "learning_rate": 2.591128196135675e-05, + "loss": 0.3713, + "step": 11158 + }, + { + "epoch": 0.49002711395545406, + "grad_norm": 2.046875, + "learning_rate": 2.5904368240470595e-05, + "loss": 0.3572, + "step": 11160 + }, + { + "epoch": 0.49011493243465754, + "grad_norm": 1.8984375, + "learning_rate": 2.5897454450327707e-05, + "loss": 0.3401, + "step": 11162 + }, + { + "epoch": 0.49020275091386106, + "grad_norm": 2.15625, + "learning_rate": 2.5890540591457535e-05, + "loss": 0.3697, + "step": 11164 + }, + { + "epoch": 0.49029056939306453, + "grad_norm": 1.7265625, + "learning_rate": 2.5883626664389556e-05, + "loss": 0.3574, + "step": 11166 + }, + { + "epoch": 0.490378387872268, + "grad_norm": 1.7578125, + "learning_rate": 2.5876712669653232e-05, + "loss": 0.3785, + "step": 11168 + }, + { + "epoch": 0.49046620635147153, + "grad_norm": 1.7578125, + "learning_rate": 2.586979860777804e-05, + "loss": 0.3473, + "step": 11170 + }, + { + "epoch": 0.490554024830675, + "grad_norm": 1.609375, + "learning_rate": 2.586288447929347e-05, + "loss": 0.349, + "step": 11172 + }, + { + "epoch": 0.4906418433098785, + "grad_norm": 1.8671875, + "learning_rate": 2.5855970284728993e-05, + "loss": 0.3541, + "step": 11174 + }, + { + "epoch": 0.49072966178908195, + "grad_norm": 1.8515625, + "learning_rate": 2.584905602461411e-05, + "loss": 0.3555, + "step": 11176 + }, + { + "epoch": 0.4908174802682855, + "grad_norm": 1.8125, + "learning_rate": 2.5842141699478317e-05, + "loss": 0.3449, + "step": 11178 + }, + { + "epoch": 0.49090529874748895, + "grad_norm": 1.6953125, + "learning_rate": 2.58352273098511e-05, + "loss": 0.3378, + "step": 11180 + }, + { + "epoch": 0.4909931172266924, + "grad_norm": 1.7890625, + "learning_rate": 2.5828312856261982e-05, + "loss": 0.3507, + "step": 11182 + }, + { + "epoch": 0.4910809357058959, + "grad_norm": 1.90625, + "learning_rate": 2.582139833924047e-05, + "loss": 0.3718, + "step": 11184 + }, + { + "epoch": 0.4911687541850994, + "grad_norm": 2.03125, + "learning_rate": 2.581448375931607e-05, + "loss": 0.3828, + "step": 11186 + }, + { + "epoch": 0.4912565726643029, + "grad_norm": 1.8515625, + "learning_rate": 2.580756911701831e-05, + "loss": 0.3782, + "step": 11188 + }, + { + "epoch": 0.49134439114350636, + "grad_norm": 2.0, + "learning_rate": 2.5800654412876713e-05, + "loss": 0.3224, + "step": 11190 + }, + { + "epoch": 0.49143220962270984, + "grad_norm": 1.8671875, + "learning_rate": 2.5793739647420817e-05, + "loss": 0.3679, + "step": 11192 + }, + { + "epoch": 0.49152002810191336, + "grad_norm": 1.765625, + "learning_rate": 2.578682482118014e-05, + "loss": 0.3788, + "step": 11194 + }, + { + "epoch": 0.49160784658111684, + "grad_norm": 1.9609375, + "learning_rate": 2.5779909934684227e-05, + "loss": 0.3789, + "step": 11196 + }, + { + "epoch": 0.4916956650603203, + "grad_norm": 1.9609375, + "learning_rate": 2.5772994988462634e-05, + "loss": 0.3832, + "step": 11198 + }, + { + "epoch": 0.4917834835395238, + "grad_norm": 1.8671875, + "learning_rate": 2.5766079983044893e-05, + "loss": 0.3572, + "step": 11200 + }, + { + "epoch": 0.4918713020187273, + "grad_norm": 1.703125, + "learning_rate": 2.5759164918960567e-05, + "loss": 0.3766, + "step": 11202 + }, + { + "epoch": 0.4919591204979308, + "grad_norm": 1.75, + "learning_rate": 2.5752249796739202e-05, + "loss": 0.3484, + "step": 11204 + }, + { + "epoch": 0.49204693897713425, + "grad_norm": 1.9921875, + "learning_rate": 2.574533461691037e-05, + "loss": 0.3933, + "step": 11206 + }, + { + "epoch": 0.4921347574563377, + "grad_norm": 2.078125, + "learning_rate": 2.5738419380003642e-05, + "loss": 0.346, + "step": 11208 + }, + { + "epoch": 0.49222257593554125, + "grad_norm": 2.28125, + "learning_rate": 2.573150408654858e-05, + "loss": 0.3779, + "step": 11210 + }, + { + "epoch": 0.4923103944147447, + "grad_norm": 1.7734375, + "learning_rate": 2.572458873707475e-05, + "loss": 0.3431, + "step": 11212 + }, + { + "epoch": 0.4923982128939482, + "grad_norm": 1.765625, + "learning_rate": 2.5717673332111748e-05, + "loss": 0.3638, + "step": 11214 + }, + { + "epoch": 0.4924860313731517, + "grad_norm": 1.8515625, + "learning_rate": 2.5710757872189145e-05, + "loss": 0.3546, + "step": 11216 + }, + { + "epoch": 0.4925738498523552, + "grad_norm": 1.5859375, + "learning_rate": 2.5703842357836537e-05, + "loss": 0.3372, + "step": 11218 + }, + { + "epoch": 0.49266166833155867, + "grad_norm": 1.7421875, + "learning_rate": 2.5696926789583513e-05, + "loss": 0.3499, + "step": 11220 + }, + { + "epoch": 0.49274948681076214, + "grad_norm": 2.0, + "learning_rate": 2.569001116795967e-05, + "loss": 0.3435, + "step": 11222 + }, + { + "epoch": 0.49283730528996567, + "grad_norm": 1.875, + "learning_rate": 2.5683095493494607e-05, + "loss": 0.3628, + "step": 11224 + }, + { + "epoch": 0.49292512376916914, + "grad_norm": 1.8203125, + "learning_rate": 2.5676179766717923e-05, + "loss": 0.3492, + "step": 11226 + }, + { + "epoch": 0.4930129422483726, + "grad_norm": 1.6640625, + "learning_rate": 2.566926398815923e-05, + "loss": 0.3667, + "step": 11228 + }, + { + "epoch": 0.4931007607275761, + "grad_norm": 1.6484375, + "learning_rate": 2.5662348158348142e-05, + "loss": 0.3326, + "step": 11230 + }, + { + "epoch": 0.4931885792067796, + "grad_norm": 1.78125, + "learning_rate": 2.5655432277814273e-05, + "loss": 0.3653, + "step": 11232 + }, + { + "epoch": 0.4932763976859831, + "grad_norm": 1.7109375, + "learning_rate": 2.5648516347087248e-05, + "loss": 0.3682, + "step": 11234 + }, + { + "epoch": 0.49336421616518655, + "grad_norm": 1.96875, + "learning_rate": 2.564160036669668e-05, + "loss": 0.3406, + "step": 11236 + }, + { + "epoch": 0.49345203464439, + "grad_norm": 1.90625, + "learning_rate": 2.5634684337172205e-05, + "loss": 0.3647, + "step": 11238 + }, + { + "epoch": 0.49353985312359355, + "grad_norm": 1.8046875, + "learning_rate": 2.562776825904345e-05, + "loss": 0.3419, + "step": 11240 + }, + { + "epoch": 0.493627671602797, + "grad_norm": 1.828125, + "learning_rate": 2.562085213284005e-05, + "loss": 0.3472, + "step": 11242 + }, + { + "epoch": 0.4937154900820005, + "grad_norm": 1.8125, + "learning_rate": 2.5613935959091647e-05, + "loss": 0.3834, + "step": 11244 + }, + { + "epoch": 0.49380330856120397, + "grad_norm": 1.9609375, + "learning_rate": 2.5607019738327882e-05, + "loss": 0.3647, + "step": 11246 + }, + { + "epoch": 0.4938911270404075, + "grad_norm": 1.78125, + "learning_rate": 2.5600103471078397e-05, + "loss": 0.3764, + "step": 11248 + }, + { + "epoch": 0.49397894551961097, + "grad_norm": 1.8203125, + "learning_rate": 2.5593187157872845e-05, + "loss": 0.364, + "step": 11250 + }, + { + "epoch": 0.49406676399881444, + "grad_norm": 1.6953125, + "learning_rate": 2.5586270799240876e-05, + "loss": 0.3456, + "step": 11252 + }, + { + "epoch": 0.4941545824780179, + "grad_norm": 1.6640625, + "learning_rate": 2.557935439571215e-05, + "loss": 0.3279, + "step": 11254 + }, + { + "epoch": 0.49424240095722144, + "grad_norm": 1.6875, + "learning_rate": 2.557243794781633e-05, + "loss": 0.3656, + "step": 11256 + }, + { + "epoch": 0.4943302194364249, + "grad_norm": 1.6953125, + "learning_rate": 2.556552145608307e-05, + "loss": 0.3393, + "step": 11258 + }, + { + "epoch": 0.4944180379156284, + "grad_norm": 1.6796875, + "learning_rate": 2.5558604921042045e-05, + "loss": 0.3982, + "step": 11260 + }, + { + "epoch": 0.49450585639483186, + "grad_norm": 1.703125, + "learning_rate": 2.555168834322292e-05, + "loss": 0.3508, + "step": 11262 + }, + { + "epoch": 0.4945936748740354, + "grad_norm": 1.7890625, + "learning_rate": 2.5544771723155365e-05, + "loss": 0.3574, + "step": 11264 + }, + { + "epoch": 0.49468149335323885, + "grad_norm": 1.703125, + "learning_rate": 2.553785506136906e-05, + "loss": 0.3464, + "step": 11266 + }, + { + "epoch": 0.4947693118324423, + "grad_norm": 1.8125, + "learning_rate": 2.553093835839369e-05, + "loss": 0.3589, + "step": 11268 + }, + { + "epoch": 0.49485713031164585, + "grad_norm": 1.78125, + "learning_rate": 2.5524021614758934e-05, + "loss": 0.3419, + "step": 11270 + }, + { + "epoch": 0.4949449487908493, + "grad_norm": 1.734375, + "learning_rate": 2.551710483099448e-05, + "loss": 0.3627, + "step": 11272 + }, + { + "epoch": 0.4950327672700528, + "grad_norm": 1.6796875, + "learning_rate": 2.551018800763001e-05, + "loss": 0.3796, + "step": 11274 + }, + { + "epoch": 0.49512058574925627, + "grad_norm": 1.625, + "learning_rate": 2.5503271145195217e-05, + "loss": 0.3389, + "step": 11276 + }, + { + "epoch": 0.4952084042284598, + "grad_norm": 1.609375, + "learning_rate": 2.5496354244219805e-05, + "loss": 0.3406, + "step": 11278 + }, + { + "epoch": 0.49529622270766327, + "grad_norm": 1.734375, + "learning_rate": 2.548943730523346e-05, + "loss": 0.3669, + "step": 11280 + }, + { + "epoch": 0.49538404118686674, + "grad_norm": 1.7890625, + "learning_rate": 2.5482520328765898e-05, + "loss": 0.3812, + "step": 11282 + }, + { + "epoch": 0.4954718596660702, + "grad_norm": 1.8125, + "learning_rate": 2.5475603315346807e-05, + "loss": 0.3608, + "step": 11284 + }, + { + "epoch": 0.49555967814527374, + "grad_norm": 1.765625, + "learning_rate": 2.5468686265505903e-05, + "loss": 0.3618, + "step": 11286 + }, + { + "epoch": 0.4956474966244772, + "grad_norm": 1.7265625, + "learning_rate": 2.5461769179772886e-05, + "loss": 0.3699, + "step": 11288 + }, + { + "epoch": 0.4957353151036807, + "grad_norm": 1.84375, + "learning_rate": 2.545485205867748e-05, + "loss": 0.3482, + "step": 11290 + }, + { + "epoch": 0.49582313358288416, + "grad_norm": 1.765625, + "learning_rate": 2.5447934902749393e-05, + "loss": 0.3649, + "step": 11292 + }, + { + "epoch": 0.4959109520620877, + "grad_norm": 2.078125, + "learning_rate": 2.5441017712518337e-05, + "loss": 0.3537, + "step": 11294 + }, + { + "epoch": 0.49599877054129116, + "grad_norm": 2.0, + "learning_rate": 2.5434100488514053e-05, + "loss": 0.3684, + "step": 11296 + }, + { + "epoch": 0.49608658902049463, + "grad_norm": 1.8046875, + "learning_rate": 2.542718323126624e-05, + "loss": 0.3429, + "step": 11298 + }, + { + "epoch": 0.4961744074996981, + "grad_norm": 1.6796875, + "learning_rate": 2.542026594130464e-05, + "loss": 0.3532, + "step": 11300 + }, + { + "epoch": 0.4962622259789016, + "grad_norm": 1.796875, + "learning_rate": 2.5413348619158967e-05, + "loss": 0.3644, + "step": 11302 + }, + { + "epoch": 0.4963500444581051, + "grad_norm": 1.7734375, + "learning_rate": 2.5406431265358955e-05, + "loss": 0.3644, + "step": 11304 + }, + { + "epoch": 0.49643786293730857, + "grad_norm": 1.78125, + "learning_rate": 2.5399513880434345e-05, + "loss": 0.3695, + "step": 11306 + }, + { + "epoch": 0.49652568141651204, + "grad_norm": 1.8203125, + "learning_rate": 2.5392596464914864e-05, + "loss": 0.3724, + "step": 11308 + }, + { + "epoch": 0.49661349989571557, + "grad_norm": 1.8515625, + "learning_rate": 2.5385679019330257e-05, + "loss": 0.3812, + "step": 11310 + }, + { + "epoch": 0.49670131837491904, + "grad_norm": 2.015625, + "learning_rate": 2.537876154421025e-05, + "loss": 0.3289, + "step": 11312 + }, + { + "epoch": 0.4967891368541225, + "grad_norm": 1.828125, + "learning_rate": 2.537184404008459e-05, + "loss": 0.3673, + "step": 11314 + }, + { + "epoch": 0.49687695533332604, + "grad_norm": 1.9609375, + "learning_rate": 2.536492650748304e-05, + "loss": 0.3545, + "step": 11316 + }, + { + "epoch": 0.4969647738125295, + "grad_norm": 2.0, + "learning_rate": 2.535800894693532e-05, + "loss": 0.3674, + "step": 11318 + }, + { + "epoch": 0.497052592291733, + "grad_norm": 1.9765625, + "learning_rate": 2.535109135897119e-05, + "loss": 0.3536, + "step": 11320 + }, + { + "epoch": 0.49714041077093646, + "grad_norm": 1.7265625, + "learning_rate": 2.5344173744120402e-05, + "loss": 0.3547, + "step": 11322 + }, + { + "epoch": 0.49722822925014, + "grad_norm": 1.8359375, + "learning_rate": 2.53372561029127e-05, + "loss": 0.3864, + "step": 11324 + }, + { + "epoch": 0.49731604772934346, + "grad_norm": 1.6796875, + "learning_rate": 2.533033843587785e-05, + "loss": 0.3391, + "step": 11326 + }, + { + "epoch": 0.49740386620854693, + "grad_norm": 1.890625, + "learning_rate": 2.53234207435456e-05, + "loss": 0.374, + "step": 11328 + }, + { + "epoch": 0.4974916846877504, + "grad_norm": 1.796875, + "learning_rate": 2.531650302644572e-05, + "loss": 0.3793, + "step": 11330 + }, + { + "epoch": 0.49757950316695393, + "grad_norm": 1.7890625, + "learning_rate": 2.530958528510795e-05, + "loss": 0.3879, + "step": 11332 + }, + { + "epoch": 0.4976673216461574, + "grad_norm": 1.90625, + "learning_rate": 2.530266752006208e-05, + "loss": 0.3623, + "step": 11334 + }, + { + "epoch": 0.4977551401253609, + "grad_norm": 1.9296875, + "learning_rate": 2.529574973183785e-05, + "loss": 0.3668, + "step": 11336 + }, + { + "epoch": 0.49784295860456435, + "grad_norm": 2.21875, + "learning_rate": 2.5288831920965028e-05, + "loss": 0.3766, + "step": 11338 + }, + { + "epoch": 0.4979307770837679, + "grad_norm": 2.171875, + "learning_rate": 2.5281914087973395e-05, + "loss": 0.375, + "step": 11340 + }, + { + "epoch": 0.49801859556297134, + "grad_norm": 1.6484375, + "learning_rate": 2.5274996233392712e-05, + "loss": 0.3424, + "step": 11342 + }, + { + "epoch": 0.4981064140421748, + "grad_norm": 1.8359375, + "learning_rate": 2.5268078357752757e-05, + "loss": 0.3336, + "step": 11344 + }, + { + "epoch": 0.4981942325213783, + "grad_norm": 1.984375, + "learning_rate": 2.52611604615833e-05, + "loss": 0.3356, + "step": 11346 + }, + { + "epoch": 0.4982820510005818, + "grad_norm": 1.8046875, + "learning_rate": 2.5254242545414104e-05, + "loss": 0.3336, + "step": 11348 + }, + { + "epoch": 0.4983698694797853, + "grad_norm": 2.171875, + "learning_rate": 2.5247324609774953e-05, + "loss": 0.3769, + "step": 11350 + }, + { + "epoch": 0.49845768795898876, + "grad_norm": 1.9375, + "learning_rate": 2.524040665519562e-05, + "loss": 0.3574, + "step": 11352 + }, + { + "epoch": 0.49854550643819223, + "grad_norm": 1.734375, + "learning_rate": 2.523348868220589e-05, + "loss": 0.3693, + "step": 11354 + }, + { + "epoch": 0.49863332491739576, + "grad_norm": 1.8671875, + "learning_rate": 2.522657069133555e-05, + "loss": 0.3505, + "step": 11356 + }, + { + "epoch": 0.49872114339659923, + "grad_norm": 1.9453125, + "learning_rate": 2.5219652683114363e-05, + "loss": 0.3886, + "step": 11358 + }, + { + "epoch": 0.4988089618758027, + "grad_norm": 2.234375, + "learning_rate": 2.5212734658072124e-05, + "loss": 0.384, + "step": 11360 + }, + { + "epoch": 0.4988967803550062, + "grad_norm": 1.7734375, + "learning_rate": 2.520581661673862e-05, + "loss": 0.3613, + "step": 11362 + }, + { + "epoch": 0.4989845988342097, + "grad_norm": 1.8515625, + "learning_rate": 2.5198898559643618e-05, + "loss": 0.3673, + "step": 11364 + }, + { + "epoch": 0.4990724173134132, + "grad_norm": 1.953125, + "learning_rate": 2.5191980487316924e-05, + "loss": 0.353, + "step": 11366 + }, + { + "epoch": 0.49916023579261665, + "grad_norm": 1.9609375, + "learning_rate": 2.5185062400288316e-05, + "loss": 0.3536, + "step": 11368 + }, + { + "epoch": 0.4992480542718202, + "grad_norm": 1.8359375, + "learning_rate": 2.517814429908759e-05, + "loss": 0.35, + "step": 11370 + }, + { + "epoch": 0.49933587275102365, + "grad_norm": 1.8125, + "learning_rate": 2.5171226184244525e-05, + "loss": 0.3362, + "step": 11372 + }, + { + "epoch": 0.4994236912302271, + "grad_norm": 1.7734375, + "learning_rate": 2.5164308056288915e-05, + "loss": 0.3457, + "step": 11374 + }, + { + "epoch": 0.4995115097094306, + "grad_norm": 1.828125, + "learning_rate": 2.515738991575057e-05, + "loss": 0.3596, + "step": 11376 + }, + { + "epoch": 0.4995993281886341, + "grad_norm": 2.078125, + "learning_rate": 2.515047176315925e-05, + "loss": 0.3743, + "step": 11378 + }, + { + "epoch": 0.4996871466678376, + "grad_norm": 2.140625, + "learning_rate": 2.5143553599044773e-05, + "loss": 0.3283, + "step": 11380 + }, + { + "epoch": 0.49977496514704106, + "grad_norm": 1.8203125, + "learning_rate": 2.513663542393693e-05, + "loss": 0.3711, + "step": 11382 + }, + { + "epoch": 0.49986278362624453, + "grad_norm": 2.296875, + "learning_rate": 2.5129717238365508e-05, + "loss": 0.3386, + "step": 11384 + }, + { + "epoch": 0.49995060210544806, + "grad_norm": 1.8125, + "learning_rate": 2.512279904286031e-05, + "loss": 0.3563, + "step": 11386 + }, + { + "epoch": 0.5000384205846515, + "grad_norm": 1.890625, + "learning_rate": 2.5115880837951134e-05, + "loss": 0.3429, + "step": 11388 + }, + { + "epoch": 0.500126239063855, + "grad_norm": 1.84375, + "learning_rate": 2.510896262416777e-05, + "loss": 0.3711, + "step": 11390 + }, + { + "epoch": 0.5002140575430585, + "grad_norm": 1.671875, + "learning_rate": 2.5102044402040033e-05, + "loss": 0.3793, + "step": 11392 + }, + { + "epoch": 0.500301876022262, + "grad_norm": 1.9296875, + "learning_rate": 2.5095126172097705e-05, + "loss": 0.3527, + "step": 11394 + }, + { + "epoch": 0.5003896945014654, + "grad_norm": 1.6953125, + "learning_rate": 2.5088207934870593e-05, + "loss": 0.3562, + "step": 11396 + }, + { + "epoch": 0.500477512980669, + "grad_norm": 1.9609375, + "learning_rate": 2.5081289690888495e-05, + "loss": 0.3647, + "step": 11398 + }, + { + "epoch": 0.5005653314598725, + "grad_norm": 1.734375, + "learning_rate": 2.507437144068121e-05, + "loss": 0.344, + "step": 11400 + }, + { + "epoch": 0.500653149939076, + "grad_norm": 1.8515625, + "learning_rate": 2.5067453184778545e-05, + "loss": 0.3791, + "step": 11402 + }, + { + "epoch": 0.5007409684182794, + "grad_norm": 1.953125, + "learning_rate": 2.5060534923710298e-05, + "loss": 0.3648, + "step": 11404 + }, + { + "epoch": 0.5008287868974829, + "grad_norm": 1.703125, + "learning_rate": 2.505361665800628e-05, + "loss": 0.3255, + "step": 11406 + }, + { + "epoch": 0.5009166053766864, + "grad_norm": 1.75, + "learning_rate": 2.5046698388196284e-05, + "loss": 0.3838, + "step": 11408 + }, + { + "epoch": 0.5010044238558898, + "grad_norm": 1.6796875, + "learning_rate": 2.50397801148101e-05, + "loss": 0.3467, + "step": 11410 + }, + { + "epoch": 0.5010922423350933, + "grad_norm": 1.8984375, + "learning_rate": 2.5032861838377557e-05, + "loss": 0.3475, + "step": 11412 + }, + { + "epoch": 0.5011800608142969, + "grad_norm": 1.734375, + "learning_rate": 2.5025943559428444e-05, + "loss": 0.3663, + "step": 11414 + }, + { + "epoch": 0.5012678792935004, + "grad_norm": 1.7265625, + "learning_rate": 2.5019025278492565e-05, + "loss": 0.3312, + "step": 11416 + }, + { + "epoch": 0.5013556977727038, + "grad_norm": 1.7265625, + "learning_rate": 2.501210699609973e-05, + "loss": 0.3312, + "step": 11418 + }, + { + "epoch": 0.5014435162519073, + "grad_norm": 1.90625, + "learning_rate": 2.5005188712779736e-05, + "loss": 0.3418, + "step": 11420 + }, + { + "epoch": 0.5015313347311108, + "grad_norm": 1.796875, + "learning_rate": 2.4998270429062394e-05, + "loss": 0.3551, + "step": 11422 + }, + { + "epoch": 0.5016191532103143, + "grad_norm": 1.7265625, + "learning_rate": 2.4991352145477497e-05, + "loss": 0.3624, + "step": 11424 + }, + { + "epoch": 0.5017069716895177, + "grad_norm": 1.8671875, + "learning_rate": 2.498443386255485e-05, + "loss": 0.3742, + "step": 11426 + }, + { + "epoch": 0.5017947901687213, + "grad_norm": 2.015625, + "learning_rate": 2.497751558082427e-05, + "loss": 0.3658, + "step": 11428 + }, + { + "epoch": 0.5018826086479248, + "grad_norm": 1.7734375, + "learning_rate": 2.4970597300815545e-05, + "loss": 0.3503, + "step": 11430 + }, + { + "epoch": 0.5019704271271282, + "grad_norm": 1.6875, + "learning_rate": 2.4963679023058495e-05, + "loss": 0.3415, + "step": 11432 + }, + { + "epoch": 0.5020582456063317, + "grad_norm": 1.9140625, + "learning_rate": 2.495676074808291e-05, + "loss": 0.3646, + "step": 11434 + }, + { + "epoch": 0.5021460640855352, + "grad_norm": 2.0625, + "learning_rate": 2.4949842476418592e-05, + "loss": 0.3604, + "step": 11436 + }, + { + "epoch": 0.5022338825647387, + "grad_norm": 1.8359375, + "learning_rate": 2.4942924208595352e-05, + "loss": 0.371, + "step": 11438 + }, + { + "epoch": 0.5023217010439421, + "grad_norm": 1.859375, + "learning_rate": 2.493600594514299e-05, + "loss": 0.3412, + "step": 11440 + }, + { + "epoch": 0.5024095195231456, + "grad_norm": 1.7734375, + "learning_rate": 2.4929087686591317e-05, + "loss": 0.344, + "step": 11442 + }, + { + "epoch": 0.5024973380023492, + "grad_norm": 1.6875, + "learning_rate": 2.4922169433470112e-05, + "loss": 0.3685, + "step": 11444 + }, + { + "epoch": 0.5025851564815527, + "grad_norm": 1.9765625, + "learning_rate": 2.491525118630921e-05, + "loss": 0.3741, + "step": 11446 + }, + { + "epoch": 0.5026729749607561, + "grad_norm": 2.125, + "learning_rate": 2.4908332945638378e-05, + "loss": 0.3448, + "step": 11448 + }, + { + "epoch": 0.5027607934399596, + "grad_norm": 1.8046875, + "learning_rate": 2.4901414711987432e-05, + "loss": 0.3645, + "step": 11450 + }, + { + "epoch": 0.5028486119191631, + "grad_norm": 1.78125, + "learning_rate": 2.489449648588617e-05, + "loss": 0.3583, + "step": 11452 + }, + { + "epoch": 0.5029364303983666, + "grad_norm": 1.625, + "learning_rate": 2.4887578267864393e-05, + "loss": 0.341, + "step": 11454 + }, + { + "epoch": 0.50302424887757, + "grad_norm": 1.703125, + "learning_rate": 2.4880660058451895e-05, + "loss": 0.3411, + "step": 11456 + }, + { + "epoch": 0.5031120673567735, + "grad_norm": 1.828125, + "learning_rate": 2.487374185817849e-05, + "loss": 0.3633, + "step": 11458 + }, + { + "epoch": 0.5031998858359771, + "grad_norm": 1.8203125, + "learning_rate": 2.4866823667573954e-05, + "loss": 0.3506, + "step": 11460 + }, + { + "epoch": 0.5032877043151806, + "grad_norm": 2.0, + "learning_rate": 2.4859905487168095e-05, + "loss": 0.3882, + "step": 11462 + }, + { + "epoch": 0.503375522794384, + "grad_norm": 1.78125, + "learning_rate": 2.485298731749071e-05, + "loss": 0.3581, + "step": 11464 + }, + { + "epoch": 0.5034633412735875, + "grad_norm": 1.671875, + "learning_rate": 2.4846069159071586e-05, + "loss": 0.3581, + "step": 11466 + }, + { + "epoch": 0.503551159752791, + "grad_norm": 2.328125, + "learning_rate": 2.4839151012440532e-05, + "loss": 0.356, + "step": 11468 + }, + { + "epoch": 0.5036389782319944, + "grad_norm": 1.734375, + "learning_rate": 2.4832232878127327e-05, + "loss": 0.3814, + "step": 11470 + }, + { + "epoch": 0.5037267967111979, + "grad_norm": 1.9765625, + "learning_rate": 2.4825314756661765e-05, + "loss": 0.3626, + "step": 11472 + }, + { + "epoch": 0.5038146151904015, + "grad_norm": 1.6796875, + "learning_rate": 2.4818396648573645e-05, + "loss": 0.3471, + "step": 11474 + }, + { + "epoch": 0.503902433669605, + "grad_norm": 1.765625, + "learning_rate": 2.4811478554392762e-05, + "loss": 0.3433, + "step": 11476 + }, + { + "epoch": 0.5039902521488084, + "grad_norm": 1.6171875, + "learning_rate": 2.4804560474648885e-05, + "loss": 0.3593, + "step": 11478 + }, + { + "epoch": 0.5040780706280119, + "grad_norm": 1.6875, + "learning_rate": 2.4797642409871816e-05, + "loss": 0.3472, + "step": 11480 + }, + { + "epoch": 0.5041658891072154, + "grad_norm": 1.921875, + "learning_rate": 2.479072436059134e-05, + "loss": 0.3718, + "step": 11482 + }, + { + "epoch": 0.5042537075864189, + "grad_norm": 1.6953125, + "learning_rate": 2.4783806327337242e-05, + "loss": 0.3797, + "step": 11484 + }, + { + "epoch": 0.5043415260656223, + "grad_norm": 1.78125, + "learning_rate": 2.477688831063931e-05, + "loss": 0.3475, + "step": 11486 + }, + { + "epoch": 0.5044293445448258, + "grad_norm": 1.7265625, + "learning_rate": 2.476997031102732e-05, + "loss": 0.4019, + "step": 11488 + }, + { + "epoch": 0.5045171630240294, + "grad_norm": 1.7265625, + "learning_rate": 2.4763052329031055e-05, + "loss": 0.3631, + "step": 11490 + }, + { + "epoch": 0.5046049815032329, + "grad_norm": 1.796875, + "learning_rate": 2.4756134365180306e-05, + "loss": 0.3593, + "step": 11492 + }, + { + "epoch": 0.5046927999824363, + "grad_norm": 1.7734375, + "learning_rate": 2.4749216420004852e-05, + "loss": 0.357, + "step": 11494 + }, + { + "epoch": 0.5047806184616398, + "grad_norm": 1.7265625, + "learning_rate": 2.4742298494034453e-05, + "loss": 0.3796, + "step": 11496 + }, + { + "epoch": 0.5048684369408433, + "grad_norm": 1.7421875, + "learning_rate": 2.4735380587798902e-05, + "loss": 0.354, + "step": 11498 + }, + { + "epoch": 0.5049562554200467, + "grad_norm": 1.734375, + "learning_rate": 2.4728462701827968e-05, + "loss": 0.3939, + "step": 11500 + }, + { + "epoch": 0.5050440738992502, + "grad_norm": 1.8515625, + "learning_rate": 2.472154483665142e-05, + "loss": 0.376, + "step": 11502 + }, + { + "epoch": 0.5051318923784537, + "grad_norm": 1.78125, + "learning_rate": 2.4714626992799047e-05, + "loss": 0.3363, + "step": 11504 + }, + { + "epoch": 0.5052197108576573, + "grad_norm": 2.28125, + "learning_rate": 2.4707709170800608e-05, + "loss": 0.3781, + "step": 11506 + }, + { + "epoch": 0.5053075293368607, + "grad_norm": 1.75, + "learning_rate": 2.4700791371185868e-05, + "loss": 0.3709, + "step": 11508 + }, + { + "epoch": 0.5053953478160642, + "grad_norm": 1.8125, + "learning_rate": 2.469387359448459e-05, + "loss": 0.345, + "step": 11510 + }, + { + "epoch": 0.5054831662952677, + "grad_norm": 1.9296875, + "learning_rate": 2.4686955841226546e-05, + "loss": 0.3357, + "step": 11512 + }, + { + "epoch": 0.5055709847744712, + "grad_norm": 1.921875, + "learning_rate": 2.4680038111941504e-05, + "loss": 0.3312, + "step": 11514 + }, + { + "epoch": 0.5056588032536746, + "grad_norm": 2.109375, + "learning_rate": 2.4673120407159214e-05, + "loss": 0.3445, + "step": 11516 + }, + { + "epoch": 0.5057466217328781, + "grad_norm": 1.828125, + "learning_rate": 2.466620272740945e-05, + "loss": 0.3678, + "step": 11518 + }, + { + "epoch": 0.5058344402120817, + "grad_norm": 1.6796875, + "learning_rate": 2.4659285073221955e-05, + "loss": 0.3549, + "step": 11520 + }, + { + "epoch": 0.5059222586912852, + "grad_norm": 2.21875, + "learning_rate": 2.465236744512649e-05, + "loss": 0.3821, + "step": 11522 + }, + { + "epoch": 0.5060100771704886, + "grad_norm": 2.15625, + "learning_rate": 2.464544984365281e-05, + "loss": 0.358, + "step": 11524 + }, + { + "epoch": 0.5060978956496921, + "grad_norm": 1.953125, + "learning_rate": 2.4638532269330665e-05, + "loss": 0.3459, + "step": 11526 + }, + { + "epoch": 0.5061857141288956, + "grad_norm": 1.8671875, + "learning_rate": 2.4631614722689806e-05, + "loss": 0.3657, + "step": 11528 + }, + { + "epoch": 0.506273532608099, + "grad_norm": 1.7265625, + "learning_rate": 2.4624697204259986e-05, + "loss": 0.3525, + "step": 11530 + }, + { + "epoch": 0.5063613510873025, + "grad_norm": 1.90625, + "learning_rate": 2.461777971457094e-05, + "loss": 0.3466, + "step": 11532 + }, + { + "epoch": 0.506449169566506, + "grad_norm": 2.0, + "learning_rate": 2.461086225415241e-05, + "loss": 0.3451, + "step": 11534 + }, + { + "epoch": 0.5065369880457096, + "grad_norm": 1.6875, + "learning_rate": 2.4603944823534146e-05, + "loss": 0.3576, + "step": 11536 + }, + { + "epoch": 0.506624806524913, + "grad_norm": 1.6640625, + "learning_rate": 2.4597027423245893e-05, + "loss": 0.3392, + "step": 11538 + }, + { + "epoch": 0.5067126250041165, + "grad_norm": 1.7265625, + "learning_rate": 2.4590110053817364e-05, + "loss": 0.3546, + "step": 11540 + }, + { + "epoch": 0.50680044348332, + "grad_norm": 1.921875, + "learning_rate": 2.4583192715778304e-05, + "loss": 0.3363, + "step": 11542 + }, + { + "epoch": 0.5068882619625235, + "grad_norm": 1.7890625, + "learning_rate": 2.4576275409658454e-05, + "loss": 0.359, + "step": 11544 + }, + { + "epoch": 0.5069760804417269, + "grad_norm": 1.75, + "learning_rate": 2.456935813598753e-05, + "loss": 0.3529, + "step": 11546 + }, + { + "epoch": 0.5070638989209304, + "grad_norm": 1.890625, + "learning_rate": 2.4562440895295265e-05, + "loss": 0.3435, + "step": 11548 + }, + { + "epoch": 0.5071517174001339, + "grad_norm": 1.8515625, + "learning_rate": 2.4555523688111377e-05, + "loss": 0.3664, + "step": 11550 + }, + { + "epoch": 0.5072395358793375, + "grad_norm": 1.9453125, + "learning_rate": 2.454860651496559e-05, + "loss": 0.3318, + "step": 11552 + }, + { + "epoch": 0.5073273543585409, + "grad_norm": 1.71875, + "learning_rate": 2.4541689376387627e-05, + "loss": 0.3551, + "step": 11554 + }, + { + "epoch": 0.5074151728377444, + "grad_norm": 1.796875, + "learning_rate": 2.453477227290721e-05, + "loss": 0.3396, + "step": 11556 + }, + { + "epoch": 0.5075029913169479, + "grad_norm": 1.8125, + "learning_rate": 2.452785520505403e-05, + "loss": 0.3486, + "step": 11558 + }, + { + "epoch": 0.5075908097961513, + "grad_norm": 1.8828125, + "learning_rate": 2.452093817335782e-05, + "loss": 0.357, + "step": 11560 + }, + { + "epoch": 0.5076786282753548, + "grad_norm": 1.6640625, + "learning_rate": 2.4514021178348276e-05, + "loss": 0.356, + "step": 11562 + }, + { + "epoch": 0.5077664467545583, + "grad_norm": 1.7265625, + "learning_rate": 2.4507104220555106e-05, + "loss": 0.3339, + "step": 11564 + }, + { + "epoch": 0.5078542652337618, + "grad_norm": 2.015625, + "learning_rate": 2.450018730050802e-05, + "loss": 0.3823, + "step": 11566 + }, + { + "epoch": 0.5079420837129653, + "grad_norm": 1.59375, + "learning_rate": 2.4493270418736707e-05, + "loss": 0.3737, + "step": 11568 + }, + { + "epoch": 0.5080299021921688, + "grad_norm": 1.953125, + "learning_rate": 2.448635357577087e-05, + "loss": 0.3471, + "step": 11570 + }, + { + "epoch": 0.5081177206713723, + "grad_norm": 2.0, + "learning_rate": 2.4479436772140195e-05, + "loss": 0.3523, + "step": 11572 + }, + { + "epoch": 0.5082055391505758, + "grad_norm": 2.015625, + "learning_rate": 2.4472520008374375e-05, + "loss": 0.3427, + "step": 11574 + }, + { + "epoch": 0.5082933576297792, + "grad_norm": 1.875, + "learning_rate": 2.44656032850031e-05, + "loss": 0.34, + "step": 11576 + }, + { + "epoch": 0.5083811761089827, + "grad_norm": 1.9453125, + "learning_rate": 2.4458686602556052e-05, + "loss": 0.3527, + "step": 11578 + }, + { + "epoch": 0.5084689945881862, + "grad_norm": 1.734375, + "learning_rate": 2.445176996156292e-05, + "loss": 0.3461, + "step": 11580 + }, + { + "epoch": 0.5085568130673898, + "grad_norm": 1.9296875, + "learning_rate": 2.444485336255337e-05, + "loss": 0.3473, + "step": 11582 + }, + { + "epoch": 0.5086446315465932, + "grad_norm": 1.703125, + "learning_rate": 2.4437936806057082e-05, + "loss": 0.3454, + "step": 11584 + }, + { + "epoch": 0.5087324500257967, + "grad_norm": 1.8828125, + "learning_rate": 2.443102029260373e-05, + "loss": 0.371, + "step": 11586 + }, + { + "epoch": 0.5088202685050002, + "grad_norm": 1.7890625, + "learning_rate": 2.442410382272298e-05, + "loss": 0.3645, + "step": 11588 + }, + { + "epoch": 0.5089080869842036, + "grad_norm": 1.859375, + "learning_rate": 2.4417187396944496e-05, + "loss": 0.3277, + "step": 11590 + }, + { + "epoch": 0.5089959054634071, + "grad_norm": 1.8359375, + "learning_rate": 2.4410271015797947e-05, + "loss": 0.3338, + "step": 11592 + }, + { + "epoch": 0.5090837239426106, + "grad_norm": 1.828125, + "learning_rate": 2.440335467981298e-05, + "loss": 0.3514, + "step": 11594 + }, + { + "epoch": 0.5091715424218141, + "grad_norm": 1.8125, + "learning_rate": 2.439643838951925e-05, + "loss": 0.365, + "step": 11596 + }, + { + "epoch": 0.5092593609010176, + "grad_norm": 1.828125, + "learning_rate": 2.438952214544643e-05, + "loss": 0.3909, + "step": 11598 + }, + { + "epoch": 0.5093471793802211, + "grad_norm": 1.9609375, + "learning_rate": 2.4382605948124137e-05, + "loss": 0.3652, + "step": 11600 + }, + { + "epoch": 0.5094349978594246, + "grad_norm": 1.75, + "learning_rate": 2.437568979808203e-05, + "loss": 0.3538, + "step": 11602 + }, + { + "epoch": 0.5095228163386281, + "grad_norm": 1.7578125, + "learning_rate": 2.436877369584975e-05, + "loss": 0.3569, + "step": 11604 + }, + { + "epoch": 0.5096106348178315, + "grad_norm": 1.890625, + "learning_rate": 2.436185764195693e-05, + "loss": 0.3627, + "step": 11606 + }, + { + "epoch": 0.509698453297035, + "grad_norm": 1.7890625, + "learning_rate": 2.4354941636933203e-05, + "loss": 0.3579, + "step": 11608 + }, + { + "epoch": 0.5097862717762385, + "grad_norm": 1.9765625, + "learning_rate": 2.43480256813082e-05, + "loss": 0.3583, + "step": 11610 + }, + { + "epoch": 0.509874090255442, + "grad_norm": 1.7734375, + "learning_rate": 2.4341109775611547e-05, + "loss": 0.3649, + "step": 11612 + }, + { + "epoch": 0.5099619087346455, + "grad_norm": 1.84375, + "learning_rate": 2.4334193920372867e-05, + "loss": 0.3639, + "step": 11614 + }, + { + "epoch": 0.510049727213849, + "grad_norm": 1.6875, + "learning_rate": 2.432727811612178e-05, + "loss": 0.3859, + "step": 11616 + }, + { + "epoch": 0.5101375456930525, + "grad_norm": 1.7578125, + "learning_rate": 2.4320362363387893e-05, + "loss": 0.3314, + "step": 11618 + }, + { + "epoch": 0.510225364172256, + "grad_norm": 1.734375, + "learning_rate": 2.4313446662700813e-05, + "loss": 0.3727, + "step": 11620 + }, + { + "epoch": 0.5103131826514594, + "grad_norm": 1.8125, + "learning_rate": 2.4306531014590156e-05, + "loss": 0.3434, + "step": 11622 + }, + { + "epoch": 0.5104010011306629, + "grad_norm": 1.75, + "learning_rate": 2.429961541958552e-05, + "loss": 0.3612, + "step": 11624 + }, + { + "epoch": 0.5104888196098664, + "grad_norm": 1.65625, + "learning_rate": 2.4292699878216505e-05, + "loss": 0.3464, + "step": 11626 + }, + { + "epoch": 0.51057663808907, + "grad_norm": 1.75, + "learning_rate": 2.4285784391012706e-05, + "loss": 0.337, + "step": 11628 + }, + { + "epoch": 0.5106644565682734, + "grad_norm": 1.6875, + "learning_rate": 2.4278868958503708e-05, + "loss": 0.3909, + "step": 11630 + }, + { + "epoch": 0.5107522750474769, + "grad_norm": 1.828125, + "learning_rate": 2.42719535812191e-05, + "loss": 0.3323, + "step": 11632 + }, + { + "epoch": 0.5108400935266804, + "grad_norm": 1.6875, + "learning_rate": 2.4265038259688456e-05, + "loss": 0.3545, + "step": 11634 + }, + { + "epoch": 0.5109279120058838, + "grad_norm": 1.859375, + "learning_rate": 2.425812299444136e-05, + "loss": 0.3492, + "step": 11636 + }, + { + "epoch": 0.5110157304850873, + "grad_norm": 1.6875, + "learning_rate": 2.425120778600738e-05, + "loss": 0.3437, + "step": 11638 + }, + { + "epoch": 0.5111035489642908, + "grad_norm": 1.8515625, + "learning_rate": 2.424429263491609e-05, + "loss": 0.3651, + "step": 11640 + }, + { + "epoch": 0.5111913674434942, + "grad_norm": 1.6015625, + "learning_rate": 2.4237377541697055e-05, + "loss": 0.3285, + "step": 11642 + }, + { + "epoch": 0.5112791859226978, + "grad_norm": 1.78125, + "learning_rate": 2.4230462506879824e-05, + "loss": 0.338, + "step": 11644 + }, + { + "epoch": 0.5113670044019013, + "grad_norm": 1.9453125, + "learning_rate": 2.4223547530993958e-05, + "loss": 0.3836, + "step": 11646 + }, + { + "epoch": 0.5114548228811048, + "grad_norm": 1.875, + "learning_rate": 2.4216632614569012e-05, + "loss": 0.382, + "step": 11648 + }, + { + "epoch": 0.5115426413603082, + "grad_norm": 1.734375, + "learning_rate": 2.4209717758134525e-05, + "loss": 0.3605, + "step": 11650 + }, + { + "epoch": 0.5116304598395117, + "grad_norm": 1.7265625, + "learning_rate": 2.4202802962220047e-05, + "loss": 0.3905, + "step": 11652 + }, + { + "epoch": 0.5117182783187152, + "grad_norm": 1.9140625, + "learning_rate": 2.4195888227355105e-05, + "loss": 0.329, + "step": 11654 + }, + { + "epoch": 0.5118060967979187, + "grad_norm": 1.75, + "learning_rate": 2.4188973554069236e-05, + "loss": 0.3697, + "step": 11656 + }, + { + "epoch": 0.5118939152771221, + "grad_norm": 1.7421875, + "learning_rate": 2.4182058942891966e-05, + "loss": 0.3456, + "step": 11658 + }, + { + "epoch": 0.5119817337563257, + "grad_norm": 1.828125, + "learning_rate": 2.417514439435283e-05, + "loss": 0.3294, + "step": 11660 + }, + { + "epoch": 0.5120695522355292, + "grad_norm": 1.7109375, + "learning_rate": 2.416822990898132e-05, + "loss": 0.3599, + "step": 11662 + }, + { + "epoch": 0.5121573707147327, + "grad_norm": 1.8671875, + "learning_rate": 2.4161315487306965e-05, + "loss": 0.3617, + "step": 11664 + }, + { + "epoch": 0.5122451891939361, + "grad_norm": 1.765625, + "learning_rate": 2.415440112985927e-05, + "loss": 0.3578, + "step": 11666 + }, + { + "epoch": 0.5123330076731396, + "grad_norm": 1.7734375, + "learning_rate": 2.4147486837167748e-05, + "loss": 0.3638, + "step": 11668 + }, + { + "epoch": 0.5124208261523431, + "grad_norm": 1.9140625, + "learning_rate": 2.414057260976188e-05, + "loss": 0.3481, + "step": 11670 + }, + { + "epoch": 0.5125086446315466, + "grad_norm": 1.734375, + "learning_rate": 2.413365844817117e-05, + "loss": 0.3752, + "step": 11672 + }, + { + "epoch": 0.5125964631107501, + "grad_norm": 1.8203125, + "learning_rate": 2.41267443529251e-05, + "loss": 0.353, + "step": 11674 + }, + { + "epoch": 0.5126842815899536, + "grad_norm": 1.65625, + "learning_rate": 2.411983032455316e-05, + "loss": 0.3513, + "step": 11676 + }, + { + "epoch": 0.5127721000691571, + "grad_norm": 1.8515625, + "learning_rate": 2.4112916363584828e-05, + "loss": 0.3692, + "step": 11678 + }, + { + "epoch": 0.5128599185483605, + "grad_norm": 1.9609375, + "learning_rate": 2.410600247054957e-05, + "loss": 0.3459, + "step": 11680 + }, + { + "epoch": 0.512947737027564, + "grad_norm": 1.7421875, + "learning_rate": 2.4099088645976855e-05, + "loss": 0.3597, + "step": 11682 + }, + { + "epoch": 0.5130355555067675, + "grad_norm": 1.921875, + "learning_rate": 2.409217489039615e-05, + "loss": 0.3649, + "step": 11684 + }, + { + "epoch": 0.513123373985971, + "grad_norm": 1.7890625, + "learning_rate": 2.4085261204336905e-05, + "loss": 0.3304, + "step": 11686 + }, + { + "epoch": 0.5132111924651744, + "grad_norm": 2.0, + "learning_rate": 2.407834758832858e-05, + "loss": 0.3645, + "step": 11688 + }, + { + "epoch": 0.513299010944378, + "grad_norm": 2.109375, + "learning_rate": 2.4071434042900627e-05, + "loss": 0.4048, + "step": 11690 + }, + { + "epoch": 0.5133868294235815, + "grad_norm": 1.7265625, + "learning_rate": 2.4064520568582468e-05, + "loss": 0.3575, + "step": 11692 + }, + { + "epoch": 0.513474647902785, + "grad_norm": 1.84375, + "learning_rate": 2.4057607165903557e-05, + "loss": 0.3888, + "step": 11694 + }, + { + "epoch": 0.5135624663819884, + "grad_norm": 2.03125, + "learning_rate": 2.4050693835393303e-05, + "loss": 0.3731, + "step": 11696 + }, + { + "epoch": 0.5136502848611919, + "grad_norm": 1.8125, + "learning_rate": 2.4043780577581145e-05, + "loss": 0.3508, + "step": 11698 + }, + { + "epoch": 0.5137381033403954, + "grad_norm": 1.734375, + "learning_rate": 2.40368673929965e-05, + "loss": 0.359, + "step": 11700 + }, + { + "epoch": 0.5138259218195989, + "grad_norm": 1.8359375, + "learning_rate": 2.4029954282168782e-05, + "loss": 0.3378, + "step": 11702 + }, + { + "epoch": 0.5139137402988023, + "grad_norm": 1.6953125, + "learning_rate": 2.40230412456274e-05, + "loss": 0.3509, + "step": 11704 + }, + { + "epoch": 0.5140015587780059, + "grad_norm": 1.8984375, + "learning_rate": 2.401612828390175e-05, + "loss": 0.3493, + "step": 11706 + }, + { + "epoch": 0.5140893772572094, + "grad_norm": 1.7734375, + "learning_rate": 2.4009215397521234e-05, + "loss": 0.3398, + "step": 11708 + }, + { + "epoch": 0.5141771957364129, + "grad_norm": 1.8203125, + "learning_rate": 2.4002302587015236e-05, + "loss": 0.3232, + "step": 11710 + }, + { + "epoch": 0.5142650142156163, + "grad_norm": 1.8828125, + "learning_rate": 2.399538985291315e-05, + "loss": 0.3542, + "step": 11712 + }, + { + "epoch": 0.5143528326948198, + "grad_norm": 1.75, + "learning_rate": 2.3988477195744353e-05, + "loss": 0.353, + "step": 11714 + }, + { + "epoch": 0.5144406511740233, + "grad_norm": 1.8828125, + "learning_rate": 2.398156461603821e-05, + "loss": 0.3577, + "step": 11716 + }, + { + "epoch": 0.5145284696532267, + "grad_norm": 1.9296875, + "learning_rate": 2.397465211432409e-05, + "loss": 0.3709, + "step": 11718 + }, + { + "epoch": 0.5146162881324303, + "grad_norm": 2.015625, + "learning_rate": 2.396773969113136e-05, + "loss": 0.3302, + "step": 11720 + }, + { + "epoch": 0.5147041066116338, + "grad_norm": 1.9140625, + "learning_rate": 2.396082734698938e-05, + "loss": 0.3461, + "step": 11722 + }, + { + "epoch": 0.5147919250908373, + "grad_norm": 1.7734375, + "learning_rate": 2.3953915082427482e-05, + "loss": 0.3456, + "step": 11724 + }, + { + "epoch": 0.5148797435700407, + "grad_norm": 1.9375, + "learning_rate": 2.3947002897975018e-05, + "loss": 0.3699, + "step": 11726 + }, + { + "epoch": 0.5149675620492442, + "grad_norm": 1.7421875, + "learning_rate": 2.3940090794161324e-05, + "loss": 0.3854, + "step": 11728 + }, + { + "epoch": 0.5150553805284477, + "grad_norm": 1.796875, + "learning_rate": 2.3933178771515735e-05, + "loss": 0.3623, + "step": 11730 + }, + { + "epoch": 0.5151431990076512, + "grad_norm": 1.8671875, + "learning_rate": 2.3926266830567567e-05, + "loss": 0.3466, + "step": 11732 + }, + { + "epoch": 0.5152310174868546, + "grad_norm": 1.734375, + "learning_rate": 2.3919354971846143e-05, + "loss": 0.3677, + "step": 11734 + }, + { + "epoch": 0.5153188359660582, + "grad_norm": 1.8125, + "learning_rate": 2.3912443195880776e-05, + "loss": 0.3551, + "step": 11736 + }, + { + "epoch": 0.5154066544452617, + "grad_norm": 1.8515625, + "learning_rate": 2.3905531503200768e-05, + "loss": 0.3481, + "step": 11738 + }, + { + "epoch": 0.5154944729244652, + "grad_norm": 2.109375, + "learning_rate": 2.3898619894335425e-05, + "loss": 0.3936, + "step": 11740 + }, + { + "epoch": 0.5155822914036686, + "grad_norm": 1.75, + "learning_rate": 2.3891708369814028e-05, + "loss": 0.3743, + "step": 11742 + }, + { + "epoch": 0.5156701098828721, + "grad_norm": 1.7421875, + "learning_rate": 2.3884796930165875e-05, + "loss": 0.3537, + "step": 11744 + }, + { + "epoch": 0.5157579283620756, + "grad_norm": 1.6796875, + "learning_rate": 2.3877885575920235e-05, + "loss": 0.3446, + "step": 11746 + }, + { + "epoch": 0.515845746841279, + "grad_norm": 1.8359375, + "learning_rate": 2.3870974307606386e-05, + "loss": 0.3612, + "step": 11748 + }, + { + "epoch": 0.5159335653204825, + "grad_norm": 1.90625, + "learning_rate": 2.38640631257536e-05, + "loss": 0.3155, + "step": 11750 + }, + { + "epoch": 0.5160213837996861, + "grad_norm": 2.03125, + "learning_rate": 2.385715203089114e-05, + "loss": 0.3664, + "step": 11752 + }, + { + "epoch": 0.5161092022788896, + "grad_norm": 1.9140625, + "learning_rate": 2.385024102354824e-05, + "loss": 0.3454, + "step": 11754 + }, + { + "epoch": 0.516197020758093, + "grad_norm": 1.6796875, + "learning_rate": 2.3843330104254165e-05, + "loss": 0.3397, + "step": 11756 + }, + { + "epoch": 0.5162848392372965, + "grad_norm": 1.796875, + "learning_rate": 2.383641927353814e-05, + "loss": 0.3574, + "step": 11758 + }, + { + "epoch": 0.5163726577165, + "grad_norm": 1.7734375, + "learning_rate": 2.3829508531929408e-05, + "loss": 0.3543, + "step": 11760 + }, + { + "epoch": 0.5164604761957035, + "grad_norm": 2.0, + "learning_rate": 2.3822597879957192e-05, + "loss": 0.3621, + "step": 11762 + }, + { + "epoch": 0.5165482946749069, + "grad_norm": 1.765625, + "learning_rate": 2.3815687318150716e-05, + "loss": 0.3541, + "step": 11764 + }, + { + "epoch": 0.5166361131541104, + "grad_norm": 1.765625, + "learning_rate": 2.3808776847039187e-05, + "loss": 0.3748, + "step": 11766 + }, + { + "epoch": 0.516723931633314, + "grad_norm": 2.4375, + "learning_rate": 2.3801866467151813e-05, + "loss": 0.3511, + "step": 11768 + }, + { + "epoch": 0.5168117501125175, + "grad_norm": 1.75, + "learning_rate": 2.3794956179017792e-05, + "loss": 0.3926, + "step": 11770 + }, + { + "epoch": 0.5168995685917209, + "grad_norm": 1.6640625, + "learning_rate": 2.3788045983166317e-05, + "loss": 0.3538, + "step": 11772 + }, + { + "epoch": 0.5169873870709244, + "grad_norm": 1.765625, + "learning_rate": 2.378113588012657e-05, + "loss": 0.334, + "step": 11774 + }, + { + "epoch": 0.5170752055501279, + "grad_norm": 1.5859375, + "learning_rate": 2.377422587042773e-05, + "loss": 0.3454, + "step": 11776 + }, + { + "epoch": 0.5171630240293313, + "grad_norm": 1.578125, + "learning_rate": 2.376731595459897e-05, + "loss": 0.3581, + "step": 11778 + }, + { + "epoch": 0.5172508425085348, + "grad_norm": 1.75, + "learning_rate": 2.3760406133169443e-05, + "loss": 0.3241, + "step": 11780 + }, + { + "epoch": 0.5173386609877384, + "grad_norm": 1.8203125, + "learning_rate": 2.3753496406668325e-05, + "loss": 0.3687, + "step": 11782 + }, + { + "epoch": 0.5174264794669419, + "grad_norm": 1.6015625, + "learning_rate": 2.3746586775624738e-05, + "loss": 0.3487, + "step": 11784 + }, + { + "epoch": 0.5175142979461453, + "grad_norm": 1.7265625, + "learning_rate": 2.3739677240567836e-05, + "loss": 0.349, + "step": 11786 + }, + { + "epoch": 0.5176021164253488, + "grad_norm": 1.6328125, + "learning_rate": 2.3732767802026757e-05, + "loss": 0.3642, + "step": 11788 + }, + { + "epoch": 0.5176899349045523, + "grad_norm": 1.734375, + "learning_rate": 2.372585846053062e-05, + "loss": 0.3601, + "step": 11790 + }, + { + "epoch": 0.5177777533837558, + "grad_norm": 2.015625, + "learning_rate": 2.3718949216608556e-05, + "loss": 0.3388, + "step": 11792 + }, + { + "epoch": 0.5178655718629592, + "grad_norm": 1.7265625, + "learning_rate": 2.371204007078966e-05, + "loss": 0.3462, + "step": 11794 + }, + { + "epoch": 0.5179533903421627, + "grad_norm": 1.7421875, + "learning_rate": 2.3705131023603043e-05, + "loss": 0.3661, + "step": 11796 + }, + { + "epoch": 0.5180412088213663, + "grad_norm": 1.78125, + "learning_rate": 2.3698222075577805e-05, + "loss": 0.3453, + "step": 11798 + }, + { + "epoch": 0.5181290273005698, + "grad_norm": 1.90625, + "learning_rate": 2.3691313227243033e-05, + "loss": 0.3558, + "step": 11800 + }, + { + "epoch": 0.5182168457797732, + "grad_norm": 1.9609375, + "learning_rate": 2.3684404479127813e-05, + "loss": 0.3436, + "step": 11802 + }, + { + "epoch": 0.5183046642589767, + "grad_norm": 1.6875, + "learning_rate": 2.3677495831761205e-05, + "loss": 0.3589, + "step": 11804 + }, + { + "epoch": 0.5183924827381802, + "grad_norm": 1.703125, + "learning_rate": 2.3670587285672284e-05, + "loss": 0.3264, + "step": 11806 + }, + { + "epoch": 0.5184803012173836, + "grad_norm": 1.703125, + "learning_rate": 2.366367884139011e-05, + "loss": 0.3578, + "step": 11808 + }, + { + "epoch": 0.5185681196965871, + "grad_norm": 1.6796875, + "learning_rate": 2.365677049944373e-05, + "loss": 0.3635, + "step": 11810 + }, + { + "epoch": 0.5186559381757906, + "grad_norm": 1.6953125, + "learning_rate": 2.364986226036219e-05, + "loss": 0.3389, + "step": 11812 + }, + { + "epoch": 0.5187437566549942, + "grad_norm": 1.7578125, + "learning_rate": 2.3642954124674523e-05, + "loss": 0.3554, + "step": 11814 + }, + { + "epoch": 0.5188315751341976, + "grad_norm": 1.875, + "learning_rate": 2.3636046092909754e-05, + "loss": 0.3582, + "step": 11816 + }, + { + "epoch": 0.5189193936134011, + "grad_norm": 1.9140625, + "learning_rate": 2.36291381655969e-05, + "loss": 0.3479, + "step": 11818 + }, + { + "epoch": 0.5190072120926046, + "grad_norm": 1.8515625, + "learning_rate": 2.362223034326497e-05, + "loss": 0.3522, + "step": 11820 + }, + { + "epoch": 0.5190950305718081, + "grad_norm": 1.9609375, + "learning_rate": 2.3615322626442977e-05, + "loss": 0.3455, + "step": 11822 + }, + { + "epoch": 0.5191828490510115, + "grad_norm": 2.109375, + "learning_rate": 2.36084150156599e-05, + "loss": 0.3522, + "step": 11824 + }, + { + "epoch": 0.519270667530215, + "grad_norm": 1.8984375, + "learning_rate": 2.360150751144474e-05, + "loss": 0.3604, + "step": 11826 + }, + { + "epoch": 0.5193584860094186, + "grad_norm": 1.625, + "learning_rate": 2.3594600114326476e-05, + "loss": 0.3782, + "step": 11828 + }, + { + "epoch": 0.5194463044886221, + "grad_norm": 1.6015625, + "learning_rate": 2.3587692824834066e-05, + "loss": 0.3688, + "step": 11830 + }, + { + "epoch": 0.5195341229678255, + "grad_norm": 1.8984375, + "learning_rate": 2.358078564349648e-05, + "loss": 0.3603, + "step": 11832 + }, + { + "epoch": 0.519621941447029, + "grad_norm": 1.75, + "learning_rate": 2.3573878570842664e-05, + "loss": 0.3842, + "step": 11834 + }, + { + "epoch": 0.5197097599262325, + "grad_norm": 1.7890625, + "learning_rate": 2.3566971607401574e-05, + "loss": 0.3662, + "step": 11836 + }, + { + "epoch": 0.519797578405436, + "grad_norm": 1.8125, + "learning_rate": 2.3560064753702144e-05, + "loss": 0.3501, + "step": 11838 + }, + { + "epoch": 0.5198853968846394, + "grad_norm": 1.625, + "learning_rate": 2.3553158010273295e-05, + "loss": 0.3431, + "step": 11840 + }, + { + "epoch": 0.5199732153638429, + "grad_norm": 1.75, + "learning_rate": 2.3546251377643955e-05, + "loss": 0.3624, + "step": 11842 + }, + { + "epoch": 0.5200610338430465, + "grad_norm": 1.828125, + "learning_rate": 2.353934485634304e-05, + "loss": 0.3681, + "step": 11844 + }, + { + "epoch": 0.5201488523222499, + "grad_norm": 1.8828125, + "learning_rate": 2.3532438446899437e-05, + "loss": 0.3293, + "step": 11846 + }, + { + "epoch": 0.5202366708014534, + "grad_norm": 1.75, + "learning_rate": 2.3525532149842054e-05, + "loss": 0.351, + "step": 11848 + }, + { + "epoch": 0.5203244892806569, + "grad_norm": 1.6328125, + "learning_rate": 2.3518625965699767e-05, + "loss": 0.3449, + "step": 11850 + }, + { + "epoch": 0.5204123077598604, + "grad_norm": 1.8984375, + "learning_rate": 2.351171989500146e-05, + "loss": 0.3673, + "step": 11852 + }, + { + "epoch": 0.5205001262390638, + "grad_norm": 1.6875, + "learning_rate": 2.3504813938276005e-05, + "loss": 0.3409, + "step": 11854 + }, + { + "epoch": 0.5205879447182673, + "grad_norm": 1.8671875, + "learning_rate": 2.349790809605225e-05, + "loss": 0.3701, + "step": 11856 + }, + { + "epoch": 0.5206757631974708, + "grad_norm": 1.859375, + "learning_rate": 2.349100236885906e-05, + "loss": 0.3485, + "step": 11858 + }, + { + "epoch": 0.5207635816766744, + "grad_norm": 1.7265625, + "learning_rate": 2.3484096757225263e-05, + "loss": 0.3583, + "step": 11860 + }, + { + "epoch": 0.5208514001558778, + "grad_norm": 2.0625, + "learning_rate": 2.3477191261679704e-05, + "loss": 0.35, + "step": 11862 + }, + { + "epoch": 0.5209392186350813, + "grad_norm": 1.8125, + "learning_rate": 2.3470285882751205e-05, + "loss": 0.3593, + "step": 11864 + }, + { + "epoch": 0.5210270371142848, + "grad_norm": 1.8359375, + "learning_rate": 2.3463380620968576e-05, + "loss": 0.3636, + "step": 11866 + }, + { + "epoch": 0.5211148555934882, + "grad_norm": 1.75, + "learning_rate": 2.345647547686063e-05, + "loss": 0.3752, + "step": 11868 + }, + { + "epoch": 0.5212026740726917, + "grad_norm": 1.84375, + "learning_rate": 2.344957045095616e-05, + "loss": 0.3505, + "step": 11870 + }, + { + "epoch": 0.5212904925518952, + "grad_norm": 2.140625, + "learning_rate": 2.3442665543783957e-05, + "loss": 0.3737, + "step": 11872 + }, + { + "epoch": 0.5213783110310988, + "grad_norm": 2.09375, + "learning_rate": 2.343576075587281e-05, + "loss": 0.3533, + "step": 11874 + }, + { + "epoch": 0.5214661295103022, + "grad_norm": 1.8359375, + "learning_rate": 2.342885608775147e-05, + "loss": 0.3613, + "step": 11876 + }, + { + "epoch": 0.5215539479895057, + "grad_norm": 1.9140625, + "learning_rate": 2.342195153994871e-05, + "loss": 0.342, + "step": 11878 + }, + { + "epoch": 0.5216417664687092, + "grad_norm": 1.7421875, + "learning_rate": 2.3415047112993274e-05, + "loss": 0.3746, + "step": 11880 + }, + { + "epoch": 0.5217295849479127, + "grad_norm": 1.7734375, + "learning_rate": 2.340814280741391e-05, + "loss": 0.3362, + "step": 11882 + }, + { + "epoch": 0.5218174034271161, + "grad_norm": 1.8671875, + "learning_rate": 2.3401238623739354e-05, + "loss": 0.3482, + "step": 11884 + }, + { + "epoch": 0.5219052219063196, + "grad_norm": 1.6015625, + "learning_rate": 2.3394334562498328e-05, + "loss": 0.3655, + "step": 11886 + }, + { + "epoch": 0.5219930403855231, + "grad_norm": 1.609375, + "learning_rate": 2.3387430624219546e-05, + "loss": 0.3439, + "step": 11888 + }, + { + "epoch": 0.5220808588647267, + "grad_norm": 2.109375, + "learning_rate": 2.3380526809431717e-05, + "loss": 0.3549, + "step": 11890 + }, + { + "epoch": 0.5221686773439301, + "grad_norm": 1.8515625, + "learning_rate": 2.3373623118663528e-05, + "loss": 0.3444, + "step": 11892 + }, + { + "epoch": 0.5222564958231336, + "grad_norm": 1.796875, + "learning_rate": 2.336671955244367e-05, + "loss": 0.3373, + "step": 11894 + }, + { + "epoch": 0.5223443143023371, + "grad_norm": 1.734375, + "learning_rate": 2.335981611130082e-05, + "loss": 0.3576, + "step": 11896 + }, + { + "epoch": 0.5224321327815405, + "grad_norm": 2.046875, + "learning_rate": 2.3352912795763646e-05, + "loss": 0.3655, + "step": 11898 + }, + { + "epoch": 0.522519951260744, + "grad_norm": 2.0, + "learning_rate": 2.334600960636081e-05, + "loss": 0.376, + "step": 11900 + }, + { + "epoch": 0.5226077697399475, + "grad_norm": 1.84375, + "learning_rate": 2.333910654362095e-05, + "loss": 0.3281, + "step": 11902 + }, + { + "epoch": 0.522695588219151, + "grad_norm": 1.8203125, + "learning_rate": 2.333220360807271e-05, + "loss": 0.3497, + "step": 11904 + }, + { + "epoch": 0.5227834066983545, + "grad_norm": 1.828125, + "learning_rate": 2.3325300800244726e-05, + "loss": 0.3409, + "step": 11906 + }, + { + "epoch": 0.522871225177558, + "grad_norm": 1.890625, + "learning_rate": 2.3318398120665598e-05, + "loss": 0.3553, + "step": 11908 + }, + { + "epoch": 0.5229590436567615, + "grad_norm": 1.6875, + "learning_rate": 2.3311495569863945e-05, + "loss": 0.3339, + "step": 11910 + }, + { + "epoch": 0.523046862135965, + "grad_norm": 1.859375, + "learning_rate": 2.3304593148368366e-05, + "loss": 0.3571, + "step": 11912 + }, + { + "epoch": 0.5231346806151684, + "grad_norm": 1.8515625, + "learning_rate": 2.3297690856707458e-05, + "loss": 0.3476, + "step": 11914 + }, + { + "epoch": 0.5232224990943719, + "grad_norm": 1.8203125, + "learning_rate": 2.3290788695409785e-05, + "loss": 0.3489, + "step": 11916 + }, + { + "epoch": 0.5233103175735754, + "grad_norm": 1.6875, + "learning_rate": 2.3283886665003924e-05, + "loss": 0.3602, + "step": 11918 + }, + { + "epoch": 0.5233981360527789, + "grad_norm": 1.8671875, + "learning_rate": 2.327698476601843e-05, + "loss": 0.3553, + "step": 11920 + }, + { + "epoch": 0.5234859545319824, + "grad_norm": 1.7421875, + "learning_rate": 2.327008299898186e-05, + "loss": 0.3447, + "step": 11922 + }, + { + "epoch": 0.5235737730111859, + "grad_norm": 1.7578125, + "learning_rate": 2.3263181364422748e-05, + "loss": 0.3234, + "step": 11924 + }, + { + "epoch": 0.5236615914903894, + "grad_norm": 1.828125, + "learning_rate": 2.3256279862869626e-05, + "loss": 0.3697, + "step": 11926 + }, + { + "epoch": 0.5237494099695928, + "grad_norm": 1.7265625, + "learning_rate": 2.3249378494851008e-05, + "loss": 0.3523, + "step": 11928 + }, + { + "epoch": 0.5238372284487963, + "grad_norm": 1.8203125, + "learning_rate": 2.3242477260895404e-05, + "loss": 0.356, + "step": 11930 + }, + { + "epoch": 0.5239250469279998, + "grad_norm": 1.7109375, + "learning_rate": 2.323557616153131e-05, + "loss": 0.3551, + "step": 11932 + }, + { + "epoch": 0.5240128654072033, + "grad_norm": 1.8359375, + "learning_rate": 2.322867519728722e-05, + "loss": 0.3631, + "step": 11934 + }, + { + "epoch": 0.5241006838864068, + "grad_norm": 1.8046875, + "learning_rate": 2.3221774368691616e-05, + "loss": 0.34, + "step": 11936 + }, + { + "epoch": 0.5241885023656103, + "grad_norm": 1.578125, + "learning_rate": 2.3214873676272948e-05, + "loss": 0.3139, + "step": 11938 + }, + { + "epoch": 0.5242763208448138, + "grad_norm": 1.765625, + "learning_rate": 2.320797312055969e-05, + "loss": 0.3523, + "step": 11940 + }, + { + "epoch": 0.5243641393240173, + "grad_norm": 1.7734375, + "learning_rate": 2.3201072702080267e-05, + "loss": 0.3513, + "step": 11942 + }, + { + "epoch": 0.5244519578032207, + "grad_norm": 1.6015625, + "learning_rate": 2.3194172421363132e-05, + "loss": 0.3215, + "step": 11944 + }, + { + "epoch": 0.5245397762824242, + "grad_norm": 1.75, + "learning_rate": 2.3187272278936705e-05, + "loss": 0.3471, + "step": 11946 + }, + { + "epoch": 0.5246275947616277, + "grad_norm": 1.9921875, + "learning_rate": 2.3180372275329404e-05, + "loss": 0.3485, + "step": 11948 + }, + { + "epoch": 0.5247154132408312, + "grad_norm": 1.9140625, + "learning_rate": 2.317347241106963e-05, + "loss": 0.3585, + "step": 11950 + }, + { + "epoch": 0.5248032317200347, + "grad_norm": 1.65625, + "learning_rate": 2.316657268668578e-05, + "loss": 0.3414, + "step": 11952 + }, + { + "epoch": 0.5248910501992382, + "grad_norm": 1.96875, + "learning_rate": 2.315967310270623e-05, + "loss": 0.3344, + "step": 11954 + }, + { + "epoch": 0.5249788686784417, + "grad_norm": 2.1875, + "learning_rate": 2.315277365965935e-05, + "loss": 0.3659, + "step": 11956 + }, + { + "epoch": 0.5250666871576452, + "grad_norm": 2.046875, + "learning_rate": 2.314587435807351e-05, + "loss": 0.3661, + "step": 11958 + }, + { + "epoch": 0.5251545056368486, + "grad_norm": 2.1875, + "learning_rate": 2.3138975198477056e-05, + "loss": 0.3642, + "step": 11960 + }, + { + "epoch": 0.5252423241160521, + "grad_norm": 1.9140625, + "learning_rate": 2.3132076181398332e-05, + "loss": 0.3845, + "step": 11962 + }, + { + "epoch": 0.5253301425952556, + "grad_norm": 1.703125, + "learning_rate": 2.3125177307365658e-05, + "loss": 0.3446, + "step": 11964 + }, + { + "epoch": 0.525417961074459, + "grad_norm": 1.9296875, + "learning_rate": 2.3118278576907366e-05, + "loss": 0.3537, + "step": 11966 + }, + { + "epoch": 0.5255057795536626, + "grad_norm": 2.0625, + "learning_rate": 2.3111379990551736e-05, + "loss": 0.3397, + "step": 11968 + }, + { + "epoch": 0.5255935980328661, + "grad_norm": 1.6015625, + "learning_rate": 2.3104481548827084e-05, + "loss": 0.3742, + "step": 11970 + }, + { + "epoch": 0.5256814165120696, + "grad_norm": 1.828125, + "learning_rate": 2.309758325226169e-05, + "loss": 0.3405, + "step": 11972 + }, + { + "epoch": 0.525769234991273, + "grad_norm": 1.8828125, + "learning_rate": 2.3090685101383824e-05, + "loss": 0.333, + "step": 11974 + }, + { + "epoch": 0.5258570534704765, + "grad_norm": 1.7421875, + "learning_rate": 2.3083787096721756e-05, + "loss": 0.3498, + "step": 11976 + }, + { + "epoch": 0.52594487194968, + "grad_norm": 1.9765625, + "learning_rate": 2.3076889238803727e-05, + "loss": 0.3476, + "step": 11978 + }, + { + "epoch": 0.5260326904288835, + "grad_norm": 1.671875, + "learning_rate": 2.3069991528157982e-05, + "loss": 0.3544, + "step": 11980 + }, + { + "epoch": 0.526120508908087, + "grad_norm": 1.90625, + "learning_rate": 2.3063093965312747e-05, + "loss": 0.3416, + "step": 11982 + }, + { + "epoch": 0.5262083273872905, + "grad_norm": 1.75, + "learning_rate": 2.305619655079624e-05, + "loss": 0.3546, + "step": 11984 + }, + { + "epoch": 0.526296145866494, + "grad_norm": 1.984375, + "learning_rate": 2.3049299285136667e-05, + "loss": 0.3546, + "step": 11986 + }, + { + "epoch": 0.5263839643456975, + "grad_norm": 1.796875, + "learning_rate": 2.304240216886223e-05, + "loss": 0.3563, + "step": 11988 + }, + { + "epoch": 0.5264717828249009, + "grad_norm": 1.5546875, + "learning_rate": 2.3035505202501095e-05, + "loss": 0.3371, + "step": 11990 + }, + { + "epoch": 0.5265596013041044, + "grad_norm": 1.8046875, + "learning_rate": 2.3028608386581446e-05, + "loss": 0.3759, + "step": 11992 + }, + { + "epoch": 0.5266474197833079, + "grad_norm": 1.609375, + "learning_rate": 2.302171172163144e-05, + "loss": 0.3462, + "step": 11994 + }, + { + "epoch": 0.5267352382625113, + "grad_norm": 1.7890625, + "learning_rate": 2.301481520817922e-05, + "loss": 0.3657, + "step": 11996 + }, + { + "epoch": 0.5268230567417149, + "grad_norm": 1.65625, + "learning_rate": 2.300791884675294e-05, + "loss": 0.3392, + "step": 11998 + }, + { + "epoch": 0.5269108752209184, + "grad_norm": 1.640625, + "learning_rate": 2.3001022637880705e-05, + "loss": 0.3366, + "step": 12000 + }, + { + "epoch": 0.5269986937001219, + "grad_norm": 1.65625, + "learning_rate": 2.2994126582090635e-05, + "loss": 0.3486, + "step": 12002 + }, + { + "epoch": 0.5270865121793253, + "grad_norm": 1.71875, + "learning_rate": 2.298723067991083e-05, + "loss": 0.3687, + "step": 12004 + }, + { + "epoch": 0.5271743306585288, + "grad_norm": 1.7421875, + "learning_rate": 2.2980334931869386e-05, + "loss": 0.3494, + "step": 12006 + }, + { + "epoch": 0.5272621491377323, + "grad_norm": 1.8671875, + "learning_rate": 2.2973439338494372e-05, + "loss": 0.3455, + "step": 12008 + }, + { + "epoch": 0.5273499676169358, + "grad_norm": 1.6484375, + "learning_rate": 2.296654390031386e-05, + "loss": 0.3771, + "step": 12010 + }, + { + "epoch": 0.5274377860961392, + "grad_norm": 1.671875, + "learning_rate": 2.295964861785591e-05, + "loss": 0.3492, + "step": 12012 + }, + { + "epoch": 0.5275256045753428, + "grad_norm": 1.7890625, + "learning_rate": 2.295275349164855e-05, + "loss": 0.3332, + "step": 12014 + }, + { + "epoch": 0.5276134230545463, + "grad_norm": 1.6953125, + "learning_rate": 2.2945858522219822e-05, + "loss": 0.3673, + "step": 12016 + }, + { + "epoch": 0.5277012415337498, + "grad_norm": 1.734375, + "learning_rate": 2.293896371009774e-05, + "loss": 0.3439, + "step": 12018 + }, + { + "epoch": 0.5277890600129532, + "grad_norm": 1.671875, + "learning_rate": 2.293206905581031e-05, + "loss": 0.3268, + "step": 12020 + }, + { + "epoch": 0.5278768784921567, + "grad_norm": 1.7265625, + "learning_rate": 2.2925174559885526e-05, + "loss": 0.3308, + "step": 12022 + }, + { + "epoch": 0.5279646969713602, + "grad_norm": 2.0, + "learning_rate": 2.291828022285138e-05, + "loss": 0.3547, + "step": 12024 + }, + { + "epoch": 0.5280525154505636, + "grad_norm": 1.703125, + "learning_rate": 2.2911386045235826e-05, + "loss": 0.3416, + "step": 12026 + }, + { + "epoch": 0.5281403339297672, + "grad_norm": 2.171875, + "learning_rate": 2.290449202756684e-05, + "loss": 0.3571, + "step": 12028 + }, + { + "epoch": 0.5282281524089707, + "grad_norm": 2.0625, + "learning_rate": 2.2897598170372346e-05, + "loss": 0.3452, + "step": 12030 + }, + { + "epoch": 0.5283159708881742, + "grad_norm": 1.7578125, + "learning_rate": 2.289070447418029e-05, + "loss": 0.3385, + "step": 12032 + }, + { + "epoch": 0.5284037893673776, + "grad_norm": 1.6796875, + "learning_rate": 2.288381093951859e-05, + "loss": 0.3539, + "step": 12034 + }, + { + "epoch": 0.5284916078465811, + "grad_norm": 1.953125, + "learning_rate": 2.2876917566915158e-05, + "loss": 0.3663, + "step": 12036 + }, + { + "epoch": 0.5285794263257846, + "grad_norm": 1.640625, + "learning_rate": 2.2870024356897887e-05, + "loss": 0.3517, + "step": 12038 + }, + { + "epoch": 0.5286672448049881, + "grad_norm": 1.765625, + "learning_rate": 2.286313130999466e-05, + "loss": 0.321, + "step": 12040 + }, + { + "epoch": 0.5287550632841915, + "grad_norm": 2.09375, + "learning_rate": 2.285623842673335e-05, + "loss": 0.3667, + "step": 12042 + }, + { + "epoch": 0.5288428817633951, + "grad_norm": 1.6640625, + "learning_rate": 2.2849345707641815e-05, + "loss": 0.3561, + "step": 12044 + }, + { + "epoch": 0.5289307002425986, + "grad_norm": 1.71875, + "learning_rate": 2.28424531532479e-05, + "loss": 0.3227, + "step": 12046 + }, + { + "epoch": 0.5290185187218021, + "grad_norm": 1.7578125, + "learning_rate": 2.2835560764079446e-05, + "loss": 0.3416, + "step": 12048 + }, + { + "epoch": 0.5291063372010055, + "grad_norm": 1.8828125, + "learning_rate": 2.282866854066426e-05, + "loss": 0.3529, + "step": 12050 + }, + { + "epoch": 0.529194155680209, + "grad_norm": 2.125, + "learning_rate": 2.282177648353016e-05, + "loss": 0.3454, + "step": 12052 + }, + { + "epoch": 0.5292819741594125, + "grad_norm": 1.7578125, + "learning_rate": 2.281488459320494e-05, + "loss": 0.3768, + "step": 12054 + }, + { + "epoch": 0.5293697926386159, + "grad_norm": 1.8828125, + "learning_rate": 2.280799287021638e-05, + "loss": 0.3357, + "step": 12056 + }, + { + "epoch": 0.5294576111178194, + "grad_norm": 1.796875, + "learning_rate": 2.280110131509226e-05, + "loss": 0.3533, + "step": 12058 + }, + { + "epoch": 0.529545429597023, + "grad_norm": 1.671875, + "learning_rate": 2.2794209928360322e-05, + "loss": 0.3469, + "step": 12060 + }, + { + "epoch": 0.5296332480762265, + "grad_norm": 1.875, + "learning_rate": 2.2787318710548313e-05, + "loss": 0.3259, + "step": 12062 + }, + { + "epoch": 0.5297210665554299, + "grad_norm": 1.890625, + "learning_rate": 2.2780427662183975e-05, + "loss": 0.3613, + "step": 12064 + }, + { + "epoch": 0.5298088850346334, + "grad_norm": 1.859375, + "learning_rate": 2.2773536783795012e-05, + "loss": 0.3602, + "step": 12066 + }, + { + "epoch": 0.5298967035138369, + "grad_norm": 1.6015625, + "learning_rate": 2.2766646075909137e-05, + "loss": 0.3444, + "step": 12068 + }, + { + "epoch": 0.5299845219930404, + "grad_norm": 1.8046875, + "learning_rate": 2.275975553905404e-05, + "loss": 0.3097, + "step": 12070 + }, + { + "epoch": 0.5300723404722438, + "grad_norm": 1.734375, + "learning_rate": 2.2752865173757403e-05, + "loss": 0.3388, + "step": 12072 + }, + { + "epoch": 0.5301601589514474, + "grad_norm": 1.78125, + "learning_rate": 2.2745974980546893e-05, + "loss": 0.3593, + "step": 12074 + }, + { + "epoch": 0.5302479774306509, + "grad_norm": 1.6484375, + "learning_rate": 2.273908495995015e-05, + "loss": 0.3607, + "step": 12076 + }, + { + "epoch": 0.5303357959098544, + "grad_norm": 1.75, + "learning_rate": 2.273219511249483e-05, + "loss": 0.3437, + "step": 12078 + }, + { + "epoch": 0.5304236143890578, + "grad_norm": 1.703125, + "learning_rate": 2.272530543870855e-05, + "loss": 0.3302, + "step": 12080 + }, + { + "epoch": 0.5305114328682613, + "grad_norm": 1.84375, + "learning_rate": 2.2718415939118924e-05, + "loss": 0.3521, + "step": 12082 + }, + { + "epoch": 0.5305992513474648, + "grad_norm": 1.859375, + "learning_rate": 2.271152661425355e-05, + "loss": 0.3512, + "step": 12084 + }, + { + "epoch": 0.5306870698266682, + "grad_norm": 1.6640625, + "learning_rate": 2.2704637464640026e-05, + "loss": 0.3374, + "step": 12086 + }, + { + "epoch": 0.5307748883058717, + "grad_norm": 1.859375, + "learning_rate": 2.2697748490805905e-05, + "loss": 0.3573, + "step": 12088 + }, + { + "epoch": 0.5308627067850753, + "grad_norm": 1.5625, + "learning_rate": 2.2690859693278772e-05, + "loss": 0.3552, + "step": 12090 + }, + { + "epoch": 0.5309505252642788, + "grad_norm": 1.8671875, + "learning_rate": 2.2683971072586145e-05, + "loss": 0.3585, + "step": 12092 + }, + { + "epoch": 0.5310383437434822, + "grad_norm": 1.828125, + "learning_rate": 2.2677082629255566e-05, + "loss": 0.3361, + "step": 12094 + }, + { + "epoch": 0.5311261622226857, + "grad_norm": 1.734375, + "learning_rate": 2.2670194363814558e-05, + "loss": 0.3917, + "step": 12096 + }, + { + "epoch": 0.5312139807018892, + "grad_norm": 1.6484375, + "learning_rate": 2.2663306276790625e-05, + "loss": 0.3448, + "step": 12098 + }, + { + "epoch": 0.5313017991810927, + "grad_norm": 1.7109375, + "learning_rate": 2.265641836871126e-05, + "loss": 0.3332, + "step": 12100 + }, + { + "epoch": 0.5313896176602961, + "grad_norm": 1.765625, + "learning_rate": 2.2649530640103936e-05, + "loss": 0.3468, + "step": 12102 + }, + { + "epoch": 0.5314774361394996, + "grad_norm": 1.8671875, + "learning_rate": 2.264264309149612e-05, + "loss": 0.362, + "step": 12104 + }, + { + "epoch": 0.5315652546187032, + "grad_norm": 1.7734375, + "learning_rate": 2.2635755723415256e-05, + "loss": 0.3369, + "step": 12106 + }, + { + "epoch": 0.5316530730979067, + "grad_norm": 1.9375, + "learning_rate": 2.262886853638879e-05, + "loss": 0.3461, + "step": 12108 + }, + { + "epoch": 0.5317408915771101, + "grad_norm": 1.625, + "learning_rate": 2.262198153094414e-05, + "loss": 0.3503, + "step": 12110 + }, + { + "epoch": 0.5318287100563136, + "grad_norm": 1.8359375, + "learning_rate": 2.2615094707608715e-05, + "loss": 0.3132, + "step": 12112 + }, + { + "epoch": 0.5319165285355171, + "grad_norm": 1.8359375, + "learning_rate": 2.2608208066909904e-05, + "loss": 0.3519, + "step": 12114 + }, + { + "epoch": 0.5320043470147205, + "grad_norm": 2.046875, + "learning_rate": 2.2601321609375097e-05, + "loss": 0.3466, + "step": 12116 + }, + { + "epoch": 0.532092165493924, + "grad_norm": 2.078125, + "learning_rate": 2.2594435335531655e-05, + "loss": 0.3607, + "step": 12118 + }, + { + "epoch": 0.5321799839731275, + "grad_norm": 1.5859375, + "learning_rate": 2.258754924590694e-05, + "loss": 0.3258, + "step": 12120 + }, + { + "epoch": 0.5322678024523311, + "grad_norm": 1.859375, + "learning_rate": 2.2580663341028273e-05, + "loss": 0.3427, + "step": 12122 + }, + { + "epoch": 0.5323556209315345, + "grad_norm": 1.7734375, + "learning_rate": 2.2573777621422985e-05, + "loss": 0.3639, + "step": 12124 + }, + { + "epoch": 0.532443439410738, + "grad_norm": 1.9765625, + "learning_rate": 2.25668920876184e-05, + "loss": 0.3382, + "step": 12126 + }, + { + "epoch": 0.5325312578899415, + "grad_norm": 1.734375, + "learning_rate": 2.256000674014179e-05, + "loss": 0.3362, + "step": 12128 + }, + { + "epoch": 0.532619076369145, + "grad_norm": 1.78125, + "learning_rate": 2.2553121579520454e-05, + "loss": 0.3644, + "step": 12130 + }, + { + "epoch": 0.5327068948483484, + "grad_norm": 1.7578125, + "learning_rate": 2.254623660628165e-05, + "loss": 0.3392, + "step": 12132 + }, + { + "epoch": 0.5327947133275519, + "grad_norm": 1.6875, + "learning_rate": 2.2539351820952636e-05, + "loss": 0.3422, + "step": 12134 + }, + { + "epoch": 0.5328825318067555, + "grad_norm": 1.890625, + "learning_rate": 2.2532467224060656e-05, + "loss": 0.3382, + "step": 12136 + }, + { + "epoch": 0.532970350285959, + "grad_norm": 1.7265625, + "learning_rate": 2.2525582816132922e-05, + "loss": 0.3704, + "step": 12138 + }, + { + "epoch": 0.5330581687651624, + "grad_norm": 1.890625, + "learning_rate": 2.2518698597696646e-05, + "loss": 0.3222, + "step": 12140 + }, + { + "epoch": 0.5331459872443659, + "grad_norm": 1.640625, + "learning_rate": 2.251181456927903e-05, + "loss": 0.3759, + "step": 12142 + }, + { + "epoch": 0.5332338057235694, + "grad_norm": 1.8671875, + "learning_rate": 2.2504930731407252e-05, + "loss": 0.3332, + "step": 12144 + }, + { + "epoch": 0.5333216242027728, + "grad_norm": 1.75, + "learning_rate": 2.2498047084608477e-05, + "loss": 0.343, + "step": 12146 + }, + { + "epoch": 0.5334094426819763, + "grad_norm": 1.6953125, + "learning_rate": 2.2491163629409852e-05, + "loss": 0.3348, + "step": 12148 + }, + { + "epoch": 0.5334972611611798, + "grad_norm": 1.6640625, + "learning_rate": 2.2484280366338527e-05, + "loss": 0.3569, + "step": 12150 + }, + { + "epoch": 0.5335850796403834, + "grad_norm": 1.71875, + "learning_rate": 2.247739729592161e-05, + "loss": 0.357, + "step": 12152 + }, + { + "epoch": 0.5336728981195868, + "grad_norm": 1.6875, + "learning_rate": 2.247051441868621e-05, + "loss": 0.3752, + "step": 12154 + }, + { + "epoch": 0.5337607165987903, + "grad_norm": 1.6328125, + "learning_rate": 2.2463631735159422e-05, + "loss": 0.3608, + "step": 12156 + }, + { + "epoch": 0.5338485350779938, + "grad_norm": 1.5546875, + "learning_rate": 2.2456749245868323e-05, + "loss": 0.3554, + "step": 12158 + }, + { + "epoch": 0.5339363535571973, + "grad_norm": 1.6328125, + "learning_rate": 2.2449866951339976e-05, + "loss": 0.3527, + "step": 12160 + }, + { + "epoch": 0.5340241720364007, + "grad_norm": 1.8671875, + "learning_rate": 2.2442984852101435e-05, + "loss": 0.3513, + "step": 12162 + }, + { + "epoch": 0.5341119905156042, + "grad_norm": 1.546875, + "learning_rate": 2.2436102948679722e-05, + "loss": 0.3651, + "step": 12164 + }, + { + "epoch": 0.5341998089948077, + "grad_norm": 1.671875, + "learning_rate": 2.242922124160186e-05, + "loss": 0.3336, + "step": 12166 + }, + { + "epoch": 0.5342876274740113, + "grad_norm": 1.859375, + "learning_rate": 2.2422339731394848e-05, + "loss": 0.3534, + "step": 12168 + }, + { + "epoch": 0.5343754459532147, + "grad_norm": 1.8125, + "learning_rate": 2.2415458418585682e-05, + "loss": 0.3558, + "step": 12170 + }, + { + "epoch": 0.5344632644324182, + "grad_norm": 1.8671875, + "learning_rate": 2.2408577303701334e-05, + "loss": 0.3285, + "step": 12172 + }, + { + "epoch": 0.5345510829116217, + "grad_norm": 1.9453125, + "learning_rate": 2.240169638726875e-05, + "loss": 0.3693, + "step": 12174 + }, + { + "epoch": 0.5346389013908251, + "grad_norm": 1.8515625, + "learning_rate": 2.239481566981488e-05, + "loss": 0.3854, + "step": 12176 + }, + { + "epoch": 0.5347267198700286, + "grad_norm": 1.7578125, + "learning_rate": 2.2387935151866653e-05, + "loss": 0.3369, + "step": 12178 + }, + { + "epoch": 0.5348145383492321, + "grad_norm": 1.9609375, + "learning_rate": 2.2381054833950976e-05, + "loss": 0.3574, + "step": 12180 + }, + { + "epoch": 0.5349023568284357, + "grad_norm": 1.8671875, + "learning_rate": 2.2374174716594758e-05, + "loss": 0.3278, + "step": 12182 + }, + { + "epoch": 0.5349901753076391, + "grad_norm": 1.6484375, + "learning_rate": 2.2367294800324862e-05, + "loss": 0.3384, + "step": 12184 + }, + { + "epoch": 0.5350779937868426, + "grad_norm": 1.8671875, + "learning_rate": 2.236041508566816e-05, + "loss": 0.3491, + "step": 12186 + }, + { + "epoch": 0.5351658122660461, + "grad_norm": 1.734375, + "learning_rate": 2.2353535573151506e-05, + "loss": 0.3535, + "step": 12188 + }, + { + "epoch": 0.5352536307452496, + "grad_norm": 1.84375, + "learning_rate": 2.234665626330173e-05, + "loss": 0.3765, + "step": 12190 + }, + { + "epoch": 0.535341449224453, + "grad_norm": 1.765625, + "learning_rate": 2.233977715664565e-05, + "loss": 0.3506, + "step": 12192 + }, + { + "epoch": 0.5354292677036565, + "grad_norm": 1.578125, + "learning_rate": 2.2332898253710077e-05, + "loss": 0.3636, + "step": 12194 + }, + { + "epoch": 0.53551708618286, + "grad_norm": 1.6796875, + "learning_rate": 2.2326019555021793e-05, + "loss": 0.331, + "step": 12196 + }, + { + "epoch": 0.5356049046620636, + "grad_norm": 1.6484375, + "learning_rate": 2.231914106110758e-05, + "loss": 0.3456, + "step": 12198 + }, + { + "epoch": 0.535692723141267, + "grad_norm": 1.8203125, + "learning_rate": 2.231226277249418e-05, + "loss": 0.3505, + "step": 12200 + }, + { + "epoch": 0.5357805416204705, + "grad_norm": 1.90625, + "learning_rate": 2.2305384689708342e-05, + "loss": 0.3472, + "step": 12202 + }, + { + "epoch": 0.535868360099674, + "grad_norm": 1.84375, + "learning_rate": 2.229850681327679e-05, + "loss": 0.3306, + "step": 12204 + }, + { + "epoch": 0.5359561785788775, + "grad_norm": 1.9375, + "learning_rate": 2.2291629143726232e-05, + "loss": 0.3345, + "step": 12206 + }, + { + "epoch": 0.5360439970580809, + "grad_norm": 1.6328125, + "learning_rate": 2.228475168158337e-05, + "loss": 0.3393, + "step": 12208 + }, + { + "epoch": 0.5361318155372844, + "grad_norm": 1.671875, + "learning_rate": 2.227787442737487e-05, + "loss": 0.327, + "step": 12210 + }, + { + "epoch": 0.5362196340164879, + "grad_norm": 1.8828125, + "learning_rate": 2.2270997381627407e-05, + "loss": 0.3662, + "step": 12212 + }, + { + "epoch": 0.5363074524956915, + "grad_norm": 1.6640625, + "learning_rate": 2.2264120544867615e-05, + "loss": 0.365, + "step": 12214 + }, + { + "epoch": 0.5363952709748949, + "grad_norm": 1.7265625, + "learning_rate": 2.2257243917622124e-05, + "loss": 0.3556, + "step": 12216 + }, + { + "epoch": 0.5364830894540984, + "grad_norm": 1.6875, + "learning_rate": 2.2250367500417547e-05, + "loss": 0.3615, + "step": 12218 + }, + { + "epoch": 0.5365709079333019, + "grad_norm": 2.53125, + "learning_rate": 2.2243491293780493e-05, + "loss": 0.3571, + "step": 12220 + }, + { + "epoch": 0.5366587264125053, + "grad_norm": 1.6875, + "learning_rate": 2.223661529823753e-05, + "loss": 0.3442, + "step": 12222 + }, + { + "epoch": 0.5367465448917088, + "grad_norm": 1.984375, + "learning_rate": 2.222973951431524e-05, + "loss": 0.3453, + "step": 12224 + }, + { + "epoch": 0.5368343633709123, + "grad_norm": 1.796875, + "learning_rate": 2.2222863942540153e-05, + "loss": 0.3537, + "step": 12226 + }, + { + "epoch": 0.5369221818501159, + "grad_norm": 1.8046875, + "learning_rate": 2.2215988583438814e-05, + "loss": 0.3566, + "step": 12228 + }, + { + "epoch": 0.5370100003293193, + "grad_norm": 1.796875, + "learning_rate": 2.2209113437537738e-05, + "loss": 0.3607, + "step": 12230 + }, + { + "epoch": 0.5370978188085228, + "grad_norm": 1.75, + "learning_rate": 2.220223850536342e-05, + "loss": 0.3418, + "step": 12232 + }, + { + "epoch": 0.5371856372877263, + "grad_norm": 1.625, + "learning_rate": 2.2195363787442356e-05, + "loss": 0.3311, + "step": 12234 + }, + { + "epoch": 0.5372734557669298, + "grad_norm": 1.8984375, + "learning_rate": 2.2188489284300997e-05, + "loss": 0.3293, + "step": 12236 + }, + { + "epoch": 0.5373612742461332, + "grad_norm": 1.671875, + "learning_rate": 2.2181614996465805e-05, + "loss": 0.3569, + "step": 12238 + }, + { + "epoch": 0.5374490927253367, + "grad_norm": 2.015625, + "learning_rate": 2.217474092446321e-05, + "loss": 0.3676, + "step": 12240 + }, + { + "epoch": 0.5375369112045402, + "grad_norm": 1.78125, + "learning_rate": 2.2167867068819646e-05, + "loss": 0.3687, + "step": 12242 + }, + { + "epoch": 0.5376247296837438, + "grad_norm": 1.7421875, + "learning_rate": 2.2160993430061488e-05, + "loss": 0.3539, + "step": 12244 + }, + { + "epoch": 0.5377125481629472, + "grad_norm": 1.703125, + "learning_rate": 2.2154120008715135e-05, + "loss": 0.3229, + "step": 12246 + }, + { + "epoch": 0.5378003666421507, + "grad_norm": 1.8203125, + "learning_rate": 2.2147246805306955e-05, + "loss": 0.3524, + "step": 12248 + }, + { + "epoch": 0.5378881851213542, + "grad_norm": 1.921875, + "learning_rate": 2.21403738203633e-05, + "loss": 0.3487, + "step": 12250 + }, + { + "epoch": 0.5379760036005576, + "grad_norm": 1.8125, + "learning_rate": 2.2133501054410503e-05, + "loss": 0.3325, + "step": 12252 + }, + { + "epoch": 0.5380638220797611, + "grad_norm": 1.6875, + "learning_rate": 2.2126628507974877e-05, + "loss": 0.315, + "step": 12254 + }, + { + "epoch": 0.5381516405589646, + "grad_norm": 1.7421875, + "learning_rate": 2.2119756181582733e-05, + "loss": 0.3404, + "step": 12256 + }, + { + "epoch": 0.538239459038168, + "grad_norm": 1.6484375, + "learning_rate": 2.2112884075760347e-05, + "loss": 0.3195, + "step": 12258 + }, + { + "epoch": 0.5383272775173716, + "grad_norm": 1.75, + "learning_rate": 2.2106012191033998e-05, + "loss": 0.3416, + "step": 12260 + }, + { + "epoch": 0.5384150959965751, + "grad_norm": 1.6875, + "learning_rate": 2.2099140527929926e-05, + "loss": 0.3675, + "step": 12262 + }, + { + "epoch": 0.5385029144757786, + "grad_norm": 1.6015625, + "learning_rate": 2.2092269086974367e-05, + "loss": 0.3592, + "step": 12264 + }, + { + "epoch": 0.538590732954982, + "grad_norm": 1.5859375, + "learning_rate": 2.2085397868693537e-05, + "loss": 0.3565, + "step": 12266 + }, + { + "epoch": 0.5386785514341855, + "grad_norm": 1.875, + "learning_rate": 2.2078526873613637e-05, + "loss": 0.3504, + "step": 12268 + }, + { + "epoch": 0.538766369913389, + "grad_norm": 1.859375, + "learning_rate": 2.207165610226085e-05, + "loss": 0.3347, + "step": 12270 + }, + { + "epoch": 0.5388541883925925, + "grad_norm": 1.71875, + "learning_rate": 2.2064785555161343e-05, + "loss": 0.3376, + "step": 12272 + }, + { + "epoch": 0.5389420068717959, + "grad_norm": 2.015625, + "learning_rate": 2.205791523284127e-05, + "loss": 0.366, + "step": 12274 + }, + { + "epoch": 0.5390298253509995, + "grad_norm": 2.3125, + "learning_rate": 2.2051045135826743e-05, + "loss": 0.3163, + "step": 12276 + }, + { + "epoch": 0.539117643830203, + "grad_norm": 1.6953125, + "learning_rate": 2.2044175264643884e-05, + "loss": 0.3666, + "step": 12278 + }, + { + "epoch": 0.5392054623094065, + "grad_norm": 1.8671875, + "learning_rate": 2.2037305619818792e-05, + "loss": 0.3586, + "step": 12280 + }, + { + "epoch": 0.5392932807886099, + "grad_norm": 2.25, + "learning_rate": 2.203043620187755e-05, + "loss": 0.3081, + "step": 12282 + }, + { + "epoch": 0.5393810992678134, + "grad_norm": 1.984375, + "learning_rate": 2.2023567011346213e-05, + "loss": 0.3113, + "step": 12284 + }, + { + "epoch": 0.5394689177470169, + "grad_norm": 2.140625, + "learning_rate": 2.2016698048750832e-05, + "loss": 0.339, + "step": 12286 + }, + { + "epoch": 0.5395567362262204, + "grad_norm": 1.6953125, + "learning_rate": 2.200982931461743e-05, + "loss": 0.3853, + "step": 12288 + }, + { + "epoch": 0.5396445547054239, + "grad_norm": 2.09375, + "learning_rate": 2.200296080947201e-05, + "loss": 0.3577, + "step": 12290 + }, + { + "epoch": 0.5397323731846274, + "grad_norm": 1.6484375, + "learning_rate": 2.199609253384057e-05, + "loss": 0.3771, + "step": 12292 + }, + { + "epoch": 0.5398201916638309, + "grad_norm": 1.71875, + "learning_rate": 2.198922448824908e-05, + "loss": 0.342, + "step": 12294 + }, + { + "epoch": 0.5399080101430344, + "grad_norm": 1.9765625, + "learning_rate": 2.1982356673223516e-05, + "loss": 0.3599, + "step": 12296 + }, + { + "epoch": 0.5399958286222378, + "grad_norm": 1.7890625, + "learning_rate": 2.1975489089289788e-05, + "loss": 0.3807, + "step": 12298 + }, + { + "epoch": 0.5400836471014413, + "grad_norm": 1.75, + "learning_rate": 2.1968621736973836e-05, + "loss": 0.3404, + "step": 12300 + }, + { + "epoch": 0.5401714655806448, + "grad_norm": 1.765625, + "learning_rate": 2.1961754616801553e-05, + "loss": 0.3492, + "step": 12302 + }, + { + "epoch": 0.5402592840598482, + "grad_norm": 1.5703125, + "learning_rate": 2.1954887729298844e-05, + "loss": 0.3583, + "step": 12304 + }, + { + "epoch": 0.5403471025390518, + "grad_norm": 1.65625, + "learning_rate": 2.1948021074991552e-05, + "loss": 0.3626, + "step": 12306 + }, + { + "epoch": 0.5404349210182553, + "grad_norm": 2.015625, + "learning_rate": 2.1941154654405536e-05, + "loss": 0.3354, + "step": 12308 + }, + { + "epoch": 0.5405227394974588, + "grad_norm": 1.6796875, + "learning_rate": 2.1934288468066627e-05, + "loss": 0.3288, + "step": 12310 + }, + { + "epoch": 0.5406105579766622, + "grad_norm": 1.6484375, + "learning_rate": 2.192742251650065e-05, + "loss": 0.3443, + "step": 12312 + }, + { + "epoch": 0.5406983764558657, + "grad_norm": 1.8671875, + "learning_rate": 2.1920556800233392e-05, + "loss": 0.3248, + "step": 12314 + }, + { + "epoch": 0.5407861949350692, + "grad_norm": 1.75, + "learning_rate": 2.1913691319790627e-05, + "loss": 0.3393, + "step": 12316 + }, + { + "epoch": 0.5408740134142727, + "grad_norm": 1.640625, + "learning_rate": 2.190682607569812e-05, + "loss": 0.3671, + "step": 12318 + }, + { + "epoch": 0.5409618318934761, + "grad_norm": 1.6796875, + "learning_rate": 2.189996106848162e-05, + "loss": 0.3408, + "step": 12320 + }, + { + "epoch": 0.5410496503726797, + "grad_norm": 1.7265625, + "learning_rate": 2.1893096298666844e-05, + "loss": 0.3222, + "step": 12322 + }, + { + "epoch": 0.5411374688518832, + "grad_norm": 1.7109375, + "learning_rate": 2.1886231766779495e-05, + "loss": 0.3502, + "step": 12324 + }, + { + "epoch": 0.5412252873310867, + "grad_norm": 1.6875, + "learning_rate": 2.1879367473345263e-05, + "loss": 0.3585, + "step": 12326 + }, + { + "epoch": 0.5413131058102901, + "grad_norm": 1.640625, + "learning_rate": 2.187250341888982e-05, + "loss": 0.3369, + "step": 12328 + }, + { + "epoch": 0.5414009242894936, + "grad_norm": 1.828125, + "learning_rate": 2.186563960393881e-05, + "loss": 0.3656, + "step": 12330 + }, + { + "epoch": 0.5414887427686971, + "grad_norm": 1.78125, + "learning_rate": 2.185877602901788e-05, + "loss": 0.3492, + "step": 12332 + }, + { + "epoch": 0.5415765612479005, + "grad_norm": 1.75, + "learning_rate": 2.1851912694652634e-05, + "loss": 0.3264, + "step": 12334 + }, + { + "epoch": 0.5416643797271041, + "grad_norm": 1.71875, + "learning_rate": 2.184504960136867e-05, + "loss": 0.3433, + "step": 12336 + }, + { + "epoch": 0.5417521982063076, + "grad_norm": 1.859375, + "learning_rate": 2.1838186749691557e-05, + "loss": 0.3419, + "step": 12338 + }, + { + "epoch": 0.5418400166855111, + "grad_norm": 2.03125, + "learning_rate": 2.183132414014686e-05, + "loss": 0.3602, + "step": 12340 + }, + { + "epoch": 0.5419278351647145, + "grad_norm": 1.7890625, + "learning_rate": 2.1824461773260122e-05, + "loss": 0.3398, + "step": 12342 + }, + { + "epoch": 0.542015653643918, + "grad_norm": 1.671875, + "learning_rate": 2.1817599649556858e-05, + "loss": 0.3386, + "step": 12344 + }, + { + "epoch": 0.5421034721231215, + "grad_norm": 1.671875, + "learning_rate": 2.181073776956258e-05, + "loss": 0.3365, + "step": 12346 + }, + { + "epoch": 0.542191290602325, + "grad_norm": 1.9453125, + "learning_rate": 2.1803876133802773e-05, + "loss": 0.316, + "step": 12348 + }, + { + "epoch": 0.5422791090815284, + "grad_norm": 1.9140625, + "learning_rate": 2.179701474280289e-05, + "loss": 0.357, + "step": 12350 + }, + { + "epoch": 0.542366927560732, + "grad_norm": 1.9375, + "learning_rate": 2.179015359708839e-05, + "loss": 0.3212, + "step": 12352 + }, + { + "epoch": 0.5424547460399355, + "grad_norm": 1.6953125, + "learning_rate": 2.1783292697184694e-05, + "loss": 0.3282, + "step": 12354 + }, + { + "epoch": 0.542542564519139, + "grad_norm": 1.890625, + "learning_rate": 2.1776432043617214e-05, + "loss": 0.3623, + "step": 12356 + }, + { + "epoch": 0.5426303829983424, + "grad_norm": 1.7578125, + "learning_rate": 2.1769571636911347e-05, + "loss": 0.3273, + "step": 12358 + }, + { + "epoch": 0.5427182014775459, + "grad_norm": 1.6171875, + "learning_rate": 2.1762711477592454e-05, + "loss": 0.3147, + "step": 12360 + }, + { + "epoch": 0.5428060199567494, + "grad_norm": 1.9140625, + "learning_rate": 2.1755851566185888e-05, + "loss": 0.3477, + "step": 12362 + }, + { + "epoch": 0.5428938384359528, + "grad_norm": 2.125, + "learning_rate": 2.174899190321699e-05, + "loss": 0.3381, + "step": 12364 + }, + { + "epoch": 0.5429816569151563, + "grad_norm": 1.7578125, + "learning_rate": 2.1742132489211082e-05, + "loss": 0.35, + "step": 12366 + }, + { + "epoch": 0.5430694753943599, + "grad_norm": 1.6328125, + "learning_rate": 2.173527332469344e-05, + "loss": 0.3457, + "step": 12368 + }, + { + "epoch": 0.5431572938735634, + "grad_norm": 1.8515625, + "learning_rate": 2.1728414410189347e-05, + "loss": 0.3632, + "step": 12370 + }, + { + "epoch": 0.5432451123527668, + "grad_norm": 1.859375, + "learning_rate": 2.172155574622407e-05, + "loss": 0.3292, + "step": 12372 + }, + { + "epoch": 0.5433329308319703, + "grad_norm": 1.734375, + "learning_rate": 2.1714697333322832e-05, + "loss": 0.3531, + "step": 12374 + }, + { + "epoch": 0.5434207493111738, + "grad_norm": 1.828125, + "learning_rate": 2.1707839172010862e-05, + "loss": 0.3582, + "step": 12376 + }, + { + "epoch": 0.5435085677903773, + "grad_norm": 1.703125, + "learning_rate": 2.1700981262813357e-05, + "loss": 0.3639, + "step": 12378 + }, + { + "epoch": 0.5435963862695807, + "grad_norm": 1.734375, + "learning_rate": 2.1694123606255496e-05, + "loss": 0.3343, + "step": 12380 + }, + { + "epoch": 0.5436842047487843, + "grad_norm": 1.8046875, + "learning_rate": 2.1687266202862445e-05, + "loss": 0.3665, + "step": 12382 + }, + { + "epoch": 0.5437720232279878, + "grad_norm": 1.75, + "learning_rate": 2.1680409053159345e-05, + "loss": 0.3628, + "step": 12384 + }, + { + "epoch": 0.5438598417071913, + "grad_norm": 1.703125, + "learning_rate": 2.1673552157671308e-05, + "loss": 0.3443, + "step": 12386 + }, + { + "epoch": 0.5439476601863947, + "grad_norm": 1.640625, + "learning_rate": 2.1666695516923445e-05, + "loss": 0.3487, + "step": 12388 + }, + { + "epoch": 0.5440354786655982, + "grad_norm": 1.7421875, + "learning_rate": 2.1659839131440844e-05, + "loss": 0.3419, + "step": 12390 + }, + { + "epoch": 0.5441232971448017, + "grad_norm": 1.65625, + "learning_rate": 2.1652983001748558e-05, + "loss": 0.3457, + "step": 12392 + }, + { + "epoch": 0.5442111156240051, + "grad_norm": 1.796875, + "learning_rate": 2.1646127128371644e-05, + "loss": 0.3612, + "step": 12394 + }, + { + "epoch": 0.5442989341032086, + "grad_norm": 1.6484375, + "learning_rate": 2.1639271511835117e-05, + "loss": 0.3232, + "step": 12396 + }, + { + "epoch": 0.5443867525824122, + "grad_norm": 1.8359375, + "learning_rate": 2.163241615266398e-05, + "loss": 0.3363, + "step": 12398 + }, + { + "epoch": 0.5444745710616157, + "grad_norm": 1.765625, + "learning_rate": 2.162556105138322e-05, + "loss": 0.3479, + "step": 12400 + }, + { + "epoch": 0.5445623895408191, + "grad_norm": 1.8203125, + "learning_rate": 2.1618706208517804e-05, + "loss": 0.3632, + "step": 12402 + }, + { + "epoch": 0.5446502080200226, + "grad_norm": 1.78125, + "learning_rate": 2.1611851624592678e-05, + "loss": 0.3729, + "step": 12404 + }, + { + "epoch": 0.5447380264992261, + "grad_norm": 1.6953125, + "learning_rate": 2.160499730013276e-05, + "loss": 0.3483, + "step": 12406 + }, + { + "epoch": 0.5448258449784296, + "grad_norm": 1.6875, + "learning_rate": 2.1598143235662977e-05, + "loss": 0.3499, + "step": 12408 + }, + { + "epoch": 0.544913663457633, + "grad_norm": 1.7734375, + "learning_rate": 2.1591289431708188e-05, + "loss": 0.3555, + "step": 12410 + }, + { + "epoch": 0.5450014819368365, + "grad_norm": 1.765625, + "learning_rate": 2.1584435888793276e-05, + "loss": 0.3504, + "step": 12412 + }, + { + "epoch": 0.5450893004160401, + "grad_norm": 1.9765625, + "learning_rate": 2.157758260744308e-05, + "loss": 0.34, + "step": 12414 + }, + { + "epoch": 0.5451771188952436, + "grad_norm": 1.84375, + "learning_rate": 2.157072958818243e-05, + "loss": 0.3406, + "step": 12416 + }, + { + "epoch": 0.545264937374447, + "grad_norm": 1.84375, + "learning_rate": 2.1563876831536127e-05, + "loss": 0.3288, + "step": 12418 + }, + { + "epoch": 0.5453527558536505, + "grad_norm": 1.8984375, + "learning_rate": 2.1557024338028968e-05, + "loss": 0.3326, + "step": 12420 + }, + { + "epoch": 0.545440574332854, + "grad_norm": 1.78125, + "learning_rate": 2.1550172108185703e-05, + "loss": 0.3643, + "step": 12422 + }, + { + "epoch": 0.5455283928120574, + "grad_norm": 1.7734375, + "learning_rate": 2.1543320142531087e-05, + "loss": 0.368, + "step": 12424 + }, + { + "epoch": 0.5456162112912609, + "grad_norm": 1.6640625, + "learning_rate": 2.1536468441589855e-05, + "loss": 0.3537, + "step": 12426 + }, + { + "epoch": 0.5457040297704645, + "grad_norm": 1.8359375, + "learning_rate": 2.1529617005886687e-05, + "loss": 0.3417, + "step": 12428 + }, + { + "epoch": 0.545791848249668, + "grad_norm": 1.921875, + "learning_rate": 2.1522765835946283e-05, + "loss": 0.3658, + "step": 12430 + }, + { + "epoch": 0.5458796667288714, + "grad_norm": 1.765625, + "learning_rate": 2.1515914932293302e-05, + "loss": 0.3662, + "step": 12432 + }, + { + "epoch": 0.5459674852080749, + "grad_norm": 1.8125, + "learning_rate": 2.1509064295452395e-05, + "loss": 0.3521, + "step": 12434 + }, + { + "epoch": 0.5460553036872784, + "grad_norm": 1.8125, + "learning_rate": 2.150221392594818e-05, + "loss": 0.3552, + "step": 12436 + }, + { + "epoch": 0.5461431221664819, + "grad_norm": 1.8984375, + "learning_rate": 2.149536382430526e-05, + "loss": 0.333, + "step": 12438 + }, + { + "epoch": 0.5462309406456853, + "grad_norm": 1.875, + "learning_rate": 2.1488513991048215e-05, + "loss": 0.3332, + "step": 12440 + }, + { + "epoch": 0.5463187591248888, + "grad_norm": 1.7421875, + "learning_rate": 2.1481664426701615e-05, + "loss": 0.3674, + "step": 12442 + }, + { + "epoch": 0.5464065776040924, + "grad_norm": 1.6875, + "learning_rate": 2.1474815131789993e-05, + "loss": 0.361, + "step": 12444 + }, + { + "epoch": 0.5464943960832959, + "grad_norm": 1.6953125, + "learning_rate": 2.1467966106837877e-05, + "loss": 0.3478, + "step": 12446 + }, + { + "epoch": 0.5465822145624993, + "grad_norm": 1.96875, + "learning_rate": 2.1461117352369765e-05, + "loss": 0.3433, + "step": 12448 + }, + { + "epoch": 0.5466700330417028, + "grad_norm": 1.75, + "learning_rate": 2.1454268868910126e-05, + "loss": 0.3391, + "step": 12450 + }, + { + "epoch": 0.5467578515209063, + "grad_norm": 1.9765625, + "learning_rate": 2.1447420656983432e-05, + "loss": 0.3514, + "step": 12452 + }, + { + "epoch": 0.5468456700001098, + "grad_norm": 1.671875, + "learning_rate": 2.1440572717114115e-05, + "loss": 0.35, + "step": 12454 + }, + { + "epoch": 0.5469334884793132, + "grad_norm": 1.75, + "learning_rate": 2.1433725049826597e-05, + "loss": 0.3498, + "step": 12456 + }, + { + "epoch": 0.5470213069585167, + "grad_norm": 1.625, + "learning_rate": 2.1426877655645274e-05, + "loss": 0.3254, + "step": 12458 + }, + { + "epoch": 0.5471091254377203, + "grad_norm": 1.796875, + "learning_rate": 2.142003053509451e-05, + "loss": 0.3818, + "step": 12460 + }, + { + "epoch": 0.5471969439169238, + "grad_norm": 1.8125, + "learning_rate": 2.1413183688698664e-05, + "loss": 0.3622, + "step": 12462 + }, + { + "epoch": 0.5472847623961272, + "grad_norm": 1.953125, + "learning_rate": 2.1406337116982074e-05, + "loss": 0.3316, + "step": 12464 + }, + { + "epoch": 0.5473725808753307, + "grad_norm": 1.671875, + "learning_rate": 2.1399490820469042e-05, + "loss": 0.3456, + "step": 12466 + }, + { + "epoch": 0.5474603993545342, + "grad_norm": 1.765625, + "learning_rate": 2.139264479968387e-05, + "loss": 0.3177, + "step": 12468 + }, + { + "epoch": 0.5475482178337376, + "grad_norm": 1.6875, + "learning_rate": 2.138579905515083e-05, + "loss": 0.3627, + "step": 12470 + }, + { + "epoch": 0.5476360363129411, + "grad_norm": 1.640625, + "learning_rate": 2.137895358739416e-05, + "loss": 0.3348, + "step": 12472 + }, + { + "epoch": 0.5477238547921446, + "grad_norm": 1.6875, + "learning_rate": 2.137210839693809e-05, + "loss": 0.3557, + "step": 12474 + }, + { + "epoch": 0.5478116732713482, + "grad_norm": 1.8984375, + "learning_rate": 2.1365263484306828e-05, + "loss": 0.3701, + "step": 12476 + }, + { + "epoch": 0.5478994917505516, + "grad_norm": 1.859375, + "learning_rate": 2.135841885002456e-05, + "loss": 0.3403, + "step": 12478 + }, + { + "epoch": 0.5479873102297551, + "grad_norm": 1.7265625, + "learning_rate": 2.135157449461545e-05, + "loss": 0.3498, + "step": 12480 + }, + { + "epoch": 0.5480751287089586, + "grad_norm": 1.6015625, + "learning_rate": 2.1344730418603646e-05, + "loss": 0.3349, + "step": 12482 + }, + { + "epoch": 0.548162947188162, + "grad_norm": 1.8203125, + "learning_rate": 2.1337886622513257e-05, + "loss": 0.3614, + "step": 12484 + }, + { + "epoch": 0.5482507656673655, + "grad_norm": 1.921875, + "learning_rate": 2.1331043106868386e-05, + "loss": 0.3407, + "step": 12486 + }, + { + "epoch": 0.548338584146569, + "grad_norm": 1.703125, + "learning_rate": 2.1324199872193128e-05, + "loss": 0.3853, + "step": 12488 + }, + { + "epoch": 0.5484264026257726, + "grad_norm": 1.6328125, + "learning_rate": 2.1317356919011513e-05, + "loss": 0.3742, + "step": 12490 + }, + { + "epoch": 0.548514221104976, + "grad_norm": 1.6171875, + "learning_rate": 2.1310514247847587e-05, + "loss": 0.3465, + "step": 12492 + }, + { + "epoch": 0.5486020395841795, + "grad_norm": 1.8046875, + "learning_rate": 2.1303671859225364e-05, + "loss": 0.3242, + "step": 12494 + }, + { + "epoch": 0.548689858063383, + "grad_norm": 1.90625, + "learning_rate": 2.1296829753668844e-05, + "loss": 0.3473, + "step": 12496 + }, + { + "epoch": 0.5487776765425865, + "grad_norm": 1.7265625, + "learning_rate": 2.1289987931701983e-05, + "loss": 0.3608, + "step": 12498 + }, + { + "epoch": 0.5488654950217899, + "grad_norm": 1.4921875, + "learning_rate": 2.1283146393848733e-05, + "loss": 0.3289, + "step": 12500 + }, + { + "epoch": 0.5489533135009934, + "grad_norm": 1.6796875, + "learning_rate": 2.1276305140633024e-05, + "loss": 0.349, + "step": 12502 + }, + { + "epoch": 0.5490411319801969, + "grad_norm": 1.640625, + "learning_rate": 2.1269464172578764e-05, + "loss": 0.3402, + "step": 12504 + }, + { + "epoch": 0.5491289504594005, + "grad_norm": 1.890625, + "learning_rate": 2.1262623490209834e-05, + "loss": 0.3441, + "step": 12506 + }, + { + "epoch": 0.5492167689386039, + "grad_norm": 1.734375, + "learning_rate": 2.1255783094050087e-05, + "loss": 0.3782, + "step": 12508 + }, + { + "epoch": 0.5493045874178074, + "grad_norm": 1.6796875, + "learning_rate": 2.1248942984623367e-05, + "loss": 0.3275, + "step": 12510 + }, + { + "epoch": 0.5493924058970109, + "grad_norm": 1.8671875, + "learning_rate": 2.1242103162453495e-05, + "loss": 0.3401, + "step": 12512 + }, + { + "epoch": 0.5494802243762144, + "grad_norm": 1.8359375, + "learning_rate": 2.123526362806426e-05, + "loss": 0.3422, + "step": 12514 + }, + { + "epoch": 0.5495680428554178, + "grad_norm": 1.703125, + "learning_rate": 2.1228424381979446e-05, + "loss": 0.3726, + "step": 12516 + }, + { + "epoch": 0.5496558613346213, + "grad_norm": 1.5625, + "learning_rate": 2.12215854247228e-05, + "loss": 0.3331, + "step": 12518 + }, + { + "epoch": 0.5497436798138248, + "grad_norm": 1.78125, + "learning_rate": 2.1214746756818044e-05, + "loss": 0.3523, + "step": 12520 + }, + { + "epoch": 0.5498314982930284, + "grad_norm": 1.9765625, + "learning_rate": 2.1207908378788888e-05, + "loss": 0.3616, + "step": 12522 + }, + { + "epoch": 0.5499193167722318, + "grad_norm": 1.7265625, + "learning_rate": 2.1201070291159014e-05, + "loss": 0.3123, + "step": 12524 + }, + { + "epoch": 0.5500071352514353, + "grad_norm": 1.8125, + "learning_rate": 2.119423249445209e-05, + "loss": 0.3212, + "step": 12526 + }, + { + "epoch": 0.5500949537306388, + "grad_norm": 1.84375, + "learning_rate": 2.1187394989191754e-05, + "loss": 0.3603, + "step": 12528 + }, + { + "epoch": 0.5501827722098422, + "grad_norm": 1.78125, + "learning_rate": 2.118055777590162e-05, + "loss": 0.3227, + "step": 12530 + }, + { + "epoch": 0.5502705906890457, + "grad_norm": 1.875, + "learning_rate": 2.1173720855105295e-05, + "loss": 0.3258, + "step": 12532 + }, + { + "epoch": 0.5503584091682492, + "grad_norm": 1.875, + "learning_rate": 2.1166884227326338e-05, + "loss": 0.3497, + "step": 12534 + }, + { + "epoch": 0.5504462276474528, + "grad_norm": 1.8203125, + "learning_rate": 2.116004789308831e-05, + "loss": 0.3479, + "step": 12536 + }, + { + "epoch": 0.5505340461266562, + "grad_norm": 1.84375, + "learning_rate": 2.115321185291473e-05, + "loss": 0.3353, + "step": 12538 + }, + { + "epoch": 0.5506218646058597, + "grad_norm": 1.6953125, + "learning_rate": 2.114637610732911e-05, + "loss": 0.3391, + "step": 12540 + }, + { + "epoch": 0.5507096830850632, + "grad_norm": 1.9296875, + "learning_rate": 2.1139540656854934e-05, + "loss": 0.3559, + "step": 12542 + }, + { + "epoch": 0.5507975015642667, + "grad_norm": 1.828125, + "learning_rate": 2.1132705502015657e-05, + "loss": 0.359, + "step": 12544 + }, + { + "epoch": 0.5508853200434701, + "grad_norm": 1.78125, + "learning_rate": 2.112587064333472e-05, + "loss": 0.3664, + "step": 12546 + }, + { + "epoch": 0.5509731385226736, + "grad_norm": 1.7109375, + "learning_rate": 2.1119036081335536e-05, + "loss": 0.3311, + "step": 12548 + }, + { + "epoch": 0.5510609570018771, + "grad_norm": 1.7109375, + "learning_rate": 2.111220181654151e-05, + "loss": 0.3288, + "step": 12550 + }, + { + "epoch": 0.5511487754810807, + "grad_norm": 1.703125, + "learning_rate": 2.1105367849475993e-05, + "loss": 0.331, + "step": 12552 + }, + { + "epoch": 0.5512365939602841, + "grad_norm": 1.6796875, + "learning_rate": 2.1098534180662337e-05, + "loss": 0.3219, + "step": 12554 + }, + { + "epoch": 0.5513244124394876, + "grad_norm": 1.8359375, + "learning_rate": 2.109170081062387e-05, + "loss": 0.3198, + "step": 12556 + }, + { + "epoch": 0.5514122309186911, + "grad_norm": 1.7421875, + "learning_rate": 2.1084867739883894e-05, + "loss": 0.339, + "step": 12558 + }, + { + "epoch": 0.5515000493978945, + "grad_norm": 1.6875, + "learning_rate": 2.1078034968965685e-05, + "loss": 0.3557, + "step": 12560 + }, + { + "epoch": 0.551587867877098, + "grad_norm": 1.84375, + "learning_rate": 2.1071202498392493e-05, + "loss": 0.3697, + "step": 12562 + }, + { + "epoch": 0.5516756863563015, + "grad_norm": 1.640625, + "learning_rate": 2.106437032868756e-05, + "loss": 0.3533, + "step": 12564 + }, + { + "epoch": 0.551763504835505, + "grad_norm": 1.84375, + "learning_rate": 2.105753846037409e-05, + "loss": 0.3537, + "step": 12566 + }, + { + "epoch": 0.5518513233147085, + "grad_norm": 1.7578125, + "learning_rate": 2.1050706893975274e-05, + "loss": 0.3369, + "step": 12568 + }, + { + "epoch": 0.551939141793912, + "grad_norm": 1.640625, + "learning_rate": 2.1043875630014265e-05, + "loss": 0.3325, + "step": 12570 + }, + { + "epoch": 0.5520269602731155, + "grad_norm": 1.609375, + "learning_rate": 2.103704466901421e-05, + "loss": 0.3235, + "step": 12572 + }, + { + "epoch": 0.552114778752319, + "grad_norm": 1.8515625, + "learning_rate": 2.1030214011498224e-05, + "loss": 0.3459, + "step": 12574 + }, + { + "epoch": 0.5522025972315224, + "grad_norm": 1.8671875, + "learning_rate": 2.10233836579894e-05, + "loss": 0.3483, + "step": 12576 + }, + { + "epoch": 0.5522904157107259, + "grad_norm": 1.6875, + "learning_rate": 2.1016553609010807e-05, + "loss": 0.326, + "step": 12578 + }, + { + "epoch": 0.5523782341899294, + "grad_norm": 1.6875, + "learning_rate": 2.1009723865085506e-05, + "loss": 0.3566, + "step": 12580 + }, + { + "epoch": 0.552466052669133, + "grad_norm": 1.7421875, + "learning_rate": 2.10028944267365e-05, + "loss": 0.3525, + "step": 12582 + }, + { + "epoch": 0.5525538711483364, + "grad_norm": 1.703125, + "learning_rate": 2.0996065294486798e-05, + "loss": 0.3291, + "step": 12584 + }, + { + "epoch": 0.5526416896275399, + "grad_norm": 1.53125, + "learning_rate": 2.098923646885937e-05, + "loss": 0.3655, + "step": 12586 + }, + { + "epoch": 0.5527295081067434, + "grad_norm": 1.5625, + "learning_rate": 2.0982407950377177e-05, + "loss": 0.3266, + "step": 12588 + }, + { + "epoch": 0.5528173265859468, + "grad_norm": 1.71875, + "learning_rate": 2.0975579739563144e-05, + "loss": 0.348, + "step": 12590 + }, + { + "epoch": 0.5529051450651503, + "grad_norm": 1.8046875, + "learning_rate": 2.096875183694018e-05, + "loss": 0.3656, + "step": 12592 + }, + { + "epoch": 0.5529929635443538, + "grad_norm": 1.7578125, + "learning_rate": 2.0961924243031177e-05, + "loss": 0.3601, + "step": 12594 + }, + { + "epoch": 0.5530807820235573, + "grad_norm": 1.6328125, + "learning_rate": 2.0955096958358973e-05, + "loss": 0.365, + "step": 12596 + }, + { + "epoch": 0.5531686005027608, + "grad_norm": 1.734375, + "learning_rate": 2.0948269983446413e-05, + "loss": 0.3289, + "step": 12598 + }, + { + "epoch": 0.5532564189819643, + "grad_norm": 1.703125, + "learning_rate": 2.0941443318816313e-05, + "loss": 0.3671, + "step": 12600 + }, + { + "epoch": 0.5533442374611678, + "grad_norm": 1.90625, + "learning_rate": 2.093461696499145e-05, + "loss": 0.34, + "step": 12602 + }, + { + "epoch": 0.5534320559403713, + "grad_norm": 1.9765625, + "learning_rate": 2.0927790922494607e-05, + "loss": 0.3728, + "step": 12604 + }, + { + "epoch": 0.5535198744195747, + "grad_norm": 1.84375, + "learning_rate": 2.0920965191848503e-05, + "loss": 0.3345, + "step": 12606 + }, + { + "epoch": 0.5536076928987782, + "grad_norm": 1.9765625, + "learning_rate": 2.091413977357586e-05, + "loss": 0.3539, + "step": 12608 + }, + { + "epoch": 0.5536955113779817, + "grad_norm": 1.828125, + "learning_rate": 2.0907314668199386e-05, + "loss": 0.3434, + "step": 12610 + }, + { + "epoch": 0.5537833298571851, + "grad_norm": 1.7890625, + "learning_rate": 2.0900489876241724e-05, + "loss": 0.3751, + "step": 12612 + }, + { + "epoch": 0.5538711483363887, + "grad_norm": 1.96875, + "learning_rate": 2.089366539822553e-05, + "loss": 0.3286, + "step": 12614 + }, + { + "epoch": 0.5539589668155922, + "grad_norm": 1.6484375, + "learning_rate": 2.0886841234673426e-05, + "loss": 0.3511, + "step": 12616 + }, + { + "epoch": 0.5540467852947957, + "grad_norm": 1.96875, + "learning_rate": 2.0880017386108005e-05, + "loss": 0.3467, + "step": 12618 + }, + { + "epoch": 0.5541346037739991, + "grad_norm": 1.765625, + "learning_rate": 2.0873193853051844e-05, + "loss": 0.3602, + "step": 12620 + }, + { + "epoch": 0.5542224222532026, + "grad_norm": 1.8046875, + "learning_rate": 2.0866370636027484e-05, + "loss": 0.3364, + "step": 12622 + }, + { + "epoch": 0.5543102407324061, + "grad_norm": 1.8671875, + "learning_rate": 2.0859547735557453e-05, + "loss": 0.3398, + "step": 12624 + }, + { + "epoch": 0.5543980592116096, + "grad_norm": 1.6875, + "learning_rate": 2.0852725152164247e-05, + "loss": 0.3592, + "step": 12626 + }, + { + "epoch": 0.5544858776908131, + "grad_norm": 1.84375, + "learning_rate": 2.0845902886370342e-05, + "loss": 0.364, + "step": 12628 + }, + { + "epoch": 0.5545736961700166, + "grad_norm": 1.6953125, + "learning_rate": 2.0839080938698193e-05, + "loss": 0.3523, + "step": 12630 + }, + { + "epoch": 0.5546615146492201, + "grad_norm": 1.734375, + "learning_rate": 2.0832259309670223e-05, + "loss": 0.3408, + "step": 12632 + }, + { + "epoch": 0.5547493331284236, + "grad_norm": 2.015625, + "learning_rate": 2.082543799980883e-05, + "loss": 0.3592, + "step": 12634 + }, + { + "epoch": 0.554837151607627, + "grad_norm": 1.828125, + "learning_rate": 2.08186170096364e-05, + "loss": 0.3799, + "step": 12636 + }, + { + "epoch": 0.5549249700868305, + "grad_norm": 1.765625, + "learning_rate": 2.0811796339675277e-05, + "loss": 0.3433, + "step": 12638 + }, + { + "epoch": 0.555012788566034, + "grad_norm": 1.765625, + "learning_rate": 2.0804975990447802e-05, + "loss": 0.3505, + "step": 12640 + }, + { + "epoch": 0.5551006070452374, + "grad_norm": 1.984375, + "learning_rate": 2.079815596247627e-05, + "loss": 0.3466, + "step": 12642 + }, + { + "epoch": 0.555188425524441, + "grad_norm": 1.7421875, + "learning_rate": 2.0791336256282958e-05, + "loss": 0.3317, + "step": 12644 + }, + { + "epoch": 0.5552762440036445, + "grad_norm": 1.703125, + "learning_rate": 2.078451687239013e-05, + "loss": 0.3306, + "step": 12646 + }, + { + "epoch": 0.555364062482848, + "grad_norm": 1.921875, + "learning_rate": 2.0777697811320003e-05, + "loss": 0.3522, + "step": 12648 + }, + { + "epoch": 0.5554518809620514, + "grad_norm": 1.671875, + "learning_rate": 2.0770879073594794e-05, + "loss": 0.3407, + "step": 12650 + }, + { + "epoch": 0.5555396994412549, + "grad_norm": 1.765625, + "learning_rate": 2.0764060659736674e-05, + "loss": 0.3543, + "step": 12652 + }, + { + "epoch": 0.5556275179204584, + "grad_norm": 1.6875, + "learning_rate": 2.0757242570267808e-05, + "loss": 0.3488, + "step": 12654 + }, + { + "epoch": 0.5557153363996619, + "grad_norm": 1.640625, + "learning_rate": 2.0750424805710326e-05, + "loss": 0.344, + "step": 12656 + }, + { + "epoch": 0.5558031548788653, + "grad_norm": 1.734375, + "learning_rate": 2.0743607366586327e-05, + "loss": 0.3543, + "step": 12658 + }, + { + "epoch": 0.5558909733580689, + "grad_norm": 1.640625, + "learning_rate": 2.0736790253417893e-05, + "loss": 0.3518, + "step": 12660 + }, + { + "epoch": 0.5559787918372724, + "grad_norm": 1.734375, + "learning_rate": 2.0729973466727086e-05, + "loss": 0.3597, + "step": 12662 + }, + { + "epoch": 0.5560666103164759, + "grad_norm": 1.6484375, + "learning_rate": 2.0723157007035938e-05, + "loss": 0.3324, + "step": 12664 + }, + { + "epoch": 0.5561544287956793, + "grad_norm": 1.6484375, + "learning_rate": 2.0716340874866453e-05, + "loss": 0.3592, + "step": 12666 + }, + { + "epoch": 0.5562422472748828, + "grad_norm": 1.90625, + "learning_rate": 2.0709525070740604e-05, + "loss": 0.3427, + "step": 12668 + }, + { + "epoch": 0.5563300657540863, + "grad_norm": 1.609375, + "learning_rate": 2.0702709595180357e-05, + "loss": 0.3546, + "step": 12670 + }, + { + "epoch": 0.5564178842332898, + "grad_norm": 1.6640625, + "learning_rate": 2.0695894448707647e-05, + "loss": 0.3577, + "step": 12672 + }, + { + "epoch": 0.5565057027124932, + "grad_norm": 1.6953125, + "learning_rate": 2.0689079631844363e-05, + "loss": 0.3441, + "step": 12674 + }, + { + "epoch": 0.5565935211916968, + "grad_norm": 1.8203125, + "learning_rate": 2.0682265145112395e-05, + "loss": 0.3469, + "step": 12676 + }, + { + "epoch": 0.5566813396709003, + "grad_norm": 1.7421875, + "learning_rate": 2.0675450989033595e-05, + "loss": 0.368, + "step": 12678 + }, + { + "epoch": 0.5567691581501037, + "grad_norm": 1.671875, + "learning_rate": 2.0668637164129796e-05, + "loss": 0.333, + "step": 12680 + }, + { + "epoch": 0.5568569766293072, + "grad_norm": 1.6953125, + "learning_rate": 2.0661823670922806e-05, + "loss": 0.3168, + "step": 12682 + }, + { + "epoch": 0.5569447951085107, + "grad_norm": 1.6875, + "learning_rate": 2.0655010509934395e-05, + "loss": 0.3233, + "step": 12684 + }, + { + "epoch": 0.5570326135877142, + "grad_norm": 2.046875, + "learning_rate": 2.064819768168632e-05, + "loss": 0.3385, + "step": 12686 + }, + { + "epoch": 0.5571204320669176, + "grad_norm": 1.71875, + "learning_rate": 2.064138518670031e-05, + "loss": 0.3491, + "step": 12688 + }, + { + "epoch": 0.5572082505461212, + "grad_norm": 1.6875, + "learning_rate": 2.0634573025498067e-05, + "loss": 0.329, + "step": 12690 + }, + { + "epoch": 0.5572960690253247, + "grad_norm": 1.8203125, + "learning_rate": 2.0627761198601273e-05, + "loss": 0.3355, + "step": 12692 + }, + { + "epoch": 0.5573838875045282, + "grad_norm": 1.6328125, + "learning_rate": 2.062094970653157e-05, + "loss": 0.349, + "step": 12694 + }, + { + "epoch": 0.5574717059837316, + "grad_norm": 1.75, + "learning_rate": 2.0614138549810587e-05, + "loss": 0.3676, + "step": 12696 + }, + { + "epoch": 0.5575595244629351, + "grad_norm": 1.71875, + "learning_rate": 2.0607327728959928e-05, + "loss": 0.3912, + "step": 12698 + }, + { + "epoch": 0.5576473429421386, + "grad_norm": 1.9375, + "learning_rate": 2.060051724450116e-05, + "loss": 0.3439, + "step": 12700 + }, + { + "epoch": 0.557735161421342, + "grad_norm": 1.7578125, + "learning_rate": 2.059370709695585e-05, + "loss": 0.3781, + "step": 12702 + }, + { + "epoch": 0.5578229799005455, + "grad_norm": 1.6953125, + "learning_rate": 2.0586897286845498e-05, + "loss": 0.3375, + "step": 12704 + }, + { + "epoch": 0.5579107983797491, + "grad_norm": 1.6796875, + "learning_rate": 2.0580087814691607e-05, + "loss": 0.3421, + "step": 12706 + }, + { + "epoch": 0.5579986168589526, + "grad_norm": 1.875, + "learning_rate": 2.0573278681015657e-05, + "loss": 0.3377, + "step": 12708 + }, + { + "epoch": 0.558086435338156, + "grad_norm": 1.671875, + "learning_rate": 2.056646988633908e-05, + "loss": 0.3251, + "step": 12710 + }, + { + "epoch": 0.5581742538173595, + "grad_norm": 1.640625, + "learning_rate": 2.0559661431183303e-05, + "loss": 0.3299, + "step": 12712 + }, + { + "epoch": 0.558262072296563, + "grad_norm": 1.65625, + "learning_rate": 2.0552853316069717e-05, + "loss": 0.3172, + "step": 12714 + }, + { + "epoch": 0.5583498907757665, + "grad_norm": 1.7109375, + "learning_rate": 2.054604554151969e-05, + "loss": 0.3287, + "step": 12716 + }, + { + "epoch": 0.5584377092549699, + "grad_norm": 1.7890625, + "learning_rate": 2.053923810805457e-05, + "loss": 0.3309, + "step": 12718 + }, + { + "epoch": 0.5585255277341734, + "grad_norm": 1.71875, + "learning_rate": 2.0532431016195658e-05, + "loss": 0.3412, + "step": 12720 + }, + { + "epoch": 0.558613346213377, + "grad_norm": 1.859375, + "learning_rate": 2.052562426646425e-05, + "loss": 0.3359, + "step": 12722 + }, + { + "epoch": 0.5587011646925805, + "grad_norm": 1.734375, + "learning_rate": 2.0518817859381607e-05, + "loss": 0.351, + "step": 12724 + }, + { + "epoch": 0.5587889831717839, + "grad_norm": 2.28125, + "learning_rate": 2.0512011795468972e-05, + "loss": 0.3586, + "step": 12726 + }, + { + "epoch": 0.5588768016509874, + "grad_norm": 1.6640625, + "learning_rate": 2.0505206075247552e-05, + "loss": 0.3383, + "step": 12728 + }, + { + "epoch": 0.5589646201301909, + "grad_norm": 1.625, + "learning_rate": 2.0498400699238522e-05, + "loss": 0.3388, + "step": 12730 + }, + { + "epoch": 0.5590524386093944, + "grad_norm": 1.7109375, + "learning_rate": 2.0491595667963042e-05, + "loss": 0.3217, + "step": 12732 + }, + { + "epoch": 0.5591402570885978, + "grad_norm": 1.9453125, + "learning_rate": 2.0484790981942263e-05, + "loss": 0.3533, + "step": 12734 + }, + { + "epoch": 0.5592280755678014, + "grad_norm": 1.59375, + "learning_rate": 2.047798664169726e-05, + "loss": 0.3279, + "step": 12736 + }, + { + "epoch": 0.5593158940470049, + "grad_norm": 1.8671875, + "learning_rate": 2.0471182647749123e-05, + "loss": 0.3693, + "step": 12738 + }, + { + "epoch": 0.5594037125262084, + "grad_norm": 1.6796875, + "learning_rate": 2.0464379000618906e-05, + "loss": 0.3364, + "step": 12740 + }, + { + "epoch": 0.5594915310054118, + "grad_norm": 1.6875, + "learning_rate": 2.0457575700827634e-05, + "loss": 0.3385, + "step": 12742 + }, + { + "epoch": 0.5595793494846153, + "grad_norm": 1.625, + "learning_rate": 2.0450772748896307e-05, + "loss": 0.3681, + "step": 12744 + }, + { + "epoch": 0.5596671679638188, + "grad_norm": 1.6015625, + "learning_rate": 2.0443970145345887e-05, + "loss": 0.3346, + "step": 12746 + }, + { + "epoch": 0.5597549864430222, + "grad_norm": 1.8359375, + "learning_rate": 2.0437167890697324e-05, + "loss": 0.328, + "step": 12748 + }, + { + "epoch": 0.5598428049222257, + "grad_norm": 1.8046875, + "learning_rate": 2.0430365985471542e-05, + "loss": 0.3155, + "step": 12750 + }, + { + "epoch": 0.5599306234014293, + "grad_norm": 1.6484375, + "learning_rate": 2.0423564430189427e-05, + "loss": 0.3309, + "step": 12752 + }, + { + "epoch": 0.5600184418806328, + "grad_norm": 1.8671875, + "learning_rate": 2.0416763225371848e-05, + "loss": 0.3572, + "step": 12754 + }, + { + "epoch": 0.5601062603598362, + "grad_norm": 1.8203125, + "learning_rate": 2.0409962371539637e-05, + "loss": 0.341, + "step": 12756 + }, + { + "epoch": 0.5601940788390397, + "grad_norm": 1.78125, + "learning_rate": 2.0403161869213608e-05, + "loss": 0.3409, + "step": 12758 + }, + { + "epoch": 0.5602818973182432, + "grad_norm": 1.78125, + "learning_rate": 2.0396361718914545e-05, + "loss": 0.3501, + "step": 12760 + }, + { + "epoch": 0.5603697157974467, + "grad_norm": 1.8671875, + "learning_rate": 2.0389561921163203e-05, + "loss": 0.3461, + "step": 12762 + }, + { + "epoch": 0.5604575342766501, + "grad_norm": 1.8671875, + "learning_rate": 2.0382762476480326e-05, + "loss": 0.3411, + "step": 12764 + }, + { + "epoch": 0.5605453527558536, + "grad_norm": 1.8984375, + "learning_rate": 2.03759633853866e-05, + "loss": 0.3693, + "step": 12766 + }, + { + "epoch": 0.5606331712350572, + "grad_norm": 1.90625, + "learning_rate": 2.0369164648402704e-05, + "loss": 0.3263, + "step": 12768 + }, + { + "epoch": 0.5607209897142607, + "grad_norm": 1.6640625, + "learning_rate": 2.0362366266049288e-05, + "loss": 0.3549, + "step": 12770 + }, + { + "epoch": 0.5608088081934641, + "grad_norm": 1.6953125, + "learning_rate": 2.0355568238846972e-05, + "loss": 0.3589, + "step": 12772 + }, + { + "epoch": 0.5608966266726676, + "grad_norm": 1.6015625, + "learning_rate": 2.0348770567316354e-05, + "loss": 0.336, + "step": 12774 + }, + { + "epoch": 0.5609844451518711, + "grad_norm": 1.7734375, + "learning_rate": 2.0341973251978003e-05, + "loss": 0.3383, + "step": 12776 + }, + { + "epoch": 0.5610722636310745, + "grad_norm": 1.6171875, + "learning_rate": 2.0335176293352456e-05, + "loss": 0.331, + "step": 12778 + }, + { + "epoch": 0.561160082110278, + "grad_norm": 1.875, + "learning_rate": 2.0328379691960235e-05, + "loss": 0.3607, + "step": 12780 + }, + { + "epoch": 0.5612479005894816, + "grad_norm": 1.7109375, + "learning_rate": 2.032158344832181e-05, + "loss": 0.3026, + "step": 12782 + }, + { + "epoch": 0.5613357190686851, + "grad_norm": 1.90625, + "learning_rate": 2.0314787562957645e-05, + "loss": 0.3605, + "step": 12784 + }, + { + "epoch": 0.5614235375478885, + "grad_norm": 2.015625, + "learning_rate": 2.0307992036388167e-05, + "loss": 0.3378, + "step": 12786 + }, + { + "epoch": 0.561511356027092, + "grad_norm": 1.796875, + "learning_rate": 2.030119686913379e-05, + "loss": 0.3292, + "step": 12788 + }, + { + "epoch": 0.5615991745062955, + "grad_norm": 1.734375, + "learning_rate": 2.0294402061714887e-05, + "loss": 0.3457, + "step": 12790 + }, + { + "epoch": 0.561686992985499, + "grad_norm": 1.6640625, + "learning_rate": 2.0287607614651797e-05, + "loss": 0.3479, + "step": 12792 + }, + { + "epoch": 0.5617748114647024, + "grad_norm": 1.65625, + "learning_rate": 2.0280813528464855e-05, + "loss": 0.3728, + "step": 12794 + }, + { + "epoch": 0.5618626299439059, + "grad_norm": 1.84375, + "learning_rate": 2.0274019803674335e-05, + "loss": 0.3711, + "step": 12796 + }, + { + "epoch": 0.5619504484231095, + "grad_norm": 1.8671875, + "learning_rate": 2.026722644080051e-05, + "loss": 0.3665, + "step": 12798 + }, + { + "epoch": 0.562038266902313, + "grad_norm": 1.6953125, + "learning_rate": 2.0260433440363617e-05, + "loss": 0.3475, + "step": 12800 + }, + { + "epoch": 0.5621260853815164, + "grad_norm": 1.6328125, + "learning_rate": 2.025364080288387e-05, + "loss": 0.3542, + "step": 12802 + }, + { + "epoch": 0.5622139038607199, + "grad_norm": 1.9609375, + "learning_rate": 2.0246848528881455e-05, + "loss": 0.3597, + "step": 12804 + }, + { + "epoch": 0.5623017223399234, + "grad_norm": 1.9453125, + "learning_rate": 2.0240056618876514e-05, + "loss": 0.3296, + "step": 12806 + }, + { + "epoch": 0.5623895408191268, + "grad_norm": 1.671875, + "learning_rate": 2.0233265073389173e-05, + "loss": 0.3503, + "step": 12808 + }, + { + "epoch": 0.5624773592983303, + "grad_norm": 1.609375, + "learning_rate": 2.022647389293954e-05, + "loss": 0.3532, + "step": 12810 + }, + { + "epoch": 0.5625651777775338, + "grad_norm": 1.640625, + "learning_rate": 2.0219683078047678e-05, + "loss": 0.3695, + "step": 12812 + }, + { + "epoch": 0.5626529962567374, + "grad_norm": 1.6875, + "learning_rate": 2.0212892629233633e-05, + "loss": 0.3369, + "step": 12814 + }, + { + "epoch": 0.5627408147359408, + "grad_norm": 1.734375, + "learning_rate": 2.0206102547017426e-05, + "loss": 0.3601, + "step": 12816 + }, + { + "epoch": 0.5628286332151443, + "grad_norm": 1.671875, + "learning_rate": 2.0199312831919024e-05, + "loss": 0.3639, + "step": 12818 + }, + { + "epoch": 0.5629164516943478, + "grad_norm": 1.796875, + "learning_rate": 2.0192523484458397e-05, + "loss": 0.3868, + "step": 12820 + }, + { + "epoch": 0.5630042701735513, + "grad_norm": 1.7578125, + "learning_rate": 2.018573450515548e-05, + "loss": 0.3354, + "step": 12822 + }, + { + "epoch": 0.5630920886527547, + "grad_norm": 1.734375, + "learning_rate": 2.0178945894530165e-05, + "loss": 0.3583, + "step": 12824 + }, + { + "epoch": 0.5631799071319582, + "grad_norm": 1.71875, + "learning_rate": 2.0172157653102337e-05, + "loss": 0.3593, + "step": 12826 + }, + { + "epoch": 0.5632677256111617, + "grad_norm": 1.703125, + "learning_rate": 2.0165369781391824e-05, + "loss": 0.3227, + "step": 12828 + }, + { + "epoch": 0.5633555440903653, + "grad_norm": 1.9453125, + "learning_rate": 2.0158582279918458e-05, + "loss": 0.3631, + "step": 12830 + }, + { + "epoch": 0.5634433625695687, + "grad_norm": 1.78125, + "learning_rate": 2.015179514920202e-05, + "loss": 0.3561, + "step": 12832 + }, + { + "epoch": 0.5635311810487722, + "grad_norm": 1.7109375, + "learning_rate": 2.0145008389762265e-05, + "loss": 0.3265, + "step": 12834 + }, + { + "epoch": 0.5636189995279757, + "grad_norm": 1.6953125, + "learning_rate": 2.0138222002118934e-05, + "loss": 0.3476, + "step": 12836 + }, + { + "epoch": 0.5637068180071791, + "grad_norm": 1.859375, + "learning_rate": 2.013143598679173e-05, + "loss": 0.3589, + "step": 12838 + }, + { + "epoch": 0.5637946364863826, + "grad_norm": 1.703125, + "learning_rate": 2.0124650344300323e-05, + "loss": 0.3578, + "step": 12840 + }, + { + "epoch": 0.5638824549655861, + "grad_norm": 1.71875, + "learning_rate": 2.0117865075164366e-05, + "loss": 0.3233, + "step": 12842 + }, + { + "epoch": 0.5639702734447897, + "grad_norm": 1.609375, + "learning_rate": 2.011108017990347e-05, + "loss": 0.3266, + "step": 12844 + }, + { + "epoch": 0.5640580919239931, + "grad_norm": 1.703125, + "learning_rate": 2.010429565903722e-05, + "loss": 0.3269, + "step": 12846 + }, + { + "epoch": 0.5641459104031966, + "grad_norm": 1.921875, + "learning_rate": 2.009751151308519e-05, + "loss": 0.3282, + "step": 12848 + }, + { + "epoch": 0.5642337288824001, + "grad_norm": 1.828125, + "learning_rate": 2.0090727742566894e-05, + "loss": 0.3494, + "step": 12850 + }, + { + "epoch": 0.5643215473616036, + "grad_norm": 1.875, + "learning_rate": 2.0083944348001856e-05, + "loss": 0.3248, + "step": 12852 + }, + { + "epoch": 0.564409365840807, + "grad_norm": 1.9296875, + "learning_rate": 2.0077161329909533e-05, + "loss": 0.3379, + "step": 12854 + }, + { + "epoch": 0.5644971843200105, + "grad_norm": 1.5859375, + "learning_rate": 2.007037868880938e-05, + "loss": 0.3335, + "step": 12856 + }, + { + "epoch": 0.564585002799214, + "grad_norm": 1.734375, + "learning_rate": 2.0063596425220803e-05, + "loss": 0.3479, + "step": 12858 + }, + { + "epoch": 0.5646728212784176, + "grad_norm": 1.8828125, + "learning_rate": 2.0056814539663196e-05, + "loss": 0.3678, + "step": 12860 + }, + { + "epoch": 0.564760639757621, + "grad_norm": 1.53125, + "learning_rate": 2.0050033032655915e-05, + "loss": 0.3086, + "step": 12862 + }, + { + "epoch": 0.5648484582368245, + "grad_norm": 1.65625, + "learning_rate": 2.004325190471829e-05, + "loss": 0.354, + "step": 12864 + }, + { + "epoch": 0.564936276716028, + "grad_norm": 1.6015625, + "learning_rate": 2.0036471156369632e-05, + "loss": 0.3582, + "step": 12866 + }, + { + "epoch": 0.5650240951952314, + "grad_norm": 1.6953125, + "learning_rate": 2.0029690788129194e-05, + "loss": 0.3552, + "step": 12868 + }, + { + "epoch": 0.5651119136744349, + "grad_norm": 1.6484375, + "learning_rate": 2.0022910800516227e-05, + "loss": 0.3399, + "step": 12870 + }, + { + "epoch": 0.5651997321536384, + "grad_norm": 1.6796875, + "learning_rate": 2.001613119404994e-05, + "loss": 0.3803, + "step": 12872 + }, + { + "epoch": 0.5652875506328419, + "grad_norm": 1.609375, + "learning_rate": 2.000935196924953e-05, + "loss": 0.3496, + "step": 12874 + }, + { + "epoch": 0.5653753691120454, + "grad_norm": 1.640625, + "learning_rate": 2.0002573126634136e-05, + "loss": 0.3368, + "step": 12876 + }, + { + "epoch": 0.5654631875912489, + "grad_norm": 1.7265625, + "learning_rate": 1.99957946667229e-05, + "loss": 0.3569, + "step": 12878 + }, + { + "epoch": 0.5655510060704524, + "grad_norm": 1.6328125, + "learning_rate": 1.9989016590034896e-05, + "loss": 0.3643, + "step": 12880 + }, + { + "epoch": 0.5656388245496559, + "grad_norm": 1.7109375, + "learning_rate": 1.9982238897089207e-05, + "loss": 0.3609, + "step": 12882 + }, + { + "epoch": 0.5657266430288593, + "grad_norm": 1.8671875, + "learning_rate": 1.9975461588404866e-05, + "loss": 0.3383, + "step": 12884 + }, + { + "epoch": 0.5658144615080628, + "grad_norm": 1.671875, + "learning_rate": 1.9968684664500888e-05, + "loss": 0.3505, + "step": 12886 + }, + { + "epoch": 0.5659022799872663, + "grad_norm": 1.7578125, + "learning_rate": 1.996190812589624e-05, + "loss": 0.3292, + "step": 12888 + }, + { + "epoch": 0.5659900984664699, + "grad_norm": 1.6171875, + "learning_rate": 1.995513197310987e-05, + "loss": 0.3501, + "step": 12890 + }, + { + "epoch": 0.5660779169456733, + "grad_norm": 1.6484375, + "learning_rate": 1.994835620666071e-05, + "loss": 0.3541, + "step": 12892 + }, + { + "epoch": 0.5661657354248768, + "grad_norm": 1.546875, + "learning_rate": 1.994158082706764e-05, + "loss": 0.3566, + "step": 12894 + }, + { + "epoch": 0.5662535539040803, + "grad_norm": 1.6875, + "learning_rate": 1.9934805834849518e-05, + "loss": 0.3237, + "step": 12896 + }, + { + "epoch": 0.5663413723832837, + "grad_norm": 1.65625, + "learning_rate": 1.992803123052518e-05, + "loss": 0.3658, + "step": 12898 + }, + { + "epoch": 0.5664291908624872, + "grad_norm": 1.8515625, + "learning_rate": 1.992125701461343e-05, + "loss": 0.3631, + "step": 12900 + }, + { + "epoch": 0.5665170093416907, + "grad_norm": 1.59375, + "learning_rate": 1.9914483187633042e-05, + "loss": 0.3414, + "step": 12902 + }, + { + "epoch": 0.5666048278208942, + "grad_norm": 1.6875, + "learning_rate": 1.990770975010274e-05, + "loss": 0.34, + "step": 12904 + }, + { + "epoch": 0.5666926463000977, + "grad_norm": 1.7578125, + "learning_rate": 1.990093670254125e-05, + "loss": 0.3301, + "step": 12906 + }, + { + "epoch": 0.5667804647793012, + "grad_norm": 1.6875, + "learning_rate": 1.9894164045467246e-05, + "loss": 0.3471, + "step": 12908 + }, + { + "epoch": 0.5668682832585047, + "grad_norm": 1.7421875, + "learning_rate": 1.9887391779399383e-05, + "loss": 0.3316, + "step": 12910 + }, + { + "epoch": 0.5669561017377082, + "grad_norm": 1.75, + "learning_rate": 1.9880619904856288e-05, + "loss": 0.3578, + "step": 12912 + }, + { + "epoch": 0.5670439202169116, + "grad_norm": 1.5625, + "learning_rate": 1.987384842235655e-05, + "loss": 0.339, + "step": 12914 + }, + { + "epoch": 0.5671317386961151, + "grad_norm": 1.6796875, + "learning_rate": 1.9867077332418723e-05, + "loss": 0.3202, + "step": 12916 + }, + { + "epoch": 0.5672195571753186, + "grad_norm": 1.8203125, + "learning_rate": 1.9860306635561353e-05, + "loss": 0.3416, + "step": 12918 + }, + { + "epoch": 0.567307375654522, + "grad_norm": 1.6328125, + "learning_rate": 1.985353633230293e-05, + "loss": 0.3111, + "step": 12920 + }, + { + "epoch": 0.5673951941337256, + "grad_norm": 1.71875, + "learning_rate": 1.984676642316192e-05, + "loss": 0.3586, + "step": 12922 + }, + { + "epoch": 0.5674830126129291, + "grad_norm": 1.6328125, + "learning_rate": 1.983999690865678e-05, + "loss": 0.3161, + "step": 12924 + }, + { + "epoch": 0.5675708310921326, + "grad_norm": 1.6953125, + "learning_rate": 1.9833227789305906e-05, + "loss": 0.3488, + "step": 12926 + }, + { + "epoch": 0.567658649571336, + "grad_norm": 1.8203125, + "learning_rate": 1.9826459065627694e-05, + "loss": 0.3508, + "step": 12928 + }, + { + "epoch": 0.5677464680505395, + "grad_norm": 2.03125, + "learning_rate": 1.9819690738140484e-05, + "loss": 0.332, + "step": 12930 + }, + { + "epoch": 0.567834286529743, + "grad_norm": 1.703125, + "learning_rate": 1.9812922807362598e-05, + "loss": 0.3561, + "step": 12932 + }, + { + "epoch": 0.5679221050089465, + "grad_norm": 1.6171875, + "learning_rate": 1.980615527381233e-05, + "loss": 0.3248, + "step": 12934 + }, + { + "epoch": 0.56800992348815, + "grad_norm": 1.9765625, + "learning_rate": 1.9799388138007928e-05, + "loss": 0.3775, + "step": 12936 + }, + { + "epoch": 0.5680977419673535, + "grad_norm": 1.75, + "learning_rate": 1.979262140046764e-05, + "loss": 0.3259, + "step": 12938 + }, + { + "epoch": 0.568185560446557, + "grad_norm": 1.84375, + "learning_rate": 1.978585506170965e-05, + "loss": 0.3159, + "step": 12940 + }, + { + "epoch": 0.5682733789257605, + "grad_norm": 1.8984375, + "learning_rate": 1.9779089122252125e-05, + "loss": 0.3324, + "step": 12942 + }, + { + "epoch": 0.5683611974049639, + "grad_norm": 1.6875, + "learning_rate": 1.977232358261321e-05, + "loss": 0.3292, + "step": 12944 + }, + { + "epoch": 0.5684490158841674, + "grad_norm": 1.8125, + "learning_rate": 1.9765558443311004e-05, + "loss": 0.3388, + "step": 12946 + }, + { + "epoch": 0.5685368343633709, + "grad_norm": 1.9375, + "learning_rate": 1.9758793704863606e-05, + "loss": 0.334, + "step": 12948 + }, + { + "epoch": 0.5686246528425744, + "grad_norm": 1.7109375, + "learning_rate": 1.9752029367789023e-05, + "loss": 0.3225, + "step": 12950 + }, + { + "epoch": 0.5687124713217779, + "grad_norm": 1.75, + "learning_rate": 1.9745265432605293e-05, + "loss": 0.3441, + "step": 12952 + }, + { + "epoch": 0.5688002898009814, + "grad_norm": 1.8984375, + "learning_rate": 1.97385018998304e-05, + "loss": 0.3379, + "step": 12954 + }, + { + "epoch": 0.5688881082801849, + "grad_norm": 1.75, + "learning_rate": 1.9731738769982287e-05, + "loss": 0.3578, + "step": 12956 + }, + { + "epoch": 0.5689759267593884, + "grad_norm": 2.15625, + "learning_rate": 1.9724976043578882e-05, + "loss": 0.3437, + "step": 12958 + }, + { + "epoch": 0.5690637452385918, + "grad_norm": 1.640625, + "learning_rate": 1.971821372113808e-05, + "loss": 0.3526, + "step": 12960 + }, + { + "epoch": 0.5691515637177953, + "grad_norm": 1.7109375, + "learning_rate": 1.9711451803177734e-05, + "loss": 0.3622, + "step": 12962 + }, + { + "epoch": 0.5692393821969988, + "grad_norm": 2.140625, + "learning_rate": 1.970469029021568e-05, + "loss": 0.3646, + "step": 12964 + }, + { + "epoch": 0.5693272006762022, + "grad_norm": 1.703125, + "learning_rate": 1.9697929182769712e-05, + "loss": 0.3192, + "step": 12966 + }, + { + "epoch": 0.5694150191554058, + "grad_norm": 1.671875, + "learning_rate": 1.9691168481357594e-05, + "loss": 0.3408, + "step": 12968 + }, + { + "epoch": 0.5695028376346093, + "grad_norm": 1.7421875, + "learning_rate": 1.9684408186497068e-05, + "loss": 0.3438, + "step": 12970 + }, + { + "epoch": 0.5695906561138128, + "grad_norm": 1.703125, + "learning_rate": 1.9677648298705838e-05, + "loss": 0.3413, + "step": 12972 + }, + { + "epoch": 0.5696784745930162, + "grad_norm": 1.9453125, + "learning_rate": 1.9670888818501576e-05, + "loss": 0.35, + "step": 12974 + }, + { + "epoch": 0.5697662930722197, + "grad_norm": 1.6640625, + "learning_rate": 1.9664129746401933e-05, + "loss": 0.327, + "step": 12976 + }, + { + "epoch": 0.5698541115514232, + "grad_norm": 1.8125, + "learning_rate": 1.9657371082924515e-05, + "loss": 0.3242, + "step": 12978 + }, + { + "epoch": 0.5699419300306267, + "grad_norm": 1.859375, + "learning_rate": 1.9650612828586895e-05, + "loss": 0.3106, + "step": 12980 + }, + { + "epoch": 0.5700297485098302, + "grad_norm": 1.6328125, + "learning_rate": 1.964385498390662e-05, + "loss": 0.3208, + "step": 12982 + }, + { + "epoch": 0.5701175669890337, + "grad_norm": 1.8046875, + "learning_rate": 1.9637097549401223e-05, + "loss": 0.3237, + "step": 12984 + }, + { + "epoch": 0.5702053854682372, + "grad_norm": 1.734375, + "learning_rate": 1.9630340525588176e-05, + "loss": 0.3198, + "step": 12986 + }, + { + "epoch": 0.5702932039474407, + "grad_norm": 1.6953125, + "learning_rate": 1.9623583912984938e-05, + "loss": 0.3426, + "step": 12988 + }, + { + "epoch": 0.5703810224266441, + "grad_norm": 1.7890625, + "learning_rate": 1.961682771210894e-05, + "loss": 0.3656, + "step": 12990 + }, + { + "epoch": 0.5704688409058476, + "grad_norm": 1.7421875, + "learning_rate": 1.961007192347756e-05, + "loss": 0.3708, + "step": 12992 + }, + { + "epoch": 0.5705566593850511, + "grad_norm": 1.6328125, + "learning_rate": 1.9603316547608165e-05, + "loss": 0.3506, + "step": 12994 + }, + { + "epoch": 0.5706444778642545, + "grad_norm": 1.71875, + "learning_rate": 1.9596561585018082e-05, + "loss": 0.3227, + "step": 12996 + }, + { + "epoch": 0.5707322963434581, + "grad_norm": 1.8984375, + "learning_rate": 1.9589807036224607e-05, + "loss": 0.3619, + "step": 12998 + }, + { + "epoch": 0.5708201148226616, + "grad_norm": 1.6015625, + "learning_rate": 1.9583052901745012e-05, + "loss": 0.3424, + "step": 13000 + }, + { + "epoch": 0.5709079333018651, + "grad_norm": 1.7890625, + "learning_rate": 1.957629918209652e-05, + "loss": 0.3276, + "step": 13002 + }, + { + "epoch": 0.5709957517810685, + "grad_norm": 1.53125, + "learning_rate": 1.9569545877796336e-05, + "loss": 0.3318, + "step": 13004 + }, + { + "epoch": 0.571083570260272, + "grad_norm": 1.7734375, + "learning_rate": 1.9562792989361628e-05, + "loss": 0.3371, + "step": 13006 + }, + { + "epoch": 0.5711713887394755, + "grad_norm": 1.703125, + "learning_rate": 1.955604051730954e-05, + "loss": 0.3476, + "step": 13008 + }, + { + "epoch": 0.571259207218679, + "grad_norm": 1.8671875, + "learning_rate": 1.9549288462157185e-05, + "loss": 0.3263, + "step": 13010 + }, + { + "epoch": 0.5713470256978824, + "grad_norm": 1.671875, + "learning_rate": 1.9542536824421613e-05, + "loss": 0.3142, + "step": 13012 + }, + { + "epoch": 0.571434844177086, + "grad_norm": 1.7109375, + "learning_rate": 1.953578560461988e-05, + "loss": 0.356, + "step": 13014 + }, + { + "epoch": 0.5715226626562895, + "grad_norm": 1.6328125, + "learning_rate": 1.9529034803269e-05, + "loss": 0.3482, + "step": 13016 + }, + { + "epoch": 0.571610481135493, + "grad_norm": 1.9296875, + "learning_rate": 1.9522284420885942e-05, + "loss": 0.3243, + "step": 13018 + }, + { + "epoch": 0.5716982996146964, + "grad_norm": 1.9140625, + "learning_rate": 1.9515534457987655e-05, + "loss": 0.3365, + "step": 13020 + }, + { + "epoch": 0.5717861180938999, + "grad_norm": 1.671875, + "learning_rate": 1.9508784915091057e-05, + "loss": 0.3138, + "step": 13022 + }, + { + "epoch": 0.5718739365731034, + "grad_norm": 1.609375, + "learning_rate": 1.9502035792713026e-05, + "loss": 0.3385, + "step": 13024 + }, + { + "epoch": 0.5719617550523068, + "grad_norm": 1.609375, + "learning_rate": 1.9495287091370414e-05, + "loss": 0.3446, + "step": 13026 + }, + { + "epoch": 0.5720495735315103, + "grad_norm": 1.640625, + "learning_rate": 1.9488538811580033e-05, + "loss": 0.3483, + "step": 13028 + }, + { + "epoch": 0.5721373920107139, + "grad_norm": 1.546875, + "learning_rate": 1.948179095385867e-05, + "loss": 0.3579, + "step": 13030 + }, + { + "epoch": 0.5722252104899174, + "grad_norm": 1.671875, + "learning_rate": 1.947504351872308e-05, + "loss": 0.3803, + "step": 13032 + }, + { + "epoch": 0.5723130289691208, + "grad_norm": 1.75, + "learning_rate": 1.946829650668998e-05, + "loss": 0.3694, + "step": 13034 + }, + { + "epoch": 0.5724008474483243, + "grad_norm": 1.90625, + "learning_rate": 1.9461549918276066e-05, + "loss": 0.3297, + "step": 13036 + }, + { + "epoch": 0.5724886659275278, + "grad_norm": 1.640625, + "learning_rate": 1.945480375399798e-05, + "loss": 0.3655, + "step": 13038 + }, + { + "epoch": 0.5725764844067313, + "grad_norm": 2.03125, + "learning_rate": 1.944805801437236e-05, + "loss": 0.3315, + "step": 13040 + }, + { + "epoch": 0.5726643028859347, + "grad_norm": 1.9609375, + "learning_rate": 1.944131269991579e-05, + "loss": 0.3349, + "step": 13042 + }, + { + "epoch": 0.5727521213651383, + "grad_norm": 1.71875, + "learning_rate": 1.9434567811144813e-05, + "loss": 0.3429, + "step": 13044 + }, + { + "epoch": 0.5728399398443418, + "grad_norm": 1.828125, + "learning_rate": 1.9427823348575972e-05, + "loss": 0.3571, + "step": 13046 + }, + { + "epoch": 0.5729277583235453, + "grad_norm": 1.6953125, + "learning_rate": 1.9421079312725755e-05, + "loss": 0.3535, + "step": 13048 + }, + { + "epoch": 0.5730155768027487, + "grad_norm": 1.9140625, + "learning_rate": 1.941433570411062e-05, + "loss": 0.298, + "step": 13050 + }, + { + "epoch": 0.5731033952819522, + "grad_norm": 1.75, + "learning_rate": 1.9407592523247002e-05, + "loss": 0.3675, + "step": 13052 + }, + { + "epoch": 0.5731912137611557, + "grad_norm": 1.6015625, + "learning_rate": 1.940084977065128e-05, + "loss": 0.3698, + "step": 13054 + }, + { + "epoch": 0.5732790322403591, + "grad_norm": 1.7109375, + "learning_rate": 1.9394107446839828e-05, + "loss": 0.3116, + "step": 13056 + }, + { + "epoch": 0.5733668507195626, + "grad_norm": 1.703125, + "learning_rate": 1.9387365552328973e-05, + "loss": 0.3263, + "step": 13058 + }, + { + "epoch": 0.5734546691987662, + "grad_norm": 1.6484375, + "learning_rate": 1.938062408763501e-05, + "loss": 0.3317, + "step": 13060 + }, + { + "epoch": 0.5735424876779697, + "grad_norm": 1.703125, + "learning_rate": 1.9373883053274206e-05, + "loss": 0.346, + "step": 13062 + }, + { + "epoch": 0.5736303061571731, + "grad_norm": 1.671875, + "learning_rate": 1.936714244976278e-05, + "loss": 0.3404, + "step": 13064 + }, + { + "epoch": 0.5737181246363766, + "grad_norm": 1.7578125, + "learning_rate": 1.9360402277616936e-05, + "loss": 0.3712, + "step": 13066 + }, + { + "epoch": 0.5738059431155801, + "grad_norm": 1.671875, + "learning_rate": 1.9353662537352833e-05, + "loss": 0.3626, + "step": 13068 + }, + { + "epoch": 0.5738937615947836, + "grad_norm": 1.5859375, + "learning_rate": 1.9346923229486625e-05, + "loss": 0.3185, + "step": 13070 + }, + { + "epoch": 0.573981580073987, + "grad_norm": 1.5234375, + "learning_rate": 1.9340184354534376e-05, + "loss": 0.4062, + "step": 13072 + }, + { + "epoch": 0.5740693985531905, + "grad_norm": 1.8203125, + "learning_rate": 1.9333445913012165e-05, + "loss": 0.3457, + "step": 13074 + }, + { + "epoch": 0.5741572170323941, + "grad_norm": 1.6015625, + "learning_rate": 1.9326707905436026e-05, + "loss": 0.3193, + "step": 13076 + }, + { + "epoch": 0.5742450355115976, + "grad_norm": 1.8046875, + "learning_rate": 1.931997033232196e-05, + "loss": 0.3622, + "step": 13078 + }, + { + "epoch": 0.574332853990801, + "grad_norm": 1.765625, + "learning_rate": 1.9313233194185925e-05, + "loss": 0.336, + "step": 13080 + }, + { + "epoch": 0.5744206724700045, + "grad_norm": 1.6015625, + "learning_rate": 1.9306496491543853e-05, + "loss": 0.3208, + "step": 13082 + }, + { + "epoch": 0.574508490949208, + "grad_norm": 1.7578125, + "learning_rate": 1.9299760224911644e-05, + "loss": 0.3113, + "step": 13084 + }, + { + "epoch": 0.5745963094284114, + "grad_norm": 1.8125, + "learning_rate": 1.9293024394805166e-05, + "loss": 0.3527, + "step": 13086 + }, + { + "epoch": 0.5746841279076149, + "grad_norm": 1.546875, + "learning_rate": 1.928628900174025e-05, + "loss": 0.3387, + "step": 13088 + }, + { + "epoch": 0.5747719463868185, + "grad_norm": 1.7109375, + "learning_rate": 1.927955404623269e-05, + "loss": 0.3336, + "step": 13090 + }, + { + "epoch": 0.574859764866022, + "grad_norm": 1.765625, + "learning_rate": 1.927281952879825e-05, + "loss": 0.3422, + "step": 13092 + }, + { + "epoch": 0.5749475833452254, + "grad_norm": 1.59375, + "learning_rate": 1.9266085449952664e-05, + "loss": 0.3387, + "step": 13094 + }, + { + "epoch": 0.5750354018244289, + "grad_norm": 1.65625, + "learning_rate": 1.925935181021163e-05, + "loss": 0.3232, + "step": 13096 + }, + { + "epoch": 0.5751232203036324, + "grad_norm": 1.8203125, + "learning_rate": 1.925261861009081e-05, + "loss": 0.3756, + "step": 13098 + }, + { + "epoch": 0.5752110387828359, + "grad_norm": 1.734375, + "learning_rate": 1.9245885850105834e-05, + "loss": 0.3472, + "step": 13100 + }, + { + "epoch": 0.5752988572620393, + "grad_norm": 1.5859375, + "learning_rate": 1.9239153530772307e-05, + "loss": 0.3247, + "step": 13102 + }, + { + "epoch": 0.5753866757412428, + "grad_norm": 1.7578125, + "learning_rate": 1.923242165260578e-05, + "loss": 0.3371, + "step": 13104 + }, + { + "epoch": 0.5754744942204464, + "grad_norm": 1.8515625, + "learning_rate": 1.9225690216121782e-05, + "loss": 0.3503, + "step": 13106 + }, + { + "epoch": 0.5755623126996499, + "grad_norm": 1.7265625, + "learning_rate": 1.921895922183581e-05, + "loss": 0.3884, + "step": 13108 + }, + { + "epoch": 0.5756501311788533, + "grad_norm": 1.875, + "learning_rate": 1.9212228670263326e-05, + "loss": 0.322, + "step": 13110 + }, + { + "epoch": 0.5757379496580568, + "grad_norm": 1.546875, + "learning_rate": 1.920549856191976e-05, + "loss": 0.3361, + "step": 13112 + }, + { + "epoch": 0.5758257681372603, + "grad_norm": 1.640625, + "learning_rate": 1.9198768897320503e-05, + "loss": 0.3715, + "step": 13114 + }, + { + "epoch": 0.5759135866164637, + "grad_norm": 1.71875, + "learning_rate": 1.9192039676980917e-05, + "loss": 0.3504, + "step": 13116 + }, + { + "epoch": 0.5760014050956672, + "grad_norm": 1.640625, + "learning_rate": 1.918531090141632e-05, + "loss": 0.3315, + "step": 13118 + }, + { + "epoch": 0.5760892235748707, + "grad_norm": 1.7109375, + "learning_rate": 1.9178582571142008e-05, + "loss": 0.3459, + "step": 13120 + }, + { + "epoch": 0.5761770420540743, + "grad_norm": 1.734375, + "learning_rate": 1.917185468667324e-05, + "loss": 0.3454, + "step": 13122 + }, + { + "epoch": 0.5762648605332777, + "grad_norm": 1.78125, + "learning_rate": 1.916512724852524e-05, + "loss": 0.3098, + "step": 13124 + }, + { + "epoch": 0.5763526790124812, + "grad_norm": 1.6015625, + "learning_rate": 1.9158400257213195e-05, + "loss": 0.321, + "step": 13126 + }, + { + "epoch": 0.5764404974916847, + "grad_norm": 1.84375, + "learning_rate": 1.9151673713252253e-05, + "loss": 0.3499, + "step": 13128 + }, + { + "epoch": 0.5765283159708882, + "grad_norm": 1.84375, + "learning_rate": 1.914494761715754e-05, + "loss": 0.3356, + "step": 13130 + }, + { + "epoch": 0.5766161344500916, + "grad_norm": 1.859375, + "learning_rate": 1.9138221969444153e-05, + "loss": 0.3424, + "step": 13132 + }, + { + "epoch": 0.5767039529292951, + "grad_norm": 1.6640625, + "learning_rate": 1.9131496770627123e-05, + "loss": 0.3522, + "step": 13134 + }, + { + "epoch": 0.5767917714084987, + "grad_norm": 1.65625, + "learning_rate": 1.9124772021221476e-05, + "loss": 0.3515, + "step": 13136 + }, + { + "epoch": 0.5768795898877022, + "grad_norm": 1.7421875, + "learning_rate": 1.91180477217422e-05, + "loss": 0.3192, + "step": 13138 + }, + { + "epoch": 0.5769674083669056, + "grad_norm": 1.75, + "learning_rate": 1.911132387270424e-05, + "loss": 0.353, + "step": 13140 + }, + { + "epoch": 0.5770552268461091, + "grad_norm": 1.671875, + "learning_rate": 1.9104600474622505e-05, + "loss": 0.3523, + "step": 13142 + }, + { + "epoch": 0.5771430453253126, + "grad_norm": 1.6015625, + "learning_rate": 1.9097877528011882e-05, + "loss": 0.3311, + "step": 13144 + }, + { + "epoch": 0.577230863804516, + "grad_norm": 1.7890625, + "learning_rate": 1.9091155033387208e-05, + "loss": 0.3442, + "step": 13146 + }, + { + "epoch": 0.5773186822837195, + "grad_norm": 1.8125, + "learning_rate": 1.9084432991263302e-05, + "loss": 0.3278, + "step": 13148 + }, + { + "epoch": 0.577406500762923, + "grad_norm": 1.6796875, + "learning_rate": 1.9077711402154942e-05, + "loss": 0.3689, + "step": 13150 + }, + { + "epoch": 0.5774943192421266, + "grad_norm": 1.640625, + "learning_rate": 1.907099026657685e-05, + "loss": 0.3226, + "step": 13152 + }, + { + "epoch": 0.57758213772133, + "grad_norm": 1.7734375, + "learning_rate": 1.906426958504375e-05, + "loss": 0.3529, + "step": 13154 + }, + { + "epoch": 0.5776699562005335, + "grad_norm": 1.734375, + "learning_rate": 1.905754935807031e-05, + "loss": 0.3441, + "step": 13156 + }, + { + "epoch": 0.577757774679737, + "grad_norm": 1.6171875, + "learning_rate": 1.905082958617116e-05, + "loss": 0.3536, + "step": 13158 + }, + { + "epoch": 0.5778455931589405, + "grad_norm": 1.703125, + "learning_rate": 1.9044110269860916e-05, + "loss": 0.3286, + "step": 13160 + }, + { + "epoch": 0.5779334116381439, + "grad_norm": 1.6796875, + "learning_rate": 1.9037391409654133e-05, + "loss": 0.3429, + "step": 13162 + }, + { + "epoch": 0.5780212301173474, + "grad_norm": 1.71875, + "learning_rate": 1.9030673006065346e-05, + "loss": 0.3406, + "step": 13164 + }, + { + "epoch": 0.5781090485965509, + "grad_norm": 1.6796875, + "learning_rate": 1.9023955059609042e-05, + "loss": 0.3143, + "step": 13166 + }, + { + "epoch": 0.5781968670757545, + "grad_norm": 1.6484375, + "learning_rate": 1.9017237570799697e-05, + "loss": 0.3395, + "step": 13168 + }, + { + "epoch": 0.5782846855549579, + "grad_norm": 1.8671875, + "learning_rate": 1.901052054015173e-05, + "loss": 0.3471, + "step": 13170 + }, + { + "epoch": 0.5783725040341614, + "grad_norm": 1.65625, + "learning_rate": 1.9003803968179537e-05, + "loss": 0.3708, + "step": 13172 + }, + { + "epoch": 0.5784603225133649, + "grad_norm": 1.53125, + "learning_rate": 1.899708785539747e-05, + "loss": 0.3453, + "step": 13174 + }, + { + "epoch": 0.5785481409925683, + "grad_norm": 1.671875, + "learning_rate": 1.8990372202319864e-05, + "loss": 0.3442, + "step": 13176 + }, + { + "epoch": 0.5786359594717718, + "grad_norm": 1.65625, + "learning_rate": 1.898365700946099e-05, + "loss": 0.3297, + "step": 13178 + }, + { + "epoch": 0.5787237779509753, + "grad_norm": 1.65625, + "learning_rate": 1.8976942277335103e-05, + "loss": 0.3381, + "step": 13180 + }, + { + "epoch": 0.5788115964301788, + "grad_norm": 1.703125, + "learning_rate": 1.897022800645642e-05, + "loss": 0.335, + "step": 13182 + }, + { + "epoch": 0.5788994149093823, + "grad_norm": 1.8671875, + "learning_rate": 1.896351419733912e-05, + "loss": 0.3333, + "step": 13184 + }, + { + "epoch": 0.5789872333885858, + "grad_norm": 1.6328125, + "learning_rate": 1.895680085049736e-05, + "loss": 0.3333, + "step": 13186 + }, + { + "epoch": 0.5790750518677893, + "grad_norm": 1.734375, + "learning_rate": 1.895008796644523e-05, + "loss": 0.3366, + "step": 13188 + }, + { + "epoch": 0.5791628703469928, + "grad_norm": 1.609375, + "learning_rate": 1.8943375545696816e-05, + "loss": 0.3414, + "step": 13190 + }, + { + "epoch": 0.5792506888261962, + "grad_norm": 1.7890625, + "learning_rate": 1.8936663588766154e-05, + "loss": 0.3546, + "step": 13192 + }, + { + "epoch": 0.5793385073053997, + "grad_norm": 1.7109375, + "learning_rate": 1.8929952096167258e-05, + "loss": 0.3291, + "step": 13194 + }, + { + "epoch": 0.5794263257846032, + "grad_norm": 1.65625, + "learning_rate": 1.8923241068414076e-05, + "loss": 0.3341, + "step": 13196 + }, + { + "epoch": 0.5795141442638068, + "grad_norm": 1.6875, + "learning_rate": 1.891653050602055e-05, + "loss": 0.3349, + "step": 13198 + }, + { + "epoch": 0.5796019627430102, + "grad_norm": 1.5625, + "learning_rate": 1.8909820409500573e-05, + "loss": 0.3762, + "step": 13200 + }, + { + "epoch": 0.5796897812222137, + "grad_norm": 1.734375, + "learning_rate": 1.8903110779368018e-05, + "loss": 0.3333, + "step": 13202 + }, + { + "epoch": 0.5797775997014172, + "grad_norm": 1.6796875, + "learning_rate": 1.8896401616136692e-05, + "loss": 0.3159, + "step": 13204 + }, + { + "epoch": 0.5798654181806207, + "grad_norm": 1.6640625, + "learning_rate": 1.8889692920320397e-05, + "loss": 0.3489, + "step": 13206 + }, + { + "epoch": 0.5799532366598241, + "grad_norm": 1.7265625, + "learning_rate": 1.888298469243288e-05, + "loss": 0.3647, + "step": 13208 + }, + { + "epoch": 0.5800410551390276, + "grad_norm": 1.6640625, + "learning_rate": 1.8876276932987864e-05, + "loss": 0.3274, + "step": 13210 + }, + { + "epoch": 0.5801288736182311, + "grad_norm": 1.6171875, + "learning_rate": 1.8869569642499032e-05, + "loss": 0.328, + "step": 13212 + }, + { + "epoch": 0.5802166920974346, + "grad_norm": 1.65625, + "learning_rate": 1.8862862821480025e-05, + "loss": 0.3536, + "step": 13214 + }, + { + "epoch": 0.5803045105766381, + "grad_norm": 1.71875, + "learning_rate": 1.885615647044445e-05, + "loss": 0.331, + "step": 13216 + }, + { + "epoch": 0.5803923290558416, + "grad_norm": 1.6953125, + "learning_rate": 1.8849450589905887e-05, + "loss": 0.3362, + "step": 13218 + }, + { + "epoch": 0.5804801475350451, + "grad_norm": 1.65625, + "learning_rate": 1.884274518037787e-05, + "loss": 0.3125, + "step": 13220 + }, + { + "epoch": 0.5805679660142485, + "grad_norm": 1.8828125, + "learning_rate": 1.883604024237391e-05, + "loss": 0.3284, + "step": 13222 + }, + { + "epoch": 0.580655784493452, + "grad_norm": 1.7109375, + "learning_rate": 1.8829335776407464e-05, + "loss": 0.3651, + "step": 13224 + }, + { + "epoch": 0.5807436029726555, + "grad_norm": 1.8671875, + "learning_rate": 1.8822631782991967e-05, + "loss": 0.337, + "step": 13226 + }, + { + "epoch": 0.580831421451859, + "grad_norm": 1.5703125, + "learning_rate": 1.8815928262640804e-05, + "loss": 0.3373, + "step": 13228 + }, + { + "epoch": 0.5809192399310625, + "grad_norm": 1.7109375, + "learning_rate": 1.8809225215867334e-05, + "loss": 0.3681, + "step": 13230 + }, + { + "epoch": 0.581007058410266, + "grad_norm": 1.8046875, + "learning_rate": 1.8802522643184882e-05, + "loss": 0.3509, + "step": 13232 + }, + { + "epoch": 0.5810948768894695, + "grad_norm": 1.5546875, + "learning_rate": 1.879582054510673e-05, + "loss": 0.3459, + "step": 13234 + }, + { + "epoch": 0.581182695368673, + "grad_norm": 1.8046875, + "learning_rate": 1.878911892214613e-05, + "loss": 0.3446, + "step": 13236 + }, + { + "epoch": 0.5812705138478764, + "grad_norm": 1.6796875, + "learning_rate": 1.8782417774816295e-05, + "loss": 0.3414, + "step": 13238 + }, + { + "epoch": 0.5813583323270799, + "grad_norm": 1.5859375, + "learning_rate": 1.8775717103630395e-05, + "loss": 0.3279, + "step": 13240 + }, + { + "epoch": 0.5814461508062834, + "grad_norm": 1.59375, + "learning_rate": 1.876901690910157e-05, + "loss": 0.3356, + "step": 13242 + }, + { + "epoch": 0.581533969285487, + "grad_norm": 1.6328125, + "learning_rate": 1.8762317191742924e-05, + "loss": 0.3271, + "step": 13244 + }, + { + "epoch": 0.5816217877646904, + "grad_norm": 1.671875, + "learning_rate": 1.875561795206752e-05, + "loss": 0.3237, + "step": 13246 + }, + { + "epoch": 0.5817096062438939, + "grad_norm": 1.7578125, + "learning_rate": 1.8748919190588398e-05, + "loss": 0.3382, + "step": 13248 + }, + { + "epoch": 0.5817974247230974, + "grad_norm": 1.7421875, + "learning_rate": 1.8742220907818535e-05, + "loss": 0.3319, + "step": 13250 + }, + { + "epoch": 0.5818852432023008, + "grad_norm": 1.6875, + "learning_rate": 1.87355231042709e-05, + "loss": 0.3463, + "step": 13252 + }, + { + "epoch": 0.5819730616815043, + "grad_norm": 1.6484375, + "learning_rate": 1.8728825780458415e-05, + "loss": 0.3481, + "step": 13254 + }, + { + "epoch": 0.5820608801607078, + "grad_norm": 1.640625, + "learning_rate": 1.8722128936893946e-05, + "loss": 0.3337, + "step": 13256 + }, + { + "epoch": 0.5821486986399113, + "grad_norm": 1.671875, + "learning_rate": 1.8715432574090344e-05, + "loss": 0.3141, + "step": 13258 + }, + { + "epoch": 0.5822365171191148, + "grad_norm": 1.640625, + "learning_rate": 1.8708736692560424e-05, + "loss": 0.3838, + "step": 13260 + }, + { + "epoch": 0.5823243355983183, + "grad_norm": 1.6796875, + "learning_rate": 1.8702041292816963e-05, + "loss": 0.3102, + "step": 13262 + }, + { + "epoch": 0.5824121540775218, + "grad_norm": 1.796875, + "learning_rate": 1.8695346375372686e-05, + "loss": 0.3297, + "step": 13264 + }, + { + "epoch": 0.5824999725567253, + "grad_norm": 1.7734375, + "learning_rate": 1.868865194074029e-05, + "loss": 0.3576, + "step": 13266 + }, + { + "epoch": 0.5825877910359287, + "grad_norm": 1.953125, + "learning_rate": 1.868195798943244e-05, + "loss": 0.3449, + "step": 13268 + }, + { + "epoch": 0.5826756095151322, + "grad_norm": 1.6640625, + "learning_rate": 1.8675264521961765e-05, + "loss": 0.3412, + "step": 13270 + }, + { + "epoch": 0.5827634279943357, + "grad_norm": 1.71875, + "learning_rate": 1.866857153884085e-05, + "loss": 0.3659, + "step": 13272 + }, + { + "epoch": 0.5828512464735391, + "grad_norm": 1.7734375, + "learning_rate": 1.8661879040582254e-05, + "loss": 0.3646, + "step": 13274 + }, + { + "epoch": 0.5829390649527427, + "grad_norm": 1.7265625, + "learning_rate": 1.8655187027698467e-05, + "loss": 0.3396, + "step": 13276 + }, + { + "epoch": 0.5830268834319462, + "grad_norm": 1.6484375, + "learning_rate": 1.864849550070198e-05, + "loss": 0.3503, + "step": 13278 + }, + { + "epoch": 0.5831147019111497, + "grad_norm": 1.8125, + "learning_rate": 1.8641804460105233e-05, + "loss": 0.3492, + "step": 13280 + }, + { + "epoch": 0.5832025203903531, + "grad_norm": 1.6484375, + "learning_rate": 1.8635113906420622e-05, + "loss": 0.3658, + "step": 13282 + }, + { + "epoch": 0.5832903388695566, + "grad_norm": 1.796875, + "learning_rate": 1.862842384016052e-05, + "loss": 0.3581, + "step": 13284 + }, + { + "epoch": 0.5833781573487601, + "grad_norm": 1.6640625, + "learning_rate": 1.862173426183725e-05, + "loss": 0.3505, + "step": 13286 + }, + { + "epoch": 0.5834659758279636, + "grad_norm": 1.9765625, + "learning_rate": 1.8615045171963098e-05, + "loss": 0.3254, + "step": 13288 + }, + { + "epoch": 0.5835537943071671, + "grad_norm": 1.8515625, + "learning_rate": 1.860835657105031e-05, + "loss": 0.318, + "step": 13290 + }, + { + "epoch": 0.5836416127863706, + "grad_norm": 1.671875, + "learning_rate": 1.8601668459611106e-05, + "loss": 0.3535, + "step": 13292 + }, + { + "epoch": 0.5837294312655741, + "grad_norm": 1.859375, + "learning_rate": 1.859498083815767e-05, + "loss": 0.3671, + "step": 13294 + }, + { + "epoch": 0.5838172497447776, + "grad_norm": 1.65625, + "learning_rate": 1.8588293707202133e-05, + "loss": 0.3476, + "step": 13296 + }, + { + "epoch": 0.583905068223981, + "grad_norm": 1.6953125, + "learning_rate": 1.858160706725661e-05, + "loss": 0.3515, + "step": 13298 + }, + { + "epoch": 0.5839928867031845, + "grad_norm": 1.6875, + "learning_rate": 1.857492091883315e-05, + "loss": 0.3621, + "step": 13300 + }, + { + "epoch": 0.584080705182388, + "grad_norm": 1.6171875, + "learning_rate": 1.8568235262443782e-05, + "loss": 0.3171, + "step": 13302 + }, + { + "epoch": 0.5841685236615914, + "grad_norm": 1.6171875, + "learning_rate": 1.8561550098600505e-05, + "loss": 0.351, + "step": 13304 + }, + { + "epoch": 0.584256342140795, + "grad_norm": 2.046875, + "learning_rate": 1.8554865427815262e-05, + "loss": 0.3387, + "step": 13306 + }, + { + "epoch": 0.5843441606199985, + "grad_norm": 1.765625, + "learning_rate": 1.854818125059997e-05, + "loss": 0.3562, + "step": 13308 + }, + { + "epoch": 0.584431979099202, + "grad_norm": 1.8984375, + "learning_rate": 1.8541497567466508e-05, + "loss": 0.3394, + "step": 13310 + }, + { + "epoch": 0.5845197975784054, + "grad_norm": 1.65625, + "learning_rate": 1.853481437892671e-05, + "loss": 0.3271, + "step": 13312 + }, + { + "epoch": 0.5846076160576089, + "grad_norm": 1.6171875, + "learning_rate": 1.852813168549237e-05, + "loss": 0.3523, + "step": 13314 + }, + { + "epoch": 0.5846954345368124, + "grad_norm": 1.75, + "learning_rate": 1.8521449487675275e-05, + "loss": 0.3484, + "step": 13316 + }, + { + "epoch": 0.5847832530160159, + "grad_norm": 1.7734375, + "learning_rate": 1.851476778598712e-05, + "loss": 0.3407, + "step": 13318 + }, + { + "epoch": 0.5848710714952193, + "grad_norm": 1.6328125, + "learning_rate": 1.8508086580939598e-05, + "loss": 0.3632, + "step": 13320 + }, + { + "epoch": 0.5849588899744229, + "grad_norm": 1.859375, + "learning_rate": 1.8501405873044363e-05, + "loss": 0.3391, + "step": 13322 + }, + { + "epoch": 0.5850467084536264, + "grad_norm": 1.5859375, + "learning_rate": 1.8494725662813028e-05, + "loss": 0.3268, + "step": 13324 + }, + { + "epoch": 0.5851345269328299, + "grad_norm": 1.9609375, + "learning_rate": 1.848804595075716e-05, + "loss": 0.3522, + "step": 13326 + }, + { + "epoch": 0.5852223454120333, + "grad_norm": 1.6171875, + "learning_rate": 1.848136673738829e-05, + "loss": 0.316, + "step": 13328 + }, + { + "epoch": 0.5853101638912368, + "grad_norm": 1.8515625, + "learning_rate": 1.847468802321792e-05, + "loss": 0.3361, + "step": 13330 + }, + { + "epoch": 0.5853979823704403, + "grad_norm": 1.7734375, + "learning_rate": 1.8468009808757505e-05, + "loss": 0.3726, + "step": 13332 + }, + { + "epoch": 0.5854858008496437, + "grad_norm": 1.828125, + "learning_rate": 1.8461332094518465e-05, + "loss": 0.345, + "step": 13334 + }, + { + "epoch": 0.5855736193288473, + "grad_norm": 1.6640625, + "learning_rate": 1.8454654881012184e-05, + "loss": 0.3274, + "step": 13336 + }, + { + "epoch": 0.5856614378080508, + "grad_norm": 1.6640625, + "learning_rate": 1.8447978168750003e-05, + "loss": 0.3359, + "step": 13338 + }, + { + "epoch": 0.5857492562872543, + "grad_norm": 1.90625, + "learning_rate": 1.844130195824322e-05, + "loss": 0.387, + "step": 13340 + }, + { + "epoch": 0.5858370747664577, + "grad_norm": 1.6796875, + "learning_rate": 1.84346262500031e-05, + "loss": 0.3119, + "step": 13342 + }, + { + "epoch": 0.5859248932456612, + "grad_norm": 1.90625, + "learning_rate": 1.842795104454088e-05, + "loss": 0.3082, + "step": 13344 + }, + { + "epoch": 0.5860127117248647, + "grad_norm": 1.6875, + "learning_rate": 1.8421276342367756e-05, + "loss": 0.3793, + "step": 13346 + }, + { + "epoch": 0.5861005302040682, + "grad_norm": 1.640625, + "learning_rate": 1.8414602143994856e-05, + "loss": 0.3599, + "step": 13348 + }, + { + "epoch": 0.5861883486832716, + "grad_norm": 1.6796875, + "learning_rate": 1.8407928449933304e-05, + "loss": 0.3041, + "step": 13350 + }, + { + "epoch": 0.5862761671624752, + "grad_norm": 1.6015625, + "learning_rate": 1.8401255260694173e-05, + "loss": 0.315, + "step": 13352 + }, + { + "epoch": 0.5863639856416787, + "grad_norm": 1.6953125, + "learning_rate": 1.8394582576788487e-05, + "loss": 0.3373, + "step": 13354 + }, + { + "epoch": 0.5864518041208822, + "grad_norm": 1.7578125, + "learning_rate": 1.838791039872726e-05, + "loss": 0.3263, + "step": 13356 + }, + { + "epoch": 0.5865396226000856, + "grad_norm": 1.90625, + "learning_rate": 1.8381238727021433e-05, + "loss": 0.3309, + "step": 13358 + }, + { + "epoch": 0.5866274410792891, + "grad_norm": 1.8515625, + "learning_rate": 1.837456756218194e-05, + "loss": 0.3174, + "step": 13360 + }, + { + "epoch": 0.5867152595584926, + "grad_norm": 1.5546875, + "learning_rate": 1.8367896904719645e-05, + "loss": 0.3269, + "step": 13362 + }, + { + "epoch": 0.586803078037696, + "grad_norm": 1.8515625, + "learning_rate": 1.8361226755145394e-05, + "loss": 0.3741, + "step": 13364 + }, + { + "epoch": 0.5868908965168995, + "grad_norm": 1.59375, + "learning_rate": 1.835455711396999e-05, + "loss": 0.3335, + "step": 13366 + }, + { + "epoch": 0.5869787149961031, + "grad_norm": 1.578125, + "learning_rate": 1.8347887981704195e-05, + "loss": 0.3346, + "step": 13368 + }, + { + "epoch": 0.5870665334753066, + "grad_norm": 1.7890625, + "learning_rate": 1.8341219358858734e-05, + "loss": 0.3419, + "step": 13370 + }, + { + "epoch": 0.58715435195451, + "grad_norm": 1.5390625, + "learning_rate": 1.833455124594429e-05, + "loss": 0.3505, + "step": 13372 + }, + { + "epoch": 0.5872421704337135, + "grad_norm": 1.71875, + "learning_rate": 1.8327883643471514e-05, + "loss": 0.3448, + "step": 13374 + }, + { + "epoch": 0.587329988912917, + "grad_norm": 1.765625, + "learning_rate": 1.8321216551951002e-05, + "loss": 0.3481, + "step": 13376 + }, + { + "epoch": 0.5874178073921205, + "grad_norm": 1.828125, + "learning_rate": 1.8314549971893342e-05, + "loss": 0.3309, + "step": 13378 + }, + { + "epoch": 0.5875056258713239, + "grad_norm": 1.609375, + "learning_rate": 1.8307883903809035e-05, + "loss": 0.3391, + "step": 13380 + }, + { + "epoch": 0.5875934443505274, + "grad_norm": 1.71875, + "learning_rate": 1.8301218348208583e-05, + "loss": 0.3251, + "step": 13382 + }, + { + "epoch": 0.587681262829731, + "grad_norm": 1.640625, + "learning_rate": 1.8294553305602434e-05, + "loss": 0.3132, + "step": 13384 + }, + { + "epoch": 0.5877690813089345, + "grad_norm": 1.6015625, + "learning_rate": 1.8287888776501007e-05, + "loss": 0.3206, + "step": 13386 + }, + { + "epoch": 0.5878568997881379, + "grad_norm": 1.65625, + "learning_rate": 1.828122476141466e-05, + "loss": 0.3317, + "step": 13388 + }, + { + "epoch": 0.5879447182673414, + "grad_norm": 1.6875, + "learning_rate": 1.8274561260853733e-05, + "loss": 0.3096, + "step": 13390 + }, + { + "epoch": 0.5880325367465449, + "grad_norm": 1.6796875, + "learning_rate": 1.8267898275328516e-05, + "loss": 0.3219, + "step": 13392 + }, + { + "epoch": 0.5881203552257483, + "grad_norm": 1.6796875, + "learning_rate": 1.8261235805349263e-05, + "loss": 0.322, + "step": 13394 + }, + { + "epoch": 0.5882081737049518, + "grad_norm": 1.6875, + "learning_rate": 1.8254573851426195e-05, + "loss": 0.354, + "step": 13396 + }, + { + "epoch": 0.5882959921841554, + "grad_norm": 1.6171875, + "learning_rate": 1.824791241406947e-05, + "loss": 0.3446, + "step": 13398 + }, + { + "epoch": 0.5883838106633589, + "grad_norm": 1.75, + "learning_rate": 1.824125149378923e-05, + "loss": 0.3736, + "step": 13400 + }, + { + "epoch": 0.5884716291425623, + "grad_norm": 1.65625, + "learning_rate": 1.823459109109557e-05, + "loss": 0.3377, + "step": 13402 + }, + { + "epoch": 0.5885594476217658, + "grad_norm": 1.640625, + "learning_rate": 1.822793120649855e-05, + "loss": 0.3308, + "step": 13404 + }, + { + "epoch": 0.5886472661009693, + "grad_norm": 1.828125, + "learning_rate": 1.822127184050818e-05, + "loss": 0.3304, + "step": 13406 + }, + { + "epoch": 0.5887350845801728, + "grad_norm": 1.7109375, + "learning_rate": 1.821461299363445e-05, + "loss": 0.3367, + "step": 13408 + }, + { + "epoch": 0.5888229030593762, + "grad_norm": 1.8046875, + "learning_rate": 1.820795466638727e-05, + "loss": 0.3298, + "step": 13410 + }, + { + "epoch": 0.5889107215385797, + "grad_norm": 1.7734375, + "learning_rate": 1.8201296859276555e-05, + "loss": 0.3305, + "step": 13412 + }, + { + "epoch": 0.5889985400177833, + "grad_norm": 1.6640625, + "learning_rate": 1.8194639572812157e-05, + "loss": 0.3244, + "step": 13414 + }, + { + "epoch": 0.5890863584969868, + "grad_norm": 1.5625, + "learning_rate": 1.8187982807503885e-05, + "loss": 0.3538, + "step": 13416 + }, + { + "epoch": 0.5891741769761902, + "grad_norm": 1.671875, + "learning_rate": 1.818132656386152e-05, + "loss": 0.3192, + "step": 13418 + }, + { + "epoch": 0.5892619954553937, + "grad_norm": 1.671875, + "learning_rate": 1.817467084239481e-05, + "loss": 0.3559, + "step": 13420 + }, + { + "epoch": 0.5893498139345972, + "grad_norm": 1.578125, + "learning_rate": 1.8168015643613445e-05, + "loss": 0.333, + "step": 13422 + }, + { + "epoch": 0.5894376324138006, + "grad_norm": 1.6953125, + "learning_rate": 1.8161360968027073e-05, + "loss": 0.324, + "step": 13424 + }, + { + "epoch": 0.5895254508930041, + "grad_norm": 1.5703125, + "learning_rate": 1.8154706816145315e-05, + "loss": 0.3465, + "step": 13426 + }, + { + "epoch": 0.5896132693722076, + "grad_norm": 1.671875, + "learning_rate": 1.8148053188477754e-05, + "loss": 0.3364, + "step": 13428 + }, + { + "epoch": 0.5897010878514112, + "grad_norm": 1.7265625, + "learning_rate": 1.814140008553392e-05, + "loss": 0.3523, + "step": 13430 + }, + { + "epoch": 0.5897889063306146, + "grad_norm": 1.9375, + "learning_rate": 1.8134747507823323e-05, + "loss": 0.3177, + "step": 13432 + }, + { + "epoch": 0.5898767248098181, + "grad_norm": 1.7734375, + "learning_rate": 1.8128095455855395e-05, + "loss": 0.3485, + "step": 13434 + }, + { + "epoch": 0.5899645432890216, + "grad_norm": 1.6640625, + "learning_rate": 1.8121443930139568e-05, + "loss": 0.3294, + "step": 13436 + }, + { + "epoch": 0.5900523617682251, + "grad_norm": 1.8046875, + "learning_rate": 1.8114792931185223e-05, + "loss": 0.3434, + "step": 13438 + }, + { + "epoch": 0.5901401802474285, + "grad_norm": 1.6640625, + "learning_rate": 1.8108142459501674e-05, + "loss": 0.3251, + "step": 13440 + }, + { + "epoch": 0.590227998726632, + "grad_norm": 1.7421875, + "learning_rate": 1.810149251559823e-05, + "loss": 0.3167, + "step": 13442 + }, + { + "epoch": 0.5903158172058356, + "grad_norm": 1.6953125, + "learning_rate": 1.8094843099984144e-05, + "loss": 0.3115, + "step": 13444 + }, + { + "epoch": 0.5904036356850391, + "grad_norm": 1.6484375, + "learning_rate": 1.8088194213168626e-05, + "loss": 0.34, + "step": 13446 + }, + { + "epoch": 0.5904914541642425, + "grad_norm": 1.6328125, + "learning_rate": 1.8081545855660858e-05, + "loss": 0.3425, + "step": 13448 + }, + { + "epoch": 0.590579272643446, + "grad_norm": 1.6640625, + "learning_rate": 1.807489802796996e-05, + "loss": 0.3633, + "step": 13450 + }, + { + "epoch": 0.5906670911226495, + "grad_norm": 1.703125, + "learning_rate": 1.8068250730605034e-05, + "loss": 0.332, + "step": 13452 + }, + { + "epoch": 0.590754909601853, + "grad_norm": 1.6796875, + "learning_rate": 1.8061603964075125e-05, + "loss": 0.3062, + "step": 13454 + }, + { + "epoch": 0.5908427280810564, + "grad_norm": 1.6328125, + "learning_rate": 1.805495772888925e-05, + "loss": 0.3278, + "step": 13456 + }, + { + "epoch": 0.5909305465602599, + "grad_norm": 1.5390625, + "learning_rate": 1.8048312025556384e-05, + "loss": 0.3581, + "step": 13458 + }, + { + "epoch": 0.5910183650394635, + "grad_norm": 1.703125, + "learning_rate": 1.8041666854585443e-05, + "loss": 0.3449, + "step": 13460 + }, + { + "epoch": 0.591106183518667, + "grad_norm": 1.8203125, + "learning_rate": 1.803502221648532e-05, + "loss": 0.3533, + "step": 13462 + }, + { + "epoch": 0.5911940019978704, + "grad_norm": 1.7109375, + "learning_rate": 1.8028378111764864e-05, + "loss": 0.2833, + "step": 13464 + }, + { + "epoch": 0.5912818204770739, + "grad_norm": 1.5859375, + "learning_rate": 1.802173454093289e-05, + "loss": 0.338, + "step": 13466 + }, + { + "epoch": 0.5913696389562774, + "grad_norm": 1.8828125, + "learning_rate": 1.801509150449815e-05, + "loss": 0.3585, + "step": 13468 + }, + { + "epoch": 0.5914574574354808, + "grad_norm": 1.6953125, + "learning_rate": 1.8008449002969395e-05, + "loss": 0.3445, + "step": 13470 + }, + { + "epoch": 0.5915452759146843, + "grad_norm": 1.65625, + "learning_rate": 1.8001807036855278e-05, + "loss": 0.3491, + "step": 13472 + }, + { + "epoch": 0.5916330943938878, + "grad_norm": 1.59375, + "learning_rate": 1.799516560666446e-05, + "loss": 0.3108, + "step": 13474 + }, + { + "epoch": 0.5917209128730914, + "grad_norm": 1.7421875, + "learning_rate": 1.7988524712905532e-05, + "loss": 0.3521, + "step": 13476 + }, + { + "epoch": 0.5918087313522948, + "grad_norm": 1.7890625, + "learning_rate": 1.7981884356087065e-05, + "loss": 0.3642, + "step": 13478 + }, + { + "epoch": 0.5918965498314983, + "grad_norm": 1.7421875, + "learning_rate": 1.7975244536717576e-05, + "loss": 0.3198, + "step": 13480 + }, + { + "epoch": 0.5919843683107018, + "grad_norm": 1.6484375, + "learning_rate": 1.7968605255305542e-05, + "loss": 0.344, + "step": 13482 + }, + { + "epoch": 0.5920721867899053, + "grad_norm": 1.671875, + "learning_rate": 1.7961966512359414e-05, + "loss": 0.3449, + "step": 13484 + }, + { + "epoch": 0.5921600052691087, + "grad_norm": 1.53125, + "learning_rate": 1.7955328308387563e-05, + "loss": 0.3294, + "step": 13486 + }, + { + "epoch": 0.5922478237483122, + "grad_norm": 1.6015625, + "learning_rate": 1.794869064389837e-05, + "loss": 0.3317, + "step": 13488 + }, + { + "epoch": 0.5923356422275158, + "grad_norm": 1.703125, + "learning_rate": 1.7942053519400133e-05, + "loss": 0.3468, + "step": 13490 + }, + { + "epoch": 0.5924234607067193, + "grad_norm": 1.53125, + "learning_rate": 1.7935416935401128e-05, + "loss": 0.3326, + "step": 13492 + }, + { + "epoch": 0.5925112791859227, + "grad_norm": 1.609375, + "learning_rate": 1.7928780892409592e-05, + "loss": 0.32, + "step": 13494 + }, + { + "epoch": 0.5925990976651262, + "grad_norm": 1.6015625, + "learning_rate": 1.7922145390933707e-05, + "loss": 0.3368, + "step": 13496 + }, + { + "epoch": 0.5926869161443297, + "grad_norm": 1.71875, + "learning_rate": 1.7915510431481625e-05, + "loss": 0.3298, + "step": 13498 + }, + { + "epoch": 0.5927747346235331, + "grad_norm": 1.796875, + "learning_rate": 1.790887601456146e-05, + "loss": 0.3706, + "step": 13500 + }, + { + "epoch": 0.5928625531027366, + "grad_norm": 1.6875, + "learning_rate": 1.7902242140681264e-05, + "loss": 0.3379, + "step": 13502 + }, + { + "epoch": 0.5929503715819401, + "grad_norm": 1.578125, + "learning_rate": 1.7895608810349062e-05, + "loss": 0.3249, + "step": 13504 + }, + { + "epoch": 0.5930381900611437, + "grad_norm": 1.6953125, + "learning_rate": 1.788897602407284e-05, + "loss": 0.3453, + "step": 13506 + }, + { + "epoch": 0.5931260085403471, + "grad_norm": 1.6640625, + "learning_rate": 1.7882343782360546e-05, + "loss": 0.3356, + "step": 13508 + }, + { + "epoch": 0.5932138270195506, + "grad_norm": 1.59375, + "learning_rate": 1.787571208572007e-05, + "loss": 0.3301, + "step": 13510 + }, + { + "epoch": 0.5933016454987541, + "grad_norm": 1.6640625, + "learning_rate": 1.7869080934659265e-05, + "loss": 0.3048, + "step": 13512 + }, + { + "epoch": 0.5933894639779576, + "grad_norm": 1.7890625, + "learning_rate": 1.7862450329685952e-05, + "loss": 0.3584, + "step": 13514 + }, + { + "epoch": 0.593477282457161, + "grad_norm": 1.625, + "learning_rate": 1.7855820271307906e-05, + "loss": 0.3314, + "step": 13516 + }, + { + "epoch": 0.5935651009363645, + "grad_norm": 1.5859375, + "learning_rate": 1.7849190760032853e-05, + "loss": 0.3615, + "step": 13518 + }, + { + "epoch": 0.593652919415568, + "grad_norm": 1.65625, + "learning_rate": 1.7842561796368496e-05, + "loss": 0.3443, + "step": 13520 + }, + { + "epoch": 0.5937407378947716, + "grad_norm": 1.8828125, + "learning_rate": 1.7835933380822462e-05, + "loss": 0.3515, + "step": 13522 + }, + { + "epoch": 0.593828556373975, + "grad_norm": 1.703125, + "learning_rate": 1.7829305513902366e-05, + "loss": 0.3048, + "step": 13524 + }, + { + "epoch": 0.5939163748531785, + "grad_norm": 1.6640625, + "learning_rate": 1.782267819611578e-05, + "loss": 0.3268, + "step": 13526 + }, + { + "epoch": 0.594004193332382, + "grad_norm": 1.5234375, + "learning_rate": 1.7816051427970213e-05, + "loss": 0.3424, + "step": 13528 + }, + { + "epoch": 0.5940920118115854, + "grad_norm": 1.6484375, + "learning_rate": 1.780942520997316e-05, + "loss": 0.3329, + "step": 13530 + }, + { + "epoch": 0.5941798302907889, + "grad_norm": 1.5625, + "learning_rate": 1.780279954263204e-05, + "loss": 0.3417, + "step": 13532 + }, + { + "epoch": 0.5942676487699924, + "grad_norm": 1.859375, + "learning_rate": 1.7796174426454255e-05, + "loss": 0.3385, + "step": 13534 + }, + { + "epoch": 0.5943554672491959, + "grad_norm": 1.5859375, + "learning_rate": 1.7789549861947165e-05, + "loss": 0.3405, + "step": 13536 + }, + { + "epoch": 0.5944432857283994, + "grad_norm": 1.5625, + "learning_rate": 1.778292584961807e-05, + "loss": 0.3766, + "step": 13538 + }, + { + "epoch": 0.5945311042076029, + "grad_norm": 1.6484375, + "learning_rate": 1.7776302389974243e-05, + "loss": 0.3446, + "step": 13540 + }, + { + "epoch": 0.5946189226868064, + "grad_norm": 1.71875, + "learning_rate": 1.776967948352291e-05, + "loss": 0.318, + "step": 13542 + }, + { + "epoch": 0.5947067411660099, + "grad_norm": 1.6796875, + "learning_rate": 1.7763057130771254e-05, + "loss": 0.353, + "step": 13544 + }, + { + "epoch": 0.5947945596452133, + "grad_norm": 1.6328125, + "learning_rate": 1.7756435332226424e-05, + "loss": 0.3147, + "step": 13546 + }, + { + "epoch": 0.5948823781244168, + "grad_norm": 1.7265625, + "learning_rate": 1.774981408839551e-05, + "loss": 0.3275, + "step": 13548 + }, + { + "epoch": 0.5949701966036203, + "grad_norm": 1.578125, + "learning_rate": 1.774319339978557e-05, + "loss": 0.3295, + "step": 13550 + }, + { + "epoch": 0.5950580150828239, + "grad_norm": 1.75, + "learning_rate": 1.7736573266903617e-05, + "loss": 0.3415, + "step": 13552 + }, + { + "epoch": 0.5951458335620273, + "grad_norm": 1.6875, + "learning_rate": 1.7729953690256625e-05, + "loss": 0.3136, + "step": 13554 + }, + { + "epoch": 0.5952336520412308, + "grad_norm": 1.6328125, + "learning_rate": 1.772333467035153e-05, + "loss": 0.3453, + "step": 13556 + }, + { + "epoch": 0.5953214705204343, + "grad_norm": 1.609375, + "learning_rate": 1.7716716207695202e-05, + "loss": 0.3261, + "step": 13558 + }, + { + "epoch": 0.5954092889996377, + "grad_norm": 2.015625, + "learning_rate": 1.7710098302794495e-05, + "loss": 0.3554, + "step": 13560 + }, + { + "epoch": 0.5954971074788412, + "grad_norm": 1.671875, + "learning_rate": 1.7703480956156215e-05, + "loss": 0.3133, + "step": 13562 + }, + { + "epoch": 0.5955849259580447, + "grad_norm": 1.875, + "learning_rate": 1.7696864168287105e-05, + "loss": 0.3392, + "step": 13564 + }, + { + "epoch": 0.5956727444372482, + "grad_norm": 1.7734375, + "learning_rate": 1.7690247939693887e-05, + "loss": 0.3364, + "step": 13566 + }, + { + "epoch": 0.5957605629164517, + "grad_norm": 1.7421875, + "learning_rate": 1.7683632270883233e-05, + "loss": 0.3271, + "step": 13568 + }, + { + "epoch": 0.5958483813956552, + "grad_norm": 1.7109375, + "learning_rate": 1.7677017162361776e-05, + "loss": 0.3657, + "step": 13570 + }, + { + "epoch": 0.5959361998748587, + "grad_norm": 1.609375, + "learning_rate": 1.7670402614636104e-05, + "loss": 0.327, + "step": 13572 + }, + { + "epoch": 0.5960240183540622, + "grad_norm": 1.625, + "learning_rate": 1.7663788628212752e-05, + "loss": 0.3258, + "step": 13574 + }, + { + "epoch": 0.5961118368332656, + "grad_norm": 1.6796875, + "learning_rate": 1.7657175203598222e-05, + "loss": 0.3347, + "step": 13576 + }, + { + "epoch": 0.5961996553124691, + "grad_norm": 1.7578125, + "learning_rate": 1.765056234129898e-05, + "loss": 0.3133, + "step": 13578 + }, + { + "epoch": 0.5962874737916726, + "grad_norm": 1.84375, + "learning_rate": 1.7643950041821434e-05, + "loss": 0.3587, + "step": 13580 + }, + { + "epoch": 0.596375292270876, + "grad_norm": 1.8125, + "learning_rate": 1.763733830567196e-05, + "loss": 0.3578, + "step": 13582 + }, + { + "epoch": 0.5964631107500796, + "grad_norm": 1.5546875, + "learning_rate": 1.763072713335688e-05, + "loss": 0.3336, + "step": 13584 + }, + { + "epoch": 0.5965509292292831, + "grad_norm": 1.7421875, + "learning_rate": 1.7624116525382482e-05, + "loss": 0.3035, + "step": 13586 + }, + { + "epoch": 0.5966387477084866, + "grad_norm": 1.546875, + "learning_rate": 1.761750648225501e-05, + "loss": 0.33, + "step": 13588 + }, + { + "epoch": 0.59672656618769, + "grad_norm": 1.65625, + "learning_rate": 1.7610897004480658e-05, + "loss": 0.2962, + "step": 13590 + }, + { + "epoch": 0.5968143846668935, + "grad_norm": 1.9453125, + "learning_rate": 1.76042880925656e-05, + "loss": 0.3426, + "step": 13592 + }, + { + "epoch": 0.596902203146097, + "grad_norm": 1.6875, + "learning_rate": 1.7597679747015922e-05, + "loss": 0.3492, + "step": 13594 + }, + { + "epoch": 0.5969900216253005, + "grad_norm": 1.96875, + "learning_rate": 1.75910719683377e-05, + "loss": 0.3161, + "step": 13596 + }, + { + "epoch": 0.597077840104504, + "grad_norm": 1.6171875, + "learning_rate": 1.758446475703697e-05, + "loss": 0.3393, + "step": 13598 + }, + { + "epoch": 0.5971656585837075, + "grad_norm": 1.921875, + "learning_rate": 1.75778581136197e-05, + "loss": 0.3211, + "step": 13600 + }, + { + "epoch": 0.597253477062911, + "grad_norm": 1.828125, + "learning_rate": 1.7571252038591835e-05, + "loss": 0.3378, + "step": 13602 + }, + { + "epoch": 0.5973412955421145, + "grad_norm": 1.6328125, + "learning_rate": 1.7564646532459273e-05, + "loss": 0.34, + "step": 13604 + }, + { + "epoch": 0.5974291140213179, + "grad_norm": 1.734375, + "learning_rate": 1.755804159572786e-05, + "loss": 0.3273, + "step": 13606 + }, + { + "epoch": 0.5975169325005214, + "grad_norm": 1.921875, + "learning_rate": 1.7551437228903407e-05, + "loss": 0.3273, + "step": 13608 + }, + { + "epoch": 0.5976047509797249, + "grad_norm": 1.796875, + "learning_rate": 1.7544833432491674e-05, + "loss": 0.3469, + "step": 13610 + }, + { + "epoch": 0.5976925694589283, + "grad_norm": 2.015625, + "learning_rate": 1.7538230206998386e-05, + "loss": 0.3423, + "step": 13612 + }, + { + "epoch": 0.5977803879381319, + "grad_norm": 1.609375, + "learning_rate": 1.7531627552929214e-05, + "loss": 0.3512, + "step": 13614 + }, + { + "epoch": 0.5978682064173354, + "grad_norm": 1.8984375, + "learning_rate": 1.7525025470789797e-05, + "loss": 0.3472, + "step": 13616 + }, + { + "epoch": 0.5979560248965389, + "grad_norm": 1.6171875, + "learning_rate": 1.7518423961085725e-05, + "loss": 0.3479, + "step": 13618 + }, + { + "epoch": 0.5980438433757423, + "grad_norm": 1.890625, + "learning_rate": 1.7511823024322534e-05, + "loss": 0.3252, + "step": 13620 + }, + { + "epoch": 0.5981316618549458, + "grad_norm": 1.9453125, + "learning_rate": 1.7505222661005745e-05, + "loss": 0.3326, + "step": 13622 + }, + { + "epoch": 0.5982194803341493, + "grad_norm": 1.96875, + "learning_rate": 1.7498622871640785e-05, + "loss": 0.3116, + "step": 13624 + }, + { + "epoch": 0.5983072988133528, + "grad_norm": 1.6875, + "learning_rate": 1.7492023656733085e-05, + "loss": 0.3341, + "step": 13626 + }, + { + "epoch": 0.5983951172925562, + "grad_norm": 1.6484375, + "learning_rate": 1.7485425016788016e-05, + "loss": 0.3458, + "step": 13628 + }, + { + "epoch": 0.5984829357717598, + "grad_norm": 1.609375, + "learning_rate": 1.74788269523109e-05, + "loss": 0.3681, + "step": 13630 + }, + { + "epoch": 0.5985707542509633, + "grad_norm": 1.71875, + "learning_rate": 1.747222946380702e-05, + "loss": 0.3796, + "step": 13632 + }, + { + "epoch": 0.5986585727301668, + "grad_norm": 1.609375, + "learning_rate": 1.7465632551781614e-05, + "loss": 0.3366, + "step": 13634 + }, + { + "epoch": 0.5987463912093702, + "grad_norm": 1.765625, + "learning_rate": 1.745903621673987e-05, + "loss": 0.3106, + "step": 13636 + }, + { + "epoch": 0.5988342096885737, + "grad_norm": 1.5859375, + "learning_rate": 1.745244045918694e-05, + "loss": 0.3482, + "step": 13638 + }, + { + "epoch": 0.5989220281677772, + "grad_norm": 1.6953125, + "learning_rate": 1.744584527962793e-05, + "loss": 0.3297, + "step": 13640 + }, + { + "epoch": 0.5990098466469806, + "grad_norm": 1.6328125, + "learning_rate": 1.7439250678567897e-05, + "loss": 0.3305, + "step": 13642 + }, + { + "epoch": 0.5990976651261842, + "grad_norm": 1.6484375, + "learning_rate": 1.7432656656511866e-05, + "loss": 0.3571, + "step": 13644 + }, + { + "epoch": 0.5991854836053877, + "grad_norm": 1.6171875, + "learning_rate": 1.7426063213964796e-05, + "loss": 0.3437, + "step": 13646 + }, + { + "epoch": 0.5992733020845912, + "grad_norm": 1.609375, + "learning_rate": 1.741947035143162e-05, + "loss": 0.3357, + "step": 13648 + }, + { + "epoch": 0.5993611205637946, + "grad_norm": 1.765625, + "learning_rate": 1.7412878069417227e-05, + "loss": 0.3361, + "step": 13650 + }, + { + "epoch": 0.5994489390429981, + "grad_norm": 1.6796875, + "learning_rate": 1.7406286368426445e-05, + "loss": 0.3687, + "step": 13652 + }, + { + "epoch": 0.5995367575222016, + "grad_norm": 1.625, + "learning_rate": 1.7399695248964086e-05, + "loss": 0.3249, + "step": 13654 + }, + { + "epoch": 0.5996245760014051, + "grad_norm": 1.59375, + "learning_rate": 1.7393104711534874e-05, + "loss": 0.3255, + "step": 13656 + }, + { + "epoch": 0.5997123944806085, + "grad_norm": 1.796875, + "learning_rate": 1.7386514756643536e-05, + "loss": 0.3569, + "step": 13658 + }, + { + "epoch": 0.5998002129598121, + "grad_norm": 1.5, + "learning_rate": 1.7379925384794716e-05, + "loss": 0.3281, + "step": 13660 + }, + { + "epoch": 0.5998880314390156, + "grad_norm": 1.609375, + "learning_rate": 1.7373336596493033e-05, + "loss": 0.3503, + "step": 13662 + }, + { + "epoch": 0.5999758499182191, + "grad_norm": 1.75, + "learning_rate": 1.7366748392243064e-05, + "loss": 0.3302, + "step": 13664 + }, + { + "epoch": 0.6000636683974225, + "grad_norm": 1.6875, + "learning_rate": 1.7360160772549333e-05, + "loss": 0.3629, + "step": 13666 + }, + { + "epoch": 0.600151486876626, + "grad_norm": 1.6328125, + "learning_rate": 1.7353573737916322e-05, + "loss": 0.369, + "step": 13668 + }, + { + "epoch": 0.6002393053558295, + "grad_norm": 1.65625, + "learning_rate": 1.7346987288848473e-05, + "loss": 0.3343, + "step": 13670 + }, + { + "epoch": 0.600327123835033, + "grad_norm": 1.734375, + "learning_rate": 1.7340401425850168e-05, + "loss": 0.3406, + "step": 13672 + }, + { + "epoch": 0.6004149423142364, + "grad_norm": 1.625, + "learning_rate": 1.7333816149425753e-05, + "loss": 0.3344, + "step": 13674 + }, + { + "epoch": 0.60050276079344, + "grad_norm": 1.6015625, + "learning_rate": 1.732723146007954e-05, + "loss": 0.3399, + "step": 13676 + }, + { + "epoch": 0.6005905792726435, + "grad_norm": 1.5625, + "learning_rate": 1.7320647358315777e-05, + "loss": 0.3329, + "step": 13678 + }, + { + "epoch": 0.600678397751847, + "grad_norm": 1.6796875, + "learning_rate": 1.731406384463869e-05, + "loss": 0.345, + "step": 13680 + }, + { + "epoch": 0.6007662162310504, + "grad_norm": 1.671875, + "learning_rate": 1.7307480919552427e-05, + "loss": 0.3567, + "step": 13682 + }, + { + "epoch": 0.6008540347102539, + "grad_norm": 1.671875, + "learning_rate": 1.730089858356113e-05, + "loss": 0.3426, + "step": 13684 + }, + { + "epoch": 0.6009418531894574, + "grad_norm": 1.828125, + "learning_rate": 1.7294316837168857e-05, + "loss": 0.2965, + "step": 13686 + }, + { + "epoch": 0.6010296716686608, + "grad_norm": 1.6171875, + "learning_rate": 1.7287735680879645e-05, + "loss": 0.3596, + "step": 13688 + }, + { + "epoch": 0.6011174901478644, + "grad_norm": 1.6953125, + "learning_rate": 1.7281155115197484e-05, + "loss": 0.3422, + "step": 13690 + }, + { + "epoch": 0.6012053086270679, + "grad_norm": 1.828125, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.3203, + "step": 13692 + }, + { + "epoch": 0.6012931271062714, + "grad_norm": 1.6953125, + "learning_rate": 1.726799575767004e-05, + "loss": 0.3258, + "step": 13694 + }, + { + "epoch": 0.6013809455854748, + "grad_norm": 1.6484375, + "learning_rate": 1.72614169668325e-05, + "loss": 0.3454, + "step": 13696 + }, + { + "epoch": 0.6014687640646783, + "grad_norm": 1.6640625, + "learning_rate": 1.7254838768617497e-05, + "loss": 0.3279, + "step": 13698 + }, + { + "epoch": 0.6015565825438818, + "grad_norm": 1.71875, + "learning_rate": 1.7248261163528806e-05, + "loss": 0.3692, + "step": 13700 + }, + { + "epoch": 0.6016444010230853, + "grad_norm": 1.8671875, + "learning_rate": 1.724168415207013e-05, + "loss": 0.366, + "step": 13702 + }, + { + "epoch": 0.6017322195022887, + "grad_norm": 1.65625, + "learning_rate": 1.7235107734745136e-05, + "loss": 0.3487, + "step": 13704 + }, + { + "epoch": 0.6018200379814923, + "grad_norm": 1.6484375, + "learning_rate": 1.7228531912057465e-05, + "loss": 0.3285, + "step": 13706 + }, + { + "epoch": 0.6019078564606958, + "grad_norm": 1.765625, + "learning_rate": 1.7221956684510677e-05, + "loss": 0.3312, + "step": 13708 + }, + { + "epoch": 0.6019956749398993, + "grad_norm": 1.6484375, + "learning_rate": 1.721538205260831e-05, + "loss": 0.3263, + "step": 13710 + }, + { + "epoch": 0.6020834934191027, + "grad_norm": 1.7421875, + "learning_rate": 1.720880801685385e-05, + "loss": 0.3295, + "step": 13712 + }, + { + "epoch": 0.6021713118983062, + "grad_norm": 1.765625, + "learning_rate": 1.720223457775075e-05, + "loss": 0.3395, + "step": 13714 + }, + { + "epoch": 0.6022591303775097, + "grad_norm": 1.8671875, + "learning_rate": 1.719566173580239e-05, + "loss": 0.3319, + "step": 13716 + }, + { + "epoch": 0.6023469488567131, + "grad_norm": 1.7421875, + "learning_rate": 1.7189089491512116e-05, + "loss": 0.3508, + "step": 13718 + }, + { + "epoch": 0.6024347673359166, + "grad_norm": 1.796875, + "learning_rate": 1.7182517845383252e-05, + "loss": 0.3301, + "step": 13720 + }, + { + "epoch": 0.6025225858151202, + "grad_norm": 1.7578125, + "learning_rate": 1.717594679791904e-05, + "loss": 0.3258, + "step": 13722 + }, + { + "epoch": 0.6026104042943237, + "grad_norm": 1.8125, + "learning_rate": 1.7169376349622698e-05, + "loss": 0.3483, + "step": 13724 + }, + { + "epoch": 0.6026982227735271, + "grad_norm": 1.8125, + "learning_rate": 1.716280650099739e-05, + "loss": 0.3264, + "step": 13726 + }, + { + "epoch": 0.6027860412527306, + "grad_norm": 1.84375, + "learning_rate": 1.7156237252546242e-05, + "loss": 0.3246, + "step": 13728 + }, + { + "epoch": 0.6028738597319341, + "grad_norm": 1.8203125, + "learning_rate": 1.7149668604772324e-05, + "loss": 0.3438, + "step": 13730 + }, + { + "epoch": 0.6029616782111376, + "grad_norm": 1.671875, + "learning_rate": 1.714310055817867e-05, + "loss": 0.3036, + "step": 13732 + }, + { + "epoch": 0.603049496690341, + "grad_norm": 1.9296875, + "learning_rate": 1.7136533113268256e-05, + "loss": 0.35, + "step": 13734 + }, + { + "epoch": 0.6031373151695445, + "grad_norm": 1.640625, + "learning_rate": 1.712996627054402e-05, + "loss": 0.3397, + "step": 13736 + }, + { + "epoch": 0.6032251336487481, + "grad_norm": 1.6328125, + "learning_rate": 1.7123400030508852e-05, + "loss": 0.3716, + "step": 13738 + }, + { + "epoch": 0.6033129521279516, + "grad_norm": 1.6796875, + "learning_rate": 1.71168343936656e-05, + "loss": 0.3035, + "step": 13740 + }, + { + "epoch": 0.603400770607155, + "grad_norm": 1.6171875, + "learning_rate": 1.7110269360517066e-05, + "loss": 0.3052, + "step": 13742 + }, + { + "epoch": 0.6034885890863585, + "grad_norm": 1.7890625, + "learning_rate": 1.710370493156599e-05, + "loss": 0.3412, + "step": 13744 + }, + { + "epoch": 0.603576407565562, + "grad_norm": 1.6953125, + "learning_rate": 1.709714110731509e-05, + "loss": 0.3488, + "step": 13746 + }, + { + "epoch": 0.6036642260447654, + "grad_norm": 1.640625, + "learning_rate": 1.709057788826701e-05, + "loss": 0.324, + "step": 13748 + }, + { + "epoch": 0.6037520445239689, + "grad_norm": 1.6484375, + "learning_rate": 1.7084015274924373e-05, + "loss": 0.3332, + "step": 13750 + }, + { + "epoch": 0.6038398630031725, + "grad_norm": 1.7734375, + "learning_rate": 1.7077453267789746e-05, + "loss": 0.3286, + "step": 13752 + }, + { + "epoch": 0.603927681482376, + "grad_norm": 1.6796875, + "learning_rate": 1.707089186736564e-05, + "loss": 0.3647, + "step": 13754 + }, + { + "epoch": 0.6040154999615794, + "grad_norm": 1.7421875, + "learning_rate": 1.7064331074154543e-05, + "loss": 0.33, + "step": 13756 + }, + { + "epoch": 0.6041033184407829, + "grad_norm": 1.765625, + "learning_rate": 1.7057770888658873e-05, + "loss": 0.3588, + "step": 13758 + }, + { + "epoch": 0.6041911369199864, + "grad_norm": 1.484375, + "learning_rate": 1.705121131138101e-05, + "loss": 0.3133, + "step": 13760 + }, + { + "epoch": 0.6042789553991899, + "grad_norm": 1.6875, + "learning_rate": 1.7044652342823292e-05, + "loss": 0.3258, + "step": 13762 + }, + { + "epoch": 0.6043667738783933, + "grad_norm": 1.8515625, + "learning_rate": 1.7038093983488003e-05, + "loss": 0.3074, + "step": 13764 + }, + { + "epoch": 0.6044545923575968, + "grad_norm": 1.59375, + "learning_rate": 1.703153623387738e-05, + "loss": 0.3299, + "step": 13766 + }, + { + "epoch": 0.6045424108368004, + "grad_norm": 1.671875, + "learning_rate": 1.7024979094493637e-05, + "loss": 0.3334, + "step": 13768 + }, + { + "epoch": 0.6046302293160039, + "grad_norm": 1.5390625, + "learning_rate": 1.7018422565838896e-05, + "loss": 0.3128, + "step": 13770 + }, + { + "epoch": 0.6047180477952073, + "grad_norm": 1.59375, + "learning_rate": 1.701186664841527e-05, + "loss": 0.3236, + "step": 13772 + }, + { + "epoch": 0.6048058662744108, + "grad_norm": 1.5625, + "learning_rate": 1.7005311342724812e-05, + "loss": 0.347, + "step": 13774 + }, + { + "epoch": 0.6048936847536143, + "grad_norm": 1.8046875, + "learning_rate": 1.6998756649269535e-05, + "loss": 0.359, + "step": 13776 + }, + { + "epoch": 0.6049815032328177, + "grad_norm": 1.640625, + "learning_rate": 1.6992202568551383e-05, + "loss": 0.322, + "step": 13778 + }, + { + "epoch": 0.6050693217120212, + "grad_norm": 1.7421875, + "learning_rate": 1.6985649101072277e-05, + "loss": 0.3372, + "step": 13780 + }, + { + "epoch": 0.6051571401912247, + "grad_norm": 1.625, + "learning_rate": 1.6979096247334092e-05, + "loss": 0.3177, + "step": 13782 + }, + { + "epoch": 0.6052449586704283, + "grad_norm": 1.6328125, + "learning_rate": 1.697254400783863e-05, + "loss": 0.3389, + "step": 13784 + }, + { + "epoch": 0.6053327771496317, + "grad_norm": 1.5859375, + "learning_rate": 1.6965992383087677e-05, + "loss": 0.3674, + "step": 13786 + }, + { + "epoch": 0.6054205956288352, + "grad_norm": 1.84375, + "learning_rate": 1.6959441373582947e-05, + "loss": 0.3298, + "step": 13788 + }, + { + "epoch": 0.6055084141080387, + "grad_norm": 1.515625, + "learning_rate": 1.6952890979826132e-05, + "loss": 0.3318, + "step": 13790 + }, + { + "epoch": 0.6055962325872422, + "grad_norm": 1.90625, + "learning_rate": 1.6946341202318854e-05, + "loss": 0.3524, + "step": 13792 + }, + { + "epoch": 0.6056840510664456, + "grad_norm": 1.6328125, + "learning_rate": 1.6939792041562695e-05, + "loss": 0.3478, + "step": 13794 + }, + { + "epoch": 0.6057718695456491, + "grad_norm": 1.75, + "learning_rate": 1.693324349805919e-05, + "loss": 0.3476, + "step": 13796 + }, + { + "epoch": 0.6058596880248527, + "grad_norm": 1.6875, + "learning_rate": 1.6926695572309837e-05, + "loss": 0.3191, + "step": 13798 + }, + { + "epoch": 0.6059475065040562, + "grad_norm": 1.6484375, + "learning_rate": 1.6920148264816064e-05, + "loss": 0.3439, + "step": 13800 + }, + { + "epoch": 0.6060353249832596, + "grad_norm": 1.8125, + "learning_rate": 1.691360157607928e-05, + "loss": 0.3311, + "step": 13802 + }, + { + "epoch": 0.6061231434624631, + "grad_norm": 1.5703125, + "learning_rate": 1.6907055506600834e-05, + "loss": 0.347, + "step": 13804 + }, + { + "epoch": 0.6062109619416666, + "grad_norm": 1.6171875, + "learning_rate": 1.6900510056882012e-05, + "loss": 0.3227, + "step": 13806 + }, + { + "epoch": 0.60629878042087, + "grad_norm": 1.6171875, + "learning_rate": 1.6893965227424073e-05, + "loss": 0.3598, + "step": 13808 + }, + { + "epoch": 0.6063865989000735, + "grad_norm": 1.7421875, + "learning_rate": 1.6887421018728215e-05, + "loss": 0.3397, + "step": 13810 + }, + { + "epoch": 0.606474417379277, + "grad_norm": 1.6171875, + "learning_rate": 1.68808774312956e-05, + "loss": 0.3515, + "step": 13812 + }, + { + "epoch": 0.6065622358584806, + "grad_norm": 1.703125, + "learning_rate": 1.6874334465627335e-05, + "loss": 0.3346, + "step": 13814 + }, + { + "epoch": 0.606650054337684, + "grad_norm": 1.5859375, + "learning_rate": 1.686779212222449e-05, + "loss": 0.3284, + "step": 13816 + }, + { + "epoch": 0.6067378728168875, + "grad_norm": 1.6328125, + "learning_rate": 1.6861250401588075e-05, + "loss": 0.3169, + "step": 13818 + }, + { + "epoch": 0.606825691296091, + "grad_norm": 1.7421875, + "learning_rate": 1.685470930421905e-05, + "loss": 0.3321, + "step": 13820 + }, + { + "epoch": 0.6069135097752945, + "grad_norm": 1.578125, + "learning_rate": 1.6848168830618338e-05, + "loss": 0.3241, + "step": 13822 + }, + { + "epoch": 0.6070013282544979, + "grad_norm": 1.6015625, + "learning_rate": 1.6841628981286814e-05, + "loss": 0.3567, + "step": 13824 + }, + { + "epoch": 0.6070891467337014, + "grad_norm": 1.6484375, + "learning_rate": 1.6835089756725297e-05, + "loss": 0.3199, + "step": 13826 + }, + { + "epoch": 0.6071769652129049, + "grad_norm": 1.8515625, + "learning_rate": 1.6828551157434568e-05, + "loss": 0.3317, + "step": 13828 + }, + { + "epoch": 0.6072647836921085, + "grad_norm": 1.609375, + "learning_rate": 1.6822013183915347e-05, + "loss": 0.3371, + "step": 13830 + }, + { + "epoch": 0.6073526021713119, + "grad_norm": 1.6953125, + "learning_rate": 1.6815475836668317e-05, + "loss": 0.3358, + "step": 13832 + }, + { + "epoch": 0.6074404206505154, + "grad_norm": 1.7734375, + "learning_rate": 1.6808939116194107e-05, + "loss": 0.3335, + "step": 13834 + }, + { + "epoch": 0.6075282391297189, + "grad_norm": 1.6796875, + "learning_rate": 1.6802403022993304e-05, + "loss": 0.3321, + "step": 13836 + }, + { + "epoch": 0.6076160576089223, + "grad_norm": 1.65625, + "learning_rate": 1.6795867557566454e-05, + "loss": 0.3463, + "step": 13838 + }, + { + "epoch": 0.6077038760881258, + "grad_norm": 1.7265625, + "learning_rate": 1.6789332720414023e-05, + "loss": 0.3516, + "step": 13840 + }, + { + "epoch": 0.6077916945673293, + "grad_norm": 1.5390625, + "learning_rate": 1.6782798512036457e-05, + "loss": 0.3261, + "step": 13842 + }, + { + "epoch": 0.6078795130465329, + "grad_norm": 1.640625, + "learning_rate": 1.677626493293416e-05, + "loss": 0.3435, + "step": 13844 + }, + { + "epoch": 0.6079673315257363, + "grad_norm": 1.6328125, + "learning_rate": 1.676973198360746e-05, + "loss": 0.3584, + "step": 13846 + }, + { + "epoch": 0.6080551500049398, + "grad_norm": 1.6015625, + "learning_rate": 1.6763199664556656e-05, + "loss": 0.3631, + "step": 13848 + }, + { + "epoch": 0.6081429684841433, + "grad_norm": 1.6640625, + "learning_rate": 1.6756667976281997e-05, + "loss": 0.3259, + "step": 13850 + }, + { + "epoch": 0.6082307869633468, + "grad_norm": 1.953125, + "learning_rate": 1.6750136919283678e-05, + "loss": 0.3486, + "step": 13852 + }, + { + "epoch": 0.6083186054425502, + "grad_norm": 1.625, + "learning_rate": 1.674360649406186e-05, + "loss": 0.3215, + "step": 13854 + }, + { + "epoch": 0.6084064239217537, + "grad_norm": 1.9609375, + "learning_rate": 1.673707670111663e-05, + "loss": 0.3284, + "step": 13856 + }, + { + "epoch": 0.6084942424009572, + "grad_norm": 1.7265625, + "learning_rate": 1.6730547540948048e-05, + "loss": 0.336, + "step": 13858 + }, + { + "epoch": 0.6085820608801608, + "grad_norm": 1.671875, + "learning_rate": 1.6724019014056115e-05, + "loss": 0.3323, + "step": 13860 + }, + { + "epoch": 0.6086698793593642, + "grad_norm": 1.671875, + "learning_rate": 1.6717491120940793e-05, + "loss": 0.358, + "step": 13862 + }, + { + "epoch": 0.6087576978385677, + "grad_norm": 1.7421875, + "learning_rate": 1.671096386210198e-05, + "loss": 0.331, + "step": 13864 + }, + { + "epoch": 0.6088455163177712, + "grad_norm": 1.5859375, + "learning_rate": 1.670443723803955e-05, + "loss": 0.3196, + "step": 13866 + }, + { + "epoch": 0.6089333347969746, + "grad_norm": 1.6484375, + "learning_rate": 1.669791124925331e-05, + "loss": 0.3557, + "step": 13868 + }, + { + "epoch": 0.6090211532761781, + "grad_norm": 1.84375, + "learning_rate": 1.669138589624301e-05, + "loss": 0.3677, + "step": 13870 + }, + { + "epoch": 0.6091089717553816, + "grad_norm": 1.578125, + "learning_rate": 1.668486117950837e-05, + "loss": 0.2952, + "step": 13872 + }, + { + "epoch": 0.6091967902345851, + "grad_norm": 1.78125, + "learning_rate": 1.6678337099549052e-05, + "loss": 0.3623, + "step": 13874 + }, + { + "epoch": 0.6092846087137886, + "grad_norm": 1.546875, + "learning_rate": 1.667181365686467e-05, + "loss": 0.3241, + "step": 13876 + }, + { + "epoch": 0.6093724271929921, + "grad_norm": 1.609375, + "learning_rate": 1.66652908519548e-05, + "loss": 0.3287, + "step": 13878 + }, + { + "epoch": 0.6094602456721956, + "grad_norm": 1.765625, + "learning_rate": 1.6658768685318955e-05, + "loss": 0.337, + "step": 13880 + }, + { + "epoch": 0.6095480641513991, + "grad_norm": 1.6171875, + "learning_rate": 1.6652247157456603e-05, + "loss": 0.3112, + "step": 13882 + }, + { + "epoch": 0.6096358826306025, + "grad_norm": 1.59375, + "learning_rate": 1.6645726268867163e-05, + "loss": 0.3299, + "step": 13884 + }, + { + "epoch": 0.609723701109806, + "grad_norm": 1.6796875, + "learning_rate": 1.6639206020050006e-05, + "loss": 0.322, + "step": 13886 + }, + { + "epoch": 0.6098115195890095, + "grad_norm": 1.6015625, + "learning_rate": 1.6632686411504455e-05, + "loss": 0.3119, + "step": 13888 + }, + { + "epoch": 0.609899338068213, + "grad_norm": 1.8984375, + "learning_rate": 1.6626167443729797e-05, + "loss": 0.3334, + "step": 13890 + }, + { + "epoch": 0.6099871565474165, + "grad_norm": 1.5390625, + "learning_rate": 1.6619649117225233e-05, + "loss": 0.354, + "step": 13892 + }, + { + "epoch": 0.61007497502662, + "grad_norm": 1.6796875, + "learning_rate": 1.6613131432489947e-05, + "loss": 0.3137, + "step": 13894 + }, + { + "epoch": 0.6101627935058235, + "grad_norm": 1.6171875, + "learning_rate": 1.6606614390023066e-05, + "loss": 0.3491, + "step": 13896 + }, + { + "epoch": 0.610250611985027, + "grad_norm": 1.703125, + "learning_rate": 1.660009799032368e-05, + "loss": 0.3024, + "step": 13898 + }, + { + "epoch": 0.6103384304642304, + "grad_norm": 1.609375, + "learning_rate": 1.65935822338908e-05, + "loss": 0.3068, + "step": 13900 + }, + { + "epoch": 0.6104262489434339, + "grad_norm": 1.6328125, + "learning_rate": 1.6587067121223397e-05, + "loss": 0.3341, + "step": 13902 + }, + { + "epoch": 0.6105140674226374, + "grad_norm": 1.8203125, + "learning_rate": 1.6580552652820412e-05, + "loss": 0.3257, + "step": 13904 + }, + { + "epoch": 0.610601885901841, + "grad_norm": 1.6328125, + "learning_rate": 1.6574038829180733e-05, + "loss": 0.3143, + "step": 13906 + }, + { + "epoch": 0.6106897043810444, + "grad_norm": 1.640625, + "learning_rate": 1.6567525650803174e-05, + "loss": 0.3188, + "step": 13908 + }, + { + "epoch": 0.6107775228602479, + "grad_norm": 1.703125, + "learning_rate": 1.656101311818652e-05, + "loss": 0.3143, + "step": 13910 + }, + { + "epoch": 0.6108653413394514, + "grad_norm": 1.59375, + "learning_rate": 1.6554501231829504e-05, + "loss": 0.3411, + "step": 13912 + }, + { + "epoch": 0.6109531598186548, + "grad_norm": 1.6328125, + "learning_rate": 1.654798999223081e-05, + "loss": 0.3352, + "step": 13914 + }, + { + "epoch": 0.6110409782978583, + "grad_norm": 1.6171875, + "learning_rate": 1.654147939988907e-05, + "loss": 0.3489, + "step": 13916 + }, + { + "epoch": 0.6111287967770618, + "grad_norm": 1.71875, + "learning_rate": 1.6534969455302864e-05, + "loss": 0.351, + "step": 13918 + }, + { + "epoch": 0.6112166152562652, + "grad_norm": 1.609375, + "learning_rate": 1.6528460158970727e-05, + "loss": 0.3228, + "step": 13920 + }, + { + "epoch": 0.6113044337354688, + "grad_norm": 1.6171875, + "learning_rate": 1.652195151139114e-05, + "loss": 0.3586, + "step": 13922 + }, + { + "epoch": 0.6113922522146723, + "grad_norm": 1.8125, + "learning_rate": 1.651544351306254e-05, + "loss": 0.326, + "step": 13924 + }, + { + "epoch": 0.6114800706938758, + "grad_norm": 1.6328125, + "learning_rate": 1.6508936164483314e-05, + "loss": 0.3237, + "step": 13926 + }, + { + "epoch": 0.6115678891730792, + "grad_norm": 1.6015625, + "learning_rate": 1.6502429466151788e-05, + "loss": 0.3357, + "step": 13928 + }, + { + "epoch": 0.6116557076522827, + "grad_norm": 1.65625, + "learning_rate": 1.649592341856625e-05, + "loss": 0.33, + "step": 13930 + }, + { + "epoch": 0.6117435261314862, + "grad_norm": 1.6328125, + "learning_rate": 1.648941802222494e-05, + "loss": 0.3332, + "step": 13932 + }, + { + "epoch": 0.6118313446106897, + "grad_norm": 1.671875, + "learning_rate": 1.6482913277626033e-05, + "loss": 0.3155, + "step": 13934 + }, + { + "epoch": 0.6119191630898931, + "grad_norm": 1.7890625, + "learning_rate": 1.6476409185267666e-05, + "loss": 0.3584, + "step": 13936 + }, + { + "epoch": 0.6120069815690967, + "grad_norm": 1.6015625, + "learning_rate": 1.6469905745647928e-05, + "loss": 0.343, + "step": 13938 + }, + { + "epoch": 0.6120948000483002, + "grad_norm": 1.640625, + "learning_rate": 1.6463402959264858e-05, + "loss": 0.3573, + "step": 13940 + }, + { + "epoch": 0.6121826185275037, + "grad_norm": 1.53125, + "learning_rate": 1.6456900826616433e-05, + "loss": 0.3155, + "step": 13942 + }, + { + "epoch": 0.6122704370067071, + "grad_norm": 1.7109375, + "learning_rate": 1.645039934820059e-05, + "loss": 0.3426, + "step": 13944 + }, + { + "epoch": 0.6123582554859106, + "grad_norm": 1.6171875, + "learning_rate": 1.644389852451521e-05, + "loss": 0.3317, + "step": 13946 + }, + { + "epoch": 0.6124460739651141, + "grad_norm": 1.5, + "learning_rate": 1.6437398356058137e-05, + "loss": 0.3334, + "step": 13948 + }, + { + "epoch": 0.6125338924443176, + "grad_norm": 1.609375, + "learning_rate": 1.643089884332715e-05, + "loss": 0.3079, + "step": 13950 + }, + { + "epoch": 0.6126217109235211, + "grad_norm": 1.7421875, + "learning_rate": 1.642439998681999e-05, + "loss": 0.3297, + "step": 13952 + }, + { + "epoch": 0.6127095294027246, + "grad_norm": 1.7265625, + "learning_rate": 1.6417901787034324e-05, + "loss": 0.309, + "step": 13954 + }, + { + "epoch": 0.6127973478819281, + "grad_norm": 1.6328125, + "learning_rate": 1.64114042444678e-05, + "loss": 0.3131, + "step": 13956 + }, + { + "epoch": 0.6128851663611316, + "grad_norm": 1.8203125, + "learning_rate": 1.6404907359618e-05, + "loss": 0.3473, + "step": 13958 + }, + { + "epoch": 0.612972984840335, + "grad_norm": 1.84375, + "learning_rate": 1.639841113298246e-05, + "loss": 0.3242, + "step": 13960 + }, + { + "epoch": 0.6130608033195385, + "grad_norm": 1.640625, + "learning_rate": 1.6391915565058653e-05, + "loss": 0.3567, + "step": 13962 + }, + { + "epoch": 0.613148621798742, + "grad_norm": 1.625, + "learning_rate": 1.6385420656344007e-05, + "loss": 0.3253, + "step": 13964 + }, + { + "epoch": 0.6132364402779454, + "grad_norm": 1.7890625, + "learning_rate": 1.637892640733592e-05, + "loss": 0.3385, + "step": 13966 + }, + { + "epoch": 0.613324258757149, + "grad_norm": 1.703125, + "learning_rate": 1.637243281853172e-05, + "loss": 0.3117, + "step": 13968 + }, + { + "epoch": 0.6134120772363525, + "grad_norm": 1.625, + "learning_rate": 1.6365939890428673e-05, + "loss": 0.3177, + "step": 13970 + }, + { + "epoch": 0.613499895715556, + "grad_norm": 1.6171875, + "learning_rate": 1.6359447623524022e-05, + "loss": 0.3271, + "step": 13972 + }, + { + "epoch": 0.6135877141947594, + "grad_norm": 1.640625, + "learning_rate": 1.635295601831494e-05, + "loss": 0.3415, + "step": 13974 + }, + { + "epoch": 0.6136755326739629, + "grad_norm": 1.6328125, + "learning_rate": 1.6346465075298564e-05, + "loss": 0.3051, + "step": 13976 + }, + { + "epoch": 0.6137633511531664, + "grad_norm": 1.59375, + "learning_rate": 1.633997479497197e-05, + "loss": 0.3285, + "step": 13978 + }, + { + "epoch": 0.6138511696323699, + "grad_norm": 1.578125, + "learning_rate": 1.6333485177832176e-05, + "loss": 0.3322, + "step": 13980 + }, + { + "epoch": 0.6139389881115733, + "grad_norm": 1.6484375, + "learning_rate": 1.632699622437617e-05, + "loss": 0.3341, + "step": 13982 + }, + { + "epoch": 0.6140268065907769, + "grad_norm": 1.6328125, + "learning_rate": 1.6320507935100863e-05, + "loss": 0.3281, + "step": 13984 + }, + { + "epoch": 0.6141146250699804, + "grad_norm": 1.5234375, + "learning_rate": 1.6314020310503144e-05, + "loss": 0.3057, + "step": 13986 + }, + { + "epoch": 0.6142024435491839, + "grad_norm": 1.8359375, + "learning_rate": 1.630753335107984e-05, + "loss": 0.3259, + "step": 13988 + }, + { + "epoch": 0.6142902620283873, + "grad_norm": 1.6796875, + "learning_rate": 1.630104705732771e-05, + "loss": 0.3332, + "step": 13990 + }, + { + "epoch": 0.6143780805075908, + "grad_norm": 1.6875, + "learning_rate": 1.6294561429743475e-05, + "loss": 0.3268, + "step": 13992 + }, + { + "epoch": 0.6144658989867943, + "grad_norm": 1.703125, + "learning_rate": 1.6288076468823827e-05, + "loss": 0.311, + "step": 13994 + }, + { + "epoch": 0.6145537174659977, + "grad_norm": 1.875, + "learning_rate": 1.6281592175065357e-05, + "loss": 0.3465, + "step": 13996 + }, + { + "epoch": 0.6146415359452013, + "grad_norm": 1.7109375, + "learning_rate": 1.6275108548964653e-05, + "loss": 0.3331, + "step": 13998 + }, + { + "epoch": 0.6147293544244048, + "grad_norm": 1.640625, + "learning_rate": 1.626862559101823e-05, + "loss": 0.3438, + "step": 14000 + }, + { + "epoch": 0.6148171729036083, + "grad_norm": 1.5703125, + "learning_rate": 1.6262143301722547e-05, + "loss": 0.3152, + "step": 14002 + }, + { + "epoch": 0.6149049913828117, + "grad_norm": 1.8125, + "learning_rate": 1.625566168157403e-05, + "loss": 0.3089, + "step": 14004 + }, + { + "epoch": 0.6149928098620152, + "grad_norm": 1.6171875, + "learning_rate": 1.6249180731069036e-05, + "loss": 0.3415, + "step": 14006 + }, + { + "epoch": 0.6150806283412187, + "grad_norm": 1.65625, + "learning_rate": 1.6242700450703876e-05, + "loss": 0.3317, + "step": 14008 + }, + { + "epoch": 0.6151684468204222, + "grad_norm": 1.84375, + "learning_rate": 1.6236220840974815e-05, + "loss": 0.3361, + "step": 14010 + }, + { + "epoch": 0.6152562652996256, + "grad_norm": 1.6796875, + "learning_rate": 1.6229741902378063e-05, + "loss": 0.3206, + "step": 14012 + }, + { + "epoch": 0.6153440837788292, + "grad_norm": 1.9765625, + "learning_rate": 1.6223263635409785e-05, + "loss": 0.3096, + "step": 14014 + }, + { + "epoch": 0.6154319022580327, + "grad_norm": 1.84375, + "learning_rate": 1.621678604056608e-05, + "loss": 0.3521, + "step": 14016 + }, + { + "epoch": 0.6155197207372362, + "grad_norm": 1.6328125, + "learning_rate": 1.6210309118343e-05, + "loss": 0.3101, + "step": 14018 + }, + { + "epoch": 0.6156075392164396, + "grad_norm": 1.6953125, + "learning_rate": 1.6203832869236557e-05, + "loss": 0.3519, + "step": 14020 + }, + { + "epoch": 0.6156953576956431, + "grad_norm": 1.546875, + "learning_rate": 1.619735729374271e-05, + "loss": 0.3418, + "step": 14022 + }, + { + "epoch": 0.6157831761748466, + "grad_norm": 1.6328125, + "learning_rate": 1.6190882392357342e-05, + "loss": 0.3315, + "step": 14024 + }, + { + "epoch": 0.61587099465405, + "grad_norm": 1.546875, + "learning_rate": 1.6184408165576316e-05, + "loss": 0.3609, + "step": 14026 + }, + { + "epoch": 0.6159588131332535, + "grad_norm": 1.78125, + "learning_rate": 1.6177934613895422e-05, + "loss": 0.3276, + "step": 14028 + }, + { + "epoch": 0.6160466316124571, + "grad_norm": 1.7578125, + "learning_rate": 1.6171461737810413e-05, + "loss": 0.3124, + "step": 14030 + }, + { + "epoch": 0.6161344500916606, + "grad_norm": 1.671875, + "learning_rate": 1.616498953781698e-05, + "loss": 0.3149, + "step": 14032 + }, + { + "epoch": 0.616222268570864, + "grad_norm": 1.609375, + "learning_rate": 1.6158518014410762e-05, + "loss": 0.3245, + "step": 14034 + }, + { + "epoch": 0.6163100870500675, + "grad_norm": 1.5390625, + "learning_rate": 1.615204716808736e-05, + "loss": 0.341, + "step": 14036 + }, + { + "epoch": 0.616397905529271, + "grad_norm": 1.5546875, + "learning_rate": 1.61455769993423e-05, + "loss": 0.3343, + "step": 14038 + }, + { + "epoch": 0.6164857240084745, + "grad_norm": 1.59375, + "learning_rate": 1.6139107508671086e-05, + "loss": 0.3379, + "step": 14040 + }, + { + "epoch": 0.6165735424876779, + "grad_norm": 1.640625, + "learning_rate": 1.6132638696569134e-05, + "loss": 0.3453, + "step": 14042 + }, + { + "epoch": 0.6166613609668815, + "grad_norm": 1.6796875, + "learning_rate": 1.612617056353184e-05, + "loss": 0.3367, + "step": 14044 + }, + { + "epoch": 0.616749179446085, + "grad_norm": 1.59375, + "learning_rate": 1.611970311005453e-05, + "loss": 0.3481, + "step": 14046 + }, + { + "epoch": 0.6168369979252885, + "grad_norm": 1.6484375, + "learning_rate": 1.611323633663248e-05, + "loss": 0.3198, + "step": 14048 + }, + { + "epoch": 0.6169248164044919, + "grad_norm": 2.015625, + "learning_rate": 1.610677024376093e-05, + "loss": 0.3636, + "step": 14050 + }, + { + "epoch": 0.6170126348836954, + "grad_norm": 1.609375, + "learning_rate": 1.6100304831935052e-05, + "loss": 0.316, + "step": 14052 + }, + { + "epoch": 0.6171004533628989, + "grad_norm": 1.6796875, + "learning_rate": 1.609384010164996e-05, + "loss": 0.3195, + "step": 14054 + }, + { + "epoch": 0.6171882718421023, + "grad_norm": 1.6484375, + "learning_rate": 1.608737605340072e-05, + "loss": 0.3629, + "step": 14056 + }, + { + "epoch": 0.6172760903213058, + "grad_norm": 1.546875, + "learning_rate": 1.608091268768236e-05, + "loss": 0.2921, + "step": 14058 + }, + { + "epoch": 0.6173639088005094, + "grad_norm": 1.609375, + "learning_rate": 1.6074450004989844e-05, + "loss": 0.3225, + "step": 14060 + }, + { + "epoch": 0.6174517272797129, + "grad_norm": 1.5546875, + "learning_rate": 1.606798800581809e-05, + "loss": 0.3192, + "step": 14062 + }, + { + "epoch": 0.6175395457589163, + "grad_norm": 1.7265625, + "learning_rate": 1.6061526690661947e-05, + "loss": 0.3356, + "step": 14064 + }, + { + "epoch": 0.6176273642381198, + "grad_norm": 1.6953125, + "learning_rate": 1.6055066060016247e-05, + "loss": 0.3436, + "step": 14066 + }, + { + "epoch": 0.6177151827173233, + "grad_norm": 1.875, + "learning_rate": 1.6048606114375723e-05, + "loss": 0.3204, + "step": 14068 + }, + { + "epoch": 0.6178030011965268, + "grad_norm": 1.5, + "learning_rate": 1.604214685423509e-05, + "loss": 0.3223, + "step": 14070 + }, + { + "epoch": 0.6178908196757302, + "grad_norm": 1.53125, + "learning_rate": 1.6035688280088995e-05, + "loss": 0.3394, + "step": 14072 + }, + { + "epoch": 0.6179786381549337, + "grad_norm": 1.515625, + "learning_rate": 1.6029230392432043e-05, + "loss": 0.3128, + "step": 14074 + }, + { + "epoch": 0.6180664566341373, + "grad_norm": 1.6640625, + "learning_rate": 1.6022773191758784e-05, + "loss": 0.3205, + "step": 14076 + }, + { + "epoch": 0.6181542751133408, + "grad_norm": 1.6328125, + "learning_rate": 1.60163166785637e-05, + "loss": 0.343, + "step": 14078 + }, + { + "epoch": 0.6182420935925442, + "grad_norm": 1.640625, + "learning_rate": 1.6009860853341237e-05, + "loss": 0.3274, + "step": 14080 + }, + { + "epoch": 0.6183299120717477, + "grad_norm": 1.640625, + "learning_rate": 1.6003405716585798e-05, + "loss": 0.3696, + "step": 14082 + }, + { + "epoch": 0.6184177305509512, + "grad_norm": 1.5546875, + "learning_rate": 1.5996951268791695e-05, + "loss": 0.3001, + "step": 14084 + }, + { + "epoch": 0.6185055490301546, + "grad_norm": 1.578125, + "learning_rate": 1.599049751045322e-05, + "loss": 0.3314, + "step": 14086 + }, + { + "epoch": 0.6185933675093581, + "grad_norm": 1.59375, + "learning_rate": 1.5984044442064606e-05, + "loss": 0.3451, + "step": 14088 + }, + { + "epoch": 0.6186811859885616, + "grad_norm": 1.6796875, + "learning_rate": 1.5977592064120027e-05, + "loss": 0.3189, + "step": 14090 + }, + { + "epoch": 0.6187690044677652, + "grad_norm": 1.625, + "learning_rate": 1.5971140377113623e-05, + "loss": 0.3403, + "step": 14092 + }, + { + "epoch": 0.6188568229469686, + "grad_norm": 1.703125, + "learning_rate": 1.5964689381539445e-05, + "loss": 0.3287, + "step": 14094 + }, + { + "epoch": 0.6189446414261721, + "grad_norm": 1.734375, + "learning_rate": 1.595823907789152e-05, + "loss": 0.3246, + "step": 14096 + }, + { + "epoch": 0.6190324599053756, + "grad_norm": 1.5625, + "learning_rate": 1.595178946666381e-05, + "loss": 0.3468, + "step": 14098 + }, + { + "epoch": 0.6191202783845791, + "grad_norm": 1.75, + "learning_rate": 1.5945340548350235e-05, + "loss": 0.329, + "step": 14100 + }, + { + "epoch": 0.6192080968637825, + "grad_norm": 1.6640625, + "learning_rate": 1.5938892323444653e-05, + "loss": 0.3288, + "step": 14102 + }, + { + "epoch": 0.619295915342986, + "grad_norm": 1.75, + "learning_rate": 1.593244479244087e-05, + "loss": 0.3357, + "step": 14104 + }, + { + "epoch": 0.6193837338221896, + "grad_norm": 1.6484375, + "learning_rate": 1.5925997955832633e-05, + "loss": 0.3161, + "step": 14106 + }, + { + "epoch": 0.6194715523013931, + "grad_norm": 1.7578125, + "learning_rate": 1.591955181411365e-05, + "loss": 0.3194, + "step": 14108 + }, + { + "epoch": 0.6195593707805965, + "grad_norm": 1.5234375, + "learning_rate": 1.5913106367777567e-05, + "loss": 0.3085, + "step": 14110 + }, + { + "epoch": 0.6196471892598, + "grad_norm": 1.8828125, + "learning_rate": 1.590666161731798e-05, + "loss": 0.3346, + "step": 14112 + }, + { + "epoch": 0.6197350077390035, + "grad_norm": 1.7578125, + "learning_rate": 1.5900217563228426e-05, + "loss": 0.3722, + "step": 14114 + }, + { + "epoch": 0.619822826218207, + "grad_norm": 1.6953125, + "learning_rate": 1.5893774206002393e-05, + "loss": 0.3329, + "step": 14116 + }, + { + "epoch": 0.6199106446974104, + "grad_norm": 1.8203125, + "learning_rate": 1.588733154613331e-05, + "loss": 0.367, + "step": 14118 + }, + { + "epoch": 0.6199984631766139, + "grad_norm": 1.7265625, + "learning_rate": 1.588088958411456e-05, + "loss": 0.3305, + "step": 14120 + }, + { + "epoch": 0.6200862816558175, + "grad_norm": 1.7421875, + "learning_rate": 1.5874448320439475e-05, + "loss": 0.3372, + "step": 14122 + }, + { + "epoch": 0.620174100135021, + "grad_norm": 1.578125, + "learning_rate": 1.586800775560132e-05, + "loss": 0.3231, + "step": 14124 + }, + { + "epoch": 0.6202619186142244, + "grad_norm": 1.6171875, + "learning_rate": 1.5861567890093328e-05, + "loss": 0.3355, + "step": 14126 + }, + { + "epoch": 0.6203497370934279, + "grad_norm": 1.6953125, + "learning_rate": 1.5855128724408655e-05, + "loss": 0.3658, + "step": 14128 + }, + { + "epoch": 0.6204375555726314, + "grad_norm": 1.5390625, + "learning_rate": 1.5848690259040414e-05, + "loss": 0.3452, + "step": 14130 + }, + { + "epoch": 0.6205253740518348, + "grad_norm": 1.6796875, + "learning_rate": 1.5842252494481664e-05, + "loss": 0.3205, + "step": 14132 + }, + { + "epoch": 0.6206131925310383, + "grad_norm": 1.6953125, + "learning_rate": 1.5835815431225418e-05, + "loss": 0.3294, + "step": 14134 + }, + { + "epoch": 0.6207010110102418, + "grad_norm": 1.921875, + "learning_rate": 1.5829379069764622e-05, + "loss": 0.3441, + "step": 14136 + }, + { + "epoch": 0.6207888294894454, + "grad_norm": 1.765625, + "learning_rate": 1.582294341059218e-05, + "loss": 0.3576, + "step": 14138 + }, + { + "epoch": 0.6208766479686488, + "grad_norm": 1.796875, + "learning_rate": 1.5816508454200922e-05, + "loss": 0.3253, + "step": 14140 + }, + { + "epoch": 0.6209644664478523, + "grad_norm": 1.6328125, + "learning_rate": 1.581007420108365e-05, + "loss": 0.3422, + "step": 14142 + }, + { + "epoch": 0.6210522849270558, + "grad_norm": 1.9140625, + "learning_rate": 1.5803640651733115e-05, + "loss": 0.329, + "step": 14144 + }, + { + "epoch": 0.6211401034062592, + "grad_norm": 1.6171875, + "learning_rate": 1.579720780664197e-05, + "loss": 0.3193, + "step": 14146 + }, + { + "epoch": 0.6212279218854627, + "grad_norm": 1.6796875, + "learning_rate": 1.5790775666302855e-05, + "loss": 0.3378, + "step": 14148 + }, + { + "epoch": 0.6213157403646662, + "grad_norm": 1.609375, + "learning_rate": 1.5784344231208347e-05, + "loss": 0.3437, + "step": 14150 + }, + { + "epoch": 0.6214035588438698, + "grad_norm": 1.5390625, + "learning_rate": 1.577791350185097e-05, + "loss": 0.3464, + "step": 14152 + }, + { + "epoch": 0.6214913773230732, + "grad_norm": 1.6484375, + "learning_rate": 1.5771483478723188e-05, + "loss": 0.3521, + "step": 14154 + }, + { + "epoch": 0.6215791958022767, + "grad_norm": 1.6015625, + "learning_rate": 1.576505416231741e-05, + "loss": 0.344, + "step": 14156 + }, + { + "epoch": 0.6216670142814802, + "grad_norm": 1.6171875, + "learning_rate": 1.5758625553126e-05, + "loss": 0.3501, + "step": 14158 + }, + { + "epoch": 0.6217548327606837, + "grad_norm": 1.6640625, + "learning_rate": 1.575219765164126e-05, + "loss": 0.3525, + "step": 14160 + }, + { + "epoch": 0.6218426512398871, + "grad_norm": 1.640625, + "learning_rate": 1.5745770458355442e-05, + "loss": 0.3339, + "step": 14162 + }, + { + "epoch": 0.6219304697190906, + "grad_norm": 1.671875, + "learning_rate": 1.5739343973760743e-05, + "loss": 0.3257, + "step": 14164 + }, + { + "epoch": 0.6220182881982941, + "grad_norm": 1.6328125, + "learning_rate": 1.57329181983493e-05, + "loss": 0.3317, + "step": 14166 + }, + { + "epoch": 0.6221061066774977, + "grad_norm": 1.796875, + "learning_rate": 1.5726493132613203e-05, + "loss": 0.3308, + "step": 14168 + }, + { + "epoch": 0.6221939251567011, + "grad_norm": 1.5546875, + "learning_rate": 1.5720068777044476e-05, + "loss": 0.3298, + "step": 14170 + }, + { + "epoch": 0.6222817436359046, + "grad_norm": 1.5703125, + "learning_rate": 1.5713645132135118e-05, + "loss": 0.3412, + "step": 14172 + }, + { + "epoch": 0.6223695621151081, + "grad_norm": 1.6328125, + "learning_rate": 1.570722219837705e-05, + "loss": 0.3044, + "step": 14174 + }, + { + "epoch": 0.6224573805943115, + "grad_norm": 1.6875, + "learning_rate": 1.570079997626212e-05, + "loss": 0.3277, + "step": 14176 + }, + { + "epoch": 0.622545199073515, + "grad_norm": 1.5859375, + "learning_rate": 1.569437846628216e-05, + "loss": 0.3342, + "step": 14178 + }, + { + "epoch": 0.6226330175527185, + "grad_norm": 1.6875, + "learning_rate": 1.5687957668928927e-05, + "loss": 0.3118, + "step": 14180 + }, + { + "epoch": 0.622720836031922, + "grad_norm": 1.640625, + "learning_rate": 1.5681537584694128e-05, + "loss": 0.3523, + "step": 14182 + }, + { + "epoch": 0.6228086545111255, + "grad_norm": 1.5546875, + "learning_rate": 1.567511821406941e-05, + "loss": 0.3283, + "step": 14184 + }, + { + "epoch": 0.622896472990329, + "grad_norm": 1.484375, + "learning_rate": 1.566869955754638e-05, + "loss": 0.324, + "step": 14186 + }, + { + "epoch": 0.6229842914695325, + "grad_norm": 1.6328125, + "learning_rate": 1.5662281615616582e-05, + "loss": 0.3403, + "step": 14188 + }, + { + "epoch": 0.623072109948736, + "grad_norm": 1.609375, + "learning_rate": 1.5655864388771486e-05, + "loss": 0.3209, + "step": 14190 + }, + { + "epoch": 0.6231599284279394, + "grad_norm": 1.5546875, + "learning_rate": 1.5649447877502537e-05, + "loss": 0.3108, + "step": 14192 + }, + { + "epoch": 0.6232477469071429, + "grad_norm": 1.671875, + "learning_rate": 1.5643032082301106e-05, + "loss": 0.3284, + "step": 14194 + }, + { + "epoch": 0.6233355653863464, + "grad_norm": 1.671875, + "learning_rate": 1.5636617003658527e-05, + "loss": 0.3531, + "step": 14196 + }, + { + "epoch": 0.62342338386555, + "grad_norm": 1.5625, + "learning_rate": 1.5630202642066062e-05, + "loss": 0.3265, + "step": 14198 + }, + { + "epoch": 0.6235112023447534, + "grad_norm": 1.6875, + "learning_rate": 1.5623788998014925e-05, + "loss": 0.3469, + "step": 14200 + }, + { + "epoch": 0.6235990208239569, + "grad_norm": 1.703125, + "learning_rate": 1.5617376071996277e-05, + "loss": 0.3165, + "step": 14202 + }, + { + "epoch": 0.6236868393031604, + "grad_norm": 1.53125, + "learning_rate": 1.5610963864501212e-05, + "loss": 0.3144, + "step": 14204 + }, + { + "epoch": 0.6237746577823639, + "grad_norm": 1.625, + "learning_rate": 1.5604552376020797e-05, + "loss": 0.3202, + "step": 14206 + }, + { + "epoch": 0.6238624762615673, + "grad_norm": 1.6875, + "learning_rate": 1.5598141607046004e-05, + "loss": 0.3172, + "step": 14208 + }, + { + "epoch": 0.6239502947407708, + "grad_norm": 1.5625, + "learning_rate": 1.559173155806778e-05, + "loss": 0.3439, + "step": 14210 + }, + { + "epoch": 0.6240381132199743, + "grad_norm": 1.671875, + "learning_rate": 1.558532222957701e-05, + "loss": 0.292, + "step": 14212 + }, + { + "epoch": 0.6241259316991778, + "grad_norm": 1.71875, + "learning_rate": 1.5578913622064523e-05, + "loss": 0.3282, + "step": 14214 + }, + { + "epoch": 0.6242137501783813, + "grad_norm": 1.671875, + "learning_rate": 1.5572505736021088e-05, + "loss": 0.3597, + "step": 14216 + }, + { + "epoch": 0.6243015686575848, + "grad_norm": 1.5234375, + "learning_rate": 1.5566098571937416e-05, + "loss": 0.3474, + "step": 14218 + }, + { + "epoch": 0.6243893871367883, + "grad_norm": 2.1875, + "learning_rate": 1.5559692130304185e-05, + "loss": 0.3342, + "step": 14220 + }, + { + "epoch": 0.6244772056159917, + "grad_norm": 1.671875, + "learning_rate": 1.555328641161199e-05, + "loss": 0.3256, + "step": 14222 + }, + { + "epoch": 0.6245650240951952, + "grad_norm": 1.59375, + "learning_rate": 1.5546881416351385e-05, + "loss": 0.3209, + "step": 14224 + }, + { + "epoch": 0.6246528425743987, + "grad_norm": 1.53125, + "learning_rate": 1.5540477145012876e-05, + "loss": 0.3217, + "step": 14226 + }, + { + "epoch": 0.6247406610536022, + "grad_norm": 1.6328125, + "learning_rate": 1.5534073598086888e-05, + "loss": 0.3278, + "step": 14228 + }, + { + "epoch": 0.6248284795328057, + "grad_norm": 1.8046875, + "learning_rate": 1.5527670776063812e-05, + "loss": 0.3114, + "step": 14230 + }, + { + "epoch": 0.6249162980120092, + "grad_norm": 1.765625, + "learning_rate": 1.552126867943398e-05, + "loss": 0.3408, + "step": 14232 + }, + { + "epoch": 0.6250041164912127, + "grad_norm": 1.65625, + "learning_rate": 1.5514867308687665e-05, + "loss": 0.3077, + "step": 14234 + }, + { + "epoch": 0.6250919349704162, + "grad_norm": 1.703125, + "learning_rate": 1.5508466664315092e-05, + "loss": 0.3432, + "step": 14236 + }, + { + "epoch": 0.6251797534496196, + "grad_norm": 1.6484375, + "learning_rate": 1.550206674680641e-05, + "loss": 0.3146, + "step": 14238 + }, + { + "epoch": 0.6252675719288231, + "grad_norm": 1.8203125, + "learning_rate": 1.5495667556651738e-05, + "loss": 0.3549, + "step": 14240 + }, + { + "epoch": 0.6253553904080266, + "grad_norm": 1.4765625, + "learning_rate": 1.548926909434112e-05, + "loss": 0.3519, + "step": 14242 + }, + { + "epoch": 0.62544320888723, + "grad_norm": 1.59375, + "learning_rate": 1.5482871360364548e-05, + "loss": 0.3228, + "step": 14244 + }, + { + "epoch": 0.6255310273664336, + "grad_norm": 1.8046875, + "learning_rate": 1.5476474355211973e-05, + "loss": 0.3409, + "step": 14246 + }, + { + "epoch": 0.6256188458456371, + "grad_norm": 1.7421875, + "learning_rate": 1.5470078079373275e-05, + "loss": 0.3254, + "step": 14248 + }, + { + "epoch": 0.6257066643248406, + "grad_norm": 1.6484375, + "learning_rate": 1.5463682533338286e-05, + "loss": 0.3055, + "step": 14250 + }, + { + "epoch": 0.625794482804044, + "grad_norm": 1.578125, + "learning_rate": 1.545728771759677e-05, + "loss": 0.3273, + "step": 14252 + }, + { + "epoch": 0.6258823012832475, + "grad_norm": 1.53125, + "learning_rate": 1.545089363263845e-05, + "loss": 0.3481, + "step": 14254 + }, + { + "epoch": 0.625970119762451, + "grad_norm": 1.7734375, + "learning_rate": 1.5444500278952982e-05, + "loss": 0.3302, + "step": 14256 + }, + { + "epoch": 0.6260579382416545, + "grad_norm": 1.5546875, + "learning_rate": 1.5438107657029975e-05, + "loss": 0.3343, + "step": 14258 + }, + { + "epoch": 0.626145756720858, + "grad_norm": 1.8984375, + "learning_rate": 1.543171576735898e-05, + "loss": 0.3062, + "step": 14260 + }, + { + "epoch": 0.6262335752000615, + "grad_norm": 1.6171875, + "learning_rate": 1.542532461042948e-05, + "loss": 0.3297, + "step": 14262 + }, + { + "epoch": 0.626321393679265, + "grad_norm": 1.7265625, + "learning_rate": 1.5418934186730923e-05, + "loss": 0.3328, + "step": 14264 + }, + { + "epoch": 0.6264092121584685, + "grad_norm": 1.6171875, + "learning_rate": 1.5412544496752686e-05, + "loss": 0.3082, + "step": 14266 + }, + { + "epoch": 0.6264970306376719, + "grad_norm": 1.5625, + "learning_rate": 1.540615554098408e-05, + "loss": 0.3179, + "step": 14268 + }, + { + "epoch": 0.6265848491168754, + "grad_norm": 1.59375, + "learning_rate": 1.539976731991438e-05, + "loss": 0.3258, + "step": 14270 + }, + { + "epoch": 0.6266726675960789, + "grad_norm": 1.6484375, + "learning_rate": 1.5393379834032804e-05, + "loss": 0.3596, + "step": 14272 + }, + { + "epoch": 0.6267604860752823, + "grad_norm": 1.671875, + "learning_rate": 1.53869930838285e-05, + "loss": 0.3398, + "step": 14274 + }, + { + "epoch": 0.6268483045544859, + "grad_norm": 1.890625, + "learning_rate": 1.5380607069790577e-05, + "loss": 0.3567, + "step": 14276 + }, + { + "epoch": 0.6269361230336894, + "grad_norm": 1.5703125, + "learning_rate": 1.5374221792408067e-05, + "loss": 0.3124, + "step": 14278 + }, + { + "epoch": 0.6270239415128929, + "grad_norm": 1.625, + "learning_rate": 1.536783725216996e-05, + "loss": 0.3209, + "step": 14280 + }, + { + "epoch": 0.6271117599920963, + "grad_norm": 1.7109375, + "learning_rate": 1.5361453449565183e-05, + "loss": 0.3161, + "step": 14282 + }, + { + "epoch": 0.6271995784712998, + "grad_norm": 1.53125, + "learning_rate": 1.535507038508261e-05, + "loss": 0.35, + "step": 14284 + }, + { + "epoch": 0.6272873969505033, + "grad_norm": 1.7578125, + "learning_rate": 1.5348688059211067e-05, + "loss": 0.3225, + "step": 14286 + }, + { + "epoch": 0.6273752154297068, + "grad_norm": 1.515625, + "learning_rate": 1.53423064724393e-05, + "loss": 0.3094, + "step": 14288 + }, + { + "epoch": 0.6274630339089102, + "grad_norm": 1.546875, + "learning_rate": 1.5335925625256017e-05, + "loss": 0.3151, + "step": 14290 + }, + { + "epoch": 0.6275508523881138, + "grad_norm": 1.7734375, + "learning_rate": 1.5329545518149867e-05, + "loss": 0.3424, + "step": 14292 + }, + { + "epoch": 0.6276386708673173, + "grad_norm": 1.65625, + "learning_rate": 1.532316615160944e-05, + "loss": 0.3404, + "step": 14294 + }, + { + "epoch": 0.6277264893465208, + "grad_norm": 2.0, + "learning_rate": 1.5316787526123273e-05, + "loss": 0.3184, + "step": 14296 + }, + { + "epoch": 0.6278143078257242, + "grad_norm": 1.6953125, + "learning_rate": 1.531040964217984e-05, + "loss": 0.3307, + "step": 14298 + }, + { + "epoch": 0.6279021263049277, + "grad_norm": 1.7890625, + "learning_rate": 1.5304032500267557e-05, + "loss": 0.3201, + "step": 14300 + }, + { + "epoch": 0.6279899447841312, + "grad_norm": 1.6875, + "learning_rate": 1.529765610087479e-05, + "loss": 0.3219, + "step": 14302 + }, + { + "epoch": 0.6280777632633346, + "grad_norm": 1.8515625, + "learning_rate": 1.529128044448984e-05, + "loss": 0.3523, + "step": 14304 + }, + { + "epoch": 0.6281655817425382, + "grad_norm": 1.6796875, + "learning_rate": 1.528490553160096e-05, + "loss": 0.3338, + "step": 14306 + }, + { + "epoch": 0.6282534002217417, + "grad_norm": 1.859375, + "learning_rate": 1.5278531362696348e-05, + "loss": 0.3669, + "step": 14308 + }, + { + "epoch": 0.6283412187009452, + "grad_norm": 1.671875, + "learning_rate": 1.5272157938264127e-05, + "loss": 0.3476, + "step": 14310 + }, + { + "epoch": 0.6284290371801486, + "grad_norm": 1.5625, + "learning_rate": 1.5265785258792395e-05, + "loss": 0.3176, + "step": 14312 + }, + { + "epoch": 0.6285168556593521, + "grad_norm": 1.59375, + "learning_rate": 1.5259413324769153e-05, + "loss": 0.311, + "step": 14314 + }, + { + "epoch": 0.6286046741385556, + "grad_norm": 1.921875, + "learning_rate": 1.5253042136682374e-05, + "loss": 0.338, + "step": 14316 + }, + { + "epoch": 0.6286924926177591, + "grad_norm": 1.71875, + "learning_rate": 1.5246671695019966e-05, + "loss": 0.3778, + "step": 14318 + }, + { + "epoch": 0.6287803110969625, + "grad_norm": 1.7734375, + "learning_rate": 1.5240302000269774e-05, + "loss": 0.3273, + "step": 14320 + }, + { + "epoch": 0.6288681295761661, + "grad_norm": 1.6875, + "learning_rate": 1.5233933052919602e-05, + "loss": 0.3136, + "step": 14322 + }, + { + "epoch": 0.6289559480553696, + "grad_norm": 1.6796875, + "learning_rate": 1.5227564853457173e-05, + "loss": 0.335, + "step": 14324 + }, + { + "epoch": 0.6290437665345731, + "grad_norm": 1.6796875, + "learning_rate": 1.5221197402370172e-05, + "loss": 0.3299, + "step": 14326 + }, + { + "epoch": 0.6291315850137765, + "grad_norm": 1.640625, + "learning_rate": 1.5214830700146227e-05, + "loss": 0.3594, + "step": 14328 + }, + { + "epoch": 0.62921940349298, + "grad_norm": 1.5390625, + "learning_rate": 1.520846474727288e-05, + "loss": 0.3397, + "step": 14330 + }, + { + "epoch": 0.6293072219721835, + "grad_norm": 1.6171875, + "learning_rate": 1.5202099544237653e-05, + "loss": 0.3367, + "step": 14332 + }, + { + "epoch": 0.629395040451387, + "grad_norm": 1.6484375, + "learning_rate": 1.519573509152799e-05, + "loss": 0.3105, + "step": 14334 + }, + { + "epoch": 0.6294828589305904, + "grad_norm": 1.5234375, + "learning_rate": 1.5189371389631284e-05, + "loss": 0.3246, + "step": 14336 + }, + { + "epoch": 0.629570677409794, + "grad_norm": 1.609375, + "learning_rate": 1.5183008439034873e-05, + "loss": 0.3085, + "step": 14338 + }, + { + "epoch": 0.6296584958889975, + "grad_norm": 1.5546875, + "learning_rate": 1.5176646240226025e-05, + "loss": 0.3442, + "step": 14340 + }, + { + "epoch": 0.6297463143682009, + "grad_norm": 1.59375, + "learning_rate": 1.5170284793691963e-05, + "loss": 0.3371, + "step": 14342 + }, + { + "epoch": 0.6298341328474044, + "grad_norm": 1.6640625, + "learning_rate": 1.5163924099919846e-05, + "loss": 0.3339, + "step": 14344 + }, + { + "epoch": 0.6299219513266079, + "grad_norm": 1.6484375, + "learning_rate": 1.5157564159396781e-05, + "loss": 0.3161, + "step": 14346 + }, + { + "epoch": 0.6300097698058114, + "grad_norm": 1.546875, + "learning_rate": 1.5151204972609818e-05, + "loss": 0.3199, + "step": 14348 + }, + { + "epoch": 0.6300975882850148, + "grad_norm": 1.59375, + "learning_rate": 1.5144846540045932e-05, + "loss": 0.3202, + "step": 14350 + }, + { + "epoch": 0.6301854067642184, + "grad_norm": 1.5390625, + "learning_rate": 1.5138488862192063e-05, + "loss": 0.3289, + "step": 14352 + }, + { + "epoch": 0.6302732252434219, + "grad_norm": 1.796875, + "learning_rate": 1.5132131939535076e-05, + "loss": 0.3225, + "step": 14354 + }, + { + "epoch": 0.6303610437226254, + "grad_norm": 1.609375, + "learning_rate": 1.5125775772561795e-05, + "loss": 0.3161, + "step": 14356 + }, + { + "epoch": 0.6304488622018288, + "grad_norm": 1.6484375, + "learning_rate": 1.5119420361758982e-05, + "loss": 0.3352, + "step": 14358 + }, + { + "epoch": 0.6305366806810323, + "grad_norm": 1.65625, + "learning_rate": 1.5113065707613317e-05, + "loss": 0.3359, + "step": 14360 + }, + { + "epoch": 0.6306244991602358, + "grad_norm": 1.671875, + "learning_rate": 1.5106711810611446e-05, + "loss": 0.3457, + "step": 14362 + }, + { + "epoch": 0.6307123176394392, + "grad_norm": 1.71875, + "learning_rate": 1.5100358671239964e-05, + "loss": 0.3188, + "step": 14364 + }, + { + "epoch": 0.6308001361186427, + "grad_norm": 1.71875, + "learning_rate": 1.5094006289985385e-05, + "loss": 0.3278, + "step": 14366 + }, + { + "epoch": 0.6308879545978463, + "grad_norm": 1.6953125, + "learning_rate": 1.5087654667334174e-05, + "loss": 0.3287, + "step": 14368 + }, + { + "epoch": 0.6309757730770498, + "grad_norm": 1.890625, + "learning_rate": 1.5081303803772751e-05, + "loss": 0.3183, + "step": 14370 + }, + { + "epoch": 0.6310635915562532, + "grad_norm": 1.640625, + "learning_rate": 1.5074953699787452e-05, + "loss": 0.3278, + "step": 14372 + }, + { + "epoch": 0.6311514100354567, + "grad_norm": 1.7109375, + "learning_rate": 1.506860435586459e-05, + "loss": 0.2985, + "step": 14374 + }, + { + "epoch": 0.6312392285146602, + "grad_norm": 1.609375, + "learning_rate": 1.506225577249038e-05, + "loss": 0.3348, + "step": 14376 + }, + { + "epoch": 0.6313270469938637, + "grad_norm": 1.609375, + "learning_rate": 1.5055907950151004e-05, + "loss": 0.3477, + "step": 14378 + }, + { + "epoch": 0.6314148654730671, + "grad_norm": 1.59375, + "learning_rate": 1.5049560889332581e-05, + "loss": 0.3124, + "step": 14380 + }, + { + "epoch": 0.6315026839522706, + "grad_norm": 1.59375, + "learning_rate": 1.5043214590521174e-05, + "loss": 0.3292, + "step": 14382 + }, + { + "epoch": 0.6315905024314742, + "grad_norm": 1.6875, + "learning_rate": 1.5036869054202782e-05, + "loss": 0.3574, + "step": 14384 + }, + { + "epoch": 0.6316783209106777, + "grad_norm": 1.6328125, + "learning_rate": 1.5030524280863342e-05, + "loss": 0.2946, + "step": 14386 + }, + { + "epoch": 0.6317661393898811, + "grad_norm": 1.6328125, + "learning_rate": 1.5024180270988741e-05, + "loss": 0.352, + "step": 14388 + }, + { + "epoch": 0.6318539578690846, + "grad_norm": 1.6328125, + "learning_rate": 1.5017837025064818e-05, + "loss": 0.3146, + "step": 14390 + }, + { + "epoch": 0.6319417763482881, + "grad_norm": 1.671875, + "learning_rate": 1.501149454357732e-05, + "loss": 0.3527, + "step": 14392 + }, + { + "epoch": 0.6320295948274915, + "grad_norm": 1.859375, + "learning_rate": 1.5005152827011962e-05, + "loss": 0.3064, + "step": 14394 + }, + { + "epoch": 0.632117413306695, + "grad_norm": 1.6875, + "learning_rate": 1.4998811875854396e-05, + "loss": 0.3194, + "step": 14396 + }, + { + "epoch": 0.6322052317858986, + "grad_norm": 1.6328125, + "learning_rate": 1.4992471690590216e-05, + "loss": 0.3234, + "step": 14398 + }, + { + "epoch": 0.6322930502651021, + "grad_norm": 1.6875, + "learning_rate": 1.4986132271704955e-05, + "loss": 0.3459, + "step": 14400 + }, + { + "epoch": 0.6323808687443055, + "grad_norm": 1.703125, + "learning_rate": 1.4979793619684082e-05, + "loss": 0.3138, + "step": 14402 + }, + { + "epoch": 0.632468687223509, + "grad_norm": 1.6484375, + "learning_rate": 1.4973455735013015e-05, + "loss": 0.327, + "step": 14404 + }, + { + "epoch": 0.6325565057027125, + "grad_norm": 1.640625, + "learning_rate": 1.4967118618177112e-05, + "loss": 0.3255, + "step": 14406 + }, + { + "epoch": 0.632644324181916, + "grad_norm": 1.546875, + "learning_rate": 1.4960782269661672e-05, + "loss": 0.3421, + "step": 14408 + }, + { + "epoch": 0.6327321426611194, + "grad_norm": 1.6640625, + "learning_rate": 1.4954446689951934e-05, + "loss": 0.3058, + "step": 14410 + }, + { + "epoch": 0.6328199611403229, + "grad_norm": 1.5859375, + "learning_rate": 1.4948111879533071e-05, + "loss": 0.3665, + "step": 14412 + }, + { + "epoch": 0.6329077796195265, + "grad_norm": 1.6953125, + "learning_rate": 1.4941777838890215e-05, + "loss": 0.3343, + "step": 14414 + }, + { + "epoch": 0.63299559809873, + "grad_norm": 1.6484375, + "learning_rate": 1.4935444568508419e-05, + "loss": 0.3379, + "step": 14416 + }, + { + "epoch": 0.6330834165779334, + "grad_norm": 1.6875, + "learning_rate": 1.4929112068872691e-05, + "loss": 0.3326, + "step": 14418 + }, + { + "epoch": 0.6331712350571369, + "grad_norm": 1.65625, + "learning_rate": 1.4922780340467984e-05, + "loss": 0.3167, + "step": 14420 + }, + { + "epoch": 0.6332590535363404, + "grad_norm": 1.53125, + "learning_rate": 1.4916449383779169e-05, + "loss": 0.3186, + "step": 14422 + }, + { + "epoch": 0.6333468720155438, + "grad_norm": 1.8125, + "learning_rate": 1.4910119199291072e-05, + "loss": 0.3316, + "step": 14424 + }, + { + "epoch": 0.6334346904947473, + "grad_norm": 1.8359375, + "learning_rate": 1.4903789787488474e-05, + "loss": 0.3321, + "step": 14426 + }, + { + "epoch": 0.6335225089739508, + "grad_norm": 1.6328125, + "learning_rate": 1.4897461148856068e-05, + "loss": 0.3481, + "step": 14428 + }, + { + "epoch": 0.6336103274531544, + "grad_norm": 1.6953125, + "learning_rate": 1.4891133283878509e-05, + "loss": 0.3035, + "step": 14430 + }, + { + "epoch": 0.6336981459323578, + "grad_norm": 1.6875, + "learning_rate": 1.4884806193040384e-05, + "loss": 0.3554, + "step": 14432 + }, + { + "epoch": 0.6337859644115613, + "grad_norm": 1.6640625, + "learning_rate": 1.487847987682623e-05, + "loss": 0.3422, + "step": 14434 + }, + { + "epoch": 0.6338737828907648, + "grad_norm": 1.6015625, + "learning_rate": 1.4872154335720518e-05, + "loss": 0.3272, + "step": 14436 + }, + { + "epoch": 0.6339616013699683, + "grad_norm": 1.65625, + "learning_rate": 1.4865829570207645e-05, + "loss": 0.3523, + "step": 14438 + }, + { + "epoch": 0.6340494198491717, + "grad_norm": 1.59375, + "learning_rate": 1.4859505580771977e-05, + "loss": 0.3274, + "step": 14440 + }, + { + "epoch": 0.6341372383283752, + "grad_norm": 1.625, + "learning_rate": 1.48531823678978e-05, + "loss": 0.3391, + "step": 14442 + }, + { + "epoch": 0.6342250568075787, + "grad_norm": 1.640625, + "learning_rate": 1.484685993206935e-05, + "loss": 0.3422, + "step": 14444 + }, + { + "epoch": 0.6343128752867823, + "grad_norm": 1.703125, + "learning_rate": 1.4840538273770807e-05, + "loss": 0.3105, + "step": 14446 + }, + { + "epoch": 0.6344006937659857, + "grad_norm": 1.6953125, + "learning_rate": 1.4834217393486272e-05, + "loss": 0.3629, + "step": 14448 + }, + { + "epoch": 0.6344885122451892, + "grad_norm": 1.6171875, + "learning_rate": 1.4827897291699816e-05, + "loss": 0.3606, + "step": 14450 + }, + { + "epoch": 0.6345763307243927, + "grad_norm": 1.640625, + "learning_rate": 1.4821577968895414e-05, + "loss": 0.3118, + "step": 14452 + }, + { + "epoch": 0.6346641492035962, + "grad_norm": 1.6640625, + "learning_rate": 1.4815259425557013e-05, + "loss": 0.3374, + "step": 14454 + }, + { + "epoch": 0.6347519676827996, + "grad_norm": 1.6640625, + "learning_rate": 1.4808941662168485e-05, + "loss": 0.3343, + "step": 14456 + }, + { + "epoch": 0.6348397861620031, + "grad_norm": 1.625, + "learning_rate": 1.4802624679213645e-05, + "loss": 0.3287, + "step": 14458 + }, + { + "epoch": 0.6349276046412067, + "grad_norm": 1.78125, + "learning_rate": 1.4796308477176258e-05, + "loss": 0.3231, + "step": 14460 + }, + { + "epoch": 0.6350154231204101, + "grad_norm": 1.6171875, + "learning_rate": 1.4789993056540013e-05, + "loss": 0.3116, + "step": 14462 + }, + { + "epoch": 0.6351032415996136, + "grad_norm": 1.5859375, + "learning_rate": 1.4783678417788544e-05, + "loss": 0.3213, + "step": 14464 + }, + { + "epoch": 0.6351910600788171, + "grad_norm": 1.65625, + "learning_rate": 1.477736456140543e-05, + "loss": 0.313, + "step": 14466 + }, + { + "epoch": 0.6352788785580206, + "grad_norm": 1.6484375, + "learning_rate": 1.4771051487874189e-05, + "loss": 0.321, + "step": 14468 + }, + { + "epoch": 0.635366697037224, + "grad_norm": 1.609375, + "learning_rate": 1.4764739197678279e-05, + "loss": 0.332, + "step": 14470 + }, + { + "epoch": 0.6354545155164275, + "grad_norm": 1.6015625, + "learning_rate": 1.4758427691301096e-05, + "loss": 0.3222, + "step": 14472 + }, + { + "epoch": 0.635542333995631, + "grad_norm": 1.65625, + "learning_rate": 1.475211696922597e-05, + "loss": 0.3467, + "step": 14474 + }, + { + "epoch": 0.6356301524748346, + "grad_norm": 1.7421875, + "learning_rate": 1.4745807031936188e-05, + "loss": 0.3384, + "step": 14476 + }, + { + "epoch": 0.635717970954038, + "grad_norm": 1.6875, + "learning_rate": 1.473949787991496e-05, + "loss": 0.3454, + "step": 14478 + }, + { + "epoch": 0.6358057894332415, + "grad_norm": 1.6953125, + "learning_rate": 1.4733189513645443e-05, + "loss": 0.3301, + "step": 14480 + }, + { + "epoch": 0.635893607912445, + "grad_norm": 1.65625, + "learning_rate": 1.4726881933610742e-05, + "loss": 0.3258, + "step": 14482 + }, + { + "epoch": 0.6359814263916485, + "grad_norm": 1.5859375, + "learning_rate": 1.472057514029388e-05, + "loss": 0.3416, + "step": 14484 + }, + { + "epoch": 0.6360692448708519, + "grad_norm": 1.5, + "learning_rate": 1.4714269134177836e-05, + "loss": 0.3131, + "step": 14486 + }, + { + "epoch": 0.6361570633500554, + "grad_norm": 1.734375, + "learning_rate": 1.4707963915745531e-05, + "loss": 0.3115, + "step": 14488 + }, + { + "epoch": 0.6362448818292589, + "grad_norm": 1.515625, + "learning_rate": 1.4701659485479813e-05, + "loss": 0.3337, + "step": 14490 + }, + { + "epoch": 0.6363327003084625, + "grad_norm": 1.7421875, + "learning_rate": 1.4695355843863479e-05, + "loss": 0.348, + "step": 14492 + }, + { + "epoch": 0.6364205187876659, + "grad_norm": 1.7109375, + "learning_rate": 1.4689052991379266e-05, + "loss": 0.3505, + "step": 14494 + }, + { + "epoch": 0.6365083372668694, + "grad_norm": 1.9296875, + "learning_rate": 1.4682750928509845e-05, + "loss": 0.3427, + "step": 14496 + }, + { + "epoch": 0.6365961557460729, + "grad_norm": 1.6171875, + "learning_rate": 1.4676449655737837e-05, + "loss": 0.3425, + "step": 14498 + }, + { + "epoch": 0.6366839742252763, + "grad_norm": 1.7890625, + "learning_rate": 1.4670149173545783e-05, + "loss": 0.3313, + "step": 14500 + }, + { + "epoch": 0.6367717927044798, + "grad_norm": 1.625, + "learning_rate": 1.4663849482416184e-05, + "loss": 0.3294, + "step": 14502 + }, + { + "epoch": 0.6368596111836833, + "grad_norm": 1.59375, + "learning_rate": 1.4657550582831467e-05, + "loss": 0.3425, + "step": 14504 + }, + { + "epoch": 0.6369474296628869, + "grad_norm": 1.828125, + "learning_rate": 1.4651252475274007e-05, + "loss": 0.3109, + "step": 14506 + }, + { + "epoch": 0.6370352481420903, + "grad_norm": 1.6328125, + "learning_rate": 1.4644955160226118e-05, + "loss": 0.3427, + "step": 14508 + }, + { + "epoch": 0.6371230666212938, + "grad_norm": 1.8125, + "learning_rate": 1.4638658638170038e-05, + "loss": 0.3422, + "step": 14510 + }, + { + "epoch": 0.6372108851004973, + "grad_norm": 1.9453125, + "learning_rate": 1.4632362909587977e-05, + "loss": 0.3501, + "step": 14512 + }, + { + "epoch": 0.6372987035797008, + "grad_norm": 1.8828125, + "learning_rate": 1.4626067974962038e-05, + "loss": 0.3177, + "step": 14514 + }, + { + "epoch": 0.6373865220589042, + "grad_norm": 1.5, + "learning_rate": 1.46197738347743e-05, + "loss": 0.3215, + "step": 14516 + }, + { + "epoch": 0.6374743405381077, + "grad_norm": 1.59375, + "learning_rate": 1.461348048950677e-05, + "loss": 0.3276, + "step": 14518 + }, + { + "epoch": 0.6375621590173112, + "grad_norm": 1.9140625, + "learning_rate": 1.4607187939641393e-05, + "loss": 0.3418, + "step": 14520 + }, + { + "epoch": 0.6376499774965148, + "grad_norm": 1.7578125, + "learning_rate": 1.4600896185660057e-05, + "loss": 0.3179, + "step": 14522 + }, + { + "epoch": 0.6377377959757182, + "grad_norm": 1.609375, + "learning_rate": 1.459460522804459e-05, + "loss": 0.328, + "step": 14524 + }, + { + "epoch": 0.6378256144549217, + "grad_norm": 1.9453125, + "learning_rate": 1.4588315067276737e-05, + "loss": 0.3368, + "step": 14526 + }, + { + "epoch": 0.6379134329341252, + "grad_norm": 1.734375, + "learning_rate": 1.458202570383822e-05, + "loss": 0.3467, + "step": 14528 + }, + { + "epoch": 0.6380012514133286, + "grad_norm": 1.71875, + "learning_rate": 1.4575737138210674e-05, + "loss": 0.2989, + "step": 14530 + }, + { + "epoch": 0.6380890698925321, + "grad_norm": 1.7734375, + "learning_rate": 1.4569449370875665e-05, + "loss": 0.3633, + "step": 14532 + }, + { + "epoch": 0.6381768883717356, + "grad_norm": 1.578125, + "learning_rate": 1.4563162402314737e-05, + "loss": 0.3455, + "step": 14534 + }, + { + "epoch": 0.6382647068509391, + "grad_norm": 1.78125, + "learning_rate": 1.4556876233009323e-05, + "loss": 0.334, + "step": 14536 + }, + { + "epoch": 0.6383525253301426, + "grad_norm": 1.5546875, + "learning_rate": 1.4550590863440838e-05, + "loss": 0.3335, + "step": 14538 + }, + { + "epoch": 0.6384403438093461, + "grad_norm": 1.6875, + "learning_rate": 1.4544306294090612e-05, + "loss": 0.3423, + "step": 14540 + }, + { + "epoch": 0.6385281622885496, + "grad_norm": 1.5234375, + "learning_rate": 1.4538022525439915e-05, + "loss": 0.3175, + "step": 14542 + }, + { + "epoch": 0.638615980767753, + "grad_norm": 1.4921875, + "learning_rate": 1.4531739557969964e-05, + "loss": 0.3087, + "step": 14544 + }, + { + "epoch": 0.6387037992469565, + "grad_norm": 1.59375, + "learning_rate": 1.4525457392161895e-05, + "loss": 0.3289, + "step": 14546 + }, + { + "epoch": 0.63879161772616, + "grad_norm": 1.6640625, + "learning_rate": 1.4519176028496817e-05, + "loss": 0.3231, + "step": 14548 + }, + { + "epoch": 0.6388794362053635, + "grad_norm": 1.796875, + "learning_rate": 1.4512895467455745e-05, + "loss": 0.3068, + "step": 14550 + }, + { + "epoch": 0.638967254684567, + "grad_norm": 1.71875, + "learning_rate": 1.4506615709519661e-05, + "loss": 0.3506, + "step": 14552 + }, + { + "epoch": 0.6390550731637705, + "grad_norm": 1.671875, + "learning_rate": 1.4500336755169464e-05, + "loss": 0.3405, + "step": 14554 + }, + { + "epoch": 0.639142891642974, + "grad_norm": 1.671875, + "learning_rate": 1.449405860488598e-05, + "loss": 0.3079, + "step": 14556 + }, + { + "epoch": 0.6392307101221775, + "grad_norm": 1.6015625, + "learning_rate": 1.4487781259150018e-05, + "loss": 0.3093, + "step": 14558 + }, + { + "epoch": 0.6393185286013809, + "grad_norm": 1.9609375, + "learning_rate": 1.448150471844228e-05, + "loss": 0.3463, + "step": 14560 + }, + { + "epoch": 0.6394063470805844, + "grad_norm": 1.609375, + "learning_rate": 1.4475228983243438e-05, + "loss": 0.3015, + "step": 14562 + }, + { + "epoch": 0.6394941655597879, + "grad_norm": 1.6796875, + "learning_rate": 1.4468954054034087e-05, + "loss": 0.3246, + "step": 14564 + }, + { + "epoch": 0.6395819840389914, + "grad_norm": 1.703125, + "learning_rate": 1.4462679931294749e-05, + "loss": 0.3138, + "step": 14566 + }, + { + "epoch": 0.6396698025181949, + "grad_norm": 1.546875, + "learning_rate": 1.4456406615505916e-05, + "loss": 0.3133, + "step": 14568 + }, + { + "epoch": 0.6397576209973984, + "grad_norm": 1.9609375, + "learning_rate": 1.4450134107147983e-05, + "loss": 0.3602, + "step": 14570 + }, + { + "epoch": 0.6398454394766019, + "grad_norm": 1.6796875, + "learning_rate": 1.4443862406701325e-05, + "loss": 0.331, + "step": 14572 + }, + { + "epoch": 0.6399332579558054, + "grad_norm": 1.7109375, + "learning_rate": 1.4437591514646198e-05, + "loss": 0.3548, + "step": 14574 + }, + { + "epoch": 0.6400210764350088, + "grad_norm": 1.8671875, + "learning_rate": 1.4431321431462858e-05, + "loss": 0.3461, + "step": 14576 + }, + { + "epoch": 0.6401088949142123, + "grad_norm": 1.671875, + "learning_rate": 1.4425052157631441e-05, + "loss": 0.3374, + "step": 14578 + }, + { + "epoch": 0.6401967133934158, + "grad_norm": 1.6796875, + "learning_rate": 1.4418783693632077e-05, + "loss": 0.3284, + "step": 14580 + }, + { + "epoch": 0.6402845318726192, + "grad_norm": 1.5625, + "learning_rate": 1.4412516039944793e-05, + "loss": 0.3184, + "step": 14582 + }, + { + "epoch": 0.6403723503518228, + "grad_norm": 1.8828125, + "learning_rate": 1.4406249197049559e-05, + "loss": 0.32, + "step": 14584 + }, + { + "epoch": 0.6404601688310263, + "grad_norm": 1.71875, + "learning_rate": 1.4399983165426312e-05, + "loss": 0.3213, + "step": 14586 + }, + { + "epoch": 0.6405479873102298, + "grad_norm": 1.609375, + "learning_rate": 1.4393717945554885e-05, + "loss": 0.3198, + "step": 14588 + }, + { + "epoch": 0.6406358057894332, + "grad_norm": 1.5546875, + "learning_rate": 1.438745353791509e-05, + "loss": 0.3314, + "step": 14590 + }, + { + "epoch": 0.6407236242686367, + "grad_norm": 1.484375, + "learning_rate": 1.4381189942986644e-05, + "loss": 0.3409, + "step": 14592 + }, + { + "epoch": 0.6408114427478402, + "grad_norm": 1.5078125, + "learning_rate": 1.4374927161249212e-05, + "loss": 0.3322, + "step": 14594 + }, + { + "epoch": 0.6408992612270437, + "grad_norm": 1.640625, + "learning_rate": 1.4368665193182416e-05, + "loss": 0.3318, + "step": 14596 + }, + { + "epoch": 0.6409870797062472, + "grad_norm": 1.6875, + "learning_rate": 1.4362404039265775e-05, + "loss": 0.3138, + "step": 14598 + }, + { + "epoch": 0.6410748981854507, + "grad_norm": 1.6875, + "learning_rate": 1.435614369997879e-05, + "loss": 0.3314, + "step": 14600 + }, + { + "epoch": 0.6411627166646542, + "grad_norm": 1.5, + "learning_rate": 1.4349884175800876e-05, + "loss": 0.2977, + "step": 14602 + }, + { + "epoch": 0.6412505351438577, + "grad_norm": 1.703125, + "learning_rate": 1.4343625467211386e-05, + "loss": 0.3434, + "step": 14604 + }, + { + "epoch": 0.6413383536230611, + "grad_norm": 1.8125, + "learning_rate": 1.4337367574689609e-05, + "loss": 0.3048, + "step": 14606 + }, + { + "epoch": 0.6414261721022646, + "grad_norm": 1.8359375, + "learning_rate": 1.4331110498714773e-05, + "loss": 0.3218, + "step": 14608 + }, + { + "epoch": 0.6415139905814681, + "grad_norm": 1.59375, + "learning_rate": 1.4324854239766059e-05, + "loss": 0.3395, + "step": 14610 + }, + { + "epoch": 0.6416018090606715, + "grad_norm": 1.5390625, + "learning_rate": 1.4318598798322557e-05, + "loss": 0.3108, + "step": 14612 + }, + { + "epoch": 0.6416896275398751, + "grad_norm": 1.578125, + "learning_rate": 1.4312344174863329e-05, + "loss": 0.3237, + "step": 14614 + }, + { + "epoch": 0.6417774460190786, + "grad_norm": 1.625, + "learning_rate": 1.4306090369867348e-05, + "loss": 0.343, + "step": 14616 + }, + { + "epoch": 0.6418652644982821, + "grad_norm": 1.5, + "learning_rate": 1.429983738381352e-05, + "loss": 0.3108, + "step": 14618 + }, + { + "epoch": 0.6419530829774855, + "grad_norm": 1.6484375, + "learning_rate": 1.4293585217180717e-05, + "loss": 0.3313, + "step": 14620 + }, + { + "epoch": 0.642040901456689, + "grad_norm": 1.6015625, + "learning_rate": 1.4287333870447716e-05, + "loss": 0.3604, + "step": 14622 + }, + { + "epoch": 0.6421287199358925, + "grad_norm": 1.6640625, + "learning_rate": 1.4281083344093265e-05, + "loss": 0.3235, + "step": 14624 + }, + { + "epoch": 0.642216538415096, + "grad_norm": 1.6015625, + "learning_rate": 1.4274833638596024e-05, + "loss": 0.3281, + "step": 14626 + }, + { + "epoch": 0.6423043568942994, + "grad_norm": 1.59375, + "learning_rate": 1.4268584754434583e-05, + "loss": 0.309, + "step": 14628 + }, + { + "epoch": 0.642392175373503, + "grad_norm": 1.6484375, + "learning_rate": 1.4262336692087503e-05, + "loss": 0.3422, + "step": 14630 + }, + { + "epoch": 0.6424799938527065, + "grad_norm": 1.6796875, + "learning_rate": 1.4256089452033241e-05, + "loss": 0.3389, + "step": 14632 + }, + { + "epoch": 0.64256781233191, + "grad_norm": 1.65625, + "learning_rate": 1.4249843034750246e-05, + "loss": 0.3091, + "step": 14634 + }, + { + "epoch": 0.6426556308111134, + "grad_norm": 1.625, + "learning_rate": 1.4243597440716827e-05, + "loss": 0.3495, + "step": 14636 + }, + { + "epoch": 0.6427434492903169, + "grad_norm": 1.6875, + "learning_rate": 1.4237352670411308e-05, + "loss": 0.3063, + "step": 14638 + }, + { + "epoch": 0.6428312677695204, + "grad_norm": 1.6640625, + "learning_rate": 1.423110872431189e-05, + "loss": 0.3091, + "step": 14640 + }, + { + "epoch": 0.6429190862487238, + "grad_norm": 1.6796875, + "learning_rate": 1.4224865602896757e-05, + "loss": 0.3372, + "step": 14642 + }, + { + "epoch": 0.6430069047279273, + "grad_norm": 1.5078125, + "learning_rate": 1.4218623306643997e-05, + "loss": 0.3061, + "step": 14644 + }, + { + "epoch": 0.6430947232071309, + "grad_norm": 1.703125, + "learning_rate": 1.421238183603164e-05, + "loss": 0.3507, + "step": 14646 + }, + { + "epoch": 0.6431825416863344, + "grad_norm": 1.640625, + "learning_rate": 1.4206141191537682e-05, + "loss": 0.3205, + "step": 14648 + }, + { + "epoch": 0.6432703601655378, + "grad_norm": 1.65625, + "learning_rate": 1.4199901373640005e-05, + "loss": 0.3764, + "step": 14650 + }, + { + "epoch": 0.6433581786447413, + "grad_norm": 1.5625, + "learning_rate": 1.419366238281648e-05, + "loss": 0.3491, + "step": 14652 + }, + { + "epoch": 0.6434459971239448, + "grad_norm": 1.578125, + "learning_rate": 1.4187424219544882e-05, + "loss": 0.3463, + "step": 14654 + }, + { + "epoch": 0.6435338156031483, + "grad_norm": 1.6796875, + "learning_rate": 1.4181186884302916e-05, + "loss": 0.343, + "step": 14656 + }, + { + "epoch": 0.6436216340823517, + "grad_norm": 1.8984375, + "learning_rate": 1.4174950377568264e-05, + "loss": 0.373, + "step": 14658 + }, + { + "epoch": 0.6437094525615553, + "grad_norm": 1.7109375, + "learning_rate": 1.4168714699818498e-05, + "loss": 0.3412, + "step": 14660 + }, + { + "epoch": 0.6437972710407588, + "grad_norm": 1.90625, + "learning_rate": 1.4162479851531163e-05, + "loss": 0.3298, + "step": 14662 + }, + { + "epoch": 0.6438850895199623, + "grad_norm": 1.625, + "learning_rate": 1.4156245833183723e-05, + "loss": 0.3244, + "step": 14664 + }, + { + "epoch": 0.6439729079991657, + "grad_norm": 1.640625, + "learning_rate": 1.4150012645253575e-05, + "loss": 0.3533, + "step": 14666 + }, + { + "epoch": 0.6440607264783692, + "grad_norm": 1.8125, + "learning_rate": 1.4143780288218058e-05, + "loss": 0.3331, + "step": 14668 + }, + { + "epoch": 0.6441485449575727, + "grad_norm": 1.609375, + "learning_rate": 1.4137548762554443e-05, + "loss": 0.3425, + "step": 14670 + }, + { + "epoch": 0.6442363634367761, + "grad_norm": 1.640625, + "learning_rate": 1.4131318068739951e-05, + "loss": 0.3373, + "step": 14672 + }, + { + "epoch": 0.6443241819159796, + "grad_norm": 1.546875, + "learning_rate": 1.4125088207251722e-05, + "loss": 0.3299, + "step": 14674 + }, + { + "epoch": 0.6444120003951832, + "grad_norm": 1.609375, + "learning_rate": 1.4118859178566853e-05, + "loss": 0.3592, + "step": 14676 + }, + { + "epoch": 0.6444998188743867, + "grad_norm": 1.53125, + "learning_rate": 1.4112630983162356e-05, + "loss": 0.3314, + "step": 14678 + }, + { + "epoch": 0.6445876373535901, + "grad_norm": 1.6015625, + "learning_rate": 1.410640362151518e-05, + "loss": 0.3222, + "step": 14680 + }, + { + "epoch": 0.6446754558327936, + "grad_norm": 1.578125, + "learning_rate": 1.4100177094102235e-05, + "loss": 0.3264, + "step": 14682 + }, + { + "epoch": 0.6447632743119971, + "grad_norm": 1.640625, + "learning_rate": 1.4093951401400335e-05, + "loss": 0.3309, + "step": 14684 + }, + { + "epoch": 0.6448510927912006, + "grad_norm": 1.6953125, + "learning_rate": 1.4087726543886254e-05, + "loss": 0.3072, + "step": 14686 + }, + { + "epoch": 0.644938911270404, + "grad_norm": 1.59375, + "learning_rate": 1.4081502522036693e-05, + "loss": 0.3241, + "step": 14688 + }, + { + "epoch": 0.6450267297496075, + "grad_norm": 1.6640625, + "learning_rate": 1.4075279336328279e-05, + "loss": 0.3302, + "step": 14690 + }, + { + "epoch": 0.6451145482288111, + "grad_norm": 1.578125, + "learning_rate": 1.40690569872376e-05, + "loss": 0.3425, + "step": 14692 + }, + { + "epoch": 0.6452023667080146, + "grad_norm": 1.5390625, + "learning_rate": 1.4062835475241148e-05, + "loss": 0.3169, + "step": 14694 + }, + { + "epoch": 0.645290185187218, + "grad_norm": 1.6171875, + "learning_rate": 1.4056614800815396e-05, + "loss": 0.3517, + "step": 14696 + }, + { + "epoch": 0.6453780036664215, + "grad_norm": 1.59375, + "learning_rate": 1.4050394964436686e-05, + "loss": 0.3589, + "step": 14698 + }, + { + "epoch": 0.645465822145625, + "grad_norm": 1.6328125, + "learning_rate": 1.4044175966581363e-05, + "loss": 0.3343, + "step": 14700 + }, + { + "epoch": 0.6455536406248285, + "grad_norm": 1.6171875, + "learning_rate": 1.4037957807725666e-05, + "loss": 0.3384, + "step": 14702 + }, + { + "epoch": 0.6456414591040319, + "grad_norm": 1.6875, + "learning_rate": 1.403174048834579e-05, + "loss": 0.3348, + "step": 14704 + }, + { + "epoch": 0.6457292775832355, + "grad_norm": 1.6640625, + "learning_rate": 1.4025524008917861e-05, + "loss": 0.3253, + "step": 14706 + }, + { + "epoch": 0.645817096062439, + "grad_norm": 1.703125, + "learning_rate": 1.4019308369917928e-05, + "loss": 0.3263, + "step": 14708 + }, + { + "epoch": 0.6459049145416424, + "grad_norm": 1.6796875, + "learning_rate": 1.4013093571821994e-05, + "loss": 0.3211, + "step": 14710 + }, + { + "epoch": 0.6459927330208459, + "grad_norm": 1.6484375, + "learning_rate": 1.4006879615105984e-05, + "loss": 0.3175, + "step": 14712 + }, + { + "epoch": 0.6460805515000494, + "grad_norm": 1.7421875, + "learning_rate": 1.400066650024578e-05, + "loss": 0.3165, + "step": 14714 + }, + { + "epoch": 0.6461683699792529, + "grad_norm": 1.453125, + "learning_rate": 1.3994454227717168e-05, + "loss": 0.3272, + "step": 14716 + }, + { + "epoch": 0.6462561884584563, + "grad_norm": 1.609375, + "learning_rate": 1.398824279799588e-05, + "loss": 0.3444, + "step": 14718 + }, + { + "epoch": 0.6463440069376598, + "grad_norm": 1.6328125, + "learning_rate": 1.3982032211557609e-05, + "loss": 0.3389, + "step": 14720 + }, + { + "epoch": 0.6464318254168634, + "grad_norm": 1.6171875, + "learning_rate": 1.3975822468877942e-05, + "loss": 0.3294, + "step": 14722 + }, + { + "epoch": 0.6465196438960669, + "grad_norm": 1.5390625, + "learning_rate": 1.396961357043244e-05, + "loss": 0.3239, + "step": 14724 + }, + { + "epoch": 0.6466074623752703, + "grad_norm": 1.5625, + "learning_rate": 1.3963405516696579e-05, + "loss": 0.3458, + "step": 14726 + }, + { + "epoch": 0.6466952808544738, + "grad_norm": 1.6328125, + "learning_rate": 1.3957198308145769e-05, + "loss": 0.3198, + "step": 14728 + }, + { + "epoch": 0.6467830993336773, + "grad_norm": 1.5625, + "learning_rate": 1.395099194525536e-05, + "loss": 0.3392, + "step": 14730 + }, + { + "epoch": 0.6468709178128808, + "grad_norm": 1.671875, + "learning_rate": 1.3944786428500623e-05, + "loss": 0.3456, + "step": 14732 + }, + { + "epoch": 0.6469587362920842, + "grad_norm": 1.578125, + "learning_rate": 1.3938581758356806e-05, + "loss": 0.3485, + "step": 14734 + }, + { + "epoch": 0.6470465547712877, + "grad_norm": 1.5703125, + "learning_rate": 1.3932377935299035e-05, + "loss": 0.3474, + "step": 14736 + }, + { + "epoch": 0.6471343732504913, + "grad_norm": 1.75, + "learning_rate": 1.3926174959802429e-05, + "loss": 0.3339, + "step": 14738 + }, + { + "epoch": 0.6472221917296948, + "grad_norm": 1.7109375, + "learning_rate": 1.3919972832341997e-05, + "loss": 0.3329, + "step": 14740 + }, + { + "epoch": 0.6473100102088982, + "grad_norm": 1.7109375, + "learning_rate": 1.391377155339269e-05, + "loss": 0.3519, + "step": 14742 + }, + { + "epoch": 0.6473978286881017, + "grad_norm": 1.609375, + "learning_rate": 1.3907571123429427e-05, + "loss": 0.3232, + "step": 14744 + }, + { + "epoch": 0.6474856471673052, + "grad_norm": 1.734375, + "learning_rate": 1.3901371542927016e-05, + "loss": 0.3239, + "step": 14746 + }, + { + "epoch": 0.6475734656465086, + "grad_norm": 1.5234375, + "learning_rate": 1.3895172812360244e-05, + "loss": 0.3496, + "step": 14748 + }, + { + "epoch": 0.6476612841257121, + "grad_norm": 1.7421875, + "learning_rate": 1.3888974932203797e-05, + "loss": 0.3183, + "step": 14750 + }, + { + "epoch": 0.6477491026049157, + "grad_norm": 1.53125, + "learning_rate": 1.3882777902932306e-05, + "loss": 0.3365, + "step": 14752 + }, + { + "epoch": 0.6478369210841192, + "grad_norm": 1.6953125, + "learning_rate": 1.387658172502036e-05, + "loss": 0.314, + "step": 14754 + }, + { + "epoch": 0.6479247395633226, + "grad_norm": 1.6953125, + "learning_rate": 1.3870386398942447e-05, + "loss": 0.3087, + "step": 14756 + }, + { + "epoch": 0.6480125580425261, + "grad_norm": 1.5546875, + "learning_rate": 1.3864191925173015e-05, + "loss": 0.3027, + "step": 14758 + }, + { + "epoch": 0.6481003765217296, + "grad_norm": 1.7890625, + "learning_rate": 1.3857998304186423e-05, + "loss": 0.3307, + "step": 14760 + }, + { + "epoch": 0.648188195000933, + "grad_norm": 1.703125, + "learning_rate": 1.3851805536457003e-05, + "loss": 0.3298, + "step": 14762 + }, + { + "epoch": 0.6482760134801365, + "grad_norm": 1.8046875, + "learning_rate": 1.3845613622458986e-05, + "loss": 0.3379, + "step": 14764 + }, + { + "epoch": 0.64836383195934, + "grad_norm": 1.625, + "learning_rate": 1.3839422562666543e-05, + "loss": 0.3376, + "step": 14766 + }, + { + "epoch": 0.6484516504385436, + "grad_norm": 1.6953125, + "learning_rate": 1.3833232357553804e-05, + "loss": 0.3247, + "step": 14768 + }, + { + "epoch": 0.648539468917747, + "grad_norm": 1.6328125, + "learning_rate": 1.3827043007594798e-05, + "loss": 0.2991, + "step": 14770 + }, + { + "epoch": 0.6486272873969505, + "grad_norm": 1.7578125, + "learning_rate": 1.3820854513263532e-05, + "loss": 0.3261, + "step": 14772 + }, + { + "epoch": 0.648715105876154, + "grad_norm": 1.6640625, + "learning_rate": 1.381466687503389e-05, + "loss": 0.3293, + "step": 14774 + }, + { + "epoch": 0.6488029243553575, + "grad_norm": 1.515625, + "learning_rate": 1.3808480093379755e-05, + "loss": 0.3416, + "step": 14776 + }, + { + "epoch": 0.6488907428345609, + "grad_norm": 1.6171875, + "learning_rate": 1.3802294168774893e-05, + "loss": 0.3469, + "step": 14778 + }, + { + "epoch": 0.6489785613137644, + "grad_norm": 1.609375, + "learning_rate": 1.3796109101693022e-05, + "loss": 0.3225, + "step": 14780 + }, + { + "epoch": 0.6490663797929679, + "grad_norm": 1.90625, + "learning_rate": 1.3789924892607808e-05, + "loss": 0.3461, + "step": 14782 + }, + { + "epoch": 0.6491541982721715, + "grad_norm": 1.625, + "learning_rate": 1.3783741541992826e-05, + "loss": 0.332, + "step": 14784 + }, + { + "epoch": 0.6492420167513749, + "grad_norm": 1.890625, + "learning_rate": 1.3777559050321615e-05, + "loss": 0.3151, + "step": 14786 + }, + { + "epoch": 0.6493298352305784, + "grad_norm": 1.7734375, + "learning_rate": 1.3771377418067621e-05, + "loss": 0.2975, + "step": 14788 + }, + { + "epoch": 0.6494176537097819, + "grad_norm": 1.5546875, + "learning_rate": 1.3765196645704236e-05, + "loss": 0.3272, + "step": 14790 + }, + { + "epoch": 0.6495054721889854, + "grad_norm": 1.6015625, + "learning_rate": 1.3759016733704783e-05, + "loss": 0.3647, + "step": 14792 + }, + { + "epoch": 0.6495932906681888, + "grad_norm": 1.703125, + "learning_rate": 1.375283768254252e-05, + "loss": 0.3253, + "step": 14794 + }, + { + "epoch": 0.6496811091473923, + "grad_norm": 1.59375, + "learning_rate": 1.3746659492690645e-05, + "loss": 0.3017, + "step": 14796 + }, + { + "epoch": 0.6497689276265958, + "grad_norm": 1.6171875, + "learning_rate": 1.3740482164622279e-05, + "loss": 0.3483, + "step": 14798 + }, + { + "epoch": 0.6498567461057994, + "grad_norm": 1.6796875, + "learning_rate": 1.3734305698810496e-05, + "loss": 0.3406, + "step": 14800 + }, + { + "epoch": 0.6499445645850028, + "grad_norm": 1.59375, + "learning_rate": 1.3728130095728284e-05, + "loss": 0.3054, + "step": 14802 + }, + { + "epoch": 0.6500323830642063, + "grad_norm": 1.6484375, + "learning_rate": 1.3721955355848562e-05, + "loss": 0.335, + "step": 14804 + }, + { + "epoch": 0.6501202015434098, + "grad_norm": 1.5234375, + "learning_rate": 1.371578147964421e-05, + "loss": 0.3368, + "step": 14806 + }, + { + "epoch": 0.6502080200226132, + "grad_norm": 1.6484375, + "learning_rate": 1.3709608467588008e-05, + "loss": 0.3173, + "step": 14808 + }, + { + "epoch": 0.6502958385018167, + "grad_norm": 1.6015625, + "learning_rate": 1.3703436320152708e-05, + "loss": 0.3083, + "step": 14810 + }, + { + "epoch": 0.6503836569810202, + "grad_norm": 1.65625, + "learning_rate": 1.3697265037810964e-05, + "loss": 0.3077, + "step": 14812 + }, + { + "epoch": 0.6504714754602238, + "grad_norm": 1.515625, + "learning_rate": 1.3691094621035358e-05, + "loss": 0.3309, + "step": 14814 + }, + { + "epoch": 0.6505592939394272, + "grad_norm": 1.640625, + "learning_rate": 1.368492507029845e-05, + "loss": 0.3468, + "step": 14816 + }, + { + "epoch": 0.6506471124186307, + "grad_norm": 1.6875, + "learning_rate": 1.367875638607269e-05, + "loss": 0.3506, + "step": 14818 + }, + { + "epoch": 0.6507349308978342, + "grad_norm": 1.703125, + "learning_rate": 1.367258856883048e-05, + "loss": 0.3203, + "step": 14820 + }, + { + "epoch": 0.6508227493770377, + "grad_norm": 1.765625, + "learning_rate": 1.3666421619044146e-05, + "loss": 0.3387, + "step": 14822 + }, + { + "epoch": 0.6509105678562411, + "grad_norm": 1.625, + "learning_rate": 1.366025553718597e-05, + "loss": 0.335, + "step": 14824 + }, + { + "epoch": 0.6509983863354446, + "grad_norm": 1.703125, + "learning_rate": 1.3654090323728142e-05, + "loss": 0.3325, + "step": 14826 + }, + { + "epoch": 0.6510862048146481, + "grad_norm": 1.6328125, + "learning_rate": 1.3647925979142789e-05, + "loss": 0.326, + "step": 14828 + }, + { + "epoch": 0.6511740232938517, + "grad_norm": 1.625, + "learning_rate": 1.3641762503901994e-05, + "loss": 0.3521, + "step": 14830 + }, + { + "epoch": 0.6512618417730551, + "grad_norm": 1.8671875, + "learning_rate": 1.3635599898477738e-05, + "loss": 0.348, + "step": 14832 + }, + { + "epoch": 0.6513496602522586, + "grad_norm": 1.8046875, + "learning_rate": 1.3629438163341978e-05, + "loss": 0.3248, + "step": 14834 + }, + { + "epoch": 0.6514374787314621, + "grad_norm": 1.6484375, + "learning_rate": 1.3623277298966558e-05, + "loss": 0.3101, + "step": 14836 + }, + { + "epoch": 0.6515252972106655, + "grad_norm": 1.6953125, + "learning_rate": 1.36171173058233e-05, + "loss": 0.3505, + "step": 14838 + }, + { + "epoch": 0.651613115689869, + "grad_norm": 1.671875, + "learning_rate": 1.3610958184383928e-05, + "loss": 0.2986, + "step": 14840 + }, + { + "epoch": 0.6517009341690725, + "grad_norm": 1.53125, + "learning_rate": 1.3604799935120099e-05, + "loss": 0.307, + "step": 14842 + }, + { + "epoch": 0.651788752648276, + "grad_norm": 1.625, + "learning_rate": 1.3598642558503432e-05, + "loss": 0.3515, + "step": 14844 + }, + { + "epoch": 0.6518765711274795, + "grad_norm": 1.5546875, + "learning_rate": 1.3592486055005441e-05, + "loss": 0.3184, + "step": 14846 + }, + { + "epoch": 0.651964389606683, + "grad_norm": 1.7890625, + "learning_rate": 1.3586330425097621e-05, + "loss": 0.3355, + "step": 14848 + }, + { + "epoch": 0.6520522080858865, + "grad_norm": 1.65625, + "learning_rate": 1.3580175669251336e-05, + "loss": 0.3188, + "step": 14850 + }, + { + "epoch": 0.65214002656509, + "grad_norm": 1.671875, + "learning_rate": 1.3574021787937944e-05, + "loss": 0.3224, + "step": 14852 + }, + { + "epoch": 0.6522278450442934, + "grad_norm": 1.6328125, + "learning_rate": 1.35678687816287e-05, + "loss": 0.3439, + "step": 14854 + }, + { + "epoch": 0.6523156635234969, + "grad_norm": 1.5234375, + "learning_rate": 1.3561716650794798e-05, + "loss": 0.356, + "step": 14856 + }, + { + "epoch": 0.6524034820027004, + "grad_norm": 1.578125, + "learning_rate": 1.3555565395907388e-05, + "loss": 0.3344, + "step": 14858 + }, + { + "epoch": 0.652491300481904, + "grad_norm": 1.625, + "learning_rate": 1.3549415017437512e-05, + "loss": 0.3208, + "step": 14860 + }, + { + "epoch": 0.6525791189611074, + "grad_norm": 1.625, + "learning_rate": 1.3543265515856191e-05, + "loss": 0.3234, + "step": 14862 + }, + { + "epoch": 0.6526669374403109, + "grad_norm": 1.671875, + "learning_rate": 1.3537116891634338e-05, + "loss": 0.3535, + "step": 14864 + }, + { + "epoch": 0.6527547559195144, + "grad_norm": 1.6484375, + "learning_rate": 1.3530969145242816e-05, + "loss": 0.3412, + "step": 14866 + }, + { + "epoch": 0.6528425743987178, + "grad_norm": 1.6875, + "learning_rate": 1.3524822277152433e-05, + "loss": 0.328, + "step": 14868 + }, + { + "epoch": 0.6529303928779213, + "grad_norm": 1.578125, + "learning_rate": 1.3518676287833904e-05, + "loss": 0.3258, + "step": 14870 + }, + { + "epoch": 0.6530182113571248, + "grad_norm": 1.546875, + "learning_rate": 1.3512531177757904e-05, + "loss": 0.3029, + "step": 14872 + }, + { + "epoch": 0.6531060298363283, + "grad_norm": 1.59375, + "learning_rate": 1.3506386947395022e-05, + "loss": 0.3254, + "step": 14874 + }, + { + "epoch": 0.6531938483155318, + "grad_norm": 1.6171875, + "learning_rate": 1.3500243597215773e-05, + "loss": 0.3469, + "step": 14876 + }, + { + "epoch": 0.6532816667947353, + "grad_norm": 1.6640625, + "learning_rate": 1.3494101127690633e-05, + "loss": 0.3451, + "step": 14878 + }, + { + "epoch": 0.6533694852739388, + "grad_norm": 1.6640625, + "learning_rate": 1.348795953928999e-05, + "loss": 0.3097, + "step": 14880 + }, + { + "epoch": 0.6534573037531423, + "grad_norm": 1.6328125, + "learning_rate": 1.3481818832484163e-05, + "loss": 0.3298, + "step": 14882 + }, + { + "epoch": 0.6535451222323457, + "grad_norm": 1.5859375, + "learning_rate": 1.3475679007743402e-05, + "loss": 0.3281, + "step": 14884 + }, + { + "epoch": 0.6536329407115492, + "grad_norm": 1.546875, + "learning_rate": 1.3469540065537917e-05, + "loss": 0.3299, + "step": 14886 + }, + { + "epoch": 0.6537207591907527, + "grad_norm": 1.6484375, + "learning_rate": 1.3463402006337817e-05, + "loss": 0.3511, + "step": 14888 + }, + { + "epoch": 0.6538085776699561, + "grad_norm": 1.5859375, + "learning_rate": 1.3457264830613141e-05, + "loss": 0.3132, + "step": 14890 + }, + { + "epoch": 0.6538963961491597, + "grad_norm": 1.6640625, + "learning_rate": 1.3451128538833906e-05, + "loss": 0.3346, + "step": 14892 + }, + { + "epoch": 0.6539842146283632, + "grad_norm": 1.609375, + "learning_rate": 1.3444993131470006e-05, + "loss": 0.3232, + "step": 14894 + }, + { + "epoch": 0.6540720331075667, + "grad_norm": 1.609375, + "learning_rate": 1.3438858608991315e-05, + "loss": 0.3156, + "step": 14896 + }, + { + "epoch": 0.6541598515867701, + "grad_norm": 1.7265625, + "learning_rate": 1.3432724971867599e-05, + "loss": 0.3412, + "step": 14898 + }, + { + "epoch": 0.6542476700659736, + "grad_norm": 1.7578125, + "learning_rate": 1.3426592220568568e-05, + "loss": 0.3014, + "step": 14900 + }, + { + "epoch": 0.6543354885451771, + "grad_norm": 1.65625, + "learning_rate": 1.3420460355563891e-05, + "loss": 0.3055, + "step": 14902 + }, + { + "epoch": 0.6544233070243806, + "grad_norm": 1.515625, + "learning_rate": 1.3414329377323126e-05, + "loss": 0.3393, + "step": 14904 + }, + { + "epoch": 0.6545111255035841, + "grad_norm": 1.78125, + "learning_rate": 1.3408199286315803e-05, + "loss": 0.344, + "step": 14906 + }, + { + "epoch": 0.6545989439827876, + "grad_norm": 1.7421875, + "learning_rate": 1.3402070083011348e-05, + "loss": 0.3015, + "step": 14908 + }, + { + "epoch": 0.6546867624619911, + "grad_norm": 1.6328125, + "learning_rate": 1.3395941767879164e-05, + "loss": 0.3608, + "step": 14910 + }, + { + "epoch": 0.6547745809411946, + "grad_norm": 1.5234375, + "learning_rate": 1.3389814341388523e-05, + "loss": 0.3229, + "step": 14912 + }, + { + "epoch": 0.654862399420398, + "grad_norm": 1.6796875, + "learning_rate": 1.3383687804008693e-05, + "loss": 0.3233, + "step": 14914 + }, + { + "epoch": 0.6549502178996015, + "grad_norm": 1.7265625, + "learning_rate": 1.3377562156208833e-05, + "loss": 0.3179, + "step": 14916 + }, + { + "epoch": 0.655038036378805, + "grad_norm": 1.5625, + "learning_rate": 1.3371437398458042e-05, + "loss": 0.3171, + "step": 14918 + }, + { + "epoch": 0.6551258548580084, + "grad_norm": 1.5859375, + "learning_rate": 1.3365313531225374e-05, + "loss": 0.3321, + "step": 14920 + }, + { + "epoch": 0.655213673337212, + "grad_norm": 1.59375, + "learning_rate": 1.3359190554979772e-05, + "loss": 0.3269, + "step": 14922 + }, + { + "epoch": 0.6553014918164155, + "grad_norm": 1.5234375, + "learning_rate": 1.3353068470190161e-05, + "loss": 0.3241, + "step": 14924 + }, + { + "epoch": 0.655389310295619, + "grad_norm": 1.59375, + "learning_rate": 1.3346947277325356e-05, + "loss": 0.2946, + "step": 14926 + }, + { + "epoch": 0.6554771287748224, + "grad_norm": 1.703125, + "learning_rate": 1.334082697685411e-05, + "loss": 0.3364, + "step": 14928 + }, + { + "epoch": 0.6555649472540259, + "grad_norm": 1.6640625, + "learning_rate": 1.3334707569245142e-05, + "loss": 0.35, + "step": 14930 + }, + { + "epoch": 0.6556527657332294, + "grad_norm": 1.640625, + "learning_rate": 1.3328589054967056e-05, + "loss": 0.3134, + "step": 14932 + }, + { + "epoch": 0.6557405842124329, + "grad_norm": 1.6171875, + "learning_rate": 1.3322471434488424e-05, + "loss": 0.3198, + "step": 14934 + }, + { + "epoch": 0.6558284026916363, + "grad_norm": 1.6171875, + "learning_rate": 1.3316354708277728e-05, + "loss": 0.3519, + "step": 14936 + }, + { + "epoch": 0.6559162211708399, + "grad_norm": 1.765625, + "learning_rate": 1.3310238876803383e-05, + "loss": 0.3331, + "step": 14938 + }, + { + "epoch": 0.6560040396500434, + "grad_norm": 1.6484375, + "learning_rate": 1.3304123940533767e-05, + "loss": 0.339, + "step": 14940 + }, + { + "epoch": 0.6560918581292469, + "grad_norm": 1.625, + "learning_rate": 1.3298009899937123e-05, + "loss": 0.3369, + "step": 14942 + }, + { + "epoch": 0.6561796766084503, + "grad_norm": 1.8046875, + "learning_rate": 1.3291896755481694e-05, + "loss": 0.3124, + "step": 14944 + }, + { + "epoch": 0.6562674950876538, + "grad_norm": 1.6875, + "learning_rate": 1.3285784507635609e-05, + "loss": 0.3247, + "step": 14946 + }, + { + "epoch": 0.6563553135668573, + "grad_norm": 1.6953125, + "learning_rate": 1.3279673156866967e-05, + "loss": 0.3425, + "step": 14948 + }, + { + "epoch": 0.6564431320460608, + "grad_norm": 1.6015625, + "learning_rate": 1.327356270364376e-05, + "loss": 0.3239, + "step": 14950 + }, + { + "epoch": 0.6565309505252643, + "grad_norm": 1.7734375, + "learning_rate": 1.3267453148433926e-05, + "loss": 0.322, + "step": 14952 + }, + { + "epoch": 0.6566187690044678, + "grad_norm": 1.5703125, + "learning_rate": 1.326134449170535e-05, + "loss": 0.3282, + "step": 14954 + }, + { + "epoch": 0.6567065874836713, + "grad_norm": 1.734375, + "learning_rate": 1.3255236733925819e-05, + "loss": 0.3349, + "step": 14956 + }, + { + "epoch": 0.6567944059628747, + "grad_norm": 1.75, + "learning_rate": 1.3249129875563083e-05, + "loss": 0.3605, + "step": 14958 + }, + { + "epoch": 0.6568822244420782, + "grad_norm": 1.484375, + "learning_rate": 1.3243023917084796e-05, + "loss": 0.3174, + "step": 14960 + }, + { + "epoch": 0.6569700429212817, + "grad_norm": 1.6640625, + "learning_rate": 1.3236918858958547e-05, + "loss": 0.3258, + "step": 14962 + }, + { + "epoch": 0.6570578614004852, + "grad_norm": 1.578125, + "learning_rate": 1.3230814701651884e-05, + "loss": 0.2999, + "step": 14964 + }, + { + "epoch": 0.6571456798796886, + "grad_norm": 1.734375, + "learning_rate": 1.3224711445632237e-05, + "loss": 0.2968, + "step": 14966 + }, + { + "epoch": 0.6572334983588922, + "grad_norm": 1.65625, + "learning_rate": 1.3218609091367024e-05, + "loss": 0.3169, + "step": 14968 + }, + { + "epoch": 0.6573213168380957, + "grad_norm": 1.625, + "learning_rate": 1.321250763932354e-05, + "loss": 0.3351, + "step": 14970 + }, + { + "epoch": 0.6574091353172992, + "grad_norm": 1.5625, + "learning_rate": 1.320640708996907e-05, + "loss": 0.3572, + "step": 14972 + }, + { + "epoch": 0.6574969537965026, + "grad_norm": 1.609375, + "learning_rate": 1.3200307443770748e-05, + "loss": 0.3086, + "step": 14974 + }, + { + "epoch": 0.6575847722757061, + "grad_norm": 1.5390625, + "learning_rate": 1.319420870119572e-05, + "loss": 0.3494, + "step": 14976 + }, + { + "epoch": 0.6576725907549096, + "grad_norm": 1.609375, + "learning_rate": 1.3188110862711023e-05, + "loss": 0.3088, + "step": 14978 + }, + { + "epoch": 0.657760409234113, + "grad_norm": 1.5234375, + "learning_rate": 1.3182013928783618e-05, + "loss": 0.3322, + "step": 14980 + }, + { + "epoch": 0.6578482277133165, + "grad_norm": 1.6796875, + "learning_rate": 1.3175917899880427e-05, + "loss": 0.3124, + "step": 14982 + }, + { + "epoch": 0.6579360461925201, + "grad_norm": 1.609375, + "learning_rate": 1.3169822776468268e-05, + "loss": 0.3098, + "step": 14984 + }, + { + "epoch": 0.6580238646717236, + "grad_norm": 1.5, + "learning_rate": 1.3163728559013928e-05, + "loss": 0.2701, + "step": 14986 + }, + { + "epoch": 0.658111683150927, + "grad_norm": 1.625, + "learning_rate": 1.3157635247984091e-05, + "loss": 0.3256, + "step": 14988 + }, + { + "epoch": 0.6581995016301305, + "grad_norm": 1.53125, + "learning_rate": 1.3151542843845377e-05, + "loss": 0.3451, + "step": 14990 + }, + { + "epoch": 0.658287320109334, + "grad_norm": 1.59375, + "learning_rate": 1.3145451347064358e-05, + "loss": 0.3386, + "step": 14992 + }, + { + "epoch": 0.6583751385885375, + "grad_norm": 1.7109375, + "learning_rate": 1.313936075810751e-05, + "loss": 0.3218, + "step": 14994 + }, + { + "epoch": 0.6584629570677409, + "grad_norm": 1.5078125, + "learning_rate": 1.313327107744127e-05, + "loss": 0.3355, + "step": 14996 + }, + { + "epoch": 0.6585507755469444, + "grad_norm": 1.703125, + "learning_rate": 1.3127182305531971e-05, + "loss": 0.3392, + "step": 14998 + }, + { + "epoch": 0.658638594026148, + "grad_norm": 1.609375, + "learning_rate": 1.3121094442845893e-05, + "loss": 0.3311, + "step": 15000 + }, + { + "epoch": 0.6587264125053515, + "grad_norm": 1.640625, + "learning_rate": 1.3115007489849265e-05, + "loss": 0.3168, + "step": 15002 + }, + { + "epoch": 0.6588142309845549, + "grad_norm": 1.640625, + "learning_rate": 1.3108921447008194e-05, + "loss": 0.3516, + "step": 15004 + }, + { + "epoch": 0.6589020494637584, + "grad_norm": 1.7109375, + "learning_rate": 1.310283631478878e-05, + "loss": 0.3232, + "step": 15006 + }, + { + "epoch": 0.6589898679429619, + "grad_norm": 1.671875, + "learning_rate": 1.3096752093657002e-05, + "loss": 0.3072, + "step": 15008 + }, + { + "epoch": 0.6590776864221654, + "grad_norm": 1.6015625, + "learning_rate": 1.3090668784078813e-05, + "loss": 0.3238, + "step": 15010 + }, + { + "epoch": 0.6591655049013688, + "grad_norm": 1.640625, + "learning_rate": 1.3084586386520062e-05, + "loss": 0.3492, + "step": 15012 + }, + { + "epoch": 0.6592533233805724, + "grad_norm": 1.671875, + "learning_rate": 1.3078504901446533e-05, + "loss": 0.3287, + "step": 15014 + }, + { + "epoch": 0.6593411418597759, + "grad_norm": 1.6796875, + "learning_rate": 1.3072424329323968e-05, + "loss": 0.3733, + "step": 15016 + }, + { + "epoch": 0.6594289603389794, + "grad_norm": 1.53125, + "learning_rate": 1.3066344670617991e-05, + "loss": 0.3267, + "step": 15018 + }, + { + "epoch": 0.6595167788181828, + "grad_norm": 1.6875, + "learning_rate": 1.3060265925794218e-05, + "loss": 0.326, + "step": 15020 + }, + { + "epoch": 0.6596045972973863, + "grad_norm": 1.5859375, + "learning_rate": 1.3054188095318137e-05, + "loss": 0.2889, + "step": 15022 + }, + { + "epoch": 0.6596924157765898, + "grad_norm": 1.5, + "learning_rate": 1.3048111179655186e-05, + "loss": 0.2946, + "step": 15024 + }, + { + "epoch": 0.6597802342557932, + "grad_norm": 1.6171875, + "learning_rate": 1.3042035179270756e-05, + "loss": 0.3259, + "step": 15026 + }, + { + "epoch": 0.6598680527349967, + "grad_norm": 1.5703125, + "learning_rate": 1.3035960094630132e-05, + "loss": 0.3238, + "step": 15028 + }, + { + "epoch": 0.6599558712142003, + "grad_norm": 1.7265625, + "learning_rate": 1.302988592619856e-05, + "loss": 0.3381, + "step": 15030 + }, + { + "epoch": 0.6600436896934038, + "grad_norm": 1.7109375, + "learning_rate": 1.3023812674441189e-05, + "loss": 0.3074, + "step": 15032 + }, + { + "epoch": 0.6601315081726072, + "grad_norm": 1.5, + "learning_rate": 1.301774033982312e-05, + "loss": 0.3234, + "step": 15034 + }, + { + "epoch": 0.6602193266518107, + "grad_norm": 1.5703125, + "learning_rate": 1.3011668922809355e-05, + "loss": 0.3027, + "step": 15036 + }, + { + "epoch": 0.6603071451310142, + "grad_norm": 1.6875, + "learning_rate": 1.3005598423864868e-05, + "loss": 0.3438, + "step": 15038 + }, + { + "epoch": 0.6603949636102177, + "grad_norm": 1.6328125, + "learning_rate": 1.2999528843454528e-05, + "loss": 0.3359, + "step": 15040 + }, + { + "epoch": 0.6604827820894211, + "grad_norm": 1.609375, + "learning_rate": 1.2993460182043138e-05, + "loss": 0.3283, + "step": 15042 + }, + { + "epoch": 0.6605706005686246, + "grad_norm": 1.65625, + "learning_rate": 1.2987392440095455e-05, + "loss": 0.3231, + "step": 15044 + }, + { + "epoch": 0.6606584190478282, + "grad_norm": 1.59375, + "learning_rate": 1.2981325618076129e-05, + "loss": 0.302, + "step": 15046 + }, + { + "epoch": 0.6607462375270317, + "grad_norm": 1.625, + "learning_rate": 1.2975259716449778e-05, + "loss": 0.3357, + "step": 15048 + }, + { + "epoch": 0.6608340560062351, + "grad_norm": 1.6015625, + "learning_rate": 1.2969194735680917e-05, + "loss": 0.3215, + "step": 15050 + }, + { + "epoch": 0.6609218744854386, + "grad_norm": 1.5546875, + "learning_rate": 1.2963130676234003e-05, + "loss": 0.3265, + "step": 15052 + }, + { + "epoch": 0.6610096929646421, + "grad_norm": 1.75, + "learning_rate": 1.2957067538573434e-05, + "loss": 0.3357, + "step": 15054 + }, + { + "epoch": 0.6610975114438455, + "grad_norm": 1.5546875, + "learning_rate": 1.2951005323163509e-05, + "loss": 0.3197, + "step": 15056 + }, + { + "epoch": 0.661185329923049, + "grad_norm": 1.4765625, + "learning_rate": 1.2944944030468498e-05, + "loss": 0.3311, + "step": 15058 + }, + { + "epoch": 0.6612731484022526, + "grad_norm": 1.609375, + "learning_rate": 1.2938883660952558e-05, + "loss": 0.3386, + "step": 15060 + }, + { + "epoch": 0.6613609668814561, + "grad_norm": 1.6484375, + "learning_rate": 1.2932824215079792e-05, + "loss": 0.2935, + "step": 15062 + }, + { + "epoch": 0.6614487853606595, + "grad_norm": 1.6015625, + "learning_rate": 1.2926765693314257e-05, + "loss": 0.324, + "step": 15064 + }, + { + "epoch": 0.661536603839863, + "grad_norm": 1.6796875, + "learning_rate": 1.2920708096119883e-05, + "loss": 0.3579, + "step": 15066 + }, + { + "epoch": 0.6616244223190665, + "grad_norm": 1.6015625, + "learning_rate": 1.291465142396059e-05, + "loss": 0.3081, + "step": 15068 + }, + { + "epoch": 0.66171224079827, + "grad_norm": 1.546875, + "learning_rate": 1.2908595677300172e-05, + "loss": 0.3196, + "step": 15070 + }, + { + "epoch": 0.6618000592774734, + "grad_norm": 1.625, + "learning_rate": 1.2902540856602414e-05, + "loss": 0.319, + "step": 15072 + }, + { + "epoch": 0.6618878777566769, + "grad_norm": 1.6953125, + "learning_rate": 1.289648696233097e-05, + "loss": 0.3505, + "step": 15074 + }, + { + "epoch": 0.6619756962358805, + "grad_norm": 1.6015625, + "learning_rate": 1.2890433994949447e-05, + "loss": 0.3762, + "step": 15076 + }, + { + "epoch": 0.662063514715084, + "grad_norm": 1.6171875, + "learning_rate": 1.2884381954921404e-05, + "loss": 0.3002, + "step": 15078 + }, + { + "epoch": 0.6621513331942874, + "grad_norm": 1.546875, + "learning_rate": 1.2878330842710284e-05, + "loss": 0.317, + "step": 15080 + }, + { + "epoch": 0.6622391516734909, + "grad_norm": 1.53125, + "learning_rate": 1.2872280658779501e-05, + "loss": 0.329, + "step": 15082 + }, + { + "epoch": 0.6623269701526944, + "grad_norm": 1.546875, + "learning_rate": 1.2866231403592377e-05, + "loss": 0.2896, + "step": 15084 + }, + { + "epoch": 0.6624147886318978, + "grad_norm": 1.5703125, + "learning_rate": 1.2860183077612148e-05, + "loss": 0.3293, + "step": 15086 + }, + { + "epoch": 0.6625026071111013, + "grad_norm": 1.8046875, + "learning_rate": 1.285413568130202e-05, + "loss": 0.3361, + "step": 15088 + }, + { + "epoch": 0.6625904255903048, + "grad_norm": 1.6875, + "learning_rate": 1.2848089215125084e-05, + "loss": 0.3096, + "step": 15090 + }, + { + "epoch": 0.6626782440695084, + "grad_norm": 1.5546875, + "learning_rate": 1.2842043679544397e-05, + "loss": 0.3276, + "step": 15092 + }, + { + "epoch": 0.6627660625487118, + "grad_norm": 1.5625, + "learning_rate": 1.283599907502292e-05, + "loss": 0.3178, + "step": 15094 + }, + { + "epoch": 0.6628538810279153, + "grad_norm": 1.7421875, + "learning_rate": 1.2829955402023549e-05, + "loss": 0.3348, + "step": 15096 + }, + { + "epoch": 0.6629416995071188, + "grad_norm": 1.515625, + "learning_rate": 1.2823912661009102e-05, + "loss": 0.2979, + "step": 15098 + }, + { + "epoch": 0.6630295179863223, + "grad_norm": 1.59375, + "learning_rate": 1.2817870852442355e-05, + "loss": 0.3386, + "step": 15100 + }, + { + "epoch": 0.6631173364655257, + "grad_norm": 1.625, + "learning_rate": 1.2811829976785971e-05, + "loss": 0.314, + "step": 15102 + }, + { + "epoch": 0.6632051549447292, + "grad_norm": 1.59375, + "learning_rate": 1.2805790034502565e-05, + "loss": 0.3095, + "step": 15104 + }, + { + "epoch": 0.6632929734239328, + "grad_norm": 1.578125, + "learning_rate": 1.2799751026054691e-05, + "loss": 0.3297, + "step": 15106 + }, + { + "epoch": 0.6633807919031363, + "grad_norm": 1.4921875, + "learning_rate": 1.2793712951904796e-05, + "loss": 0.3125, + "step": 15108 + }, + { + "epoch": 0.6634686103823397, + "grad_norm": 1.546875, + "learning_rate": 1.2787675812515299e-05, + "loss": 0.3212, + "step": 15110 + }, + { + "epoch": 0.6635564288615432, + "grad_norm": 1.546875, + "learning_rate": 1.2781639608348517e-05, + "loss": 0.3171, + "step": 15112 + }, + { + "epoch": 0.6636442473407467, + "grad_norm": 1.6953125, + "learning_rate": 1.2775604339866692e-05, + "loss": 0.3389, + "step": 15114 + }, + { + "epoch": 0.6637320658199501, + "grad_norm": 1.6484375, + "learning_rate": 1.2769570007532027e-05, + "loss": 0.3024, + "step": 15116 + }, + { + "epoch": 0.6638198842991536, + "grad_norm": 1.6796875, + "learning_rate": 1.2763536611806615e-05, + "loss": 0.3248, + "step": 15118 + }, + { + "epoch": 0.6639077027783571, + "grad_norm": 1.8203125, + "learning_rate": 1.275750415315251e-05, + "loss": 0.3536, + "step": 15120 + }, + { + "epoch": 0.6639955212575607, + "grad_norm": 1.6171875, + "learning_rate": 1.2751472632031672e-05, + "loss": 0.3393, + "step": 15122 + }, + { + "epoch": 0.6640833397367641, + "grad_norm": 1.4765625, + "learning_rate": 1.2745442048905998e-05, + "loss": 0.3293, + "step": 15124 + }, + { + "epoch": 0.6641711582159676, + "grad_norm": 1.796875, + "learning_rate": 1.2739412404237306e-05, + "loss": 0.3239, + "step": 15126 + }, + { + "epoch": 0.6642589766951711, + "grad_norm": 1.59375, + "learning_rate": 1.2733383698487344e-05, + "loss": 0.3344, + "step": 15128 + }, + { + "epoch": 0.6643467951743746, + "grad_norm": 1.6484375, + "learning_rate": 1.2727355932117806e-05, + "loss": 0.3109, + "step": 15130 + }, + { + "epoch": 0.664434613653578, + "grad_norm": 1.6484375, + "learning_rate": 1.2721329105590284e-05, + "loss": 0.3262, + "step": 15132 + }, + { + "epoch": 0.6645224321327815, + "grad_norm": 1.75, + "learning_rate": 1.2715303219366337e-05, + "loss": 0.3469, + "step": 15134 + }, + { + "epoch": 0.664610250611985, + "grad_norm": 1.6484375, + "learning_rate": 1.2709278273907408e-05, + "loss": 0.3071, + "step": 15136 + }, + { + "epoch": 0.6646980690911886, + "grad_norm": 1.6875, + "learning_rate": 1.2703254269674885e-05, + "loss": 0.3076, + "step": 15138 + }, + { + "epoch": 0.664785887570392, + "grad_norm": 1.6953125, + "learning_rate": 1.269723120713011e-05, + "loss": 0.3035, + "step": 15140 + }, + { + "epoch": 0.6648737060495955, + "grad_norm": 1.7890625, + "learning_rate": 1.2691209086734313e-05, + "loss": 0.3316, + "step": 15142 + }, + { + "epoch": 0.664961524528799, + "grad_norm": 1.6875, + "learning_rate": 1.2685187908948678e-05, + "loss": 0.3274, + "step": 15144 + }, + { + "epoch": 0.6650493430080024, + "grad_norm": 1.6875, + "learning_rate": 1.2679167674234308e-05, + "loss": 0.305, + "step": 15146 + }, + { + "epoch": 0.6651371614872059, + "grad_norm": 1.640625, + "learning_rate": 1.267314838305222e-05, + "loss": 0.3264, + "step": 15148 + }, + { + "epoch": 0.6652249799664094, + "grad_norm": 1.5625, + "learning_rate": 1.2667130035863395e-05, + "loss": 0.3286, + "step": 15150 + }, + { + "epoch": 0.6653127984456129, + "grad_norm": 1.59375, + "learning_rate": 1.2661112633128696e-05, + "loss": 0.3141, + "step": 15152 + }, + { + "epoch": 0.6654006169248164, + "grad_norm": 1.5625, + "learning_rate": 1.2655096175308962e-05, + "loss": 0.3165, + "step": 15154 + }, + { + "epoch": 0.6654884354040199, + "grad_norm": 1.5859375, + "learning_rate": 1.264908066286492e-05, + "loss": 0.3351, + "step": 15156 + }, + { + "epoch": 0.6655762538832234, + "grad_norm": 1.59375, + "learning_rate": 1.2643066096257244e-05, + "loss": 0.3032, + "step": 15158 + }, + { + "epoch": 0.6656640723624269, + "grad_norm": 1.6015625, + "learning_rate": 1.2637052475946526e-05, + "loss": 0.303, + "step": 15160 + }, + { + "epoch": 0.6657518908416303, + "grad_norm": 1.546875, + "learning_rate": 1.2631039802393286e-05, + "loss": 0.3241, + "step": 15162 + }, + { + "epoch": 0.6658397093208338, + "grad_norm": 1.6171875, + "learning_rate": 1.2625028076057987e-05, + "loss": 0.3363, + "step": 15164 + }, + { + "epoch": 0.6659275278000373, + "grad_norm": 1.53125, + "learning_rate": 1.2619017297400998e-05, + "loss": 0.3022, + "step": 15166 + }, + { + "epoch": 0.6660153462792409, + "grad_norm": 1.53125, + "learning_rate": 1.2613007466882643e-05, + "loss": 0.3297, + "step": 15168 + }, + { + "epoch": 0.6661031647584443, + "grad_norm": 1.5234375, + "learning_rate": 1.2606998584963136e-05, + "loss": 0.3167, + "step": 15170 + }, + { + "epoch": 0.6661909832376478, + "grad_norm": 1.75, + "learning_rate": 1.2600990652102656e-05, + "loss": 0.3309, + "step": 15172 + }, + { + "epoch": 0.6662788017168513, + "grad_norm": 1.7421875, + "learning_rate": 1.2594983668761286e-05, + "loss": 0.3045, + "step": 15174 + }, + { + "epoch": 0.6663666201960547, + "grad_norm": 1.6484375, + "learning_rate": 1.2588977635399029e-05, + "loss": 0.3155, + "step": 15176 + }, + { + "epoch": 0.6664544386752582, + "grad_norm": 1.65625, + "learning_rate": 1.2582972552475852e-05, + "loss": 0.3326, + "step": 15178 + }, + { + "epoch": 0.6665422571544617, + "grad_norm": 1.65625, + "learning_rate": 1.2576968420451601e-05, + "loss": 0.3442, + "step": 15180 + }, + { + "epoch": 0.6666300756336652, + "grad_norm": 1.7578125, + "learning_rate": 1.2570965239786098e-05, + "loss": 0.313, + "step": 15182 + }, + { + "epoch": 0.6667178941128687, + "grad_norm": 1.75, + "learning_rate": 1.2564963010939057e-05, + "loss": 0.3379, + "step": 15184 + }, + { + "epoch": 0.6668057125920722, + "grad_norm": 1.6015625, + "learning_rate": 1.2558961734370128e-05, + "loss": 0.3332, + "step": 15186 + }, + { + "epoch": 0.6668935310712757, + "grad_norm": 1.7890625, + "learning_rate": 1.2552961410538894e-05, + "loss": 0.3414, + "step": 15188 + }, + { + "epoch": 0.6669813495504792, + "grad_norm": 1.5625, + "learning_rate": 1.2546962039904847e-05, + "loss": 0.334, + "step": 15190 + }, + { + "epoch": 0.6670691680296826, + "grad_norm": 1.5859375, + "learning_rate": 1.254096362292744e-05, + "loss": 0.3496, + "step": 15192 + }, + { + "epoch": 0.6671569865088861, + "grad_norm": 1.4921875, + "learning_rate": 1.253496616006602e-05, + "loss": 0.3553, + "step": 15194 + }, + { + "epoch": 0.6672448049880896, + "grad_norm": 1.6796875, + "learning_rate": 1.2528969651779888e-05, + "loss": 0.3289, + "step": 15196 + }, + { + "epoch": 0.667332623467293, + "grad_norm": 1.5546875, + "learning_rate": 1.2522974098528245e-05, + "loss": 0.2986, + "step": 15198 + }, + { + "epoch": 0.6674204419464966, + "grad_norm": 1.5625, + "learning_rate": 1.2516979500770232e-05, + "loss": 0.3219, + "step": 15200 + }, + { + "epoch": 0.6675082604257001, + "grad_norm": 1.6015625, + "learning_rate": 1.251098585896493e-05, + "loss": 0.3009, + "step": 15202 + }, + { + "epoch": 0.6675960789049036, + "grad_norm": 1.5234375, + "learning_rate": 1.250499317357131e-05, + "loss": 0.3442, + "step": 15204 + }, + { + "epoch": 0.667683897384107, + "grad_norm": 1.640625, + "learning_rate": 1.2499001445048325e-05, + "loss": 0.3178, + "step": 15206 + }, + { + "epoch": 0.6677717158633105, + "grad_norm": 1.7265625, + "learning_rate": 1.2493010673854803e-05, + "loss": 0.3308, + "step": 15208 + }, + { + "epoch": 0.667859534342514, + "grad_norm": 1.421875, + "learning_rate": 1.2487020860449511e-05, + "loss": 0.3033, + "step": 15210 + }, + { + "epoch": 0.6679473528217175, + "grad_norm": 1.5546875, + "learning_rate": 1.2481032005291173e-05, + "loss": 0.3283, + "step": 15212 + }, + { + "epoch": 0.668035171300921, + "grad_norm": 1.5234375, + "learning_rate": 1.2475044108838393e-05, + "loss": 0.2983, + "step": 15214 + }, + { + "epoch": 0.6681229897801245, + "grad_norm": 1.6640625, + "learning_rate": 1.2469057171549759e-05, + "loss": 0.3213, + "step": 15216 + }, + { + "epoch": 0.668210808259328, + "grad_norm": 1.671875, + "learning_rate": 1.246307119388371e-05, + "loss": 0.3042, + "step": 15218 + }, + { + "epoch": 0.6682986267385315, + "grad_norm": 1.5625, + "learning_rate": 1.2457086176298685e-05, + "loss": 0.3172, + "step": 15220 + }, + { + "epoch": 0.6683864452177349, + "grad_norm": 1.6171875, + "learning_rate": 1.2451102119253009e-05, + "loss": 0.3245, + "step": 15222 + }, + { + "epoch": 0.6684742636969384, + "grad_norm": 1.6171875, + "learning_rate": 1.2445119023204926e-05, + "loss": 0.32, + "step": 15224 + }, + { + "epoch": 0.6685620821761419, + "grad_norm": 1.7734375, + "learning_rate": 1.2439136888612652e-05, + "loss": 0.3245, + "step": 15226 + }, + { + "epoch": 0.6686499006553454, + "grad_norm": 1.625, + "learning_rate": 1.2433155715934275e-05, + "loss": 0.3037, + "step": 15228 + }, + { + "epoch": 0.6687377191345489, + "grad_norm": 1.5078125, + "learning_rate": 1.2427175505627856e-05, + "loss": 0.3555, + "step": 15230 + }, + { + "epoch": 0.6688255376137524, + "grad_norm": 1.53125, + "learning_rate": 1.2421196258151337e-05, + "loss": 0.3333, + "step": 15232 + }, + { + "epoch": 0.6689133560929559, + "grad_norm": 1.6640625, + "learning_rate": 1.241521797396264e-05, + "loss": 0.3125, + "step": 15234 + }, + { + "epoch": 0.6690011745721594, + "grad_norm": 1.6953125, + "learning_rate": 1.2409240653519564e-05, + "loss": 0.3263, + "step": 15236 + }, + { + "epoch": 0.6690889930513628, + "grad_norm": 1.5625, + "learning_rate": 1.2403264297279849e-05, + "loss": 0.3191, + "step": 15238 + }, + { + "epoch": 0.6691768115305663, + "grad_norm": 1.640625, + "learning_rate": 1.239728890570118e-05, + "loss": 0.3162, + "step": 15240 + }, + { + "epoch": 0.6692646300097698, + "grad_norm": 1.640625, + "learning_rate": 1.239131447924114e-05, + "loss": 0.3191, + "step": 15242 + }, + { + "epoch": 0.6693524484889732, + "grad_norm": 1.5703125, + "learning_rate": 1.238534101835727e-05, + "loss": 0.3004, + "step": 15244 + }, + { + "epoch": 0.6694402669681768, + "grad_norm": 1.859375, + "learning_rate": 1.2379368523507007e-05, + "loss": 0.3416, + "step": 15246 + }, + { + "epoch": 0.6695280854473803, + "grad_norm": 1.6640625, + "learning_rate": 1.2373396995147729e-05, + "loss": 0.3377, + "step": 15248 + }, + { + "epoch": 0.6696159039265838, + "grad_norm": 1.5703125, + "learning_rate": 1.2367426433736737e-05, + "loss": 0.3255, + "step": 15250 + }, + { + "epoch": 0.6697037224057872, + "grad_norm": 1.7890625, + "learning_rate": 1.2361456839731245e-05, + "loss": 0.3023, + "step": 15252 + }, + { + "epoch": 0.6697915408849907, + "grad_norm": 1.75, + "learning_rate": 1.2355488213588429e-05, + "loss": 0.3236, + "step": 15254 + }, + { + "epoch": 0.6698793593641942, + "grad_norm": 1.65625, + "learning_rate": 1.2349520555765348e-05, + "loss": 0.3063, + "step": 15256 + }, + { + "epoch": 0.6699671778433977, + "grad_norm": 1.6953125, + "learning_rate": 1.2343553866719024e-05, + "loss": 0.3225, + "step": 15258 + }, + { + "epoch": 0.6700549963226012, + "grad_norm": 1.6328125, + "learning_rate": 1.2337588146906378e-05, + "loss": 0.2899, + "step": 15260 + }, + { + "epoch": 0.6701428148018047, + "grad_norm": 1.5703125, + "learning_rate": 1.2331623396784258e-05, + "loss": 0.3267, + "step": 15262 + }, + { + "epoch": 0.6702306332810082, + "grad_norm": 1.5546875, + "learning_rate": 1.2325659616809466e-05, + "loss": 0.299, + "step": 15264 + }, + { + "epoch": 0.6703184517602117, + "grad_norm": 1.609375, + "learning_rate": 1.2319696807438686e-05, + "loss": 0.3019, + "step": 15266 + }, + { + "epoch": 0.6704062702394151, + "grad_norm": 1.5703125, + "learning_rate": 1.2313734969128576e-05, + "loss": 0.2836, + "step": 15268 + }, + { + "epoch": 0.6704940887186186, + "grad_norm": 1.5703125, + "learning_rate": 1.2307774102335685e-05, + "loss": 0.3468, + "step": 15270 + }, + { + "epoch": 0.6705819071978221, + "grad_norm": 1.5390625, + "learning_rate": 1.2301814207516482e-05, + "loss": 0.3267, + "step": 15272 + }, + { + "epoch": 0.6706697256770255, + "grad_norm": 1.6171875, + "learning_rate": 1.2295855285127403e-05, + "loss": 0.3108, + "step": 15274 + }, + { + "epoch": 0.6707575441562291, + "grad_norm": 1.7109375, + "learning_rate": 1.2289897335624761e-05, + "loss": 0.3076, + "step": 15276 + }, + { + "epoch": 0.6708453626354326, + "grad_norm": 1.578125, + "learning_rate": 1.2283940359464849e-05, + "loss": 0.3156, + "step": 15278 + }, + { + "epoch": 0.6709331811146361, + "grad_norm": 1.625, + "learning_rate": 1.2277984357103811e-05, + "loss": 0.3288, + "step": 15280 + }, + { + "epoch": 0.6710209995938395, + "grad_norm": 1.640625, + "learning_rate": 1.2272029328997791e-05, + "loss": 0.3448, + "step": 15282 + }, + { + "epoch": 0.671108818073043, + "grad_norm": 1.6640625, + "learning_rate": 1.2266075275602818e-05, + "loss": 0.3192, + "step": 15284 + }, + { + "epoch": 0.6711966365522465, + "grad_norm": 1.5625, + "learning_rate": 1.226012219737484e-05, + "loss": 0.3315, + "step": 15286 + }, + { + "epoch": 0.67128445503145, + "grad_norm": 1.5703125, + "learning_rate": 1.2254170094769771e-05, + "loss": 0.3169, + "step": 15288 + }, + { + "epoch": 0.6713722735106534, + "grad_norm": 1.640625, + "learning_rate": 1.22482189682434e-05, + "loss": 0.3263, + "step": 15290 + }, + { + "epoch": 0.671460091989857, + "grad_norm": 1.53125, + "learning_rate": 1.2242268818251486e-05, + "loss": 0.3187, + "step": 15292 + }, + { + "epoch": 0.6715479104690605, + "grad_norm": 1.84375, + "learning_rate": 1.2236319645249677e-05, + "loss": 0.3476, + "step": 15294 + }, + { + "epoch": 0.671635728948264, + "grad_norm": 1.6484375, + "learning_rate": 1.2230371449693578e-05, + "loss": 0.3365, + "step": 15296 + }, + { + "epoch": 0.6717235474274674, + "grad_norm": 1.5859375, + "learning_rate": 1.2224424232038692e-05, + "loss": 0.299, + "step": 15298 + }, + { + "epoch": 0.6718113659066709, + "grad_norm": 1.625, + "learning_rate": 1.2218477992740455e-05, + "loss": 0.3148, + "step": 15300 + }, + { + "epoch": 0.6718991843858744, + "grad_norm": 1.6015625, + "learning_rate": 1.2212532732254245e-05, + "loss": 0.3289, + "step": 15302 + }, + { + "epoch": 0.6719870028650778, + "grad_norm": 1.7265625, + "learning_rate": 1.2206588451035334e-05, + "loss": 0.3381, + "step": 15304 + }, + { + "epoch": 0.6720748213442814, + "grad_norm": 1.546875, + "learning_rate": 1.2200645149538955e-05, + "loss": 0.3261, + "step": 15306 + }, + { + "epoch": 0.6721626398234849, + "grad_norm": 1.8125, + "learning_rate": 1.2194702828220241e-05, + "loss": 0.3341, + "step": 15308 + }, + { + "epoch": 0.6722504583026884, + "grad_norm": 1.5546875, + "learning_rate": 1.2188761487534254e-05, + "loss": 0.3285, + "step": 15310 + }, + { + "epoch": 0.6723382767818918, + "grad_norm": 1.71875, + "learning_rate": 1.218282112793598e-05, + "loss": 0.3094, + "step": 15312 + }, + { + "epoch": 0.6724260952610953, + "grad_norm": 1.5625, + "learning_rate": 1.2176881749880328e-05, + "loss": 0.3299, + "step": 15314 + }, + { + "epoch": 0.6725139137402988, + "grad_norm": 1.5390625, + "learning_rate": 1.2170943353822155e-05, + "loss": 0.3249, + "step": 15316 + }, + { + "epoch": 0.6726017322195023, + "grad_norm": 1.625, + "learning_rate": 1.2165005940216209e-05, + "loss": 0.3287, + "step": 15318 + }, + { + "epoch": 0.6726895506987057, + "grad_norm": 1.6875, + "learning_rate": 1.2159069509517193e-05, + "loss": 0.3378, + "step": 15320 + }, + { + "epoch": 0.6727773691779093, + "grad_norm": 1.515625, + "learning_rate": 1.2153134062179711e-05, + "loss": 0.3203, + "step": 15322 + }, + { + "epoch": 0.6728651876571128, + "grad_norm": 1.4921875, + "learning_rate": 1.2147199598658293e-05, + "loss": 0.3337, + "step": 15324 + }, + { + "epoch": 0.6729530061363163, + "grad_norm": 1.609375, + "learning_rate": 1.2141266119407421e-05, + "loss": 0.3331, + "step": 15326 + }, + { + "epoch": 0.6730408246155197, + "grad_norm": 1.65625, + "learning_rate": 1.2135333624881463e-05, + "loss": 0.3097, + "step": 15328 + }, + { + "epoch": 0.6731286430947232, + "grad_norm": 1.6484375, + "learning_rate": 1.2129402115534747e-05, + "loss": 0.3504, + "step": 15330 + }, + { + "epoch": 0.6732164615739267, + "grad_norm": 1.75, + "learning_rate": 1.2123471591821503e-05, + "loss": 0.337, + "step": 15332 + }, + { + "epoch": 0.6733042800531301, + "grad_norm": 1.6796875, + "learning_rate": 1.2117542054195882e-05, + "loss": 0.2912, + "step": 15334 + }, + { + "epoch": 0.6733920985323336, + "grad_norm": 1.578125, + "learning_rate": 1.2111613503111987e-05, + "loss": 0.3178, + "step": 15336 + }, + { + "epoch": 0.6734799170115372, + "grad_norm": 1.6796875, + "learning_rate": 1.2105685939023811e-05, + "loss": 0.3123, + "step": 15338 + }, + { + "epoch": 0.6735677354907407, + "grad_norm": 1.5546875, + "learning_rate": 1.2099759362385318e-05, + "loss": 0.3066, + "step": 15340 + }, + { + "epoch": 0.6736555539699441, + "grad_norm": 1.6328125, + "learning_rate": 1.2093833773650324e-05, + "loss": 0.3333, + "step": 15342 + }, + { + "epoch": 0.6737433724491476, + "grad_norm": 1.5625, + "learning_rate": 1.2087909173272644e-05, + "loss": 0.3225, + "step": 15344 + }, + { + "epoch": 0.6738311909283511, + "grad_norm": 1.59375, + "learning_rate": 1.2081985561705975e-05, + "loss": 0.3382, + "step": 15346 + }, + { + "epoch": 0.6739190094075546, + "grad_norm": 1.6796875, + "learning_rate": 1.2076062939403937e-05, + "loss": 0.2952, + "step": 15348 + }, + { + "epoch": 0.674006827886758, + "grad_norm": 1.6015625, + "learning_rate": 1.2070141306820106e-05, + "loss": 0.3388, + "step": 15350 + }, + { + "epoch": 0.6740946463659615, + "grad_norm": 1.5703125, + "learning_rate": 1.2064220664407946e-05, + "loss": 0.3128, + "step": 15352 + }, + { + "epoch": 0.6741824648451651, + "grad_norm": 1.5625, + "learning_rate": 1.205830101262088e-05, + "loss": 0.3194, + "step": 15354 + }, + { + "epoch": 0.6742702833243686, + "grad_norm": 1.546875, + "learning_rate": 1.2052382351912219e-05, + "loss": 0.3389, + "step": 15356 + }, + { + "epoch": 0.674358101803572, + "grad_norm": 1.5546875, + "learning_rate": 1.2046464682735217e-05, + "loss": 0.3324, + "step": 15358 + }, + { + "epoch": 0.6744459202827755, + "grad_norm": 1.5234375, + "learning_rate": 1.2040548005543062e-05, + "loss": 0.327, + "step": 15360 + }, + { + "epoch": 0.674533738761979, + "grad_norm": 1.5859375, + "learning_rate": 1.2034632320788839e-05, + "loss": 0.3616, + "step": 15362 + }, + { + "epoch": 0.6746215572411824, + "grad_norm": 1.6171875, + "learning_rate": 1.2028717628925587e-05, + "loss": 0.3157, + "step": 15364 + }, + { + "epoch": 0.6747093757203859, + "grad_norm": 1.640625, + "learning_rate": 1.2022803930406242e-05, + "loss": 0.3295, + "step": 15366 + }, + { + "epoch": 0.6747971941995895, + "grad_norm": 1.8046875, + "learning_rate": 1.2016891225683691e-05, + "loss": 0.3492, + "step": 15368 + }, + { + "epoch": 0.674885012678793, + "grad_norm": 1.7109375, + "learning_rate": 1.2010979515210724e-05, + "loss": 0.3435, + "step": 15370 + }, + { + "epoch": 0.6749728311579964, + "grad_norm": 1.625, + "learning_rate": 1.2005068799440059e-05, + "loss": 0.3334, + "step": 15372 + }, + { + "epoch": 0.6750606496371999, + "grad_norm": 1.734375, + "learning_rate": 1.1999159078824337e-05, + "loss": 0.3308, + "step": 15374 + }, + { + "epoch": 0.6751484681164034, + "grad_norm": 1.765625, + "learning_rate": 1.1993250353816124e-05, + "loss": 0.3166, + "step": 15376 + }, + { + "epoch": 0.6752362865956069, + "grad_norm": 1.796875, + "learning_rate": 1.1987342624867926e-05, + "loss": 0.3151, + "step": 15378 + }, + { + "epoch": 0.6753241050748103, + "grad_norm": 2.015625, + "learning_rate": 1.1981435892432139e-05, + "loss": 0.3483, + "step": 15380 + }, + { + "epoch": 0.6754119235540138, + "grad_norm": 1.5234375, + "learning_rate": 1.1975530156961119e-05, + "loss": 0.3139, + "step": 15382 + }, + { + "epoch": 0.6754997420332174, + "grad_norm": 1.5, + "learning_rate": 1.1969625418907123e-05, + "loss": 0.3511, + "step": 15384 + }, + { + "epoch": 0.6755875605124209, + "grad_norm": 1.71875, + "learning_rate": 1.1963721678722328e-05, + "loss": 0.332, + "step": 15386 + }, + { + "epoch": 0.6756753789916243, + "grad_norm": 1.875, + "learning_rate": 1.1957818936858862e-05, + "loss": 0.3403, + "step": 15388 + }, + { + "epoch": 0.6757631974708278, + "grad_norm": 1.5078125, + "learning_rate": 1.1951917193768736e-05, + "loss": 0.3234, + "step": 15390 + }, + { + "epoch": 0.6758510159500313, + "grad_norm": 1.765625, + "learning_rate": 1.194601644990393e-05, + "loss": 0.3602, + "step": 15392 + }, + { + "epoch": 0.6759388344292347, + "grad_norm": 1.609375, + "learning_rate": 1.1940116705716315e-05, + "loss": 0.3491, + "step": 15394 + }, + { + "epoch": 0.6760266529084382, + "grad_norm": 1.640625, + "learning_rate": 1.1934217961657682e-05, + "loss": 0.335, + "step": 15396 + }, + { + "epoch": 0.6761144713876417, + "grad_norm": 1.640625, + "learning_rate": 1.1928320218179779e-05, + "loss": 0.3248, + "step": 15398 + }, + { + "epoch": 0.6762022898668453, + "grad_norm": 1.6953125, + "learning_rate": 1.1922423475734248e-05, + "loss": 0.3105, + "step": 15400 + }, + { + "epoch": 0.6762901083460487, + "grad_norm": 1.6953125, + "learning_rate": 1.1916527734772661e-05, + "loss": 0.3222, + "step": 15402 + }, + { + "epoch": 0.6763779268252522, + "grad_norm": 1.53125, + "learning_rate": 1.191063299574651e-05, + "loss": 0.3442, + "step": 15404 + }, + { + "epoch": 0.6764657453044557, + "grad_norm": 1.703125, + "learning_rate": 1.1904739259107228e-05, + "loss": 0.3104, + "step": 15406 + }, + { + "epoch": 0.6765535637836592, + "grad_norm": 1.5234375, + "learning_rate": 1.1898846525306154e-05, + "loss": 0.3163, + "step": 15408 + }, + { + "epoch": 0.6766413822628626, + "grad_norm": 1.5546875, + "learning_rate": 1.1892954794794545e-05, + "loss": 0.3274, + "step": 15410 + }, + { + "epoch": 0.6767292007420661, + "grad_norm": 1.5390625, + "learning_rate": 1.1887064068023607e-05, + "loss": 0.336, + "step": 15412 + }, + { + "epoch": 0.6768170192212697, + "grad_norm": 1.640625, + "learning_rate": 1.1881174345444437e-05, + "loss": 0.3256, + "step": 15414 + }, + { + "epoch": 0.6769048377004732, + "grad_norm": 1.609375, + "learning_rate": 1.187528562750809e-05, + "loss": 0.3031, + "step": 15416 + }, + { + "epoch": 0.6769926561796766, + "grad_norm": 1.6875, + "learning_rate": 1.1869397914665516e-05, + "loss": 0.355, + "step": 15418 + }, + { + "epoch": 0.6770804746588801, + "grad_norm": 1.5625, + "learning_rate": 1.186351120736759e-05, + "loss": 0.3176, + "step": 15420 + }, + { + "epoch": 0.6771682931380836, + "grad_norm": 1.625, + "learning_rate": 1.1857625506065135e-05, + "loss": 0.3048, + "step": 15422 + }, + { + "epoch": 0.677256111617287, + "grad_norm": 1.625, + "learning_rate": 1.1851740811208856e-05, + "loss": 0.3217, + "step": 15424 + }, + { + "epoch": 0.6773439300964905, + "grad_norm": 1.75, + "learning_rate": 1.1845857123249427e-05, + "loss": 0.3281, + "step": 15426 + }, + { + "epoch": 0.677431748575694, + "grad_norm": 1.5390625, + "learning_rate": 1.1839974442637406e-05, + "loss": 0.3514, + "step": 15428 + }, + { + "epoch": 0.6775195670548976, + "grad_norm": 1.4921875, + "learning_rate": 1.1834092769823304e-05, + "loss": 0.3142, + "step": 15430 + }, + { + "epoch": 0.677607385534101, + "grad_norm": 1.6484375, + "learning_rate": 1.1828212105257536e-05, + "loss": 0.3072, + "step": 15432 + }, + { + "epoch": 0.6776952040133045, + "grad_norm": 1.6484375, + "learning_rate": 1.1822332449390441e-05, + "loss": 0.3315, + "step": 15434 + }, + { + "epoch": 0.677783022492508, + "grad_norm": 1.515625, + "learning_rate": 1.1816453802672286e-05, + "loss": 0.2974, + "step": 15436 + }, + { + "epoch": 0.6778708409717115, + "grad_norm": 1.609375, + "learning_rate": 1.181057616555325e-05, + "loss": 0.3113, + "step": 15438 + }, + { + "epoch": 0.6779586594509149, + "grad_norm": 1.5625, + "learning_rate": 1.1804699538483462e-05, + "loss": 0.3069, + "step": 15440 + }, + { + "epoch": 0.6780464779301184, + "grad_norm": 1.6171875, + "learning_rate": 1.1798823921912937e-05, + "loss": 0.3202, + "step": 15442 + }, + { + "epoch": 0.6781342964093219, + "grad_norm": 1.671875, + "learning_rate": 1.1792949316291651e-05, + "loss": 0.3149, + "step": 15444 + }, + { + "epoch": 0.6782221148885255, + "grad_norm": 1.578125, + "learning_rate": 1.1787075722069471e-05, + "loss": 0.3165, + "step": 15446 + }, + { + "epoch": 0.6783099333677289, + "grad_norm": 1.6640625, + "learning_rate": 1.1781203139696192e-05, + "loss": 0.348, + "step": 15448 + }, + { + "epoch": 0.6783977518469324, + "grad_norm": 1.5546875, + "learning_rate": 1.1775331569621553e-05, + "loss": 0.3292, + "step": 15450 + }, + { + "epoch": 0.6784855703261359, + "grad_norm": 1.5625, + "learning_rate": 1.1769461012295183e-05, + "loss": 0.3128, + "step": 15452 + }, + { + "epoch": 0.6785733888053394, + "grad_norm": 1.65625, + "learning_rate": 1.1763591468166671e-05, + "loss": 0.3219, + "step": 15454 + }, + { + "epoch": 0.6786612072845428, + "grad_norm": 1.78125, + "learning_rate": 1.1757722937685498e-05, + "loss": 0.337, + "step": 15456 + }, + { + "epoch": 0.6787490257637463, + "grad_norm": 1.6015625, + "learning_rate": 1.1751855421301064e-05, + "loss": 0.3249, + "step": 15458 + }, + { + "epoch": 0.6788368442429499, + "grad_norm": 1.640625, + "learning_rate": 1.174598891946273e-05, + "loss": 0.31, + "step": 15460 + }, + { + "epoch": 0.6789246627221533, + "grad_norm": 1.6640625, + "learning_rate": 1.1740123432619741e-05, + "loss": 0.3139, + "step": 15462 + }, + { + "epoch": 0.6790124812013568, + "grad_norm": 1.5859375, + "learning_rate": 1.1734258961221278e-05, + "loss": 0.3365, + "step": 15464 + }, + { + "epoch": 0.6791002996805603, + "grad_norm": 1.6171875, + "learning_rate": 1.1728395505716433e-05, + "loss": 0.3293, + "step": 15466 + }, + { + "epoch": 0.6791881181597638, + "grad_norm": 1.5546875, + "learning_rate": 1.1722533066554254e-05, + "loss": 0.336, + "step": 15468 + }, + { + "epoch": 0.6792759366389672, + "grad_norm": 1.4453125, + "learning_rate": 1.1716671644183674e-05, + "loss": 0.3193, + "step": 15470 + }, + { + "epoch": 0.6793637551181707, + "grad_norm": 1.6640625, + "learning_rate": 1.1710811239053553e-05, + "loss": 0.3072, + "step": 15472 + }, + { + "epoch": 0.6794515735973742, + "grad_norm": 1.59375, + "learning_rate": 1.1704951851612705e-05, + "loss": 0.3524, + "step": 15474 + }, + { + "epoch": 0.6795393920765778, + "grad_norm": 1.5546875, + "learning_rate": 1.169909348230982e-05, + "loss": 0.323, + "step": 15476 + }, + { + "epoch": 0.6796272105557812, + "grad_norm": 1.4921875, + "learning_rate": 1.1693236131593555e-05, + "loss": 0.3359, + "step": 15478 + }, + { + "epoch": 0.6797150290349847, + "grad_norm": 1.5703125, + "learning_rate": 1.1687379799912457e-05, + "loss": 0.3165, + "step": 15480 + }, + { + "epoch": 0.6798028475141882, + "grad_norm": 1.6171875, + "learning_rate": 1.1681524487714995e-05, + "loss": 0.3152, + "step": 15482 + }, + { + "epoch": 0.6798906659933917, + "grad_norm": 1.6640625, + "learning_rate": 1.167567019544959e-05, + "loss": 0.3278, + "step": 15484 + }, + { + "epoch": 0.6799784844725951, + "grad_norm": 1.6015625, + "learning_rate": 1.1669816923564544e-05, + "loss": 0.3325, + "step": 15486 + }, + { + "epoch": 0.6800663029517986, + "grad_norm": 1.546875, + "learning_rate": 1.1663964672508126e-05, + "loss": 0.327, + "step": 15488 + }, + { + "epoch": 0.6801541214310021, + "grad_norm": 1.5546875, + "learning_rate": 1.1658113442728489e-05, + "loss": 0.3238, + "step": 15490 + }, + { + "epoch": 0.6802419399102057, + "grad_norm": 1.453125, + "learning_rate": 1.1652263234673725e-05, + "loss": 0.3403, + "step": 15492 + }, + { + "epoch": 0.6803297583894091, + "grad_norm": 1.5625, + "learning_rate": 1.164641404879183e-05, + "loss": 0.321, + "step": 15494 + }, + { + "epoch": 0.6804175768686126, + "grad_norm": 1.5234375, + "learning_rate": 1.1640565885530758e-05, + "loss": 0.3183, + "step": 15496 + }, + { + "epoch": 0.6805053953478161, + "grad_norm": 1.734375, + "learning_rate": 1.1634718745338353e-05, + "loss": 0.3223, + "step": 15498 + }, + { + "epoch": 0.6805932138270195, + "grad_norm": 1.625, + "learning_rate": 1.1628872628662381e-05, + "loss": 0.3443, + "step": 15500 + }, + { + "epoch": 0.680681032306223, + "grad_norm": 1.546875, + "learning_rate": 1.1623027535950559e-05, + "loss": 0.3374, + "step": 15502 + }, + { + "epoch": 0.6807688507854265, + "grad_norm": 1.5625, + "learning_rate": 1.1617183467650483e-05, + "loss": 0.33, + "step": 15504 + }, + { + "epoch": 0.68085666926463, + "grad_norm": 1.53125, + "learning_rate": 1.1611340424209715e-05, + "loss": 0.3205, + "step": 15506 + }, + { + "epoch": 0.6809444877438335, + "grad_norm": 1.75, + "learning_rate": 1.160549840607571e-05, + "loss": 0.3222, + "step": 15508 + }, + { + "epoch": 0.681032306223037, + "grad_norm": 1.6875, + "learning_rate": 1.1599657413695836e-05, + "loss": 0.362, + "step": 15510 + }, + { + "epoch": 0.6811201247022405, + "grad_norm": 1.6875, + "learning_rate": 1.1593817447517419e-05, + "loss": 0.3281, + "step": 15512 + }, + { + "epoch": 0.681207943181444, + "grad_norm": 1.4296875, + "learning_rate": 1.1587978507987667e-05, + "loss": 0.2968, + "step": 15514 + }, + { + "epoch": 0.6812957616606474, + "grad_norm": 1.5703125, + "learning_rate": 1.1582140595553746e-05, + "loss": 0.3268, + "step": 15516 + }, + { + "epoch": 0.6813835801398509, + "grad_norm": 1.6015625, + "learning_rate": 1.157630371066271e-05, + "loss": 0.312, + "step": 15518 + }, + { + "epoch": 0.6814713986190544, + "grad_norm": 1.65625, + "learning_rate": 1.1570467853761552e-05, + "loss": 0.3154, + "step": 15520 + }, + { + "epoch": 0.681559217098258, + "grad_norm": 1.609375, + "learning_rate": 1.156463302529719e-05, + "loss": 0.2936, + "step": 15522 + }, + { + "epoch": 0.6816470355774614, + "grad_norm": 1.4609375, + "learning_rate": 1.1558799225716451e-05, + "loss": 0.3225, + "step": 15524 + }, + { + "epoch": 0.6817348540566649, + "grad_norm": 1.65625, + "learning_rate": 1.155296645546609e-05, + "loss": 0.3127, + "step": 15526 + }, + { + "epoch": 0.6818226725358684, + "grad_norm": 1.5625, + "learning_rate": 1.1547134714992772e-05, + "loss": 0.3079, + "step": 15528 + }, + { + "epoch": 0.6819104910150718, + "grad_norm": 1.6796875, + "learning_rate": 1.1541304004743112e-05, + "loss": 0.3163, + "step": 15530 + }, + { + "epoch": 0.6819983094942753, + "grad_norm": 1.6171875, + "learning_rate": 1.1535474325163618e-05, + "loss": 0.2959, + "step": 15532 + }, + { + "epoch": 0.6820861279734788, + "grad_norm": 1.703125, + "learning_rate": 1.1529645676700717e-05, + "loss": 0.3165, + "step": 15534 + }, + { + "epoch": 0.6821739464526823, + "grad_norm": 1.53125, + "learning_rate": 1.1523818059800793e-05, + "loss": 0.3352, + "step": 15536 + }, + { + "epoch": 0.6822617649318858, + "grad_norm": 1.6328125, + "learning_rate": 1.1517991474910097e-05, + "loss": 0.321, + "step": 15538 + }, + { + "epoch": 0.6823495834110893, + "grad_norm": 1.5703125, + "learning_rate": 1.1512165922474857e-05, + "loss": 0.3393, + "step": 15540 + }, + { + "epoch": 0.6824374018902928, + "grad_norm": 1.6171875, + "learning_rate": 1.1506341402941187e-05, + "loss": 0.3403, + "step": 15542 + }, + { + "epoch": 0.6825252203694963, + "grad_norm": 1.59375, + "learning_rate": 1.1500517916755115e-05, + "loss": 0.316, + "step": 15544 + }, + { + "epoch": 0.6826130388486997, + "grad_norm": 1.578125, + "learning_rate": 1.1494695464362627e-05, + "loss": 0.3239, + "step": 15546 + }, + { + "epoch": 0.6827008573279032, + "grad_norm": 1.625, + "learning_rate": 1.1488874046209588e-05, + "loss": 0.3235, + "step": 15548 + }, + { + "epoch": 0.6827886758071067, + "grad_norm": 1.5078125, + "learning_rate": 1.1483053662741822e-05, + "loss": 0.3239, + "step": 15550 + }, + { + "epoch": 0.6828764942863101, + "grad_norm": 1.6171875, + "learning_rate": 1.1477234314405048e-05, + "loss": 0.304, + "step": 15552 + }, + { + "epoch": 0.6829643127655137, + "grad_norm": 1.6640625, + "learning_rate": 1.1471416001644911e-05, + "loss": 0.3451, + "step": 15554 + }, + { + "epoch": 0.6830521312447172, + "grad_norm": 1.5625, + "learning_rate": 1.146559872490697e-05, + "loss": 0.2889, + "step": 15556 + }, + { + "epoch": 0.6831399497239207, + "grad_norm": 1.765625, + "learning_rate": 1.1459782484636734e-05, + "loss": 0.3077, + "step": 15558 + }, + { + "epoch": 0.6832277682031241, + "grad_norm": 1.625, + "learning_rate": 1.1453967281279601e-05, + "loss": 0.346, + "step": 15560 + }, + { + "epoch": 0.6833155866823276, + "grad_norm": 1.578125, + "learning_rate": 1.144815311528089e-05, + "loss": 0.3292, + "step": 15562 + }, + { + "epoch": 0.6834034051615311, + "grad_norm": 1.578125, + "learning_rate": 1.1442339987085873e-05, + "loss": 0.3335, + "step": 15564 + }, + { + "epoch": 0.6834912236407346, + "grad_norm": 1.5546875, + "learning_rate": 1.1436527897139698e-05, + "loss": 0.2855, + "step": 15566 + }, + { + "epoch": 0.6835790421199381, + "grad_norm": 1.6171875, + "learning_rate": 1.1430716845887478e-05, + "loss": 0.322, + "step": 15568 + }, + { + "epoch": 0.6836668605991416, + "grad_norm": 1.5859375, + "learning_rate": 1.1424906833774218e-05, + "loss": 0.297, + "step": 15570 + }, + { + "epoch": 0.6837546790783451, + "grad_norm": 1.578125, + "learning_rate": 1.1419097861244834e-05, + "loss": 0.299, + "step": 15572 + }, + { + "epoch": 0.6838424975575486, + "grad_norm": 1.625, + "learning_rate": 1.1413289928744203e-05, + "loss": 0.3015, + "step": 15574 + }, + { + "epoch": 0.683930316036752, + "grad_norm": 1.640625, + "learning_rate": 1.1407483036717076e-05, + "loss": 0.3119, + "step": 15576 + }, + { + "epoch": 0.6840181345159555, + "grad_norm": 1.4453125, + "learning_rate": 1.1401677185608165e-05, + "loss": 0.3085, + "step": 15578 + }, + { + "epoch": 0.684105952995159, + "grad_norm": 1.625, + "learning_rate": 1.1395872375862074e-05, + "loss": 0.3267, + "step": 15580 + }, + { + "epoch": 0.6841937714743624, + "grad_norm": 1.8671875, + "learning_rate": 1.139006860792333e-05, + "loss": 0.3639, + "step": 15582 + }, + { + "epoch": 0.684281589953566, + "grad_norm": 1.6875, + "learning_rate": 1.1384265882236414e-05, + "loss": 0.3307, + "step": 15584 + }, + { + "epoch": 0.6843694084327695, + "grad_norm": 1.6796875, + "learning_rate": 1.1378464199245659e-05, + "loss": 0.3185, + "step": 15586 + }, + { + "epoch": 0.684457226911973, + "grad_norm": 1.6875, + "learning_rate": 1.137266355939539e-05, + "loss": 0.2997, + "step": 15588 + }, + { + "epoch": 0.6845450453911764, + "grad_norm": 1.5234375, + "learning_rate": 1.1366863963129805e-05, + "loss": 0.3336, + "step": 15590 + }, + { + "epoch": 0.6846328638703799, + "grad_norm": 1.5390625, + "learning_rate": 1.1361065410893057e-05, + "loss": 0.2917, + "step": 15592 + }, + { + "epoch": 0.6847206823495834, + "grad_norm": 1.59375, + "learning_rate": 1.1355267903129187e-05, + "loss": 0.3479, + "step": 15594 + }, + { + "epoch": 0.6848085008287869, + "grad_norm": 1.53125, + "learning_rate": 1.1349471440282164e-05, + "loss": 0.2962, + "step": 15596 + }, + { + "epoch": 0.6848963193079903, + "grad_norm": 1.59375, + "learning_rate": 1.1343676022795898e-05, + "loss": 0.3089, + "step": 15598 + }, + { + "epoch": 0.6849841377871939, + "grad_norm": 1.625, + "learning_rate": 1.1337881651114188e-05, + "loss": 0.3061, + "step": 15600 + }, + { + "epoch": 0.6850719562663974, + "grad_norm": 1.6484375, + "learning_rate": 1.1332088325680783e-05, + "loss": 0.3318, + "step": 15602 + }, + { + "epoch": 0.6851597747456009, + "grad_norm": 1.546875, + "learning_rate": 1.1326296046939333e-05, + "loss": 0.2978, + "step": 15604 + }, + { + "epoch": 0.6852475932248043, + "grad_norm": 1.4921875, + "learning_rate": 1.1320504815333399e-05, + "loss": 0.3091, + "step": 15606 + }, + { + "epoch": 0.6853354117040078, + "grad_norm": 1.671875, + "learning_rate": 1.1314714631306495e-05, + "loss": 0.3161, + "step": 15608 + }, + { + "epoch": 0.6854232301832113, + "grad_norm": 1.6171875, + "learning_rate": 1.1308925495302017e-05, + "loss": 0.3394, + "step": 15610 + }, + { + "epoch": 0.6855110486624147, + "grad_norm": 1.5703125, + "learning_rate": 1.1303137407763314e-05, + "loss": 0.3152, + "step": 15612 + }, + { + "epoch": 0.6855988671416183, + "grad_norm": 1.671875, + "learning_rate": 1.1297350369133632e-05, + "loss": 0.3281, + "step": 15614 + }, + { + "epoch": 0.6856866856208218, + "grad_norm": 1.6015625, + "learning_rate": 1.1291564379856145e-05, + "loss": 0.3239, + "step": 15616 + }, + { + "epoch": 0.6857745041000253, + "grad_norm": 1.6875, + "learning_rate": 1.1285779440373943e-05, + "loss": 0.3456, + "step": 15618 + }, + { + "epoch": 0.6858623225792287, + "grad_norm": 1.5625, + "learning_rate": 1.1279995551130029e-05, + "loss": 0.3397, + "step": 15620 + }, + { + "epoch": 0.6859501410584322, + "grad_norm": 1.5, + "learning_rate": 1.1274212712567354e-05, + "loss": 0.3237, + "step": 15622 + }, + { + "epoch": 0.6860379595376357, + "grad_norm": 1.5, + "learning_rate": 1.126843092512875e-05, + "loss": 0.3207, + "step": 15624 + }, + { + "epoch": 0.6861257780168392, + "grad_norm": 1.5546875, + "learning_rate": 1.126265018925701e-05, + "loss": 0.3607, + "step": 15626 + }, + { + "epoch": 0.6862135964960426, + "grad_norm": 1.5234375, + "learning_rate": 1.1256870505394798e-05, + "loss": 0.2892, + "step": 15628 + }, + { + "epoch": 0.6863014149752462, + "grad_norm": 1.515625, + "learning_rate": 1.1251091873984748e-05, + "loss": 0.326, + "step": 15630 + }, + { + "epoch": 0.6863892334544497, + "grad_norm": 1.59375, + "learning_rate": 1.1245314295469379e-05, + "loss": 0.3374, + "step": 15632 + }, + { + "epoch": 0.6864770519336532, + "grad_norm": 1.5078125, + "learning_rate": 1.1239537770291128e-05, + "loss": 0.3499, + "step": 15634 + }, + { + "epoch": 0.6865648704128566, + "grad_norm": 1.53125, + "learning_rate": 1.1233762298892384e-05, + "loss": 0.3229, + "step": 15636 + }, + { + "epoch": 0.6866526888920601, + "grad_norm": 1.5546875, + "learning_rate": 1.1227987881715412e-05, + "loss": 0.3101, + "step": 15638 + }, + { + "epoch": 0.6867405073712636, + "grad_norm": 1.609375, + "learning_rate": 1.1222214519202439e-05, + "loss": 0.326, + "step": 15640 + }, + { + "epoch": 0.686828325850467, + "grad_norm": 1.6328125, + "learning_rate": 1.1216442211795582e-05, + "loss": 0.3178, + "step": 15642 + }, + { + "epoch": 0.6869161443296705, + "grad_norm": 1.5234375, + "learning_rate": 1.1210670959936875e-05, + "loss": 0.2925, + "step": 15644 + }, + { + "epoch": 0.6870039628088741, + "grad_norm": 1.46875, + "learning_rate": 1.120490076406831e-05, + "loss": 0.2749, + "step": 15646 + }, + { + "epoch": 0.6870917812880776, + "grad_norm": 1.59375, + "learning_rate": 1.1199131624631734e-05, + "loss": 0.313, + "step": 15648 + }, + { + "epoch": 0.687179599767281, + "grad_norm": 1.609375, + "learning_rate": 1.1193363542068974e-05, + "loss": 0.3055, + "step": 15650 + }, + { + "epoch": 0.6872674182464845, + "grad_norm": 1.484375, + "learning_rate": 1.1187596516821734e-05, + "loss": 0.3377, + "step": 15652 + }, + { + "epoch": 0.687355236725688, + "grad_norm": 1.484375, + "learning_rate": 1.1181830549331674e-05, + "loss": 0.3177, + "step": 15654 + }, + { + "epoch": 0.6874430552048915, + "grad_norm": 1.578125, + "learning_rate": 1.1176065640040342e-05, + "loss": 0.3335, + "step": 15656 + }, + { + "epoch": 0.6875308736840949, + "grad_norm": 1.5859375, + "learning_rate": 1.1170301789389209e-05, + "loss": 0.2978, + "step": 15658 + }, + { + "epoch": 0.6876186921632985, + "grad_norm": 1.6328125, + "learning_rate": 1.116453899781969e-05, + "loss": 0.3564, + "step": 15660 + }, + { + "epoch": 0.687706510642502, + "grad_norm": 1.734375, + "learning_rate": 1.115877726577308e-05, + "loss": 0.3511, + "step": 15662 + }, + { + "epoch": 0.6877943291217055, + "grad_norm": 1.53125, + "learning_rate": 1.1153016593690634e-05, + "loss": 0.3411, + "step": 15664 + }, + { + "epoch": 0.6878821476009089, + "grad_norm": 1.6953125, + "learning_rate": 1.11472569820135e-05, + "loss": 0.333, + "step": 15666 + }, + { + "epoch": 0.6879699660801124, + "grad_norm": 2.015625, + "learning_rate": 1.1141498431182735e-05, + "loss": 0.3437, + "step": 15668 + }, + { + "epoch": 0.6880577845593159, + "grad_norm": 1.578125, + "learning_rate": 1.1135740941639353e-05, + "loss": 0.3365, + "step": 15670 + }, + { + "epoch": 0.6881456030385193, + "grad_norm": 1.6640625, + "learning_rate": 1.1129984513824241e-05, + "loss": 0.3471, + "step": 15672 + }, + { + "epoch": 0.6882334215177228, + "grad_norm": 1.6328125, + "learning_rate": 1.1124229148178253e-05, + "loss": 0.3298, + "step": 15674 + }, + { + "epoch": 0.6883212399969264, + "grad_norm": 1.6875, + "learning_rate": 1.1118474845142122e-05, + "loss": 0.3135, + "step": 15676 + }, + { + "epoch": 0.6884090584761299, + "grad_norm": 1.78125, + "learning_rate": 1.1112721605156511e-05, + "loss": 0.3343, + "step": 15678 + }, + { + "epoch": 0.6884968769553333, + "grad_norm": 1.5625, + "learning_rate": 1.1106969428662011e-05, + "loss": 0.3215, + "step": 15680 + }, + { + "epoch": 0.6885846954345368, + "grad_norm": 1.5546875, + "learning_rate": 1.1101218316099115e-05, + "loss": 0.3083, + "step": 15682 + }, + { + "epoch": 0.6886725139137403, + "grad_norm": 1.546875, + "learning_rate": 1.109546826790826e-05, + "loss": 0.3814, + "step": 15684 + }, + { + "epoch": 0.6887603323929438, + "grad_norm": 1.765625, + "learning_rate": 1.108971928452977e-05, + "loss": 0.3308, + "step": 15686 + }, + { + "epoch": 0.6888481508721472, + "grad_norm": 1.5703125, + "learning_rate": 1.108397136640392e-05, + "loss": 0.3188, + "step": 15688 + }, + { + "epoch": 0.6889359693513507, + "grad_norm": 1.578125, + "learning_rate": 1.107822451397087e-05, + "loss": 0.3336, + "step": 15690 + }, + { + "epoch": 0.6890237878305543, + "grad_norm": 1.6640625, + "learning_rate": 1.1072478727670732e-05, + "loss": 0.3128, + "step": 15692 + }, + { + "epoch": 0.6891116063097578, + "grad_norm": 1.71875, + "learning_rate": 1.1066734007943514e-05, + "loss": 0.3277, + "step": 15694 + }, + { + "epoch": 0.6891994247889612, + "grad_norm": 1.546875, + "learning_rate": 1.1060990355229134e-05, + "loss": 0.3215, + "step": 15696 + }, + { + "epoch": 0.6892872432681647, + "grad_norm": 1.6328125, + "learning_rate": 1.1055247769967465e-05, + "loss": 0.3183, + "step": 15698 + }, + { + "epoch": 0.6893750617473682, + "grad_norm": 1.5078125, + "learning_rate": 1.1049506252598255e-05, + "loss": 0.3155, + "step": 15700 + }, + { + "epoch": 0.6894628802265717, + "grad_norm": 1.59375, + "learning_rate": 1.1043765803561207e-05, + "loss": 0.3152, + "step": 15702 + }, + { + "epoch": 0.6895506987057751, + "grad_norm": 1.59375, + "learning_rate": 1.1038026423295923e-05, + "loss": 0.3361, + "step": 15704 + }, + { + "epoch": 0.6896385171849786, + "grad_norm": 1.5234375, + "learning_rate": 1.103228811224191e-05, + "loss": 0.3342, + "step": 15706 + }, + { + "epoch": 0.6897263356641822, + "grad_norm": 1.5390625, + "learning_rate": 1.1026550870838643e-05, + "loss": 0.3308, + "step": 15708 + }, + { + "epoch": 0.6898141541433856, + "grad_norm": 1.59375, + "learning_rate": 1.1020814699525439e-05, + "loss": 0.3376, + "step": 15710 + }, + { + "epoch": 0.6899019726225891, + "grad_norm": 1.546875, + "learning_rate": 1.1015079598741607e-05, + "loss": 0.3261, + "step": 15712 + }, + { + "epoch": 0.6899897911017926, + "grad_norm": 1.625, + "learning_rate": 1.1009345568926321e-05, + "loss": 0.3297, + "step": 15714 + }, + { + "epoch": 0.6900776095809961, + "grad_norm": 1.53125, + "learning_rate": 1.1003612610518718e-05, + "loss": 0.3, + "step": 15716 + }, + { + "epoch": 0.6901654280601995, + "grad_norm": 1.65625, + "learning_rate": 1.0997880723957812e-05, + "loss": 0.2955, + "step": 15718 + }, + { + "epoch": 0.690253246539403, + "grad_norm": 1.5859375, + "learning_rate": 1.099214990968255e-05, + "loss": 0.3062, + "step": 15720 + }, + { + "epoch": 0.6903410650186066, + "grad_norm": 1.640625, + "learning_rate": 1.0986420168131817e-05, + "loss": 0.3384, + "step": 15722 + }, + { + "epoch": 0.6904288834978101, + "grad_norm": 1.5078125, + "learning_rate": 1.0980691499744375e-05, + "loss": 0.3259, + "step": 15724 + }, + { + "epoch": 0.6905167019770135, + "grad_norm": 1.65625, + "learning_rate": 1.0974963904958947e-05, + "loss": 0.3113, + "step": 15726 + }, + { + "epoch": 0.690604520456217, + "grad_norm": 1.5234375, + "learning_rate": 1.0969237384214146e-05, + "loss": 0.3524, + "step": 15728 + }, + { + "epoch": 0.6906923389354205, + "grad_norm": 1.546875, + "learning_rate": 1.0963511937948501e-05, + "loss": 0.2989, + "step": 15730 + }, + { + "epoch": 0.690780157414624, + "grad_norm": 1.546875, + "learning_rate": 1.0957787566600486e-05, + "loss": 0.3107, + "step": 15732 + }, + { + "epoch": 0.6908679758938274, + "grad_norm": 1.5859375, + "learning_rate": 1.0952064270608452e-05, + "loss": 0.3216, + "step": 15734 + }, + { + "epoch": 0.6909557943730309, + "grad_norm": 1.515625, + "learning_rate": 1.0946342050410719e-05, + "loss": 0.3078, + "step": 15736 + }, + { + "epoch": 0.6910436128522345, + "grad_norm": 1.515625, + "learning_rate": 1.0940620906445478e-05, + "loss": 0.323, + "step": 15738 + }, + { + "epoch": 0.691131431331438, + "grad_norm": 1.5390625, + "learning_rate": 1.0934900839150858e-05, + "loss": 0.2987, + "step": 15740 + }, + { + "epoch": 0.6912192498106414, + "grad_norm": 1.5859375, + "learning_rate": 1.0929181848964904e-05, + "loss": 0.3072, + "step": 15742 + }, + { + "epoch": 0.6913070682898449, + "grad_norm": 1.515625, + "learning_rate": 1.0923463936325568e-05, + "loss": 0.2987, + "step": 15744 + }, + { + "epoch": 0.6913948867690484, + "grad_norm": 1.6171875, + "learning_rate": 1.0917747101670744e-05, + "loss": 0.2951, + "step": 15746 + }, + { + "epoch": 0.6914827052482518, + "grad_norm": 1.484375, + "learning_rate": 1.0912031345438218e-05, + "loss": 0.3291, + "step": 15748 + }, + { + "epoch": 0.6915705237274553, + "grad_norm": 1.5390625, + "learning_rate": 1.0906316668065717e-05, + "loss": 0.3089, + "step": 15750 + }, + { + "epoch": 0.6916583422066588, + "grad_norm": 1.5625, + "learning_rate": 1.0900603069990861e-05, + "loss": 0.3456, + "step": 15752 + }, + { + "epoch": 0.6917461606858624, + "grad_norm": 1.6015625, + "learning_rate": 1.0894890551651197e-05, + "loss": 0.337, + "step": 15754 + }, + { + "epoch": 0.6918339791650658, + "grad_norm": 1.5234375, + "learning_rate": 1.0889179113484202e-05, + "loss": 0.3076, + "step": 15756 + }, + { + "epoch": 0.6919217976442693, + "grad_norm": 1.484375, + "learning_rate": 1.0883468755927245e-05, + "loss": 0.322, + "step": 15758 + }, + { + "epoch": 0.6920096161234728, + "grad_norm": 1.7109375, + "learning_rate": 1.0877759479417643e-05, + "loss": 0.3365, + "step": 15760 + }, + { + "epoch": 0.6920974346026763, + "grad_norm": 1.53125, + "learning_rate": 1.0872051284392596e-05, + "loss": 0.3051, + "step": 15762 + }, + { + "epoch": 0.6921852530818797, + "grad_norm": 1.53125, + "learning_rate": 1.0866344171289259e-05, + "loss": 0.2926, + "step": 15764 + }, + { + "epoch": 0.6922730715610832, + "grad_norm": 1.5703125, + "learning_rate": 1.0860638140544672e-05, + "loss": 0.302, + "step": 15766 + }, + { + "epoch": 0.6923608900402868, + "grad_norm": 1.6171875, + "learning_rate": 1.0854933192595806e-05, + "loss": 0.3183, + "step": 15768 + }, + { + "epoch": 0.6924487085194903, + "grad_norm": 1.6484375, + "learning_rate": 1.0849229327879548e-05, + "loss": 0.3038, + "step": 15770 + }, + { + "epoch": 0.6925365269986937, + "grad_norm": 1.6015625, + "learning_rate": 1.0843526546832688e-05, + "loss": 0.3107, + "step": 15772 + }, + { + "epoch": 0.6926243454778972, + "grad_norm": 1.4921875, + "learning_rate": 1.0837824849891972e-05, + "loss": 0.32, + "step": 15774 + }, + { + "epoch": 0.6927121639571007, + "grad_norm": 1.5234375, + "learning_rate": 1.0832124237494013e-05, + "loss": 0.3063, + "step": 15776 + }, + { + "epoch": 0.6927999824363041, + "grad_norm": 1.5546875, + "learning_rate": 1.0826424710075383e-05, + "loss": 0.3032, + "step": 15778 + }, + { + "epoch": 0.6928878009155076, + "grad_norm": 1.6171875, + "learning_rate": 1.082072626807255e-05, + "loss": 0.3558, + "step": 15780 + }, + { + "epoch": 0.6929756193947111, + "grad_norm": 1.5625, + "learning_rate": 1.0815028911921887e-05, + "loss": 0.2991, + "step": 15782 + }, + { + "epoch": 0.6930634378739147, + "grad_norm": 1.578125, + "learning_rate": 1.0809332642059721e-05, + "loss": 0.3309, + "step": 15784 + }, + { + "epoch": 0.6931512563531181, + "grad_norm": 1.7421875, + "learning_rate": 1.0803637458922253e-05, + "loss": 0.313, + "step": 15786 + }, + { + "epoch": 0.6932390748323216, + "grad_norm": 1.5546875, + "learning_rate": 1.079794336294564e-05, + "loss": 0.2886, + "step": 15788 + }, + { + "epoch": 0.6933268933115251, + "grad_norm": 1.6875, + "learning_rate": 1.079225035456593e-05, + "loss": 0.3351, + "step": 15790 + }, + { + "epoch": 0.6934147117907286, + "grad_norm": 1.6015625, + "learning_rate": 1.0786558434219082e-05, + "loss": 0.3275, + "step": 15792 + }, + { + "epoch": 0.693502530269932, + "grad_norm": 1.5859375, + "learning_rate": 1.0780867602341007e-05, + "loss": 0.2797, + "step": 15794 + }, + { + "epoch": 0.6935903487491355, + "grad_norm": 1.6796875, + "learning_rate": 1.0775177859367492e-05, + "loss": 0.319, + "step": 15796 + }, + { + "epoch": 0.693678167228339, + "grad_norm": 1.609375, + "learning_rate": 1.0769489205734276e-05, + "loss": 0.3419, + "step": 15798 + }, + { + "epoch": 0.6937659857075426, + "grad_norm": 1.453125, + "learning_rate": 1.0763801641876986e-05, + "loss": 0.2933, + "step": 15800 + }, + { + "epoch": 0.693853804186746, + "grad_norm": 1.5859375, + "learning_rate": 1.0758115168231178e-05, + "loss": 0.2888, + "step": 15802 + }, + { + "epoch": 0.6939416226659495, + "grad_norm": 1.8046875, + "learning_rate": 1.0752429785232326e-05, + "loss": 0.3036, + "step": 15804 + }, + { + "epoch": 0.694029441145153, + "grad_norm": 1.6484375, + "learning_rate": 1.0746745493315807e-05, + "loss": 0.316, + "step": 15806 + }, + { + "epoch": 0.6941172596243564, + "grad_norm": 1.6640625, + "learning_rate": 1.0741062292916943e-05, + "loss": 0.3299, + "step": 15808 + }, + { + "epoch": 0.6942050781035599, + "grad_norm": 1.703125, + "learning_rate": 1.073538018447094e-05, + "loss": 0.3164, + "step": 15810 + }, + { + "epoch": 0.6942928965827634, + "grad_norm": 1.734375, + "learning_rate": 1.072969916841295e-05, + "loss": 0.3163, + "step": 15812 + }, + { + "epoch": 0.694380715061967, + "grad_norm": 1.5390625, + "learning_rate": 1.0724019245178016e-05, + "loss": 0.3286, + "step": 15814 + }, + { + "epoch": 0.6944685335411704, + "grad_norm": 1.6328125, + "learning_rate": 1.0718340415201104e-05, + "loss": 0.3321, + "step": 15816 + }, + { + "epoch": 0.6945563520203739, + "grad_norm": 1.5078125, + "learning_rate": 1.0712662678917115e-05, + "loss": 0.3012, + "step": 15818 + }, + { + "epoch": 0.6946441704995774, + "grad_norm": 1.5, + "learning_rate": 1.0706986036760833e-05, + "loss": 0.3009, + "step": 15820 + }, + { + "epoch": 0.6947319889787809, + "grad_norm": 1.5546875, + "learning_rate": 1.0701310489166997e-05, + "loss": 0.3225, + "step": 15822 + }, + { + "epoch": 0.6948198074579843, + "grad_norm": 1.6484375, + "learning_rate": 1.0695636036570222e-05, + "loss": 0.3273, + "step": 15824 + }, + { + "epoch": 0.6949076259371878, + "grad_norm": 1.546875, + "learning_rate": 1.0689962679405077e-05, + "loss": 0.3408, + "step": 15826 + }, + { + "epoch": 0.6949954444163913, + "grad_norm": 1.5625, + "learning_rate": 1.0684290418106022e-05, + "loss": 0.2963, + "step": 15828 + }, + { + "epoch": 0.6950832628955949, + "grad_norm": 1.5859375, + "learning_rate": 1.0678619253107436e-05, + "loss": 0.3406, + "step": 15830 + }, + { + "epoch": 0.6951710813747983, + "grad_norm": 1.515625, + "learning_rate": 1.0672949184843622e-05, + "loss": 0.3546, + "step": 15832 + }, + { + "epoch": 0.6952588998540018, + "grad_norm": 1.5625, + "learning_rate": 1.0667280213748784e-05, + "loss": 0.3345, + "step": 15834 + }, + { + "epoch": 0.6953467183332053, + "grad_norm": 1.625, + "learning_rate": 1.0661612340257071e-05, + "loss": 0.3073, + "step": 15836 + }, + { + "epoch": 0.6954345368124087, + "grad_norm": 1.6171875, + "learning_rate": 1.0655945564802517e-05, + "loss": 0.3455, + "step": 15838 + }, + { + "epoch": 0.6955223552916122, + "grad_norm": 1.5234375, + "learning_rate": 1.0650279887819094e-05, + "loss": 0.3232, + "step": 15840 + }, + { + "epoch": 0.6956101737708157, + "grad_norm": 1.578125, + "learning_rate": 1.0644615309740683e-05, + "loss": 0.3087, + "step": 15842 + }, + { + "epoch": 0.6956979922500192, + "grad_norm": 1.53125, + "learning_rate": 1.063895183100106e-05, + "loss": 0.3733, + "step": 15844 + }, + { + "epoch": 0.6957858107292227, + "grad_norm": 1.6015625, + "learning_rate": 1.0633289452033957e-05, + "loss": 0.3111, + "step": 15846 + }, + { + "epoch": 0.6958736292084262, + "grad_norm": 1.5703125, + "learning_rate": 1.0627628173272986e-05, + "loss": 0.3288, + "step": 15848 + }, + { + "epoch": 0.6959614476876297, + "grad_norm": 1.5703125, + "learning_rate": 1.0621967995151699e-05, + "loss": 0.3649, + "step": 15850 + }, + { + "epoch": 0.6960492661668332, + "grad_norm": 1.6015625, + "learning_rate": 1.0616308918103554e-05, + "loss": 0.3319, + "step": 15852 + }, + { + "epoch": 0.6961370846460366, + "grad_norm": 1.578125, + "learning_rate": 1.0610650942561909e-05, + "loss": 0.3189, + "step": 15854 + }, + { + "epoch": 0.6962249031252401, + "grad_norm": 1.609375, + "learning_rate": 1.0604994068960072e-05, + "loss": 0.3257, + "step": 15856 + }, + { + "epoch": 0.6963127216044436, + "grad_norm": 1.5703125, + "learning_rate": 1.0599338297731231e-05, + "loss": 0.3204, + "step": 15858 + }, + { + "epoch": 0.696400540083647, + "grad_norm": 1.640625, + "learning_rate": 1.0593683629308537e-05, + "loss": 0.3182, + "step": 15860 + }, + { + "epoch": 0.6964883585628506, + "grad_norm": 1.546875, + "learning_rate": 1.0588030064124981e-05, + "loss": 0.3205, + "step": 15862 + }, + { + "epoch": 0.6965761770420541, + "grad_norm": 1.5625, + "learning_rate": 1.058237760261355e-05, + "loss": 0.3209, + "step": 15864 + }, + { + "epoch": 0.6966639955212576, + "grad_norm": 1.4921875, + "learning_rate": 1.05767262452071e-05, + "loss": 0.2977, + "step": 15866 + }, + { + "epoch": 0.696751814000461, + "grad_norm": 1.5625, + "learning_rate": 1.0571075992338398e-05, + "loss": 0.3416, + "step": 15868 + }, + { + "epoch": 0.6968396324796645, + "grad_norm": 1.515625, + "learning_rate": 1.0565426844440166e-05, + "loss": 0.3263, + "step": 15870 + }, + { + "epoch": 0.696927450958868, + "grad_norm": 1.5625, + "learning_rate": 1.0559778801945e-05, + "loss": 0.3241, + "step": 15872 + }, + { + "epoch": 0.6970152694380715, + "grad_norm": 1.4296875, + "learning_rate": 1.0554131865285441e-05, + "loss": 0.332, + "step": 15874 + }, + { + "epoch": 0.697103087917275, + "grad_norm": 1.6328125, + "learning_rate": 1.0548486034893926e-05, + "loss": 0.3466, + "step": 15876 + }, + { + "epoch": 0.6971909063964785, + "grad_norm": 1.4609375, + "learning_rate": 1.0542841311202809e-05, + "loss": 0.3141, + "step": 15878 + }, + { + "epoch": 0.697278724875682, + "grad_norm": 1.5625, + "learning_rate": 1.0537197694644376e-05, + "loss": 0.3049, + "step": 15880 + }, + { + "epoch": 0.6973665433548855, + "grad_norm": 1.4375, + "learning_rate": 1.0531555185650802e-05, + "loss": 0.3313, + "step": 15882 + }, + { + "epoch": 0.6974543618340889, + "grad_norm": 1.578125, + "learning_rate": 1.052591378465421e-05, + "loss": 0.3541, + "step": 15884 + }, + { + "epoch": 0.6975421803132924, + "grad_norm": 1.59375, + "learning_rate": 1.052027349208661e-05, + "loss": 0.3362, + "step": 15886 + }, + { + "epoch": 0.6976299987924959, + "grad_norm": 1.578125, + "learning_rate": 1.0514634308379928e-05, + "loss": 0.3457, + "step": 15888 + }, + { + "epoch": 0.6977178172716993, + "grad_norm": 1.609375, + "learning_rate": 1.050899623396603e-05, + "loss": 0.3524, + "step": 15890 + }, + { + "epoch": 0.6978056357509029, + "grad_norm": 1.5703125, + "learning_rate": 1.0503359269276678e-05, + "loss": 0.3214, + "step": 15892 + }, + { + "epoch": 0.6978934542301064, + "grad_norm": 1.6484375, + "learning_rate": 1.0497723414743546e-05, + "loss": 0.2927, + "step": 15894 + }, + { + "epoch": 0.6979812727093099, + "grad_norm": 1.453125, + "learning_rate": 1.0492088670798223e-05, + "loss": 0.317, + "step": 15896 + }, + { + "epoch": 0.6980690911885133, + "grad_norm": 1.6015625, + "learning_rate": 1.0486455037872236e-05, + "loss": 0.3112, + "step": 15898 + }, + { + "epoch": 0.6981569096677168, + "grad_norm": 1.5703125, + "learning_rate": 1.0480822516396994e-05, + "loss": 0.3209, + "step": 15900 + }, + { + "epoch": 0.6982447281469203, + "grad_norm": 1.4765625, + "learning_rate": 1.047519110680385e-05, + "loss": 0.3126, + "step": 15902 + }, + { + "epoch": 0.6983325466261238, + "grad_norm": 1.59375, + "learning_rate": 1.0469560809524056e-05, + "loss": 0.2958, + "step": 15904 + }, + { + "epoch": 0.6984203651053272, + "grad_norm": 1.5625, + "learning_rate": 1.0463931624988769e-05, + "loss": 0.308, + "step": 15906 + }, + { + "epoch": 0.6985081835845308, + "grad_norm": 1.65625, + "learning_rate": 1.045830355362909e-05, + "loss": 0.3281, + "step": 15908 + }, + { + "epoch": 0.6985960020637343, + "grad_norm": 1.5546875, + "learning_rate": 1.0452676595876001e-05, + "loss": 0.3242, + "step": 15910 + }, + { + "epoch": 0.6986838205429378, + "grad_norm": 1.625, + "learning_rate": 1.0447050752160436e-05, + "loss": 0.3326, + "step": 15912 + }, + { + "epoch": 0.6987716390221412, + "grad_norm": 1.6015625, + "learning_rate": 1.0441426022913211e-05, + "loss": 0.3141, + "step": 15914 + }, + { + "epoch": 0.6988594575013447, + "grad_norm": 1.640625, + "learning_rate": 1.0435802408565065e-05, + "loss": 0.3224, + "step": 15916 + }, + { + "epoch": 0.6989472759805482, + "grad_norm": 1.546875, + "learning_rate": 1.043017990954667e-05, + "loss": 0.3246, + "step": 15918 + }, + { + "epoch": 0.6990350944597516, + "grad_norm": 1.7421875, + "learning_rate": 1.042455852628858e-05, + "loss": 0.318, + "step": 15920 + }, + { + "epoch": 0.6991229129389552, + "grad_norm": 1.7890625, + "learning_rate": 1.0418938259221311e-05, + "loss": 0.3173, + "step": 15922 + }, + { + "epoch": 0.6992107314181587, + "grad_norm": 1.640625, + "learning_rate": 1.041331910877523e-05, + "loss": 0.2939, + "step": 15924 + }, + { + "epoch": 0.6992985498973622, + "grad_norm": 1.5078125, + "learning_rate": 1.0407701075380674e-05, + "loss": 0.313, + "step": 15926 + }, + { + "epoch": 0.6993863683765656, + "grad_norm": 1.578125, + "learning_rate": 1.0402084159467867e-05, + "loss": 0.3124, + "step": 15928 + }, + { + "epoch": 0.6994741868557691, + "grad_norm": 1.625, + "learning_rate": 1.0396468361466947e-05, + "loss": 0.3338, + "step": 15930 + }, + { + "epoch": 0.6995620053349726, + "grad_norm": 1.59375, + "learning_rate": 1.0390853681807989e-05, + "loss": 0.2974, + "step": 15932 + }, + { + "epoch": 0.6996498238141761, + "grad_norm": 1.5546875, + "learning_rate": 1.038524012092095e-05, + "loss": 0.3551, + "step": 15934 + }, + { + "epoch": 0.6997376422933795, + "grad_norm": 1.609375, + "learning_rate": 1.0379627679235734e-05, + "loss": 0.3138, + "step": 15936 + }, + { + "epoch": 0.6998254607725831, + "grad_norm": 1.5234375, + "learning_rate": 1.0374016357182137e-05, + "loss": 0.3576, + "step": 15938 + }, + { + "epoch": 0.6999132792517866, + "grad_norm": 1.6171875, + "learning_rate": 1.0368406155189862e-05, + "loss": 0.3219, + "step": 15940 + }, + { + "epoch": 0.7000010977309901, + "grad_norm": 1.6796875, + "learning_rate": 1.036279707368856e-05, + "loss": 0.3306, + "step": 15942 + }, + { + "epoch": 0.7000889162101935, + "grad_norm": 1.6015625, + "learning_rate": 1.035718911310776e-05, + "loss": 0.3197, + "step": 15944 + }, + { + "epoch": 0.700176734689397, + "grad_norm": 1.625, + "learning_rate": 1.0351582273876936e-05, + "loss": 0.2982, + "step": 15946 + }, + { + "epoch": 0.7002645531686005, + "grad_norm": 1.5, + "learning_rate": 1.0345976556425452e-05, + "loss": 0.3038, + "step": 15948 + }, + { + "epoch": 0.700352371647804, + "grad_norm": 1.71875, + "learning_rate": 1.0340371961182588e-05, + "loss": 0.3031, + "step": 15950 + }, + { + "epoch": 0.7004401901270074, + "grad_norm": 1.4765625, + "learning_rate": 1.0334768488577563e-05, + "loss": 0.3268, + "step": 15952 + }, + { + "epoch": 0.700528008606211, + "grad_norm": 1.8671875, + "learning_rate": 1.032916613903948e-05, + "loss": 0.317, + "step": 15954 + }, + { + "epoch": 0.7006158270854145, + "grad_norm": 1.640625, + "learning_rate": 1.0323564912997371e-05, + "loss": 0.3188, + "step": 15956 + }, + { + "epoch": 0.700703645564618, + "grad_norm": 1.6328125, + "learning_rate": 1.0317964810880173e-05, + "loss": 0.33, + "step": 15958 + }, + { + "epoch": 0.7007914640438214, + "grad_norm": 1.53125, + "learning_rate": 1.0312365833116757e-05, + "loss": 0.3089, + "step": 15960 + }, + { + "epoch": 0.7008792825230249, + "grad_norm": 1.5625, + "learning_rate": 1.0306767980135878e-05, + "loss": 0.3248, + "step": 15962 + }, + { + "epoch": 0.7009671010022284, + "grad_norm": 1.5, + "learning_rate": 1.0301171252366238e-05, + "loss": 0.3073, + "step": 15964 + }, + { + "epoch": 0.7010549194814318, + "grad_norm": 1.4609375, + "learning_rate": 1.0295575650236428e-05, + "loss": 0.339, + "step": 15966 + }, + { + "epoch": 0.7011427379606354, + "grad_norm": 1.5546875, + "learning_rate": 1.0289981174174947e-05, + "loss": 0.3299, + "step": 15968 + }, + { + "epoch": 0.7012305564398389, + "grad_norm": 1.609375, + "learning_rate": 1.0284387824610247e-05, + "loss": 0.3266, + "step": 15970 + }, + { + "epoch": 0.7013183749190424, + "grad_norm": 1.640625, + "learning_rate": 1.0278795601970646e-05, + "loss": 0.3139, + "step": 15972 + }, + { + "epoch": 0.7014061933982458, + "grad_norm": 1.6796875, + "learning_rate": 1.027320450668441e-05, + "loss": 0.3182, + "step": 15974 + }, + { + "epoch": 0.7014940118774493, + "grad_norm": 1.640625, + "learning_rate": 1.026761453917971e-05, + "loss": 0.3215, + "step": 15976 + }, + { + "epoch": 0.7015818303566528, + "grad_norm": 1.4609375, + "learning_rate": 1.026202569988461e-05, + "loss": 0.3387, + "step": 15978 + }, + { + "epoch": 0.7016696488358563, + "grad_norm": 1.5546875, + "learning_rate": 1.025643798922712e-05, + "loss": 0.3404, + "step": 15980 + }, + { + "epoch": 0.7017574673150597, + "grad_norm": 1.5546875, + "learning_rate": 1.0250851407635137e-05, + "loss": 0.3075, + "step": 15982 + }, + { + "epoch": 0.7018452857942633, + "grad_norm": 1.53125, + "learning_rate": 1.0245265955536503e-05, + "loss": 0.3429, + "step": 15984 + }, + { + "epoch": 0.7019331042734668, + "grad_norm": 1.6015625, + "learning_rate": 1.0239681633358924e-05, + "loss": 0.3293, + "step": 15986 + }, + { + "epoch": 0.7020209227526703, + "grad_norm": 1.5859375, + "learning_rate": 1.0234098441530075e-05, + "loss": 0.3247, + "step": 15988 + }, + { + "epoch": 0.7021087412318737, + "grad_norm": 1.6953125, + "learning_rate": 1.0228516380477504e-05, + "loss": 0.3329, + "step": 15990 + }, + { + "epoch": 0.7021965597110772, + "grad_norm": 1.4609375, + "learning_rate": 1.0222935450628681e-05, + "loss": 0.2793, + "step": 15992 + }, + { + "epoch": 0.7022843781902807, + "grad_norm": 1.609375, + "learning_rate": 1.0217355652411015e-05, + "loss": 0.3479, + "step": 15994 + }, + { + "epoch": 0.7023721966694841, + "grad_norm": 1.796875, + "learning_rate": 1.0211776986251784e-05, + "loss": 0.3216, + "step": 15996 + }, + { + "epoch": 0.7024600151486876, + "grad_norm": 1.625, + "learning_rate": 1.0206199452578228e-05, + "loss": 0.3071, + "step": 15998 + }, + { + "epoch": 0.7025478336278912, + "grad_norm": 1.625, + "learning_rate": 1.0200623051817462e-05, + "loss": 0.303, + "step": 16000 + }, + { + "epoch": 0.7026356521070947, + "grad_norm": 1.7421875, + "learning_rate": 1.0195047784396524e-05, + "loss": 0.3267, + "step": 16002 + }, + { + "epoch": 0.7027234705862981, + "grad_norm": 1.6171875, + "learning_rate": 1.0189473650742385e-05, + "loss": 0.3194, + "step": 16004 + }, + { + "epoch": 0.7028112890655016, + "grad_norm": 1.796875, + "learning_rate": 1.018390065128189e-05, + "loss": 0.3244, + "step": 16006 + }, + { + "epoch": 0.7028991075447051, + "grad_norm": 1.7265625, + "learning_rate": 1.0178328786441848e-05, + "loss": 0.3149, + "step": 16008 + }, + { + "epoch": 0.7029869260239086, + "grad_norm": 1.4921875, + "learning_rate": 1.017275805664894e-05, + "loss": 0.3069, + "step": 16010 + }, + { + "epoch": 0.703074744503112, + "grad_norm": 1.8046875, + "learning_rate": 1.0167188462329767e-05, + "loss": 0.3143, + "step": 16012 + }, + { + "epoch": 0.7031625629823156, + "grad_norm": 1.765625, + "learning_rate": 1.016162000391087e-05, + "loss": 0.3319, + "step": 16014 + }, + { + "epoch": 0.7032503814615191, + "grad_norm": 1.5703125, + "learning_rate": 1.0156052681818659e-05, + "loss": 0.3057, + "step": 16016 + }, + { + "epoch": 0.7033381999407226, + "grad_norm": 1.5703125, + "learning_rate": 1.0150486496479498e-05, + "loss": 0.2966, + "step": 16018 + }, + { + "epoch": 0.703426018419926, + "grad_norm": 1.546875, + "learning_rate": 1.014492144831963e-05, + "loss": 0.3022, + "step": 16020 + }, + { + "epoch": 0.7035138368991295, + "grad_norm": 1.5625, + "learning_rate": 1.0139357537765249e-05, + "loss": 0.2942, + "step": 16022 + }, + { + "epoch": 0.703601655378333, + "grad_norm": 1.6484375, + "learning_rate": 1.013379476524242e-05, + "loss": 0.3157, + "step": 16024 + }, + { + "epoch": 0.7036894738575364, + "grad_norm": 1.421875, + "learning_rate": 1.0128233131177161e-05, + "loss": 0.3504, + "step": 16026 + }, + { + "epoch": 0.7037772923367399, + "grad_norm": 1.515625, + "learning_rate": 1.0122672635995375e-05, + "loss": 0.2993, + "step": 16028 + }, + { + "epoch": 0.7038651108159435, + "grad_norm": 1.921875, + "learning_rate": 1.0117113280122875e-05, + "loss": 0.3401, + "step": 16030 + }, + { + "epoch": 0.703952929295147, + "grad_norm": 1.5546875, + "learning_rate": 1.0111555063985418e-05, + "loss": 0.3181, + "step": 16032 + }, + { + "epoch": 0.7040407477743504, + "grad_norm": 1.5625, + "learning_rate": 1.0105997988008631e-05, + "loss": 0.3173, + "step": 16034 + }, + { + "epoch": 0.7041285662535539, + "grad_norm": 1.4921875, + "learning_rate": 1.01004420526181e-05, + "loss": 0.3183, + "step": 16036 + }, + { + "epoch": 0.7042163847327574, + "grad_norm": 1.4765625, + "learning_rate": 1.0094887258239288e-05, + "loss": 0.3163, + "step": 16038 + }, + { + "epoch": 0.7043042032119609, + "grad_norm": 1.5390625, + "learning_rate": 1.0089333605297574e-05, + "loss": 0.3289, + "step": 16040 + }, + { + "epoch": 0.7043920216911643, + "grad_norm": 1.4609375, + "learning_rate": 1.0083781094218275e-05, + "loss": 0.2794, + "step": 16042 + }, + { + "epoch": 0.7044798401703678, + "grad_norm": 1.6484375, + "learning_rate": 1.0078229725426594e-05, + "loss": 0.3269, + "step": 16044 + }, + { + "epoch": 0.7045676586495714, + "grad_norm": 1.609375, + "learning_rate": 1.0072679499347663e-05, + "loss": 0.3037, + "step": 16046 + }, + { + "epoch": 0.7046554771287749, + "grad_norm": 1.6015625, + "learning_rate": 1.00671304164065e-05, + "loss": 0.3021, + "step": 16048 + }, + { + "epoch": 0.7047432956079783, + "grad_norm": 1.5390625, + "learning_rate": 1.0061582477028078e-05, + "loss": 0.3212, + "step": 16050 + }, + { + "epoch": 0.7048311140871818, + "grad_norm": 1.59375, + "learning_rate": 1.0056035681637254e-05, + "loss": 0.313, + "step": 16052 + }, + { + "epoch": 0.7049189325663853, + "grad_norm": 1.5390625, + "learning_rate": 1.005049003065879e-05, + "loss": 0.3348, + "step": 16054 + }, + { + "epoch": 0.7050067510455887, + "grad_norm": 1.578125, + "learning_rate": 1.0044945524517391e-05, + "loss": 0.324, + "step": 16056 + }, + { + "epoch": 0.7050945695247922, + "grad_norm": 1.8046875, + "learning_rate": 1.003940216363764e-05, + "loss": 0.3451, + "step": 16058 + }, + { + "epoch": 0.7051823880039957, + "grad_norm": 1.5625, + "learning_rate": 1.0033859948444069e-05, + "loss": 0.32, + "step": 16060 + }, + { + "epoch": 0.7052702064831993, + "grad_norm": 1.546875, + "learning_rate": 1.0028318879361087e-05, + "loss": 0.3068, + "step": 16062 + }, + { + "epoch": 0.7053580249624027, + "grad_norm": 1.515625, + "learning_rate": 1.0022778956813028e-05, + "loss": 0.3484, + "step": 16064 + }, + { + "epoch": 0.7054458434416062, + "grad_norm": 1.5390625, + "learning_rate": 1.0017240181224155e-05, + "loss": 0.325, + "step": 16066 + }, + { + "epoch": 0.7055336619208097, + "grad_norm": 1.671875, + "learning_rate": 1.0011702553018612e-05, + "loss": 0.3345, + "step": 16068 + }, + { + "epoch": 0.7056214804000132, + "grad_norm": 1.5, + "learning_rate": 1.0006166072620488e-05, + "loss": 0.3413, + "step": 16070 + }, + { + "epoch": 0.7057092988792166, + "grad_norm": 1.609375, + "learning_rate": 1.0000630740453762e-05, + "loss": 0.2878, + "step": 16072 + }, + { + "epoch": 0.7057971173584201, + "grad_norm": 1.59375, + "learning_rate": 9.995096556942319e-06, + "loss": 0.3334, + "step": 16074 + }, + { + "epoch": 0.7058849358376237, + "grad_norm": 1.5, + "learning_rate": 9.989563522509997e-06, + "loss": 0.3158, + "step": 16076 + }, + { + "epoch": 0.7059727543168272, + "grad_norm": 1.515625, + "learning_rate": 9.984031637580483e-06, + "loss": 0.2987, + "step": 16078 + }, + { + "epoch": 0.7060605727960306, + "grad_norm": 1.5078125, + "learning_rate": 9.978500902577432e-06, + "loss": 0.3314, + "step": 16080 + }, + { + "epoch": 0.7061483912752341, + "grad_norm": 1.6328125, + "learning_rate": 9.972971317924374e-06, + "loss": 0.3223, + "step": 16082 + }, + { + "epoch": 0.7062362097544376, + "grad_norm": 1.6328125, + "learning_rate": 9.967442884044784e-06, + "loss": 0.3217, + "step": 16084 + }, + { + "epoch": 0.706324028233641, + "grad_norm": 1.453125, + "learning_rate": 9.961915601362013e-06, + "loss": 0.3206, + "step": 16086 + }, + { + "epoch": 0.7064118467128445, + "grad_norm": 1.4453125, + "learning_rate": 9.95638947029936e-06, + "loss": 0.3122, + "step": 16088 + }, + { + "epoch": 0.706499665192048, + "grad_norm": 1.4765625, + "learning_rate": 9.950864491280004e-06, + "loss": 0.2964, + "step": 16090 + }, + { + "epoch": 0.7065874836712516, + "grad_norm": 1.6015625, + "learning_rate": 9.945340664727048e-06, + "loss": 0.3622, + "step": 16092 + }, + { + "epoch": 0.706675302150455, + "grad_norm": 1.421875, + "learning_rate": 9.939817991063518e-06, + "loss": 0.3222, + "step": 16094 + }, + { + "epoch": 0.7067631206296585, + "grad_norm": 1.671875, + "learning_rate": 9.934296470712331e-06, + "loss": 0.307, + "step": 16096 + }, + { + "epoch": 0.706850939108862, + "grad_norm": 1.609375, + "learning_rate": 9.928776104096338e-06, + "loss": 0.298, + "step": 16098 + }, + { + "epoch": 0.7069387575880655, + "grad_norm": 1.5, + "learning_rate": 9.923256891638285e-06, + "loss": 0.3036, + "step": 16100 + }, + { + "epoch": 0.7070265760672689, + "grad_norm": 1.5234375, + "learning_rate": 9.917738833760826e-06, + "loss": 0.3059, + "step": 16102 + }, + { + "epoch": 0.7071143945464724, + "grad_norm": 1.4921875, + "learning_rate": 9.91222193088655e-06, + "loss": 0.325, + "step": 16104 + }, + { + "epoch": 0.7072022130256759, + "grad_norm": 1.796875, + "learning_rate": 9.906706183437933e-06, + "loss": 0.3055, + "step": 16106 + }, + { + "epoch": 0.7072900315048795, + "grad_norm": 1.5546875, + "learning_rate": 9.901191591837378e-06, + "loss": 0.3273, + "step": 16108 + }, + { + "epoch": 0.7073778499840829, + "grad_norm": 1.53125, + "learning_rate": 9.89567815650718e-06, + "loss": 0.2972, + "step": 16110 + }, + { + "epoch": 0.7074656684632864, + "grad_norm": 1.5234375, + "learning_rate": 9.890165877869578e-06, + "loss": 0.3642, + "step": 16112 + }, + { + "epoch": 0.7075534869424899, + "grad_norm": 1.4375, + "learning_rate": 9.884654756346698e-06, + "loss": 0.3163, + "step": 16114 + }, + { + "epoch": 0.7076413054216933, + "grad_norm": 1.5859375, + "learning_rate": 9.879144792360567e-06, + "loss": 0.3013, + "step": 16116 + }, + { + "epoch": 0.7077291239008968, + "grad_norm": 1.5234375, + "learning_rate": 9.873635986333162e-06, + "loss": 0.3158, + "step": 16118 + }, + { + "epoch": 0.7078169423801003, + "grad_norm": 1.53125, + "learning_rate": 9.868128338686334e-06, + "loss": 0.3253, + "step": 16120 + }, + { + "epoch": 0.7079047608593039, + "grad_norm": 1.609375, + "learning_rate": 9.862621849841871e-06, + "loss": 0.3257, + "step": 16122 + }, + { + "epoch": 0.7079925793385073, + "grad_norm": 1.4140625, + "learning_rate": 9.857116520221457e-06, + "loss": 0.3153, + "step": 16124 + }, + { + "epoch": 0.7080803978177108, + "grad_norm": 1.5703125, + "learning_rate": 9.85161235024668e-06, + "loss": 0.3007, + "step": 16126 + }, + { + "epoch": 0.7081682162969143, + "grad_norm": 1.6875, + "learning_rate": 9.846109340339068e-06, + "loss": 0.334, + "step": 16128 + }, + { + "epoch": 0.7082560347761178, + "grad_norm": 1.59375, + "learning_rate": 9.840607490920031e-06, + "loss": 0.3236, + "step": 16130 + }, + { + "epoch": 0.7083438532553212, + "grad_norm": 1.578125, + "learning_rate": 9.835106802410913e-06, + "loss": 0.2867, + "step": 16132 + }, + { + "epoch": 0.7084316717345247, + "grad_norm": 1.5390625, + "learning_rate": 9.829607275232949e-06, + "loss": 0.294, + "step": 16134 + }, + { + "epoch": 0.7085194902137282, + "grad_norm": 1.4453125, + "learning_rate": 9.824108909807297e-06, + "loss": 0.3177, + "step": 16136 + }, + { + "epoch": 0.7086073086929318, + "grad_norm": 1.5078125, + "learning_rate": 9.818611706555026e-06, + "loss": 0.3327, + "step": 16138 + }, + { + "epoch": 0.7086951271721352, + "grad_norm": 1.5078125, + "learning_rate": 9.813115665897096e-06, + "loss": 0.3253, + "step": 16140 + }, + { + "epoch": 0.7087829456513387, + "grad_norm": 1.5234375, + "learning_rate": 9.807620788254421e-06, + "loss": 0.3269, + "step": 16142 + }, + { + "epoch": 0.7088707641305422, + "grad_norm": 1.5078125, + "learning_rate": 9.802127074047779e-06, + "loss": 0.3377, + "step": 16144 + }, + { + "epoch": 0.7089585826097456, + "grad_norm": 1.65625, + "learning_rate": 9.796634523697898e-06, + "loss": 0.3253, + "step": 16146 + }, + { + "epoch": 0.7090464010889491, + "grad_norm": 1.5859375, + "learning_rate": 9.79114313762539e-06, + "loss": 0.3284, + "step": 16148 + }, + { + "epoch": 0.7091342195681526, + "grad_norm": 1.6015625, + "learning_rate": 9.785652916250773e-06, + "loss": 0.3334, + "step": 16150 + }, + { + "epoch": 0.7092220380473561, + "grad_norm": 1.5859375, + "learning_rate": 9.780163859994515e-06, + "loss": 0.2905, + "step": 16152 + }, + { + "epoch": 0.7093098565265596, + "grad_norm": 1.5546875, + "learning_rate": 9.77467596927695e-06, + "loss": 0.3026, + "step": 16154 + }, + { + "epoch": 0.7093976750057631, + "grad_norm": 1.578125, + "learning_rate": 9.769189244518354e-06, + "loss": 0.3614, + "step": 16156 + }, + { + "epoch": 0.7094854934849666, + "grad_norm": 1.6484375, + "learning_rate": 9.763703686138892e-06, + "loss": 0.3132, + "step": 16158 + }, + { + "epoch": 0.7095733119641701, + "grad_norm": 1.5625, + "learning_rate": 9.75821929455866e-06, + "loss": 0.2955, + "step": 16160 + }, + { + "epoch": 0.7096611304433735, + "grad_norm": 1.6484375, + "learning_rate": 9.75273607019765e-06, + "loss": 0.3125, + "step": 16162 + }, + { + "epoch": 0.709748948922577, + "grad_norm": 1.5, + "learning_rate": 9.747254013475754e-06, + "loss": 0.3066, + "step": 16164 + }, + { + "epoch": 0.7098367674017805, + "grad_norm": 1.78125, + "learning_rate": 9.741773124812814e-06, + "loss": 0.3302, + "step": 16166 + }, + { + "epoch": 0.7099245858809841, + "grad_norm": 1.5703125, + "learning_rate": 9.736293404628546e-06, + "loss": 0.3334, + "step": 16168 + }, + { + "epoch": 0.7100124043601875, + "grad_norm": 1.4765625, + "learning_rate": 9.730814853342587e-06, + "loss": 0.3064, + "step": 16170 + }, + { + "epoch": 0.710100222839391, + "grad_norm": 1.5546875, + "learning_rate": 9.72533747137448e-06, + "loss": 0.3152, + "step": 16172 + }, + { + "epoch": 0.7101880413185945, + "grad_norm": 1.578125, + "learning_rate": 9.719861259143698e-06, + "loss": 0.3324, + "step": 16174 + }, + { + "epoch": 0.710275859797798, + "grad_norm": 1.7578125, + "learning_rate": 9.714386217069604e-06, + "loss": 0.347, + "step": 16176 + }, + { + "epoch": 0.7103636782770014, + "grad_norm": 1.625, + "learning_rate": 9.708912345571469e-06, + "loss": 0.3197, + "step": 16178 + }, + { + "epoch": 0.7104514967562049, + "grad_norm": 1.515625, + "learning_rate": 9.7034396450685e-06, + "loss": 0.3114, + "step": 16180 + }, + { + "epoch": 0.7105393152354084, + "grad_norm": 1.703125, + "learning_rate": 9.69796811597978e-06, + "loss": 0.3416, + "step": 16182 + }, + { + "epoch": 0.710627133714612, + "grad_norm": 1.8125, + "learning_rate": 9.692497758724342e-06, + "loss": 0.3315, + "step": 16184 + }, + { + "epoch": 0.7107149521938154, + "grad_norm": 1.5, + "learning_rate": 9.687028573721094e-06, + "loss": 0.3204, + "step": 16186 + }, + { + "epoch": 0.7108027706730189, + "grad_norm": 1.5859375, + "learning_rate": 9.681560561388858e-06, + "loss": 0.3229, + "step": 16188 + }, + { + "epoch": 0.7108905891522224, + "grad_norm": 1.5390625, + "learning_rate": 9.676093722146399e-06, + "loss": 0.3268, + "step": 16190 + }, + { + "epoch": 0.7109784076314258, + "grad_norm": 1.5703125, + "learning_rate": 9.670628056412342e-06, + "loss": 0.2892, + "step": 16192 + }, + { + "epoch": 0.7110662261106293, + "grad_norm": 1.5234375, + "learning_rate": 9.665163564605275e-06, + "loss": 0.3335, + "step": 16194 + }, + { + "epoch": 0.7111540445898328, + "grad_norm": 1.71875, + "learning_rate": 9.659700247143658e-06, + "loss": 0.3094, + "step": 16196 + }, + { + "epoch": 0.7112418630690363, + "grad_norm": 1.515625, + "learning_rate": 9.654238104445873e-06, + "loss": 0.3172, + "step": 16198 + }, + { + "epoch": 0.7113296815482398, + "grad_norm": 1.546875, + "learning_rate": 9.648777136930215e-06, + "loss": 0.3063, + "step": 16200 + }, + { + "epoch": 0.7114175000274433, + "grad_norm": 1.53125, + "learning_rate": 9.64331734501487e-06, + "loss": 0.3165, + "step": 16202 + }, + { + "epoch": 0.7115053185066468, + "grad_norm": 1.5, + "learning_rate": 9.637858729117977e-06, + "loss": 0.3141, + "step": 16204 + }, + { + "epoch": 0.7115931369858502, + "grad_norm": 1.5078125, + "learning_rate": 9.632401289657537e-06, + "loss": 0.3346, + "step": 16206 + }, + { + "epoch": 0.7116809554650537, + "grad_norm": 1.59375, + "learning_rate": 9.626945027051495e-06, + "loss": 0.3514, + "step": 16208 + }, + { + "epoch": 0.7117687739442572, + "grad_norm": 1.5625, + "learning_rate": 9.621489941717691e-06, + "loss": 0.3265, + "step": 16210 + }, + { + "epoch": 0.7118565924234607, + "grad_norm": 1.546875, + "learning_rate": 9.616036034073863e-06, + "loss": 0.3285, + "step": 16212 + }, + { + "epoch": 0.7119444109026641, + "grad_norm": 1.5703125, + "learning_rate": 9.610583304537693e-06, + "loss": 0.3249, + "step": 16214 + }, + { + "epoch": 0.7120322293818677, + "grad_norm": 1.4765625, + "learning_rate": 9.605131753526733e-06, + "loss": 0.3098, + "step": 16216 + }, + { + "epoch": 0.7121200478610712, + "grad_norm": 1.546875, + "learning_rate": 9.599681381458483e-06, + "loss": 0.3362, + "step": 16218 + }, + { + "epoch": 0.7122078663402747, + "grad_norm": 1.6328125, + "learning_rate": 9.594232188750316e-06, + "loss": 0.3121, + "step": 16220 + }, + { + "epoch": 0.7122956848194781, + "grad_norm": 1.546875, + "learning_rate": 9.58878417581955e-06, + "loss": 0.3073, + "step": 16222 + }, + { + "epoch": 0.7123835032986816, + "grad_norm": 1.515625, + "learning_rate": 9.583337343083387e-06, + "loss": 0.3131, + "step": 16224 + }, + { + "epoch": 0.7124713217778851, + "grad_norm": 1.484375, + "learning_rate": 9.577891690958935e-06, + "loss": 0.2908, + "step": 16226 + }, + { + "epoch": 0.7125591402570886, + "grad_norm": 1.578125, + "learning_rate": 9.572447219863253e-06, + "loss": 0.3163, + "step": 16228 + }, + { + "epoch": 0.7126469587362921, + "grad_norm": 1.640625, + "learning_rate": 9.567003930213241e-06, + "loss": 0.3299, + "step": 16230 + }, + { + "epoch": 0.7127347772154956, + "grad_norm": 1.484375, + "learning_rate": 9.56156182242578e-06, + "loss": 0.3156, + "step": 16232 + }, + { + "epoch": 0.7128225956946991, + "grad_norm": 1.5859375, + "learning_rate": 9.556120896917605e-06, + "loss": 0.3299, + "step": 16234 + }, + { + "epoch": 0.7129104141739026, + "grad_norm": 1.4921875, + "learning_rate": 9.550681154105403e-06, + "loss": 0.3117, + "step": 16236 + }, + { + "epoch": 0.712998232653106, + "grad_norm": 1.5625, + "learning_rate": 9.545242594405743e-06, + "loss": 0.295, + "step": 16238 + }, + { + "epoch": 0.7130860511323095, + "grad_norm": 1.5625, + "learning_rate": 9.539805218235101e-06, + "loss": 0.3388, + "step": 16240 + }, + { + "epoch": 0.713173869611513, + "grad_norm": 1.546875, + "learning_rate": 9.534369026009888e-06, + "loss": 0.3074, + "step": 16242 + }, + { + "epoch": 0.7132616880907164, + "grad_norm": 1.625, + "learning_rate": 9.528934018146396e-06, + "loss": 0.3924, + "step": 16244 + }, + { + "epoch": 0.71334950656992, + "grad_norm": 1.515625, + "learning_rate": 9.523500195060852e-06, + "loss": 0.308, + "step": 16246 + }, + { + "epoch": 0.7134373250491235, + "grad_norm": 1.5546875, + "learning_rate": 9.518067557169375e-06, + "loss": 0.3017, + "step": 16248 + }, + { + "epoch": 0.713525143528327, + "grad_norm": 1.5078125, + "learning_rate": 9.512636104887984e-06, + "loss": 0.3188, + "step": 16250 + }, + { + "epoch": 0.7136129620075304, + "grad_norm": 1.578125, + "learning_rate": 9.507205838632643e-06, + "loss": 0.3262, + "step": 16252 + }, + { + "epoch": 0.7137007804867339, + "grad_norm": 1.5625, + "learning_rate": 9.501776758819186e-06, + "loss": 0.3345, + "step": 16254 + }, + { + "epoch": 0.7137885989659374, + "grad_norm": 1.5078125, + "learning_rate": 9.496348865863386e-06, + "loss": 0.3339, + "step": 16256 + }, + { + "epoch": 0.7138764174451409, + "grad_norm": 1.5390625, + "learning_rate": 9.49092216018091e-06, + "loss": 0.3174, + "step": 16258 + }, + { + "epoch": 0.7139642359243443, + "grad_norm": 1.6015625, + "learning_rate": 9.485496642187328e-06, + "loss": 0.3242, + "step": 16260 + }, + { + "epoch": 0.7140520544035479, + "grad_norm": 1.5625, + "learning_rate": 9.480072312298135e-06, + "loss": 0.3445, + "step": 16262 + }, + { + "epoch": 0.7141398728827514, + "grad_norm": 1.671875, + "learning_rate": 9.474649170928714e-06, + "loss": 0.3035, + "step": 16264 + }, + { + "epoch": 0.7142276913619549, + "grad_norm": 1.5, + "learning_rate": 9.469227218494391e-06, + "loss": 0.305, + "step": 16266 + }, + { + "epoch": 0.7143155098411583, + "grad_norm": 1.71875, + "learning_rate": 9.463806455410365e-06, + "loss": 0.3175, + "step": 16268 + }, + { + "epoch": 0.7144033283203618, + "grad_norm": 1.5859375, + "learning_rate": 9.458386882091769e-06, + "loss": 0.3037, + "step": 16270 + }, + { + "epoch": 0.7144911467995653, + "grad_norm": 1.53125, + "learning_rate": 9.452968498953634e-06, + "loss": 0.3173, + "step": 16272 + }, + { + "epoch": 0.7145789652787687, + "grad_norm": 1.8125, + "learning_rate": 9.447551306410887e-06, + "loss": 0.3008, + "step": 16274 + }, + { + "epoch": 0.7146667837579723, + "grad_norm": 1.515625, + "learning_rate": 9.442135304878403e-06, + "loss": 0.31, + "step": 16276 + }, + { + "epoch": 0.7147546022371758, + "grad_norm": 1.546875, + "learning_rate": 9.436720494770912e-06, + "loss": 0.3117, + "step": 16278 + }, + { + "epoch": 0.7148424207163793, + "grad_norm": 1.5390625, + "learning_rate": 9.431306876503108e-06, + "loss": 0.3323, + "step": 16280 + }, + { + "epoch": 0.7149302391955827, + "grad_norm": 1.6171875, + "learning_rate": 9.425894450489556e-06, + "loss": 0.3338, + "step": 16282 + }, + { + "epoch": 0.7150180576747862, + "grad_norm": 1.53125, + "learning_rate": 9.420483217144729e-06, + "loss": 0.3296, + "step": 16284 + }, + { + "epoch": 0.7151058761539897, + "grad_norm": 1.5078125, + "learning_rate": 9.415073176883043e-06, + "loss": 0.3117, + "step": 16286 + }, + { + "epoch": 0.7151936946331932, + "grad_norm": 1.5859375, + "learning_rate": 9.409664330118778e-06, + "loss": 0.3294, + "step": 16288 + }, + { + "epoch": 0.7152815131123966, + "grad_norm": 1.65625, + "learning_rate": 9.404256677266176e-06, + "loss": 0.2995, + "step": 16290 + }, + { + "epoch": 0.7153693315916002, + "grad_norm": 1.5859375, + "learning_rate": 9.398850218739319e-06, + "loss": 0.3314, + "step": 16292 + }, + { + "epoch": 0.7154571500708037, + "grad_norm": 1.5078125, + "learning_rate": 9.393444954952257e-06, + "loss": 0.3079, + "step": 16294 + }, + { + "epoch": 0.7155449685500072, + "grad_norm": 1.5703125, + "learning_rate": 9.388040886318916e-06, + "loss": 0.3207, + "step": 16296 + }, + { + "epoch": 0.7156327870292106, + "grad_norm": 1.578125, + "learning_rate": 9.382638013253156e-06, + "loss": 0.3263, + "step": 16298 + }, + { + "epoch": 0.7157206055084141, + "grad_norm": 1.4765625, + "learning_rate": 9.377236336168717e-06, + "loss": 0.2916, + "step": 16300 + }, + { + "epoch": 0.7158084239876176, + "grad_norm": 1.546875, + "learning_rate": 9.371835855479258e-06, + "loss": 0.3527, + "step": 16302 + }, + { + "epoch": 0.715896242466821, + "grad_norm": 1.5703125, + "learning_rate": 9.366436571598364e-06, + "loss": 0.2921, + "step": 16304 + }, + { + "epoch": 0.7159840609460245, + "grad_norm": 1.5703125, + "learning_rate": 9.361038484939496e-06, + "loss": 0.3471, + "step": 16306 + }, + { + "epoch": 0.7160718794252281, + "grad_norm": 1.609375, + "learning_rate": 9.355641595916059e-06, + "loss": 0.3398, + "step": 16308 + }, + { + "epoch": 0.7161596979044316, + "grad_norm": 1.5, + "learning_rate": 9.350245904941338e-06, + "loss": 0.3452, + "step": 16310 + }, + { + "epoch": 0.716247516383635, + "grad_norm": 1.5625, + "learning_rate": 9.34485141242853e-06, + "loss": 0.3264, + "step": 16312 + }, + { + "epoch": 0.7163353348628385, + "grad_norm": 1.5390625, + "learning_rate": 9.339458118790761e-06, + "loss": 0.2958, + "step": 16314 + }, + { + "epoch": 0.716423153342042, + "grad_norm": 1.7109375, + "learning_rate": 9.334066024441035e-06, + "loss": 0.3358, + "step": 16316 + }, + { + "epoch": 0.7165109718212455, + "grad_norm": 1.6875, + "learning_rate": 9.328675129792298e-06, + "loss": 0.318, + "step": 16318 + }, + { + "epoch": 0.7165987903004489, + "grad_norm": 1.703125, + "learning_rate": 9.323285435257373e-06, + "loss": 0.3243, + "step": 16320 + }, + { + "epoch": 0.7166866087796525, + "grad_norm": 1.640625, + "learning_rate": 9.31789694124901e-06, + "loss": 0.3271, + "step": 16322 + }, + { + "epoch": 0.716774427258856, + "grad_norm": 1.53125, + "learning_rate": 9.312509648179856e-06, + "loss": 0.3216, + "step": 16324 + }, + { + "epoch": 0.7168622457380595, + "grad_norm": 1.4765625, + "learning_rate": 9.30712355646247e-06, + "loss": 0.3215, + "step": 16326 + }, + { + "epoch": 0.7169500642172629, + "grad_norm": 1.5390625, + "learning_rate": 9.301738666509327e-06, + "loss": 0.2933, + "step": 16328 + }, + { + "epoch": 0.7170378826964664, + "grad_norm": 1.78125, + "learning_rate": 9.296354978732793e-06, + "loss": 0.3125, + "step": 16330 + }, + { + "epoch": 0.7171257011756699, + "grad_norm": 1.515625, + "learning_rate": 9.29097249354517e-06, + "loss": 0.3495, + "step": 16332 + }, + { + "epoch": 0.7172135196548733, + "grad_norm": 1.6796875, + "learning_rate": 9.285591211358637e-06, + "loss": 0.3285, + "step": 16334 + }, + { + "epoch": 0.7173013381340768, + "grad_norm": 1.4921875, + "learning_rate": 9.28021113258529e-06, + "loss": 0.3444, + "step": 16336 + }, + { + "epoch": 0.7173891566132804, + "grad_norm": 1.6171875, + "learning_rate": 9.274832257637148e-06, + "loss": 0.3218, + "step": 16338 + }, + { + "epoch": 0.7174769750924839, + "grad_norm": 1.5, + "learning_rate": 9.269454586926116e-06, + "loss": 0.3116, + "step": 16340 + }, + { + "epoch": 0.7175647935716873, + "grad_norm": 1.5625, + "learning_rate": 9.264078120864029e-06, + "loss": 0.3402, + "step": 16342 + }, + { + "epoch": 0.7176526120508908, + "grad_norm": 1.546875, + "learning_rate": 9.258702859862612e-06, + "loss": 0.3325, + "step": 16344 + }, + { + "epoch": 0.7177404305300943, + "grad_norm": 1.6484375, + "learning_rate": 9.253328804333495e-06, + "loss": 0.3402, + "step": 16346 + }, + { + "epoch": 0.7178282490092978, + "grad_norm": 1.5234375, + "learning_rate": 9.247955954688242e-06, + "loss": 0.349, + "step": 16348 + }, + { + "epoch": 0.7179160674885012, + "grad_norm": 1.59375, + "learning_rate": 9.242584311338288e-06, + "loss": 0.3494, + "step": 16350 + }, + { + "epoch": 0.7180038859677047, + "grad_norm": 1.5625, + "learning_rate": 9.237213874695021e-06, + "loss": 0.3199, + "step": 16352 + }, + { + "epoch": 0.7180917044469083, + "grad_norm": 1.6484375, + "learning_rate": 9.231844645169679e-06, + "loss": 0.3132, + "step": 16354 + }, + { + "epoch": 0.7181795229261118, + "grad_norm": 1.515625, + "learning_rate": 9.226476623173464e-06, + "loss": 0.3385, + "step": 16356 + }, + { + "epoch": 0.7182673414053152, + "grad_norm": 1.6015625, + "learning_rate": 9.22110980911744e-06, + "loss": 0.3345, + "step": 16358 + }, + { + "epoch": 0.7183551598845187, + "grad_norm": 1.46875, + "learning_rate": 9.215744203412619e-06, + "loss": 0.3218, + "step": 16360 + }, + { + "epoch": 0.7184429783637222, + "grad_norm": 1.5546875, + "learning_rate": 9.210379806469888e-06, + "loss": 0.317, + "step": 16362 + }, + { + "epoch": 0.7185307968429256, + "grad_norm": 1.5234375, + "learning_rate": 9.205016618700049e-06, + "loss": 0.3086, + "step": 16364 + }, + { + "epoch": 0.7186186153221291, + "grad_norm": 1.5, + "learning_rate": 9.199654640513833e-06, + "loss": 0.3399, + "step": 16366 + }, + { + "epoch": 0.7187064338013327, + "grad_norm": 1.484375, + "learning_rate": 9.194293872321843e-06, + "loss": 0.3437, + "step": 16368 + }, + { + "epoch": 0.7187942522805362, + "grad_norm": 1.59375, + "learning_rate": 9.188934314534625e-06, + "loss": 0.3387, + "step": 16370 + }, + { + "epoch": 0.7188820707597396, + "grad_norm": 1.59375, + "learning_rate": 9.18357596756261e-06, + "loss": 0.2981, + "step": 16372 + }, + { + "epoch": 0.7189698892389431, + "grad_norm": 1.5390625, + "learning_rate": 9.178218831816126e-06, + "loss": 0.315, + "step": 16374 + }, + { + "epoch": 0.7190577077181466, + "grad_norm": 1.5859375, + "learning_rate": 9.17286290770545e-06, + "loss": 0.2909, + "step": 16376 + }, + { + "epoch": 0.7191455261973501, + "grad_norm": 1.4921875, + "learning_rate": 9.167508195640714e-06, + "loss": 0.3028, + "step": 16378 + }, + { + "epoch": 0.7192333446765535, + "grad_norm": 1.5078125, + "learning_rate": 9.162154696032007e-06, + "loss": 0.3327, + "step": 16380 + }, + { + "epoch": 0.719321163155757, + "grad_norm": 1.5625, + "learning_rate": 9.156802409289289e-06, + "loss": 0.3227, + "step": 16382 + }, + { + "epoch": 0.7194089816349606, + "grad_norm": 1.5625, + "learning_rate": 9.151451335822442e-06, + "loss": 0.311, + "step": 16384 + }, + { + "epoch": 0.7194968001141641, + "grad_norm": 1.484375, + "learning_rate": 9.146101476041249e-06, + "loss": 0.3146, + "step": 16386 + }, + { + "epoch": 0.7195846185933675, + "grad_norm": 1.65625, + "learning_rate": 9.140752830355395e-06, + "loss": 0.3515, + "step": 16388 + }, + { + "epoch": 0.719672437072571, + "grad_norm": 1.703125, + "learning_rate": 9.135405399174504e-06, + "loss": 0.3435, + "step": 16390 + }, + { + "epoch": 0.7197602555517745, + "grad_norm": 1.4921875, + "learning_rate": 9.13005918290806e-06, + "loss": 0.3042, + "step": 16392 + }, + { + "epoch": 0.719848074030978, + "grad_norm": 1.6328125, + "learning_rate": 9.124714181965497e-06, + "loss": 0.3166, + "step": 16394 + }, + { + "epoch": 0.7199358925101814, + "grad_norm": 1.5859375, + "learning_rate": 9.119370396756125e-06, + "loss": 0.3493, + "step": 16396 + }, + { + "epoch": 0.7200237109893849, + "grad_norm": 1.6328125, + "learning_rate": 9.114027827689168e-06, + "loss": 0.3101, + "step": 16398 + }, + { + "epoch": 0.7201115294685885, + "grad_norm": 1.671875, + "learning_rate": 9.108686475173777e-06, + "loss": 0.3509, + "step": 16400 + }, + { + "epoch": 0.720199347947792, + "grad_norm": 1.5390625, + "learning_rate": 9.103346339618975e-06, + "loss": 0.3276, + "step": 16402 + }, + { + "epoch": 0.7202871664269954, + "grad_norm": 1.515625, + "learning_rate": 9.098007421433732e-06, + "loss": 0.3511, + "step": 16404 + }, + { + "epoch": 0.7203749849061989, + "grad_norm": 1.8125, + "learning_rate": 9.092669721026892e-06, + "loss": 0.3317, + "step": 16406 + }, + { + "epoch": 0.7204628033854024, + "grad_norm": 1.5390625, + "learning_rate": 9.087333238807208e-06, + "loss": 0.3172, + "step": 16408 + }, + { + "epoch": 0.7205506218646058, + "grad_norm": 1.53125, + "learning_rate": 9.081997975183368e-06, + "loss": 0.3195, + "step": 16410 + }, + { + "epoch": 0.7206384403438093, + "grad_norm": 1.6328125, + "learning_rate": 9.07666393056394e-06, + "loss": 0.3213, + "step": 16412 + }, + { + "epoch": 0.7207262588230128, + "grad_norm": 1.6328125, + "learning_rate": 9.071331105357406e-06, + "loss": 0.3312, + "step": 16414 + }, + { + "epoch": 0.7208140773022164, + "grad_norm": 1.515625, + "learning_rate": 9.065999499972144e-06, + "loss": 0.3171, + "step": 16416 + }, + { + "epoch": 0.7209018957814198, + "grad_norm": 1.5, + "learning_rate": 9.06066911481647e-06, + "loss": 0.3196, + "step": 16418 + }, + { + "epoch": 0.7209897142606233, + "grad_norm": 1.625, + "learning_rate": 9.055339950298564e-06, + "loss": 0.3272, + "step": 16420 + }, + { + "epoch": 0.7210775327398268, + "grad_norm": 1.5859375, + "learning_rate": 9.050012006826558e-06, + "loss": 0.2998, + "step": 16422 + }, + { + "epoch": 0.7211653512190302, + "grad_norm": 1.625, + "learning_rate": 9.044685284808458e-06, + "loss": 0.3191, + "step": 16424 + }, + { + "epoch": 0.7212531696982337, + "grad_norm": 1.5546875, + "learning_rate": 9.03935978465217e-06, + "loss": 0.3121, + "step": 16426 + }, + { + "epoch": 0.7213409881774372, + "grad_norm": 1.5859375, + "learning_rate": 9.034035506765548e-06, + "loss": 0.3239, + "step": 16428 + }, + { + "epoch": 0.7214288066566408, + "grad_norm": 1.5703125, + "learning_rate": 9.028712451556307e-06, + "loss": 0.2768, + "step": 16430 + }, + { + "epoch": 0.7215166251358442, + "grad_norm": 1.53125, + "learning_rate": 9.023390619432101e-06, + "loss": 0.3418, + "step": 16432 + }, + { + "epoch": 0.7216044436150477, + "grad_norm": 1.59375, + "learning_rate": 9.018070010800472e-06, + "loss": 0.3248, + "step": 16434 + }, + { + "epoch": 0.7216922620942512, + "grad_norm": 1.53125, + "learning_rate": 9.012750626068864e-06, + "loss": 0.2989, + "step": 16436 + }, + { + "epoch": 0.7217800805734547, + "grad_norm": 1.578125, + "learning_rate": 9.007432465644652e-06, + "loss": 0.3299, + "step": 16438 + }, + { + "epoch": 0.7218678990526581, + "grad_norm": 1.6328125, + "learning_rate": 9.00211552993509e-06, + "loss": 0.3024, + "step": 16440 + }, + { + "epoch": 0.7219557175318616, + "grad_norm": 1.609375, + "learning_rate": 8.996799819347363e-06, + "loss": 0.3125, + "step": 16442 + }, + { + "epoch": 0.7220435360110651, + "grad_norm": 1.6796875, + "learning_rate": 8.991485334288542e-06, + "loss": 0.3349, + "step": 16444 + }, + { + "epoch": 0.7221313544902687, + "grad_norm": 1.578125, + "learning_rate": 8.986172075165611e-06, + "loss": 0.3068, + "step": 16446 + }, + { + "epoch": 0.7222191729694721, + "grad_norm": 1.59375, + "learning_rate": 8.98086004238546e-06, + "loss": 0.3324, + "step": 16448 + }, + { + "epoch": 0.7223069914486756, + "grad_norm": 1.6328125, + "learning_rate": 8.975549236354882e-06, + "loss": 0.3361, + "step": 16450 + }, + { + "epoch": 0.7223948099278791, + "grad_norm": 1.6015625, + "learning_rate": 8.970239657480592e-06, + "loss": 0.3373, + "step": 16452 + }, + { + "epoch": 0.7224826284070825, + "grad_norm": 1.6484375, + "learning_rate": 8.964931306169182e-06, + "loss": 0.3342, + "step": 16454 + }, + { + "epoch": 0.722570446886286, + "grad_norm": 1.6875, + "learning_rate": 8.959624182827187e-06, + "loss": 0.311, + "step": 16456 + }, + { + "epoch": 0.7226582653654895, + "grad_norm": 1.515625, + "learning_rate": 8.954318287861016e-06, + "loss": 0.3384, + "step": 16458 + }, + { + "epoch": 0.722746083844693, + "grad_norm": 1.6015625, + "learning_rate": 8.949013621676988e-06, + "loss": 0.3401, + "step": 16460 + }, + { + "epoch": 0.7228339023238965, + "grad_norm": 1.5859375, + "learning_rate": 8.943710184681353e-06, + "loss": 0.3354, + "step": 16462 + }, + { + "epoch": 0.7229217208031, + "grad_norm": 1.484375, + "learning_rate": 8.938407977280233e-06, + "loss": 0.3297, + "step": 16464 + }, + { + "epoch": 0.7230095392823035, + "grad_norm": 1.53125, + "learning_rate": 8.93310699987969e-06, + "loss": 0.2986, + "step": 16466 + }, + { + "epoch": 0.723097357761507, + "grad_norm": 1.6015625, + "learning_rate": 8.927807252885664e-06, + "loss": 0.3087, + "step": 16468 + }, + { + "epoch": 0.7231851762407104, + "grad_norm": 1.59375, + "learning_rate": 8.922508736704002e-06, + "loss": 0.3098, + "step": 16470 + }, + { + "epoch": 0.7232729947199139, + "grad_norm": 1.59375, + "learning_rate": 8.917211451740484e-06, + "loss": 0.3198, + "step": 16472 + }, + { + "epoch": 0.7233608131991174, + "grad_norm": 1.5078125, + "learning_rate": 8.911915398400767e-06, + "loss": 0.3407, + "step": 16474 + }, + { + "epoch": 0.723448631678321, + "grad_norm": 1.75, + "learning_rate": 8.906620577090427e-06, + "loss": 0.3181, + "step": 16476 + }, + { + "epoch": 0.7235364501575244, + "grad_norm": 1.546875, + "learning_rate": 8.90132698821493e-06, + "loss": 0.3371, + "step": 16478 + }, + { + "epoch": 0.7236242686367279, + "grad_norm": 1.5390625, + "learning_rate": 8.896034632179683e-06, + "loss": 0.342, + "step": 16480 + }, + { + "epoch": 0.7237120871159314, + "grad_norm": 1.640625, + "learning_rate": 8.890743509389953e-06, + "loss": 0.3339, + "step": 16482 + }, + { + "epoch": 0.7237999055951349, + "grad_norm": 1.5703125, + "learning_rate": 8.885453620250958e-06, + "loss": 0.3264, + "step": 16484 + }, + { + "epoch": 0.7238877240743383, + "grad_norm": 1.515625, + "learning_rate": 8.880164965167787e-06, + "loss": 0.3196, + "step": 16486 + }, + { + "epoch": 0.7239755425535418, + "grad_norm": 1.5703125, + "learning_rate": 8.874877544545438e-06, + "loss": 0.3019, + "step": 16488 + }, + { + "epoch": 0.7240633610327453, + "grad_norm": 1.5234375, + "learning_rate": 8.86959135878884e-06, + "loss": 0.3177, + "step": 16490 + }, + { + "epoch": 0.7241511795119489, + "grad_norm": 1.59375, + "learning_rate": 8.864306408302795e-06, + "loss": 0.3494, + "step": 16492 + }, + { + "epoch": 0.7242389979911523, + "grad_norm": 1.6328125, + "learning_rate": 8.859022693492042e-06, + "loss": 0.3201, + "step": 16494 + }, + { + "epoch": 0.7243268164703558, + "grad_norm": 1.59375, + "learning_rate": 8.8537402147612e-06, + "loss": 0.2976, + "step": 16496 + }, + { + "epoch": 0.7244146349495593, + "grad_norm": 1.4765625, + "learning_rate": 8.848458972514792e-06, + "loss": 0.332, + "step": 16498 + }, + { + "epoch": 0.7245024534287627, + "grad_norm": 1.609375, + "learning_rate": 8.843178967157278e-06, + "loss": 0.312, + "step": 16500 + }, + { + "epoch": 0.7245902719079662, + "grad_norm": 1.625, + "learning_rate": 8.837900199092986e-06, + "loss": 0.3438, + "step": 16502 + }, + { + "epoch": 0.7246780903871697, + "grad_norm": 1.484375, + "learning_rate": 8.832622668726184e-06, + "loss": 0.3145, + "step": 16504 + }, + { + "epoch": 0.7247659088663732, + "grad_norm": 1.4765625, + "learning_rate": 8.827346376460998e-06, + "loss": 0.3059, + "step": 16506 + }, + { + "epoch": 0.7248537273455767, + "grad_norm": 1.59375, + "learning_rate": 8.822071322701513e-06, + "loss": 0.3079, + "step": 16508 + }, + { + "epoch": 0.7249415458247802, + "grad_norm": 1.5703125, + "learning_rate": 8.816797507851682e-06, + "loss": 0.3218, + "step": 16510 + }, + { + "epoch": 0.7250293643039837, + "grad_norm": 1.640625, + "learning_rate": 8.811524932315371e-06, + "loss": 0.31, + "step": 16512 + }, + { + "epoch": 0.7251171827831872, + "grad_norm": 1.4609375, + "learning_rate": 8.806253596496369e-06, + "loss": 0.3252, + "step": 16514 + }, + { + "epoch": 0.7252050012623906, + "grad_norm": 1.4765625, + "learning_rate": 8.800983500798341e-06, + "loss": 0.3262, + "step": 16516 + }, + { + "epoch": 0.7252928197415941, + "grad_norm": 1.5625, + "learning_rate": 8.795714645624887e-06, + "loss": 0.3043, + "step": 16518 + }, + { + "epoch": 0.7253806382207976, + "grad_norm": 1.53125, + "learning_rate": 8.79044703137949e-06, + "loss": 0.3325, + "step": 16520 + }, + { + "epoch": 0.7254684567000012, + "grad_norm": 1.46875, + "learning_rate": 8.785180658465536e-06, + "loss": 0.3342, + "step": 16522 + }, + { + "epoch": 0.7255562751792046, + "grad_norm": 1.578125, + "learning_rate": 8.779915527286343e-06, + "loss": 0.3068, + "step": 16524 + }, + { + "epoch": 0.7256440936584081, + "grad_norm": 1.5625, + "learning_rate": 8.7746516382451e-06, + "loss": 0.3271, + "step": 16526 + }, + { + "epoch": 0.7257319121376116, + "grad_norm": 1.6171875, + "learning_rate": 8.769388991744928e-06, + "loss": 0.2964, + "step": 16528 + }, + { + "epoch": 0.725819730616815, + "grad_norm": 1.6953125, + "learning_rate": 8.764127588188842e-06, + "loss": 0.3511, + "step": 16530 + }, + { + "epoch": 0.7259075490960185, + "grad_norm": 1.640625, + "learning_rate": 8.758867427979748e-06, + "loss": 0.3357, + "step": 16532 + }, + { + "epoch": 0.725995367575222, + "grad_norm": 1.5859375, + "learning_rate": 8.753608511520489e-06, + "loss": 0.348, + "step": 16534 + }, + { + "epoch": 0.7260831860544255, + "grad_norm": 1.5546875, + "learning_rate": 8.748350839213782e-06, + "loss": 0.3523, + "step": 16536 + }, + { + "epoch": 0.726171004533629, + "grad_norm": 1.5390625, + "learning_rate": 8.743094411462266e-06, + "loss": 0.3257, + "step": 16538 + }, + { + "epoch": 0.7262588230128325, + "grad_norm": 1.453125, + "learning_rate": 8.737839228668468e-06, + "loss": 0.307, + "step": 16540 + }, + { + "epoch": 0.726346641492036, + "grad_norm": 1.4765625, + "learning_rate": 8.73258529123485e-06, + "loss": 0.3303, + "step": 16542 + }, + { + "epoch": 0.7264344599712395, + "grad_norm": 1.6953125, + "learning_rate": 8.727332599563745e-06, + "loss": 0.3162, + "step": 16544 + }, + { + "epoch": 0.7265222784504429, + "grad_norm": 1.5546875, + "learning_rate": 8.722081154057408e-06, + "loss": 0.3351, + "step": 16546 + }, + { + "epoch": 0.7266100969296464, + "grad_norm": 1.6484375, + "learning_rate": 8.716830955118002e-06, + "loss": 0.329, + "step": 16548 + }, + { + "epoch": 0.7266979154088499, + "grad_norm": 1.6796875, + "learning_rate": 8.711582003147578e-06, + "loss": 0.3177, + "step": 16550 + }, + { + "epoch": 0.7267857338880533, + "grad_norm": 1.5078125, + "learning_rate": 8.706334298548119e-06, + "loss": 0.3, + "step": 16552 + }, + { + "epoch": 0.7268735523672569, + "grad_norm": 1.484375, + "learning_rate": 8.701087841721475e-06, + "loss": 0.3191, + "step": 16554 + }, + { + "epoch": 0.7269613708464604, + "grad_norm": 1.484375, + "learning_rate": 8.69584263306944e-06, + "loss": 0.3198, + "step": 16556 + }, + { + "epoch": 0.7270491893256639, + "grad_norm": 1.5703125, + "learning_rate": 8.690598672993683e-06, + "loss": 0.3061, + "step": 16558 + }, + { + "epoch": 0.7271370078048673, + "grad_norm": 1.4375, + "learning_rate": 8.685355961895784e-06, + "loss": 0.3073, + "step": 16560 + }, + { + "epoch": 0.7272248262840708, + "grad_norm": 1.6328125, + "learning_rate": 8.680114500177241e-06, + "loss": 0.3, + "step": 16562 + }, + { + "epoch": 0.7273126447632743, + "grad_norm": 1.5, + "learning_rate": 8.674874288239438e-06, + "loss": 0.3184, + "step": 16564 + }, + { + "epoch": 0.7274004632424778, + "grad_norm": 1.4453125, + "learning_rate": 8.669635326483688e-06, + "loss": 0.2938, + "step": 16566 + }, + { + "epoch": 0.7274882817216813, + "grad_norm": 1.5703125, + "learning_rate": 8.664397615311165e-06, + "loss": 0.3619, + "step": 16568 + }, + { + "epoch": 0.7275761002008848, + "grad_norm": 1.5859375, + "learning_rate": 8.659161155122997e-06, + "loss": 0.3037, + "step": 16570 + }, + { + "epoch": 0.7276639186800883, + "grad_norm": 1.5546875, + "learning_rate": 8.653925946320183e-06, + "loss": 0.3034, + "step": 16572 + }, + { + "epoch": 0.7277517371592918, + "grad_norm": 1.4375, + "learning_rate": 8.648691989303631e-06, + "loss": 0.3027, + "step": 16574 + }, + { + "epoch": 0.7278395556384952, + "grad_norm": 1.515625, + "learning_rate": 8.643459284474176e-06, + "loss": 0.3192, + "step": 16576 + }, + { + "epoch": 0.7279273741176987, + "grad_norm": 1.53125, + "learning_rate": 8.638227832232517e-06, + "loss": 0.3333, + "step": 16578 + }, + { + "epoch": 0.7280151925969022, + "grad_norm": 1.6015625, + "learning_rate": 8.632997632979306e-06, + "loss": 0.3472, + "step": 16580 + }, + { + "epoch": 0.7281030110761056, + "grad_norm": 1.5859375, + "learning_rate": 8.62776868711506e-06, + "loss": 0.3078, + "step": 16582 + }, + { + "epoch": 0.7281908295553092, + "grad_norm": 1.6484375, + "learning_rate": 8.622540995040202e-06, + "loss": 0.3002, + "step": 16584 + }, + { + "epoch": 0.7282786480345127, + "grad_norm": 1.5546875, + "learning_rate": 8.617314557155087e-06, + "loss": 0.3192, + "step": 16586 + }, + { + "epoch": 0.7283664665137162, + "grad_norm": 1.5234375, + "learning_rate": 8.612089373859945e-06, + "loss": 0.3664, + "step": 16588 + }, + { + "epoch": 0.7284542849929196, + "grad_norm": 1.703125, + "learning_rate": 8.606865445554934e-06, + "loss": 0.3197, + "step": 16590 + }, + { + "epoch": 0.7285421034721231, + "grad_norm": 1.578125, + "learning_rate": 8.601642772640097e-06, + "loss": 0.3189, + "step": 16592 + }, + { + "epoch": 0.7286299219513266, + "grad_norm": 1.578125, + "learning_rate": 8.596421355515383e-06, + "loss": 0.3084, + "step": 16594 + }, + { + "epoch": 0.7287177404305301, + "grad_norm": 1.5859375, + "learning_rate": 8.591201194580667e-06, + "loss": 0.3358, + "step": 16596 + }, + { + "epoch": 0.7288055589097335, + "grad_norm": 1.5859375, + "learning_rate": 8.585982290235681e-06, + "loss": 0.3179, + "step": 16598 + }, + { + "epoch": 0.7288933773889371, + "grad_norm": 1.515625, + "learning_rate": 8.580764642880113e-06, + "loss": 0.3107, + "step": 16600 + }, + { + "epoch": 0.7289811958681406, + "grad_norm": 1.5078125, + "learning_rate": 8.575548252913515e-06, + "loss": 0.3655, + "step": 16602 + }, + { + "epoch": 0.7290690143473441, + "grad_norm": 1.515625, + "learning_rate": 8.57033312073538e-06, + "loss": 0.3162, + "step": 16604 + }, + { + "epoch": 0.7291568328265475, + "grad_norm": 1.5625, + "learning_rate": 8.565119246745074e-06, + "loss": 0.3325, + "step": 16606 + }, + { + "epoch": 0.729244651305751, + "grad_norm": 1.5625, + "learning_rate": 8.559906631341866e-06, + "loss": 0.3174, + "step": 16608 + }, + { + "epoch": 0.7293324697849545, + "grad_norm": 1.4921875, + "learning_rate": 8.554695274924956e-06, + "loss": 0.3142, + "step": 16610 + }, + { + "epoch": 0.729420288264158, + "grad_norm": 1.5625, + "learning_rate": 8.549485177893418e-06, + "loss": 0.3224, + "step": 16612 + }, + { + "epoch": 0.7295081067433614, + "grad_norm": 1.6640625, + "learning_rate": 8.544276340646256e-06, + "loss": 0.3582, + "step": 16614 + }, + { + "epoch": 0.729595925222565, + "grad_norm": 1.53125, + "learning_rate": 8.539068763582347e-06, + "loss": 0.3463, + "step": 16616 + }, + { + "epoch": 0.7296837437017685, + "grad_norm": 1.5078125, + "learning_rate": 8.533862447100511e-06, + "loss": 0.297, + "step": 16618 + }, + { + "epoch": 0.7297715621809719, + "grad_norm": 1.5546875, + "learning_rate": 8.528657391599431e-06, + "loss": 0.3059, + "step": 16620 + }, + { + "epoch": 0.7298593806601754, + "grad_norm": 1.5625, + "learning_rate": 8.52345359747771e-06, + "loss": 0.3107, + "step": 16622 + }, + { + "epoch": 0.7299471991393789, + "grad_norm": 1.5078125, + "learning_rate": 8.51825106513387e-06, + "loss": 0.3003, + "step": 16624 + }, + { + "epoch": 0.7300350176185824, + "grad_norm": 1.515625, + "learning_rate": 8.513049794966305e-06, + "loss": 0.3071, + "step": 16626 + }, + { + "epoch": 0.7301228360977858, + "grad_norm": 1.46875, + "learning_rate": 8.507849787373356e-06, + "loss": 0.3396, + "step": 16628 + }, + { + "epoch": 0.7302106545769894, + "grad_norm": 1.484375, + "learning_rate": 8.50265104275321e-06, + "loss": 0.3191, + "step": 16630 + }, + { + "epoch": 0.7302984730561929, + "grad_norm": 1.5546875, + "learning_rate": 8.497453561504007e-06, + "loss": 0.2972, + "step": 16632 + }, + { + "epoch": 0.7303862915353964, + "grad_norm": 1.71875, + "learning_rate": 8.492257344023769e-06, + "loss": 0.3168, + "step": 16634 + }, + { + "epoch": 0.7304741100145998, + "grad_norm": 1.5078125, + "learning_rate": 8.48706239071041e-06, + "loss": 0.3114, + "step": 16636 + }, + { + "epoch": 0.7305619284938033, + "grad_norm": 1.4375, + "learning_rate": 8.481868701961782e-06, + "loss": 0.3148, + "step": 16638 + }, + { + "epoch": 0.7306497469730068, + "grad_norm": 1.515625, + "learning_rate": 8.476676278175597e-06, + "loss": 0.3238, + "step": 16640 + }, + { + "epoch": 0.7307375654522102, + "grad_norm": 1.6953125, + "learning_rate": 8.471485119749514e-06, + "loss": 0.3176, + "step": 16642 + }, + { + "epoch": 0.7308253839314137, + "grad_norm": 1.53125, + "learning_rate": 8.466295227081061e-06, + "loss": 0.3225, + "step": 16644 + }, + { + "epoch": 0.7309132024106173, + "grad_norm": 1.671875, + "learning_rate": 8.461106600567679e-06, + "loss": 0.3377, + "step": 16646 + }, + { + "epoch": 0.7310010208898208, + "grad_norm": 1.5859375, + "learning_rate": 8.455919240606722e-06, + "loss": 0.3063, + "step": 16648 + }, + { + "epoch": 0.7310888393690242, + "grad_norm": 1.5546875, + "learning_rate": 8.450733147595427e-06, + "loss": 0.3358, + "step": 16650 + }, + { + "epoch": 0.7311766578482277, + "grad_norm": 1.6015625, + "learning_rate": 8.445548321930966e-06, + "loss": 0.3395, + "step": 16652 + }, + { + "epoch": 0.7312644763274312, + "grad_norm": 1.5390625, + "learning_rate": 8.44036476401038e-06, + "loss": 0.3459, + "step": 16654 + }, + { + "epoch": 0.7313522948066347, + "grad_norm": 1.59375, + "learning_rate": 8.435182474230624e-06, + "loss": 0.3312, + "step": 16656 + }, + { + "epoch": 0.7314401132858381, + "grad_norm": 1.6328125, + "learning_rate": 8.430001452988582e-06, + "loss": 0.3066, + "step": 16658 + }, + { + "epoch": 0.7315279317650416, + "grad_norm": 1.546875, + "learning_rate": 8.424821700680982e-06, + "loss": 0.3105, + "step": 16660 + }, + { + "epoch": 0.7316157502442452, + "grad_norm": 1.53125, + "learning_rate": 8.419643217704517e-06, + "loss": 0.3185, + "step": 16662 + }, + { + "epoch": 0.7317035687234487, + "grad_norm": 1.5546875, + "learning_rate": 8.414466004455743e-06, + "loss": 0.3262, + "step": 16664 + }, + { + "epoch": 0.7317913872026521, + "grad_norm": 1.5625, + "learning_rate": 8.409290061331145e-06, + "loss": 0.3022, + "step": 16666 + }, + { + "epoch": 0.7318792056818556, + "grad_norm": 1.5078125, + "learning_rate": 8.404115388727093e-06, + "loss": 0.3006, + "step": 16668 + }, + { + "epoch": 0.7319670241610591, + "grad_norm": 1.5234375, + "learning_rate": 8.398941987039854e-06, + "loss": 0.3115, + "step": 16670 + }, + { + "epoch": 0.7320548426402625, + "grad_norm": 1.5703125, + "learning_rate": 8.393769856665626e-06, + "loss": 0.3167, + "step": 16672 + }, + { + "epoch": 0.732142661119466, + "grad_norm": 1.4453125, + "learning_rate": 8.388598998000472e-06, + "loss": 0.284, + "step": 16674 + }, + { + "epoch": 0.7322304795986696, + "grad_norm": 1.5078125, + "learning_rate": 8.383429411440399e-06, + "loss": 0.2953, + "step": 16676 + }, + { + "epoch": 0.7323182980778731, + "grad_norm": 1.546875, + "learning_rate": 8.378261097381285e-06, + "loss": 0.3249, + "step": 16678 + }, + { + "epoch": 0.7324061165570765, + "grad_norm": 1.6171875, + "learning_rate": 8.373094056218913e-06, + "loss": 0.2998, + "step": 16680 + }, + { + "epoch": 0.73249393503628, + "grad_norm": 1.53125, + "learning_rate": 8.367928288348992e-06, + "loss": 0.3415, + "step": 16682 + }, + { + "epoch": 0.7325817535154835, + "grad_norm": 1.46875, + "learning_rate": 8.3627637941671e-06, + "loss": 0.2889, + "step": 16684 + }, + { + "epoch": 0.732669571994687, + "grad_norm": 1.5859375, + "learning_rate": 8.357600574068755e-06, + "loss": 0.3368, + "step": 16686 + }, + { + "epoch": 0.7327573904738904, + "grad_norm": 1.5546875, + "learning_rate": 8.352438628449347e-06, + "loss": 0.2997, + "step": 16688 + }, + { + "epoch": 0.7328452089530939, + "grad_norm": 1.53125, + "learning_rate": 8.347277957704178e-06, + "loss": 0.3069, + "step": 16690 + }, + { + "epoch": 0.7329330274322975, + "grad_norm": 1.5078125, + "learning_rate": 8.34211856222845e-06, + "loss": 0.3195, + "step": 16692 + }, + { + "epoch": 0.733020845911501, + "grad_norm": 1.59375, + "learning_rate": 8.33696044241728e-06, + "loss": 0.3095, + "step": 16694 + }, + { + "epoch": 0.7331086643907044, + "grad_norm": 1.5078125, + "learning_rate": 8.331803598665674e-06, + "loss": 0.2801, + "step": 16696 + }, + { + "epoch": 0.7331964828699079, + "grad_norm": 1.5, + "learning_rate": 8.326648031368536e-06, + "loss": 0.3288, + "step": 16698 + }, + { + "epoch": 0.7332843013491114, + "grad_norm": 1.5703125, + "learning_rate": 8.321493740920699e-06, + "loss": 0.3092, + "step": 16700 + }, + { + "epoch": 0.7333721198283149, + "grad_norm": 1.5625, + "learning_rate": 8.316340727716862e-06, + "loss": 0.3149, + "step": 16702 + }, + { + "epoch": 0.7334599383075183, + "grad_norm": 1.6484375, + "learning_rate": 8.311188992151656e-06, + "loss": 0.3189, + "step": 16704 + }, + { + "epoch": 0.7335477567867218, + "grad_norm": 1.4296875, + "learning_rate": 8.3060385346196e-06, + "loss": 0.3034, + "step": 16706 + }, + { + "epoch": 0.7336355752659254, + "grad_norm": 1.5078125, + "learning_rate": 8.300889355515107e-06, + "loss": 0.3473, + "step": 16708 + }, + { + "epoch": 0.7337233937451288, + "grad_norm": 1.453125, + "learning_rate": 8.295741455232517e-06, + "loss": 0.3084, + "step": 16710 + }, + { + "epoch": 0.7338112122243323, + "grad_norm": 1.546875, + "learning_rate": 8.290594834166043e-06, + "loss": 0.332, + "step": 16712 + }, + { + "epoch": 0.7338990307035358, + "grad_norm": 1.515625, + "learning_rate": 8.285449492709829e-06, + "loss": 0.325, + "step": 16714 + }, + { + "epoch": 0.7339868491827393, + "grad_norm": 1.5078125, + "learning_rate": 8.2803054312579e-06, + "loss": 0.3286, + "step": 16716 + }, + { + "epoch": 0.7340746676619427, + "grad_norm": 1.5078125, + "learning_rate": 8.275162650204182e-06, + "loss": 0.3368, + "step": 16718 + }, + { + "epoch": 0.7341624861411462, + "grad_norm": 1.78125, + "learning_rate": 8.270021149942536e-06, + "loss": 0.3279, + "step": 16720 + }, + { + "epoch": 0.7342503046203498, + "grad_norm": 1.609375, + "learning_rate": 8.26488093086666e-06, + "loss": 0.3313, + "step": 16722 + }, + { + "epoch": 0.7343381230995533, + "grad_norm": 1.59375, + "learning_rate": 8.259741993370227e-06, + "loss": 0.3095, + "step": 16724 + }, + { + "epoch": 0.7344259415787567, + "grad_norm": 1.5625, + "learning_rate": 8.254604337846753e-06, + "loss": 0.3407, + "step": 16726 + }, + { + "epoch": 0.7345137600579602, + "grad_norm": 1.671875, + "learning_rate": 8.2494679646897e-06, + "loss": 0.3175, + "step": 16728 + }, + { + "epoch": 0.7346015785371637, + "grad_norm": 1.6328125, + "learning_rate": 8.24433287429241e-06, + "loss": 0.3401, + "step": 16730 + }, + { + "epoch": 0.7346893970163672, + "grad_norm": 1.4453125, + "learning_rate": 8.239199067048114e-06, + "loss": 0.316, + "step": 16732 + }, + { + "epoch": 0.7347772154955706, + "grad_norm": 1.484375, + "learning_rate": 8.23406654334998e-06, + "loss": 0.3004, + "step": 16734 + }, + { + "epoch": 0.7348650339747741, + "grad_norm": 1.515625, + "learning_rate": 8.22893530359104e-06, + "loss": 0.319, + "step": 16736 + }, + { + "epoch": 0.7349528524539777, + "grad_norm": 1.546875, + "learning_rate": 8.223805348164265e-06, + "loss": 0.319, + "step": 16738 + }, + { + "epoch": 0.7350406709331812, + "grad_norm": 1.53125, + "learning_rate": 8.218676677462497e-06, + "loss": 0.3369, + "step": 16740 + }, + { + "epoch": 0.7351284894123846, + "grad_norm": 1.5703125, + "learning_rate": 8.213549291878483e-06, + "loss": 0.3231, + "step": 16742 + }, + { + "epoch": 0.7352163078915881, + "grad_norm": 1.4921875, + "learning_rate": 8.208423191804899e-06, + "loss": 0.3173, + "step": 16744 + }, + { + "epoch": 0.7353041263707916, + "grad_norm": 1.3984375, + "learning_rate": 8.20329837763428e-06, + "loss": 0.3021, + "step": 16746 + }, + { + "epoch": 0.735391944849995, + "grad_norm": 1.7578125, + "learning_rate": 8.19817484975911e-06, + "loss": 0.32, + "step": 16748 + }, + { + "epoch": 0.7354797633291985, + "grad_norm": 1.578125, + "learning_rate": 8.193052608571736e-06, + "loss": 0.309, + "step": 16750 + }, + { + "epoch": 0.735567581808402, + "grad_norm": 1.5859375, + "learning_rate": 8.18793165446442e-06, + "loss": 0.2867, + "step": 16752 + }, + { + "epoch": 0.7356554002876056, + "grad_norm": 1.46875, + "learning_rate": 8.182811987829323e-06, + "loss": 0.3224, + "step": 16754 + }, + { + "epoch": 0.735743218766809, + "grad_norm": 1.421875, + "learning_rate": 8.177693609058521e-06, + "loss": 0.3212, + "step": 16756 + }, + { + "epoch": 0.7358310372460125, + "grad_norm": 1.578125, + "learning_rate": 8.172576518543976e-06, + "loss": 0.3462, + "step": 16758 + }, + { + "epoch": 0.735918855725216, + "grad_norm": 1.5390625, + "learning_rate": 8.167460716677546e-06, + "loss": 0.3368, + "step": 16760 + }, + { + "epoch": 0.7360066742044195, + "grad_norm": 1.5234375, + "learning_rate": 8.16234620385102e-06, + "loss": 0.3412, + "step": 16762 + }, + { + "epoch": 0.7360944926836229, + "grad_norm": 1.4921875, + "learning_rate": 8.157232980456047e-06, + "loss": 0.3209, + "step": 16764 + }, + { + "epoch": 0.7361823111628264, + "grad_norm": 1.5234375, + "learning_rate": 8.15212104688422e-06, + "loss": 0.2946, + "step": 16766 + }, + { + "epoch": 0.7362701296420299, + "grad_norm": 1.5234375, + "learning_rate": 8.147010403527003e-06, + "loss": 0.3094, + "step": 16768 + }, + { + "epoch": 0.7363579481212335, + "grad_norm": 1.515625, + "learning_rate": 8.141901050775758e-06, + "loss": 0.3224, + "step": 16770 + }, + { + "epoch": 0.7364457666004369, + "grad_norm": 1.53125, + "learning_rate": 8.136792989021783e-06, + "loss": 0.3271, + "step": 16772 + }, + { + "epoch": 0.7365335850796404, + "grad_norm": 1.7265625, + "learning_rate": 8.131686218656231e-06, + "loss": 0.3339, + "step": 16774 + }, + { + "epoch": 0.7366214035588439, + "grad_norm": 1.5078125, + "learning_rate": 8.126580740070202e-06, + "loss": 0.311, + "step": 16776 + }, + { + "epoch": 0.7367092220380473, + "grad_norm": 1.6328125, + "learning_rate": 8.121476553654666e-06, + "loss": 0.309, + "step": 16778 + }, + { + "epoch": 0.7367970405172508, + "grad_norm": 1.5703125, + "learning_rate": 8.116373659800502e-06, + "loss": 0.3169, + "step": 16780 + }, + { + "epoch": 0.7368848589964543, + "grad_norm": 1.5, + "learning_rate": 8.11127205889849e-06, + "loss": 0.331, + "step": 16782 + }, + { + "epoch": 0.7369726774756579, + "grad_norm": 1.578125, + "learning_rate": 8.106171751339303e-06, + "loss": 0.3081, + "step": 16784 + }, + { + "epoch": 0.7370604959548613, + "grad_norm": 1.5234375, + "learning_rate": 8.10107273751354e-06, + "loss": 0.3114, + "step": 16786 + }, + { + "epoch": 0.7371483144340648, + "grad_norm": 1.546875, + "learning_rate": 8.095975017811671e-06, + "loss": 0.332, + "step": 16788 + }, + { + "epoch": 0.7372361329132683, + "grad_norm": 1.546875, + "learning_rate": 8.090878592624097e-06, + "loss": 0.3164, + "step": 16790 + }, + { + "epoch": 0.7373239513924718, + "grad_norm": 1.4921875, + "learning_rate": 8.085783462341093e-06, + "loss": 0.3269, + "step": 16792 + }, + { + "epoch": 0.7374117698716752, + "grad_norm": 1.5078125, + "learning_rate": 8.080689627352837e-06, + "loss": 0.3515, + "step": 16794 + }, + { + "epoch": 0.7374995883508787, + "grad_norm": 1.5390625, + "learning_rate": 8.075597088049434e-06, + "loss": 0.2985, + "step": 16796 + }, + { + "epoch": 0.7375874068300822, + "grad_norm": 1.59375, + "learning_rate": 8.070505844820853e-06, + "loss": 0.3259, + "step": 16798 + }, + { + "epoch": 0.7376752253092858, + "grad_norm": 1.5078125, + "learning_rate": 8.065415898057003e-06, + "loss": 0.328, + "step": 16800 + }, + { + "epoch": 0.7377630437884892, + "grad_norm": 1.4765625, + "learning_rate": 8.060327248147662e-06, + "loss": 0.2985, + "step": 16802 + }, + { + "epoch": 0.7378508622676927, + "grad_norm": 1.4765625, + "learning_rate": 8.055239895482514e-06, + "loss": 0.3036, + "step": 16804 + }, + { + "epoch": 0.7379386807468962, + "grad_norm": 1.484375, + "learning_rate": 8.050153840451163e-06, + "loss": 0.3052, + "step": 16806 + }, + { + "epoch": 0.7380264992260996, + "grad_norm": 1.5234375, + "learning_rate": 8.045069083443088e-06, + "loss": 0.339, + "step": 16808 + }, + { + "epoch": 0.7381143177053031, + "grad_norm": 1.4609375, + "learning_rate": 8.039985624847692e-06, + "loss": 0.3002, + "step": 16810 + }, + { + "epoch": 0.7382021361845066, + "grad_norm": 1.546875, + "learning_rate": 8.034903465054266e-06, + "loss": 0.3592, + "step": 16812 + }, + { + "epoch": 0.7382899546637101, + "grad_norm": 1.53125, + "learning_rate": 8.029822604451997e-06, + "loss": 0.3023, + "step": 16814 + }, + { + "epoch": 0.7383777731429136, + "grad_norm": 1.5390625, + "learning_rate": 8.024743043429975e-06, + "loss": 0.3548, + "step": 16816 + }, + { + "epoch": 0.7384655916221171, + "grad_norm": 1.6015625, + "learning_rate": 8.019664782377207e-06, + "loss": 0.3256, + "step": 16818 + }, + { + "epoch": 0.7385534101013206, + "grad_norm": 1.59375, + "learning_rate": 8.014587821682578e-06, + "loss": 0.3362, + "step": 16820 + }, + { + "epoch": 0.7386412285805241, + "grad_norm": 1.53125, + "learning_rate": 8.009512161734881e-06, + "loss": 0.2868, + "step": 16822 + }, + { + "epoch": 0.7387290470597275, + "grad_norm": 1.453125, + "learning_rate": 8.004437802922823e-06, + "loss": 0.3509, + "step": 16824 + }, + { + "epoch": 0.738816865538931, + "grad_norm": 1.5234375, + "learning_rate": 7.999364745634982e-06, + "loss": 0.3372, + "step": 16826 + }, + { + "epoch": 0.7389046840181345, + "grad_norm": 1.5546875, + "learning_rate": 7.994292990259875e-06, + "loss": 0.3092, + "step": 16828 + }, + { + "epoch": 0.738992502497338, + "grad_norm": 1.4609375, + "learning_rate": 7.989222537185886e-06, + "loss": 0.3271, + "step": 16830 + }, + { + "epoch": 0.7390803209765415, + "grad_norm": 1.6640625, + "learning_rate": 7.984153386801304e-06, + "loss": 0.3161, + "step": 16832 + }, + { + "epoch": 0.739168139455745, + "grad_norm": 1.546875, + "learning_rate": 7.979085539494344e-06, + "loss": 0.3168, + "step": 16834 + }, + { + "epoch": 0.7392559579349485, + "grad_norm": 1.5078125, + "learning_rate": 7.974018995653085e-06, + "loss": 0.3135, + "step": 16836 + }, + { + "epoch": 0.7393437764141519, + "grad_norm": 1.4609375, + "learning_rate": 7.96895375566554e-06, + "loss": 0.3193, + "step": 16838 + }, + { + "epoch": 0.7394315948933554, + "grad_norm": 1.515625, + "learning_rate": 7.963889819919599e-06, + "loss": 0.3187, + "step": 16840 + }, + { + "epoch": 0.7395194133725589, + "grad_norm": 1.578125, + "learning_rate": 7.95882718880306e-06, + "loss": 0.3404, + "step": 16842 + }, + { + "epoch": 0.7396072318517624, + "grad_norm": 1.53125, + "learning_rate": 7.953765862703622e-06, + "loss": 0.3411, + "step": 16844 + }, + { + "epoch": 0.7396950503309659, + "grad_norm": 1.5078125, + "learning_rate": 7.948705842008869e-06, + "loss": 0.3225, + "step": 16846 + }, + { + "epoch": 0.7397828688101694, + "grad_norm": 1.5234375, + "learning_rate": 7.943647127106318e-06, + "loss": 0.346, + "step": 16848 + }, + { + "epoch": 0.7398706872893729, + "grad_norm": 1.515625, + "learning_rate": 7.938589718383354e-06, + "loss": 0.2811, + "step": 16850 + }, + { + "epoch": 0.7399585057685764, + "grad_norm": 1.59375, + "learning_rate": 7.933533616227284e-06, + "loss": 0.3548, + "step": 16852 + }, + { + "epoch": 0.7400463242477798, + "grad_norm": 1.4453125, + "learning_rate": 7.928478821025304e-06, + "loss": 0.2942, + "step": 16854 + }, + { + "epoch": 0.7401341427269833, + "grad_norm": 1.5078125, + "learning_rate": 7.923425333164497e-06, + "loss": 0.3173, + "step": 16856 + }, + { + "epoch": 0.7402219612061868, + "grad_norm": 1.6484375, + "learning_rate": 7.918373153031882e-06, + "loss": 0.2976, + "step": 16858 + }, + { + "epoch": 0.7403097796853902, + "grad_norm": 1.5390625, + "learning_rate": 7.913322281014337e-06, + "loss": 0.3343, + "step": 16860 + }, + { + "epoch": 0.7403975981645938, + "grad_norm": 1.5546875, + "learning_rate": 7.908272717498674e-06, + "loss": 0.3248, + "step": 16862 + }, + { + "epoch": 0.7404854166437973, + "grad_norm": 1.4921875, + "learning_rate": 7.903224462871586e-06, + "loss": 0.3092, + "step": 16864 + }, + { + "epoch": 0.7405732351230008, + "grad_norm": 1.453125, + "learning_rate": 7.898177517519659e-06, + "loss": 0.3267, + "step": 16866 + }, + { + "epoch": 0.7406610536022042, + "grad_norm": 1.5078125, + "learning_rate": 7.893131881829405e-06, + "loss": 0.3049, + "step": 16868 + }, + { + "epoch": 0.7407488720814077, + "grad_norm": 1.4296875, + "learning_rate": 7.888087556187201e-06, + "loss": 0.3228, + "step": 16870 + }, + { + "epoch": 0.7408366905606112, + "grad_norm": 1.546875, + "learning_rate": 7.883044540979373e-06, + "loss": 0.2953, + "step": 16872 + }, + { + "epoch": 0.7409245090398147, + "grad_norm": 1.625, + "learning_rate": 7.878002836592082e-06, + "loss": 0.331, + "step": 16874 + }, + { + "epoch": 0.7410123275190182, + "grad_norm": 1.640625, + "learning_rate": 7.872962443411445e-06, + "loss": 0.3062, + "step": 16876 + }, + { + "epoch": 0.7411001459982217, + "grad_norm": 1.4921875, + "learning_rate": 7.86792336182344e-06, + "loss": 0.3085, + "step": 16878 + }, + { + "epoch": 0.7411879644774252, + "grad_norm": 1.7265625, + "learning_rate": 7.86288559221398e-06, + "loss": 0.3122, + "step": 16880 + }, + { + "epoch": 0.7412757829566287, + "grad_norm": 1.5859375, + "learning_rate": 7.85784913496885e-06, + "loss": 0.3141, + "step": 16882 + }, + { + "epoch": 0.7413636014358321, + "grad_norm": 1.5703125, + "learning_rate": 7.852813990473734e-06, + "loss": 0.3276, + "step": 16884 + }, + { + "epoch": 0.7414514199150356, + "grad_norm": 1.5, + "learning_rate": 7.847780159114243e-06, + "loss": 0.3191, + "step": 16886 + }, + { + "epoch": 0.7415392383942391, + "grad_norm": 1.7734375, + "learning_rate": 7.84274764127585e-06, + "loss": 0.3259, + "step": 16888 + }, + { + "epoch": 0.7416270568734425, + "grad_norm": 1.640625, + "learning_rate": 7.837716437343961e-06, + "loss": 0.31, + "step": 16890 + }, + { + "epoch": 0.7417148753526461, + "grad_norm": 1.5078125, + "learning_rate": 7.832686547703866e-06, + "loss": 0.344, + "step": 16892 + }, + { + "epoch": 0.7418026938318496, + "grad_norm": 1.4453125, + "learning_rate": 7.827657972740738e-06, + "loss": 0.3119, + "step": 16894 + }, + { + "epoch": 0.7418905123110531, + "grad_norm": 1.6015625, + "learning_rate": 7.82263071283969e-06, + "loss": 0.3151, + "step": 16896 + }, + { + "epoch": 0.7419783307902565, + "grad_norm": 1.5078125, + "learning_rate": 7.81760476838569e-06, + "loss": 0.3009, + "step": 16898 + }, + { + "epoch": 0.74206614926946, + "grad_norm": 1.4453125, + "learning_rate": 7.812580139763646e-06, + "loss": 0.3434, + "step": 16900 + }, + { + "epoch": 0.7421539677486635, + "grad_norm": 1.6171875, + "learning_rate": 7.807556827358331e-06, + "loss": 0.3139, + "step": 16902 + }, + { + "epoch": 0.742241786227867, + "grad_norm": 1.46875, + "learning_rate": 7.802534831554437e-06, + "loss": 0.3411, + "step": 16904 + }, + { + "epoch": 0.7423296047070704, + "grad_norm": 1.4453125, + "learning_rate": 7.797514152736548e-06, + "loss": 0.3141, + "step": 16906 + }, + { + "epoch": 0.742417423186274, + "grad_norm": 1.4921875, + "learning_rate": 7.792494791289142e-06, + "loss": 0.301, + "step": 16908 + }, + { + "epoch": 0.7425052416654775, + "grad_norm": 1.4609375, + "learning_rate": 7.787476747596618e-06, + "loss": 0.3115, + "step": 16910 + }, + { + "epoch": 0.742593060144681, + "grad_norm": 1.5234375, + "learning_rate": 7.782460022043242e-06, + "loss": 0.3402, + "step": 16912 + }, + { + "epoch": 0.7426808786238844, + "grad_norm": 1.53125, + "learning_rate": 7.777444615013213e-06, + "loss": 0.3368, + "step": 16914 + }, + { + "epoch": 0.7427686971030879, + "grad_norm": 1.6015625, + "learning_rate": 7.772430526890603e-06, + "loss": 0.3026, + "step": 16916 + }, + { + "epoch": 0.7428565155822914, + "grad_norm": 1.5078125, + "learning_rate": 7.767417758059386e-06, + "loss": 0.3252, + "step": 16918 + }, + { + "epoch": 0.7429443340614948, + "grad_norm": 1.4921875, + "learning_rate": 7.762406308903458e-06, + "loss": 0.3343, + "step": 16920 + }, + { + "epoch": 0.7430321525406984, + "grad_norm": 1.5625, + "learning_rate": 7.757396179806576e-06, + "loss": 0.3512, + "step": 16922 + }, + { + "epoch": 0.7431199710199019, + "grad_norm": 1.734375, + "learning_rate": 7.752387371152436e-06, + "loss": 0.3558, + "step": 16924 + }, + { + "epoch": 0.7432077894991054, + "grad_norm": 1.4609375, + "learning_rate": 7.747379883324606e-06, + "loss": 0.3414, + "step": 16926 + }, + { + "epoch": 0.7432956079783088, + "grad_norm": 1.5, + "learning_rate": 7.742373716706556e-06, + "loss": 0.3374, + "step": 16928 + }, + { + "epoch": 0.7433834264575123, + "grad_norm": 1.5859375, + "learning_rate": 7.737368871681666e-06, + "loss": 0.3306, + "step": 16930 + }, + { + "epoch": 0.7434712449367158, + "grad_norm": 1.5390625, + "learning_rate": 7.732365348633203e-06, + "loss": 0.2976, + "step": 16932 + }, + { + "epoch": 0.7435590634159193, + "grad_norm": 1.46875, + "learning_rate": 7.727363147944352e-06, + "loss": 0.3012, + "step": 16934 + }, + { + "epoch": 0.7436468818951227, + "grad_norm": 1.46875, + "learning_rate": 7.722362269998159e-06, + "loss": 0.2872, + "step": 16936 + }, + { + "epoch": 0.7437347003743263, + "grad_norm": 1.5, + "learning_rate": 7.717362715177611e-06, + "loss": 0.3526, + "step": 16938 + }, + { + "epoch": 0.7438225188535298, + "grad_norm": 1.671875, + "learning_rate": 7.712364483865564e-06, + "loss": 0.3291, + "step": 16940 + }, + { + "epoch": 0.7439103373327333, + "grad_norm": 1.5546875, + "learning_rate": 7.707367576444796e-06, + "loss": 0.3077, + "step": 16942 + }, + { + "epoch": 0.7439981558119367, + "grad_norm": 1.4609375, + "learning_rate": 7.702371993297963e-06, + "loss": 0.3227, + "step": 16944 + }, + { + "epoch": 0.7440859742911402, + "grad_norm": 1.515625, + "learning_rate": 7.697377734807623e-06, + "loss": 0.319, + "step": 16946 + }, + { + "epoch": 0.7441737927703437, + "grad_norm": 1.53125, + "learning_rate": 7.69238480135625e-06, + "loss": 0.3222, + "step": 16948 + }, + { + "epoch": 0.7442616112495472, + "grad_norm": 1.5390625, + "learning_rate": 7.68739319332619e-06, + "loss": 0.3384, + "step": 16950 + }, + { + "epoch": 0.7443494297287506, + "grad_norm": 1.53125, + "learning_rate": 7.682402911099717e-06, + "loss": 0.2967, + "step": 16952 + }, + { + "epoch": 0.7444372482079542, + "grad_norm": 1.5390625, + "learning_rate": 7.677413955058982e-06, + "loss": 0.3151, + "step": 16954 + }, + { + "epoch": 0.7445250666871577, + "grad_norm": 1.5, + "learning_rate": 7.67242632558603e-06, + "loss": 0.309, + "step": 16956 + }, + { + "epoch": 0.7446128851663611, + "grad_norm": 1.5234375, + "learning_rate": 7.667440023062833e-06, + "loss": 0.3118, + "step": 16958 + }, + { + "epoch": 0.7447007036455646, + "grad_norm": 1.5703125, + "learning_rate": 7.662455047871226e-06, + "loss": 0.3044, + "step": 16960 + }, + { + "epoch": 0.7447885221247681, + "grad_norm": 1.5390625, + "learning_rate": 7.657471400392974e-06, + "loss": 0.3071, + "step": 16962 + }, + { + "epoch": 0.7448763406039716, + "grad_norm": 1.609375, + "learning_rate": 7.652489081009718e-06, + "loss": 0.3313, + "step": 16964 + }, + { + "epoch": 0.744964159083175, + "grad_norm": 1.453125, + "learning_rate": 7.647508090103009e-06, + "loss": 0.3014, + "step": 16966 + }, + { + "epoch": 0.7450519775623785, + "grad_norm": 1.5234375, + "learning_rate": 7.642528428054288e-06, + "loss": 0.3124, + "step": 16968 + }, + { + "epoch": 0.7451397960415821, + "grad_norm": 1.59375, + "learning_rate": 7.637550095244894e-06, + "loss": 0.2996, + "step": 16970 + }, + { + "epoch": 0.7452276145207856, + "grad_norm": 1.6328125, + "learning_rate": 7.632573092056086e-06, + "loss": 0.3438, + "step": 16972 + }, + { + "epoch": 0.745315432999989, + "grad_norm": 1.53125, + "learning_rate": 7.627597418868984e-06, + "loss": 0.3348, + "step": 16974 + }, + { + "epoch": 0.7454032514791925, + "grad_norm": 1.546875, + "learning_rate": 7.622623076064645e-06, + "loss": 0.3366, + "step": 16976 + }, + { + "epoch": 0.745491069958396, + "grad_norm": 1.6640625, + "learning_rate": 7.617650064023996e-06, + "loss": 0.3047, + "step": 16978 + }, + { + "epoch": 0.7455788884375995, + "grad_norm": 1.5546875, + "learning_rate": 7.6126783831278605e-06, + "loss": 0.3447, + "step": 16980 + }, + { + "epoch": 0.7456667069168029, + "grad_norm": 1.5625, + "learning_rate": 7.607708033756994e-06, + "loss": 0.3101, + "step": 16982 + }, + { + "epoch": 0.7457545253960065, + "grad_norm": 1.546875, + "learning_rate": 7.602739016292007e-06, + "loss": 0.31, + "step": 16984 + }, + { + "epoch": 0.74584234387521, + "grad_norm": 1.5625, + "learning_rate": 7.5977713311134454e-06, + "loss": 0.3373, + "step": 16986 + }, + { + "epoch": 0.7459301623544135, + "grad_norm": 1.640625, + "learning_rate": 7.592804978601725e-06, + "loss": 0.3055, + "step": 16988 + }, + { + "epoch": 0.7460179808336169, + "grad_norm": 1.53125, + "learning_rate": 7.587839959137166e-06, + "loss": 0.3276, + "step": 16990 + }, + { + "epoch": 0.7461057993128204, + "grad_norm": 1.6015625, + "learning_rate": 7.582876273100004e-06, + "loss": 0.3149, + "step": 16992 + }, + { + "epoch": 0.7461936177920239, + "grad_norm": 1.5703125, + "learning_rate": 7.5779139208703446e-06, + "loss": 0.3327, + "step": 16994 + }, + { + "epoch": 0.7462814362712273, + "grad_norm": 1.5078125, + "learning_rate": 7.572952902828229e-06, + "loss": 0.3176, + "step": 16996 + }, + { + "epoch": 0.7463692547504308, + "grad_norm": 1.53125, + "learning_rate": 7.567993219353542e-06, + "loss": 0.3338, + "step": 16998 + }, + { + "epoch": 0.7464570732296344, + "grad_norm": 1.5078125, + "learning_rate": 7.563034870826121e-06, + "loss": 0.3115, + "step": 17000 + }, + { + "epoch": 0.7465448917088379, + "grad_norm": 1.578125, + "learning_rate": 7.55807785762567e-06, + "loss": 0.3558, + "step": 17002 + }, + { + "epoch": 0.7466327101880413, + "grad_norm": 1.5234375, + "learning_rate": 7.553122180131788e-06, + "loss": 0.3067, + "step": 17004 + }, + { + "epoch": 0.7467205286672448, + "grad_norm": 1.5390625, + "learning_rate": 7.548167838724002e-06, + "loss": 0.3201, + "step": 17006 + }, + { + "epoch": 0.7468083471464483, + "grad_norm": 1.5390625, + "learning_rate": 7.543214833781695e-06, + "loss": 0.3435, + "step": 17008 + }, + { + "epoch": 0.7468961656256518, + "grad_norm": 1.671875, + "learning_rate": 7.538263165684192e-06, + "loss": 0.3126, + "step": 17010 + }, + { + "epoch": 0.7469839841048552, + "grad_norm": 1.5078125, + "learning_rate": 7.533312834810672e-06, + "loss": 0.3172, + "step": 17012 + }, + { + "epoch": 0.7470718025840587, + "grad_norm": 1.578125, + "learning_rate": 7.5283638415402505e-06, + "loss": 0.3332, + "step": 17014 + }, + { + "epoch": 0.7471596210632623, + "grad_norm": 1.5625, + "learning_rate": 7.523416186251917e-06, + "loss": 0.3223, + "step": 17016 + }, + { + "epoch": 0.7472474395424658, + "grad_norm": 1.484375, + "learning_rate": 7.518469869324548e-06, + "loss": 0.3223, + "step": 17018 + }, + { + "epoch": 0.7473352580216692, + "grad_norm": 1.5546875, + "learning_rate": 7.513524891136958e-06, + "loss": 0.3237, + "step": 17020 + }, + { + "epoch": 0.7474230765008727, + "grad_norm": 1.546875, + "learning_rate": 7.5085812520678174e-06, + "loss": 0.3332, + "step": 17022 + }, + { + "epoch": 0.7475108949800762, + "grad_norm": 1.5390625, + "learning_rate": 7.503638952495723e-06, + "loss": 0.3227, + "step": 17024 + }, + { + "epoch": 0.7475987134592796, + "grad_norm": 1.609375, + "learning_rate": 7.498697992799153e-06, + "loss": 0.3195, + "step": 17026 + }, + { + "epoch": 0.7476865319384831, + "grad_norm": 1.53125, + "learning_rate": 7.4937583733564855e-06, + "loss": 0.3361, + "step": 17028 + }, + { + "epoch": 0.7477743504176867, + "grad_norm": 1.6328125, + "learning_rate": 7.488820094545998e-06, + "loss": 0.2991, + "step": 17030 + }, + { + "epoch": 0.7478621688968902, + "grad_norm": 1.5390625, + "learning_rate": 7.483883156745858e-06, + "loss": 0.3425, + "step": 17032 + }, + { + "epoch": 0.7479499873760936, + "grad_norm": 1.484375, + "learning_rate": 7.478947560334151e-06, + "loss": 0.3203, + "step": 17034 + }, + { + "epoch": 0.7480378058552971, + "grad_norm": 1.5390625, + "learning_rate": 7.4740133056888345e-06, + "loss": 0.3273, + "step": 17036 + }, + { + "epoch": 0.7481256243345006, + "grad_norm": 1.515625, + "learning_rate": 7.469080393187786e-06, + "loss": 0.3265, + "step": 17038 + }, + { + "epoch": 0.748213442813704, + "grad_norm": 1.71875, + "learning_rate": 7.464148823208764e-06, + "loss": 0.3364, + "step": 17040 + }, + { + "epoch": 0.7483012612929075, + "grad_norm": 1.4921875, + "learning_rate": 7.459218596129422e-06, + "loss": 0.2967, + "step": 17042 + }, + { + "epoch": 0.748389079772111, + "grad_norm": 1.5546875, + "learning_rate": 7.454289712327333e-06, + "loss": 0.3519, + "step": 17044 + }, + { + "epoch": 0.7484768982513146, + "grad_norm": 1.5859375, + "learning_rate": 7.449362172179936e-06, + "loss": 0.3004, + "step": 17046 + }, + { + "epoch": 0.748564716730518, + "grad_norm": 1.6171875, + "learning_rate": 7.444435976064595e-06, + "loss": 0.2956, + "step": 17048 + }, + { + "epoch": 0.7486525352097215, + "grad_norm": 1.4609375, + "learning_rate": 7.439511124358558e-06, + "loss": 0.3085, + "step": 17050 + }, + { + "epoch": 0.748740353688925, + "grad_norm": 1.6171875, + "learning_rate": 7.434587617438962e-06, + "loss": 0.3035, + "step": 17052 + }, + { + "epoch": 0.7488281721681285, + "grad_norm": 1.484375, + "learning_rate": 7.4296654556828635e-06, + "loss": 0.307, + "step": 17054 + }, + { + "epoch": 0.7489159906473319, + "grad_norm": 1.53125, + "learning_rate": 7.424744639467196e-06, + "loss": 0.3258, + "step": 17056 + }, + { + "epoch": 0.7490038091265354, + "grad_norm": 1.515625, + "learning_rate": 7.419825169168798e-06, + "loss": 0.3134, + "step": 17058 + }, + { + "epoch": 0.7490916276057389, + "grad_norm": 1.46875, + "learning_rate": 7.4149070451643955e-06, + "loss": 0.3302, + "step": 17060 + }, + { + "epoch": 0.7491794460849425, + "grad_norm": 1.484375, + "learning_rate": 7.4099902678306324e-06, + "loss": 0.3126, + "step": 17062 + }, + { + "epoch": 0.7492672645641459, + "grad_norm": 1.5859375, + "learning_rate": 7.405074837544035e-06, + "loss": 0.3404, + "step": 17064 + }, + { + "epoch": 0.7493550830433494, + "grad_norm": 1.4609375, + "learning_rate": 7.400160754681012e-06, + "loss": 0.3275, + "step": 17066 + }, + { + "epoch": 0.7494429015225529, + "grad_norm": 1.578125, + "learning_rate": 7.3952480196179094e-06, + "loss": 0.3284, + "step": 17068 + }, + { + "epoch": 0.7495307200017564, + "grad_norm": 1.484375, + "learning_rate": 7.3903366327309235e-06, + "loss": 0.3025, + "step": 17070 + }, + { + "epoch": 0.7496185384809598, + "grad_norm": 1.4765625, + "learning_rate": 7.385426594396186e-06, + "loss": 0.3241, + "step": 17072 + }, + { + "epoch": 0.7497063569601633, + "grad_norm": 1.59375, + "learning_rate": 7.3805179049896975e-06, + "loss": 0.3365, + "step": 17074 + }, + { + "epoch": 0.7497941754393669, + "grad_norm": 1.65625, + "learning_rate": 7.375610564887378e-06, + "loss": 0.3483, + "step": 17076 + }, + { + "epoch": 0.7498819939185704, + "grad_norm": 1.46875, + "learning_rate": 7.3707045744650265e-06, + "loss": 0.31, + "step": 17078 + }, + { + "epoch": 0.7499698123977738, + "grad_norm": 1.6640625, + "learning_rate": 7.365799934098336e-06, + "loss": 0.3239, + "step": 17080 + }, + { + "epoch": 0.7500576308769773, + "grad_norm": 1.515625, + "learning_rate": 7.360896644162924e-06, + "loss": 0.3389, + "step": 17082 + }, + { + "epoch": 0.7501454493561808, + "grad_norm": 1.5625, + "learning_rate": 7.355994705034267e-06, + "loss": 0.3029, + "step": 17084 + }, + { + "epoch": 0.7502332678353842, + "grad_norm": 1.40625, + "learning_rate": 7.35109411708777e-06, + "loss": 0.3065, + "step": 17086 + }, + { + "epoch": 0.7503210863145877, + "grad_norm": 1.578125, + "learning_rate": 7.346194880698718e-06, + "loss": 0.3347, + "step": 17088 + }, + { + "epoch": 0.7504089047937912, + "grad_norm": 1.6484375, + "learning_rate": 7.341296996242295e-06, + "loss": 0.3161, + "step": 17090 + }, + { + "epoch": 0.7504967232729948, + "grad_norm": 1.5625, + "learning_rate": 7.336400464093579e-06, + "loss": 0.318, + "step": 17092 + }, + { + "epoch": 0.7505845417521982, + "grad_norm": 1.5078125, + "learning_rate": 7.331505284627543e-06, + "loss": 0.3086, + "step": 17094 + }, + { + "epoch": 0.7506723602314017, + "grad_norm": 1.578125, + "learning_rate": 7.326611458219077e-06, + "loss": 0.3083, + "step": 17096 + }, + { + "epoch": 0.7507601787106052, + "grad_norm": 1.546875, + "learning_rate": 7.321718985242931e-06, + "loss": 0.3355, + "step": 17098 + }, + { + "epoch": 0.7508479971898087, + "grad_norm": 1.828125, + "learning_rate": 7.316827866073794e-06, + "loss": 0.3235, + "step": 17100 + }, + { + "epoch": 0.7509358156690121, + "grad_norm": 1.59375, + "learning_rate": 7.3119381010862155e-06, + "loss": 0.2957, + "step": 17102 + }, + { + "epoch": 0.7510236341482156, + "grad_norm": 1.484375, + "learning_rate": 7.307049690654649e-06, + "loss": 0.3141, + "step": 17104 + }, + { + "epoch": 0.7511114526274191, + "grad_norm": 1.5390625, + "learning_rate": 7.30216263515347e-06, + "loss": 0.3129, + "step": 17106 + }, + { + "epoch": 0.7511992711066227, + "grad_norm": 1.53125, + "learning_rate": 7.297276934956909e-06, + "loss": 0.3318, + "step": 17108 + }, + { + "epoch": 0.7512870895858261, + "grad_norm": 1.6640625, + "learning_rate": 7.292392590439132e-06, + "loss": 0.3171, + "step": 17110 + }, + { + "epoch": 0.7513749080650296, + "grad_norm": 1.625, + "learning_rate": 7.287509601974174e-06, + "loss": 0.3326, + "step": 17112 + }, + { + "epoch": 0.7514627265442331, + "grad_norm": 1.53125, + "learning_rate": 7.28262796993597e-06, + "loss": 0.3218, + "step": 17114 + }, + { + "epoch": 0.7515505450234365, + "grad_norm": 1.5546875, + "learning_rate": 7.2777476946983696e-06, + "loss": 0.3128, + "step": 17116 + }, + { + "epoch": 0.75163836350264, + "grad_norm": 1.5703125, + "learning_rate": 7.2728687766351e-06, + "loss": 0.3404, + "step": 17118 + }, + { + "epoch": 0.7517261819818435, + "grad_norm": 1.6953125, + "learning_rate": 7.267991216119791e-06, + "loss": 0.3249, + "step": 17120 + }, + { + "epoch": 0.751814000461047, + "grad_norm": 1.65625, + "learning_rate": 7.263115013525956e-06, + "loss": 0.3063, + "step": 17122 + }, + { + "epoch": 0.7519018189402505, + "grad_norm": 1.640625, + "learning_rate": 7.258240169227032e-06, + "loss": 0.3196, + "step": 17124 + }, + { + "epoch": 0.751989637419454, + "grad_norm": 1.71875, + "learning_rate": 7.25336668359633e-06, + "loss": 0.3088, + "step": 17126 + }, + { + "epoch": 0.7520774558986575, + "grad_norm": 1.4609375, + "learning_rate": 7.248494557007051e-06, + "loss": 0.3015, + "step": 17128 + }, + { + "epoch": 0.752165274377861, + "grad_norm": 1.65625, + "learning_rate": 7.2436237898323236e-06, + "loss": 0.3299, + "step": 17130 + }, + { + "epoch": 0.7522530928570644, + "grad_norm": 1.5234375, + "learning_rate": 7.238754382445137e-06, + "loss": 0.3355, + "step": 17132 + }, + { + "epoch": 0.7523409113362679, + "grad_norm": 1.4921875, + "learning_rate": 7.233886335218404e-06, + "loss": 0.3233, + "step": 17134 + }, + { + "epoch": 0.7524287298154714, + "grad_norm": 1.5390625, + "learning_rate": 7.2290196485249155e-06, + "loss": 0.3121, + "step": 17136 + }, + { + "epoch": 0.752516548294675, + "grad_norm": 1.453125, + "learning_rate": 7.22415432273735e-06, + "loss": 0.3085, + "step": 17138 + }, + { + "epoch": 0.7526043667738784, + "grad_norm": 1.5390625, + "learning_rate": 7.21929035822832e-06, + "loss": 0.3517, + "step": 17140 + }, + { + "epoch": 0.7526921852530819, + "grad_norm": 1.484375, + "learning_rate": 7.214427755370287e-06, + "loss": 0.3077, + "step": 17142 + }, + { + "epoch": 0.7527800037322854, + "grad_norm": 1.4453125, + "learning_rate": 7.209566514535648e-06, + "loss": 0.3223, + "step": 17144 + }, + { + "epoch": 0.7528678222114888, + "grad_norm": 1.5234375, + "learning_rate": 7.204706636096664e-06, + "loss": 0.3242, + "step": 17146 + }, + { + "epoch": 0.7529556406906923, + "grad_norm": 1.9296875, + "learning_rate": 7.199848120425526e-06, + "loss": 0.3434, + "step": 17148 + }, + { + "epoch": 0.7530434591698958, + "grad_norm": 1.546875, + "learning_rate": 7.194990967894269e-06, + "loss": 0.298, + "step": 17150 + }, + { + "epoch": 0.7531312776490993, + "grad_norm": 1.640625, + "learning_rate": 7.1901351788748795e-06, + "loss": 0.288, + "step": 17152 + }, + { + "epoch": 0.7532190961283028, + "grad_norm": 1.5234375, + "learning_rate": 7.1852807537392095e-06, + "loss": 0.3107, + "step": 17154 + }, + { + "epoch": 0.7533069146075063, + "grad_norm": 1.5234375, + "learning_rate": 7.1804276928590015e-06, + "loss": 0.2997, + "step": 17156 + }, + { + "epoch": 0.7533947330867098, + "grad_norm": 1.6015625, + "learning_rate": 7.175575996605918e-06, + "loss": 0.2898, + "step": 17158 + }, + { + "epoch": 0.7534825515659133, + "grad_norm": 1.4765625, + "learning_rate": 7.170725665351493e-06, + "loss": 0.3254, + "step": 17160 + }, + { + "epoch": 0.7535703700451167, + "grad_norm": 1.5703125, + "learning_rate": 7.165876699467175e-06, + "loss": 0.3258, + "step": 17162 + }, + { + "epoch": 0.7536581885243202, + "grad_norm": 1.4765625, + "learning_rate": 7.161029099324299e-06, + "loss": 0.3426, + "step": 17164 + }, + { + "epoch": 0.7537460070035237, + "grad_norm": 1.4375, + "learning_rate": 7.156182865294078e-06, + "loss": 0.2992, + "step": 17166 + }, + { + "epoch": 0.7538338254827271, + "grad_norm": 1.5078125, + "learning_rate": 7.151337997747662e-06, + "loss": 0.3336, + "step": 17168 + }, + { + "epoch": 0.7539216439619307, + "grad_norm": 1.546875, + "learning_rate": 7.14649449705605e-06, + "loss": 0.3151, + "step": 17170 + }, + { + "epoch": 0.7540094624411342, + "grad_norm": 1.515625, + "learning_rate": 7.1416523635901785e-06, + "loss": 0.2882, + "step": 17172 + }, + { + "epoch": 0.7540972809203377, + "grad_norm": 1.5, + "learning_rate": 7.136811597720852e-06, + "loss": 0.3184, + "step": 17174 + }, + { + "epoch": 0.7541850993995411, + "grad_norm": 1.515625, + "learning_rate": 7.131972199818765e-06, + "loss": 0.3324, + "step": 17176 + }, + { + "epoch": 0.7542729178787446, + "grad_norm": 1.46875, + "learning_rate": 7.12713417025454e-06, + "loss": 0.323, + "step": 17178 + }, + { + "epoch": 0.7543607363579481, + "grad_norm": 1.6328125, + "learning_rate": 7.122297509398662e-06, + "loss": 0.3348, + "step": 17180 + }, + { + "epoch": 0.7544485548371516, + "grad_norm": 1.546875, + "learning_rate": 7.117462217621529e-06, + "loss": 0.3198, + "step": 17182 + }, + { + "epoch": 0.7545363733163551, + "grad_norm": 1.5234375, + "learning_rate": 7.112628295293417e-06, + "loss": 0.318, + "step": 17184 + }, + { + "epoch": 0.7546241917955586, + "grad_norm": 1.4609375, + "learning_rate": 7.107795742784526e-06, + "loss": 0.2928, + "step": 17186 + }, + { + "epoch": 0.7547120102747621, + "grad_norm": 1.53125, + "learning_rate": 7.102964560464925e-06, + "loss": 0.3126, + "step": 17188 + }, + { + "epoch": 0.7547998287539656, + "grad_norm": 1.7421875, + "learning_rate": 7.0981347487045825e-06, + "loss": 0.3456, + "step": 17190 + }, + { + "epoch": 0.754887647233169, + "grad_norm": 1.6484375, + "learning_rate": 7.093306307873376e-06, + "loss": 0.3137, + "step": 17192 + }, + { + "epoch": 0.7549754657123725, + "grad_norm": 1.5859375, + "learning_rate": 7.0884792383410615e-06, + "loss": 0.3314, + "step": 17194 + }, + { + "epoch": 0.755063284191576, + "grad_norm": 1.625, + "learning_rate": 7.083653540477306e-06, + "loss": 0.3567, + "step": 17196 + }, + { + "epoch": 0.7551511026707795, + "grad_norm": 1.484375, + "learning_rate": 7.078829214651658e-06, + "loss": 0.308, + "step": 17198 + }, + { + "epoch": 0.755238921149983, + "grad_norm": 1.4453125, + "learning_rate": 7.074006261233559e-06, + "loss": 0.3441, + "step": 17200 + }, + { + "epoch": 0.7553267396291865, + "grad_norm": 1.5703125, + "learning_rate": 7.0691846805923635e-06, + "loss": 0.3215, + "step": 17202 + }, + { + "epoch": 0.75541455810839, + "grad_norm": 1.5, + "learning_rate": 7.064364473097296e-06, + "loss": 0.3256, + "step": 17204 + }, + { + "epoch": 0.7555023765875934, + "grad_norm": 1.4453125, + "learning_rate": 7.059545639117504e-06, + "loss": 0.3148, + "step": 17206 + }, + { + "epoch": 0.7555901950667969, + "grad_norm": 1.46875, + "learning_rate": 7.054728179021999e-06, + "loss": 0.3193, + "step": 17208 + }, + { + "epoch": 0.7556780135460004, + "grad_norm": 1.5859375, + "learning_rate": 7.049912093179728e-06, + "loss": 0.2858, + "step": 17210 + }, + { + "epoch": 0.7557658320252039, + "grad_norm": 1.515625, + "learning_rate": 7.0450973819594785e-06, + "loss": 0.3166, + "step": 17212 + }, + { + "epoch": 0.7558536505044073, + "grad_norm": 1.609375, + "learning_rate": 7.04028404572998e-06, + "loss": 0.3183, + "step": 17214 + }, + { + "epoch": 0.7559414689836109, + "grad_norm": 1.53125, + "learning_rate": 7.035472084859837e-06, + "loss": 0.299, + "step": 17216 + }, + { + "epoch": 0.7560292874628144, + "grad_norm": 1.5546875, + "learning_rate": 7.030661499717539e-06, + "loss": 0.3222, + "step": 17218 + }, + { + "epoch": 0.7561171059420179, + "grad_norm": 1.484375, + "learning_rate": 7.0258522906715e-06, + "loss": 0.3416, + "step": 17220 + }, + { + "epoch": 0.7562049244212213, + "grad_norm": 1.4921875, + "learning_rate": 7.0210444580899925e-06, + "loss": 0.3432, + "step": 17222 + }, + { + "epoch": 0.7562927429004248, + "grad_norm": 1.5625, + "learning_rate": 7.016238002341219e-06, + "loss": 0.316, + "step": 17224 + }, + { + "epoch": 0.7563805613796283, + "grad_norm": 1.4375, + "learning_rate": 7.0114329237932485e-06, + "loss": 0.3184, + "step": 17226 + }, + { + "epoch": 0.7564683798588318, + "grad_norm": 1.53125, + "learning_rate": 7.006629222814048e-06, + "loss": 0.3126, + "step": 17228 + }, + { + "epoch": 0.7565561983380353, + "grad_norm": 1.4453125, + "learning_rate": 7.001826899771505e-06, + "loss": 0.3099, + "step": 17230 + }, + { + "epoch": 0.7566440168172388, + "grad_norm": 1.4609375, + "learning_rate": 6.997025955033365e-06, + "loss": 0.3084, + "step": 17232 + }, + { + "epoch": 0.7567318352964423, + "grad_norm": 1.6171875, + "learning_rate": 6.992226388967302e-06, + "loss": 0.3072, + "step": 17234 + }, + { + "epoch": 0.7568196537756458, + "grad_norm": 1.5078125, + "learning_rate": 6.987428201940854e-06, + "loss": 0.3085, + "step": 17236 + }, + { + "epoch": 0.7569074722548492, + "grad_norm": 1.5546875, + "learning_rate": 6.982631394321468e-06, + "loss": 0.3344, + "step": 17238 + }, + { + "epoch": 0.7569952907340527, + "grad_norm": 1.6171875, + "learning_rate": 6.977835966476503e-06, + "loss": 0.3099, + "step": 17240 + }, + { + "epoch": 0.7570831092132562, + "grad_norm": 1.5703125, + "learning_rate": 6.973041918773168e-06, + "loss": 0.3596, + "step": 17242 + }, + { + "epoch": 0.7571709276924596, + "grad_norm": 1.4765625, + "learning_rate": 6.9682492515786096e-06, + "loss": 0.3381, + "step": 17244 + }, + { + "epoch": 0.7572587461716632, + "grad_norm": 1.5546875, + "learning_rate": 6.963457965259837e-06, + "loss": 0.3254, + "step": 17246 + }, + { + "epoch": 0.7573465646508667, + "grad_norm": 1.5625, + "learning_rate": 6.958668060183785e-06, + "loss": 0.3155, + "step": 17248 + }, + { + "epoch": 0.7574343831300702, + "grad_norm": 1.515625, + "learning_rate": 6.953879536717259e-06, + "loss": 0.3348, + "step": 17250 + }, + { + "epoch": 0.7575222016092736, + "grad_norm": 1.578125, + "learning_rate": 6.949092395226955e-06, + "loss": 0.3602, + "step": 17252 + }, + { + "epoch": 0.7576100200884771, + "grad_norm": 1.4765625, + "learning_rate": 6.944306636079492e-06, + "loss": 0.2985, + "step": 17254 + }, + { + "epoch": 0.7576978385676806, + "grad_norm": 1.53125, + "learning_rate": 6.939522259641346e-06, + "loss": 0.2893, + "step": 17256 + }, + { + "epoch": 0.757785657046884, + "grad_norm": 1.5390625, + "learning_rate": 6.934739266278923e-06, + "loss": 0.3334, + "step": 17258 + }, + { + "epoch": 0.7578734755260875, + "grad_norm": 1.53125, + "learning_rate": 6.929957656358496e-06, + "loss": 0.3047, + "step": 17260 + }, + { + "epoch": 0.7579612940052911, + "grad_norm": 1.4921875, + "learning_rate": 6.925177430246238e-06, + "loss": 0.3304, + "step": 17262 + }, + { + "epoch": 0.7580491124844946, + "grad_norm": 1.5859375, + "learning_rate": 6.920398588308233e-06, + "loss": 0.3467, + "step": 17264 + }, + { + "epoch": 0.758136930963698, + "grad_norm": 1.515625, + "learning_rate": 6.915621130910427e-06, + "loss": 0.3235, + "step": 17266 + }, + { + "epoch": 0.7582247494429015, + "grad_norm": 1.515625, + "learning_rate": 6.9108450584187e-06, + "loss": 0.3322, + "step": 17268 + }, + { + "epoch": 0.758312567922105, + "grad_norm": 1.546875, + "learning_rate": 6.9060703711987944e-06, + "loss": 0.3114, + "step": 17270 + }, + { + "epoch": 0.7584003864013085, + "grad_norm": 1.5, + "learning_rate": 6.9012970696163585e-06, + "loss": 0.3164, + "step": 17272 + }, + { + "epoch": 0.7584882048805119, + "grad_norm": 1.6328125, + "learning_rate": 6.896525154036923e-06, + "loss": 0.323, + "step": 17274 + }, + { + "epoch": 0.7585760233597155, + "grad_norm": 1.578125, + "learning_rate": 6.891754624825939e-06, + "loss": 0.3231, + "step": 17276 + }, + { + "epoch": 0.758663841838919, + "grad_norm": 1.4921875, + "learning_rate": 6.886985482348726e-06, + "loss": 0.3322, + "step": 17278 + }, + { + "epoch": 0.7587516603181225, + "grad_norm": 1.609375, + "learning_rate": 6.8822177269704965e-06, + "loss": 0.3385, + "step": 17280 + }, + { + "epoch": 0.7588394787973259, + "grad_norm": 1.609375, + "learning_rate": 6.877451359056389e-06, + "loss": 0.3327, + "step": 17282 + }, + { + "epoch": 0.7589272972765294, + "grad_norm": 1.59375, + "learning_rate": 6.872686378971391e-06, + "loss": 0.3015, + "step": 17284 + }, + { + "epoch": 0.7590151157557329, + "grad_norm": 1.640625, + "learning_rate": 6.867922787080422e-06, + "loss": 0.3224, + "step": 17286 + }, + { + "epoch": 0.7591029342349364, + "grad_norm": 1.515625, + "learning_rate": 6.863160583748274e-06, + "loss": 0.3328, + "step": 17288 + }, + { + "epoch": 0.7591907527141398, + "grad_norm": 1.6875, + "learning_rate": 6.858399769339627e-06, + "loss": 0.3141, + "step": 17290 + }, + { + "epoch": 0.7592785711933434, + "grad_norm": 1.5546875, + "learning_rate": 6.853640344219084e-06, + "loss": 0.3176, + "step": 17292 + }, + { + "epoch": 0.7593663896725469, + "grad_norm": 1.4921875, + "learning_rate": 6.848882308751106e-06, + "loss": 0.2885, + "step": 17294 + }, + { + "epoch": 0.7594542081517504, + "grad_norm": 1.484375, + "learning_rate": 6.844125663300077e-06, + "loss": 0.3246, + "step": 17296 + }, + { + "epoch": 0.7595420266309538, + "grad_norm": 1.5859375, + "learning_rate": 6.839370408230259e-06, + "loss": 0.3065, + "step": 17298 + }, + { + "epoch": 0.7596298451101573, + "grad_norm": 1.5078125, + "learning_rate": 6.8346165439058e-06, + "loss": 0.3019, + "step": 17300 + }, + { + "epoch": 0.7597176635893608, + "grad_norm": 1.5859375, + "learning_rate": 6.829864070690778e-06, + "loss": 0.3003, + "step": 17302 + }, + { + "epoch": 0.7598054820685642, + "grad_norm": 1.53125, + "learning_rate": 6.825112988949103e-06, + "loss": 0.3029, + "step": 17304 + }, + { + "epoch": 0.7598933005477677, + "grad_norm": 1.5234375, + "learning_rate": 6.820363299044641e-06, + "loss": 0.3294, + "step": 17306 + }, + { + "epoch": 0.7599811190269713, + "grad_norm": 1.5625, + "learning_rate": 6.815615001341108e-06, + "loss": 0.2908, + "step": 17308 + }, + { + "epoch": 0.7600689375061748, + "grad_norm": 1.53125, + "learning_rate": 6.810868096202144e-06, + "loss": 0.3225, + "step": 17310 + }, + { + "epoch": 0.7601567559853782, + "grad_norm": 1.46875, + "learning_rate": 6.806122583991264e-06, + "loss": 0.3106, + "step": 17312 + }, + { + "epoch": 0.7602445744645817, + "grad_norm": 1.6171875, + "learning_rate": 6.801378465071867e-06, + "loss": 0.3209, + "step": 17314 + }, + { + "epoch": 0.7603323929437852, + "grad_norm": 1.5546875, + "learning_rate": 6.7966357398072804e-06, + "loss": 0.3088, + "step": 17316 + }, + { + "epoch": 0.7604202114229887, + "grad_norm": 1.5, + "learning_rate": 6.791894408560681e-06, + "loss": 0.3274, + "step": 17318 + }, + { + "epoch": 0.7605080299021921, + "grad_norm": 1.5703125, + "learning_rate": 6.787154471695184e-06, + "loss": 0.3167, + "step": 17320 + }, + { + "epoch": 0.7605958483813956, + "grad_norm": 1.4609375, + "learning_rate": 6.7824159295737625e-06, + "loss": 0.3075, + "step": 17322 + }, + { + "epoch": 0.7606836668605992, + "grad_norm": 1.515625, + "learning_rate": 6.777678782559288e-06, + "loss": 0.3066, + "step": 17324 + }, + { + "epoch": 0.7607714853398027, + "grad_norm": 1.453125, + "learning_rate": 6.772943031014548e-06, + "loss": 0.2794, + "step": 17326 + }, + { + "epoch": 0.7608593038190061, + "grad_norm": 1.625, + "learning_rate": 6.768208675302193e-06, + "loss": 0.3188, + "step": 17328 + }, + { + "epoch": 0.7609471222982096, + "grad_norm": 1.4296875, + "learning_rate": 6.763475715784795e-06, + "loss": 0.2992, + "step": 17330 + }, + { + "epoch": 0.7610349407774131, + "grad_norm": 1.625, + "learning_rate": 6.758744152824798e-06, + "loss": 0.3023, + "step": 17332 + }, + { + "epoch": 0.7611227592566165, + "grad_norm": 1.609375, + "learning_rate": 6.754013986784546e-06, + "loss": 0.3005, + "step": 17334 + }, + { + "epoch": 0.76121057773582, + "grad_norm": 1.578125, + "learning_rate": 6.749285218026272e-06, + "loss": 0.3198, + "step": 17336 + }, + { + "epoch": 0.7612983962150236, + "grad_norm": 1.5390625, + "learning_rate": 6.744557846912114e-06, + "loss": 0.3374, + "step": 17338 + }, + { + "epoch": 0.7613862146942271, + "grad_norm": 1.625, + "learning_rate": 6.739831873804095e-06, + "loss": 0.3041, + "step": 17340 + }, + { + "epoch": 0.7614740331734305, + "grad_norm": 1.609375, + "learning_rate": 6.7351072990641225e-06, + "loss": 0.3155, + "step": 17342 + }, + { + "epoch": 0.761561851652634, + "grad_norm": 1.4609375, + "learning_rate": 6.730384123054018e-06, + "loss": 0.311, + "step": 17344 + }, + { + "epoch": 0.7616496701318375, + "grad_norm": 1.59375, + "learning_rate": 6.725662346135467e-06, + "loss": 0.3149, + "step": 17346 + }, + { + "epoch": 0.761737488611041, + "grad_norm": 1.4375, + "learning_rate": 6.720941968670083e-06, + "loss": 0.309, + "step": 17348 + }, + { + "epoch": 0.7618253070902444, + "grad_norm": 1.453125, + "learning_rate": 6.716222991019347e-06, + "loss": 0.3154, + "step": 17350 + }, + { + "epoch": 0.7619131255694479, + "grad_norm": 1.6171875, + "learning_rate": 6.711505413544628e-06, + "loss": 0.3152, + "step": 17352 + }, + { + "epoch": 0.7620009440486515, + "grad_norm": 1.4375, + "learning_rate": 6.706789236607214e-06, + "loss": 0.298, + "step": 17354 + }, + { + "epoch": 0.762088762527855, + "grad_norm": 1.609375, + "learning_rate": 6.7020744605682616e-06, + "loss": 0.351, + "step": 17356 + }, + { + "epoch": 0.7621765810070584, + "grad_norm": 1.5546875, + "learning_rate": 6.697361085788839e-06, + "loss": 0.3015, + "step": 17358 + }, + { + "epoch": 0.7622643994862619, + "grad_norm": 1.5859375, + "learning_rate": 6.69264911262989e-06, + "loss": 0.3124, + "step": 17360 + }, + { + "epoch": 0.7623522179654654, + "grad_norm": 1.390625, + "learning_rate": 6.687938541452257e-06, + "loss": 0.3441, + "step": 17362 + }, + { + "epoch": 0.7624400364446688, + "grad_norm": 1.5625, + "learning_rate": 6.6832293726166926e-06, + "loss": 0.3319, + "step": 17364 + }, + { + "epoch": 0.7625278549238723, + "grad_norm": 1.4765625, + "learning_rate": 6.678521606483798e-06, + "loss": 0.3132, + "step": 17366 + }, + { + "epoch": 0.7626156734030758, + "grad_norm": 1.5078125, + "learning_rate": 6.673815243414119e-06, + "loss": 0.3261, + "step": 17368 + }, + { + "epoch": 0.7627034918822794, + "grad_norm": 1.515625, + "learning_rate": 6.669110283768057e-06, + "loss": 0.3018, + "step": 17370 + }, + { + "epoch": 0.7627913103614828, + "grad_norm": 1.71875, + "learning_rate": 6.664406727905928e-06, + "loss": 0.3038, + "step": 17372 + }, + { + "epoch": 0.7628791288406863, + "grad_norm": 1.5, + "learning_rate": 6.659704576187928e-06, + "loss": 0.3124, + "step": 17374 + }, + { + "epoch": 0.7629669473198898, + "grad_norm": 1.578125, + "learning_rate": 6.655003828974141e-06, + "loss": 0.3278, + "step": 17376 + }, + { + "epoch": 0.7630547657990933, + "grad_norm": 1.4921875, + "learning_rate": 6.650304486624565e-06, + "loss": 0.3151, + "step": 17378 + }, + { + "epoch": 0.7631425842782967, + "grad_norm": 1.5546875, + "learning_rate": 6.645606549499062e-06, + "loss": 0.3277, + "step": 17380 + }, + { + "epoch": 0.7632304027575002, + "grad_norm": 1.453125, + "learning_rate": 6.640910017957419e-06, + "loss": 0.296, + "step": 17382 + }, + { + "epoch": 0.7633182212367038, + "grad_norm": 1.5703125, + "learning_rate": 6.6362148923592854e-06, + "loss": 0.3152, + "step": 17384 + }, + { + "epoch": 0.7634060397159073, + "grad_norm": 1.4453125, + "learning_rate": 6.6315211730642114e-06, + "loss": 0.3105, + "step": 17386 + }, + { + "epoch": 0.7634938581951107, + "grad_norm": 1.625, + "learning_rate": 6.626828860431658e-06, + "loss": 0.3076, + "step": 17388 + }, + { + "epoch": 0.7635816766743142, + "grad_norm": 1.59375, + "learning_rate": 6.622137954820945e-06, + "loss": 0.3395, + "step": 17390 + }, + { + "epoch": 0.7636694951535177, + "grad_norm": 1.5078125, + "learning_rate": 6.617448456591321e-06, + "loss": 0.3213, + "step": 17392 + }, + { + "epoch": 0.7637573136327211, + "grad_norm": 1.59375, + "learning_rate": 6.612760366101902e-06, + "loss": 0.2909, + "step": 17394 + }, + { + "epoch": 0.7638451321119246, + "grad_norm": 1.515625, + "learning_rate": 6.608073683711699e-06, + "loss": 0.3157, + "step": 17396 + }, + { + "epoch": 0.7639329505911281, + "grad_norm": 1.5234375, + "learning_rate": 6.603388409779626e-06, + "loss": 0.3192, + "step": 17398 + }, + { + "epoch": 0.7640207690703317, + "grad_norm": 1.640625, + "learning_rate": 6.598704544664469e-06, + "loss": 0.3447, + "step": 17400 + }, + { + "epoch": 0.7641085875495351, + "grad_norm": 1.53125, + "learning_rate": 6.594022088724935e-06, + "loss": 0.3035, + "step": 17402 + }, + { + "epoch": 0.7641964060287386, + "grad_norm": 1.4921875, + "learning_rate": 6.589341042319597e-06, + "loss": 0.3009, + "step": 17404 + }, + { + "epoch": 0.7642842245079421, + "grad_norm": 1.515625, + "learning_rate": 6.5846614058069415e-06, + "loss": 0.296, + "step": 17406 + }, + { + "epoch": 0.7643720429871456, + "grad_norm": 1.5390625, + "learning_rate": 6.579983179545324e-06, + "loss": 0.3131, + "step": 17408 + }, + { + "epoch": 0.764459861466349, + "grad_norm": 1.578125, + "learning_rate": 6.5753063638930165e-06, + "loss": 0.3251, + "step": 17410 + }, + { + "epoch": 0.7645476799455525, + "grad_norm": 1.765625, + "learning_rate": 6.570630959208163e-06, + "loss": 0.3128, + "step": 17412 + }, + { + "epoch": 0.764635498424756, + "grad_norm": 1.5703125, + "learning_rate": 6.5659569658488015e-06, + "loss": 0.3027, + "step": 17414 + }, + { + "epoch": 0.7647233169039596, + "grad_norm": 1.5546875, + "learning_rate": 6.561284384172883e-06, + "loss": 0.3251, + "step": 17416 + }, + { + "epoch": 0.764811135383163, + "grad_norm": 1.453125, + "learning_rate": 6.556613214538218e-06, + "loss": 0.3083, + "step": 17418 + }, + { + "epoch": 0.7648989538623665, + "grad_norm": 1.484375, + "learning_rate": 6.551943457302543e-06, + "loss": 0.3032, + "step": 17420 + }, + { + "epoch": 0.76498677234157, + "grad_norm": 1.6015625, + "learning_rate": 6.547275112823459e-06, + "loss": 0.2952, + "step": 17422 + }, + { + "epoch": 0.7650745908207734, + "grad_norm": 1.578125, + "learning_rate": 6.542608181458471e-06, + "loss": 0.3373, + "step": 17424 + }, + { + "epoch": 0.7651624092999769, + "grad_norm": 1.46875, + "learning_rate": 6.537942663564975e-06, + "loss": 0.324, + "step": 17426 + }, + { + "epoch": 0.7652502277791804, + "grad_norm": 1.625, + "learning_rate": 6.5332785595002446e-06, + "loss": 0.3154, + "step": 17428 + }, + { + "epoch": 0.765338046258384, + "grad_norm": 1.578125, + "learning_rate": 6.528615869621477e-06, + "loss": 0.3149, + "step": 17430 + }, + { + "epoch": 0.7654258647375874, + "grad_norm": 1.5234375, + "learning_rate": 6.523954594285728e-06, + "loss": 0.3135, + "step": 17432 + }, + { + "epoch": 0.7655136832167909, + "grad_norm": 1.5078125, + "learning_rate": 6.51929473384997e-06, + "loss": 0.3317, + "step": 17434 + }, + { + "epoch": 0.7656015016959944, + "grad_norm": 1.5, + "learning_rate": 6.514636288671056e-06, + "loss": 0.3129, + "step": 17436 + }, + { + "epoch": 0.7656893201751979, + "grad_norm": 1.5234375, + "learning_rate": 6.509979259105714e-06, + "loss": 0.2847, + "step": 17438 + }, + { + "epoch": 0.7657771386544013, + "grad_norm": 1.5859375, + "learning_rate": 6.505323645510603e-06, + "loss": 0.3047, + "step": 17440 + }, + { + "epoch": 0.7658649571336048, + "grad_norm": 1.53125, + "learning_rate": 6.500669448242233e-06, + "loss": 0.3203, + "step": 17442 + }, + { + "epoch": 0.7659527756128083, + "grad_norm": 1.5546875, + "learning_rate": 6.496016667657037e-06, + "loss": 0.2908, + "step": 17444 + }, + { + "epoch": 0.7660405940920119, + "grad_norm": 1.5546875, + "learning_rate": 6.491365304111322e-06, + "loss": 0.3149, + "step": 17446 + }, + { + "epoch": 0.7661284125712153, + "grad_norm": 1.53125, + "learning_rate": 6.486715357961281e-06, + "loss": 0.3405, + "step": 17448 + }, + { + "epoch": 0.7662162310504188, + "grad_norm": 1.4375, + "learning_rate": 6.4820668295630245e-06, + "loss": 0.2915, + "step": 17450 + }, + { + "epoch": 0.7663040495296223, + "grad_norm": 1.4453125, + "learning_rate": 6.47741971927252e-06, + "loss": 0.3329, + "step": 17452 + }, + { + "epoch": 0.7663918680088257, + "grad_norm": 1.5546875, + "learning_rate": 6.4727740274456605e-06, + "loss": 0.3177, + "step": 17454 + }, + { + "epoch": 0.7664796864880292, + "grad_norm": 1.4609375, + "learning_rate": 6.46812975443821e-06, + "loss": 0.3042, + "step": 17456 + }, + { + "epoch": 0.7665675049672327, + "grad_norm": 1.5078125, + "learning_rate": 6.463486900605822e-06, + "loss": 0.3394, + "step": 17458 + }, + { + "epoch": 0.7666553234464362, + "grad_norm": 1.3828125, + "learning_rate": 6.458845466304053e-06, + "loss": 0.2948, + "step": 17460 + }, + { + "epoch": 0.7667431419256397, + "grad_norm": 1.5234375, + "learning_rate": 6.454205451888335e-06, + "loss": 0.3091, + "step": 17462 + }, + { + "epoch": 0.7668309604048432, + "grad_norm": 1.5859375, + "learning_rate": 6.449566857714015e-06, + "loss": 0.3186, + "step": 17464 + }, + { + "epoch": 0.7669187788840467, + "grad_norm": 1.625, + "learning_rate": 6.444929684136306e-06, + "loss": 0.3018, + "step": 17466 + }, + { + "epoch": 0.7670065973632502, + "grad_norm": 1.484375, + "learning_rate": 6.440293931510336e-06, + "loss": 0.3279, + "step": 17468 + }, + { + "epoch": 0.7670944158424536, + "grad_norm": 1.5, + "learning_rate": 6.435659600191099e-06, + "loss": 0.3197, + "step": 17470 + }, + { + "epoch": 0.7671822343216571, + "grad_norm": 1.5078125, + "learning_rate": 6.43102669053351e-06, + "loss": 0.2848, + "step": 17472 + }, + { + "epoch": 0.7672700528008606, + "grad_norm": 1.484375, + "learning_rate": 6.4263952028923435e-06, + "loss": 0.2845, + "step": 17474 + }, + { + "epoch": 0.767357871280064, + "grad_norm": 1.546875, + "learning_rate": 6.421765137622282e-06, + "loss": 0.3126, + "step": 17476 + }, + { + "epoch": 0.7674456897592676, + "grad_norm": 1.5859375, + "learning_rate": 6.417136495077905e-06, + "loss": 0.3325, + "step": 17478 + }, + { + "epoch": 0.7675335082384711, + "grad_norm": 1.4765625, + "learning_rate": 6.4125092756136625e-06, + "loss": 0.3037, + "step": 17480 + }, + { + "epoch": 0.7676213267176746, + "grad_norm": 1.546875, + "learning_rate": 6.407883479583921e-06, + "loss": 0.3043, + "step": 17482 + }, + { + "epoch": 0.767709145196878, + "grad_norm": 1.5078125, + "learning_rate": 6.403259107342921e-06, + "loss": 0.34, + "step": 17484 + }, + { + "epoch": 0.7677969636760815, + "grad_norm": 1.5703125, + "learning_rate": 6.398636159244797e-06, + "loss": 0.3392, + "step": 17486 + }, + { + "epoch": 0.767884782155285, + "grad_norm": 1.5, + "learning_rate": 6.394014635643575e-06, + "loss": 0.3265, + "step": 17488 + }, + { + "epoch": 0.7679726006344885, + "grad_norm": 1.5546875, + "learning_rate": 6.389394536893165e-06, + "loss": 0.3089, + "step": 17490 + }, + { + "epoch": 0.768060419113692, + "grad_norm": 1.4921875, + "learning_rate": 6.384775863347389e-06, + "loss": 0.2844, + "step": 17492 + }, + { + "epoch": 0.7681482375928955, + "grad_norm": 1.7265625, + "learning_rate": 6.380158615359932e-06, + "loss": 0.3237, + "step": 17494 + }, + { + "epoch": 0.768236056072099, + "grad_norm": 1.4375, + "learning_rate": 6.3755427932844005e-06, + "loss": 0.2955, + "step": 17496 + }, + { + "epoch": 0.7683238745513025, + "grad_norm": 1.40625, + "learning_rate": 6.3709283974742654e-06, + "loss": 0.3182, + "step": 17498 + }, + { + "epoch": 0.7684116930305059, + "grad_norm": 1.4921875, + "learning_rate": 6.366315428282893e-06, + "loss": 0.3128, + "step": 17500 + }, + { + "epoch": 0.7684995115097094, + "grad_norm": 1.4375, + "learning_rate": 6.361703886063561e-06, + "loss": 0.2936, + "step": 17502 + }, + { + "epoch": 0.7685873299889129, + "grad_norm": 1.4921875, + "learning_rate": 6.357093771169403e-06, + "loss": 0.3087, + "step": 17504 + }, + { + "epoch": 0.7686751484681164, + "grad_norm": 1.625, + "learning_rate": 6.35248508395348e-06, + "loss": 0.3173, + "step": 17506 + }, + { + "epoch": 0.7687629669473199, + "grad_norm": 1.484375, + "learning_rate": 6.347877824768722e-06, + "loss": 0.307, + "step": 17508 + }, + { + "epoch": 0.7688507854265234, + "grad_norm": 1.5703125, + "learning_rate": 6.343271993967942e-06, + "loss": 0.3284, + "step": 17510 + }, + { + "epoch": 0.7689386039057269, + "grad_norm": 1.546875, + "learning_rate": 6.338667591903874e-06, + "loss": 0.3296, + "step": 17512 + }, + { + "epoch": 0.7690264223849304, + "grad_norm": 1.4921875, + "learning_rate": 6.334064618929106e-06, + "loss": 0.3028, + "step": 17514 + }, + { + "epoch": 0.7691142408641338, + "grad_norm": 1.609375, + "learning_rate": 6.329463075396161e-06, + "loss": 0.3115, + "step": 17516 + }, + { + "epoch": 0.7692020593433373, + "grad_norm": 1.453125, + "learning_rate": 6.324862961657393e-06, + "loss": 0.3134, + "step": 17518 + }, + { + "epoch": 0.7692898778225408, + "grad_norm": 1.5546875, + "learning_rate": 6.320264278065103e-06, + "loss": 0.3194, + "step": 17520 + }, + { + "epoch": 0.7693776963017442, + "grad_norm": 1.5546875, + "learning_rate": 6.315667024971453e-06, + "loss": 0.2776, + "step": 17522 + }, + { + "epoch": 0.7694655147809478, + "grad_norm": 1.421875, + "learning_rate": 6.311071202728494e-06, + "loss": 0.3067, + "step": 17524 + }, + { + "epoch": 0.7695533332601513, + "grad_norm": 1.5, + "learning_rate": 6.306476811688189e-06, + "loss": 0.3116, + "step": 17526 + }, + { + "epoch": 0.7696411517393548, + "grad_norm": 1.546875, + "learning_rate": 6.301883852202365e-06, + "loss": 0.2875, + "step": 17528 + }, + { + "epoch": 0.7697289702185582, + "grad_norm": 1.5, + "learning_rate": 6.2972923246227635e-06, + "loss": 0.297, + "step": 17530 + }, + { + "epoch": 0.7698167886977617, + "grad_norm": 1.546875, + "learning_rate": 6.292702229301001e-06, + "loss": 0.2835, + "step": 17532 + }, + { + "epoch": 0.7699046071769652, + "grad_norm": 1.546875, + "learning_rate": 6.288113566588577e-06, + "loss": 0.2944, + "step": 17534 + }, + { + "epoch": 0.7699924256561687, + "grad_norm": 1.453125, + "learning_rate": 6.283526336836912e-06, + "loss": 0.3091, + "step": 17536 + }, + { + "epoch": 0.7700802441353722, + "grad_norm": 1.53125, + "learning_rate": 6.2789405403972765e-06, + "loss": 0.3228, + "step": 17538 + }, + { + "epoch": 0.7701680626145757, + "grad_norm": 1.5, + "learning_rate": 6.274356177620871e-06, + "loss": 0.3132, + "step": 17540 + }, + { + "epoch": 0.7702558810937792, + "grad_norm": 1.5546875, + "learning_rate": 6.269773248858748e-06, + "loss": 0.3091, + "step": 17542 + }, + { + "epoch": 0.7703436995729827, + "grad_norm": 1.5859375, + "learning_rate": 6.265191754461891e-06, + "loss": 0.3336, + "step": 17544 + }, + { + "epoch": 0.7704315180521861, + "grad_norm": 1.4375, + "learning_rate": 6.260611694781138e-06, + "loss": 0.2966, + "step": 17546 + }, + { + "epoch": 0.7705193365313896, + "grad_norm": 1.5390625, + "learning_rate": 6.256033070167236e-06, + "loss": 0.2936, + "step": 17548 + }, + { + "epoch": 0.7706071550105931, + "grad_norm": 1.5703125, + "learning_rate": 6.251455880970811e-06, + "loss": 0.3251, + "step": 17550 + }, + { + "epoch": 0.7706949734897965, + "grad_norm": 1.5078125, + "learning_rate": 6.246880127542385e-06, + "loss": 0.3251, + "step": 17552 + }, + { + "epoch": 0.7707827919690001, + "grad_norm": 1.5078125, + "learning_rate": 6.242305810232379e-06, + "loss": 0.3179, + "step": 17554 + }, + { + "epoch": 0.7708706104482036, + "grad_norm": 1.53125, + "learning_rate": 6.237732929391085e-06, + "loss": 0.3123, + "step": 17556 + }, + { + "epoch": 0.7709584289274071, + "grad_norm": 1.5390625, + "learning_rate": 6.233161485368707e-06, + "loss": 0.3289, + "step": 17558 + }, + { + "epoch": 0.7710462474066105, + "grad_norm": 1.46875, + "learning_rate": 6.228591478515322e-06, + "loss": 0.308, + "step": 17560 + }, + { + "epoch": 0.771134065885814, + "grad_norm": 1.4296875, + "learning_rate": 6.224022909180893e-06, + "loss": 0.3368, + "step": 17562 + }, + { + "epoch": 0.7712218843650175, + "grad_norm": 1.5, + "learning_rate": 6.219455777715299e-06, + "loss": 0.3168, + "step": 17564 + }, + { + "epoch": 0.771309702844221, + "grad_norm": 1.5390625, + "learning_rate": 6.2148900844682775e-06, + "loss": 0.3278, + "step": 17566 + }, + { + "epoch": 0.7713975213234244, + "grad_norm": 1.4765625, + "learning_rate": 6.210325829789481e-06, + "loss": 0.3131, + "step": 17568 + }, + { + "epoch": 0.771485339802628, + "grad_norm": 1.5625, + "learning_rate": 6.205763014028437e-06, + "loss": 0.2855, + "step": 17570 + }, + { + "epoch": 0.7715731582818315, + "grad_norm": 1.5625, + "learning_rate": 6.201201637534562e-06, + "loss": 0.3164, + "step": 17572 + }, + { + "epoch": 0.771660976761035, + "grad_norm": 1.6484375, + "learning_rate": 6.196641700657177e-06, + "loss": 0.3106, + "step": 17574 + }, + { + "epoch": 0.7717487952402384, + "grad_norm": 1.4609375, + "learning_rate": 6.192083203745472e-06, + "loss": 0.2999, + "step": 17576 + }, + { + "epoch": 0.7718366137194419, + "grad_norm": 1.515625, + "learning_rate": 6.187526147148557e-06, + "loss": 0.3328, + "step": 17578 + }, + { + "epoch": 0.7719244321986454, + "grad_norm": 1.5078125, + "learning_rate": 6.182970531215384e-06, + "loss": 0.3015, + "step": 17580 + }, + { + "epoch": 0.7720122506778488, + "grad_norm": 1.4609375, + "learning_rate": 6.1784163562948476e-06, + "loss": 0.3401, + "step": 17582 + }, + { + "epoch": 0.7721000691570524, + "grad_norm": 1.515625, + "learning_rate": 6.173863622735698e-06, + "loss": 0.3226, + "step": 17584 + }, + { + "epoch": 0.7721878876362559, + "grad_norm": 1.4296875, + "learning_rate": 6.169312330886578e-06, + "loss": 0.3179, + "step": 17586 + }, + { + "epoch": 0.7722757061154594, + "grad_norm": 1.53125, + "learning_rate": 6.164762481096042e-06, + "loss": 0.3081, + "step": 17588 + }, + { + "epoch": 0.7723635245946628, + "grad_norm": 1.484375, + "learning_rate": 6.1602140737125e-06, + "loss": 0.2958, + "step": 17590 + }, + { + "epoch": 0.7724513430738663, + "grad_norm": 1.5234375, + "learning_rate": 6.15566710908429e-06, + "loss": 0.3202, + "step": 17592 + }, + { + "epoch": 0.7725391615530698, + "grad_norm": 1.5625, + "learning_rate": 6.151121587559611e-06, + "loss": 0.2966, + "step": 17594 + }, + { + "epoch": 0.7726269800322733, + "grad_norm": 1.4609375, + "learning_rate": 6.146577509486551e-06, + "loss": 0.3459, + "step": 17596 + }, + { + "epoch": 0.7727147985114767, + "grad_norm": 1.5, + "learning_rate": 6.1420348752131095e-06, + "loss": 0.3385, + "step": 17598 + }, + { + "epoch": 0.7728026169906803, + "grad_norm": 1.46875, + "learning_rate": 6.137493685087154e-06, + "loss": 0.2862, + "step": 17600 + }, + { + "epoch": 0.7728904354698838, + "grad_norm": 1.4140625, + "learning_rate": 6.1329539394564596e-06, + "loss": 0.2925, + "step": 17602 + }, + { + "epoch": 0.7729782539490873, + "grad_norm": 1.65625, + "learning_rate": 6.128415638668669e-06, + "loss": 0.2776, + "step": 17604 + }, + { + "epoch": 0.7730660724282907, + "grad_norm": 1.578125, + "learning_rate": 6.123878783071338e-06, + "loss": 0.3116, + "step": 17606 + }, + { + "epoch": 0.7731538909074942, + "grad_norm": 1.5625, + "learning_rate": 6.119343373011896e-06, + "loss": 0.3309, + "step": 17608 + }, + { + "epoch": 0.7732417093866977, + "grad_norm": 1.484375, + "learning_rate": 6.114809408837665e-06, + "loss": 0.3367, + "step": 17610 + }, + { + "epoch": 0.7733295278659011, + "grad_norm": 1.5234375, + "learning_rate": 6.1102768908958555e-06, + "loss": 0.326, + "step": 17612 + }, + { + "epoch": 0.7734173463451046, + "grad_norm": 1.5, + "learning_rate": 6.105745819533562e-06, + "loss": 0.3058, + "step": 17614 + }, + { + "epoch": 0.7735051648243082, + "grad_norm": 1.3984375, + "learning_rate": 6.101216195097792e-06, + "loss": 0.3065, + "step": 17616 + }, + { + "epoch": 0.7735929833035117, + "grad_norm": 1.609375, + "learning_rate": 6.096688017935406e-06, + "loss": 0.3105, + "step": 17618 + }, + { + "epoch": 0.7736808017827151, + "grad_norm": 1.625, + "learning_rate": 6.0921612883931944e-06, + "loss": 0.3188, + "step": 17620 + }, + { + "epoch": 0.7737686202619186, + "grad_norm": 1.59375, + "learning_rate": 6.087636006817801e-06, + "loss": 0.3217, + "step": 17622 + }, + { + "epoch": 0.7738564387411221, + "grad_norm": 1.5078125, + "learning_rate": 6.083112173555769e-06, + "loss": 0.3665, + "step": 17624 + }, + { + "epoch": 0.7739442572203256, + "grad_norm": 1.5859375, + "learning_rate": 6.0785897889535485e-06, + "loss": 0.2938, + "step": 17626 + }, + { + "epoch": 0.774032075699529, + "grad_norm": 1.546875, + "learning_rate": 6.074068853357451e-06, + "loss": 0.3167, + "step": 17628 + }, + { + "epoch": 0.7741198941787326, + "grad_norm": 1.6171875, + "learning_rate": 6.069549367113706e-06, + "loss": 0.3021, + "step": 17630 + }, + { + "epoch": 0.7742077126579361, + "grad_norm": 1.4296875, + "learning_rate": 6.065031330568408e-06, + "loss": 0.2957, + "step": 17632 + }, + { + "epoch": 0.7742955311371396, + "grad_norm": 1.5078125, + "learning_rate": 6.0605147440675415e-06, + "loss": 0.3229, + "step": 17634 + }, + { + "epoch": 0.774383349616343, + "grad_norm": 1.5234375, + "learning_rate": 6.0559996079570025e-06, + "loss": 0.2996, + "step": 17636 + }, + { + "epoch": 0.7744711680955465, + "grad_norm": 1.4765625, + "learning_rate": 6.051485922582548e-06, + "loss": 0.315, + "step": 17638 + }, + { + "epoch": 0.77455898657475, + "grad_norm": 1.5, + "learning_rate": 6.046973688289859e-06, + "loss": 0.3164, + "step": 17640 + }, + { + "epoch": 0.7746468050539534, + "grad_norm": 1.546875, + "learning_rate": 6.042462905424454e-06, + "loss": 0.3116, + "step": 17642 + }, + { + "epoch": 0.7747346235331569, + "grad_norm": 1.5234375, + "learning_rate": 6.0379535743317896e-06, + "loss": 0.3056, + "step": 17644 + }, + { + "epoch": 0.7748224420123605, + "grad_norm": 1.625, + "learning_rate": 6.033445695357187e-06, + "loss": 0.2799, + "step": 17646 + }, + { + "epoch": 0.774910260491564, + "grad_norm": 1.5390625, + "learning_rate": 6.0289392688458544e-06, + "loss": 0.3192, + "step": 17648 + }, + { + "epoch": 0.7749980789707674, + "grad_norm": 1.5625, + "learning_rate": 6.024434295142905e-06, + "loss": 0.3216, + "step": 17650 + }, + { + "epoch": 0.7750858974499709, + "grad_norm": 1.4296875, + "learning_rate": 6.019930774593318e-06, + "loss": 0.3274, + "step": 17652 + }, + { + "epoch": 0.7751737159291744, + "grad_norm": 1.5234375, + "learning_rate": 6.015428707541993e-06, + "loss": 0.325, + "step": 17654 + }, + { + "epoch": 0.7752615344083779, + "grad_norm": 1.5625, + "learning_rate": 6.010928094333684e-06, + "loss": 0.336, + "step": 17656 + }, + { + "epoch": 0.7753493528875813, + "grad_norm": 1.6875, + "learning_rate": 6.006428935313049e-06, + "loss": 0.3216, + "step": 17658 + }, + { + "epoch": 0.7754371713667848, + "grad_norm": 1.5078125, + "learning_rate": 6.001931230824648e-06, + "loss": 0.2961, + "step": 17660 + }, + { + "epoch": 0.7755249898459884, + "grad_norm": 1.5078125, + "learning_rate": 5.997434981212896e-06, + "loss": 0.2915, + "step": 17662 + }, + { + "epoch": 0.7756128083251919, + "grad_norm": 1.546875, + "learning_rate": 5.992940186822138e-06, + "loss": 0.2998, + "step": 17664 + }, + { + "epoch": 0.7757006268043953, + "grad_norm": 1.5234375, + "learning_rate": 5.988446847996579e-06, + "loss": 0.3264, + "step": 17666 + }, + { + "epoch": 0.7757884452835988, + "grad_norm": 1.484375, + "learning_rate": 5.983954965080307e-06, + "loss": 0.3098, + "step": 17668 + }, + { + "epoch": 0.7758762637628023, + "grad_norm": 1.4453125, + "learning_rate": 5.9794645384173314e-06, + "loss": 0.2944, + "step": 17670 + }, + { + "epoch": 0.7759640822420057, + "grad_norm": 1.6171875, + "learning_rate": 5.974975568351521e-06, + "loss": 0.3112, + "step": 17672 + }, + { + "epoch": 0.7760519007212092, + "grad_norm": 1.46875, + "learning_rate": 5.970488055226642e-06, + "loss": 0.3, + "step": 17674 + }, + { + "epoch": 0.7761397192004127, + "grad_norm": 1.6796875, + "learning_rate": 5.966001999386339e-06, + "loss": 0.3396, + "step": 17676 + }, + { + "epoch": 0.7762275376796163, + "grad_norm": 1.46875, + "learning_rate": 5.9615174011741774e-06, + "loss": 0.3165, + "step": 17678 + }, + { + "epoch": 0.7763153561588197, + "grad_norm": 1.671875, + "learning_rate": 5.957034260933567e-06, + "loss": 0.3076, + "step": 17680 + }, + { + "epoch": 0.7764031746380232, + "grad_norm": 1.65625, + "learning_rate": 5.952552579007847e-06, + "loss": 0.2645, + "step": 17682 + }, + { + "epoch": 0.7764909931172267, + "grad_norm": 1.5859375, + "learning_rate": 5.948072355740214e-06, + "loss": 0.3043, + "step": 17684 + }, + { + "epoch": 0.7765788115964302, + "grad_norm": 1.6328125, + "learning_rate": 5.943593591473762e-06, + "loss": 0.295, + "step": 17686 + }, + { + "epoch": 0.7766666300756336, + "grad_norm": 1.4296875, + "learning_rate": 5.939116286551488e-06, + "loss": 0.3101, + "step": 17688 + }, + { + "epoch": 0.7767544485548371, + "grad_norm": 1.53125, + "learning_rate": 5.9346404413162494e-06, + "loss": 0.3455, + "step": 17690 + }, + { + "epoch": 0.7768422670340407, + "grad_norm": 1.4609375, + "learning_rate": 5.930166056110825e-06, + "loss": 0.3092, + "step": 17692 + }, + { + "epoch": 0.7769300855132442, + "grad_norm": 1.4921875, + "learning_rate": 5.925693131277854e-06, + "loss": 0.328, + "step": 17694 + }, + { + "epoch": 0.7770179039924476, + "grad_norm": 1.5078125, + "learning_rate": 5.921221667159868e-06, + "loss": 0.3365, + "step": 17696 + }, + { + "epoch": 0.7771057224716511, + "grad_norm": 1.4296875, + "learning_rate": 5.91675166409931e-06, + "loss": 0.2962, + "step": 17698 + }, + { + "epoch": 0.7771935409508546, + "grad_norm": 1.5078125, + "learning_rate": 5.912283122438481e-06, + "loss": 0.3077, + "step": 17700 + }, + { + "epoch": 0.777281359430058, + "grad_norm": 1.5234375, + "learning_rate": 5.907816042519587e-06, + "loss": 0.3033, + "step": 17702 + }, + { + "epoch": 0.7773691779092615, + "grad_norm": 1.5, + "learning_rate": 5.903350424684712e-06, + "loss": 0.3449, + "step": 17704 + }, + { + "epoch": 0.777456996388465, + "grad_norm": 1.7578125, + "learning_rate": 5.898886269275844e-06, + "loss": 0.2861, + "step": 17706 + }, + { + "epoch": 0.7775448148676686, + "grad_norm": 1.6875, + "learning_rate": 5.894423576634847e-06, + "loss": 0.3205, + "step": 17708 + }, + { + "epoch": 0.777632633346872, + "grad_norm": 1.46875, + "learning_rate": 5.889962347103461e-06, + "loss": 0.3092, + "step": 17710 + }, + { + "epoch": 0.7777204518260755, + "grad_norm": 1.4453125, + "learning_rate": 5.88550258102335e-06, + "loss": 0.3223, + "step": 17712 + }, + { + "epoch": 0.777808270305279, + "grad_norm": 1.7265625, + "learning_rate": 5.881044278736025e-06, + "loss": 0.3207, + "step": 17714 + }, + { + "epoch": 0.7778960887844825, + "grad_norm": 1.5859375, + "learning_rate": 5.87658744058292e-06, + "loss": 0.3131, + "step": 17716 + }, + { + "epoch": 0.7779839072636859, + "grad_norm": 1.5, + "learning_rate": 5.8721320669053335e-06, + "loss": 0.3225, + "step": 17718 + }, + { + "epoch": 0.7780717257428894, + "grad_norm": 1.515625, + "learning_rate": 5.867678158044451e-06, + "loss": 0.3102, + "step": 17720 + }, + { + "epoch": 0.7781595442220929, + "grad_norm": 1.5, + "learning_rate": 5.863225714341367e-06, + "loss": 0.3148, + "step": 17722 + }, + { + "epoch": 0.7782473627012965, + "grad_norm": 1.484375, + "learning_rate": 5.85877473613704e-06, + "loss": 0.324, + "step": 17724 + }, + { + "epoch": 0.7783351811804999, + "grad_norm": 1.484375, + "learning_rate": 5.854325223772339e-06, + "loss": 0.3006, + "step": 17726 + }, + { + "epoch": 0.7784229996597034, + "grad_norm": 1.5234375, + "learning_rate": 5.849877177588e-06, + "loss": 0.3068, + "step": 17728 + }, + { + "epoch": 0.7785108181389069, + "grad_norm": 1.515625, + "learning_rate": 5.845430597924653e-06, + "loss": 0.3072, + "step": 17730 + }, + { + "epoch": 0.7785986366181104, + "grad_norm": 1.5546875, + "learning_rate": 5.840985485122829e-06, + "loss": 0.3408, + "step": 17732 + }, + { + "epoch": 0.7786864550973138, + "grad_norm": 1.4609375, + "learning_rate": 5.836541839522927e-06, + "loss": 0.3022, + "step": 17734 + }, + { + "epoch": 0.7787742735765173, + "grad_norm": 1.578125, + "learning_rate": 5.832099661465248e-06, + "loss": 0.3409, + "step": 17736 + }, + { + "epoch": 0.7788620920557209, + "grad_norm": 1.5234375, + "learning_rate": 5.827658951289963e-06, + "loss": 0.3165, + "step": 17738 + }, + { + "epoch": 0.7789499105349244, + "grad_norm": 1.515625, + "learning_rate": 5.823219709337158e-06, + "loss": 0.3617, + "step": 17740 + }, + { + "epoch": 0.7790377290141278, + "grad_norm": 1.453125, + "learning_rate": 5.818781935946779e-06, + "loss": 0.2971, + "step": 17742 + }, + { + "epoch": 0.7791255474933313, + "grad_norm": 1.5078125, + "learning_rate": 5.814345631458684e-06, + "loss": 0.3356, + "step": 17744 + }, + { + "epoch": 0.7792133659725348, + "grad_norm": 1.5625, + "learning_rate": 5.8099107962125975e-06, + "loss": 0.3173, + "step": 17746 + }, + { + "epoch": 0.7793011844517382, + "grad_norm": 1.4609375, + "learning_rate": 5.80547743054814e-06, + "loss": 0.3085, + "step": 17748 + }, + { + "epoch": 0.7793890029309417, + "grad_norm": 1.4765625, + "learning_rate": 5.801045534804825e-06, + "loss": 0.3229, + "step": 17750 + }, + { + "epoch": 0.7794768214101452, + "grad_norm": 1.484375, + "learning_rate": 5.7966151093220396e-06, + "loss": 0.318, + "step": 17752 + }, + { + "epoch": 0.7795646398893488, + "grad_norm": 1.4921875, + "learning_rate": 5.7921861544390805e-06, + "loss": 0.3038, + "step": 17754 + }, + { + "epoch": 0.7796524583685522, + "grad_norm": 1.546875, + "learning_rate": 5.787758670495108e-06, + "loss": 0.334, + "step": 17756 + }, + { + "epoch": 0.7797402768477557, + "grad_norm": 1.484375, + "learning_rate": 5.783332657829177e-06, + "loss": 0.2941, + "step": 17758 + }, + { + "epoch": 0.7798280953269592, + "grad_norm": 1.484375, + "learning_rate": 5.778908116780244e-06, + "loss": 0.3116, + "step": 17760 + }, + { + "epoch": 0.7799159138061627, + "grad_norm": 1.59375, + "learning_rate": 5.7744850476871335e-06, + "loss": 0.3276, + "step": 17762 + }, + { + "epoch": 0.7800037322853661, + "grad_norm": 1.5546875, + "learning_rate": 5.770063450888569e-06, + "loss": 0.3172, + "step": 17764 + }, + { + "epoch": 0.7800915507645696, + "grad_norm": 1.3984375, + "learning_rate": 5.765643326723147e-06, + "loss": 0.3122, + "step": 17766 + }, + { + "epoch": 0.7801793692437731, + "grad_norm": 1.6015625, + "learning_rate": 5.761224675529375e-06, + "loss": 0.3361, + "step": 17768 + }, + { + "epoch": 0.7802671877229767, + "grad_norm": 1.4765625, + "learning_rate": 5.756807497645633e-06, + "loss": 0.3319, + "step": 17770 + }, + { + "epoch": 0.7803550062021801, + "grad_norm": 1.4375, + "learning_rate": 5.752391793410175e-06, + "loss": 0.3019, + "step": 17772 + }, + { + "epoch": 0.7804428246813836, + "grad_norm": 1.453125, + "learning_rate": 5.7479775631611775e-06, + "loss": 0.3042, + "step": 17774 + }, + { + "epoch": 0.7805306431605871, + "grad_norm": 1.4375, + "learning_rate": 5.743564807236665e-06, + "loss": 0.2863, + "step": 17776 + }, + { + "epoch": 0.7806184616397905, + "grad_norm": 1.578125, + "learning_rate": 5.739153525974583e-06, + "loss": 0.3002, + "step": 17778 + }, + { + "epoch": 0.780706280118994, + "grad_norm": 1.421875, + "learning_rate": 5.73474371971274e-06, + "loss": 0.3111, + "step": 17780 + }, + { + "epoch": 0.7807940985981975, + "grad_norm": 1.5234375, + "learning_rate": 5.730335388788835e-06, + "loss": 0.318, + "step": 17782 + }, + { + "epoch": 0.7808819170774011, + "grad_norm": 1.453125, + "learning_rate": 5.725928533540473e-06, + "loss": 0.3323, + "step": 17784 + }, + { + "epoch": 0.7809697355566045, + "grad_norm": 1.5390625, + "learning_rate": 5.721523154305117e-06, + "loss": 0.3277, + "step": 17786 + }, + { + "epoch": 0.781057554035808, + "grad_norm": 1.4765625, + "learning_rate": 5.717119251420145e-06, + "loss": 0.3311, + "step": 17788 + }, + { + "epoch": 0.7811453725150115, + "grad_norm": 1.4375, + "learning_rate": 5.712716825222803e-06, + "loss": 0.319, + "step": 17790 + }, + { + "epoch": 0.781233190994215, + "grad_norm": 1.65625, + "learning_rate": 5.7083158760502295e-06, + "loss": 0.3544, + "step": 17792 + }, + { + "epoch": 0.7813210094734184, + "grad_norm": 1.5, + "learning_rate": 5.703916404239454e-06, + "loss": 0.31, + "step": 17794 + }, + { + "epoch": 0.7814088279526219, + "grad_norm": 1.5078125, + "learning_rate": 5.699518410127375e-06, + "loss": 0.3007, + "step": 17796 + }, + { + "epoch": 0.7814966464318254, + "grad_norm": 1.515625, + "learning_rate": 5.695121894050812e-06, + "loss": 0.3031, + "step": 17798 + }, + { + "epoch": 0.781584464911029, + "grad_norm": 1.5078125, + "learning_rate": 5.690726856346434e-06, + "loss": 0.3103, + "step": 17800 + }, + { + "epoch": 0.7816722833902324, + "grad_norm": 1.40625, + "learning_rate": 5.686333297350832e-06, + "loss": 0.3037, + "step": 17802 + }, + { + "epoch": 0.7817601018694359, + "grad_norm": 1.484375, + "learning_rate": 5.681941217400446e-06, + "loss": 0.3205, + "step": 17804 + }, + { + "epoch": 0.7818479203486394, + "grad_norm": 1.5, + "learning_rate": 5.677550616831639e-06, + "loss": 0.2974, + "step": 17806 + }, + { + "epoch": 0.7819357388278428, + "grad_norm": 1.5390625, + "learning_rate": 5.673161495980639e-06, + "loss": 0.3513, + "step": 17808 + }, + { + "epoch": 0.7820235573070463, + "grad_norm": 1.4921875, + "learning_rate": 5.668773855183557e-06, + "loss": 0.3199, + "step": 17810 + }, + { + "epoch": 0.7821113757862498, + "grad_norm": 1.5234375, + "learning_rate": 5.664387694776416e-06, + "loss": 0.327, + "step": 17812 + }, + { + "epoch": 0.7821991942654533, + "grad_norm": 1.4765625, + "learning_rate": 5.660003015095092e-06, + "loss": 0.3015, + "step": 17814 + }, + { + "epoch": 0.7822870127446568, + "grad_norm": 1.59375, + "learning_rate": 5.65561981647538e-06, + "loss": 0.2809, + "step": 17816 + }, + { + "epoch": 0.7823748312238603, + "grad_norm": 1.5546875, + "learning_rate": 5.65123809925294e-06, + "loss": 0.3464, + "step": 17818 + }, + { + "epoch": 0.7824626497030638, + "grad_norm": 1.5390625, + "learning_rate": 5.646857863763317e-06, + "loss": 0.335, + "step": 17820 + }, + { + "epoch": 0.7825504681822673, + "grad_norm": 1.4921875, + "learning_rate": 5.642479110341964e-06, + "loss": 0.3084, + "step": 17822 + }, + { + "epoch": 0.7826382866614707, + "grad_norm": 1.6015625, + "learning_rate": 5.638101839324203e-06, + "loss": 0.3195, + "step": 17824 + }, + { + "epoch": 0.7827261051406742, + "grad_norm": 1.5078125, + "learning_rate": 5.633726051045243e-06, + "loss": 0.3401, + "step": 17826 + }, + { + "epoch": 0.7828139236198777, + "grad_norm": 1.5546875, + "learning_rate": 5.629351745840181e-06, + "loss": 0.3235, + "step": 17828 + }, + { + "epoch": 0.7829017420990811, + "grad_norm": 1.515625, + "learning_rate": 5.624978924044008e-06, + "loss": 0.3002, + "step": 17830 + }, + { + "epoch": 0.7829895605782847, + "grad_norm": 1.46875, + "learning_rate": 5.620607585991597e-06, + "loss": 0.288, + "step": 17832 + }, + { + "epoch": 0.7830773790574882, + "grad_norm": 1.453125, + "learning_rate": 5.616237732017693e-06, + "loss": 0.3017, + "step": 17834 + }, + { + "epoch": 0.7831651975366917, + "grad_norm": 1.46875, + "learning_rate": 5.611869362456959e-06, + "loss": 0.3192, + "step": 17836 + }, + { + "epoch": 0.7832530160158951, + "grad_norm": 1.4609375, + "learning_rate": 5.607502477643908e-06, + "loss": 0.2933, + "step": 17838 + }, + { + "epoch": 0.7833408344950986, + "grad_norm": 1.4296875, + "learning_rate": 5.603137077912976e-06, + "loss": 0.3554, + "step": 17840 + }, + { + "epoch": 0.7834286529743021, + "grad_norm": 1.53125, + "learning_rate": 5.598773163598456e-06, + "loss": 0.2942, + "step": 17842 + }, + { + "epoch": 0.7835164714535056, + "grad_norm": 1.4765625, + "learning_rate": 5.5944107350345274e-06, + "loss": 0.3343, + "step": 17844 + }, + { + "epoch": 0.7836042899327091, + "grad_norm": 1.4609375, + "learning_rate": 5.590049792555285e-06, + "loss": 0.3123, + "step": 17846 + }, + { + "epoch": 0.7836921084119126, + "grad_norm": 1.4453125, + "learning_rate": 5.5856903364946785e-06, + "loss": 0.3106, + "step": 17848 + }, + { + "epoch": 0.7837799268911161, + "grad_norm": 1.5234375, + "learning_rate": 5.581332367186562e-06, + "loss": 0.3005, + "step": 17850 + }, + { + "epoch": 0.7838677453703196, + "grad_norm": 1.546875, + "learning_rate": 5.576975884964672e-06, + "loss": 0.3089, + "step": 17852 + }, + { + "epoch": 0.783955563849523, + "grad_norm": 1.7734375, + "learning_rate": 5.572620890162622e-06, + "loss": 0.3218, + "step": 17854 + }, + { + "epoch": 0.7840433823287265, + "grad_norm": 1.4609375, + "learning_rate": 5.568267383113923e-06, + "loss": 0.3051, + "step": 17856 + }, + { + "epoch": 0.78413120080793, + "grad_norm": 1.546875, + "learning_rate": 5.563915364151959e-06, + "loss": 0.2974, + "step": 17858 + }, + { + "epoch": 0.7842190192871334, + "grad_norm": 1.5546875, + "learning_rate": 5.55956483361002e-06, + "loss": 0.3184, + "step": 17860 + }, + { + "epoch": 0.784306837766337, + "grad_norm": 1.484375, + "learning_rate": 5.555215791821261e-06, + "loss": 0.2942, + "step": 17862 + }, + { + "epoch": 0.7843946562455405, + "grad_norm": 1.578125, + "learning_rate": 5.550868239118745e-06, + "loss": 0.3245, + "step": 17864 + }, + { + "epoch": 0.784482474724744, + "grad_norm": 1.5390625, + "learning_rate": 5.5465221758353945e-06, + "loss": 0.2906, + "step": 17866 + }, + { + "epoch": 0.7845702932039474, + "grad_norm": 1.53125, + "learning_rate": 5.542177602304047e-06, + "loss": 0.3384, + "step": 17868 + }, + { + "epoch": 0.7846581116831509, + "grad_norm": 1.4140625, + "learning_rate": 5.537834518857401e-06, + "loss": 0.3252, + "step": 17870 + }, + { + "epoch": 0.7847459301623544, + "grad_norm": 1.5, + "learning_rate": 5.5334929258280485e-06, + "loss": 0.3389, + "step": 17872 + }, + { + "epoch": 0.7848337486415579, + "grad_norm": 1.4453125, + "learning_rate": 5.5291528235484776e-06, + "loss": 0.3351, + "step": 17874 + }, + { + "epoch": 0.7849215671207613, + "grad_norm": 1.4453125, + "learning_rate": 5.524814212351048e-06, + "loss": 0.3114, + "step": 17876 + }, + { + "epoch": 0.7850093855999649, + "grad_norm": 1.4296875, + "learning_rate": 5.520477092568019e-06, + "loss": 0.3462, + "step": 17878 + }, + { + "epoch": 0.7850972040791684, + "grad_norm": 1.4765625, + "learning_rate": 5.516141464531524e-06, + "loss": 0.3045, + "step": 17880 + }, + { + "epoch": 0.7851850225583719, + "grad_norm": 1.5625, + "learning_rate": 5.511807328573579e-06, + "loss": 0.3109, + "step": 17882 + }, + { + "epoch": 0.7852728410375753, + "grad_norm": 1.4921875, + "learning_rate": 5.507474685026115e-06, + "loss": 0.3091, + "step": 17884 + }, + { + "epoch": 0.7853606595167788, + "grad_norm": 1.484375, + "learning_rate": 5.503143534220901e-06, + "loss": 0.3276, + "step": 17886 + }, + { + "epoch": 0.7854484779959823, + "grad_norm": 1.421875, + "learning_rate": 5.4988138764896305e-06, + "loss": 0.3302, + "step": 17888 + }, + { + "epoch": 0.7855362964751857, + "grad_norm": 1.5390625, + "learning_rate": 5.494485712163866e-06, + "loss": 0.3134, + "step": 17890 + }, + { + "epoch": 0.7856241149543893, + "grad_norm": 1.4375, + "learning_rate": 5.490159041575066e-06, + "loss": 0.3505, + "step": 17892 + }, + { + "epoch": 0.7857119334335928, + "grad_norm": 1.546875, + "learning_rate": 5.485833865054563e-06, + "loss": 0.3337, + "step": 17894 + }, + { + "epoch": 0.7857997519127963, + "grad_norm": 1.46875, + "learning_rate": 5.481510182933575e-06, + "loss": 0.2966, + "step": 17896 + }, + { + "epoch": 0.7858875703919997, + "grad_norm": 1.5625, + "learning_rate": 5.4771879955432205e-06, + "loss": 0.316, + "step": 17898 + }, + { + "epoch": 0.7859753888712032, + "grad_norm": 1.5078125, + "learning_rate": 5.472867303214485e-06, + "loss": 0.3278, + "step": 17900 + }, + { + "epoch": 0.7860632073504067, + "grad_norm": 1.4140625, + "learning_rate": 5.46854810627826e-06, + "loss": 0.2878, + "step": 17902 + }, + { + "epoch": 0.7861510258296102, + "grad_norm": 1.484375, + "learning_rate": 5.464230405065301e-06, + "loss": 0.3051, + "step": 17904 + }, + { + "epoch": 0.7862388443088136, + "grad_norm": 1.46875, + "learning_rate": 5.459914199906252e-06, + "loss": 0.3092, + "step": 17906 + }, + { + "epoch": 0.7863266627880172, + "grad_norm": 1.5546875, + "learning_rate": 5.455599491131669e-06, + "loss": 0.3125, + "step": 17908 + }, + { + "epoch": 0.7864144812672207, + "grad_norm": 1.4921875, + "learning_rate": 5.451286279071952e-06, + "loss": 0.3291, + "step": 17910 + }, + { + "epoch": 0.7865022997464242, + "grad_norm": 1.5625, + "learning_rate": 5.446974564057425e-06, + "loss": 0.3274, + "step": 17912 + }, + { + "epoch": 0.7865901182256276, + "grad_norm": 1.578125, + "learning_rate": 5.442664346418275e-06, + "loss": 0.3265, + "step": 17914 + }, + { + "epoch": 0.7866779367048311, + "grad_norm": 1.453125, + "learning_rate": 5.438355626484576e-06, + "loss": 0.3356, + "step": 17916 + }, + { + "epoch": 0.7867657551840346, + "grad_norm": 1.5, + "learning_rate": 5.434048404586292e-06, + "loss": 0.3155, + "step": 17918 + }, + { + "epoch": 0.786853573663238, + "grad_norm": 1.578125, + "learning_rate": 5.429742681053266e-06, + "loss": 0.3409, + "step": 17920 + }, + { + "epoch": 0.7869413921424415, + "grad_norm": 1.4296875, + "learning_rate": 5.4254384562152424e-06, + "loss": 0.3067, + "step": 17922 + }, + { + "epoch": 0.7870292106216451, + "grad_norm": 1.4921875, + "learning_rate": 5.421135730401828e-06, + "loss": 0.3168, + "step": 17924 + }, + { + "epoch": 0.7871170291008486, + "grad_norm": 1.5625, + "learning_rate": 5.416834503942539e-06, + "loss": 0.3115, + "step": 17926 + }, + { + "epoch": 0.787204847580052, + "grad_norm": 1.609375, + "learning_rate": 5.4125347771667585e-06, + "loss": 0.3487, + "step": 17928 + }, + { + "epoch": 0.7872926660592555, + "grad_norm": 1.546875, + "learning_rate": 5.408236550403753e-06, + "loss": 0.3071, + "step": 17930 + }, + { + "epoch": 0.787380484538459, + "grad_norm": 1.4453125, + "learning_rate": 5.403939823982698e-06, + "loss": 0.3221, + "step": 17932 + }, + { + "epoch": 0.7874683030176625, + "grad_norm": 1.5390625, + "learning_rate": 5.39964459823262e-06, + "loss": 0.299, + "step": 17934 + }, + { + "epoch": 0.7875561214968659, + "grad_norm": 1.421875, + "learning_rate": 5.395350873482463e-06, + "loss": 0.2888, + "step": 17936 + }, + { + "epoch": 0.7876439399760695, + "grad_norm": 1.484375, + "learning_rate": 5.391058650061032e-06, + "loss": 0.3037, + "step": 17938 + }, + { + "epoch": 0.787731758455273, + "grad_norm": 1.484375, + "learning_rate": 5.3867679282970345e-06, + "loss": 0.3282, + "step": 17940 + }, + { + "epoch": 0.7878195769344765, + "grad_norm": 1.46875, + "learning_rate": 5.382478708519051e-06, + "loss": 0.3471, + "step": 17942 + }, + { + "epoch": 0.7879073954136799, + "grad_norm": 1.4609375, + "learning_rate": 5.378190991055543e-06, + "loss": 0.292, + "step": 17944 + }, + { + "epoch": 0.7879952138928834, + "grad_norm": 1.625, + "learning_rate": 5.373904776234886e-06, + "loss": 0.3057, + "step": 17946 + }, + { + "epoch": 0.7880830323720869, + "grad_norm": 1.5078125, + "learning_rate": 5.369620064385294e-06, + "loss": 0.3136, + "step": 17948 + }, + { + "epoch": 0.7881708508512903, + "grad_norm": 1.59375, + "learning_rate": 5.36533685583491e-06, + "loss": 0.3183, + "step": 17950 + }, + { + "epoch": 0.7882586693304938, + "grad_norm": 1.5625, + "learning_rate": 5.361055150911729e-06, + "loss": 0.3312, + "step": 17952 + }, + { + "epoch": 0.7883464878096974, + "grad_norm": 1.4765625, + "learning_rate": 5.3567749499436605e-06, + "loss": 0.3276, + "step": 17954 + }, + { + "epoch": 0.7884343062889009, + "grad_norm": 1.5390625, + "learning_rate": 5.352496253258474e-06, + "loss": 0.3156, + "step": 17956 + }, + { + "epoch": 0.7885221247681043, + "grad_norm": 1.4921875, + "learning_rate": 5.348219061183826e-06, + "loss": 0.3053, + "step": 17958 + }, + { + "epoch": 0.7886099432473078, + "grad_norm": 1.515625, + "learning_rate": 5.34394337404728e-06, + "loss": 0.3108, + "step": 17960 + }, + { + "epoch": 0.7886977617265113, + "grad_norm": 1.484375, + "learning_rate": 5.339669192176258e-06, + "loss": 0.3091, + "step": 17962 + }, + { + "epoch": 0.7887855802057148, + "grad_norm": 1.4453125, + "learning_rate": 5.335396515898086e-06, + "loss": 0.2981, + "step": 17964 + }, + { + "epoch": 0.7888733986849182, + "grad_norm": 1.5078125, + "learning_rate": 5.331125345539967e-06, + "loss": 0.344, + "step": 17966 + }, + { + "epoch": 0.7889612171641217, + "grad_norm": 1.46875, + "learning_rate": 5.326855681428974e-06, + "loss": 0.3075, + "step": 17968 + }, + { + "epoch": 0.7890490356433253, + "grad_norm": 1.4921875, + "learning_rate": 5.3225875238920945e-06, + "loss": 0.311, + "step": 17970 + }, + { + "epoch": 0.7891368541225288, + "grad_norm": 1.5, + "learning_rate": 5.318320873256174e-06, + "loss": 0.3374, + "step": 17972 + }, + { + "epoch": 0.7892246726017322, + "grad_norm": 1.4609375, + "learning_rate": 5.314055729847967e-06, + "loss": 0.2925, + "step": 17974 + }, + { + "epoch": 0.7893124910809357, + "grad_norm": 1.5, + "learning_rate": 5.309792093994093e-06, + "loss": 0.3213, + "step": 17976 + }, + { + "epoch": 0.7894003095601392, + "grad_norm": 1.5546875, + "learning_rate": 5.3055299660210555e-06, + "loss": 0.3087, + "step": 17978 + }, + { + "epoch": 0.7894881280393427, + "grad_norm": 1.46875, + "learning_rate": 5.301269346255258e-06, + "loss": 0.3155, + "step": 17980 + }, + { + "epoch": 0.7895759465185461, + "grad_norm": 1.4375, + "learning_rate": 5.2970102350229675e-06, + "loss": 0.3026, + "step": 17982 + }, + { + "epoch": 0.7896637649977497, + "grad_norm": 1.546875, + "learning_rate": 5.292752632650363e-06, + "loss": 0.3308, + "step": 17984 + }, + { + "epoch": 0.7897515834769532, + "grad_norm": 1.53125, + "learning_rate": 5.288496539463481e-06, + "loss": 0.3074, + "step": 17986 + }, + { + "epoch": 0.7898394019561567, + "grad_norm": 1.578125, + "learning_rate": 5.284241955788266e-06, + "loss": 0.3303, + "step": 17988 + }, + { + "epoch": 0.7899272204353601, + "grad_norm": 1.546875, + "learning_rate": 5.279988881950526e-06, + "loss": 0.3099, + "step": 17990 + }, + { + "epoch": 0.7900150389145636, + "grad_norm": 1.4609375, + "learning_rate": 5.2757373182759585e-06, + "loss": 0.3013, + "step": 17992 + }, + { + "epoch": 0.7901028573937671, + "grad_norm": 1.671875, + "learning_rate": 5.271487265090163e-06, + "loss": 0.3248, + "step": 17994 + }, + { + "epoch": 0.7901906758729705, + "grad_norm": 1.8984375, + "learning_rate": 5.2672387227185954e-06, + "loss": 0.3134, + "step": 17996 + }, + { + "epoch": 0.790278494352174, + "grad_norm": 1.5, + "learning_rate": 5.262991691486624e-06, + "loss": 0.3233, + "step": 17998 + }, + { + "epoch": 0.7903663128313776, + "grad_norm": 1.484375, + "learning_rate": 5.25874617171947e-06, + "loss": 0.31, + "step": 18000 + }, + { + "epoch": 0.7904541313105811, + "grad_norm": 1.5703125, + "learning_rate": 5.254502163742275e-06, + "loss": 0.2897, + "step": 18002 + }, + { + "epoch": 0.7905419497897845, + "grad_norm": 1.4765625, + "learning_rate": 5.250259667880039e-06, + "loss": 0.317, + "step": 18004 + }, + { + "epoch": 0.790629768268988, + "grad_norm": 1.4921875, + "learning_rate": 5.246018684457646e-06, + "loss": 0.2996, + "step": 18006 + }, + { + "epoch": 0.7907175867481915, + "grad_norm": 1.4609375, + "learning_rate": 5.241779213799888e-06, + "loss": 0.3249, + "step": 18008 + }, + { + "epoch": 0.790805405227395, + "grad_norm": 1.515625, + "learning_rate": 5.237541256231402e-06, + "loss": 0.3213, + "step": 18010 + }, + { + "epoch": 0.7908932237065984, + "grad_norm": 1.5546875, + "learning_rate": 5.23330481207675e-06, + "loss": 0.3026, + "step": 18012 + }, + { + "epoch": 0.7909810421858019, + "grad_norm": 1.546875, + "learning_rate": 5.22906988166035e-06, + "loss": 0.3241, + "step": 18014 + }, + { + "epoch": 0.7910688606650055, + "grad_norm": 1.4921875, + "learning_rate": 5.224836465306521e-06, + "loss": 0.3207, + "step": 18016 + }, + { + "epoch": 0.791156679144209, + "grad_norm": 1.4296875, + "learning_rate": 5.22060456333946e-06, + "loss": 0.3222, + "step": 18018 + }, + { + "epoch": 0.7912444976234124, + "grad_norm": 1.453125, + "learning_rate": 5.216374176083233e-06, + "loss": 0.3284, + "step": 18020 + }, + { + "epoch": 0.7913323161026159, + "grad_norm": 1.5625, + "learning_rate": 5.212145303861821e-06, + "loss": 0.311, + "step": 18022 + }, + { + "epoch": 0.7914201345818194, + "grad_norm": 1.5078125, + "learning_rate": 5.207917946999058e-06, + "loss": 0.3093, + "step": 18024 + }, + { + "epoch": 0.7915079530610228, + "grad_norm": 1.5546875, + "learning_rate": 5.2036921058186915e-06, + "loss": 0.3083, + "step": 18026 + }, + { + "epoch": 0.7915957715402263, + "grad_norm": 1.59375, + "learning_rate": 5.199467780644329e-06, + "loss": 0.3385, + "step": 18028 + }, + { + "epoch": 0.7916835900194298, + "grad_norm": 1.5546875, + "learning_rate": 5.195244971799462e-06, + "loss": 0.3325, + "step": 18030 + }, + { + "epoch": 0.7917714084986334, + "grad_norm": 1.4921875, + "learning_rate": 5.19102367960749e-06, + "loss": 0.3155, + "step": 18032 + }, + { + "epoch": 0.7918592269778368, + "grad_norm": 1.5625, + "learning_rate": 5.186803904391669e-06, + "loss": 0.3371, + "step": 18034 + }, + { + "epoch": 0.7919470454570403, + "grad_norm": 1.421875, + "learning_rate": 5.1825856464751575e-06, + "loss": 0.3148, + "step": 18036 + }, + { + "epoch": 0.7920348639362438, + "grad_norm": 1.5, + "learning_rate": 5.178368906180989e-06, + "loss": 0.2917, + "step": 18038 + }, + { + "epoch": 0.7921226824154473, + "grad_norm": 1.484375, + "learning_rate": 5.174153683832081e-06, + "loss": 0.3068, + "step": 18040 + }, + { + "epoch": 0.7922105008946507, + "grad_norm": 1.4453125, + "learning_rate": 5.1699399797512375e-06, + "loss": 0.3011, + "step": 18042 + }, + { + "epoch": 0.7922983193738542, + "grad_norm": 1.59375, + "learning_rate": 5.165727794261135e-06, + "loss": 0.3002, + "step": 18044 + }, + { + "epoch": 0.7923861378530578, + "grad_norm": 1.5234375, + "learning_rate": 5.161517127684362e-06, + "loss": 0.3056, + "step": 18046 + }, + { + "epoch": 0.7924739563322613, + "grad_norm": 1.59375, + "learning_rate": 5.157307980343357e-06, + "loss": 0.3399, + "step": 18048 + }, + { + "epoch": 0.7925617748114647, + "grad_norm": 1.578125, + "learning_rate": 5.153100352560467e-06, + "loss": 0.2994, + "step": 18050 + }, + { + "epoch": 0.7926495932906682, + "grad_norm": 1.4921875, + "learning_rate": 5.148894244657912e-06, + "loss": 0.3117, + "step": 18052 + }, + { + "epoch": 0.7927374117698717, + "grad_norm": 1.4765625, + "learning_rate": 5.144689656957785e-06, + "loss": 0.3101, + "step": 18054 + }, + { + "epoch": 0.7928252302490751, + "grad_norm": 1.4765625, + "learning_rate": 5.140486589782092e-06, + "loss": 0.3079, + "step": 18056 + }, + { + "epoch": 0.7929130487282786, + "grad_norm": 1.6015625, + "learning_rate": 5.13628504345269e-06, + "loss": 0.3244, + "step": 18058 + }, + { + "epoch": 0.7930008672074821, + "grad_norm": 1.4765625, + "learning_rate": 5.13208501829135e-06, + "loss": 0.3635, + "step": 18060 + }, + { + "epoch": 0.7930886856866857, + "grad_norm": 1.4609375, + "learning_rate": 5.127886514619698e-06, + "loss": 0.3079, + "step": 18062 + }, + { + "epoch": 0.7931765041658891, + "grad_norm": 1.4140625, + "learning_rate": 5.123689532759254e-06, + "loss": 0.2855, + "step": 18064 + }, + { + "epoch": 0.7932643226450926, + "grad_norm": 1.5703125, + "learning_rate": 5.119494073031439e-06, + "loss": 0.3265, + "step": 18066 + }, + { + "epoch": 0.7933521411242961, + "grad_norm": 1.4140625, + "learning_rate": 5.115300135757534e-06, + "loss": 0.3127, + "step": 18068 + }, + { + "epoch": 0.7934399596034996, + "grad_norm": 1.5625, + "learning_rate": 5.11110772125871e-06, + "loss": 0.3004, + "step": 18070 + }, + { + "epoch": 0.793527778082703, + "grad_norm": 1.5, + "learning_rate": 5.1069168298560176e-06, + "loss": 0.3214, + "step": 18072 + }, + { + "epoch": 0.7936155965619065, + "grad_norm": 1.453125, + "learning_rate": 5.10272746187041e-06, + "loss": 0.3196, + "step": 18074 + }, + { + "epoch": 0.79370341504111, + "grad_norm": 1.4375, + "learning_rate": 5.098539617622697e-06, + "loss": 0.3235, + "step": 18076 + }, + { + "epoch": 0.7937912335203136, + "grad_norm": 1.484375, + "learning_rate": 5.094353297433596e-06, + "loss": 0.3163, + "step": 18078 + }, + { + "epoch": 0.793879051999517, + "grad_norm": 1.4921875, + "learning_rate": 5.090168501623693e-06, + "loss": 0.3085, + "step": 18080 + }, + { + "epoch": 0.7939668704787205, + "grad_norm": 1.4453125, + "learning_rate": 5.085985230513451e-06, + "loss": 0.2908, + "step": 18082 + }, + { + "epoch": 0.794054688957924, + "grad_norm": 1.53125, + "learning_rate": 5.081803484423242e-06, + "loss": 0.3026, + "step": 18084 + }, + { + "epoch": 0.7941425074371274, + "grad_norm": 1.546875, + "learning_rate": 5.077623263673289e-06, + "loss": 0.3191, + "step": 18086 + }, + { + "epoch": 0.7942303259163309, + "grad_norm": 1.5078125, + "learning_rate": 5.07344456858373e-06, + "loss": 0.3342, + "step": 18088 + }, + { + "epoch": 0.7943181443955344, + "grad_norm": 1.546875, + "learning_rate": 5.069267399474559e-06, + "loss": 0.3048, + "step": 18090 + }, + { + "epoch": 0.794405962874738, + "grad_norm": 1.5625, + "learning_rate": 5.0650917566656656e-06, + "loss": 0.2963, + "step": 18092 + }, + { + "epoch": 0.7944937813539414, + "grad_norm": 1.484375, + "learning_rate": 5.0609176404768285e-06, + "loss": 0.3213, + "step": 18094 + }, + { + "epoch": 0.7945815998331449, + "grad_norm": 1.515625, + "learning_rate": 5.056745051227693e-06, + "loss": 0.3337, + "step": 18096 + }, + { + "epoch": 0.7946694183123484, + "grad_norm": 1.53125, + "learning_rate": 5.052573989237808e-06, + "loss": 0.3239, + "step": 18098 + }, + { + "epoch": 0.7947572367915519, + "grad_norm": 1.53125, + "learning_rate": 5.048404454826588e-06, + "loss": 0.3279, + "step": 18100 + }, + { + "epoch": 0.7948450552707553, + "grad_norm": 1.484375, + "learning_rate": 5.044236448313339e-06, + "loss": 0.3332, + "step": 18102 + }, + { + "epoch": 0.7949328737499588, + "grad_norm": 1.578125, + "learning_rate": 5.040069970017247e-06, + "loss": 0.3256, + "step": 18104 + }, + { + "epoch": 0.7950206922291623, + "grad_norm": 1.4765625, + "learning_rate": 5.035905020257373e-06, + "loss": 0.3154, + "step": 18106 + }, + { + "epoch": 0.7951085107083659, + "grad_norm": 1.5546875, + "learning_rate": 5.031741599352685e-06, + "loss": 0.3131, + "step": 18108 + }, + { + "epoch": 0.7951963291875693, + "grad_norm": 1.4375, + "learning_rate": 5.027579707622007e-06, + "loss": 0.3013, + "step": 18110 + }, + { + "epoch": 0.7952841476667728, + "grad_norm": 1.6171875, + "learning_rate": 5.02341934538407e-06, + "loss": 0.3147, + "step": 18112 + }, + { + "epoch": 0.7953719661459763, + "grad_norm": 1.5546875, + "learning_rate": 5.019260512957466e-06, + "loss": 0.3319, + "step": 18114 + }, + { + "epoch": 0.7954597846251797, + "grad_norm": 1.4921875, + "learning_rate": 5.0151032106606764e-06, + "loss": 0.3055, + "step": 18116 + }, + { + "epoch": 0.7955476031043832, + "grad_norm": 1.484375, + "learning_rate": 5.010947438812078e-06, + "loss": 0.3211, + "step": 18118 + }, + { + "epoch": 0.7956354215835867, + "grad_norm": 1.5859375, + "learning_rate": 5.006793197729912e-06, + "loss": 0.3466, + "step": 18120 + }, + { + "epoch": 0.7957232400627902, + "grad_norm": 1.484375, + "learning_rate": 5.002640487732321e-06, + "loss": 0.3254, + "step": 18122 + }, + { + "epoch": 0.7958110585419937, + "grad_norm": 1.4375, + "learning_rate": 4.9984893091373165e-06, + "loss": 0.3338, + "step": 18124 + }, + { + "epoch": 0.7958988770211972, + "grad_norm": 1.53125, + "learning_rate": 4.994339662262787e-06, + "loss": 0.3281, + "step": 18126 + }, + { + "epoch": 0.7959866955004007, + "grad_norm": 1.453125, + "learning_rate": 4.990191547426531e-06, + "loss": 0.3051, + "step": 18128 + }, + { + "epoch": 0.7960745139796042, + "grad_norm": 1.4453125, + "learning_rate": 4.986044964946201e-06, + "loss": 0.3234, + "step": 18130 + }, + { + "epoch": 0.7961623324588076, + "grad_norm": 1.515625, + "learning_rate": 4.981899915139346e-06, + "loss": 0.3067, + "step": 18132 + }, + { + "epoch": 0.7962501509380111, + "grad_norm": 1.5703125, + "learning_rate": 4.977756398323388e-06, + "loss": 0.2966, + "step": 18134 + }, + { + "epoch": 0.7963379694172146, + "grad_norm": 1.4765625, + "learning_rate": 4.97361441481565e-06, + "loss": 0.3272, + "step": 18136 + }, + { + "epoch": 0.7964257878964182, + "grad_norm": 1.546875, + "learning_rate": 4.969473964933313e-06, + "loss": 0.2979, + "step": 18138 + }, + { + "epoch": 0.7965136063756216, + "grad_norm": 1.484375, + "learning_rate": 4.965335048993472e-06, + "loss": 0.2876, + "step": 18140 + }, + { + "epoch": 0.7966014248548251, + "grad_norm": 1.515625, + "learning_rate": 4.961197667313072e-06, + "loss": 0.3546, + "step": 18142 + }, + { + "epoch": 0.7966892433340286, + "grad_norm": 1.5546875, + "learning_rate": 4.957061820208952e-06, + "loss": 0.3265, + "step": 18144 + }, + { + "epoch": 0.796777061813232, + "grad_norm": 1.453125, + "learning_rate": 4.952927507997851e-06, + "loss": 0.3119, + "step": 18146 + }, + { + "epoch": 0.7968648802924355, + "grad_norm": 1.5, + "learning_rate": 4.948794730996359e-06, + "loss": 0.3151, + "step": 18148 + }, + { + "epoch": 0.796952698771639, + "grad_norm": 1.4375, + "learning_rate": 4.9446634895209815e-06, + "loss": 0.3152, + "step": 18150 + }, + { + "epoch": 0.7970405172508425, + "grad_norm": 1.5234375, + "learning_rate": 4.940533783888079e-06, + "loss": 0.3161, + "step": 18152 + }, + { + "epoch": 0.797128335730046, + "grad_norm": 1.5390625, + "learning_rate": 4.936405614413903e-06, + "loss": 0.3029, + "step": 18154 + }, + { + "epoch": 0.7972161542092495, + "grad_norm": 1.5, + "learning_rate": 4.932278981414601e-06, + "loss": 0.3507, + "step": 18156 + }, + { + "epoch": 0.797303972688453, + "grad_norm": 1.4921875, + "learning_rate": 4.92815388520618e-06, + "loss": 0.3181, + "step": 18158 + }, + { + "epoch": 0.7973917911676565, + "grad_norm": 1.453125, + "learning_rate": 4.924030326104556e-06, + "loss": 0.303, + "step": 18160 + }, + { + "epoch": 0.7974796096468599, + "grad_norm": 1.5625, + "learning_rate": 4.9199083044254915e-06, + "loss": 0.3098, + "step": 18162 + }, + { + "epoch": 0.7975674281260634, + "grad_norm": 1.453125, + "learning_rate": 4.915787820484669e-06, + "loss": 0.3075, + "step": 18164 + }, + { + "epoch": 0.7976552466052669, + "grad_norm": 1.4921875, + "learning_rate": 4.911668874597628e-06, + "loss": 0.326, + "step": 18166 + }, + { + "epoch": 0.7977430650844703, + "grad_norm": 1.5234375, + "learning_rate": 4.9075514670797935e-06, + "loss": 0.3283, + "step": 18168 + }, + { + "epoch": 0.7978308835636739, + "grad_norm": 1.484375, + "learning_rate": 4.903435598246492e-06, + "loss": 0.3154, + "step": 18170 + }, + { + "epoch": 0.7979187020428774, + "grad_norm": 1.484375, + "learning_rate": 4.899321268412904e-06, + "loss": 0.3441, + "step": 18172 + }, + { + "epoch": 0.7980065205220809, + "grad_norm": 1.6015625, + "learning_rate": 4.895208477894117e-06, + "loss": 0.3069, + "step": 18174 + }, + { + "epoch": 0.7980943390012843, + "grad_norm": 1.5234375, + "learning_rate": 4.891097227005085e-06, + "loss": 0.307, + "step": 18176 + }, + { + "epoch": 0.7981821574804878, + "grad_norm": 1.46875, + "learning_rate": 4.88698751606064e-06, + "loss": 0.3102, + "step": 18178 + }, + { + "epoch": 0.7982699759596913, + "grad_norm": 1.5, + "learning_rate": 4.882879345375521e-06, + "loss": 0.3094, + "step": 18180 + }, + { + "epoch": 0.7983577944388948, + "grad_norm": 1.515625, + "learning_rate": 4.878772715264315e-06, + "loss": 0.3016, + "step": 18182 + }, + { + "epoch": 0.7984456129180982, + "grad_norm": 1.4453125, + "learning_rate": 4.874667626041526e-06, + "loss": 0.3108, + "step": 18184 + }, + { + "epoch": 0.7985334313973018, + "grad_norm": 1.5703125, + "learning_rate": 4.870564078021514e-06, + "loss": 0.3263, + "step": 18186 + }, + { + "epoch": 0.7986212498765053, + "grad_norm": 1.46875, + "learning_rate": 4.866462071518524e-06, + "loss": 0.2997, + "step": 18188 + }, + { + "epoch": 0.7987090683557088, + "grad_norm": 1.59375, + "learning_rate": 4.862361606846702e-06, + "loss": 0.3249, + "step": 18190 + }, + { + "epoch": 0.7987968868349122, + "grad_norm": 1.671875, + "learning_rate": 4.858262684320056e-06, + "loss": 0.3326, + "step": 18192 + }, + { + "epoch": 0.7988847053141157, + "grad_norm": 1.5546875, + "learning_rate": 4.854165304252481e-06, + "loss": 0.3095, + "step": 18194 + }, + { + "epoch": 0.7989725237933192, + "grad_norm": 1.46875, + "learning_rate": 4.850069466957749e-06, + "loss": 0.3536, + "step": 18196 + }, + { + "epoch": 0.7990603422725227, + "grad_norm": 1.5078125, + "learning_rate": 4.8459751727495335e-06, + "loss": 0.3048, + "step": 18198 + }, + { + "epoch": 0.7991481607517262, + "grad_norm": 1.5390625, + "learning_rate": 4.841882421941365e-06, + "loss": 0.3148, + "step": 18200 + }, + { + "epoch": 0.7992359792309297, + "grad_norm": 1.53125, + "learning_rate": 4.837791214846679e-06, + "loss": 0.3154, + "step": 18202 + }, + { + "epoch": 0.7993237977101332, + "grad_norm": 1.515625, + "learning_rate": 4.833701551778777e-06, + "loss": 0.3204, + "step": 18204 + }, + { + "epoch": 0.7994116161893366, + "grad_norm": 1.4375, + "learning_rate": 4.829613433050837e-06, + "loss": 0.3105, + "step": 18206 + }, + { + "epoch": 0.7994994346685401, + "grad_norm": 1.4453125, + "learning_rate": 4.82552685897594e-06, + "loss": 0.2983, + "step": 18208 + }, + { + "epoch": 0.7995872531477436, + "grad_norm": 1.4921875, + "learning_rate": 4.8214418298670264e-06, + "loss": 0.3086, + "step": 18210 + }, + { + "epoch": 0.7996750716269471, + "grad_norm": 1.5, + "learning_rate": 4.8173583460369435e-06, + "loss": 0.3359, + "step": 18212 + }, + { + "epoch": 0.7997628901061505, + "grad_norm": 1.484375, + "learning_rate": 4.813276407798395e-06, + "loss": 0.2954, + "step": 18214 + }, + { + "epoch": 0.7998507085853541, + "grad_norm": 1.4921875, + "learning_rate": 4.809196015463971e-06, + "loss": 0.3317, + "step": 18216 + }, + { + "epoch": 0.7999385270645576, + "grad_norm": 1.515625, + "learning_rate": 4.805117169346163e-06, + "loss": 0.3251, + "step": 18218 + }, + { + "epoch": 0.8000263455437611, + "grad_norm": 1.4609375, + "learning_rate": 4.801039869757318e-06, + "loss": 0.3396, + "step": 18220 + }, + { + "epoch": 0.8001141640229645, + "grad_norm": 1.609375, + "learning_rate": 4.7969641170096944e-06, + "loss": 0.3243, + "step": 18222 + }, + { + "epoch": 0.800201982502168, + "grad_norm": 1.5, + "learning_rate": 4.792889911415388e-06, + "loss": 0.2899, + "step": 18224 + }, + { + "epoch": 0.8002898009813715, + "grad_norm": 1.4296875, + "learning_rate": 4.788817253286424e-06, + "loss": 0.3137, + "step": 18226 + }, + { + "epoch": 0.800377619460575, + "grad_norm": 1.4296875, + "learning_rate": 4.784746142934676e-06, + "loss": 0.2939, + "step": 18228 + }, + { + "epoch": 0.8004654379397784, + "grad_norm": 1.5234375, + "learning_rate": 4.780676580671911e-06, + "loss": 0.3131, + "step": 18230 + }, + { + "epoch": 0.800553256418982, + "grad_norm": 1.5078125, + "learning_rate": 4.776608566809787e-06, + "loss": 0.3141, + "step": 18232 + }, + { + "epoch": 0.8006410748981855, + "grad_norm": 1.4609375, + "learning_rate": 4.772542101659819e-06, + "loss": 0.3322, + "step": 18234 + }, + { + "epoch": 0.800728893377389, + "grad_norm": 1.4609375, + "learning_rate": 4.7684771855334324e-06, + "loss": 0.3079, + "step": 18236 + }, + { + "epoch": 0.8008167118565924, + "grad_norm": 1.4453125, + "learning_rate": 4.764413818741914e-06, + "loss": 0.3413, + "step": 18238 + }, + { + "epoch": 0.8009045303357959, + "grad_norm": 1.53125, + "learning_rate": 4.760352001596427e-06, + "loss": 0.3347, + "step": 18240 + }, + { + "epoch": 0.8009923488149994, + "grad_norm": 1.5, + "learning_rate": 4.756291734408044e-06, + "loss": 0.3092, + "step": 18242 + }, + { + "epoch": 0.8010801672942028, + "grad_norm": 1.421875, + "learning_rate": 4.752233017487687e-06, + "loss": 0.3148, + "step": 18244 + }, + { + "epoch": 0.8011679857734064, + "grad_norm": 1.53125, + "learning_rate": 4.748175851146186e-06, + "loss": 0.3175, + "step": 18246 + }, + { + "epoch": 0.8012558042526099, + "grad_norm": 1.453125, + "learning_rate": 4.74412023569423e-06, + "loss": 0.2983, + "step": 18248 + }, + { + "epoch": 0.8013436227318134, + "grad_norm": 1.4296875, + "learning_rate": 4.740066171442398e-06, + "loss": 0.3073, + "step": 18250 + }, + { + "epoch": 0.8014314412110168, + "grad_norm": 1.6171875, + "learning_rate": 4.7360136587011665e-06, + "loss": 0.321, + "step": 18252 + }, + { + "epoch": 0.8015192596902203, + "grad_norm": 1.4921875, + "learning_rate": 4.731962697780856e-06, + "loss": 0.3364, + "step": 18254 + }, + { + "epoch": 0.8016070781694238, + "grad_norm": 1.421875, + "learning_rate": 4.727913288991706e-06, + "loss": 0.2831, + "step": 18256 + }, + { + "epoch": 0.8016948966486273, + "grad_norm": 1.5546875, + "learning_rate": 4.723865432643809e-06, + "loss": 0.3107, + "step": 18258 + }, + { + "epoch": 0.8017827151278307, + "grad_norm": 1.59375, + "learning_rate": 4.719819129047165e-06, + "loss": 0.322, + "step": 18260 + }, + { + "epoch": 0.8018705336070343, + "grad_norm": 1.6640625, + "learning_rate": 4.7157743785116255e-06, + "loss": 0.2842, + "step": 18262 + }, + { + "epoch": 0.8019583520862378, + "grad_norm": 1.546875, + "learning_rate": 4.711731181346954e-06, + "loss": 0.3266, + "step": 18264 + }, + { + "epoch": 0.8020461705654413, + "grad_norm": 1.4296875, + "learning_rate": 4.707689537862772e-06, + "loss": 0.2794, + "step": 18266 + }, + { + "epoch": 0.8021339890446447, + "grad_norm": 1.484375, + "learning_rate": 4.703649448368583e-06, + "loss": 0.3098, + "step": 18268 + }, + { + "epoch": 0.8022218075238482, + "grad_norm": 1.5859375, + "learning_rate": 4.699610913173791e-06, + "loss": 0.3132, + "step": 18270 + }, + { + "epoch": 0.8023096260030517, + "grad_norm": 1.5078125, + "learning_rate": 4.695573932587657e-06, + "loss": 0.3035, + "step": 18272 + }, + { + "epoch": 0.8023974444822551, + "grad_norm": 1.4140625, + "learning_rate": 4.691538506919344e-06, + "loss": 0.3148, + "step": 18274 + }, + { + "epoch": 0.8024852629614586, + "grad_norm": 1.4921875, + "learning_rate": 4.68750463647788e-06, + "loss": 0.3232, + "step": 18276 + }, + { + "epoch": 0.8025730814406622, + "grad_norm": 1.4765625, + "learning_rate": 4.683472321572172e-06, + "loss": 0.3216, + "step": 18278 + }, + { + "epoch": 0.8026608999198657, + "grad_norm": 1.40625, + "learning_rate": 4.679441562511033e-06, + "loss": 0.3349, + "step": 18280 + }, + { + "epoch": 0.8027487183990691, + "grad_norm": 1.59375, + "learning_rate": 4.675412359603121e-06, + "loss": 0.2902, + "step": 18282 + }, + { + "epoch": 0.8028365368782726, + "grad_norm": 1.6015625, + "learning_rate": 4.671384713157018e-06, + "loss": 0.3061, + "step": 18284 + }, + { + "epoch": 0.8029243553574761, + "grad_norm": 1.5078125, + "learning_rate": 4.667358623481132e-06, + "loss": 0.3198, + "step": 18286 + }, + { + "epoch": 0.8030121738366796, + "grad_norm": 1.4921875, + "learning_rate": 4.663334090883806e-06, + "loss": 0.2927, + "step": 18288 + }, + { + "epoch": 0.803099992315883, + "grad_norm": 1.4921875, + "learning_rate": 4.659311115673229e-06, + "loss": 0.3303, + "step": 18290 + }, + { + "epoch": 0.8031878107950866, + "grad_norm": 1.4921875, + "learning_rate": 4.655289698157475e-06, + "loss": 0.3044, + "step": 18292 + }, + { + "epoch": 0.8032756292742901, + "grad_norm": 1.53125, + "learning_rate": 4.65126983864452e-06, + "loss": 0.3317, + "step": 18294 + }, + { + "epoch": 0.8033634477534936, + "grad_norm": 1.484375, + "learning_rate": 4.647251537442193e-06, + "loss": 0.2918, + "step": 18296 + }, + { + "epoch": 0.803451266232697, + "grad_norm": 1.46875, + "learning_rate": 4.643234794858229e-06, + "loss": 0.3144, + "step": 18298 + }, + { + "epoch": 0.8035390847119005, + "grad_norm": 1.5, + "learning_rate": 4.639219611200221e-06, + "loss": 0.3063, + "step": 18300 + }, + { + "epoch": 0.803626903191104, + "grad_norm": 1.53125, + "learning_rate": 4.635205986775654e-06, + "loss": 0.3148, + "step": 18302 + }, + { + "epoch": 0.8037147216703074, + "grad_norm": 1.453125, + "learning_rate": 4.6311939218918995e-06, + "loss": 0.3087, + "step": 18304 + }, + { + "epoch": 0.8038025401495109, + "grad_norm": 1.546875, + "learning_rate": 4.627183416856187e-06, + "loss": 0.2936, + "step": 18306 + }, + { + "epoch": 0.8038903586287145, + "grad_norm": 1.53125, + "learning_rate": 4.623174471975664e-06, + "loss": 0.2981, + "step": 18308 + }, + { + "epoch": 0.803978177107918, + "grad_norm": 1.578125, + "learning_rate": 4.619167087557322e-06, + "loss": 0.2947, + "step": 18310 + }, + { + "epoch": 0.8040659955871214, + "grad_norm": 1.5390625, + "learning_rate": 4.615161263908044e-06, + "loss": 0.3003, + "step": 18312 + }, + { + "epoch": 0.8041538140663249, + "grad_norm": 1.5078125, + "learning_rate": 4.611157001334615e-06, + "loss": 0.3225, + "step": 18314 + }, + { + "epoch": 0.8042416325455284, + "grad_norm": 1.5078125, + "learning_rate": 4.607154300143657e-06, + "loss": 0.2862, + "step": 18316 + }, + { + "epoch": 0.8043294510247319, + "grad_norm": 1.4765625, + "learning_rate": 4.603153160641719e-06, + "loss": 0.29, + "step": 18318 + }, + { + "epoch": 0.8044172695039353, + "grad_norm": 1.5234375, + "learning_rate": 4.5991535831351964e-06, + "loss": 0.3264, + "step": 18320 + }, + { + "epoch": 0.8045050879831388, + "grad_norm": 1.484375, + "learning_rate": 4.595155567930387e-06, + "loss": 0.2995, + "step": 18322 + }, + { + "epoch": 0.8045929064623424, + "grad_norm": 1.4765625, + "learning_rate": 4.591159115333454e-06, + "loss": 0.3019, + "step": 18324 + }, + { + "epoch": 0.8046807249415459, + "grad_norm": 1.4140625, + "learning_rate": 4.587164225650445e-06, + "loss": 0.3189, + "step": 18326 + }, + { + "epoch": 0.8047685434207493, + "grad_norm": 1.53125, + "learning_rate": 4.583170899187298e-06, + "loss": 0.3148, + "step": 18328 + }, + { + "epoch": 0.8048563618999528, + "grad_norm": 1.4609375, + "learning_rate": 4.579179136249812e-06, + "loss": 0.318, + "step": 18330 + }, + { + "epoch": 0.8049441803791563, + "grad_norm": 1.5859375, + "learning_rate": 4.5751889371436905e-06, + "loss": 0.3232, + "step": 18332 + }, + { + "epoch": 0.8050319988583597, + "grad_norm": 1.4296875, + "learning_rate": 4.571200302174489e-06, + "loss": 0.3035, + "step": 18334 + }, + { + "epoch": 0.8051198173375632, + "grad_norm": 1.625, + "learning_rate": 4.567213231647669e-06, + "loss": 0.3367, + "step": 18336 + }, + { + "epoch": 0.8052076358167668, + "grad_norm": 1.4921875, + "learning_rate": 4.563227725868561e-06, + "loss": 0.3073, + "step": 18338 + }, + { + "epoch": 0.8052954542959703, + "grad_norm": 1.4921875, + "learning_rate": 4.559243785142367e-06, + "loss": 0.3291, + "step": 18340 + }, + { + "epoch": 0.8053832727751737, + "grad_norm": 1.4453125, + "learning_rate": 4.555261409774187e-06, + "loss": 0.3237, + "step": 18342 + }, + { + "epoch": 0.8054710912543772, + "grad_norm": 1.4609375, + "learning_rate": 4.5512806000689916e-06, + "loss": 0.2984, + "step": 18344 + }, + { + "epoch": 0.8055589097335807, + "grad_norm": 1.3828125, + "learning_rate": 4.547301356331629e-06, + "loss": 0.3249, + "step": 18346 + }, + { + "epoch": 0.8056467282127842, + "grad_norm": 1.4765625, + "learning_rate": 4.543323678866826e-06, + "loss": 0.3269, + "step": 18348 + }, + { + "epoch": 0.8057345466919876, + "grad_norm": 1.453125, + "learning_rate": 4.539347567979205e-06, + "loss": 0.3146, + "step": 18350 + }, + { + "epoch": 0.8058223651711911, + "grad_norm": 1.59375, + "learning_rate": 4.535373023973253e-06, + "loss": 0.3335, + "step": 18352 + }, + { + "epoch": 0.8059101836503947, + "grad_norm": 1.53125, + "learning_rate": 4.531400047153331e-06, + "loss": 0.2805, + "step": 18354 + }, + { + "epoch": 0.8059980021295982, + "grad_norm": 1.53125, + "learning_rate": 4.52742863782371e-06, + "loss": 0.3067, + "step": 18356 + }, + { + "epoch": 0.8060858206088016, + "grad_norm": 1.5234375, + "learning_rate": 4.5234587962885045e-06, + "loss": 0.2892, + "step": 18358 + }, + { + "epoch": 0.8061736390880051, + "grad_norm": 1.5546875, + "learning_rate": 4.519490522851738e-06, + "loss": 0.3257, + "step": 18360 + }, + { + "epoch": 0.8062614575672086, + "grad_norm": 1.5234375, + "learning_rate": 4.515523817817297e-06, + "loss": 0.3211, + "step": 18362 + }, + { + "epoch": 0.806349276046412, + "grad_norm": 1.5078125, + "learning_rate": 4.511558681488945e-06, + "loss": 0.2904, + "step": 18364 + }, + { + "epoch": 0.8064370945256155, + "grad_norm": 1.4765625, + "learning_rate": 4.507595114170349e-06, + "loss": 0.3069, + "step": 18366 + }, + { + "epoch": 0.806524913004819, + "grad_norm": 1.515625, + "learning_rate": 4.503633116165026e-06, + "loss": 0.323, + "step": 18368 + }, + { + "epoch": 0.8066127314840226, + "grad_norm": 1.484375, + "learning_rate": 4.499672687776396e-06, + "loss": 0.3047, + "step": 18370 + }, + { + "epoch": 0.806700549963226, + "grad_norm": 1.5546875, + "learning_rate": 4.495713829307749e-06, + "loss": 0.3107, + "step": 18372 + }, + { + "epoch": 0.8067883684424295, + "grad_norm": 1.546875, + "learning_rate": 4.491756541062242e-06, + "loss": 0.3108, + "step": 18374 + }, + { + "epoch": 0.806876186921633, + "grad_norm": 1.4765625, + "learning_rate": 4.48780082334295e-06, + "loss": 0.3186, + "step": 18376 + }, + { + "epoch": 0.8069640054008365, + "grad_norm": 1.546875, + "learning_rate": 4.483846676452777e-06, + "loss": 0.3069, + "step": 18378 + }, + { + "epoch": 0.8070518238800399, + "grad_norm": 1.4765625, + "learning_rate": 4.479894100694545e-06, + "loss": 0.3001, + "step": 18380 + }, + { + "epoch": 0.8071396423592434, + "grad_norm": 1.5078125, + "learning_rate": 4.4759430963709406e-06, + "loss": 0.3085, + "step": 18382 + }, + { + "epoch": 0.8072274608384469, + "grad_norm": 1.7578125, + "learning_rate": 4.471993663784538e-06, + "loss": 0.3099, + "step": 18384 + }, + { + "epoch": 0.8073152793176505, + "grad_norm": 1.5625, + "learning_rate": 4.468045803237783e-06, + "loss": 0.2982, + "step": 18386 + }, + { + "epoch": 0.8074030977968539, + "grad_norm": 1.5, + "learning_rate": 4.464099515032993e-06, + "loss": 0.3134, + "step": 18388 + }, + { + "epoch": 0.8074909162760574, + "grad_norm": 1.46875, + "learning_rate": 4.460154799472394e-06, + "loss": 0.3161, + "step": 18390 + }, + { + "epoch": 0.8075787347552609, + "grad_norm": 1.4375, + "learning_rate": 4.456211656858056e-06, + "loss": 0.3432, + "step": 18392 + }, + { + "epoch": 0.8076665532344643, + "grad_norm": 1.5703125, + "learning_rate": 4.452270087491961e-06, + "loss": 0.317, + "step": 18394 + }, + { + "epoch": 0.8077543717136678, + "grad_norm": 1.4609375, + "learning_rate": 4.448330091675943e-06, + "loss": 0.3255, + "step": 18396 + }, + { + "epoch": 0.8078421901928713, + "grad_norm": 1.515625, + "learning_rate": 4.444391669711737e-06, + "loss": 0.2994, + "step": 18398 + }, + { + "epoch": 0.8079300086720749, + "grad_norm": 1.4609375, + "learning_rate": 4.440454821900947e-06, + "loss": 0.3137, + "step": 18400 + }, + { + "epoch": 0.8080178271512783, + "grad_norm": 1.46875, + "learning_rate": 4.436519548545049e-06, + "loss": 0.3011, + "step": 18402 + }, + { + "epoch": 0.8081056456304818, + "grad_norm": 1.4609375, + "learning_rate": 4.432585849945417e-06, + "loss": 0.3021, + "step": 18404 + }, + { + "epoch": 0.8081934641096853, + "grad_norm": 1.4453125, + "learning_rate": 4.428653726403292e-06, + "loss": 0.321, + "step": 18406 + }, + { + "epoch": 0.8082812825888888, + "grad_norm": 1.421875, + "learning_rate": 4.424723178219798e-06, + "loss": 0.304, + "step": 18408 + }, + { + "epoch": 0.8083691010680922, + "grad_norm": 1.5078125, + "learning_rate": 4.4207942056959275e-06, + "loss": 0.3304, + "step": 18410 + }, + { + "epoch": 0.8084569195472957, + "grad_norm": 1.4453125, + "learning_rate": 4.416866809132575e-06, + "loss": 0.2906, + "step": 18412 + }, + { + "epoch": 0.8085447380264992, + "grad_norm": 1.53125, + "learning_rate": 4.412940988830497e-06, + "loss": 0.3207, + "step": 18414 + }, + { + "epoch": 0.8086325565057028, + "grad_norm": 1.515625, + "learning_rate": 4.409016745090327e-06, + "loss": 0.3424, + "step": 18416 + }, + { + "epoch": 0.8087203749849062, + "grad_norm": 1.453125, + "learning_rate": 4.405094078212599e-06, + "loss": 0.3357, + "step": 18418 + }, + { + "epoch": 0.8088081934641097, + "grad_norm": 1.53125, + "learning_rate": 4.4011729884976955e-06, + "loss": 0.3276, + "step": 18420 + }, + { + "epoch": 0.8088960119433132, + "grad_norm": 1.46875, + "learning_rate": 4.397253476245908e-06, + "loss": 0.3036, + "step": 18422 + }, + { + "epoch": 0.8089838304225166, + "grad_norm": 1.484375, + "learning_rate": 4.393335541757387e-06, + "loss": 0.3055, + "step": 18424 + }, + { + "epoch": 0.8090716489017201, + "grad_norm": 1.515625, + "learning_rate": 4.389419185332167e-06, + "loss": 0.3363, + "step": 18426 + }, + { + "epoch": 0.8091594673809236, + "grad_norm": 1.5546875, + "learning_rate": 4.3855044072701715e-06, + "loss": 0.3146, + "step": 18428 + }, + { + "epoch": 0.8092472858601271, + "grad_norm": 1.375, + "learning_rate": 4.381591207871183e-06, + "loss": 0.311, + "step": 18430 + }, + { + "epoch": 0.8093351043393306, + "grad_norm": 1.53125, + "learning_rate": 4.377679587434888e-06, + "loss": 0.3498, + "step": 18432 + }, + { + "epoch": 0.8094229228185341, + "grad_norm": 1.5, + "learning_rate": 4.373769546260836e-06, + "loss": 0.3082, + "step": 18434 + }, + { + "epoch": 0.8095107412977376, + "grad_norm": 1.640625, + "learning_rate": 4.369861084648455e-06, + "loss": 0.3477, + "step": 18436 + }, + { + "epoch": 0.8095985597769411, + "grad_norm": 1.5390625, + "learning_rate": 4.365954202897058e-06, + "loss": 0.3237, + "step": 18438 + }, + { + "epoch": 0.8096863782561445, + "grad_norm": 1.5546875, + "learning_rate": 4.362048901305829e-06, + "loss": 0.3204, + "step": 18440 + }, + { + "epoch": 0.809774196735348, + "grad_norm": 1.4609375, + "learning_rate": 4.358145180173847e-06, + "loss": 0.3125, + "step": 18442 + }, + { + "epoch": 0.8098620152145515, + "grad_norm": 1.5, + "learning_rate": 4.354243039800049e-06, + "loss": 0.2819, + "step": 18444 + }, + { + "epoch": 0.8099498336937551, + "grad_norm": 1.4375, + "learning_rate": 4.350342480483277e-06, + "loss": 0.3345, + "step": 18446 + }, + { + "epoch": 0.8100376521729585, + "grad_norm": 1.515625, + "learning_rate": 4.346443502522226e-06, + "loss": 0.3361, + "step": 18448 + }, + { + "epoch": 0.810125470652162, + "grad_norm": 1.4609375, + "learning_rate": 4.3425461062154755e-06, + "loss": 0.3344, + "step": 18450 + }, + { + "epoch": 0.8102132891313655, + "grad_norm": 1.4453125, + "learning_rate": 4.338650291861504e-06, + "loss": 0.2927, + "step": 18452 + }, + { + "epoch": 0.810301107610569, + "grad_norm": 1.46875, + "learning_rate": 4.334756059758638e-06, + "loss": 0.3279, + "step": 18454 + }, + { + "epoch": 0.8103889260897724, + "grad_norm": 1.5234375, + "learning_rate": 4.330863410205116e-06, + "loss": 0.327, + "step": 18456 + }, + { + "epoch": 0.8104767445689759, + "grad_norm": 1.484375, + "learning_rate": 4.326972343499025e-06, + "loss": 0.3323, + "step": 18458 + }, + { + "epoch": 0.8105645630481794, + "grad_norm": 1.5, + "learning_rate": 4.323082859938343e-06, + "loss": 0.3349, + "step": 18460 + }, + { + "epoch": 0.810652381527383, + "grad_norm": 1.5078125, + "learning_rate": 4.319194959820941e-06, + "loss": 0.312, + "step": 18462 + }, + { + "epoch": 0.8107402000065864, + "grad_norm": 1.5625, + "learning_rate": 4.315308643444537e-06, + "loss": 0.3173, + "step": 18464 + }, + { + "epoch": 0.8108280184857899, + "grad_norm": 1.5, + "learning_rate": 4.3114239111067625e-06, + "loss": 0.3198, + "step": 18466 + }, + { + "epoch": 0.8109158369649934, + "grad_norm": 1.46875, + "learning_rate": 4.307540763105103e-06, + "loss": 0.3422, + "step": 18468 + }, + { + "epoch": 0.8110036554441968, + "grad_norm": 1.4609375, + "learning_rate": 4.303659199736934e-06, + "loss": 0.3171, + "step": 18470 + }, + { + "epoch": 0.8110914739234003, + "grad_norm": 1.5078125, + "learning_rate": 4.299779221299499e-06, + "loss": 0.3135, + "step": 18472 + }, + { + "epoch": 0.8111792924026038, + "grad_norm": 1.546875, + "learning_rate": 4.295900828089938e-06, + "loss": 0.3031, + "step": 18474 + }, + { + "epoch": 0.8112671108818073, + "grad_norm": 1.625, + "learning_rate": 4.292024020405255e-06, + "loss": 0.3092, + "step": 18476 + }, + { + "epoch": 0.8113549293610108, + "grad_norm": 1.59375, + "learning_rate": 4.288148798542332e-06, + "loss": 0.3243, + "step": 18478 + }, + { + "epoch": 0.8114427478402143, + "grad_norm": 1.59375, + "learning_rate": 4.284275162797943e-06, + "loss": 0.3145, + "step": 18480 + }, + { + "epoch": 0.8115305663194178, + "grad_norm": 1.484375, + "learning_rate": 4.2804031134687255e-06, + "loss": 0.2968, + "step": 18482 + }, + { + "epoch": 0.8116183847986213, + "grad_norm": 1.4609375, + "learning_rate": 4.276532650851206e-06, + "loss": 0.3296, + "step": 18484 + }, + { + "epoch": 0.8117062032778247, + "grad_norm": 1.484375, + "learning_rate": 4.272663775241787e-06, + "loss": 0.3132, + "step": 18486 + }, + { + "epoch": 0.8117940217570282, + "grad_norm": 1.5, + "learning_rate": 4.268796486936738e-06, + "loss": 0.3123, + "step": 18488 + }, + { + "epoch": 0.8118818402362317, + "grad_norm": 1.4921875, + "learning_rate": 4.264930786232227e-06, + "loss": 0.3005, + "step": 18490 + }, + { + "epoch": 0.8119696587154352, + "grad_norm": 1.5, + "learning_rate": 4.2610666734242825e-06, + "loss": 0.3299, + "step": 18492 + }, + { + "epoch": 0.8120574771946387, + "grad_norm": 1.5, + "learning_rate": 4.2572041488088325e-06, + "loss": 0.2917, + "step": 18494 + }, + { + "epoch": 0.8121452956738422, + "grad_norm": 1.5546875, + "learning_rate": 4.253343212681657e-06, + "loss": 0.335, + "step": 18496 + }, + { + "epoch": 0.8122331141530457, + "grad_norm": 1.5078125, + "learning_rate": 4.249483865338435e-06, + "loss": 0.324, + "step": 18498 + }, + { + "epoch": 0.8123209326322491, + "grad_norm": 1.546875, + "learning_rate": 4.24562610707471e-06, + "loss": 0.3325, + "step": 18500 + }, + { + "epoch": 0.8124087511114526, + "grad_norm": 1.4375, + "learning_rate": 4.241769938185907e-06, + "loss": 0.3297, + "step": 18502 + }, + { + "epoch": 0.8124965695906561, + "grad_norm": 1.4296875, + "learning_rate": 4.237915358967348e-06, + "loss": 0.3245, + "step": 18504 + }, + { + "epoch": 0.8125843880698596, + "grad_norm": 1.453125, + "learning_rate": 4.234062369714198e-06, + "loss": 0.2839, + "step": 18506 + }, + { + "epoch": 0.8126722065490631, + "grad_norm": 1.4609375, + "learning_rate": 4.230210970721538e-06, + "loss": 0.3228, + "step": 18508 + }, + { + "epoch": 0.8127600250282666, + "grad_norm": 1.4375, + "learning_rate": 4.226361162284298e-06, + "loss": 0.3114, + "step": 18510 + }, + { + "epoch": 0.8128478435074701, + "grad_norm": 1.5859375, + "learning_rate": 4.222512944697296e-06, + "loss": 0.3034, + "step": 18512 + }, + { + "epoch": 0.8129356619866736, + "grad_norm": 1.546875, + "learning_rate": 4.218666318255238e-06, + "loss": 0.2847, + "step": 18514 + }, + { + "epoch": 0.813023480465877, + "grad_norm": 1.453125, + "learning_rate": 4.21482128325269e-06, + "loss": 0.3017, + "step": 18516 + }, + { + "epoch": 0.8131112989450805, + "grad_norm": 1.6171875, + "learning_rate": 4.210977839984117e-06, + "loss": 0.3169, + "step": 18518 + }, + { + "epoch": 0.813199117424284, + "grad_norm": 1.5625, + "learning_rate": 4.207135988743844e-06, + "loss": 0.3233, + "step": 18520 + }, + { + "epoch": 0.8132869359034874, + "grad_norm": 1.5625, + "learning_rate": 4.203295729826076e-06, + "loss": 0.3397, + "step": 18522 + }, + { + "epoch": 0.813374754382691, + "grad_norm": 1.453125, + "learning_rate": 4.199457063524911e-06, + "loss": 0.3195, + "step": 18524 + }, + { + "epoch": 0.8134625728618945, + "grad_norm": 1.53125, + "learning_rate": 4.1956199901343055e-06, + "loss": 0.3229, + "step": 18526 + }, + { + "epoch": 0.813550391341098, + "grad_norm": 1.53125, + "learning_rate": 4.191784509948121e-06, + "loss": 0.2953, + "step": 18528 + }, + { + "epoch": 0.8136382098203014, + "grad_norm": 1.515625, + "learning_rate": 4.187950623260053e-06, + "loss": 0.3143, + "step": 18530 + }, + { + "epoch": 0.8137260282995049, + "grad_norm": 1.4921875, + "learning_rate": 4.184118330363721e-06, + "loss": 0.3555, + "step": 18532 + }, + { + "epoch": 0.8138138467787084, + "grad_norm": 1.4296875, + "learning_rate": 4.180287631552593e-06, + "loss": 0.3137, + "step": 18534 + }, + { + "epoch": 0.8139016652579119, + "grad_norm": 1.46875, + "learning_rate": 4.176458527120034e-06, + "loss": 0.3324, + "step": 18536 + }, + { + "epoch": 0.8139894837371154, + "grad_norm": 1.5546875, + "learning_rate": 4.172631017359274e-06, + "loss": 0.3093, + "step": 18538 + }, + { + "epoch": 0.8140773022163189, + "grad_norm": 1.5390625, + "learning_rate": 4.168805102563414e-06, + "loss": 0.3012, + "step": 18540 + }, + { + "epoch": 0.8141651206955224, + "grad_norm": 1.484375, + "learning_rate": 4.164980783025463e-06, + "loss": 0.2968, + "step": 18542 + }, + { + "epoch": 0.8142529391747259, + "grad_norm": 1.5234375, + "learning_rate": 4.1611580590382695e-06, + "loss": 0.3131, + "step": 18544 + }, + { + "epoch": 0.8143407576539293, + "grad_norm": 1.4453125, + "learning_rate": 4.157336930894593e-06, + "loss": 0.3175, + "step": 18546 + }, + { + "epoch": 0.8144285761331328, + "grad_norm": 1.484375, + "learning_rate": 4.153517398887053e-06, + "loss": 0.2955, + "step": 18548 + }, + { + "epoch": 0.8145163946123363, + "grad_norm": 1.5, + "learning_rate": 4.14969946330814e-06, + "loss": 0.3392, + "step": 18550 + }, + { + "epoch": 0.8146042130915397, + "grad_norm": 1.5703125, + "learning_rate": 4.145883124450245e-06, + "loss": 0.3102, + "step": 18552 + }, + { + "epoch": 0.8146920315707433, + "grad_norm": 1.4453125, + "learning_rate": 4.142068382605615e-06, + "loss": 0.3169, + "step": 18554 + }, + { + "epoch": 0.8147798500499468, + "grad_norm": 1.5, + "learning_rate": 4.138255238066397e-06, + "loss": 0.3488, + "step": 18556 + }, + { + "epoch": 0.8148676685291503, + "grad_norm": 1.6015625, + "learning_rate": 4.13444369112459e-06, + "loss": 0.3287, + "step": 18558 + }, + { + "epoch": 0.8149554870083537, + "grad_norm": 1.40625, + "learning_rate": 4.130633742072087e-06, + "loss": 0.3307, + "step": 18560 + }, + { + "epoch": 0.8150433054875572, + "grad_norm": 1.5703125, + "learning_rate": 4.126825391200656e-06, + "loss": 0.3352, + "step": 18562 + }, + { + "epoch": 0.8151311239667607, + "grad_norm": 1.5234375, + "learning_rate": 4.123018638801935e-06, + "loss": 0.3291, + "step": 18564 + }, + { + "epoch": 0.8152189424459642, + "grad_norm": 1.4609375, + "learning_rate": 4.119213485167456e-06, + "loss": 0.3104, + "step": 18566 + }, + { + "epoch": 0.8153067609251676, + "grad_norm": 1.4765625, + "learning_rate": 4.115409930588606e-06, + "loss": 0.312, + "step": 18568 + }, + { + "epoch": 0.8153945794043712, + "grad_norm": 1.5390625, + "learning_rate": 4.111607975356679e-06, + "loss": 0.3169, + "step": 18570 + }, + { + "epoch": 0.8154823978835747, + "grad_norm": 1.5625, + "learning_rate": 4.1078076197628214e-06, + "loss": 0.3016, + "step": 18572 + }, + { + "epoch": 0.8155702163627782, + "grad_norm": 1.4609375, + "learning_rate": 4.104008864098055e-06, + "loss": 0.3218, + "step": 18574 + }, + { + "epoch": 0.8156580348419816, + "grad_norm": 1.4765625, + "learning_rate": 4.100211708653306e-06, + "loss": 0.305, + "step": 18576 + }, + { + "epoch": 0.8157458533211851, + "grad_norm": 1.5, + "learning_rate": 4.0964161537193486e-06, + "loss": 0.3193, + "step": 18578 + }, + { + "epoch": 0.8158336718003886, + "grad_norm": 1.4609375, + "learning_rate": 4.092622199586859e-06, + "loss": 0.3265, + "step": 18580 + }, + { + "epoch": 0.815921490279592, + "grad_norm": 1.5390625, + "learning_rate": 4.088829846546374e-06, + "loss": 0.2984, + "step": 18582 + }, + { + "epoch": 0.8160093087587955, + "grad_norm": 1.6015625, + "learning_rate": 4.085039094888307e-06, + "loss": 0.3128, + "step": 18584 + }, + { + "epoch": 0.8160971272379991, + "grad_norm": 1.5078125, + "learning_rate": 4.0812499449029624e-06, + "loss": 0.3125, + "step": 18586 + }, + { + "epoch": 0.8161849457172026, + "grad_norm": 1.46875, + "learning_rate": 4.077462396880508e-06, + "loss": 0.3213, + "step": 18588 + }, + { + "epoch": 0.816272764196406, + "grad_norm": 1.4375, + "learning_rate": 4.073676451111011e-06, + "loss": 0.2875, + "step": 18590 + }, + { + "epoch": 0.8163605826756095, + "grad_norm": 1.4609375, + "learning_rate": 4.069892107884374e-06, + "loss": 0.3203, + "step": 18592 + }, + { + "epoch": 0.816448401154813, + "grad_norm": 1.4921875, + "learning_rate": 4.066109367490426e-06, + "loss": 0.3497, + "step": 18594 + }, + { + "epoch": 0.8165362196340165, + "grad_norm": 1.4921875, + "learning_rate": 4.062328230218831e-06, + "loss": 0.2914, + "step": 18596 + }, + { + "epoch": 0.8166240381132199, + "grad_norm": 1.640625, + "learning_rate": 4.0585486963591655e-06, + "loss": 0.3261, + "step": 18598 + }, + { + "epoch": 0.8167118565924235, + "grad_norm": 1.46875, + "learning_rate": 4.0547707662008634e-06, + "loss": 0.3163, + "step": 18600 + }, + { + "epoch": 0.816799675071627, + "grad_norm": 1.5, + "learning_rate": 4.050994440033229e-06, + "loss": 0.3222, + "step": 18602 + }, + { + "epoch": 0.8168874935508305, + "grad_norm": 1.5234375, + "learning_rate": 4.04721971814547e-06, + "loss": 0.3136, + "step": 18604 + }, + { + "epoch": 0.8169753120300339, + "grad_norm": 1.4921875, + "learning_rate": 4.0434466008266395e-06, + "loss": 0.3103, + "step": 18606 + }, + { + "epoch": 0.8170631305092374, + "grad_norm": 1.4296875, + "learning_rate": 4.0396750883657e-06, + "loss": 0.3185, + "step": 18608 + }, + { + "epoch": 0.8171509489884409, + "grad_norm": 1.5, + "learning_rate": 4.035905181051464e-06, + "loss": 0.3172, + "step": 18610 + }, + { + "epoch": 0.8172387674676443, + "grad_norm": 1.515625, + "learning_rate": 4.0321368791726325e-06, + "loss": 0.3128, + "step": 18612 + }, + { + "epoch": 0.8173265859468478, + "grad_norm": 1.3984375, + "learning_rate": 4.028370183017788e-06, + "loss": 0.3079, + "step": 18614 + }, + { + "epoch": 0.8174144044260514, + "grad_norm": 1.578125, + "learning_rate": 4.024605092875378e-06, + "loss": 0.3365, + "step": 18616 + }, + { + "epoch": 0.8175022229052549, + "grad_norm": 1.46875, + "learning_rate": 4.020841609033743e-06, + "loss": 0.3228, + "step": 18618 + }, + { + "epoch": 0.8175900413844583, + "grad_norm": 1.4296875, + "learning_rate": 4.0170797317810875e-06, + "loss": 0.3432, + "step": 18620 + }, + { + "epoch": 0.8176778598636618, + "grad_norm": 1.46875, + "learning_rate": 4.013319461405493e-06, + "loss": 0.3294, + "step": 18622 + }, + { + "epoch": 0.8177656783428653, + "grad_norm": 1.4296875, + "learning_rate": 4.009560798194928e-06, + "loss": 0.3218, + "step": 18624 + }, + { + "epoch": 0.8178534968220688, + "grad_norm": 1.4375, + "learning_rate": 4.005803742437222e-06, + "loss": 0.3066, + "step": 18626 + }, + { + "epoch": 0.8179413153012722, + "grad_norm": 1.5234375, + "learning_rate": 4.002048294420105e-06, + "loss": 0.3282, + "step": 18628 + }, + { + "epoch": 0.8180291337804757, + "grad_norm": 1.4453125, + "learning_rate": 3.998294454431157e-06, + "loss": 0.3375, + "step": 18630 + }, + { + "epoch": 0.8181169522596793, + "grad_norm": 1.5625, + "learning_rate": 3.99454222275786e-06, + "loss": 0.3129, + "step": 18632 + }, + { + "epoch": 0.8182047707388828, + "grad_norm": 1.3828125, + "learning_rate": 3.990791599687554e-06, + "loss": 0.332, + "step": 18634 + }, + { + "epoch": 0.8182925892180862, + "grad_norm": 1.578125, + "learning_rate": 3.987042585507458e-06, + "loss": 0.325, + "step": 18636 + }, + { + "epoch": 0.8183804076972897, + "grad_norm": 1.5625, + "learning_rate": 3.983295180504685e-06, + "loss": 0.2998, + "step": 18638 + }, + { + "epoch": 0.8184682261764932, + "grad_norm": 1.4921875, + "learning_rate": 3.979549384966197e-06, + "loss": 0.2963, + "step": 18640 + }, + { + "epoch": 0.8185560446556966, + "grad_norm": 1.4765625, + "learning_rate": 3.975805199178865e-06, + "loss": 0.3278, + "step": 18642 + }, + { + "epoch": 0.8186438631349001, + "grad_norm": 1.4765625, + "learning_rate": 3.972062623429409e-06, + "loss": 0.3575, + "step": 18644 + }, + { + "epoch": 0.8187316816141037, + "grad_norm": 1.453125, + "learning_rate": 3.968321658004431e-06, + "loss": 0.3186, + "step": 18646 + }, + { + "epoch": 0.8188195000933072, + "grad_norm": 1.5859375, + "learning_rate": 3.964582303190428e-06, + "loss": 0.3378, + "step": 18648 + }, + { + "epoch": 0.8189073185725106, + "grad_norm": 1.578125, + "learning_rate": 3.9608445592737575e-06, + "loss": 0.311, + "step": 18650 + }, + { + "epoch": 0.8189951370517141, + "grad_norm": 1.515625, + "learning_rate": 3.957108426540654e-06, + "loss": 0.2803, + "step": 18652 + }, + { + "epoch": 0.8190829555309176, + "grad_norm": 1.4453125, + "learning_rate": 3.953373905277222e-06, + "loss": 0.2866, + "step": 18654 + }, + { + "epoch": 0.8191707740101211, + "grad_norm": 1.5, + "learning_rate": 3.949640995769471e-06, + "loss": 0.3107, + "step": 18656 + }, + { + "epoch": 0.8192585924893245, + "grad_norm": 1.4375, + "learning_rate": 3.945909698303249e-06, + "loss": 0.321, + "step": 18658 + }, + { + "epoch": 0.819346410968528, + "grad_norm": 1.5625, + "learning_rate": 3.942180013164318e-06, + "loss": 0.305, + "step": 18660 + }, + { + "epoch": 0.8194342294477316, + "grad_norm": 1.4453125, + "learning_rate": 3.938451940638291e-06, + "loss": 0.3157, + "step": 18662 + }, + { + "epoch": 0.8195220479269351, + "grad_norm": 1.5, + "learning_rate": 3.934725481010653e-06, + "loss": 0.3003, + "step": 18664 + }, + { + "epoch": 0.8196098664061385, + "grad_norm": 1.4921875, + "learning_rate": 3.931000634566798e-06, + "loss": 0.343, + "step": 18666 + }, + { + "epoch": 0.819697684885342, + "grad_norm": 1.4921875, + "learning_rate": 3.927277401591956e-06, + "loss": 0.3026, + "step": 18668 + }, + { + "epoch": 0.8197855033645455, + "grad_norm": 1.5, + "learning_rate": 3.923555782371269e-06, + "loss": 0.3104, + "step": 18670 + }, + { + "epoch": 0.819873321843749, + "grad_norm": 1.5703125, + "learning_rate": 3.919835777189732e-06, + "loss": 0.3062, + "step": 18672 + }, + { + "epoch": 0.8199611403229524, + "grad_norm": 1.4921875, + "learning_rate": 3.916117386332219e-06, + "loss": 0.31, + "step": 18674 + }, + { + "epoch": 0.8200489588021559, + "grad_norm": 1.5390625, + "learning_rate": 3.912400610083494e-06, + "loss": 0.2994, + "step": 18676 + }, + { + "epoch": 0.8201367772813595, + "grad_norm": 1.5703125, + "learning_rate": 3.908685448728183e-06, + "loss": 0.3164, + "step": 18678 + }, + { + "epoch": 0.820224595760563, + "grad_norm": 1.5, + "learning_rate": 3.9049719025508e-06, + "loss": 0.3125, + "step": 18680 + }, + { + "epoch": 0.8203124142397664, + "grad_norm": 1.5234375, + "learning_rate": 3.901259971835728e-06, + "loss": 0.3175, + "step": 18682 + }, + { + "epoch": 0.8204002327189699, + "grad_norm": 1.5234375, + "learning_rate": 3.897549656867222e-06, + "loss": 0.3205, + "step": 18684 + }, + { + "epoch": 0.8204880511981734, + "grad_norm": 1.484375, + "learning_rate": 3.893840957929423e-06, + "loss": 0.3137, + "step": 18686 + }, + { + "epoch": 0.8205758696773768, + "grad_norm": 1.578125, + "learning_rate": 3.8901338753063375e-06, + "loss": 0.3512, + "step": 18688 + }, + { + "epoch": 0.8206636881565803, + "grad_norm": 1.4609375, + "learning_rate": 3.886428409281867e-06, + "loss": 0.3016, + "step": 18690 + }, + { + "epoch": 0.8207515066357839, + "grad_norm": 1.578125, + "learning_rate": 3.882724560139764e-06, + "loss": 0.3361, + "step": 18692 + }, + { + "epoch": 0.8208393251149874, + "grad_norm": 1.5, + "learning_rate": 3.879022328163681e-06, + "loss": 0.3345, + "step": 18694 + }, + { + "epoch": 0.8209271435941908, + "grad_norm": 1.6328125, + "learning_rate": 3.875321713637131e-06, + "loss": 0.3463, + "step": 18696 + }, + { + "epoch": 0.8210149620733943, + "grad_norm": 1.5703125, + "learning_rate": 3.8716227168435035e-06, + "loss": 0.3222, + "step": 18698 + }, + { + "epoch": 0.8211027805525978, + "grad_norm": 1.5, + "learning_rate": 3.867925338066078e-06, + "loss": 0.3165, + "step": 18700 + }, + { + "epoch": 0.8211905990318012, + "grad_norm": 1.53125, + "learning_rate": 3.864229577587991e-06, + "loss": 0.2992, + "step": 18702 + }, + { + "epoch": 0.8212784175110047, + "grad_norm": 1.5234375, + "learning_rate": 3.860535435692275e-06, + "loss": 0.3192, + "step": 18704 + }, + { + "epoch": 0.8213662359902082, + "grad_norm": 1.5078125, + "learning_rate": 3.856842912661823e-06, + "loss": 0.3102, + "step": 18706 + }, + { + "epoch": 0.8214540544694118, + "grad_norm": 1.4609375, + "learning_rate": 3.853152008779401e-06, + "loss": 0.3308, + "step": 18708 + }, + { + "epoch": 0.8215418729486152, + "grad_norm": 1.453125, + "learning_rate": 3.8494627243276764e-06, + "loss": 0.3053, + "step": 18710 + }, + { + "epoch": 0.8216296914278187, + "grad_norm": 1.4453125, + "learning_rate": 3.8457750595891656e-06, + "loss": 0.3065, + "step": 18712 + }, + { + "epoch": 0.8217175099070222, + "grad_norm": 1.4609375, + "learning_rate": 3.84208901484627e-06, + "loss": 0.3132, + "step": 18714 + }, + { + "epoch": 0.8218053283862257, + "grad_norm": 1.53125, + "learning_rate": 3.838404590381267e-06, + "loss": 0.3347, + "step": 18716 + }, + { + "epoch": 0.8218931468654291, + "grad_norm": 1.46875, + "learning_rate": 3.834721786476317e-06, + "loss": 0.2712, + "step": 18718 + }, + { + "epoch": 0.8219809653446326, + "grad_norm": 1.4921875, + "learning_rate": 3.831040603413441e-06, + "loss": 0.3004, + "step": 18720 + }, + { + "epoch": 0.8220687838238361, + "grad_norm": 1.5078125, + "learning_rate": 3.827361041474556e-06, + "loss": 0.3115, + "step": 18722 + }, + { + "epoch": 0.8221566023030397, + "grad_norm": 1.46875, + "learning_rate": 3.823683100941436e-06, + "loss": 0.3081, + "step": 18724 + }, + { + "epoch": 0.8222444207822431, + "grad_norm": 1.5, + "learning_rate": 3.820006782095736e-06, + "loss": 0.3414, + "step": 18726 + }, + { + "epoch": 0.8223322392614466, + "grad_norm": 1.4921875, + "learning_rate": 3.816332085218999e-06, + "loss": 0.3236, + "step": 18728 + }, + { + "epoch": 0.8224200577406501, + "grad_norm": 1.4921875, + "learning_rate": 3.812659010592626e-06, + "loss": 0.3281, + "step": 18730 + }, + { + "epoch": 0.8225078762198536, + "grad_norm": 1.4453125, + "learning_rate": 3.808987558497906e-06, + "loss": 0.302, + "step": 18732 + }, + { + "epoch": 0.822595694699057, + "grad_norm": 1.4765625, + "learning_rate": 3.8053177292160015e-06, + "loss": 0.3059, + "step": 18734 + }, + { + "epoch": 0.8226835131782605, + "grad_norm": 1.4765625, + "learning_rate": 3.801649523027942e-06, + "loss": 0.325, + "step": 18736 + }, + { + "epoch": 0.822771331657464, + "grad_norm": 1.5703125, + "learning_rate": 3.7979829402146477e-06, + "loss": 0.3035, + "step": 18738 + }, + { + "epoch": 0.8228591501366675, + "grad_norm": 1.46875, + "learning_rate": 3.794317981056894e-06, + "loss": 0.3197, + "step": 18740 + }, + { + "epoch": 0.822946968615871, + "grad_norm": 1.4765625, + "learning_rate": 3.7906546458353677e-06, + "loss": 0.3018, + "step": 18742 + }, + { + "epoch": 0.8230347870950745, + "grad_norm": 1.4375, + "learning_rate": 3.78699293483058e-06, + "loss": 0.3275, + "step": 18744 + }, + { + "epoch": 0.823122605574278, + "grad_norm": 1.484375, + "learning_rate": 3.783332848322965e-06, + "loss": 0.3183, + "step": 18746 + }, + { + "epoch": 0.8232104240534814, + "grad_norm": 1.484375, + "learning_rate": 3.7796743865928045e-06, + "loss": 0.3437, + "step": 18748 + }, + { + "epoch": 0.8232982425326849, + "grad_norm": 1.3984375, + "learning_rate": 3.776017549920263e-06, + "loss": 0.3303, + "step": 18750 + }, + { + "epoch": 0.8233860610118884, + "grad_norm": 1.453125, + "learning_rate": 3.772362338585389e-06, + "loss": 0.3062, + "step": 18752 + }, + { + "epoch": 0.823473879491092, + "grad_norm": 1.4609375, + "learning_rate": 3.7687087528680915e-06, + "loss": 0.3194, + "step": 18754 + }, + { + "epoch": 0.8235616979702954, + "grad_norm": 1.546875, + "learning_rate": 3.7650567930481716e-06, + "loss": 0.3231, + "step": 18756 + }, + { + "epoch": 0.8236495164494989, + "grad_norm": 1.5, + "learning_rate": 3.761406459405292e-06, + "loss": 0.3186, + "step": 18758 + }, + { + "epoch": 0.8237373349287024, + "grad_norm": 1.453125, + "learning_rate": 3.7577577522189935e-06, + "loss": 0.3101, + "step": 18760 + }, + { + "epoch": 0.8238251534079059, + "grad_norm": 1.5703125, + "learning_rate": 3.754110671768704e-06, + "loss": 0.313, + "step": 18762 + }, + { + "epoch": 0.8239129718871093, + "grad_norm": 1.4140625, + "learning_rate": 3.750465218333704e-06, + "loss": 0.3161, + "step": 18764 + }, + { + "epoch": 0.8240007903663128, + "grad_norm": 1.546875, + "learning_rate": 3.746821392193181e-06, + "loss": 0.3347, + "step": 18766 + }, + { + "epoch": 0.8240886088455163, + "grad_norm": 1.484375, + "learning_rate": 3.7431791936261672e-06, + "loss": 0.3277, + "step": 18768 + }, + { + "epoch": 0.8241764273247199, + "grad_norm": 1.5546875, + "learning_rate": 3.739538622911584e-06, + "loss": 0.3013, + "step": 18770 + }, + { + "epoch": 0.8242642458039233, + "grad_norm": 1.5, + "learning_rate": 3.7358996803282335e-06, + "loss": 0.3254, + "step": 18772 + }, + { + "epoch": 0.8243520642831268, + "grad_norm": 1.4296875, + "learning_rate": 3.7322623661547816e-06, + "loss": 0.3235, + "step": 18774 + }, + { + "epoch": 0.8244398827623303, + "grad_norm": 1.4375, + "learning_rate": 3.7286266806697777e-06, + "loss": 0.3175, + "step": 18776 + }, + { + "epoch": 0.8245277012415337, + "grad_norm": 1.4375, + "learning_rate": 3.7249926241516353e-06, + "loss": 0.2844, + "step": 18778 + }, + { + "epoch": 0.8246155197207372, + "grad_norm": 1.5234375, + "learning_rate": 3.721360196878662e-06, + "loss": 0.3042, + "step": 18780 + }, + { + "epoch": 0.8247033381999407, + "grad_norm": 1.4765625, + "learning_rate": 3.7177293991290273e-06, + "loss": 0.3065, + "step": 18782 + }, + { + "epoch": 0.8247911566791442, + "grad_norm": 1.5078125, + "learning_rate": 3.7141002311807698e-06, + "loss": 0.2974, + "step": 18784 + }, + { + "epoch": 0.8248789751583477, + "grad_norm": 1.4375, + "learning_rate": 3.7104726933118282e-06, + "loss": 0.3056, + "step": 18786 + }, + { + "epoch": 0.8249667936375512, + "grad_norm": 1.421875, + "learning_rate": 3.7068467857999832e-06, + "loss": 0.3465, + "step": 18788 + }, + { + "epoch": 0.8250546121167547, + "grad_norm": 1.515625, + "learning_rate": 3.703222508922921e-06, + "loss": 0.3242, + "step": 18790 + }, + { + "epoch": 0.8251424305959582, + "grad_norm": 1.5625, + "learning_rate": 3.699599862958178e-06, + "loss": 0.3172, + "step": 18792 + }, + { + "epoch": 0.8252302490751616, + "grad_norm": 1.6484375, + "learning_rate": 3.69597884818319e-06, + "loss": 0.3146, + "step": 18794 + }, + { + "epoch": 0.8253180675543651, + "grad_norm": 1.4296875, + "learning_rate": 3.69235946487525e-06, + "loss": 0.3353, + "step": 18796 + }, + { + "epoch": 0.8254058860335686, + "grad_norm": 1.4609375, + "learning_rate": 3.688741713311522e-06, + "loss": 0.3312, + "step": 18798 + }, + { + "epoch": 0.8254937045127722, + "grad_norm": 1.453125, + "learning_rate": 3.6851255937690704e-06, + "loss": 0.3205, + "step": 18800 + }, + { + "epoch": 0.8255815229919756, + "grad_norm": 1.4375, + "learning_rate": 3.6815111065248043e-06, + "loss": 0.2822, + "step": 18802 + }, + { + "epoch": 0.8256693414711791, + "grad_norm": 1.671875, + "learning_rate": 3.677898251855538e-06, + "loss": 0.3132, + "step": 18804 + }, + { + "epoch": 0.8257571599503826, + "grad_norm": 1.53125, + "learning_rate": 3.6742870300379232e-06, + "loss": 0.3054, + "step": 18806 + }, + { + "epoch": 0.825844978429586, + "grad_norm": 1.4453125, + "learning_rate": 3.6706774413485275e-06, + "loss": 0.2958, + "step": 18808 + }, + { + "epoch": 0.8259327969087895, + "grad_norm": 1.5078125, + "learning_rate": 3.667069486063765e-06, + "loss": 0.3208, + "step": 18810 + }, + { + "epoch": 0.826020615387993, + "grad_norm": 1.515625, + "learning_rate": 3.6634631644599295e-06, + "loss": 0.3191, + "step": 18812 + }, + { + "epoch": 0.8261084338671965, + "grad_norm": 1.5, + "learning_rate": 3.659858476813205e-06, + "loss": 0.3186, + "step": 18814 + }, + { + "epoch": 0.8261962523464, + "grad_norm": 1.46875, + "learning_rate": 3.6562554233996295e-06, + "loss": 0.3111, + "step": 18816 + }, + { + "epoch": 0.8262840708256035, + "grad_norm": 1.4453125, + "learning_rate": 3.6526540044951346e-06, + "loss": 0.3027, + "step": 18818 + }, + { + "epoch": 0.826371889304807, + "grad_norm": 1.4140625, + "learning_rate": 3.649054220375514e-06, + "loss": 0.2807, + "step": 18820 + }, + { + "epoch": 0.8264597077840105, + "grad_norm": 1.5, + "learning_rate": 3.6454560713164334e-06, + "loss": 0.3191, + "step": 18822 + }, + { + "epoch": 0.8265475262632139, + "grad_norm": 1.484375, + "learning_rate": 3.6418595575934524e-06, + "loss": 0.3282, + "step": 18824 + }, + { + "epoch": 0.8266353447424174, + "grad_norm": 1.4609375, + "learning_rate": 3.6382646794819785e-06, + "loss": 0.3097, + "step": 18826 + }, + { + "epoch": 0.8267231632216209, + "grad_norm": 1.578125, + "learning_rate": 3.6346714372573224e-06, + "loss": 0.3229, + "step": 18828 + }, + { + "epoch": 0.8268109817008243, + "grad_norm": 1.46875, + "learning_rate": 3.6310798311946503e-06, + "loss": 0.3392, + "step": 18830 + }, + { + "epoch": 0.8268988001800279, + "grad_norm": 1.421875, + "learning_rate": 3.627489861569003e-06, + "loss": 0.2793, + "step": 18832 + }, + { + "epoch": 0.8269866186592314, + "grad_norm": 1.484375, + "learning_rate": 3.623901528655313e-06, + "loss": 0.3158, + "step": 18834 + }, + { + "epoch": 0.8270744371384349, + "grad_norm": 1.4375, + "learning_rate": 3.620314832728361e-06, + "loss": 0.3022, + "step": 18836 + }, + { + "epoch": 0.8271622556176383, + "grad_norm": 1.515625, + "learning_rate": 3.616729774062827e-06, + "loss": 0.2894, + "step": 18838 + }, + { + "epoch": 0.8272500740968418, + "grad_norm": 1.53125, + "learning_rate": 3.613146352933247e-06, + "loss": 0.2916, + "step": 18840 + }, + { + "epoch": 0.8273378925760453, + "grad_norm": 1.3828125, + "learning_rate": 3.6095645696140547e-06, + "loss": 0.3082, + "step": 18842 + }, + { + "epoch": 0.8274257110552488, + "grad_norm": 1.3984375, + "learning_rate": 3.6059844243795327e-06, + "loss": 0.3201, + "step": 18844 + }, + { + "epoch": 0.8275135295344523, + "grad_norm": 1.765625, + "learning_rate": 3.6024059175038455e-06, + "loss": 0.293, + "step": 18846 + }, + { + "epoch": 0.8276013480136558, + "grad_norm": 1.4453125, + "learning_rate": 3.5988290492610488e-06, + "loss": 0.3128, + "step": 18848 + }, + { + "epoch": 0.8276891664928593, + "grad_norm": 1.40625, + "learning_rate": 3.5952538199250515e-06, + "loss": 0.2913, + "step": 18850 + }, + { + "epoch": 0.8277769849720628, + "grad_norm": 1.46875, + "learning_rate": 3.5916802297696506e-06, + "loss": 0.3064, + "step": 18852 + }, + { + "epoch": 0.8278648034512662, + "grad_norm": 1.546875, + "learning_rate": 3.5881082790685026e-06, + "loss": 0.322, + "step": 18854 + }, + { + "epoch": 0.8279526219304697, + "grad_norm": 1.4765625, + "learning_rate": 3.5845379680951614e-06, + "loss": 0.3225, + "step": 18856 + }, + { + "epoch": 0.8280404404096732, + "grad_norm": 1.5390625, + "learning_rate": 3.580969297123038e-06, + "loss": 0.3068, + "step": 18858 + }, + { + "epoch": 0.8281282588888766, + "grad_norm": 1.7578125, + "learning_rate": 3.577402266425414e-06, + "loss": 0.3302, + "step": 18860 + }, + { + "epoch": 0.8282160773680802, + "grad_norm": 1.5078125, + "learning_rate": 3.573836876275466e-06, + "loss": 0.3012, + "step": 18862 + }, + { + "epoch": 0.8283038958472837, + "grad_norm": 1.5, + "learning_rate": 3.5702731269462193e-06, + "loss": 0.338, + "step": 18864 + }, + { + "epoch": 0.8283917143264872, + "grad_norm": 1.4609375, + "learning_rate": 3.5667110187106056e-06, + "loss": 0.3062, + "step": 18866 + }, + { + "epoch": 0.8284795328056906, + "grad_norm": 1.4140625, + "learning_rate": 3.5631505518413904e-06, + "loss": 0.2983, + "step": 18868 + }, + { + "epoch": 0.8285673512848941, + "grad_norm": 1.5234375, + "learning_rate": 3.5595917266112474e-06, + "loss": 0.309, + "step": 18870 + }, + { + "epoch": 0.8286551697640976, + "grad_norm": 1.546875, + "learning_rate": 3.5560345432927104e-06, + "loss": 0.3137, + "step": 18872 + }, + { + "epoch": 0.8287429882433011, + "grad_norm": 1.4453125, + "learning_rate": 3.552479002158185e-06, + "loss": 0.3057, + "step": 18874 + }, + { + "epoch": 0.8288308067225045, + "grad_norm": 1.4765625, + "learning_rate": 3.5489251034799608e-06, + "loss": 0.328, + "step": 18876 + }, + { + "epoch": 0.8289186252017081, + "grad_norm": 1.46875, + "learning_rate": 3.545372847530193e-06, + "loss": 0.3488, + "step": 18878 + }, + { + "epoch": 0.8290064436809116, + "grad_norm": 1.5390625, + "learning_rate": 3.5418222345809186e-06, + "loss": 0.3083, + "step": 18880 + }, + { + "epoch": 0.8290942621601151, + "grad_norm": 1.6640625, + "learning_rate": 3.5382732649040405e-06, + "loss": 0.3395, + "step": 18882 + }, + { + "epoch": 0.8291820806393185, + "grad_norm": 1.53125, + "learning_rate": 3.5347259387713354e-06, + "loss": 0.314, + "step": 18884 + }, + { + "epoch": 0.829269899118522, + "grad_norm": 1.5078125, + "learning_rate": 3.5311802564544723e-06, + "loss": 0.2924, + "step": 18886 + }, + { + "epoch": 0.8293577175977255, + "grad_norm": 1.46875, + "learning_rate": 3.527636218224961e-06, + "loss": 0.3303, + "step": 18888 + }, + { + "epoch": 0.829445536076929, + "grad_norm": 1.578125, + "learning_rate": 3.5240938243542244e-06, + "loss": 0.3289, + "step": 18890 + }, + { + "epoch": 0.8295333545561325, + "grad_norm": 1.4375, + "learning_rate": 3.5205530751135307e-06, + "loss": 0.3181, + "step": 18892 + }, + { + "epoch": 0.829621173035336, + "grad_norm": 1.6015625, + "learning_rate": 3.5170139707740247e-06, + "loss": 0.3109, + "step": 18894 + }, + { + "epoch": 0.8297089915145395, + "grad_norm": 1.5625, + "learning_rate": 3.5134765116067504e-06, + "loss": 0.3041, + "step": 18896 + }, + { + "epoch": 0.829796809993743, + "grad_norm": 1.453125, + "learning_rate": 3.5099406978825857e-06, + "loss": 0.3023, + "step": 18898 + }, + { + "epoch": 0.8298846284729464, + "grad_norm": 1.578125, + "learning_rate": 3.5064065298723163e-06, + "loss": 0.3104, + "step": 18900 + }, + { + "epoch": 0.8299724469521499, + "grad_norm": 1.46875, + "learning_rate": 3.502874007846585e-06, + "loss": 0.3267, + "step": 18902 + }, + { + "epoch": 0.8300602654313534, + "grad_norm": 1.4609375, + "learning_rate": 3.499343132075919e-06, + "loss": 0.2919, + "step": 18904 + }, + { + "epoch": 0.8301480839105568, + "grad_norm": 1.59375, + "learning_rate": 3.495813902830711e-06, + "loss": 0.3234, + "step": 18906 + }, + { + "epoch": 0.8302359023897604, + "grad_norm": 1.4609375, + "learning_rate": 3.492286320381222e-06, + "loss": 0.3001, + "step": 18908 + }, + { + "epoch": 0.8303237208689639, + "grad_norm": 1.5859375, + "learning_rate": 3.488760384997608e-06, + "loss": 0.2751, + "step": 18910 + }, + { + "epoch": 0.8304115393481674, + "grad_norm": 1.53125, + "learning_rate": 3.485236096949876e-06, + "loss": 0.2929, + "step": 18912 + }, + { + "epoch": 0.8304993578273708, + "grad_norm": 1.4453125, + "learning_rate": 3.4817134565079264e-06, + "loss": 0.3345, + "step": 18914 + }, + { + "epoch": 0.8305871763065743, + "grad_norm": 1.53125, + "learning_rate": 3.478192463941518e-06, + "loss": 0.3347, + "step": 18916 + }, + { + "epoch": 0.8306749947857778, + "grad_norm": 1.546875, + "learning_rate": 3.4746731195202857e-06, + "loss": 0.3297, + "step": 18918 + }, + { + "epoch": 0.8307628132649812, + "grad_norm": 1.484375, + "learning_rate": 3.47115542351375e-06, + "loss": 0.2782, + "step": 18920 + }, + { + "epoch": 0.8308506317441847, + "grad_norm": 1.375, + "learning_rate": 3.467639376191287e-06, + "loss": 0.3137, + "step": 18922 + }, + { + "epoch": 0.8309384502233883, + "grad_norm": 1.4140625, + "learning_rate": 3.4641249778221694e-06, + "loss": 0.3205, + "step": 18924 + }, + { + "epoch": 0.8310262687025918, + "grad_norm": 1.4609375, + "learning_rate": 3.460612228675522e-06, + "loss": 0.3171, + "step": 18926 + }, + { + "epoch": 0.8311140871817952, + "grad_norm": 1.484375, + "learning_rate": 3.4571011290203543e-06, + "loss": 0.2997, + "step": 18928 + }, + { + "epoch": 0.8312019056609987, + "grad_norm": 1.46875, + "learning_rate": 3.4535916791255424e-06, + "loss": 0.3184, + "step": 18930 + }, + { + "epoch": 0.8312897241402022, + "grad_norm": 1.53125, + "learning_rate": 3.4500838792598467e-06, + "loss": 0.3275, + "step": 18932 + }, + { + "epoch": 0.8313775426194057, + "grad_norm": 1.5, + "learning_rate": 3.4465777296918965e-06, + "loss": 0.3294, + "step": 18934 + }, + { + "epoch": 0.8314653610986091, + "grad_norm": 1.4765625, + "learning_rate": 3.443073230690183e-06, + "loss": 0.3117, + "step": 18936 + }, + { + "epoch": 0.8315531795778126, + "grad_norm": 1.484375, + "learning_rate": 3.439570382523094e-06, + "loss": 0.2968, + "step": 18938 + }, + { + "epoch": 0.8316409980570162, + "grad_norm": 1.484375, + "learning_rate": 3.4360691854588707e-06, + "loss": 0.3137, + "step": 18940 + }, + { + "epoch": 0.8317288165362197, + "grad_norm": 1.4765625, + "learning_rate": 3.4325696397656425e-06, + "loss": 0.3005, + "step": 18942 + }, + { + "epoch": 0.8318166350154231, + "grad_norm": 1.4765625, + "learning_rate": 3.429071745711401e-06, + "loss": 0.3195, + "step": 18944 + }, + { + "epoch": 0.8319044534946266, + "grad_norm": 1.5078125, + "learning_rate": 3.42557550356401e-06, + "loss": 0.3318, + "step": 18946 + }, + { + "epoch": 0.8319922719738301, + "grad_norm": 1.453125, + "learning_rate": 3.4220809135912247e-06, + "loss": 0.3077, + "step": 18948 + }, + { + "epoch": 0.8320800904530335, + "grad_norm": 1.390625, + "learning_rate": 3.418587976060653e-06, + "loss": 0.2822, + "step": 18950 + }, + { + "epoch": 0.832167908932237, + "grad_norm": 1.5234375, + "learning_rate": 3.4150966912397888e-06, + "loss": 0.329, + "step": 18952 + }, + { + "epoch": 0.8322557274114406, + "grad_norm": 1.4296875, + "learning_rate": 3.4116070593959963e-06, + "loss": 0.3254, + "step": 18954 + }, + { + "epoch": 0.8323435458906441, + "grad_norm": 1.5234375, + "learning_rate": 3.4081190807965043e-06, + "loss": 0.3085, + "step": 18956 + }, + { + "epoch": 0.8324313643698475, + "grad_norm": 1.46875, + "learning_rate": 3.4046327557084396e-06, + "loss": 0.3283, + "step": 18958 + }, + { + "epoch": 0.832519182849051, + "grad_norm": 1.40625, + "learning_rate": 3.4011480843987643e-06, + "loss": 0.3184, + "step": 18960 + }, + { + "epoch": 0.8326070013282545, + "grad_norm": 1.4453125, + "learning_rate": 3.3976650671343535e-06, + "loss": 0.3031, + "step": 18962 + }, + { + "epoch": 0.832694819807458, + "grad_norm": 1.609375, + "learning_rate": 3.3941837041819247e-06, + "loss": 0.3436, + "step": 18964 + }, + { + "epoch": 0.8327826382866614, + "grad_norm": 1.4921875, + "learning_rate": 3.3907039958080895e-06, + "loss": 0.3419, + "step": 18966 + }, + { + "epoch": 0.8328704567658649, + "grad_norm": 1.5078125, + "learning_rate": 3.3872259422793263e-06, + "loss": 0.3346, + "step": 18968 + }, + { + "epoch": 0.8329582752450685, + "grad_norm": 1.5234375, + "learning_rate": 3.383749543861972e-06, + "loss": 0.3143, + "step": 18970 + }, + { + "epoch": 0.833046093724272, + "grad_norm": 1.4765625, + "learning_rate": 3.3802748008222667e-06, + "loss": 0.3023, + "step": 18972 + }, + { + "epoch": 0.8331339122034754, + "grad_norm": 1.6015625, + "learning_rate": 3.3768017134262945e-06, + "loss": 0.3151, + "step": 18974 + }, + { + "epoch": 0.8332217306826789, + "grad_norm": 1.5390625, + "learning_rate": 3.3733302819400376e-06, + "loss": 0.3068, + "step": 18976 + }, + { + "epoch": 0.8333095491618824, + "grad_norm": 1.3984375, + "learning_rate": 3.36986050662933e-06, + "loss": 0.3282, + "step": 18978 + }, + { + "epoch": 0.8333973676410859, + "grad_norm": 1.3984375, + "learning_rate": 3.366392387759884e-06, + "loss": 0.3174, + "step": 18980 + }, + { + "epoch": 0.8334851861202893, + "grad_norm": 1.4453125, + "learning_rate": 3.3629259255973017e-06, + "loss": 0.3048, + "step": 18982 + }, + { + "epoch": 0.8335730045994928, + "grad_norm": 1.4140625, + "learning_rate": 3.3594611204070313e-06, + "loss": 0.3117, + "step": 18984 + }, + { + "epoch": 0.8336608230786964, + "grad_norm": 1.46875, + "learning_rate": 3.355997972454425e-06, + "loss": 0.3045, + "step": 18986 + }, + { + "epoch": 0.8337486415578998, + "grad_norm": 1.5390625, + "learning_rate": 3.3525364820046782e-06, + "loss": 0.3071, + "step": 18988 + }, + { + "epoch": 0.8338364600371033, + "grad_norm": 1.46875, + "learning_rate": 3.34907664932288e-06, + "loss": 0.2948, + "step": 18990 + }, + { + "epoch": 0.8339242785163068, + "grad_norm": 1.546875, + "learning_rate": 3.3456184746739753e-06, + "loss": 0.3179, + "step": 18992 + }, + { + "epoch": 0.8340120969955103, + "grad_norm": 1.453125, + "learning_rate": 3.3421619583228036e-06, + "loss": 0.3317, + "step": 18994 + }, + { + "epoch": 0.8340999154747137, + "grad_norm": 1.4296875, + "learning_rate": 3.338707100534061e-06, + "loss": 0.2953, + "step": 18996 + }, + { + "epoch": 0.8341877339539172, + "grad_norm": 1.4140625, + "learning_rate": 3.335253901572316e-06, + "loss": 0.2883, + "step": 18998 + }, + { + "epoch": 0.8342755524331208, + "grad_norm": 1.5625, + "learning_rate": 3.3318023617020273e-06, + "loss": 0.3376, + "step": 19000 + }, + { + "epoch": 0.8343633709123243, + "grad_norm": 1.4375, + "learning_rate": 3.3283524811875023e-06, + "loss": 0.3111, + "step": 19002 + }, + { + "epoch": 0.8344511893915277, + "grad_norm": 1.421875, + "learning_rate": 3.3249042602929437e-06, + "loss": 0.3274, + "step": 19004 + }, + { + "epoch": 0.8345390078707312, + "grad_norm": 1.4921875, + "learning_rate": 3.3214576992824125e-06, + "loss": 0.336, + "step": 19006 + }, + { + "epoch": 0.8346268263499347, + "grad_norm": 1.46875, + "learning_rate": 3.3180127984198423e-06, + "loss": 0.3123, + "step": 19008 + }, + { + "epoch": 0.8347146448291382, + "grad_norm": 1.4921875, + "learning_rate": 3.314569557969055e-06, + "loss": 0.3108, + "step": 19010 + }, + { + "epoch": 0.8348024633083416, + "grad_norm": 1.40625, + "learning_rate": 3.311127978193726e-06, + "loss": 0.3298, + "step": 19012 + }, + { + "epoch": 0.8348902817875451, + "grad_norm": 1.4765625, + "learning_rate": 3.3076880593574196e-06, + "loss": 0.3208, + "step": 19014 + }, + { + "epoch": 0.8349781002667487, + "grad_norm": 1.6328125, + "learning_rate": 3.3042498017235607e-06, + "loss": 0.342, + "step": 19016 + }, + { + "epoch": 0.8350659187459522, + "grad_norm": 1.4375, + "learning_rate": 3.300813205555453e-06, + "loss": 0.2855, + "step": 19018 + }, + { + "epoch": 0.8351537372251556, + "grad_norm": 1.46875, + "learning_rate": 3.2973782711162722e-06, + "loss": 0.3106, + "step": 19020 + }, + { + "epoch": 0.8352415557043591, + "grad_norm": 1.46875, + "learning_rate": 3.29394499866906e-06, + "loss": 0.3279, + "step": 19022 + }, + { + "epoch": 0.8353293741835626, + "grad_norm": 1.4375, + "learning_rate": 3.2905133884767486e-06, + "loss": 0.3162, + "step": 19024 + }, + { + "epoch": 0.835417192662766, + "grad_norm": 1.4609375, + "learning_rate": 3.287083440802122e-06, + "loss": 0.3103, + "step": 19026 + }, + { + "epoch": 0.8355050111419695, + "grad_norm": 1.4453125, + "learning_rate": 3.2836551559078525e-06, + "loss": 0.2965, + "step": 19028 + }, + { + "epoch": 0.835592829621173, + "grad_norm": 1.40625, + "learning_rate": 3.280228534056479e-06, + "loss": 0.3235, + "step": 19030 + }, + { + "epoch": 0.8356806481003766, + "grad_norm": 1.46875, + "learning_rate": 3.2768035755104064e-06, + "loss": 0.29, + "step": 19032 + }, + { + "epoch": 0.83576846657958, + "grad_norm": 1.453125, + "learning_rate": 3.273380280531929e-06, + "loss": 0.3043, + "step": 19034 + }, + { + "epoch": 0.8358562850587835, + "grad_norm": 1.5078125, + "learning_rate": 3.2699586493831895e-06, + "loss": 0.3144, + "step": 19036 + }, + { + "epoch": 0.835944103537987, + "grad_norm": 1.4375, + "learning_rate": 3.266538682326234e-06, + "loss": 0.3357, + "step": 19038 + }, + { + "epoch": 0.8360319220171905, + "grad_norm": 1.4453125, + "learning_rate": 3.2631203796229555e-06, + "loss": 0.3015, + "step": 19040 + }, + { + "epoch": 0.8361197404963939, + "grad_norm": 1.4140625, + "learning_rate": 3.259703741535122e-06, + "loss": 0.2977, + "step": 19042 + }, + { + "epoch": 0.8362075589755974, + "grad_norm": 1.3984375, + "learning_rate": 3.256288768324395e-06, + "loss": 0.299, + "step": 19044 + }, + { + "epoch": 0.836295377454801, + "grad_norm": 1.5234375, + "learning_rate": 3.2528754602522804e-06, + "loss": 0.3027, + "step": 19046 + }, + { + "epoch": 0.8363831959340045, + "grad_norm": 1.6015625, + "learning_rate": 3.249463817580181e-06, + "loss": 0.3362, + "step": 19048 + }, + { + "epoch": 0.8364710144132079, + "grad_norm": 1.5234375, + "learning_rate": 3.2460538405693573e-06, + "loss": 0.31, + "step": 19050 + }, + { + "epoch": 0.8365588328924114, + "grad_norm": 1.46875, + "learning_rate": 3.242645529480945e-06, + "loss": 0.3407, + "step": 19052 + }, + { + "epoch": 0.8366466513716149, + "grad_norm": 1.5390625, + "learning_rate": 3.239238884575949e-06, + "loss": 0.3327, + "step": 19054 + }, + { + "epoch": 0.8367344698508183, + "grad_norm": 1.4609375, + "learning_rate": 3.23583390611526e-06, + "loss": 0.3321, + "step": 19056 + }, + { + "epoch": 0.8368222883300218, + "grad_norm": 1.4765625, + "learning_rate": 3.2324305943596256e-06, + "loss": 0.2949, + "step": 19058 + }, + { + "epoch": 0.8369101068092253, + "grad_norm": 1.5078125, + "learning_rate": 3.2290289495696705e-06, + "loss": 0.2996, + "step": 19060 + }, + { + "epoch": 0.8369979252884289, + "grad_norm": 1.5390625, + "learning_rate": 3.2256289720059035e-06, + "loss": 0.3195, + "step": 19062 + }, + { + "epoch": 0.8370857437676323, + "grad_norm": 1.5390625, + "learning_rate": 3.2222306619286852e-06, + "loss": 0.2868, + "step": 19064 + }, + { + "epoch": 0.8371735622468358, + "grad_norm": 1.4765625, + "learning_rate": 3.2188340195982685e-06, + "loss": 0.3008, + "step": 19066 + }, + { + "epoch": 0.8372613807260393, + "grad_norm": 1.453125, + "learning_rate": 3.2154390452747625e-06, + "loss": 0.3051, + "step": 19068 + }, + { + "epoch": 0.8373491992052428, + "grad_norm": 1.546875, + "learning_rate": 3.2120457392181503e-06, + "loss": 0.3128, + "step": 19070 + }, + { + "epoch": 0.8374370176844462, + "grad_norm": 1.46875, + "learning_rate": 3.208654101688305e-06, + "loss": 0.3044, + "step": 19072 + }, + { + "epoch": 0.8375248361636497, + "grad_norm": 1.3984375, + "learning_rate": 3.205264132944946e-06, + "loss": 0.3149, + "step": 19074 + }, + { + "epoch": 0.8376126546428532, + "grad_norm": 1.4765625, + "learning_rate": 3.2018758332476916e-06, + "loss": 0.3165, + "step": 19076 + }, + { + "epoch": 0.8377004731220568, + "grad_norm": 1.4375, + "learning_rate": 3.198489202856009e-06, + "loss": 0.2965, + "step": 19078 + }, + { + "epoch": 0.8377882916012602, + "grad_norm": 1.5078125, + "learning_rate": 3.195104242029251e-06, + "loss": 0.3335, + "step": 19080 + }, + { + "epoch": 0.8378761100804637, + "grad_norm": 1.6171875, + "learning_rate": 3.191720951026636e-06, + "loss": 0.3275, + "step": 19082 + }, + { + "epoch": 0.8379639285596672, + "grad_norm": 1.515625, + "learning_rate": 3.188339330107254e-06, + "loss": 0.3249, + "step": 19084 + }, + { + "epoch": 0.8380517470388706, + "grad_norm": 1.4609375, + "learning_rate": 3.184959379530081e-06, + "loss": 0.3148, + "step": 19086 + }, + { + "epoch": 0.8381395655180741, + "grad_norm": 1.46875, + "learning_rate": 3.1815810995539404e-06, + "loss": 0.3339, + "step": 19088 + }, + { + "epoch": 0.8382273839972776, + "grad_norm": 1.40625, + "learning_rate": 3.178204490437556e-06, + "loss": 0.3135, + "step": 19090 + }, + { + "epoch": 0.8383152024764811, + "grad_norm": 1.4765625, + "learning_rate": 3.174829552439504e-06, + "loss": 0.3102, + "step": 19092 + }, + { + "epoch": 0.8384030209556846, + "grad_norm": 1.5703125, + "learning_rate": 3.1714562858182277e-06, + "loss": 0.3405, + "step": 19094 + }, + { + "epoch": 0.8384908394348881, + "grad_norm": 1.5546875, + "learning_rate": 3.168084690832071e-06, + "loss": 0.3078, + "step": 19096 + }, + { + "epoch": 0.8385786579140916, + "grad_norm": 1.4765625, + "learning_rate": 3.1647147677392157e-06, + "loss": 0.3009, + "step": 19098 + }, + { + "epoch": 0.8386664763932951, + "grad_norm": 1.515625, + "learning_rate": 3.161346516797742e-06, + "loss": 0.3162, + "step": 19100 + }, + { + "epoch": 0.8387542948724985, + "grad_norm": 1.53125, + "learning_rate": 3.157979938265587e-06, + "loss": 0.3176, + "step": 19102 + }, + { + "epoch": 0.838842113351702, + "grad_norm": 1.5546875, + "learning_rate": 3.1546150324005595e-06, + "loss": 0.3332, + "step": 19104 + }, + { + "epoch": 0.8389299318309055, + "grad_norm": 1.4453125, + "learning_rate": 3.1512517994603556e-06, + "loss": 0.2953, + "step": 19106 + }, + { + "epoch": 0.8390177503101091, + "grad_norm": 1.4140625, + "learning_rate": 3.1478902397025196e-06, + "loss": 0.3121, + "step": 19108 + }, + { + "epoch": 0.8391055687893125, + "grad_norm": 1.4375, + "learning_rate": 3.1445303533845005e-06, + "loss": 0.303, + "step": 19110 + }, + { + "epoch": 0.839193387268516, + "grad_norm": 1.5, + "learning_rate": 3.1411721407635738e-06, + "loss": 0.3029, + "step": 19112 + }, + { + "epoch": 0.8392812057477195, + "grad_norm": 1.5703125, + "learning_rate": 3.13781560209693e-06, + "loss": 0.3016, + "step": 19114 + }, + { + "epoch": 0.8393690242269229, + "grad_norm": 1.4609375, + "learning_rate": 3.1344607376416008e-06, + "loss": 0.3275, + "step": 19116 + }, + { + "epoch": 0.8394568427061264, + "grad_norm": 1.4453125, + "learning_rate": 3.131107547654519e-06, + "loss": 0.2944, + "step": 19118 + }, + { + "epoch": 0.8395446611853299, + "grad_norm": 1.515625, + "learning_rate": 3.1277560323924593e-06, + "loss": 0.3159, + "step": 19120 + }, + { + "epoch": 0.8396324796645334, + "grad_norm": 1.4453125, + "learning_rate": 3.1244061921120838e-06, + "loss": 0.3307, + "step": 19122 + }, + { + "epoch": 0.8397202981437369, + "grad_norm": 1.53125, + "learning_rate": 3.1210580270699285e-06, + "loss": 0.3018, + "step": 19124 + }, + { + "epoch": 0.8398081166229404, + "grad_norm": 1.4453125, + "learning_rate": 3.1177115375223915e-06, + "loss": 0.3169, + "step": 19126 + }, + { + "epoch": 0.8398959351021439, + "grad_norm": 1.5546875, + "learning_rate": 3.1143667237257563e-06, + "loss": 0.3231, + "step": 19128 + }, + { + "epoch": 0.8399837535813474, + "grad_norm": 1.4296875, + "learning_rate": 3.1110235859361597e-06, + "loss": 0.3246, + "step": 19130 + }, + { + "epoch": 0.8400715720605508, + "grad_norm": 1.4375, + "learning_rate": 3.107682124409622e-06, + "loss": 0.2926, + "step": 19132 + }, + { + "epoch": 0.8401593905397543, + "grad_norm": 1.625, + "learning_rate": 3.1043423394020417e-06, + "loss": 0.3063, + "step": 19134 + }, + { + "epoch": 0.8402472090189578, + "grad_norm": 1.4453125, + "learning_rate": 3.1010042311691663e-06, + "loss": 0.3252, + "step": 19136 + }, + { + "epoch": 0.8403350274981612, + "grad_norm": 1.4921875, + "learning_rate": 3.0976677999666414e-06, + "loss": 0.3084, + "step": 19138 + }, + { + "epoch": 0.8404228459773648, + "grad_norm": 1.4765625, + "learning_rate": 3.094333046049966e-06, + "loss": 0.3037, + "step": 19140 + }, + { + "epoch": 0.8405106644565683, + "grad_norm": 1.5859375, + "learning_rate": 3.0909999696745185e-06, + "loss": 0.3232, + "step": 19142 + }, + { + "epoch": 0.8405984829357718, + "grad_norm": 1.4453125, + "learning_rate": 3.0876685710955476e-06, + "loss": 0.3094, + "step": 19144 + }, + { + "epoch": 0.8406863014149752, + "grad_norm": 1.53125, + "learning_rate": 3.084338850568161e-06, + "loss": 0.2993, + "step": 19146 + }, + { + "epoch": 0.8407741198941787, + "grad_norm": 1.546875, + "learning_rate": 3.081010808347365e-06, + "loss": 0.314, + "step": 19148 + }, + { + "epoch": 0.8408619383733822, + "grad_norm": 1.4765625, + "learning_rate": 3.0776844446880115e-06, + "loss": 0.3055, + "step": 19150 + }, + { + "epoch": 0.8409497568525857, + "grad_norm": 1.40625, + "learning_rate": 3.074359759844844e-06, + "loss": 0.2917, + "step": 19152 + }, + { + "epoch": 0.8410375753317892, + "grad_norm": 1.5078125, + "learning_rate": 3.071036754072462e-06, + "loss": 0.3387, + "step": 19154 + }, + { + "epoch": 0.8411253938109927, + "grad_norm": 1.453125, + "learning_rate": 3.067715427625334e-06, + "loss": 0.3268, + "step": 19156 + }, + { + "epoch": 0.8412132122901962, + "grad_norm": 1.4296875, + "learning_rate": 3.0643957807578253e-06, + "loss": 0.2954, + "step": 19158 + }, + { + "epoch": 0.8413010307693997, + "grad_norm": 1.359375, + "learning_rate": 3.061077813724139e-06, + "loss": 0.3172, + "step": 19160 + }, + { + "epoch": 0.8413888492486031, + "grad_norm": 1.453125, + "learning_rate": 3.0577615267783773e-06, + "loss": 0.2994, + "step": 19162 + }, + { + "epoch": 0.8414766677278066, + "grad_norm": 1.453125, + "learning_rate": 3.0544469201744976e-06, + "loss": 0.3058, + "step": 19164 + }, + { + "epoch": 0.8415644862070101, + "grad_norm": 1.4765625, + "learning_rate": 3.0511339941663303e-06, + "loss": 0.313, + "step": 19166 + }, + { + "epoch": 0.8416523046862135, + "grad_norm": 1.4296875, + "learning_rate": 3.0478227490075866e-06, + "loss": 0.313, + "step": 19168 + }, + { + "epoch": 0.8417401231654171, + "grad_norm": 1.4453125, + "learning_rate": 3.044513184951833e-06, + "loss": 0.3098, + "step": 19170 + }, + { + "epoch": 0.8418279416446206, + "grad_norm": 1.515625, + "learning_rate": 3.0412053022525367e-06, + "loss": 0.2796, + "step": 19172 + }, + { + "epoch": 0.8419157601238241, + "grad_norm": 1.4921875, + "learning_rate": 3.037899101162989e-06, + "loss": 0.3176, + "step": 19174 + }, + { + "epoch": 0.8420035786030275, + "grad_norm": 1.46875, + "learning_rate": 3.034594581936398e-06, + "loss": 0.2792, + "step": 19176 + }, + { + "epoch": 0.842091397082231, + "grad_norm": 1.5078125, + "learning_rate": 3.0312917448258205e-06, + "loss": 0.3012, + "step": 19178 + }, + { + "epoch": 0.8421792155614345, + "grad_norm": 1.375, + "learning_rate": 3.0279905900841815e-06, + "loss": 0.3114, + "step": 19180 + }, + { + "epoch": 0.842267034040638, + "grad_norm": 1.4765625, + "learning_rate": 3.024691117964298e-06, + "loss": 0.3407, + "step": 19182 + }, + { + "epoch": 0.8423548525198414, + "grad_norm": 1.5, + "learning_rate": 3.0213933287188272e-06, + "loss": 0.2972, + "step": 19184 + }, + { + "epoch": 0.842442670999045, + "grad_norm": 1.5, + "learning_rate": 3.0180972226003327e-06, + "loss": 0.2992, + "step": 19186 + }, + { + "epoch": 0.8425304894782485, + "grad_norm": 1.4296875, + "learning_rate": 3.014802799861216e-06, + "loss": 0.2968, + "step": 19188 + }, + { + "epoch": 0.842618307957452, + "grad_norm": 1.484375, + "learning_rate": 3.011510060753775e-06, + "loss": 0.3046, + "step": 19190 + }, + { + "epoch": 0.8427061264366554, + "grad_norm": 1.4375, + "learning_rate": 3.008219005530166e-06, + "loss": 0.3093, + "step": 19192 + }, + { + "epoch": 0.8427939449158589, + "grad_norm": 1.5234375, + "learning_rate": 3.0049296344424103e-06, + "loss": 0.3324, + "step": 19194 + }, + { + "epoch": 0.8428817633950624, + "grad_norm": 1.5625, + "learning_rate": 3.001641947742423e-06, + "loss": 0.3031, + "step": 19196 + }, + { + "epoch": 0.8429695818742658, + "grad_norm": 1.53125, + "learning_rate": 2.9983559456819633e-06, + "loss": 0.3161, + "step": 19198 + }, + { + "epoch": 0.8430574003534694, + "grad_norm": 1.4375, + "learning_rate": 2.9950716285126827e-06, + "loss": 0.3365, + "step": 19200 + }, + { + "epoch": 0.8431452188326729, + "grad_norm": 1.390625, + "learning_rate": 2.9917889964860917e-06, + "loss": 0.3279, + "step": 19202 + }, + { + "epoch": 0.8432330373118764, + "grad_norm": 1.5, + "learning_rate": 2.9885080498535777e-06, + "loss": 0.3001, + "step": 19204 + }, + { + "epoch": 0.8433208557910798, + "grad_norm": 1.5, + "learning_rate": 2.9852287888663928e-06, + "loss": 0.2887, + "step": 19206 + }, + { + "epoch": 0.8434086742702833, + "grad_norm": 1.5703125, + "learning_rate": 2.9819512137756577e-06, + "loss": 0.3053, + "step": 19208 + }, + { + "epoch": 0.8434964927494868, + "grad_norm": 1.4453125, + "learning_rate": 2.9786753248323833e-06, + "loss": 0.3567, + "step": 19210 + }, + { + "epoch": 0.8435843112286903, + "grad_norm": 1.5234375, + "learning_rate": 2.9754011222874275e-06, + "loss": 0.2944, + "step": 19212 + }, + { + "epoch": 0.8436721297078937, + "grad_norm": 1.4609375, + "learning_rate": 2.972128606391536e-06, + "loss": 0.3047, + "step": 19214 + }, + { + "epoch": 0.8437599481870973, + "grad_norm": 1.4609375, + "learning_rate": 2.9688577773953176e-06, + "loss": 0.2938, + "step": 19216 + }, + { + "epoch": 0.8438477666663008, + "grad_norm": 1.453125, + "learning_rate": 2.965588635549249e-06, + "loss": 0.3215, + "step": 19218 + }, + { + "epoch": 0.8439355851455043, + "grad_norm": 1.6171875, + "learning_rate": 2.9623211811036862e-06, + "loss": 0.3035, + "step": 19220 + }, + { + "epoch": 0.8440234036247077, + "grad_norm": 1.4609375, + "learning_rate": 2.959055414308845e-06, + "loss": 0.3194, + "step": 19222 + }, + { + "epoch": 0.8441112221039112, + "grad_norm": 1.453125, + "learning_rate": 2.9557913354148316e-06, + "loss": 0.3443, + "step": 19224 + }, + { + "epoch": 0.8441990405831147, + "grad_norm": 1.4765625, + "learning_rate": 2.952528944671601e-06, + "loss": 0.3636, + "step": 19226 + }, + { + "epoch": 0.8442868590623182, + "grad_norm": 1.5703125, + "learning_rate": 2.9492682423289843e-06, + "loss": 0.2937, + "step": 19228 + }, + { + "epoch": 0.8443746775415216, + "grad_norm": 1.421875, + "learning_rate": 2.9460092286366955e-06, + "loss": 0.3119, + "step": 19230 + }, + { + "epoch": 0.8444624960207252, + "grad_norm": 1.4921875, + "learning_rate": 2.9427519038443014e-06, + "loss": 0.2929, + "step": 19232 + }, + { + "epoch": 0.8445503144999287, + "grad_norm": 1.515625, + "learning_rate": 2.939496268201264e-06, + "loss": 0.3141, + "step": 19234 + }, + { + "epoch": 0.8446381329791322, + "grad_norm": 1.4453125, + "learning_rate": 2.936242321956881e-06, + "loss": 0.3232, + "step": 19236 + }, + { + "epoch": 0.8447259514583356, + "grad_norm": 1.4765625, + "learning_rate": 2.9329900653603553e-06, + "loss": 0.3165, + "step": 19238 + }, + { + "epoch": 0.8448137699375391, + "grad_norm": 1.609375, + "learning_rate": 2.929739498660741e-06, + "loss": 0.3192, + "step": 19240 + }, + { + "epoch": 0.8449015884167426, + "grad_norm": 1.4375, + "learning_rate": 2.9264906221069586e-06, + "loss": 0.3393, + "step": 19242 + }, + { + "epoch": 0.844989406895946, + "grad_norm": 1.5234375, + "learning_rate": 2.9232434359478256e-06, + "loss": 0.3182, + "step": 19244 + }, + { + "epoch": 0.8450772253751496, + "grad_norm": 1.4296875, + "learning_rate": 2.919997940431993e-06, + "loss": 0.2901, + "step": 19246 + }, + { + "epoch": 0.8451650438543531, + "grad_norm": 1.5234375, + "learning_rate": 2.916754135808017e-06, + "loss": 0.2945, + "step": 19248 + }, + { + "epoch": 0.8452528623335566, + "grad_norm": 1.4921875, + "learning_rate": 2.9135120223242996e-06, + "loss": 0.3016, + "step": 19250 + }, + { + "epoch": 0.84534068081276, + "grad_norm": 1.4375, + "learning_rate": 2.9102716002291307e-06, + "loss": 0.2988, + "step": 19252 + }, + { + "epoch": 0.8454284992919635, + "grad_norm": 1.7109375, + "learning_rate": 2.907032869770657e-06, + "loss": 0.3238, + "step": 19254 + }, + { + "epoch": 0.845516317771167, + "grad_norm": 1.5078125, + "learning_rate": 2.903795831196898e-06, + "loss": 0.3231, + "step": 19256 + }, + { + "epoch": 0.8456041362503705, + "grad_norm": 1.4375, + "learning_rate": 2.900560484755754e-06, + "loss": 0.3126, + "step": 19258 + }, + { + "epoch": 0.8456919547295739, + "grad_norm": 1.4609375, + "learning_rate": 2.8973268306949824e-06, + "loss": 0.3204, + "step": 19260 + }, + { + "epoch": 0.8457797732087775, + "grad_norm": 1.5703125, + "learning_rate": 2.894094869262226e-06, + "loss": 0.3058, + "step": 19262 + }, + { + "epoch": 0.845867591687981, + "grad_norm": 1.3984375, + "learning_rate": 2.8908646007049843e-06, + "loss": 0.3002, + "step": 19264 + }, + { + "epoch": 0.8459554101671845, + "grad_norm": 1.5234375, + "learning_rate": 2.887636025270629e-06, + "loss": 0.3175, + "step": 19266 + }, + { + "epoch": 0.8460432286463879, + "grad_norm": 1.5234375, + "learning_rate": 2.8844091432064097e-06, + "loss": 0.3171, + "step": 19268 + }, + { + "epoch": 0.8461310471255914, + "grad_norm": 1.4375, + "learning_rate": 2.8811839547594337e-06, + "loss": 0.3075, + "step": 19270 + }, + { + "epoch": 0.8462188656047949, + "grad_norm": 1.4921875, + "learning_rate": 2.8779604601766957e-06, + "loss": 0.3231, + "step": 19272 + }, + { + "epoch": 0.8463066840839983, + "grad_norm": 1.59375, + "learning_rate": 2.874738659705048e-06, + "loss": 0.3302, + "step": 19274 + }, + { + "epoch": 0.8463945025632018, + "grad_norm": 1.484375, + "learning_rate": 2.8715185535912182e-06, + "loss": 0.3165, + "step": 19276 + }, + { + "epoch": 0.8464823210424054, + "grad_norm": 1.5390625, + "learning_rate": 2.868300142081806e-06, + "loss": 0.3322, + "step": 19278 + }, + { + "epoch": 0.8465701395216089, + "grad_norm": 1.46875, + "learning_rate": 2.865083425423265e-06, + "loss": 0.3214, + "step": 19280 + }, + { + "epoch": 0.8466579580008123, + "grad_norm": 1.4765625, + "learning_rate": 2.8618684038619504e-06, + "loss": 0.327, + "step": 19282 + }, + { + "epoch": 0.8467457764800158, + "grad_norm": 1.484375, + "learning_rate": 2.858655077644054e-06, + "loss": 0.314, + "step": 19284 + }, + { + "epoch": 0.8468335949592193, + "grad_norm": 1.609375, + "learning_rate": 2.855443447015663e-06, + "loss": 0.3064, + "step": 19286 + }, + { + "epoch": 0.8469214134384228, + "grad_norm": 1.4765625, + "learning_rate": 2.8522335122227214e-06, + "loss": 0.29, + "step": 19288 + }, + { + "epoch": 0.8470092319176262, + "grad_norm": 1.6953125, + "learning_rate": 2.849025273511044e-06, + "loss": 0.3206, + "step": 19290 + }, + { + "epoch": 0.8470970503968297, + "grad_norm": 1.453125, + "learning_rate": 2.8458187311263233e-06, + "loss": 0.327, + "step": 19292 + }, + { + "epoch": 0.8471848688760333, + "grad_norm": 1.5078125, + "learning_rate": 2.842613885314116e-06, + "loss": 0.3152, + "step": 19294 + }, + { + "epoch": 0.8472726873552368, + "grad_norm": 1.5, + "learning_rate": 2.83941073631985e-06, + "loss": 0.3103, + "step": 19296 + }, + { + "epoch": 0.8473605058344402, + "grad_norm": 1.5234375, + "learning_rate": 2.836209284388816e-06, + "loss": 0.3559, + "step": 19298 + }, + { + "epoch": 0.8474483243136437, + "grad_norm": 1.4453125, + "learning_rate": 2.8330095297661925e-06, + "loss": 0.3054, + "step": 19300 + }, + { + "epoch": 0.8475361427928472, + "grad_norm": 1.453125, + "learning_rate": 2.8298114726970138e-06, + "loss": 0.2983, + "step": 19302 + }, + { + "epoch": 0.8476239612720506, + "grad_norm": 1.421875, + "learning_rate": 2.8266151134261815e-06, + "loss": 0.2949, + "step": 19304 + }, + { + "epoch": 0.8477117797512541, + "grad_norm": 1.546875, + "learning_rate": 2.8234204521984857e-06, + "loss": 0.3187, + "step": 19306 + }, + { + "epoch": 0.8477995982304577, + "grad_norm": 1.4765625, + "learning_rate": 2.8202274892585617e-06, + "loss": 0.3119, + "step": 19308 + }, + { + "epoch": 0.8478874167096612, + "grad_norm": 1.546875, + "learning_rate": 2.8170362248509386e-06, + "loss": 0.3204, + "step": 19310 + }, + { + "epoch": 0.8479752351888646, + "grad_norm": 1.53125, + "learning_rate": 2.8138466592199986e-06, + "loss": 0.3137, + "step": 19312 + }, + { + "epoch": 0.8480630536680681, + "grad_norm": 1.484375, + "learning_rate": 2.8106587926099963e-06, + "loss": 0.2915, + "step": 19314 + }, + { + "epoch": 0.8481508721472716, + "grad_norm": 1.453125, + "learning_rate": 2.8074726252650645e-06, + "loss": 0.322, + "step": 19316 + }, + { + "epoch": 0.8482386906264751, + "grad_norm": 1.484375, + "learning_rate": 2.804288157429197e-06, + "loss": 0.3051, + "step": 19318 + }, + { + "epoch": 0.8483265091056785, + "grad_norm": 1.5234375, + "learning_rate": 2.8011053893462676e-06, + "loss": 0.3313, + "step": 19320 + }, + { + "epoch": 0.848414327584882, + "grad_norm": 1.515625, + "learning_rate": 2.7979243212600035e-06, + "loss": 0.331, + "step": 19322 + }, + { + "epoch": 0.8485021460640856, + "grad_norm": 1.4921875, + "learning_rate": 2.7947449534140217e-06, + "loss": 0.3045, + "step": 19324 + }, + { + "epoch": 0.848589964543289, + "grad_norm": 1.40625, + "learning_rate": 2.7915672860517937e-06, + "loss": 0.3273, + "step": 19326 + }, + { + "epoch": 0.8486777830224925, + "grad_norm": 1.484375, + "learning_rate": 2.7883913194166693e-06, + "loss": 0.3184, + "step": 19328 + }, + { + "epoch": 0.848765601501696, + "grad_norm": 1.4765625, + "learning_rate": 2.7852170537518594e-06, + "loss": 0.3319, + "step": 19330 + }, + { + "epoch": 0.8488534199808995, + "grad_norm": 1.6953125, + "learning_rate": 2.782044489300448e-06, + "loss": 0.313, + "step": 19332 + }, + { + "epoch": 0.8489412384601029, + "grad_norm": 1.546875, + "learning_rate": 2.7788736263054033e-06, + "loss": 0.3191, + "step": 19334 + }, + { + "epoch": 0.8490290569393064, + "grad_norm": 1.5078125, + "learning_rate": 2.775704465009535e-06, + "loss": 0.3005, + "step": 19336 + }, + { + "epoch": 0.8491168754185099, + "grad_norm": 1.40625, + "learning_rate": 2.772537005655554e-06, + "loss": 0.2972, + "step": 19338 + }, + { + "epoch": 0.8492046938977135, + "grad_norm": 1.46875, + "learning_rate": 2.7693712484860135e-06, + "loss": 0.3037, + "step": 19340 + }, + { + "epoch": 0.8492925123769169, + "grad_norm": 1.484375, + "learning_rate": 2.7662071937433502e-06, + "loss": 0.3017, + "step": 19342 + }, + { + "epoch": 0.8493803308561204, + "grad_norm": 1.53125, + "learning_rate": 2.7630448416698734e-06, + "loss": 0.304, + "step": 19344 + }, + { + "epoch": 0.8494681493353239, + "grad_norm": 1.46875, + "learning_rate": 2.75988419250775e-06, + "loss": 0.2853, + "step": 19346 + }, + { + "epoch": 0.8495559678145274, + "grad_norm": 1.53125, + "learning_rate": 2.7567252464990283e-06, + "loss": 0.3149, + "step": 19348 + }, + { + "epoch": 0.8496437862937308, + "grad_norm": 1.4453125, + "learning_rate": 2.753568003885623e-06, + "loss": 0.2996, + "step": 19350 + }, + { + "epoch": 0.8497316047729343, + "grad_norm": 1.421875, + "learning_rate": 2.750412464909305e-06, + "loss": 0.3227, + "step": 19352 + }, + { + "epoch": 0.8498194232521379, + "grad_norm": 1.46875, + "learning_rate": 2.7472586298117385e-06, + "loss": 0.311, + "step": 19354 + }, + { + "epoch": 0.8499072417313414, + "grad_norm": 1.4453125, + "learning_rate": 2.7441064988344424e-06, + "loss": 0.3119, + "step": 19356 + }, + { + "epoch": 0.8499950602105448, + "grad_norm": 1.671875, + "learning_rate": 2.740956072218806e-06, + "loss": 0.3566, + "step": 19358 + }, + { + "epoch": 0.8500828786897483, + "grad_norm": 1.453125, + "learning_rate": 2.737807350206084e-06, + "loss": 0.3247, + "step": 19360 + }, + { + "epoch": 0.8501706971689518, + "grad_norm": 1.4921875, + "learning_rate": 2.734660333037417e-06, + "loss": 0.3071, + "step": 19362 + }, + { + "epoch": 0.8502585156481552, + "grad_norm": 1.4375, + "learning_rate": 2.7315150209537984e-06, + "loss": 0.3137, + "step": 19364 + }, + { + "epoch": 0.8503463341273587, + "grad_norm": 1.5625, + "learning_rate": 2.7283714141960905e-06, + "loss": 0.3234, + "step": 19366 + }, + { + "epoch": 0.8504341526065622, + "grad_norm": 1.515625, + "learning_rate": 2.7252295130050483e-06, + "loss": 0.2924, + "step": 19368 + }, + { + "epoch": 0.8505219710857658, + "grad_norm": 1.4609375, + "learning_rate": 2.7220893176212595e-06, + "loss": 0.3097, + "step": 19370 + }, + { + "epoch": 0.8506097895649692, + "grad_norm": 1.4921875, + "learning_rate": 2.718950828285219e-06, + "loss": 0.2995, + "step": 19372 + }, + { + "epoch": 0.8506976080441727, + "grad_norm": 1.5078125, + "learning_rate": 2.7158140452372666e-06, + "loss": 0.3103, + "step": 19374 + }, + { + "epoch": 0.8507854265233762, + "grad_norm": 1.4765625, + "learning_rate": 2.712678968717608e-06, + "loss": 0.2902, + "step": 19376 + }, + { + "epoch": 0.8508732450025797, + "grad_norm": 1.484375, + "learning_rate": 2.7095455989663452e-06, + "loss": 0.315, + "step": 19378 + }, + { + "epoch": 0.8509610634817831, + "grad_norm": 1.5546875, + "learning_rate": 2.7064139362234174e-06, + "loss": 0.3209, + "step": 19380 + }, + { + "epoch": 0.8510488819609866, + "grad_norm": 1.59375, + "learning_rate": 2.7032839807286575e-06, + "loss": 0.3109, + "step": 19382 + }, + { + "epoch": 0.8511367004401901, + "grad_norm": 1.6640625, + "learning_rate": 2.700155732721751e-06, + "loss": 0.3309, + "step": 19384 + }, + { + "epoch": 0.8512245189193937, + "grad_norm": 1.515625, + "learning_rate": 2.6970291924422768e-06, + "loss": 0.2905, + "step": 19386 + }, + { + "epoch": 0.8513123373985971, + "grad_norm": 1.453125, + "learning_rate": 2.6939043601296425e-06, + "loss": 0.3297, + "step": 19388 + }, + { + "epoch": 0.8514001558778006, + "grad_norm": 1.625, + "learning_rate": 2.690781236023163e-06, + "loss": 0.347, + "step": 19390 + }, + { + "epoch": 0.8514879743570041, + "grad_norm": 1.4296875, + "learning_rate": 2.687659820362004e-06, + "loss": 0.2977, + "step": 19392 + }, + { + "epoch": 0.8515757928362075, + "grad_norm": 1.53125, + "learning_rate": 2.6845401133852005e-06, + "loss": 0.3306, + "step": 19394 + }, + { + "epoch": 0.851663611315411, + "grad_norm": 1.4375, + "learning_rate": 2.6814221153316694e-06, + "loss": 0.3197, + "step": 19396 + }, + { + "epoch": 0.8517514297946145, + "grad_norm": 1.5390625, + "learning_rate": 2.6783058264401776e-06, + "loss": 0.2946, + "step": 19398 + }, + { + "epoch": 0.8518392482738181, + "grad_norm": 1.5078125, + "learning_rate": 2.675191246949382e-06, + "loss": 0.3442, + "step": 19400 + }, + { + "epoch": 0.8519270667530215, + "grad_norm": 1.4296875, + "learning_rate": 2.672078377097792e-06, + "loss": 0.292, + "step": 19402 + }, + { + "epoch": 0.852014885232225, + "grad_norm": 1.4296875, + "learning_rate": 2.6689672171237885e-06, + "loss": 0.3111, + "step": 19404 + }, + { + "epoch": 0.8521027037114285, + "grad_norm": 1.515625, + "learning_rate": 2.6658577672656337e-06, + "loss": 0.2968, + "step": 19406 + }, + { + "epoch": 0.852190522190632, + "grad_norm": 1.4921875, + "learning_rate": 2.6627500277614376e-06, + "loss": 0.303, + "step": 19408 + }, + { + "epoch": 0.8522783406698354, + "grad_norm": 1.46875, + "learning_rate": 2.6596439988492065e-06, + "loss": 0.2896, + "step": 19410 + }, + { + "epoch": 0.8523661591490389, + "grad_norm": 1.546875, + "learning_rate": 2.6565396807667895e-06, + "loss": 0.3206, + "step": 19412 + }, + { + "epoch": 0.8524539776282424, + "grad_norm": 1.546875, + "learning_rate": 2.6534370737519188e-06, + "loss": 0.3197, + "step": 19414 + }, + { + "epoch": 0.852541796107446, + "grad_norm": 1.5, + "learning_rate": 2.6503361780421983e-06, + "loss": 0.2924, + "step": 19416 + }, + { + "epoch": 0.8526296145866494, + "grad_norm": 1.46875, + "learning_rate": 2.647236993875088e-06, + "loss": 0.3023, + "step": 19418 + }, + { + "epoch": 0.8527174330658529, + "grad_norm": 1.421875, + "learning_rate": 2.644139521487929e-06, + "loss": 0.316, + "step": 19420 + }, + { + "epoch": 0.8528052515450564, + "grad_norm": 1.5078125, + "learning_rate": 2.64104376111792e-06, + "loss": 0.3262, + "step": 19422 + }, + { + "epoch": 0.8528930700242598, + "grad_norm": 1.5078125, + "learning_rate": 2.6379497130021443e-06, + "loss": 0.322, + "step": 19424 + }, + { + "epoch": 0.8529808885034633, + "grad_norm": 1.609375, + "learning_rate": 2.6348573773775394e-06, + "loss": 0.31, + "step": 19426 + }, + { + "epoch": 0.8530687069826668, + "grad_norm": 1.4375, + "learning_rate": 2.6317667544809134e-06, + "loss": 0.3069, + "step": 19428 + }, + { + "epoch": 0.8531565254618703, + "grad_norm": 1.5859375, + "learning_rate": 2.6286778445489574e-06, + "loss": 0.3245, + "step": 19430 + }, + { + "epoch": 0.8532443439410738, + "grad_norm": 1.515625, + "learning_rate": 2.6255906478182075e-06, + "loss": 0.3098, + "step": 19432 + }, + { + "epoch": 0.8533321624202773, + "grad_norm": 1.5, + "learning_rate": 2.6225051645250965e-06, + "loss": 0.2982, + "step": 19434 + }, + { + "epoch": 0.8534199808994808, + "grad_norm": 1.484375, + "learning_rate": 2.619421394905902e-06, + "loss": 0.3077, + "step": 19436 + }, + { + "epoch": 0.8535077993786843, + "grad_norm": 1.515625, + "learning_rate": 2.61633933919678e-06, + "loss": 0.2876, + "step": 19438 + }, + { + "epoch": 0.8535956178578877, + "grad_norm": 1.4296875, + "learning_rate": 2.613258997633761e-06, + "loss": 0.3006, + "step": 19440 + }, + { + "epoch": 0.8536834363370912, + "grad_norm": 1.5234375, + "learning_rate": 2.610180370452728e-06, + "loss": 0.3323, + "step": 19442 + }, + { + "epoch": 0.8537712548162947, + "grad_norm": 1.484375, + "learning_rate": 2.6071034578894567e-06, + "loss": 0.3108, + "step": 19444 + }, + { + "epoch": 0.8538590732954981, + "grad_norm": 1.5546875, + "learning_rate": 2.6040282601795697e-06, + "loss": 0.3092, + "step": 19446 + }, + { + "epoch": 0.8539468917747017, + "grad_norm": 1.4609375, + "learning_rate": 2.6009547775585648e-06, + "loss": 0.3037, + "step": 19448 + }, + { + "epoch": 0.8540347102539052, + "grad_norm": 1.4765625, + "learning_rate": 2.597883010261809e-06, + "loss": 0.3239, + "step": 19450 + }, + { + "epoch": 0.8541225287331087, + "grad_norm": 1.453125, + "learning_rate": 2.594812958524548e-06, + "loss": 0.3089, + "step": 19452 + }, + { + "epoch": 0.8542103472123121, + "grad_norm": 1.4765625, + "learning_rate": 2.591744622581882e-06, + "loss": 0.2894, + "step": 19454 + }, + { + "epoch": 0.8542981656915156, + "grad_norm": 1.390625, + "learning_rate": 2.588678002668779e-06, + "loss": 0.2996, + "step": 19456 + }, + { + "epoch": 0.8543859841707191, + "grad_norm": 1.5546875, + "learning_rate": 2.585613099020093e-06, + "loss": 0.32, + "step": 19458 + }, + { + "epoch": 0.8544738026499226, + "grad_norm": 1.46875, + "learning_rate": 2.5825499118705213e-06, + "loss": 0.2837, + "step": 19460 + }, + { + "epoch": 0.8545616211291261, + "grad_norm": 1.4765625, + "learning_rate": 2.5794884414546584e-06, + "loss": 0.3122, + "step": 19462 + }, + { + "epoch": 0.8546494396083296, + "grad_norm": 1.6328125, + "learning_rate": 2.5764286880069434e-06, + "loss": 0.3227, + "step": 19464 + }, + { + "epoch": 0.8547372580875331, + "grad_norm": 1.4296875, + "learning_rate": 2.573370651761692e-06, + "loss": 0.3458, + "step": 19466 + }, + { + "epoch": 0.8548250765667366, + "grad_norm": 1.609375, + "learning_rate": 2.5703143329530943e-06, + "loss": 0.3311, + "step": 19468 + }, + { + "epoch": 0.85491289504594, + "grad_norm": 1.46875, + "learning_rate": 2.5672597318152e-06, + "loss": 0.2945, + "step": 19470 + }, + { + "epoch": 0.8550007135251435, + "grad_norm": 1.4609375, + "learning_rate": 2.5642068485819347e-06, + "loss": 0.3096, + "step": 19472 + }, + { + "epoch": 0.855088532004347, + "grad_norm": 1.5, + "learning_rate": 2.5611556834870905e-06, + "loss": 0.3302, + "step": 19474 + }, + { + "epoch": 0.8551763504835505, + "grad_norm": 1.53125, + "learning_rate": 2.5581062367643154e-06, + "loss": 0.3125, + "step": 19476 + }, + { + "epoch": 0.855264168962754, + "grad_norm": 1.71875, + "learning_rate": 2.555058508647154e-06, + "loss": 0.2861, + "step": 19478 + }, + { + "epoch": 0.8553519874419575, + "grad_norm": 1.4375, + "learning_rate": 2.5520124993689858e-06, + "loss": 0.3093, + "step": 19480 + }, + { + "epoch": 0.855439805921161, + "grad_norm": 1.578125, + "learning_rate": 2.548968209163083e-06, + "loss": 0.2958, + "step": 19482 + }, + { + "epoch": 0.8555276244003645, + "grad_norm": 1.4921875, + "learning_rate": 2.5459256382625723e-06, + "loss": 0.3501, + "step": 19484 + }, + { + "epoch": 0.8556154428795679, + "grad_norm": 1.5625, + "learning_rate": 2.542884786900465e-06, + "loss": 0.3296, + "step": 19486 + }, + { + "epoch": 0.8557032613587714, + "grad_norm": 1.5234375, + "learning_rate": 2.539845655309625e-06, + "loss": 0.3145, + "step": 19488 + }, + { + "epoch": 0.8557910798379749, + "grad_norm": 1.4375, + "learning_rate": 2.5368082437227825e-06, + "loss": 0.3172, + "step": 19490 + }, + { + "epoch": 0.8558788983171783, + "grad_norm": 1.5234375, + "learning_rate": 2.533772552372554e-06, + "loss": 0.3262, + "step": 19492 + }, + { + "epoch": 0.8559667167963819, + "grad_norm": 1.421875, + "learning_rate": 2.5307385814914043e-06, + "loss": 0.307, + "step": 19494 + }, + { + "epoch": 0.8560545352755854, + "grad_norm": 1.484375, + "learning_rate": 2.5277063313116854e-06, + "loss": 0.3223, + "step": 19496 + }, + { + "epoch": 0.8561423537547889, + "grad_norm": 1.4453125, + "learning_rate": 2.5246758020656013e-06, + "loss": 0.3303, + "step": 19498 + }, + { + "epoch": 0.8562301722339923, + "grad_norm": 1.484375, + "learning_rate": 2.52164699398523e-06, + "loss": 0.3126, + "step": 19500 + }, + { + "epoch": 0.8563179907131958, + "grad_norm": 1.5078125, + "learning_rate": 2.518619907302522e-06, + "loss": 0.2908, + "step": 19502 + }, + { + "epoch": 0.8564058091923993, + "grad_norm": 1.3984375, + "learning_rate": 2.515594542249289e-06, + "loss": 0.2945, + "step": 19504 + }, + { + "epoch": 0.8564936276716028, + "grad_norm": 1.5, + "learning_rate": 2.512570899057218e-06, + "loss": 0.3543, + "step": 19506 + }, + { + "epoch": 0.8565814461508063, + "grad_norm": 1.4765625, + "learning_rate": 2.5095489779578595e-06, + "loss": 0.3082, + "step": 19508 + }, + { + "epoch": 0.8566692646300098, + "grad_norm": 1.4609375, + "learning_rate": 2.506528779182632e-06, + "loss": 0.3024, + "step": 19510 + }, + { + "epoch": 0.8567570831092133, + "grad_norm": 1.5546875, + "learning_rate": 2.5035103029628166e-06, + "loss": 0.3181, + "step": 19512 + }, + { + "epoch": 0.8568449015884168, + "grad_norm": 1.4453125, + "learning_rate": 2.5004935495295813e-06, + "loss": 0.2928, + "step": 19514 + }, + { + "epoch": 0.8569327200676202, + "grad_norm": 1.453125, + "learning_rate": 2.497478519113941e-06, + "loss": 0.295, + "step": 19516 + }, + { + "epoch": 0.8570205385468237, + "grad_norm": 1.5234375, + "learning_rate": 2.4944652119467864e-06, + "loss": 0.3238, + "step": 19518 + }, + { + "epoch": 0.8571083570260272, + "grad_norm": 1.5390625, + "learning_rate": 2.4914536282588856e-06, + "loss": 0.3081, + "step": 19520 + }, + { + "epoch": 0.8571961755052306, + "grad_norm": 1.5, + "learning_rate": 2.488443768280857e-06, + "loss": 0.3123, + "step": 19522 + }, + { + "epoch": 0.8572839939844342, + "grad_norm": 1.4375, + "learning_rate": 2.4854356322432078e-06, + "loss": 0.3266, + "step": 19524 + }, + { + "epoch": 0.8573718124636377, + "grad_norm": 1.515625, + "learning_rate": 2.482429220376292e-06, + "loss": 0.3134, + "step": 19526 + }, + { + "epoch": 0.8574596309428412, + "grad_norm": 1.4296875, + "learning_rate": 2.4794245329103406e-06, + "loss": 0.3128, + "step": 19528 + }, + { + "epoch": 0.8575474494220446, + "grad_norm": 1.4765625, + "learning_rate": 2.476421570075463e-06, + "loss": 0.3205, + "step": 19530 + }, + { + "epoch": 0.8576352679012481, + "grad_norm": 1.421875, + "learning_rate": 2.4734203321016143e-06, + "loss": 0.3162, + "step": 19532 + }, + { + "epoch": 0.8577230863804516, + "grad_norm": 1.4609375, + "learning_rate": 2.470420819218641e-06, + "loss": 0.3417, + "step": 19534 + }, + { + "epoch": 0.857810904859655, + "grad_norm": 1.5078125, + "learning_rate": 2.4674230316562435e-06, + "loss": 0.3085, + "step": 19536 + }, + { + "epoch": 0.8578987233388585, + "grad_norm": 1.453125, + "learning_rate": 2.4644269696439847e-06, + "loss": 0.3028, + "step": 19538 + }, + { + "epoch": 0.8579865418180621, + "grad_norm": 1.4296875, + "learning_rate": 2.461432633411323e-06, + "loss": 0.3199, + "step": 19540 + }, + { + "epoch": 0.8580743602972656, + "grad_norm": 1.546875, + "learning_rate": 2.4584400231875416e-06, + "loss": 0.3034, + "step": 19542 + }, + { + "epoch": 0.858162178776469, + "grad_norm": 1.4921875, + "learning_rate": 2.4554491392018347e-06, + "loss": 0.3213, + "step": 19544 + }, + { + "epoch": 0.8582499972556725, + "grad_norm": 1.4765625, + "learning_rate": 2.4524599816832276e-06, + "loss": 0.3014, + "step": 19546 + }, + { + "epoch": 0.858337815734876, + "grad_norm": 1.4453125, + "learning_rate": 2.4494725508606488e-06, + "loss": 0.2933, + "step": 19548 + }, + { + "epoch": 0.8584256342140795, + "grad_norm": 1.4765625, + "learning_rate": 2.446486846962867e-06, + "loss": 0.3237, + "step": 19550 + }, + { + "epoch": 0.8585134526932829, + "grad_norm": 1.484375, + "learning_rate": 2.4435028702185227e-06, + "loss": 0.3021, + "step": 19552 + }, + { + "epoch": 0.8586012711724865, + "grad_norm": 1.4453125, + "learning_rate": 2.4405206208561403e-06, + "loss": 0.2859, + "step": 19554 + }, + { + "epoch": 0.85868908965169, + "grad_norm": 1.4296875, + "learning_rate": 2.437540099104094e-06, + "loss": 0.3134, + "step": 19556 + }, + { + "epoch": 0.8587769081308935, + "grad_norm": 1.5078125, + "learning_rate": 2.4345613051906384e-06, + "loss": 0.3277, + "step": 19558 + }, + { + "epoch": 0.8588647266100969, + "grad_norm": 1.453125, + "learning_rate": 2.431584239343887e-06, + "loss": 0.2996, + "step": 19560 + }, + { + "epoch": 0.8589525450893004, + "grad_norm": 1.4921875, + "learning_rate": 2.4286089017918233e-06, + "loss": 0.3341, + "step": 19562 + }, + { + "epoch": 0.8590403635685039, + "grad_norm": 1.484375, + "learning_rate": 2.4256352927623015e-06, + "loss": 0.2953, + "step": 19564 + }, + { + "epoch": 0.8591281820477074, + "grad_norm": 1.484375, + "learning_rate": 2.4226634124830366e-06, + "loss": 0.2946, + "step": 19566 + }, + { + "epoch": 0.8592160005269108, + "grad_norm": 1.40625, + "learning_rate": 2.419693261181627e-06, + "loss": 0.3183, + "step": 19568 + }, + { + "epoch": 0.8593038190061144, + "grad_norm": 1.4765625, + "learning_rate": 2.4167248390855187e-06, + "loss": 0.2981, + "step": 19570 + }, + { + "epoch": 0.8593916374853179, + "grad_norm": 1.4609375, + "learning_rate": 2.413758146422035e-06, + "loss": 0.3042, + "step": 19572 + }, + { + "epoch": 0.8594794559645214, + "grad_norm": 1.4921875, + "learning_rate": 2.4107931834183667e-06, + "loss": 0.321, + "step": 19574 + }, + { + "epoch": 0.8595672744437248, + "grad_norm": 1.515625, + "learning_rate": 2.4078299503015654e-06, + "loss": 0.3173, + "step": 19576 + }, + { + "epoch": 0.8596550929229283, + "grad_norm": 1.421875, + "learning_rate": 2.4048684472985694e-06, + "loss": 0.3045, + "step": 19578 + }, + { + "epoch": 0.8597429114021318, + "grad_norm": 1.59375, + "learning_rate": 2.401908674636158e-06, + "loss": 0.3217, + "step": 19580 + }, + { + "epoch": 0.8598307298813352, + "grad_norm": 1.484375, + "learning_rate": 2.398950632541003e-06, + "loss": 0.3452, + "step": 19582 + }, + { + "epoch": 0.8599185483605387, + "grad_norm": 1.40625, + "learning_rate": 2.3959943212396223e-06, + "loss": 0.312, + "step": 19584 + }, + { + "epoch": 0.8600063668397423, + "grad_norm": 1.4921875, + "learning_rate": 2.3930397409584194e-06, + "loss": 0.3037, + "step": 19586 + }, + { + "epoch": 0.8600941853189458, + "grad_norm": 1.3984375, + "learning_rate": 2.3900868919236515e-06, + "loss": 0.2944, + "step": 19588 + }, + { + "epoch": 0.8601820037981492, + "grad_norm": 1.515625, + "learning_rate": 2.387135774361446e-06, + "loss": 0.3224, + "step": 19590 + }, + { + "epoch": 0.8602698222773527, + "grad_norm": 1.5, + "learning_rate": 2.384186388497808e-06, + "loss": 0.3191, + "step": 19592 + }, + { + "epoch": 0.8603576407565562, + "grad_norm": 1.4921875, + "learning_rate": 2.3812387345585914e-06, + "loss": 0.311, + "step": 19594 + }, + { + "epoch": 0.8604454592357597, + "grad_norm": 1.515625, + "learning_rate": 2.378292812769542e-06, + "loss": 0.3169, + "step": 19596 + }, + { + "epoch": 0.8605332777149631, + "grad_norm": 1.4765625, + "learning_rate": 2.3753486233562496e-06, + "loss": 0.2885, + "step": 19598 + }, + { + "epoch": 0.8606210961941667, + "grad_norm": 1.515625, + "learning_rate": 2.3724061665441777e-06, + "loss": 0.33, + "step": 19600 + }, + { + "epoch": 0.8607089146733702, + "grad_norm": 1.46875, + "learning_rate": 2.3694654425586772e-06, + "loss": 0.2909, + "step": 19602 + }, + { + "epoch": 0.8607967331525737, + "grad_norm": 1.53125, + "learning_rate": 2.3665264516249287e-06, + "loss": 0.292, + "step": 19604 + }, + { + "epoch": 0.8608845516317771, + "grad_norm": 1.4453125, + "learning_rate": 2.363589193968013e-06, + "loss": 0.2853, + "step": 19606 + }, + { + "epoch": 0.8609723701109806, + "grad_norm": 1.5859375, + "learning_rate": 2.3606536698128595e-06, + "loss": 0.2869, + "step": 19608 + }, + { + "epoch": 0.8610601885901841, + "grad_norm": 1.4765625, + "learning_rate": 2.357719879384279e-06, + "loss": 0.2986, + "step": 19610 + }, + { + "epoch": 0.8611480070693875, + "grad_norm": 1.375, + "learning_rate": 2.3547878229069397e-06, + "loss": 0.2786, + "step": 19612 + }, + { + "epoch": 0.861235825548591, + "grad_norm": 1.484375, + "learning_rate": 2.35185750060537e-06, + "loss": 0.2973, + "step": 19614 + }, + { + "epoch": 0.8613236440277946, + "grad_norm": 1.5, + "learning_rate": 2.3489289127039906e-06, + "loss": 0.3174, + "step": 19616 + }, + { + "epoch": 0.8614114625069981, + "grad_norm": 1.5234375, + "learning_rate": 2.3460020594270575e-06, + "loss": 0.3406, + "step": 19618 + }, + { + "epoch": 0.8614992809862015, + "grad_norm": 1.5390625, + "learning_rate": 2.343076940998723e-06, + "loss": 0.3065, + "step": 19620 + }, + { + "epoch": 0.861587099465405, + "grad_norm": 1.4609375, + "learning_rate": 2.340153557642985e-06, + "loss": 0.335, + "step": 19622 + }, + { + "epoch": 0.8616749179446085, + "grad_norm": 1.5546875, + "learning_rate": 2.3372319095837174e-06, + "loss": 0.2874, + "step": 19624 + }, + { + "epoch": 0.861762736423812, + "grad_norm": 1.4765625, + "learning_rate": 2.334311997044666e-06, + "loss": 0.2934, + "step": 19626 + }, + { + "epoch": 0.8618505549030154, + "grad_norm": 1.453125, + "learning_rate": 2.3313938202494307e-06, + "loss": 0.3114, + "step": 19628 + }, + { + "epoch": 0.8619383733822189, + "grad_norm": 1.53125, + "learning_rate": 2.328477379421498e-06, + "loss": 0.3105, + "step": 19630 + }, + { + "epoch": 0.8620261918614225, + "grad_norm": 1.5234375, + "learning_rate": 2.3255626747841985e-06, + "loss": 0.319, + "step": 19632 + }, + { + "epoch": 0.862114010340626, + "grad_norm": 1.4296875, + "learning_rate": 2.3226497065607478e-06, + "loss": 0.2853, + "step": 19634 + }, + { + "epoch": 0.8622018288198294, + "grad_norm": 1.46875, + "learning_rate": 2.3197384749742175e-06, + "loss": 0.3208, + "step": 19636 + }, + { + "epoch": 0.8622896472990329, + "grad_norm": 1.484375, + "learning_rate": 2.316828980247546e-06, + "loss": 0.3086, + "step": 19638 + }, + { + "epoch": 0.8623774657782364, + "grad_norm": 1.4375, + "learning_rate": 2.3139212226035543e-06, + "loss": 0.2994, + "step": 19640 + }, + { + "epoch": 0.8624652842574398, + "grad_norm": 1.53125, + "learning_rate": 2.3110152022649103e-06, + "loss": 0.3229, + "step": 19642 + }, + { + "epoch": 0.8625531027366433, + "grad_norm": 1.5390625, + "learning_rate": 2.308110919454165e-06, + "loss": 0.3214, + "step": 19644 + }, + { + "epoch": 0.8626409212158468, + "grad_norm": 1.515625, + "learning_rate": 2.305208374393719e-06, + "loss": 0.3302, + "step": 19646 + }, + { + "epoch": 0.8627287396950504, + "grad_norm": 1.4765625, + "learning_rate": 2.3023075673058615e-06, + "loss": 0.3134, + "step": 19648 + }, + { + "epoch": 0.8628165581742538, + "grad_norm": 1.4609375, + "learning_rate": 2.299408498412731e-06, + "loss": 0.319, + "step": 19650 + }, + { + "epoch": 0.8629043766534573, + "grad_norm": 1.5234375, + "learning_rate": 2.2965111679363355e-06, + "loss": 0.2917, + "step": 19652 + }, + { + "epoch": 0.8629921951326608, + "grad_norm": 1.703125, + "learning_rate": 2.293615576098562e-06, + "loss": 0.3199, + "step": 19654 + }, + { + "epoch": 0.8630800136118643, + "grad_norm": 1.4296875, + "learning_rate": 2.290721723121145e-06, + "loss": 0.3266, + "step": 19656 + }, + { + "epoch": 0.8631678320910677, + "grad_norm": 1.5, + "learning_rate": 2.2878296092257085e-06, + "loss": 0.323, + "step": 19658 + }, + { + "epoch": 0.8632556505702712, + "grad_norm": 1.390625, + "learning_rate": 2.2849392346337235e-06, + "loss": 0.328, + "step": 19660 + }, + { + "epoch": 0.8633434690494748, + "grad_norm": 1.4765625, + "learning_rate": 2.2820505995665378e-06, + "loss": 0.3261, + "step": 19662 + }, + { + "epoch": 0.8634312875286783, + "grad_norm": 1.453125, + "learning_rate": 2.2791637042453628e-06, + "loss": 0.3211, + "step": 19664 + }, + { + "epoch": 0.8635191060078817, + "grad_norm": 1.5390625, + "learning_rate": 2.2762785488912724e-06, + "loss": 0.3188, + "step": 19666 + }, + { + "epoch": 0.8636069244870852, + "grad_norm": 1.4609375, + "learning_rate": 2.273395133725223e-06, + "loss": 0.3157, + "step": 19668 + }, + { + "epoch": 0.8636947429662887, + "grad_norm": 1.359375, + "learning_rate": 2.270513458968018e-06, + "loss": 0.3233, + "step": 19670 + }, + { + "epoch": 0.8637825614454921, + "grad_norm": 1.3984375, + "learning_rate": 2.267633524840346e-06, + "loss": 0.3448, + "step": 19672 + }, + { + "epoch": 0.8638703799246956, + "grad_norm": 1.484375, + "learning_rate": 2.2647553315627485e-06, + "loss": 0.3084, + "step": 19674 + }, + { + "epoch": 0.8639581984038991, + "grad_norm": 1.578125, + "learning_rate": 2.261878879355633e-06, + "loss": 0.297, + "step": 19676 + }, + { + "epoch": 0.8640460168831027, + "grad_norm": 1.5625, + "learning_rate": 2.2590041684392854e-06, + "loss": 0.3149, + "step": 19678 + }, + { + "epoch": 0.8641338353623061, + "grad_norm": 1.4609375, + "learning_rate": 2.256131199033848e-06, + "loss": 0.3068, + "step": 19680 + }, + { + "epoch": 0.8642216538415096, + "grad_norm": 1.5390625, + "learning_rate": 2.2532599713593396e-06, + "loss": 0.3197, + "step": 19682 + }, + { + "epoch": 0.8643094723207131, + "grad_norm": 1.4609375, + "learning_rate": 2.2503904856356366e-06, + "loss": 0.3218, + "step": 19684 + }, + { + "epoch": 0.8643972907999166, + "grad_norm": 1.3984375, + "learning_rate": 2.247522742082478e-06, + "loss": 0.3111, + "step": 19686 + }, + { + "epoch": 0.86448510927912, + "grad_norm": 1.421875, + "learning_rate": 2.244656740919485e-06, + "loss": 0.3329, + "step": 19688 + }, + { + "epoch": 0.8645729277583235, + "grad_norm": 1.4921875, + "learning_rate": 2.241792482366131e-06, + "loss": 0.3249, + "step": 19690 + }, + { + "epoch": 0.864660746237527, + "grad_norm": 1.4765625, + "learning_rate": 2.238929966641767e-06, + "loss": 0.3023, + "step": 19692 + }, + { + "epoch": 0.8647485647167306, + "grad_norm": 1.453125, + "learning_rate": 2.2360691939656036e-06, + "loss": 0.3127, + "step": 19694 + }, + { + "epoch": 0.864836383195934, + "grad_norm": 1.4609375, + "learning_rate": 2.2332101645567176e-06, + "loss": 0.3544, + "step": 19696 + }, + { + "epoch": 0.8649242016751375, + "grad_norm": 1.4375, + "learning_rate": 2.2303528786340565e-06, + "loss": 0.31, + "step": 19698 + }, + { + "epoch": 0.865012020154341, + "grad_norm": 1.421875, + "learning_rate": 2.227497336416423e-06, + "loss": 0.3246, + "step": 19700 + }, + { + "epoch": 0.8650998386335444, + "grad_norm": 1.5234375, + "learning_rate": 2.224643538122509e-06, + "loss": 0.3037, + "step": 19702 + }, + { + "epoch": 0.8651876571127479, + "grad_norm": 1.4375, + "learning_rate": 2.2217914839708477e-06, + "loss": 0.3307, + "step": 19704 + }, + { + "epoch": 0.8652754755919514, + "grad_norm": 1.4453125, + "learning_rate": 2.2189411741798587e-06, + "loss": 0.3167, + "step": 19706 + }, + { + "epoch": 0.865363294071155, + "grad_norm": 1.53125, + "learning_rate": 2.216092608967818e-06, + "loss": 0.3497, + "step": 19708 + }, + { + "epoch": 0.8654511125503584, + "grad_norm": 1.46875, + "learning_rate": 2.213245788552859e-06, + "loss": 0.3256, + "step": 19710 + }, + { + "epoch": 0.8655389310295619, + "grad_norm": 1.4375, + "learning_rate": 2.2104007131530046e-06, + "loss": 0.2998, + "step": 19712 + }, + { + "epoch": 0.8656267495087654, + "grad_norm": 1.421875, + "learning_rate": 2.207557382986125e-06, + "loss": 0.3256, + "step": 19714 + }, + { + "epoch": 0.8657145679879689, + "grad_norm": 1.5546875, + "learning_rate": 2.2047157982699686e-06, + "loss": 0.3113, + "step": 19716 + }, + { + "epoch": 0.8658023864671723, + "grad_norm": 1.4453125, + "learning_rate": 2.201875959222133e-06, + "loss": 0.2938, + "step": 19718 + }, + { + "epoch": 0.8658902049463758, + "grad_norm": 1.5546875, + "learning_rate": 2.199037866060108e-06, + "loss": 0.3194, + "step": 19720 + }, + { + "epoch": 0.8659780234255793, + "grad_norm": 1.4765625, + "learning_rate": 2.1962015190012315e-06, + "loss": 0.2859, + "step": 19722 + }, + { + "epoch": 0.8660658419047829, + "grad_norm": 1.4453125, + "learning_rate": 2.1933669182627044e-06, + "loss": 0.3604, + "step": 19724 + }, + { + "epoch": 0.8661536603839863, + "grad_norm": 1.46875, + "learning_rate": 2.1905340640616085e-06, + "loss": 0.2944, + "step": 19726 + }, + { + "epoch": 0.8662414788631898, + "grad_norm": 1.390625, + "learning_rate": 2.187702956614879e-06, + "loss": 0.2785, + "step": 19728 + }, + { + "epoch": 0.8663292973423933, + "grad_norm": 1.515625, + "learning_rate": 2.184873596139328e-06, + "loss": 0.3243, + "step": 19730 + }, + { + "epoch": 0.8664171158215968, + "grad_norm": 1.5, + "learning_rate": 2.1820459828516215e-06, + "loss": 0.3116, + "step": 19732 + }, + { + "epoch": 0.8665049343008002, + "grad_norm": 1.453125, + "learning_rate": 2.1792201169683085e-06, + "loss": 0.3112, + "step": 19734 + }, + { + "epoch": 0.8665927527800037, + "grad_norm": 1.390625, + "learning_rate": 2.1763959987057882e-06, + "loss": 0.3129, + "step": 19736 + }, + { + "epoch": 0.8666805712592072, + "grad_norm": 1.40625, + "learning_rate": 2.1735736282803314e-06, + "loss": 0.3219, + "step": 19738 + }, + { + "epoch": 0.8667683897384107, + "grad_norm": 1.4609375, + "learning_rate": 2.17075300590808e-06, + "loss": 0.295, + "step": 19740 + }, + { + "epoch": 0.8668562082176142, + "grad_norm": 1.4375, + "learning_rate": 2.1679341318050327e-06, + "loss": 0.2926, + "step": 19742 + }, + { + "epoch": 0.8669440266968177, + "grad_norm": 1.515625, + "learning_rate": 2.1651170061870675e-06, + "loss": 0.3007, + "step": 19744 + }, + { + "epoch": 0.8670318451760212, + "grad_norm": 1.46875, + "learning_rate": 2.1623016292699137e-06, + "loss": 0.3406, + "step": 19746 + }, + { + "epoch": 0.8671196636552246, + "grad_norm": 1.4296875, + "learning_rate": 2.159488001269175e-06, + "loss": 0.3446, + "step": 19748 + }, + { + "epoch": 0.8672074821344281, + "grad_norm": 1.515625, + "learning_rate": 2.156676122400322e-06, + "loss": 0.3072, + "step": 19750 + }, + { + "epoch": 0.8672953006136316, + "grad_norm": 1.4765625, + "learning_rate": 2.153865992878684e-06, + "loss": 0.3402, + "step": 19752 + }, + { + "epoch": 0.8673831190928352, + "grad_norm": 1.4375, + "learning_rate": 2.151057612919474e-06, + "loss": 0.2902, + "step": 19754 + }, + { + "epoch": 0.8674709375720386, + "grad_norm": 1.4921875, + "learning_rate": 2.14825098273774e-06, + "loss": 0.286, + "step": 19756 + }, + { + "epoch": 0.8675587560512421, + "grad_norm": 1.5390625, + "learning_rate": 2.1454461025484292e-06, + "loss": 0.2923, + "step": 19758 + }, + { + "epoch": 0.8676465745304456, + "grad_norm": 1.5546875, + "learning_rate": 2.1426429725663335e-06, + "loss": 0.3102, + "step": 19760 + }, + { + "epoch": 0.867734393009649, + "grad_norm": 1.515625, + "learning_rate": 2.1398415930061146e-06, + "loss": 0.3185, + "step": 19762 + }, + { + "epoch": 0.8678222114888525, + "grad_norm": 1.3984375, + "learning_rate": 2.1370419640823094e-06, + "loss": 0.305, + "step": 19764 + }, + { + "epoch": 0.867910029968056, + "grad_norm": 1.4296875, + "learning_rate": 2.134244086009307e-06, + "loss": 0.3049, + "step": 19766 + }, + { + "epoch": 0.8679978484472595, + "grad_norm": 1.5078125, + "learning_rate": 2.1314479590013786e-06, + "loss": 0.3173, + "step": 19768 + }, + { + "epoch": 0.868085666926463, + "grad_norm": 1.46875, + "learning_rate": 2.1286535832726467e-06, + "loss": 0.3196, + "step": 19770 + }, + { + "epoch": 0.8681734854056665, + "grad_norm": 1.5390625, + "learning_rate": 2.125860959037104e-06, + "loss": 0.3416, + "step": 19772 + }, + { + "epoch": 0.86826130388487, + "grad_norm": 1.5234375, + "learning_rate": 2.1230700865086134e-06, + "loss": 0.3315, + "step": 19774 + }, + { + "epoch": 0.8683491223640735, + "grad_norm": 1.4375, + "learning_rate": 2.1202809659008976e-06, + "loss": 0.317, + "step": 19776 + }, + { + "epoch": 0.8684369408432769, + "grad_norm": 1.390625, + "learning_rate": 2.1174935974275556e-06, + "loss": 0.3034, + "step": 19778 + }, + { + "epoch": 0.8685247593224804, + "grad_norm": 1.453125, + "learning_rate": 2.114707981302033e-06, + "loss": 0.3161, + "step": 19780 + }, + { + "epoch": 0.8686125778016839, + "grad_norm": 1.546875, + "learning_rate": 2.1119241177376626e-06, + "loss": 0.2952, + "step": 19782 + }, + { + "epoch": 0.8687003962808874, + "grad_norm": 1.4609375, + "learning_rate": 2.1091420069476316e-06, + "loss": 0.3344, + "step": 19784 + }, + { + "epoch": 0.8687882147600909, + "grad_norm": 1.4609375, + "learning_rate": 2.1063616491449917e-06, + "loss": 0.3011, + "step": 19786 + }, + { + "epoch": 0.8688760332392944, + "grad_norm": 1.4375, + "learning_rate": 2.1035830445426648e-06, + "loss": 0.3266, + "step": 19788 + }, + { + "epoch": 0.8689638517184979, + "grad_norm": 1.4296875, + "learning_rate": 2.1008061933534327e-06, + "loss": 0.3156, + "step": 19790 + }, + { + "epoch": 0.8690516701977014, + "grad_norm": 1.546875, + "learning_rate": 2.098031095789957e-06, + "loss": 0.3302, + "step": 19792 + }, + { + "epoch": 0.8691394886769048, + "grad_norm": 1.5078125, + "learning_rate": 2.095257752064747e-06, + "loss": 0.309, + "step": 19794 + }, + { + "epoch": 0.8692273071561083, + "grad_norm": 1.5703125, + "learning_rate": 2.092486162390192e-06, + "loss": 0.3126, + "step": 19796 + }, + { + "epoch": 0.8693151256353118, + "grad_norm": 1.5390625, + "learning_rate": 2.089716326978536e-06, + "loss": 0.3339, + "step": 19798 + }, + { + "epoch": 0.8694029441145152, + "grad_norm": 1.4765625, + "learning_rate": 2.086948246041892e-06, + "loss": 0.3164, + "step": 19800 + }, + { + "epoch": 0.8694907625937188, + "grad_norm": 1.484375, + "learning_rate": 2.0841819197922494e-06, + "loss": 0.3027, + "step": 19802 + }, + { + "epoch": 0.8695785810729223, + "grad_norm": 1.46875, + "learning_rate": 2.0814173484414447e-06, + "loss": 0.3284, + "step": 19804 + }, + { + "epoch": 0.8696663995521258, + "grad_norm": 1.4453125, + "learning_rate": 2.0786545322011965e-06, + "loss": 0.3101, + "step": 19806 + }, + { + "epoch": 0.8697542180313292, + "grad_norm": 1.5703125, + "learning_rate": 2.07589347128308e-06, + "loss": 0.3168, + "step": 19808 + }, + { + "epoch": 0.8698420365105327, + "grad_norm": 1.5703125, + "learning_rate": 2.0731341658985324e-06, + "loss": 0.3165, + "step": 19810 + }, + { + "epoch": 0.8699298549897362, + "grad_norm": 1.5078125, + "learning_rate": 2.0703766162588706e-06, + "loss": 0.3176, + "step": 19812 + }, + { + "epoch": 0.8700176734689397, + "grad_norm": 1.4765625, + "learning_rate": 2.0676208225752585e-06, + "loss": 0.3196, + "step": 19814 + }, + { + "epoch": 0.8701054919481432, + "grad_norm": 1.515625, + "learning_rate": 2.06486678505875e-06, + "loss": 0.3524, + "step": 19816 + }, + { + "epoch": 0.8701933104273467, + "grad_norm": 1.4609375, + "learning_rate": 2.0621145039202343e-06, + "loss": 0.2923, + "step": 19818 + }, + { + "epoch": 0.8702811289065502, + "grad_norm": 1.4140625, + "learning_rate": 2.0593639793704905e-06, + "loss": 0.3128, + "step": 19820 + }, + { + "epoch": 0.8703689473857537, + "grad_norm": 1.4921875, + "learning_rate": 2.0566152116201528e-06, + "loss": 0.3216, + "step": 19822 + }, + { + "epoch": 0.8704567658649571, + "grad_norm": 1.4375, + "learning_rate": 2.053868200879719e-06, + "loss": 0.2935, + "step": 19824 + }, + { + "epoch": 0.8705445843441606, + "grad_norm": 1.4765625, + "learning_rate": 2.0511229473595626e-06, + "loss": 0.3261, + "step": 19826 + }, + { + "epoch": 0.8706324028233641, + "grad_norm": 1.5546875, + "learning_rate": 2.0483794512699077e-06, + "loss": 0.305, + "step": 19828 + }, + { + "epoch": 0.8707202213025675, + "grad_norm": 1.421875, + "learning_rate": 2.045637712820861e-06, + "loss": 0.3249, + "step": 19830 + }, + { + "epoch": 0.8708080397817711, + "grad_norm": 1.453125, + "learning_rate": 2.042897732222379e-06, + "loss": 0.3237, + "step": 19832 + }, + { + "epoch": 0.8708958582609746, + "grad_norm": 1.4140625, + "learning_rate": 2.040159509684289e-06, + "loss": 0.3407, + "step": 19834 + }, + { + "epoch": 0.8709836767401781, + "grad_norm": 1.4765625, + "learning_rate": 2.0374230454162924e-06, + "loss": 0.3136, + "step": 19836 + }, + { + "epoch": 0.8710714952193815, + "grad_norm": 1.4453125, + "learning_rate": 2.034688339627938e-06, + "loss": 0.3151, + "step": 19838 + }, + { + "epoch": 0.871159313698585, + "grad_norm": 1.390625, + "learning_rate": 2.031955392528662e-06, + "loss": 0.3123, + "step": 19840 + }, + { + "epoch": 0.8712471321777885, + "grad_norm": 1.5234375, + "learning_rate": 2.0292242043277455e-06, + "loss": 0.3193, + "step": 19842 + }, + { + "epoch": 0.871334950656992, + "grad_norm": 1.484375, + "learning_rate": 2.0264947752343446e-06, + "loss": 0.2988, + "step": 19844 + }, + { + "epoch": 0.8714227691361954, + "grad_norm": 1.5, + "learning_rate": 2.023767105457483e-06, + "loss": 0.3288, + "step": 19846 + }, + { + "epoch": 0.871510587615399, + "grad_norm": 1.4453125, + "learning_rate": 2.0210411952060444e-06, + "loss": 0.2927, + "step": 19848 + }, + { + "epoch": 0.8715984060946025, + "grad_norm": 1.53125, + "learning_rate": 2.0183170446887807e-06, + "loss": 0.2949, + "step": 19850 + }, + { + "epoch": 0.871686224573806, + "grad_norm": 1.4609375, + "learning_rate": 2.0155946541143e-06, + "loss": 0.3076, + "step": 19852 + }, + { + "epoch": 0.8717740430530094, + "grad_norm": 1.6796875, + "learning_rate": 2.012874023691097e-06, + "loss": 0.3349, + "step": 19854 + }, + { + "epoch": 0.8718618615322129, + "grad_norm": 1.5703125, + "learning_rate": 2.010155153627505e-06, + "loss": 0.3102, + "step": 19856 + }, + { + "epoch": 0.8719496800114164, + "grad_norm": 1.5, + "learning_rate": 2.0074380441317488e-06, + "loss": 0.3419, + "step": 19858 + }, + { + "epoch": 0.8720374984906198, + "grad_norm": 1.4375, + "learning_rate": 2.004722695411898e-06, + "loss": 0.3193, + "step": 19860 + }, + { + "epoch": 0.8721253169698234, + "grad_norm": 1.484375, + "learning_rate": 2.0020091076758922e-06, + "loss": 0.3176, + "step": 19862 + }, + { + "epoch": 0.8722131354490269, + "grad_norm": 1.421875, + "learning_rate": 1.9992972811315458e-06, + "loss": 0.2944, + "step": 19864 + }, + { + "epoch": 0.8723009539282304, + "grad_norm": 1.421875, + "learning_rate": 1.9965872159865227e-06, + "loss": 0.3135, + "step": 19866 + }, + { + "epoch": 0.8723887724074338, + "grad_norm": 1.4609375, + "learning_rate": 1.9938789124483687e-06, + "loss": 0.3138, + "step": 19868 + }, + { + "epoch": 0.8724765908866373, + "grad_norm": 1.5078125, + "learning_rate": 1.991172370724484e-06, + "loss": 0.322, + "step": 19870 + }, + { + "epoch": 0.8725644093658408, + "grad_norm": 1.546875, + "learning_rate": 1.988467591022131e-06, + "loss": 0.2898, + "step": 19872 + }, + { + "epoch": 0.8726522278450443, + "grad_norm": 1.46875, + "learning_rate": 1.9857645735484486e-06, + "loss": 0.3098, + "step": 19874 + }, + { + "epoch": 0.8727400463242477, + "grad_norm": 1.421875, + "learning_rate": 1.9830633185104282e-06, + "loss": 0.3381, + "step": 19876 + }, + { + "epoch": 0.8728278648034513, + "grad_norm": 1.4609375, + "learning_rate": 1.980363826114945e-06, + "loss": 0.2921, + "step": 19878 + }, + { + "epoch": 0.8729156832826548, + "grad_norm": 1.5078125, + "learning_rate": 1.977666096568709e-06, + "loss": 0.2974, + "step": 19880 + }, + { + "epoch": 0.8730035017618583, + "grad_norm": 1.4921875, + "learning_rate": 1.974970130078327e-06, + "loss": 0.3526, + "step": 19882 + }, + { + "epoch": 0.8730913202410617, + "grad_norm": 1.4453125, + "learning_rate": 1.9722759268502507e-06, + "loss": 0.3244, + "step": 19884 + }, + { + "epoch": 0.8731791387202652, + "grad_norm": 1.4375, + "learning_rate": 1.9695834870908007e-06, + "loss": 0.3194, + "step": 19886 + }, + { + "epoch": 0.8732669571994687, + "grad_norm": 1.5, + "learning_rate": 1.966892811006174e-06, + "loss": 0.3328, + "step": 19888 + }, + { + "epoch": 0.8733547756786721, + "grad_norm": 1.4453125, + "learning_rate": 1.964203898802411e-06, + "loss": 0.3276, + "step": 19890 + }, + { + "epoch": 0.8734425941578756, + "grad_norm": 1.46875, + "learning_rate": 1.961516750685441e-06, + "loss": 0.2992, + "step": 19892 + }, + { + "epoch": 0.8735304126370792, + "grad_norm": 1.421875, + "learning_rate": 1.9588313668610385e-06, + "loss": 0.3253, + "step": 19894 + }, + { + "epoch": 0.8736182311162827, + "grad_norm": 1.4921875, + "learning_rate": 1.9561477475348504e-06, + "loss": 0.329, + "step": 19896 + }, + { + "epoch": 0.8737060495954861, + "grad_norm": 1.46875, + "learning_rate": 1.9534658929123984e-06, + "loss": 0.2999, + "step": 19898 + }, + { + "epoch": 0.8737938680746896, + "grad_norm": 1.4609375, + "learning_rate": 1.9507858031990454e-06, + "loss": 0.3202, + "step": 19900 + }, + { + "epoch": 0.8738816865538931, + "grad_norm": 1.4296875, + "learning_rate": 1.9481074786000473e-06, + "loss": 0.2992, + "step": 19902 + }, + { + "epoch": 0.8739695050330966, + "grad_norm": 1.5078125, + "learning_rate": 1.945430919320504e-06, + "loss": 0.3101, + "step": 19904 + }, + { + "epoch": 0.8740573235123, + "grad_norm": 1.40625, + "learning_rate": 1.9427561255653816e-06, + "loss": 0.3208, + "step": 19906 + }, + { + "epoch": 0.8741451419915036, + "grad_norm": 1.4296875, + "learning_rate": 1.9400830975395272e-06, + "loss": 0.3327, + "step": 19908 + }, + { + "epoch": 0.8742329604707071, + "grad_norm": 1.5390625, + "learning_rate": 1.937411835447639e-06, + "loss": 0.3426, + "step": 19910 + }, + { + "epoch": 0.8743207789499106, + "grad_norm": 1.3984375, + "learning_rate": 1.9347423394942774e-06, + "loss": 0.299, + "step": 19912 + }, + { + "epoch": 0.874408597429114, + "grad_norm": 1.4765625, + "learning_rate": 1.9320746098838744e-06, + "loss": 0.3203, + "step": 19914 + }, + { + "epoch": 0.8744964159083175, + "grad_norm": 1.484375, + "learning_rate": 1.9294086468207325e-06, + "loss": 0.3169, + "step": 19916 + }, + { + "epoch": 0.874584234387521, + "grad_norm": 1.5078125, + "learning_rate": 1.9267444505090022e-06, + "loss": 0.3429, + "step": 19918 + }, + { + "epoch": 0.8746720528667244, + "grad_norm": 1.4375, + "learning_rate": 1.9240820211527148e-06, + "loss": 0.2993, + "step": 19920 + }, + { + "epoch": 0.8747598713459279, + "grad_norm": 1.59375, + "learning_rate": 1.9214213589557608e-06, + "loss": 0.3321, + "step": 19922 + }, + { + "epoch": 0.8748476898251315, + "grad_norm": 1.4609375, + "learning_rate": 1.918762464121887e-06, + "loss": 0.2825, + "step": 19924 + }, + { + "epoch": 0.874935508304335, + "grad_norm": 1.5625, + "learning_rate": 1.9161053368547207e-06, + "loss": 0.3315, + "step": 19926 + }, + { + "epoch": 0.8750233267835384, + "grad_norm": 1.5078125, + "learning_rate": 1.913449977357737e-06, + "loss": 0.3192, + "step": 19928 + }, + { + "epoch": 0.8751111452627419, + "grad_norm": 1.4609375, + "learning_rate": 1.9107963858342905e-06, + "loss": 0.323, + "step": 19930 + }, + { + "epoch": 0.8751989637419454, + "grad_norm": 1.4921875, + "learning_rate": 1.9081445624875933e-06, + "loss": 0.3143, + "step": 19932 + }, + { + "epoch": 0.8752867822211489, + "grad_norm": 1.453125, + "learning_rate": 1.9054945075207164e-06, + "loss": 0.3259, + "step": 19934 + }, + { + "epoch": 0.8753746007003523, + "grad_norm": 1.546875, + "learning_rate": 1.9028462211366117e-06, + "loss": 0.3299, + "step": 19936 + }, + { + "epoch": 0.8754624191795558, + "grad_norm": 1.4453125, + "learning_rate": 1.9001997035380776e-06, + "loss": 0.3065, + "step": 19938 + }, + { + "epoch": 0.8755502376587594, + "grad_norm": 1.5859375, + "learning_rate": 1.897554954927791e-06, + "loss": 0.3281, + "step": 19940 + }, + { + "epoch": 0.8756380561379629, + "grad_norm": 1.546875, + "learning_rate": 1.894911975508276e-06, + "loss": 0.3459, + "step": 19942 + }, + { + "epoch": 0.8757258746171663, + "grad_norm": 1.515625, + "learning_rate": 1.8922707654819488e-06, + "loss": 0.3059, + "step": 19944 + }, + { + "epoch": 0.8758136930963698, + "grad_norm": 1.5234375, + "learning_rate": 1.8896313250510612e-06, + "loss": 0.2982, + "step": 19946 + }, + { + "epoch": 0.8759015115755733, + "grad_norm": 1.53125, + "learning_rate": 1.8869936544177458e-06, + "loss": 0.3176, + "step": 19948 + }, + { + "epoch": 0.8759893300547767, + "grad_norm": 1.4921875, + "learning_rate": 1.8843577537839996e-06, + "loss": 0.2997, + "step": 19950 + }, + { + "epoch": 0.8760771485339802, + "grad_norm": 1.515625, + "learning_rate": 1.881723623351675e-06, + "loss": 0.3199, + "step": 19952 + }, + { + "epoch": 0.8761649670131838, + "grad_norm": 1.4453125, + "learning_rate": 1.8790912633224994e-06, + "loss": 0.3363, + "step": 19954 + }, + { + "epoch": 0.8762527854923873, + "grad_norm": 1.5625, + "learning_rate": 1.876460673898059e-06, + "loss": 0.3124, + "step": 19956 + }, + { + "epoch": 0.8763406039715907, + "grad_norm": 1.421875, + "learning_rate": 1.8738318552797978e-06, + "loss": 0.2969, + "step": 19958 + }, + { + "epoch": 0.8764284224507942, + "grad_norm": 1.5, + "learning_rate": 1.8712048076690442e-06, + "loss": 0.2909, + "step": 19960 + }, + { + "epoch": 0.8765162409299977, + "grad_norm": 1.4453125, + "learning_rate": 1.8685795312669619e-06, + "loss": 0.3397, + "step": 19962 + }, + { + "epoch": 0.8766040594092012, + "grad_norm": 1.4921875, + "learning_rate": 1.8659560262746123e-06, + "loss": 0.3175, + "step": 19964 + }, + { + "epoch": 0.8766918778884046, + "grad_norm": 1.546875, + "learning_rate": 1.8633342928928931e-06, + "loss": 0.2963, + "step": 19966 + }, + { + "epoch": 0.8767796963676081, + "grad_norm": 1.5, + "learning_rate": 1.8607143313225773e-06, + "loss": 0.296, + "step": 19968 + }, + { + "epoch": 0.8768675148468117, + "grad_norm": 1.5078125, + "learning_rate": 1.8580961417643123e-06, + "loss": 0.3398, + "step": 19970 + }, + { + "epoch": 0.8769553333260152, + "grad_norm": 1.6015625, + "learning_rate": 1.855479724418585e-06, + "loss": 0.3342, + "step": 19972 + }, + { + "epoch": 0.8770431518052186, + "grad_norm": 1.515625, + "learning_rate": 1.8528650794857716e-06, + "loss": 0.3249, + "step": 19974 + }, + { + "epoch": 0.8771309702844221, + "grad_norm": 1.4453125, + "learning_rate": 1.850252207166095e-06, + "loss": 0.3134, + "step": 19976 + }, + { + "epoch": 0.8772187887636256, + "grad_norm": 1.4921875, + "learning_rate": 1.847641107659659e-06, + "loss": 0.323, + "step": 19978 + }, + { + "epoch": 0.877306607242829, + "grad_norm": 1.5546875, + "learning_rate": 1.845031781166412e-06, + "loss": 0.2874, + "step": 19980 + }, + { + "epoch": 0.8773944257220325, + "grad_norm": 1.4140625, + "learning_rate": 1.8424242278861858e-06, + "loss": 0.3032, + "step": 19982 + }, + { + "epoch": 0.877482244201236, + "grad_norm": 1.4609375, + "learning_rate": 1.8398184480186654e-06, + "loss": 0.3245, + "step": 19984 + }, + { + "epoch": 0.8775700626804396, + "grad_norm": 1.4453125, + "learning_rate": 1.8372144417633935e-06, + "loss": 0.3182, + "step": 19986 + }, + { + "epoch": 0.877657881159643, + "grad_norm": 1.5078125, + "learning_rate": 1.834612209319797e-06, + "loss": 0.3319, + "step": 19988 + }, + { + "epoch": 0.8777456996388465, + "grad_norm": 1.453125, + "learning_rate": 1.832011750887147e-06, + "loss": 0.2969, + "step": 19990 + }, + { + "epoch": 0.87783351811805, + "grad_norm": 1.46875, + "learning_rate": 1.8294130666645926e-06, + "loss": 0.3246, + "step": 19992 + }, + { + "epoch": 0.8779213365972535, + "grad_norm": 1.5, + "learning_rate": 1.8268161568511438e-06, + "loss": 0.3301, + "step": 19994 + }, + { + "epoch": 0.8780091550764569, + "grad_norm": 1.5078125, + "learning_rate": 1.8242210216456612e-06, + "loss": 0.2898, + "step": 19996 + }, + { + "epoch": 0.8780969735556604, + "grad_norm": 1.4296875, + "learning_rate": 1.8216276612468941e-06, + "loss": 0.3018, + "step": 19998 + }, + { + "epoch": 0.8781847920348639, + "grad_norm": 1.609375, + "learning_rate": 1.8190360758534392e-06, + "loss": 0.277, + "step": 20000 + }, + { + "epoch": 0.8782726105140675, + "grad_norm": 1.5859375, + "learning_rate": 1.8164462656637544e-06, + "loss": 0.3097, + "step": 20002 + }, + { + "epoch": 0.8783604289932709, + "grad_norm": 1.453125, + "learning_rate": 1.8138582308761698e-06, + "loss": 0.3035, + "step": 20004 + }, + { + "epoch": 0.8784482474724744, + "grad_norm": 1.4375, + "learning_rate": 1.8112719716888853e-06, + "loss": 0.3346, + "step": 20006 + }, + { + "epoch": 0.8785360659516779, + "grad_norm": 1.4296875, + "learning_rate": 1.808687488299951e-06, + "loss": 0.3034, + "step": 20008 + }, + { + "epoch": 0.8786238844308814, + "grad_norm": 1.375, + "learning_rate": 1.8061047809072834e-06, + "loss": 0.3205, + "step": 20010 + }, + { + "epoch": 0.8787117029100848, + "grad_norm": 1.5546875, + "learning_rate": 1.8035238497086743e-06, + "loss": 0.3309, + "step": 20012 + }, + { + "epoch": 0.8787995213892883, + "grad_norm": 1.4609375, + "learning_rate": 1.8009446949017683e-06, + "loss": 0.3232, + "step": 20014 + }, + { + "epoch": 0.8788873398684919, + "grad_norm": 1.5078125, + "learning_rate": 1.7983673166840825e-06, + "loss": 0.3017, + "step": 20016 + }, + { + "epoch": 0.8789751583476954, + "grad_norm": 1.5078125, + "learning_rate": 1.7957917152529869e-06, + "loss": 0.3277, + "step": 20018 + }, + { + "epoch": 0.8790629768268988, + "grad_norm": 1.5, + "learning_rate": 1.7932178908057208e-06, + "loss": 0.2818, + "step": 20020 + }, + { + "epoch": 0.8791507953061023, + "grad_norm": 1.5078125, + "learning_rate": 1.7906458435393986e-06, + "loss": 0.3283, + "step": 20022 + }, + { + "epoch": 0.8792386137853058, + "grad_norm": 1.3984375, + "learning_rate": 1.7880755736509741e-06, + "loss": 0.3126, + "step": 20024 + }, + { + "epoch": 0.8793264322645092, + "grad_norm": 1.4375, + "learning_rate": 1.7855070813372926e-06, + "loss": 0.3258, + "step": 20026 + }, + { + "epoch": 0.8794142507437127, + "grad_norm": 1.53125, + "learning_rate": 1.7829403667950412e-06, + "loss": 0.3533, + "step": 20028 + }, + { + "epoch": 0.8795020692229162, + "grad_norm": 1.4921875, + "learning_rate": 1.7803754302207848e-06, + "loss": 0.3357, + "step": 20030 + }, + { + "epoch": 0.8795898877021198, + "grad_norm": 1.453125, + "learning_rate": 1.7778122718109441e-06, + "loss": 0.318, + "step": 20032 + }, + { + "epoch": 0.8796777061813232, + "grad_norm": 1.453125, + "learning_rate": 1.7752508917618011e-06, + "loss": 0.3047, + "step": 20034 + }, + { + "epoch": 0.8797655246605267, + "grad_norm": 1.4453125, + "learning_rate": 1.7726912902695152e-06, + "loss": 0.3174, + "step": 20036 + }, + { + "epoch": 0.8798533431397302, + "grad_norm": 1.5859375, + "learning_rate": 1.770133467530094e-06, + "loss": 0.3134, + "step": 20038 + }, + { + "epoch": 0.8799411616189337, + "grad_norm": 1.3984375, + "learning_rate": 1.7675774237394278e-06, + "loss": 0.2914, + "step": 20040 + }, + { + "epoch": 0.8800289800981371, + "grad_norm": 1.5546875, + "learning_rate": 1.765023159093246e-06, + "loss": 0.3218, + "step": 20042 + }, + { + "epoch": 0.8801167985773406, + "grad_norm": 1.484375, + "learning_rate": 1.7624706737871643e-06, + "loss": 0.3444, + "step": 20044 + }, + { + "epoch": 0.8802046170565441, + "grad_norm": 1.4765625, + "learning_rate": 1.7599199680166462e-06, + "loss": 0.3358, + "step": 20046 + }, + { + "epoch": 0.8802924355357477, + "grad_norm": 1.5546875, + "learning_rate": 1.7573710419770267e-06, + "loss": 0.3213, + "step": 20048 + }, + { + "epoch": 0.8803802540149511, + "grad_norm": 1.4921875, + "learning_rate": 1.7548238958635082e-06, + "loss": 0.2897, + "step": 20050 + }, + { + "epoch": 0.8804680724941546, + "grad_norm": 1.453125, + "learning_rate": 1.7522785298711457e-06, + "loss": 0.3444, + "step": 20052 + }, + { + "epoch": 0.8805558909733581, + "grad_norm": 1.5390625, + "learning_rate": 1.7497349441948668e-06, + "loss": 0.2972, + "step": 20054 + }, + { + "epoch": 0.8806437094525615, + "grad_norm": 1.46875, + "learning_rate": 1.7471931390294627e-06, + "loss": 0.3189, + "step": 20056 + }, + { + "epoch": 0.880731527931765, + "grad_norm": 1.484375, + "learning_rate": 1.744653114569575e-06, + "loss": 0.3152, + "step": 20058 + }, + { + "epoch": 0.8808193464109685, + "grad_norm": 1.5, + "learning_rate": 1.7421148710097312e-06, + "loss": 0.3031, + "step": 20060 + }, + { + "epoch": 0.8809071648901721, + "grad_norm": 1.4921875, + "learning_rate": 1.7395784085443068e-06, + "loss": 0.3525, + "step": 20062 + }, + { + "epoch": 0.8809949833693755, + "grad_norm": 1.4375, + "learning_rate": 1.737043727367546e-06, + "loss": 0.3036, + "step": 20064 + }, + { + "epoch": 0.881082801848579, + "grad_norm": 1.4921875, + "learning_rate": 1.7345108276735467e-06, + "loss": 0.3163, + "step": 20066 + }, + { + "epoch": 0.8811706203277825, + "grad_norm": 1.4375, + "learning_rate": 1.7319797096562867e-06, + "loss": 0.3123, + "step": 20068 + }, + { + "epoch": 0.881258438806986, + "grad_norm": 1.4765625, + "learning_rate": 1.729450373509603e-06, + "loss": 0.2987, + "step": 20070 + }, + { + "epoch": 0.8813462572861894, + "grad_norm": 1.546875, + "learning_rate": 1.726922819427182e-06, + "loss": 0.3379, + "step": 20072 + }, + { + "epoch": 0.8814340757653929, + "grad_norm": 1.484375, + "learning_rate": 1.724397047602594e-06, + "loss": 0.3099, + "step": 20074 + }, + { + "epoch": 0.8815218942445964, + "grad_norm": 1.4296875, + "learning_rate": 1.7218730582292563e-06, + "loss": 0.3147, + "step": 20076 + }, + { + "epoch": 0.8816097127238, + "grad_norm": 1.5390625, + "learning_rate": 1.719350851500462e-06, + "loss": 0.3137, + "step": 20078 + }, + { + "epoch": 0.8816975312030034, + "grad_norm": 1.4375, + "learning_rate": 1.7168304276093616e-06, + "loss": 0.3025, + "step": 20080 + }, + { + "epoch": 0.8817853496822069, + "grad_norm": 1.390625, + "learning_rate": 1.714311786748965e-06, + "loss": 0.3322, + "step": 20082 + }, + { + "epoch": 0.8818731681614104, + "grad_norm": 1.4609375, + "learning_rate": 1.7117949291121565e-06, + "loss": 0.3313, + "step": 20084 + }, + { + "epoch": 0.8819609866406138, + "grad_norm": 1.4453125, + "learning_rate": 1.7092798548916738e-06, + "loss": 0.3306, + "step": 20086 + }, + { + "epoch": 0.8820488051198173, + "grad_norm": 1.5078125, + "learning_rate": 1.706766564280124e-06, + "loss": 0.3208, + "step": 20088 + }, + { + "epoch": 0.8821366235990208, + "grad_norm": 1.4296875, + "learning_rate": 1.704255057469975e-06, + "loss": 0.3011, + "step": 20090 + }, + { + "epoch": 0.8822244420782243, + "grad_norm": 1.4375, + "learning_rate": 1.7017453346535595e-06, + "loss": 0.3052, + "step": 20092 + }, + { + "epoch": 0.8823122605574278, + "grad_norm": 1.453125, + "learning_rate": 1.6992373960230707e-06, + "loss": 0.3114, + "step": 20094 + }, + { + "epoch": 0.8824000790366313, + "grad_norm": 1.484375, + "learning_rate": 1.6967312417705634e-06, + "loss": 0.2927, + "step": 20096 + }, + { + "epoch": 0.8824878975158348, + "grad_norm": 1.4921875, + "learning_rate": 1.6942268720879672e-06, + "loss": 0.2854, + "step": 20098 + }, + { + "epoch": 0.8825757159950383, + "grad_norm": 1.5078125, + "learning_rate": 1.6917242871670596e-06, + "loss": 0.3352, + "step": 20100 + }, + { + "epoch": 0.8826635344742417, + "grad_norm": 1.453125, + "learning_rate": 1.6892234871994983e-06, + "loss": 0.3241, + "step": 20102 + }, + { + "epoch": 0.8827513529534452, + "grad_norm": 1.5234375, + "learning_rate": 1.686724472376791e-06, + "loss": 0.3028, + "step": 20104 + }, + { + "epoch": 0.8828391714326487, + "grad_norm": 1.4921875, + "learning_rate": 1.6842272428903073e-06, + "loss": 0.3198, + "step": 20106 + }, + { + "epoch": 0.8829269899118523, + "grad_norm": 1.484375, + "learning_rate": 1.6817317989312935e-06, + "loss": 0.3084, + "step": 20108 + }, + { + "epoch": 0.8830148083910557, + "grad_norm": 1.484375, + "learning_rate": 1.6792381406908475e-06, + "loss": 0.3003, + "step": 20110 + }, + { + "epoch": 0.8831026268702592, + "grad_norm": 1.4765625, + "learning_rate": 1.6767462683599355e-06, + "loss": 0.3089, + "step": 20112 + }, + { + "epoch": 0.8831904453494627, + "grad_norm": 1.4765625, + "learning_rate": 1.6742561821293829e-06, + "loss": 0.2854, + "step": 20114 + }, + { + "epoch": 0.8832782638286661, + "grad_norm": 1.5, + "learning_rate": 1.671767882189887e-06, + "loss": 0.3033, + "step": 20116 + }, + { + "epoch": 0.8833660823078696, + "grad_norm": 1.46875, + "learning_rate": 1.6692813687320008e-06, + "loss": 0.33, + "step": 20118 + }, + { + "epoch": 0.8834539007870731, + "grad_norm": 1.625, + "learning_rate": 1.6667966419461333e-06, + "loss": 0.3175, + "step": 20120 + }, + { + "epoch": 0.8835417192662766, + "grad_norm": 1.46875, + "learning_rate": 1.6643137020225824e-06, + "loss": 0.3049, + "step": 20122 + }, + { + "epoch": 0.8836295377454801, + "grad_norm": 1.3984375, + "learning_rate": 1.6618325491514736e-06, + "loss": 0.3186, + "step": 20124 + }, + { + "epoch": 0.8837173562246836, + "grad_norm": 1.5390625, + "learning_rate": 1.6593531835228244e-06, + "loss": 0.3149, + "step": 20126 + }, + { + "epoch": 0.8838051747038871, + "grad_norm": 1.5, + "learning_rate": 1.6568756053265023e-06, + "loss": 0.3189, + "step": 20128 + }, + { + "epoch": 0.8838929931830906, + "grad_norm": 1.71875, + "learning_rate": 1.6543998147522444e-06, + "loss": 0.3124, + "step": 20130 + }, + { + "epoch": 0.883980811662294, + "grad_norm": 1.40625, + "learning_rate": 1.6519258119896463e-06, + "loss": 0.2992, + "step": 20132 + }, + { + "epoch": 0.8840686301414975, + "grad_norm": 1.46875, + "learning_rate": 1.6494535972281623e-06, + "loss": 0.3163, + "step": 20134 + }, + { + "epoch": 0.884156448620701, + "grad_norm": 1.4921875, + "learning_rate": 1.6469831706571237e-06, + "loss": 0.3111, + "step": 20136 + }, + { + "epoch": 0.8842442670999044, + "grad_norm": 1.4453125, + "learning_rate": 1.6445145324657075e-06, + "loss": 0.3072, + "step": 20138 + }, + { + "epoch": 0.884332085579108, + "grad_norm": 1.5, + "learning_rate": 1.6420476828429704e-06, + "loss": 0.3367, + "step": 20140 + }, + { + "epoch": 0.8844199040583115, + "grad_norm": 1.453125, + "learning_rate": 1.6395826219778226e-06, + "loss": 0.3162, + "step": 20142 + }, + { + "epoch": 0.884507722537515, + "grad_norm": 1.4921875, + "learning_rate": 1.6371193500590325e-06, + "loss": 0.3177, + "step": 20144 + }, + { + "epoch": 0.8845955410167184, + "grad_norm": 1.4140625, + "learning_rate": 1.6346578672752467e-06, + "loss": 0.3382, + "step": 20146 + }, + { + "epoch": 0.8846833594959219, + "grad_norm": 1.59375, + "learning_rate": 1.6321981738149584e-06, + "loss": 0.3199, + "step": 20148 + }, + { + "epoch": 0.8847711779751254, + "grad_norm": 1.4921875, + "learning_rate": 1.6297402698665392e-06, + "loss": 0.3121, + "step": 20150 + }, + { + "epoch": 0.8848589964543289, + "grad_norm": 1.4921875, + "learning_rate": 1.6272841556182139e-06, + "loss": 0.3434, + "step": 20152 + }, + { + "epoch": 0.8849468149335324, + "grad_norm": 1.5, + "learning_rate": 1.6248298312580706e-06, + "loss": 0.2925, + "step": 20154 + }, + { + "epoch": 0.8850346334127359, + "grad_norm": 1.5390625, + "learning_rate": 1.6223772969740592e-06, + "loss": 0.3513, + "step": 20156 + }, + { + "epoch": 0.8851224518919394, + "grad_norm": 1.4765625, + "learning_rate": 1.6199265529539987e-06, + "loss": 0.3147, + "step": 20158 + }, + { + "epoch": 0.8852102703711429, + "grad_norm": 1.4296875, + "learning_rate": 1.6174775993855695e-06, + "loss": 0.303, + "step": 20160 + }, + { + "epoch": 0.8852980888503463, + "grad_norm": 1.4765625, + "learning_rate": 1.615030436456308e-06, + "loss": 0.3321, + "step": 20162 + }, + { + "epoch": 0.8853859073295498, + "grad_norm": 1.3828125, + "learning_rate": 1.6125850643536221e-06, + "loss": 0.3239, + "step": 20164 + }, + { + "epoch": 0.8854737258087533, + "grad_norm": 1.4765625, + "learning_rate": 1.6101414832647822e-06, + "loss": 0.3308, + "step": 20166 + }, + { + "epoch": 0.8855615442879567, + "grad_norm": 1.46875, + "learning_rate": 1.6076996933769073e-06, + "loss": 0.3111, + "step": 20168 + }, + { + "epoch": 0.8856493627671603, + "grad_norm": 1.5078125, + "learning_rate": 1.6052596948770037e-06, + "loss": 0.3035, + "step": 20170 + }, + { + "epoch": 0.8857371812463638, + "grad_norm": 1.5, + "learning_rate": 1.6028214879519166e-06, + "loss": 0.3051, + "step": 20172 + }, + { + "epoch": 0.8858249997255673, + "grad_norm": 1.5234375, + "learning_rate": 1.6003850727883713e-06, + "loss": 0.2882, + "step": 20174 + }, + { + "epoch": 0.8859128182047707, + "grad_norm": 1.4296875, + "learning_rate": 1.5979504495729436e-06, + "loss": 0.3004, + "step": 20176 + }, + { + "epoch": 0.8860006366839742, + "grad_norm": 1.578125, + "learning_rate": 1.5955176184920844e-06, + "loss": 0.3127, + "step": 20178 + }, + { + "epoch": 0.8860884551631777, + "grad_norm": 1.4765625, + "learning_rate": 1.5930865797320948e-06, + "loss": 0.3208, + "step": 20180 + }, + { + "epoch": 0.8861762736423812, + "grad_norm": 1.5859375, + "learning_rate": 1.5906573334791424e-06, + "loss": 0.321, + "step": 20182 + }, + { + "epoch": 0.8862640921215846, + "grad_norm": 1.5234375, + "learning_rate": 1.588229879919273e-06, + "loss": 0.3277, + "step": 20184 + }, + { + "epoch": 0.8863519106007882, + "grad_norm": 1.46875, + "learning_rate": 1.5858042192383626e-06, + "loss": 0.3402, + "step": 20186 + }, + { + "epoch": 0.8864397290799917, + "grad_norm": 1.5078125, + "learning_rate": 1.5833803516221795e-06, + "loss": 0.321, + "step": 20188 + }, + { + "epoch": 0.8865275475591952, + "grad_norm": 1.3984375, + "learning_rate": 1.580958277256342e-06, + "loss": 0.3062, + "step": 20190 + }, + { + "epoch": 0.8866153660383986, + "grad_norm": 1.5703125, + "learning_rate": 1.5785379963263347e-06, + "loss": 0.3405, + "step": 20192 + }, + { + "epoch": 0.8867031845176021, + "grad_norm": 1.6484375, + "learning_rate": 1.5761195090175041e-06, + "loss": 0.3248, + "step": 20194 + }, + { + "epoch": 0.8867910029968056, + "grad_norm": 1.453125, + "learning_rate": 1.5737028155150517e-06, + "loss": 0.3085, + "step": 20196 + }, + { + "epoch": 0.886878821476009, + "grad_norm": 1.546875, + "learning_rate": 1.5712879160040578e-06, + "loss": 0.3068, + "step": 20198 + }, + { + "epoch": 0.8869666399552125, + "grad_norm": 1.4609375, + "learning_rate": 1.568874810669449e-06, + "loss": 0.3134, + "step": 20200 + }, + { + "epoch": 0.8870544584344161, + "grad_norm": 1.46875, + "learning_rate": 1.5664634996960248e-06, + "loss": 0.3131, + "step": 20202 + }, + { + "epoch": 0.8871422769136196, + "grad_norm": 1.4609375, + "learning_rate": 1.564053983268446e-06, + "loss": 0.318, + "step": 20204 + }, + { + "epoch": 0.887230095392823, + "grad_norm": 1.515625, + "learning_rate": 1.5616462615712257e-06, + "loss": 0.2956, + "step": 20206 + }, + { + "epoch": 0.8873179138720265, + "grad_norm": 1.421875, + "learning_rate": 1.5592403347887558e-06, + "loss": 0.2928, + "step": 20208 + }, + { + "epoch": 0.88740573235123, + "grad_norm": 1.4296875, + "learning_rate": 1.5568362031052803e-06, + "loss": 0.3236, + "step": 20210 + }, + { + "epoch": 0.8874935508304335, + "grad_norm": 1.4453125, + "learning_rate": 1.5544338667049075e-06, + "loss": 0.3086, + "step": 20212 + }, + { + "epoch": 0.8875813693096369, + "grad_norm": 1.5859375, + "learning_rate": 1.5520333257716125e-06, + "loss": 0.3167, + "step": 20214 + }, + { + "epoch": 0.8876691877888405, + "grad_norm": 1.5, + "learning_rate": 1.549634580489226e-06, + "loss": 0.3088, + "step": 20216 + }, + { + "epoch": 0.887757006268044, + "grad_norm": 1.4921875, + "learning_rate": 1.5472376310414428e-06, + "loss": 0.2893, + "step": 20218 + }, + { + "epoch": 0.8878448247472475, + "grad_norm": 1.546875, + "learning_rate": 1.5448424776118215e-06, + "loss": 0.3047, + "step": 20220 + }, + { + "epoch": 0.8879326432264509, + "grad_norm": 1.4609375, + "learning_rate": 1.5424491203837903e-06, + "loss": 0.2981, + "step": 20222 + }, + { + "epoch": 0.8880204617056544, + "grad_norm": 1.46875, + "learning_rate": 1.5400575595406226e-06, + "loss": 0.3243, + "step": 20224 + }, + { + "epoch": 0.8881082801848579, + "grad_norm": 1.4921875, + "learning_rate": 1.5376677952654767e-06, + "loss": 0.3125, + "step": 20226 + }, + { + "epoch": 0.8881960986640614, + "grad_norm": 1.53125, + "learning_rate": 1.5352798277413539e-06, + "loss": 0.3246, + "step": 20228 + }, + { + "epoch": 0.8882839171432648, + "grad_norm": 1.5546875, + "learning_rate": 1.5328936571511244e-06, + "loss": 0.3198, + "step": 20230 + }, + { + "epoch": 0.8883717356224684, + "grad_norm": 1.5703125, + "learning_rate": 1.530509283677528e-06, + "loss": 0.3416, + "step": 20232 + }, + { + "epoch": 0.8884595541016719, + "grad_norm": 1.5234375, + "learning_rate": 1.5281267075031497e-06, + "loss": 0.313, + "step": 20234 + }, + { + "epoch": 0.8885473725808753, + "grad_norm": 1.4765625, + "learning_rate": 1.5257459288104625e-06, + "loss": 0.3235, + "step": 20236 + }, + { + "epoch": 0.8886351910600788, + "grad_norm": 1.4296875, + "learning_rate": 1.5233669477817737e-06, + "loss": 0.2899, + "step": 20238 + }, + { + "epoch": 0.8887230095392823, + "grad_norm": 1.4453125, + "learning_rate": 1.520989764599276e-06, + "loss": 0.3083, + "step": 20240 + }, + { + "epoch": 0.8888108280184858, + "grad_norm": 1.4921875, + "learning_rate": 1.5186143794450103e-06, + "loss": 0.3121, + "step": 20242 + }, + { + "epoch": 0.8888986464976892, + "grad_norm": 1.4140625, + "learning_rate": 1.516240792500878e-06, + "loss": 0.3316, + "step": 20244 + }, + { + "epoch": 0.8889864649768927, + "grad_norm": 1.515625, + "learning_rate": 1.5138690039486675e-06, + "loss": 0.2773, + "step": 20246 + }, + { + "epoch": 0.8890742834560963, + "grad_norm": 1.4609375, + "learning_rate": 1.5114990139699886e-06, + "loss": 0.3228, + "step": 20248 + }, + { + "epoch": 0.8891621019352998, + "grad_norm": 1.53125, + "learning_rate": 1.5091308227463492e-06, + "loss": 0.3265, + "step": 20250 + }, + { + "epoch": 0.8892499204145032, + "grad_norm": 1.4453125, + "learning_rate": 1.5067644304590983e-06, + "loss": 0.3161, + "step": 20252 + }, + { + "epoch": 0.8893377388937067, + "grad_norm": 1.4375, + "learning_rate": 1.5043998372894634e-06, + "loss": 0.3223, + "step": 20254 + }, + { + "epoch": 0.8894255573729102, + "grad_norm": 1.4765625, + "learning_rate": 1.5020370434185221e-06, + "loss": 0.3077, + "step": 20256 + }, + { + "epoch": 0.8895133758521137, + "grad_norm": 1.5390625, + "learning_rate": 1.4996760490272127e-06, + "loss": 0.3164, + "step": 20258 + }, + { + "epoch": 0.8896011943313171, + "grad_norm": 1.5234375, + "learning_rate": 1.4973168542963489e-06, + "loss": 0.299, + "step": 20260 + }, + { + "epoch": 0.8896890128105207, + "grad_norm": 1.484375, + "learning_rate": 1.494959459406589e-06, + "loss": 0.3543, + "step": 20262 + }, + { + "epoch": 0.8897768312897242, + "grad_norm": 1.515625, + "learning_rate": 1.4926038645384749e-06, + "loss": 0.3334, + "step": 20264 + }, + { + "epoch": 0.8898646497689277, + "grad_norm": 1.46875, + "learning_rate": 1.4902500698723897e-06, + "loss": 0.3223, + "step": 20266 + }, + { + "epoch": 0.8899524682481311, + "grad_norm": 1.5234375, + "learning_rate": 1.4878980755885868e-06, + "loss": 0.3124, + "step": 20268 + }, + { + "epoch": 0.8900402867273346, + "grad_norm": 1.46875, + "learning_rate": 1.4855478818671887e-06, + "loss": 0.3265, + "step": 20270 + }, + { + "epoch": 0.8901281052065381, + "grad_norm": 1.40625, + "learning_rate": 1.483199488888168e-06, + "loss": 0.3368, + "step": 20272 + }, + { + "epoch": 0.8902159236857415, + "grad_norm": 1.4453125, + "learning_rate": 1.4808528968313695e-06, + "loss": 0.3398, + "step": 20274 + }, + { + "epoch": 0.890303742164945, + "grad_norm": 1.3828125, + "learning_rate": 1.4785081058764972e-06, + "loss": 0.3304, + "step": 20276 + }, + { + "epoch": 0.8903915606441486, + "grad_norm": 1.4609375, + "learning_rate": 1.4761651162031099e-06, + "loss": 0.3183, + "step": 20278 + }, + { + "epoch": 0.8904793791233521, + "grad_norm": 1.5234375, + "learning_rate": 1.473823927990639e-06, + "loss": 0.3324, + "step": 20280 + }, + { + "epoch": 0.8905671976025555, + "grad_norm": 1.4375, + "learning_rate": 1.4714845414183665e-06, + "loss": 0.3068, + "step": 20282 + }, + { + "epoch": 0.890655016081759, + "grad_norm": 1.4453125, + "learning_rate": 1.4691469566654514e-06, + "loss": 0.3171, + "step": 20284 + }, + { + "epoch": 0.8907428345609625, + "grad_norm": 1.5078125, + "learning_rate": 1.466811173910898e-06, + "loss": 0.3047, + "step": 20286 + }, + { + "epoch": 0.890830653040166, + "grad_norm": 1.53125, + "learning_rate": 1.4644771933335938e-06, + "loss": 0.3207, + "step": 20288 + }, + { + "epoch": 0.8909184715193694, + "grad_norm": 1.53125, + "learning_rate": 1.4621450151122652e-06, + "loss": 0.3226, + "step": 20290 + }, + { + "epoch": 0.8910062899985729, + "grad_norm": 1.4140625, + "learning_rate": 1.4598146394255109e-06, + "loss": 0.3179, + "step": 20292 + }, + { + "epoch": 0.8910941084777765, + "grad_norm": 1.46875, + "learning_rate": 1.4574860664517964e-06, + "loss": 0.2931, + "step": 20294 + }, + { + "epoch": 0.89118192695698, + "grad_norm": 1.4921875, + "learning_rate": 1.4551592963694404e-06, + "loss": 0.305, + "step": 20296 + }, + { + "epoch": 0.8912697454361834, + "grad_norm": 1.6171875, + "learning_rate": 1.4528343293566337e-06, + "loss": 0.288, + "step": 20298 + }, + { + "epoch": 0.8913575639153869, + "grad_norm": 1.5390625, + "learning_rate": 1.4505111655914199e-06, + "loss": 0.3151, + "step": 20300 + }, + { + "epoch": 0.8914453823945904, + "grad_norm": 1.484375, + "learning_rate": 1.4481898052517013e-06, + "loss": 0.339, + "step": 20302 + }, + { + "epoch": 0.8915332008737938, + "grad_norm": 1.4375, + "learning_rate": 1.4458702485152549e-06, + "loss": 0.2906, + "step": 20304 + }, + { + "epoch": 0.8916210193529973, + "grad_norm": 1.5859375, + "learning_rate": 1.4435524955597134e-06, + "loss": 0.3058, + "step": 20306 + }, + { + "epoch": 0.8917088378322009, + "grad_norm": 1.4609375, + "learning_rate": 1.4412365465625689e-06, + "loss": 0.3228, + "step": 20308 + }, + { + "epoch": 0.8917966563114044, + "grad_norm": 1.5078125, + "learning_rate": 1.4389224017011704e-06, + "loss": 0.3381, + "step": 20310 + }, + { + "epoch": 0.8918844747906078, + "grad_norm": 1.5, + "learning_rate": 1.436610061152749e-06, + "loss": 0.323, + "step": 20312 + }, + { + "epoch": 0.8919722932698113, + "grad_norm": 1.5625, + "learning_rate": 1.4342995250943735e-06, + "loss": 0.3117, + "step": 20314 + }, + { + "epoch": 0.8920601117490148, + "grad_norm": 1.4375, + "learning_rate": 1.431990793702992e-06, + "loss": 0.3206, + "step": 20316 + }, + { + "epoch": 0.8921479302282183, + "grad_norm": 1.4921875, + "learning_rate": 1.429683867155407e-06, + "loss": 0.2996, + "step": 20318 + }, + { + "epoch": 0.8922357487074217, + "grad_norm": 1.515625, + "learning_rate": 1.4273787456282771e-06, + "loss": 0.3233, + "step": 20320 + }, + { + "epoch": 0.8923235671866252, + "grad_norm": 1.4453125, + "learning_rate": 1.4250754292981366e-06, + "loss": 0.3058, + "step": 20322 + }, + { + "epoch": 0.8924113856658288, + "grad_norm": 1.453125, + "learning_rate": 1.4227739183413663e-06, + "loss": 0.3134, + "step": 20324 + }, + { + "epoch": 0.8924992041450323, + "grad_norm": 1.484375, + "learning_rate": 1.420474212934228e-06, + "loss": 0.3056, + "step": 20326 + }, + { + "epoch": 0.8925870226242357, + "grad_norm": 1.515625, + "learning_rate": 1.4181763132528253e-06, + "loss": 0.3122, + "step": 20328 + }, + { + "epoch": 0.8926748411034392, + "grad_norm": 1.4609375, + "learning_rate": 1.4158802194731285e-06, + "loss": 0.2905, + "step": 20330 + }, + { + "epoch": 0.8927626595826427, + "grad_norm": 1.515625, + "learning_rate": 1.4135859317709832e-06, + "loss": 0.3222, + "step": 20332 + }, + { + "epoch": 0.8928504780618461, + "grad_norm": 1.4140625, + "learning_rate": 1.4112934503220766e-06, + "loss": 0.2815, + "step": 20334 + }, + { + "epoch": 0.8929382965410496, + "grad_norm": 1.484375, + "learning_rate": 1.4090027753019763e-06, + "loss": 0.3155, + "step": 20336 + }, + { + "epoch": 0.8930261150202531, + "grad_norm": 1.453125, + "learning_rate": 1.4067139068861007e-06, + "loss": 0.3158, + "step": 20338 + }, + { + "epoch": 0.8931139334994567, + "grad_norm": 1.625, + "learning_rate": 1.404426845249729e-06, + "loss": 0.3042, + "step": 20340 + }, + { + "epoch": 0.8932017519786601, + "grad_norm": 1.34375, + "learning_rate": 1.4021415905680042e-06, + "loss": 0.3318, + "step": 20342 + }, + { + "epoch": 0.8932895704578636, + "grad_norm": 1.453125, + "learning_rate": 1.399858143015928e-06, + "loss": 0.3627, + "step": 20344 + }, + { + "epoch": 0.8933773889370671, + "grad_norm": 1.4453125, + "learning_rate": 1.39757650276838e-06, + "loss": 0.3007, + "step": 20346 + }, + { + "epoch": 0.8934652074162706, + "grad_norm": 1.5, + "learning_rate": 1.3952966700000764e-06, + "loss": 0.3236, + "step": 20348 + }, + { + "epoch": 0.893553025895474, + "grad_norm": 1.53125, + "learning_rate": 1.393018644885613e-06, + "loss": 0.3086, + "step": 20350 + }, + { + "epoch": 0.8936408443746775, + "grad_norm": 1.5, + "learning_rate": 1.3907424275994452e-06, + "loss": 0.284, + "step": 20352 + }, + { + "epoch": 0.893728662853881, + "grad_norm": 1.421875, + "learning_rate": 1.3884680183158748e-06, + "loss": 0.2977, + "step": 20354 + }, + { + "epoch": 0.8938164813330846, + "grad_norm": 1.4296875, + "learning_rate": 1.3861954172090907e-06, + "loss": 0.301, + "step": 20356 + }, + { + "epoch": 0.893904299812288, + "grad_norm": 1.546875, + "learning_rate": 1.3839246244531145e-06, + "loss": 0.3359, + "step": 20358 + }, + { + "epoch": 0.8939921182914915, + "grad_norm": 1.53125, + "learning_rate": 1.3816556402218573e-06, + "loss": 0.3188, + "step": 20360 + }, + { + "epoch": 0.894079936770695, + "grad_norm": 1.4140625, + "learning_rate": 1.3793884646890742e-06, + "loss": 0.3213, + "step": 20362 + }, + { + "epoch": 0.8941677552498984, + "grad_norm": 1.4921875, + "learning_rate": 1.3771230980283795e-06, + "loss": 0.3277, + "step": 20364 + }, + { + "epoch": 0.8942555737291019, + "grad_norm": 1.5859375, + "learning_rate": 1.3748595404132648e-06, + "loss": 0.3196, + "step": 20366 + }, + { + "epoch": 0.8943433922083054, + "grad_norm": 1.453125, + "learning_rate": 1.372597792017069e-06, + "loss": 0.3485, + "step": 20368 + }, + { + "epoch": 0.894431210687509, + "grad_norm": 1.4765625, + "learning_rate": 1.3703378530129984e-06, + "loss": 0.3237, + "step": 20370 + }, + { + "epoch": 0.8945190291667124, + "grad_norm": 1.4296875, + "learning_rate": 1.3680797235741171e-06, + "loss": 0.3003, + "step": 20372 + }, + { + "epoch": 0.8946068476459159, + "grad_norm": 1.4453125, + "learning_rate": 1.3658234038733591e-06, + "loss": 0.2984, + "step": 20374 + }, + { + "epoch": 0.8946946661251194, + "grad_norm": 1.4453125, + "learning_rate": 1.3635688940835056e-06, + "loss": 0.306, + "step": 20376 + }, + { + "epoch": 0.8947824846043229, + "grad_norm": 1.53125, + "learning_rate": 1.3613161943772156e-06, + "loss": 0.3302, + "step": 20378 + }, + { + "epoch": 0.8948703030835263, + "grad_norm": 1.4140625, + "learning_rate": 1.3590653049269985e-06, + "loss": 0.2974, + "step": 20380 + }, + { + "epoch": 0.8949581215627298, + "grad_norm": 1.4765625, + "learning_rate": 1.3568162259052247e-06, + "loss": 0.3034, + "step": 20382 + }, + { + "epoch": 0.8950459400419333, + "grad_norm": 1.4453125, + "learning_rate": 1.3545689574841342e-06, + "loss": 0.3418, + "step": 20384 + }, + { + "epoch": 0.8951337585211369, + "grad_norm": 1.53125, + "learning_rate": 1.352323499835817e-06, + "loss": 0.3485, + "step": 20386 + }, + { + "epoch": 0.8952215770003403, + "grad_norm": 1.484375, + "learning_rate": 1.3500798531322412e-06, + "loss": 0.3248, + "step": 20388 + }, + { + "epoch": 0.8953093954795438, + "grad_norm": 1.484375, + "learning_rate": 1.3478380175452165e-06, + "loss": 0.3054, + "step": 20390 + }, + { + "epoch": 0.8953972139587473, + "grad_norm": 1.53125, + "learning_rate": 1.3455979932464253e-06, + "loss": 0.3101, + "step": 20392 + }, + { + "epoch": 0.8954850324379507, + "grad_norm": 1.4453125, + "learning_rate": 1.3433597804074132e-06, + "loss": 0.2924, + "step": 20394 + }, + { + "epoch": 0.8955728509171542, + "grad_norm": 1.4296875, + "learning_rate": 1.3411233791995743e-06, + "loss": 0.2853, + "step": 20396 + }, + { + "epoch": 0.8956606693963577, + "grad_norm": 1.578125, + "learning_rate": 1.3388887897941877e-06, + "loss": 0.3011, + "step": 20398 + }, + { + "epoch": 0.8957484878755612, + "grad_norm": 1.484375, + "learning_rate": 1.336656012362361e-06, + "loss": 0.3603, + "step": 20400 + }, + { + "epoch": 0.8958363063547647, + "grad_norm": 1.4765625, + "learning_rate": 1.3344250470750941e-06, + "loss": 0.3254, + "step": 20402 + }, + { + "epoch": 0.8959241248339682, + "grad_norm": 1.453125, + "learning_rate": 1.3321958941032303e-06, + "loss": 0.294, + "step": 20404 + }, + { + "epoch": 0.8960119433131717, + "grad_norm": 1.5078125, + "learning_rate": 1.3299685536174749e-06, + "loss": 0.3111, + "step": 20406 + }, + { + "epoch": 0.8960997617923752, + "grad_norm": 1.5234375, + "learning_rate": 1.3277430257884055e-06, + "loss": 0.3373, + "step": 20408 + }, + { + "epoch": 0.8961875802715786, + "grad_norm": 1.546875, + "learning_rate": 1.3255193107864438e-06, + "loss": 0.3106, + "step": 20410 + }, + { + "epoch": 0.8962753987507821, + "grad_norm": 1.3828125, + "learning_rate": 1.3232974087818955e-06, + "loss": 0.328, + "step": 20412 + }, + { + "epoch": 0.8963632172299856, + "grad_norm": 1.453125, + "learning_rate": 1.321077319944905e-06, + "loss": 0.3167, + "step": 20414 + }, + { + "epoch": 0.8964510357091892, + "grad_norm": 1.5390625, + "learning_rate": 1.3188590444454863e-06, + "loss": 0.3185, + "step": 20416 + }, + { + "epoch": 0.8965388541883926, + "grad_norm": 1.5, + "learning_rate": 1.3166425824535227e-06, + "loss": 0.2994, + "step": 20418 + }, + { + "epoch": 0.8966266726675961, + "grad_norm": 1.46875, + "learning_rate": 1.3144279341387427e-06, + "loss": 0.3108, + "step": 20420 + }, + { + "epoch": 0.8967144911467996, + "grad_norm": 1.4921875, + "learning_rate": 1.312215099670755e-06, + "loss": 0.3096, + "step": 20422 + }, + { + "epoch": 0.896802309626003, + "grad_norm": 1.5390625, + "learning_rate": 1.3100040792190127e-06, + "loss": 0.3016, + "step": 20424 + }, + { + "epoch": 0.8968901281052065, + "grad_norm": 1.4140625, + "learning_rate": 1.3077948729528333e-06, + "loss": 0.3203, + "step": 20426 + }, + { + "epoch": 0.89697794658441, + "grad_norm": 1.484375, + "learning_rate": 1.3055874810414037e-06, + "loss": 0.3113, + "step": 20428 + }, + { + "epoch": 0.8970657650636135, + "grad_norm": 1.421875, + "learning_rate": 1.3033819036537665e-06, + "loss": 0.3138, + "step": 20430 + }, + { + "epoch": 0.897153583542817, + "grad_norm": 1.484375, + "learning_rate": 1.3011781409588225e-06, + "loss": 0.336, + "step": 20432 + }, + { + "epoch": 0.8972414020220205, + "grad_norm": 1.546875, + "learning_rate": 1.298976193125334e-06, + "loss": 0.304, + "step": 20434 + }, + { + "epoch": 0.897329220501224, + "grad_norm": 1.453125, + "learning_rate": 1.2967760603219358e-06, + "loss": 0.3237, + "step": 20436 + }, + { + "epoch": 0.8974170389804275, + "grad_norm": 1.453125, + "learning_rate": 1.294577742717104e-06, + "loss": 0.3243, + "step": 20438 + }, + { + "epoch": 0.8975048574596309, + "grad_norm": 1.4609375, + "learning_rate": 1.2923812404791958e-06, + "loss": 0.2941, + "step": 20440 + }, + { + "epoch": 0.8975926759388344, + "grad_norm": 1.4765625, + "learning_rate": 1.2901865537764124e-06, + "loss": 0.3097, + "step": 20442 + }, + { + "epoch": 0.8976804944180379, + "grad_norm": 1.453125, + "learning_rate": 1.2879936827768253e-06, + "loss": 0.3195, + "step": 20444 + }, + { + "epoch": 0.8977683128972413, + "grad_norm": 1.4296875, + "learning_rate": 1.2858026276483691e-06, + "loss": 0.3152, + "step": 20446 + }, + { + "epoch": 0.8978561313764449, + "grad_norm": 1.3828125, + "learning_rate": 1.2836133885588297e-06, + "loss": 0.3098, + "step": 20448 + }, + { + "epoch": 0.8979439498556484, + "grad_norm": 1.484375, + "learning_rate": 1.2814259656758643e-06, + "loss": 0.322, + "step": 20450 + }, + { + "epoch": 0.8980317683348519, + "grad_norm": 1.4765625, + "learning_rate": 1.2792403591669832e-06, + "loss": 0.2979, + "step": 20452 + }, + { + "epoch": 0.8981195868140553, + "grad_norm": 1.5234375, + "learning_rate": 1.277056569199561e-06, + "loss": 0.2918, + "step": 20454 + }, + { + "epoch": 0.8982074052932588, + "grad_norm": 1.4765625, + "learning_rate": 1.2748745959408365e-06, + "loss": 0.3074, + "step": 20456 + }, + { + "epoch": 0.8982952237724623, + "grad_norm": 1.40625, + "learning_rate": 1.2726944395578978e-06, + "loss": 0.2898, + "step": 20458 + }, + { + "epoch": 0.8983830422516658, + "grad_norm": 1.546875, + "learning_rate": 1.270516100217714e-06, + "loss": 0.3297, + "step": 20460 + }, + { + "epoch": 0.8984708607308693, + "grad_norm": 1.5390625, + "learning_rate": 1.2683395780870883e-06, + "loss": 0.3182, + "step": 20462 + }, + { + "epoch": 0.8985586792100728, + "grad_norm": 1.578125, + "learning_rate": 1.266164873332712e-06, + "loss": 0.3075, + "step": 20464 + }, + { + "epoch": 0.8986464976892763, + "grad_norm": 1.515625, + "learning_rate": 1.2639919861211158e-06, + "loss": 0.3312, + "step": 20466 + }, + { + "epoch": 0.8987343161684798, + "grad_norm": 1.53125, + "learning_rate": 1.261820916618703e-06, + "loss": 0.3273, + "step": 20468 + }, + { + "epoch": 0.8988221346476832, + "grad_norm": 1.4296875, + "learning_rate": 1.2596516649917373e-06, + "loss": 0.3323, + "step": 20470 + }, + { + "epoch": 0.8989099531268867, + "grad_norm": 1.4609375, + "learning_rate": 1.2574842314063335e-06, + "loss": 0.3316, + "step": 20472 + }, + { + "epoch": 0.8989977716060902, + "grad_norm": 1.46875, + "learning_rate": 1.2553186160284837e-06, + "loss": 0.3054, + "step": 20474 + }, + { + "epoch": 0.8990855900852937, + "grad_norm": 1.5390625, + "learning_rate": 1.2531548190240244e-06, + "loss": 0.2856, + "step": 20476 + }, + { + "epoch": 0.8991734085644972, + "grad_norm": 1.4140625, + "learning_rate": 1.2509928405586596e-06, + "loss": 0.2936, + "step": 20478 + }, + { + "epoch": 0.8992612270437007, + "grad_norm": 1.4375, + "learning_rate": 1.2488326807979594e-06, + "loss": 0.3209, + "step": 20480 + }, + { + "epoch": 0.8993490455229042, + "grad_norm": 1.4921875, + "learning_rate": 1.2466743399073415e-06, + "loss": 0.3034, + "step": 20482 + }, + { + "epoch": 0.8994368640021076, + "grad_norm": 1.5546875, + "learning_rate": 1.2445178180521016e-06, + "loss": 0.3014, + "step": 20484 + }, + { + "epoch": 0.8995246824813111, + "grad_norm": 1.4921875, + "learning_rate": 1.2423631153973824e-06, + "loss": 0.3369, + "step": 20486 + }, + { + "epoch": 0.8996125009605146, + "grad_norm": 1.421875, + "learning_rate": 1.2402102321081854e-06, + "loss": 0.3253, + "step": 20488 + }, + { + "epoch": 0.8997003194397181, + "grad_norm": 1.46875, + "learning_rate": 1.2380591683493926e-06, + "loss": 0.3104, + "step": 20490 + }, + { + "epoch": 0.8997881379189215, + "grad_norm": 1.46875, + "learning_rate": 1.2359099242857164e-06, + "loss": 0.2955, + "step": 20492 + }, + { + "epoch": 0.8998759563981251, + "grad_norm": 1.4296875, + "learning_rate": 1.2337625000817616e-06, + "loss": 0.3055, + "step": 20494 + }, + { + "epoch": 0.8999637748773286, + "grad_norm": 1.5078125, + "learning_rate": 1.2316168959019658e-06, + "loss": 0.3278, + "step": 20496 + }, + { + "epoch": 0.9000515933565321, + "grad_norm": 1.484375, + "learning_rate": 1.2294731119106479e-06, + "loss": 0.319, + "step": 20498 + }, + { + "epoch": 0.9001394118357355, + "grad_norm": 1.5078125, + "learning_rate": 1.2273311482719764e-06, + "loss": 0.3076, + "step": 20500 + }, + { + "epoch": 0.900227230314939, + "grad_norm": 1.40625, + "learning_rate": 1.2251910051499865e-06, + "loss": 0.2989, + "step": 20502 + }, + { + "epoch": 0.9003150487941425, + "grad_norm": 1.5703125, + "learning_rate": 1.2230526827085698e-06, + "loss": 0.3129, + "step": 20504 + }, + { + "epoch": 0.900402867273346, + "grad_norm": 1.3984375, + "learning_rate": 1.2209161811114727e-06, + "loss": 0.3273, + "step": 20506 + }, + { + "epoch": 0.9004906857525495, + "grad_norm": 1.75, + "learning_rate": 1.2187815005223202e-06, + "loss": 0.3165, + "step": 20508 + }, + { + "epoch": 0.900578504231753, + "grad_norm": 1.546875, + "learning_rate": 1.2166486411045786e-06, + "loss": 0.33, + "step": 20510 + }, + { + "epoch": 0.9006663227109565, + "grad_norm": 1.484375, + "learning_rate": 1.2145176030215866e-06, + "loss": 0.3129, + "step": 20512 + }, + { + "epoch": 0.90075414119016, + "grad_norm": 1.421875, + "learning_rate": 1.2123883864365388e-06, + "loss": 0.3177, + "step": 20514 + }, + { + "epoch": 0.9008419596693634, + "grad_norm": 1.546875, + "learning_rate": 1.210260991512488e-06, + "loss": 0.3488, + "step": 20516 + }, + { + "epoch": 0.9009297781485669, + "grad_norm": 1.5625, + "learning_rate": 1.208135418412354e-06, + "loss": 0.3142, + "step": 20518 + }, + { + "epoch": 0.9010175966277704, + "grad_norm": 1.5390625, + "learning_rate": 1.2060116672989118e-06, + "loss": 0.3306, + "step": 20520 + }, + { + "epoch": 0.9011054151069738, + "grad_norm": 1.453125, + "learning_rate": 1.203889738334807e-06, + "loss": 0.3044, + "step": 20522 + }, + { + "epoch": 0.9011932335861774, + "grad_norm": 1.515625, + "learning_rate": 1.2017696316825228e-06, + "loss": 0.3305, + "step": 20524 + }, + { + "epoch": 0.9012810520653809, + "grad_norm": 1.453125, + "learning_rate": 1.1996513475044297e-06, + "loss": 0.3275, + "step": 20526 + }, + { + "epoch": 0.9013688705445844, + "grad_norm": 1.515625, + "learning_rate": 1.1975348859627395e-06, + "loss": 0.3127, + "step": 20528 + }, + { + "epoch": 0.9014566890237878, + "grad_norm": 1.5078125, + "learning_rate": 1.195420247219531e-06, + "loss": 0.3118, + "step": 20530 + }, + { + "epoch": 0.9015445075029913, + "grad_norm": 1.453125, + "learning_rate": 1.193307431436752e-06, + "loss": 0.3125, + "step": 20532 + }, + { + "epoch": 0.9016323259821948, + "grad_norm": 1.4296875, + "learning_rate": 1.1911964387761904e-06, + "loss": 0.2804, + "step": 20534 + }, + { + "epoch": 0.9017201444613983, + "grad_norm": 1.4765625, + "learning_rate": 1.1890872693995165e-06, + "loss": 0.3351, + "step": 20536 + }, + { + "epoch": 0.9018079629406017, + "grad_norm": 1.4375, + "learning_rate": 1.1869799234682482e-06, + "loss": 0.3175, + "step": 20538 + }, + { + "epoch": 0.9018957814198053, + "grad_norm": 1.5078125, + "learning_rate": 1.1848744011437623e-06, + "loss": 0.3165, + "step": 20540 + }, + { + "epoch": 0.9019835998990088, + "grad_norm": 1.5625, + "learning_rate": 1.1827707025873074e-06, + "loss": 0.3231, + "step": 20542 + }, + { + "epoch": 0.9020714183782123, + "grad_norm": 1.453125, + "learning_rate": 1.1806688279599798e-06, + "loss": 0.3045, + "step": 20544 + }, + { + "epoch": 0.9021592368574157, + "grad_norm": 1.5078125, + "learning_rate": 1.1785687774227422e-06, + "loss": 0.3261, + "step": 20546 + }, + { + "epoch": 0.9022470553366192, + "grad_norm": 1.484375, + "learning_rate": 1.1764705511364215e-06, + "loss": 0.2926, + "step": 20548 + }, + { + "epoch": 0.9023348738158227, + "grad_norm": 1.453125, + "learning_rate": 1.1743741492616922e-06, + "loss": 0.2985, + "step": 20550 + }, + { + "epoch": 0.9024226922950261, + "grad_norm": 1.4296875, + "learning_rate": 1.172279571959109e-06, + "loss": 0.2882, + "step": 20552 + }, + { + "epoch": 0.9025105107742296, + "grad_norm": 1.484375, + "learning_rate": 1.170186819389063e-06, + "loss": 0.3183, + "step": 20554 + }, + { + "epoch": 0.9025983292534332, + "grad_norm": 1.5625, + "learning_rate": 1.1680958917118235e-06, + "loss": 0.2962, + "step": 20556 + }, + { + "epoch": 0.9026861477326367, + "grad_norm": 1.4765625, + "learning_rate": 1.1660067890875092e-06, + "loss": 0.3091, + "step": 20558 + }, + { + "epoch": 0.9027739662118401, + "grad_norm": 1.5625, + "learning_rate": 1.1639195116761148e-06, + "loss": 0.328, + "step": 20560 + }, + { + "epoch": 0.9028617846910436, + "grad_norm": 1.46875, + "learning_rate": 1.161834059637476e-06, + "loss": 0.2942, + "step": 20562 + }, + { + "epoch": 0.9029496031702471, + "grad_norm": 1.5, + "learning_rate": 1.1597504331312986e-06, + "loss": 0.2917, + "step": 20564 + }, + { + "epoch": 0.9030374216494506, + "grad_norm": 1.4296875, + "learning_rate": 1.1576686323171493e-06, + "loss": 0.297, + "step": 20566 + }, + { + "epoch": 0.903125240128654, + "grad_norm": 1.5546875, + "learning_rate": 1.1555886573544478e-06, + "loss": 0.336, + "step": 20568 + }, + { + "epoch": 0.9032130586078576, + "grad_norm": 1.625, + "learning_rate": 1.1535105084024862e-06, + "loss": 0.3405, + "step": 20570 + }, + { + "epoch": 0.9033008770870611, + "grad_norm": 1.5234375, + "learning_rate": 1.1514341856204037e-06, + "loss": 0.3324, + "step": 20572 + }, + { + "epoch": 0.9033886955662646, + "grad_norm": 1.65625, + "learning_rate": 1.149359689167212e-06, + "loss": 0.3195, + "step": 20574 + }, + { + "epoch": 0.903476514045468, + "grad_norm": 1.578125, + "learning_rate": 1.1472870192017732e-06, + "loss": 0.319, + "step": 20576 + }, + { + "epoch": 0.9035643325246715, + "grad_norm": 1.484375, + "learning_rate": 1.1452161758828074e-06, + "loss": 0.3324, + "step": 20578 + }, + { + "epoch": 0.903652151003875, + "grad_norm": 1.4921875, + "learning_rate": 1.1431471593689097e-06, + "loss": 0.3128, + "step": 20580 + }, + { + "epoch": 0.9037399694830784, + "grad_norm": 1.4765625, + "learning_rate": 1.1410799698185204e-06, + "loss": 0.3055, + "step": 20582 + }, + { + "epoch": 0.9038277879622819, + "grad_norm": 1.46875, + "learning_rate": 1.139014607389946e-06, + "loss": 0.2876, + "step": 20584 + }, + { + "epoch": 0.9039156064414855, + "grad_norm": 1.4140625, + "learning_rate": 1.136951072241349e-06, + "loss": 0.3179, + "step": 20586 + }, + { + "epoch": 0.904003424920689, + "grad_norm": 1.46875, + "learning_rate": 1.134889364530764e-06, + "loss": 0.3113, + "step": 20588 + }, + { + "epoch": 0.9040912433998924, + "grad_norm": 1.5625, + "learning_rate": 1.1328294844160732e-06, + "loss": 0.3197, + "step": 20590 + }, + { + "epoch": 0.9041790618790959, + "grad_norm": 1.4453125, + "learning_rate": 1.1307714320550167e-06, + "loss": 0.3094, + "step": 20592 + }, + { + "epoch": 0.9042668803582994, + "grad_norm": 1.53125, + "learning_rate": 1.1287152076052104e-06, + "loss": 0.2775, + "step": 20594 + }, + { + "epoch": 0.9043546988375029, + "grad_norm": 1.4765625, + "learning_rate": 1.1266608112241118e-06, + "loss": 0.3146, + "step": 20596 + }, + { + "epoch": 0.9044425173167063, + "grad_norm": 1.3984375, + "learning_rate": 1.1246082430690558e-06, + "loss": 0.2975, + "step": 20598 + }, + { + "epoch": 0.9045303357959098, + "grad_norm": 1.453125, + "learning_rate": 1.1225575032972223e-06, + "loss": 0.306, + "step": 20600 + }, + { + "epoch": 0.9046181542751134, + "grad_norm": 1.4609375, + "learning_rate": 1.1205085920656556e-06, + "loss": 0.3098, + "step": 20602 + }, + { + "epoch": 0.9047059727543169, + "grad_norm": 1.453125, + "learning_rate": 1.1184615095312684e-06, + "loss": 0.3128, + "step": 20604 + }, + { + "epoch": 0.9047937912335203, + "grad_norm": 1.5078125, + "learning_rate": 1.1164162558508217e-06, + "loss": 0.2778, + "step": 20606 + }, + { + "epoch": 0.9048816097127238, + "grad_norm": 1.4453125, + "learning_rate": 1.114372831180946e-06, + "loss": 0.3463, + "step": 20608 + }, + { + "epoch": 0.9049694281919273, + "grad_norm": 1.515625, + "learning_rate": 1.112331235678124e-06, + "loss": 0.3002, + "step": 20610 + }, + { + "epoch": 0.9050572466711307, + "grad_norm": 1.4765625, + "learning_rate": 1.1102914694987004e-06, + "loss": 0.3101, + "step": 20612 + }, + { + "epoch": 0.9051450651503342, + "grad_norm": 1.5078125, + "learning_rate": 1.1082535327988864e-06, + "loss": 0.313, + "step": 20614 + }, + { + "epoch": 0.9052328836295378, + "grad_norm": 1.4765625, + "learning_rate": 1.1062174257347402e-06, + "loss": 0.3067, + "step": 20616 + }, + { + "epoch": 0.9053207021087413, + "grad_norm": 1.515625, + "learning_rate": 1.1041831484621956e-06, + "loss": 0.3436, + "step": 20618 + }, + { + "epoch": 0.9054085205879447, + "grad_norm": 1.59375, + "learning_rate": 1.102150701137028e-06, + "loss": 0.3385, + "step": 20620 + }, + { + "epoch": 0.9054963390671482, + "grad_norm": 1.4765625, + "learning_rate": 1.1001200839148934e-06, + "loss": 0.3233, + "step": 20622 + }, + { + "epoch": 0.9055841575463517, + "grad_norm": 1.484375, + "learning_rate": 1.0980912969512897e-06, + "loss": 0.3154, + "step": 20624 + }, + { + "epoch": 0.9056719760255552, + "grad_norm": 1.4921875, + "learning_rate": 1.0960643404015813e-06, + "loss": 0.3367, + "step": 20626 + }, + { + "epoch": 0.9057597945047586, + "grad_norm": 1.515625, + "learning_rate": 1.0940392144210027e-06, + "loss": 0.3401, + "step": 20628 + }, + { + "epoch": 0.9058476129839621, + "grad_norm": 1.4765625, + "learning_rate": 1.092015919164624e-06, + "loss": 0.3034, + "step": 20630 + }, + { + "epoch": 0.9059354314631657, + "grad_norm": 1.46875, + "learning_rate": 1.089994454787402e-06, + "loss": 0.3058, + "step": 20632 + }, + { + "epoch": 0.9060232499423692, + "grad_norm": 1.4765625, + "learning_rate": 1.0879748214441348e-06, + "loss": 0.3177, + "step": 20634 + }, + { + "epoch": 0.9061110684215726, + "grad_norm": 1.515625, + "learning_rate": 1.0859570192894908e-06, + "loss": 0.32, + "step": 20636 + }, + { + "epoch": 0.9061988869007761, + "grad_norm": 1.375, + "learning_rate": 1.0839410484779876e-06, + "loss": 0.2909, + "step": 20638 + }, + { + "epoch": 0.9062867053799796, + "grad_norm": 1.453125, + "learning_rate": 1.0819269091640134e-06, + "loss": 0.3363, + "step": 20640 + }, + { + "epoch": 0.906374523859183, + "grad_norm": 1.421875, + "learning_rate": 1.0799146015018109e-06, + "loss": 0.3271, + "step": 20642 + }, + { + "epoch": 0.9064623423383865, + "grad_norm": 1.546875, + "learning_rate": 1.077904125645482e-06, + "loss": 0.3047, + "step": 20644 + }, + { + "epoch": 0.90655016081759, + "grad_norm": 1.671875, + "learning_rate": 1.0758954817489897e-06, + "loss": 0.3081, + "step": 20646 + }, + { + "epoch": 0.9066379792967936, + "grad_norm": 1.546875, + "learning_rate": 1.0738886699661528e-06, + "loss": 0.2996, + "step": 20648 + }, + { + "epoch": 0.906725797775997, + "grad_norm": 1.4296875, + "learning_rate": 1.0718836904506617e-06, + "loss": 0.3251, + "step": 20650 + }, + { + "epoch": 0.9068136162552005, + "grad_norm": 1.4609375, + "learning_rate": 1.0698805433560528e-06, + "loss": 0.3055, + "step": 20652 + }, + { + "epoch": 0.906901434734404, + "grad_norm": 1.5, + "learning_rate": 1.0678792288357249e-06, + "loss": 0.3177, + "step": 20654 + }, + { + "epoch": 0.9069892532136075, + "grad_norm": 1.484375, + "learning_rate": 1.0658797470429443e-06, + "loss": 0.3137, + "step": 20656 + }, + { + "epoch": 0.9070770716928109, + "grad_norm": 1.4609375, + "learning_rate": 1.0638820981308305e-06, + "loss": 0.3099, + "step": 20658 + }, + { + "epoch": 0.9071648901720144, + "grad_norm": 1.4609375, + "learning_rate": 1.0618862822523639e-06, + "loss": 0.3337, + "step": 20660 + }, + { + "epoch": 0.907252708651218, + "grad_norm": 1.4375, + "learning_rate": 1.0598922995603861e-06, + "loss": 0.3277, + "step": 20662 + }, + { + "epoch": 0.9073405271304215, + "grad_norm": 1.46875, + "learning_rate": 1.0579001502075887e-06, + "loss": 0.334, + "step": 20664 + }, + { + "epoch": 0.9074283456096249, + "grad_norm": 1.46875, + "learning_rate": 1.055909834346544e-06, + "loss": 0.3086, + "step": 20666 + }, + { + "epoch": 0.9075161640888284, + "grad_norm": 1.453125, + "learning_rate": 1.0539213521296582e-06, + "loss": 0.3132, + "step": 20668 + }, + { + "epoch": 0.9076039825680319, + "grad_norm": 1.4296875, + "learning_rate": 1.0519347037092175e-06, + "loss": 0.3115, + "step": 20670 + }, + { + "epoch": 0.9076918010472353, + "grad_norm": 1.4765625, + "learning_rate": 1.0499498892373616e-06, + "loss": 0.3169, + "step": 20672 + }, + { + "epoch": 0.9077796195264388, + "grad_norm": 1.515625, + "learning_rate": 1.0479669088660827e-06, + "loss": 0.3559, + "step": 20674 + }, + { + "epoch": 0.9078674380056423, + "grad_norm": 1.46875, + "learning_rate": 1.0459857627472396e-06, + "loss": 0.3287, + "step": 20676 + }, + { + "epoch": 0.9079552564848459, + "grad_norm": 1.515625, + "learning_rate": 1.0440064510325448e-06, + "loss": 0.3209, + "step": 20678 + }, + { + "epoch": 0.9080430749640493, + "grad_norm": 1.4765625, + "learning_rate": 1.0420289738735822e-06, + "loss": 0.3123, + "step": 20680 + }, + { + "epoch": 0.9081308934432528, + "grad_norm": 1.4375, + "learning_rate": 1.0400533314217837e-06, + "loss": 0.332, + "step": 20682 + }, + { + "epoch": 0.9082187119224563, + "grad_norm": 1.484375, + "learning_rate": 1.0380795238284446e-06, + "loss": 0.299, + "step": 20684 + }, + { + "epoch": 0.9083065304016598, + "grad_norm": 1.4921875, + "learning_rate": 1.0361075512447193e-06, + "loss": 0.3013, + "step": 20686 + }, + { + "epoch": 0.9083943488808632, + "grad_norm": 1.4921875, + "learning_rate": 1.0341374138216203e-06, + "loss": 0.314, + "step": 20688 + }, + { + "epoch": 0.9084821673600667, + "grad_norm": 1.4921875, + "learning_rate": 1.0321691117100268e-06, + "loss": 0.3044, + "step": 20690 + }, + { + "epoch": 0.9085699858392702, + "grad_norm": 1.4296875, + "learning_rate": 1.0302026450606656e-06, + "loss": 0.3105, + "step": 20692 + }, + { + "epoch": 0.9086578043184738, + "grad_norm": 1.484375, + "learning_rate": 1.0282380140241326e-06, + "loss": 0.3198, + "step": 20694 + }, + { + "epoch": 0.9087456227976772, + "grad_norm": 1.421875, + "learning_rate": 1.02627521875088e-06, + "loss": 0.3752, + "step": 20696 + }, + { + "epoch": 0.9088334412768807, + "grad_norm": 1.4453125, + "learning_rate": 1.0243142593912153e-06, + "loss": 0.3131, + "step": 20698 + }, + { + "epoch": 0.9089212597560842, + "grad_norm": 1.421875, + "learning_rate": 1.0223551360953154e-06, + "loss": 0.2975, + "step": 20700 + }, + { + "epoch": 0.9090090782352876, + "grad_norm": 1.421875, + "learning_rate": 1.020397849013205e-06, + "loss": 0.3036, + "step": 20702 + }, + { + "epoch": 0.9090968967144911, + "grad_norm": 1.5546875, + "learning_rate": 1.018442398294775e-06, + "loss": 0.285, + "step": 20704 + }, + { + "epoch": 0.9091847151936946, + "grad_norm": 1.5234375, + "learning_rate": 1.016488784089778e-06, + "loss": 0.3174, + "step": 20706 + }, + { + "epoch": 0.9092725336728981, + "grad_norm": 1.5078125, + "learning_rate": 1.0145370065478194e-06, + "loss": 0.3333, + "step": 20708 + }, + { + "epoch": 0.9093603521521016, + "grad_norm": 1.5703125, + "learning_rate": 1.01258706581836e-06, + "loss": 0.311, + "step": 20710 + }, + { + "epoch": 0.9094481706313051, + "grad_norm": 1.6171875, + "learning_rate": 1.010638962050739e-06, + "loss": 0.3376, + "step": 20712 + }, + { + "epoch": 0.9095359891105086, + "grad_norm": 1.46875, + "learning_rate": 1.0086926953941368e-06, + "loss": 0.3444, + "step": 20714 + }, + { + "epoch": 0.9096238075897121, + "grad_norm": 1.59375, + "learning_rate": 1.0067482659975953e-06, + "loss": 0.2992, + "step": 20716 + }, + { + "epoch": 0.9097116260689155, + "grad_norm": 1.4609375, + "learning_rate": 1.0048056740100286e-06, + "loss": 0.337, + "step": 20718 + }, + { + "epoch": 0.909799444548119, + "grad_norm": 1.3984375, + "learning_rate": 1.0028649195801903e-06, + "loss": 0.3046, + "step": 20720 + }, + { + "epoch": 0.9098872630273225, + "grad_norm": 1.4453125, + "learning_rate": 1.0009260028567113e-06, + "loss": 0.3162, + "step": 20722 + }, + { + "epoch": 0.9099750815065261, + "grad_norm": 1.421875, + "learning_rate": 9.989889239880729e-07, + "loss": 0.301, + "step": 20724 + }, + { + "epoch": 0.9100628999857295, + "grad_norm": 1.4453125, + "learning_rate": 9.970536831226145e-07, + "loss": 0.301, + "step": 20726 + }, + { + "epoch": 0.910150718464933, + "grad_norm": 1.484375, + "learning_rate": 9.951202804085402e-07, + "loss": 0.3301, + "step": 20728 + }, + { + "epoch": 0.9102385369441365, + "grad_norm": 1.5546875, + "learning_rate": 9.931887159939062e-07, + "loss": 0.302, + "step": 20730 + }, + { + "epoch": 0.91032635542334, + "grad_norm": 1.5234375, + "learning_rate": 9.91258990026639e-07, + "loss": 0.292, + "step": 20732 + }, + { + "epoch": 0.9104141739025434, + "grad_norm": 1.4921875, + "learning_rate": 9.893311026545116e-07, + "loss": 0.33, + "step": 20734 + }, + { + "epoch": 0.9105019923817469, + "grad_norm": 1.453125, + "learning_rate": 9.874050540251672e-07, + "loss": 0.2974, + "step": 20736 + }, + { + "epoch": 0.9105898108609504, + "grad_norm": 1.4921875, + "learning_rate": 9.85480844286099e-07, + "loss": 0.3126, + "step": 20738 + }, + { + "epoch": 0.910677629340154, + "grad_norm": 1.4609375, + "learning_rate": 9.835584735846588e-07, + "loss": 0.3131, + "step": 20740 + }, + { + "epoch": 0.9107654478193574, + "grad_norm": 1.375, + "learning_rate": 9.816379420680727e-07, + "loss": 0.3123, + "step": 20742 + }, + { + "epoch": 0.9108532662985609, + "grad_norm": 1.4609375, + "learning_rate": 9.797192498834096e-07, + "loss": 0.3226, + "step": 20744 + }, + { + "epoch": 0.9109410847777644, + "grad_norm": 1.453125, + "learning_rate": 9.778023971776045e-07, + "loss": 0.324, + "step": 20746 + }, + { + "epoch": 0.9110289032569678, + "grad_norm": 1.53125, + "learning_rate": 9.758873840974514e-07, + "loss": 0.3356, + "step": 20748 + }, + { + "epoch": 0.9111167217361713, + "grad_norm": 1.53125, + "learning_rate": 9.739742107895994e-07, + "loss": 0.3209, + "step": 20750 + }, + { + "epoch": 0.9112045402153748, + "grad_norm": 1.546875, + "learning_rate": 9.720628774005647e-07, + "loss": 0.2962, + "step": 20752 + }, + { + "epoch": 0.9112923586945783, + "grad_norm": 1.5078125, + "learning_rate": 9.701533840767108e-07, + "loss": 0.3132, + "step": 20754 + }, + { + "epoch": 0.9113801771737818, + "grad_norm": 1.5, + "learning_rate": 9.682457309642735e-07, + "loss": 0.2998, + "step": 20756 + }, + { + "epoch": 0.9114679956529853, + "grad_norm": 1.4140625, + "learning_rate": 9.663399182093386e-07, + "loss": 0.338, + "step": 20758 + }, + { + "epoch": 0.9115558141321888, + "grad_norm": 1.4140625, + "learning_rate": 9.644359459578533e-07, + "loss": 0.329, + "step": 20760 + }, + { + "epoch": 0.9116436326113923, + "grad_norm": 1.4921875, + "learning_rate": 9.62533814355626e-07, + "loss": 0.2874, + "step": 20762 + }, + { + "epoch": 0.9117314510905957, + "grad_norm": 1.46875, + "learning_rate": 9.606335235483182e-07, + "loss": 0.3186, + "step": 20764 + }, + { + "epoch": 0.9118192695697992, + "grad_norm": 1.453125, + "learning_rate": 9.58735073681466e-07, + "loss": 0.3224, + "step": 20766 + }, + { + "epoch": 0.9119070880490027, + "grad_norm": 1.453125, + "learning_rate": 9.568384649004363e-07, + "loss": 0.337, + "step": 20768 + }, + { + "epoch": 0.9119949065282063, + "grad_norm": 1.4375, + "learning_rate": 9.549436973504855e-07, + "loss": 0.2885, + "step": 20770 + }, + { + "epoch": 0.9120827250074097, + "grad_norm": 1.53125, + "learning_rate": 9.530507711767056e-07, + "loss": 0.3455, + "step": 20772 + }, + { + "epoch": 0.9121705434866132, + "grad_norm": 1.4921875, + "learning_rate": 9.511596865240669e-07, + "loss": 0.3165, + "step": 20774 + }, + { + "epoch": 0.9122583619658167, + "grad_norm": 1.4609375, + "learning_rate": 9.49270443537384e-07, + "loss": 0.294, + "step": 20776 + }, + { + "epoch": 0.9123461804450201, + "grad_norm": 1.4296875, + "learning_rate": 9.473830423613328e-07, + "loss": 0.339, + "step": 20778 + }, + { + "epoch": 0.9124339989242236, + "grad_norm": 1.4296875, + "learning_rate": 9.454974831404561e-07, + "loss": 0.3194, + "step": 20780 + }, + { + "epoch": 0.9125218174034271, + "grad_norm": 1.421875, + "learning_rate": 9.436137660191469e-07, + "loss": 0.3044, + "step": 20782 + }, + { + "epoch": 0.9126096358826306, + "grad_norm": 1.53125, + "learning_rate": 9.417318911416644e-07, + "loss": 0.315, + "step": 20784 + }, + { + "epoch": 0.9126974543618341, + "grad_norm": 1.5, + "learning_rate": 9.398518586521188e-07, + "loss": 0.2958, + "step": 20786 + }, + { + "epoch": 0.9127852728410376, + "grad_norm": 1.4296875, + "learning_rate": 9.379736686944862e-07, + "loss": 0.305, + "step": 20788 + }, + { + "epoch": 0.9128730913202411, + "grad_norm": 1.5234375, + "learning_rate": 9.36097321412599e-07, + "loss": 0.322, + "step": 20790 + }, + { + "epoch": 0.9129609097994446, + "grad_norm": 1.4453125, + "learning_rate": 9.342228169501449e-07, + "loss": 0.3145, + "step": 20792 + }, + { + "epoch": 0.913048728278648, + "grad_norm": 1.53125, + "learning_rate": 9.323501554506786e-07, + "loss": 0.2959, + "step": 20794 + }, + { + "epoch": 0.9131365467578515, + "grad_norm": 1.484375, + "learning_rate": 9.304793370576076e-07, + "loss": 0.311, + "step": 20796 + }, + { + "epoch": 0.913224365237055, + "grad_norm": 1.546875, + "learning_rate": 9.286103619141978e-07, + "loss": 0.3461, + "step": 20798 + }, + { + "epoch": 0.9133121837162584, + "grad_norm": 1.4765625, + "learning_rate": 9.267432301635792e-07, + "loss": 0.2936, + "step": 20800 + }, + { + "epoch": 0.913400002195462, + "grad_norm": 1.421875, + "learning_rate": 9.248779419487292e-07, + "loss": 0.289, + "step": 20802 + }, + { + "epoch": 0.9134878206746655, + "grad_norm": 1.453125, + "learning_rate": 9.230144974125027e-07, + "loss": 0.3029, + "step": 20804 + }, + { + "epoch": 0.913575639153869, + "grad_norm": 1.453125, + "learning_rate": 9.211528966975941e-07, + "loss": 0.3303, + "step": 20806 + }, + { + "epoch": 0.9136634576330724, + "grad_norm": 1.4375, + "learning_rate": 9.192931399465698e-07, + "loss": 0.3011, + "step": 20808 + }, + { + "epoch": 0.9137512761122759, + "grad_norm": 1.46875, + "learning_rate": 9.174352273018521e-07, + "loss": 0.3299, + "step": 20810 + }, + { + "epoch": 0.9138390945914794, + "grad_norm": 1.4375, + "learning_rate": 9.155791589057133e-07, + "loss": 0.3313, + "step": 20812 + }, + { + "epoch": 0.9139269130706829, + "grad_norm": 1.3984375, + "learning_rate": 9.137249349002979e-07, + "loss": 0.3033, + "step": 20814 + }, + { + "epoch": 0.9140147315498864, + "grad_norm": 1.4453125, + "learning_rate": 9.118725554276008e-07, + "loss": 0.3177, + "step": 20816 + }, + { + "epoch": 0.9141025500290899, + "grad_norm": 1.546875, + "learning_rate": 9.10022020629478e-07, + "loss": 0.3122, + "step": 20818 + }, + { + "epoch": 0.9141903685082934, + "grad_norm": 1.5234375, + "learning_rate": 9.081733306476437e-07, + "loss": 0.3234, + "step": 20820 + }, + { + "epoch": 0.9142781869874969, + "grad_norm": 1.5, + "learning_rate": 9.063264856236708e-07, + "loss": 0.3054, + "step": 20822 + }, + { + "epoch": 0.9143660054667003, + "grad_norm": 1.4296875, + "learning_rate": 9.044814856989908e-07, + "loss": 0.3372, + "step": 20824 + }, + { + "epoch": 0.9144538239459038, + "grad_norm": 1.4296875, + "learning_rate": 9.026383310148933e-07, + "loss": 0.3083, + "step": 20826 + }, + { + "epoch": 0.9145416424251073, + "grad_norm": 1.4375, + "learning_rate": 9.00797021712535e-07, + "loss": 0.323, + "step": 20828 + }, + { + "epoch": 0.9146294609043107, + "grad_norm": 1.640625, + "learning_rate": 8.989575579329113e-07, + "loss": 0.3073, + "step": 20830 + }, + { + "epoch": 0.9147172793835143, + "grad_norm": 1.484375, + "learning_rate": 8.971199398168983e-07, + "loss": 0.3294, + "step": 20832 + }, + { + "epoch": 0.9148050978627178, + "grad_norm": 1.4921875, + "learning_rate": 8.95284167505217e-07, + "loss": 0.3088, + "step": 20834 + }, + { + "epoch": 0.9148929163419213, + "grad_norm": 1.4765625, + "learning_rate": 8.934502411384549e-07, + "loss": 0.3042, + "step": 20836 + }, + { + "epoch": 0.9149807348211247, + "grad_norm": 1.4921875, + "learning_rate": 8.916181608570495e-07, + "loss": 0.3142, + "step": 20838 + }, + { + "epoch": 0.9150685533003282, + "grad_norm": 1.4296875, + "learning_rate": 8.897879268013027e-07, + "loss": 0.3137, + "step": 20840 + }, + { + "epoch": 0.9151563717795317, + "grad_norm": 1.484375, + "learning_rate": 8.879595391113798e-07, + "loss": 0.3189, + "step": 20842 + }, + { + "epoch": 0.9152441902587352, + "grad_norm": 1.4453125, + "learning_rate": 8.861329979272915e-07, + "loss": 0.3058, + "step": 20844 + }, + { + "epoch": 0.9153320087379386, + "grad_norm": 1.390625, + "learning_rate": 8.843083033889227e-07, + "loss": 0.3186, + "step": 20846 + }, + { + "epoch": 0.9154198272171422, + "grad_norm": 1.46875, + "learning_rate": 8.824854556360062e-07, + "loss": 0.3289, + "step": 20848 + }, + { + "epoch": 0.9155076456963457, + "grad_norm": 1.40625, + "learning_rate": 8.806644548081305e-07, + "loss": 0.3385, + "step": 20850 + }, + { + "epoch": 0.9155954641755492, + "grad_norm": 1.53125, + "learning_rate": 8.788453010447534e-07, + "loss": 0.3497, + "step": 20852 + }, + { + "epoch": 0.9156832826547526, + "grad_norm": 1.4375, + "learning_rate": 8.770279944851856e-07, + "loss": 0.3319, + "step": 20854 + }, + { + "epoch": 0.9157711011339561, + "grad_norm": 1.4765625, + "learning_rate": 8.752125352685992e-07, + "loss": 0.3535, + "step": 20856 + }, + { + "epoch": 0.9158589196131596, + "grad_norm": 1.4921875, + "learning_rate": 8.73398923534019e-07, + "loss": 0.3008, + "step": 20858 + }, + { + "epoch": 0.915946738092363, + "grad_norm": 1.421875, + "learning_rate": 8.715871594203312e-07, + "loss": 0.3212, + "step": 20860 + }, + { + "epoch": 0.9160345565715666, + "grad_norm": 1.53125, + "learning_rate": 8.697772430662859e-07, + "loss": 0.2935, + "step": 20862 + }, + { + "epoch": 0.9161223750507701, + "grad_norm": 1.4140625, + "learning_rate": 8.679691746104807e-07, + "loss": 0.3038, + "step": 20864 + }, + { + "epoch": 0.9162101935299736, + "grad_norm": 1.46875, + "learning_rate": 8.661629541913824e-07, + "loss": 0.2944, + "step": 20866 + }, + { + "epoch": 0.916298012009177, + "grad_norm": 1.5546875, + "learning_rate": 8.643585819473055e-07, + "loss": 0.3247, + "step": 20868 + }, + { + "epoch": 0.9163858304883805, + "grad_norm": 1.4765625, + "learning_rate": 8.625560580164394e-07, + "loss": 0.3316, + "step": 20870 + }, + { + "epoch": 0.916473648967584, + "grad_norm": 1.5390625, + "learning_rate": 8.60755382536818e-07, + "loss": 0.2913, + "step": 20872 + }, + { + "epoch": 0.9165614674467875, + "grad_norm": 1.578125, + "learning_rate": 8.589565556463314e-07, + "loss": 0.346, + "step": 20874 + }, + { + "epoch": 0.9166492859259909, + "grad_norm": 1.53125, + "learning_rate": 8.571595774827413e-07, + "loss": 0.3234, + "step": 20876 + }, + { + "epoch": 0.9167371044051945, + "grad_norm": 1.515625, + "learning_rate": 8.553644481836542e-07, + "loss": 0.2973, + "step": 20878 + }, + { + "epoch": 0.916824922884398, + "grad_norm": 1.609375, + "learning_rate": 8.535711678865493e-07, + "loss": 0.2843, + "step": 20880 + }, + { + "epoch": 0.9169127413636015, + "grad_norm": 1.4921875, + "learning_rate": 8.517797367287555e-07, + "loss": 0.3122, + "step": 20882 + }, + { + "epoch": 0.9170005598428049, + "grad_norm": 1.4296875, + "learning_rate": 8.49990154847452e-07, + "loss": 0.2951, + "step": 20884 + }, + { + "epoch": 0.9170883783220084, + "grad_norm": 1.4453125, + "learning_rate": 8.482024223796958e-07, + "loss": 0.3027, + "step": 20886 + }, + { + "epoch": 0.9171761968012119, + "grad_norm": 1.4375, + "learning_rate": 8.464165394623829e-07, + "loss": 0.3013, + "step": 20888 + }, + { + "epoch": 0.9172640152804153, + "grad_norm": 1.5, + "learning_rate": 8.446325062322902e-07, + "loss": 0.3374, + "step": 20890 + }, + { + "epoch": 0.9173518337596188, + "grad_norm": 1.46875, + "learning_rate": 8.428503228260221e-07, + "loss": 0.3171, + "step": 20892 + }, + { + "epoch": 0.9174396522388224, + "grad_norm": 1.421875, + "learning_rate": 8.410699893800722e-07, + "loss": 0.3274, + "step": 20894 + }, + { + "epoch": 0.9175274707180259, + "grad_norm": 1.4921875, + "learning_rate": 8.392915060307704e-07, + "loss": 0.2995, + "step": 20896 + }, + { + "epoch": 0.9176152891972293, + "grad_norm": 1.453125, + "learning_rate": 8.37514872914319e-07, + "loss": 0.3186, + "step": 20898 + }, + { + "epoch": 0.9177031076764328, + "grad_norm": 1.4765625, + "learning_rate": 8.3574009016677e-07, + "loss": 0.3063, + "step": 20900 + }, + { + "epoch": 0.9177909261556363, + "grad_norm": 1.546875, + "learning_rate": 8.339671579240371e-07, + "loss": 0.3366, + "step": 20902 + }, + { + "epoch": 0.9178787446348398, + "grad_norm": 1.546875, + "learning_rate": 8.321960763218922e-07, + "loss": 0.3206, + "step": 20904 + }, + { + "epoch": 0.9179665631140432, + "grad_norm": 1.484375, + "learning_rate": 8.304268454959657e-07, + "loss": 0.3006, + "step": 20906 + }, + { + "epoch": 0.9180543815932467, + "grad_norm": 1.453125, + "learning_rate": 8.286594655817465e-07, + "loss": 0.3366, + "step": 20908 + }, + { + "epoch": 0.9181422000724503, + "grad_norm": 1.40625, + "learning_rate": 8.268939367145789e-07, + "loss": 0.3098, + "step": 20910 + }, + { + "epoch": 0.9182300185516538, + "grad_norm": 1.4765625, + "learning_rate": 8.251302590296661e-07, + "loss": 0.3572, + "step": 20912 + }, + { + "epoch": 0.9183178370308572, + "grad_norm": 1.4296875, + "learning_rate": 8.23368432662075e-07, + "loss": 0.271, + "step": 20914 + }, + { + "epoch": 0.9184056555100607, + "grad_norm": 1.515625, + "learning_rate": 8.216084577467226e-07, + "loss": 0.3109, + "step": 20916 + }, + { + "epoch": 0.9184934739892642, + "grad_norm": 1.46875, + "learning_rate": 8.198503344183955e-07, + "loss": 0.3085, + "step": 20918 + }, + { + "epoch": 0.9185812924684676, + "grad_norm": 1.4921875, + "learning_rate": 8.180940628117223e-07, + "loss": 0.3106, + "step": 20920 + }, + { + "epoch": 0.9186691109476711, + "grad_norm": 1.484375, + "learning_rate": 8.163396430612063e-07, + "loss": 0.3375, + "step": 20922 + }, + { + "epoch": 0.9187569294268747, + "grad_norm": 1.5, + "learning_rate": 8.145870753011958e-07, + "loss": 0.3231, + "step": 20924 + }, + { + "epoch": 0.9188447479060782, + "grad_norm": 1.5, + "learning_rate": 8.128363596659e-07, + "loss": 0.2954, + "step": 20926 + }, + { + "epoch": 0.9189325663852816, + "grad_norm": 1.46875, + "learning_rate": 8.110874962894005e-07, + "loss": 0.3181, + "step": 20928 + }, + { + "epoch": 0.9190203848644851, + "grad_norm": 1.484375, + "learning_rate": 8.093404853056125e-07, + "loss": 0.3185, + "step": 20930 + }, + { + "epoch": 0.9191082033436886, + "grad_norm": 1.4296875, + "learning_rate": 8.075953268483344e-07, + "loss": 0.3389, + "step": 20932 + }, + { + "epoch": 0.9191960218228921, + "grad_norm": 1.5078125, + "learning_rate": 8.058520210512066e-07, + "loss": 0.3182, + "step": 20934 + }, + { + "epoch": 0.9192838403020955, + "grad_norm": 1.59375, + "learning_rate": 8.041105680477251e-07, + "loss": 0.3405, + "step": 20936 + }, + { + "epoch": 0.919371658781299, + "grad_norm": 1.453125, + "learning_rate": 8.02370967971261e-07, + "loss": 0.3015, + "step": 20938 + }, + { + "epoch": 0.9194594772605026, + "grad_norm": 1.4296875, + "learning_rate": 8.006332209550244e-07, + "loss": 0.297, + "step": 20940 + }, + { + "epoch": 0.9195472957397061, + "grad_norm": 1.4453125, + "learning_rate": 7.988973271321004e-07, + "loss": 0.3436, + "step": 20942 + }, + { + "epoch": 0.9196351142189095, + "grad_norm": 1.421875, + "learning_rate": 7.971632866354189e-07, + "loss": 0.3226, + "step": 20944 + }, + { + "epoch": 0.919722932698113, + "grad_norm": 1.4296875, + "learning_rate": 7.954310995977737e-07, + "loss": 0.3318, + "step": 20946 + }, + { + "epoch": 0.9198107511773165, + "grad_norm": 1.421875, + "learning_rate": 7.937007661518198e-07, + "loss": 0.3187, + "step": 20948 + }, + { + "epoch": 0.91989856965652, + "grad_norm": 1.4609375, + "learning_rate": 7.919722864300649e-07, + "loss": 0.3513, + "step": 20950 + }, + { + "epoch": 0.9199863881357234, + "grad_norm": 1.4609375, + "learning_rate": 7.902456605648756e-07, + "loss": 0.3281, + "step": 20952 + }, + { + "epoch": 0.9200742066149269, + "grad_norm": 1.4140625, + "learning_rate": 7.885208886884709e-07, + "loss": 0.2876, + "step": 20954 + }, + { + "epoch": 0.9201620250941305, + "grad_norm": 1.5, + "learning_rate": 7.867979709329454e-07, + "loss": 0.2869, + "step": 20956 + }, + { + "epoch": 0.920249843573334, + "grad_norm": 1.4609375, + "learning_rate": 7.85076907430235e-07, + "loss": 0.3085, + "step": 20958 + }, + { + "epoch": 0.9203376620525374, + "grad_norm": 1.5625, + "learning_rate": 7.833576983121399e-07, + "loss": 0.3395, + "step": 20960 + }, + { + "epoch": 0.9204254805317409, + "grad_norm": 1.453125, + "learning_rate": 7.816403437103159e-07, + "loss": 0.3473, + "step": 20962 + }, + { + "epoch": 0.9205132990109444, + "grad_norm": 1.4921875, + "learning_rate": 7.799248437562801e-07, + "loss": 0.2932, + "step": 20964 + }, + { + "epoch": 0.9206011174901478, + "grad_norm": 1.40625, + "learning_rate": 7.782111985814077e-07, + "loss": 0.31, + "step": 20966 + }, + { + "epoch": 0.9206889359693513, + "grad_norm": 1.53125, + "learning_rate": 7.764994083169247e-07, + "loss": 0.3037, + "step": 20968 + }, + { + "epoch": 0.9207767544485549, + "grad_norm": 1.4296875, + "learning_rate": 7.747894730939259e-07, + "loss": 0.3276, + "step": 20970 + }, + { + "epoch": 0.9208645729277584, + "grad_norm": 1.40625, + "learning_rate": 7.730813930433567e-07, + "loss": 0.285, + "step": 20972 + }, + { + "epoch": 0.9209523914069618, + "grad_norm": 1.484375, + "learning_rate": 7.713751682960207e-07, + "loss": 0.3125, + "step": 20974 + }, + { + "epoch": 0.9210402098861653, + "grad_norm": 1.4375, + "learning_rate": 7.69670798982583e-07, + "loss": 0.3246, + "step": 20976 + }, + { + "epoch": 0.9211280283653688, + "grad_norm": 1.6015625, + "learning_rate": 7.67968285233564e-07, + "loss": 0.3272, + "step": 20978 + }, + { + "epoch": 0.9212158468445723, + "grad_norm": 1.609375, + "learning_rate": 7.662676271793429e-07, + "loss": 0.2933, + "step": 20980 + }, + { + "epoch": 0.9213036653237757, + "grad_norm": 1.5625, + "learning_rate": 7.64568824950157e-07, + "loss": 0.3296, + "step": 20982 + }, + { + "epoch": 0.9213914838029792, + "grad_norm": 1.46875, + "learning_rate": 7.628718786760997e-07, + "loss": 0.3073, + "step": 20984 + }, + { + "epoch": 0.9214793022821828, + "grad_norm": 1.4375, + "learning_rate": 7.611767884871251e-07, + "loss": 0.3272, + "step": 20986 + }, + { + "epoch": 0.9215671207613862, + "grad_norm": 1.5703125, + "learning_rate": 7.59483554513038e-07, + "loss": 0.3255, + "step": 20988 + }, + { + "epoch": 0.9216549392405897, + "grad_norm": 1.453125, + "learning_rate": 7.57792176883515e-07, + "loss": 0.3086, + "step": 20990 + }, + { + "epoch": 0.9217427577197932, + "grad_norm": 1.453125, + "learning_rate": 7.561026557280748e-07, + "loss": 0.3066, + "step": 20992 + }, + { + "epoch": 0.9218305761989967, + "grad_norm": 1.46875, + "learning_rate": 7.544149911761084e-07, + "loss": 0.3216, + "step": 20994 + }, + { + "epoch": 0.9219183946782001, + "grad_norm": 1.4140625, + "learning_rate": 7.527291833568539e-07, + "loss": 0.3202, + "step": 20996 + }, + { + "epoch": 0.9220062131574036, + "grad_norm": 1.4296875, + "learning_rate": 7.510452323994083e-07, + "loss": 0.3295, + "step": 20998 + }, + { + "epoch": 0.9220940316366071, + "grad_norm": 1.4296875, + "learning_rate": 7.493631384327348e-07, + "loss": 0.2978, + "step": 21000 + }, + { + "epoch": 0.9221818501158107, + "grad_norm": 1.46875, + "learning_rate": 7.476829015856446e-07, + "loss": 0.3027, + "step": 21002 + }, + { + "epoch": 0.9222696685950141, + "grad_norm": 1.4453125, + "learning_rate": 7.460045219868095e-07, + "loss": 0.3039, + "step": 21004 + }, + { + "epoch": 0.9223574870742176, + "grad_norm": 1.59375, + "learning_rate": 7.443279997647657e-07, + "loss": 0.3202, + "step": 21006 + }, + { + "epoch": 0.9224453055534211, + "grad_norm": 1.4453125, + "learning_rate": 7.426533350478937e-07, + "loss": 0.2813, + "step": 21008 + }, + { + "epoch": 0.9225331240326246, + "grad_norm": 1.46875, + "learning_rate": 7.409805279644494e-07, + "loss": 0.3119, + "step": 21010 + }, + { + "epoch": 0.922620942511828, + "grad_norm": 1.453125, + "learning_rate": 7.393095786425275e-07, + "loss": 0.3038, + "step": 21012 + }, + { + "epoch": 0.9227087609910315, + "grad_norm": 1.46875, + "learning_rate": 7.376404872100978e-07, + "loss": 0.3249, + "step": 21014 + }, + { + "epoch": 0.9227965794702351, + "grad_norm": 1.53125, + "learning_rate": 7.359732537949693e-07, + "loss": 0.3023, + "step": 21016 + }, + { + "epoch": 0.9228843979494386, + "grad_norm": 1.3984375, + "learning_rate": 7.343078785248315e-07, + "loss": 0.3033, + "step": 21018 + }, + { + "epoch": 0.922972216428642, + "grad_norm": 1.5, + "learning_rate": 7.326443615272099e-07, + "loss": 0.3115, + "step": 21020 + }, + { + "epoch": 0.9230600349078455, + "grad_norm": 1.546875, + "learning_rate": 7.309827029295002e-07, + "loss": 0.3266, + "step": 21022 + }, + { + "epoch": 0.923147853387049, + "grad_norm": 1.4609375, + "learning_rate": 7.29322902858956e-07, + "loss": 0.3321, + "step": 21024 + }, + { + "epoch": 0.9232356718662524, + "grad_norm": 1.6640625, + "learning_rate": 7.276649614426784e-07, + "loss": 0.3087, + "step": 21026 + }, + { + "epoch": 0.9233234903454559, + "grad_norm": 1.390625, + "learning_rate": 7.26008878807638e-07, + "loss": 0.3076, + "step": 21028 + }, + { + "epoch": 0.9234113088246594, + "grad_norm": 1.4765625, + "learning_rate": 7.243546550806557e-07, + "loss": 0.3151, + "step": 21030 + }, + { + "epoch": 0.923499127303863, + "grad_norm": 1.5078125, + "learning_rate": 7.227022903884134e-07, + "loss": 0.2999, + "step": 21032 + }, + { + "epoch": 0.9235869457830664, + "grad_norm": 1.4453125, + "learning_rate": 7.210517848574516e-07, + "loss": 0.335, + "step": 21034 + }, + { + "epoch": 0.9236747642622699, + "grad_norm": 1.5078125, + "learning_rate": 7.194031386141608e-07, + "loss": 0.3007, + "step": 21036 + }, + { + "epoch": 0.9237625827414734, + "grad_norm": 1.3828125, + "learning_rate": 7.177563517848013e-07, + "loss": 0.2695, + "step": 21038 + }, + { + "epoch": 0.9238504012206769, + "grad_norm": 1.515625, + "learning_rate": 7.161114244954775e-07, + "loss": 0.3298, + "step": 21040 + }, + { + "epoch": 0.9239382196998803, + "grad_norm": 1.4296875, + "learning_rate": 7.144683568721694e-07, + "loss": 0.3179, + "step": 21042 + }, + { + "epoch": 0.9240260381790838, + "grad_norm": 1.5234375, + "learning_rate": 7.128271490406873e-07, + "loss": 0.3385, + "step": 21044 + }, + { + "epoch": 0.9241138566582873, + "grad_norm": 1.4375, + "learning_rate": 7.111878011267309e-07, + "loss": 0.3113, + "step": 21046 + }, + { + "epoch": 0.9242016751374909, + "grad_norm": 1.453125, + "learning_rate": 7.095503132558329e-07, + "loss": 0.3134, + "step": 21048 + }, + { + "epoch": 0.9242894936166943, + "grad_norm": 1.40625, + "learning_rate": 7.07914685553393e-07, + "loss": 0.2692, + "step": 21050 + }, + { + "epoch": 0.9243773120958978, + "grad_norm": 1.5078125, + "learning_rate": 7.062809181446695e-07, + "loss": 0.3174, + "step": 21052 + }, + { + "epoch": 0.9244651305751013, + "grad_norm": 1.5, + "learning_rate": 7.046490111547788e-07, + "loss": 0.3495, + "step": 21054 + }, + { + "epoch": 0.9245529490543047, + "grad_norm": 1.4453125, + "learning_rate": 7.030189647086904e-07, + "loss": 0.3221, + "step": 21056 + }, + { + "epoch": 0.9246407675335082, + "grad_norm": 1.46875, + "learning_rate": 7.013907789312352e-07, + "loss": 0.3218, + "step": 21058 + }, + { + "epoch": 0.9247285860127117, + "grad_norm": 1.390625, + "learning_rate": 6.997644539470938e-07, + "loss": 0.3023, + "step": 21060 + }, + { + "epoch": 0.9248164044919152, + "grad_norm": 1.515625, + "learning_rate": 6.981399898808222e-07, + "loss": 0.3328, + "step": 21062 + }, + { + "epoch": 0.9249042229711187, + "grad_norm": 1.5234375, + "learning_rate": 6.965173868568098e-07, + "loss": 0.3167, + "step": 21064 + }, + { + "epoch": 0.9249920414503222, + "grad_norm": 1.4921875, + "learning_rate": 6.948966449993266e-07, + "loss": 0.3245, + "step": 21066 + }, + { + "epoch": 0.9250798599295257, + "grad_norm": 1.4453125, + "learning_rate": 6.932777644324844e-07, + "loss": 0.3237, + "step": 21068 + }, + { + "epoch": 0.9251676784087292, + "grad_norm": 1.4453125, + "learning_rate": 6.916607452802538e-07, + "loss": 0.319, + "step": 21070 + }, + { + "epoch": 0.9252554968879326, + "grad_norm": 1.5, + "learning_rate": 6.90045587666474e-07, + "loss": 0.3122, + "step": 21072 + }, + { + "epoch": 0.9253433153671361, + "grad_norm": 1.5859375, + "learning_rate": 6.884322917148328e-07, + "loss": 0.3193, + "step": 21074 + }, + { + "epoch": 0.9254311338463396, + "grad_norm": 1.46875, + "learning_rate": 6.868208575488699e-07, + "loss": 0.3228, + "step": 21076 + }, + { + "epoch": 0.9255189523255432, + "grad_norm": 1.4765625, + "learning_rate": 6.852112852919951e-07, + "loss": 0.3195, + "step": 21078 + }, + { + "epoch": 0.9256067708047466, + "grad_norm": 1.5234375, + "learning_rate": 6.836035750674708e-07, + "loss": 0.3395, + "step": 21080 + }, + { + "epoch": 0.9256945892839501, + "grad_norm": 1.4765625, + "learning_rate": 6.819977269984123e-07, + "loss": 0.3088, + "step": 21082 + }, + { + "epoch": 0.9257824077631536, + "grad_norm": 1.5234375, + "learning_rate": 6.803937412077965e-07, + "loss": 0.3288, + "step": 21084 + }, + { + "epoch": 0.925870226242357, + "grad_norm": 1.5, + "learning_rate": 6.787916178184583e-07, + "loss": 0.3166, + "step": 21086 + }, + { + "epoch": 0.9259580447215605, + "grad_norm": 1.4140625, + "learning_rate": 6.771913569530857e-07, + "loss": 0.2714, + "step": 21088 + }, + { + "epoch": 0.926045863200764, + "grad_norm": 1.453125, + "learning_rate": 6.755929587342336e-07, + "loss": 0.3068, + "step": 21090 + }, + { + "epoch": 0.9261336816799675, + "grad_norm": 1.453125, + "learning_rate": 6.739964232843038e-07, + "loss": 0.3145, + "step": 21092 + }, + { + "epoch": 0.926221500159171, + "grad_norm": 1.4921875, + "learning_rate": 6.72401750725557e-07, + "loss": 0.3062, + "step": 21094 + }, + { + "epoch": 0.9263093186383745, + "grad_norm": 1.5078125, + "learning_rate": 6.708089411801177e-07, + "loss": 0.2989, + "step": 21096 + }, + { + "epoch": 0.926397137117578, + "grad_norm": 1.421875, + "learning_rate": 6.692179947699579e-07, + "loss": 0.2991, + "step": 21098 + }, + { + "epoch": 0.9264849555967815, + "grad_norm": 1.5703125, + "learning_rate": 6.676289116169188e-07, + "loss": 0.342, + "step": 21100 + }, + { + "epoch": 0.9265727740759849, + "grad_norm": 1.4765625, + "learning_rate": 6.660416918426892e-07, + "loss": 0.2854, + "step": 21102 + }, + { + "epoch": 0.9266605925551884, + "grad_norm": 1.4296875, + "learning_rate": 6.644563355688277e-07, + "loss": 0.3511, + "step": 21104 + }, + { + "epoch": 0.9267484110343919, + "grad_norm": 1.53125, + "learning_rate": 6.628728429167258e-07, + "loss": 0.3465, + "step": 21106 + }, + { + "epoch": 0.9268362295135953, + "grad_norm": 1.578125, + "learning_rate": 6.612912140076588e-07, + "loss": 0.3519, + "step": 21108 + }, + { + "epoch": 0.9269240479927989, + "grad_norm": 1.515625, + "learning_rate": 6.597114489627437e-07, + "loss": 0.3272, + "step": 21110 + }, + { + "epoch": 0.9270118664720024, + "grad_norm": 1.4453125, + "learning_rate": 6.581335479029588e-07, + "loss": 0.2801, + "step": 21112 + }, + { + "epoch": 0.9270996849512059, + "grad_norm": 1.484375, + "learning_rate": 6.565575109491462e-07, + "loss": 0.3199, + "step": 21114 + }, + { + "epoch": 0.9271875034304093, + "grad_norm": 1.46875, + "learning_rate": 6.5498333822199e-07, + "loss": 0.2785, + "step": 21116 + }, + { + "epoch": 0.9272753219096128, + "grad_norm": 1.40625, + "learning_rate": 6.534110298420493e-07, + "loss": 0.3323, + "step": 21118 + }, + { + "epoch": 0.9273631403888163, + "grad_norm": 1.453125, + "learning_rate": 6.518405859297277e-07, + "loss": 0.3108, + "step": 21120 + }, + { + "epoch": 0.9274509588680198, + "grad_norm": 1.53125, + "learning_rate": 6.502720066052903e-07, + "loss": 0.315, + "step": 21122 + }, + { + "epoch": 0.9275387773472233, + "grad_norm": 1.421875, + "learning_rate": 6.487052919888603e-07, + "loss": 0.3075, + "step": 21124 + }, + { + "epoch": 0.9276265958264268, + "grad_norm": 1.4140625, + "learning_rate": 6.47140442200414e-07, + "loss": 0.3272, + "step": 21126 + }, + { + "epoch": 0.9277144143056303, + "grad_norm": 1.3984375, + "learning_rate": 6.455774573597917e-07, + "loss": 0.3082, + "step": 21128 + }, + { + "epoch": 0.9278022327848338, + "grad_norm": 1.4765625, + "learning_rate": 6.440163375866892e-07, + "loss": 0.3059, + "step": 21130 + }, + { + "epoch": 0.9278900512640372, + "grad_norm": 1.546875, + "learning_rate": 6.424570830006498e-07, + "loss": 0.3184, + "step": 21132 + }, + { + "epoch": 0.9279778697432407, + "grad_norm": 1.46875, + "learning_rate": 6.408996937210892e-07, + "loss": 0.3014, + "step": 21134 + }, + { + "epoch": 0.9280656882224442, + "grad_norm": 1.4453125, + "learning_rate": 6.393441698672647e-07, + "loss": 0.3257, + "step": 21136 + }, + { + "epoch": 0.9281535067016476, + "grad_norm": 1.4296875, + "learning_rate": 6.377905115583088e-07, + "loss": 0.2957, + "step": 21138 + }, + { + "epoch": 0.9282413251808512, + "grad_norm": 1.484375, + "learning_rate": 6.362387189131902e-07, + "loss": 0.2928, + "step": 21140 + }, + { + "epoch": 0.9283291436600547, + "grad_norm": 1.4765625, + "learning_rate": 6.346887920507555e-07, + "loss": 0.3254, + "step": 21142 + }, + { + "epoch": 0.9284169621392582, + "grad_norm": 1.4921875, + "learning_rate": 6.33140731089693e-07, + "loss": 0.3156, + "step": 21144 + }, + { + "epoch": 0.9285047806184616, + "grad_norm": 1.4375, + "learning_rate": 6.315945361485498e-07, + "loss": 0.3211, + "step": 21146 + }, + { + "epoch": 0.9285925990976651, + "grad_norm": 1.5078125, + "learning_rate": 6.300502073457448e-07, + "loss": 0.3218, + "step": 21148 + }, + { + "epoch": 0.9286804175768686, + "grad_norm": 1.515625, + "learning_rate": 6.285077447995307e-07, + "loss": 0.2848, + "step": 21150 + }, + { + "epoch": 0.9287682360560721, + "grad_norm": 1.453125, + "learning_rate": 6.26967148628041e-07, + "loss": 0.306, + "step": 21152 + }, + { + "epoch": 0.9288560545352755, + "grad_norm": 1.515625, + "learning_rate": 6.254284189492476e-07, + "loss": 0.3131, + "step": 21154 + }, + { + "epoch": 0.9289438730144791, + "grad_norm": 1.4609375, + "learning_rate": 6.238915558809899e-07, + "loss": 0.3059, + "step": 21156 + }, + { + "epoch": 0.9290316914936826, + "grad_norm": 1.46875, + "learning_rate": 6.223565595409597e-07, + "loss": 0.2828, + "step": 21158 + }, + { + "epoch": 0.9291195099728861, + "grad_norm": 1.53125, + "learning_rate": 6.208234300467048e-07, + "loss": 0.3251, + "step": 21160 + }, + { + "epoch": 0.9292073284520895, + "grad_norm": 1.46875, + "learning_rate": 6.192921675156394e-07, + "loss": 0.3106, + "step": 21162 + }, + { + "epoch": 0.929295146931293, + "grad_norm": 1.4765625, + "learning_rate": 6.177627720650226e-07, + "loss": 0.3275, + "step": 21164 + }, + { + "epoch": 0.9293829654104965, + "grad_norm": 1.5078125, + "learning_rate": 6.1623524381198e-07, + "loss": 0.2936, + "step": 21166 + }, + { + "epoch": 0.9294707838897, + "grad_norm": 1.453125, + "learning_rate": 6.14709582873485e-07, + "loss": 0.2886, + "step": 21168 + }, + { + "epoch": 0.9295586023689035, + "grad_norm": 1.46875, + "learning_rate": 6.131857893663772e-07, + "loss": 0.3234, + "step": 21170 + }, + { + "epoch": 0.929646420848107, + "grad_norm": 1.515625, + "learning_rate": 6.116638634073496e-07, + "loss": 0.3274, + "step": 21172 + }, + { + "epoch": 0.9297342393273105, + "grad_norm": 1.4921875, + "learning_rate": 6.101438051129449e-07, + "loss": 0.3255, + "step": 21174 + }, + { + "epoch": 0.929822057806514, + "grad_norm": 1.5078125, + "learning_rate": 6.086256145995783e-07, + "loss": 0.3073, + "step": 21176 + }, + { + "epoch": 0.9299098762857174, + "grad_norm": 1.5234375, + "learning_rate": 6.071092919835042e-07, + "loss": 0.3081, + "step": 21178 + }, + { + "epoch": 0.9299976947649209, + "grad_norm": 1.46875, + "learning_rate": 6.055948373808517e-07, + "loss": 0.3165, + "step": 21180 + }, + { + "epoch": 0.9300855132441244, + "grad_norm": 1.46875, + "learning_rate": 6.04082250907595e-07, + "loss": 0.3216, + "step": 21182 + }, + { + "epoch": 0.9301733317233278, + "grad_norm": 1.4375, + "learning_rate": 6.025715326795633e-07, + "loss": 0.3113, + "step": 21184 + }, + { + "epoch": 0.9302611502025314, + "grad_norm": 1.6015625, + "learning_rate": 6.01062682812456e-07, + "loss": 0.3103, + "step": 21186 + }, + { + "epoch": 0.9303489686817349, + "grad_norm": 1.4453125, + "learning_rate": 5.995557014218168e-07, + "loss": 0.2916, + "step": 21188 + }, + { + "epoch": 0.9304367871609384, + "grad_norm": 1.484375, + "learning_rate": 5.980505886230503e-07, + "loss": 0.2992, + "step": 21190 + }, + { + "epoch": 0.9305246056401418, + "grad_norm": 1.5390625, + "learning_rate": 5.965473445314201e-07, + "loss": 0.2912, + "step": 21192 + }, + { + "epoch": 0.9306124241193453, + "grad_norm": 1.515625, + "learning_rate": 5.950459692620425e-07, + "loss": 0.2946, + "step": 21194 + }, + { + "epoch": 0.9307002425985488, + "grad_norm": 1.5546875, + "learning_rate": 5.935464629298975e-07, + "loss": 0.3201, + "step": 21196 + }, + { + "epoch": 0.9307880610777522, + "grad_norm": 1.4765625, + "learning_rate": 5.920488256498131e-07, + "loss": 0.3221, + "step": 21198 + }, + { + "epoch": 0.9308758795569557, + "grad_norm": 1.4609375, + "learning_rate": 5.905530575364831e-07, + "loss": 0.3377, + "step": 21200 + }, + { + "epoch": 0.9309636980361593, + "grad_norm": 1.484375, + "learning_rate": 5.89059158704447e-07, + "loss": 0.3286, + "step": 21202 + }, + { + "epoch": 0.9310515165153628, + "grad_norm": 1.6640625, + "learning_rate": 5.875671292681157e-07, + "loss": 0.2927, + "step": 21204 + }, + { + "epoch": 0.9311393349945662, + "grad_norm": 1.5, + "learning_rate": 5.860769693417451e-07, + "loss": 0.3109, + "step": 21206 + }, + { + "epoch": 0.9312271534737697, + "grad_norm": 1.546875, + "learning_rate": 5.845886790394495e-07, + "loss": 0.3093, + "step": 21208 + }, + { + "epoch": 0.9313149719529732, + "grad_norm": 1.5234375, + "learning_rate": 5.8310225847521e-07, + "loss": 0.3427, + "step": 21210 + }, + { + "epoch": 0.9314027904321767, + "grad_norm": 1.4921875, + "learning_rate": 5.816177077628493e-07, + "loss": 0.3308, + "step": 21212 + }, + { + "epoch": 0.9314906089113801, + "grad_norm": 1.5546875, + "learning_rate": 5.801350270160599e-07, + "loss": 0.3064, + "step": 21214 + }, + { + "epoch": 0.9315784273905837, + "grad_norm": 1.515625, + "learning_rate": 5.786542163483843e-07, + "loss": 0.3105, + "step": 21216 + }, + { + "epoch": 0.9316662458697872, + "grad_norm": 1.5, + "learning_rate": 5.771752758732207e-07, + "loss": 0.3202, + "step": 21218 + }, + { + "epoch": 0.9317540643489907, + "grad_norm": 1.421875, + "learning_rate": 5.756982057038312e-07, + "loss": 0.3233, + "step": 21220 + }, + { + "epoch": 0.9318418828281941, + "grad_norm": 1.453125, + "learning_rate": 5.742230059533255e-07, + "loss": 0.3054, + "step": 21222 + }, + { + "epoch": 0.9319297013073976, + "grad_norm": 1.46875, + "learning_rate": 5.727496767346796e-07, + "loss": 0.3086, + "step": 21224 + }, + { + "epoch": 0.9320175197866011, + "grad_norm": 1.4375, + "learning_rate": 5.712782181607202e-07, + "loss": 0.3207, + "step": 21226 + }, + { + "epoch": 0.9321053382658046, + "grad_norm": 1.5703125, + "learning_rate": 5.698086303441292e-07, + "loss": 0.3119, + "step": 21228 + }, + { + "epoch": 0.932193156745008, + "grad_norm": 1.515625, + "learning_rate": 5.683409133974499e-07, + "loss": 0.304, + "step": 21230 + }, + { + "epoch": 0.9322809752242116, + "grad_norm": 1.546875, + "learning_rate": 5.668750674330786e-07, + "loss": 0.3022, + "step": 21232 + }, + { + "epoch": 0.9323687937034151, + "grad_norm": 1.578125, + "learning_rate": 5.654110925632756e-07, + "loss": 0.2937, + "step": 21234 + }, + { + "epoch": 0.9324566121826185, + "grad_norm": 1.5703125, + "learning_rate": 5.639489889001426e-07, + "loss": 0.3227, + "step": 21236 + }, + { + "epoch": 0.932544430661822, + "grad_norm": 1.4921875, + "learning_rate": 5.624887565556596e-07, + "loss": 0.3311, + "step": 21238 + }, + { + "epoch": 0.9326322491410255, + "grad_norm": 1.4375, + "learning_rate": 5.610303956416402e-07, + "loss": 0.3161, + "step": 21240 + }, + { + "epoch": 0.932720067620229, + "grad_norm": 1.5, + "learning_rate": 5.595739062697752e-07, + "loss": 0.2872, + "step": 21242 + }, + { + "epoch": 0.9328078860994324, + "grad_norm": 1.4609375, + "learning_rate": 5.581192885516006e-07, + "loss": 0.3168, + "step": 21244 + }, + { + "epoch": 0.9328957045786359, + "grad_norm": 1.515625, + "learning_rate": 5.566665425985052e-07, + "loss": 0.3393, + "step": 21246 + }, + { + "epoch": 0.9329835230578395, + "grad_norm": 1.4140625, + "learning_rate": 5.552156685217497e-07, + "loss": 0.3114, + "step": 21248 + }, + { + "epoch": 0.933071341537043, + "grad_norm": 1.46875, + "learning_rate": 5.537666664324342e-07, + "loss": 0.2953, + "step": 21250 + }, + { + "epoch": 0.9331591600162464, + "grad_norm": 1.4296875, + "learning_rate": 5.523195364415312e-07, + "loss": 0.3083, + "step": 21252 + }, + { + "epoch": 0.9332469784954499, + "grad_norm": 1.5, + "learning_rate": 5.508742786598575e-07, + "loss": 0.3182, + "step": 21254 + }, + { + "epoch": 0.9333347969746534, + "grad_norm": 1.484375, + "learning_rate": 5.494308931980913e-07, + "loss": 0.3156, + "step": 21256 + }, + { + "epoch": 0.9334226154538569, + "grad_norm": 1.4375, + "learning_rate": 5.479893801667718e-07, + "loss": 0.321, + "step": 21258 + }, + { + "epoch": 0.9335104339330603, + "grad_norm": 1.4921875, + "learning_rate": 5.465497396762831e-07, + "loss": 0.3114, + "step": 21260 + }, + { + "epoch": 0.9335982524122638, + "grad_norm": 1.484375, + "learning_rate": 5.451119718368786e-07, + "loss": 0.3205, + "step": 21262 + }, + { + "epoch": 0.9336860708914674, + "grad_norm": 1.4921875, + "learning_rate": 5.436760767586618e-07, + "loss": 0.3173, + "step": 21264 + }, + { + "epoch": 0.9337738893706709, + "grad_norm": 1.4921875, + "learning_rate": 5.422420545515949e-07, + "loss": 0.2962, + "step": 21266 + }, + { + "epoch": 0.9338617078498743, + "grad_norm": 1.421875, + "learning_rate": 5.408099053254929e-07, + "loss": 0.3042, + "step": 21268 + }, + { + "epoch": 0.9339495263290778, + "grad_norm": 1.4140625, + "learning_rate": 5.393796291900316e-07, + "loss": 0.3332, + "step": 21270 + }, + { + "epoch": 0.9340373448082813, + "grad_norm": 1.5078125, + "learning_rate": 5.379512262547431e-07, + "loss": 0.3221, + "step": 21272 + }, + { + "epoch": 0.9341251632874847, + "grad_norm": 1.4609375, + "learning_rate": 5.365246966290094e-07, + "loss": 0.3113, + "step": 21274 + }, + { + "epoch": 0.9342129817666882, + "grad_norm": 1.5546875, + "learning_rate": 5.351000404220846e-07, + "loss": 0.2909, + "step": 21276 + }, + { + "epoch": 0.9343008002458918, + "grad_norm": 1.453125, + "learning_rate": 5.336772577430593e-07, + "loss": 0.2881, + "step": 21278 + }, + { + "epoch": 0.9343886187250953, + "grad_norm": 1.4453125, + "learning_rate": 5.322563487008964e-07, + "loss": 0.3233, + "step": 21280 + }, + { + "epoch": 0.9344764372042987, + "grad_norm": 1.40625, + "learning_rate": 5.308373134044059e-07, + "loss": 0.297, + "step": 21282 + }, + { + "epoch": 0.9345642556835022, + "grad_norm": 1.421875, + "learning_rate": 5.294201519622594e-07, + "loss": 0.3024, + "step": 21284 + }, + { + "epoch": 0.9346520741627057, + "grad_norm": 1.4296875, + "learning_rate": 5.280048644829866e-07, + "loss": 0.2991, + "step": 21286 + }, + { + "epoch": 0.9347398926419092, + "grad_norm": 1.453125, + "learning_rate": 5.265914510749676e-07, + "loss": 0.325, + "step": 21288 + }, + { + "epoch": 0.9348277111211126, + "grad_norm": 1.421875, + "learning_rate": 5.251799118464407e-07, + "loss": 0.311, + "step": 21290 + }, + { + "epoch": 0.9349155296003161, + "grad_norm": 1.5, + "learning_rate": 5.237702469055028e-07, + "loss": 0.3254, + "step": 21292 + }, + { + "epoch": 0.9350033480795197, + "grad_norm": 1.4296875, + "learning_rate": 5.223624563601065e-07, + "loss": 0.3102, + "step": 21294 + }, + { + "epoch": 0.9350911665587232, + "grad_norm": 1.5234375, + "learning_rate": 5.209565403180627e-07, + "loss": 0.3131, + "step": 21296 + }, + { + "epoch": 0.9351789850379266, + "grad_norm": 1.46875, + "learning_rate": 5.195524988870326e-07, + "loss": 0.3322, + "step": 21298 + }, + { + "epoch": 0.9352668035171301, + "grad_norm": 1.625, + "learning_rate": 5.18150332174544e-07, + "loss": 0.3213, + "step": 21300 + }, + { + "epoch": 0.9353546219963336, + "grad_norm": 1.4765625, + "learning_rate": 5.167500402879665e-07, + "loss": 0.2772, + "step": 21302 + }, + { + "epoch": 0.935442440475537, + "grad_norm": 1.421875, + "learning_rate": 5.153516233345451e-07, + "loss": 0.3016, + "step": 21304 + }, + { + "epoch": 0.9355302589547405, + "grad_norm": 1.4453125, + "learning_rate": 5.139550814213634e-07, + "loss": 0.2926, + "step": 21306 + }, + { + "epoch": 0.935618077433944, + "grad_norm": 1.5546875, + "learning_rate": 5.125604146553692e-07, + "loss": 0.3425, + "step": 21308 + }, + { + "epoch": 0.9357058959131476, + "grad_norm": 1.4921875, + "learning_rate": 5.111676231433715e-07, + "loss": 0.3165, + "step": 21310 + }, + { + "epoch": 0.935793714392351, + "grad_norm": 1.4140625, + "learning_rate": 5.097767069920267e-07, + "loss": 0.313, + "step": 21312 + }, + { + "epoch": 0.9358815328715545, + "grad_norm": 1.4375, + "learning_rate": 5.083876663078523e-07, + "loss": 0.3176, + "step": 21314 + }, + { + "epoch": 0.935969351350758, + "grad_norm": 1.4140625, + "learning_rate": 5.070005011972218e-07, + "loss": 0.3126, + "step": 21316 + }, + { + "epoch": 0.9360571698299615, + "grad_norm": 1.46875, + "learning_rate": 5.056152117663665e-07, + "loss": 0.3096, + "step": 21318 + }, + { + "epoch": 0.9361449883091649, + "grad_norm": 1.4609375, + "learning_rate": 5.042317981213684e-07, + "loss": 0.3051, + "step": 21320 + }, + { + "epoch": 0.9362328067883684, + "grad_norm": 1.46875, + "learning_rate": 5.028502603681678e-07, + "loss": 0.3365, + "step": 21322 + }, + { + "epoch": 0.936320625267572, + "grad_norm": 1.546875, + "learning_rate": 5.01470598612569e-07, + "loss": 0.3218, + "step": 21324 + }, + { + "epoch": 0.9364084437467755, + "grad_norm": 1.578125, + "learning_rate": 5.000928129602234e-07, + "loss": 0.2954, + "step": 21326 + }, + { + "epoch": 0.9364962622259789, + "grad_norm": 1.6875, + "learning_rate": 4.987169035166467e-07, + "loss": 0.3334, + "step": 21328 + }, + { + "epoch": 0.9365840807051824, + "grad_norm": 1.5, + "learning_rate": 4.973428703872018e-07, + "loss": 0.3243, + "step": 21330 + }, + { + "epoch": 0.9366718991843859, + "grad_norm": 1.453125, + "learning_rate": 4.959707136771103e-07, + "loss": 0.297, + "step": 21332 + }, + { + "epoch": 0.9367597176635893, + "grad_norm": 1.5546875, + "learning_rate": 4.9460043349146e-07, + "loss": 0.3222, + "step": 21334 + }, + { + "epoch": 0.9368475361427928, + "grad_norm": 1.5546875, + "learning_rate": 4.932320299351784e-07, + "loss": 0.3216, + "step": 21336 + }, + { + "epoch": 0.9369353546219963, + "grad_norm": 1.4921875, + "learning_rate": 4.918655031130648e-07, + "loss": 0.3081, + "step": 21338 + }, + { + "epoch": 0.9370231731011999, + "grad_norm": 1.4296875, + "learning_rate": 4.905008531297661e-07, + "loss": 0.3371, + "step": 21340 + }, + { + "epoch": 0.9371109915804033, + "grad_norm": 1.53125, + "learning_rate": 4.891380800897877e-07, + "loss": 0.3587, + "step": 21342 + }, + { + "epoch": 0.9371988100596068, + "grad_norm": 1.3984375, + "learning_rate": 4.877771840974904e-07, + "loss": 0.2993, + "step": 21344 + }, + { + "epoch": 0.9372866285388103, + "grad_norm": 1.546875, + "learning_rate": 4.86418165257091e-07, + "loss": 0.3142, + "step": 21346 + }, + { + "epoch": 0.9373744470180138, + "grad_norm": 1.4140625, + "learning_rate": 4.850610236726672e-07, + "loss": 0.3118, + "step": 21348 + }, + { + "epoch": 0.9374622654972172, + "grad_norm": 1.484375, + "learning_rate": 4.83705759448147e-07, + "loss": 0.2905, + "step": 21350 + }, + { + "epoch": 0.9375500839764207, + "grad_norm": 1.5390625, + "learning_rate": 4.823523726873169e-07, + "loss": 0.3095, + "step": 21352 + }, + { + "epoch": 0.9376379024556242, + "grad_norm": 1.46875, + "learning_rate": 4.810008634938163e-07, + "loss": 0.3168, + "step": 21354 + }, + { + "epoch": 0.9377257209348278, + "grad_norm": 1.4453125, + "learning_rate": 4.796512319711482e-07, + "loss": 0.3462, + "step": 21356 + }, + { + "epoch": 0.9378135394140312, + "grad_norm": 1.515625, + "learning_rate": 4.783034782226691e-07, + "loss": 0.3285, + "step": 21358 + }, + { + "epoch": 0.9379013578932347, + "grad_norm": 1.421875, + "learning_rate": 4.769576023515854e-07, + "loss": 0.2981, + "step": 21360 + }, + { + "epoch": 0.9379891763724382, + "grad_norm": 1.421875, + "learning_rate": 4.7561360446096714e-07, + "loss": 0.3216, + "step": 21362 + }, + { + "epoch": 0.9380769948516416, + "grad_norm": 1.421875, + "learning_rate": 4.742714846537377e-07, + "loss": 0.298, + "step": 21364 + }, + { + "epoch": 0.9381648133308451, + "grad_norm": 1.515625, + "learning_rate": 4.729312430326788e-07, + "loss": 0.3255, + "step": 21366 + }, + { + "epoch": 0.9382526318100486, + "grad_norm": 1.5546875, + "learning_rate": 4.7159287970042485e-07, + "loss": 0.3263, + "step": 21368 + }, + { + "epoch": 0.9383404502892522, + "grad_norm": 1.46875, + "learning_rate": 4.702563947594663e-07, + "loss": 0.3215, + "step": 21370 + }, + { + "epoch": 0.9384282687684556, + "grad_norm": 1.4375, + "learning_rate": 4.6892178831215437e-07, + "loss": 0.3175, + "step": 21372 + }, + { + "epoch": 0.9385160872476591, + "grad_norm": 1.40625, + "learning_rate": 4.6758906046069084e-07, + "loss": 0.3307, + "step": 21374 + }, + { + "epoch": 0.9386039057268626, + "grad_norm": 1.4609375, + "learning_rate": 4.662582113071412e-07, + "loss": 0.313, + "step": 21376 + }, + { + "epoch": 0.9386917242060661, + "grad_norm": 1.46875, + "learning_rate": 4.649292409534184e-07, + "loss": 0.2962, + "step": 21378 + }, + { + "epoch": 0.9387795426852695, + "grad_norm": 1.5, + "learning_rate": 4.6360214950129375e-07, + "loss": 0.3318, + "step": 21380 + }, + { + "epoch": 0.938867361164473, + "grad_norm": 1.4765625, + "learning_rate": 4.6227693705239993e-07, + "loss": 0.2868, + "step": 21382 + }, + { + "epoch": 0.9389551796436765, + "grad_norm": 1.515625, + "learning_rate": 4.6095360370821685e-07, + "loss": 0.3004, + "step": 21384 + }, + { + "epoch": 0.9390429981228801, + "grad_norm": 1.53125, + "learning_rate": 4.5963214957009394e-07, + "loss": 0.3312, + "step": 21386 + }, + { + "epoch": 0.9391308166020835, + "grad_norm": 1.3984375, + "learning_rate": 4.5831257473921973e-07, + "loss": 0.3139, + "step": 21388 + }, + { + "epoch": 0.939218635081287, + "grad_norm": 1.4765625, + "learning_rate": 4.569948793166551e-07, + "loss": 0.2907, + "step": 21390 + }, + { + "epoch": 0.9393064535604905, + "grad_norm": 1.4765625, + "learning_rate": 4.5567906340330537e-07, + "loss": 0.3269, + "step": 21392 + }, + { + "epoch": 0.939394272039694, + "grad_norm": 1.515625, + "learning_rate": 4.543651270999344e-07, + "loss": 0.3064, + "step": 21394 + }, + { + "epoch": 0.9394820905188974, + "grad_norm": 1.4765625, + "learning_rate": 4.530530705071673e-07, + "loss": 0.3008, + "step": 21396 + }, + { + "epoch": 0.9395699089981009, + "grad_norm": 1.375, + "learning_rate": 4.5174289372547917e-07, + "loss": 0.3103, + "step": 21398 + }, + { + "epoch": 0.9396577274773044, + "grad_norm": 1.421875, + "learning_rate": 4.504345968552065e-07, + "loss": 0.2845, + "step": 21400 + }, + { + "epoch": 0.9397455459565079, + "grad_norm": 1.3828125, + "learning_rate": 4.4912817999653576e-07, + "loss": 0.3474, + "step": 21402 + }, + { + "epoch": 0.9398333644357114, + "grad_norm": 1.484375, + "learning_rate": 4.4782364324951476e-07, + "loss": 0.2995, + "step": 21404 + }, + { + "epoch": 0.9399211829149149, + "grad_norm": 1.5234375, + "learning_rate": 4.4652098671404685e-07, + "loss": 0.2957, + "step": 21406 + }, + { + "epoch": 0.9400090013941184, + "grad_norm": 1.46875, + "learning_rate": 4.452202104898828e-07, + "loss": 0.305, + "step": 21408 + }, + { + "epoch": 0.9400968198733218, + "grad_norm": 1.453125, + "learning_rate": 4.4392131467664853e-07, + "loss": 0.3167, + "step": 21410 + }, + { + "epoch": 0.9401846383525253, + "grad_norm": 1.5234375, + "learning_rate": 4.4262429937380057e-07, + "loss": 0.3332, + "step": 21412 + }, + { + "epoch": 0.9402724568317288, + "grad_norm": 1.46875, + "learning_rate": 4.4132916468067333e-07, + "loss": 0.2977, + "step": 21414 + }, + { + "epoch": 0.9403602753109322, + "grad_norm": 1.4609375, + "learning_rate": 4.400359106964458e-07, + "loss": 0.3122, + "step": 21416 + }, + { + "epoch": 0.9404480937901358, + "grad_norm": 1.484375, + "learning_rate": 4.387445375201527e-07, + "loss": 0.3073, + "step": 21418 + }, + { + "epoch": 0.9405359122693393, + "grad_norm": 1.4765625, + "learning_rate": 4.374550452506926e-07, + "loss": 0.3325, + "step": 21420 + }, + { + "epoch": 0.9406237307485428, + "grad_norm": 1.5, + "learning_rate": 4.3616743398681157e-07, + "loss": 0.283, + "step": 21422 + }, + { + "epoch": 0.9407115492277462, + "grad_norm": 1.484375, + "learning_rate": 4.348817038271197e-07, + "loss": 0.3195, + "step": 21424 + }, + { + "epoch": 0.9407993677069497, + "grad_norm": 1.453125, + "learning_rate": 4.335978548700742e-07, + "loss": 0.3162, + "step": 21426 + }, + { + "epoch": 0.9408871861861532, + "grad_norm": 1.4375, + "learning_rate": 4.323158872139937e-07, + "loss": 0.2864, + "step": 21428 + }, + { + "epoch": 0.9409750046653567, + "grad_norm": 1.453125, + "learning_rate": 4.3103580095705256e-07, + "loss": 0.3208, + "step": 21430 + }, + { + "epoch": 0.9410628231445602, + "grad_norm": 1.421875, + "learning_rate": 4.2975759619727775e-07, + "loss": 0.3209, + "step": 21432 + }, + { + "epoch": 0.9411506416237637, + "grad_norm": 1.40625, + "learning_rate": 4.284812730325577e-07, + "loss": 0.3148, + "step": 21434 + }, + { + "epoch": 0.9412384601029672, + "grad_norm": 1.5234375, + "learning_rate": 4.272068315606309e-07, + "loss": 0.3439, + "step": 21436 + }, + { + "epoch": 0.9413262785821707, + "grad_norm": 1.5, + "learning_rate": 4.2593427187909705e-07, + "loss": 0.2771, + "step": 21438 + }, + { + "epoch": 0.9414140970613741, + "grad_norm": 1.46875, + "learning_rate": 4.246635940854088e-07, + "loss": 0.3518, + "step": 21440 + }, + { + "epoch": 0.9415019155405776, + "grad_norm": 1.484375, + "learning_rate": 4.233947982768716e-07, + "loss": 0.3015, + "step": 21442 + }, + { + "epoch": 0.9415897340197811, + "grad_norm": 1.46875, + "learning_rate": 4.2212788455065213e-07, + "loss": 0.3217, + "step": 21444 + }, + { + "epoch": 0.9416775524989845, + "grad_norm": 1.4609375, + "learning_rate": 4.2086285300377004e-07, + "loss": 0.2966, + "step": 21446 + }, + { + "epoch": 0.9417653709781881, + "grad_norm": 1.4375, + "learning_rate": 4.195997037331034e-07, + "loss": 0.3075, + "step": 21448 + }, + { + "epoch": 0.9418531894573916, + "grad_norm": 1.3984375, + "learning_rate": 4.183384368353832e-07, + "loss": 0.3088, + "step": 21450 + }, + { + "epoch": 0.9419410079365951, + "grad_norm": 1.46875, + "learning_rate": 4.170790524071988e-07, + "loss": 0.3209, + "step": 21452 + }, + { + "epoch": 0.9420288264157985, + "grad_norm": 1.4140625, + "learning_rate": 4.158215505449953e-07, + "loss": 0.2918, + "step": 21454 + }, + { + "epoch": 0.942116644895002, + "grad_norm": 1.484375, + "learning_rate": 4.145659313450678e-07, + "loss": 0.3079, + "step": 21456 + }, + { + "epoch": 0.9422044633742055, + "grad_norm": 1.4921875, + "learning_rate": 4.1331219490357563e-07, + "loss": 0.298, + "step": 21458 + }, + { + "epoch": 0.942292281853409, + "grad_norm": 1.5234375, + "learning_rate": 4.12060341316528e-07, + "loss": 0.31, + "step": 21460 + }, + { + "epoch": 0.9423801003326124, + "grad_norm": 1.515625, + "learning_rate": 4.1081037067979553e-07, + "loss": 0.329, + "step": 21462 + }, + { + "epoch": 0.942467918811816, + "grad_norm": 1.5, + "learning_rate": 4.095622830891016e-07, + "loss": 0.3058, + "step": 21464 + }, + { + "epoch": 0.9425557372910195, + "grad_norm": 1.4609375, + "learning_rate": 4.08316078640017e-07, + "loss": 0.3173, + "step": 21466 + }, + { + "epoch": 0.942643555770223, + "grad_norm": 1.5859375, + "learning_rate": 4.070717574279875e-07, + "loss": 0.339, + "step": 21468 + }, + { + "epoch": 0.9427313742494264, + "grad_norm": 1.4375, + "learning_rate": 4.058293195482954e-07, + "loss": 0.3143, + "step": 21470 + }, + { + "epoch": 0.9428191927286299, + "grad_norm": 1.5, + "learning_rate": 4.045887650960922e-07, + "loss": 0.3114, + "step": 21472 + }, + { + "epoch": 0.9429070112078334, + "grad_norm": 1.4609375, + "learning_rate": 4.0335009416637426e-07, + "loss": 0.3175, + "step": 21474 + }, + { + "epoch": 0.9429948296870369, + "grad_norm": 1.484375, + "learning_rate": 4.021133068540045e-07, + "loss": 0.312, + "step": 21476 + }, + { + "epoch": 0.9430826481662404, + "grad_norm": 1.515625, + "learning_rate": 4.008784032536933e-07, + "loss": 0.3322, + "step": 21478 + }, + { + "epoch": 0.9431704666454439, + "grad_norm": 1.4609375, + "learning_rate": 3.9964538346000945e-07, + "loss": 0.2827, + "step": 21480 + }, + { + "epoch": 0.9432582851246474, + "grad_norm": 1.4375, + "learning_rate": 3.9841424756738023e-07, + "loss": 0.3179, + "step": 21482 + }, + { + "epoch": 0.9433461036038508, + "grad_norm": 1.6171875, + "learning_rate": 3.9718499567008573e-07, + "loss": 0.3429, + "step": 21484 + }, + { + "epoch": 0.9434339220830543, + "grad_norm": 1.46875, + "learning_rate": 3.959576278622618e-07, + "loss": 0.3006, + "step": 21486 + }, + { + "epoch": 0.9435217405622578, + "grad_norm": 1.5234375, + "learning_rate": 3.9473214423789983e-07, + "loss": 0.3245, + "step": 21488 + }, + { + "epoch": 0.9436095590414613, + "grad_norm": 1.4765625, + "learning_rate": 3.9350854489084985e-07, + "loss": 0.3254, + "step": 21490 + }, + { + "epoch": 0.9436973775206647, + "grad_norm": 1.4609375, + "learning_rate": 3.922868299148119e-07, + "loss": 0.3228, + "step": 21492 + }, + { + "epoch": 0.9437851959998683, + "grad_norm": 1.4921875, + "learning_rate": 3.9106699940335004e-07, + "loss": 0.3083, + "step": 21494 + }, + { + "epoch": 0.9438730144790718, + "grad_norm": 1.4765625, + "learning_rate": 3.898490534498755e-07, + "loss": 0.2993, + "step": 21496 + }, + { + "epoch": 0.9439608329582753, + "grad_norm": 1.453125, + "learning_rate": 3.8863299214765834e-07, + "loss": 0.3122, + "step": 21498 + }, + { + "epoch": 0.9440486514374787, + "grad_norm": 1.4296875, + "learning_rate": 3.874188155898295e-07, + "loss": 0.3229, + "step": 21500 + }, + { + "epoch": 0.9441364699166822, + "grad_norm": 1.5390625, + "learning_rate": 3.862065238693674e-07, + "loss": 0.3165, + "step": 21502 + }, + { + "epoch": 0.9442242883958857, + "grad_norm": 1.4921875, + "learning_rate": 3.8499611707910887e-07, + "loss": 0.3188, + "step": 21504 + }, + { + "epoch": 0.9443121068750892, + "grad_norm": 1.5234375, + "learning_rate": 3.8378759531174924e-07, + "loss": 0.2803, + "step": 21506 + }, + { + "epoch": 0.9443999253542926, + "grad_norm": 1.421875, + "learning_rate": 3.825809586598339e-07, + "loss": 0.3319, + "step": 21508 + }, + { + "epoch": 0.9444877438334962, + "grad_norm": 1.484375, + "learning_rate": 3.8137620721577227e-07, + "loss": 0.3344, + "step": 21510 + }, + { + "epoch": 0.9445755623126997, + "grad_norm": 1.4609375, + "learning_rate": 3.8017334107182113e-07, + "loss": 0.2868, + "step": 21512 + }, + { + "epoch": 0.9446633807919032, + "grad_norm": 1.4140625, + "learning_rate": 3.789723603200984e-07, + "loss": 0.2926, + "step": 21514 + }, + { + "epoch": 0.9447511992711066, + "grad_norm": 1.40625, + "learning_rate": 3.7777326505257503e-07, + "loss": 0.2918, + "step": 21516 + }, + { + "epoch": 0.9448390177503101, + "grad_norm": 1.546875, + "learning_rate": 3.765760553610748e-07, + "loss": 0.2923, + "step": 21518 + }, + { + "epoch": 0.9449268362295136, + "grad_norm": 1.5390625, + "learning_rate": 3.7538073133728256e-07, + "loss": 0.3119, + "step": 21520 + }, + { + "epoch": 0.945014654708717, + "grad_norm": 1.53125, + "learning_rate": 3.7418729307273913e-07, + "loss": 0.3021, + "step": 21522 + }, + { + "epoch": 0.9451024731879206, + "grad_norm": 1.4921875, + "learning_rate": 3.7299574065883527e-07, + "loss": 0.3275, + "step": 21524 + }, + { + "epoch": 0.9451902916671241, + "grad_norm": 1.4921875, + "learning_rate": 3.718060741868229e-07, + "loss": 0.3329, + "step": 21526 + }, + { + "epoch": 0.9452781101463276, + "grad_norm": 1.5390625, + "learning_rate": 3.706182937478014e-07, + "loss": 0.3302, + "step": 21528 + }, + { + "epoch": 0.945365928625531, + "grad_norm": 1.4765625, + "learning_rate": 3.69432399432737e-07, + "loss": 0.3176, + "step": 21530 + }, + { + "epoch": 0.9454537471047345, + "grad_norm": 1.484375, + "learning_rate": 3.6824839133244303e-07, + "loss": 0.313, + "step": 21532 + }, + { + "epoch": 0.945541565583938, + "grad_norm": 1.484375, + "learning_rate": 3.6706626953759427e-07, + "loss": 0.3476, + "step": 21534 + }, + { + "epoch": 0.9456293840631415, + "grad_norm": 1.4296875, + "learning_rate": 3.6588603413871267e-07, + "loss": 0.321, + "step": 21536 + }, + { + "epoch": 0.9457172025423449, + "grad_norm": 1.5234375, + "learning_rate": 3.6470768522618713e-07, + "loss": 0.3198, + "step": 21538 + }, + { + "epoch": 0.9458050210215485, + "grad_norm": 1.4296875, + "learning_rate": 3.6353122289025096e-07, + "loss": 0.3369, + "step": 21540 + }, + { + "epoch": 0.945892839500752, + "grad_norm": 1.53125, + "learning_rate": 3.623566472209988e-07, + "loss": 0.304, + "step": 21542 + }, + { + "epoch": 0.9459806579799555, + "grad_norm": 1.4453125, + "learning_rate": 3.6118395830838095e-07, + "loss": 0.3101, + "step": 21544 + }, + { + "epoch": 0.9460684764591589, + "grad_norm": 1.46875, + "learning_rate": 3.6001315624220046e-07, + "loss": 0.3317, + "step": 21546 + }, + { + "epoch": 0.9461562949383624, + "grad_norm": 1.5078125, + "learning_rate": 3.588442411121218e-07, + "loss": 0.2873, + "step": 21548 + }, + { + "epoch": 0.9462441134175659, + "grad_norm": 1.4296875, + "learning_rate": 3.5767721300765666e-07, + "loss": 0.3251, + "step": 21550 + }, + { + "epoch": 0.9463319318967693, + "grad_norm": 1.5, + "learning_rate": 3.5651207201817527e-07, + "loss": 0.3384, + "step": 21552 + }, + { + "epoch": 0.9464197503759728, + "grad_norm": 1.4296875, + "learning_rate": 3.553488182329118e-07, + "loss": 0.3002, + "step": 21554 + }, + { + "epoch": 0.9465075688551764, + "grad_norm": 1.4921875, + "learning_rate": 3.5418745174093936e-07, + "loss": 0.3427, + "step": 21556 + }, + { + "epoch": 0.9465953873343799, + "grad_norm": 1.546875, + "learning_rate": 3.530279726312008e-07, + "loss": 0.3356, + "step": 21558 + }, + { + "epoch": 0.9466832058135833, + "grad_norm": 1.4296875, + "learning_rate": 3.5187038099248893e-07, + "loss": 0.2836, + "step": 21560 + }, + { + "epoch": 0.9467710242927868, + "grad_norm": 1.4609375, + "learning_rate": 3.5071467691345226e-07, + "loss": 0.3062, + "step": 21562 + }, + { + "epoch": 0.9468588427719903, + "grad_norm": 1.6328125, + "learning_rate": 3.495608604825951e-07, + "loss": 0.297, + "step": 21564 + }, + { + "epoch": 0.9469466612511938, + "grad_norm": 1.4453125, + "learning_rate": 3.4840893178827715e-07, + "loss": 0.3281, + "step": 21566 + }, + { + "epoch": 0.9470344797303972, + "grad_norm": 1.6015625, + "learning_rate": 3.472588909187113e-07, + "loss": 0.3245, + "step": 21568 + }, + { + "epoch": 0.9471222982096008, + "grad_norm": 1.46875, + "learning_rate": 3.461107379619688e-07, + "loss": 0.3271, + "step": 21570 + }, + { + "epoch": 0.9472101166888043, + "grad_norm": 1.453125, + "learning_rate": 3.4496447300597647e-07, + "loss": 0.297, + "step": 21572 + }, + { + "epoch": 0.9472979351680078, + "grad_norm": 1.5, + "learning_rate": 3.438200961385141e-07, + "loss": 0.2927, + "step": 21574 + }, + { + "epoch": 0.9473857536472112, + "grad_norm": 1.453125, + "learning_rate": 3.426776074472199e-07, + "loss": 0.3157, + "step": 21576 + }, + { + "epoch": 0.9474735721264147, + "grad_norm": 1.5234375, + "learning_rate": 3.41537007019585e-07, + "loss": 0.3015, + "step": 21578 + }, + { + "epoch": 0.9475613906056182, + "grad_norm": 1.5234375, + "learning_rate": 3.4039829494295626e-07, + "loss": 0.3131, + "step": 21580 + }, + { + "epoch": 0.9476492090848216, + "grad_norm": 1.546875, + "learning_rate": 3.392614713045389e-07, + "loss": 0.3216, + "step": 21582 + }, + { + "epoch": 0.9477370275640251, + "grad_norm": 1.4921875, + "learning_rate": 3.381265361913882e-07, + "loss": 0.3391, + "step": 21584 + }, + { + "epoch": 0.9478248460432287, + "grad_norm": 1.5859375, + "learning_rate": 3.369934896904209e-07, + "loss": 0.2709, + "step": 21586 + }, + { + "epoch": 0.9479126645224322, + "grad_norm": 1.5703125, + "learning_rate": 3.3586233188840355e-07, + "loss": 0.2907, + "step": 21588 + }, + { + "epoch": 0.9480004830016356, + "grad_norm": 1.4375, + "learning_rate": 3.3473306287196136e-07, + "loss": 0.3163, + "step": 21590 + }, + { + "epoch": 0.9480883014808391, + "grad_norm": 1.453125, + "learning_rate": 3.336056827275752e-07, + "loss": 0.2975, + "step": 21592 + }, + { + "epoch": 0.9481761199600426, + "grad_norm": 1.6015625, + "learning_rate": 3.3248019154157595e-07, + "loss": 0.3388, + "step": 21594 + }, + { + "epoch": 0.9482639384392461, + "grad_norm": 1.5390625, + "learning_rate": 3.3135658940015857e-07, + "loss": 0.2906, + "step": 21596 + }, + { + "epoch": 0.9483517569184495, + "grad_norm": 1.453125, + "learning_rate": 3.302348763893681e-07, + "loss": 0.3181, + "step": 21598 + }, + { + "epoch": 0.948439575397653, + "grad_norm": 1.4609375, + "learning_rate": 3.2911505259510255e-07, + "loss": 0.3341, + "step": 21600 + }, + { + "epoch": 0.9485273938768566, + "grad_norm": 1.421875, + "learning_rate": 3.2799711810312107e-07, + "loss": 0.3411, + "step": 21602 + }, + { + "epoch": 0.9486152123560601, + "grad_norm": 1.5625, + "learning_rate": 3.268810729990329e-07, + "loss": 0.3265, + "step": 21604 + }, + { + "epoch": 0.9487030308352635, + "grad_norm": 1.453125, + "learning_rate": 3.2576691736831144e-07, + "loss": 0.3259, + "step": 21606 + }, + { + "epoch": 0.948790849314467, + "grad_norm": 1.53125, + "learning_rate": 3.246546512962689e-07, + "loss": 0.3057, + "step": 21608 + }, + { + "epoch": 0.9488786677936705, + "grad_norm": 1.4453125, + "learning_rate": 3.2354427486809e-07, + "loss": 0.3533, + "step": 21610 + }, + { + "epoch": 0.9489664862728739, + "grad_norm": 1.5234375, + "learning_rate": 3.224357881688067e-07, + "loss": 0.3012, + "step": 21612 + }, + { + "epoch": 0.9490543047520774, + "grad_norm": 1.453125, + "learning_rate": 3.2132919128330664e-07, + "loss": 0.3049, + "step": 21614 + }, + { + "epoch": 0.9491421232312809, + "grad_norm": 1.46875, + "learning_rate": 3.202244842963331e-07, + "loss": 0.2996, + "step": 21616 + }, + { + "epoch": 0.9492299417104845, + "grad_norm": 1.5703125, + "learning_rate": 3.191216672924824e-07, + "loss": 0.2839, + "step": 21618 + }, + { + "epoch": 0.9493177601896879, + "grad_norm": 1.4375, + "learning_rate": 3.180207403562119e-07, + "loss": 0.3177, + "step": 21620 + }, + { + "epoch": 0.9494055786688914, + "grad_norm": 1.53125, + "learning_rate": 3.1692170357183193e-07, + "loss": 0.3231, + "step": 21622 + }, + { + "epoch": 0.9494933971480949, + "grad_norm": 1.421875, + "learning_rate": 3.15824557023503e-07, + "loss": 0.3116, + "step": 21624 + }, + { + "epoch": 0.9495812156272984, + "grad_norm": 1.546875, + "learning_rate": 3.1472930079524674e-07, + "loss": 0.2935, + "step": 21626 + }, + { + "epoch": 0.9496690341065018, + "grad_norm": 1.5234375, + "learning_rate": 3.136359349709378e-07, + "loss": 0.3204, + "step": 21628 + }, + { + "epoch": 0.9497568525857053, + "grad_norm": 1.546875, + "learning_rate": 3.1254445963430914e-07, + "loss": 0.31, + "step": 21630 + }, + { + "epoch": 0.9498446710649089, + "grad_norm": 1.5, + "learning_rate": 3.114548748689411e-07, + "loss": 0.3214, + "step": 21632 + }, + { + "epoch": 0.9499324895441124, + "grad_norm": 1.4453125, + "learning_rate": 3.1036718075827806e-07, + "loss": 0.3303, + "step": 21634 + }, + { + "epoch": 0.9500203080233158, + "grad_norm": 1.4453125, + "learning_rate": 3.092813773856118e-07, + "loss": 0.3243, + "step": 21636 + }, + { + "epoch": 0.9501081265025193, + "grad_norm": 1.5078125, + "learning_rate": 3.0819746483410075e-07, + "loss": 0.317, + "step": 21638 + }, + { + "epoch": 0.9501959449817228, + "grad_norm": 1.5078125, + "learning_rate": 3.0711544318674514e-07, + "loss": 0.3354, + "step": 21640 + }, + { + "epoch": 0.9502837634609262, + "grad_norm": 1.46875, + "learning_rate": 3.060353125264065e-07, + "loss": 0.2811, + "step": 21642 + }, + { + "epoch": 0.9503715819401297, + "grad_norm": 1.4375, + "learning_rate": 3.049570729358076e-07, + "loss": 0.3008, + "step": 21644 + }, + { + "epoch": 0.9504594004193332, + "grad_norm": 1.453125, + "learning_rate": 3.0388072449751e-07, + "loss": 0.2968, + "step": 21646 + }, + { + "epoch": 0.9505472188985368, + "grad_norm": 1.515625, + "learning_rate": 3.028062672939508e-07, + "loss": 0.3307, + "step": 21648 + }, + { + "epoch": 0.9506350373777402, + "grad_norm": 1.484375, + "learning_rate": 3.017337014074084e-07, + "loss": 0.3145, + "step": 21650 + }, + { + "epoch": 0.9507228558569437, + "grad_norm": 1.4296875, + "learning_rate": 3.0066302692001724e-07, + "loss": 0.3367, + "step": 21652 + }, + { + "epoch": 0.9508106743361472, + "grad_norm": 1.5078125, + "learning_rate": 2.995942439137728e-07, + "loss": 0.3307, + "step": 21654 + }, + { + "epoch": 0.9508984928153507, + "grad_norm": 1.5, + "learning_rate": 2.9852735247052346e-07, + "loss": 0.3209, + "step": 21656 + }, + { + "epoch": 0.9509863112945541, + "grad_norm": 1.4609375, + "learning_rate": 2.9746235267197053e-07, + "loss": 0.3235, + "step": 21658 + }, + { + "epoch": 0.9510741297737576, + "grad_norm": 1.421875, + "learning_rate": 2.9639924459967105e-07, + "loss": 0.3129, + "step": 21660 + }, + { + "epoch": 0.9511619482529611, + "grad_norm": 1.4921875, + "learning_rate": 2.9533802833504043e-07, + "loss": 0.3338, + "step": 21662 + }, + { + "epoch": 0.9512497667321647, + "grad_norm": 1.46875, + "learning_rate": 2.94278703959347e-07, + "loss": 0.3112, + "step": 21664 + }, + { + "epoch": 0.9513375852113681, + "grad_norm": 1.4375, + "learning_rate": 2.932212715537092e-07, + "loss": 0.311, + "step": 21666 + }, + { + "epoch": 0.9514254036905716, + "grad_norm": 1.3828125, + "learning_rate": 2.9216573119911217e-07, + "loss": 0.3196, + "step": 21668 + }, + { + "epoch": 0.9515132221697751, + "grad_norm": 1.5078125, + "learning_rate": 2.9111208297638303e-07, + "loss": 0.2995, + "step": 21670 + }, + { + "epoch": 0.9516010406489785, + "grad_norm": 1.5390625, + "learning_rate": 2.9006032696621833e-07, + "loss": 0.3024, + "step": 21672 + }, + { + "epoch": 0.951688859128182, + "grad_norm": 1.5078125, + "learning_rate": 2.890104632491536e-07, + "loss": 0.3242, + "step": 21674 + }, + { + "epoch": 0.9517766776073855, + "grad_norm": 1.4140625, + "learning_rate": 2.879624919055912e-07, + "loss": 0.3083, + "step": 21676 + }, + { + "epoch": 0.9518644960865891, + "grad_norm": 1.640625, + "learning_rate": 2.869164130157864e-07, + "loss": 0.3083, + "step": 21678 + }, + { + "epoch": 0.9519523145657925, + "grad_norm": 1.4609375, + "learning_rate": 2.858722266598474e-07, + "loss": 0.287, + "step": 21680 + }, + { + "epoch": 0.952040133044996, + "grad_norm": 1.3984375, + "learning_rate": 2.8482993291773506e-07, + "loss": 0.3142, + "step": 21682 + }, + { + "epoch": 0.9521279515241995, + "grad_norm": 1.5546875, + "learning_rate": 2.8378953186927457e-07, + "loss": 0.3063, + "step": 21684 + }, + { + "epoch": 0.952215770003403, + "grad_norm": 1.4765625, + "learning_rate": 2.827510235941355e-07, + "loss": 0.3172, + "step": 21686 + }, + { + "epoch": 0.9523035884826064, + "grad_norm": 1.484375, + "learning_rate": 2.817144081718459e-07, + "loss": 0.3375, + "step": 21688 + }, + { + "epoch": 0.9523914069618099, + "grad_norm": 1.5078125, + "learning_rate": 2.8067968568179505e-07, + "loss": 0.283, + "step": 21690 + }, + { + "epoch": 0.9524792254410134, + "grad_norm": 1.453125, + "learning_rate": 2.7964685620321953e-07, + "loss": 0.3213, + "step": 21692 + }, + { + "epoch": 0.952567043920217, + "grad_norm": 1.5234375, + "learning_rate": 2.786159198152116e-07, + "loss": 0.3321, + "step": 21694 + }, + { + "epoch": 0.9526548623994204, + "grad_norm": 1.46875, + "learning_rate": 2.775868765967221e-07, + "loss": 0.3309, + "step": 21696 + }, + { + "epoch": 0.9527426808786239, + "grad_norm": 1.4296875, + "learning_rate": 2.7655972662655736e-07, + "loss": 0.301, + "step": 21698 + }, + { + "epoch": 0.9528304993578274, + "grad_norm": 1.4296875, + "learning_rate": 2.755344699833767e-07, + "loss": 0.3061, + "step": 21700 + }, + { + "epoch": 0.9529183178370308, + "grad_norm": 1.5390625, + "learning_rate": 2.7451110674569237e-07, + "loss": 0.3429, + "step": 21702 + }, + { + "epoch": 0.9530061363162343, + "grad_norm": 1.4453125, + "learning_rate": 2.7348963699187214e-07, + "loss": 0.3205, + "step": 21704 + }, + { + "epoch": 0.9530939547954378, + "grad_norm": 1.4921875, + "learning_rate": 2.7247006080014513e-07, + "loss": 0.324, + "step": 21706 + }, + { + "epoch": 0.9531817732746413, + "grad_norm": 1.53125, + "learning_rate": 2.714523782485878e-07, + "loss": 0.3208, + "step": 21708 + }, + { + "epoch": 0.9532695917538448, + "grad_norm": 1.4921875, + "learning_rate": 2.7043658941513783e-07, + "loss": 0.3202, + "step": 21710 + }, + { + "epoch": 0.9533574102330483, + "grad_norm": 1.5390625, + "learning_rate": 2.6942269437758015e-07, + "loss": 0.2927, + "step": 21712 + }, + { + "epoch": 0.9534452287122518, + "grad_norm": 1.4765625, + "learning_rate": 2.6841069321355827e-07, + "loss": 0.337, + "step": 21714 + }, + { + "epoch": 0.9535330471914553, + "grad_norm": 1.46875, + "learning_rate": 2.674005860005768e-07, + "loss": 0.2998, + "step": 21716 + }, + { + "epoch": 0.9536208656706587, + "grad_norm": 1.515625, + "learning_rate": 2.6639237281598783e-07, + "loss": 0.2922, + "step": 21718 + }, + { + "epoch": 0.9537086841498622, + "grad_norm": 1.40625, + "learning_rate": 2.6538605373699897e-07, + "loss": 0.2955, + "step": 21720 + }, + { + "epoch": 0.9537965026290657, + "grad_norm": 1.46875, + "learning_rate": 2.6438162884067365e-07, + "loss": 0.3054, + "step": 21722 + }, + { + "epoch": 0.9538843211082693, + "grad_norm": 1.53125, + "learning_rate": 2.6337909820393634e-07, + "loss": 0.326, + "step": 21724 + }, + { + "epoch": 0.9539721395874727, + "grad_norm": 1.546875, + "learning_rate": 2.623784619035535e-07, + "loss": 0.2911, + "step": 21726 + }, + { + "epoch": 0.9540599580666762, + "grad_norm": 1.46875, + "learning_rate": 2.613797200161611e-07, + "loss": 0.3035, + "step": 21728 + }, + { + "epoch": 0.9541477765458797, + "grad_norm": 1.4921875, + "learning_rate": 2.6038287261823944e-07, + "loss": 0.3146, + "step": 21730 + }, + { + "epoch": 0.9542355950250831, + "grad_norm": 1.4375, + "learning_rate": 2.593879197861249e-07, + "loss": 0.298, + "step": 21732 + }, + { + "epoch": 0.9543234135042866, + "grad_norm": 1.484375, + "learning_rate": 2.5839486159601746e-07, + "loss": 0.2863, + "step": 21734 + }, + { + "epoch": 0.9544112319834901, + "grad_norm": 1.4609375, + "learning_rate": 2.5740369812396193e-07, + "loss": 0.2981, + "step": 21736 + }, + { + "epoch": 0.9544990504626936, + "grad_norm": 1.4453125, + "learning_rate": 2.5641442944586144e-07, + "loss": 0.2854, + "step": 21738 + }, + { + "epoch": 0.9545868689418971, + "grad_norm": 1.515625, + "learning_rate": 2.554270556374777e-07, + "loss": 0.2901, + "step": 21740 + }, + { + "epoch": 0.9546746874211006, + "grad_norm": 1.453125, + "learning_rate": 2.544415767744196e-07, + "loss": 0.3035, + "step": 21742 + }, + { + "epoch": 0.9547625059003041, + "grad_norm": 1.4765625, + "learning_rate": 2.5345799293215734e-07, + "loss": 0.3136, + "step": 21744 + }, + { + "epoch": 0.9548503243795076, + "grad_norm": 1.546875, + "learning_rate": 2.5247630418601673e-07, + "loss": 0.3432, + "step": 21746 + }, + { + "epoch": 0.954938142858711, + "grad_norm": 1.46875, + "learning_rate": 2.5149651061117105e-07, + "loss": 0.3208, + "step": 21748 + }, + { + "epoch": 0.9550259613379145, + "grad_norm": 1.515625, + "learning_rate": 2.505186122826547e-07, + "loss": 0.3199, + "step": 21750 + }, + { + "epoch": 0.955113779817118, + "grad_norm": 1.53125, + "learning_rate": 2.495426092753578e-07, + "loss": 0.32, + "step": 21752 + }, + { + "epoch": 0.9552015982963215, + "grad_norm": 1.4921875, + "learning_rate": 2.485685016640177e-07, + "loss": 0.3055, + "step": 21754 + }, + { + "epoch": 0.955289416775525, + "grad_norm": 1.421875, + "learning_rate": 2.4759628952323867e-07, + "loss": 0.2988, + "step": 21756 + }, + { + "epoch": 0.9553772352547285, + "grad_norm": 1.484375, + "learning_rate": 2.466259729274667e-07, + "loss": 0.3319, + "step": 21758 + }, + { + "epoch": 0.955465053733932, + "grad_norm": 1.4375, + "learning_rate": 2.456575519510118e-07, + "loss": 0.3322, + "step": 21760 + }, + { + "epoch": 0.9555528722131355, + "grad_norm": 1.421875, + "learning_rate": 2.446910266680369e-07, + "loss": 0.3233, + "step": 21762 + }, + { + "epoch": 0.9556406906923389, + "grad_norm": 1.421875, + "learning_rate": 2.4372639715255776e-07, + "loss": 0.3157, + "step": 21764 + }, + { + "epoch": 0.9557285091715424, + "grad_norm": 1.5625, + "learning_rate": 2.4276366347844305e-07, + "loss": 0.2984, + "step": 21766 + }, + { + "epoch": 0.9558163276507459, + "grad_norm": 1.484375, + "learning_rate": 2.4180282571942546e-07, + "loss": 0.3368, + "step": 21768 + }, + { + "epoch": 0.9559041461299493, + "grad_norm": 1.4296875, + "learning_rate": 2.4084388394907954e-07, + "loss": 0.3091, + "step": 21770 + }, + { + "epoch": 0.9559919646091529, + "grad_norm": 1.6796875, + "learning_rate": 2.398868382408437e-07, + "loss": 0.3215, + "step": 21772 + }, + { + "epoch": 0.9560797830883564, + "grad_norm": 1.484375, + "learning_rate": 2.389316886680121e-07, + "loss": 0.315, + "step": 21774 + }, + { + "epoch": 0.9561676015675599, + "grad_norm": 1.4609375, + "learning_rate": 2.3797843530372344e-07, + "loss": 0.3483, + "step": 21776 + }, + { + "epoch": 0.9562554200467633, + "grad_norm": 1.5, + "learning_rate": 2.370270782209888e-07, + "loss": 0.3088, + "step": 21778 + }, + { + "epoch": 0.9563432385259668, + "grad_norm": 1.4765625, + "learning_rate": 2.360776174926499e-07, + "loss": 0.3143, + "step": 21780 + }, + { + "epoch": 0.9564310570051703, + "grad_norm": 1.4375, + "learning_rate": 2.3513005319142634e-07, + "loss": 0.3293, + "step": 21782 + }, + { + "epoch": 0.9565188754843738, + "grad_norm": 1.4765625, + "learning_rate": 2.3418438538987952e-07, + "loss": 0.3208, + "step": 21784 + }, + { + "epoch": 0.9566066939635773, + "grad_norm": 1.4609375, + "learning_rate": 2.332406141604293e-07, + "loss": 0.3239, + "step": 21786 + }, + { + "epoch": 0.9566945124427808, + "grad_norm": 1.546875, + "learning_rate": 2.3229873957534841e-07, + "loss": 0.3204, + "step": 21788 + }, + { + "epoch": 0.9567823309219843, + "grad_norm": 1.46875, + "learning_rate": 2.3135876170676806e-07, + "loss": 0.3359, + "step": 21790 + }, + { + "epoch": 0.9568701494011878, + "grad_norm": 1.5390625, + "learning_rate": 2.3042068062667232e-07, + "loss": 0.3261, + "step": 21792 + }, + { + "epoch": 0.9569579678803912, + "grad_norm": 1.53125, + "learning_rate": 2.294844964068954e-07, + "loss": 0.3228, + "step": 21794 + }, + { + "epoch": 0.9570457863595947, + "grad_norm": 1.484375, + "learning_rate": 2.2855020911913826e-07, + "loss": 0.3152, + "step": 21796 + }, + { + "epoch": 0.9571336048387982, + "grad_norm": 1.4921875, + "learning_rate": 2.2761781883494094e-07, + "loss": 0.316, + "step": 21798 + }, + { + "epoch": 0.9572214233180016, + "grad_norm": 1.46875, + "learning_rate": 2.2668732562571016e-07, + "loss": 0.283, + "step": 21800 + }, + { + "epoch": 0.9573092417972052, + "grad_norm": 1.484375, + "learning_rate": 2.2575872956270283e-07, + "loss": 0.3198, + "step": 21802 + }, + { + "epoch": 0.9573970602764087, + "grad_norm": 1.453125, + "learning_rate": 2.248320307170315e-07, + "loss": 0.3325, + "step": 21804 + }, + { + "epoch": 0.9574848787556122, + "grad_norm": 1.4296875, + "learning_rate": 2.2390722915966167e-07, + "loss": 0.3085, + "step": 21806 + }, + { + "epoch": 0.9575726972348156, + "grad_norm": 1.53125, + "learning_rate": 2.2298432496141441e-07, + "loss": 0.346, + "step": 21808 + }, + { + "epoch": 0.9576605157140191, + "grad_norm": 1.5, + "learning_rate": 2.2206331819296934e-07, + "loss": 0.3222, + "step": 21810 + }, + { + "epoch": 0.9577483341932226, + "grad_norm": 1.46875, + "learning_rate": 2.2114420892485333e-07, + "loss": 0.3223, + "step": 21812 + }, + { + "epoch": 0.957836152672426, + "grad_norm": 1.515625, + "learning_rate": 2.2022699722745454e-07, + "loss": 0.3287, + "step": 21814 + }, + { + "epoch": 0.9579239711516295, + "grad_norm": 1.515625, + "learning_rate": 2.1931168317101125e-07, + "loss": 0.3536, + "step": 21816 + }, + { + "epoch": 0.9580117896308331, + "grad_norm": 1.4140625, + "learning_rate": 2.1839826682562015e-07, + "loss": 0.3204, + "step": 21818 + }, + { + "epoch": 0.9580996081100366, + "grad_norm": 1.5, + "learning_rate": 2.1748674826123084e-07, + "loss": 0.3263, + "step": 21820 + }, + { + "epoch": 0.95818742658924, + "grad_norm": 1.546875, + "learning_rate": 2.165771275476458e-07, + "loss": 0.3258, + "step": 21822 + }, + { + "epoch": 0.9582752450684435, + "grad_norm": 1.5078125, + "learning_rate": 2.1566940475452602e-07, + "loss": 0.3304, + "step": 21824 + }, + { + "epoch": 0.958363063547647, + "grad_norm": 1.3984375, + "learning_rate": 2.147635799513853e-07, + "loss": 0.304, + "step": 21826 + }, + { + "epoch": 0.9584508820268505, + "grad_norm": 1.453125, + "learning_rate": 2.1385965320759038e-07, + "loss": 0.3063, + "step": 21828 + }, + { + "epoch": 0.9585387005060539, + "grad_norm": 1.5, + "learning_rate": 2.1295762459236368e-07, + "loss": 0.3192, + "step": 21830 + }, + { + "epoch": 0.9586265189852575, + "grad_norm": 1.4453125, + "learning_rate": 2.1205749417478604e-07, + "loss": 0.3364, + "step": 21832 + }, + { + "epoch": 0.958714337464461, + "grad_norm": 1.453125, + "learning_rate": 2.1115926202378565e-07, + "loss": 0.3031, + "step": 21834 + }, + { + "epoch": 0.9588021559436645, + "grad_norm": 1.546875, + "learning_rate": 2.1026292820815195e-07, + "loss": 0.3354, + "step": 21836 + }, + { + "epoch": 0.9588899744228679, + "grad_norm": 1.453125, + "learning_rate": 2.0936849279652727e-07, + "loss": 0.2824, + "step": 21838 + }, + { + "epoch": 0.9589777929020714, + "grad_norm": 1.4375, + "learning_rate": 2.0847595585740676e-07, + "loss": 0.3315, + "step": 21840 + }, + { + "epoch": 0.9590656113812749, + "grad_norm": 1.5078125, + "learning_rate": 2.075853174591358e-07, + "loss": 0.288, + "step": 21842 + }, + { + "epoch": 0.9591534298604784, + "grad_norm": 1.4765625, + "learning_rate": 2.0669657766992923e-07, + "loss": 0.317, + "step": 21844 + }, + { + "epoch": 0.9592412483396818, + "grad_norm": 1.3984375, + "learning_rate": 2.058097365578382e-07, + "loss": 0.2892, + "step": 21846 + }, + { + "epoch": 0.9593290668188854, + "grad_norm": 1.4765625, + "learning_rate": 2.0492479419078336e-07, + "loss": 0.3132, + "step": 21848 + }, + { + "epoch": 0.9594168852980889, + "grad_norm": 1.5390625, + "learning_rate": 2.0404175063653275e-07, + "loss": 0.3096, + "step": 21850 + }, + { + "epoch": 0.9595047037772924, + "grad_norm": 1.4453125, + "learning_rate": 2.0316060596270726e-07, + "loss": 0.3124, + "step": 21852 + }, + { + "epoch": 0.9595925222564958, + "grad_norm": 1.5859375, + "learning_rate": 2.0228136023678623e-07, + "loss": 0.3051, + "step": 21854 + }, + { + "epoch": 0.9596803407356993, + "grad_norm": 1.4765625, + "learning_rate": 2.0140401352610195e-07, + "loss": 0.2949, + "step": 21856 + }, + { + "epoch": 0.9597681592149028, + "grad_norm": 1.46875, + "learning_rate": 2.0052856589784507e-07, + "loss": 0.3101, + "step": 21858 + }, + { + "epoch": 0.9598559776941062, + "grad_norm": 1.4453125, + "learning_rate": 1.9965501741905645e-07, + "loss": 0.302, + "step": 21860 + }, + { + "epoch": 0.9599437961733097, + "grad_norm": 1.5078125, + "learning_rate": 1.9878336815662978e-07, + "loss": 0.2926, + "step": 21862 + }, + { + "epoch": 0.9600316146525133, + "grad_norm": 1.5078125, + "learning_rate": 1.9791361817732002e-07, + "loss": 0.3298, + "step": 21864 + }, + { + "epoch": 0.9601194331317168, + "grad_norm": 1.578125, + "learning_rate": 1.9704576754772663e-07, + "loss": 0.3247, + "step": 21866 + }, + { + "epoch": 0.9602072516109202, + "grad_norm": 1.5546875, + "learning_rate": 1.961798163343187e-07, + "loss": 0.2966, + "step": 21868 + }, + { + "epoch": 0.9602950700901237, + "grad_norm": 1.515625, + "learning_rate": 1.953157646034043e-07, + "loss": 0.3175, + "step": 21870 + }, + { + "epoch": 0.9603828885693272, + "grad_norm": 1.453125, + "learning_rate": 1.9445361242115545e-07, + "loss": 0.3292, + "step": 21872 + }, + { + "epoch": 0.9604707070485307, + "grad_norm": 1.5078125, + "learning_rate": 1.935933598535944e-07, + "loss": 0.3042, + "step": 21874 + }, + { + "epoch": 0.9605585255277341, + "grad_norm": 1.4296875, + "learning_rate": 1.9273500696659896e-07, + "loss": 0.3043, + "step": 21876 + }, + { + "epoch": 0.9606463440069377, + "grad_norm": 1.4765625, + "learning_rate": 1.9187855382590547e-07, + "loss": 0.3103, + "step": 21878 + }, + { + "epoch": 0.9607341624861412, + "grad_norm": 1.4375, + "learning_rate": 1.9102400049710035e-07, + "loss": 0.3412, + "step": 21880 + }, + { + "epoch": 0.9608219809653447, + "grad_norm": 1.4765625, + "learning_rate": 1.901713470456229e-07, + "loss": 0.3092, + "step": 21882 + }, + { + "epoch": 0.9609097994445481, + "grad_norm": 1.5859375, + "learning_rate": 1.893205935367709e-07, + "loss": 0.3328, + "step": 21884 + }, + { + "epoch": 0.9609976179237516, + "grad_norm": 1.4140625, + "learning_rate": 1.88471740035695e-07, + "loss": 0.3059, + "step": 21886 + }, + { + "epoch": 0.9610854364029551, + "grad_norm": 1.5078125, + "learning_rate": 1.8762478660740156e-07, + "loss": 0.3126, + "step": 21888 + }, + { + "epoch": 0.9611732548821585, + "grad_norm": 1.4140625, + "learning_rate": 1.8677973331674982e-07, + "loss": 0.3218, + "step": 21890 + }, + { + "epoch": 0.961261073361362, + "grad_norm": 1.3984375, + "learning_rate": 1.8593658022845462e-07, + "loss": 0.2991, + "step": 21892 + }, + { + "epoch": 0.9613488918405656, + "grad_norm": 1.453125, + "learning_rate": 1.8509532740708102e-07, + "loss": 0.3121, + "step": 21894 + }, + { + "epoch": 0.9614367103197691, + "grad_norm": 1.5078125, + "learning_rate": 1.842559749170608e-07, + "loss": 0.2948, + "step": 21896 + }, + { + "epoch": 0.9615245287989725, + "grad_norm": 1.5, + "learning_rate": 1.834185228226648e-07, + "loss": 0.3117, + "step": 21898 + }, + { + "epoch": 0.961612347278176, + "grad_norm": 1.5, + "learning_rate": 1.8258297118802502e-07, + "loss": 0.3318, + "step": 21900 + }, + { + "epoch": 0.9617001657573795, + "grad_norm": 1.453125, + "learning_rate": 1.8174932007713476e-07, + "loss": 0.3209, + "step": 21902 + }, + { + "epoch": 0.961787984236583, + "grad_norm": 1.515625, + "learning_rate": 1.80917569553829e-07, + "loss": 0.3037, + "step": 21904 + }, + { + "epoch": 0.9618758027157864, + "grad_norm": 1.4296875, + "learning_rate": 1.8008771968180403e-07, + "loss": 0.2937, + "step": 21906 + }, + { + "epoch": 0.9619636211949899, + "grad_norm": 1.5390625, + "learning_rate": 1.7925977052461186e-07, + "loss": 0.3217, + "step": 21908 + }, + { + "epoch": 0.9620514396741935, + "grad_norm": 1.4921875, + "learning_rate": 1.7843372214565723e-07, + "loss": 0.3387, + "step": 21910 + }, + { + "epoch": 0.962139258153397, + "grad_norm": 1.46875, + "learning_rate": 1.7760957460819793e-07, + "loss": 0.3066, + "step": 21912 + }, + { + "epoch": 0.9622270766326004, + "grad_norm": 1.5078125, + "learning_rate": 1.7678732797534735e-07, + "loss": 0.3127, + "step": 21914 + }, + { + "epoch": 0.9623148951118039, + "grad_norm": 1.515625, + "learning_rate": 1.7596698231007459e-07, + "loss": 0.3274, + "step": 21916 + }, + { + "epoch": 0.9624027135910074, + "grad_norm": 1.4765625, + "learning_rate": 1.7514853767519878e-07, + "loss": 0.3352, + "step": 21918 + }, + { + "epoch": 0.9624905320702108, + "grad_norm": 1.5, + "learning_rate": 1.7433199413340317e-07, + "loss": 0.2812, + "step": 21920 + }, + { + "epoch": 0.9625783505494143, + "grad_norm": 1.4140625, + "learning_rate": 1.7351735174721274e-07, + "loss": 0.286, + "step": 21922 + }, + { + "epoch": 0.9626661690286179, + "grad_norm": 1.4375, + "learning_rate": 1.7270461057901367e-07, + "loss": 0.3296, + "step": 21924 + }, + { + "epoch": 0.9627539875078214, + "grad_norm": 1.4140625, + "learning_rate": 1.7189377069104784e-07, + "loss": 0.306, + "step": 21926 + }, + { + "epoch": 0.9628418059870248, + "grad_norm": 1.4765625, + "learning_rate": 1.7108483214540726e-07, + "loss": 0.3067, + "step": 21928 + }, + { + "epoch": 0.9629296244662283, + "grad_norm": 1.46875, + "learning_rate": 1.702777950040424e-07, + "loss": 0.3172, + "step": 21930 + }, + { + "epoch": 0.9630174429454318, + "grad_norm": 1.5390625, + "learning_rate": 1.6947265932875655e-07, + "loss": 0.3347, + "step": 21932 + }, + { + "epoch": 0.9631052614246353, + "grad_norm": 1.4375, + "learning_rate": 1.6866942518120877e-07, + "loss": 0.2929, + "step": 21934 + }, + { + "epoch": 0.9631930799038387, + "grad_norm": 1.3984375, + "learning_rate": 1.6786809262290816e-07, + "loss": 0.3185, + "step": 21936 + }, + { + "epoch": 0.9632808983830422, + "grad_norm": 1.4609375, + "learning_rate": 1.6706866171521952e-07, + "loss": 0.3116, + "step": 21938 + }, + { + "epoch": 0.9633687168622458, + "grad_norm": 1.484375, + "learning_rate": 1.662711325193689e-07, + "loss": 0.307, + "step": 21940 + }, + { + "epoch": 0.9634565353414493, + "grad_norm": 1.515625, + "learning_rate": 1.6547550509642406e-07, + "loss": 0.3371, + "step": 21942 + }, + { + "epoch": 0.9635443538206527, + "grad_norm": 1.421875, + "learning_rate": 1.6468177950731967e-07, + "loss": 0.3152, + "step": 21944 + }, + { + "epoch": 0.9636321722998562, + "grad_norm": 1.421875, + "learning_rate": 1.638899558128404e-07, + "loss": 0.317, + "step": 21946 + }, + { + "epoch": 0.9637199907790597, + "grad_norm": 1.4140625, + "learning_rate": 1.631000340736183e-07, + "loss": 0.3107, + "step": 21948 + }, + { + "epoch": 0.9638078092582631, + "grad_norm": 1.484375, + "learning_rate": 1.623120143501522e-07, + "loss": 0.3131, + "step": 21950 + }, + { + "epoch": 0.9638956277374666, + "grad_norm": 1.46875, + "learning_rate": 1.6152589670278552e-07, + "loss": 0.3149, + "step": 21952 + }, + { + "epoch": 0.9639834462166701, + "grad_norm": 1.5234375, + "learning_rate": 1.6074168119172006e-07, + "loss": 0.3243, + "step": 21954 + }, + { + "epoch": 0.9640712646958737, + "grad_norm": 1.3984375, + "learning_rate": 1.5995936787700782e-07, + "loss": 0.3016, + "step": 21956 + }, + { + "epoch": 0.9641590831750771, + "grad_norm": 1.4140625, + "learning_rate": 1.5917895681856475e-07, + "loss": 0.3022, + "step": 21958 + }, + { + "epoch": 0.9642469016542806, + "grad_norm": 1.4921875, + "learning_rate": 1.5840044807615138e-07, + "loss": 0.3238, + "step": 21960 + }, + { + "epoch": 0.9643347201334841, + "grad_norm": 1.453125, + "learning_rate": 1.5762384170938947e-07, + "loss": 0.3044, + "step": 21962 + }, + { + "epoch": 0.9644225386126876, + "grad_norm": 1.4765625, + "learning_rate": 1.5684913777774536e-07, + "loss": 0.3587, + "step": 21964 + }, + { + "epoch": 0.964510357091891, + "grad_norm": 1.46875, + "learning_rate": 1.560763363405493e-07, + "loss": 0.306, + "step": 21966 + }, + { + "epoch": 0.9645981755710945, + "grad_norm": 1.421875, + "learning_rate": 1.5530543745698457e-07, + "loss": 0.3222, + "step": 21968 + }, + { + "epoch": 0.964685994050298, + "grad_norm": 1.59375, + "learning_rate": 1.5453644118608447e-07, + "loss": 0.3017, + "step": 21970 + }, + { + "epoch": 0.9647738125295016, + "grad_norm": 1.546875, + "learning_rate": 1.5376934758674077e-07, + "loss": 0.3307, + "step": 21972 + }, + { + "epoch": 0.964861631008705, + "grad_norm": 1.453125, + "learning_rate": 1.530041567176954e-07, + "loss": 0.3025, + "step": 21974 + }, + { + "epoch": 0.9649494494879085, + "grad_norm": 1.4609375, + "learning_rate": 1.5224086863754594e-07, + "loss": 0.3043, + "step": 21976 + }, + { + "epoch": 0.965037267967112, + "grad_norm": 1.421875, + "learning_rate": 1.5147948340475115e-07, + "loss": 0.3172, + "step": 21978 + }, + { + "epoch": 0.9651250864463154, + "grad_norm": 1.484375, + "learning_rate": 1.5072000107761164e-07, + "loss": 0.3221, + "step": 21980 + }, + { + "epoch": 0.9652129049255189, + "grad_norm": 1.515625, + "learning_rate": 1.4996242171429197e-07, + "loss": 0.3245, + "step": 21982 + }, + { + "epoch": 0.9653007234047224, + "grad_norm": 1.3984375, + "learning_rate": 1.4920674537280688e-07, + "loss": 0.3153, + "step": 21984 + }, + { + "epoch": 0.965388541883926, + "grad_norm": 1.4609375, + "learning_rate": 1.4845297211102393e-07, + "loss": 0.3186, + "step": 21986 + }, + { + "epoch": 0.9654763603631294, + "grad_norm": 1.328125, + "learning_rate": 1.4770110198667197e-07, + "loss": 0.3236, + "step": 21988 + }, + { + "epoch": 0.9655641788423329, + "grad_norm": 1.5703125, + "learning_rate": 1.4695113505732715e-07, + "loss": 0.3182, + "step": 21990 + }, + { + "epoch": 0.9656519973215364, + "grad_norm": 1.5, + "learning_rate": 1.4620307138042412e-07, + "loss": 0.303, + "step": 21992 + }, + { + "epoch": 0.9657398158007399, + "grad_norm": 1.5546875, + "learning_rate": 1.4545691101324476e-07, + "loss": 0.337, + "step": 21994 + }, + { + "epoch": 0.9658276342799433, + "grad_norm": 1.5625, + "learning_rate": 1.447126540129351e-07, + "loss": 0.3146, + "step": 21996 + }, + { + "epoch": 0.9659154527591468, + "grad_norm": 1.4453125, + "learning_rate": 1.439703004364884e-07, + "loss": 0.3166, + "step": 21998 + }, + { + "epoch": 0.9660032712383503, + "grad_norm": 1.421875, + "learning_rate": 1.4322985034075366e-07, + "loss": 0.2911, + "step": 22000 + } + ], + "logging_steps": 2, + "max_steps": 22774, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 1.498954685106157e+20, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}