diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,42028 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21271221366941864, + "eval_steps": 1000, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.7726017805784885e-05, + "grad_norm": 636.0, + "learning_rate": 2.9411764705882356e-07, + "loss": 14.7402, + "step": 1 + }, + { + "epoch": 3.545203561156977e-05, + "grad_norm": 620.0, + "learning_rate": 5.882352941176471e-07, + "loss": 13.8226, + "step": 2 + }, + { + "epoch": 7.090407122313954e-05, + "grad_norm": 648.0, + "learning_rate": 1.1764705882352942e-06, + "loss": 14.3719, + "step": 4 + }, + { + "epoch": 0.00010635610683470932, + "grad_norm": 556.0, + "learning_rate": 1.7647058823529412e-06, + "loss": 13.7975, + "step": 6 + }, + { + "epoch": 0.00014180814244627908, + "grad_norm": 672.0, + "learning_rate": 2.3529411764705885e-06, + "loss": 14.2378, + "step": 8 + }, + { + "epoch": 0.00017726017805784887, + "grad_norm": 552.0, + "learning_rate": 2.9411764705882355e-06, + "loss": 13.8208, + "step": 10 + }, + { + "epoch": 0.00021271221366941863, + "grad_norm": 492.0, + "learning_rate": 3.5294117647058825e-06, + "loss": 12.2904, + "step": 12 + }, + { + "epoch": 0.0002481642492809884, + "grad_norm": 450.0, + "learning_rate": 4.11764705882353e-06, + "loss": 12.2658, + "step": 14 + }, + { + "epoch": 0.00028361628489255816, + "grad_norm": 338.0, + "learning_rate": 4.705882352941177e-06, + "loss": 10.4614, + "step": 16 + }, + { + "epoch": 0.00031906832050412795, + "grad_norm": 209.0, + "learning_rate": 5.294117647058824e-06, + "loss": 8.4051, + "step": 18 + }, + { + "epoch": 0.00035452035611569774, + "grad_norm": 125.5, + "learning_rate": 5.882352941176471e-06, + "loss": 7.5553, + "step": 20 + }, + { + "epoch": 0.00038997239172726747, + "grad_norm": 90.0, + "learning_rate": 6.470588235294119e-06, + "loss": 7.013, + "step": 22 + }, + { + "epoch": 0.00042542442733883726, + "grad_norm": 81.0, + "learning_rate": 7.058823529411765e-06, + "loss": 6.7415, + "step": 24 + }, + { + "epoch": 0.00046087646295040705, + "grad_norm": 103.0, + "learning_rate": 7.647058823529413e-06, + "loss": 6.0122, + "step": 26 + }, + { + "epoch": 0.0004963284985619768, + "grad_norm": 108.0, + "learning_rate": 8.23529411764706e-06, + "loss": 5.1164, + "step": 28 + }, + { + "epoch": 0.0005317805341735466, + "grad_norm": 68.5, + "learning_rate": 8.823529411764707e-06, + "loss": 4.0865, + "step": 30 + }, + { + "epoch": 0.0005672325697851163, + "grad_norm": 70.0, + "learning_rate": 9.411764705882354e-06, + "loss": 3.4705, + "step": 32 + }, + { + "epoch": 0.0006026846053966862, + "grad_norm": 56.0, + "learning_rate": 1e-05, + "loss": 2.6308, + "step": 34 + }, + { + "epoch": 0.0006381366410082559, + "grad_norm": 19.5, + "learning_rate": 1.0588235294117648e-05, + "loss": 2.1415, + "step": 36 + }, + { + "epoch": 0.0006735886766198256, + "grad_norm": 13.4375, + "learning_rate": 1.1176470588235295e-05, + "loss": 1.9987, + "step": 38 + }, + { + "epoch": 0.0007090407122313955, + "grad_norm": 13.625, + "learning_rate": 1.1764705882352942e-05, + "loss": 1.8561, + "step": 40 + }, + { + "epoch": 0.0007444927478429652, + "grad_norm": 10.875, + "learning_rate": 1.2352941176470589e-05, + "loss": 1.8186, + "step": 42 + }, + { + "epoch": 0.0007799447834545349, + "grad_norm": 6.8125, + "learning_rate": 1.2941176470588238e-05, + "loss": 1.7927, + "step": 44 + }, + { + "epoch": 0.0008153968190661048, + "grad_norm": 8.4375, + "learning_rate": 1.3529411764705883e-05, + "loss": 1.6379, + "step": 46 + }, + { + "epoch": 0.0008508488546776745, + "grad_norm": 5.5, + "learning_rate": 1.411764705882353e-05, + "loss": 1.5964, + "step": 48 + }, + { + "epoch": 0.0008863008902892443, + "grad_norm": 5.3125, + "learning_rate": 1.4705882352941177e-05, + "loss": 1.4906, + "step": 50 + }, + { + "epoch": 0.0009217529259008141, + "grad_norm": 5.59375, + "learning_rate": 1.5294117647058826e-05, + "loss": 1.4529, + "step": 52 + }, + { + "epoch": 0.0009572049615123838, + "grad_norm": 4.46875, + "learning_rate": 1.588235294117647e-05, + "loss": 1.4715, + "step": 54 + }, + { + "epoch": 0.0009926569971239537, + "grad_norm": 4.46875, + "learning_rate": 1.647058823529412e-05, + "loss": 1.3749, + "step": 56 + }, + { + "epoch": 0.0010281090327355234, + "grad_norm": 3.0625, + "learning_rate": 1.7058823529411767e-05, + "loss": 1.3041, + "step": 58 + }, + { + "epoch": 0.0010635610683470932, + "grad_norm": 3.015625, + "learning_rate": 1.7647058823529414e-05, + "loss": 1.2544, + "step": 60 + }, + { + "epoch": 0.0010990131039586629, + "grad_norm": 3.40625, + "learning_rate": 1.8235294117647057e-05, + "loss": 1.3137, + "step": 62 + }, + { + "epoch": 0.0011344651395702326, + "grad_norm": 3.15625, + "learning_rate": 1.8823529411764708e-05, + "loss": 1.2781, + "step": 64 + }, + { + "epoch": 0.0011699171751818026, + "grad_norm": 3.40625, + "learning_rate": 1.9411764705882355e-05, + "loss": 1.2188, + "step": 66 + }, + { + "epoch": 0.0012053692107933723, + "grad_norm": 3.984375, + "learning_rate": 2e-05, + "loss": 1.2637, + "step": 68 + }, + { + "epoch": 0.001240821246404942, + "grad_norm": 3.625, + "learning_rate": 2.058823529411765e-05, + "loss": 1.2316, + "step": 70 + }, + { + "epoch": 0.0012762732820165118, + "grad_norm": 4.0625, + "learning_rate": 2.1176470588235296e-05, + "loss": 1.1779, + "step": 72 + }, + { + "epoch": 0.0013117253176280815, + "grad_norm": 3.28125, + "learning_rate": 2.1764705882352943e-05, + "loss": 1.214, + "step": 74 + }, + { + "epoch": 0.0013471773532396513, + "grad_norm": 3.4375, + "learning_rate": 2.235294117647059e-05, + "loss": 1.3211, + "step": 76 + }, + { + "epoch": 0.0013826293888512212, + "grad_norm": 3.765625, + "learning_rate": 2.2941176470588237e-05, + "loss": 1.1991, + "step": 78 + }, + { + "epoch": 0.001418081424462791, + "grad_norm": 3.15625, + "learning_rate": 2.3529411764705884e-05, + "loss": 1.234, + "step": 80 + }, + { + "epoch": 0.0014535334600743607, + "grad_norm": 3.09375, + "learning_rate": 2.411764705882353e-05, + "loss": 1.2325, + "step": 82 + }, + { + "epoch": 0.0014889854956859304, + "grad_norm": 3.796875, + "learning_rate": 2.4705882352941178e-05, + "loss": 1.2462, + "step": 84 + }, + { + "epoch": 0.0015244375312975001, + "grad_norm": 3.21875, + "learning_rate": 2.5294117647058825e-05, + "loss": 1.1721, + "step": 86 + }, + { + "epoch": 0.0015598895669090699, + "grad_norm": 5.9375, + "learning_rate": 2.5882352941176475e-05, + "loss": 1.2239, + "step": 88 + }, + { + "epoch": 0.0015953416025206398, + "grad_norm": 3.3125, + "learning_rate": 2.647058823529412e-05, + "loss": 1.1905, + "step": 90 + }, + { + "epoch": 0.0016307936381322096, + "grad_norm": 2.953125, + "learning_rate": 2.7058823529411766e-05, + "loss": 1.1956, + "step": 92 + }, + { + "epoch": 0.0016662456737437793, + "grad_norm": 3.71875, + "learning_rate": 2.7647058823529416e-05, + "loss": 1.1939, + "step": 94 + }, + { + "epoch": 0.001701697709355349, + "grad_norm": 3.625, + "learning_rate": 2.823529411764706e-05, + "loss": 1.1435, + "step": 96 + }, + { + "epoch": 0.0017371497449669188, + "grad_norm": 4.4375, + "learning_rate": 2.8823529411764703e-05, + "loss": 1.1985, + "step": 98 + }, + { + "epoch": 0.0017726017805784885, + "grad_norm": 3.890625, + "learning_rate": 2.9411764705882354e-05, + "loss": 1.1163, + "step": 100 + }, + { + "epoch": 0.0018080538161900585, + "grad_norm": 3.921875, + "learning_rate": 3e-05, + "loss": 1.1596, + "step": 102 + }, + { + "epoch": 0.0018435058518016282, + "grad_norm": 3.828125, + "learning_rate": 3.058823529411765e-05, + "loss": 1.2044, + "step": 104 + }, + { + "epoch": 0.001878957887413198, + "grad_norm": 3.390625, + "learning_rate": 3.11764705882353e-05, + "loss": 1.177, + "step": 106 + }, + { + "epoch": 0.0019144099230247677, + "grad_norm": 4.40625, + "learning_rate": 3.176470588235294e-05, + "loss": 1.099, + "step": 108 + }, + { + "epoch": 0.0019498619586363374, + "grad_norm": 3.703125, + "learning_rate": 3.235294117647059e-05, + "loss": 1.0945, + "step": 110 + }, + { + "epoch": 0.0019853139942479074, + "grad_norm": 3.84375, + "learning_rate": 3.294117647058824e-05, + "loss": 1.1515, + "step": 112 + }, + { + "epoch": 0.002020766029859477, + "grad_norm": 4.0, + "learning_rate": 3.352941176470588e-05, + "loss": 1.1436, + "step": 114 + }, + { + "epoch": 0.002056218065471047, + "grad_norm": 4.5, + "learning_rate": 3.411764705882353e-05, + "loss": 1.1481, + "step": 116 + }, + { + "epoch": 0.0020916701010826166, + "grad_norm": 3.6875, + "learning_rate": 3.470588235294118e-05, + "loss": 1.1345, + "step": 118 + }, + { + "epoch": 0.0021271221366941863, + "grad_norm": 4.75, + "learning_rate": 3.529411764705883e-05, + "loss": 1.1136, + "step": 120 + }, + { + "epoch": 0.002162574172305756, + "grad_norm": 3.203125, + "learning_rate": 3.5882352941176474e-05, + "loss": 1.1009, + "step": 122 + }, + { + "epoch": 0.0021980262079173258, + "grad_norm": 3.640625, + "learning_rate": 3.6470588235294114e-05, + "loss": 1.1125, + "step": 124 + }, + { + "epoch": 0.0022334782435288955, + "grad_norm": 3.5625, + "learning_rate": 3.705882352941177e-05, + "loss": 1.1209, + "step": 126 + }, + { + "epoch": 0.0022689302791404652, + "grad_norm": 4.5, + "learning_rate": 3.7647058823529415e-05, + "loss": 1.1285, + "step": 128 + }, + { + "epoch": 0.002304382314752035, + "grad_norm": 4.0, + "learning_rate": 3.8235294117647055e-05, + "loss": 1.1407, + "step": 130 + }, + { + "epoch": 0.002339834350363605, + "grad_norm": 3.78125, + "learning_rate": 3.882352941176471e-05, + "loss": 1.1664, + "step": 132 + }, + { + "epoch": 0.002375286385975175, + "grad_norm": 3.53125, + "learning_rate": 3.9411764705882356e-05, + "loss": 1.1043, + "step": 134 + }, + { + "epoch": 0.0024107384215867446, + "grad_norm": 3.71875, + "learning_rate": 4e-05, + "loss": 1.1204, + "step": 136 + }, + { + "epoch": 0.0024461904571983144, + "grad_norm": 3.546875, + "learning_rate": 4.058823529411765e-05, + "loss": 1.0994, + "step": 138 + }, + { + "epoch": 0.002481642492809884, + "grad_norm": 3.640625, + "learning_rate": 4.11764705882353e-05, + "loss": 1.0434, + "step": 140 + }, + { + "epoch": 0.002517094528421454, + "grad_norm": 3.515625, + "learning_rate": 4.1764705882352944e-05, + "loss": 1.0621, + "step": 142 + }, + { + "epoch": 0.0025525465640330236, + "grad_norm": 3.6875, + "learning_rate": 4.235294117647059e-05, + "loss": 1.1062, + "step": 144 + }, + { + "epoch": 0.0025879985996445933, + "grad_norm": 3.40625, + "learning_rate": 4.294117647058823e-05, + "loss": 1.0814, + "step": 146 + }, + { + "epoch": 0.002623450635256163, + "grad_norm": 3.203125, + "learning_rate": 4.3529411764705885e-05, + "loss": 1.0785, + "step": 148 + }, + { + "epoch": 0.0026589026708677328, + "grad_norm": 3.484375, + "learning_rate": 4.411764705882353e-05, + "loss": 1.0745, + "step": 150 + }, + { + "epoch": 0.0026943547064793025, + "grad_norm": 3.796875, + "learning_rate": 4.470588235294118e-05, + "loss": 1.1046, + "step": 152 + }, + { + "epoch": 0.0027298067420908722, + "grad_norm": 3.734375, + "learning_rate": 4.5294117647058826e-05, + "loss": 1.0708, + "step": 154 + }, + { + "epoch": 0.0027652587777024424, + "grad_norm": 3.84375, + "learning_rate": 4.588235294117647e-05, + "loss": 1.1357, + "step": 156 + }, + { + "epoch": 0.002800710813314012, + "grad_norm": 3.859375, + "learning_rate": 4.647058823529412e-05, + "loss": 1.1327, + "step": 158 + }, + { + "epoch": 0.002836162848925582, + "grad_norm": 3.6875, + "learning_rate": 4.705882352941177e-05, + "loss": 1.06, + "step": 160 + }, + { + "epoch": 0.0028716148845371516, + "grad_norm": 3.640625, + "learning_rate": 4.7647058823529414e-05, + "loss": 1.0476, + "step": 162 + }, + { + "epoch": 0.0029070669201487214, + "grad_norm": 5.15625, + "learning_rate": 4.823529411764706e-05, + "loss": 1.0745, + "step": 164 + }, + { + "epoch": 0.002942518955760291, + "grad_norm": 3.171875, + "learning_rate": 4.882352941176471e-05, + "loss": 1.0779, + "step": 166 + }, + { + "epoch": 0.002977970991371861, + "grad_norm": 3.90625, + "learning_rate": 4.9411764705882355e-05, + "loss": 1.1239, + "step": 168 + }, + { + "epoch": 0.0030134230269834306, + "grad_norm": 3.59375, + "learning_rate": 5e-05, + "loss": 1.0765, + "step": 170 + }, + { + "epoch": 0.0030488750625950003, + "grad_norm": 5.0625, + "learning_rate": 4.999999984400261e-05, + "loss": 1.0344, + "step": 172 + }, + { + "epoch": 0.00308432709820657, + "grad_norm": 3.8125, + "learning_rate": 4.999999937601042e-05, + "loss": 1.1249, + "step": 174 + }, + { + "epoch": 0.0031197791338181398, + "grad_norm": 3.9375, + "learning_rate": 4.9999998596023444e-05, + "loss": 1.0789, + "step": 176 + }, + { + "epoch": 0.0031552311694297095, + "grad_norm": 3.46875, + "learning_rate": 4.9999997504041694e-05, + "loss": 1.0495, + "step": 178 + }, + { + "epoch": 0.0031906832050412797, + "grad_norm": 4.28125, + "learning_rate": 4.999999610006519e-05, + "loss": 1.0348, + "step": 180 + }, + { + "epoch": 0.0032261352406528494, + "grad_norm": 3.640625, + "learning_rate": 4.999999438409393e-05, + "loss": 1.0471, + "step": 182 + }, + { + "epoch": 0.003261587276264419, + "grad_norm": 3.546875, + "learning_rate": 4.9999992356127956e-05, + "loss": 1.0496, + "step": 184 + }, + { + "epoch": 0.003297039311875989, + "grad_norm": 4.125, + "learning_rate": 4.9999990016167286e-05, + "loss": 1.0611, + "step": 186 + }, + { + "epoch": 0.0033324913474875586, + "grad_norm": 3.328125, + "learning_rate": 4.999998736421194e-05, + "loss": 1.0915, + "step": 188 + }, + { + "epoch": 0.0033679433830991284, + "grad_norm": 3.78125, + "learning_rate": 4.999998440026197e-05, + "loss": 1.0573, + "step": 190 + }, + { + "epoch": 0.003403395418710698, + "grad_norm": 3.9375, + "learning_rate": 4.999998112431738e-05, + "loss": 1.0416, + "step": 192 + }, + { + "epoch": 0.003438847454322268, + "grad_norm": 3.703125, + "learning_rate": 4.999997753637825e-05, + "loss": 1.0162, + "step": 194 + }, + { + "epoch": 0.0034742994899338376, + "grad_norm": 3.046875, + "learning_rate": 4.999997363644461e-05, + "loss": 1.004, + "step": 196 + }, + { + "epoch": 0.0035097515255454073, + "grad_norm": 4.65625, + "learning_rate": 4.99999694245165e-05, + "loss": 1.0658, + "step": 198 + }, + { + "epoch": 0.003545203561156977, + "grad_norm": 3.65625, + "learning_rate": 4.9999964900593975e-05, + "loss": 0.9414, + "step": 200 + }, + { + "epoch": 0.0035806555967685468, + "grad_norm": 3.8125, + "learning_rate": 4.9999960064677104e-05, + "loss": 1.0537, + "step": 202 + }, + { + "epoch": 0.003616107632380117, + "grad_norm": 4.8125, + "learning_rate": 4.9999954916765934e-05, + "loss": 1.0669, + "step": 204 + }, + { + "epoch": 0.0036515596679916867, + "grad_norm": 3.703125, + "learning_rate": 4.999994945686053e-05, + "loss": 0.9318, + "step": 206 + }, + { + "epoch": 0.0036870117036032564, + "grad_norm": 3.71875, + "learning_rate": 4.999994368496097e-05, + "loss": 1.0402, + "step": 208 + }, + { + "epoch": 0.003722463739214826, + "grad_norm": 4.40625, + "learning_rate": 4.999993760106732e-05, + "loss": 1.0629, + "step": 210 + }, + { + "epoch": 0.003757915774826396, + "grad_norm": 3.796875, + "learning_rate": 4.999993120517965e-05, + "loss": 1.0532, + "step": 212 + }, + { + "epoch": 0.0037933678104379656, + "grad_norm": 3.3125, + "learning_rate": 4.999992449729806e-05, + "loss": 1.0498, + "step": 214 + }, + { + "epoch": 0.0038288198460495354, + "grad_norm": 3.28125, + "learning_rate": 4.9999917477422594e-05, + "loss": 0.9999, + "step": 216 + }, + { + "epoch": 0.003864271881661105, + "grad_norm": 3.859375, + "learning_rate": 4.9999910145553386e-05, + "loss": 1.0786, + "step": 218 + }, + { + "epoch": 0.003899723917272675, + "grad_norm": 3.46875, + "learning_rate": 4.99999025016905e-05, + "loss": 1.0175, + "step": 220 + }, + { + "epoch": 0.0039351759528842446, + "grad_norm": 3.640625, + "learning_rate": 4.9999894545834034e-05, + "loss": 1.0597, + "step": 222 + }, + { + "epoch": 0.003970627988495815, + "grad_norm": 3.8125, + "learning_rate": 4.99998862779841e-05, + "loss": 0.9901, + "step": 224 + }, + { + "epoch": 0.004006080024107384, + "grad_norm": 3.625, + "learning_rate": 4.9999877698140783e-05, + "loss": 1.053, + "step": 226 + }, + { + "epoch": 0.004041532059718954, + "grad_norm": 3.640625, + "learning_rate": 4.999986880630421e-05, + "loss": 1.0277, + "step": 228 + }, + { + "epoch": 0.0040769840953305235, + "grad_norm": 3.140625, + "learning_rate": 4.9999859602474474e-05, + "loss": 0.9884, + "step": 230 + }, + { + "epoch": 0.004112436130942094, + "grad_norm": 3.375, + "learning_rate": 4.999985008665169e-05, + "loss": 0.9866, + "step": 232 + }, + { + "epoch": 0.004147888166553663, + "grad_norm": 3.234375, + "learning_rate": 4.9999840258835994e-05, + "loss": 0.976, + "step": 234 + }, + { + "epoch": 0.004183340202165233, + "grad_norm": 3.328125, + "learning_rate": 4.99998301190275e-05, + "loss": 1.0003, + "step": 236 + }, + { + "epoch": 0.0042187922377768024, + "grad_norm": 3.40625, + "learning_rate": 4.999981966722633e-05, + "loss": 1.0255, + "step": 238 + }, + { + "epoch": 0.004254244273388373, + "grad_norm": 3.140625, + "learning_rate": 4.999980890343262e-05, + "loss": 1.0533, + "step": 240 + }, + { + "epoch": 0.004289696308999943, + "grad_norm": 3.15625, + "learning_rate": 4.99997978276465e-05, + "loss": 1.0061, + "step": 242 + }, + { + "epoch": 0.004325148344611512, + "grad_norm": 3.625, + "learning_rate": 4.999978643986811e-05, + "loss": 1.0097, + "step": 244 + }, + { + "epoch": 0.004360600380223082, + "grad_norm": 3.4375, + "learning_rate": 4.9999774740097597e-05, + "loss": 0.996, + "step": 246 + }, + { + "epoch": 0.0043960524158346516, + "grad_norm": 3.6875, + "learning_rate": 4.9999762728335094e-05, + "loss": 0.9974, + "step": 248 + }, + { + "epoch": 0.004431504451446222, + "grad_norm": 3.1875, + "learning_rate": 4.999975040458076e-05, + "loss": 0.9845, + "step": 250 + }, + { + "epoch": 0.004466956487057791, + "grad_norm": 3.46875, + "learning_rate": 4.999973776883475e-05, + "loss": 1.0087, + "step": 252 + }, + { + "epoch": 0.004502408522669361, + "grad_norm": 4.1875, + "learning_rate": 4.9999724821097226e-05, + "loss": 1.0533, + "step": 254 + }, + { + "epoch": 0.0045378605582809305, + "grad_norm": 3.984375, + "learning_rate": 4.999971156136833e-05, + "loss": 1.0019, + "step": 256 + }, + { + "epoch": 0.004573312593892501, + "grad_norm": 4.40625, + "learning_rate": 4.999969798964825e-05, + "loss": 1.0433, + "step": 258 + }, + { + "epoch": 0.00460876462950407, + "grad_norm": 3.34375, + "learning_rate": 4.999968410593715e-05, + "loss": 0.9849, + "step": 260 + }, + { + "epoch": 0.00464421666511564, + "grad_norm": 3.453125, + "learning_rate": 4.999966991023519e-05, + "loss": 0.9957, + "step": 262 + }, + { + "epoch": 0.00467966870072721, + "grad_norm": 3.296875, + "learning_rate": 4.999965540254257e-05, + "loss": 0.9741, + "step": 264 + }, + { + "epoch": 0.00471512073633878, + "grad_norm": 3.140625, + "learning_rate": 4.999964058285944e-05, + "loss": 1.0172, + "step": 266 + }, + { + "epoch": 0.00475057277195035, + "grad_norm": 4.0625, + "learning_rate": 4.9999625451186014e-05, + "loss": 1.0139, + "step": 268 + }, + { + "epoch": 0.004786024807561919, + "grad_norm": 3.875, + "learning_rate": 4.999961000752247e-05, + "loss": 1.0714, + "step": 270 + }, + { + "epoch": 0.004821476843173489, + "grad_norm": 3.640625, + "learning_rate": 4.999959425186899e-05, + "loss": 1.044, + "step": 272 + }, + { + "epoch": 0.0048569288787850586, + "grad_norm": 3.796875, + "learning_rate": 4.999957818422579e-05, + "loss": 0.9591, + "step": 274 + }, + { + "epoch": 0.004892380914396629, + "grad_norm": 4.09375, + "learning_rate": 4.999956180459306e-05, + "loss": 0.9922, + "step": 276 + }, + { + "epoch": 0.004927832950008198, + "grad_norm": 3.546875, + "learning_rate": 4.9999545112971004e-05, + "loss": 1.0256, + "step": 278 + }, + { + "epoch": 0.004963284985619768, + "grad_norm": 4.03125, + "learning_rate": 4.9999528109359825e-05, + "loss": 1.0298, + "step": 280 + }, + { + "epoch": 0.0049987370212313375, + "grad_norm": 3.546875, + "learning_rate": 4.999951079375976e-05, + "loss": 1.0022, + "step": 282 + }, + { + "epoch": 0.005034189056842908, + "grad_norm": 3.9375, + "learning_rate": 4.9999493166170994e-05, + "loss": 1.0502, + "step": 284 + }, + { + "epoch": 0.005069641092454477, + "grad_norm": 3.625, + "learning_rate": 4.999947522659376e-05, + "loss": 1.0071, + "step": 286 + }, + { + "epoch": 0.005105093128066047, + "grad_norm": 3.34375, + "learning_rate": 4.999945697502829e-05, + "loss": 1.0428, + "step": 288 + }, + { + "epoch": 0.005140545163677617, + "grad_norm": 3.46875, + "learning_rate": 4.9999438411474794e-05, + "loss": 1.0481, + "step": 290 + }, + { + "epoch": 0.005175997199289187, + "grad_norm": 3.265625, + "learning_rate": 4.999941953593352e-05, + "loss": 1.0106, + "step": 292 + }, + { + "epoch": 0.005211449234900757, + "grad_norm": 3.1875, + "learning_rate": 4.99994003484047e-05, + "loss": 0.9525, + "step": 294 + }, + { + "epoch": 0.005246901270512326, + "grad_norm": 3.4375, + "learning_rate": 4.9999380848888566e-05, + "loss": 0.9956, + "step": 296 + }, + { + "epoch": 0.005282353306123896, + "grad_norm": 3.609375, + "learning_rate": 4.9999361037385366e-05, + "loss": 1.002, + "step": 298 + }, + { + "epoch": 0.0053178053417354655, + "grad_norm": 4.1875, + "learning_rate": 4.9999340913895347e-05, + "loss": 1.0014, + "step": 300 + }, + { + "epoch": 0.005353257377347036, + "grad_norm": 3.640625, + "learning_rate": 4.9999320478418766e-05, + "loss": 1.0703, + "step": 302 + }, + { + "epoch": 0.005388709412958605, + "grad_norm": 3.40625, + "learning_rate": 4.999929973095586e-05, + "loss": 0.9922, + "step": 304 + }, + { + "epoch": 0.005424161448570175, + "grad_norm": 3.3125, + "learning_rate": 4.999927867150691e-05, + "loss": 1.0016, + "step": 306 + }, + { + "epoch": 0.0054596134841817445, + "grad_norm": 3.3125, + "learning_rate": 4.999925730007217e-05, + "loss": 1.0145, + "step": 308 + }, + { + "epoch": 0.005495065519793315, + "grad_norm": 3.453125, + "learning_rate": 4.999923561665191e-05, + "loss": 1.0087, + "step": 310 + }, + { + "epoch": 0.005530517555404885, + "grad_norm": 3.234375, + "learning_rate": 4.999921362124639e-05, + "loss": 0.9935, + "step": 312 + }, + { + "epoch": 0.005565969591016454, + "grad_norm": 3.28125, + "learning_rate": 4.9999191313855884e-05, + "loss": 1.013, + "step": 314 + }, + { + "epoch": 0.005601421626628024, + "grad_norm": 3.28125, + "learning_rate": 4.999916869448069e-05, + "loss": 0.9903, + "step": 316 + }, + { + "epoch": 0.005636873662239594, + "grad_norm": 3.265625, + "learning_rate": 4.999914576312107e-05, + "loss": 0.9984, + "step": 318 + }, + { + "epoch": 0.005672325697851164, + "grad_norm": 3.5625, + "learning_rate": 4.999912251977732e-05, + "loss": 1.0084, + "step": 320 + }, + { + "epoch": 0.005707777733462733, + "grad_norm": 3.390625, + "learning_rate": 4.999909896444972e-05, + "loss": 0.9749, + "step": 322 + }, + { + "epoch": 0.005743229769074303, + "grad_norm": 3.609375, + "learning_rate": 4.9999075097138584e-05, + "loss": 1.0102, + "step": 324 + }, + { + "epoch": 0.0057786818046858725, + "grad_norm": 3.453125, + "learning_rate": 4.9999050917844194e-05, + "loss": 0.9708, + "step": 326 + }, + { + "epoch": 0.005814133840297443, + "grad_norm": 3.453125, + "learning_rate": 4.9999026426566854e-05, + "loss": 1.0564, + "step": 328 + }, + { + "epoch": 0.005849585875909012, + "grad_norm": 3.609375, + "learning_rate": 4.9999001623306876e-05, + "loss": 0.9852, + "step": 330 + }, + { + "epoch": 0.005885037911520582, + "grad_norm": 3.8125, + "learning_rate": 4.999897650806455e-05, + "loss": 0.9472, + "step": 332 + }, + { + "epoch": 0.0059204899471321515, + "grad_norm": 3.59375, + "learning_rate": 4.9998951080840214e-05, + "loss": 0.9868, + "step": 334 + }, + { + "epoch": 0.005955941982743722, + "grad_norm": 3.546875, + "learning_rate": 4.9998925341634187e-05, + "loss": 1.0065, + "step": 336 + }, + { + "epoch": 0.005991394018355292, + "grad_norm": 3.40625, + "learning_rate": 4.999889929044676e-05, + "loss": 1.008, + "step": 338 + }, + { + "epoch": 0.006026846053966861, + "grad_norm": 3.8125, + "learning_rate": 4.9998872927278284e-05, + "loss": 1.0051, + "step": 340 + }, + { + "epoch": 0.006062298089578431, + "grad_norm": 3.640625, + "learning_rate": 4.9998846252129086e-05, + "loss": 0.9748, + "step": 342 + }, + { + "epoch": 0.006097750125190001, + "grad_norm": 3.53125, + "learning_rate": 4.999881926499949e-05, + "loss": 0.9838, + "step": 344 + }, + { + "epoch": 0.006133202160801571, + "grad_norm": 3.25, + "learning_rate": 4.999879196588983e-05, + "loss": 0.9938, + "step": 346 + }, + { + "epoch": 0.00616865419641314, + "grad_norm": 3.1875, + "learning_rate": 4.999876435480045e-05, + "loss": 0.9663, + "step": 348 + }, + { + "epoch": 0.00620410623202471, + "grad_norm": 3.75, + "learning_rate": 4.9998736431731715e-05, + "loss": 0.963, + "step": 350 + }, + { + "epoch": 0.0062395582676362795, + "grad_norm": 4.09375, + "learning_rate": 4.9998708196683945e-05, + "loss": 0.9871, + "step": 352 + }, + { + "epoch": 0.00627501030324785, + "grad_norm": 3.625, + "learning_rate": 4.999867964965751e-05, + "loss": 0.9627, + "step": 354 + }, + { + "epoch": 0.006310462338859419, + "grad_norm": 3.421875, + "learning_rate": 4.999865079065275e-05, + "loss": 1.0292, + "step": 356 + }, + { + "epoch": 0.006345914374470989, + "grad_norm": 3.234375, + "learning_rate": 4.9998621619670036e-05, + "loss": 0.9694, + "step": 358 + }, + { + "epoch": 0.006381366410082559, + "grad_norm": 3.40625, + "learning_rate": 4.9998592136709746e-05, + "loss": 0.9795, + "step": 360 + }, + { + "epoch": 0.006416818445694129, + "grad_norm": 3.6875, + "learning_rate": 4.999856234177221e-05, + "loss": 0.9889, + "step": 362 + }, + { + "epoch": 0.006452270481305699, + "grad_norm": 3.3125, + "learning_rate": 4.999853223485784e-05, + "loss": 0.9322, + "step": 364 + }, + { + "epoch": 0.006487722516917268, + "grad_norm": 3.515625, + "learning_rate": 4.999850181596699e-05, + "loss": 1.0098, + "step": 366 + }, + { + "epoch": 0.006523174552528838, + "grad_norm": 3.734375, + "learning_rate": 4.9998471085100045e-05, + "loss": 0.9977, + "step": 368 + }, + { + "epoch": 0.006558626588140408, + "grad_norm": 3.28125, + "learning_rate": 4.999844004225739e-05, + "loss": 0.9989, + "step": 370 + }, + { + "epoch": 0.006594078623751978, + "grad_norm": 3.046875, + "learning_rate": 4.9998408687439405e-05, + "loss": 0.9484, + "step": 372 + }, + { + "epoch": 0.006629530659363547, + "grad_norm": 2.921875, + "learning_rate": 4.999837702064649e-05, + "loss": 0.9547, + "step": 374 + }, + { + "epoch": 0.006664982694975117, + "grad_norm": 3.5, + "learning_rate": 4.999834504187904e-05, + "loss": 1.0299, + "step": 376 + }, + { + "epoch": 0.0067004347305866865, + "grad_norm": 3.171875, + "learning_rate": 4.999831275113744e-05, + "loss": 1.0, + "step": 378 + }, + { + "epoch": 0.006735886766198257, + "grad_norm": 3.453125, + "learning_rate": 4.9998280148422104e-05, + "loss": 1.0383, + "step": 380 + }, + { + "epoch": 0.006771338801809826, + "grad_norm": 3.5625, + "learning_rate": 4.999824723373345e-05, + "loss": 0.9557, + "step": 382 + }, + { + "epoch": 0.006806790837421396, + "grad_norm": 3.0625, + "learning_rate": 4.999821400707186e-05, + "loss": 0.9847, + "step": 384 + }, + { + "epoch": 0.006842242873032966, + "grad_norm": 3.28125, + "learning_rate": 4.9998180468437786e-05, + "loss": 0.9431, + "step": 386 + }, + { + "epoch": 0.006877694908644536, + "grad_norm": 3.9375, + "learning_rate": 4.9998146617831616e-05, + "loss": 1.0244, + "step": 388 + }, + { + "epoch": 0.006913146944256106, + "grad_norm": 2.75, + "learning_rate": 4.999811245525378e-05, + "loss": 0.9302, + "step": 390 + }, + { + "epoch": 0.006948598979867675, + "grad_norm": 3.078125, + "learning_rate": 4.9998077980704716e-05, + "loss": 0.9856, + "step": 392 + }, + { + "epoch": 0.006984051015479245, + "grad_norm": 2.90625, + "learning_rate": 4.999804319418484e-05, + "loss": 0.978, + "step": 394 + }, + { + "epoch": 0.007019503051090815, + "grad_norm": 3.71875, + "learning_rate": 4.999800809569459e-05, + "loss": 1.0063, + "step": 396 + }, + { + "epoch": 0.007054955086702385, + "grad_norm": 3.453125, + "learning_rate": 4.999797268523441e-05, + "loss": 1.0083, + "step": 398 + }, + { + "epoch": 0.007090407122313954, + "grad_norm": 3.53125, + "learning_rate": 4.9997936962804736e-05, + "loss": 1.0052, + "step": 400 + }, + { + "epoch": 0.007125859157925524, + "grad_norm": 3.21875, + "learning_rate": 4.999790092840602e-05, + "loss": 0.9596, + "step": 402 + }, + { + "epoch": 0.0071613111935370935, + "grad_norm": 3.015625, + "learning_rate": 4.99978645820387e-05, + "loss": 0.899, + "step": 404 + }, + { + "epoch": 0.007196763229148664, + "grad_norm": 3.21875, + "learning_rate": 4.999782792370323e-05, + "loss": 0.9701, + "step": 406 + }, + { + "epoch": 0.007232215264760234, + "grad_norm": 3.046875, + "learning_rate": 4.999779095340009e-05, + "loss": 0.9746, + "step": 408 + }, + { + "epoch": 0.007267667300371803, + "grad_norm": 2.984375, + "learning_rate": 4.999775367112972e-05, + "loss": 1.0319, + "step": 410 + }, + { + "epoch": 0.007303119335983373, + "grad_norm": 3.5625, + "learning_rate": 4.999771607689259e-05, + "loss": 0.9785, + "step": 412 + }, + { + "epoch": 0.007338571371594943, + "grad_norm": 3.296875, + "learning_rate": 4.9997678170689164e-05, + "loss": 0.968, + "step": 414 + }, + { + "epoch": 0.007374023407206513, + "grad_norm": 3.671875, + "learning_rate": 4.999763995251993e-05, + "loss": 0.9797, + "step": 416 + }, + { + "epoch": 0.007409475442818082, + "grad_norm": 3.125, + "learning_rate": 4.999760142238535e-05, + "loss": 1.0253, + "step": 418 + }, + { + "epoch": 0.007444927478429652, + "grad_norm": 3.34375, + "learning_rate": 4.9997562580285914e-05, + "loss": 0.9627, + "step": 420 + }, + { + "epoch": 0.007480379514041222, + "grad_norm": 3.125, + "learning_rate": 4.999752342622211e-05, + "loss": 0.9922, + "step": 422 + }, + { + "epoch": 0.007515831549652792, + "grad_norm": 3.53125, + "learning_rate": 4.9997483960194405e-05, + "loss": 0.973, + "step": 424 + }, + { + "epoch": 0.007551283585264361, + "grad_norm": 3.296875, + "learning_rate": 4.9997444182203316e-05, + "loss": 0.9356, + "step": 426 + }, + { + "epoch": 0.007586735620875931, + "grad_norm": 3.390625, + "learning_rate": 4.9997404092249336e-05, + "loss": 0.9873, + "step": 428 + }, + { + "epoch": 0.0076221876564875005, + "grad_norm": 3.25, + "learning_rate": 4.999736369033295e-05, + "loss": 0.952, + "step": 430 + }, + { + "epoch": 0.007657639692099071, + "grad_norm": 3.6875, + "learning_rate": 4.999732297645467e-05, + "loss": 0.9977, + "step": 432 + }, + { + "epoch": 0.007693091727710641, + "grad_norm": 3.578125, + "learning_rate": 4.999728195061502e-05, + "loss": 1.0054, + "step": 434 + }, + { + "epoch": 0.00772854376332221, + "grad_norm": 3.328125, + "learning_rate": 4.999724061281448e-05, + "loss": 0.9507, + "step": 436 + }, + { + "epoch": 0.00776399579893378, + "grad_norm": 3.0625, + "learning_rate": 4.9997198963053595e-05, + "loss": 0.9529, + "step": 438 + }, + { + "epoch": 0.00779944783454535, + "grad_norm": 2.96875, + "learning_rate": 4.999715700133287e-05, + "loss": 1.0338, + "step": 440 + }, + { + "epoch": 0.00783489987015692, + "grad_norm": 3.296875, + "learning_rate": 4.999711472765283e-05, + "loss": 0.9645, + "step": 442 + }, + { + "epoch": 0.007870351905768489, + "grad_norm": 3.3125, + "learning_rate": 4.999707214201401e-05, + "loss": 0.9345, + "step": 444 + }, + { + "epoch": 0.007905803941380058, + "grad_norm": 3.40625, + "learning_rate": 4.999702924441693e-05, + "loss": 0.9372, + "step": 446 + }, + { + "epoch": 0.00794125597699163, + "grad_norm": 3.578125, + "learning_rate": 4.999698603486214e-05, + "loss": 0.9688, + "step": 448 + }, + { + "epoch": 0.007976708012603199, + "grad_norm": 3.296875, + "learning_rate": 4.999694251335016e-05, + "loss": 0.9641, + "step": 450 + }, + { + "epoch": 0.008012160048214768, + "grad_norm": 3.125, + "learning_rate": 4.9996898679881556e-05, + "loss": 0.9635, + "step": 452 + }, + { + "epoch": 0.008047612083826337, + "grad_norm": 3.515625, + "learning_rate": 4.999685453445685e-05, + "loss": 0.9986, + "step": 454 + }, + { + "epoch": 0.008083064119437908, + "grad_norm": 3.484375, + "learning_rate": 4.999681007707662e-05, + "loss": 0.9567, + "step": 456 + }, + { + "epoch": 0.008118516155049478, + "grad_norm": 3.09375, + "learning_rate": 4.9996765307741394e-05, + "loss": 0.8916, + "step": 458 + }, + { + "epoch": 0.008153968190661047, + "grad_norm": 2.6875, + "learning_rate": 4.999672022645175e-05, + "loss": 0.9889, + "step": 460 + }, + { + "epoch": 0.008189420226272618, + "grad_norm": 2.953125, + "learning_rate": 4.999667483320825e-05, + "loss": 0.9913, + "step": 462 + }, + { + "epoch": 0.008224872261884187, + "grad_norm": 2.921875, + "learning_rate": 4.999662912801144e-05, + "loss": 0.9699, + "step": 464 + }, + { + "epoch": 0.008260324297495757, + "grad_norm": 3.0625, + "learning_rate": 4.999658311086191e-05, + "loss": 0.9943, + "step": 466 + }, + { + "epoch": 0.008295776333107326, + "grad_norm": 3.203125, + "learning_rate": 4.9996536781760236e-05, + "loss": 0.9744, + "step": 468 + }, + { + "epoch": 0.008331228368718897, + "grad_norm": 3.625, + "learning_rate": 4.999649014070698e-05, + "loss": 0.9958, + "step": 470 + }, + { + "epoch": 0.008366680404330466, + "grad_norm": 2.875, + "learning_rate": 4.999644318770274e-05, + "loss": 0.9638, + "step": 472 + }, + { + "epoch": 0.008402132439942036, + "grad_norm": 3.28125, + "learning_rate": 4.99963959227481e-05, + "loss": 0.9533, + "step": 474 + }, + { + "epoch": 0.008437584475553605, + "grad_norm": 3.171875, + "learning_rate": 4.999634834584363e-05, + "loss": 0.9881, + "step": 476 + }, + { + "epoch": 0.008473036511165176, + "grad_norm": 2.890625, + "learning_rate": 4.999630045698995e-05, + "loss": 0.9871, + "step": 478 + }, + { + "epoch": 0.008508488546776745, + "grad_norm": 2.90625, + "learning_rate": 4.9996252256187646e-05, + "loss": 0.9489, + "step": 480 + }, + { + "epoch": 0.008543940582388315, + "grad_norm": 3.046875, + "learning_rate": 4.999620374343732e-05, + "loss": 0.9673, + "step": 482 + }, + { + "epoch": 0.008579392617999886, + "grad_norm": 3.21875, + "learning_rate": 4.999615491873957e-05, + "loss": 0.9671, + "step": 484 + }, + { + "epoch": 0.008614844653611455, + "grad_norm": 3.5, + "learning_rate": 4.999610578209502e-05, + "loss": 0.9086, + "step": 486 + }, + { + "epoch": 0.008650296689223024, + "grad_norm": 3.390625, + "learning_rate": 4.9996056333504275e-05, + "loss": 0.9864, + "step": 488 + }, + { + "epoch": 0.008685748724834593, + "grad_norm": 3.0625, + "learning_rate": 4.999600657296796e-05, + "loss": 0.9359, + "step": 490 + }, + { + "epoch": 0.008721200760446165, + "grad_norm": 3.34375, + "learning_rate": 4.9995956500486676e-05, + "loss": 0.9322, + "step": 492 + }, + { + "epoch": 0.008756652796057734, + "grad_norm": 3.375, + "learning_rate": 4.999590611606107e-05, + "loss": 0.9402, + "step": 494 + }, + { + "epoch": 0.008792104831669303, + "grad_norm": 3.109375, + "learning_rate": 4.9995855419691765e-05, + "loss": 0.9617, + "step": 496 + }, + { + "epoch": 0.008827556867280872, + "grad_norm": 3.28125, + "learning_rate": 4.999580441137938e-05, + "loss": 0.9414, + "step": 498 + }, + { + "epoch": 0.008863008902892443, + "grad_norm": 3.46875, + "learning_rate": 4.999575309112456e-05, + "loss": 0.9602, + "step": 500 + }, + { + "epoch": 0.008898460938504013, + "grad_norm": 2.859375, + "learning_rate": 4.999570145892796e-05, + "loss": 0.9457, + "step": 502 + }, + { + "epoch": 0.008933912974115582, + "grad_norm": 3.1875, + "learning_rate": 4.999564951479021e-05, + "loss": 1.0017, + "step": 504 + }, + { + "epoch": 0.008969365009727153, + "grad_norm": 3.421875, + "learning_rate": 4.9995597258711954e-05, + "loss": 0.9839, + "step": 506 + }, + { + "epoch": 0.009004817045338722, + "grad_norm": 3.296875, + "learning_rate": 4.999554469069386e-05, + "loss": 0.9671, + "step": 508 + }, + { + "epoch": 0.009040269080950292, + "grad_norm": 3.125, + "learning_rate": 4.9995491810736564e-05, + "loss": 0.9471, + "step": 510 + }, + { + "epoch": 0.009075721116561861, + "grad_norm": 3.5, + "learning_rate": 4.999543861884074e-05, + "loss": 0.9322, + "step": 512 + }, + { + "epoch": 0.009111173152173432, + "grad_norm": 3.5, + "learning_rate": 4.9995385115007055e-05, + "loss": 0.9606, + "step": 514 + }, + { + "epoch": 0.009146625187785001, + "grad_norm": 3.03125, + "learning_rate": 4.999533129923616e-05, + "loss": 0.965, + "step": 516 + }, + { + "epoch": 0.00918207722339657, + "grad_norm": 3.140625, + "learning_rate": 4.999527717152874e-05, + "loss": 1.0181, + "step": 518 + }, + { + "epoch": 0.00921752925900814, + "grad_norm": 3.09375, + "learning_rate": 4.999522273188547e-05, + "loss": 0.9654, + "step": 520 + }, + { + "epoch": 0.009252981294619711, + "grad_norm": 3.109375, + "learning_rate": 4.9995167980307024e-05, + "loss": 1.0039, + "step": 522 + }, + { + "epoch": 0.00928843333023128, + "grad_norm": 3.0, + "learning_rate": 4.9995112916794084e-05, + "loss": 0.9443, + "step": 524 + }, + { + "epoch": 0.00932388536584285, + "grad_norm": 3.609375, + "learning_rate": 4.999505754134734e-05, + "loss": 0.9385, + "step": 526 + }, + { + "epoch": 0.00935933740145442, + "grad_norm": 3.125, + "learning_rate": 4.999500185396749e-05, + "loss": 0.9158, + "step": 528 + }, + { + "epoch": 0.00939478943706599, + "grad_norm": 2.859375, + "learning_rate": 4.999494585465523e-05, + "loss": 0.9583, + "step": 530 + }, + { + "epoch": 0.00943024147267756, + "grad_norm": 3.03125, + "learning_rate": 4.999488954341124e-05, + "loss": 0.9829, + "step": 532 + }, + { + "epoch": 0.009465693508289129, + "grad_norm": 3.1875, + "learning_rate": 4.999483292023624e-05, + "loss": 0.9174, + "step": 534 + }, + { + "epoch": 0.0095011455439007, + "grad_norm": 3.359375, + "learning_rate": 4.9994775985130925e-05, + "loss": 0.9316, + "step": 536 + }, + { + "epoch": 0.009536597579512269, + "grad_norm": 3.109375, + "learning_rate": 4.999471873809602e-05, + "loss": 1.006, + "step": 538 + }, + { + "epoch": 0.009572049615123838, + "grad_norm": 3.15625, + "learning_rate": 4.999466117913223e-05, + "loss": 1.008, + "step": 540 + }, + { + "epoch": 0.009607501650735407, + "grad_norm": 3.359375, + "learning_rate": 4.999460330824027e-05, + "loss": 0.9878, + "step": 542 + }, + { + "epoch": 0.009642953686346978, + "grad_norm": 3.0625, + "learning_rate": 4.999454512542087e-05, + "loss": 0.9953, + "step": 544 + }, + { + "epoch": 0.009678405721958548, + "grad_norm": 3.0625, + "learning_rate": 4.9994486630674744e-05, + "loss": 0.9758, + "step": 546 + }, + { + "epoch": 0.009713857757570117, + "grad_norm": 2.84375, + "learning_rate": 4.999442782400264e-05, + "loss": 0.9526, + "step": 548 + }, + { + "epoch": 0.009749309793181686, + "grad_norm": 3.4375, + "learning_rate": 4.999436870540528e-05, + "loss": 0.9034, + "step": 550 + }, + { + "epoch": 0.009784761828793257, + "grad_norm": 3.40625, + "learning_rate": 4.999430927488341e-05, + "loss": 0.9539, + "step": 552 + }, + { + "epoch": 0.009820213864404827, + "grad_norm": 3.078125, + "learning_rate": 4.999424953243776e-05, + "loss": 0.9947, + "step": 554 + }, + { + "epoch": 0.009855665900016396, + "grad_norm": 3.625, + "learning_rate": 4.999418947806908e-05, + "loss": 0.9475, + "step": 556 + }, + { + "epoch": 0.009891117935627967, + "grad_norm": 3.34375, + "learning_rate": 4.999412911177813e-05, + "loss": 0.9541, + "step": 558 + }, + { + "epoch": 0.009926569971239536, + "grad_norm": 3.75, + "learning_rate": 4.999406843356564e-05, + "loss": 0.9163, + "step": 560 + }, + { + "epoch": 0.009962022006851106, + "grad_norm": 3.28125, + "learning_rate": 4.99940074434324e-05, + "loss": 0.932, + "step": 562 + }, + { + "epoch": 0.009997474042462675, + "grad_norm": 3.734375, + "learning_rate": 4.9993946141379145e-05, + "loss": 0.9848, + "step": 564 + }, + { + "epoch": 0.010032926078074246, + "grad_norm": 3.5625, + "learning_rate": 4.999388452740664e-05, + "loss": 0.9937, + "step": 566 + }, + { + "epoch": 0.010068378113685815, + "grad_norm": 3.515625, + "learning_rate": 4.999382260151567e-05, + "loss": 0.9689, + "step": 568 + }, + { + "epoch": 0.010103830149297385, + "grad_norm": 3.21875, + "learning_rate": 4.9993760363706996e-05, + "loss": 0.9488, + "step": 570 + }, + { + "epoch": 0.010139282184908954, + "grad_norm": 3.15625, + "learning_rate": 4.9993697813981404e-05, + "loss": 0.9778, + "step": 572 + }, + { + "epoch": 0.010174734220520525, + "grad_norm": 3.109375, + "learning_rate": 4.999363495233966e-05, + "loss": 0.9597, + "step": 574 + }, + { + "epoch": 0.010210186256132094, + "grad_norm": 3.125, + "learning_rate": 4.9993571778782565e-05, + "loss": 0.968, + "step": 576 + }, + { + "epoch": 0.010245638291743664, + "grad_norm": 3.25, + "learning_rate": 4.9993508293310905e-05, + "loss": 0.9588, + "step": 578 + }, + { + "epoch": 0.010281090327355235, + "grad_norm": 2.734375, + "learning_rate": 4.999344449592546e-05, + "loss": 0.9334, + "step": 580 + }, + { + "epoch": 0.010316542362966804, + "grad_norm": 3.25, + "learning_rate": 4.999338038662703e-05, + "loss": 0.975, + "step": 582 + }, + { + "epoch": 0.010351994398578373, + "grad_norm": 3.28125, + "learning_rate": 4.999331596541643e-05, + "loss": 0.9033, + "step": 584 + }, + { + "epoch": 0.010387446434189943, + "grad_norm": 3.609375, + "learning_rate": 4.999325123229444e-05, + "loss": 0.9773, + "step": 586 + }, + { + "epoch": 0.010422898469801514, + "grad_norm": 3.234375, + "learning_rate": 4.9993186187261884e-05, + "loss": 0.9611, + "step": 588 + }, + { + "epoch": 0.010458350505413083, + "grad_norm": 3.203125, + "learning_rate": 4.999312083031957e-05, + "loss": 0.9743, + "step": 590 + }, + { + "epoch": 0.010493802541024652, + "grad_norm": 3.109375, + "learning_rate": 4.999305516146832e-05, + "loss": 0.9143, + "step": 592 + }, + { + "epoch": 0.010529254576636221, + "grad_norm": 3.265625, + "learning_rate": 4.999298918070894e-05, + "loss": 0.9472, + "step": 594 + }, + { + "epoch": 0.010564706612247792, + "grad_norm": 3.140625, + "learning_rate": 4.999292288804227e-05, + "loss": 0.934, + "step": 596 + }, + { + "epoch": 0.010600158647859362, + "grad_norm": 3.109375, + "learning_rate": 4.999285628346912e-05, + "loss": 0.9666, + "step": 598 + }, + { + "epoch": 0.010635610683470931, + "grad_norm": 2.765625, + "learning_rate": 4.999278936699033e-05, + "loss": 0.9855, + "step": 600 + }, + { + "epoch": 0.010671062719082502, + "grad_norm": 3.109375, + "learning_rate": 4.999272213860674e-05, + "loss": 0.9246, + "step": 602 + }, + { + "epoch": 0.010706514754694071, + "grad_norm": 3.578125, + "learning_rate": 4.999265459831917e-05, + "loss": 0.9739, + "step": 604 + }, + { + "epoch": 0.01074196679030564, + "grad_norm": 3.015625, + "learning_rate": 4.999258674612849e-05, + "loss": 0.993, + "step": 606 + }, + { + "epoch": 0.01077741882591721, + "grad_norm": 3.359375, + "learning_rate": 4.999251858203553e-05, + "loss": 0.9682, + "step": 608 + }, + { + "epoch": 0.010812870861528781, + "grad_norm": 3.109375, + "learning_rate": 4.9992450106041135e-05, + "loss": 0.9582, + "step": 610 + }, + { + "epoch": 0.01084832289714035, + "grad_norm": 2.921875, + "learning_rate": 4.999238131814617e-05, + "loss": 0.9464, + "step": 612 + }, + { + "epoch": 0.01088377493275192, + "grad_norm": 3.078125, + "learning_rate": 4.99923122183515e-05, + "loss": 0.908, + "step": 614 + }, + { + "epoch": 0.010919226968363489, + "grad_norm": 3.171875, + "learning_rate": 4.999224280665798e-05, + "loss": 0.9571, + "step": 616 + }, + { + "epoch": 0.01095467900397506, + "grad_norm": 3.015625, + "learning_rate": 4.9992173083066466e-05, + "loss": 0.9992, + "step": 618 + }, + { + "epoch": 0.01099013103958663, + "grad_norm": 3.0, + "learning_rate": 4.999210304757784e-05, + "loss": 0.9529, + "step": 620 + }, + { + "epoch": 0.011025583075198199, + "grad_norm": 3.03125, + "learning_rate": 4.999203270019298e-05, + "loss": 0.9182, + "step": 622 + }, + { + "epoch": 0.01106103511080977, + "grad_norm": 3.140625, + "learning_rate": 4.9991962040912756e-05, + "loss": 0.9034, + "step": 624 + }, + { + "epoch": 0.011096487146421339, + "grad_norm": 3.140625, + "learning_rate": 4.999189106973804e-05, + "loss": 0.9601, + "step": 626 + }, + { + "epoch": 0.011131939182032908, + "grad_norm": 3.59375, + "learning_rate": 4.999181978666974e-05, + "loss": 0.9431, + "step": 628 + }, + { + "epoch": 0.011167391217644478, + "grad_norm": 2.71875, + "learning_rate": 4.999174819170873e-05, + "loss": 0.8976, + "step": 630 + }, + { + "epoch": 0.011202843253256049, + "grad_norm": 3.03125, + "learning_rate": 4.999167628485591e-05, + "loss": 0.9586, + "step": 632 + }, + { + "epoch": 0.011238295288867618, + "grad_norm": 2.78125, + "learning_rate": 4.999160406611218e-05, + "loss": 0.933, + "step": 634 + }, + { + "epoch": 0.011273747324479187, + "grad_norm": 3.21875, + "learning_rate": 4.999153153547842e-05, + "loss": 0.9944, + "step": 636 + }, + { + "epoch": 0.011309199360090757, + "grad_norm": 3.4375, + "learning_rate": 4.999145869295557e-05, + "loss": 0.9273, + "step": 638 + }, + { + "epoch": 0.011344651395702328, + "grad_norm": 3.25, + "learning_rate": 4.999138553854451e-05, + "loss": 0.8649, + "step": 640 + }, + { + "epoch": 0.011380103431313897, + "grad_norm": 3.28125, + "learning_rate": 4.999131207224617e-05, + "loss": 0.9654, + "step": 642 + }, + { + "epoch": 0.011415555466925466, + "grad_norm": 2.890625, + "learning_rate": 4.999123829406146e-05, + "loss": 0.9769, + "step": 644 + }, + { + "epoch": 0.011451007502537035, + "grad_norm": 3.0625, + "learning_rate": 4.9991164203991295e-05, + "loss": 0.9016, + "step": 646 + }, + { + "epoch": 0.011486459538148606, + "grad_norm": 3.53125, + "learning_rate": 4.999108980203662e-05, + "loss": 0.9064, + "step": 648 + }, + { + "epoch": 0.011521911573760176, + "grad_norm": 2.953125, + "learning_rate": 4.999101508819833e-05, + "loss": 0.8923, + "step": 650 + }, + { + "epoch": 0.011557363609371745, + "grad_norm": 3.140625, + "learning_rate": 4.9990940062477386e-05, + "loss": 0.9613, + "step": 652 + }, + { + "epoch": 0.011592815644983316, + "grad_norm": 3.421875, + "learning_rate": 4.999086472487472e-05, + "loss": 1.0074, + "step": 654 + }, + { + "epoch": 0.011628267680594885, + "grad_norm": 3.015625, + "learning_rate": 4.9990789075391264e-05, + "loss": 0.9456, + "step": 656 + }, + { + "epoch": 0.011663719716206455, + "grad_norm": 3.046875, + "learning_rate": 4.9990713114027966e-05, + "loss": 0.9861, + "step": 658 + }, + { + "epoch": 0.011699171751818024, + "grad_norm": 3.25, + "learning_rate": 4.9990636840785775e-05, + "loss": 0.9855, + "step": 660 + }, + { + "epoch": 0.011734623787429595, + "grad_norm": 3.234375, + "learning_rate": 4.999056025566564e-05, + "loss": 0.9234, + "step": 662 + }, + { + "epoch": 0.011770075823041164, + "grad_norm": 3.015625, + "learning_rate": 4.9990483358668514e-05, + "loss": 0.9737, + "step": 664 + }, + { + "epoch": 0.011805527858652734, + "grad_norm": 2.859375, + "learning_rate": 4.999040614979536e-05, + "loss": 0.9562, + "step": 666 + }, + { + "epoch": 0.011840979894264303, + "grad_norm": 3.125, + "learning_rate": 4.999032862904715e-05, + "loss": 0.9459, + "step": 668 + }, + { + "epoch": 0.011876431929875874, + "grad_norm": 3.03125, + "learning_rate": 4.999025079642484e-05, + "loss": 0.9167, + "step": 670 + }, + { + "epoch": 0.011911883965487443, + "grad_norm": 2.9375, + "learning_rate": 4.999017265192941e-05, + "loss": 0.9068, + "step": 672 + }, + { + "epoch": 0.011947336001099013, + "grad_norm": 2.9375, + "learning_rate": 4.999009419556182e-05, + "loss": 0.9414, + "step": 674 + }, + { + "epoch": 0.011982788036710584, + "grad_norm": 3.59375, + "learning_rate": 4.999001542732307e-05, + "loss": 0.9365, + "step": 676 + }, + { + "epoch": 0.012018240072322153, + "grad_norm": 3.359375, + "learning_rate": 4.9989936347214125e-05, + "loss": 0.9686, + "step": 678 + }, + { + "epoch": 0.012053692107933722, + "grad_norm": 3.65625, + "learning_rate": 4.9989856955235985e-05, + "loss": 0.9942, + "step": 680 + }, + { + "epoch": 0.012089144143545292, + "grad_norm": 3.453125, + "learning_rate": 4.998977725138964e-05, + "loss": 0.9682, + "step": 682 + }, + { + "epoch": 0.012124596179156863, + "grad_norm": 3.390625, + "learning_rate": 4.998969723567607e-05, + "loss": 0.9719, + "step": 684 + }, + { + "epoch": 0.012160048214768432, + "grad_norm": 2.96875, + "learning_rate": 4.998961690809628e-05, + "loss": 0.901, + "step": 686 + }, + { + "epoch": 0.012195500250380001, + "grad_norm": 2.984375, + "learning_rate": 4.998953626865128e-05, + "loss": 0.9027, + "step": 688 + }, + { + "epoch": 0.01223095228599157, + "grad_norm": 3.09375, + "learning_rate": 4.9989455317342076e-05, + "loss": 0.943, + "step": 690 + }, + { + "epoch": 0.012266404321603142, + "grad_norm": 3.59375, + "learning_rate": 4.9989374054169676e-05, + "loss": 0.9789, + "step": 692 + }, + { + "epoch": 0.01230185635721471, + "grad_norm": 3.125, + "learning_rate": 4.998929247913509e-05, + "loss": 0.9397, + "step": 694 + }, + { + "epoch": 0.01233730839282628, + "grad_norm": 2.84375, + "learning_rate": 4.998921059223933e-05, + "loss": 0.9447, + "step": 696 + }, + { + "epoch": 0.012372760428437851, + "grad_norm": 3.390625, + "learning_rate": 4.9989128393483445e-05, + "loss": 0.946, + "step": 698 + }, + { + "epoch": 0.01240821246404942, + "grad_norm": 2.734375, + "learning_rate": 4.9989045882868426e-05, + "loss": 0.9301, + "step": 700 + }, + { + "epoch": 0.01244366449966099, + "grad_norm": 3.0625, + "learning_rate": 4.998896306039533e-05, + "loss": 0.9113, + "step": 702 + }, + { + "epoch": 0.012479116535272559, + "grad_norm": 2.734375, + "learning_rate": 4.9988879926065174e-05, + "loss": 0.9572, + "step": 704 + }, + { + "epoch": 0.01251456857088413, + "grad_norm": 3.453125, + "learning_rate": 4.998879647987901e-05, + "loss": 0.9825, + "step": 706 + }, + { + "epoch": 0.0125500206064957, + "grad_norm": 3.265625, + "learning_rate": 4.998871272183786e-05, + "loss": 0.9766, + "step": 708 + }, + { + "epoch": 0.012585472642107269, + "grad_norm": 2.875, + "learning_rate": 4.998862865194278e-05, + "loss": 0.9208, + "step": 710 + }, + { + "epoch": 0.012620924677718838, + "grad_norm": 2.96875, + "learning_rate": 4.998854427019483e-05, + "loss": 0.9375, + "step": 712 + }, + { + "epoch": 0.012656376713330409, + "grad_norm": 3.140625, + "learning_rate": 4.998845957659504e-05, + "loss": 0.9579, + "step": 714 + }, + { + "epoch": 0.012691828748941978, + "grad_norm": 3.328125, + "learning_rate": 4.9988374571144484e-05, + "loss": 0.9579, + "step": 716 + }, + { + "epoch": 0.012727280784553548, + "grad_norm": 3.078125, + "learning_rate": 4.9988289253844214e-05, + "loss": 0.8884, + "step": 718 + }, + { + "epoch": 0.012762732820165119, + "grad_norm": 3.0, + "learning_rate": 4.998820362469531e-05, + "loss": 0.9132, + "step": 720 + }, + { + "epoch": 0.012798184855776688, + "grad_norm": 3.109375, + "learning_rate": 4.998811768369882e-05, + "loss": 0.958, + "step": 722 + }, + { + "epoch": 0.012833636891388257, + "grad_norm": 3.0625, + "learning_rate": 4.998803143085583e-05, + "loss": 0.9573, + "step": 724 + }, + { + "epoch": 0.012869088926999827, + "grad_norm": 3.328125, + "learning_rate": 4.9987944866167405e-05, + "loss": 0.9226, + "step": 726 + }, + { + "epoch": 0.012904540962611398, + "grad_norm": 3.328125, + "learning_rate": 4.998785798963464e-05, + "loss": 0.9214, + "step": 728 + }, + { + "epoch": 0.012939992998222967, + "grad_norm": 3.328125, + "learning_rate": 4.9987770801258617e-05, + "loss": 0.9307, + "step": 730 + }, + { + "epoch": 0.012975445033834536, + "grad_norm": 3.078125, + "learning_rate": 4.998768330104041e-05, + "loss": 0.9398, + "step": 732 + }, + { + "epoch": 0.013010897069446106, + "grad_norm": 3.03125, + "learning_rate": 4.998759548898112e-05, + "loss": 0.9196, + "step": 734 + }, + { + "epoch": 0.013046349105057677, + "grad_norm": 2.921875, + "learning_rate": 4.998750736508184e-05, + "loss": 0.9937, + "step": 736 + }, + { + "epoch": 0.013081801140669246, + "grad_norm": 3.265625, + "learning_rate": 4.998741892934368e-05, + "loss": 0.963, + "step": 738 + }, + { + "epoch": 0.013117253176280815, + "grad_norm": 3.6875, + "learning_rate": 4.998733018176774e-05, + "loss": 0.9119, + "step": 740 + }, + { + "epoch": 0.013152705211892384, + "grad_norm": 3.234375, + "learning_rate": 4.9987241122355116e-05, + "loss": 0.861, + "step": 742 + }, + { + "epoch": 0.013188157247503956, + "grad_norm": 3.4375, + "learning_rate": 4.9987151751106934e-05, + "loss": 0.9322, + "step": 744 + }, + { + "epoch": 0.013223609283115525, + "grad_norm": 3.8125, + "learning_rate": 4.998706206802429e-05, + "loss": 0.9604, + "step": 746 + }, + { + "epoch": 0.013259061318727094, + "grad_norm": 3.125, + "learning_rate": 4.9986972073108326e-05, + "loss": 0.9062, + "step": 748 + }, + { + "epoch": 0.013294513354338665, + "grad_norm": 3.1875, + "learning_rate": 4.998688176636015e-05, + "loss": 0.9155, + "step": 750 + }, + { + "epoch": 0.013329965389950234, + "grad_norm": 3.375, + "learning_rate": 4.99867911477809e-05, + "loss": 0.897, + "step": 752 + }, + { + "epoch": 0.013365417425561804, + "grad_norm": 2.953125, + "learning_rate": 4.9986700217371694e-05, + "loss": 0.9338, + "step": 754 + }, + { + "epoch": 0.013400869461173373, + "grad_norm": 3.171875, + "learning_rate": 4.998660897513367e-05, + "loss": 0.9416, + "step": 756 + }, + { + "epoch": 0.013436321496784944, + "grad_norm": 3.125, + "learning_rate": 4.9986517421067986e-05, + "loss": 0.9024, + "step": 758 + }, + { + "epoch": 0.013471773532396513, + "grad_norm": 3.09375, + "learning_rate": 4.9986425555175766e-05, + "loss": 0.9484, + "step": 760 + }, + { + "epoch": 0.013507225568008083, + "grad_norm": 2.953125, + "learning_rate": 4.998633337745815e-05, + "loss": 0.8972, + "step": 762 + }, + { + "epoch": 0.013542677603619652, + "grad_norm": 3.390625, + "learning_rate": 4.99862408879163e-05, + "loss": 0.9608, + "step": 764 + }, + { + "epoch": 0.013578129639231223, + "grad_norm": 3.375, + "learning_rate": 4.998614808655137e-05, + "loss": 0.9107, + "step": 766 + }, + { + "epoch": 0.013613581674842792, + "grad_norm": 3.109375, + "learning_rate": 4.9986054973364516e-05, + "loss": 0.9088, + "step": 768 + }, + { + "epoch": 0.013649033710454362, + "grad_norm": 3.234375, + "learning_rate": 4.99859615483569e-05, + "loss": 0.9374, + "step": 770 + }, + { + "epoch": 0.013684485746065933, + "grad_norm": 3.125, + "learning_rate": 4.9985867811529685e-05, + "loss": 0.9202, + "step": 772 + }, + { + "epoch": 0.013719937781677502, + "grad_norm": 2.71875, + "learning_rate": 4.998577376288405e-05, + "loss": 0.9546, + "step": 774 + }, + { + "epoch": 0.013755389817289071, + "grad_norm": 2.953125, + "learning_rate": 4.998567940242116e-05, + "loss": 0.9569, + "step": 776 + }, + { + "epoch": 0.01379084185290064, + "grad_norm": 3.0, + "learning_rate": 4.9985584730142185e-05, + "loss": 0.9551, + "step": 778 + }, + { + "epoch": 0.013826293888512212, + "grad_norm": 2.78125, + "learning_rate": 4.998548974604833e-05, + "loss": 0.9098, + "step": 780 + }, + { + "epoch": 0.013861745924123781, + "grad_norm": 3.15625, + "learning_rate": 4.998539445014077e-05, + "loss": 0.9948, + "step": 782 + }, + { + "epoch": 0.01389719795973535, + "grad_norm": 3.125, + "learning_rate": 4.998529884242068e-05, + "loss": 0.9789, + "step": 784 + }, + { + "epoch": 0.01393264999534692, + "grad_norm": 3.390625, + "learning_rate": 4.998520292288927e-05, + "loss": 0.9182, + "step": 786 + }, + { + "epoch": 0.01396810203095849, + "grad_norm": 2.921875, + "learning_rate": 4.998510669154773e-05, + "loss": 0.9248, + "step": 788 + }, + { + "epoch": 0.01400355406657006, + "grad_norm": 3.125, + "learning_rate": 4.998501014839726e-05, + "loss": 0.9428, + "step": 790 + }, + { + "epoch": 0.01403900610218163, + "grad_norm": 2.953125, + "learning_rate": 4.998491329343907e-05, + "loss": 0.9488, + "step": 792 + }, + { + "epoch": 0.0140744581377932, + "grad_norm": 3.234375, + "learning_rate": 4.998481612667437e-05, + "loss": 0.9789, + "step": 794 + }, + { + "epoch": 0.01410991017340477, + "grad_norm": 3.28125, + "learning_rate": 4.9984718648104365e-05, + "loss": 0.9332, + "step": 796 + }, + { + "epoch": 0.014145362209016339, + "grad_norm": 3.203125, + "learning_rate": 4.998462085773027e-05, + "loss": 0.9086, + "step": 798 + }, + { + "epoch": 0.014180814244627908, + "grad_norm": 3.375, + "learning_rate": 4.998452275555332e-05, + "loss": 0.9826, + "step": 800 + }, + { + "epoch": 0.01421626628023948, + "grad_norm": 3.09375, + "learning_rate": 4.9984424341574724e-05, + "loss": 0.903, + "step": 802 + }, + { + "epoch": 0.014251718315851048, + "grad_norm": 3.984375, + "learning_rate": 4.998432561579572e-05, + "loss": 0.9578, + "step": 804 + }, + { + "epoch": 0.014287170351462618, + "grad_norm": 3.171875, + "learning_rate": 4.998422657821753e-05, + "loss": 0.9535, + "step": 806 + }, + { + "epoch": 0.014322622387074187, + "grad_norm": 3.375, + "learning_rate": 4.99841272288414e-05, + "loss": 0.945, + "step": 808 + }, + { + "epoch": 0.014358074422685758, + "grad_norm": 2.890625, + "learning_rate": 4.998402756766857e-05, + "loss": 0.9676, + "step": 810 + }, + { + "epoch": 0.014393526458297327, + "grad_norm": 2.859375, + "learning_rate": 4.9983927594700275e-05, + "loss": 0.874, + "step": 812 + }, + { + "epoch": 0.014428978493908897, + "grad_norm": 3.125, + "learning_rate": 4.998382730993777e-05, + "loss": 0.935, + "step": 814 + }, + { + "epoch": 0.014464430529520468, + "grad_norm": 2.71875, + "learning_rate": 4.99837267133823e-05, + "loss": 0.9059, + "step": 816 + }, + { + "epoch": 0.014499882565132037, + "grad_norm": 3.0625, + "learning_rate": 4.998362580503513e-05, + "loss": 0.9293, + "step": 818 + }, + { + "epoch": 0.014535334600743606, + "grad_norm": 3.3125, + "learning_rate": 4.998352458489751e-05, + "loss": 1.0035, + "step": 820 + }, + { + "epoch": 0.014570786636355176, + "grad_norm": 3.4375, + "learning_rate": 4.99834230529707e-05, + "loss": 0.9548, + "step": 822 + }, + { + "epoch": 0.014606238671966747, + "grad_norm": 3.171875, + "learning_rate": 4.998332120925598e-05, + "loss": 0.9474, + "step": 824 + }, + { + "epoch": 0.014641690707578316, + "grad_norm": 2.984375, + "learning_rate": 4.9983219053754627e-05, + "loss": 0.9112, + "step": 826 + }, + { + "epoch": 0.014677142743189885, + "grad_norm": 3.109375, + "learning_rate": 4.9983116586467896e-05, + "loss": 0.9462, + "step": 828 + }, + { + "epoch": 0.014712594778801455, + "grad_norm": 3.09375, + "learning_rate": 4.998301380739706e-05, + "loss": 0.9302, + "step": 830 + }, + { + "epoch": 0.014748046814413026, + "grad_norm": 2.734375, + "learning_rate": 4.998291071654343e-05, + "loss": 0.9114, + "step": 832 + }, + { + "epoch": 0.014783498850024595, + "grad_norm": 2.765625, + "learning_rate": 4.9982807313908273e-05, + "loss": 0.9189, + "step": 834 + }, + { + "epoch": 0.014818950885636164, + "grad_norm": 3.0625, + "learning_rate": 4.998270359949289e-05, + "loss": 0.9278, + "step": 836 + }, + { + "epoch": 0.014854402921247734, + "grad_norm": 3.28125, + "learning_rate": 4.998259957329856e-05, + "loss": 0.9242, + "step": 838 + }, + { + "epoch": 0.014889854956859305, + "grad_norm": 2.9375, + "learning_rate": 4.99824952353266e-05, + "loss": 0.9375, + "step": 840 + }, + { + "epoch": 0.014925306992470874, + "grad_norm": 3.078125, + "learning_rate": 4.9982390585578295e-05, + "loss": 0.9858, + "step": 842 + }, + { + "epoch": 0.014960759028082443, + "grad_norm": 3.34375, + "learning_rate": 4.9982285624054956e-05, + "loss": 0.9225, + "step": 844 + }, + { + "epoch": 0.014996211063694014, + "grad_norm": 2.8125, + "learning_rate": 4.9982180350757905e-05, + "loss": 0.9209, + "step": 846 + }, + { + "epoch": 0.015031663099305584, + "grad_norm": 3.453125, + "learning_rate": 4.998207476568845e-05, + "loss": 0.9432, + "step": 848 + }, + { + "epoch": 0.015067115134917153, + "grad_norm": 3.28125, + "learning_rate": 4.99819688688479e-05, + "loss": 0.9355, + "step": 850 + }, + { + "epoch": 0.015102567170528722, + "grad_norm": 2.96875, + "learning_rate": 4.998186266023758e-05, + "loss": 0.9108, + "step": 852 + }, + { + "epoch": 0.015138019206140293, + "grad_norm": 3.171875, + "learning_rate": 4.998175613985882e-05, + "loss": 0.9086, + "step": 854 + }, + { + "epoch": 0.015173471241751862, + "grad_norm": 2.90625, + "learning_rate": 4.998164930771294e-05, + "loss": 0.8956, + "step": 856 + }, + { + "epoch": 0.015208923277363432, + "grad_norm": 3.125, + "learning_rate": 4.998154216380129e-05, + "loss": 0.9252, + "step": 858 + }, + { + "epoch": 0.015244375312975001, + "grad_norm": 3.28125, + "learning_rate": 4.9981434708125184e-05, + "loss": 0.9134, + "step": 860 + }, + { + "epoch": 0.015279827348586572, + "grad_norm": 3.015625, + "learning_rate": 4.9981326940685985e-05, + "loss": 0.9403, + "step": 862 + }, + { + "epoch": 0.015315279384198141, + "grad_norm": 2.9375, + "learning_rate": 4.998121886148503e-05, + "loss": 0.8918, + "step": 864 + }, + { + "epoch": 0.01535073141980971, + "grad_norm": 3.3125, + "learning_rate": 4.998111047052366e-05, + "loss": 0.9087, + "step": 866 + }, + { + "epoch": 0.015386183455421282, + "grad_norm": 3.625, + "learning_rate": 4.9981001767803245e-05, + "loss": 0.9514, + "step": 868 + }, + { + "epoch": 0.015421635491032851, + "grad_norm": 3.296875, + "learning_rate": 4.998089275332513e-05, + "loss": 0.9317, + "step": 870 + }, + { + "epoch": 0.01545708752664442, + "grad_norm": 3.40625, + "learning_rate": 4.998078342709067e-05, + "loss": 0.874, + "step": 872 + }, + { + "epoch": 0.01549253956225599, + "grad_norm": 3.1875, + "learning_rate": 4.9980673789101234e-05, + "loss": 0.9233, + "step": 874 + }, + { + "epoch": 0.01552799159786756, + "grad_norm": 3.046875, + "learning_rate": 4.99805638393582e-05, + "loss": 0.9601, + "step": 876 + }, + { + "epoch": 0.01556344363347913, + "grad_norm": 3.078125, + "learning_rate": 4.998045357786293e-05, + "loss": 0.9901, + "step": 878 + }, + { + "epoch": 0.0155988956690907, + "grad_norm": 2.703125, + "learning_rate": 4.9980343004616795e-05, + "loss": 0.9001, + "step": 880 + }, + { + "epoch": 0.01563434770470227, + "grad_norm": 3.15625, + "learning_rate": 4.998023211962119e-05, + "loss": 0.9135, + "step": 882 + }, + { + "epoch": 0.01566979974031384, + "grad_norm": 2.59375, + "learning_rate": 4.998012092287749e-05, + "loss": 0.9025, + "step": 884 + }, + { + "epoch": 0.015705251775925407, + "grad_norm": 2.84375, + "learning_rate": 4.998000941438709e-05, + "loss": 0.9301, + "step": 886 + }, + { + "epoch": 0.015740703811536978, + "grad_norm": 3.015625, + "learning_rate": 4.997989759415136e-05, + "loss": 0.9281, + "step": 888 + }, + { + "epoch": 0.01577615584714855, + "grad_norm": 2.921875, + "learning_rate": 4.997978546217172e-05, + "loss": 0.8869, + "step": 890 + }, + { + "epoch": 0.015811607882760117, + "grad_norm": 3.375, + "learning_rate": 4.9979673018449555e-05, + "loss": 0.9427, + "step": 892 + }, + { + "epoch": 0.015847059918371688, + "grad_norm": 2.984375, + "learning_rate": 4.9979560262986284e-05, + "loss": 0.9285, + "step": 894 + }, + { + "epoch": 0.01588251195398326, + "grad_norm": 3.265625, + "learning_rate": 4.9979447195783304e-05, + "loss": 0.9488, + "step": 896 + }, + { + "epoch": 0.015917963989594826, + "grad_norm": 3.171875, + "learning_rate": 4.997933381684202e-05, + "loss": 0.9196, + "step": 898 + }, + { + "epoch": 0.015953416025206398, + "grad_norm": 3.140625, + "learning_rate": 4.997922012616385e-05, + "loss": 0.9429, + "step": 900 + }, + { + "epoch": 0.01598886806081797, + "grad_norm": 3.0625, + "learning_rate": 4.997910612375022e-05, + "loss": 0.9284, + "step": 902 + }, + { + "epoch": 0.016024320096429536, + "grad_norm": 3.09375, + "learning_rate": 4.997899180960255e-05, + "loss": 0.9539, + "step": 904 + }, + { + "epoch": 0.016059772132041107, + "grad_norm": 3.359375, + "learning_rate": 4.997887718372226e-05, + "loss": 0.9394, + "step": 906 + }, + { + "epoch": 0.016095224167652675, + "grad_norm": 3.765625, + "learning_rate": 4.997876224611079e-05, + "loss": 0.9208, + "step": 908 + }, + { + "epoch": 0.016130676203264246, + "grad_norm": 3.5, + "learning_rate": 4.9978646996769563e-05, + "loss": 0.9476, + "step": 910 + }, + { + "epoch": 0.016166128238875817, + "grad_norm": 2.890625, + "learning_rate": 4.997853143570003e-05, + "loss": 0.9158, + "step": 912 + }, + { + "epoch": 0.016201580274487384, + "grad_norm": 3.046875, + "learning_rate": 4.997841556290362e-05, + "loss": 0.9036, + "step": 914 + }, + { + "epoch": 0.016237032310098955, + "grad_norm": 2.984375, + "learning_rate": 4.99782993783818e-05, + "loss": 0.8826, + "step": 916 + }, + { + "epoch": 0.016272484345710526, + "grad_norm": 2.765625, + "learning_rate": 4.997818288213599e-05, + "loss": 0.906, + "step": 918 + }, + { + "epoch": 0.016307936381322094, + "grad_norm": 3.109375, + "learning_rate": 4.997806607416767e-05, + "loss": 0.9132, + "step": 920 + }, + { + "epoch": 0.016343388416933665, + "grad_norm": 2.84375, + "learning_rate": 4.997794895447829e-05, + "loss": 0.9249, + "step": 922 + }, + { + "epoch": 0.016378840452545236, + "grad_norm": 3.46875, + "learning_rate": 4.9977831523069305e-05, + "loss": 0.942, + "step": 924 + }, + { + "epoch": 0.016414292488156804, + "grad_norm": 3.25, + "learning_rate": 4.9977713779942195e-05, + "loss": 0.9376, + "step": 926 + }, + { + "epoch": 0.016449744523768375, + "grad_norm": 3.15625, + "learning_rate": 4.99775957250984e-05, + "loss": 0.8812, + "step": 928 + }, + { + "epoch": 0.016485196559379942, + "grad_norm": 3.234375, + "learning_rate": 4.997747735853943e-05, + "loss": 0.9415, + "step": 930 + }, + { + "epoch": 0.016520648594991513, + "grad_norm": 3.3125, + "learning_rate": 4.997735868026674e-05, + "loss": 0.9229, + "step": 932 + }, + { + "epoch": 0.016556100630603084, + "grad_norm": 2.984375, + "learning_rate": 4.9977239690281816e-05, + "loss": 0.9193, + "step": 934 + }, + { + "epoch": 0.016591552666214652, + "grad_norm": 2.859375, + "learning_rate": 4.9977120388586144e-05, + "loss": 0.9391, + "step": 936 + }, + { + "epoch": 0.016627004701826223, + "grad_norm": 3.1875, + "learning_rate": 4.99770007751812e-05, + "loss": 0.8891, + "step": 938 + }, + { + "epoch": 0.016662456737437794, + "grad_norm": 2.828125, + "learning_rate": 4.99768808500685e-05, + "loss": 0.9539, + "step": 940 + }, + { + "epoch": 0.01669790877304936, + "grad_norm": 3.140625, + "learning_rate": 4.997676061324953e-05, + "loss": 0.9205, + "step": 942 + }, + { + "epoch": 0.016733360808660933, + "grad_norm": 3.25, + "learning_rate": 4.997664006472579e-05, + "loss": 0.9477, + "step": 944 + }, + { + "epoch": 0.016768812844272504, + "grad_norm": 2.96875, + "learning_rate": 4.997651920449878e-05, + "loss": 0.9004, + "step": 946 + }, + { + "epoch": 0.01680426487988407, + "grad_norm": 2.96875, + "learning_rate": 4.9976398032570015e-05, + "loss": 0.8733, + "step": 948 + }, + { + "epoch": 0.016839716915495642, + "grad_norm": 3.015625, + "learning_rate": 4.9976276548941005e-05, + "loss": 0.9192, + "step": 950 + }, + { + "epoch": 0.01687516895110721, + "grad_norm": 2.96875, + "learning_rate": 4.997615475361327e-05, + "loss": 0.9484, + "step": 952 + }, + { + "epoch": 0.01691062098671878, + "grad_norm": 3.078125, + "learning_rate": 4.997603264658832e-05, + "loss": 0.8731, + "step": 954 + }, + { + "epoch": 0.016946073022330352, + "grad_norm": 3.65625, + "learning_rate": 4.997591022786768e-05, + "loss": 0.9159, + "step": 956 + }, + { + "epoch": 0.01698152505794192, + "grad_norm": 2.953125, + "learning_rate": 4.997578749745288e-05, + "loss": 0.9644, + "step": 958 + }, + { + "epoch": 0.01701697709355349, + "grad_norm": 3.0, + "learning_rate": 4.997566445534547e-05, + "loss": 0.9076, + "step": 960 + }, + { + "epoch": 0.01705242912916506, + "grad_norm": 2.6875, + "learning_rate": 4.997554110154696e-05, + "loss": 0.9178, + "step": 962 + }, + { + "epoch": 0.01708788116477663, + "grad_norm": 3.34375, + "learning_rate": 4.99754174360589e-05, + "loss": 0.935, + "step": 964 + }, + { + "epoch": 0.0171233332003882, + "grad_norm": 2.65625, + "learning_rate": 4.997529345888284e-05, + "loss": 0.9035, + "step": 966 + }, + { + "epoch": 0.01715878523599977, + "grad_norm": 3.046875, + "learning_rate": 4.9975169170020306e-05, + "loss": 0.8921, + "step": 968 + }, + { + "epoch": 0.01719423727161134, + "grad_norm": 3.3125, + "learning_rate": 4.997504456947287e-05, + "loss": 0.8597, + "step": 970 + }, + { + "epoch": 0.01722968930722291, + "grad_norm": 3.703125, + "learning_rate": 4.997491965724208e-05, + "loss": 0.9268, + "step": 972 + }, + { + "epoch": 0.017265141342834477, + "grad_norm": 3.21875, + "learning_rate": 4.99747944333295e-05, + "loss": 0.9444, + "step": 974 + }, + { + "epoch": 0.01730059337844605, + "grad_norm": 3.40625, + "learning_rate": 4.997466889773668e-05, + "loss": 0.9268, + "step": 976 + }, + { + "epoch": 0.01733604541405762, + "grad_norm": 2.96875, + "learning_rate": 4.997454305046519e-05, + "loss": 0.9425, + "step": 978 + }, + { + "epoch": 0.017371497449669187, + "grad_norm": 3.0, + "learning_rate": 4.9974416891516615e-05, + "loss": 0.9128, + "step": 980 + }, + { + "epoch": 0.017406949485280758, + "grad_norm": 3.140625, + "learning_rate": 4.9974290420892514e-05, + "loss": 0.9078, + "step": 982 + }, + { + "epoch": 0.01744240152089233, + "grad_norm": 3.203125, + "learning_rate": 4.9974163638594475e-05, + "loss": 0.9496, + "step": 984 + }, + { + "epoch": 0.017477853556503897, + "grad_norm": 2.9375, + "learning_rate": 4.9974036544624063e-05, + "loss": 0.9128, + "step": 986 + }, + { + "epoch": 0.017513305592115468, + "grad_norm": 3.046875, + "learning_rate": 4.997390913898289e-05, + "loss": 0.8829, + "step": 988 + }, + { + "epoch": 0.01754875762772704, + "grad_norm": 2.71875, + "learning_rate": 4.997378142167253e-05, + "loss": 0.8951, + "step": 990 + }, + { + "epoch": 0.017584209663338606, + "grad_norm": 2.875, + "learning_rate": 4.997365339269457e-05, + "loss": 0.9845, + "step": 992 + }, + { + "epoch": 0.017619661698950177, + "grad_norm": 3.34375, + "learning_rate": 4.997352505205062e-05, + "loss": 0.8814, + "step": 994 + }, + { + "epoch": 0.017655113734561745, + "grad_norm": 3.546875, + "learning_rate": 4.997339639974229e-05, + "loss": 0.9382, + "step": 996 + }, + { + "epoch": 0.017690565770173316, + "grad_norm": 2.609375, + "learning_rate": 4.9973267435771165e-05, + "loss": 0.9112, + "step": 998 + }, + { + "epoch": 0.017726017805784887, + "grad_norm": 3.03125, + "learning_rate": 4.9973138160138865e-05, + "loss": 0.886, + "step": 1000 + }, + { + "epoch": 0.017761469841396454, + "grad_norm": 2.875, + "learning_rate": 4.9973008572847e-05, + "loss": 0.9003, + "step": 1002 + }, + { + "epoch": 0.017796921877008025, + "grad_norm": 2.921875, + "learning_rate": 4.9972878673897194e-05, + "loss": 0.9066, + "step": 1004 + }, + { + "epoch": 0.017832373912619597, + "grad_norm": 2.765625, + "learning_rate": 4.997274846329106e-05, + "loss": 0.8871, + "step": 1006 + }, + { + "epoch": 0.017867825948231164, + "grad_norm": 2.6875, + "learning_rate": 4.9972617941030236e-05, + "loss": 0.8963, + "step": 1008 + }, + { + "epoch": 0.017903277983842735, + "grad_norm": 3.15625, + "learning_rate": 4.9972487107116336e-05, + "loss": 0.8911, + "step": 1010 + }, + { + "epoch": 0.017938730019454306, + "grad_norm": 3.265625, + "learning_rate": 4.9972355961550995e-05, + "loss": 0.9564, + "step": 1012 + }, + { + "epoch": 0.017974182055065874, + "grad_norm": 3.40625, + "learning_rate": 4.997222450433585e-05, + "loss": 0.9578, + "step": 1014 + }, + { + "epoch": 0.018009634090677445, + "grad_norm": 3.046875, + "learning_rate": 4.9972092735472556e-05, + "loss": 0.9335, + "step": 1016 + }, + { + "epoch": 0.018045086126289012, + "grad_norm": 3.34375, + "learning_rate": 4.997196065496274e-05, + "loss": 0.9272, + "step": 1018 + }, + { + "epoch": 0.018080538161900583, + "grad_norm": 2.921875, + "learning_rate": 4.997182826280805e-05, + "loss": 0.9273, + "step": 1020 + }, + { + "epoch": 0.018115990197512154, + "grad_norm": 2.796875, + "learning_rate": 4.9971695559010155e-05, + "loss": 0.8782, + "step": 1022 + }, + { + "epoch": 0.018151442233123722, + "grad_norm": 3.03125, + "learning_rate": 4.997156254357069e-05, + "loss": 0.935, + "step": 1024 + }, + { + "epoch": 0.018186894268735293, + "grad_norm": 3.671875, + "learning_rate": 4.997142921649134e-05, + "loss": 0.9054, + "step": 1026 + }, + { + "epoch": 0.018222346304346864, + "grad_norm": 3.25, + "learning_rate": 4.997129557777375e-05, + "loss": 0.911, + "step": 1028 + }, + { + "epoch": 0.01825779833995843, + "grad_norm": 2.71875, + "learning_rate": 4.9971161627419585e-05, + "loss": 0.9384, + "step": 1030 + }, + { + "epoch": 0.018293250375570003, + "grad_norm": 3.390625, + "learning_rate": 4.9971027365430526e-05, + "loss": 0.9084, + "step": 1032 + }, + { + "epoch": 0.018328702411181574, + "grad_norm": 2.921875, + "learning_rate": 4.997089279180825e-05, + "loss": 0.8903, + "step": 1034 + }, + { + "epoch": 0.01836415444679314, + "grad_norm": 2.8125, + "learning_rate": 4.997075790655443e-05, + "loss": 0.9273, + "step": 1036 + }, + { + "epoch": 0.018399606482404712, + "grad_norm": 2.65625, + "learning_rate": 4.997062270967075e-05, + "loss": 0.9002, + "step": 1038 + }, + { + "epoch": 0.01843505851801628, + "grad_norm": 3.21875, + "learning_rate": 4.9970487201158903e-05, + "loss": 0.887, + "step": 1040 + }, + { + "epoch": 0.01847051055362785, + "grad_norm": 3.265625, + "learning_rate": 4.997035138102058e-05, + "loss": 0.9246, + "step": 1042 + }, + { + "epoch": 0.018505962589239422, + "grad_norm": 2.90625, + "learning_rate": 4.997021524925747e-05, + "loss": 0.9138, + "step": 1044 + }, + { + "epoch": 0.01854141462485099, + "grad_norm": 3.140625, + "learning_rate": 4.997007880587127e-05, + "loss": 0.9084, + "step": 1046 + }, + { + "epoch": 0.01857686666046256, + "grad_norm": 3.28125, + "learning_rate": 4.99699420508637e-05, + "loss": 0.9483, + "step": 1048 + }, + { + "epoch": 0.01861231869607413, + "grad_norm": 3.09375, + "learning_rate": 4.996980498423644e-05, + "loss": 0.9521, + "step": 1050 + }, + { + "epoch": 0.0186477707316857, + "grad_norm": 3.109375, + "learning_rate": 4.996966760599122e-05, + "loss": 0.9273, + "step": 1052 + }, + { + "epoch": 0.01868322276729727, + "grad_norm": 2.875, + "learning_rate": 4.996952991612975e-05, + "loss": 0.8884, + "step": 1054 + }, + { + "epoch": 0.01871867480290884, + "grad_norm": 2.640625, + "learning_rate": 4.996939191465375e-05, + "loss": 0.9442, + "step": 1056 + }, + { + "epoch": 0.01875412683852041, + "grad_norm": 2.890625, + "learning_rate": 4.9969253601564935e-05, + "loss": 0.9197, + "step": 1058 + }, + { + "epoch": 0.01878957887413198, + "grad_norm": 3.265625, + "learning_rate": 4.996911497686503e-05, + "loss": 0.9621, + "step": 1060 + }, + { + "epoch": 0.018825030909743547, + "grad_norm": 2.953125, + "learning_rate": 4.9968976040555785e-05, + "loss": 0.9149, + "step": 1062 + }, + { + "epoch": 0.01886048294535512, + "grad_norm": 3.96875, + "learning_rate": 4.9968836792638904e-05, + "loss": 0.9447, + "step": 1064 + }, + { + "epoch": 0.01889593498096669, + "grad_norm": 3.046875, + "learning_rate": 4.996869723311616e-05, + "loss": 0.9461, + "step": 1066 + }, + { + "epoch": 0.018931387016578257, + "grad_norm": 2.671875, + "learning_rate": 4.996855736198926e-05, + "loss": 0.9095, + "step": 1068 + }, + { + "epoch": 0.018966839052189828, + "grad_norm": 3.21875, + "learning_rate": 4.996841717925997e-05, + "loss": 0.9161, + "step": 1070 + }, + { + "epoch": 0.0190022910878014, + "grad_norm": 3.03125, + "learning_rate": 4.996827668493003e-05, + "loss": 0.9484, + "step": 1072 + }, + { + "epoch": 0.019037743123412967, + "grad_norm": 3.421875, + "learning_rate": 4.99681358790012e-05, + "loss": 0.9372, + "step": 1074 + }, + { + "epoch": 0.019073195159024538, + "grad_norm": 2.65625, + "learning_rate": 4.996799476147524e-05, + "loss": 0.9262, + "step": 1076 + }, + { + "epoch": 0.019108647194636105, + "grad_norm": 3.234375, + "learning_rate": 4.99678533323539e-05, + "loss": 0.9125, + "step": 1078 + }, + { + "epoch": 0.019144099230247676, + "grad_norm": 2.828125, + "learning_rate": 4.996771159163895e-05, + "loss": 0.8768, + "step": 1080 + }, + { + "epoch": 0.019179551265859247, + "grad_norm": 3.046875, + "learning_rate": 4.996756953933216e-05, + "loss": 0.9753, + "step": 1082 + }, + { + "epoch": 0.019215003301470815, + "grad_norm": 2.953125, + "learning_rate": 4.996742717543531e-05, + "loss": 0.8453, + "step": 1084 + }, + { + "epoch": 0.019250455337082386, + "grad_norm": 3.140625, + "learning_rate": 4.996728449995016e-05, + "loss": 0.96, + "step": 1086 + }, + { + "epoch": 0.019285907372693957, + "grad_norm": 3.34375, + "learning_rate": 4.99671415128785e-05, + "loss": 0.9682, + "step": 1088 + }, + { + "epoch": 0.019321359408305525, + "grad_norm": 3.640625, + "learning_rate": 4.996699821422212e-05, + "loss": 0.922, + "step": 1090 + }, + { + "epoch": 0.019356811443917096, + "grad_norm": 3.125, + "learning_rate": 4.99668546039828e-05, + "loss": 0.8806, + "step": 1092 + }, + { + "epoch": 0.019392263479528667, + "grad_norm": 3.046875, + "learning_rate": 4.996671068216233e-05, + "loss": 0.8965, + "step": 1094 + }, + { + "epoch": 0.019427715515140234, + "grad_norm": 2.890625, + "learning_rate": 4.996656644876252e-05, + "loss": 0.9415, + "step": 1096 + }, + { + "epoch": 0.019463167550751805, + "grad_norm": 2.9375, + "learning_rate": 4.996642190378515e-05, + "loss": 0.9455, + "step": 1098 + }, + { + "epoch": 0.019498619586363373, + "grad_norm": 3.375, + "learning_rate": 4.996627704723205e-05, + "loss": 0.9358, + "step": 1100 + }, + { + "epoch": 0.019534071621974944, + "grad_norm": 3.359375, + "learning_rate": 4.9966131879105003e-05, + "loss": 0.9367, + "step": 1102 + }, + { + "epoch": 0.019569523657586515, + "grad_norm": 3.0, + "learning_rate": 4.996598639940583e-05, + "loss": 0.9078, + "step": 1104 + }, + { + "epoch": 0.019604975693198082, + "grad_norm": 3.4375, + "learning_rate": 4.996584060813635e-05, + "loss": 0.9562, + "step": 1106 + }, + { + "epoch": 0.019640427728809653, + "grad_norm": 3.421875, + "learning_rate": 4.996569450529838e-05, + "loss": 0.9156, + "step": 1108 + }, + { + "epoch": 0.019675879764421225, + "grad_norm": 3.109375, + "learning_rate": 4.996554809089373e-05, + "loss": 0.9195, + "step": 1110 + }, + { + "epoch": 0.019711331800032792, + "grad_norm": 2.84375, + "learning_rate": 4.9965401364924254e-05, + "loss": 0.8973, + "step": 1112 + }, + { + "epoch": 0.019746783835644363, + "grad_norm": 3.375, + "learning_rate": 4.996525432739176e-05, + "loss": 0.9351, + "step": 1114 + }, + { + "epoch": 0.019782235871255934, + "grad_norm": 3.15625, + "learning_rate": 4.99651069782981e-05, + "loss": 0.9017, + "step": 1116 + }, + { + "epoch": 0.0198176879068675, + "grad_norm": 2.90625, + "learning_rate": 4.99649593176451e-05, + "loss": 0.9318, + "step": 1118 + }, + { + "epoch": 0.019853139942479073, + "grad_norm": 3.25, + "learning_rate": 4.996481134543461e-05, + "loss": 0.8764, + "step": 1120 + }, + { + "epoch": 0.01988859197809064, + "grad_norm": 3.390625, + "learning_rate": 4.996466306166847e-05, + "loss": 0.9343, + "step": 1122 + }, + { + "epoch": 0.01992404401370221, + "grad_norm": 3.265625, + "learning_rate": 4.996451446634854e-05, + "loss": 0.9393, + "step": 1124 + }, + { + "epoch": 0.019959496049313782, + "grad_norm": 2.75, + "learning_rate": 4.996436555947667e-05, + "loss": 0.9263, + "step": 1126 + }, + { + "epoch": 0.01999494808492535, + "grad_norm": 3.25, + "learning_rate": 4.996421634105471e-05, + "loss": 0.9262, + "step": 1128 + }, + { + "epoch": 0.02003040012053692, + "grad_norm": 3.265625, + "learning_rate": 4.996406681108453e-05, + "loss": 0.9101, + "step": 1130 + }, + { + "epoch": 0.020065852156148492, + "grad_norm": 3.046875, + "learning_rate": 4.9963916969568e-05, + "loss": 0.9742, + "step": 1132 + }, + { + "epoch": 0.02010130419176006, + "grad_norm": 3.046875, + "learning_rate": 4.996376681650698e-05, + "loss": 0.9335, + "step": 1134 + }, + { + "epoch": 0.02013675622737163, + "grad_norm": 3.34375, + "learning_rate": 4.996361635190336e-05, + "loss": 0.9756, + "step": 1136 + }, + { + "epoch": 0.0201722082629832, + "grad_norm": 3.4375, + "learning_rate": 4.9963465575759006e-05, + "loss": 0.9531, + "step": 1138 + }, + { + "epoch": 0.02020766029859477, + "grad_norm": 2.78125, + "learning_rate": 4.9963314488075795e-05, + "loss": 0.8895, + "step": 1140 + }, + { + "epoch": 0.02024311233420634, + "grad_norm": 2.75, + "learning_rate": 4.996316308885562e-05, + "loss": 0.8957, + "step": 1142 + }, + { + "epoch": 0.020278564369817908, + "grad_norm": 2.96875, + "learning_rate": 4.9963011378100376e-05, + "loss": 0.9058, + "step": 1144 + }, + { + "epoch": 0.02031401640542948, + "grad_norm": 3.0, + "learning_rate": 4.996285935581194e-05, + "loss": 0.9273, + "step": 1146 + }, + { + "epoch": 0.02034946844104105, + "grad_norm": 3.5625, + "learning_rate": 4.996270702199223e-05, + "loss": 0.9218, + "step": 1148 + }, + { + "epoch": 0.020384920476652617, + "grad_norm": 2.921875, + "learning_rate": 4.996255437664312e-05, + "loss": 0.9251, + "step": 1150 + }, + { + "epoch": 0.02042037251226419, + "grad_norm": 3.109375, + "learning_rate": 4.996240141976654e-05, + "loss": 0.892, + "step": 1152 + }, + { + "epoch": 0.02045582454787576, + "grad_norm": 3.203125, + "learning_rate": 4.996224815136439e-05, + "loss": 0.9465, + "step": 1154 + }, + { + "epoch": 0.020491276583487327, + "grad_norm": 3.1875, + "learning_rate": 4.996209457143858e-05, + "loss": 0.9364, + "step": 1156 + }, + { + "epoch": 0.020526728619098898, + "grad_norm": 3.03125, + "learning_rate": 4.996194067999103e-05, + "loss": 0.9111, + "step": 1158 + }, + { + "epoch": 0.02056218065471047, + "grad_norm": 3.546875, + "learning_rate": 4.996178647702366e-05, + "loss": 0.8519, + "step": 1160 + }, + { + "epoch": 0.020597632690322037, + "grad_norm": 2.890625, + "learning_rate": 4.996163196253839e-05, + "loss": 0.976, + "step": 1162 + }, + { + "epoch": 0.020633084725933608, + "grad_norm": 2.8125, + "learning_rate": 4.996147713653716e-05, + "loss": 0.8858, + "step": 1164 + }, + { + "epoch": 0.020668536761545175, + "grad_norm": 2.875, + "learning_rate": 4.9961321999021886e-05, + "loss": 0.926, + "step": 1166 + }, + { + "epoch": 0.020703988797156746, + "grad_norm": 3.0, + "learning_rate": 4.996116654999452e-05, + "loss": 0.9177, + "step": 1168 + }, + { + "epoch": 0.020739440832768317, + "grad_norm": 2.859375, + "learning_rate": 4.996101078945699e-05, + "loss": 0.9003, + "step": 1170 + }, + { + "epoch": 0.020774892868379885, + "grad_norm": 2.71875, + "learning_rate": 4.9960854717411243e-05, + "loss": 0.9368, + "step": 1172 + }, + { + "epoch": 0.020810344903991456, + "grad_norm": 2.890625, + "learning_rate": 4.9960698333859234e-05, + "loss": 0.9016, + "step": 1174 + }, + { + "epoch": 0.020845796939603027, + "grad_norm": 3.1875, + "learning_rate": 4.9960541638802903e-05, + "loss": 0.8804, + "step": 1176 + }, + { + "epoch": 0.020881248975214595, + "grad_norm": 3.109375, + "learning_rate": 4.9960384632244216e-05, + "loss": 0.9707, + "step": 1178 + }, + { + "epoch": 0.020916701010826166, + "grad_norm": 2.859375, + "learning_rate": 4.9960227314185124e-05, + "loss": 0.8634, + "step": 1180 + }, + { + "epoch": 0.020952153046437737, + "grad_norm": 2.84375, + "learning_rate": 4.99600696846276e-05, + "loss": 0.9332, + "step": 1182 + }, + { + "epoch": 0.020987605082049304, + "grad_norm": 3.125, + "learning_rate": 4.99599117435736e-05, + "loss": 0.9643, + "step": 1184 + }, + { + "epoch": 0.021023057117660875, + "grad_norm": 3.421875, + "learning_rate": 4.9959753491025095e-05, + "loss": 0.888, + "step": 1186 + }, + { + "epoch": 0.021058509153272443, + "grad_norm": 2.859375, + "learning_rate": 4.9959594926984074e-05, + "loss": 0.9664, + "step": 1188 + }, + { + "epoch": 0.021093961188884014, + "grad_norm": 3.109375, + "learning_rate": 4.99594360514525e-05, + "loss": 0.9423, + "step": 1190 + }, + { + "epoch": 0.021129413224495585, + "grad_norm": 2.796875, + "learning_rate": 4.995927686443237e-05, + "loss": 0.8912, + "step": 1192 + }, + { + "epoch": 0.021164865260107153, + "grad_norm": 3.359375, + "learning_rate": 4.9959117365925654e-05, + "loss": 0.9002, + "step": 1194 + }, + { + "epoch": 0.021200317295718724, + "grad_norm": 3.296875, + "learning_rate": 4.995895755593436e-05, + "loss": 0.9246, + "step": 1196 + }, + { + "epoch": 0.021235769331330295, + "grad_norm": 2.90625, + "learning_rate": 4.995879743446047e-05, + "loss": 0.8685, + "step": 1198 + }, + { + "epoch": 0.021271221366941862, + "grad_norm": 2.8125, + "learning_rate": 4.995863700150599e-05, + "loss": 0.9362, + "step": 1200 + }, + { + "epoch": 0.021306673402553433, + "grad_norm": 2.8125, + "learning_rate": 4.9958476257072914e-05, + "loss": 0.9406, + "step": 1202 + }, + { + "epoch": 0.021342125438165004, + "grad_norm": 3.0, + "learning_rate": 4.995831520116326e-05, + "loss": 0.9096, + "step": 1204 + }, + { + "epoch": 0.021377577473776572, + "grad_norm": 3.296875, + "learning_rate": 4.9958153833779027e-05, + "loss": 0.8959, + "step": 1206 + }, + { + "epoch": 0.021413029509388143, + "grad_norm": 2.828125, + "learning_rate": 4.995799215492223e-05, + "loss": 0.8913, + "step": 1208 + }, + { + "epoch": 0.02144848154499971, + "grad_norm": 3.203125, + "learning_rate": 4.9957830164594893e-05, + "loss": 0.9452, + "step": 1210 + }, + { + "epoch": 0.02148393358061128, + "grad_norm": 2.84375, + "learning_rate": 4.995766786279903e-05, + "loss": 0.9203, + "step": 1212 + }, + { + "epoch": 0.021519385616222853, + "grad_norm": 2.59375, + "learning_rate": 4.9957505249536676e-05, + "loss": 0.9122, + "step": 1214 + }, + { + "epoch": 0.02155483765183442, + "grad_norm": 2.640625, + "learning_rate": 4.995734232480985e-05, + "loss": 0.8905, + "step": 1216 + }, + { + "epoch": 0.02159028968744599, + "grad_norm": 2.859375, + "learning_rate": 4.995717908862059e-05, + "loss": 0.9156, + "step": 1218 + }, + { + "epoch": 0.021625741723057562, + "grad_norm": 3.03125, + "learning_rate": 4.995701554097094e-05, + "loss": 0.9158, + "step": 1220 + }, + { + "epoch": 0.02166119375866913, + "grad_norm": 2.96875, + "learning_rate": 4.995685168186293e-05, + "loss": 0.8903, + "step": 1222 + }, + { + "epoch": 0.0216966457942807, + "grad_norm": 3.453125, + "learning_rate": 4.9956687511298604e-05, + "loss": 0.9174, + "step": 1224 + }, + { + "epoch": 0.021732097829892272, + "grad_norm": 3.234375, + "learning_rate": 4.995652302928002e-05, + "loss": 0.9353, + "step": 1226 + }, + { + "epoch": 0.02176754986550384, + "grad_norm": 2.8125, + "learning_rate": 4.995635823580922e-05, + "loss": 0.9103, + "step": 1228 + }, + { + "epoch": 0.02180300190111541, + "grad_norm": 3.390625, + "learning_rate": 4.9956193130888276e-05, + "loss": 0.9258, + "step": 1230 + }, + { + "epoch": 0.021838453936726978, + "grad_norm": 3.1875, + "learning_rate": 4.995602771451924e-05, + "loss": 0.8866, + "step": 1232 + }, + { + "epoch": 0.02187390597233855, + "grad_norm": 3.1875, + "learning_rate": 4.9955861986704175e-05, + "loss": 0.9572, + "step": 1234 + }, + { + "epoch": 0.02190935800795012, + "grad_norm": 2.9375, + "learning_rate": 4.9955695947445145e-05, + "loss": 0.9017, + "step": 1236 + }, + { + "epoch": 0.021944810043561688, + "grad_norm": 3.046875, + "learning_rate": 4.995552959674423e-05, + "loss": 0.9137, + "step": 1238 + }, + { + "epoch": 0.02198026207917326, + "grad_norm": 2.875, + "learning_rate": 4.995536293460351e-05, + "loss": 0.8681, + "step": 1240 + }, + { + "epoch": 0.02201571411478483, + "grad_norm": 3.34375, + "learning_rate": 4.995519596102506e-05, + "loss": 0.9348, + "step": 1242 + }, + { + "epoch": 0.022051166150396397, + "grad_norm": 2.90625, + "learning_rate": 4.995502867601095e-05, + "loss": 0.9005, + "step": 1244 + }, + { + "epoch": 0.02208661818600797, + "grad_norm": 3.296875, + "learning_rate": 4.995486107956329e-05, + "loss": 0.9338, + "step": 1246 + }, + { + "epoch": 0.02212207022161954, + "grad_norm": 2.90625, + "learning_rate": 4.995469317168415e-05, + "loss": 0.8978, + "step": 1248 + }, + { + "epoch": 0.022157522257231107, + "grad_norm": 2.828125, + "learning_rate": 4.9954524952375646e-05, + "loss": 0.8685, + "step": 1250 + }, + { + "epoch": 0.022192974292842678, + "grad_norm": 2.75, + "learning_rate": 4.995435642163987e-05, + "loss": 0.9427, + "step": 1252 + }, + { + "epoch": 0.022228426328454245, + "grad_norm": 3.015625, + "learning_rate": 4.995418757947892e-05, + "loss": 0.915, + "step": 1254 + }, + { + "epoch": 0.022263878364065817, + "grad_norm": 2.71875, + "learning_rate": 4.99540184258949e-05, + "loss": 0.8801, + "step": 1256 + }, + { + "epoch": 0.022299330399677388, + "grad_norm": 3.265625, + "learning_rate": 4.995384896088994e-05, + "loss": 0.9515, + "step": 1258 + }, + { + "epoch": 0.022334782435288955, + "grad_norm": 2.96875, + "learning_rate": 4.995367918446613e-05, + "loss": 0.9193, + "step": 1260 + }, + { + "epoch": 0.022370234470900526, + "grad_norm": 2.9375, + "learning_rate": 4.995350909662561e-05, + "loss": 0.9507, + "step": 1262 + }, + { + "epoch": 0.022405686506512097, + "grad_norm": 2.953125, + "learning_rate": 4.995333869737049e-05, + "loss": 0.8967, + "step": 1264 + }, + { + "epoch": 0.022441138542123665, + "grad_norm": 2.671875, + "learning_rate": 4.9953167986702905e-05, + "loss": 0.9278, + "step": 1266 + }, + { + "epoch": 0.022476590577735236, + "grad_norm": 2.984375, + "learning_rate": 4.9952996964624976e-05, + "loss": 0.8715, + "step": 1268 + }, + { + "epoch": 0.022512042613346807, + "grad_norm": 3.734375, + "learning_rate": 4.995282563113885e-05, + "loss": 0.8882, + "step": 1270 + }, + { + "epoch": 0.022547494648958374, + "grad_norm": 2.90625, + "learning_rate": 4.9952653986246646e-05, + "loss": 0.9051, + "step": 1272 + }, + { + "epoch": 0.022582946684569945, + "grad_norm": 3.109375, + "learning_rate": 4.995248202995052e-05, + "loss": 0.9401, + "step": 1274 + }, + { + "epoch": 0.022618398720181513, + "grad_norm": 2.671875, + "learning_rate": 4.9952309762252624e-05, + "loss": 0.8906, + "step": 1276 + }, + { + "epoch": 0.022653850755793084, + "grad_norm": 3.078125, + "learning_rate": 4.995213718315509e-05, + "loss": 0.9692, + "step": 1278 + }, + { + "epoch": 0.022689302791404655, + "grad_norm": 3.140625, + "learning_rate": 4.995196429266009e-05, + "loss": 0.8744, + "step": 1280 + }, + { + "epoch": 0.022724754827016223, + "grad_norm": 3.296875, + "learning_rate": 4.995179109076976e-05, + "loss": 0.8542, + "step": 1282 + }, + { + "epoch": 0.022760206862627794, + "grad_norm": 2.8125, + "learning_rate": 4.9951617577486285e-05, + "loss": 0.8996, + "step": 1284 + }, + { + "epoch": 0.022795658898239365, + "grad_norm": 2.953125, + "learning_rate": 4.995144375281182e-05, + "loss": 0.9078, + "step": 1286 + }, + { + "epoch": 0.022831110933850932, + "grad_norm": 3.03125, + "learning_rate": 4.9951269616748534e-05, + "loss": 0.9386, + "step": 1288 + }, + { + "epoch": 0.022866562969462503, + "grad_norm": 3.46875, + "learning_rate": 4.995109516929859e-05, + "loss": 0.8791, + "step": 1290 + }, + { + "epoch": 0.02290201500507407, + "grad_norm": 2.84375, + "learning_rate": 4.995092041046419e-05, + "loss": 0.8965, + "step": 1292 + }, + { + "epoch": 0.022937467040685642, + "grad_norm": 2.90625, + "learning_rate": 4.995074534024748e-05, + "loss": 0.9523, + "step": 1294 + }, + { + "epoch": 0.022972919076297213, + "grad_norm": 2.78125, + "learning_rate": 4.9950569958650684e-05, + "loss": 0.9555, + "step": 1296 + }, + { + "epoch": 0.02300837111190878, + "grad_norm": 3.546875, + "learning_rate": 4.995039426567596e-05, + "loss": 0.8951, + "step": 1298 + }, + { + "epoch": 0.02304382314752035, + "grad_norm": 2.90625, + "learning_rate": 4.995021826132552e-05, + "loss": 0.8788, + "step": 1300 + }, + { + "epoch": 0.023079275183131923, + "grad_norm": 3.09375, + "learning_rate": 4.995004194560155e-05, + "loss": 0.9304, + "step": 1302 + }, + { + "epoch": 0.02311472721874349, + "grad_norm": 3.203125, + "learning_rate": 4.9949865318506254e-05, + "loss": 0.9267, + "step": 1304 + }, + { + "epoch": 0.02315017925435506, + "grad_norm": 2.96875, + "learning_rate": 4.994968838004184e-05, + "loss": 0.8883, + "step": 1306 + }, + { + "epoch": 0.023185631289966632, + "grad_norm": 2.890625, + "learning_rate": 4.99495111302105e-05, + "loss": 0.9201, + "step": 1308 + }, + { + "epoch": 0.0232210833255782, + "grad_norm": 2.984375, + "learning_rate": 4.9949333569014464e-05, + "loss": 0.9868, + "step": 1310 + }, + { + "epoch": 0.02325653536118977, + "grad_norm": 2.703125, + "learning_rate": 4.994915569645594e-05, + "loss": 0.8611, + "step": 1312 + }, + { + "epoch": 0.02329198739680134, + "grad_norm": 2.65625, + "learning_rate": 4.994897751253715e-05, + "loss": 0.9253, + "step": 1314 + }, + { + "epoch": 0.02332743943241291, + "grad_norm": 2.953125, + "learning_rate": 4.9948799017260325e-05, + "loss": 0.9014, + "step": 1316 + }, + { + "epoch": 0.02336289146802448, + "grad_norm": 2.765625, + "learning_rate": 4.994862021062767e-05, + "loss": 0.9189, + "step": 1318 + }, + { + "epoch": 0.023398343503636048, + "grad_norm": 3.390625, + "learning_rate": 4.994844109264145e-05, + "loss": 0.8979, + "step": 1320 + }, + { + "epoch": 0.02343379553924762, + "grad_norm": 3.015625, + "learning_rate": 4.994826166330386e-05, + "loss": 0.8681, + "step": 1322 + }, + { + "epoch": 0.02346924757485919, + "grad_norm": 2.671875, + "learning_rate": 4.994808192261718e-05, + "loss": 0.8864, + "step": 1324 + }, + { + "epoch": 0.023504699610470758, + "grad_norm": 3.046875, + "learning_rate": 4.994790187058363e-05, + "loss": 0.9201, + "step": 1326 + }, + { + "epoch": 0.02354015164608233, + "grad_norm": 3.0625, + "learning_rate": 4.994772150720545e-05, + "loss": 0.8995, + "step": 1328 + }, + { + "epoch": 0.0235756036816939, + "grad_norm": 3.109375, + "learning_rate": 4.9947540832484904e-05, + "loss": 0.8978, + "step": 1330 + }, + { + "epoch": 0.023611055717305467, + "grad_norm": 2.875, + "learning_rate": 4.994735984642426e-05, + "loss": 0.903, + "step": 1332 + }, + { + "epoch": 0.02364650775291704, + "grad_norm": 2.875, + "learning_rate": 4.9947178549025745e-05, + "loss": 0.907, + "step": 1334 + }, + { + "epoch": 0.023681959788528606, + "grad_norm": 2.984375, + "learning_rate": 4.9946996940291644e-05, + "loss": 0.8918, + "step": 1336 + }, + { + "epoch": 0.023717411824140177, + "grad_norm": 2.984375, + "learning_rate": 4.9946815020224215e-05, + "loss": 0.8664, + "step": 1338 + }, + { + "epoch": 0.023752863859751748, + "grad_norm": 3.265625, + "learning_rate": 4.994663278882573e-05, + "loss": 0.9018, + "step": 1340 + }, + { + "epoch": 0.023788315895363316, + "grad_norm": 2.921875, + "learning_rate": 4.994645024609847e-05, + "loss": 0.875, + "step": 1342 + }, + { + "epoch": 0.023823767930974887, + "grad_norm": 3.125, + "learning_rate": 4.9946267392044696e-05, + "loss": 0.9201, + "step": 1344 + }, + { + "epoch": 0.023859219966586458, + "grad_norm": 3.234375, + "learning_rate": 4.994608422666671e-05, + "loss": 0.929, + "step": 1346 + }, + { + "epoch": 0.023894672002198025, + "grad_norm": 3.34375, + "learning_rate": 4.994590074996679e-05, + "loss": 0.929, + "step": 1348 + }, + { + "epoch": 0.023930124037809596, + "grad_norm": 2.859375, + "learning_rate": 4.994571696194722e-05, + "loss": 0.9022, + "step": 1350 + }, + { + "epoch": 0.023965576073421167, + "grad_norm": 2.984375, + "learning_rate": 4.99455328626103e-05, + "loss": 0.8865, + "step": 1352 + }, + { + "epoch": 0.024001028109032735, + "grad_norm": 2.84375, + "learning_rate": 4.994534845195832e-05, + "loss": 0.8835, + "step": 1354 + }, + { + "epoch": 0.024036480144644306, + "grad_norm": 2.9375, + "learning_rate": 4.99451637299936e-05, + "loss": 0.9338, + "step": 1356 + }, + { + "epoch": 0.024071932180255873, + "grad_norm": 3.0625, + "learning_rate": 4.9944978696718416e-05, + "loss": 0.9045, + "step": 1358 + }, + { + "epoch": 0.024107384215867445, + "grad_norm": 2.625, + "learning_rate": 4.99447933521351e-05, + "loss": 0.9218, + "step": 1360 + }, + { + "epoch": 0.024142836251479016, + "grad_norm": 3.09375, + "learning_rate": 4.994460769624596e-05, + "loss": 0.9163, + "step": 1362 + }, + { + "epoch": 0.024178288287090583, + "grad_norm": 3.09375, + "learning_rate": 4.994442172905331e-05, + "loss": 0.8869, + "step": 1364 + }, + { + "epoch": 0.024213740322702154, + "grad_norm": 3.046875, + "learning_rate": 4.994423545055948e-05, + "loss": 0.9001, + "step": 1366 + }, + { + "epoch": 0.024249192358313725, + "grad_norm": 3.015625, + "learning_rate": 4.994404886076678e-05, + "loss": 0.9176, + "step": 1368 + }, + { + "epoch": 0.024284644393925293, + "grad_norm": 3.109375, + "learning_rate": 4.994386195967754e-05, + "loss": 0.9031, + "step": 1370 + }, + { + "epoch": 0.024320096429536864, + "grad_norm": 2.90625, + "learning_rate": 4.994367474729411e-05, + "loss": 0.8798, + "step": 1372 + }, + { + "epoch": 0.024355548465148435, + "grad_norm": 3.40625, + "learning_rate": 4.994348722361881e-05, + "loss": 0.8774, + "step": 1374 + }, + { + "epoch": 0.024391000500760002, + "grad_norm": 3.15625, + "learning_rate": 4.9943299388653984e-05, + "loss": 0.9432, + "step": 1376 + }, + { + "epoch": 0.024426452536371573, + "grad_norm": 3.390625, + "learning_rate": 4.9943111242401974e-05, + "loss": 0.9392, + "step": 1378 + }, + { + "epoch": 0.02446190457198314, + "grad_norm": 3.109375, + "learning_rate": 4.994292278486514e-05, + "loss": 0.8717, + "step": 1380 + }, + { + "epoch": 0.024497356607594712, + "grad_norm": 3.046875, + "learning_rate": 4.994273401604582e-05, + "loss": 0.8945, + "step": 1382 + }, + { + "epoch": 0.024532808643206283, + "grad_norm": 3.140625, + "learning_rate": 4.994254493594637e-05, + "loss": 0.9413, + "step": 1384 + }, + { + "epoch": 0.02456826067881785, + "grad_norm": 2.625, + "learning_rate": 4.994235554456916e-05, + "loss": 0.889, + "step": 1386 + }, + { + "epoch": 0.02460371271442942, + "grad_norm": 3.125, + "learning_rate": 4.9942165841916545e-05, + "loss": 0.9052, + "step": 1388 + }, + { + "epoch": 0.024639164750040993, + "grad_norm": 2.796875, + "learning_rate": 4.99419758279909e-05, + "loss": 0.9182, + "step": 1390 + }, + { + "epoch": 0.02467461678565256, + "grad_norm": 2.90625, + "learning_rate": 4.9941785502794586e-05, + "loss": 0.8578, + "step": 1392 + }, + { + "epoch": 0.02471006882126413, + "grad_norm": 2.828125, + "learning_rate": 4.994159486632999e-05, + "loss": 0.8931, + "step": 1394 + }, + { + "epoch": 0.024745520856875702, + "grad_norm": 3.21875, + "learning_rate": 4.994140391859947e-05, + "loss": 0.9515, + "step": 1396 + }, + { + "epoch": 0.02478097289248727, + "grad_norm": 3.34375, + "learning_rate": 4.994121265960544e-05, + "loss": 0.9105, + "step": 1398 + }, + { + "epoch": 0.02481642492809884, + "grad_norm": 3.171875, + "learning_rate": 4.994102108935027e-05, + "loss": 0.9351, + "step": 1400 + }, + { + "epoch": 0.02485187696371041, + "grad_norm": 2.546875, + "learning_rate": 4.9940829207836346e-05, + "loss": 0.8385, + "step": 1402 + }, + { + "epoch": 0.02488732899932198, + "grad_norm": 2.90625, + "learning_rate": 4.994063701506607e-05, + "loss": 0.8899, + "step": 1404 + }, + { + "epoch": 0.02492278103493355, + "grad_norm": 2.875, + "learning_rate": 4.994044451104184e-05, + "loss": 0.9206, + "step": 1406 + }, + { + "epoch": 0.024958233070545118, + "grad_norm": 3.015625, + "learning_rate": 4.994025169576605e-05, + "loss": 0.9093, + "step": 1408 + }, + { + "epoch": 0.02499368510615669, + "grad_norm": 3.046875, + "learning_rate": 4.9940058569241125e-05, + "loss": 0.9178, + "step": 1410 + }, + { + "epoch": 0.02502913714176826, + "grad_norm": 2.8125, + "learning_rate": 4.993986513146945e-05, + "loss": 0.8901, + "step": 1412 + }, + { + "epoch": 0.025064589177379828, + "grad_norm": 2.796875, + "learning_rate": 4.993967138245347e-05, + "loss": 0.8781, + "step": 1414 + }, + { + "epoch": 0.0251000412129914, + "grad_norm": 2.765625, + "learning_rate": 4.9939477322195574e-05, + "loss": 0.9092, + "step": 1416 + }, + { + "epoch": 0.02513549324860297, + "grad_norm": 3.375, + "learning_rate": 4.9939282950698195e-05, + "loss": 0.9014, + "step": 1418 + }, + { + "epoch": 0.025170945284214537, + "grad_norm": 3.53125, + "learning_rate": 4.9939088267963763e-05, + "loss": 0.8958, + "step": 1420 + }, + { + "epoch": 0.02520639731982611, + "grad_norm": 3.078125, + "learning_rate": 4.9938893273994706e-05, + "loss": 0.918, + "step": 1422 + }, + { + "epoch": 0.025241849355437676, + "grad_norm": 2.859375, + "learning_rate": 4.9938697968793454e-05, + "loss": 0.8702, + "step": 1424 + }, + { + "epoch": 0.025277301391049247, + "grad_norm": 3.765625, + "learning_rate": 4.9938502352362435e-05, + "loss": 0.9012, + "step": 1426 + }, + { + "epoch": 0.025312753426660818, + "grad_norm": 2.765625, + "learning_rate": 4.9938306424704114e-05, + "loss": 0.8525, + "step": 1428 + }, + { + "epoch": 0.025348205462272386, + "grad_norm": 2.984375, + "learning_rate": 4.993811018582092e-05, + "loss": 0.909, + "step": 1430 + }, + { + "epoch": 0.025383657497883957, + "grad_norm": 3.15625, + "learning_rate": 4.993791363571531e-05, + "loss": 0.8888, + "step": 1432 + }, + { + "epoch": 0.025419109533495528, + "grad_norm": 2.96875, + "learning_rate": 4.993771677438972e-05, + "loss": 0.9036, + "step": 1434 + }, + { + "epoch": 0.025454561569107095, + "grad_norm": 3.046875, + "learning_rate": 4.993751960184663e-05, + "loss": 0.8765, + "step": 1436 + }, + { + "epoch": 0.025490013604718666, + "grad_norm": 3.546875, + "learning_rate": 4.993732211808848e-05, + "loss": 0.9343, + "step": 1438 + }, + { + "epoch": 0.025525465640330237, + "grad_norm": 3.203125, + "learning_rate": 4.993712432311775e-05, + "loss": 0.8502, + "step": 1440 + }, + { + "epoch": 0.025560917675941805, + "grad_norm": 3.046875, + "learning_rate": 4.9936926216936905e-05, + "loss": 0.9446, + "step": 1442 + }, + { + "epoch": 0.025596369711553376, + "grad_norm": 3.015625, + "learning_rate": 4.993672779954841e-05, + "loss": 0.9214, + "step": 1444 + }, + { + "epoch": 0.025631821747164944, + "grad_norm": 2.640625, + "learning_rate": 4.993652907095475e-05, + "loss": 0.8599, + "step": 1446 + }, + { + "epoch": 0.025667273782776515, + "grad_norm": 2.9375, + "learning_rate": 4.99363300311584e-05, + "loss": 0.9155, + "step": 1448 + }, + { + "epoch": 0.025702725818388086, + "grad_norm": 2.90625, + "learning_rate": 4.993613068016184e-05, + "loss": 0.9058, + "step": 1450 + }, + { + "epoch": 0.025738177853999653, + "grad_norm": 2.9375, + "learning_rate": 4.993593101796756e-05, + "loss": 0.9138, + "step": 1452 + }, + { + "epoch": 0.025773629889611224, + "grad_norm": 2.765625, + "learning_rate": 4.993573104457806e-05, + "loss": 0.888, + "step": 1454 + }, + { + "epoch": 0.025809081925222795, + "grad_norm": 2.828125, + "learning_rate": 4.993553075999584e-05, + "loss": 0.9234, + "step": 1456 + }, + { + "epoch": 0.025844533960834363, + "grad_norm": 3.015625, + "learning_rate": 4.9935330164223376e-05, + "loss": 0.91, + "step": 1458 + }, + { + "epoch": 0.025879985996445934, + "grad_norm": 2.890625, + "learning_rate": 4.993512925726319e-05, + "loss": 0.9006, + "step": 1460 + }, + { + "epoch": 0.025915438032057505, + "grad_norm": 2.890625, + "learning_rate": 4.993492803911778e-05, + "loss": 0.8324, + "step": 1462 + }, + { + "epoch": 0.025950890067669072, + "grad_norm": 2.734375, + "learning_rate": 4.993472650978968e-05, + "loss": 0.8797, + "step": 1464 + }, + { + "epoch": 0.025986342103280644, + "grad_norm": 2.859375, + "learning_rate": 4.993452466928137e-05, + "loss": 0.932, + "step": 1466 + }, + { + "epoch": 0.02602179413889221, + "grad_norm": 3.203125, + "learning_rate": 4.993432251759538e-05, + "loss": 0.8812, + "step": 1468 + }, + { + "epoch": 0.026057246174503782, + "grad_norm": 3.078125, + "learning_rate": 4.993412005473425e-05, + "loss": 0.9186, + "step": 1470 + }, + { + "epoch": 0.026092698210115353, + "grad_norm": 3.359375, + "learning_rate": 4.993391728070049e-05, + "loss": 0.9155, + "step": 1472 + }, + { + "epoch": 0.02612815024572692, + "grad_norm": 3.0625, + "learning_rate": 4.993371419549664e-05, + "loss": 0.8935, + "step": 1474 + }, + { + "epoch": 0.026163602281338492, + "grad_norm": 2.875, + "learning_rate": 4.9933510799125224e-05, + "loss": 0.8947, + "step": 1476 + }, + { + "epoch": 0.026199054316950063, + "grad_norm": 3.078125, + "learning_rate": 4.9933307091588796e-05, + "loss": 0.8763, + "step": 1478 + }, + { + "epoch": 0.02623450635256163, + "grad_norm": 2.875, + "learning_rate": 4.993310307288988e-05, + "loss": 0.8889, + "step": 1480 + }, + { + "epoch": 0.0262699583881732, + "grad_norm": 2.65625, + "learning_rate": 4.993289874303103e-05, + "loss": 0.8904, + "step": 1482 + }, + { + "epoch": 0.02630541042378477, + "grad_norm": 3.328125, + "learning_rate": 4.993269410201481e-05, + "loss": 0.9644, + "step": 1484 + }, + { + "epoch": 0.02634086245939634, + "grad_norm": 2.84375, + "learning_rate": 4.993248914984375e-05, + "loss": 0.9051, + "step": 1486 + }, + { + "epoch": 0.02637631449500791, + "grad_norm": 2.8125, + "learning_rate": 4.9932283886520413e-05, + "loss": 0.9146, + "step": 1488 + }, + { + "epoch": 0.02641176653061948, + "grad_norm": 2.609375, + "learning_rate": 4.993207831204738e-05, + "loss": 0.8888, + "step": 1490 + }, + { + "epoch": 0.02644721856623105, + "grad_norm": 2.90625, + "learning_rate": 4.9931872426427196e-05, + "loss": 0.9044, + "step": 1492 + }, + { + "epoch": 0.02648267060184262, + "grad_norm": 2.875, + "learning_rate": 4.9931666229662435e-05, + "loss": 0.955, + "step": 1494 + }, + { + "epoch": 0.02651812263745419, + "grad_norm": 3.046875, + "learning_rate": 4.993145972175567e-05, + "loss": 0.9151, + "step": 1496 + }, + { + "epoch": 0.02655357467306576, + "grad_norm": 3.09375, + "learning_rate": 4.993125290270949e-05, + "loss": 0.917, + "step": 1498 + }, + { + "epoch": 0.02658902670867733, + "grad_norm": 3.25, + "learning_rate": 4.993104577252646e-05, + "loss": 0.9137, + "step": 1500 + }, + { + "epoch": 0.026624478744288898, + "grad_norm": 2.71875, + "learning_rate": 4.993083833120917e-05, + "loss": 0.9038, + "step": 1502 + }, + { + "epoch": 0.02665993077990047, + "grad_norm": 2.765625, + "learning_rate": 4.993063057876022e-05, + "loss": 0.9221, + "step": 1504 + }, + { + "epoch": 0.026695382815512037, + "grad_norm": 3.3125, + "learning_rate": 4.993042251518218e-05, + "loss": 0.8918, + "step": 1506 + }, + { + "epoch": 0.026730834851123608, + "grad_norm": 3.15625, + "learning_rate": 4.993021414047767e-05, + "loss": 0.9025, + "step": 1508 + }, + { + "epoch": 0.02676628688673518, + "grad_norm": 3.0625, + "learning_rate": 4.9930005454649276e-05, + "loss": 0.8823, + "step": 1510 + }, + { + "epoch": 0.026801738922346746, + "grad_norm": 2.96875, + "learning_rate": 4.9929796457699606e-05, + "loss": 0.8694, + "step": 1512 + }, + { + "epoch": 0.026837190957958317, + "grad_norm": 3.03125, + "learning_rate": 4.9929587149631265e-05, + "loss": 0.9034, + "step": 1514 + }, + { + "epoch": 0.026872642993569888, + "grad_norm": 3.171875, + "learning_rate": 4.9929377530446876e-05, + "loss": 0.9044, + "step": 1516 + }, + { + "epoch": 0.026908095029181456, + "grad_norm": 2.859375, + "learning_rate": 4.992916760014904e-05, + "loss": 0.9068, + "step": 1518 + }, + { + "epoch": 0.026943547064793027, + "grad_norm": 2.609375, + "learning_rate": 4.992895735874039e-05, + "loss": 0.8867, + "step": 1520 + }, + { + "epoch": 0.026978999100404598, + "grad_norm": 2.90625, + "learning_rate": 4.9928746806223545e-05, + "loss": 0.8914, + "step": 1522 + }, + { + "epoch": 0.027014451136016165, + "grad_norm": 3.046875, + "learning_rate": 4.992853594260114e-05, + "loss": 0.9118, + "step": 1524 + }, + { + "epoch": 0.027049903171627736, + "grad_norm": 2.65625, + "learning_rate": 4.992832476787579e-05, + "loss": 0.8973, + "step": 1526 + }, + { + "epoch": 0.027085355207239304, + "grad_norm": 2.828125, + "learning_rate": 4.992811328205013e-05, + "loss": 0.9082, + "step": 1528 + }, + { + "epoch": 0.027120807242850875, + "grad_norm": 2.765625, + "learning_rate": 4.992790148512682e-05, + "loss": 0.8552, + "step": 1530 + }, + { + "epoch": 0.027156259278462446, + "grad_norm": 3.25, + "learning_rate": 4.992768937710849e-05, + "loss": 0.9192, + "step": 1532 + }, + { + "epoch": 0.027191711314074014, + "grad_norm": 2.984375, + "learning_rate": 4.992747695799779e-05, + "loss": 0.9207, + "step": 1534 + }, + { + "epoch": 0.027227163349685585, + "grad_norm": 2.703125, + "learning_rate": 4.992726422779737e-05, + "loss": 0.9069, + "step": 1536 + }, + { + "epoch": 0.027262615385297156, + "grad_norm": 2.890625, + "learning_rate": 4.9927051186509876e-05, + "loss": 0.8553, + "step": 1538 + }, + { + "epoch": 0.027298067420908723, + "grad_norm": 2.96875, + "learning_rate": 4.992683783413798e-05, + "loss": 0.9094, + "step": 1540 + }, + { + "epoch": 0.027333519456520294, + "grad_norm": 2.921875, + "learning_rate": 4.9926624170684345e-05, + "loss": 0.9098, + "step": 1542 + }, + { + "epoch": 0.027368971492131865, + "grad_norm": 2.921875, + "learning_rate": 4.9926410196151625e-05, + "loss": 0.8634, + "step": 1544 + }, + { + "epoch": 0.027404423527743433, + "grad_norm": 3.484375, + "learning_rate": 4.99261959105425e-05, + "loss": 0.9041, + "step": 1546 + }, + { + "epoch": 0.027439875563355004, + "grad_norm": 2.953125, + "learning_rate": 4.992598131385964e-05, + "loss": 0.8923, + "step": 1548 + }, + { + "epoch": 0.02747532759896657, + "grad_norm": 2.921875, + "learning_rate": 4.992576640610572e-05, + "loss": 0.9115, + "step": 1550 + }, + { + "epoch": 0.027510779634578143, + "grad_norm": 2.5625, + "learning_rate": 4.992555118728344e-05, + "loss": 0.8475, + "step": 1552 + }, + { + "epoch": 0.027546231670189714, + "grad_norm": 3.1875, + "learning_rate": 4.992533565739547e-05, + "loss": 0.9031, + "step": 1554 + }, + { + "epoch": 0.02758168370580128, + "grad_norm": 3.0625, + "learning_rate": 4.99251198164445e-05, + "loss": 0.8947, + "step": 1556 + }, + { + "epoch": 0.027617135741412852, + "grad_norm": 2.625, + "learning_rate": 4.992490366443322e-05, + "loss": 0.8605, + "step": 1558 + }, + { + "epoch": 0.027652587777024423, + "grad_norm": 3.0625, + "learning_rate": 4.992468720136434e-05, + "loss": 0.8773, + "step": 1560 + }, + { + "epoch": 0.02768803981263599, + "grad_norm": 3.046875, + "learning_rate": 4.9924470427240556e-05, + "loss": 0.9447, + "step": 1562 + }, + { + "epoch": 0.027723491848247562, + "grad_norm": 2.484375, + "learning_rate": 4.992425334206457e-05, + "loss": 0.9087, + "step": 1564 + }, + { + "epoch": 0.027758943883859133, + "grad_norm": 2.890625, + "learning_rate": 4.992403594583909e-05, + "loss": 0.9347, + "step": 1566 + }, + { + "epoch": 0.0277943959194707, + "grad_norm": 2.96875, + "learning_rate": 4.9923818238566844e-05, + "loss": 0.894, + "step": 1568 + }, + { + "epoch": 0.02782984795508227, + "grad_norm": 3.25, + "learning_rate": 4.9923600220250526e-05, + "loss": 0.855, + "step": 1570 + }, + { + "epoch": 0.02786529999069384, + "grad_norm": 3.1875, + "learning_rate": 4.9923381890892874e-05, + "loss": 0.8794, + "step": 1572 + }, + { + "epoch": 0.02790075202630541, + "grad_norm": 2.78125, + "learning_rate": 4.9923163250496606e-05, + "loss": 0.8898, + "step": 1574 + }, + { + "epoch": 0.02793620406191698, + "grad_norm": 3.25, + "learning_rate": 4.992294429906445e-05, + "loss": 0.9033, + "step": 1576 + }, + { + "epoch": 0.02797165609752855, + "grad_norm": 2.703125, + "learning_rate": 4.9922725036599146e-05, + "loss": 0.9061, + "step": 1578 + }, + { + "epoch": 0.02800710813314012, + "grad_norm": 2.9375, + "learning_rate": 4.992250546310342e-05, + "loss": 0.8935, + "step": 1580 + }, + { + "epoch": 0.02804256016875169, + "grad_norm": 2.890625, + "learning_rate": 4.992228557858002e-05, + "loss": 0.8952, + "step": 1582 + }, + { + "epoch": 0.02807801220436326, + "grad_norm": 2.984375, + "learning_rate": 4.992206538303168e-05, + "loss": 0.8433, + "step": 1584 + }, + { + "epoch": 0.02811346423997483, + "grad_norm": 2.859375, + "learning_rate": 4.992184487646116e-05, + "loss": 0.8904, + "step": 1586 + }, + { + "epoch": 0.0281489162755864, + "grad_norm": 2.921875, + "learning_rate": 4.992162405887121e-05, + "loss": 0.8927, + "step": 1588 + }, + { + "epoch": 0.028184368311197968, + "grad_norm": 3.046875, + "learning_rate": 4.992140293026458e-05, + "loss": 0.908, + "step": 1590 + }, + { + "epoch": 0.02821982034680954, + "grad_norm": 3.109375, + "learning_rate": 4.992118149064403e-05, + "loss": 0.8945, + "step": 1592 + }, + { + "epoch": 0.028255272382421107, + "grad_norm": 3.0625, + "learning_rate": 4.9920959740012326e-05, + "loss": 0.9162, + "step": 1594 + }, + { + "epoch": 0.028290724418032678, + "grad_norm": 3.078125, + "learning_rate": 4.9920737678372234e-05, + "loss": 0.8884, + "step": 1596 + }, + { + "epoch": 0.02832617645364425, + "grad_norm": 3.0625, + "learning_rate": 4.9920515305726526e-05, + "loss": 0.8824, + "step": 1598 + }, + { + "epoch": 0.028361628489255816, + "grad_norm": 3.0625, + "learning_rate": 4.992029262207798e-05, + "loss": 0.8814, + "step": 1600 + }, + { + "epoch": 0.028397080524867387, + "grad_norm": 2.921875, + "learning_rate": 4.9920069627429375e-05, + "loss": 0.9012, + "step": 1602 + }, + { + "epoch": 0.02843253256047896, + "grad_norm": 3.28125, + "learning_rate": 4.991984632178349e-05, + "loss": 0.9315, + "step": 1604 + }, + { + "epoch": 0.028467984596090526, + "grad_norm": 2.734375, + "learning_rate": 4.9919622705143106e-05, + "loss": 0.9475, + "step": 1606 + }, + { + "epoch": 0.028503436631702097, + "grad_norm": 3.125, + "learning_rate": 4.991939877751103e-05, + "loss": 0.9051, + "step": 1608 + }, + { + "epoch": 0.028538888667313668, + "grad_norm": 3.078125, + "learning_rate": 4.991917453889004e-05, + "loss": 0.887, + "step": 1610 + }, + { + "epoch": 0.028574340702925236, + "grad_norm": 3.03125, + "learning_rate": 4.991894998928295e-05, + "loss": 0.9247, + "step": 1612 + }, + { + "epoch": 0.028609792738536807, + "grad_norm": 3.078125, + "learning_rate": 4.991872512869256e-05, + "loss": 0.9195, + "step": 1614 + }, + { + "epoch": 0.028645244774148374, + "grad_norm": 3.078125, + "learning_rate": 4.9918499957121654e-05, + "loss": 0.9334, + "step": 1616 + }, + { + "epoch": 0.028680696809759945, + "grad_norm": 2.84375, + "learning_rate": 4.991827447457307e-05, + "loss": 0.8793, + "step": 1618 + }, + { + "epoch": 0.028716148845371516, + "grad_norm": 2.6875, + "learning_rate": 4.991804868104961e-05, + "loss": 0.9034, + "step": 1620 + }, + { + "epoch": 0.028751600880983084, + "grad_norm": 3.0625, + "learning_rate": 4.991782257655408e-05, + "loss": 0.9069, + "step": 1622 + }, + { + "epoch": 0.028787052916594655, + "grad_norm": 2.9375, + "learning_rate": 4.991759616108933e-05, + "loss": 0.8551, + "step": 1624 + }, + { + "epoch": 0.028822504952206226, + "grad_norm": 2.9375, + "learning_rate": 4.991736943465816e-05, + "loss": 0.9543, + "step": 1626 + }, + { + "epoch": 0.028857956987817793, + "grad_norm": 2.546875, + "learning_rate": 4.991714239726342e-05, + "loss": 0.9071, + "step": 1628 + }, + { + "epoch": 0.028893409023429364, + "grad_norm": 3.125, + "learning_rate": 4.991691504890792e-05, + "loss": 0.9125, + "step": 1630 + }, + { + "epoch": 0.028928861059040935, + "grad_norm": 3.09375, + "learning_rate": 4.991668738959452e-05, + "loss": 0.9668, + "step": 1632 + }, + { + "epoch": 0.028964313094652503, + "grad_norm": 3.015625, + "learning_rate": 4.991645941932604e-05, + "loss": 0.8547, + "step": 1634 + }, + { + "epoch": 0.028999765130264074, + "grad_norm": 2.921875, + "learning_rate": 4.9916231138105354e-05, + "loss": 0.8627, + "step": 1636 + }, + { + "epoch": 0.02903521716587564, + "grad_norm": 2.84375, + "learning_rate": 4.991600254593527e-05, + "loss": 0.8693, + "step": 1638 + }, + { + "epoch": 0.029070669201487213, + "grad_norm": 2.8125, + "learning_rate": 4.9915773642818684e-05, + "loss": 0.8523, + "step": 1640 + }, + { + "epoch": 0.029106121237098784, + "grad_norm": 2.9375, + "learning_rate": 4.991554442875842e-05, + "loss": 0.8958, + "step": 1642 + }, + { + "epoch": 0.02914157327271035, + "grad_norm": 3.03125, + "learning_rate": 4.991531490375736e-05, + "loss": 0.8493, + "step": 1644 + }, + { + "epoch": 0.029177025308321922, + "grad_norm": 3.078125, + "learning_rate": 4.9915085067818355e-05, + "loss": 0.9503, + "step": 1646 + }, + { + "epoch": 0.029212477343933493, + "grad_norm": 2.90625, + "learning_rate": 4.9914854920944276e-05, + "loss": 0.9112, + "step": 1648 + }, + { + "epoch": 0.02924792937954506, + "grad_norm": 3.125, + "learning_rate": 4.9914624463138e-05, + "loss": 0.9116, + "step": 1650 + }, + { + "epoch": 0.029283381415156632, + "grad_norm": 3.265625, + "learning_rate": 4.991439369440239e-05, + "loss": 0.899, + "step": 1652 + }, + { + "epoch": 0.029318833450768203, + "grad_norm": 2.90625, + "learning_rate": 4.9914162614740355e-05, + "loss": 0.8528, + "step": 1654 + }, + { + "epoch": 0.02935428548637977, + "grad_norm": 3.15625, + "learning_rate": 4.991393122415475e-05, + "loss": 0.9004, + "step": 1656 + }, + { + "epoch": 0.02938973752199134, + "grad_norm": 2.921875, + "learning_rate": 4.991369952264847e-05, + "loss": 0.915, + "step": 1658 + }, + { + "epoch": 0.02942518955760291, + "grad_norm": 2.734375, + "learning_rate": 4.991346751022441e-05, + "loss": 0.8629, + "step": 1660 + }, + { + "epoch": 0.02946064159321448, + "grad_norm": 2.921875, + "learning_rate": 4.9913235186885464e-05, + "loss": 0.8786, + "step": 1662 + }, + { + "epoch": 0.02949609362882605, + "grad_norm": 3.78125, + "learning_rate": 4.991300255263454e-05, + "loss": 0.8372, + "step": 1664 + }, + { + "epoch": 0.02953154566443762, + "grad_norm": 2.890625, + "learning_rate": 4.991276960747452e-05, + "loss": 0.8547, + "step": 1666 + }, + { + "epoch": 0.02956699770004919, + "grad_norm": 3.078125, + "learning_rate": 4.9912536351408334e-05, + "loss": 0.9204, + "step": 1668 + }, + { + "epoch": 0.02960244973566076, + "grad_norm": 3.28125, + "learning_rate": 4.991230278443888e-05, + "loss": 0.8767, + "step": 1670 + }, + { + "epoch": 0.02963790177127233, + "grad_norm": 2.59375, + "learning_rate": 4.9912068906569076e-05, + "loss": 0.8974, + "step": 1672 + }, + { + "epoch": 0.0296733538068839, + "grad_norm": 3.109375, + "learning_rate": 4.991183471780184e-05, + "loss": 0.8934, + "step": 1674 + }, + { + "epoch": 0.029708805842495467, + "grad_norm": 2.953125, + "learning_rate": 4.9911600218140107e-05, + "loss": 0.9092, + "step": 1676 + }, + { + "epoch": 0.029744257878107038, + "grad_norm": 3.1875, + "learning_rate": 4.9911365407586774e-05, + "loss": 0.8898, + "step": 1678 + }, + { + "epoch": 0.02977970991371861, + "grad_norm": 2.921875, + "learning_rate": 4.9911130286144805e-05, + "loss": 0.8761, + "step": 1680 + }, + { + "epoch": 0.029815161949330177, + "grad_norm": 2.90625, + "learning_rate": 4.9910894853817106e-05, + "loss": 0.8469, + "step": 1682 + }, + { + "epoch": 0.029850613984941748, + "grad_norm": 3.328125, + "learning_rate": 4.991065911060663e-05, + "loss": 0.8584, + "step": 1684 + }, + { + "epoch": 0.02988606602055332, + "grad_norm": 3.078125, + "learning_rate": 4.991042305651632e-05, + "loss": 0.8896, + "step": 1686 + }, + { + "epoch": 0.029921518056164886, + "grad_norm": 2.875, + "learning_rate": 4.9910186691549123e-05, + "loss": 0.8259, + "step": 1688 + }, + { + "epoch": 0.029956970091776457, + "grad_norm": 2.90625, + "learning_rate": 4.990995001570798e-05, + "loss": 0.9028, + "step": 1690 + }, + { + "epoch": 0.02999242212738803, + "grad_norm": 2.9375, + "learning_rate": 4.9909713028995845e-05, + "loss": 0.8824, + "step": 1692 + }, + { + "epoch": 0.030027874162999596, + "grad_norm": 2.96875, + "learning_rate": 4.9909475731415686e-05, + "loss": 0.9096, + "step": 1694 + }, + { + "epoch": 0.030063326198611167, + "grad_norm": 2.953125, + "learning_rate": 4.990923812297046e-05, + "loss": 0.8724, + "step": 1696 + }, + { + "epoch": 0.030098778234222735, + "grad_norm": 2.9375, + "learning_rate": 4.990900020366313e-05, + "loss": 0.8796, + "step": 1698 + }, + { + "epoch": 0.030134230269834306, + "grad_norm": 3.140625, + "learning_rate": 4.990876197349665e-05, + "loss": 0.8311, + "step": 1700 + }, + { + "epoch": 0.030169682305445877, + "grad_norm": 3.53125, + "learning_rate": 4.9908523432474024e-05, + "loss": 0.9301, + "step": 1702 + }, + { + "epoch": 0.030205134341057444, + "grad_norm": 2.5625, + "learning_rate": 4.9908284580598206e-05, + "loss": 0.8491, + "step": 1704 + }, + { + "epoch": 0.030240586376669015, + "grad_norm": 2.609375, + "learning_rate": 4.99080454178722e-05, + "loss": 0.9107, + "step": 1706 + }, + { + "epoch": 0.030276038412280586, + "grad_norm": 2.875, + "learning_rate": 4.990780594429896e-05, + "loss": 0.9217, + "step": 1708 + }, + { + "epoch": 0.030311490447892154, + "grad_norm": 3.203125, + "learning_rate": 4.990756615988149e-05, + "loss": 0.9592, + "step": 1710 + }, + { + "epoch": 0.030346942483503725, + "grad_norm": 2.625, + "learning_rate": 4.9907326064622786e-05, + "loss": 0.8798, + "step": 1712 + }, + { + "epoch": 0.030382394519115296, + "grad_norm": 2.890625, + "learning_rate": 4.990708565852584e-05, + "loss": 0.9163, + "step": 1714 + }, + { + "epoch": 0.030417846554726864, + "grad_norm": 2.859375, + "learning_rate": 4.9906844941593654e-05, + "loss": 0.9099, + "step": 1716 + }, + { + "epoch": 0.030453298590338435, + "grad_norm": 2.9375, + "learning_rate": 4.990660391382923e-05, + "loss": 0.9131, + "step": 1718 + }, + { + "epoch": 0.030488750625950002, + "grad_norm": 3.09375, + "learning_rate": 4.9906362575235575e-05, + "loss": 0.8907, + "step": 1720 + }, + { + "epoch": 0.030524202661561573, + "grad_norm": 3.0, + "learning_rate": 4.9906120925815706e-05, + "loss": 0.8797, + "step": 1722 + }, + { + "epoch": 0.030559654697173144, + "grad_norm": 3.265625, + "learning_rate": 4.990587896557263e-05, + "loss": 0.934, + "step": 1724 + }, + { + "epoch": 0.030595106732784712, + "grad_norm": 2.890625, + "learning_rate": 4.990563669450938e-05, + "loss": 0.8834, + "step": 1726 + }, + { + "epoch": 0.030630558768396283, + "grad_norm": 2.859375, + "learning_rate": 4.990539411262897e-05, + "loss": 0.9206, + "step": 1728 + }, + { + "epoch": 0.030666010804007854, + "grad_norm": 2.8125, + "learning_rate": 4.990515121993442e-05, + "loss": 0.8585, + "step": 1730 + }, + { + "epoch": 0.03070146283961942, + "grad_norm": 2.890625, + "learning_rate": 4.990490801642878e-05, + "loss": 0.8668, + "step": 1732 + }, + { + "epoch": 0.030736914875230992, + "grad_norm": 2.84375, + "learning_rate": 4.990466450211507e-05, + "loss": 0.911, + "step": 1734 + }, + { + "epoch": 0.030772366910842563, + "grad_norm": 3.078125, + "learning_rate": 4.990442067699634e-05, + "loss": 0.8918, + "step": 1736 + }, + { + "epoch": 0.03080781894645413, + "grad_norm": 2.71875, + "learning_rate": 4.990417654107562e-05, + "loss": 0.881, + "step": 1738 + }, + { + "epoch": 0.030843270982065702, + "grad_norm": 2.6875, + "learning_rate": 4.990393209435596e-05, + "loss": 0.9107, + "step": 1740 + }, + { + "epoch": 0.03087872301767727, + "grad_norm": 3.359375, + "learning_rate": 4.990368733684043e-05, + "loss": 0.9067, + "step": 1742 + }, + { + "epoch": 0.03091417505328884, + "grad_norm": 3.15625, + "learning_rate": 4.9903442268532066e-05, + "loss": 0.853, + "step": 1744 + }, + { + "epoch": 0.03094962708890041, + "grad_norm": 3.21875, + "learning_rate": 4.990319688943392e-05, + "loss": 0.8587, + "step": 1746 + }, + { + "epoch": 0.03098507912451198, + "grad_norm": 2.796875, + "learning_rate": 4.990295119954906e-05, + "loss": 0.8913, + "step": 1748 + }, + { + "epoch": 0.03102053116012355, + "grad_norm": 2.90625, + "learning_rate": 4.990270519888057e-05, + "loss": 0.9057, + "step": 1750 + }, + { + "epoch": 0.03105598319573512, + "grad_norm": 2.75, + "learning_rate": 4.99024588874315e-05, + "loss": 0.8899, + "step": 1752 + }, + { + "epoch": 0.03109143523134669, + "grad_norm": 2.890625, + "learning_rate": 4.990221226520493e-05, + "loss": 0.8811, + "step": 1754 + }, + { + "epoch": 0.03112688726695826, + "grad_norm": 3.03125, + "learning_rate": 4.990196533220394e-05, + "loss": 0.8812, + "step": 1756 + }, + { + "epoch": 0.03116233930256983, + "grad_norm": 2.953125, + "learning_rate": 4.99017180884316e-05, + "loss": 0.8764, + "step": 1758 + }, + { + "epoch": 0.0311977913381814, + "grad_norm": 2.71875, + "learning_rate": 4.9901470533891014e-05, + "loss": 0.9297, + "step": 1760 + }, + { + "epoch": 0.03123324337379297, + "grad_norm": 3.09375, + "learning_rate": 4.9901222668585266e-05, + "loss": 0.8929, + "step": 1762 + }, + { + "epoch": 0.03126869540940454, + "grad_norm": 2.828125, + "learning_rate": 4.9900974492517435e-05, + "loss": 0.8731, + "step": 1764 + }, + { + "epoch": 0.03130414744501611, + "grad_norm": 2.734375, + "learning_rate": 4.990072600569064e-05, + "loss": 0.8794, + "step": 1766 + }, + { + "epoch": 0.03133959948062768, + "grad_norm": 2.6875, + "learning_rate": 4.9900477208107957e-05, + "loss": 0.8343, + "step": 1768 + }, + { + "epoch": 0.03137505151623925, + "grad_norm": 3.015625, + "learning_rate": 4.9900228099772516e-05, + "loss": 0.9005, + "step": 1770 + }, + { + "epoch": 0.031410503551850814, + "grad_norm": 2.625, + "learning_rate": 4.9899978680687406e-05, + "loss": 0.879, + "step": 1772 + }, + { + "epoch": 0.03144595558746239, + "grad_norm": 2.84375, + "learning_rate": 4.9899728950855764e-05, + "loss": 0.876, + "step": 1774 + }, + { + "epoch": 0.031481407623073956, + "grad_norm": 3.171875, + "learning_rate": 4.989947891028067e-05, + "loss": 0.9064, + "step": 1776 + }, + { + "epoch": 0.031516859658685524, + "grad_norm": 3.09375, + "learning_rate": 4.989922855896528e-05, + "loss": 0.8831, + "step": 1778 + }, + { + "epoch": 0.0315523116942971, + "grad_norm": 3.03125, + "learning_rate": 4.98989778969127e-05, + "loss": 0.8988, + "step": 1780 + }, + { + "epoch": 0.031587763729908666, + "grad_norm": 3.203125, + "learning_rate": 4.989872692412606e-05, + "loss": 0.8684, + "step": 1782 + }, + { + "epoch": 0.031623215765520234, + "grad_norm": 3.578125, + "learning_rate": 4.98984756406085e-05, + "loss": 0.8947, + "step": 1784 + }, + { + "epoch": 0.03165866780113181, + "grad_norm": 3.3125, + "learning_rate": 4.989822404636314e-05, + "loss": 0.8621, + "step": 1786 + }, + { + "epoch": 0.031694119836743376, + "grad_norm": 2.796875, + "learning_rate": 4.9897972141393135e-05, + "loss": 0.872, + "step": 1788 + }, + { + "epoch": 0.03172957187235494, + "grad_norm": 2.90625, + "learning_rate": 4.989771992570163e-05, + "loss": 0.893, + "step": 1790 + }, + { + "epoch": 0.03176502390796652, + "grad_norm": 2.96875, + "learning_rate": 4.9897467399291756e-05, + "loss": 0.916, + "step": 1792 + }, + { + "epoch": 0.031800475943578085, + "grad_norm": 2.375, + "learning_rate": 4.989721456216668e-05, + "loss": 0.8348, + "step": 1794 + }, + { + "epoch": 0.03183592797918965, + "grad_norm": 2.796875, + "learning_rate": 4.989696141432955e-05, + "loss": 0.8458, + "step": 1796 + }, + { + "epoch": 0.03187138001480123, + "grad_norm": 2.96875, + "learning_rate": 4.9896707955783526e-05, + "loss": 0.9631, + "step": 1798 + }, + { + "epoch": 0.031906832050412795, + "grad_norm": 3.0, + "learning_rate": 4.989645418653177e-05, + "loss": 0.9095, + "step": 1800 + }, + { + "epoch": 0.03194228408602436, + "grad_norm": 3.046875, + "learning_rate": 4.9896200106577465e-05, + "loss": 0.8879, + "step": 1802 + }, + { + "epoch": 0.03197773612163594, + "grad_norm": 3.140625, + "learning_rate": 4.9895945715923754e-05, + "loss": 0.9092, + "step": 1804 + }, + { + "epoch": 0.032013188157247505, + "grad_norm": 2.578125, + "learning_rate": 4.989569101457383e-05, + "loss": 0.843, + "step": 1806 + }, + { + "epoch": 0.03204864019285907, + "grad_norm": 3.046875, + "learning_rate": 4.989543600253087e-05, + "loss": 0.9134, + "step": 1808 + }, + { + "epoch": 0.03208409222847065, + "grad_norm": 2.984375, + "learning_rate": 4.989518067979805e-05, + "loss": 0.8729, + "step": 1810 + }, + { + "epoch": 0.032119544264082214, + "grad_norm": 2.828125, + "learning_rate": 4.989492504637856e-05, + "loss": 0.8983, + "step": 1812 + }, + { + "epoch": 0.03215499629969378, + "grad_norm": 3.0625, + "learning_rate": 4.989466910227559e-05, + "loss": 0.8304, + "step": 1814 + }, + { + "epoch": 0.03219044833530535, + "grad_norm": 3.328125, + "learning_rate": 4.9894412847492345e-05, + "loss": 0.8675, + "step": 1816 + }, + { + "epoch": 0.032225900370916924, + "grad_norm": 3.34375, + "learning_rate": 4.9894156282032e-05, + "loss": 0.9471, + "step": 1818 + }, + { + "epoch": 0.03226135240652849, + "grad_norm": 3.234375, + "learning_rate": 4.989389940589778e-05, + "loss": 0.9131, + "step": 1820 + }, + { + "epoch": 0.03229680444214006, + "grad_norm": 2.90625, + "learning_rate": 4.989364221909287e-05, + "loss": 0.909, + "step": 1822 + }, + { + "epoch": 0.032332256477751634, + "grad_norm": 3.046875, + "learning_rate": 4.98933847216205e-05, + "loss": 0.8465, + "step": 1824 + }, + { + "epoch": 0.0323677085133632, + "grad_norm": 2.96875, + "learning_rate": 4.989312691348387e-05, + "loss": 0.8581, + "step": 1826 + }, + { + "epoch": 0.03240316054897477, + "grad_norm": 2.6875, + "learning_rate": 4.98928687946862e-05, + "loss": 0.9257, + "step": 1828 + }, + { + "epoch": 0.03243861258458634, + "grad_norm": 2.65625, + "learning_rate": 4.989261036523071e-05, + "loss": 0.8431, + "step": 1830 + }, + { + "epoch": 0.03247406462019791, + "grad_norm": 2.96875, + "learning_rate": 4.989235162512064e-05, + "loss": 0.9564, + "step": 1832 + }, + { + "epoch": 0.03250951665580948, + "grad_norm": 2.984375, + "learning_rate": 4.989209257435919e-05, + "loss": 0.8758, + "step": 1834 + }, + { + "epoch": 0.03254496869142105, + "grad_norm": 2.828125, + "learning_rate": 4.989183321294961e-05, + "loss": 0.8783, + "step": 1836 + }, + { + "epoch": 0.03258042072703262, + "grad_norm": 3.28125, + "learning_rate": 4.989157354089515e-05, + "loss": 0.927, + "step": 1838 + }, + { + "epoch": 0.03261587276264419, + "grad_norm": 3.390625, + "learning_rate": 4.9891313558199025e-05, + "loss": 0.9246, + "step": 1840 + }, + { + "epoch": 0.03265132479825576, + "grad_norm": 2.96875, + "learning_rate": 4.98910532648645e-05, + "loss": 0.8639, + "step": 1842 + }, + { + "epoch": 0.03268677683386733, + "grad_norm": 3.03125, + "learning_rate": 4.9890792660894806e-05, + "loss": 0.856, + "step": 1844 + }, + { + "epoch": 0.0327222288694789, + "grad_norm": 2.984375, + "learning_rate": 4.989053174629321e-05, + "loss": 0.8842, + "step": 1846 + }, + { + "epoch": 0.03275768090509047, + "grad_norm": 2.921875, + "learning_rate": 4.989027052106295e-05, + "loss": 0.8997, + "step": 1848 + }, + { + "epoch": 0.03279313294070204, + "grad_norm": 3.03125, + "learning_rate": 4.989000898520732e-05, + "loss": 0.8674, + "step": 1850 + }, + { + "epoch": 0.03282858497631361, + "grad_norm": 2.921875, + "learning_rate": 4.988974713872955e-05, + "loss": 0.8712, + "step": 1852 + }, + { + "epoch": 0.03286403701192518, + "grad_norm": 3.234375, + "learning_rate": 4.9889484981632913e-05, + "loss": 0.9267, + "step": 1854 + }, + { + "epoch": 0.03289948904753675, + "grad_norm": 3.0625, + "learning_rate": 4.98892225139207e-05, + "loss": 0.8959, + "step": 1856 + }, + { + "epoch": 0.03293494108314832, + "grad_norm": 2.96875, + "learning_rate": 4.9888959735596165e-05, + "loss": 0.887, + "step": 1858 + }, + { + "epoch": 0.032970393118759884, + "grad_norm": 2.984375, + "learning_rate": 4.9888696646662606e-05, + "loss": 0.8938, + "step": 1860 + }, + { + "epoch": 0.03300584515437146, + "grad_norm": 3.71875, + "learning_rate": 4.9888433247123284e-05, + "loss": 0.8704, + "step": 1862 + }, + { + "epoch": 0.03304129718998303, + "grad_norm": 2.546875, + "learning_rate": 4.988816953698151e-05, + "loss": 0.926, + "step": 1864 + }, + { + "epoch": 0.033076749225594594, + "grad_norm": 2.875, + "learning_rate": 4.9887905516240555e-05, + "loss": 0.8907, + "step": 1866 + }, + { + "epoch": 0.03311220126120617, + "grad_norm": 3.078125, + "learning_rate": 4.988764118490373e-05, + "loss": 0.9096, + "step": 1868 + }, + { + "epoch": 0.033147653296817736, + "grad_norm": 3.078125, + "learning_rate": 4.988737654297432e-05, + "loss": 0.8758, + "step": 1870 + }, + { + "epoch": 0.033183105332429304, + "grad_norm": 2.90625, + "learning_rate": 4.988711159045564e-05, + "loss": 0.9465, + "step": 1872 + }, + { + "epoch": 0.03321855736804088, + "grad_norm": 2.65625, + "learning_rate": 4.988684632735099e-05, + "loss": 0.8731, + "step": 1874 + }, + { + "epoch": 0.033254009403652446, + "grad_norm": 3.0625, + "learning_rate": 4.988658075366368e-05, + "loss": 0.9215, + "step": 1876 + }, + { + "epoch": 0.03328946143926401, + "grad_norm": 2.515625, + "learning_rate": 4.9886314869397025e-05, + "loss": 0.9032, + "step": 1878 + }, + { + "epoch": 0.03332491347487559, + "grad_norm": 2.875, + "learning_rate": 4.9886048674554347e-05, + "loss": 0.8975, + "step": 1880 + }, + { + "epoch": 0.033360365510487155, + "grad_norm": 3.078125, + "learning_rate": 4.988578216913896e-05, + "loss": 0.9586, + "step": 1882 + }, + { + "epoch": 0.03339581754609872, + "grad_norm": 2.9375, + "learning_rate": 4.98855153531542e-05, + "loss": 0.8882, + "step": 1884 + }, + { + "epoch": 0.0334312695817103, + "grad_norm": 2.90625, + "learning_rate": 4.9885248226603397e-05, + "loss": 0.8855, + "step": 1886 + }, + { + "epoch": 0.033466721617321865, + "grad_norm": 2.90625, + "learning_rate": 4.9884980789489865e-05, + "loss": 0.8807, + "step": 1888 + }, + { + "epoch": 0.03350217365293343, + "grad_norm": 3.140625, + "learning_rate": 4.988471304181697e-05, + "loss": 0.8932, + "step": 1890 + }, + { + "epoch": 0.03353762568854501, + "grad_norm": 2.765625, + "learning_rate": 4.988444498358803e-05, + "loss": 0.8452, + "step": 1892 + }, + { + "epoch": 0.033573077724156575, + "grad_norm": 2.859375, + "learning_rate": 4.98841766148064e-05, + "loss": 0.8801, + "step": 1894 + }, + { + "epoch": 0.03360852975976814, + "grad_norm": 3.234375, + "learning_rate": 4.9883907935475436e-05, + "loss": 0.8889, + "step": 1896 + }, + { + "epoch": 0.03364398179537972, + "grad_norm": 2.71875, + "learning_rate": 4.988363894559847e-05, + "loss": 0.8927, + "step": 1898 + }, + { + "epoch": 0.033679433830991284, + "grad_norm": 2.796875, + "learning_rate": 4.988336964517889e-05, + "loss": 0.8917, + "step": 1900 + }, + { + "epoch": 0.03371488586660285, + "grad_norm": 2.6875, + "learning_rate": 4.988310003422003e-05, + "loss": 0.8618, + "step": 1902 + }, + { + "epoch": 0.03375033790221442, + "grad_norm": 2.6875, + "learning_rate": 4.9882830112725264e-05, + "loss": 0.9061, + "step": 1904 + }, + { + "epoch": 0.033785789937825994, + "grad_norm": 2.984375, + "learning_rate": 4.9882559880697964e-05, + "loss": 0.8871, + "step": 1906 + }, + { + "epoch": 0.03382124197343756, + "grad_norm": 2.75, + "learning_rate": 4.9882289338141494e-05, + "loss": 0.9005, + "step": 1908 + }, + { + "epoch": 0.03385669400904913, + "grad_norm": 2.734375, + "learning_rate": 4.988201848505925e-05, + "loss": 0.8677, + "step": 1910 + }, + { + "epoch": 0.033892146044660704, + "grad_norm": 2.71875, + "learning_rate": 4.988174732145458e-05, + "loss": 0.8617, + "step": 1912 + }, + { + "epoch": 0.03392759808027227, + "grad_norm": 3.015625, + "learning_rate": 4.988147584733089e-05, + "loss": 0.8704, + "step": 1914 + }, + { + "epoch": 0.03396305011588384, + "grad_norm": 3.03125, + "learning_rate": 4.9881204062691575e-05, + "loss": 0.8464, + "step": 1916 + }, + { + "epoch": 0.03399850215149541, + "grad_norm": 3.0625, + "learning_rate": 4.988093196754001e-05, + "loss": 0.9313, + "step": 1918 + }, + { + "epoch": 0.03403395418710698, + "grad_norm": 3.078125, + "learning_rate": 4.9880659561879596e-05, + "loss": 0.8973, + "step": 1920 + }, + { + "epoch": 0.03406940622271855, + "grad_norm": 2.75, + "learning_rate": 4.988038684571373e-05, + "loss": 0.9628, + "step": 1922 + }, + { + "epoch": 0.03410485825833012, + "grad_norm": 2.828125, + "learning_rate": 4.988011381904581e-05, + "loss": 0.87, + "step": 1924 + }, + { + "epoch": 0.03414031029394169, + "grad_norm": 3.0, + "learning_rate": 4.987984048187927e-05, + "loss": 0.9069, + "step": 1926 + }, + { + "epoch": 0.03417576232955326, + "grad_norm": 3.078125, + "learning_rate": 4.98795668342175e-05, + "loss": 0.9091, + "step": 1928 + }, + { + "epoch": 0.03421121436516483, + "grad_norm": 2.828125, + "learning_rate": 4.987929287606391e-05, + "loss": 0.8688, + "step": 1930 + }, + { + "epoch": 0.0342466664007764, + "grad_norm": 2.9375, + "learning_rate": 4.9879018607421927e-05, + "loss": 0.8515, + "step": 1932 + }, + { + "epoch": 0.03428211843638797, + "grad_norm": 2.765625, + "learning_rate": 4.9878744028294974e-05, + "loss": 0.899, + "step": 1934 + }, + { + "epoch": 0.03431757047199954, + "grad_norm": 3.0625, + "learning_rate": 4.987846913868648e-05, + "loss": 0.8567, + "step": 1936 + }, + { + "epoch": 0.03435302250761111, + "grad_norm": 3.125, + "learning_rate": 4.987819393859987e-05, + "loss": 0.8997, + "step": 1938 + }, + { + "epoch": 0.03438847454322268, + "grad_norm": 2.765625, + "learning_rate": 4.987791842803858e-05, + "loss": 0.8767, + "step": 1940 + }, + { + "epoch": 0.03442392657883425, + "grad_norm": 2.828125, + "learning_rate": 4.9877642607006056e-05, + "loss": 0.8827, + "step": 1942 + }, + { + "epoch": 0.03445937861444582, + "grad_norm": 3.0, + "learning_rate": 4.9877366475505735e-05, + "loss": 0.8539, + "step": 1944 + }, + { + "epoch": 0.03449483065005739, + "grad_norm": 2.859375, + "learning_rate": 4.9877090033541065e-05, + "loss": 0.9004, + "step": 1946 + }, + { + "epoch": 0.034530282685668955, + "grad_norm": 2.96875, + "learning_rate": 4.987681328111548e-05, + "loss": 0.8699, + "step": 1948 + }, + { + "epoch": 0.03456573472128053, + "grad_norm": 2.859375, + "learning_rate": 4.987653621823245e-05, + "loss": 0.8887, + "step": 1950 + }, + { + "epoch": 0.0346011867568921, + "grad_norm": 2.96875, + "learning_rate": 4.987625884489544e-05, + "loss": 0.9072, + "step": 1952 + }, + { + "epoch": 0.034636638792503664, + "grad_norm": 2.921875, + "learning_rate": 4.9875981161107885e-05, + "loss": 0.9238, + "step": 1954 + }, + { + "epoch": 0.03467209082811524, + "grad_norm": 3.046875, + "learning_rate": 4.987570316687328e-05, + "loss": 0.9007, + "step": 1956 + }, + { + "epoch": 0.034707542863726806, + "grad_norm": 3.09375, + "learning_rate": 4.987542486219507e-05, + "loss": 0.8812, + "step": 1958 + }, + { + "epoch": 0.034742994899338374, + "grad_norm": 3.421875, + "learning_rate": 4.987514624707675e-05, + "loss": 0.9161, + "step": 1960 + }, + { + "epoch": 0.03477844693494995, + "grad_norm": 3.28125, + "learning_rate": 4.9874867321521776e-05, + "loss": 0.893, + "step": 1962 + }, + { + "epoch": 0.034813898970561516, + "grad_norm": 2.828125, + "learning_rate": 4.9874588085533644e-05, + "loss": 0.8903, + "step": 1964 + }, + { + "epoch": 0.034849351006173084, + "grad_norm": 3.359375, + "learning_rate": 4.987430853911583e-05, + "loss": 0.9052, + "step": 1966 + }, + { + "epoch": 0.03488480304178466, + "grad_norm": 3.109375, + "learning_rate": 4.987402868227183e-05, + "loss": 0.8775, + "step": 1968 + }, + { + "epoch": 0.034920255077396226, + "grad_norm": 2.515625, + "learning_rate": 4.9873748515005134e-05, + "loss": 0.861, + "step": 1970 + }, + { + "epoch": 0.03495570711300779, + "grad_norm": 2.75, + "learning_rate": 4.987346803731924e-05, + "loss": 0.8776, + "step": 1972 + }, + { + "epoch": 0.03499115914861937, + "grad_norm": 2.703125, + "learning_rate": 4.987318724921764e-05, + "loss": 0.8981, + "step": 1974 + }, + { + "epoch": 0.035026611184230935, + "grad_norm": 3.0625, + "learning_rate": 4.987290615070385e-05, + "loss": 0.8984, + "step": 1976 + }, + { + "epoch": 0.0350620632198425, + "grad_norm": 2.96875, + "learning_rate": 4.987262474178136e-05, + "loss": 0.9129, + "step": 1978 + }, + { + "epoch": 0.03509751525545408, + "grad_norm": 2.78125, + "learning_rate": 4.98723430224537e-05, + "loss": 0.9045, + "step": 1980 + }, + { + "epoch": 0.035132967291065645, + "grad_norm": 2.78125, + "learning_rate": 4.9872060992724387e-05, + "loss": 0.8857, + "step": 1982 + }, + { + "epoch": 0.03516841932667721, + "grad_norm": 2.953125, + "learning_rate": 4.9871778652596926e-05, + "loss": 0.8699, + "step": 1984 + }, + { + "epoch": 0.03520387136228878, + "grad_norm": 2.90625, + "learning_rate": 4.9871496002074845e-05, + "loss": 0.8877, + "step": 1986 + }, + { + "epoch": 0.035239323397900355, + "grad_norm": 2.765625, + "learning_rate": 4.987121304116168e-05, + "loss": 0.8973, + "step": 1988 + }, + { + "epoch": 0.03527477543351192, + "grad_norm": 2.84375, + "learning_rate": 4.987092976986095e-05, + "loss": 0.8566, + "step": 1990 + }, + { + "epoch": 0.03531022746912349, + "grad_norm": 2.78125, + "learning_rate": 4.9870646188176205e-05, + "loss": 0.8753, + "step": 1992 + }, + { + "epoch": 0.035345679504735064, + "grad_norm": 2.6875, + "learning_rate": 4.9870362296110974e-05, + "loss": 0.9174, + "step": 1994 + }, + { + "epoch": 0.03538113154034663, + "grad_norm": 2.9375, + "learning_rate": 4.9870078093668795e-05, + "loss": 0.8929, + "step": 1996 + }, + { + "epoch": 0.0354165835759582, + "grad_norm": 2.8125, + "learning_rate": 4.986979358085323e-05, + "loss": 0.8878, + "step": 1998 + }, + { + "epoch": 0.035452035611569774, + "grad_norm": 2.828125, + "learning_rate": 4.9869508757667816e-05, + "loss": 0.912, + "step": 2000 + }, + { + "epoch": 0.03548748764718134, + "grad_norm": 2.6875, + "learning_rate": 4.986922362411611e-05, + "loss": 0.8657, + "step": 2002 + }, + { + "epoch": 0.03552293968279291, + "grad_norm": 2.78125, + "learning_rate": 4.986893818020168e-05, + "loss": 0.8613, + "step": 2004 + }, + { + "epoch": 0.03555839171840448, + "grad_norm": 3.0, + "learning_rate": 4.986865242592807e-05, + "loss": 0.8583, + "step": 2006 + }, + { + "epoch": 0.03559384375401605, + "grad_norm": 3.15625, + "learning_rate": 4.9868366361298873e-05, + "loss": 0.8839, + "step": 2008 + }, + { + "epoch": 0.03562929578962762, + "grad_norm": 3.109375, + "learning_rate": 4.986807998631764e-05, + "loss": 0.8905, + "step": 2010 + }, + { + "epoch": 0.03566474782523919, + "grad_norm": 2.90625, + "learning_rate": 4.9867793300987945e-05, + "loss": 0.8616, + "step": 2012 + }, + { + "epoch": 0.03570019986085076, + "grad_norm": 3.015625, + "learning_rate": 4.986750630531336e-05, + "loss": 0.8757, + "step": 2014 + }, + { + "epoch": 0.03573565189646233, + "grad_norm": 2.640625, + "learning_rate": 4.9867218999297486e-05, + "loss": 0.8668, + "step": 2016 + }, + { + "epoch": 0.0357711039320739, + "grad_norm": 3.046875, + "learning_rate": 4.98669313829439e-05, + "loss": 0.8505, + "step": 2018 + }, + { + "epoch": 0.03580655596768547, + "grad_norm": 2.875, + "learning_rate": 4.986664345625619e-05, + "loss": 0.89, + "step": 2020 + }, + { + "epoch": 0.03584200800329704, + "grad_norm": 2.6875, + "learning_rate": 4.986635521923794e-05, + "loss": 0.8667, + "step": 2022 + }, + { + "epoch": 0.03587746003890861, + "grad_norm": 3.109375, + "learning_rate": 4.9866066671892765e-05, + "loss": 0.954, + "step": 2024 + }, + { + "epoch": 0.03591291207452018, + "grad_norm": 2.703125, + "learning_rate": 4.9865777814224254e-05, + "loss": 0.9229, + "step": 2026 + }, + { + "epoch": 0.03594836411013175, + "grad_norm": 3.140625, + "learning_rate": 4.9865488646236014e-05, + "loss": 0.8738, + "step": 2028 + }, + { + "epoch": 0.035983816145743315, + "grad_norm": 3.0, + "learning_rate": 4.9865199167931657e-05, + "loss": 0.9409, + "step": 2030 + }, + { + "epoch": 0.03601926818135489, + "grad_norm": 3.171875, + "learning_rate": 4.986490937931479e-05, + "loss": 0.8501, + "step": 2032 + }, + { + "epoch": 0.03605472021696646, + "grad_norm": 2.625, + "learning_rate": 4.9864619280389036e-05, + "loss": 0.8757, + "step": 2034 + }, + { + "epoch": 0.036090172252578025, + "grad_norm": 2.875, + "learning_rate": 4.986432887115801e-05, + "loss": 0.8583, + "step": 2036 + }, + { + "epoch": 0.0361256242881896, + "grad_norm": 2.703125, + "learning_rate": 4.986403815162534e-05, + "loss": 0.8564, + "step": 2038 + }, + { + "epoch": 0.03616107632380117, + "grad_norm": 3.046875, + "learning_rate": 4.9863747121794654e-05, + "loss": 0.9092, + "step": 2040 + }, + { + "epoch": 0.036196528359412734, + "grad_norm": 3.09375, + "learning_rate": 4.986345578166958e-05, + "loss": 0.9119, + "step": 2042 + }, + { + "epoch": 0.03623198039502431, + "grad_norm": 3.25, + "learning_rate": 4.986316413125376e-05, + "loss": 0.8834, + "step": 2044 + }, + { + "epoch": 0.036267432430635876, + "grad_norm": 2.921875, + "learning_rate": 4.9862872170550826e-05, + "loss": 0.8866, + "step": 2046 + }, + { + "epoch": 0.036302884466247444, + "grad_norm": 2.9375, + "learning_rate": 4.9862579899564435e-05, + "loss": 0.9096, + "step": 2048 + }, + { + "epoch": 0.03633833650185902, + "grad_norm": 2.9375, + "learning_rate": 4.9862287318298213e-05, + "loss": 0.8794, + "step": 2050 + }, + { + "epoch": 0.036373788537470586, + "grad_norm": 2.875, + "learning_rate": 4.9861994426755834e-05, + "loss": 0.909, + "step": 2052 + }, + { + "epoch": 0.036409240573082154, + "grad_norm": 3.234375, + "learning_rate": 4.986170122494093e-05, + "loss": 0.8997, + "step": 2054 + }, + { + "epoch": 0.03644469260869373, + "grad_norm": 2.453125, + "learning_rate": 4.9861407712857185e-05, + "loss": 0.8905, + "step": 2056 + }, + { + "epoch": 0.036480144644305296, + "grad_norm": 2.96875, + "learning_rate": 4.9861113890508246e-05, + "loss": 0.9056, + "step": 2058 + }, + { + "epoch": 0.03651559667991686, + "grad_norm": 3.015625, + "learning_rate": 4.986081975789778e-05, + "loss": 0.9046, + "step": 2060 + }, + { + "epoch": 0.03655104871552844, + "grad_norm": 2.875, + "learning_rate": 4.986052531502947e-05, + "loss": 0.9152, + "step": 2062 + }, + { + "epoch": 0.036586500751140005, + "grad_norm": 2.828125, + "learning_rate": 4.9860230561906976e-05, + "loss": 0.8837, + "step": 2064 + }, + { + "epoch": 0.03662195278675157, + "grad_norm": 3.53125, + "learning_rate": 4.985993549853398e-05, + "loss": 0.9131, + "step": 2066 + }, + { + "epoch": 0.03665740482236315, + "grad_norm": 3.078125, + "learning_rate": 4.9859640124914174e-05, + "loss": 0.8591, + "step": 2068 + }, + { + "epoch": 0.036692856857974715, + "grad_norm": 3.203125, + "learning_rate": 4.9859344441051234e-05, + "loss": 0.9129, + "step": 2070 + }, + { + "epoch": 0.03672830889358628, + "grad_norm": 2.890625, + "learning_rate": 4.985904844694885e-05, + "loss": 0.8509, + "step": 2072 + }, + { + "epoch": 0.03676376092919785, + "grad_norm": 3.078125, + "learning_rate": 4.985875214261073e-05, + "loss": 0.8665, + "step": 2074 + }, + { + "epoch": 0.036799212964809425, + "grad_norm": 2.75, + "learning_rate": 4.985845552804055e-05, + "loss": 0.8363, + "step": 2076 + }, + { + "epoch": 0.03683466500042099, + "grad_norm": 2.65625, + "learning_rate": 4.985815860324203e-05, + "loss": 0.8305, + "step": 2078 + }, + { + "epoch": 0.03687011703603256, + "grad_norm": 2.890625, + "learning_rate": 4.985786136821886e-05, + "loss": 0.8186, + "step": 2080 + }, + { + "epoch": 0.036905569071644134, + "grad_norm": 2.875, + "learning_rate": 4.985756382297476e-05, + "loss": 0.8541, + "step": 2082 + }, + { + "epoch": 0.0369410211072557, + "grad_norm": 2.65625, + "learning_rate": 4.985726596751344e-05, + "loss": 0.877, + "step": 2084 + }, + { + "epoch": 0.03697647314286727, + "grad_norm": 2.90625, + "learning_rate": 4.985696780183863e-05, + "loss": 0.9137, + "step": 2086 + }, + { + "epoch": 0.037011925178478844, + "grad_norm": 2.984375, + "learning_rate": 4.985666932595403e-05, + "loss": 0.89, + "step": 2088 + }, + { + "epoch": 0.03704737721409041, + "grad_norm": 2.921875, + "learning_rate": 4.985637053986337e-05, + "loss": 0.8867, + "step": 2090 + }, + { + "epoch": 0.03708282924970198, + "grad_norm": 3.390625, + "learning_rate": 4.985607144357039e-05, + "loss": 0.8978, + "step": 2092 + }, + { + "epoch": 0.037118281285313554, + "grad_norm": 3.09375, + "learning_rate": 4.9855772037078814e-05, + "loss": 0.8873, + "step": 2094 + }, + { + "epoch": 0.03715373332092512, + "grad_norm": 2.859375, + "learning_rate": 4.9855472320392375e-05, + "loss": 0.9264, + "step": 2096 + }, + { + "epoch": 0.03718918535653669, + "grad_norm": 3.09375, + "learning_rate": 4.9855172293514826e-05, + "loss": 0.8793, + "step": 2098 + }, + { + "epoch": 0.03722463739214826, + "grad_norm": 2.765625, + "learning_rate": 4.98548719564499e-05, + "loss": 0.8243, + "step": 2100 + }, + { + "epoch": 0.03726008942775983, + "grad_norm": 2.6875, + "learning_rate": 4.985457130920135e-05, + "loss": 0.8314, + "step": 2102 + }, + { + "epoch": 0.0372955414633714, + "grad_norm": 2.578125, + "learning_rate": 4.9854270351772925e-05, + "loss": 0.8263, + "step": 2104 + }, + { + "epoch": 0.03733099349898297, + "grad_norm": 2.96875, + "learning_rate": 4.9853969084168386e-05, + "loss": 0.8607, + "step": 2106 + }, + { + "epoch": 0.03736644553459454, + "grad_norm": 2.921875, + "learning_rate": 4.985366750639148e-05, + "loss": 0.8569, + "step": 2108 + }, + { + "epoch": 0.03740189757020611, + "grad_norm": 2.953125, + "learning_rate": 4.985336561844599e-05, + "loss": 0.8951, + "step": 2110 + }, + { + "epoch": 0.03743734960581768, + "grad_norm": 2.96875, + "learning_rate": 4.985306342033567e-05, + "loss": 0.9011, + "step": 2112 + }, + { + "epoch": 0.03747280164142925, + "grad_norm": 2.96875, + "learning_rate": 4.98527609120643e-05, + "loss": 0.884, + "step": 2114 + }, + { + "epoch": 0.03750825367704082, + "grad_norm": 2.6875, + "learning_rate": 4.9852458093635646e-05, + "loss": 0.8846, + "step": 2116 + }, + { + "epoch": 0.037543705712652385, + "grad_norm": 3.109375, + "learning_rate": 4.985215496505349e-05, + "loss": 0.8899, + "step": 2118 + }, + { + "epoch": 0.03757915774826396, + "grad_norm": 2.9375, + "learning_rate": 4.985185152632162e-05, + "loss": 0.8511, + "step": 2120 + }, + { + "epoch": 0.03761460978387553, + "grad_norm": 3.140625, + "learning_rate": 4.985154777744382e-05, + "loss": 0.8845, + "step": 2122 + }, + { + "epoch": 0.037650061819487095, + "grad_norm": 2.921875, + "learning_rate": 4.9851243718423875e-05, + "loss": 0.8729, + "step": 2124 + }, + { + "epoch": 0.03768551385509867, + "grad_norm": 3.4375, + "learning_rate": 4.9850939349265587e-05, + "loss": 0.8871, + "step": 2126 + }, + { + "epoch": 0.03772096589071024, + "grad_norm": 3.015625, + "learning_rate": 4.985063466997275e-05, + "loss": 0.9002, + "step": 2128 + }, + { + "epoch": 0.037756417926321804, + "grad_norm": 2.65625, + "learning_rate": 4.985032968054917e-05, + "loss": 0.8798, + "step": 2130 + }, + { + "epoch": 0.03779186996193338, + "grad_norm": 3.359375, + "learning_rate": 4.985002438099865e-05, + "loss": 0.9102, + "step": 2132 + }, + { + "epoch": 0.037827321997544947, + "grad_norm": 2.828125, + "learning_rate": 4.984971877132501e-05, + "loss": 0.8697, + "step": 2134 + }, + { + "epoch": 0.037862774033156514, + "grad_norm": 2.984375, + "learning_rate": 4.984941285153204e-05, + "loss": 0.9321, + "step": 2136 + }, + { + "epoch": 0.03789822606876809, + "grad_norm": 2.734375, + "learning_rate": 4.984910662162359e-05, + "loss": 0.8666, + "step": 2138 + }, + { + "epoch": 0.037933678104379656, + "grad_norm": 2.953125, + "learning_rate": 4.9848800081603455e-05, + "loss": 0.8904, + "step": 2140 + }, + { + "epoch": 0.037969130139991224, + "grad_norm": 2.84375, + "learning_rate": 4.9848493231475466e-05, + "loss": 0.9229, + "step": 2142 + }, + { + "epoch": 0.0380045821756028, + "grad_norm": 2.703125, + "learning_rate": 4.9848186071243463e-05, + "loss": 0.8472, + "step": 2144 + }, + { + "epoch": 0.038040034211214366, + "grad_norm": 3.0625, + "learning_rate": 4.984787860091128e-05, + "loss": 0.8923, + "step": 2146 + }, + { + "epoch": 0.03807548624682593, + "grad_norm": 3.1875, + "learning_rate": 4.984757082048274e-05, + "loss": 0.8802, + "step": 2148 + }, + { + "epoch": 0.03811093828243751, + "grad_norm": 2.671875, + "learning_rate": 4.9847262729961694e-05, + "loss": 0.8569, + "step": 2150 + }, + { + "epoch": 0.038146390318049075, + "grad_norm": 2.8125, + "learning_rate": 4.984695432935198e-05, + "loss": 0.9036, + "step": 2152 + }, + { + "epoch": 0.03818184235366064, + "grad_norm": 2.953125, + "learning_rate": 4.9846645618657463e-05, + "loss": 0.8576, + "step": 2154 + }, + { + "epoch": 0.03821729438927221, + "grad_norm": 2.609375, + "learning_rate": 4.984633659788197e-05, + "loss": 0.8861, + "step": 2156 + }, + { + "epoch": 0.038252746424883785, + "grad_norm": 2.65625, + "learning_rate": 4.984602726702938e-05, + "loss": 0.9025, + "step": 2158 + }, + { + "epoch": 0.03828819846049535, + "grad_norm": 2.796875, + "learning_rate": 4.984571762610354e-05, + "loss": 0.8429, + "step": 2160 + }, + { + "epoch": 0.03832365049610692, + "grad_norm": 2.9375, + "learning_rate": 4.984540767510833e-05, + "loss": 0.8757, + "step": 2162 + }, + { + "epoch": 0.038359102531718495, + "grad_norm": 2.953125, + "learning_rate": 4.984509741404759e-05, + "loss": 0.8641, + "step": 2164 + }, + { + "epoch": 0.03839455456733006, + "grad_norm": 2.5625, + "learning_rate": 4.984478684292522e-05, + "loss": 0.8471, + "step": 2166 + }, + { + "epoch": 0.03843000660294163, + "grad_norm": 2.90625, + "learning_rate": 4.984447596174507e-05, + "loss": 0.8905, + "step": 2168 + }, + { + "epoch": 0.038465458638553204, + "grad_norm": 3.015625, + "learning_rate": 4.984416477051106e-05, + "loss": 0.8571, + "step": 2170 + }, + { + "epoch": 0.03850091067416477, + "grad_norm": 2.984375, + "learning_rate": 4.984385326922703e-05, + "loss": 0.8629, + "step": 2172 + }, + { + "epoch": 0.03853636270977634, + "grad_norm": 2.921875, + "learning_rate": 4.9843541457896894e-05, + "loss": 0.8954, + "step": 2174 + }, + { + "epoch": 0.038571814745387914, + "grad_norm": 2.75, + "learning_rate": 4.9843229336524526e-05, + "loss": 0.8768, + "step": 2176 + }, + { + "epoch": 0.03860726678099948, + "grad_norm": 2.71875, + "learning_rate": 4.984291690511384e-05, + "loss": 0.8464, + "step": 2178 + }, + { + "epoch": 0.03864271881661105, + "grad_norm": 2.828125, + "learning_rate": 4.984260416366872e-05, + "loss": 0.9009, + "step": 2180 + }, + { + "epoch": 0.038678170852222624, + "grad_norm": 2.84375, + "learning_rate": 4.9842291112193076e-05, + "loss": 0.8651, + "step": 2182 + }, + { + "epoch": 0.03871362288783419, + "grad_norm": 2.90625, + "learning_rate": 4.9841977750690815e-05, + "loss": 0.8651, + "step": 2184 + }, + { + "epoch": 0.03874907492344576, + "grad_norm": 2.796875, + "learning_rate": 4.984166407916584e-05, + "loss": 0.8658, + "step": 2186 + }, + { + "epoch": 0.03878452695905733, + "grad_norm": 2.859375, + "learning_rate": 4.984135009762208e-05, + "loss": 0.9066, + "step": 2188 + }, + { + "epoch": 0.0388199789946689, + "grad_norm": 3.09375, + "learning_rate": 4.984103580606344e-05, + "loss": 0.9346, + "step": 2190 + }, + { + "epoch": 0.03885543103028047, + "grad_norm": 2.984375, + "learning_rate": 4.984072120449385e-05, + "loss": 0.8453, + "step": 2192 + }, + { + "epoch": 0.03889088306589204, + "grad_norm": 3.0625, + "learning_rate": 4.984040629291723e-05, + "loss": 0.8937, + "step": 2194 + }, + { + "epoch": 0.03892633510150361, + "grad_norm": 2.875, + "learning_rate": 4.9840091071337514e-05, + "loss": 0.844, + "step": 2196 + }, + { + "epoch": 0.03896178713711518, + "grad_norm": 3.015625, + "learning_rate": 4.9839775539758635e-05, + "loss": 0.843, + "step": 2198 + }, + { + "epoch": 0.038997239172726746, + "grad_norm": 3.015625, + "learning_rate": 4.983945969818453e-05, + "loss": 0.8641, + "step": 2200 + }, + { + "epoch": 0.03903269120833832, + "grad_norm": 2.9375, + "learning_rate": 4.9839143546619146e-05, + "loss": 0.9192, + "step": 2202 + }, + { + "epoch": 0.03906814324394989, + "grad_norm": 3.3125, + "learning_rate": 4.983882708506642e-05, + "loss": 0.8461, + "step": 2204 + }, + { + "epoch": 0.039103595279561455, + "grad_norm": 2.625, + "learning_rate": 4.9838510313530304e-05, + "loss": 0.8884, + "step": 2206 + }, + { + "epoch": 0.03913904731517303, + "grad_norm": 2.875, + "learning_rate": 4.983819323201476e-05, + "loss": 0.8661, + "step": 2208 + }, + { + "epoch": 0.0391744993507846, + "grad_norm": 2.828125, + "learning_rate": 4.9837875840523734e-05, + "loss": 0.9119, + "step": 2210 + }, + { + "epoch": 0.039209951386396165, + "grad_norm": 2.96875, + "learning_rate": 4.983755813906119e-05, + "loss": 0.929, + "step": 2212 + }, + { + "epoch": 0.03924540342200774, + "grad_norm": 2.78125, + "learning_rate": 4.9837240127631094e-05, + "loss": 0.8929, + "step": 2214 + }, + { + "epoch": 0.03928085545761931, + "grad_norm": 2.921875, + "learning_rate": 4.9836921806237416e-05, + "loss": 0.8669, + "step": 2216 + }, + { + "epoch": 0.039316307493230875, + "grad_norm": 2.765625, + "learning_rate": 4.983660317488412e-05, + "loss": 0.8984, + "step": 2218 + }, + { + "epoch": 0.03935175952884245, + "grad_norm": 3.0625, + "learning_rate": 4.98362842335752e-05, + "loss": 0.9475, + "step": 2220 + }, + { + "epoch": 0.03938721156445402, + "grad_norm": 2.875, + "learning_rate": 4.983596498231462e-05, + "loss": 0.8664, + "step": 2222 + }, + { + "epoch": 0.039422663600065584, + "grad_norm": 2.625, + "learning_rate": 4.983564542110637e-05, + "loss": 0.89, + "step": 2224 + }, + { + "epoch": 0.03945811563567716, + "grad_norm": 2.921875, + "learning_rate": 4.9835325549954446e-05, + "loss": 0.864, + "step": 2226 + }, + { + "epoch": 0.039493567671288726, + "grad_norm": 2.953125, + "learning_rate": 4.983500536886282e-05, + "loss": 0.8795, + "step": 2228 + }, + { + "epoch": 0.039529019706900294, + "grad_norm": 3.125, + "learning_rate": 4.9834684877835506e-05, + "loss": 0.894, + "step": 2230 + }, + { + "epoch": 0.03956447174251187, + "grad_norm": 3.265625, + "learning_rate": 4.9834364076876493e-05, + "loss": 0.8737, + "step": 2232 + }, + { + "epoch": 0.039599923778123436, + "grad_norm": 2.734375, + "learning_rate": 4.983404296598979e-05, + "loss": 0.866, + "step": 2234 + }, + { + "epoch": 0.039635375813735, + "grad_norm": 3.46875, + "learning_rate": 4.9833721545179415e-05, + "loss": 0.8949, + "step": 2236 + }, + { + "epoch": 0.03967082784934658, + "grad_norm": 2.78125, + "learning_rate": 4.983339981444936e-05, + "loss": 0.8475, + "step": 2238 + }, + { + "epoch": 0.039706279884958146, + "grad_norm": 3.0, + "learning_rate": 4.9833077773803635e-05, + "loss": 0.868, + "step": 2240 + }, + { + "epoch": 0.03974173192056971, + "grad_norm": 2.984375, + "learning_rate": 4.983275542324629e-05, + "loss": 0.8695, + "step": 2242 + }, + { + "epoch": 0.03977718395618128, + "grad_norm": 2.671875, + "learning_rate": 4.983243276278132e-05, + "loss": 0.9194, + "step": 2244 + }, + { + "epoch": 0.039812635991792855, + "grad_norm": 2.40625, + "learning_rate": 4.9832109792412764e-05, + "loss": 0.8362, + "step": 2246 + }, + { + "epoch": 0.03984808802740442, + "grad_norm": 3.171875, + "learning_rate": 4.983178651214465e-05, + "loss": 0.9008, + "step": 2248 + }, + { + "epoch": 0.03988354006301599, + "grad_norm": 3.25, + "learning_rate": 4.983146292198101e-05, + "loss": 0.8607, + "step": 2250 + }, + { + "epoch": 0.039918992098627565, + "grad_norm": 3.5625, + "learning_rate": 4.9831139021925886e-05, + "loss": 0.9423, + "step": 2252 + }, + { + "epoch": 0.03995444413423913, + "grad_norm": 2.984375, + "learning_rate": 4.9830814811983316e-05, + "loss": 0.9248, + "step": 2254 + }, + { + "epoch": 0.0399898961698507, + "grad_norm": 3.0, + "learning_rate": 4.983049029215735e-05, + "loss": 0.9025, + "step": 2256 + }, + { + "epoch": 0.040025348205462274, + "grad_norm": 2.84375, + "learning_rate": 4.983016546245205e-05, + "loss": 0.8686, + "step": 2258 + }, + { + "epoch": 0.04006080024107384, + "grad_norm": 3.015625, + "learning_rate": 4.982984032287145e-05, + "loss": 0.8586, + "step": 2260 + }, + { + "epoch": 0.04009625227668541, + "grad_norm": 2.875, + "learning_rate": 4.98295148734196e-05, + "loss": 0.8336, + "step": 2262 + }, + { + "epoch": 0.040131704312296984, + "grad_norm": 2.984375, + "learning_rate": 4.982918911410059e-05, + "loss": 0.8966, + "step": 2264 + }, + { + "epoch": 0.04016715634790855, + "grad_norm": 2.90625, + "learning_rate": 4.982886304491847e-05, + "loss": 0.8924, + "step": 2266 + }, + { + "epoch": 0.04020260838352012, + "grad_norm": 2.953125, + "learning_rate": 4.9828536665877304e-05, + "loss": 0.8736, + "step": 2268 + }, + { + "epoch": 0.040238060419131694, + "grad_norm": 3.015625, + "learning_rate": 4.9828209976981174e-05, + "loss": 0.864, + "step": 2270 + }, + { + "epoch": 0.04027351245474326, + "grad_norm": 3.09375, + "learning_rate": 4.9827882978234165e-05, + "loss": 0.8624, + "step": 2272 + }, + { + "epoch": 0.04030896449035483, + "grad_norm": 2.984375, + "learning_rate": 4.9827555669640335e-05, + "loss": 0.8635, + "step": 2274 + }, + { + "epoch": 0.0403444165259664, + "grad_norm": 2.890625, + "learning_rate": 4.982722805120379e-05, + "loss": 0.8958, + "step": 2276 + }, + { + "epoch": 0.04037986856157797, + "grad_norm": 2.765625, + "learning_rate": 4.982690012292861e-05, + "loss": 0.9135, + "step": 2278 + }, + { + "epoch": 0.04041532059718954, + "grad_norm": 2.875, + "learning_rate": 4.9826571884818886e-05, + "loss": 0.8516, + "step": 2280 + }, + { + "epoch": 0.04045077263280111, + "grad_norm": 2.96875, + "learning_rate": 4.982624333687871e-05, + "loss": 0.8412, + "step": 2282 + }, + { + "epoch": 0.04048622466841268, + "grad_norm": 3.203125, + "learning_rate": 4.982591447911219e-05, + "loss": 0.9317, + "step": 2284 + }, + { + "epoch": 0.04052167670402425, + "grad_norm": 2.8125, + "learning_rate": 4.982558531152343e-05, + "loss": 0.8704, + "step": 2286 + }, + { + "epoch": 0.040557128739635816, + "grad_norm": 2.71875, + "learning_rate": 4.982525583411654e-05, + "loss": 0.8408, + "step": 2288 + }, + { + "epoch": 0.04059258077524739, + "grad_norm": 3.09375, + "learning_rate": 4.982492604689562e-05, + "loss": 0.8701, + "step": 2290 + }, + { + "epoch": 0.04062803281085896, + "grad_norm": 2.796875, + "learning_rate": 4.98245959498648e-05, + "loss": 0.8691, + "step": 2292 + }, + { + "epoch": 0.040663484846470525, + "grad_norm": 2.796875, + "learning_rate": 4.9824265543028195e-05, + "loss": 0.8744, + "step": 2294 + }, + { + "epoch": 0.0406989368820821, + "grad_norm": 3.015625, + "learning_rate": 4.9823934826389916e-05, + "loss": 0.863, + "step": 2296 + }, + { + "epoch": 0.04073438891769367, + "grad_norm": 3.046875, + "learning_rate": 4.982360379995411e-05, + "loss": 0.8763, + "step": 2298 + }, + { + "epoch": 0.040769840953305235, + "grad_norm": 2.734375, + "learning_rate": 4.98232724637249e-05, + "loss": 0.8696, + "step": 2300 + }, + { + "epoch": 0.04080529298891681, + "grad_norm": 2.875, + "learning_rate": 4.982294081770641e-05, + "loss": 0.9117, + "step": 2302 + }, + { + "epoch": 0.04084074502452838, + "grad_norm": 2.75, + "learning_rate": 4.9822608861902795e-05, + "loss": 0.9322, + "step": 2304 + }, + { + "epoch": 0.040876197060139945, + "grad_norm": 2.6875, + "learning_rate": 4.9822276596318195e-05, + "loss": 0.8368, + "step": 2306 + }, + { + "epoch": 0.04091164909575152, + "grad_norm": 3.0, + "learning_rate": 4.982194402095675e-05, + "loss": 0.8842, + "step": 2308 + }, + { + "epoch": 0.04094710113136309, + "grad_norm": 2.90625, + "learning_rate": 4.9821611135822615e-05, + "loss": 0.8414, + "step": 2310 + }, + { + "epoch": 0.040982553166974654, + "grad_norm": 2.59375, + "learning_rate": 4.982127794091994e-05, + "loss": 0.8953, + "step": 2312 + }, + { + "epoch": 0.04101800520258623, + "grad_norm": 2.921875, + "learning_rate": 4.982094443625289e-05, + "loss": 0.9333, + "step": 2314 + }, + { + "epoch": 0.041053457238197796, + "grad_norm": 2.8125, + "learning_rate": 4.982061062182562e-05, + "loss": 0.876, + "step": 2316 + }, + { + "epoch": 0.041088909273809364, + "grad_norm": 2.890625, + "learning_rate": 4.98202764976423e-05, + "loss": 0.8729, + "step": 2318 + }, + { + "epoch": 0.04112436130942094, + "grad_norm": 2.796875, + "learning_rate": 4.9819942063707105e-05, + "loss": 0.9321, + "step": 2320 + }, + { + "epoch": 0.041159813345032506, + "grad_norm": 2.609375, + "learning_rate": 4.981960732002419e-05, + "loss": 0.867, + "step": 2322 + }, + { + "epoch": 0.041195265380644074, + "grad_norm": 3.21875, + "learning_rate": 4.9819272266597755e-05, + "loss": 0.8381, + "step": 2324 + }, + { + "epoch": 0.04123071741625565, + "grad_norm": 2.984375, + "learning_rate": 4.981893690343197e-05, + "loss": 0.847, + "step": 2326 + }, + { + "epoch": 0.041266169451867216, + "grad_norm": 2.84375, + "learning_rate": 4.981860123053102e-05, + "loss": 0.8989, + "step": 2328 + }, + { + "epoch": 0.04130162148747878, + "grad_norm": 2.59375, + "learning_rate": 4.98182652478991e-05, + "loss": 0.8794, + "step": 2330 + }, + { + "epoch": 0.04133707352309035, + "grad_norm": 2.625, + "learning_rate": 4.9817928955540395e-05, + "loss": 0.8438, + "step": 2332 + }, + { + "epoch": 0.041372525558701925, + "grad_norm": 2.90625, + "learning_rate": 4.9817592353459104e-05, + "loss": 0.8878, + "step": 2334 + }, + { + "epoch": 0.04140797759431349, + "grad_norm": 2.703125, + "learning_rate": 4.981725544165943e-05, + "loss": 0.8578, + "step": 2336 + }, + { + "epoch": 0.04144342962992506, + "grad_norm": 3.046875, + "learning_rate": 4.981691822014558e-05, + "loss": 0.8464, + "step": 2338 + }, + { + "epoch": 0.041478881665536635, + "grad_norm": 2.734375, + "learning_rate": 4.981658068892176e-05, + "loss": 0.8744, + "step": 2340 + }, + { + "epoch": 0.0415143337011482, + "grad_norm": 2.921875, + "learning_rate": 4.9816242847992176e-05, + "loss": 0.8701, + "step": 2342 + }, + { + "epoch": 0.04154978573675977, + "grad_norm": 2.75, + "learning_rate": 4.981590469736106e-05, + "loss": 0.9046, + "step": 2344 + }, + { + "epoch": 0.041585237772371345, + "grad_norm": 3.1875, + "learning_rate": 4.981556623703262e-05, + "loss": 0.841, + "step": 2346 + }, + { + "epoch": 0.04162068980798291, + "grad_norm": 3.046875, + "learning_rate": 4.981522746701107e-05, + "loss": 0.9096, + "step": 2348 + }, + { + "epoch": 0.04165614184359448, + "grad_norm": 2.703125, + "learning_rate": 4.981488838730066e-05, + "loss": 0.8283, + "step": 2350 + }, + { + "epoch": 0.041691593879206054, + "grad_norm": 2.765625, + "learning_rate": 4.9814548997905616e-05, + "loss": 0.8977, + "step": 2352 + }, + { + "epoch": 0.04172704591481762, + "grad_norm": 3.28125, + "learning_rate": 4.981420929883016e-05, + "loss": 0.8914, + "step": 2354 + }, + { + "epoch": 0.04176249795042919, + "grad_norm": 2.96875, + "learning_rate": 4.981386929007854e-05, + "loss": 0.8514, + "step": 2356 + }, + { + "epoch": 0.041797949986040764, + "grad_norm": 2.828125, + "learning_rate": 4.981352897165501e-05, + "loss": 0.8564, + "step": 2358 + }, + { + "epoch": 0.04183340202165233, + "grad_norm": 3.015625, + "learning_rate": 4.98131883435638e-05, + "loss": 0.9608, + "step": 2360 + }, + { + "epoch": 0.0418688540572639, + "grad_norm": 2.984375, + "learning_rate": 4.981284740580916e-05, + "loss": 0.9024, + "step": 2362 + }, + { + "epoch": 0.041904306092875473, + "grad_norm": 2.921875, + "learning_rate": 4.9812506158395365e-05, + "loss": 0.906, + "step": 2364 + }, + { + "epoch": 0.04193975812848704, + "grad_norm": 2.8125, + "learning_rate": 4.981216460132666e-05, + "loss": 0.8399, + "step": 2366 + }, + { + "epoch": 0.04197521016409861, + "grad_norm": 2.9375, + "learning_rate": 4.9811822734607304e-05, + "loss": 0.8214, + "step": 2368 + }, + { + "epoch": 0.042010662199710176, + "grad_norm": 3.0625, + "learning_rate": 4.981148055824157e-05, + "loss": 0.8488, + "step": 2370 + }, + { + "epoch": 0.04204611423532175, + "grad_norm": 3.09375, + "learning_rate": 4.981113807223372e-05, + "loss": 0.9123, + "step": 2372 + }, + { + "epoch": 0.04208156627093332, + "grad_norm": 2.96875, + "learning_rate": 4.981079527658804e-05, + "loss": 0.8889, + "step": 2374 + }, + { + "epoch": 0.042117018306544886, + "grad_norm": 2.5, + "learning_rate": 4.981045217130881e-05, + "loss": 0.871, + "step": 2376 + }, + { + "epoch": 0.04215247034215646, + "grad_norm": 2.9375, + "learning_rate": 4.9810108756400294e-05, + "loss": 0.8747, + "step": 2378 + }, + { + "epoch": 0.04218792237776803, + "grad_norm": 3.015625, + "learning_rate": 4.980976503186679e-05, + "loss": 0.831, + "step": 2380 + }, + { + "epoch": 0.042223374413379595, + "grad_norm": 2.53125, + "learning_rate": 4.980942099771259e-05, + "loss": 0.8865, + "step": 2382 + }, + { + "epoch": 0.04225882644899117, + "grad_norm": 3.125, + "learning_rate": 4.980907665394198e-05, + "loss": 0.8713, + "step": 2384 + }, + { + "epoch": 0.04229427848460274, + "grad_norm": 3.1875, + "learning_rate": 4.980873200055927e-05, + "loss": 0.8802, + "step": 2386 + }, + { + "epoch": 0.042329730520214305, + "grad_norm": 3.265625, + "learning_rate": 4.980838703756874e-05, + "loss": 0.8807, + "step": 2388 + }, + { + "epoch": 0.04236518255582588, + "grad_norm": 3.015625, + "learning_rate": 4.980804176497471e-05, + "loss": 0.8972, + "step": 2390 + }, + { + "epoch": 0.04240063459143745, + "grad_norm": 2.84375, + "learning_rate": 4.980769618278149e-05, + "loss": 0.8419, + "step": 2392 + }, + { + "epoch": 0.042436086627049015, + "grad_norm": 2.75, + "learning_rate": 4.980735029099338e-05, + "loss": 0.8692, + "step": 2394 + }, + { + "epoch": 0.04247153866266059, + "grad_norm": 2.609375, + "learning_rate": 4.9807004089614714e-05, + "loss": 0.8991, + "step": 2396 + }, + { + "epoch": 0.04250699069827216, + "grad_norm": 2.890625, + "learning_rate": 4.98066575786498e-05, + "loss": 0.9319, + "step": 2398 + }, + { + "epoch": 0.042542442733883724, + "grad_norm": 2.734375, + "learning_rate": 4.980631075810297e-05, + "loss": 0.8623, + "step": 2400 + }, + { + "epoch": 0.0425778947694953, + "grad_norm": 2.8125, + "learning_rate": 4.980596362797855e-05, + "loss": 0.8915, + "step": 2402 + }, + { + "epoch": 0.042613346805106866, + "grad_norm": 2.703125, + "learning_rate": 4.9805616188280855e-05, + "loss": 0.843, + "step": 2404 + }, + { + "epoch": 0.042648798840718434, + "grad_norm": 3.1875, + "learning_rate": 4.980526843901425e-05, + "loss": 0.9275, + "step": 2406 + }, + { + "epoch": 0.04268425087633001, + "grad_norm": 3.28125, + "learning_rate": 4.9804920380183064e-05, + "loss": 0.8761, + "step": 2408 + }, + { + "epoch": 0.042719702911941576, + "grad_norm": 2.859375, + "learning_rate": 4.980457201179163e-05, + "loss": 0.8647, + "step": 2410 + }, + { + "epoch": 0.042755154947553144, + "grad_norm": 3.171875, + "learning_rate": 4.980422333384431e-05, + "loss": 0.8317, + "step": 2412 + }, + { + "epoch": 0.04279060698316471, + "grad_norm": 2.625, + "learning_rate": 4.9803874346345445e-05, + "loss": 0.8614, + "step": 2414 + }, + { + "epoch": 0.042826059018776286, + "grad_norm": 3.0625, + "learning_rate": 4.980352504929939e-05, + "loss": 0.8696, + "step": 2416 + }, + { + "epoch": 0.04286151105438785, + "grad_norm": 3.203125, + "learning_rate": 4.980317544271052e-05, + "loss": 0.8983, + "step": 2418 + }, + { + "epoch": 0.04289696308999942, + "grad_norm": 2.96875, + "learning_rate": 4.980282552658318e-05, + "loss": 0.8849, + "step": 2420 + }, + { + "epoch": 0.042932415125610995, + "grad_norm": 2.6875, + "learning_rate": 4.9802475300921744e-05, + "loss": 0.8875, + "step": 2422 + }, + { + "epoch": 0.04296786716122256, + "grad_norm": 2.84375, + "learning_rate": 4.9802124765730586e-05, + "loss": 0.8473, + "step": 2424 + }, + { + "epoch": 0.04300331919683413, + "grad_norm": 2.953125, + "learning_rate": 4.980177392101407e-05, + "loss": 0.8613, + "step": 2426 + }, + { + "epoch": 0.043038771232445705, + "grad_norm": 3.03125, + "learning_rate": 4.980142276677658e-05, + "loss": 0.8858, + "step": 2428 + }, + { + "epoch": 0.04307422326805727, + "grad_norm": 2.796875, + "learning_rate": 4.9801071303022514e-05, + "loss": 0.8649, + "step": 2430 + }, + { + "epoch": 0.04310967530366884, + "grad_norm": 2.84375, + "learning_rate": 4.9800719529756236e-05, + "loss": 0.8845, + "step": 2432 + }, + { + "epoch": 0.043145127339280415, + "grad_norm": 2.734375, + "learning_rate": 4.980036744698214e-05, + "loss": 0.8627, + "step": 2434 + }, + { + "epoch": 0.04318057937489198, + "grad_norm": 3.171875, + "learning_rate": 4.9800015054704625e-05, + "loss": 0.878, + "step": 2436 + }, + { + "epoch": 0.04321603141050355, + "grad_norm": 3.3125, + "learning_rate": 4.979966235292809e-05, + "loss": 0.8916, + "step": 2438 + }, + { + "epoch": 0.043251483446115124, + "grad_norm": 2.84375, + "learning_rate": 4.979930934165693e-05, + "loss": 0.8307, + "step": 2440 + }, + { + "epoch": 0.04328693548172669, + "grad_norm": 2.75, + "learning_rate": 4.979895602089556e-05, + "loss": 0.8751, + "step": 2442 + }, + { + "epoch": 0.04332238751733826, + "grad_norm": 2.9375, + "learning_rate": 4.979860239064838e-05, + "loss": 0.8464, + "step": 2444 + }, + { + "epoch": 0.043357839552949834, + "grad_norm": 3.375, + "learning_rate": 4.9798248450919804e-05, + "loss": 0.8977, + "step": 2446 + }, + { + "epoch": 0.0433932915885614, + "grad_norm": 2.828125, + "learning_rate": 4.9797894201714266e-05, + "loss": 0.8583, + "step": 2448 + }, + { + "epoch": 0.04342874362417297, + "grad_norm": 2.84375, + "learning_rate": 4.979753964303616e-05, + "loss": 0.9041, + "step": 2450 + }, + { + "epoch": 0.043464195659784544, + "grad_norm": 3.390625, + "learning_rate": 4.9797184774889926e-05, + "loss": 0.8823, + "step": 2452 + }, + { + "epoch": 0.04349964769539611, + "grad_norm": 2.765625, + "learning_rate": 4.979682959728e-05, + "loss": 0.8753, + "step": 2454 + }, + { + "epoch": 0.04353509973100768, + "grad_norm": 2.84375, + "learning_rate": 4.97964741102108e-05, + "loss": 0.9116, + "step": 2456 + }, + { + "epoch": 0.043570551766619246, + "grad_norm": 2.765625, + "learning_rate": 4.9796118313686765e-05, + "loss": 0.8646, + "step": 2458 + }, + { + "epoch": 0.04360600380223082, + "grad_norm": 2.859375, + "learning_rate": 4.979576220771234e-05, + "loss": 0.852, + "step": 2460 + }, + { + "epoch": 0.04364145583784239, + "grad_norm": 3.015625, + "learning_rate": 4.979540579229197e-05, + "loss": 0.8815, + "step": 2462 + }, + { + "epoch": 0.043676907873453956, + "grad_norm": 3.078125, + "learning_rate": 4.979504906743009e-05, + "loss": 0.8904, + "step": 2464 + }, + { + "epoch": 0.04371235990906553, + "grad_norm": 2.578125, + "learning_rate": 4.9794692033131176e-05, + "loss": 0.8501, + "step": 2466 + }, + { + "epoch": 0.0437478119446771, + "grad_norm": 2.890625, + "learning_rate": 4.979433468939966e-05, + "loss": 0.8315, + "step": 2468 + }, + { + "epoch": 0.043783263980288666, + "grad_norm": 2.875, + "learning_rate": 4.9793977036240015e-05, + "loss": 0.8595, + "step": 2470 + }, + { + "epoch": 0.04381871601590024, + "grad_norm": 2.84375, + "learning_rate": 4.979361907365671e-05, + "loss": 0.8838, + "step": 2472 + }, + { + "epoch": 0.04385416805151181, + "grad_norm": 2.765625, + "learning_rate": 4.979326080165419e-05, + "loss": 0.8831, + "step": 2474 + }, + { + "epoch": 0.043889620087123375, + "grad_norm": 2.5, + "learning_rate": 4.9792902220236936e-05, + "loss": 0.8185, + "step": 2476 + }, + { + "epoch": 0.04392507212273495, + "grad_norm": 2.921875, + "learning_rate": 4.979254332940944e-05, + "loss": 0.8773, + "step": 2478 + }, + { + "epoch": 0.04396052415834652, + "grad_norm": 3.0625, + "learning_rate": 4.9792184129176153e-05, + "loss": 0.8821, + "step": 2480 + }, + { + "epoch": 0.043995976193958085, + "grad_norm": 2.859375, + "learning_rate": 4.979182461954158e-05, + "loss": 0.8751, + "step": 2482 + }, + { + "epoch": 0.04403142822956966, + "grad_norm": 3.25, + "learning_rate": 4.97914648005102e-05, + "loss": 0.9332, + "step": 2484 + }, + { + "epoch": 0.04406688026518123, + "grad_norm": 2.890625, + "learning_rate": 4.979110467208651e-05, + "loss": 0.8437, + "step": 2486 + }, + { + "epoch": 0.044102332300792794, + "grad_norm": 3.1875, + "learning_rate": 4.9790744234274986e-05, + "loss": 0.8874, + "step": 2488 + }, + { + "epoch": 0.04413778433640437, + "grad_norm": 3.015625, + "learning_rate": 4.979038348708014e-05, + "loss": 0.8942, + "step": 2490 + }, + { + "epoch": 0.04417323637201594, + "grad_norm": 3.25, + "learning_rate": 4.9790022430506463e-05, + "loss": 0.8691, + "step": 2492 + }, + { + "epoch": 0.044208688407627504, + "grad_norm": 2.84375, + "learning_rate": 4.978966106455848e-05, + "loss": 0.9051, + "step": 2494 + }, + { + "epoch": 0.04424414044323908, + "grad_norm": 2.984375, + "learning_rate": 4.9789299389240694e-05, + "loss": 0.8664, + "step": 2496 + }, + { + "epoch": 0.044279592478850646, + "grad_norm": 2.984375, + "learning_rate": 4.9788937404557615e-05, + "loss": 0.8914, + "step": 2498 + }, + { + "epoch": 0.044315044514462214, + "grad_norm": 2.921875, + "learning_rate": 4.9788575110513755e-05, + "loss": 0.8917, + "step": 2500 + }, + { + "epoch": 0.04435049655007378, + "grad_norm": 2.890625, + "learning_rate": 4.978821250711364e-05, + "loss": 0.8945, + "step": 2502 + }, + { + "epoch": 0.044385948585685356, + "grad_norm": 2.671875, + "learning_rate": 4.978784959436179e-05, + "loss": 0.8685, + "step": 2504 + }, + { + "epoch": 0.04442140062129692, + "grad_norm": 2.859375, + "learning_rate": 4.9787486372262746e-05, + "loss": 0.8572, + "step": 2506 + }, + { + "epoch": 0.04445685265690849, + "grad_norm": 2.703125, + "learning_rate": 4.978712284082104e-05, + "loss": 0.8378, + "step": 2508 + }, + { + "epoch": 0.044492304692520065, + "grad_norm": 3.015625, + "learning_rate": 4.9786759000041197e-05, + "loss": 0.8705, + "step": 2510 + }, + { + "epoch": 0.04452775672813163, + "grad_norm": 3.234375, + "learning_rate": 4.978639484992777e-05, + "loss": 0.874, + "step": 2512 + }, + { + "epoch": 0.0445632087637432, + "grad_norm": 3.078125, + "learning_rate": 4.9786030390485295e-05, + "loss": 0.9042, + "step": 2514 + }, + { + "epoch": 0.044598660799354775, + "grad_norm": 3.171875, + "learning_rate": 4.9785665621718325e-05, + "loss": 0.8862, + "step": 2516 + }, + { + "epoch": 0.04463411283496634, + "grad_norm": 2.578125, + "learning_rate": 4.978530054363141e-05, + "loss": 0.8207, + "step": 2518 + }, + { + "epoch": 0.04466956487057791, + "grad_norm": 2.828125, + "learning_rate": 4.97849351562291e-05, + "loss": 0.8353, + "step": 2520 + }, + { + "epoch": 0.044705016906189485, + "grad_norm": 2.875, + "learning_rate": 4.978456945951597e-05, + "loss": 0.8698, + "step": 2522 + }, + { + "epoch": 0.04474046894180105, + "grad_norm": 2.6875, + "learning_rate": 4.978420345349657e-05, + "loss": 0.9051, + "step": 2524 + }, + { + "epoch": 0.04477592097741262, + "grad_norm": 3.140625, + "learning_rate": 4.978383713817548e-05, + "loss": 0.8803, + "step": 2526 + }, + { + "epoch": 0.044811373013024194, + "grad_norm": 2.71875, + "learning_rate": 4.9783470513557255e-05, + "loss": 0.8891, + "step": 2528 + }, + { + "epoch": 0.04484682504863576, + "grad_norm": 3.03125, + "learning_rate": 4.978310357964648e-05, + "loss": 0.8661, + "step": 2530 + }, + { + "epoch": 0.04488227708424733, + "grad_norm": 3.25, + "learning_rate": 4.978273633644775e-05, + "loss": 0.8592, + "step": 2532 + }, + { + "epoch": 0.044917729119858904, + "grad_norm": 3.125, + "learning_rate": 4.978236878396562e-05, + "loss": 0.8831, + "step": 2534 + }, + { + "epoch": 0.04495318115547047, + "grad_norm": 2.8125, + "learning_rate": 4.978200092220469e-05, + "loss": 0.9059, + "step": 2536 + }, + { + "epoch": 0.04498863319108204, + "grad_norm": 3.1875, + "learning_rate": 4.978163275116955e-05, + "loss": 0.9341, + "step": 2538 + }, + { + "epoch": 0.045024085226693614, + "grad_norm": 2.625, + "learning_rate": 4.978126427086479e-05, + "loss": 0.8293, + "step": 2540 + }, + { + "epoch": 0.04505953726230518, + "grad_norm": 3.09375, + "learning_rate": 4.978089548129502e-05, + "loss": 0.8728, + "step": 2542 + }, + { + "epoch": 0.04509498929791675, + "grad_norm": 2.515625, + "learning_rate": 4.9780526382464846e-05, + "loss": 0.8605, + "step": 2544 + }, + { + "epoch": 0.045130441333528316, + "grad_norm": 2.859375, + "learning_rate": 4.978015697437884e-05, + "loss": 0.9002, + "step": 2546 + }, + { + "epoch": 0.04516589336913989, + "grad_norm": 2.8125, + "learning_rate": 4.977978725704165e-05, + "loss": 0.9107, + "step": 2548 + }, + { + "epoch": 0.04520134540475146, + "grad_norm": 2.875, + "learning_rate": 4.977941723045788e-05, + "loss": 0.8659, + "step": 2550 + }, + { + "epoch": 0.045236797440363026, + "grad_norm": 2.546875, + "learning_rate": 4.9779046894632145e-05, + "loss": 0.8158, + "step": 2552 + }, + { + "epoch": 0.0452722494759746, + "grad_norm": 2.953125, + "learning_rate": 4.9778676249569055e-05, + "loss": 0.8675, + "step": 2554 + }, + { + "epoch": 0.04530770151158617, + "grad_norm": 3.15625, + "learning_rate": 4.977830529527325e-05, + "loss": 0.8723, + "step": 2556 + }, + { + "epoch": 0.045343153547197736, + "grad_norm": 3.0, + "learning_rate": 4.977793403174936e-05, + "loss": 0.8759, + "step": 2558 + }, + { + "epoch": 0.04537860558280931, + "grad_norm": 2.921875, + "learning_rate": 4.9777562459002016e-05, + "loss": 0.8769, + "step": 2560 + }, + { + "epoch": 0.04541405761842088, + "grad_norm": 2.65625, + "learning_rate": 4.977719057703585e-05, + "loss": 0.8584, + "step": 2562 + }, + { + "epoch": 0.045449509654032445, + "grad_norm": 2.84375, + "learning_rate": 4.977681838585551e-05, + "loss": 0.8723, + "step": 2564 + }, + { + "epoch": 0.04548496168964402, + "grad_norm": 2.84375, + "learning_rate": 4.977644588546563e-05, + "loss": 0.8643, + "step": 2566 + }, + { + "epoch": 0.04552041372525559, + "grad_norm": 3.125, + "learning_rate": 4.977607307587087e-05, + "loss": 0.9124, + "step": 2568 + }, + { + "epoch": 0.045555865760867155, + "grad_norm": 2.921875, + "learning_rate": 4.977569995707588e-05, + "loss": 0.8738, + "step": 2570 + }, + { + "epoch": 0.04559131779647873, + "grad_norm": 2.78125, + "learning_rate": 4.977532652908531e-05, + "loss": 0.8377, + "step": 2572 + }, + { + "epoch": 0.0456267698320903, + "grad_norm": 3.015625, + "learning_rate": 4.977495279190383e-05, + "loss": 0.8863, + "step": 2574 + }, + { + "epoch": 0.045662221867701865, + "grad_norm": 2.8125, + "learning_rate": 4.9774578745536095e-05, + "loss": 0.8753, + "step": 2576 + }, + { + "epoch": 0.04569767390331344, + "grad_norm": 2.6875, + "learning_rate": 4.9774204389986776e-05, + "loss": 0.8668, + "step": 2578 + }, + { + "epoch": 0.04573312593892501, + "grad_norm": 2.46875, + "learning_rate": 4.977382972526056e-05, + "loss": 0.8861, + "step": 2580 + }, + { + "epoch": 0.045768577974536574, + "grad_norm": 2.796875, + "learning_rate": 4.9773454751362095e-05, + "loss": 0.8983, + "step": 2582 + }, + { + "epoch": 0.04580403001014814, + "grad_norm": 3.0, + "learning_rate": 4.977307946829608e-05, + "loss": 0.8544, + "step": 2584 + }, + { + "epoch": 0.045839482045759716, + "grad_norm": 3.171875, + "learning_rate": 4.977270387606719e-05, + "loss": 0.8897, + "step": 2586 + }, + { + "epoch": 0.045874934081371284, + "grad_norm": 3.171875, + "learning_rate": 4.977232797468012e-05, + "loss": 0.8848, + "step": 2588 + }, + { + "epoch": 0.04591038611698285, + "grad_norm": 2.953125, + "learning_rate": 4.977195176413955e-05, + "loss": 0.9034, + "step": 2590 + }, + { + "epoch": 0.045945838152594426, + "grad_norm": 2.796875, + "learning_rate": 4.9771575244450186e-05, + "loss": 0.8224, + "step": 2592 + }, + { + "epoch": 0.045981290188205994, + "grad_norm": 2.984375, + "learning_rate": 4.977119841561672e-05, + "loss": 0.8883, + "step": 2594 + }, + { + "epoch": 0.04601674222381756, + "grad_norm": 2.390625, + "learning_rate": 4.9770821277643867e-05, + "loss": 0.8868, + "step": 2596 + }, + { + "epoch": 0.046052194259429136, + "grad_norm": 2.96875, + "learning_rate": 4.977044383053631e-05, + "loss": 0.8422, + "step": 2598 + }, + { + "epoch": 0.0460876462950407, + "grad_norm": 2.828125, + "learning_rate": 4.977006607429878e-05, + "loss": 0.8772, + "step": 2600 + }, + { + "epoch": 0.04612309833065227, + "grad_norm": 3.015625, + "learning_rate": 4.976968800893598e-05, + "loss": 0.8761, + "step": 2602 + }, + { + "epoch": 0.046158550366263845, + "grad_norm": 2.78125, + "learning_rate": 4.9769309634452644e-05, + "loss": 0.8523, + "step": 2604 + }, + { + "epoch": 0.04619400240187541, + "grad_norm": 2.921875, + "learning_rate": 4.976893095085347e-05, + "loss": 0.8927, + "step": 2606 + }, + { + "epoch": 0.04622945443748698, + "grad_norm": 2.859375, + "learning_rate": 4.976855195814321e-05, + "loss": 0.8732, + "step": 2608 + }, + { + "epoch": 0.046264906473098555, + "grad_norm": 2.828125, + "learning_rate": 4.976817265632657e-05, + "loss": 0.8469, + "step": 2610 + }, + { + "epoch": 0.04630035850871012, + "grad_norm": 2.703125, + "learning_rate": 4.9767793045408293e-05, + "loss": 0.8568, + "step": 2612 + }, + { + "epoch": 0.04633581054432169, + "grad_norm": 2.84375, + "learning_rate": 4.976741312539313e-05, + "loss": 0.8712, + "step": 2614 + }, + { + "epoch": 0.046371262579933265, + "grad_norm": 2.96875, + "learning_rate": 4.976703289628579e-05, + "loss": 0.8554, + "step": 2616 + }, + { + "epoch": 0.04640671461554483, + "grad_norm": 2.796875, + "learning_rate": 4.9766652358091054e-05, + "loss": 0.8811, + "step": 2618 + }, + { + "epoch": 0.0464421666511564, + "grad_norm": 2.984375, + "learning_rate": 4.976627151081365e-05, + "loss": 0.9108, + "step": 2620 + }, + { + "epoch": 0.046477618686767974, + "grad_norm": 2.671875, + "learning_rate": 4.9765890354458335e-05, + "loss": 0.8402, + "step": 2622 + }, + { + "epoch": 0.04651307072237954, + "grad_norm": 2.9375, + "learning_rate": 4.976550888902987e-05, + "loss": 0.8537, + "step": 2624 + }, + { + "epoch": 0.04654852275799111, + "grad_norm": 2.671875, + "learning_rate": 4.976512711453301e-05, + "loss": 0.8806, + "step": 2626 + }, + { + "epoch": 0.04658397479360268, + "grad_norm": 2.9375, + "learning_rate": 4.976474503097252e-05, + "loss": 0.8581, + "step": 2628 + }, + { + "epoch": 0.04661942682921425, + "grad_norm": 2.734375, + "learning_rate": 4.976436263835317e-05, + "loss": 0.8177, + "step": 2630 + }, + { + "epoch": 0.04665487886482582, + "grad_norm": 2.828125, + "learning_rate": 4.976397993667974e-05, + "loss": 0.8971, + "step": 2632 + }, + { + "epoch": 0.046690330900437386, + "grad_norm": 3.046875, + "learning_rate": 4.976359692595699e-05, + "loss": 0.8278, + "step": 2634 + }, + { + "epoch": 0.04672578293604896, + "grad_norm": 3.046875, + "learning_rate": 4.976321360618972e-05, + "loss": 0.8896, + "step": 2636 + }, + { + "epoch": 0.04676123497166053, + "grad_norm": 2.859375, + "learning_rate": 4.976282997738269e-05, + "loss": 0.8968, + "step": 2638 + }, + { + "epoch": 0.046796687007272096, + "grad_norm": 2.890625, + "learning_rate": 4.97624460395407e-05, + "loss": 0.8867, + "step": 2640 + }, + { + "epoch": 0.04683213904288367, + "grad_norm": 3.078125, + "learning_rate": 4.9762061792668546e-05, + "loss": 0.8456, + "step": 2642 + }, + { + "epoch": 0.04686759107849524, + "grad_norm": 2.703125, + "learning_rate": 4.976167723677102e-05, + "loss": 0.8924, + "step": 2644 + }, + { + "epoch": 0.046903043114106806, + "grad_norm": 3.0, + "learning_rate": 4.976129237185291e-05, + "loss": 0.8889, + "step": 2646 + }, + { + "epoch": 0.04693849514971838, + "grad_norm": 3.328125, + "learning_rate": 4.976090719791904e-05, + "loss": 0.8419, + "step": 2648 + }, + { + "epoch": 0.04697394718532995, + "grad_norm": 2.765625, + "learning_rate": 4.97605217149742e-05, + "loss": 0.8749, + "step": 2650 + }, + { + "epoch": 0.047009399220941515, + "grad_norm": 3.203125, + "learning_rate": 4.97601359230232e-05, + "loss": 0.8336, + "step": 2652 + }, + { + "epoch": 0.04704485125655309, + "grad_norm": 3.21875, + "learning_rate": 4.9759749822070864e-05, + "loss": 0.8508, + "step": 2654 + }, + { + "epoch": 0.04708030329216466, + "grad_norm": 2.8125, + "learning_rate": 4.975936341212202e-05, + "loss": 0.8548, + "step": 2656 + }, + { + "epoch": 0.047115755327776225, + "grad_norm": 2.828125, + "learning_rate": 4.9758976693181464e-05, + "loss": 0.8688, + "step": 2658 + }, + { + "epoch": 0.0471512073633878, + "grad_norm": 2.8125, + "learning_rate": 4.9758589665254044e-05, + "loss": 0.9214, + "step": 2660 + }, + { + "epoch": 0.04718665939899937, + "grad_norm": 2.734375, + "learning_rate": 4.975820232834457e-05, + "loss": 0.8447, + "step": 2662 + }, + { + "epoch": 0.047222111434610935, + "grad_norm": 2.640625, + "learning_rate": 4.97578146824579e-05, + "loss": 0.8716, + "step": 2664 + }, + { + "epoch": 0.04725756347022251, + "grad_norm": 2.65625, + "learning_rate": 4.975742672759885e-05, + "loss": 0.86, + "step": 2666 + }, + { + "epoch": 0.04729301550583408, + "grad_norm": 3.03125, + "learning_rate": 4.975703846377228e-05, + "loss": 0.8478, + "step": 2668 + }, + { + "epoch": 0.047328467541445644, + "grad_norm": 2.59375, + "learning_rate": 4.975664989098302e-05, + "loss": 0.842, + "step": 2670 + }, + { + "epoch": 0.04736391957705721, + "grad_norm": 2.546875, + "learning_rate": 4.975626100923593e-05, + "loss": 0.8574, + "step": 2672 + }, + { + "epoch": 0.047399371612668786, + "grad_norm": 2.8125, + "learning_rate": 4.975587181853586e-05, + "loss": 0.8438, + "step": 2674 + }, + { + "epoch": 0.047434823648280354, + "grad_norm": 2.53125, + "learning_rate": 4.9755482318887656e-05, + "loss": 0.8721, + "step": 2676 + }, + { + "epoch": 0.04747027568389192, + "grad_norm": 2.953125, + "learning_rate": 4.9755092510296204e-05, + "loss": 0.8788, + "step": 2678 + }, + { + "epoch": 0.047505727719503496, + "grad_norm": 2.84375, + "learning_rate": 4.975470239276634e-05, + "loss": 0.8623, + "step": 2680 + }, + { + "epoch": 0.047541179755115064, + "grad_norm": 2.75, + "learning_rate": 4.975431196630296e-05, + "loss": 0.8361, + "step": 2682 + }, + { + "epoch": 0.04757663179072663, + "grad_norm": 3.03125, + "learning_rate": 4.975392123091091e-05, + "loss": 0.8689, + "step": 2684 + }, + { + "epoch": 0.047612083826338206, + "grad_norm": 2.75, + "learning_rate": 4.975353018659508e-05, + "loss": 0.8697, + "step": 2686 + }, + { + "epoch": 0.04764753586194977, + "grad_norm": 2.59375, + "learning_rate": 4.975313883336036e-05, + "loss": 0.8277, + "step": 2688 + }, + { + "epoch": 0.04768298789756134, + "grad_norm": 3.0, + "learning_rate": 4.9752747171211614e-05, + "loss": 0.8991, + "step": 2690 + }, + { + "epoch": 0.047718439933172915, + "grad_norm": 2.640625, + "learning_rate": 4.9752355200153735e-05, + "loss": 0.8643, + "step": 2692 + }, + { + "epoch": 0.04775389196878448, + "grad_norm": 2.84375, + "learning_rate": 4.975196292019163e-05, + "loss": 0.8467, + "step": 2694 + }, + { + "epoch": 0.04778934400439605, + "grad_norm": 3.328125, + "learning_rate": 4.9751570331330176e-05, + "loss": 0.8837, + "step": 2696 + }, + { + "epoch": 0.047824796040007625, + "grad_norm": 2.671875, + "learning_rate": 4.975117743357428e-05, + "loss": 0.8534, + "step": 2698 + }, + { + "epoch": 0.04786024807561919, + "grad_norm": 2.921875, + "learning_rate": 4.975078422692884e-05, + "loss": 0.8852, + "step": 2700 + }, + { + "epoch": 0.04789570011123076, + "grad_norm": 2.921875, + "learning_rate": 4.975039071139878e-05, + "loss": 0.886, + "step": 2702 + }, + { + "epoch": 0.047931152146842335, + "grad_norm": 2.828125, + "learning_rate": 4.9749996886988994e-05, + "loss": 0.88, + "step": 2704 + }, + { + "epoch": 0.0479666041824539, + "grad_norm": 3.015625, + "learning_rate": 4.974960275370439e-05, + "loss": 0.8631, + "step": 2706 + }, + { + "epoch": 0.04800205621806547, + "grad_norm": 3.0625, + "learning_rate": 4.9749208311549916e-05, + "loss": 0.9029, + "step": 2708 + }, + { + "epoch": 0.048037508253677044, + "grad_norm": 2.703125, + "learning_rate": 4.974881356053047e-05, + "loss": 0.8651, + "step": 2710 + }, + { + "epoch": 0.04807296028928861, + "grad_norm": 2.578125, + "learning_rate": 4.974841850065098e-05, + "loss": 0.8155, + "step": 2712 + }, + { + "epoch": 0.04810841232490018, + "grad_norm": 2.921875, + "learning_rate": 4.97480231319164e-05, + "loss": 0.8669, + "step": 2714 + }, + { + "epoch": 0.04814386436051175, + "grad_norm": 3.125, + "learning_rate": 4.974762745433163e-05, + "loss": 0.8886, + "step": 2716 + }, + { + "epoch": 0.04817931639612332, + "grad_norm": 3.0, + "learning_rate": 4.9747231467901625e-05, + "loss": 0.868, + "step": 2718 + }, + { + "epoch": 0.04821476843173489, + "grad_norm": 2.984375, + "learning_rate": 4.974683517263134e-05, + "loss": 0.8811, + "step": 2720 + }, + { + "epoch": 0.04825022046734646, + "grad_norm": 3.109375, + "learning_rate": 4.974643856852569e-05, + "loss": 0.8448, + "step": 2722 + }, + { + "epoch": 0.04828567250295803, + "grad_norm": 3.609375, + "learning_rate": 4.9746041655589644e-05, + "loss": 0.8653, + "step": 2724 + }, + { + "epoch": 0.0483211245385696, + "grad_norm": 2.875, + "learning_rate": 4.974564443382815e-05, + "loss": 0.8171, + "step": 2726 + }, + { + "epoch": 0.048356576574181166, + "grad_norm": 2.6875, + "learning_rate": 4.974524690324618e-05, + "loss": 0.8275, + "step": 2728 + }, + { + "epoch": 0.04839202860979274, + "grad_norm": 3.109375, + "learning_rate": 4.974484906384867e-05, + "loss": 0.8725, + "step": 2730 + }, + { + "epoch": 0.04842748064540431, + "grad_norm": 2.703125, + "learning_rate": 4.9744450915640605e-05, + "loss": 0.8374, + "step": 2732 + }, + { + "epoch": 0.048462932681015876, + "grad_norm": 2.78125, + "learning_rate": 4.9744052458626944e-05, + "loss": 0.9077, + "step": 2734 + }, + { + "epoch": 0.04849838471662745, + "grad_norm": 2.71875, + "learning_rate": 4.9743653692812657e-05, + "loss": 0.8366, + "step": 2736 + }, + { + "epoch": 0.04853383675223902, + "grad_norm": 3.0, + "learning_rate": 4.974325461820273e-05, + "loss": 0.8669, + "step": 2738 + }, + { + "epoch": 0.048569288787850586, + "grad_norm": 2.984375, + "learning_rate": 4.9742855234802146e-05, + "loss": 0.904, + "step": 2740 + }, + { + "epoch": 0.04860474082346216, + "grad_norm": 2.8125, + "learning_rate": 4.9742455542615876e-05, + "loss": 0.854, + "step": 2742 + }, + { + "epoch": 0.04864019285907373, + "grad_norm": 2.828125, + "learning_rate": 4.9742055541648916e-05, + "loss": 0.8762, + "step": 2744 + }, + { + "epoch": 0.048675644894685295, + "grad_norm": 2.71875, + "learning_rate": 4.9741655231906246e-05, + "loss": 0.8533, + "step": 2746 + }, + { + "epoch": 0.04871109693029687, + "grad_norm": 2.65625, + "learning_rate": 4.9741254613392887e-05, + "loss": 0.8636, + "step": 2748 + }, + { + "epoch": 0.04874654896590844, + "grad_norm": 2.75, + "learning_rate": 4.974085368611381e-05, + "loss": 0.8573, + "step": 2750 + }, + { + "epoch": 0.048782001001520005, + "grad_norm": 2.8125, + "learning_rate": 4.9740452450074044e-05, + "loss": 0.9177, + "step": 2752 + }, + { + "epoch": 0.04881745303713157, + "grad_norm": 2.75, + "learning_rate": 4.9740050905278577e-05, + "loss": 0.8821, + "step": 2754 + }, + { + "epoch": 0.04885290507274315, + "grad_norm": 2.640625, + "learning_rate": 4.973964905173243e-05, + "loss": 0.8554, + "step": 2756 + }, + { + "epoch": 0.048888357108354714, + "grad_norm": 2.6875, + "learning_rate": 4.973924688944061e-05, + "loss": 0.8452, + "step": 2758 + }, + { + "epoch": 0.04892380914396628, + "grad_norm": 2.953125, + "learning_rate": 4.973884441840816e-05, + "loss": 0.8852, + "step": 2760 + }, + { + "epoch": 0.048959261179577857, + "grad_norm": 2.828125, + "learning_rate": 4.9738441638640064e-05, + "loss": 0.8495, + "step": 2762 + }, + { + "epoch": 0.048994713215189424, + "grad_norm": 2.875, + "learning_rate": 4.973803855014138e-05, + "loss": 0.883, + "step": 2764 + }, + { + "epoch": 0.04903016525080099, + "grad_norm": 3.109375, + "learning_rate": 4.973763515291713e-05, + "loss": 0.8985, + "step": 2766 + }, + { + "epoch": 0.049065617286412566, + "grad_norm": 3.09375, + "learning_rate": 4.9737231446972334e-05, + "loss": 0.8508, + "step": 2768 + }, + { + "epoch": 0.049101069322024134, + "grad_norm": 3.140625, + "learning_rate": 4.973682743231205e-05, + "loss": 0.8939, + "step": 2770 + }, + { + "epoch": 0.0491365213576357, + "grad_norm": 3.21875, + "learning_rate": 4.973642310894131e-05, + "loss": 0.8824, + "step": 2772 + }, + { + "epoch": 0.049171973393247276, + "grad_norm": 3.109375, + "learning_rate": 4.9736018476865165e-05, + "loss": 0.9063, + "step": 2774 + }, + { + "epoch": 0.04920742542885884, + "grad_norm": 2.75, + "learning_rate": 4.973561353608866e-05, + "loss": 0.8409, + "step": 2776 + }, + { + "epoch": 0.04924287746447041, + "grad_norm": 2.984375, + "learning_rate": 4.973520828661684e-05, + "loss": 0.8812, + "step": 2778 + }, + { + "epoch": 0.049278329500081985, + "grad_norm": 2.859375, + "learning_rate": 4.973480272845479e-05, + "loss": 0.8498, + "step": 2780 + }, + { + "epoch": 0.04931378153569355, + "grad_norm": 2.6875, + "learning_rate": 4.9734396861607543e-05, + "loss": 0.8471, + "step": 2782 + }, + { + "epoch": 0.04934923357130512, + "grad_norm": 2.65625, + "learning_rate": 4.973399068608018e-05, + "loss": 0.8652, + "step": 2784 + }, + { + "epoch": 0.049384685606916695, + "grad_norm": 2.828125, + "learning_rate": 4.973358420187776e-05, + "loss": 0.8479, + "step": 2786 + }, + { + "epoch": 0.04942013764252826, + "grad_norm": 2.5625, + "learning_rate": 4.973317740900536e-05, + "loss": 0.8945, + "step": 2788 + }, + { + "epoch": 0.04945558967813983, + "grad_norm": 2.640625, + "learning_rate": 4.973277030746806e-05, + "loss": 0.85, + "step": 2790 + }, + { + "epoch": 0.049491041713751405, + "grad_norm": 3.09375, + "learning_rate": 4.973236289727094e-05, + "loss": 0.8745, + "step": 2792 + }, + { + "epoch": 0.04952649374936297, + "grad_norm": 2.515625, + "learning_rate": 4.9731955178419075e-05, + "loss": 0.842, + "step": 2794 + }, + { + "epoch": 0.04956194578497454, + "grad_norm": 2.859375, + "learning_rate": 4.9731547150917566e-05, + "loss": 0.9042, + "step": 2796 + }, + { + "epoch": 0.04959739782058611, + "grad_norm": 3.046875, + "learning_rate": 4.973113881477151e-05, + "loss": 0.885, + "step": 2798 + }, + { + "epoch": 0.04963284985619768, + "grad_norm": 2.765625, + "learning_rate": 4.973073016998598e-05, + "loss": 0.8983, + "step": 2800 + }, + { + "epoch": 0.04966830189180925, + "grad_norm": 2.609375, + "learning_rate": 4.973032121656609e-05, + "loss": 0.8418, + "step": 2802 + }, + { + "epoch": 0.04970375392742082, + "grad_norm": 2.78125, + "learning_rate": 4.9729911954516946e-05, + "loss": 0.8868, + "step": 2804 + }, + { + "epoch": 0.04973920596303239, + "grad_norm": 2.78125, + "learning_rate": 4.9729502383843643e-05, + "loss": 0.838, + "step": 2806 + }, + { + "epoch": 0.04977465799864396, + "grad_norm": 3.015625, + "learning_rate": 4.972909250455131e-05, + "loss": 0.8609, + "step": 2808 + }, + { + "epoch": 0.04981011003425553, + "grad_norm": 3.03125, + "learning_rate": 4.972868231664505e-05, + "loss": 0.8558, + "step": 2810 + }, + { + "epoch": 0.0498455620698671, + "grad_norm": 3.015625, + "learning_rate": 4.972827182012999e-05, + "loss": 0.858, + "step": 2812 + }, + { + "epoch": 0.04988101410547867, + "grad_norm": 2.875, + "learning_rate": 4.972786101501125e-05, + "loss": 0.8594, + "step": 2814 + }, + { + "epoch": 0.049916466141090236, + "grad_norm": 2.9375, + "learning_rate": 4.972744990129394e-05, + "loss": 0.9087, + "step": 2816 + }, + { + "epoch": 0.04995191817670181, + "grad_norm": 2.75, + "learning_rate": 4.972703847898321e-05, + "loss": 0.8705, + "step": 2818 + }, + { + "epoch": 0.04998737021231338, + "grad_norm": 2.8125, + "learning_rate": 4.97266267480842e-05, + "loss": 0.8763, + "step": 2820 + }, + { + "epoch": 0.050022822247924946, + "grad_norm": 2.828125, + "learning_rate": 4.972621470860204e-05, + "loss": 0.8277, + "step": 2822 + }, + { + "epoch": 0.05005827428353652, + "grad_norm": 2.75, + "learning_rate": 4.9725802360541854e-05, + "loss": 0.8925, + "step": 2824 + }, + { + "epoch": 0.05009372631914809, + "grad_norm": 2.765625, + "learning_rate": 4.972538970390882e-05, + "loss": 0.8644, + "step": 2826 + }, + { + "epoch": 0.050129178354759656, + "grad_norm": 3.375, + "learning_rate": 4.972497673870806e-05, + "loss": 0.895, + "step": 2828 + }, + { + "epoch": 0.05016463039037123, + "grad_norm": 2.71875, + "learning_rate": 4.9724563464944754e-05, + "loss": 0.8475, + "step": 2830 + }, + { + "epoch": 0.0502000824259828, + "grad_norm": 2.875, + "learning_rate": 4.972414988262404e-05, + "loss": 0.8468, + "step": 2832 + }, + { + "epoch": 0.050235534461594365, + "grad_norm": 2.78125, + "learning_rate": 4.9723735991751076e-05, + "loss": 0.9013, + "step": 2834 + }, + { + "epoch": 0.05027098649720594, + "grad_norm": 2.953125, + "learning_rate": 4.972332179233105e-05, + "loss": 0.9036, + "step": 2836 + }, + { + "epoch": 0.05030643853281751, + "grad_norm": 3.03125, + "learning_rate": 4.972290728436911e-05, + "loss": 0.8596, + "step": 2838 + }, + { + "epoch": 0.050341890568429075, + "grad_norm": 2.875, + "learning_rate": 4.9722492467870434e-05, + "loss": 0.8977, + "step": 2840 + }, + { + "epoch": 0.05037734260404064, + "grad_norm": 2.78125, + "learning_rate": 4.9722077342840204e-05, + "loss": 0.8696, + "step": 2842 + }, + { + "epoch": 0.05041279463965222, + "grad_norm": 2.984375, + "learning_rate": 4.97216619092836e-05, + "loss": 0.8877, + "step": 2844 + }, + { + "epoch": 0.050448246675263785, + "grad_norm": 3.1875, + "learning_rate": 4.97212461672058e-05, + "loss": 0.8741, + "step": 2846 + }, + { + "epoch": 0.05048369871087535, + "grad_norm": 2.921875, + "learning_rate": 4.9720830116612004e-05, + "loss": 0.877, + "step": 2848 + }, + { + "epoch": 0.05051915074648693, + "grad_norm": 2.875, + "learning_rate": 4.97204137575074e-05, + "loss": 0.8109, + "step": 2850 + }, + { + "epoch": 0.050554602782098494, + "grad_norm": 2.90625, + "learning_rate": 4.9719997089897176e-05, + "loss": 0.8455, + "step": 2852 + }, + { + "epoch": 0.05059005481771006, + "grad_norm": 2.734375, + "learning_rate": 4.971958011378653e-05, + "loss": 0.8942, + "step": 2854 + }, + { + "epoch": 0.050625506853321636, + "grad_norm": 3.109375, + "learning_rate": 4.9719162829180684e-05, + "loss": 0.8563, + "step": 2856 + }, + { + "epoch": 0.050660958888933204, + "grad_norm": 2.921875, + "learning_rate": 4.971874523608483e-05, + "loss": 0.8732, + "step": 2858 + }, + { + "epoch": 0.05069641092454477, + "grad_norm": 2.859375, + "learning_rate": 4.9718327334504175e-05, + "loss": 0.8292, + "step": 2860 + }, + { + "epoch": 0.050731862960156346, + "grad_norm": 2.734375, + "learning_rate": 4.971790912444396e-05, + "loss": 0.9058, + "step": 2862 + }, + { + "epoch": 0.05076731499576791, + "grad_norm": 2.796875, + "learning_rate": 4.971749060590938e-05, + "loss": 0.883, + "step": 2864 + }, + { + "epoch": 0.05080276703137948, + "grad_norm": 2.890625, + "learning_rate": 4.9717071778905667e-05, + "loss": 0.8377, + "step": 2866 + }, + { + "epoch": 0.050838219066991056, + "grad_norm": 3.125, + "learning_rate": 4.971665264343804e-05, + "loss": 0.8734, + "step": 2868 + }, + { + "epoch": 0.05087367110260262, + "grad_norm": 2.796875, + "learning_rate": 4.971623319951174e-05, + "loss": 0.8644, + "step": 2870 + }, + { + "epoch": 0.05090912313821419, + "grad_norm": 2.390625, + "learning_rate": 4.9715813447132e-05, + "loss": 0.8551, + "step": 2872 + }, + { + "epoch": 0.050944575173825765, + "grad_norm": 2.78125, + "learning_rate": 4.9715393386304056e-05, + "loss": 0.8708, + "step": 2874 + }, + { + "epoch": 0.05098002720943733, + "grad_norm": 3.109375, + "learning_rate": 4.971497301703315e-05, + "loss": 0.8547, + "step": 2876 + }, + { + "epoch": 0.0510154792450489, + "grad_norm": 3.328125, + "learning_rate": 4.9714552339324525e-05, + "loss": 0.8751, + "step": 2878 + }, + { + "epoch": 0.051050931280660475, + "grad_norm": 2.84375, + "learning_rate": 4.9714131353183435e-05, + "loss": 0.8485, + "step": 2880 + }, + { + "epoch": 0.05108638331627204, + "grad_norm": 3.140625, + "learning_rate": 4.971371005861514e-05, + "loss": 0.8416, + "step": 2882 + }, + { + "epoch": 0.05112183535188361, + "grad_norm": 2.8125, + "learning_rate": 4.971328845562488e-05, + "loss": 0.9008, + "step": 2884 + }, + { + "epoch": 0.05115728738749518, + "grad_norm": 2.46875, + "learning_rate": 4.971286654421793e-05, + "loss": 0.822, + "step": 2886 + }, + { + "epoch": 0.05119273942310675, + "grad_norm": 2.953125, + "learning_rate": 4.971244432439956e-05, + "loss": 0.8887, + "step": 2888 + }, + { + "epoch": 0.05122819145871832, + "grad_norm": 2.78125, + "learning_rate": 4.971202179617502e-05, + "loss": 0.8879, + "step": 2890 + }, + { + "epoch": 0.05126364349432989, + "grad_norm": 2.84375, + "learning_rate": 4.971159895954961e-05, + "loss": 0.8847, + "step": 2892 + }, + { + "epoch": 0.05129909552994146, + "grad_norm": 2.828125, + "learning_rate": 4.971117581452859e-05, + "loss": 0.872, + "step": 2894 + }, + { + "epoch": 0.05133454756555303, + "grad_norm": 2.921875, + "learning_rate": 4.971075236111724e-05, + "loss": 0.861, + "step": 2896 + }, + { + "epoch": 0.0513699996011646, + "grad_norm": 2.671875, + "learning_rate": 4.9710328599320846e-05, + "loss": 0.8889, + "step": 2898 + }, + { + "epoch": 0.05140545163677617, + "grad_norm": 2.78125, + "learning_rate": 4.970990452914469e-05, + "loss": 0.8878, + "step": 2900 + }, + { + "epoch": 0.05144090367238774, + "grad_norm": 2.609375, + "learning_rate": 4.970948015059408e-05, + "loss": 0.8651, + "step": 2902 + }, + { + "epoch": 0.051476355707999306, + "grad_norm": 2.921875, + "learning_rate": 4.9709055463674304e-05, + "loss": 0.8522, + "step": 2904 + }, + { + "epoch": 0.05151180774361088, + "grad_norm": 2.78125, + "learning_rate": 4.970863046839066e-05, + "loss": 0.8409, + "step": 2906 + }, + { + "epoch": 0.05154725977922245, + "grad_norm": 3.0, + "learning_rate": 4.970820516474846e-05, + "loss": 0.8737, + "step": 2908 + }, + { + "epoch": 0.051582711814834016, + "grad_norm": 3.09375, + "learning_rate": 4.9707779552753e-05, + "loss": 0.87, + "step": 2910 + }, + { + "epoch": 0.05161816385044559, + "grad_norm": 2.78125, + "learning_rate": 4.97073536324096e-05, + "loss": 0.8184, + "step": 2912 + }, + { + "epoch": 0.05165361588605716, + "grad_norm": 2.765625, + "learning_rate": 4.9706927403723574e-05, + "loss": 0.9204, + "step": 2914 + }, + { + "epoch": 0.051689067921668726, + "grad_norm": 2.5625, + "learning_rate": 4.970650086670024e-05, + "loss": 0.8298, + "step": 2916 + }, + { + "epoch": 0.0517245199572803, + "grad_norm": 3.015625, + "learning_rate": 4.9706074021344916e-05, + "loss": 0.8427, + "step": 2918 + }, + { + "epoch": 0.05175997199289187, + "grad_norm": 2.78125, + "learning_rate": 4.970564686766294e-05, + "loss": 0.8631, + "step": 2920 + }, + { + "epoch": 0.051795424028503435, + "grad_norm": 2.875, + "learning_rate": 4.9705219405659635e-05, + "loss": 0.8462, + "step": 2922 + }, + { + "epoch": 0.05183087606411501, + "grad_norm": 2.890625, + "learning_rate": 4.970479163534034e-05, + "loss": 0.8948, + "step": 2924 + }, + { + "epoch": 0.05186632809972658, + "grad_norm": 2.546875, + "learning_rate": 4.970436355671039e-05, + "loss": 0.8498, + "step": 2926 + }, + { + "epoch": 0.051901780135338145, + "grad_norm": 2.765625, + "learning_rate": 4.970393516977513e-05, + "loss": 0.885, + "step": 2928 + }, + { + "epoch": 0.05193723217094971, + "grad_norm": 2.59375, + "learning_rate": 4.9703506474539894e-05, + "loss": 0.8458, + "step": 2930 + }, + { + "epoch": 0.05197268420656129, + "grad_norm": 3.015625, + "learning_rate": 4.970307747101005e-05, + "loss": 0.8869, + "step": 2932 + }, + { + "epoch": 0.052008136242172855, + "grad_norm": 2.625, + "learning_rate": 4.9702648159190944e-05, + "loss": 0.8571, + "step": 2934 + }, + { + "epoch": 0.05204358827778442, + "grad_norm": 2.9375, + "learning_rate": 4.970221853908794e-05, + "loss": 0.8827, + "step": 2936 + }, + { + "epoch": 0.052079040313396, + "grad_norm": 2.765625, + "learning_rate": 4.9701788610706384e-05, + "loss": 0.8422, + "step": 2938 + }, + { + "epoch": 0.052114492349007564, + "grad_norm": 2.6875, + "learning_rate": 4.970135837405166e-05, + "loss": 0.8821, + "step": 2940 + }, + { + "epoch": 0.05214994438461913, + "grad_norm": 2.890625, + "learning_rate": 4.970092782912912e-05, + "loss": 0.8914, + "step": 2942 + }, + { + "epoch": 0.052185396420230706, + "grad_norm": 2.734375, + "learning_rate": 4.970049697594415e-05, + "loss": 0.8454, + "step": 2944 + }, + { + "epoch": 0.052220848455842274, + "grad_norm": 2.796875, + "learning_rate": 4.9700065814502125e-05, + "loss": 0.8119, + "step": 2946 + }, + { + "epoch": 0.05225630049145384, + "grad_norm": 2.796875, + "learning_rate": 4.9699634344808425e-05, + "loss": 0.904, + "step": 2948 + }, + { + "epoch": 0.052291752527065416, + "grad_norm": 2.921875, + "learning_rate": 4.969920256686842e-05, + "loss": 0.8324, + "step": 2950 + }, + { + "epoch": 0.052327204562676984, + "grad_norm": 2.734375, + "learning_rate": 4.9698770480687515e-05, + "loss": 0.8968, + "step": 2952 + }, + { + "epoch": 0.05236265659828855, + "grad_norm": 3.109375, + "learning_rate": 4.9698338086271114e-05, + "loss": 0.8728, + "step": 2954 + }, + { + "epoch": 0.052398108633900126, + "grad_norm": 2.90625, + "learning_rate": 4.969790538362458e-05, + "loss": 0.8537, + "step": 2956 + }, + { + "epoch": 0.05243356066951169, + "grad_norm": 2.8125, + "learning_rate": 4.969747237275334e-05, + "loss": 0.8752, + "step": 2958 + }, + { + "epoch": 0.05246901270512326, + "grad_norm": 3.015625, + "learning_rate": 4.9697039053662785e-05, + "loss": 0.8249, + "step": 2960 + }, + { + "epoch": 0.052504464740734835, + "grad_norm": 2.421875, + "learning_rate": 4.969660542635833e-05, + "loss": 0.8757, + "step": 2962 + }, + { + "epoch": 0.0525399167763464, + "grad_norm": 2.96875, + "learning_rate": 4.969617149084538e-05, + "loss": 0.8594, + "step": 2964 + }, + { + "epoch": 0.05257536881195797, + "grad_norm": 2.65625, + "learning_rate": 4.969573724712936e-05, + "loss": 0.8425, + "step": 2966 + }, + { + "epoch": 0.05261082084756954, + "grad_norm": 2.8125, + "learning_rate": 4.969530269521568e-05, + "loss": 0.8623, + "step": 2968 + }, + { + "epoch": 0.05264627288318111, + "grad_norm": 2.953125, + "learning_rate": 4.969486783510976e-05, + "loss": 0.8533, + "step": 2970 + }, + { + "epoch": 0.05268172491879268, + "grad_norm": 2.9375, + "learning_rate": 4.9694432666817036e-05, + "loss": 0.8297, + "step": 2972 + }, + { + "epoch": 0.05271717695440425, + "grad_norm": 2.296875, + "learning_rate": 4.969399719034295e-05, + "loss": 0.8356, + "step": 2974 + }, + { + "epoch": 0.05275262899001582, + "grad_norm": 3.3125, + "learning_rate": 4.96935614056929e-05, + "loss": 0.8644, + "step": 2976 + }, + { + "epoch": 0.05278808102562739, + "grad_norm": 3.375, + "learning_rate": 4.9693125312872356e-05, + "loss": 0.8769, + "step": 2978 + }, + { + "epoch": 0.05282353306123896, + "grad_norm": 3.28125, + "learning_rate": 4.969268891188676e-05, + "loss": 0.8532, + "step": 2980 + }, + { + "epoch": 0.05285898509685053, + "grad_norm": 3.109375, + "learning_rate": 4.969225220274154e-05, + "loss": 0.8512, + "step": 2982 + }, + { + "epoch": 0.0528944371324621, + "grad_norm": 3.1875, + "learning_rate": 4.9691815185442155e-05, + "loss": 0.869, + "step": 2984 + }, + { + "epoch": 0.05292988916807367, + "grad_norm": 2.65625, + "learning_rate": 4.9691377859994056e-05, + "loss": 0.8594, + "step": 2986 + }, + { + "epoch": 0.05296534120368524, + "grad_norm": 2.765625, + "learning_rate": 4.9690940226402716e-05, + "loss": 0.8524, + "step": 2988 + }, + { + "epoch": 0.05300079323929681, + "grad_norm": 3.09375, + "learning_rate": 4.969050228467358e-05, + "loss": 0.8984, + "step": 2990 + }, + { + "epoch": 0.05303624527490838, + "grad_norm": 3.03125, + "learning_rate": 4.9690064034812114e-05, + "loss": 0.8926, + "step": 2992 + }, + { + "epoch": 0.05307169731051995, + "grad_norm": 2.71875, + "learning_rate": 4.9689625476823795e-05, + "loss": 0.8046, + "step": 2994 + }, + { + "epoch": 0.05310714934613152, + "grad_norm": 2.796875, + "learning_rate": 4.968918661071409e-05, + "loss": 0.8651, + "step": 2996 + }, + { + "epoch": 0.053142601381743086, + "grad_norm": 3.21875, + "learning_rate": 4.968874743648848e-05, + "loss": 0.913, + "step": 2998 + }, + { + "epoch": 0.05317805341735466, + "grad_norm": 2.78125, + "learning_rate": 4.968830795415245e-05, + "loss": 0.8606, + "step": 3000 + }, + { + "epoch": 0.05321350545296623, + "grad_norm": 3.40625, + "learning_rate": 4.9687868163711474e-05, + "loss": 0.8876, + "step": 3002 + }, + { + "epoch": 0.053248957488577796, + "grad_norm": 2.828125, + "learning_rate": 4.968742806517104e-05, + "loss": 0.8648, + "step": 3004 + }, + { + "epoch": 0.05328440952418937, + "grad_norm": 2.9375, + "learning_rate": 4.9686987658536646e-05, + "loss": 0.8341, + "step": 3006 + }, + { + "epoch": 0.05331986155980094, + "grad_norm": 2.921875, + "learning_rate": 4.968654694381379e-05, + "loss": 0.8511, + "step": 3008 + }, + { + "epoch": 0.053355313595412505, + "grad_norm": 3.171875, + "learning_rate": 4.968610592100797e-05, + "loss": 0.8549, + "step": 3010 + }, + { + "epoch": 0.05339076563102407, + "grad_norm": 2.984375, + "learning_rate": 4.96856645901247e-05, + "loss": 0.8929, + "step": 3012 + }, + { + "epoch": 0.05342621766663565, + "grad_norm": 2.71875, + "learning_rate": 4.968522295116947e-05, + "loss": 0.8663, + "step": 3014 + }, + { + "epoch": 0.053461669702247215, + "grad_norm": 2.609375, + "learning_rate": 4.9684781004147795e-05, + "loss": 0.8973, + "step": 3016 + }, + { + "epoch": 0.05349712173785878, + "grad_norm": 2.625, + "learning_rate": 4.96843387490652e-05, + "loss": 0.8569, + "step": 3018 + }, + { + "epoch": 0.05353257377347036, + "grad_norm": 2.828125, + "learning_rate": 4.96838961859272e-05, + "loss": 0.8066, + "step": 3020 + }, + { + "epoch": 0.053568025809081925, + "grad_norm": 2.796875, + "learning_rate": 4.9683453314739314e-05, + "loss": 0.9235, + "step": 3022 + }, + { + "epoch": 0.05360347784469349, + "grad_norm": 2.890625, + "learning_rate": 4.968301013550707e-05, + "loss": 0.8878, + "step": 3024 + }, + { + "epoch": 0.05363892988030507, + "grad_norm": 2.765625, + "learning_rate": 4.9682566648236007e-05, + "loss": 0.894, + "step": 3026 + }, + { + "epoch": 0.053674381915916634, + "grad_norm": 2.53125, + "learning_rate": 4.968212285293165e-05, + "loss": 0.8596, + "step": 3028 + }, + { + "epoch": 0.0537098339515282, + "grad_norm": 3.03125, + "learning_rate": 4.9681678749599536e-05, + "loss": 0.8688, + "step": 3030 + }, + { + "epoch": 0.053745285987139776, + "grad_norm": 2.75, + "learning_rate": 4.9681234338245214e-05, + "loss": 0.872, + "step": 3032 + }, + { + "epoch": 0.053780738022751344, + "grad_norm": 2.671875, + "learning_rate": 4.968078961887423e-05, + "loss": 0.859, + "step": 3034 + }, + { + "epoch": 0.05381619005836291, + "grad_norm": 2.796875, + "learning_rate": 4.968034459149213e-05, + "loss": 0.8821, + "step": 3036 + }, + { + "epoch": 0.053851642093974486, + "grad_norm": 2.78125, + "learning_rate": 4.967989925610447e-05, + "loss": 0.8507, + "step": 3038 + }, + { + "epoch": 0.053887094129586054, + "grad_norm": 3.453125, + "learning_rate": 4.9679453612716816e-05, + "loss": 0.8604, + "step": 3040 + }, + { + "epoch": 0.05392254616519762, + "grad_norm": 2.71875, + "learning_rate": 4.967900766133471e-05, + "loss": 0.8521, + "step": 3042 + }, + { + "epoch": 0.053957998200809196, + "grad_norm": 2.984375, + "learning_rate": 4.9678561401963736e-05, + "loss": 0.9058, + "step": 3044 + }, + { + "epoch": 0.05399345023642076, + "grad_norm": 2.671875, + "learning_rate": 4.967811483460946e-05, + "loss": 0.8202, + "step": 3046 + }, + { + "epoch": 0.05402890227203233, + "grad_norm": 3.09375, + "learning_rate": 4.967766795927744e-05, + "loss": 0.8616, + "step": 3048 + }, + { + "epoch": 0.054064354307643905, + "grad_norm": 2.78125, + "learning_rate": 4.967722077597327e-05, + "loss": 0.8462, + "step": 3050 + }, + { + "epoch": 0.05409980634325547, + "grad_norm": 2.6875, + "learning_rate": 4.967677328470252e-05, + "loss": 0.8751, + "step": 3052 + }, + { + "epoch": 0.05413525837886704, + "grad_norm": 3.03125, + "learning_rate": 4.967632548547078e-05, + "loss": 0.9041, + "step": 3054 + }, + { + "epoch": 0.05417071041447861, + "grad_norm": 2.59375, + "learning_rate": 4.967587737828364e-05, + "loss": 0.8535, + "step": 3056 + }, + { + "epoch": 0.05420616245009018, + "grad_norm": 3.078125, + "learning_rate": 4.967542896314669e-05, + "loss": 0.823, + "step": 3058 + }, + { + "epoch": 0.05424161448570175, + "grad_norm": 2.5625, + "learning_rate": 4.967498024006553e-05, + "loss": 0.847, + "step": 3060 + }, + { + "epoch": 0.05427706652131332, + "grad_norm": 2.859375, + "learning_rate": 4.967453120904575e-05, + "loss": 0.8793, + "step": 3062 + }, + { + "epoch": 0.05431251855692489, + "grad_norm": 3.03125, + "learning_rate": 4.967408187009296e-05, + "loss": 0.87, + "step": 3064 + }, + { + "epoch": 0.05434797059253646, + "grad_norm": 2.6875, + "learning_rate": 4.967363222321277e-05, + "loss": 0.8522, + "step": 3066 + }, + { + "epoch": 0.05438342262814803, + "grad_norm": 2.671875, + "learning_rate": 4.967318226841079e-05, + "loss": 0.8765, + "step": 3068 + }, + { + "epoch": 0.0544188746637596, + "grad_norm": 3.0625, + "learning_rate": 4.967273200569263e-05, + "loss": 0.8881, + "step": 3070 + }, + { + "epoch": 0.05445432669937117, + "grad_norm": 2.734375, + "learning_rate": 4.9672281435063915e-05, + "loss": 0.8436, + "step": 3072 + }, + { + "epoch": 0.05448977873498274, + "grad_norm": 2.8125, + "learning_rate": 4.967183055653027e-05, + "loss": 0.8846, + "step": 3074 + }, + { + "epoch": 0.05452523077059431, + "grad_norm": 2.75, + "learning_rate": 4.9671379370097314e-05, + "loss": 0.8671, + "step": 3076 + }, + { + "epoch": 0.05456068280620588, + "grad_norm": 3.25, + "learning_rate": 4.967092787577068e-05, + "loss": 0.8623, + "step": 3078 + }, + { + "epoch": 0.05459613484181745, + "grad_norm": 3.171875, + "learning_rate": 4.9670476073556015e-05, + "loss": 0.8926, + "step": 3080 + }, + { + "epoch": 0.05463158687742902, + "grad_norm": 2.9375, + "learning_rate": 4.967002396345894e-05, + "loss": 0.8009, + "step": 3082 + }, + { + "epoch": 0.05466703891304059, + "grad_norm": 3.0625, + "learning_rate": 4.966957154548511e-05, + "loss": 0.8722, + "step": 3084 + }, + { + "epoch": 0.054702490948652156, + "grad_norm": 2.9375, + "learning_rate": 4.966911881964016e-05, + "loss": 0.8625, + "step": 3086 + }, + { + "epoch": 0.05473794298426373, + "grad_norm": 2.734375, + "learning_rate": 4.9668665785929744e-05, + "loss": 0.8557, + "step": 3088 + }, + { + "epoch": 0.0547733950198753, + "grad_norm": 2.71875, + "learning_rate": 4.966821244435952e-05, + "loss": 0.8461, + "step": 3090 + }, + { + "epoch": 0.054808847055486866, + "grad_norm": 2.640625, + "learning_rate": 4.966775879493514e-05, + "loss": 0.8348, + "step": 3092 + }, + { + "epoch": 0.05484429909109844, + "grad_norm": 2.796875, + "learning_rate": 4.9667304837662265e-05, + "loss": 0.8429, + "step": 3094 + }, + { + "epoch": 0.05487975112671001, + "grad_norm": 2.8125, + "learning_rate": 4.9666850572546575e-05, + "loss": 0.8614, + "step": 3096 + }, + { + "epoch": 0.054915203162321576, + "grad_norm": 3.109375, + "learning_rate": 4.966639599959372e-05, + "loss": 0.9079, + "step": 3098 + }, + { + "epoch": 0.05495065519793314, + "grad_norm": 2.640625, + "learning_rate": 4.9665941118809375e-05, + "loss": 0.8368, + "step": 3100 + }, + { + "epoch": 0.05498610723354472, + "grad_norm": 2.859375, + "learning_rate": 4.966548593019923e-05, + "loss": 0.8453, + "step": 3102 + }, + { + "epoch": 0.055021559269156285, + "grad_norm": 2.84375, + "learning_rate": 4.966503043376896e-05, + "loss": 0.9079, + "step": 3104 + }, + { + "epoch": 0.05505701130476785, + "grad_norm": 2.515625, + "learning_rate": 4.966457462952424e-05, + "loss": 0.8182, + "step": 3106 + }, + { + "epoch": 0.05509246334037943, + "grad_norm": 2.71875, + "learning_rate": 4.966411851747078e-05, + "loss": 0.8326, + "step": 3108 + }, + { + "epoch": 0.055127915375990995, + "grad_norm": 2.78125, + "learning_rate": 4.9663662097614245e-05, + "loss": 0.8604, + "step": 3110 + }, + { + "epoch": 0.05516336741160256, + "grad_norm": 2.703125, + "learning_rate": 4.9663205369960345e-05, + "loss": 0.8305, + "step": 3112 + }, + { + "epoch": 0.05519881944721414, + "grad_norm": 2.4375, + "learning_rate": 4.9662748334514784e-05, + "loss": 0.8346, + "step": 3114 + }, + { + "epoch": 0.055234271482825704, + "grad_norm": 2.78125, + "learning_rate": 4.9662290991283254e-05, + "loss": 0.914, + "step": 3116 + }, + { + "epoch": 0.05526972351843727, + "grad_norm": 2.828125, + "learning_rate": 4.966183334027148e-05, + "loss": 0.8923, + "step": 3118 + }, + { + "epoch": 0.05530517555404885, + "grad_norm": 3.125, + "learning_rate": 4.966137538148515e-05, + "loss": 0.878, + "step": 3120 + }, + { + "epoch": 0.055340627589660414, + "grad_norm": 2.609375, + "learning_rate": 4.966091711493e-05, + "loss": 0.8524, + "step": 3122 + }, + { + "epoch": 0.05537607962527198, + "grad_norm": 2.5, + "learning_rate": 4.966045854061174e-05, + "loss": 0.8129, + "step": 3124 + }, + { + "epoch": 0.055411531660883556, + "grad_norm": 2.859375, + "learning_rate": 4.965999965853609e-05, + "loss": 0.8476, + "step": 3126 + }, + { + "epoch": 0.055446983696495124, + "grad_norm": 3.09375, + "learning_rate": 4.965954046870879e-05, + "loss": 0.8393, + "step": 3128 + }, + { + "epoch": 0.05548243573210669, + "grad_norm": 3.15625, + "learning_rate": 4.965908097113555e-05, + "loss": 0.8137, + "step": 3130 + }, + { + "epoch": 0.055517887767718266, + "grad_norm": 2.859375, + "learning_rate": 4.965862116582212e-05, + "loss": 0.8436, + "step": 3132 + }, + { + "epoch": 0.05555333980332983, + "grad_norm": 2.703125, + "learning_rate": 4.965816105277423e-05, + "loss": 0.8907, + "step": 3134 + }, + { + "epoch": 0.0555887918389414, + "grad_norm": 3.015625, + "learning_rate": 4.965770063199763e-05, + "loss": 0.8394, + "step": 3136 + }, + { + "epoch": 0.05562424387455297, + "grad_norm": 2.765625, + "learning_rate": 4.9657239903498064e-05, + "loss": 0.848, + "step": 3138 + }, + { + "epoch": 0.05565969591016454, + "grad_norm": 2.984375, + "learning_rate": 4.965677886728128e-05, + "loss": 0.8919, + "step": 3140 + }, + { + "epoch": 0.05569514794577611, + "grad_norm": 3.046875, + "learning_rate": 4.965631752335302e-05, + "loss": 0.8882, + "step": 3142 + }, + { + "epoch": 0.05573059998138768, + "grad_norm": 2.765625, + "learning_rate": 4.965585587171907e-05, + "loss": 0.8359, + "step": 3144 + }, + { + "epoch": 0.05576605201699925, + "grad_norm": 2.625, + "learning_rate": 4.9655393912385164e-05, + "loss": 0.8831, + "step": 3146 + }, + { + "epoch": 0.05580150405261082, + "grad_norm": 2.734375, + "learning_rate": 4.9654931645357075e-05, + "loss": 0.8672, + "step": 3148 + }, + { + "epoch": 0.05583695608822239, + "grad_norm": 2.65625, + "learning_rate": 4.965446907064059e-05, + "loss": 0.8515, + "step": 3150 + }, + { + "epoch": 0.05587240812383396, + "grad_norm": 3.203125, + "learning_rate": 4.965400618824145e-05, + "loss": 0.9172, + "step": 3152 + }, + { + "epoch": 0.05590786015944553, + "grad_norm": 2.703125, + "learning_rate": 4.965354299816545e-05, + "loss": 0.8372, + "step": 3154 + }, + { + "epoch": 0.0559433121950571, + "grad_norm": 2.625, + "learning_rate": 4.965307950041837e-05, + "loss": 0.8744, + "step": 3156 + }, + { + "epoch": 0.05597876423066867, + "grad_norm": 3.03125, + "learning_rate": 4.9652615695006e-05, + "loss": 0.8757, + "step": 3158 + }, + { + "epoch": 0.05601421626628024, + "grad_norm": 3.25, + "learning_rate": 4.965215158193411e-05, + "loss": 0.8774, + "step": 3160 + }, + { + "epoch": 0.05604966830189181, + "grad_norm": 2.9375, + "learning_rate": 4.9651687161208505e-05, + "loss": 0.8498, + "step": 3162 + }, + { + "epoch": 0.05608512033750338, + "grad_norm": 3.46875, + "learning_rate": 4.9651222432834985e-05, + "loss": 0.9136, + "step": 3164 + }, + { + "epoch": 0.05612057237311495, + "grad_norm": 3.09375, + "learning_rate": 4.9650757396819335e-05, + "loss": 0.8381, + "step": 3166 + }, + { + "epoch": 0.05615602440872652, + "grad_norm": 3.0, + "learning_rate": 4.965029205316737e-05, + "loss": 0.8501, + "step": 3168 + }, + { + "epoch": 0.05619147644433809, + "grad_norm": 2.953125, + "learning_rate": 4.9649826401884904e-05, + "loss": 0.8755, + "step": 3170 + }, + { + "epoch": 0.05622692847994966, + "grad_norm": 3.03125, + "learning_rate": 4.964936044297773e-05, + "loss": 0.9018, + "step": 3172 + }, + { + "epoch": 0.056262380515561226, + "grad_norm": 2.953125, + "learning_rate": 4.964889417645166e-05, + "loss": 0.8797, + "step": 3174 + }, + { + "epoch": 0.0562978325511728, + "grad_norm": 2.6875, + "learning_rate": 4.964842760231254e-05, + "loss": 0.8833, + "step": 3176 + }, + { + "epoch": 0.05633328458678437, + "grad_norm": 2.9375, + "learning_rate": 4.964796072056618e-05, + "loss": 0.8543, + "step": 3178 + }, + { + "epoch": 0.056368736622395936, + "grad_norm": 2.796875, + "learning_rate": 4.96474935312184e-05, + "loss": 0.9254, + "step": 3180 + }, + { + "epoch": 0.056404188658007504, + "grad_norm": 3.234375, + "learning_rate": 4.964702603427504e-05, + "loss": 0.8885, + "step": 3182 + }, + { + "epoch": 0.05643964069361908, + "grad_norm": 3.0, + "learning_rate": 4.964655822974191e-05, + "loss": 0.8349, + "step": 3184 + }, + { + "epoch": 0.056475092729230646, + "grad_norm": 2.703125, + "learning_rate": 4.964609011762488e-05, + "loss": 0.8539, + "step": 3186 + }, + { + "epoch": 0.05651054476484221, + "grad_norm": 2.6875, + "learning_rate": 4.964562169792978e-05, + "loss": 0.8605, + "step": 3188 + }, + { + "epoch": 0.05654599680045379, + "grad_norm": 2.703125, + "learning_rate": 4.964515297066245e-05, + "loss": 0.8885, + "step": 3190 + }, + { + "epoch": 0.056581448836065355, + "grad_norm": 2.34375, + "learning_rate": 4.964468393582875e-05, + "loss": 0.812, + "step": 3192 + }, + { + "epoch": 0.05661690087167692, + "grad_norm": 2.5, + "learning_rate": 4.964421459343452e-05, + "loss": 0.8265, + "step": 3194 + }, + { + "epoch": 0.0566523529072885, + "grad_norm": 2.5625, + "learning_rate": 4.9643744943485626e-05, + "loss": 0.866, + "step": 3196 + }, + { + "epoch": 0.056687804942900065, + "grad_norm": 2.6875, + "learning_rate": 4.964327498598793e-05, + "loss": 0.8846, + "step": 3198 + }, + { + "epoch": 0.05672325697851163, + "grad_norm": 2.734375, + "learning_rate": 4.96428047209473e-05, + "loss": 0.8741, + "step": 3200 + }, + { + "epoch": 0.05675870901412321, + "grad_norm": 3.03125, + "learning_rate": 4.9642334148369595e-05, + "loss": 0.8554, + "step": 3202 + }, + { + "epoch": 0.056794161049734775, + "grad_norm": 3.078125, + "learning_rate": 4.964186326826069e-05, + "loss": 0.8448, + "step": 3204 + }, + { + "epoch": 0.05682961308534634, + "grad_norm": 2.6875, + "learning_rate": 4.964139208062647e-05, + "loss": 0.8586, + "step": 3206 + }, + { + "epoch": 0.05686506512095792, + "grad_norm": 2.96875, + "learning_rate": 4.964092058547281e-05, + "loss": 0.9298, + "step": 3208 + }, + { + "epoch": 0.056900517156569484, + "grad_norm": 2.90625, + "learning_rate": 4.964044878280558e-05, + "loss": 0.8457, + "step": 3210 + }, + { + "epoch": 0.05693596919218105, + "grad_norm": 2.703125, + "learning_rate": 4.963997667263069e-05, + "loss": 0.9313, + "step": 3212 + }, + { + "epoch": 0.056971421227792626, + "grad_norm": 2.765625, + "learning_rate": 4.9639504254954026e-05, + "loss": 0.8426, + "step": 3214 + }, + { + "epoch": 0.057006873263404194, + "grad_norm": 2.984375, + "learning_rate": 4.963903152978148e-05, + "loss": 0.8391, + "step": 3216 + }, + { + "epoch": 0.05704232529901576, + "grad_norm": 2.65625, + "learning_rate": 4.9638558497118956e-05, + "loss": 0.8089, + "step": 3218 + }, + { + "epoch": 0.057077777334627336, + "grad_norm": 2.9375, + "learning_rate": 4.963808515697235e-05, + "loss": 0.8812, + "step": 3220 + }, + { + "epoch": 0.057113229370238904, + "grad_norm": 2.5625, + "learning_rate": 4.963761150934757e-05, + "loss": 0.831, + "step": 3222 + }, + { + "epoch": 0.05714868140585047, + "grad_norm": 2.953125, + "learning_rate": 4.9637137554250535e-05, + "loss": 0.9013, + "step": 3224 + }, + { + "epoch": 0.05718413344146204, + "grad_norm": 2.984375, + "learning_rate": 4.963666329168715e-05, + "loss": 0.8752, + "step": 3226 + }, + { + "epoch": 0.05721958547707361, + "grad_norm": 2.921875, + "learning_rate": 4.963618872166334e-05, + "loss": 0.882, + "step": 3228 + }, + { + "epoch": 0.05725503751268518, + "grad_norm": 3.0625, + "learning_rate": 4.9635713844185025e-05, + "loss": 0.9392, + "step": 3230 + }, + { + "epoch": 0.05729048954829675, + "grad_norm": 2.515625, + "learning_rate": 4.9635238659258136e-05, + "loss": 0.7893, + "step": 3232 + }, + { + "epoch": 0.05732594158390832, + "grad_norm": 2.796875, + "learning_rate": 4.96347631668886e-05, + "loss": 0.8776, + "step": 3234 + }, + { + "epoch": 0.05736139361951989, + "grad_norm": 2.671875, + "learning_rate": 4.9634287367082346e-05, + "loss": 0.8508, + "step": 3236 + }, + { + "epoch": 0.05739684565513146, + "grad_norm": 2.515625, + "learning_rate": 4.963381125984532e-05, + "loss": 0.8219, + "step": 3238 + }, + { + "epoch": 0.05743229769074303, + "grad_norm": 2.859375, + "learning_rate": 4.963333484518346e-05, + "loss": 0.8711, + "step": 3240 + }, + { + "epoch": 0.0574677497263546, + "grad_norm": 2.953125, + "learning_rate": 4.963285812310271e-05, + "loss": 0.892, + "step": 3242 + }, + { + "epoch": 0.05750320176196617, + "grad_norm": 2.921875, + "learning_rate": 4.9632381093609024e-05, + "loss": 0.8696, + "step": 3244 + }, + { + "epoch": 0.05753865379757774, + "grad_norm": 2.921875, + "learning_rate": 4.963190375670835e-05, + "loss": 0.8536, + "step": 3246 + }, + { + "epoch": 0.05757410583318931, + "grad_norm": 2.921875, + "learning_rate": 4.963142611240665e-05, + "loss": 0.883, + "step": 3248 + }, + { + "epoch": 0.05760955786880088, + "grad_norm": 2.859375, + "learning_rate": 4.963094816070988e-05, + "loss": 0.9322, + "step": 3250 + }, + { + "epoch": 0.05764500990441245, + "grad_norm": 2.75, + "learning_rate": 4.963046990162401e-05, + "loss": 0.8714, + "step": 3252 + }, + { + "epoch": 0.05768046194002402, + "grad_norm": 2.8125, + "learning_rate": 4.9629991335155e-05, + "loss": 0.8551, + "step": 3254 + }, + { + "epoch": 0.05771591397563559, + "grad_norm": 2.703125, + "learning_rate": 4.962951246130884e-05, + "loss": 0.9051, + "step": 3256 + }, + { + "epoch": 0.05775136601124716, + "grad_norm": 2.71875, + "learning_rate": 4.962903328009149e-05, + "loss": 0.8693, + "step": 3258 + }, + { + "epoch": 0.05778681804685873, + "grad_norm": 2.609375, + "learning_rate": 4.962855379150893e-05, + "loss": 0.8398, + "step": 3260 + }, + { + "epoch": 0.057822270082470296, + "grad_norm": 2.90625, + "learning_rate": 4.962807399556715e-05, + "loss": 0.8445, + "step": 3262 + }, + { + "epoch": 0.05785772211808187, + "grad_norm": 2.671875, + "learning_rate": 4.962759389227213e-05, + "loss": 0.8498, + "step": 3264 + }, + { + "epoch": 0.05789317415369344, + "grad_norm": 3.0, + "learning_rate": 4.962711348162987e-05, + "loss": 0.8548, + "step": 3266 + }, + { + "epoch": 0.057928626189305006, + "grad_norm": 2.671875, + "learning_rate": 4.962663276364637e-05, + "loss": 0.8758, + "step": 3268 + }, + { + "epoch": 0.057964078224916574, + "grad_norm": 2.78125, + "learning_rate": 4.962615173832762e-05, + "loss": 0.8511, + "step": 3270 + }, + { + "epoch": 0.05799953026052815, + "grad_norm": 2.84375, + "learning_rate": 4.9625670405679626e-05, + "loss": 0.8197, + "step": 3272 + }, + { + "epoch": 0.058034982296139716, + "grad_norm": 2.8125, + "learning_rate": 4.96251887657084e-05, + "loss": 0.8447, + "step": 3274 + }, + { + "epoch": 0.05807043433175128, + "grad_norm": 2.96875, + "learning_rate": 4.962470681841993e-05, + "loss": 0.8452, + "step": 3276 + }, + { + "epoch": 0.05810588636736286, + "grad_norm": 2.890625, + "learning_rate": 4.962422456382026e-05, + "loss": 0.8332, + "step": 3278 + }, + { + "epoch": 0.058141338402974425, + "grad_norm": 2.859375, + "learning_rate": 4.96237420019154e-05, + "loss": 0.8803, + "step": 3280 + }, + { + "epoch": 0.05817679043858599, + "grad_norm": 2.8125, + "learning_rate": 4.9623259132711365e-05, + "loss": 0.7956, + "step": 3282 + }, + { + "epoch": 0.05821224247419757, + "grad_norm": 2.953125, + "learning_rate": 4.9622775956214187e-05, + "loss": 0.8399, + "step": 3284 + }, + { + "epoch": 0.058247694509809135, + "grad_norm": 2.75, + "learning_rate": 4.962229247242989e-05, + "loss": 0.8275, + "step": 3286 + }, + { + "epoch": 0.0582831465454207, + "grad_norm": 3.265625, + "learning_rate": 4.9621808681364506e-05, + "loss": 0.8737, + "step": 3288 + }, + { + "epoch": 0.05831859858103228, + "grad_norm": 2.859375, + "learning_rate": 4.9621324583024085e-05, + "loss": 0.8652, + "step": 3290 + }, + { + "epoch": 0.058354050616643845, + "grad_norm": 2.65625, + "learning_rate": 4.962084017741466e-05, + "loss": 0.8329, + "step": 3292 + }, + { + "epoch": 0.05838950265225541, + "grad_norm": 2.859375, + "learning_rate": 4.962035546454228e-05, + "loss": 0.8641, + "step": 3294 + }, + { + "epoch": 0.05842495468786699, + "grad_norm": 2.78125, + "learning_rate": 4.961987044441299e-05, + "loss": 0.8403, + "step": 3296 + }, + { + "epoch": 0.058460406723478554, + "grad_norm": 2.796875, + "learning_rate": 4.961938511703284e-05, + "loss": 0.854, + "step": 3298 + }, + { + "epoch": 0.05849585875909012, + "grad_norm": 2.984375, + "learning_rate": 4.96188994824079e-05, + "loss": 0.8576, + "step": 3300 + }, + { + "epoch": 0.058531310794701696, + "grad_norm": 2.890625, + "learning_rate": 4.961841354054422e-05, + "loss": 0.8808, + "step": 3302 + }, + { + "epoch": 0.058566762830313264, + "grad_norm": 2.671875, + "learning_rate": 4.961792729144786e-05, + "loss": 0.8476, + "step": 3304 + }, + { + "epoch": 0.05860221486592483, + "grad_norm": 2.890625, + "learning_rate": 4.96174407351249e-05, + "loss": 0.8627, + "step": 3306 + }, + { + "epoch": 0.058637666901536406, + "grad_norm": 2.65625, + "learning_rate": 4.9616953871581406e-05, + "loss": 0.8171, + "step": 3308 + }, + { + "epoch": 0.058673118937147974, + "grad_norm": 2.875, + "learning_rate": 4.9616466700823455e-05, + "loss": 0.8769, + "step": 3310 + }, + { + "epoch": 0.05870857097275954, + "grad_norm": 2.765625, + "learning_rate": 4.961597922285712e-05, + "loss": 0.8673, + "step": 3312 + }, + { + "epoch": 0.05874402300837111, + "grad_norm": 3.4375, + "learning_rate": 4.96154914376885e-05, + "loss": 0.8796, + "step": 3314 + }, + { + "epoch": 0.05877947504398268, + "grad_norm": 2.875, + "learning_rate": 4.961500334532368e-05, + "loss": 0.8608, + "step": 3316 + }, + { + "epoch": 0.05881492707959425, + "grad_norm": 3.15625, + "learning_rate": 4.9614514945768734e-05, + "loss": 0.8851, + "step": 3318 + }, + { + "epoch": 0.05885037911520582, + "grad_norm": 2.984375, + "learning_rate": 4.961402623902978e-05, + "loss": 0.8761, + "step": 3320 + }, + { + "epoch": 0.05888583115081739, + "grad_norm": 2.90625, + "learning_rate": 4.9613537225112893e-05, + "loss": 0.8832, + "step": 3322 + }, + { + "epoch": 0.05892128318642896, + "grad_norm": 3.15625, + "learning_rate": 4.961304790402419e-05, + "loss": 0.8012, + "step": 3324 + }, + { + "epoch": 0.05895673522204053, + "grad_norm": 2.953125, + "learning_rate": 4.961255827576978e-05, + "loss": 0.8719, + "step": 3326 + }, + { + "epoch": 0.0589921872576521, + "grad_norm": 2.71875, + "learning_rate": 4.9612068340355766e-05, + "loss": 0.9218, + "step": 3328 + }, + { + "epoch": 0.05902763929326367, + "grad_norm": 2.96875, + "learning_rate": 4.961157809778827e-05, + "loss": 0.862, + "step": 3330 + }, + { + "epoch": 0.05906309132887524, + "grad_norm": 2.6875, + "learning_rate": 4.96110875480734e-05, + "loss": 0.8544, + "step": 3332 + }, + { + "epoch": 0.05909854336448681, + "grad_norm": 2.875, + "learning_rate": 4.9610596691217284e-05, + "loss": 0.8294, + "step": 3334 + }, + { + "epoch": 0.05913399540009838, + "grad_norm": 2.71875, + "learning_rate": 4.961010552722605e-05, + "loss": 0.8704, + "step": 3336 + }, + { + "epoch": 0.05916944743570995, + "grad_norm": 2.40625, + "learning_rate": 4.960961405610582e-05, + "loss": 0.7835, + "step": 3338 + }, + { + "epoch": 0.05920489947132152, + "grad_norm": 2.9375, + "learning_rate": 4.960912227786274e-05, + "loss": 0.8676, + "step": 3340 + }, + { + "epoch": 0.05924035150693309, + "grad_norm": 2.90625, + "learning_rate": 4.9608630192502935e-05, + "loss": 0.86, + "step": 3342 + }, + { + "epoch": 0.05927580354254466, + "grad_norm": 2.921875, + "learning_rate": 4.960813780003256e-05, + "loss": 0.7854, + "step": 3344 + }, + { + "epoch": 0.05931125557815623, + "grad_norm": 2.453125, + "learning_rate": 4.960764510045774e-05, + "loss": 0.8139, + "step": 3346 + }, + { + "epoch": 0.0593467076137678, + "grad_norm": 2.765625, + "learning_rate": 4.960715209378464e-05, + "loss": 0.8455, + "step": 3348 + }, + { + "epoch": 0.05938215964937937, + "grad_norm": 2.90625, + "learning_rate": 4.96066587800194e-05, + "loss": 0.9059, + "step": 3350 + }, + { + "epoch": 0.059417611684990934, + "grad_norm": 2.640625, + "learning_rate": 4.960616515916819e-05, + "loss": 0.8427, + "step": 3352 + }, + { + "epoch": 0.05945306372060251, + "grad_norm": 2.953125, + "learning_rate": 4.960567123123716e-05, + "loss": 0.8606, + "step": 3354 + }, + { + "epoch": 0.059488515756214076, + "grad_norm": 3.015625, + "learning_rate": 4.960517699623248e-05, + "loss": 0.9067, + "step": 3356 + }, + { + "epoch": 0.059523967791825644, + "grad_norm": 2.953125, + "learning_rate": 4.960468245416032e-05, + "loss": 0.873, + "step": 3358 + }, + { + "epoch": 0.05955941982743722, + "grad_norm": 2.734375, + "learning_rate": 4.9604187605026845e-05, + "loss": 0.8933, + "step": 3360 + }, + { + "epoch": 0.059594871863048786, + "grad_norm": 3.265625, + "learning_rate": 4.960369244883823e-05, + "loss": 0.8775, + "step": 3362 + }, + { + "epoch": 0.05963032389866035, + "grad_norm": 2.828125, + "learning_rate": 4.960319698560066e-05, + "loss": 0.8348, + "step": 3364 + }, + { + "epoch": 0.05966577593427193, + "grad_norm": 2.53125, + "learning_rate": 4.960270121532031e-05, + "loss": 0.8505, + "step": 3366 + }, + { + "epoch": 0.059701227969883496, + "grad_norm": 2.84375, + "learning_rate": 4.960220513800339e-05, + "loss": 0.8609, + "step": 3368 + }, + { + "epoch": 0.05973668000549506, + "grad_norm": 2.796875, + "learning_rate": 4.960170875365606e-05, + "loss": 0.8479, + "step": 3370 + }, + { + "epoch": 0.05977213204110664, + "grad_norm": 2.765625, + "learning_rate": 4.960121206228453e-05, + "loss": 0.8407, + "step": 3372 + }, + { + "epoch": 0.059807584076718205, + "grad_norm": 3.046875, + "learning_rate": 4.9600715063895e-05, + "loss": 0.8408, + "step": 3374 + }, + { + "epoch": 0.05984303611232977, + "grad_norm": 2.765625, + "learning_rate": 4.960021775849367e-05, + "loss": 0.8333, + "step": 3376 + }, + { + "epoch": 0.05987848814794135, + "grad_norm": 2.875, + "learning_rate": 4.959972014608675e-05, + "loss": 0.8421, + "step": 3378 + }, + { + "epoch": 0.059913940183552915, + "grad_norm": 2.796875, + "learning_rate": 4.959922222668044e-05, + "loss": 0.8293, + "step": 3380 + }, + { + "epoch": 0.05994939221916448, + "grad_norm": 2.6875, + "learning_rate": 4.959872400028096e-05, + "loss": 0.8708, + "step": 3382 + }, + { + "epoch": 0.05998484425477606, + "grad_norm": 2.625, + "learning_rate": 4.959822546689453e-05, + "loss": 0.8583, + "step": 3384 + }, + { + "epoch": 0.060020296290387624, + "grad_norm": 2.78125, + "learning_rate": 4.959772662652737e-05, + "loss": 0.8624, + "step": 3386 + }, + { + "epoch": 0.06005574832599919, + "grad_norm": 2.96875, + "learning_rate": 4.959722747918571e-05, + "loss": 0.879, + "step": 3388 + }, + { + "epoch": 0.060091200361610767, + "grad_norm": 2.75, + "learning_rate": 4.9596728024875774e-05, + "loss": 0.8485, + "step": 3390 + }, + { + "epoch": 0.060126652397222334, + "grad_norm": 2.875, + "learning_rate": 4.959622826360378e-05, + "loss": 0.8543, + "step": 3392 + }, + { + "epoch": 0.0601621044328339, + "grad_norm": 2.96875, + "learning_rate": 4.9595728195375996e-05, + "loss": 0.8681, + "step": 3394 + }, + { + "epoch": 0.06019755646844547, + "grad_norm": 2.890625, + "learning_rate": 4.959522782019864e-05, + "loss": 0.8927, + "step": 3396 + }, + { + "epoch": 0.060233008504057044, + "grad_norm": 2.59375, + "learning_rate": 4.9594727138077967e-05, + "loss": 0.8581, + "step": 3398 + }, + { + "epoch": 0.06026846053966861, + "grad_norm": 2.625, + "learning_rate": 4.9594226149020226e-05, + "loss": 0.8652, + "step": 3400 + }, + { + "epoch": 0.06030391257528018, + "grad_norm": 2.890625, + "learning_rate": 4.959372485303165e-05, + "loss": 0.8713, + "step": 3402 + }, + { + "epoch": 0.06033936461089175, + "grad_norm": 2.96875, + "learning_rate": 4.9593223250118524e-05, + "loss": 0.8531, + "step": 3404 + }, + { + "epoch": 0.06037481664650332, + "grad_norm": 3.203125, + "learning_rate": 4.95927213402871e-05, + "loss": 0.869, + "step": 3406 + }, + { + "epoch": 0.06041026868211489, + "grad_norm": 2.734375, + "learning_rate": 4.959221912354362e-05, + "loss": 0.8642, + "step": 3408 + }, + { + "epoch": 0.06044572071772646, + "grad_norm": 2.734375, + "learning_rate": 4.959171659989438e-05, + "loss": 0.8605, + "step": 3410 + }, + { + "epoch": 0.06048117275333803, + "grad_norm": 2.703125, + "learning_rate": 4.959121376934563e-05, + "loss": 0.858, + "step": 3412 + }, + { + "epoch": 0.0605166247889496, + "grad_norm": 2.75, + "learning_rate": 4.959071063190366e-05, + "loss": 0.8314, + "step": 3414 + }, + { + "epoch": 0.06055207682456117, + "grad_norm": 2.84375, + "learning_rate": 4.959020718757474e-05, + "loss": 0.8443, + "step": 3416 + }, + { + "epoch": 0.06058752886017274, + "grad_norm": 2.609375, + "learning_rate": 4.9589703436365156e-05, + "loss": 0.8542, + "step": 3418 + }, + { + "epoch": 0.06062298089578431, + "grad_norm": 2.90625, + "learning_rate": 4.9589199378281194e-05, + "loss": 0.844, + "step": 3420 + }, + { + "epoch": 0.06065843293139588, + "grad_norm": 2.828125, + "learning_rate": 4.958869501332914e-05, + "loss": 0.8895, + "step": 3422 + }, + { + "epoch": 0.06069388496700745, + "grad_norm": 2.921875, + "learning_rate": 4.958819034151531e-05, + "loss": 0.8338, + "step": 3424 + }, + { + "epoch": 0.06072933700261902, + "grad_norm": 2.609375, + "learning_rate": 4.958768536284597e-05, + "loss": 0.8165, + "step": 3426 + }, + { + "epoch": 0.06076478903823059, + "grad_norm": 2.625, + "learning_rate": 4.9587180077327444e-05, + "loss": 0.7967, + "step": 3428 + }, + { + "epoch": 0.06080024107384216, + "grad_norm": 2.671875, + "learning_rate": 4.958667448496604e-05, + "loss": 0.8665, + "step": 3430 + }, + { + "epoch": 0.06083569310945373, + "grad_norm": 2.890625, + "learning_rate": 4.958616858576804e-05, + "loss": 0.8432, + "step": 3432 + }, + { + "epoch": 0.0608711451450653, + "grad_norm": 2.734375, + "learning_rate": 4.9585662379739796e-05, + "loss": 0.8507, + "step": 3434 + }, + { + "epoch": 0.06090659718067687, + "grad_norm": 2.921875, + "learning_rate": 4.95851558668876e-05, + "loss": 0.8548, + "step": 3436 + }, + { + "epoch": 0.06094204921628844, + "grad_norm": 2.78125, + "learning_rate": 4.958464904721778e-05, + "loss": 0.8712, + "step": 3438 + }, + { + "epoch": 0.060977501251900004, + "grad_norm": 2.9375, + "learning_rate": 4.9584141920736656e-05, + "loss": 0.8419, + "step": 3440 + }, + { + "epoch": 0.06101295328751158, + "grad_norm": 2.765625, + "learning_rate": 4.9583634487450565e-05, + "loss": 0.8738, + "step": 3442 + }, + { + "epoch": 0.061048405323123146, + "grad_norm": 2.6875, + "learning_rate": 4.9583126747365834e-05, + "loss": 0.8648, + "step": 3444 + }, + { + "epoch": 0.061083857358734714, + "grad_norm": 2.6875, + "learning_rate": 4.9582618700488805e-05, + "loss": 0.8645, + "step": 3446 + }, + { + "epoch": 0.06111930939434629, + "grad_norm": 2.8125, + "learning_rate": 4.9582110346825814e-05, + "loss": 0.8694, + "step": 3448 + }, + { + "epoch": 0.061154761429957856, + "grad_norm": 2.890625, + "learning_rate": 4.95816016863832e-05, + "loss": 0.8641, + "step": 3450 + }, + { + "epoch": 0.061190213465569424, + "grad_norm": 3.171875, + "learning_rate": 4.9581092719167324e-05, + "loss": 0.8866, + "step": 3452 + }, + { + "epoch": 0.061225665501181, + "grad_norm": 3.015625, + "learning_rate": 4.958058344518452e-05, + "loss": 0.895, + "step": 3454 + }, + { + "epoch": 0.061261117536792566, + "grad_norm": 2.609375, + "learning_rate": 4.958007386444117e-05, + "loss": 0.8642, + "step": 3456 + }, + { + "epoch": 0.06129656957240413, + "grad_norm": 2.46875, + "learning_rate": 4.957956397694361e-05, + "loss": 0.8568, + "step": 3458 + }, + { + "epoch": 0.06133202160801571, + "grad_norm": 3.015625, + "learning_rate": 4.957905378269821e-05, + "loss": 0.8839, + "step": 3460 + }, + { + "epoch": 0.061367473643627275, + "grad_norm": 2.703125, + "learning_rate": 4.9578543281711345e-05, + "loss": 0.842, + "step": 3462 + }, + { + "epoch": 0.06140292567923884, + "grad_norm": 2.578125, + "learning_rate": 4.9578032473989364e-05, + "loss": 0.8433, + "step": 3464 + }, + { + "epoch": 0.06143837771485042, + "grad_norm": 2.8125, + "learning_rate": 4.957752135953867e-05, + "loss": 0.8188, + "step": 3466 + }, + { + "epoch": 0.061473829750461985, + "grad_norm": 3.0, + "learning_rate": 4.9577009938365624e-05, + "loss": 0.8758, + "step": 3468 + }, + { + "epoch": 0.06150928178607355, + "grad_norm": 3.203125, + "learning_rate": 4.957649821047662e-05, + "loss": 0.8724, + "step": 3470 + }, + { + "epoch": 0.06154473382168513, + "grad_norm": 2.96875, + "learning_rate": 4.957598617587803e-05, + "loss": 0.8746, + "step": 3472 + }, + { + "epoch": 0.061580185857296695, + "grad_norm": 2.9375, + "learning_rate": 4.957547383457625e-05, + "loss": 0.8618, + "step": 3474 + }, + { + "epoch": 0.06161563789290826, + "grad_norm": 2.828125, + "learning_rate": 4.957496118657768e-05, + "loss": 0.8622, + "step": 3476 + }, + { + "epoch": 0.06165108992851984, + "grad_norm": 2.890625, + "learning_rate": 4.957444823188871e-05, + "loss": 0.8822, + "step": 3478 + }, + { + "epoch": 0.061686541964131404, + "grad_norm": 2.84375, + "learning_rate": 4.9573934970515744e-05, + "loss": 0.8869, + "step": 3480 + }, + { + "epoch": 0.06172199399974297, + "grad_norm": 2.84375, + "learning_rate": 4.957342140246519e-05, + "loss": 0.8593, + "step": 3482 + }, + { + "epoch": 0.06175744603535454, + "grad_norm": 2.65625, + "learning_rate": 4.957290752774346e-05, + "loss": 0.8421, + "step": 3484 + }, + { + "epoch": 0.061792898070966114, + "grad_norm": 2.78125, + "learning_rate": 4.957239334635696e-05, + "loss": 0.8195, + "step": 3486 + }, + { + "epoch": 0.06182835010657768, + "grad_norm": 2.71875, + "learning_rate": 4.95718788583121e-05, + "loss": 0.8544, + "step": 3488 + }, + { + "epoch": 0.06186380214218925, + "grad_norm": 2.6875, + "learning_rate": 4.957136406361532e-05, + "loss": 0.8591, + "step": 3490 + }, + { + "epoch": 0.06189925417780082, + "grad_norm": 2.421875, + "learning_rate": 4.957084896227303e-05, + "loss": 0.8311, + "step": 3492 + }, + { + "epoch": 0.06193470621341239, + "grad_norm": 3.125, + "learning_rate": 4.957033355429166e-05, + "loss": 0.8216, + "step": 3494 + }, + { + "epoch": 0.06197015824902396, + "grad_norm": 3.265625, + "learning_rate": 4.9569817839677646e-05, + "loss": 0.8608, + "step": 3496 + }, + { + "epoch": 0.06200561028463553, + "grad_norm": 2.953125, + "learning_rate": 4.956930181843742e-05, + "loss": 0.8534, + "step": 3498 + }, + { + "epoch": 0.0620410623202471, + "grad_norm": 2.8125, + "learning_rate": 4.956878549057743e-05, + "loss": 0.8135, + "step": 3500 + }, + { + "epoch": 0.06207651435585867, + "grad_norm": 2.640625, + "learning_rate": 4.956826885610412e-05, + "loss": 0.8465, + "step": 3502 + }, + { + "epoch": 0.06211196639147024, + "grad_norm": 2.6875, + "learning_rate": 4.9567751915023925e-05, + "loss": 0.8692, + "step": 3504 + }, + { + "epoch": 0.06214741842708181, + "grad_norm": 3.546875, + "learning_rate": 4.9567234667343305e-05, + "loss": 0.8903, + "step": 3506 + }, + { + "epoch": 0.06218287046269338, + "grad_norm": 2.84375, + "learning_rate": 4.9566717113068715e-05, + "loss": 0.8463, + "step": 3508 + }, + { + "epoch": 0.06221832249830495, + "grad_norm": 2.65625, + "learning_rate": 4.9566199252206605e-05, + "loss": 0.8344, + "step": 3510 + }, + { + "epoch": 0.06225377453391652, + "grad_norm": 2.546875, + "learning_rate": 4.956568108476345e-05, + "loss": 0.8576, + "step": 3512 + }, + { + "epoch": 0.06228922656952809, + "grad_norm": 2.890625, + "learning_rate": 4.956516261074571e-05, + "loss": 0.8552, + "step": 3514 + }, + { + "epoch": 0.06232467860513966, + "grad_norm": 2.953125, + "learning_rate": 4.956464383015986e-05, + "loss": 0.8446, + "step": 3516 + }, + { + "epoch": 0.06236013064075123, + "grad_norm": 2.71875, + "learning_rate": 4.956412474301237e-05, + "loss": 0.8456, + "step": 3518 + }, + { + "epoch": 0.0623955826763628, + "grad_norm": 2.78125, + "learning_rate": 4.9563605349309714e-05, + "loss": 0.8691, + "step": 3520 + }, + { + "epoch": 0.062431034711974365, + "grad_norm": 2.8125, + "learning_rate": 4.9563085649058395e-05, + "loss": 0.8493, + "step": 3522 + }, + { + "epoch": 0.06246648674758594, + "grad_norm": 2.9375, + "learning_rate": 4.956256564226487e-05, + "loss": 0.8059, + "step": 3524 + }, + { + "epoch": 0.06250193878319751, + "grad_norm": 2.640625, + "learning_rate": 4.9562045328935644e-05, + "loss": 0.8029, + "step": 3526 + }, + { + "epoch": 0.06253739081880907, + "grad_norm": 2.734375, + "learning_rate": 4.9561524709077215e-05, + "loss": 0.8365, + "step": 3528 + }, + { + "epoch": 0.06257284285442065, + "grad_norm": 2.703125, + "learning_rate": 4.9561003782696055e-05, + "loss": 0.8374, + "step": 3530 + }, + { + "epoch": 0.06260829489003222, + "grad_norm": 2.828125, + "learning_rate": 4.95604825497987e-05, + "loss": 0.8928, + "step": 3532 + }, + { + "epoch": 0.06264374692564378, + "grad_norm": 2.578125, + "learning_rate": 4.955996101039164e-05, + "loss": 0.8294, + "step": 3534 + }, + { + "epoch": 0.06267919896125536, + "grad_norm": 3.0, + "learning_rate": 4.955943916448137e-05, + "loss": 0.85, + "step": 3536 + }, + { + "epoch": 0.06271465099686693, + "grad_norm": 2.78125, + "learning_rate": 4.9558917012074425e-05, + "loss": 0.8115, + "step": 3538 + }, + { + "epoch": 0.0627501030324785, + "grad_norm": 3.0, + "learning_rate": 4.955839455317731e-05, + "loss": 0.8667, + "step": 3540 + }, + { + "epoch": 0.06278555506809007, + "grad_norm": 2.625, + "learning_rate": 4.955787178779654e-05, + "loss": 0.8485, + "step": 3542 + }, + { + "epoch": 0.06282100710370163, + "grad_norm": 3.109375, + "learning_rate": 4.9557348715938646e-05, + "loss": 0.8938, + "step": 3544 + }, + { + "epoch": 0.0628564591393132, + "grad_norm": 2.625, + "learning_rate": 4.9556825337610156e-05, + "loss": 0.8341, + "step": 3546 + }, + { + "epoch": 0.06289191117492478, + "grad_norm": 2.921875, + "learning_rate": 4.9556301652817604e-05, + "loss": 0.8989, + "step": 3548 + }, + { + "epoch": 0.06292736321053634, + "grad_norm": 2.90625, + "learning_rate": 4.955577766156752e-05, + "loss": 0.853, + "step": 3550 + }, + { + "epoch": 0.06296281524614791, + "grad_norm": 3.03125, + "learning_rate": 4.955525336386645e-05, + "loss": 0.8993, + "step": 3552 + }, + { + "epoch": 0.06299826728175949, + "grad_norm": 2.484375, + "learning_rate": 4.9554728759720925e-05, + "loss": 0.8373, + "step": 3554 + }, + { + "epoch": 0.06303371931737105, + "grad_norm": 2.703125, + "learning_rate": 4.95542038491375e-05, + "loss": 0.8574, + "step": 3556 + }, + { + "epoch": 0.06306917135298262, + "grad_norm": 3.0, + "learning_rate": 4.9553678632122724e-05, + "loss": 0.9052, + "step": 3558 + }, + { + "epoch": 0.0631046233885942, + "grad_norm": 3.0625, + "learning_rate": 4.955315310868316e-05, + "loss": 0.8823, + "step": 3560 + }, + { + "epoch": 0.06314007542420576, + "grad_norm": 3.046875, + "learning_rate": 4.955262727882536e-05, + "loss": 0.8426, + "step": 3562 + }, + { + "epoch": 0.06317552745981733, + "grad_norm": 2.65625, + "learning_rate": 4.9552101142555874e-05, + "loss": 0.8356, + "step": 3564 + }, + { + "epoch": 0.0632109794954289, + "grad_norm": 2.84375, + "learning_rate": 4.9551574699881285e-05, + "loss": 0.8112, + "step": 3566 + }, + { + "epoch": 0.06324643153104047, + "grad_norm": 2.734375, + "learning_rate": 4.955104795080816e-05, + "loss": 0.8955, + "step": 3568 + }, + { + "epoch": 0.06328188356665204, + "grad_norm": 2.921875, + "learning_rate": 4.955052089534308e-05, + "loss": 0.8834, + "step": 3570 + }, + { + "epoch": 0.06331733560226362, + "grad_norm": 2.5625, + "learning_rate": 4.9549993533492595e-05, + "loss": 0.8221, + "step": 3572 + }, + { + "epoch": 0.06335278763787518, + "grad_norm": 2.640625, + "learning_rate": 4.9549465865263314e-05, + "loss": 0.9124, + "step": 3574 + }, + { + "epoch": 0.06338823967348675, + "grad_norm": 2.671875, + "learning_rate": 4.954893789066181e-05, + "loss": 0.8662, + "step": 3576 + }, + { + "epoch": 0.06342369170909833, + "grad_norm": 3.0, + "learning_rate": 4.954840960969467e-05, + "loss": 0.8301, + "step": 3578 + }, + { + "epoch": 0.06345914374470989, + "grad_norm": 2.8125, + "learning_rate": 4.9547881022368495e-05, + "loss": 0.8467, + "step": 3580 + }, + { + "epoch": 0.06349459578032146, + "grad_norm": 2.90625, + "learning_rate": 4.954735212868988e-05, + "loss": 0.8899, + "step": 3582 + }, + { + "epoch": 0.06353004781593304, + "grad_norm": 2.96875, + "learning_rate": 4.954682292866542e-05, + "loss": 0.8437, + "step": 3584 + }, + { + "epoch": 0.0635654998515446, + "grad_norm": 3.0625, + "learning_rate": 4.9546293422301724e-05, + "loss": 0.8412, + "step": 3586 + }, + { + "epoch": 0.06360095188715617, + "grad_norm": 2.8125, + "learning_rate": 4.95457636096054e-05, + "loss": 0.7977, + "step": 3588 + }, + { + "epoch": 0.06363640392276775, + "grad_norm": 2.703125, + "learning_rate": 4.9545233490583057e-05, + "loss": 0.8696, + "step": 3590 + }, + { + "epoch": 0.0636718559583793, + "grad_norm": 2.453125, + "learning_rate": 4.954470306524131e-05, + "loss": 0.8215, + "step": 3592 + }, + { + "epoch": 0.06370730799399088, + "grad_norm": 2.796875, + "learning_rate": 4.954417233358678e-05, + "loss": 0.8238, + "step": 3594 + }, + { + "epoch": 0.06374276002960245, + "grad_norm": 2.796875, + "learning_rate": 4.9543641295626096e-05, + "loss": 0.8648, + "step": 3596 + }, + { + "epoch": 0.06377821206521402, + "grad_norm": 2.4375, + "learning_rate": 4.9543109951365886e-05, + "loss": 0.8195, + "step": 3598 + }, + { + "epoch": 0.06381366410082559, + "grad_norm": 2.640625, + "learning_rate": 4.954257830081276e-05, + "loss": 0.8593, + "step": 3600 + }, + { + "epoch": 0.06384911613643716, + "grad_norm": 3.03125, + "learning_rate": 4.954204634397338e-05, + "loss": 0.8781, + "step": 3602 + }, + { + "epoch": 0.06388456817204873, + "grad_norm": 2.84375, + "learning_rate": 4.9541514080854375e-05, + "loss": 0.8603, + "step": 3604 + }, + { + "epoch": 0.0639200202076603, + "grad_norm": 2.8125, + "learning_rate": 4.954098151146238e-05, + "loss": 0.8483, + "step": 3606 + }, + { + "epoch": 0.06395547224327187, + "grad_norm": 2.734375, + "learning_rate": 4.954044863580405e-05, + "loss": 0.8716, + "step": 3608 + }, + { + "epoch": 0.06399092427888343, + "grad_norm": 2.8125, + "learning_rate": 4.953991545388603e-05, + "loss": 0.8673, + "step": 3610 + }, + { + "epoch": 0.06402637631449501, + "grad_norm": 3.125, + "learning_rate": 4.953938196571498e-05, + "loss": 0.8617, + "step": 3612 + }, + { + "epoch": 0.06406182835010658, + "grad_norm": 3.109375, + "learning_rate": 4.953884817129755e-05, + "loss": 0.857, + "step": 3614 + }, + { + "epoch": 0.06409728038571814, + "grad_norm": 2.8125, + "learning_rate": 4.953831407064041e-05, + "loss": 0.853, + "step": 3616 + }, + { + "epoch": 0.06413273242132972, + "grad_norm": 2.921875, + "learning_rate": 4.9537779663750225e-05, + "loss": 0.8622, + "step": 3618 + }, + { + "epoch": 0.0641681844569413, + "grad_norm": 2.578125, + "learning_rate": 4.953724495063365e-05, + "loss": 0.8242, + "step": 3620 + }, + { + "epoch": 0.06420363649255285, + "grad_norm": 2.34375, + "learning_rate": 4.953670993129738e-05, + "loss": 0.8159, + "step": 3622 + }, + { + "epoch": 0.06423908852816443, + "grad_norm": 2.671875, + "learning_rate": 4.953617460574807e-05, + "loss": 0.8065, + "step": 3624 + }, + { + "epoch": 0.064274540563776, + "grad_norm": 2.84375, + "learning_rate": 4.9535638973992416e-05, + "loss": 0.8516, + "step": 3626 + }, + { + "epoch": 0.06430999259938756, + "grad_norm": 2.90625, + "learning_rate": 4.95351030360371e-05, + "loss": 0.8336, + "step": 3628 + }, + { + "epoch": 0.06434544463499914, + "grad_norm": 2.765625, + "learning_rate": 4.9534566791888804e-05, + "loss": 0.8426, + "step": 3630 + }, + { + "epoch": 0.0643808966706107, + "grad_norm": 2.9375, + "learning_rate": 4.953403024155423e-05, + "loss": 0.849, + "step": 3632 + }, + { + "epoch": 0.06441634870622227, + "grad_norm": 2.9375, + "learning_rate": 4.9533493385040067e-05, + "loss": 0.8788, + "step": 3634 + }, + { + "epoch": 0.06445180074183385, + "grad_norm": 2.65625, + "learning_rate": 4.9532956222353014e-05, + "loss": 0.8311, + "step": 3636 + }, + { + "epoch": 0.06448725277744541, + "grad_norm": 2.796875, + "learning_rate": 4.953241875349978e-05, + "loss": 0.8983, + "step": 3638 + }, + { + "epoch": 0.06452270481305698, + "grad_norm": 2.9375, + "learning_rate": 4.9531880978487065e-05, + "loss": 0.8764, + "step": 3640 + }, + { + "epoch": 0.06455815684866856, + "grad_norm": 2.765625, + "learning_rate": 4.953134289732159e-05, + "loss": 0.835, + "step": 3642 + }, + { + "epoch": 0.06459360888428012, + "grad_norm": 2.796875, + "learning_rate": 4.9530804510010065e-05, + "loss": 0.8383, + "step": 3644 + }, + { + "epoch": 0.06462906091989169, + "grad_norm": 2.859375, + "learning_rate": 4.953026581655921e-05, + "loss": 0.856, + "step": 3646 + }, + { + "epoch": 0.06466451295550327, + "grad_norm": 2.921875, + "learning_rate": 4.952972681697574e-05, + "loss": 0.8197, + "step": 3648 + }, + { + "epoch": 0.06469996499111483, + "grad_norm": 2.765625, + "learning_rate": 4.9529187511266395e-05, + "loss": 0.8752, + "step": 3650 + }, + { + "epoch": 0.0647354170267264, + "grad_norm": 2.84375, + "learning_rate": 4.9528647899437894e-05, + "loss": 0.8281, + "step": 3652 + }, + { + "epoch": 0.06477086906233798, + "grad_norm": 2.875, + "learning_rate": 4.9528107981496985e-05, + "loss": 0.8465, + "step": 3654 + }, + { + "epoch": 0.06480632109794954, + "grad_norm": 2.9375, + "learning_rate": 4.952756775745039e-05, + "loss": 0.8236, + "step": 3656 + }, + { + "epoch": 0.06484177313356111, + "grad_norm": 3.03125, + "learning_rate": 4.952702722730486e-05, + "loss": 0.8802, + "step": 3658 + }, + { + "epoch": 0.06487722516917269, + "grad_norm": 3.0625, + "learning_rate": 4.952648639106714e-05, + "loss": 0.8799, + "step": 3660 + }, + { + "epoch": 0.06491267720478425, + "grad_norm": 2.796875, + "learning_rate": 4.9525945248743974e-05, + "loss": 0.8126, + "step": 3662 + }, + { + "epoch": 0.06494812924039582, + "grad_norm": 2.453125, + "learning_rate": 4.952540380034212e-05, + "loss": 0.842, + "step": 3664 + }, + { + "epoch": 0.0649835812760074, + "grad_norm": 2.765625, + "learning_rate": 4.952486204586834e-05, + "loss": 0.8368, + "step": 3666 + }, + { + "epoch": 0.06501903331161896, + "grad_norm": 2.765625, + "learning_rate": 4.952431998532939e-05, + "loss": 0.8623, + "step": 3668 + }, + { + "epoch": 0.06505448534723053, + "grad_norm": 2.8125, + "learning_rate": 4.952377761873203e-05, + "loss": 0.8137, + "step": 3670 + }, + { + "epoch": 0.0650899373828421, + "grad_norm": 2.828125, + "learning_rate": 4.952323494608303e-05, + "loss": 0.8041, + "step": 3672 + }, + { + "epoch": 0.06512538941845367, + "grad_norm": 2.6875, + "learning_rate": 4.9522691967389175e-05, + "loss": 0.8597, + "step": 3674 + }, + { + "epoch": 0.06516084145406524, + "grad_norm": 2.875, + "learning_rate": 4.952214868265723e-05, + "loss": 0.8554, + "step": 3676 + }, + { + "epoch": 0.06519629348967682, + "grad_norm": 2.921875, + "learning_rate": 4.952160509189397e-05, + "loss": 0.8432, + "step": 3678 + }, + { + "epoch": 0.06523174552528838, + "grad_norm": 2.953125, + "learning_rate": 4.952106119510619e-05, + "loss": 0.8919, + "step": 3680 + }, + { + "epoch": 0.06526719756089995, + "grad_norm": 2.9375, + "learning_rate": 4.9520516992300675e-05, + "loss": 0.862, + "step": 3682 + }, + { + "epoch": 0.06530264959651153, + "grad_norm": 2.5625, + "learning_rate": 4.9519972483484214e-05, + "loss": 0.8214, + "step": 3684 + }, + { + "epoch": 0.06533810163212309, + "grad_norm": 2.71875, + "learning_rate": 4.9519427668663603e-05, + "loss": 0.8427, + "step": 3686 + }, + { + "epoch": 0.06537355366773466, + "grad_norm": 2.953125, + "learning_rate": 4.9518882547845645e-05, + "loss": 0.8962, + "step": 3688 + }, + { + "epoch": 0.06540900570334623, + "grad_norm": 2.921875, + "learning_rate": 4.951833712103714e-05, + "loss": 0.8666, + "step": 3690 + }, + { + "epoch": 0.0654444577389578, + "grad_norm": 2.71875, + "learning_rate": 4.951779138824489e-05, + "loss": 0.8367, + "step": 3692 + }, + { + "epoch": 0.06547990977456937, + "grad_norm": 2.703125, + "learning_rate": 4.951724534947571e-05, + "loss": 0.8496, + "step": 3694 + }, + { + "epoch": 0.06551536181018094, + "grad_norm": 2.859375, + "learning_rate": 4.9516699004736415e-05, + "loss": 0.878, + "step": 3696 + }, + { + "epoch": 0.0655508138457925, + "grad_norm": 2.421875, + "learning_rate": 4.9516152354033826e-05, + "loss": 0.8568, + "step": 3698 + }, + { + "epoch": 0.06558626588140408, + "grad_norm": 3.09375, + "learning_rate": 4.9515605397374765e-05, + "loss": 0.8316, + "step": 3700 + }, + { + "epoch": 0.06562171791701565, + "grad_norm": 2.65625, + "learning_rate": 4.951505813476605e-05, + "loss": 0.8079, + "step": 3702 + }, + { + "epoch": 0.06565716995262721, + "grad_norm": 2.5, + "learning_rate": 4.951451056621451e-05, + "loss": 0.9063, + "step": 3704 + }, + { + "epoch": 0.06569262198823879, + "grad_norm": 2.890625, + "learning_rate": 4.9513962691726986e-05, + "loss": 0.8466, + "step": 3706 + }, + { + "epoch": 0.06572807402385036, + "grad_norm": 2.796875, + "learning_rate": 4.9513414511310325e-05, + "loss": 0.917, + "step": 3708 + }, + { + "epoch": 0.06576352605946192, + "grad_norm": 2.75, + "learning_rate": 4.951286602497135e-05, + "loss": 0.8531, + "step": 3710 + }, + { + "epoch": 0.0657989780950735, + "grad_norm": 2.828125, + "learning_rate": 4.951231723271691e-05, + "loss": 0.8474, + "step": 3712 + }, + { + "epoch": 0.06583443013068506, + "grad_norm": 2.4375, + "learning_rate": 4.951176813455386e-05, + "loss": 0.839, + "step": 3714 + }, + { + "epoch": 0.06586988216629663, + "grad_norm": 2.6875, + "learning_rate": 4.951121873048905e-05, + "loss": 0.8619, + "step": 3716 + }, + { + "epoch": 0.06590533420190821, + "grad_norm": 2.90625, + "learning_rate": 4.951066902052933e-05, + "loss": 0.8007, + "step": 3718 + }, + { + "epoch": 0.06594078623751977, + "grad_norm": 3.21875, + "learning_rate": 4.951011900468157e-05, + "loss": 0.9208, + "step": 3720 + }, + { + "epoch": 0.06597623827313134, + "grad_norm": 2.96875, + "learning_rate": 4.9509568682952627e-05, + "loss": 0.8499, + "step": 3722 + }, + { + "epoch": 0.06601169030874292, + "grad_norm": 2.875, + "learning_rate": 4.9509018055349374e-05, + "loss": 0.8741, + "step": 3724 + }, + { + "epoch": 0.06604714234435448, + "grad_norm": 2.828125, + "learning_rate": 4.950846712187868e-05, + "loss": 0.8479, + "step": 3726 + }, + { + "epoch": 0.06608259437996605, + "grad_norm": 2.734375, + "learning_rate": 4.950791588254742e-05, + "loss": 0.8415, + "step": 3728 + }, + { + "epoch": 0.06611804641557763, + "grad_norm": 3.03125, + "learning_rate": 4.950736433736248e-05, + "loss": 0.9057, + "step": 3730 + }, + { + "epoch": 0.06615349845118919, + "grad_norm": 2.96875, + "learning_rate": 4.9506812486330734e-05, + "loss": 0.8189, + "step": 3732 + }, + { + "epoch": 0.06618895048680076, + "grad_norm": 2.828125, + "learning_rate": 4.950626032945907e-05, + "loss": 0.884, + "step": 3734 + }, + { + "epoch": 0.06622440252241234, + "grad_norm": 2.859375, + "learning_rate": 4.950570786675438e-05, + "loss": 0.8709, + "step": 3736 + }, + { + "epoch": 0.0662598545580239, + "grad_norm": 2.796875, + "learning_rate": 4.9505155098223565e-05, + "loss": 0.845, + "step": 3738 + }, + { + "epoch": 0.06629530659363547, + "grad_norm": 2.8125, + "learning_rate": 4.9504602023873514e-05, + "loss": 0.8461, + "step": 3740 + }, + { + "epoch": 0.06633075862924705, + "grad_norm": 3.125, + "learning_rate": 4.950404864371114e-05, + "loss": 0.8842, + "step": 3742 + }, + { + "epoch": 0.06636621066485861, + "grad_norm": 2.796875, + "learning_rate": 4.950349495774333e-05, + "loss": 0.8245, + "step": 3744 + }, + { + "epoch": 0.06640166270047018, + "grad_norm": 2.796875, + "learning_rate": 4.9502940965977026e-05, + "loss": 0.8499, + "step": 3746 + }, + { + "epoch": 0.06643711473608176, + "grad_norm": 2.8125, + "learning_rate": 4.950238666841911e-05, + "loss": 0.8319, + "step": 3748 + }, + { + "epoch": 0.06647256677169332, + "grad_norm": 3.140625, + "learning_rate": 4.950183206507651e-05, + "loss": 0.8837, + "step": 3750 + }, + { + "epoch": 0.06650801880730489, + "grad_norm": 2.65625, + "learning_rate": 4.9501277155956164e-05, + "loss": 0.8457, + "step": 3752 + }, + { + "epoch": 0.06654347084291647, + "grad_norm": 2.4375, + "learning_rate": 4.9500721941064964e-05, + "loss": 0.8314, + "step": 3754 + }, + { + "epoch": 0.06657892287852803, + "grad_norm": 3.09375, + "learning_rate": 4.9500166420409866e-05, + "loss": 0.8802, + "step": 3756 + }, + { + "epoch": 0.0666143749141396, + "grad_norm": 2.796875, + "learning_rate": 4.949961059399779e-05, + "loss": 0.8275, + "step": 3758 + }, + { + "epoch": 0.06664982694975118, + "grad_norm": 2.75, + "learning_rate": 4.9499054461835684e-05, + "loss": 0.8344, + "step": 3760 + }, + { + "epoch": 0.06668527898536274, + "grad_norm": 2.953125, + "learning_rate": 4.949849802393047e-05, + "loss": 0.8534, + "step": 3762 + }, + { + "epoch": 0.06672073102097431, + "grad_norm": 3.015625, + "learning_rate": 4.9497941280289116e-05, + "loss": 0.8407, + "step": 3764 + }, + { + "epoch": 0.06675618305658589, + "grad_norm": 2.921875, + "learning_rate": 4.949738423091855e-05, + "loss": 0.8525, + "step": 3766 + }, + { + "epoch": 0.06679163509219745, + "grad_norm": 2.8125, + "learning_rate": 4.949682687582573e-05, + "loss": 0.8505, + "step": 3768 + }, + { + "epoch": 0.06682708712780902, + "grad_norm": 2.609375, + "learning_rate": 4.9496269215017624e-05, + "loss": 0.829, + "step": 3770 + }, + { + "epoch": 0.0668625391634206, + "grad_norm": 2.890625, + "learning_rate": 4.949571124850116e-05, + "loss": 0.8904, + "step": 3772 + }, + { + "epoch": 0.06689799119903216, + "grad_norm": 2.828125, + "learning_rate": 4.949515297628334e-05, + "loss": 0.8788, + "step": 3774 + }, + { + "epoch": 0.06693344323464373, + "grad_norm": 2.765625, + "learning_rate": 4.949459439837111e-05, + "loss": 0.817, + "step": 3776 + }, + { + "epoch": 0.0669688952702553, + "grad_norm": 2.625, + "learning_rate": 4.949403551477144e-05, + "loss": 0.8703, + "step": 3778 + }, + { + "epoch": 0.06700434730586687, + "grad_norm": 2.515625, + "learning_rate": 4.9493476325491306e-05, + "loss": 0.8258, + "step": 3780 + }, + { + "epoch": 0.06703979934147844, + "grad_norm": 2.71875, + "learning_rate": 4.949291683053769e-05, + "loss": 0.859, + "step": 3782 + }, + { + "epoch": 0.06707525137709001, + "grad_norm": 2.90625, + "learning_rate": 4.949235702991757e-05, + "loss": 0.8668, + "step": 3784 + }, + { + "epoch": 0.06711070341270158, + "grad_norm": 2.859375, + "learning_rate": 4.9491796923637945e-05, + "loss": 0.8673, + "step": 3786 + }, + { + "epoch": 0.06714615544831315, + "grad_norm": 3.0625, + "learning_rate": 4.949123651170579e-05, + "loss": 0.8853, + "step": 3788 + }, + { + "epoch": 0.06718160748392472, + "grad_norm": 2.984375, + "learning_rate": 4.9490675794128105e-05, + "loss": 0.8029, + "step": 3790 + }, + { + "epoch": 0.06721705951953628, + "grad_norm": 3.0625, + "learning_rate": 4.9490114770911886e-05, + "loss": 0.858, + "step": 3792 + }, + { + "epoch": 0.06725251155514786, + "grad_norm": 2.75, + "learning_rate": 4.948955344206414e-05, + "loss": 0.8789, + "step": 3794 + }, + { + "epoch": 0.06728796359075943, + "grad_norm": 2.75, + "learning_rate": 4.948899180759187e-05, + "loss": 0.8451, + "step": 3796 + }, + { + "epoch": 0.067323415626371, + "grad_norm": 2.796875, + "learning_rate": 4.948842986750207e-05, + "loss": 0.8629, + "step": 3798 + }, + { + "epoch": 0.06735886766198257, + "grad_norm": 2.8125, + "learning_rate": 4.948786762180178e-05, + "loss": 0.8486, + "step": 3800 + }, + { + "epoch": 0.06739431969759413, + "grad_norm": 2.890625, + "learning_rate": 4.9487305070498e-05, + "loss": 0.8094, + "step": 3802 + }, + { + "epoch": 0.0674297717332057, + "grad_norm": 2.71875, + "learning_rate": 4.9486742213597745e-05, + "loss": 0.8793, + "step": 3804 + }, + { + "epoch": 0.06746522376881728, + "grad_norm": 2.65625, + "learning_rate": 4.9486179051108054e-05, + "loss": 0.8378, + "step": 3806 + }, + { + "epoch": 0.06750067580442884, + "grad_norm": 2.703125, + "learning_rate": 4.9485615583035946e-05, + "loss": 0.8348, + "step": 3808 + }, + { + "epoch": 0.06753612784004041, + "grad_norm": 2.5625, + "learning_rate": 4.948505180938846e-05, + "loss": 0.83, + "step": 3810 + }, + { + "epoch": 0.06757157987565199, + "grad_norm": 2.671875, + "learning_rate": 4.9484487730172624e-05, + "loss": 0.8492, + "step": 3812 + }, + { + "epoch": 0.06760703191126355, + "grad_norm": 2.875, + "learning_rate": 4.948392334539548e-05, + "loss": 0.9011, + "step": 3814 + }, + { + "epoch": 0.06764248394687512, + "grad_norm": 2.828125, + "learning_rate": 4.948335865506407e-05, + "loss": 0.8411, + "step": 3816 + }, + { + "epoch": 0.0676779359824867, + "grad_norm": 2.8125, + "learning_rate": 4.948279365918544e-05, + "loss": 0.8459, + "step": 3818 + }, + { + "epoch": 0.06771338801809826, + "grad_norm": 2.671875, + "learning_rate": 4.948222835776666e-05, + "loss": 0.8186, + "step": 3820 + }, + { + "epoch": 0.06774884005370983, + "grad_norm": 2.90625, + "learning_rate": 4.948166275081476e-05, + "loss": 0.8643, + "step": 3822 + }, + { + "epoch": 0.06778429208932141, + "grad_norm": 3.125, + "learning_rate": 4.9481096838336804e-05, + "loss": 0.8403, + "step": 3824 + }, + { + "epoch": 0.06781974412493297, + "grad_norm": 2.859375, + "learning_rate": 4.948053062033986e-05, + "loss": 0.8526, + "step": 3826 + }, + { + "epoch": 0.06785519616054454, + "grad_norm": 2.890625, + "learning_rate": 4.9479964096831e-05, + "loss": 0.8543, + "step": 3828 + }, + { + "epoch": 0.06789064819615612, + "grad_norm": 2.734375, + "learning_rate": 4.947939726781729e-05, + "loss": 0.8389, + "step": 3830 + }, + { + "epoch": 0.06792610023176768, + "grad_norm": 2.6875, + "learning_rate": 4.947883013330579e-05, + "loss": 0.8496, + "step": 3832 + }, + { + "epoch": 0.06796155226737925, + "grad_norm": 2.59375, + "learning_rate": 4.947826269330359e-05, + "loss": 0.8223, + "step": 3834 + }, + { + "epoch": 0.06799700430299083, + "grad_norm": 2.625, + "learning_rate": 4.947769494781777e-05, + "loss": 0.8621, + "step": 3836 + }, + { + "epoch": 0.06803245633860239, + "grad_norm": 2.703125, + "learning_rate": 4.947712689685542e-05, + "loss": 0.903, + "step": 3838 + }, + { + "epoch": 0.06806790837421396, + "grad_norm": 2.59375, + "learning_rate": 4.947655854042362e-05, + "loss": 0.845, + "step": 3840 + }, + { + "epoch": 0.06810336040982554, + "grad_norm": 2.828125, + "learning_rate": 4.947598987852947e-05, + "loss": 0.858, + "step": 3842 + }, + { + "epoch": 0.0681388124454371, + "grad_norm": 2.625, + "learning_rate": 4.947542091118006e-05, + "loss": 0.8404, + "step": 3844 + }, + { + "epoch": 0.06817426448104867, + "grad_norm": 2.84375, + "learning_rate": 4.9474851638382504e-05, + "loss": 0.8334, + "step": 3846 + }, + { + "epoch": 0.06820971651666025, + "grad_norm": 2.96875, + "learning_rate": 4.9474282060143885e-05, + "loss": 0.8613, + "step": 3848 + }, + { + "epoch": 0.0682451685522718, + "grad_norm": 2.984375, + "learning_rate": 4.947371217647133e-05, + "loss": 0.8584, + "step": 3850 + }, + { + "epoch": 0.06828062058788338, + "grad_norm": 3.046875, + "learning_rate": 4.947314198737195e-05, + "loss": 0.8718, + "step": 3852 + }, + { + "epoch": 0.06831607262349496, + "grad_norm": 2.953125, + "learning_rate": 4.947257149285285e-05, + "loss": 0.8363, + "step": 3854 + }, + { + "epoch": 0.06835152465910652, + "grad_norm": 2.609375, + "learning_rate": 4.947200069292115e-05, + "loss": 0.8623, + "step": 3856 + }, + { + "epoch": 0.06838697669471809, + "grad_norm": 2.84375, + "learning_rate": 4.9471429587583985e-05, + "loss": 0.8581, + "step": 3858 + }, + { + "epoch": 0.06842242873032967, + "grad_norm": 2.625, + "learning_rate": 4.947085817684848e-05, + "loss": 0.85, + "step": 3860 + }, + { + "epoch": 0.06845788076594123, + "grad_norm": 2.765625, + "learning_rate": 4.947028646072175e-05, + "loss": 0.8351, + "step": 3862 + }, + { + "epoch": 0.0684933328015528, + "grad_norm": 2.9375, + "learning_rate": 4.9469714439210954e-05, + "loss": 0.8576, + "step": 3864 + }, + { + "epoch": 0.06852878483716437, + "grad_norm": 2.625, + "learning_rate": 4.946914211232321e-05, + "loss": 0.8174, + "step": 3866 + }, + { + "epoch": 0.06856423687277594, + "grad_norm": 2.921875, + "learning_rate": 4.946856948006567e-05, + "loss": 0.8789, + "step": 3868 + }, + { + "epoch": 0.06859968890838751, + "grad_norm": 2.828125, + "learning_rate": 4.946799654244548e-05, + "loss": 0.8533, + "step": 3870 + }, + { + "epoch": 0.06863514094399908, + "grad_norm": 2.8125, + "learning_rate": 4.9467423299469796e-05, + "loss": 0.8777, + "step": 3872 + }, + { + "epoch": 0.06867059297961065, + "grad_norm": 2.796875, + "learning_rate": 4.9466849751145754e-05, + "loss": 0.8299, + "step": 3874 + }, + { + "epoch": 0.06870604501522222, + "grad_norm": 3.078125, + "learning_rate": 4.946627589748053e-05, + "loss": 0.8746, + "step": 3876 + }, + { + "epoch": 0.0687414970508338, + "grad_norm": 2.8125, + "learning_rate": 4.9465701738481276e-05, + "loss": 0.8488, + "step": 3878 + }, + { + "epoch": 0.06877694908644535, + "grad_norm": 2.90625, + "learning_rate": 4.9465127274155165e-05, + "loss": 0.8468, + "step": 3880 + }, + { + "epoch": 0.06881240112205693, + "grad_norm": 2.953125, + "learning_rate": 4.9464552504509353e-05, + "loss": 0.8292, + "step": 3882 + }, + { + "epoch": 0.0688478531576685, + "grad_norm": 2.8125, + "learning_rate": 4.946397742955103e-05, + "loss": 0.8555, + "step": 3884 + }, + { + "epoch": 0.06888330519328006, + "grad_norm": 2.671875, + "learning_rate": 4.946340204928736e-05, + "loss": 0.857, + "step": 3886 + }, + { + "epoch": 0.06891875722889164, + "grad_norm": 3.203125, + "learning_rate": 4.946282636372553e-05, + "loss": 0.8918, + "step": 3888 + }, + { + "epoch": 0.0689542092645032, + "grad_norm": 2.6875, + "learning_rate": 4.946225037287272e-05, + "loss": 0.8305, + "step": 3890 + }, + { + "epoch": 0.06898966130011477, + "grad_norm": 2.875, + "learning_rate": 4.946167407673612e-05, + "loss": 0.9015, + "step": 3892 + }, + { + "epoch": 0.06902511333572635, + "grad_norm": 2.921875, + "learning_rate": 4.9461097475322925e-05, + "loss": 0.8525, + "step": 3894 + }, + { + "epoch": 0.06906056537133791, + "grad_norm": 2.671875, + "learning_rate": 4.946052056864032e-05, + "loss": 0.8384, + "step": 3896 + }, + { + "epoch": 0.06909601740694948, + "grad_norm": 2.65625, + "learning_rate": 4.945994335669552e-05, + "loss": 0.7948, + "step": 3898 + }, + { + "epoch": 0.06913146944256106, + "grad_norm": 3.25, + "learning_rate": 4.945936583949573e-05, + "loss": 0.8752, + "step": 3900 + }, + { + "epoch": 0.06916692147817262, + "grad_norm": 2.671875, + "learning_rate": 4.945878801704814e-05, + "loss": 0.8423, + "step": 3902 + }, + { + "epoch": 0.0692023735137842, + "grad_norm": 2.390625, + "learning_rate": 4.945820988935997e-05, + "loss": 0.7475, + "step": 3904 + }, + { + "epoch": 0.06923782554939577, + "grad_norm": 3.109375, + "learning_rate": 4.945763145643844e-05, + "loss": 0.8427, + "step": 3906 + }, + { + "epoch": 0.06927327758500733, + "grad_norm": 2.59375, + "learning_rate": 4.9457052718290756e-05, + "loss": 0.8373, + "step": 3908 + }, + { + "epoch": 0.0693087296206189, + "grad_norm": 2.953125, + "learning_rate": 4.945647367492415e-05, + "loss": 0.8304, + "step": 3910 + }, + { + "epoch": 0.06934418165623048, + "grad_norm": 2.765625, + "learning_rate": 4.945589432634584e-05, + "loss": 0.8281, + "step": 3912 + }, + { + "epoch": 0.06937963369184204, + "grad_norm": 2.8125, + "learning_rate": 4.945531467256307e-05, + "loss": 0.8817, + "step": 3914 + }, + { + "epoch": 0.06941508572745361, + "grad_norm": 2.578125, + "learning_rate": 4.9454734713583075e-05, + "loss": 0.8514, + "step": 3916 + }, + { + "epoch": 0.06945053776306519, + "grad_norm": 2.890625, + "learning_rate": 4.945415444941307e-05, + "loss": 0.8647, + "step": 3918 + }, + { + "epoch": 0.06948598979867675, + "grad_norm": 2.90625, + "learning_rate": 4.9453573880060324e-05, + "loss": 0.8591, + "step": 3920 + }, + { + "epoch": 0.06952144183428832, + "grad_norm": 2.734375, + "learning_rate": 4.945299300553206e-05, + "loss": 0.8554, + "step": 3922 + }, + { + "epoch": 0.0695568938698999, + "grad_norm": 2.78125, + "learning_rate": 4.945241182583554e-05, + "loss": 0.8347, + "step": 3924 + }, + { + "epoch": 0.06959234590551146, + "grad_norm": 2.640625, + "learning_rate": 4.9451830340978014e-05, + "loss": 0.8306, + "step": 3926 + }, + { + "epoch": 0.06962779794112303, + "grad_norm": 2.765625, + "learning_rate": 4.945124855096673e-05, + "loss": 0.8073, + "step": 3928 + }, + { + "epoch": 0.0696632499767346, + "grad_norm": 2.734375, + "learning_rate": 4.9450666455808965e-05, + "loss": 0.8574, + "step": 3930 + }, + { + "epoch": 0.06969870201234617, + "grad_norm": 3.03125, + "learning_rate": 4.945008405551197e-05, + "loss": 0.8346, + "step": 3932 + }, + { + "epoch": 0.06973415404795774, + "grad_norm": 2.78125, + "learning_rate": 4.9449501350083024e-05, + "loss": 0.8432, + "step": 3934 + }, + { + "epoch": 0.06976960608356932, + "grad_norm": 2.875, + "learning_rate": 4.944891833952939e-05, + "loss": 0.8358, + "step": 3936 + }, + { + "epoch": 0.06980505811918088, + "grad_norm": 2.734375, + "learning_rate": 4.944833502385835e-05, + "loss": 0.8209, + "step": 3938 + }, + { + "epoch": 0.06984051015479245, + "grad_norm": 2.84375, + "learning_rate": 4.944775140307718e-05, + "loss": 0.8509, + "step": 3940 + }, + { + "epoch": 0.06987596219040403, + "grad_norm": 2.828125, + "learning_rate": 4.944716747719317e-05, + "loss": 0.8755, + "step": 3942 + }, + { + "epoch": 0.06991141422601559, + "grad_norm": 3.15625, + "learning_rate": 4.9446583246213594e-05, + "loss": 0.8793, + "step": 3944 + }, + { + "epoch": 0.06994686626162716, + "grad_norm": 2.625, + "learning_rate": 4.944599871014576e-05, + "loss": 0.8635, + "step": 3946 + }, + { + "epoch": 0.06998231829723874, + "grad_norm": 2.515625, + "learning_rate": 4.944541386899694e-05, + "loss": 0.8418, + "step": 3948 + }, + { + "epoch": 0.0700177703328503, + "grad_norm": 2.671875, + "learning_rate": 4.9444828722774455e-05, + "loss": 0.8428, + "step": 3950 + }, + { + "epoch": 0.07005322236846187, + "grad_norm": 2.9375, + "learning_rate": 4.94442432714856e-05, + "loss": 0.9022, + "step": 3952 + }, + { + "epoch": 0.07008867440407344, + "grad_norm": 2.9375, + "learning_rate": 4.9443657515137674e-05, + "loss": 0.836, + "step": 3954 + }, + { + "epoch": 0.070124126439685, + "grad_norm": 2.828125, + "learning_rate": 4.9443071453738e-05, + "loss": 0.8242, + "step": 3956 + }, + { + "epoch": 0.07015957847529658, + "grad_norm": 2.703125, + "learning_rate": 4.9442485087293886e-05, + "loss": 0.8566, + "step": 3958 + }, + { + "epoch": 0.07019503051090815, + "grad_norm": 2.96875, + "learning_rate": 4.944189841581265e-05, + "loss": 0.8319, + "step": 3960 + }, + { + "epoch": 0.07023048254651972, + "grad_norm": 2.640625, + "learning_rate": 4.944131143930161e-05, + "loss": 0.8564, + "step": 3962 + }, + { + "epoch": 0.07026593458213129, + "grad_norm": 2.6875, + "learning_rate": 4.944072415776809e-05, + "loss": 0.862, + "step": 3964 + }, + { + "epoch": 0.07030138661774286, + "grad_norm": 3.015625, + "learning_rate": 4.944013657121942e-05, + "loss": 0.8417, + "step": 3966 + }, + { + "epoch": 0.07033683865335442, + "grad_norm": 2.828125, + "learning_rate": 4.943954867966295e-05, + "loss": 0.845, + "step": 3968 + }, + { + "epoch": 0.070372290688966, + "grad_norm": 2.765625, + "learning_rate": 4.943896048310599e-05, + "loss": 0.8121, + "step": 3970 + }, + { + "epoch": 0.07040774272457756, + "grad_norm": 2.828125, + "learning_rate": 4.94383719815559e-05, + "loss": 0.8761, + "step": 3972 + }, + { + "epoch": 0.07044319476018913, + "grad_norm": 3.421875, + "learning_rate": 4.9437783175020015e-05, + "loss": 0.8621, + "step": 3974 + }, + { + "epoch": 0.07047864679580071, + "grad_norm": 3.03125, + "learning_rate": 4.943719406350569e-05, + "loss": 0.8443, + "step": 3976 + }, + { + "epoch": 0.07051409883141227, + "grad_norm": 2.75, + "learning_rate": 4.943660464702027e-05, + "loss": 0.8103, + "step": 3978 + }, + { + "epoch": 0.07054955086702384, + "grad_norm": 2.5625, + "learning_rate": 4.943601492557112e-05, + "loss": 0.8744, + "step": 3980 + }, + { + "epoch": 0.07058500290263542, + "grad_norm": 2.859375, + "learning_rate": 4.9435424899165586e-05, + "loss": 0.8371, + "step": 3982 + }, + { + "epoch": 0.07062045493824698, + "grad_norm": 3.234375, + "learning_rate": 4.943483456781104e-05, + "loss": 0.8318, + "step": 3984 + }, + { + "epoch": 0.07065590697385855, + "grad_norm": 2.859375, + "learning_rate": 4.943424393151485e-05, + "loss": 0.8621, + "step": 3986 + }, + { + "epoch": 0.07069135900947013, + "grad_norm": 2.90625, + "learning_rate": 4.9433652990284375e-05, + "loss": 0.8897, + "step": 3988 + }, + { + "epoch": 0.07072681104508169, + "grad_norm": 2.875, + "learning_rate": 4.943306174412701e-05, + "loss": 0.8527, + "step": 3990 + }, + { + "epoch": 0.07076226308069326, + "grad_norm": 2.8125, + "learning_rate": 4.943247019305012e-05, + "loss": 0.831, + "step": 3992 + }, + { + "epoch": 0.07079771511630484, + "grad_norm": 2.859375, + "learning_rate": 4.943187833706109e-05, + "loss": 0.8465, + "step": 3994 + }, + { + "epoch": 0.0708331671519164, + "grad_norm": 2.875, + "learning_rate": 4.943128617616731e-05, + "loss": 0.865, + "step": 3996 + }, + { + "epoch": 0.07086861918752797, + "grad_norm": 2.9375, + "learning_rate": 4.943069371037618e-05, + "loss": 0.8672, + "step": 3998 + }, + { + "epoch": 0.07090407122313955, + "grad_norm": 2.953125, + "learning_rate": 4.943010093969506e-05, + "loss": 0.833, + "step": 4000 + }, + { + "epoch": 0.07093952325875111, + "grad_norm": 2.890625, + "learning_rate": 4.9429507864131375e-05, + "loss": 0.8364, + "step": 4002 + }, + { + "epoch": 0.07097497529436268, + "grad_norm": 2.765625, + "learning_rate": 4.942891448369252e-05, + "loss": 0.8505, + "step": 4004 + }, + { + "epoch": 0.07101042732997426, + "grad_norm": 2.984375, + "learning_rate": 4.942832079838591e-05, + "loss": 0.8213, + "step": 4006 + }, + { + "epoch": 0.07104587936558582, + "grad_norm": 3.0, + "learning_rate": 4.9427726808218935e-05, + "loss": 0.8397, + "step": 4008 + }, + { + "epoch": 0.07108133140119739, + "grad_norm": 3.390625, + "learning_rate": 4.9427132513199015e-05, + "loss": 0.8628, + "step": 4010 + }, + { + "epoch": 0.07111678343680897, + "grad_norm": 2.84375, + "learning_rate": 4.942653791333357e-05, + "loss": 0.8477, + "step": 4012 + }, + { + "epoch": 0.07115223547242053, + "grad_norm": 2.765625, + "learning_rate": 4.942594300863003e-05, + "loss": 0.9068, + "step": 4014 + }, + { + "epoch": 0.0711876875080321, + "grad_norm": 2.90625, + "learning_rate": 4.94253477990958e-05, + "loss": 0.8385, + "step": 4016 + }, + { + "epoch": 0.07122313954364368, + "grad_norm": 2.6875, + "learning_rate": 4.942475228473832e-05, + "loss": 0.8585, + "step": 4018 + }, + { + "epoch": 0.07125859157925524, + "grad_norm": 2.875, + "learning_rate": 4.942415646556501e-05, + "loss": 0.8674, + "step": 4020 + }, + { + "epoch": 0.07129404361486681, + "grad_norm": 2.78125, + "learning_rate": 4.9423560341583325e-05, + "loss": 0.841, + "step": 4022 + }, + { + "epoch": 0.07132949565047839, + "grad_norm": 2.953125, + "learning_rate": 4.942296391280069e-05, + "loss": 0.8507, + "step": 4024 + }, + { + "epoch": 0.07136494768608995, + "grad_norm": 2.890625, + "learning_rate": 4.9422367179224555e-05, + "loss": 0.8421, + "step": 4026 + }, + { + "epoch": 0.07140039972170152, + "grad_norm": 2.8125, + "learning_rate": 4.942177014086236e-05, + "loss": 0.8639, + "step": 4028 + }, + { + "epoch": 0.0714358517573131, + "grad_norm": 2.65625, + "learning_rate": 4.9421172797721566e-05, + "loss": 0.8645, + "step": 4030 + }, + { + "epoch": 0.07147130379292466, + "grad_norm": 2.96875, + "learning_rate": 4.942057514980962e-05, + "loss": 0.8383, + "step": 4032 + }, + { + "epoch": 0.07150675582853623, + "grad_norm": 2.921875, + "learning_rate": 4.9419977197133984e-05, + "loss": 0.8567, + "step": 4034 + }, + { + "epoch": 0.0715422078641478, + "grad_norm": 2.578125, + "learning_rate": 4.941937893970211e-05, + "loss": 0.8575, + "step": 4036 + }, + { + "epoch": 0.07157765989975937, + "grad_norm": 2.96875, + "learning_rate": 4.941878037752148e-05, + "loss": 0.8374, + "step": 4038 + }, + { + "epoch": 0.07161311193537094, + "grad_norm": 2.671875, + "learning_rate": 4.941818151059956e-05, + "loss": 0.8745, + "step": 4040 + }, + { + "epoch": 0.07164856397098252, + "grad_norm": 2.71875, + "learning_rate": 4.9417582338943815e-05, + "loss": 0.8833, + "step": 4042 + }, + { + "epoch": 0.07168401600659408, + "grad_norm": 2.796875, + "learning_rate": 4.9416982862561726e-05, + "loss": 0.8612, + "step": 4044 + }, + { + "epoch": 0.07171946804220565, + "grad_norm": 2.671875, + "learning_rate": 4.941638308146078e-05, + "loss": 0.819, + "step": 4046 + }, + { + "epoch": 0.07175492007781722, + "grad_norm": 3.296875, + "learning_rate": 4.941578299564846e-05, + "loss": 0.837, + "step": 4048 + }, + { + "epoch": 0.07179037211342879, + "grad_norm": 2.96875, + "learning_rate": 4.9415182605132255e-05, + "loss": 0.895, + "step": 4050 + }, + { + "epoch": 0.07182582414904036, + "grad_norm": 2.625, + "learning_rate": 4.9414581909919656e-05, + "loss": 0.8425, + "step": 4052 + }, + { + "epoch": 0.07186127618465193, + "grad_norm": 2.484375, + "learning_rate": 4.941398091001815e-05, + "loss": 0.8123, + "step": 4054 + }, + { + "epoch": 0.0718967282202635, + "grad_norm": 2.609375, + "learning_rate": 4.9413379605435264e-05, + "loss": 0.8288, + "step": 4056 + }, + { + "epoch": 0.07193218025587507, + "grad_norm": 2.625, + "learning_rate": 4.9412777996178474e-05, + "loss": 0.8381, + "step": 4058 + }, + { + "epoch": 0.07196763229148663, + "grad_norm": 2.765625, + "learning_rate": 4.9412176082255304e-05, + "loss": 0.9139, + "step": 4060 + }, + { + "epoch": 0.0720030843270982, + "grad_norm": 2.78125, + "learning_rate": 4.941157386367326e-05, + "loss": 0.8468, + "step": 4062 + }, + { + "epoch": 0.07203853636270978, + "grad_norm": 2.859375, + "learning_rate": 4.941097134043986e-05, + "loss": 0.8892, + "step": 4064 + }, + { + "epoch": 0.07207398839832134, + "grad_norm": 2.734375, + "learning_rate": 4.9410368512562624e-05, + "loss": 0.8498, + "step": 4066 + }, + { + "epoch": 0.07210944043393291, + "grad_norm": 2.53125, + "learning_rate": 4.940976538004907e-05, + "loss": 0.8358, + "step": 4068 + }, + { + "epoch": 0.07214489246954449, + "grad_norm": 2.875, + "learning_rate": 4.9409161942906724e-05, + "loss": 0.8611, + "step": 4070 + }, + { + "epoch": 0.07218034450515605, + "grad_norm": 2.8125, + "learning_rate": 4.940855820114312e-05, + "loss": 0.8264, + "step": 4072 + }, + { + "epoch": 0.07221579654076762, + "grad_norm": 2.859375, + "learning_rate": 4.94079541547658e-05, + "loss": 0.9031, + "step": 4074 + }, + { + "epoch": 0.0722512485763792, + "grad_norm": 2.5, + "learning_rate": 4.94073498037823e-05, + "loss": 0.7923, + "step": 4076 + }, + { + "epoch": 0.07228670061199076, + "grad_norm": 3.171875, + "learning_rate": 4.940674514820015e-05, + "loss": 0.8308, + "step": 4078 + }, + { + "epoch": 0.07232215264760233, + "grad_norm": 2.84375, + "learning_rate": 4.9406140188026905e-05, + "loss": 0.8739, + "step": 4080 + }, + { + "epoch": 0.07235760468321391, + "grad_norm": 2.984375, + "learning_rate": 4.940553492327012e-05, + "loss": 0.8497, + "step": 4082 + }, + { + "epoch": 0.07239305671882547, + "grad_norm": 2.71875, + "learning_rate": 4.9404929353937336e-05, + "loss": 0.8896, + "step": 4084 + }, + { + "epoch": 0.07242850875443704, + "grad_norm": 2.765625, + "learning_rate": 4.9404323480036116e-05, + "loss": 0.8515, + "step": 4086 + }, + { + "epoch": 0.07246396079004862, + "grad_norm": 2.625, + "learning_rate": 4.940371730157403e-05, + "loss": 0.831, + "step": 4088 + }, + { + "epoch": 0.07249941282566018, + "grad_norm": 2.890625, + "learning_rate": 4.940311081855863e-05, + "loss": 0.8463, + "step": 4090 + }, + { + "epoch": 0.07253486486127175, + "grad_norm": 2.9375, + "learning_rate": 4.940250403099749e-05, + "loss": 0.8772, + "step": 4092 + }, + { + "epoch": 0.07257031689688333, + "grad_norm": 2.765625, + "learning_rate": 4.9401896938898185e-05, + "loss": 0.837, + "step": 4094 + }, + { + "epoch": 0.07260576893249489, + "grad_norm": 3.03125, + "learning_rate": 4.940128954226828e-05, + "loss": 0.863, + "step": 4096 + }, + { + "epoch": 0.07264122096810646, + "grad_norm": 2.75, + "learning_rate": 4.940068184111537e-05, + "loss": 0.8407, + "step": 4098 + }, + { + "epoch": 0.07267667300371804, + "grad_norm": 2.71875, + "learning_rate": 4.9400073835447035e-05, + "loss": 0.8389, + "step": 4100 + }, + { + "epoch": 0.0727121250393296, + "grad_norm": 2.96875, + "learning_rate": 4.939946552527086e-05, + "loss": 0.8972, + "step": 4102 + }, + { + "epoch": 0.07274757707494117, + "grad_norm": 2.703125, + "learning_rate": 4.939885691059444e-05, + "loss": 0.8153, + "step": 4104 + }, + { + "epoch": 0.07278302911055275, + "grad_norm": 3.1875, + "learning_rate": 4.939824799142536e-05, + "loss": 0.8642, + "step": 4106 + }, + { + "epoch": 0.07281848114616431, + "grad_norm": 2.890625, + "learning_rate": 4.939763876777122e-05, + "loss": 0.8415, + "step": 4108 + }, + { + "epoch": 0.07285393318177588, + "grad_norm": 2.53125, + "learning_rate": 4.939702923963965e-05, + "loss": 0.819, + "step": 4110 + }, + { + "epoch": 0.07288938521738746, + "grad_norm": 3.078125, + "learning_rate": 4.9396419407038226e-05, + "loss": 0.863, + "step": 4112 + }, + { + "epoch": 0.07292483725299902, + "grad_norm": 2.765625, + "learning_rate": 4.939580926997457e-05, + "loss": 0.8575, + "step": 4114 + }, + { + "epoch": 0.07296028928861059, + "grad_norm": 3.15625, + "learning_rate": 4.9395198828456294e-05, + "loss": 0.8643, + "step": 4116 + }, + { + "epoch": 0.07299574132422217, + "grad_norm": 2.703125, + "learning_rate": 4.9394588082491024e-05, + "loss": 0.8993, + "step": 4118 + }, + { + "epoch": 0.07303119335983373, + "grad_norm": 2.515625, + "learning_rate": 4.939397703208637e-05, + "loss": 0.8818, + "step": 4120 + }, + { + "epoch": 0.0730666453954453, + "grad_norm": 3.046875, + "learning_rate": 4.939336567724996e-05, + "loss": 0.8545, + "step": 4122 + }, + { + "epoch": 0.07310209743105688, + "grad_norm": 2.6875, + "learning_rate": 4.9392754017989435e-05, + "loss": 0.8493, + "step": 4124 + }, + { + "epoch": 0.07313754946666844, + "grad_norm": 2.625, + "learning_rate": 4.9392142054312416e-05, + "loss": 0.83, + "step": 4126 + }, + { + "epoch": 0.07317300150228001, + "grad_norm": 3.046875, + "learning_rate": 4.939152978622655e-05, + "loss": 0.9047, + "step": 4128 + }, + { + "epoch": 0.07320845353789159, + "grad_norm": 2.375, + "learning_rate": 4.939091721373946e-05, + "loss": 0.8372, + "step": 4130 + }, + { + "epoch": 0.07324390557350315, + "grad_norm": 2.8125, + "learning_rate": 4.9390304336858814e-05, + "loss": 0.8822, + "step": 4132 + }, + { + "epoch": 0.07327935760911472, + "grad_norm": 2.53125, + "learning_rate": 4.9389691155592256e-05, + "loss": 0.7915, + "step": 4134 + }, + { + "epoch": 0.0733148096447263, + "grad_norm": 2.640625, + "learning_rate": 4.938907766994742e-05, + "loss": 0.8189, + "step": 4136 + }, + { + "epoch": 0.07335026168033786, + "grad_norm": 2.671875, + "learning_rate": 4.938846387993198e-05, + "loss": 0.8671, + "step": 4138 + }, + { + "epoch": 0.07338571371594943, + "grad_norm": 2.890625, + "learning_rate": 4.9387849785553584e-05, + "loss": 0.8696, + "step": 4140 + }, + { + "epoch": 0.07342116575156099, + "grad_norm": 2.640625, + "learning_rate": 4.938723538681991e-05, + "loss": 0.7995, + "step": 4142 + }, + { + "epoch": 0.07345661778717257, + "grad_norm": 2.53125, + "learning_rate": 4.9386620683738616e-05, + "loss": 0.8268, + "step": 4144 + }, + { + "epoch": 0.07349206982278414, + "grad_norm": 2.890625, + "learning_rate": 4.9386005676317385e-05, + "loss": 0.7975, + "step": 4146 + }, + { + "epoch": 0.0735275218583957, + "grad_norm": 2.6875, + "learning_rate": 4.9385390364563864e-05, + "loss": 0.8414, + "step": 4148 + }, + { + "epoch": 0.07356297389400727, + "grad_norm": 3.09375, + "learning_rate": 4.9384774748485764e-05, + "loss": 0.8708, + "step": 4150 + }, + { + "epoch": 0.07359842592961885, + "grad_norm": 3.015625, + "learning_rate": 4.938415882809074e-05, + "loss": 0.9021, + "step": 4152 + }, + { + "epoch": 0.07363387796523041, + "grad_norm": 2.796875, + "learning_rate": 4.938354260338651e-05, + "loss": 0.8506, + "step": 4154 + }, + { + "epoch": 0.07366933000084198, + "grad_norm": 2.578125, + "learning_rate": 4.938292607438074e-05, + "loss": 0.8132, + "step": 4156 + }, + { + "epoch": 0.07370478203645356, + "grad_norm": 2.921875, + "learning_rate": 4.938230924108113e-05, + "loss": 0.8767, + "step": 4158 + }, + { + "epoch": 0.07374023407206512, + "grad_norm": 2.84375, + "learning_rate": 4.938169210349538e-05, + "loss": 0.8324, + "step": 4160 + }, + { + "epoch": 0.0737756861076767, + "grad_norm": 2.609375, + "learning_rate": 4.938107466163119e-05, + "loss": 0.8516, + "step": 4162 + }, + { + "epoch": 0.07381113814328827, + "grad_norm": 2.5625, + "learning_rate": 4.938045691549626e-05, + "loss": 0.8218, + "step": 4164 + }, + { + "epoch": 0.07384659017889983, + "grad_norm": 2.875, + "learning_rate": 4.937983886509832e-05, + "loss": 0.8524, + "step": 4166 + }, + { + "epoch": 0.0738820422145114, + "grad_norm": 2.859375, + "learning_rate": 4.937922051044506e-05, + "loss": 0.8436, + "step": 4168 + }, + { + "epoch": 0.07391749425012298, + "grad_norm": 2.765625, + "learning_rate": 4.93786018515442e-05, + "loss": 0.8616, + "step": 4170 + }, + { + "epoch": 0.07395294628573454, + "grad_norm": 2.765625, + "learning_rate": 4.937798288840347e-05, + "loss": 0.8628, + "step": 4172 + }, + { + "epoch": 0.07398839832134611, + "grad_norm": 2.875, + "learning_rate": 4.9377363621030596e-05, + "loss": 0.8646, + "step": 4174 + }, + { + "epoch": 0.07402385035695769, + "grad_norm": 3.109375, + "learning_rate": 4.93767440494333e-05, + "loss": 0.8985, + "step": 4176 + }, + { + "epoch": 0.07405930239256925, + "grad_norm": 2.640625, + "learning_rate": 4.937612417361932e-05, + "loss": 0.8776, + "step": 4178 + }, + { + "epoch": 0.07409475442818082, + "grad_norm": 2.734375, + "learning_rate": 4.937550399359638e-05, + "loss": 0.8236, + "step": 4180 + }, + { + "epoch": 0.0741302064637924, + "grad_norm": 2.78125, + "learning_rate": 4.937488350937223e-05, + "loss": 0.8464, + "step": 4182 + }, + { + "epoch": 0.07416565849940396, + "grad_norm": 2.59375, + "learning_rate": 4.937426272095461e-05, + "loss": 0.8683, + "step": 4184 + }, + { + "epoch": 0.07420111053501553, + "grad_norm": 2.875, + "learning_rate": 4.937364162835127e-05, + "loss": 0.8014, + "step": 4186 + }, + { + "epoch": 0.07423656257062711, + "grad_norm": 2.71875, + "learning_rate": 4.9373020231569956e-05, + "loss": 0.8531, + "step": 4188 + }, + { + "epoch": 0.07427201460623867, + "grad_norm": 3.0625, + "learning_rate": 4.937239853061843e-05, + "loss": 0.8679, + "step": 4190 + }, + { + "epoch": 0.07430746664185024, + "grad_norm": 2.9375, + "learning_rate": 4.9371776525504446e-05, + "loss": 0.8294, + "step": 4192 + }, + { + "epoch": 0.07434291867746182, + "grad_norm": 2.75, + "learning_rate": 4.937115421623577e-05, + "loss": 0.8347, + "step": 4194 + }, + { + "epoch": 0.07437837071307338, + "grad_norm": 2.6875, + "learning_rate": 4.937053160282016e-05, + "loss": 0.843, + "step": 4196 + }, + { + "epoch": 0.07441382274868495, + "grad_norm": 2.796875, + "learning_rate": 4.936990868526539e-05, + "loss": 0.8282, + "step": 4198 + }, + { + "epoch": 0.07444927478429653, + "grad_norm": 2.71875, + "learning_rate": 4.936928546357924e-05, + "loss": 0.8278, + "step": 4200 + }, + { + "epoch": 0.07448472681990809, + "grad_norm": 2.734375, + "learning_rate": 4.9368661937769475e-05, + "loss": 0.8537, + "step": 4202 + }, + { + "epoch": 0.07452017885551966, + "grad_norm": 2.78125, + "learning_rate": 4.936803810784389e-05, + "loss": 0.8322, + "step": 4204 + }, + { + "epoch": 0.07455563089113124, + "grad_norm": 2.9375, + "learning_rate": 4.936741397381027e-05, + "loss": 0.8855, + "step": 4206 + }, + { + "epoch": 0.0745910829267428, + "grad_norm": 3.0, + "learning_rate": 4.93667895356764e-05, + "loss": 0.851, + "step": 4208 + }, + { + "epoch": 0.07462653496235437, + "grad_norm": 2.703125, + "learning_rate": 4.9366164793450066e-05, + "loss": 0.8417, + "step": 4210 + }, + { + "epoch": 0.07466198699796595, + "grad_norm": 2.71875, + "learning_rate": 4.936553974713907e-05, + "loss": 0.8607, + "step": 4212 + }, + { + "epoch": 0.0746974390335775, + "grad_norm": 2.78125, + "learning_rate": 4.936491439675122e-05, + "loss": 0.816, + "step": 4214 + }, + { + "epoch": 0.07473289106918908, + "grad_norm": 2.90625, + "learning_rate": 4.9364288742294306e-05, + "loss": 0.8346, + "step": 4216 + }, + { + "epoch": 0.07476834310480066, + "grad_norm": 3.015625, + "learning_rate": 4.9363662783776146e-05, + "loss": 0.8284, + "step": 4218 + }, + { + "epoch": 0.07480379514041222, + "grad_norm": 2.75, + "learning_rate": 4.9363036521204546e-05, + "loss": 0.8666, + "step": 4220 + }, + { + "epoch": 0.07483924717602379, + "grad_norm": 2.5, + "learning_rate": 4.936240995458733e-05, + "loss": 0.8194, + "step": 4222 + }, + { + "epoch": 0.07487469921163536, + "grad_norm": 2.640625, + "learning_rate": 4.9361783083932304e-05, + "loss": 0.8879, + "step": 4224 + }, + { + "epoch": 0.07491015124724693, + "grad_norm": 2.8125, + "learning_rate": 4.93611559092473e-05, + "loss": 0.8462, + "step": 4226 + }, + { + "epoch": 0.0749456032828585, + "grad_norm": 2.6875, + "learning_rate": 4.936052843054015e-05, + "loss": 0.8267, + "step": 4228 + }, + { + "epoch": 0.07498105531847006, + "grad_norm": 2.671875, + "learning_rate": 4.935990064781868e-05, + "loss": 0.897, + "step": 4230 + }, + { + "epoch": 0.07501650735408164, + "grad_norm": 2.640625, + "learning_rate": 4.935927256109072e-05, + "loss": 0.8587, + "step": 4232 + }, + { + "epoch": 0.07505195938969321, + "grad_norm": 2.578125, + "learning_rate": 4.935864417036412e-05, + "loss": 0.8527, + "step": 4234 + }, + { + "epoch": 0.07508741142530477, + "grad_norm": 2.8125, + "learning_rate": 4.93580154756467e-05, + "loss": 0.8459, + "step": 4236 + }, + { + "epoch": 0.07512286346091634, + "grad_norm": 3.09375, + "learning_rate": 4.9357386476946334e-05, + "loss": 0.8307, + "step": 4238 + }, + { + "epoch": 0.07515831549652792, + "grad_norm": 2.59375, + "learning_rate": 4.935675717427085e-05, + "loss": 0.8156, + "step": 4240 + }, + { + "epoch": 0.07519376753213948, + "grad_norm": 2.90625, + "learning_rate": 4.935612756762811e-05, + "loss": 0.8588, + "step": 4242 + }, + { + "epoch": 0.07522921956775105, + "grad_norm": 2.859375, + "learning_rate": 4.935549765702597e-05, + "loss": 0.8664, + "step": 4244 + }, + { + "epoch": 0.07526467160336263, + "grad_norm": 2.890625, + "learning_rate": 4.93548674424723e-05, + "loss": 0.8555, + "step": 4246 + }, + { + "epoch": 0.07530012363897419, + "grad_norm": 2.84375, + "learning_rate": 4.935423692397495e-05, + "loss": 0.8634, + "step": 4248 + }, + { + "epoch": 0.07533557567458576, + "grad_norm": 2.71875, + "learning_rate": 4.93536061015418e-05, + "loss": 0.835, + "step": 4250 + }, + { + "epoch": 0.07537102771019734, + "grad_norm": 2.78125, + "learning_rate": 4.935297497518071e-05, + "loss": 0.8366, + "step": 4252 + }, + { + "epoch": 0.0754064797458089, + "grad_norm": 3.015625, + "learning_rate": 4.935234354489958e-05, + "loss": 0.7994, + "step": 4254 + }, + { + "epoch": 0.07544193178142047, + "grad_norm": 3.03125, + "learning_rate": 4.935171181070626e-05, + "loss": 0.8496, + "step": 4256 + }, + { + "epoch": 0.07547738381703205, + "grad_norm": 3.0, + "learning_rate": 4.935107977260865e-05, + "loss": 0.8336, + "step": 4258 + }, + { + "epoch": 0.07551283585264361, + "grad_norm": 2.75, + "learning_rate": 4.9350447430614647e-05, + "loss": 0.8468, + "step": 4260 + }, + { + "epoch": 0.07554828788825518, + "grad_norm": 3.125, + "learning_rate": 4.934981478473213e-05, + "loss": 0.8597, + "step": 4262 + }, + { + "epoch": 0.07558373992386676, + "grad_norm": 2.96875, + "learning_rate": 4.934918183496898e-05, + "loss": 0.82, + "step": 4264 + }, + { + "epoch": 0.07561919195947832, + "grad_norm": 2.703125, + "learning_rate": 4.934854858133313e-05, + "loss": 0.8744, + "step": 4266 + }, + { + "epoch": 0.07565464399508989, + "grad_norm": 2.765625, + "learning_rate": 4.934791502383246e-05, + "loss": 0.8545, + "step": 4268 + }, + { + "epoch": 0.07569009603070147, + "grad_norm": 3.0625, + "learning_rate": 4.934728116247488e-05, + "loss": 0.8635, + "step": 4270 + }, + { + "epoch": 0.07572554806631303, + "grad_norm": 2.859375, + "learning_rate": 4.93466469972683e-05, + "loss": 0.8592, + "step": 4272 + }, + { + "epoch": 0.0757610001019246, + "grad_norm": 2.96875, + "learning_rate": 4.934601252822064e-05, + "loss": 0.8474, + "step": 4274 + }, + { + "epoch": 0.07579645213753618, + "grad_norm": 2.921875, + "learning_rate": 4.9345377755339815e-05, + "loss": 0.9021, + "step": 4276 + }, + { + "epoch": 0.07583190417314774, + "grad_norm": 2.9375, + "learning_rate": 4.934474267863375e-05, + "loss": 0.8184, + "step": 4278 + }, + { + "epoch": 0.07586735620875931, + "grad_norm": 2.78125, + "learning_rate": 4.934410729811036e-05, + "loss": 0.8187, + "step": 4280 + }, + { + "epoch": 0.07590280824437089, + "grad_norm": 2.96875, + "learning_rate": 4.9343471613777584e-05, + "loss": 0.8448, + "step": 4282 + }, + { + "epoch": 0.07593826027998245, + "grad_norm": 2.453125, + "learning_rate": 4.934283562564335e-05, + "loss": 0.8228, + "step": 4284 + }, + { + "epoch": 0.07597371231559402, + "grad_norm": 2.671875, + "learning_rate": 4.93421993337156e-05, + "loss": 0.8537, + "step": 4286 + }, + { + "epoch": 0.0760091643512056, + "grad_norm": 2.625, + "learning_rate": 4.934156273800228e-05, + "loss": 0.8121, + "step": 4288 + }, + { + "epoch": 0.07604461638681716, + "grad_norm": 2.625, + "learning_rate": 4.934092583851132e-05, + "loss": 0.8435, + "step": 4290 + }, + { + "epoch": 0.07608006842242873, + "grad_norm": 2.78125, + "learning_rate": 4.934028863525067e-05, + "loss": 0.7778, + "step": 4292 + }, + { + "epoch": 0.0761155204580403, + "grad_norm": 3.203125, + "learning_rate": 4.933965112822829e-05, + "loss": 0.8711, + "step": 4294 + }, + { + "epoch": 0.07615097249365187, + "grad_norm": 3.109375, + "learning_rate": 4.9339013317452145e-05, + "loss": 0.8524, + "step": 4296 + }, + { + "epoch": 0.07618642452926344, + "grad_norm": 2.71875, + "learning_rate": 4.933837520293017e-05, + "loss": 0.842, + "step": 4298 + }, + { + "epoch": 0.07622187656487502, + "grad_norm": 2.75, + "learning_rate": 4.933773678467035e-05, + "loss": 0.8656, + "step": 4300 + }, + { + "epoch": 0.07625732860048658, + "grad_norm": 2.828125, + "learning_rate": 4.9337098062680635e-05, + "loss": 0.8061, + "step": 4302 + }, + { + "epoch": 0.07629278063609815, + "grad_norm": 2.921875, + "learning_rate": 4.933645903696901e-05, + "loss": 0.8413, + "step": 4304 + }, + { + "epoch": 0.07632823267170973, + "grad_norm": 2.5625, + "learning_rate": 4.933581970754345e-05, + "loss": 0.8499, + "step": 4306 + }, + { + "epoch": 0.07636368470732129, + "grad_norm": 2.796875, + "learning_rate": 4.9335180074411926e-05, + "loss": 0.8621, + "step": 4308 + }, + { + "epoch": 0.07639913674293286, + "grad_norm": 3.015625, + "learning_rate": 4.933454013758242e-05, + "loss": 0.8514, + "step": 4310 + }, + { + "epoch": 0.07643458877854442, + "grad_norm": 2.859375, + "learning_rate": 4.933389989706292e-05, + "loss": 0.8217, + "step": 4312 + }, + { + "epoch": 0.076470040814156, + "grad_norm": 2.734375, + "learning_rate": 4.933325935286142e-05, + "loss": 0.8471, + "step": 4314 + }, + { + "epoch": 0.07650549284976757, + "grad_norm": 2.890625, + "learning_rate": 4.933261850498592e-05, + "loss": 0.8507, + "step": 4316 + }, + { + "epoch": 0.07654094488537913, + "grad_norm": 2.71875, + "learning_rate": 4.93319773534444e-05, + "loss": 0.873, + "step": 4318 + }, + { + "epoch": 0.0765763969209907, + "grad_norm": 2.84375, + "learning_rate": 4.9331335898244866e-05, + "loss": 0.8346, + "step": 4320 + }, + { + "epoch": 0.07661184895660228, + "grad_norm": 2.9375, + "learning_rate": 4.933069413939534e-05, + "loss": 0.8501, + "step": 4322 + }, + { + "epoch": 0.07664730099221384, + "grad_norm": 2.734375, + "learning_rate": 4.933005207690381e-05, + "loss": 0.8429, + "step": 4324 + }, + { + "epoch": 0.07668275302782541, + "grad_norm": 2.5625, + "learning_rate": 4.93294097107783e-05, + "loss": 0.811, + "step": 4326 + }, + { + "epoch": 0.07671820506343699, + "grad_norm": 2.6875, + "learning_rate": 4.9328767041026824e-05, + "loss": 0.8589, + "step": 4328 + }, + { + "epoch": 0.07675365709904855, + "grad_norm": 2.875, + "learning_rate": 4.9328124067657406e-05, + "loss": 0.8443, + "step": 4330 + }, + { + "epoch": 0.07678910913466012, + "grad_norm": 2.890625, + "learning_rate": 4.932748079067806e-05, + "loss": 0.8548, + "step": 4332 + }, + { + "epoch": 0.0768245611702717, + "grad_norm": 2.515625, + "learning_rate": 4.932683721009683e-05, + "loss": 0.8519, + "step": 4334 + }, + { + "epoch": 0.07686001320588326, + "grad_norm": 2.546875, + "learning_rate": 4.9326193325921734e-05, + "loss": 0.8667, + "step": 4336 + }, + { + "epoch": 0.07689546524149483, + "grad_norm": 2.9375, + "learning_rate": 4.932554913816081e-05, + "loss": 0.815, + "step": 4338 + }, + { + "epoch": 0.07693091727710641, + "grad_norm": 2.9375, + "learning_rate": 4.9324904646822104e-05, + "loss": 0.8384, + "step": 4340 + }, + { + "epoch": 0.07696636931271797, + "grad_norm": 2.78125, + "learning_rate": 4.932425985191365e-05, + "loss": 0.8983, + "step": 4342 + }, + { + "epoch": 0.07700182134832954, + "grad_norm": 2.5, + "learning_rate": 4.9323614753443506e-05, + "loss": 0.8064, + "step": 4344 + }, + { + "epoch": 0.07703727338394112, + "grad_norm": 2.859375, + "learning_rate": 4.932296935141971e-05, + "loss": 0.8356, + "step": 4346 + }, + { + "epoch": 0.07707272541955268, + "grad_norm": 2.921875, + "learning_rate": 4.932232364585032e-05, + "loss": 0.8279, + "step": 4348 + }, + { + "epoch": 0.07710817745516425, + "grad_norm": 2.8125, + "learning_rate": 4.93216776367434e-05, + "loss": 0.8228, + "step": 4350 + }, + { + "epoch": 0.07714362949077583, + "grad_norm": 2.78125, + "learning_rate": 4.9321031324107016e-05, + "loss": 0.8484, + "step": 4352 + }, + { + "epoch": 0.07717908152638739, + "grad_norm": 2.734375, + "learning_rate": 4.932038470794922e-05, + "loss": 0.833, + "step": 4354 + }, + { + "epoch": 0.07721453356199896, + "grad_norm": 2.953125, + "learning_rate": 4.931973778827809e-05, + "loss": 0.8317, + "step": 4356 + }, + { + "epoch": 0.07724998559761054, + "grad_norm": 2.65625, + "learning_rate": 4.931909056510169e-05, + "loss": 0.8346, + "step": 4358 + }, + { + "epoch": 0.0772854376332221, + "grad_norm": 2.609375, + "learning_rate": 4.931844303842811e-05, + "loss": 0.8479, + "step": 4360 + }, + { + "epoch": 0.07732088966883367, + "grad_norm": 2.703125, + "learning_rate": 4.931779520826543e-05, + "loss": 0.8641, + "step": 4362 + }, + { + "epoch": 0.07735634170444525, + "grad_norm": 2.703125, + "learning_rate": 4.931714707462173e-05, + "loss": 0.8309, + "step": 4364 + }, + { + "epoch": 0.07739179374005681, + "grad_norm": 2.84375, + "learning_rate": 4.93164986375051e-05, + "loss": 0.9095, + "step": 4366 + }, + { + "epoch": 0.07742724577566838, + "grad_norm": 2.765625, + "learning_rate": 4.931584989692363e-05, + "loss": 0.8253, + "step": 4368 + }, + { + "epoch": 0.07746269781127996, + "grad_norm": 2.609375, + "learning_rate": 4.9315200852885415e-05, + "loss": 0.8257, + "step": 4370 + }, + { + "epoch": 0.07749814984689152, + "grad_norm": 3.1875, + "learning_rate": 4.931455150539856e-05, + "loss": 0.8824, + "step": 4372 + }, + { + "epoch": 0.07753360188250309, + "grad_norm": 2.578125, + "learning_rate": 4.931390185447117e-05, + "loss": 0.845, + "step": 4374 + }, + { + "epoch": 0.07756905391811467, + "grad_norm": 3.0, + "learning_rate": 4.9313251900111346e-05, + "loss": 0.8355, + "step": 4376 + }, + { + "epoch": 0.07760450595372623, + "grad_norm": 2.484375, + "learning_rate": 4.93126016423272e-05, + "loss": 0.8556, + "step": 4378 + }, + { + "epoch": 0.0776399579893378, + "grad_norm": 2.53125, + "learning_rate": 4.931195108112685e-05, + "loss": 0.8141, + "step": 4380 + }, + { + "epoch": 0.07767541002494938, + "grad_norm": 2.828125, + "learning_rate": 4.9311300216518416e-05, + "loss": 0.8393, + "step": 4382 + }, + { + "epoch": 0.07771086206056094, + "grad_norm": 2.703125, + "learning_rate": 4.931064904851003e-05, + "loss": 0.8447, + "step": 4384 + }, + { + "epoch": 0.07774631409617251, + "grad_norm": 2.671875, + "learning_rate": 4.930999757710979e-05, + "loss": 0.8567, + "step": 4386 + }, + { + "epoch": 0.07778176613178409, + "grad_norm": 2.75, + "learning_rate": 4.930934580232585e-05, + "loss": 0.8612, + "step": 4388 + }, + { + "epoch": 0.07781721816739565, + "grad_norm": 3.109375, + "learning_rate": 4.930869372416634e-05, + "loss": 0.8599, + "step": 4390 + }, + { + "epoch": 0.07785267020300722, + "grad_norm": 2.734375, + "learning_rate": 4.930804134263939e-05, + "loss": 0.861, + "step": 4392 + }, + { + "epoch": 0.0778881222386188, + "grad_norm": 2.546875, + "learning_rate": 4.930738865775315e-05, + "loss": 0.8451, + "step": 4394 + }, + { + "epoch": 0.07792357427423036, + "grad_norm": 2.890625, + "learning_rate": 4.930673566951577e-05, + "loss": 0.8911, + "step": 4396 + }, + { + "epoch": 0.07795902630984193, + "grad_norm": 2.8125, + "learning_rate": 4.9306082377935384e-05, + "loss": 0.8285, + "step": 4398 + }, + { + "epoch": 0.07799447834545349, + "grad_norm": 3.109375, + "learning_rate": 4.930542878302015e-05, + "loss": 0.8523, + "step": 4400 + }, + { + "epoch": 0.07802993038106507, + "grad_norm": 3.265625, + "learning_rate": 4.9304774884778224e-05, + "loss": 0.866, + "step": 4402 + }, + { + "epoch": 0.07806538241667664, + "grad_norm": 2.953125, + "learning_rate": 4.930412068321778e-05, + "loss": 0.8531, + "step": 4404 + }, + { + "epoch": 0.0781008344522882, + "grad_norm": 2.984375, + "learning_rate": 4.930346617834697e-05, + "loss": 0.8139, + "step": 4406 + }, + { + "epoch": 0.07813628648789978, + "grad_norm": 2.84375, + "learning_rate": 4.9302811370173966e-05, + "loss": 0.836, + "step": 4408 + }, + { + "epoch": 0.07817173852351135, + "grad_norm": 2.96875, + "learning_rate": 4.930215625870693e-05, + "loss": 0.8246, + "step": 4410 + }, + { + "epoch": 0.07820719055912291, + "grad_norm": 2.40625, + "learning_rate": 4.930150084395405e-05, + "loss": 0.7908, + "step": 4412 + }, + { + "epoch": 0.07824264259473449, + "grad_norm": 2.515625, + "learning_rate": 4.93008451259235e-05, + "loss": 0.8515, + "step": 4414 + }, + { + "epoch": 0.07827809463034606, + "grad_norm": 2.640625, + "learning_rate": 4.9300189104623466e-05, + "loss": 0.8549, + "step": 4416 + }, + { + "epoch": 0.07831354666595762, + "grad_norm": 2.609375, + "learning_rate": 4.929953278006213e-05, + "loss": 0.842, + "step": 4418 + }, + { + "epoch": 0.0783489987015692, + "grad_norm": 2.9375, + "learning_rate": 4.929887615224769e-05, + "loss": 0.8326, + "step": 4420 + }, + { + "epoch": 0.07838445073718077, + "grad_norm": 2.96875, + "learning_rate": 4.9298219221188336e-05, + "loss": 0.844, + "step": 4422 + }, + { + "epoch": 0.07841990277279233, + "grad_norm": 2.90625, + "learning_rate": 4.929756198689227e-05, + "loss": 0.8625, + "step": 4424 + }, + { + "epoch": 0.0784553548084039, + "grad_norm": 2.375, + "learning_rate": 4.9296904449367685e-05, + "loss": 0.8317, + "step": 4426 + }, + { + "epoch": 0.07849080684401548, + "grad_norm": 2.609375, + "learning_rate": 4.9296246608622795e-05, + "loss": 0.795, + "step": 4428 + }, + { + "epoch": 0.07852625887962704, + "grad_norm": 2.578125, + "learning_rate": 4.929558846466581e-05, + "loss": 0.8097, + "step": 4430 + }, + { + "epoch": 0.07856171091523861, + "grad_norm": 3.09375, + "learning_rate": 4.929493001750494e-05, + "loss": 0.834, + "step": 4432 + }, + { + "epoch": 0.07859716295085019, + "grad_norm": 2.609375, + "learning_rate": 4.9294271267148405e-05, + "loss": 0.8514, + "step": 4434 + }, + { + "epoch": 0.07863261498646175, + "grad_norm": 2.984375, + "learning_rate": 4.929361221360442e-05, + "loss": 0.8629, + "step": 4436 + }, + { + "epoch": 0.07866806702207332, + "grad_norm": 2.40625, + "learning_rate": 4.929295285688122e-05, + "loss": 0.854, + "step": 4438 + }, + { + "epoch": 0.0787035190576849, + "grad_norm": 2.765625, + "learning_rate": 4.929229319698703e-05, + "loss": 0.8652, + "step": 4440 + }, + { + "epoch": 0.07873897109329646, + "grad_norm": 2.703125, + "learning_rate": 4.929163323393008e-05, + "loss": 0.8475, + "step": 4442 + }, + { + "epoch": 0.07877442312890803, + "grad_norm": 2.96875, + "learning_rate": 4.92909729677186e-05, + "loss": 0.8511, + "step": 4444 + }, + { + "epoch": 0.07880987516451961, + "grad_norm": 2.5, + "learning_rate": 4.929031239836084e-05, + "loss": 0.7948, + "step": 4446 + }, + { + "epoch": 0.07884532720013117, + "grad_norm": 3.25, + "learning_rate": 4.9289651525865046e-05, + "loss": 0.8679, + "step": 4448 + }, + { + "epoch": 0.07888077923574274, + "grad_norm": 2.796875, + "learning_rate": 4.928899035023945e-05, + "loss": 0.8649, + "step": 4450 + }, + { + "epoch": 0.07891623127135432, + "grad_norm": 2.765625, + "learning_rate": 4.9288328871492315e-05, + "loss": 0.8551, + "step": 4452 + }, + { + "epoch": 0.07895168330696588, + "grad_norm": 2.71875, + "learning_rate": 4.9287667089631904e-05, + "loss": 0.7862, + "step": 4454 + }, + { + "epoch": 0.07898713534257745, + "grad_norm": 2.578125, + "learning_rate": 4.9287005004666465e-05, + "loss": 0.8378, + "step": 4456 + }, + { + "epoch": 0.07902258737818903, + "grad_norm": 2.796875, + "learning_rate": 4.928634261660425e-05, + "loss": 0.8194, + "step": 4458 + }, + { + "epoch": 0.07905803941380059, + "grad_norm": 2.84375, + "learning_rate": 4.9285679925453545e-05, + "loss": 0.8504, + "step": 4460 + }, + { + "epoch": 0.07909349144941216, + "grad_norm": 2.609375, + "learning_rate": 4.928501693122262e-05, + "loss": 0.877, + "step": 4462 + }, + { + "epoch": 0.07912894348502374, + "grad_norm": 2.625, + "learning_rate": 4.928435363391973e-05, + "loss": 0.8103, + "step": 4464 + }, + { + "epoch": 0.0791643955206353, + "grad_norm": 3.015625, + "learning_rate": 4.9283690033553174e-05, + "loss": 0.8439, + "step": 4466 + }, + { + "epoch": 0.07919984755624687, + "grad_norm": 2.6875, + "learning_rate": 4.928302613013122e-05, + "loss": 0.7667, + "step": 4468 + }, + { + "epoch": 0.07923529959185845, + "grad_norm": 2.765625, + "learning_rate": 4.9282361923662156e-05, + "loss": 0.8139, + "step": 4470 + }, + { + "epoch": 0.07927075162747, + "grad_norm": 2.828125, + "learning_rate": 4.928169741415428e-05, + "loss": 0.8062, + "step": 4472 + }, + { + "epoch": 0.07930620366308158, + "grad_norm": 2.890625, + "learning_rate": 4.928103260161587e-05, + "loss": 0.8375, + "step": 4474 + }, + { + "epoch": 0.07934165569869316, + "grad_norm": 2.796875, + "learning_rate": 4.928036748605523e-05, + "loss": 0.874, + "step": 4476 + }, + { + "epoch": 0.07937710773430472, + "grad_norm": 2.796875, + "learning_rate": 4.927970206748067e-05, + "loss": 0.8288, + "step": 4478 + }, + { + "epoch": 0.07941255976991629, + "grad_norm": 2.734375, + "learning_rate": 4.927903634590048e-05, + "loss": 0.8458, + "step": 4480 + }, + { + "epoch": 0.07944801180552787, + "grad_norm": 2.671875, + "learning_rate": 4.927837032132297e-05, + "loss": 0.8423, + "step": 4482 + }, + { + "epoch": 0.07948346384113943, + "grad_norm": 2.859375, + "learning_rate": 4.927770399375646e-05, + "loss": 0.8363, + "step": 4484 + }, + { + "epoch": 0.079518915876751, + "grad_norm": 2.78125, + "learning_rate": 4.9277037363209256e-05, + "loss": 0.8574, + "step": 4486 + }, + { + "epoch": 0.07955436791236256, + "grad_norm": 2.65625, + "learning_rate": 4.927637042968969e-05, + "loss": 0.8135, + "step": 4488 + }, + { + "epoch": 0.07958981994797414, + "grad_norm": 2.75, + "learning_rate": 4.927570319320607e-05, + "loss": 0.8335, + "step": 4490 + }, + { + "epoch": 0.07962527198358571, + "grad_norm": 2.6875, + "learning_rate": 4.9275035653766735e-05, + "loss": 0.848, + "step": 4492 + }, + { + "epoch": 0.07966072401919727, + "grad_norm": 2.71875, + "learning_rate": 4.927436781138001e-05, + "loss": 0.8307, + "step": 4494 + }, + { + "epoch": 0.07969617605480885, + "grad_norm": 2.96875, + "learning_rate": 4.927369966605423e-05, + "loss": 0.8645, + "step": 4496 + }, + { + "epoch": 0.07973162809042042, + "grad_norm": 2.875, + "learning_rate": 4.927303121779773e-05, + "loss": 0.8851, + "step": 4498 + }, + { + "epoch": 0.07976708012603198, + "grad_norm": 2.6875, + "learning_rate": 4.927236246661886e-05, + "loss": 0.8013, + "step": 4500 + }, + { + "epoch": 0.07980253216164356, + "grad_norm": 2.96875, + "learning_rate": 4.927169341252596e-05, + "loss": 0.8538, + "step": 4502 + }, + { + "epoch": 0.07983798419725513, + "grad_norm": 2.609375, + "learning_rate": 4.927102405552738e-05, + "loss": 0.828, + "step": 4504 + }, + { + "epoch": 0.07987343623286669, + "grad_norm": 2.9375, + "learning_rate": 4.927035439563149e-05, + "loss": 0.8303, + "step": 4506 + }, + { + "epoch": 0.07990888826847826, + "grad_norm": 2.625, + "learning_rate": 4.926968443284662e-05, + "loss": 0.8525, + "step": 4508 + }, + { + "epoch": 0.07994434030408984, + "grad_norm": 2.71875, + "learning_rate": 4.926901416718114e-05, + "loss": 0.8281, + "step": 4510 + }, + { + "epoch": 0.0799797923397014, + "grad_norm": 2.515625, + "learning_rate": 4.926834359864342e-05, + "loss": 0.8465, + "step": 4512 + }, + { + "epoch": 0.08001524437531297, + "grad_norm": 2.484375, + "learning_rate": 4.9267672727241834e-05, + "loss": 0.8424, + "step": 4514 + }, + { + "epoch": 0.08005069641092455, + "grad_norm": 3.09375, + "learning_rate": 4.926700155298474e-05, + "loss": 0.792, + "step": 4516 + }, + { + "epoch": 0.08008614844653611, + "grad_norm": 2.921875, + "learning_rate": 4.926633007588053e-05, + "loss": 0.8201, + "step": 4518 + }, + { + "epoch": 0.08012160048214768, + "grad_norm": 2.75, + "learning_rate": 4.926565829593756e-05, + "loss": 0.8458, + "step": 4520 + }, + { + "epoch": 0.08015705251775926, + "grad_norm": 2.484375, + "learning_rate": 4.9264986213164235e-05, + "loss": 0.8366, + "step": 4522 + }, + { + "epoch": 0.08019250455337082, + "grad_norm": 2.71875, + "learning_rate": 4.926431382756894e-05, + "loss": 0.8485, + "step": 4524 + }, + { + "epoch": 0.0802279565889824, + "grad_norm": 3.21875, + "learning_rate": 4.926364113916006e-05, + "loss": 0.8815, + "step": 4526 + }, + { + "epoch": 0.08026340862459397, + "grad_norm": 3.328125, + "learning_rate": 4.926296814794599e-05, + "loss": 0.844, + "step": 4528 + }, + { + "epoch": 0.08029886066020553, + "grad_norm": 2.875, + "learning_rate": 4.926229485393513e-05, + "loss": 0.7989, + "step": 4530 + }, + { + "epoch": 0.0803343126958171, + "grad_norm": 2.625, + "learning_rate": 4.926162125713589e-05, + "loss": 0.8728, + "step": 4532 + }, + { + "epoch": 0.08036976473142868, + "grad_norm": 2.625, + "learning_rate": 4.9260947357556666e-05, + "loss": 0.8449, + "step": 4534 + }, + { + "epoch": 0.08040521676704024, + "grad_norm": 2.546875, + "learning_rate": 4.926027315520588e-05, + "loss": 0.8506, + "step": 4536 + }, + { + "epoch": 0.08044066880265181, + "grad_norm": 3.078125, + "learning_rate": 4.925959865009193e-05, + "loss": 0.8575, + "step": 4538 + }, + { + "epoch": 0.08047612083826339, + "grad_norm": 2.9375, + "learning_rate": 4.925892384222324e-05, + "loss": 0.8483, + "step": 4540 + }, + { + "epoch": 0.08051157287387495, + "grad_norm": 2.859375, + "learning_rate": 4.9258248731608235e-05, + "loss": 0.8541, + "step": 4542 + }, + { + "epoch": 0.08054702490948652, + "grad_norm": 2.765625, + "learning_rate": 4.9257573318255344e-05, + "loss": 0.8851, + "step": 4544 + }, + { + "epoch": 0.0805824769450981, + "grad_norm": 2.859375, + "learning_rate": 4.9256897602172986e-05, + "loss": 0.8394, + "step": 4546 + }, + { + "epoch": 0.08061792898070966, + "grad_norm": 2.640625, + "learning_rate": 4.92562215833696e-05, + "loss": 0.8504, + "step": 4548 + }, + { + "epoch": 0.08065338101632123, + "grad_norm": 3.125, + "learning_rate": 4.925554526185362e-05, + "loss": 0.8731, + "step": 4550 + }, + { + "epoch": 0.0806888330519328, + "grad_norm": 2.65625, + "learning_rate": 4.925486863763349e-05, + "loss": 0.8335, + "step": 4552 + }, + { + "epoch": 0.08072428508754437, + "grad_norm": 2.734375, + "learning_rate": 4.925419171071765e-05, + "loss": 0.8089, + "step": 4554 + }, + { + "epoch": 0.08075973712315594, + "grad_norm": 2.953125, + "learning_rate": 4.9253514481114535e-05, + "loss": 0.8675, + "step": 4556 + }, + { + "epoch": 0.08079518915876752, + "grad_norm": 3.0, + "learning_rate": 4.925283694883263e-05, + "loss": 0.8822, + "step": 4558 + }, + { + "epoch": 0.08083064119437908, + "grad_norm": 2.734375, + "learning_rate": 4.9252159113880365e-05, + "loss": 0.8938, + "step": 4560 + }, + { + "epoch": 0.08086609322999065, + "grad_norm": 2.921875, + "learning_rate": 4.925148097626621e-05, + "loss": 0.8671, + "step": 4562 + }, + { + "epoch": 0.08090154526560223, + "grad_norm": 2.546875, + "learning_rate": 4.9250802535998605e-05, + "loss": 0.8473, + "step": 4564 + }, + { + "epoch": 0.08093699730121379, + "grad_norm": 2.65625, + "learning_rate": 4.925012379308606e-05, + "loss": 0.7768, + "step": 4566 + }, + { + "epoch": 0.08097244933682536, + "grad_norm": 2.984375, + "learning_rate": 4.924944474753701e-05, + "loss": 0.8361, + "step": 4568 + }, + { + "epoch": 0.08100790137243692, + "grad_norm": 3.015625, + "learning_rate": 4.9248765399359934e-05, + "loss": 0.9217, + "step": 4570 + }, + { + "epoch": 0.0810433534080485, + "grad_norm": 2.921875, + "learning_rate": 4.924808574856332e-05, + "loss": 0.8699, + "step": 4572 + }, + { + "epoch": 0.08107880544366007, + "grad_norm": 2.640625, + "learning_rate": 4.9247405795155655e-05, + "loss": 0.8332, + "step": 4574 + }, + { + "epoch": 0.08111425747927163, + "grad_norm": 2.84375, + "learning_rate": 4.924672553914541e-05, + "loss": 0.8051, + "step": 4576 + }, + { + "epoch": 0.0811497095148832, + "grad_norm": 2.796875, + "learning_rate": 4.9246044980541084e-05, + "loss": 0.8353, + "step": 4578 + }, + { + "epoch": 0.08118516155049478, + "grad_norm": 2.6875, + "learning_rate": 4.924536411935116e-05, + "loss": 0.8489, + "step": 4580 + }, + { + "epoch": 0.08122061358610634, + "grad_norm": 2.6875, + "learning_rate": 4.924468295558415e-05, + "loss": 0.8225, + "step": 4582 + }, + { + "epoch": 0.08125606562171792, + "grad_norm": 2.640625, + "learning_rate": 4.9244001489248535e-05, + "loss": 0.8798, + "step": 4584 + }, + { + "epoch": 0.08129151765732949, + "grad_norm": 2.84375, + "learning_rate": 4.924331972035284e-05, + "loss": 0.8132, + "step": 4586 + }, + { + "epoch": 0.08132696969294105, + "grad_norm": 2.890625, + "learning_rate": 4.924263764890557e-05, + "loss": 0.8357, + "step": 4588 + }, + { + "epoch": 0.08136242172855263, + "grad_norm": 2.671875, + "learning_rate": 4.924195527491522e-05, + "loss": 0.8486, + "step": 4590 + }, + { + "epoch": 0.0813978737641642, + "grad_norm": 2.40625, + "learning_rate": 4.924127259839032e-05, + "loss": 0.7952, + "step": 4592 + }, + { + "epoch": 0.08143332579977576, + "grad_norm": 2.9375, + "learning_rate": 4.924058961933939e-05, + "loss": 0.8301, + "step": 4594 + }, + { + "epoch": 0.08146877783538733, + "grad_norm": 2.90625, + "learning_rate": 4.923990633777095e-05, + "loss": 0.8903, + "step": 4596 + }, + { + "epoch": 0.08150422987099891, + "grad_norm": 2.703125, + "learning_rate": 4.923922275369352e-05, + "loss": 0.8317, + "step": 4598 + }, + { + "epoch": 0.08153968190661047, + "grad_norm": 2.796875, + "learning_rate": 4.923853886711565e-05, + "loss": 0.8319, + "step": 4600 + }, + { + "epoch": 0.08157513394222204, + "grad_norm": 2.609375, + "learning_rate": 4.9237854678045855e-05, + "loss": 0.8296, + "step": 4602 + }, + { + "epoch": 0.08161058597783362, + "grad_norm": 2.625, + "learning_rate": 4.923717018649269e-05, + "loss": 0.8253, + "step": 4604 + }, + { + "epoch": 0.08164603801344518, + "grad_norm": 3.171875, + "learning_rate": 4.923648539246468e-05, + "loss": 0.8293, + "step": 4606 + }, + { + "epoch": 0.08168149004905675, + "grad_norm": 3.421875, + "learning_rate": 4.923580029597039e-05, + "loss": 0.8476, + "step": 4608 + }, + { + "epoch": 0.08171694208466833, + "grad_norm": 2.921875, + "learning_rate": 4.923511489701835e-05, + "loss": 0.8445, + "step": 4610 + }, + { + "epoch": 0.08175239412027989, + "grad_norm": 2.75, + "learning_rate": 4.923442919561714e-05, + "loss": 0.8342, + "step": 4612 + }, + { + "epoch": 0.08178784615589146, + "grad_norm": 3.140625, + "learning_rate": 4.9233743191775286e-05, + "loss": 0.8761, + "step": 4614 + }, + { + "epoch": 0.08182329819150304, + "grad_norm": 3.25, + "learning_rate": 4.923305688550137e-05, + "loss": 0.8501, + "step": 4616 + }, + { + "epoch": 0.0818587502271146, + "grad_norm": 2.453125, + "learning_rate": 4.9232370276803955e-05, + "loss": 0.8057, + "step": 4618 + }, + { + "epoch": 0.08189420226272617, + "grad_norm": 2.484375, + "learning_rate": 4.923168336569159e-05, + "loss": 0.8029, + "step": 4620 + }, + { + "epoch": 0.08192965429833775, + "grad_norm": 3.140625, + "learning_rate": 4.923099615217288e-05, + "loss": 0.8876, + "step": 4622 + }, + { + "epoch": 0.08196510633394931, + "grad_norm": 3.234375, + "learning_rate": 4.9230308636256385e-05, + "loss": 0.8402, + "step": 4624 + }, + { + "epoch": 0.08200055836956088, + "grad_norm": 2.84375, + "learning_rate": 4.922962081795068e-05, + "loss": 0.8426, + "step": 4626 + }, + { + "epoch": 0.08203601040517246, + "grad_norm": 2.8125, + "learning_rate": 4.922893269726435e-05, + "loss": 0.8373, + "step": 4628 + }, + { + "epoch": 0.08207146244078402, + "grad_norm": 2.5625, + "learning_rate": 4.922824427420599e-05, + "loss": 0.8424, + "step": 4630 + }, + { + "epoch": 0.08210691447639559, + "grad_norm": 2.890625, + "learning_rate": 4.9227555548784196e-05, + "loss": 0.8827, + "step": 4632 + }, + { + "epoch": 0.08214236651200717, + "grad_norm": 2.703125, + "learning_rate": 4.922686652100754e-05, + "loss": 0.8642, + "step": 4634 + }, + { + "epoch": 0.08217781854761873, + "grad_norm": 3.03125, + "learning_rate": 4.9226177190884645e-05, + "loss": 0.8704, + "step": 4636 + }, + { + "epoch": 0.0822132705832303, + "grad_norm": 2.765625, + "learning_rate": 4.922548755842411e-05, + "loss": 0.8341, + "step": 4638 + }, + { + "epoch": 0.08224872261884188, + "grad_norm": 2.921875, + "learning_rate": 4.922479762363453e-05, + "loss": 0.8625, + "step": 4640 + }, + { + "epoch": 0.08228417465445344, + "grad_norm": 2.75, + "learning_rate": 4.922410738652452e-05, + "loss": 0.813, + "step": 4642 + }, + { + "epoch": 0.08231962669006501, + "grad_norm": 3.03125, + "learning_rate": 4.92234168471027e-05, + "loss": 0.8251, + "step": 4644 + }, + { + "epoch": 0.08235507872567659, + "grad_norm": 3.359375, + "learning_rate": 4.922272600537767e-05, + "loss": 0.8559, + "step": 4646 + }, + { + "epoch": 0.08239053076128815, + "grad_norm": 2.75, + "learning_rate": 4.922203486135808e-05, + "loss": 0.8383, + "step": 4648 + }, + { + "epoch": 0.08242598279689972, + "grad_norm": 2.6875, + "learning_rate": 4.9221343415052534e-05, + "loss": 0.8475, + "step": 4650 + }, + { + "epoch": 0.0824614348325113, + "grad_norm": 2.890625, + "learning_rate": 4.922065166646966e-05, + "loss": 0.8242, + "step": 4652 + }, + { + "epoch": 0.08249688686812286, + "grad_norm": 2.765625, + "learning_rate": 4.921995961561811e-05, + "loss": 0.8398, + "step": 4654 + }, + { + "epoch": 0.08253233890373443, + "grad_norm": 2.671875, + "learning_rate": 4.921926726250651e-05, + "loss": 0.8347, + "step": 4656 + }, + { + "epoch": 0.08256779093934599, + "grad_norm": 2.875, + "learning_rate": 4.9218574607143485e-05, + "loss": 0.8158, + "step": 4658 + }, + { + "epoch": 0.08260324297495757, + "grad_norm": 3.3125, + "learning_rate": 4.92178816495377e-05, + "loss": 0.8181, + "step": 4660 + }, + { + "epoch": 0.08263869501056914, + "grad_norm": 2.8125, + "learning_rate": 4.9217188389697796e-05, + "loss": 0.8476, + "step": 4662 + }, + { + "epoch": 0.0826741470461807, + "grad_norm": 3.109375, + "learning_rate": 4.921649482763243e-05, + "loss": 0.8557, + "step": 4664 + }, + { + "epoch": 0.08270959908179228, + "grad_norm": 2.671875, + "learning_rate": 4.921580096335025e-05, + "loss": 0.8052, + "step": 4666 + }, + { + "epoch": 0.08274505111740385, + "grad_norm": 2.578125, + "learning_rate": 4.921510679685992e-05, + "loss": 0.8763, + "step": 4668 + }, + { + "epoch": 0.08278050315301541, + "grad_norm": 2.703125, + "learning_rate": 4.921441232817009e-05, + "loss": 0.8867, + "step": 4670 + }, + { + "epoch": 0.08281595518862699, + "grad_norm": 2.765625, + "learning_rate": 4.921371755728945e-05, + "loss": 0.8104, + "step": 4672 + }, + { + "epoch": 0.08285140722423856, + "grad_norm": 2.71875, + "learning_rate": 4.921302248422665e-05, + "loss": 0.8524, + "step": 4674 + }, + { + "epoch": 0.08288685925985012, + "grad_norm": 2.84375, + "learning_rate": 4.9212327108990376e-05, + "loss": 0.837, + "step": 4676 + }, + { + "epoch": 0.0829223112954617, + "grad_norm": 2.828125, + "learning_rate": 4.92116314315893e-05, + "loss": 0.8133, + "step": 4678 + }, + { + "epoch": 0.08295776333107327, + "grad_norm": 2.734375, + "learning_rate": 4.921093545203211e-05, + "loss": 0.8389, + "step": 4680 + }, + { + "epoch": 0.08299321536668483, + "grad_norm": 2.765625, + "learning_rate": 4.921023917032749e-05, + "loss": 0.8538, + "step": 4682 + }, + { + "epoch": 0.0830286674022964, + "grad_norm": 2.484375, + "learning_rate": 4.920954258648413e-05, + "loss": 0.8015, + "step": 4684 + }, + { + "epoch": 0.08306411943790798, + "grad_norm": 2.796875, + "learning_rate": 4.9208845700510707e-05, + "loss": 0.8108, + "step": 4686 + }, + { + "epoch": 0.08309957147351954, + "grad_norm": 2.640625, + "learning_rate": 4.920814851241595e-05, + "loss": 0.8173, + "step": 4688 + }, + { + "epoch": 0.08313502350913111, + "grad_norm": 2.703125, + "learning_rate": 4.9207451022208525e-05, + "loss": 0.8193, + "step": 4690 + }, + { + "epoch": 0.08317047554474269, + "grad_norm": 2.71875, + "learning_rate": 4.9206753229897165e-05, + "loss": 0.8298, + "step": 4692 + }, + { + "epoch": 0.08320592758035425, + "grad_norm": 2.765625, + "learning_rate": 4.9206055135490563e-05, + "loss": 0.8196, + "step": 4694 + }, + { + "epoch": 0.08324137961596582, + "grad_norm": 2.671875, + "learning_rate": 4.920535673899743e-05, + "loss": 0.8335, + "step": 4696 + }, + { + "epoch": 0.0832768316515774, + "grad_norm": 2.796875, + "learning_rate": 4.9204658040426496e-05, + "loss": 0.8865, + "step": 4698 + }, + { + "epoch": 0.08331228368718896, + "grad_norm": 2.484375, + "learning_rate": 4.9203959039786465e-05, + "loss": 0.82, + "step": 4700 + }, + { + "epoch": 0.08334773572280053, + "grad_norm": 2.875, + "learning_rate": 4.920325973708607e-05, + "loss": 0.8465, + "step": 4702 + }, + { + "epoch": 0.08338318775841211, + "grad_norm": 2.5, + "learning_rate": 4.920256013233403e-05, + "loss": 0.8028, + "step": 4704 + }, + { + "epoch": 0.08341863979402367, + "grad_norm": 2.75, + "learning_rate": 4.920186022553909e-05, + "loss": 0.838, + "step": 4706 + }, + { + "epoch": 0.08345409182963524, + "grad_norm": 2.75, + "learning_rate": 4.9201160016709964e-05, + "loss": 0.8395, + "step": 4708 + }, + { + "epoch": 0.08348954386524682, + "grad_norm": 2.984375, + "learning_rate": 4.920045950585541e-05, + "loss": 0.8705, + "step": 4710 + }, + { + "epoch": 0.08352499590085838, + "grad_norm": 2.71875, + "learning_rate": 4.919975869298416e-05, + "loss": 0.8168, + "step": 4712 + }, + { + "epoch": 0.08356044793646995, + "grad_norm": 2.8125, + "learning_rate": 4.919905757810496e-05, + "loss": 0.8021, + "step": 4714 + }, + { + "epoch": 0.08359589997208153, + "grad_norm": 3.125, + "learning_rate": 4.9198356161226555e-05, + "loss": 0.8569, + "step": 4716 + }, + { + "epoch": 0.08363135200769309, + "grad_norm": 2.53125, + "learning_rate": 4.919765444235771e-05, + "loss": 0.8266, + "step": 4718 + }, + { + "epoch": 0.08366680404330466, + "grad_norm": 2.75, + "learning_rate": 4.919695242150718e-05, + "loss": 0.8239, + "step": 4720 + }, + { + "epoch": 0.08370225607891624, + "grad_norm": 2.859375, + "learning_rate": 4.919625009868373e-05, + "loss": 0.8637, + "step": 4722 + }, + { + "epoch": 0.0837377081145278, + "grad_norm": 2.9375, + "learning_rate": 4.919554747389611e-05, + "loss": 0.8091, + "step": 4724 + }, + { + "epoch": 0.08377316015013937, + "grad_norm": 2.921875, + "learning_rate": 4.9194844547153095e-05, + "loss": 0.8482, + "step": 4726 + }, + { + "epoch": 0.08380861218575095, + "grad_norm": 2.921875, + "learning_rate": 4.919414131846346e-05, + "loss": 0.8627, + "step": 4728 + }, + { + "epoch": 0.08384406422136251, + "grad_norm": 2.75, + "learning_rate": 4.9193437787835987e-05, + "loss": 0.8841, + "step": 4730 + }, + { + "epoch": 0.08387951625697408, + "grad_norm": 3.015625, + "learning_rate": 4.9192733955279446e-05, + "loss": 0.8262, + "step": 4732 + }, + { + "epoch": 0.08391496829258566, + "grad_norm": 2.921875, + "learning_rate": 4.919202982080262e-05, + "loss": 0.8459, + "step": 4734 + }, + { + "epoch": 0.08395042032819722, + "grad_norm": 2.75, + "learning_rate": 4.919132538441431e-05, + "loss": 0.8286, + "step": 4736 + }, + { + "epoch": 0.08398587236380879, + "grad_norm": 2.4375, + "learning_rate": 4.919062064612329e-05, + "loss": 0.834, + "step": 4738 + }, + { + "epoch": 0.08402132439942035, + "grad_norm": 2.78125, + "learning_rate": 4.918991560593837e-05, + "loss": 0.8834, + "step": 4740 + }, + { + "epoch": 0.08405677643503193, + "grad_norm": 2.625, + "learning_rate": 4.9189210263868335e-05, + "loss": 0.7786, + "step": 4742 + }, + { + "epoch": 0.0840922284706435, + "grad_norm": 3.015625, + "learning_rate": 4.9188504619922e-05, + "loss": 0.8179, + "step": 4744 + }, + { + "epoch": 0.08412768050625506, + "grad_norm": 3.0625, + "learning_rate": 4.918779867410817e-05, + "loss": 0.8539, + "step": 4746 + }, + { + "epoch": 0.08416313254186664, + "grad_norm": 2.578125, + "learning_rate": 4.9187092426435634e-05, + "loss": 0.8234, + "step": 4748 + }, + { + "epoch": 0.08419858457747821, + "grad_norm": 2.671875, + "learning_rate": 4.918638587691323e-05, + "loss": 0.841, + "step": 4750 + }, + { + "epoch": 0.08423403661308977, + "grad_norm": 3.4375, + "learning_rate": 4.918567902554977e-05, + "loss": 0.8399, + "step": 4752 + }, + { + "epoch": 0.08426948864870135, + "grad_norm": 2.65625, + "learning_rate": 4.918497187235407e-05, + "loss": 0.8286, + "step": 4754 + }, + { + "epoch": 0.08430494068431292, + "grad_norm": 2.859375, + "learning_rate": 4.918426441733496e-05, + "loss": 0.8949, + "step": 4756 + }, + { + "epoch": 0.08434039271992448, + "grad_norm": 2.9375, + "learning_rate": 4.918355666050127e-05, + "loss": 0.8858, + "step": 4758 + }, + { + "epoch": 0.08437584475553606, + "grad_norm": 2.828125, + "learning_rate": 4.918284860186183e-05, + "loss": 0.8442, + "step": 4760 + }, + { + "epoch": 0.08441129679114763, + "grad_norm": 2.796875, + "learning_rate": 4.918214024142547e-05, + "loss": 0.859, + "step": 4762 + }, + { + "epoch": 0.08444674882675919, + "grad_norm": 2.875, + "learning_rate": 4.918143157920104e-05, + "loss": 0.8918, + "step": 4764 + }, + { + "epoch": 0.08448220086237077, + "grad_norm": 2.765625, + "learning_rate": 4.918072261519738e-05, + "loss": 0.8376, + "step": 4766 + }, + { + "epoch": 0.08451765289798234, + "grad_norm": 2.890625, + "learning_rate": 4.9180013349423346e-05, + "loss": 0.8575, + "step": 4768 + }, + { + "epoch": 0.0845531049335939, + "grad_norm": 2.71875, + "learning_rate": 4.917930378188778e-05, + "loss": 0.8454, + "step": 4770 + }, + { + "epoch": 0.08458855696920548, + "grad_norm": 2.859375, + "learning_rate": 4.917859391259952e-05, + "loss": 0.8589, + "step": 4772 + }, + { + "epoch": 0.08462400900481705, + "grad_norm": 2.578125, + "learning_rate": 4.917788374156747e-05, + "loss": 0.8413, + "step": 4774 + }, + { + "epoch": 0.08465946104042861, + "grad_norm": 2.546875, + "learning_rate": 4.917717326880045e-05, + "loss": 0.8128, + "step": 4776 + }, + { + "epoch": 0.08469491307604018, + "grad_norm": 3.265625, + "learning_rate": 4.917646249430735e-05, + "loss": 0.8709, + "step": 4778 + }, + { + "epoch": 0.08473036511165176, + "grad_norm": 2.46875, + "learning_rate": 4.917575141809703e-05, + "loss": 0.8053, + "step": 4780 + }, + { + "epoch": 0.08476581714726332, + "grad_norm": 2.578125, + "learning_rate": 4.917504004017837e-05, + "loss": 0.8852, + "step": 4782 + }, + { + "epoch": 0.0848012691828749, + "grad_norm": 2.734375, + "learning_rate": 4.917432836056025e-05, + "loss": 0.8599, + "step": 4784 + }, + { + "epoch": 0.08483672121848647, + "grad_norm": 2.6875, + "learning_rate": 4.917361637925154e-05, + "loss": 0.8266, + "step": 4786 + }, + { + "epoch": 0.08487217325409803, + "grad_norm": 2.765625, + "learning_rate": 4.9172904096261136e-05, + "loss": 0.8163, + "step": 4788 + }, + { + "epoch": 0.0849076252897096, + "grad_norm": 3.0625, + "learning_rate": 4.917219151159792e-05, + "loss": 0.856, + "step": 4790 + }, + { + "epoch": 0.08494307732532118, + "grad_norm": 2.953125, + "learning_rate": 4.917147862527079e-05, + "loss": 0.8377, + "step": 4792 + }, + { + "epoch": 0.08497852936093274, + "grad_norm": 2.71875, + "learning_rate": 4.9170765437288644e-05, + "loss": 0.8261, + "step": 4794 + }, + { + "epoch": 0.08501398139654431, + "grad_norm": 2.9375, + "learning_rate": 4.917005194766038e-05, + "loss": 0.8279, + "step": 4796 + }, + { + "epoch": 0.08504943343215589, + "grad_norm": 3.109375, + "learning_rate": 4.9169338156394904e-05, + "loss": 0.8655, + "step": 4798 + }, + { + "epoch": 0.08508488546776745, + "grad_norm": 2.90625, + "learning_rate": 4.916862406350112e-05, + "loss": 0.8507, + "step": 4800 + }, + { + "epoch": 0.08512033750337902, + "grad_norm": 2.671875, + "learning_rate": 4.9167909668987935e-05, + "loss": 0.8584, + "step": 4802 + }, + { + "epoch": 0.0851557895389906, + "grad_norm": 2.6875, + "learning_rate": 4.9167194972864275e-05, + "loss": 0.8287, + "step": 4804 + }, + { + "epoch": 0.08519124157460216, + "grad_norm": 2.828125, + "learning_rate": 4.916647997513906e-05, + "loss": 0.8062, + "step": 4806 + }, + { + "epoch": 0.08522669361021373, + "grad_norm": 3.03125, + "learning_rate": 4.91657646758212e-05, + "loss": 0.8694, + "step": 4808 + }, + { + "epoch": 0.08526214564582531, + "grad_norm": 3.109375, + "learning_rate": 4.9165049074919646e-05, + "loss": 0.8329, + "step": 4810 + }, + { + "epoch": 0.08529759768143687, + "grad_norm": 2.546875, + "learning_rate": 4.91643331724433e-05, + "loss": 0.8391, + "step": 4812 + }, + { + "epoch": 0.08533304971704844, + "grad_norm": 2.625, + "learning_rate": 4.91636169684011e-05, + "loss": 0.8347, + "step": 4814 + }, + { + "epoch": 0.08536850175266002, + "grad_norm": 2.78125, + "learning_rate": 4.916290046280201e-05, + "loss": 0.8175, + "step": 4816 + }, + { + "epoch": 0.08540395378827158, + "grad_norm": 2.84375, + "learning_rate": 4.916218365565495e-05, + "loss": 0.8785, + "step": 4818 + }, + { + "epoch": 0.08543940582388315, + "grad_norm": 2.703125, + "learning_rate": 4.916146654696887e-05, + "loss": 0.8315, + "step": 4820 + }, + { + "epoch": 0.08547485785949473, + "grad_norm": 2.5625, + "learning_rate": 4.916074913675272e-05, + "loss": 0.8088, + "step": 4822 + }, + { + "epoch": 0.08551030989510629, + "grad_norm": 2.4375, + "learning_rate": 4.916003142501545e-05, + "loss": 0.818, + "step": 4824 + }, + { + "epoch": 0.08554576193071786, + "grad_norm": 2.703125, + "learning_rate": 4.915931341176603e-05, + "loss": 0.8604, + "step": 4826 + }, + { + "epoch": 0.08558121396632942, + "grad_norm": 2.921875, + "learning_rate": 4.915859509701341e-05, + "loss": 0.8509, + "step": 4828 + }, + { + "epoch": 0.085616666001941, + "grad_norm": 2.796875, + "learning_rate": 4.915787648076654e-05, + "loss": 0.8354, + "step": 4830 + }, + { + "epoch": 0.08565211803755257, + "grad_norm": 2.828125, + "learning_rate": 4.9157157563034414e-05, + "loss": 0.858, + "step": 4832 + }, + { + "epoch": 0.08568757007316413, + "grad_norm": 2.875, + "learning_rate": 4.915643834382599e-05, + "loss": 0.8217, + "step": 4834 + }, + { + "epoch": 0.0857230221087757, + "grad_norm": 2.796875, + "learning_rate": 4.915571882315024e-05, + "loss": 0.839, + "step": 4836 + }, + { + "epoch": 0.08575847414438728, + "grad_norm": 2.765625, + "learning_rate": 4.9154999001016165e-05, + "loss": 0.8276, + "step": 4838 + }, + { + "epoch": 0.08579392617999884, + "grad_norm": 2.65625, + "learning_rate": 4.915427887743273e-05, + "loss": 0.8278, + "step": 4840 + }, + { + "epoch": 0.08582937821561042, + "grad_norm": 2.734375, + "learning_rate": 4.915355845240892e-05, + "loss": 0.8404, + "step": 4842 + }, + { + "epoch": 0.08586483025122199, + "grad_norm": 2.765625, + "learning_rate": 4.915283772595373e-05, + "loss": 0.8471, + "step": 4844 + }, + { + "epoch": 0.08590028228683355, + "grad_norm": 2.734375, + "learning_rate": 4.915211669807616e-05, + "loss": 0.8204, + "step": 4846 + }, + { + "epoch": 0.08593573432244513, + "grad_norm": 2.796875, + "learning_rate": 4.915139536878521e-05, + "loss": 0.8382, + "step": 4848 + }, + { + "epoch": 0.0859711863580567, + "grad_norm": 2.6875, + "learning_rate": 4.915067373808987e-05, + "loss": 0.8489, + "step": 4850 + }, + { + "epoch": 0.08600663839366826, + "grad_norm": 2.765625, + "learning_rate": 4.914995180599915e-05, + "loss": 0.8831, + "step": 4852 + }, + { + "epoch": 0.08604209042927984, + "grad_norm": 2.890625, + "learning_rate": 4.914922957252206e-05, + "loss": 0.8454, + "step": 4854 + }, + { + "epoch": 0.08607754246489141, + "grad_norm": 2.78125, + "learning_rate": 4.914850703766762e-05, + "loss": 0.8631, + "step": 4856 + }, + { + "epoch": 0.08611299450050297, + "grad_norm": 2.84375, + "learning_rate": 4.914778420144484e-05, + "loss": 0.8594, + "step": 4858 + }, + { + "epoch": 0.08614844653611455, + "grad_norm": 2.859375, + "learning_rate": 4.914706106386274e-05, + "loss": 0.8227, + "step": 4860 + }, + { + "epoch": 0.08618389857172612, + "grad_norm": 2.65625, + "learning_rate": 4.914633762493035e-05, + "loss": 0.8217, + "step": 4862 + }, + { + "epoch": 0.08621935060733768, + "grad_norm": 2.703125, + "learning_rate": 4.91456138846567e-05, + "loss": 0.8461, + "step": 4864 + }, + { + "epoch": 0.08625480264294925, + "grad_norm": 2.65625, + "learning_rate": 4.914488984305081e-05, + "loss": 0.8445, + "step": 4866 + }, + { + "epoch": 0.08629025467856083, + "grad_norm": 2.671875, + "learning_rate": 4.914416550012173e-05, + "loss": 0.8095, + "step": 4868 + }, + { + "epoch": 0.08632570671417239, + "grad_norm": 2.78125, + "learning_rate": 4.9143440855878496e-05, + "loss": 0.8082, + "step": 4870 + }, + { + "epoch": 0.08636115874978396, + "grad_norm": 2.625, + "learning_rate": 4.914271591033014e-05, + "loss": 0.8297, + "step": 4872 + }, + { + "epoch": 0.08639661078539554, + "grad_norm": 2.625, + "learning_rate": 4.914199066348573e-05, + "loss": 0.8541, + "step": 4874 + }, + { + "epoch": 0.0864320628210071, + "grad_norm": 2.828125, + "learning_rate": 4.9141265115354294e-05, + "loss": 0.804, + "step": 4876 + }, + { + "epoch": 0.08646751485661867, + "grad_norm": 2.734375, + "learning_rate": 4.9140539265944894e-05, + "loss": 0.8892, + "step": 4878 + }, + { + "epoch": 0.08650296689223025, + "grad_norm": 2.75, + "learning_rate": 4.9139813115266604e-05, + "loss": 0.8561, + "step": 4880 + }, + { + "epoch": 0.08653841892784181, + "grad_norm": 2.671875, + "learning_rate": 4.913908666332847e-05, + "loss": 0.8416, + "step": 4882 + }, + { + "epoch": 0.08657387096345338, + "grad_norm": 2.921875, + "learning_rate": 4.9138359910139556e-05, + "loss": 0.8247, + "step": 4884 + }, + { + "epoch": 0.08660932299906496, + "grad_norm": 2.65625, + "learning_rate": 4.9137632855708946e-05, + "loss": 0.8197, + "step": 4886 + }, + { + "epoch": 0.08664477503467652, + "grad_norm": 2.890625, + "learning_rate": 4.9136905500045705e-05, + "loss": 0.8617, + "step": 4888 + }, + { + "epoch": 0.0866802270702881, + "grad_norm": 2.734375, + "learning_rate": 4.9136177843158906e-05, + "loss": 0.8368, + "step": 4890 + }, + { + "epoch": 0.08671567910589967, + "grad_norm": 2.8125, + "learning_rate": 4.9135449885057636e-05, + "loss": 0.8302, + "step": 4892 + }, + { + "epoch": 0.08675113114151123, + "grad_norm": 2.53125, + "learning_rate": 4.913472162575098e-05, + "loss": 0.8489, + "step": 4894 + }, + { + "epoch": 0.0867865831771228, + "grad_norm": 2.875, + "learning_rate": 4.913399306524803e-05, + "loss": 0.8218, + "step": 4896 + }, + { + "epoch": 0.08682203521273438, + "grad_norm": 2.859375, + "learning_rate": 4.913326420355787e-05, + "loss": 0.8156, + "step": 4898 + }, + { + "epoch": 0.08685748724834594, + "grad_norm": 2.46875, + "learning_rate": 4.913253504068959e-05, + "loss": 0.8496, + "step": 4900 + }, + { + "epoch": 0.08689293928395751, + "grad_norm": 2.953125, + "learning_rate": 4.9131805576652315e-05, + "loss": 0.8293, + "step": 4902 + }, + { + "epoch": 0.08692839131956909, + "grad_norm": 2.75, + "learning_rate": 4.913107581145512e-05, + "loss": 0.8324, + "step": 4904 + }, + { + "epoch": 0.08696384335518065, + "grad_norm": 2.6875, + "learning_rate": 4.913034574510713e-05, + "loss": 0.8057, + "step": 4906 + }, + { + "epoch": 0.08699929539079222, + "grad_norm": 2.859375, + "learning_rate": 4.9129615377617445e-05, + "loss": 0.8315, + "step": 4908 + }, + { + "epoch": 0.08703474742640378, + "grad_norm": 3.0625, + "learning_rate": 4.912888470899519e-05, + "loss": 0.8528, + "step": 4910 + }, + { + "epoch": 0.08707019946201536, + "grad_norm": 3.0625, + "learning_rate": 4.912815373924948e-05, + "loss": 0.8459, + "step": 4912 + }, + { + "epoch": 0.08710565149762693, + "grad_norm": 3.484375, + "learning_rate": 4.912742246838944e-05, + "loss": 0.8431, + "step": 4914 + }, + { + "epoch": 0.08714110353323849, + "grad_norm": 2.703125, + "learning_rate": 4.9126690896424195e-05, + "loss": 0.8502, + "step": 4916 + }, + { + "epoch": 0.08717655556885007, + "grad_norm": 2.875, + "learning_rate": 4.912595902336287e-05, + "loss": 0.8689, + "step": 4918 + }, + { + "epoch": 0.08721200760446164, + "grad_norm": 2.890625, + "learning_rate": 4.912522684921459e-05, + "loss": 0.8495, + "step": 4920 + }, + { + "epoch": 0.0872474596400732, + "grad_norm": 2.875, + "learning_rate": 4.9124494373988516e-05, + "loss": 0.8241, + "step": 4922 + }, + { + "epoch": 0.08728291167568478, + "grad_norm": 2.84375, + "learning_rate": 4.912376159769378e-05, + "loss": 0.8331, + "step": 4924 + }, + { + "epoch": 0.08731836371129635, + "grad_norm": 2.625, + "learning_rate": 4.912302852033952e-05, + "loss": 0.8012, + "step": 4926 + }, + { + "epoch": 0.08735381574690791, + "grad_norm": 2.5625, + "learning_rate": 4.912229514193488e-05, + "loss": 0.8198, + "step": 4928 + }, + { + "epoch": 0.08738926778251949, + "grad_norm": 2.90625, + "learning_rate": 4.912156146248903e-05, + "loss": 0.812, + "step": 4930 + }, + { + "epoch": 0.08742471981813106, + "grad_norm": 2.875, + "learning_rate": 4.912082748201112e-05, + "loss": 0.8546, + "step": 4932 + }, + { + "epoch": 0.08746017185374262, + "grad_norm": 2.921875, + "learning_rate": 4.9120093200510295e-05, + "loss": 0.8525, + "step": 4934 + }, + { + "epoch": 0.0874956238893542, + "grad_norm": 2.53125, + "learning_rate": 4.911935861799574e-05, + "loss": 0.8227, + "step": 4936 + }, + { + "epoch": 0.08753107592496577, + "grad_norm": 3.171875, + "learning_rate": 4.911862373447661e-05, + "loss": 0.7985, + "step": 4938 + }, + { + "epoch": 0.08756652796057733, + "grad_norm": 2.65625, + "learning_rate": 4.911788854996209e-05, + "loss": 0.8347, + "step": 4940 + }, + { + "epoch": 0.0876019799961889, + "grad_norm": 2.84375, + "learning_rate": 4.911715306446133e-05, + "loss": 0.8388, + "step": 4942 + }, + { + "epoch": 0.08763743203180048, + "grad_norm": 3.0, + "learning_rate": 4.911641727798353e-05, + "loss": 0.8764, + "step": 4944 + }, + { + "epoch": 0.08767288406741204, + "grad_norm": 2.96875, + "learning_rate": 4.911568119053786e-05, + "loss": 0.8823, + "step": 4946 + }, + { + "epoch": 0.08770833610302362, + "grad_norm": 3.125, + "learning_rate": 4.911494480213351e-05, + "loss": 0.8387, + "step": 4948 + }, + { + "epoch": 0.08774378813863519, + "grad_norm": 2.671875, + "learning_rate": 4.911420811277968e-05, + "loss": 0.7991, + "step": 4950 + }, + { + "epoch": 0.08777924017424675, + "grad_norm": 2.65625, + "learning_rate": 4.9113471122485556e-05, + "loss": 0.805, + "step": 4952 + }, + { + "epoch": 0.08781469220985832, + "grad_norm": 2.671875, + "learning_rate": 4.9112733831260325e-05, + "loss": 0.8836, + "step": 4954 + }, + { + "epoch": 0.0878501442454699, + "grad_norm": 2.84375, + "learning_rate": 4.911199623911321e-05, + "loss": 0.8398, + "step": 4956 + }, + { + "epoch": 0.08788559628108146, + "grad_norm": 2.953125, + "learning_rate": 4.911125834605339e-05, + "loss": 0.837, + "step": 4958 + }, + { + "epoch": 0.08792104831669303, + "grad_norm": 2.9375, + "learning_rate": 4.9110520152090104e-05, + "loss": 0.7892, + "step": 4960 + }, + { + "epoch": 0.08795650035230461, + "grad_norm": 3.046875, + "learning_rate": 4.910978165723253e-05, + "loss": 0.8841, + "step": 4962 + }, + { + "epoch": 0.08799195238791617, + "grad_norm": 3.546875, + "learning_rate": 4.910904286148992e-05, + "loss": 0.8603, + "step": 4964 + }, + { + "epoch": 0.08802740442352774, + "grad_norm": 2.796875, + "learning_rate": 4.910830376487146e-05, + "loss": 0.8726, + "step": 4966 + }, + { + "epoch": 0.08806285645913932, + "grad_norm": 2.59375, + "learning_rate": 4.9107564367386404e-05, + "loss": 0.8372, + "step": 4968 + }, + { + "epoch": 0.08809830849475088, + "grad_norm": 2.5625, + "learning_rate": 4.9106824669043964e-05, + "loss": 0.8159, + "step": 4970 + }, + { + "epoch": 0.08813376053036245, + "grad_norm": 2.9375, + "learning_rate": 4.910608466985337e-05, + "loss": 0.8235, + "step": 4972 + }, + { + "epoch": 0.08816921256597403, + "grad_norm": 2.765625, + "learning_rate": 4.910534436982386e-05, + "loss": 0.8469, + "step": 4974 + }, + { + "epoch": 0.08820466460158559, + "grad_norm": 2.71875, + "learning_rate": 4.910460376896468e-05, + "loss": 0.8395, + "step": 4976 + }, + { + "epoch": 0.08824011663719716, + "grad_norm": 3.3125, + "learning_rate": 4.9103862867285056e-05, + "loss": 0.8427, + "step": 4978 + }, + { + "epoch": 0.08827556867280874, + "grad_norm": 2.53125, + "learning_rate": 4.910312166479425e-05, + "loss": 0.8158, + "step": 4980 + }, + { + "epoch": 0.0883110207084203, + "grad_norm": 2.921875, + "learning_rate": 4.910238016150151e-05, + "loss": 0.8614, + "step": 4982 + }, + { + "epoch": 0.08834647274403187, + "grad_norm": 2.703125, + "learning_rate": 4.910163835741607e-05, + "loss": 0.8538, + "step": 4984 + }, + { + "epoch": 0.08838192477964345, + "grad_norm": 2.703125, + "learning_rate": 4.9100896252547215e-05, + "loss": 0.8273, + "step": 4986 + }, + { + "epoch": 0.08841737681525501, + "grad_norm": 2.625, + "learning_rate": 4.91001538469042e-05, + "loss": 0.792, + "step": 4988 + }, + { + "epoch": 0.08845282885086658, + "grad_norm": 2.578125, + "learning_rate": 4.909941114049627e-05, + "loss": 0.8621, + "step": 4990 + }, + { + "epoch": 0.08848828088647816, + "grad_norm": 2.6875, + "learning_rate": 4.9098668133332714e-05, + "loss": 0.8404, + "step": 4992 + }, + { + "epoch": 0.08852373292208972, + "grad_norm": 2.71875, + "learning_rate": 4.90979248254228e-05, + "loss": 0.8261, + "step": 4994 + }, + { + "epoch": 0.08855918495770129, + "grad_norm": 2.75, + "learning_rate": 4.9097181216775805e-05, + "loss": 0.8467, + "step": 4996 + }, + { + "epoch": 0.08859463699331285, + "grad_norm": 2.703125, + "learning_rate": 4.9096437307401004e-05, + "loss": 0.8357, + "step": 4998 + }, + { + "epoch": 0.08863008902892443, + "grad_norm": 2.625, + "learning_rate": 4.909569309730769e-05, + "loss": 0.8499, + "step": 5000 + }, + { + "epoch": 0.088665541064536, + "grad_norm": 3.09375, + "learning_rate": 4.909494858650514e-05, + "loss": 0.8532, + "step": 5002 + }, + { + "epoch": 0.08870099310014756, + "grad_norm": 3.0625, + "learning_rate": 4.909420377500266e-05, + "loss": 0.8132, + "step": 5004 + }, + { + "epoch": 0.08873644513575914, + "grad_norm": 2.96875, + "learning_rate": 4.909345866280952e-05, + "loss": 0.8429, + "step": 5006 + }, + { + "epoch": 0.08877189717137071, + "grad_norm": 2.796875, + "learning_rate": 4.909271324993504e-05, + "loss": 0.8266, + "step": 5008 + }, + { + "epoch": 0.08880734920698227, + "grad_norm": 2.65625, + "learning_rate": 4.909196753638852e-05, + "loss": 0.8255, + "step": 5010 + }, + { + "epoch": 0.08884280124259385, + "grad_norm": 2.953125, + "learning_rate": 4.9091221522179264e-05, + "loss": 0.8205, + "step": 5012 + }, + { + "epoch": 0.08887825327820542, + "grad_norm": 2.546875, + "learning_rate": 4.909047520731658e-05, + "loss": 0.8872, + "step": 5014 + }, + { + "epoch": 0.08891370531381698, + "grad_norm": 2.625, + "learning_rate": 4.908972859180978e-05, + "loss": 0.8599, + "step": 5016 + }, + { + "epoch": 0.08894915734942856, + "grad_norm": 2.703125, + "learning_rate": 4.9088981675668185e-05, + "loss": 0.8741, + "step": 5018 + }, + { + "epoch": 0.08898460938504013, + "grad_norm": 2.9375, + "learning_rate": 4.9088234458901114e-05, + "loss": 0.839, + "step": 5020 + }, + { + "epoch": 0.08902006142065169, + "grad_norm": 2.84375, + "learning_rate": 4.90874869415179e-05, + "loss": 0.8378, + "step": 5022 + }, + { + "epoch": 0.08905551345626327, + "grad_norm": 2.9375, + "learning_rate": 4.908673912352786e-05, + "loss": 0.8216, + "step": 5024 + }, + { + "epoch": 0.08909096549187484, + "grad_norm": 2.796875, + "learning_rate": 4.908599100494034e-05, + "loss": 0.8167, + "step": 5026 + }, + { + "epoch": 0.0891264175274864, + "grad_norm": 2.59375, + "learning_rate": 4.908524258576467e-05, + "loss": 0.8109, + "step": 5028 + }, + { + "epoch": 0.08916186956309798, + "grad_norm": 2.453125, + "learning_rate": 4.908449386601019e-05, + "loss": 0.8242, + "step": 5030 + }, + { + "epoch": 0.08919732159870955, + "grad_norm": 2.859375, + "learning_rate": 4.908374484568623e-05, + "loss": 0.8618, + "step": 5032 + }, + { + "epoch": 0.08923277363432111, + "grad_norm": 3.046875, + "learning_rate": 4.9082995524802157e-05, + "loss": 0.8933, + "step": 5034 + }, + { + "epoch": 0.08926822566993269, + "grad_norm": 2.828125, + "learning_rate": 4.908224590336732e-05, + "loss": 0.9062, + "step": 5036 + }, + { + "epoch": 0.08930367770554426, + "grad_norm": 2.703125, + "learning_rate": 4.908149598139107e-05, + "loss": 0.81, + "step": 5038 + }, + { + "epoch": 0.08933912974115582, + "grad_norm": 2.84375, + "learning_rate": 4.908074575888276e-05, + "loss": 0.8303, + "step": 5040 + }, + { + "epoch": 0.0893745817767674, + "grad_norm": 2.5, + "learning_rate": 4.907999523585176e-05, + "loss": 0.836, + "step": 5042 + }, + { + "epoch": 0.08941003381237897, + "grad_norm": 2.59375, + "learning_rate": 4.907924441230743e-05, + "loss": 0.7865, + "step": 5044 + }, + { + "epoch": 0.08944548584799053, + "grad_norm": 2.859375, + "learning_rate": 4.907849328825916e-05, + "loss": 0.8685, + "step": 5046 + }, + { + "epoch": 0.0894809378836021, + "grad_norm": 2.671875, + "learning_rate": 4.9077741863716296e-05, + "loss": 0.8481, + "step": 5048 + }, + { + "epoch": 0.08951638991921368, + "grad_norm": 2.640625, + "learning_rate": 4.907699013868824e-05, + "loss": 0.8459, + "step": 5050 + }, + { + "epoch": 0.08955184195482524, + "grad_norm": 2.59375, + "learning_rate": 4.9076238113184344e-05, + "loss": 0.8189, + "step": 5052 + }, + { + "epoch": 0.08958729399043681, + "grad_norm": 2.71875, + "learning_rate": 4.9075485787214024e-05, + "loss": 0.818, + "step": 5054 + }, + { + "epoch": 0.08962274602604839, + "grad_norm": 3.15625, + "learning_rate": 4.907473316078666e-05, + "loss": 0.8193, + "step": 5056 + }, + { + "epoch": 0.08965819806165995, + "grad_norm": 3.0, + "learning_rate": 4.9073980233911635e-05, + "loss": 0.8221, + "step": 5058 + }, + { + "epoch": 0.08969365009727152, + "grad_norm": 2.75, + "learning_rate": 4.907322700659835e-05, + "loss": 0.8591, + "step": 5060 + }, + { + "epoch": 0.0897291021328831, + "grad_norm": 2.609375, + "learning_rate": 4.907247347885621e-05, + "loss": 0.8252, + "step": 5062 + }, + { + "epoch": 0.08976455416849466, + "grad_norm": 2.65625, + "learning_rate": 4.907171965069461e-05, + "loss": 0.7966, + "step": 5064 + }, + { + "epoch": 0.08980000620410623, + "grad_norm": 2.78125, + "learning_rate": 4.907096552212296e-05, + "loss": 0.82, + "step": 5066 + }, + { + "epoch": 0.08983545823971781, + "grad_norm": 3.0, + "learning_rate": 4.907021109315068e-05, + "loss": 0.8558, + "step": 5068 + }, + { + "epoch": 0.08987091027532937, + "grad_norm": 2.796875, + "learning_rate": 4.906945636378718e-05, + "loss": 0.8481, + "step": 5070 + }, + { + "epoch": 0.08990636231094094, + "grad_norm": 2.875, + "learning_rate": 4.906870133404187e-05, + "loss": 0.8009, + "step": 5072 + }, + { + "epoch": 0.08994181434655252, + "grad_norm": 2.796875, + "learning_rate": 4.906794600392419e-05, + "loss": 0.834, + "step": 5074 + }, + { + "epoch": 0.08997726638216408, + "grad_norm": 2.65625, + "learning_rate": 4.906719037344355e-05, + "loss": 0.8621, + "step": 5076 + }, + { + "epoch": 0.09001271841777565, + "grad_norm": 3.0, + "learning_rate": 4.906643444260939e-05, + "loss": 0.8749, + "step": 5078 + }, + { + "epoch": 0.09004817045338723, + "grad_norm": 2.65625, + "learning_rate": 4.906567821143114e-05, + "loss": 0.7919, + "step": 5080 + }, + { + "epoch": 0.09008362248899879, + "grad_norm": 2.890625, + "learning_rate": 4.906492167991824e-05, + "loss": 0.8311, + "step": 5082 + }, + { + "epoch": 0.09011907452461036, + "grad_norm": 3.046875, + "learning_rate": 4.906416484808013e-05, + "loss": 0.8423, + "step": 5084 + }, + { + "epoch": 0.09015452656022192, + "grad_norm": 3.15625, + "learning_rate": 4.9063407715926255e-05, + "loss": 0.8415, + "step": 5086 + }, + { + "epoch": 0.0901899785958335, + "grad_norm": 2.625, + "learning_rate": 4.906265028346606e-05, + "loss": 0.8695, + "step": 5088 + }, + { + "epoch": 0.09022543063144507, + "grad_norm": 2.765625, + "learning_rate": 4.9061892550709e-05, + "loss": 0.8689, + "step": 5090 + }, + { + "epoch": 0.09026088266705663, + "grad_norm": 2.5625, + "learning_rate": 4.906113451766454e-05, + "loss": 0.8314, + "step": 5092 + }, + { + "epoch": 0.09029633470266821, + "grad_norm": 2.78125, + "learning_rate": 4.906037618434213e-05, + "loss": 0.834, + "step": 5094 + }, + { + "epoch": 0.09033178673827978, + "grad_norm": 2.734375, + "learning_rate": 4.905961755075124e-05, + "loss": 0.8216, + "step": 5096 + }, + { + "epoch": 0.09036723877389134, + "grad_norm": 2.375, + "learning_rate": 4.905885861690133e-05, + "loss": 0.7918, + "step": 5098 + }, + { + "epoch": 0.09040269080950292, + "grad_norm": 2.5625, + "learning_rate": 4.905809938280187e-05, + "loss": 0.8244, + "step": 5100 + }, + { + "epoch": 0.09043814284511449, + "grad_norm": 2.5625, + "learning_rate": 4.905733984846235e-05, + "loss": 0.8276, + "step": 5102 + }, + { + "epoch": 0.09047359488072605, + "grad_norm": 2.953125, + "learning_rate": 4.905658001389223e-05, + "loss": 0.8623, + "step": 5104 + }, + { + "epoch": 0.09050904691633763, + "grad_norm": 2.875, + "learning_rate": 4.905581987910101e-05, + "loss": 0.812, + "step": 5106 + }, + { + "epoch": 0.0905444989519492, + "grad_norm": 2.875, + "learning_rate": 4.905505944409816e-05, + "loss": 0.8412, + "step": 5108 + }, + { + "epoch": 0.09057995098756076, + "grad_norm": 2.71875, + "learning_rate": 4.905429870889319e-05, + "loss": 0.8368, + "step": 5110 + }, + { + "epoch": 0.09061540302317234, + "grad_norm": 2.71875, + "learning_rate": 4.9053537673495576e-05, + "loss": 0.8673, + "step": 5112 + }, + { + "epoch": 0.09065085505878391, + "grad_norm": 2.78125, + "learning_rate": 4.9052776337914816e-05, + "loss": 0.8744, + "step": 5114 + }, + { + "epoch": 0.09068630709439547, + "grad_norm": 2.9375, + "learning_rate": 4.905201470216043e-05, + "loss": 0.7991, + "step": 5116 + }, + { + "epoch": 0.09072175913000705, + "grad_norm": 2.984375, + "learning_rate": 4.90512527662419e-05, + "loss": 0.8332, + "step": 5118 + }, + { + "epoch": 0.09075721116561862, + "grad_norm": 3.0, + "learning_rate": 4.9050490530168743e-05, + "loss": 0.8392, + "step": 5120 + }, + { + "epoch": 0.09079266320123018, + "grad_norm": 2.796875, + "learning_rate": 4.9049727993950486e-05, + "loss": 0.8683, + "step": 5122 + }, + { + "epoch": 0.09082811523684176, + "grad_norm": 2.671875, + "learning_rate": 4.904896515759663e-05, + "loss": 0.8318, + "step": 5124 + }, + { + "epoch": 0.09086356727245333, + "grad_norm": 2.75, + "learning_rate": 4.904820202111669e-05, + "loss": 0.8412, + "step": 5126 + }, + { + "epoch": 0.09089901930806489, + "grad_norm": 3.125, + "learning_rate": 4.90474385845202e-05, + "loss": 0.8313, + "step": 5128 + }, + { + "epoch": 0.09093447134367647, + "grad_norm": 2.984375, + "learning_rate": 4.9046674847816685e-05, + "loss": 0.8337, + "step": 5130 + }, + { + "epoch": 0.09096992337928804, + "grad_norm": 2.390625, + "learning_rate": 4.904591081101568e-05, + "loss": 0.8189, + "step": 5132 + }, + { + "epoch": 0.0910053754148996, + "grad_norm": 3.078125, + "learning_rate": 4.904514647412672e-05, + "loss": 0.8416, + "step": 5134 + }, + { + "epoch": 0.09104082745051117, + "grad_norm": 2.78125, + "learning_rate": 4.904438183715933e-05, + "loss": 0.8311, + "step": 5136 + }, + { + "epoch": 0.09107627948612275, + "grad_norm": 2.609375, + "learning_rate": 4.904361690012307e-05, + "loss": 0.8309, + "step": 5138 + }, + { + "epoch": 0.09111173152173431, + "grad_norm": 2.9375, + "learning_rate": 4.904285166302748e-05, + "loss": 0.8497, + "step": 5140 + }, + { + "epoch": 0.09114718355734588, + "grad_norm": 2.828125, + "learning_rate": 4.9042086125882104e-05, + "loss": 0.869, + "step": 5142 + }, + { + "epoch": 0.09118263559295746, + "grad_norm": 2.78125, + "learning_rate": 4.9041320288696505e-05, + "loss": 0.8205, + "step": 5144 + }, + { + "epoch": 0.09121808762856902, + "grad_norm": 2.921875, + "learning_rate": 4.9040554151480235e-05, + "loss": 0.825, + "step": 5146 + }, + { + "epoch": 0.0912535396641806, + "grad_norm": 2.859375, + "learning_rate": 4.903978771424285e-05, + "loss": 0.8514, + "step": 5148 + }, + { + "epoch": 0.09128899169979217, + "grad_norm": 2.703125, + "learning_rate": 4.903902097699393e-05, + "loss": 0.8235, + "step": 5150 + }, + { + "epoch": 0.09132444373540373, + "grad_norm": 2.78125, + "learning_rate": 4.903825393974303e-05, + "loss": 0.869, + "step": 5152 + }, + { + "epoch": 0.0913598957710153, + "grad_norm": 2.9375, + "learning_rate": 4.903748660249973e-05, + "loss": 0.8642, + "step": 5154 + }, + { + "epoch": 0.09139534780662688, + "grad_norm": 2.609375, + "learning_rate": 4.90367189652736e-05, + "loss": 0.8293, + "step": 5156 + }, + { + "epoch": 0.09143079984223844, + "grad_norm": 2.859375, + "learning_rate": 4.903595102807423e-05, + "loss": 0.8324, + "step": 5158 + }, + { + "epoch": 0.09146625187785001, + "grad_norm": 2.578125, + "learning_rate": 4.903518279091119e-05, + "loss": 0.8141, + "step": 5160 + }, + { + "epoch": 0.09150170391346159, + "grad_norm": 2.78125, + "learning_rate": 4.903441425379408e-05, + "loss": 0.8715, + "step": 5162 + }, + { + "epoch": 0.09153715594907315, + "grad_norm": 2.6875, + "learning_rate": 4.9033645416732486e-05, + "loss": 0.8393, + "step": 5164 + }, + { + "epoch": 0.09157260798468472, + "grad_norm": 2.59375, + "learning_rate": 4.903287627973601e-05, + "loss": 0.8631, + "step": 5166 + }, + { + "epoch": 0.09160806002029628, + "grad_norm": 2.65625, + "learning_rate": 4.903210684281423e-05, + "loss": 0.8291, + "step": 5168 + }, + { + "epoch": 0.09164351205590786, + "grad_norm": 2.84375, + "learning_rate": 4.903133710597677e-05, + "loss": 0.8772, + "step": 5170 + }, + { + "epoch": 0.09167896409151943, + "grad_norm": 2.96875, + "learning_rate": 4.903056706923322e-05, + "loss": 0.851, + "step": 5172 + }, + { + "epoch": 0.091714416127131, + "grad_norm": 2.703125, + "learning_rate": 4.902979673259321e-05, + "loss": 0.8378, + "step": 5174 + }, + { + "epoch": 0.09174986816274257, + "grad_norm": 2.84375, + "learning_rate": 4.902902609606634e-05, + "loss": 0.8006, + "step": 5176 + }, + { + "epoch": 0.09178532019835414, + "grad_norm": 2.765625, + "learning_rate": 4.902825515966223e-05, + "loss": 0.8601, + "step": 5178 + }, + { + "epoch": 0.0918207722339657, + "grad_norm": 2.65625, + "learning_rate": 4.902748392339049e-05, + "loss": 0.8244, + "step": 5180 + }, + { + "epoch": 0.09185622426957728, + "grad_norm": 2.6875, + "learning_rate": 4.902671238726076e-05, + "loss": 0.8326, + "step": 5182 + }, + { + "epoch": 0.09189167630518885, + "grad_norm": 3.109375, + "learning_rate": 4.902594055128267e-05, + "loss": 0.8521, + "step": 5184 + }, + { + "epoch": 0.09192712834080041, + "grad_norm": 2.65625, + "learning_rate": 4.902516841546584e-05, + "loss": 0.8358, + "step": 5186 + }, + { + "epoch": 0.09196258037641199, + "grad_norm": 2.875, + "learning_rate": 4.9024395979819915e-05, + "loss": 0.8225, + "step": 5188 + }, + { + "epoch": 0.09199803241202356, + "grad_norm": 2.625, + "learning_rate": 4.902362324435453e-05, + "loss": 0.8264, + "step": 5190 + }, + { + "epoch": 0.09203348444763512, + "grad_norm": 2.671875, + "learning_rate": 4.902285020907933e-05, + "loss": 0.8246, + "step": 5192 + }, + { + "epoch": 0.0920689364832467, + "grad_norm": 3.046875, + "learning_rate": 4.902207687400396e-05, + "loss": 0.8338, + "step": 5194 + }, + { + "epoch": 0.09210438851885827, + "grad_norm": 2.890625, + "learning_rate": 4.902130323913808e-05, + "loss": 0.8269, + "step": 5196 + }, + { + "epoch": 0.09213984055446983, + "grad_norm": 2.75, + "learning_rate": 4.9020529304491345e-05, + "loss": 0.8291, + "step": 5198 + }, + { + "epoch": 0.0921752925900814, + "grad_norm": 2.78125, + "learning_rate": 4.90197550700734e-05, + "loss": 0.8452, + "step": 5200 + }, + { + "epoch": 0.09221074462569298, + "grad_norm": 3.046875, + "learning_rate": 4.901898053589391e-05, + "loss": 0.8634, + "step": 5202 + }, + { + "epoch": 0.09224619666130454, + "grad_norm": 2.890625, + "learning_rate": 4.901820570196255e-05, + "loss": 0.8451, + "step": 5204 + }, + { + "epoch": 0.09228164869691612, + "grad_norm": 2.703125, + "learning_rate": 4.901743056828899e-05, + "loss": 0.9183, + "step": 5206 + }, + { + "epoch": 0.09231710073252769, + "grad_norm": 3.109375, + "learning_rate": 4.901665513488289e-05, + "loss": 0.8343, + "step": 5208 + }, + { + "epoch": 0.09235255276813925, + "grad_norm": 2.90625, + "learning_rate": 4.901587940175394e-05, + "loss": 0.8573, + "step": 5210 + }, + { + "epoch": 0.09238800480375083, + "grad_norm": 2.671875, + "learning_rate": 4.9015103368911816e-05, + "loss": 0.8048, + "step": 5212 + }, + { + "epoch": 0.0924234568393624, + "grad_norm": 2.75, + "learning_rate": 4.901432703636622e-05, + "loss": 0.8261, + "step": 5214 + }, + { + "epoch": 0.09245890887497396, + "grad_norm": 2.765625, + "learning_rate": 4.901355040412681e-05, + "loss": 0.8197, + "step": 5216 + }, + { + "epoch": 0.09249436091058554, + "grad_norm": 2.875, + "learning_rate": 4.901277347220329e-05, + "loss": 0.8323, + "step": 5218 + }, + { + "epoch": 0.09252981294619711, + "grad_norm": 2.796875, + "learning_rate": 4.901199624060536e-05, + "loss": 0.7934, + "step": 5220 + }, + { + "epoch": 0.09256526498180867, + "grad_norm": 2.9375, + "learning_rate": 4.901121870934272e-05, + "loss": 0.8816, + "step": 5222 + }, + { + "epoch": 0.09260071701742024, + "grad_norm": 2.984375, + "learning_rate": 4.901044087842507e-05, + "loss": 0.8536, + "step": 5224 + }, + { + "epoch": 0.09263616905303182, + "grad_norm": 2.953125, + "learning_rate": 4.9009662747862115e-05, + "loss": 0.837, + "step": 5226 + }, + { + "epoch": 0.09267162108864338, + "grad_norm": 2.65625, + "learning_rate": 4.900888431766359e-05, + "loss": 0.8126, + "step": 5228 + }, + { + "epoch": 0.09270707312425495, + "grad_norm": 2.796875, + "learning_rate": 4.900810558783917e-05, + "loss": 0.8512, + "step": 5230 + }, + { + "epoch": 0.09274252515986653, + "grad_norm": 2.59375, + "learning_rate": 4.9007326558398595e-05, + "loss": 0.8265, + "step": 5232 + }, + { + "epoch": 0.09277797719547809, + "grad_norm": 2.71875, + "learning_rate": 4.9006547229351587e-05, + "loss": 0.8007, + "step": 5234 + }, + { + "epoch": 0.09281342923108966, + "grad_norm": 2.609375, + "learning_rate": 4.9005767600707866e-05, + "loss": 0.8372, + "step": 5236 + }, + { + "epoch": 0.09284888126670124, + "grad_norm": 2.59375, + "learning_rate": 4.900498767247717e-05, + "loss": 0.8367, + "step": 5238 + }, + { + "epoch": 0.0928843333023128, + "grad_norm": 2.65625, + "learning_rate": 4.9004207444669224e-05, + "loss": 0.8489, + "step": 5240 + }, + { + "epoch": 0.09291978533792437, + "grad_norm": 2.71875, + "learning_rate": 4.900342691729378e-05, + "loss": 0.828, + "step": 5242 + }, + { + "epoch": 0.09295523737353595, + "grad_norm": 2.671875, + "learning_rate": 4.900264609036056e-05, + "loss": 0.7999, + "step": 5244 + }, + { + "epoch": 0.09299068940914751, + "grad_norm": 2.734375, + "learning_rate": 4.900186496387931e-05, + "loss": 0.831, + "step": 5246 + }, + { + "epoch": 0.09302614144475908, + "grad_norm": 2.78125, + "learning_rate": 4.900108353785979e-05, + "loss": 0.8824, + "step": 5248 + }, + { + "epoch": 0.09306159348037066, + "grad_norm": 2.828125, + "learning_rate": 4.900030181231175e-05, + "loss": 0.826, + "step": 5250 + }, + { + "epoch": 0.09309704551598222, + "grad_norm": 2.859375, + "learning_rate": 4.899951978724494e-05, + "loss": 0.8538, + "step": 5252 + }, + { + "epoch": 0.0931324975515938, + "grad_norm": 2.828125, + "learning_rate": 4.899873746266912e-05, + "loss": 0.8773, + "step": 5254 + }, + { + "epoch": 0.09316794958720535, + "grad_norm": 3.015625, + "learning_rate": 4.8997954838594055e-05, + "loss": 0.8843, + "step": 5256 + }, + { + "epoch": 0.09320340162281693, + "grad_norm": 2.890625, + "learning_rate": 4.899717191502951e-05, + "loss": 0.7928, + "step": 5258 + }, + { + "epoch": 0.0932388536584285, + "grad_norm": 2.875, + "learning_rate": 4.8996388691985265e-05, + "loss": 0.8471, + "step": 5260 + }, + { + "epoch": 0.09327430569404006, + "grad_norm": 2.703125, + "learning_rate": 4.899560516947108e-05, + "loss": 0.8468, + "step": 5262 + }, + { + "epoch": 0.09330975772965164, + "grad_norm": 2.5, + "learning_rate": 4.899482134749674e-05, + "loss": 0.8251, + "step": 5264 + }, + { + "epoch": 0.09334520976526321, + "grad_norm": 2.671875, + "learning_rate": 4.8994037226072037e-05, + "loss": 0.7813, + "step": 5266 + }, + { + "epoch": 0.09338066180087477, + "grad_norm": 2.75, + "learning_rate": 4.899325280520674e-05, + "loss": 0.8214, + "step": 5268 + }, + { + "epoch": 0.09341611383648635, + "grad_norm": 2.859375, + "learning_rate": 4.899246808491065e-05, + "loss": 0.853, + "step": 5270 + }, + { + "epoch": 0.09345156587209792, + "grad_norm": 2.796875, + "learning_rate": 4.899168306519355e-05, + "loss": 0.8129, + "step": 5272 + }, + { + "epoch": 0.09348701790770948, + "grad_norm": 2.75, + "learning_rate": 4.899089774606525e-05, + "loss": 0.8524, + "step": 5274 + }, + { + "epoch": 0.09352246994332106, + "grad_norm": 2.484375, + "learning_rate": 4.8990112127535525e-05, + "loss": 0.7781, + "step": 5276 + }, + { + "epoch": 0.09355792197893263, + "grad_norm": 2.65625, + "learning_rate": 4.898932620961422e-05, + "loss": 0.8256, + "step": 5278 + }, + { + "epoch": 0.09359337401454419, + "grad_norm": 2.609375, + "learning_rate": 4.89885399923111e-05, + "loss": 0.8313, + "step": 5280 + }, + { + "epoch": 0.09362882605015577, + "grad_norm": 2.6875, + "learning_rate": 4.8987753475636014e-05, + "loss": 0.8584, + "step": 5282 + }, + { + "epoch": 0.09366427808576734, + "grad_norm": 3.046875, + "learning_rate": 4.8986966659598756e-05, + "loss": 0.8305, + "step": 5284 + }, + { + "epoch": 0.0936997301213789, + "grad_norm": 2.59375, + "learning_rate": 4.8986179544209146e-05, + "loss": 0.7863, + "step": 5286 + }, + { + "epoch": 0.09373518215699048, + "grad_norm": 2.46875, + "learning_rate": 4.8985392129477014e-05, + "loss": 0.8056, + "step": 5288 + }, + { + "epoch": 0.09377063419260205, + "grad_norm": 2.546875, + "learning_rate": 4.898460441541218e-05, + "loss": 0.8324, + "step": 5290 + }, + { + "epoch": 0.09380608622821361, + "grad_norm": 2.875, + "learning_rate": 4.8983816402024484e-05, + "loss": 0.8534, + "step": 5292 + }, + { + "epoch": 0.09384153826382519, + "grad_norm": 2.921875, + "learning_rate": 4.8983028089323756e-05, + "loss": 0.8487, + "step": 5294 + }, + { + "epoch": 0.09387699029943676, + "grad_norm": 2.703125, + "learning_rate": 4.8982239477319824e-05, + "loss": 0.8318, + "step": 5296 + }, + { + "epoch": 0.09391244233504832, + "grad_norm": 2.890625, + "learning_rate": 4.8981450566022545e-05, + "loss": 0.8097, + "step": 5298 + }, + { + "epoch": 0.0939478943706599, + "grad_norm": 3.171875, + "learning_rate": 4.898066135544176e-05, + "loss": 0.8471, + "step": 5300 + }, + { + "epoch": 0.09398334640627147, + "grad_norm": 2.703125, + "learning_rate": 4.897987184558731e-05, + "loss": 0.8206, + "step": 5302 + }, + { + "epoch": 0.09401879844188303, + "grad_norm": 2.96875, + "learning_rate": 4.897908203646906e-05, + "loss": 0.8759, + "step": 5304 + }, + { + "epoch": 0.0940542504774946, + "grad_norm": 2.28125, + "learning_rate": 4.897829192809686e-05, + "loss": 0.7891, + "step": 5306 + }, + { + "epoch": 0.09408970251310618, + "grad_norm": 2.765625, + "learning_rate": 4.897750152048057e-05, + "loss": 0.8524, + "step": 5308 + }, + { + "epoch": 0.09412515454871774, + "grad_norm": 2.734375, + "learning_rate": 4.8976710813630056e-05, + "loss": 0.8392, + "step": 5310 + }, + { + "epoch": 0.09416060658432931, + "grad_norm": 2.828125, + "learning_rate": 4.897591980755518e-05, + "loss": 0.84, + "step": 5312 + }, + { + "epoch": 0.09419605861994089, + "grad_norm": 2.53125, + "learning_rate": 4.897512850226582e-05, + "loss": 0.7987, + "step": 5314 + }, + { + "epoch": 0.09423151065555245, + "grad_norm": 2.875, + "learning_rate": 4.8974336897771855e-05, + "loss": 0.853, + "step": 5316 + }, + { + "epoch": 0.09426696269116402, + "grad_norm": 2.765625, + "learning_rate": 4.8973544994083154e-05, + "loss": 0.8278, + "step": 5318 + }, + { + "epoch": 0.0943024147267756, + "grad_norm": 2.8125, + "learning_rate": 4.897275279120961e-05, + "loss": 0.8601, + "step": 5320 + }, + { + "epoch": 0.09433786676238716, + "grad_norm": 2.65625, + "learning_rate": 4.89719602891611e-05, + "loss": 0.8513, + "step": 5322 + }, + { + "epoch": 0.09437331879799873, + "grad_norm": 2.625, + "learning_rate": 4.8971167487947525e-05, + "loss": 0.8074, + "step": 5324 + }, + { + "epoch": 0.09440877083361031, + "grad_norm": 2.96875, + "learning_rate": 4.897037438757876e-05, + "loss": 0.8192, + "step": 5326 + }, + { + "epoch": 0.09444422286922187, + "grad_norm": 3.140625, + "learning_rate": 4.8969580988064724e-05, + "loss": 0.8364, + "step": 5328 + }, + { + "epoch": 0.09447967490483344, + "grad_norm": 2.9375, + "learning_rate": 4.896878728941531e-05, + "loss": 0.832, + "step": 5330 + }, + { + "epoch": 0.09451512694044502, + "grad_norm": 2.515625, + "learning_rate": 4.896799329164043e-05, + "loss": 0.8042, + "step": 5332 + }, + { + "epoch": 0.09455057897605658, + "grad_norm": 2.546875, + "learning_rate": 4.8967198994749966e-05, + "loss": 0.8357, + "step": 5334 + }, + { + "epoch": 0.09458603101166815, + "grad_norm": 2.96875, + "learning_rate": 4.896640439875386e-05, + "loss": 0.8761, + "step": 5336 + }, + { + "epoch": 0.09462148304727971, + "grad_norm": 2.65625, + "learning_rate": 4.896560950366202e-05, + "loss": 0.8303, + "step": 5338 + }, + { + "epoch": 0.09465693508289129, + "grad_norm": 2.84375, + "learning_rate": 4.896481430948437e-05, + "loss": 0.8626, + "step": 5340 + }, + { + "epoch": 0.09469238711850286, + "grad_norm": 2.65625, + "learning_rate": 4.896401881623083e-05, + "loss": 0.8361, + "step": 5342 + }, + { + "epoch": 0.09472783915411442, + "grad_norm": 3.03125, + "learning_rate": 4.8963223023911315e-05, + "loss": 0.8183, + "step": 5344 + }, + { + "epoch": 0.094763291189726, + "grad_norm": 3.0, + "learning_rate": 4.8962426932535775e-05, + "loss": 0.8441, + "step": 5346 + }, + { + "epoch": 0.09479874322533757, + "grad_norm": 2.8125, + "learning_rate": 4.8961630542114135e-05, + "loss": 0.8247, + "step": 5348 + }, + { + "epoch": 0.09483419526094913, + "grad_norm": 2.6875, + "learning_rate": 4.896083385265634e-05, + "loss": 0.8313, + "step": 5350 + }, + { + "epoch": 0.09486964729656071, + "grad_norm": 2.75, + "learning_rate": 4.896003686417233e-05, + "loss": 0.8515, + "step": 5352 + }, + { + "epoch": 0.09490509933217228, + "grad_norm": 2.71875, + "learning_rate": 4.895923957667204e-05, + "loss": 0.8214, + "step": 5354 + }, + { + "epoch": 0.09494055136778384, + "grad_norm": 2.875, + "learning_rate": 4.895844199016544e-05, + "loss": 0.8345, + "step": 5356 + }, + { + "epoch": 0.09497600340339542, + "grad_norm": 2.875, + "learning_rate": 4.895764410466248e-05, + "loss": 0.8197, + "step": 5358 + }, + { + "epoch": 0.09501145543900699, + "grad_norm": 2.84375, + "learning_rate": 4.89568459201731e-05, + "loss": 0.86, + "step": 5360 + }, + { + "epoch": 0.09504690747461855, + "grad_norm": 3.046875, + "learning_rate": 4.8956047436707276e-05, + "loss": 0.8541, + "step": 5362 + }, + { + "epoch": 0.09508235951023013, + "grad_norm": 2.765625, + "learning_rate": 4.8955248654274974e-05, + "loss": 0.8416, + "step": 5364 + }, + { + "epoch": 0.0951178115458417, + "grad_norm": 2.953125, + "learning_rate": 4.8954449572886154e-05, + "loss": 0.8919, + "step": 5366 + }, + { + "epoch": 0.09515326358145326, + "grad_norm": 2.96875, + "learning_rate": 4.895365019255079e-05, + "loss": 0.8213, + "step": 5368 + }, + { + "epoch": 0.09518871561706484, + "grad_norm": 2.671875, + "learning_rate": 4.895285051327887e-05, + "loss": 0.816, + "step": 5370 + }, + { + "epoch": 0.09522416765267641, + "grad_norm": 2.796875, + "learning_rate": 4.895205053508036e-05, + "loss": 0.8613, + "step": 5372 + }, + { + "epoch": 0.09525961968828797, + "grad_norm": 2.75, + "learning_rate": 4.895125025796525e-05, + "loss": 0.8261, + "step": 5374 + }, + { + "epoch": 0.09529507172389955, + "grad_norm": 2.5, + "learning_rate": 4.895044968194352e-05, + "loss": 0.81, + "step": 5376 + }, + { + "epoch": 0.09533052375951112, + "grad_norm": 2.5625, + "learning_rate": 4.894964880702517e-05, + "loss": 0.8326, + "step": 5378 + }, + { + "epoch": 0.09536597579512268, + "grad_norm": 3.28125, + "learning_rate": 4.894884763322019e-05, + "loss": 0.8157, + "step": 5380 + }, + { + "epoch": 0.09540142783073426, + "grad_norm": 2.71875, + "learning_rate": 4.894804616053858e-05, + "loss": 0.8442, + "step": 5382 + }, + { + "epoch": 0.09543687986634583, + "grad_norm": 2.765625, + "learning_rate": 4.8947244388990345e-05, + "loss": 0.8449, + "step": 5384 + }, + { + "epoch": 0.09547233190195739, + "grad_norm": 2.609375, + "learning_rate": 4.894644231858548e-05, + "loss": 0.8198, + "step": 5386 + }, + { + "epoch": 0.09550778393756897, + "grad_norm": 2.8125, + "learning_rate": 4.894563994933401e-05, + "loss": 0.8566, + "step": 5388 + }, + { + "epoch": 0.09554323597318054, + "grad_norm": 2.75, + "learning_rate": 4.8944837281245934e-05, + "loss": 0.8476, + "step": 5390 + }, + { + "epoch": 0.0955786880087921, + "grad_norm": 2.984375, + "learning_rate": 4.894403431433129e-05, + "loss": 0.8254, + "step": 5392 + }, + { + "epoch": 0.09561414004440368, + "grad_norm": 2.703125, + "learning_rate": 4.894323104860007e-05, + "loss": 0.8202, + "step": 5394 + }, + { + "epoch": 0.09564959208001525, + "grad_norm": 2.453125, + "learning_rate": 4.894242748406232e-05, + "loss": 0.8245, + "step": 5396 + }, + { + "epoch": 0.09568504411562681, + "grad_norm": 3.453125, + "learning_rate": 4.894162362072806e-05, + "loss": 0.8343, + "step": 5398 + }, + { + "epoch": 0.09572049615123839, + "grad_norm": 2.546875, + "learning_rate": 4.8940819458607323e-05, + "loss": 0.8675, + "step": 5400 + }, + { + "epoch": 0.09575594818684996, + "grad_norm": 2.75, + "learning_rate": 4.894001499771015e-05, + "loss": 0.8272, + "step": 5402 + }, + { + "epoch": 0.09579140022246152, + "grad_norm": 2.8125, + "learning_rate": 4.8939210238046577e-05, + "loss": 0.7998, + "step": 5404 + }, + { + "epoch": 0.0958268522580731, + "grad_norm": 2.609375, + "learning_rate": 4.8938405179626644e-05, + "loss": 0.8126, + "step": 5406 + }, + { + "epoch": 0.09586230429368467, + "grad_norm": 2.65625, + "learning_rate": 4.89375998224604e-05, + "loss": 0.8555, + "step": 5408 + }, + { + "epoch": 0.09589775632929623, + "grad_norm": 2.609375, + "learning_rate": 4.8936794166557895e-05, + "loss": 0.8274, + "step": 5410 + }, + { + "epoch": 0.0959332083649078, + "grad_norm": 2.828125, + "learning_rate": 4.893598821192918e-05, + "loss": 0.8555, + "step": 5412 + }, + { + "epoch": 0.09596866040051938, + "grad_norm": 2.53125, + "learning_rate": 4.893518195858433e-05, + "loss": 0.776, + "step": 5414 + }, + { + "epoch": 0.09600411243613094, + "grad_norm": 2.890625, + "learning_rate": 4.8934375406533384e-05, + "loss": 0.8853, + "step": 5416 + }, + { + "epoch": 0.09603956447174251, + "grad_norm": 2.890625, + "learning_rate": 4.8933568555786416e-05, + "loss": 0.8464, + "step": 5418 + }, + { + "epoch": 0.09607501650735409, + "grad_norm": 2.671875, + "learning_rate": 4.8932761406353506e-05, + "loss": 0.847, + "step": 5420 + }, + { + "epoch": 0.09611046854296565, + "grad_norm": 2.625, + "learning_rate": 4.893195395824472e-05, + "loss": 0.8339, + "step": 5422 + }, + { + "epoch": 0.09614592057857722, + "grad_norm": 2.703125, + "learning_rate": 4.8931146211470126e-05, + "loss": 0.8212, + "step": 5424 + }, + { + "epoch": 0.09618137261418878, + "grad_norm": 2.875, + "learning_rate": 4.8930338166039815e-05, + "loss": 0.812, + "step": 5426 + }, + { + "epoch": 0.09621682464980036, + "grad_norm": 2.921875, + "learning_rate": 4.892952982196387e-05, + "loss": 0.846, + "step": 5428 + }, + { + "epoch": 0.09625227668541193, + "grad_norm": 2.765625, + "learning_rate": 4.892872117925237e-05, + "loss": 0.8056, + "step": 5430 + }, + { + "epoch": 0.0962877287210235, + "grad_norm": 2.65625, + "learning_rate": 4.8927912237915416e-05, + "loss": 0.8156, + "step": 5432 + }, + { + "epoch": 0.09632318075663507, + "grad_norm": 2.515625, + "learning_rate": 4.8927102997963105e-05, + "loss": 0.7889, + "step": 5434 + }, + { + "epoch": 0.09635863279224664, + "grad_norm": 2.9375, + "learning_rate": 4.8926293459405524e-05, + "loss": 0.8546, + "step": 5436 + }, + { + "epoch": 0.0963940848278582, + "grad_norm": 2.609375, + "learning_rate": 4.8925483622252796e-05, + "loss": 0.8052, + "step": 5438 + }, + { + "epoch": 0.09642953686346978, + "grad_norm": 2.671875, + "learning_rate": 4.892467348651501e-05, + "loss": 0.8228, + "step": 5440 + }, + { + "epoch": 0.09646498889908135, + "grad_norm": 2.5, + "learning_rate": 4.892386305220228e-05, + "loss": 0.8544, + "step": 5442 + }, + { + "epoch": 0.09650044093469291, + "grad_norm": 2.9375, + "learning_rate": 4.892305231932473e-05, + "loss": 0.8331, + "step": 5444 + }, + { + "epoch": 0.09653589297030449, + "grad_norm": 2.96875, + "learning_rate": 4.892224128789246e-05, + "loss": 0.7923, + "step": 5446 + }, + { + "epoch": 0.09657134500591606, + "grad_norm": 2.46875, + "learning_rate": 4.8921429957915606e-05, + "loss": 0.853, + "step": 5448 + }, + { + "epoch": 0.09660679704152762, + "grad_norm": 2.6875, + "learning_rate": 4.8920618329404286e-05, + "loss": 0.8734, + "step": 5450 + }, + { + "epoch": 0.0966422490771392, + "grad_norm": 2.921875, + "learning_rate": 4.891980640236864e-05, + "loss": 0.8263, + "step": 5452 + }, + { + "epoch": 0.09667770111275077, + "grad_norm": 3.109375, + "learning_rate": 4.891899417681878e-05, + "loss": 0.8675, + "step": 5454 + }, + { + "epoch": 0.09671315314836233, + "grad_norm": 2.703125, + "learning_rate": 4.891818165276486e-05, + "loss": 0.7996, + "step": 5456 + }, + { + "epoch": 0.09674860518397391, + "grad_norm": 2.921875, + "learning_rate": 4.891736883021701e-05, + "loss": 0.8463, + "step": 5458 + }, + { + "epoch": 0.09678405721958548, + "grad_norm": 2.828125, + "learning_rate": 4.891655570918539e-05, + "loss": 0.8306, + "step": 5460 + }, + { + "epoch": 0.09681950925519704, + "grad_norm": 3.203125, + "learning_rate": 4.8915742289680136e-05, + "loss": 0.8614, + "step": 5462 + }, + { + "epoch": 0.09685496129080862, + "grad_norm": 2.703125, + "learning_rate": 4.891492857171139e-05, + "loss": 0.8186, + "step": 5464 + }, + { + "epoch": 0.09689041332642019, + "grad_norm": 2.6875, + "learning_rate": 4.891411455528932e-05, + "loss": 0.7825, + "step": 5466 + }, + { + "epoch": 0.09692586536203175, + "grad_norm": 2.828125, + "learning_rate": 4.891330024042408e-05, + "loss": 0.8715, + "step": 5468 + }, + { + "epoch": 0.09696131739764333, + "grad_norm": 2.5, + "learning_rate": 4.8912485627125835e-05, + "loss": 0.823, + "step": 5470 + }, + { + "epoch": 0.0969967694332549, + "grad_norm": 2.875, + "learning_rate": 4.891167071540475e-05, + "loss": 0.8334, + "step": 5472 + }, + { + "epoch": 0.09703222146886646, + "grad_norm": 3.0, + "learning_rate": 4.8910855505271e-05, + "loss": 0.8304, + "step": 5474 + }, + { + "epoch": 0.09706767350447804, + "grad_norm": 2.921875, + "learning_rate": 4.891003999673475e-05, + "loss": 0.8579, + "step": 5476 + }, + { + "epoch": 0.09710312554008961, + "grad_norm": 2.609375, + "learning_rate": 4.890922418980617e-05, + "loss": 0.8458, + "step": 5478 + }, + { + "epoch": 0.09713857757570117, + "grad_norm": 2.765625, + "learning_rate": 4.890840808449547e-05, + "loss": 0.7615, + "step": 5480 + }, + { + "epoch": 0.09717402961131275, + "grad_norm": 2.703125, + "learning_rate": 4.89075916808128e-05, + "loss": 0.8277, + "step": 5482 + }, + { + "epoch": 0.09720948164692432, + "grad_norm": 2.4375, + "learning_rate": 4.8906774978768376e-05, + "loss": 0.8035, + "step": 5484 + }, + { + "epoch": 0.09724493368253588, + "grad_norm": 2.6875, + "learning_rate": 4.8905957978372377e-05, + "loss": 0.8383, + "step": 5486 + }, + { + "epoch": 0.09728038571814746, + "grad_norm": 2.734375, + "learning_rate": 4.890514067963501e-05, + "loss": 0.8408, + "step": 5488 + }, + { + "epoch": 0.09731583775375903, + "grad_norm": 2.828125, + "learning_rate": 4.8904323082566456e-05, + "loss": 0.8368, + "step": 5490 + }, + { + "epoch": 0.09735128978937059, + "grad_norm": 2.6875, + "learning_rate": 4.890350518717693e-05, + "loss": 0.8349, + "step": 5492 + }, + { + "epoch": 0.09738674182498216, + "grad_norm": 2.671875, + "learning_rate": 4.890268699347664e-05, + "loss": 0.8517, + "step": 5494 + }, + { + "epoch": 0.09742219386059374, + "grad_norm": 2.859375, + "learning_rate": 4.890186850147579e-05, + "loss": 0.8121, + "step": 5496 + }, + { + "epoch": 0.0974576458962053, + "grad_norm": 2.96875, + "learning_rate": 4.890104971118461e-05, + "loss": 0.8557, + "step": 5498 + }, + { + "epoch": 0.09749309793181687, + "grad_norm": 2.640625, + "learning_rate": 4.8900230622613294e-05, + "loss": 0.8497, + "step": 5500 + }, + { + "epoch": 0.09752854996742845, + "grad_norm": 2.515625, + "learning_rate": 4.889941123577209e-05, + "loss": 0.8452, + "step": 5502 + }, + { + "epoch": 0.09756400200304001, + "grad_norm": 2.953125, + "learning_rate": 4.8898591550671205e-05, + "loss": 0.825, + "step": 5504 + }, + { + "epoch": 0.09759945403865158, + "grad_norm": 2.84375, + "learning_rate": 4.889777156732088e-05, + "loss": 0.8256, + "step": 5506 + }, + { + "epoch": 0.09763490607426314, + "grad_norm": 2.96875, + "learning_rate": 4.889695128573134e-05, + "loss": 0.8618, + "step": 5508 + }, + { + "epoch": 0.09767035810987472, + "grad_norm": 2.765625, + "learning_rate": 4.889613070591283e-05, + "loss": 0.8529, + "step": 5510 + }, + { + "epoch": 0.0977058101454863, + "grad_norm": 2.625, + "learning_rate": 4.889530982787558e-05, + "loss": 0.825, + "step": 5512 + }, + { + "epoch": 0.09774126218109785, + "grad_norm": 2.84375, + "learning_rate": 4.8894488651629844e-05, + "loss": 0.8353, + "step": 5514 + }, + { + "epoch": 0.09777671421670943, + "grad_norm": 2.734375, + "learning_rate": 4.889366717718587e-05, + "loss": 0.8779, + "step": 5516 + }, + { + "epoch": 0.097812166252321, + "grad_norm": 2.609375, + "learning_rate": 4.8892845404553897e-05, + "loss": 0.7985, + "step": 5518 + }, + { + "epoch": 0.09784761828793256, + "grad_norm": 2.984375, + "learning_rate": 4.88920233337442e-05, + "loss": 0.8459, + "step": 5520 + }, + { + "epoch": 0.09788307032354414, + "grad_norm": 2.65625, + "learning_rate": 4.8891200964767014e-05, + "loss": 0.7836, + "step": 5522 + }, + { + "epoch": 0.09791852235915571, + "grad_norm": 2.796875, + "learning_rate": 4.889037829763262e-05, + "loss": 0.8491, + "step": 5524 + }, + { + "epoch": 0.09795397439476727, + "grad_norm": 2.703125, + "learning_rate": 4.888955533235129e-05, + "loss": 0.844, + "step": 5526 + }, + { + "epoch": 0.09798942643037885, + "grad_norm": 2.625, + "learning_rate": 4.888873206893328e-05, + "loss": 0.8221, + "step": 5528 + }, + { + "epoch": 0.09802487846599042, + "grad_norm": 2.71875, + "learning_rate": 4.888790850738887e-05, + "loss": 0.8318, + "step": 5530 + }, + { + "epoch": 0.09806033050160198, + "grad_norm": 2.734375, + "learning_rate": 4.888708464772834e-05, + "loss": 0.8489, + "step": 5532 + }, + { + "epoch": 0.09809578253721356, + "grad_norm": 2.78125, + "learning_rate": 4.8886260489961963e-05, + "loss": 0.8574, + "step": 5534 + }, + { + "epoch": 0.09813123457282513, + "grad_norm": 2.765625, + "learning_rate": 4.888543603410004e-05, + "loss": 0.8249, + "step": 5536 + }, + { + "epoch": 0.09816668660843669, + "grad_norm": 3.015625, + "learning_rate": 4.888461128015283e-05, + "loss": 0.8662, + "step": 5538 + }, + { + "epoch": 0.09820213864404827, + "grad_norm": 2.921875, + "learning_rate": 4.888378622813067e-05, + "loss": 0.8338, + "step": 5540 + }, + { + "epoch": 0.09823759067965984, + "grad_norm": 2.6875, + "learning_rate": 4.888296087804383e-05, + "loss": 0.8295, + "step": 5542 + }, + { + "epoch": 0.0982730427152714, + "grad_norm": 2.75, + "learning_rate": 4.88821352299026e-05, + "loss": 0.8539, + "step": 5544 + }, + { + "epoch": 0.09830849475088298, + "grad_norm": 2.53125, + "learning_rate": 4.8881309283717305e-05, + "loss": 0.8205, + "step": 5546 + }, + { + "epoch": 0.09834394678649455, + "grad_norm": 2.375, + "learning_rate": 4.888048303949824e-05, + "loss": 0.8314, + "step": 5548 + }, + { + "epoch": 0.09837939882210611, + "grad_norm": 2.84375, + "learning_rate": 4.887965649725572e-05, + "loss": 0.878, + "step": 5550 + }, + { + "epoch": 0.09841485085771769, + "grad_norm": 2.859375, + "learning_rate": 4.887882965700006e-05, + "loss": 0.8663, + "step": 5552 + }, + { + "epoch": 0.09845030289332926, + "grad_norm": 2.46875, + "learning_rate": 4.8878002518741585e-05, + "loss": 0.785, + "step": 5554 + }, + { + "epoch": 0.09848575492894082, + "grad_norm": 2.5625, + "learning_rate": 4.8877175082490606e-05, + "loss": 0.7979, + "step": 5556 + }, + { + "epoch": 0.0985212069645524, + "grad_norm": 2.703125, + "learning_rate": 4.8876347348257454e-05, + "loss": 0.8108, + "step": 5558 + }, + { + "epoch": 0.09855665900016397, + "grad_norm": 2.59375, + "learning_rate": 4.8875519316052464e-05, + "loss": 0.8471, + "step": 5560 + }, + { + "epoch": 0.09859211103577553, + "grad_norm": 3.3125, + "learning_rate": 4.8874690985885975e-05, + "loss": 0.8127, + "step": 5562 + }, + { + "epoch": 0.0986275630713871, + "grad_norm": 2.859375, + "learning_rate": 4.88738623577683e-05, + "loss": 0.8511, + "step": 5564 + }, + { + "epoch": 0.09866301510699868, + "grad_norm": 2.65625, + "learning_rate": 4.8873033431709804e-05, + "loss": 0.8387, + "step": 5566 + }, + { + "epoch": 0.09869846714261024, + "grad_norm": 2.734375, + "learning_rate": 4.887220420772082e-05, + "loss": 0.8615, + "step": 5568 + }, + { + "epoch": 0.09873391917822182, + "grad_norm": 2.71875, + "learning_rate": 4.88713746858117e-05, + "loss": 0.8746, + "step": 5570 + }, + { + "epoch": 0.09876937121383339, + "grad_norm": 2.703125, + "learning_rate": 4.88705448659928e-05, + "loss": 0.8258, + "step": 5572 + }, + { + "epoch": 0.09880482324944495, + "grad_norm": 2.765625, + "learning_rate": 4.8869714748274464e-05, + "loss": 0.8558, + "step": 5574 + }, + { + "epoch": 0.09884027528505653, + "grad_norm": 2.59375, + "learning_rate": 4.8868884332667066e-05, + "loss": 0.8201, + "step": 5576 + }, + { + "epoch": 0.0988757273206681, + "grad_norm": 3.046875, + "learning_rate": 4.886805361918096e-05, + "loss": 0.8155, + "step": 5578 + }, + { + "epoch": 0.09891117935627966, + "grad_norm": 2.828125, + "learning_rate": 4.886722260782652e-05, + "loss": 0.8609, + "step": 5580 + }, + { + "epoch": 0.09894663139189123, + "grad_norm": 3.015625, + "learning_rate": 4.886639129861411e-05, + "loss": 0.833, + "step": 5582 + }, + { + "epoch": 0.09898208342750281, + "grad_norm": 2.734375, + "learning_rate": 4.886555969155411e-05, + "loss": 0.8938, + "step": 5584 + }, + { + "epoch": 0.09901753546311437, + "grad_norm": 2.9375, + "learning_rate": 4.8864727786656886e-05, + "loss": 0.8474, + "step": 5586 + }, + { + "epoch": 0.09905298749872594, + "grad_norm": 2.84375, + "learning_rate": 4.8863895583932836e-05, + "loss": 0.7775, + "step": 5588 + }, + { + "epoch": 0.09908843953433752, + "grad_norm": 3.015625, + "learning_rate": 4.886306308339235e-05, + "loss": 0.853, + "step": 5590 + }, + { + "epoch": 0.09912389156994908, + "grad_norm": 2.421875, + "learning_rate": 4.8862230285045794e-05, + "loss": 0.8057, + "step": 5592 + }, + { + "epoch": 0.09915934360556065, + "grad_norm": 2.5625, + "learning_rate": 4.886139718890358e-05, + "loss": 0.8473, + "step": 5594 + }, + { + "epoch": 0.09919479564117221, + "grad_norm": 3.140625, + "learning_rate": 4.88605637949761e-05, + "loss": 0.829, + "step": 5596 + }, + { + "epoch": 0.09923024767678379, + "grad_norm": 2.703125, + "learning_rate": 4.885973010327375e-05, + "loss": 0.8445, + "step": 5598 + }, + { + "epoch": 0.09926569971239536, + "grad_norm": 2.921875, + "learning_rate": 4.885889611380694e-05, + "loss": 0.8265, + "step": 5600 + }, + { + "epoch": 0.09930115174800692, + "grad_norm": 3.109375, + "learning_rate": 4.885806182658607e-05, + "loss": 0.8717, + "step": 5602 + }, + { + "epoch": 0.0993366037836185, + "grad_norm": 2.859375, + "learning_rate": 4.885722724162156e-05, + "loss": 0.8219, + "step": 5604 + }, + { + "epoch": 0.09937205581923007, + "grad_norm": 2.84375, + "learning_rate": 4.885639235892383e-05, + "loss": 0.8432, + "step": 5606 + }, + { + "epoch": 0.09940750785484163, + "grad_norm": 2.484375, + "learning_rate": 4.8855557178503286e-05, + "loss": 0.8082, + "step": 5608 + }, + { + "epoch": 0.09944295989045321, + "grad_norm": 2.875, + "learning_rate": 4.885472170037037e-05, + "loss": 0.836, + "step": 5610 + }, + { + "epoch": 0.09947841192606478, + "grad_norm": 2.671875, + "learning_rate": 4.885388592453548e-05, + "loss": 0.8404, + "step": 5612 + }, + { + "epoch": 0.09951386396167634, + "grad_norm": 2.6875, + "learning_rate": 4.8853049851009064e-05, + "loss": 0.8383, + "step": 5614 + }, + { + "epoch": 0.09954931599728792, + "grad_norm": 3.078125, + "learning_rate": 4.885221347980156e-05, + "loss": 0.8697, + "step": 5616 + }, + { + "epoch": 0.09958476803289949, + "grad_norm": 2.796875, + "learning_rate": 4.8851376810923396e-05, + "loss": 0.8104, + "step": 5618 + }, + { + "epoch": 0.09962022006851105, + "grad_norm": 2.90625, + "learning_rate": 4.8850539844385017e-05, + "loss": 0.8173, + "step": 5620 + }, + { + "epoch": 0.09965567210412263, + "grad_norm": 2.453125, + "learning_rate": 4.8849702580196874e-05, + "loss": 0.8496, + "step": 5622 + }, + { + "epoch": 0.0996911241397342, + "grad_norm": 2.78125, + "learning_rate": 4.8848865018369404e-05, + "loss": 0.8188, + "step": 5624 + }, + { + "epoch": 0.09972657617534576, + "grad_norm": 2.875, + "learning_rate": 4.884802715891307e-05, + "loss": 0.8529, + "step": 5626 + }, + { + "epoch": 0.09976202821095734, + "grad_norm": 2.59375, + "learning_rate": 4.8847189001838324e-05, + "loss": 0.8529, + "step": 5628 + }, + { + "epoch": 0.09979748024656891, + "grad_norm": 2.9375, + "learning_rate": 4.884635054715562e-05, + "loss": 0.8271, + "step": 5630 + }, + { + "epoch": 0.09983293228218047, + "grad_norm": 3.109375, + "learning_rate": 4.884551179487543e-05, + "loss": 0.8768, + "step": 5632 + }, + { + "epoch": 0.09986838431779205, + "grad_norm": 2.78125, + "learning_rate": 4.884467274500822e-05, + "loss": 0.8548, + "step": 5634 + }, + { + "epoch": 0.09990383635340362, + "grad_norm": 2.8125, + "learning_rate": 4.884383339756447e-05, + "loss": 0.8354, + "step": 5636 + }, + { + "epoch": 0.09993928838901518, + "grad_norm": 2.8125, + "learning_rate": 4.8842993752554635e-05, + "loss": 0.8216, + "step": 5638 + }, + { + "epoch": 0.09997474042462676, + "grad_norm": 2.59375, + "learning_rate": 4.884215380998921e-05, + "loss": 0.8471, + "step": 5640 + }, + { + "epoch": 0.10001019246023833, + "grad_norm": 2.96875, + "learning_rate": 4.884131356987867e-05, + "loss": 0.8059, + "step": 5642 + }, + { + "epoch": 0.10004564449584989, + "grad_norm": 2.828125, + "learning_rate": 4.8840473032233494e-05, + "loss": 0.8661, + "step": 5644 + }, + { + "epoch": 0.10008109653146147, + "grad_norm": 2.75, + "learning_rate": 4.883963219706419e-05, + "loss": 0.7908, + "step": 5646 + }, + { + "epoch": 0.10011654856707304, + "grad_norm": 2.90625, + "learning_rate": 4.883879106438124e-05, + "loss": 0.8687, + "step": 5648 + }, + { + "epoch": 0.1001520006026846, + "grad_norm": 2.765625, + "learning_rate": 4.883794963419514e-05, + "loss": 0.8066, + "step": 5650 + }, + { + "epoch": 0.10018745263829618, + "grad_norm": 2.84375, + "learning_rate": 4.88371079065164e-05, + "loss": 0.827, + "step": 5652 + }, + { + "epoch": 0.10022290467390775, + "grad_norm": 2.625, + "learning_rate": 4.883626588135551e-05, + "loss": 0.8052, + "step": 5654 + }, + { + "epoch": 0.10025835670951931, + "grad_norm": 2.8125, + "learning_rate": 4.883542355872299e-05, + "loss": 0.8144, + "step": 5656 + }, + { + "epoch": 0.10029380874513089, + "grad_norm": 2.71875, + "learning_rate": 4.883458093862935e-05, + "loss": 0.8118, + "step": 5658 + }, + { + "epoch": 0.10032926078074246, + "grad_norm": 3.0625, + "learning_rate": 4.88337380210851e-05, + "loss": 0.8369, + "step": 5660 + }, + { + "epoch": 0.10036471281635402, + "grad_norm": 3.015625, + "learning_rate": 4.883289480610077e-05, + "loss": 0.8589, + "step": 5662 + }, + { + "epoch": 0.1004001648519656, + "grad_norm": 2.59375, + "learning_rate": 4.883205129368688e-05, + "loss": 0.8265, + "step": 5664 + }, + { + "epoch": 0.10043561688757717, + "grad_norm": 3.125, + "learning_rate": 4.883120748385394e-05, + "loss": 0.8489, + "step": 5666 + }, + { + "epoch": 0.10047106892318873, + "grad_norm": 2.765625, + "learning_rate": 4.8830363376612497e-05, + "loss": 0.8463, + "step": 5668 + }, + { + "epoch": 0.1005065209588003, + "grad_norm": 2.78125, + "learning_rate": 4.8829518971973085e-05, + "loss": 0.8248, + "step": 5670 + }, + { + "epoch": 0.10054197299441188, + "grad_norm": 2.96875, + "learning_rate": 4.8828674269946254e-05, + "loss": 0.8391, + "step": 5672 + }, + { + "epoch": 0.10057742503002344, + "grad_norm": 2.703125, + "learning_rate": 4.882782927054251e-05, + "loss": 0.8326, + "step": 5674 + }, + { + "epoch": 0.10061287706563501, + "grad_norm": 2.78125, + "learning_rate": 4.882698397377243e-05, + "loss": 0.8218, + "step": 5676 + }, + { + "epoch": 0.10064832910124658, + "grad_norm": 2.609375, + "learning_rate": 4.882613837964655e-05, + "loss": 0.8307, + "step": 5678 + }, + { + "epoch": 0.10068378113685815, + "grad_norm": 2.796875, + "learning_rate": 4.882529248817543e-05, + "loss": 0.8427, + "step": 5680 + }, + { + "epoch": 0.10071923317246972, + "grad_norm": 2.875, + "learning_rate": 4.882444629936962e-05, + "loss": 0.8637, + "step": 5682 + }, + { + "epoch": 0.10075468520808128, + "grad_norm": 2.84375, + "learning_rate": 4.882359981323968e-05, + "loss": 0.8579, + "step": 5684 + }, + { + "epoch": 0.10079013724369286, + "grad_norm": 2.734375, + "learning_rate": 4.8822753029796174e-05, + "loss": 0.8361, + "step": 5686 + }, + { + "epoch": 0.10082558927930443, + "grad_norm": 2.46875, + "learning_rate": 4.8821905949049685e-05, + "loss": 0.8359, + "step": 5688 + }, + { + "epoch": 0.100861041314916, + "grad_norm": 2.953125, + "learning_rate": 4.882105857101076e-05, + "loss": 0.8288, + "step": 5690 + }, + { + "epoch": 0.10089649335052757, + "grad_norm": 2.6875, + "learning_rate": 4.882021089568999e-05, + "loss": 0.8332, + "step": 5692 + }, + { + "epoch": 0.10093194538613914, + "grad_norm": 2.640625, + "learning_rate": 4.881936292309795e-05, + "loss": 0.8297, + "step": 5694 + }, + { + "epoch": 0.1009673974217507, + "grad_norm": 2.765625, + "learning_rate": 4.881851465324522e-05, + "loss": 0.8713, + "step": 5696 + }, + { + "epoch": 0.10100284945736228, + "grad_norm": 2.53125, + "learning_rate": 4.881766608614238e-05, + "loss": 0.8066, + "step": 5698 + }, + { + "epoch": 0.10103830149297385, + "grad_norm": 3.03125, + "learning_rate": 4.881681722180004e-05, + "loss": 0.8149, + "step": 5700 + }, + { + "epoch": 0.10107375352858541, + "grad_norm": 2.828125, + "learning_rate": 4.881596806022878e-05, + "loss": 0.7993, + "step": 5702 + }, + { + "epoch": 0.10110920556419699, + "grad_norm": 2.796875, + "learning_rate": 4.881511860143919e-05, + "loss": 0.7886, + "step": 5704 + }, + { + "epoch": 0.10114465759980856, + "grad_norm": 2.65625, + "learning_rate": 4.881426884544189e-05, + "loss": 0.8257, + "step": 5706 + }, + { + "epoch": 0.10118010963542012, + "grad_norm": 2.734375, + "learning_rate": 4.881341879224747e-05, + "loss": 0.8037, + "step": 5708 + }, + { + "epoch": 0.1012155616710317, + "grad_norm": 2.484375, + "learning_rate": 4.881256844186655e-05, + "loss": 0.8028, + "step": 5710 + }, + { + "epoch": 0.10125101370664327, + "grad_norm": 3.25, + "learning_rate": 4.881171779430973e-05, + "loss": 0.8512, + "step": 5712 + }, + { + "epoch": 0.10128646574225483, + "grad_norm": 2.578125, + "learning_rate": 4.881086684958763e-05, + "loss": 0.7972, + "step": 5714 + }, + { + "epoch": 0.10132191777786641, + "grad_norm": 3.25, + "learning_rate": 4.881001560771087e-05, + "loss": 0.8728, + "step": 5716 + }, + { + "epoch": 0.10135736981347798, + "grad_norm": 2.90625, + "learning_rate": 4.8809164068690084e-05, + "loss": 0.8728, + "step": 5718 + }, + { + "epoch": 0.10139282184908954, + "grad_norm": 3.03125, + "learning_rate": 4.880831223253588e-05, + "loss": 0.8824, + "step": 5720 + }, + { + "epoch": 0.10142827388470112, + "grad_norm": 2.71875, + "learning_rate": 4.8807460099258906e-05, + "loss": 0.8078, + "step": 5722 + }, + { + "epoch": 0.10146372592031269, + "grad_norm": 2.828125, + "learning_rate": 4.8806607668869786e-05, + "loss": 0.8533, + "step": 5724 + }, + { + "epoch": 0.10149917795592425, + "grad_norm": 2.765625, + "learning_rate": 4.880575494137916e-05, + "loss": 0.8264, + "step": 5726 + }, + { + "epoch": 0.10153462999153583, + "grad_norm": 2.515625, + "learning_rate": 4.880490191679767e-05, + "loss": 0.8368, + "step": 5728 + }, + { + "epoch": 0.1015700820271474, + "grad_norm": 2.578125, + "learning_rate": 4.880404859513596e-05, + "loss": 0.8025, + "step": 5730 + }, + { + "epoch": 0.10160553406275896, + "grad_norm": 2.46875, + "learning_rate": 4.8803194976404685e-05, + "loss": 0.7964, + "step": 5732 + }, + { + "epoch": 0.10164098609837054, + "grad_norm": 2.65625, + "learning_rate": 4.8802341060614495e-05, + "loss": 0.8358, + "step": 5734 + }, + { + "epoch": 0.10167643813398211, + "grad_norm": 2.921875, + "learning_rate": 4.8801486847776044e-05, + "loss": 0.8685, + "step": 5736 + }, + { + "epoch": 0.10171189016959367, + "grad_norm": 2.671875, + "learning_rate": 4.880063233789999e-05, + "loss": 0.8414, + "step": 5738 + }, + { + "epoch": 0.10174734220520525, + "grad_norm": 2.484375, + "learning_rate": 4.8799777530997017e-05, + "loss": 0.8109, + "step": 5740 + }, + { + "epoch": 0.10178279424081682, + "grad_norm": 2.6875, + "learning_rate": 4.8798922427077764e-05, + "loss": 0.8377, + "step": 5742 + }, + { + "epoch": 0.10181824627642838, + "grad_norm": 2.65625, + "learning_rate": 4.8798067026152914e-05, + "loss": 0.8486, + "step": 5744 + }, + { + "epoch": 0.10185369831203996, + "grad_norm": 2.78125, + "learning_rate": 4.879721132823315e-05, + "loss": 0.8192, + "step": 5746 + }, + { + "epoch": 0.10188915034765153, + "grad_norm": 2.890625, + "learning_rate": 4.8796355333329145e-05, + "loss": 0.8028, + "step": 5748 + }, + { + "epoch": 0.10192460238326309, + "grad_norm": 2.734375, + "learning_rate": 4.8795499041451585e-05, + "loss": 0.8343, + "step": 5750 + }, + { + "epoch": 0.10196005441887467, + "grad_norm": 2.875, + "learning_rate": 4.879464245261115e-05, + "loss": 0.8285, + "step": 5752 + }, + { + "epoch": 0.10199550645448624, + "grad_norm": 2.578125, + "learning_rate": 4.879378556681854e-05, + "loss": 0.8591, + "step": 5754 + }, + { + "epoch": 0.1020309584900978, + "grad_norm": 2.65625, + "learning_rate": 4.8792928384084435e-05, + "loss": 0.8476, + "step": 5756 + }, + { + "epoch": 0.10206641052570938, + "grad_norm": 2.59375, + "learning_rate": 4.879207090441954e-05, + "loss": 0.8165, + "step": 5758 + }, + { + "epoch": 0.10210186256132095, + "grad_norm": 2.921875, + "learning_rate": 4.8791213127834555e-05, + "loss": 0.8621, + "step": 5760 + }, + { + "epoch": 0.10213731459693251, + "grad_norm": 2.921875, + "learning_rate": 4.879035505434019e-05, + "loss": 0.8944, + "step": 5762 + }, + { + "epoch": 0.10217276663254408, + "grad_norm": 3.109375, + "learning_rate": 4.878949668394714e-05, + "loss": 0.815, + "step": 5764 + }, + { + "epoch": 0.10220821866815565, + "grad_norm": 3.140625, + "learning_rate": 4.878863801666613e-05, + "loss": 0.8699, + "step": 5766 + }, + { + "epoch": 0.10224367070376722, + "grad_norm": 2.625, + "learning_rate": 4.8787779052507874e-05, + "loss": 0.8312, + "step": 5768 + }, + { + "epoch": 0.1022791227393788, + "grad_norm": 2.515625, + "learning_rate": 4.8786919791483094e-05, + "loss": 0.8142, + "step": 5770 + }, + { + "epoch": 0.10231457477499036, + "grad_norm": 2.828125, + "learning_rate": 4.8786060233602506e-05, + "loss": 0.8463, + "step": 5772 + }, + { + "epoch": 0.10235002681060193, + "grad_norm": 2.734375, + "learning_rate": 4.8785200378876836e-05, + "loss": 0.8769, + "step": 5774 + }, + { + "epoch": 0.1023854788462135, + "grad_norm": 2.875, + "learning_rate": 4.8784340227316825e-05, + "loss": 0.8068, + "step": 5776 + }, + { + "epoch": 0.10242093088182506, + "grad_norm": 2.515625, + "learning_rate": 4.8783479778933207e-05, + "loss": 0.8446, + "step": 5778 + }, + { + "epoch": 0.10245638291743664, + "grad_norm": 2.90625, + "learning_rate": 4.878261903373671e-05, + "loss": 0.8235, + "step": 5780 + }, + { + "epoch": 0.10249183495304821, + "grad_norm": 2.75, + "learning_rate": 4.8781757991738074e-05, + "loss": 0.8487, + "step": 5782 + }, + { + "epoch": 0.10252728698865977, + "grad_norm": 2.875, + "learning_rate": 4.8780896652948054e-05, + "loss": 0.8151, + "step": 5784 + }, + { + "epoch": 0.10256273902427135, + "grad_norm": 2.96875, + "learning_rate": 4.87800350173774e-05, + "loss": 0.8187, + "step": 5786 + }, + { + "epoch": 0.10259819105988292, + "grad_norm": 2.765625, + "learning_rate": 4.8779173085036865e-05, + "loss": 0.8441, + "step": 5788 + }, + { + "epoch": 0.10263364309549448, + "grad_norm": 2.6875, + "learning_rate": 4.87783108559372e-05, + "loss": 0.8383, + "step": 5790 + }, + { + "epoch": 0.10266909513110606, + "grad_norm": 2.953125, + "learning_rate": 4.877744833008917e-05, + "loss": 0.8315, + "step": 5792 + }, + { + "epoch": 0.10270454716671763, + "grad_norm": 2.859375, + "learning_rate": 4.877658550750353e-05, + "loss": 0.8346, + "step": 5794 + }, + { + "epoch": 0.1027399992023292, + "grad_norm": 2.609375, + "learning_rate": 4.877572238819106e-05, + "loss": 0.8366, + "step": 5796 + }, + { + "epoch": 0.10277545123794077, + "grad_norm": 2.9375, + "learning_rate": 4.877485897216253e-05, + "loss": 0.8602, + "step": 5798 + }, + { + "epoch": 0.10281090327355234, + "grad_norm": 2.515625, + "learning_rate": 4.877399525942871e-05, + "loss": 0.8087, + "step": 5800 + }, + { + "epoch": 0.1028463553091639, + "grad_norm": 2.515625, + "learning_rate": 4.877313125000038e-05, + "loss": 0.8278, + "step": 5802 + }, + { + "epoch": 0.10288180734477548, + "grad_norm": 3.03125, + "learning_rate": 4.877226694388832e-05, + "loss": 0.8435, + "step": 5804 + }, + { + "epoch": 0.10291725938038705, + "grad_norm": 3.0, + "learning_rate": 4.877140234110333e-05, + "loss": 0.8759, + "step": 5806 + }, + { + "epoch": 0.10295271141599861, + "grad_norm": 2.671875, + "learning_rate": 4.8770537441656184e-05, + "loss": 0.8405, + "step": 5808 + }, + { + "epoch": 0.10298816345161019, + "grad_norm": 2.6875, + "learning_rate": 4.876967224555768e-05, + "loss": 0.8133, + "step": 5810 + }, + { + "epoch": 0.10302361548722176, + "grad_norm": 2.734375, + "learning_rate": 4.876880675281862e-05, + "loss": 0.8224, + "step": 5812 + }, + { + "epoch": 0.10305906752283332, + "grad_norm": 2.625, + "learning_rate": 4.87679409634498e-05, + "loss": 0.8069, + "step": 5814 + }, + { + "epoch": 0.1030945195584449, + "grad_norm": 2.578125, + "learning_rate": 4.876707487746203e-05, + "loss": 0.8132, + "step": 5816 + }, + { + "epoch": 0.10312997159405647, + "grad_norm": 2.828125, + "learning_rate": 4.8766208494866114e-05, + "loss": 0.8216, + "step": 5818 + }, + { + "epoch": 0.10316542362966803, + "grad_norm": 3.109375, + "learning_rate": 4.8765341815672865e-05, + "loss": 0.8498, + "step": 5820 + }, + { + "epoch": 0.1032008756652796, + "grad_norm": 3.15625, + "learning_rate": 4.87644748398931e-05, + "loss": 0.8971, + "step": 5822 + }, + { + "epoch": 0.10323632770089118, + "grad_norm": 2.6875, + "learning_rate": 4.876360756753764e-05, + "loss": 0.8124, + "step": 5824 + }, + { + "epoch": 0.10327177973650274, + "grad_norm": 2.96875, + "learning_rate": 4.876273999861731e-05, + "loss": 0.8198, + "step": 5826 + }, + { + "epoch": 0.10330723177211432, + "grad_norm": 2.671875, + "learning_rate": 4.8761872133142935e-05, + "loss": 0.8254, + "step": 5828 + }, + { + "epoch": 0.10334268380772589, + "grad_norm": 2.703125, + "learning_rate": 4.876100397112534e-05, + "loss": 0.8737, + "step": 5830 + }, + { + "epoch": 0.10337813584333745, + "grad_norm": 2.59375, + "learning_rate": 4.8760135512575364e-05, + "loss": 0.8547, + "step": 5832 + }, + { + "epoch": 0.10341358787894903, + "grad_norm": 2.59375, + "learning_rate": 4.8759266757503855e-05, + "loss": 0.8565, + "step": 5834 + }, + { + "epoch": 0.1034490399145606, + "grad_norm": 3.078125, + "learning_rate": 4.875839770592163e-05, + "loss": 0.8471, + "step": 5836 + }, + { + "epoch": 0.10348449195017216, + "grad_norm": 2.6875, + "learning_rate": 4.8757528357839564e-05, + "loss": 0.8266, + "step": 5838 + }, + { + "epoch": 0.10351994398578374, + "grad_norm": 2.734375, + "learning_rate": 4.8756658713268486e-05, + "loss": 0.8161, + "step": 5840 + }, + { + "epoch": 0.10355539602139531, + "grad_norm": 2.5, + "learning_rate": 4.8755788772219256e-05, + "loss": 0.8541, + "step": 5842 + }, + { + "epoch": 0.10359084805700687, + "grad_norm": 3.015625, + "learning_rate": 4.8754918534702733e-05, + "loss": 0.8992, + "step": 5844 + }, + { + "epoch": 0.10362630009261845, + "grad_norm": 2.78125, + "learning_rate": 4.875404800072977e-05, + "loss": 0.8513, + "step": 5846 + }, + { + "epoch": 0.10366175212823002, + "grad_norm": 2.734375, + "learning_rate": 4.875317717031124e-05, + "loss": 0.8752, + "step": 5848 + }, + { + "epoch": 0.10369720416384158, + "grad_norm": 2.765625, + "learning_rate": 4.8752306043458e-05, + "loss": 0.8375, + "step": 5850 + }, + { + "epoch": 0.10373265619945315, + "grad_norm": 2.796875, + "learning_rate": 4.875143462018094e-05, + "loss": 0.8408, + "step": 5852 + }, + { + "epoch": 0.10376810823506472, + "grad_norm": 2.703125, + "learning_rate": 4.875056290049091e-05, + "loss": 0.8534, + "step": 5854 + }, + { + "epoch": 0.10380356027067629, + "grad_norm": 2.6875, + "learning_rate": 4.87496908843988e-05, + "loss": 0.7775, + "step": 5856 + }, + { + "epoch": 0.10383901230628786, + "grad_norm": 2.90625, + "learning_rate": 4.874881857191551e-05, + "loss": 0.8615, + "step": 5858 + }, + { + "epoch": 0.10387446434189943, + "grad_norm": 2.71875, + "learning_rate": 4.87479459630519e-05, + "loss": 0.8127, + "step": 5860 + }, + { + "epoch": 0.103909916377511, + "grad_norm": 2.53125, + "learning_rate": 4.874707305781887e-05, + "loss": 0.8257, + "step": 5862 + }, + { + "epoch": 0.10394536841312257, + "grad_norm": 2.578125, + "learning_rate": 4.8746199856227315e-05, + "loss": 0.772, + "step": 5864 + }, + { + "epoch": 0.10398082044873413, + "grad_norm": 2.71875, + "learning_rate": 4.8745326358288133e-05, + "loss": 0.8003, + "step": 5866 + }, + { + "epoch": 0.10401627248434571, + "grad_norm": 2.734375, + "learning_rate": 4.874445256401223e-05, + "loss": 0.8276, + "step": 5868 + }, + { + "epoch": 0.10405172451995728, + "grad_norm": 2.96875, + "learning_rate": 4.87435784734105e-05, + "loss": 0.8251, + "step": 5870 + }, + { + "epoch": 0.10408717655556884, + "grad_norm": 2.515625, + "learning_rate": 4.874270408649385e-05, + "loss": 0.7982, + "step": 5872 + }, + { + "epoch": 0.10412262859118042, + "grad_norm": 3.015625, + "learning_rate": 4.874182940327321e-05, + "loss": 0.8599, + "step": 5874 + }, + { + "epoch": 0.104158080626792, + "grad_norm": 2.75, + "learning_rate": 4.874095442375948e-05, + "loss": 0.8527, + "step": 5876 + }, + { + "epoch": 0.10419353266240355, + "grad_norm": 2.703125, + "learning_rate": 4.874007914796358e-05, + "loss": 0.8557, + "step": 5878 + }, + { + "epoch": 0.10422898469801513, + "grad_norm": 2.515625, + "learning_rate": 4.873920357589644e-05, + "loss": 0.8352, + "step": 5880 + }, + { + "epoch": 0.1042644367336267, + "grad_norm": 2.953125, + "learning_rate": 4.8738327707568974e-05, + "loss": 0.8541, + "step": 5882 + }, + { + "epoch": 0.10429988876923826, + "grad_norm": 2.78125, + "learning_rate": 4.8737451542992136e-05, + "loss": 0.8092, + "step": 5884 + }, + { + "epoch": 0.10433534080484984, + "grad_norm": 2.59375, + "learning_rate": 4.873657508217684e-05, + "loss": 0.8368, + "step": 5886 + }, + { + "epoch": 0.10437079284046141, + "grad_norm": 2.625, + "learning_rate": 4.873569832513403e-05, + "loss": 0.8305, + "step": 5888 + }, + { + "epoch": 0.10440624487607297, + "grad_norm": 2.9375, + "learning_rate": 4.8734821271874656e-05, + "loss": 0.833, + "step": 5890 + }, + { + "epoch": 0.10444169691168455, + "grad_norm": 2.875, + "learning_rate": 4.873394392240965e-05, + "loss": 0.8522, + "step": 5892 + }, + { + "epoch": 0.10447714894729612, + "grad_norm": 2.796875, + "learning_rate": 4.873306627674997e-05, + "loss": 0.8828, + "step": 5894 + }, + { + "epoch": 0.10451260098290768, + "grad_norm": 2.6875, + "learning_rate": 4.873218833490656e-05, + "loss": 0.817, + "step": 5896 + }, + { + "epoch": 0.10454805301851926, + "grad_norm": 2.734375, + "learning_rate": 4.873131009689039e-05, + "loss": 0.82, + "step": 5898 + }, + { + "epoch": 0.10458350505413083, + "grad_norm": 2.5, + "learning_rate": 4.873043156271241e-05, + "loss": 0.8608, + "step": 5900 + }, + { + "epoch": 0.10461895708974239, + "grad_norm": 2.546875, + "learning_rate": 4.8729552732383586e-05, + "loss": 0.8599, + "step": 5902 + }, + { + "epoch": 0.10465440912535397, + "grad_norm": 2.703125, + "learning_rate": 4.872867360591489e-05, + "loss": 0.8522, + "step": 5904 + }, + { + "epoch": 0.10468986116096554, + "grad_norm": 2.78125, + "learning_rate": 4.872779418331729e-05, + "loss": 0.8401, + "step": 5906 + }, + { + "epoch": 0.1047253131965771, + "grad_norm": 2.9375, + "learning_rate": 4.872691446460176e-05, + "loss": 0.8433, + "step": 5908 + }, + { + "epoch": 0.10476076523218868, + "grad_norm": 3.0, + "learning_rate": 4.872603444977927e-05, + "loss": 0.7944, + "step": 5910 + }, + { + "epoch": 0.10479621726780025, + "grad_norm": 2.765625, + "learning_rate": 4.8725154138860826e-05, + "loss": 0.839, + "step": 5912 + }, + { + "epoch": 0.10483166930341181, + "grad_norm": 2.5625, + "learning_rate": 4.872427353185739e-05, + "loss": 0.8284, + "step": 5914 + }, + { + "epoch": 0.10486712133902339, + "grad_norm": 2.953125, + "learning_rate": 4.872339262877996e-05, + "loss": 0.8338, + "step": 5916 + }, + { + "epoch": 0.10490257337463496, + "grad_norm": 2.765625, + "learning_rate": 4.8722511429639536e-05, + "loss": 0.826, + "step": 5918 + }, + { + "epoch": 0.10493802541024652, + "grad_norm": 2.796875, + "learning_rate": 4.872162993444712e-05, + "loss": 0.8543, + "step": 5920 + }, + { + "epoch": 0.1049734774458581, + "grad_norm": 2.6875, + "learning_rate": 4.872074814321369e-05, + "loss": 0.8229, + "step": 5922 + }, + { + "epoch": 0.10500892948146967, + "grad_norm": 2.765625, + "learning_rate": 4.871986605595027e-05, + "loss": 0.7872, + "step": 5924 + }, + { + "epoch": 0.10504438151708123, + "grad_norm": 2.5, + "learning_rate": 4.871898367266785e-05, + "loss": 0.7904, + "step": 5926 + }, + { + "epoch": 0.1050798335526928, + "grad_norm": 2.78125, + "learning_rate": 4.871810099337747e-05, + "loss": 0.8301, + "step": 5928 + }, + { + "epoch": 0.10511528558830438, + "grad_norm": 2.703125, + "learning_rate": 4.871721801809013e-05, + "loss": 0.8046, + "step": 5930 + }, + { + "epoch": 0.10515073762391594, + "grad_norm": 2.609375, + "learning_rate": 4.871633474681684e-05, + "loss": 0.8217, + "step": 5932 + }, + { + "epoch": 0.10518618965952752, + "grad_norm": 2.609375, + "learning_rate": 4.871545117956863e-05, + "loss": 0.8079, + "step": 5934 + }, + { + "epoch": 0.10522164169513908, + "grad_norm": 3.078125, + "learning_rate": 4.871456731635653e-05, + "loss": 0.8503, + "step": 5936 + }, + { + "epoch": 0.10525709373075065, + "grad_norm": 2.71875, + "learning_rate": 4.871368315719158e-05, + "loss": 0.8112, + "step": 5938 + }, + { + "epoch": 0.10529254576636222, + "grad_norm": 2.609375, + "learning_rate": 4.871279870208479e-05, + "loss": 0.8458, + "step": 5940 + }, + { + "epoch": 0.10532799780197379, + "grad_norm": 2.765625, + "learning_rate": 4.8711913951047224e-05, + "loss": 0.8309, + "step": 5942 + }, + { + "epoch": 0.10536344983758536, + "grad_norm": 2.78125, + "learning_rate": 4.8711028904089905e-05, + "loss": 0.775, + "step": 5944 + }, + { + "epoch": 0.10539890187319693, + "grad_norm": 2.53125, + "learning_rate": 4.871014356122388e-05, + "loss": 0.8295, + "step": 5946 + }, + { + "epoch": 0.1054343539088085, + "grad_norm": 2.65625, + "learning_rate": 4.870925792246021e-05, + "loss": 0.8225, + "step": 5948 + }, + { + "epoch": 0.10546980594442007, + "grad_norm": 2.8125, + "learning_rate": 4.870837198780993e-05, + "loss": 0.8658, + "step": 5950 + }, + { + "epoch": 0.10550525798003164, + "grad_norm": 2.578125, + "learning_rate": 4.870748575728411e-05, + "loss": 0.8338, + "step": 5952 + }, + { + "epoch": 0.1055407100156432, + "grad_norm": 2.84375, + "learning_rate": 4.8706599230893805e-05, + "loss": 0.8351, + "step": 5954 + }, + { + "epoch": 0.10557616205125478, + "grad_norm": 2.703125, + "learning_rate": 4.8705712408650086e-05, + "loss": 0.8041, + "step": 5956 + }, + { + "epoch": 0.10561161408686635, + "grad_norm": 2.765625, + "learning_rate": 4.8704825290564004e-05, + "loss": 0.8343, + "step": 5958 + }, + { + "epoch": 0.10564706612247791, + "grad_norm": 2.59375, + "learning_rate": 4.870393787664664e-05, + "loss": 0.8447, + "step": 5960 + }, + { + "epoch": 0.10568251815808949, + "grad_norm": 2.625, + "learning_rate": 4.870305016690908e-05, + "loss": 0.852, + "step": 5962 + }, + { + "epoch": 0.10571797019370106, + "grad_norm": 2.875, + "learning_rate": 4.870216216136238e-05, + "loss": 0.8384, + "step": 5964 + }, + { + "epoch": 0.10575342222931262, + "grad_norm": 2.875, + "learning_rate": 4.870127386001764e-05, + "loss": 0.8305, + "step": 5966 + }, + { + "epoch": 0.1057888742649242, + "grad_norm": 2.625, + "learning_rate": 4.870038526288593e-05, + "loss": 0.8172, + "step": 5968 + }, + { + "epoch": 0.10582432630053577, + "grad_norm": 2.609375, + "learning_rate": 4.8699496369978346e-05, + "loss": 0.8577, + "step": 5970 + }, + { + "epoch": 0.10585977833614733, + "grad_norm": 2.90625, + "learning_rate": 4.8698607181306e-05, + "loss": 0.8757, + "step": 5972 + }, + { + "epoch": 0.10589523037175891, + "grad_norm": 2.75, + "learning_rate": 4.8697717696879965e-05, + "loss": 0.8196, + "step": 5974 + }, + { + "epoch": 0.10593068240737048, + "grad_norm": 2.796875, + "learning_rate": 4.869682791671134e-05, + "loss": 0.84, + "step": 5976 + }, + { + "epoch": 0.10596613444298204, + "grad_norm": 2.8125, + "learning_rate": 4.869593784081124e-05, + "loss": 0.82, + "step": 5978 + }, + { + "epoch": 0.10600158647859362, + "grad_norm": 2.703125, + "learning_rate": 4.869504746919078e-05, + "loss": 0.8291, + "step": 5980 + }, + { + "epoch": 0.10603703851420519, + "grad_norm": 2.9375, + "learning_rate": 4.8694156801861065e-05, + "loss": 0.7903, + "step": 5982 + }, + { + "epoch": 0.10607249054981675, + "grad_norm": 2.59375, + "learning_rate": 4.86932658388332e-05, + "loss": 0.7952, + "step": 5984 + }, + { + "epoch": 0.10610794258542833, + "grad_norm": 3.0625, + "learning_rate": 4.8692374580118314e-05, + "loss": 0.8418, + "step": 5986 + }, + { + "epoch": 0.1061433946210399, + "grad_norm": 2.578125, + "learning_rate": 4.869148302572753e-05, + "loss": 0.8257, + "step": 5988 + }, + { + "epoch": 0.10617884665665146, + "grad_norm": 2.84375, + "learning_rate": 4.869059117567198e-05, + "loss": 0.8086, + "step": 5990 + }, + { + "epoch": 0.10621429869226304, + "grad_norm": 2.921875, + "learning_rate": 4.8689699029962774e-05, + "loss": 0.8294, + "step": 5992 + }, + { + "epoch": 0.10624975072787461, + "grad_norm": 2.953125, + "learning_rate": 4.868880658861106e-05, + "loss": 0.8258, + "step": 5994 + }, + { + "epoch": 0.10628520276348617, + "grad_norm": 2.671875, + "learning_rate": 4.868791385162797e-05, + "loss": 0.8312, + "step": 5996 + }, + { + "epoch": 0.10632065479909775, + "grad_norm": 2.703125, + "learning_rate": 4.868702081902466e-05, + "loss": 0.8579, + "step": 5998 + }, + { + "epoch": 0.10635610683470932, + "grad_norm": 2.953125, + "learning_rate": 4.868612749081226e-05, + "loss": 0.8733, + "step": 6000 + }, + { + "epoch": 0.10639155887032088, + "grad_norm": 2.65625, + "learning_rate": 4.868523386700192e-05, + "loss": 0.8503, + "step": 6002 + }, + { + "epoch": 0.10642701090593246, + "grad_norm": 2.5, + "learning_rate": 4.86843399476048e-05, + "loss": 0.7982, + "step": 6004 + }, + { + "epoch": 0.10646246294154403, + "grad_norm": 3.0, + "learning_rate": 4.8683445732632046e-05, + "loss": 0.8144, + "step": 6006 + }, + { + "epoch": 0.10649791497715559, + "grad_norm": 2.5625, + "learning_rate": 4.868255122209482e-05, + "loss": 0.8306, + "step": 6008 + }, + { + "epoch": 0.10653336701276717, + "grad_norm": 2.53125, + "learning_rate": 4.868165641600429e-05, + "loss": 0.7931, + "step": 6010 + }, + { + "epoch": 0.10656881904837874, + "grad_norm": 2.53125, + "learning_rate": 4.868076131437162e-05, + "loss": 0.8033, + "step": 6012 + }, + { + "epoch": 0.1066042710839903, + "grad_norm": 2.84375, + "learning_rate": 4.8679865917207986e-05, + "loss": 0.805, + "step": 6014 + }, + { + "epoch": 0.10663972311960188, + "grad_norm": 2.625, + "learning_rate": 4.867897022452455e-05, + "loss": 0.8566, + "step": 6016 + }, + { + "epoch": 0.10667517515521345, + "grad_norm": 2.640625, + "learning_rate": 4.8678074236332505e-05, + "loss": 0.8544, + "step": 6018 + }, + { + "epoch": 0.10671062719082501, + "grad_norm": 3.046875, + "learning_rate": 4.867717795264301e-05, + "loss": 0.7666, + "step": 6020 + }, + { + "epoch": 0.10674607922643659, + "grad_norm": 2.734375, + "learning_rate": 4.8676281373467284e-05, + "loss": 0.8344, + "step": 6022 + }, + { + "epoch": 0.10678153126204815, + "grad_norm": 3.0, + "learning_rate": 4.867538449881649e-05, + "loss": 0.8397, + "step": 6024 + }, + { + "epoch": 0.10681698329765972, + "grad_norm": 2.84375, + "learning_rate": 4.8674487328701836e-05, + "loss": 0.8351, + "step": 6026 + }, + { + "epoch": 0.1068524353332713, + "grad_norm": 2.90625, + "learning_rate": 4.86735898631345e-05, + "loss": 0.8393, + "step": 6028 + }, + { + "epoch": 0.10688788736888286, + "grad_norm": 2.671875, + "learning_rate": 4.8672692102125696e-05, + "loss": 0.8348, + "step": 6030 + }, + { + "epoch": 0.10692333940449443, + "grad_norm": 2.828125, + "learning_rate": 4.867179404568663e-05, + "loss": 0.8175, + "step": 6032 + }, + { + "epoch": 0.106958791440106, + "grad_norm": 2.5, + "learning_rate": 4.86708956938285e-05, + "loss": 0.8695, + "step": 6034 + }, + { + "epoch": 0.10699424347571757, + "grad_norm": 2.65625, + "learning_rate": 4.8669997046562524e-05, + "loss": 0.8068, + "step": 6036 + }, + { + "epoch": 0.10702969551132914, + "grad_norm": 2.8125, + "learning_rate": 4.866909810389991e-05, + "loss": 0.8816, + "step": 6038 + }, + { + "epoch": 0.10706514754694071, + "grad_norm": 2.765625, + "learning_rate": 4.866819886585189e-05, + "loss": 0.8582, + "step": 6040 + }, + { + "epoch": 0.10710059958255227, + "grad_norm": 2.578125, + "learning_rate": 4.866729933242968e-05, + "loss": 0.7931, + "step": 6042 + }, + { + "epoch": 0.10713605161816385, + "grad_norm": 3.0, + "learning_rate": 4.866639950364449e-05, + "loss": 0.8141, + "step": 6044 + }, + { + "epoch": 0.10717150365377542, + "grad_norm": 2.828125, + "learning_rate": 4.866549937950757e-05, + "loss": 0.8202, + "step": 6046 + }, + { + "epoch": 0.10720695568938698, + "grad_norm": 2.65625, + "learning_rate": 4.8664598960030154e-05, + "loss": 0.8198, + "step": 6048 + }, + { + "epoch": 0.10724240772499856, + "grad_norm": 2.828125, + "learning_rate": 4.8663698245223466e-05, + "loss": 0.7826, + "step": 6050 + }, + { + "epoch": 0.10727785976061013, + "grad_norm": 2.59375, + "learning_rate": 4.8662797235098754e-05, + "loss": 0.8455, + "step": 6052 + }, + { + "epoch": 0.1073133117962217, + "grad_norm": 2.734375, + "learning_rate": 4.866189592966726e-05, + "loss": 0.8529, + "step": 6054 + }, + { + "epoch": 0.10734876383183327, + "grad_norm": 2.578125, + "learning_rate": 4.8660994328940235e-05, + "loss": 0.809, + "step": 6056 + }, + { + "epoch": 0.10738421586744484, + "grad_norm": 2.625, + "learning_rate": 4.866009243292893e-05, + "loss": 0.8287, + "step": 6058 + }, + { + "epoch": 0.1074196679030564, + "grad_norm": 2.546875, + "learning_rate": 4.86591902416446e-05, + "loss": 0.8295, + "step": 6060 + }, + { + "epoch": 0.10745511993866798, + "grad_norm": 3.03125, + "learning_rate": 4.86582877550985e-05, + "loss": 0.8268, + "step": 6062 + }, + { + "epoch": 0.10749057197427955, + "grad_norm": 2.296875, + "learning_rate": 4.865738497330189e-05, + "loss": 0.8194, + "step": 6064 + }, + { + "epoch": 0.10752602400989111, + "grad_norm": 2.71875, + "learning_rate": 4.865648189626605e-05, + "loss": 0.8585, + "step": 6066 + }, + { + "epoch": 0.10756147604550269, + "grad_norm": 2.75, + "learning_rate": 4.865557852400225e-05, + "loss": 0.8076, + "step": 6068 + }, + { + "epoch": 0.10759692808111426, + "grad_norm": 3.171875, + "learning_rate": 4.8654674856521745e-05, + "loss": 0.7988, + "step": 6070 + }, + { + "epoch": 0.10763238011672582, + "grad_norm": 2.921875, + "learning_rate": 4.865377089383584e-05, + "loss": 0.8241, + "step": 6072 + }, + { + "epoch": 0.1076678321523374, + "grad_norm": 2.71875, + "learning_rate": 4.865286663595578e-05, + "loss": 0.8282, + "step": 6074 + }, + { + "epoch": 0.10770328418794897, + "grad_norm": 2.375, + "learning_rate": 4.865196208289289e-05, + "loss": 0.8366, + "step": 6076 + }, + { + "epoch": 0.10773873622356053, + "grad_norm": 2.8125, + "learning_rate": 4.865105723465843e-05, + "loss": 0.8635, + "step": 6078 + }, + { + "epoch": 0.10777418825917211, + "grad_norm": 2.59375, + "learning_rate": 4.86501520912637e-05, + "loss": 0.7817, + "step": 6080 + }, + { + "epoch": 0.10780964029478368, + "grad_norm": 2.640625, + "learning_rate": 4.8649246652720005e-05, + "loss": 0.8349, + "step": 6082 + }, + { + "epoch": 0.10784509233039524, + "grad_norm": 3.046875, + "learning_rate": 4.864834091903864e-05, + "loss": 0.8655, + "step": 6084 + }, + { + "epoch": 0.10788054436600682, + "grad_norm": 2.671875, + "learning_rate": 4.864743489023089e-05, + "loss": 0.7774, + "step": 6086 + }, + { + "epoch": 0.10791599640161839, + "grad_norm": 2.828125, + "learning_rate": 4.8646528566308094e-05, + "loss": 0.8279, + "step": 6088 + }, + { + "epoch": 0.10795144843722995, + "grad_norm": 2.578125, + "learning_rate": 4.864562194728154e-05, + "loss": 0.8073, + "step": 6090 + }, + { + "epoch": 0.10798690047284153, + "grad_norm": 2.6875, + "learning_rate": 4.8644715033162546e-05, + "loss": 0.8185, + "step": 6092 + }, + { + "epoch": 0.1080223525084531, + "grad_norm": 2.75, + "learning_rate": 4.864380782396244e-05, + "loss": 0.8483, + "step": 6094 + }, + { + "epoch": 0.10805780454406466, + "grad_norm": 2.703125, + "learning_rate": 4.8642900319692536e-05, + "loss": 0.8436, + "step": 6096 + }, + { + "epoch": 0.10809325657967624, + "grad_norm": 2.984375, + "learning_rate": 4.864199252036415e-05, + "loss": 0.8808, + "step": 6098 + }, + { + "epoch": 0.10812870861528781, + "grad_norm": 2.84375, + "learning_rate": 4.864108442598864e-05, + "loss": 0.8295, + "step": 6100 + }, + { + "epoch": 0.10816416065089937, + "grad_norm": 3.0625, + "learning_rate": 4.864017603657731e-05, + "loss": 0.839, + "step": 6102 + }, + { + "epoch": 0.10819961268651095, + "grad_norm": 2.671875, + "learning_rate": 4.86392673521415e-05, + "loss": 0.8202, + "step": 6104 + }, + { + "epoch": 0.1082350647221225, + "grad_norm": 3.015625, + "learning_rate": 4.863835837269257e-05, + "loss": 0.8195, + "step": 6106 + }, + { + "epoch": 0.10827051675773408, + "grad_norm": 2.828125, + "learning_rate": 4.863744909824185e-05, + "loss": 0.7983, + "step": 6108 + }, + { + "epoch": 0.10830596879334566, + "grad_norm": 3.03125, + "learning_rate": 4.8636539528800685e-05, + "loss": 0.8431, + "step": 6110 + }, + { + "epoch": 0.10834142082895722, + "grad_norm": 2.625, + "learning_rate": 4.863562966438042e-05, + "loss": 0.8398, + "step": 6112 + }, + { + "epoch": 0.10837687286456879, + "grad_norm": 2.59375, + "learning_rate": 4.863471950499243e-05, + "loss": 0.8235, + "step": 6114 + }, + { + "epoch": 0.10841232490018037, + "grad_norm": 3.09375, + "learning_rate": 4.8633809050648064e-05, + "loss": 0.8191, + "step": 6116 + }, + { + "epoch": 0.10844777693579193, + "grad_norm": 2.734375, + "learning_rate": 4.8632898301358684e-05, + "loss": 0.81, + "step": 6118 + }, + { + "epoch": 0.1084832289714035, + "grad_norm": 2.828125, + "learning_rate": 4.863198725713565e-05, + "loss": 0.7965, + "step": 6120 + }, + { + "epoch": 0.10851868100701507, + "grad_norm": 2.78125, + "learning_rate": 4.863107591799034e-05, + "loss": 0.805, + "step": 6122 + }, + { + "epoch": 0.10855413304262664, + "grad_norm": 2.671875, + "learning_rate": 4.863016428393413e-05, + "loss": 0.8054, + "step": 6124 + }, + { + "epoch": 0.10858958507823821, + "grad_norm": 2.765625, + "learning_rate": 4.862925235497839e-05, + "loss": 0.7906, + "step": 6126 + }, + { + "epoch": 0.10862503711384978, + "grad_norm": 2.90625, + "learning_rate": 4.8628340131134496e-05, + "loss": 0.8641, + "step": 6128 + }, + { + "epoch": 0.10866048914946135, + "grad_norm": 2.703125, + "learning_rate": 4.862742761241384e-05, + "loss": 0.8086, + "step": 6130 + }, + { + "epoch": 0.10869594118507292, + "grad_norm": 2.796875, + "learning_rate": 4.862651479882782e-05, + "loss": 0.8464, + "step": 6132 + }, + { + "epoch": 0.1087313932206845, + "grad_norm": 2.734375, + "learning_rate": 4.8625601690387804e-05, + "loss": 0.8145, + "step": 6134 + }, + { + "epoch": 0.10876684525629605, + "grad_norm": 2.671875, + "learning_rate": 4.8624688287105195e-05, + "loss": 0.8007, + "step": 6136 + }, + { + "epoch": 0.10880229729190763, + "grad_norm": 2.78125, + "learning_rate": 4.86237745889914e-05, + "loss": 0.8283, + "step": 6138 + }, + { + "epoch": 0.1088377493275192, + "grad_norm": 2.5625, + "learning_rate": 4.8622860596057826e-05, + "loss": 0.7944, + "step": 6140 + }, + { + "epoch": 0.10887320136313076, + "grad_norm": 2.765625, + "learning_rate": 4.862194630831587e-05, + "loss": 0.8282, + "step": 6142 + }, + { + "epoch": 0.10890865339874234, + "grad_norm": 2.640625, + "learning_rate": 4.862103172577695e-05, + "loss": 0.8247, + "step": 6144 + }, + { + "epoch": 0.10894410543435391, + "grad_norm": 2.8125, + "learning_rate": 4.862011684845246e-05, + "loss": 0.8667, + "step": 6146 + }, + { + "epoch": 0.10897955746996547, + "grad_norm": 2.53125, + "learning_rate": 4.861920167635384e-05, + "loss": 0.8452, + "step": 6148 + }, + { + "epoch": 0.10901500950557705, + "grad_norm": 2.921875, + "learning_rate": 4.86182862094925e-05, + "loss": 0.7993, + "step": 6150 + }, + { + "epoch": 0.10905046154118862, + "grad_norm": 2.84375, + "learning_rate": 4.861737044787987e-05, + "loss": 0.8357, + "step": 6152 + }, + { + "epoch": 0.10908591357680018, + "grad_norm": 2.5625, + "learning_rate": 4.861645439152738e-05, + "loss": 0.8498, + "step": 6154 + }, + { + "epoch": 0.10912136561241176, + "grad_norm": 2.78125, + "learning_rate": 4.8615538040446446e-05, + "loss": 0.8273, + "step": 6156 + }, + { + "epoch": 0.10915681764802333, + "grad_norm": 2.71875, + "learning_rate": 4.8614621394648525e-05, + "loss": 0.8296, + "step": 6158 + }, + { + "epoch": 0.1091922696836349, + "grad_norm": 2.71875, + "learning_rate": 4.8613704454145045e-05, + "loss": 0.8243, + "step": 6160 + }, + { + "epoch": 0.10922772171924647, + "grad_norm": 3.015625, + "learning_rate": 4.8612787218947454e-05, + "loss": 0.8467, + "step": 6162 + }, + { + "epoch": 0.10926317375485804, + "grad_norm": 2.625, + "learning_rate": 4.8611869689067194e-05, + "loss": 0.8519, + "step": 6164 + }, + { + "epoch": 0.1092986257904696, + "grad_norm": 2.6875, + "learning_rate": 4.861095186451572e-05, + "loss": 0.8219, + "step": 6166 + }, + { + "epoch": 0.10933407782608118, + "grad_norm": 2.3125, + "learning_rate": 4.861003374530448e-05, + "loss": 0.8044, + "step": 6168 + }, + { + "epoch": 0.10936952986169275, + "grad_norm": 2.65625, + "learning_rate": 4.860911533144494e-05, + "loss": 0.7915, + "step": 6170 + }, + { + "epoch": 0.10940498189730431, + "grad_norm": 3.109375, + "learning_rate": 4.8608196622948554e-05, + "loss": 0.8269, + "step": 6172 + }, + { + "epoch": 0.10944043393291589, + "grad_norm": 2.859375, + "learning_rate": 4.8607277619826796e-05, + "loss": 0.838, + "step": 6174 + }, + { + "epoch": 0.10947588596852746, + "grad_norm": 2.78125, + "learning_rate": 4.860635832209113e-05, + "loss": 0.803, + "step": 6176 + }, + { + "epoch": 0.10951133800413902, + "grad_norm": 2.859375, + "learning_rate": 4.860543872975303e-05, + "loss": 0.8131, + "step": 6178 + }, + { + "epoch": 0.1095467900397506, + "grad_norm": 2.890625, + "learning_rate": 4.8604518842823974e-05, + "loss": 0.8283, + "step": 6180 + }, + { + "epoch": 0.10958224207536217, + "grad_norm": 2.515625, + "learning_rate": 4.860359866131543e-05, + "loss": 0.8426, + "step": 6182 + }, + { + "epoch": 0.10961769411097373, + "grad_norm": 2.578125, + "learning_rate": 4.86026781852389e-05, + "loss": 0.8239, + "step": 6184 + }, + { + "epoch": 0.1096531461465853, + "grad_norm": 3.28125, + "learning_rate": 4.860175741460585e-05, + "loss": 0.8929, + "step": 6186 + }, + { + "epoch": 0.10968859818219688, + "grad_norm": 2.765625, + "learning_rate": 4.8600836349427805e-05, + "loss": 0.8083, + "step": 6188 + }, + { + "epoch": 0.10972405021780844, + "grad_norm": 2.765625, + "learning_rate": 4.8599914989716223e-05, + "loss": 0.8372, + "step": 6190 + }, + { + "epoch": 0.10975950225342002, + "grad_norm": 2.9375, + "learning_rate": 4.859899333548261e-05, + "loss": 0.8588, + "step": 6192 + }, + { + "epoch": 0.10979495428903158, + "grad_norm": 2.6875, + "learning_rate": 4.8598071386738485e-05, + "loss": 0.7989, + "step": 6194 + }, + { + "epoch": 0.10983040632464315, + "grad_norm": 2.453125, + "learning_rate": 4.859714914349535e-05, + "loss": 0.8067, + "step": 6196 + }, + { + "epoch": 0.10986585836025473, + "grad_norm": 2.9375, + "learning_rate": 4.8596226605764704e-05, + "loss": 0.7671, + "step": 6198 + }, + { + "epoch": 0.10990131039586629, + "grad_norm": 2.859375, + "learning_rate": 4.8595303773558064e-05, + "loss": 0.8118, + "step": 6200 + }, + { + "epoch": 0.10993676243147786, + "grad_norm": 2.71875, + "learning_rate": 4.8594380646886945e-05, + "loss": 0.8167, + "step": 6202 + }, + { + "epoch": 0.10997221446708944, + "grad_norm": 2.59375, + "learning_rate": 4.8593457225762873e-05, + "loss": 0.8439, + "step": 6204 + }, + { + "epoch": 0.110007666502701, + "grad_norm": 2.984375, + "learning_rate": 4.859253351019737e-05, + "loss": 0.8135, + "step": 6206 + }, + { + "epoch": 0.11004311853831257, + "grad_norm": 2.71875, + "learning_rate": 4.859160950020196e-05, + "loss": 0.8585, + "step": 6208 + }, + { + "epoch": 0.11007857057392414, + "grad_norm": 2.578125, + "learning_rate": 4.859068519578818e-05, + "loss": 0.8055, + "step": 6210 + }, + { + "epoch": 0.1101140226095357, + "grad_norm": 3.15625, + "learning_rate": 4.8589760596967555e-05, + "loss": 0.8373, + "step": 6212 + }, + { + "epoch": 0.11014947464514728, + "grad_norm": 2.703125, + "learning_rate": 4.858883570375163e-05, + "loss": 0.805, + "step": 6214 + }, + { + "epoch": 0.11018492668075885, + "grad_norm": 3.078125, + "learning_rate": 4.858791051615196e-05, + "loss": 0.8253, + "step": 6216 + }, + { + "epoch": 0.11022037871637042, + "grad_norm": 2.84375, + "learning_rate": 4.8586985034180076e-05, + "loss": 0.8373, + "step": 6218 + }, + { + "epoch": 0.11025583075198199, + "grad_norm": 2.640625, + "learning_rate": 4.858605925784753e-05, + "loss": 0.833, + "step": 6220 + }, + { + "epoch": 0.11029128278759356, + "grad_norm": 2.46875, + "learning_rate": 4.8585133187165876e-05, + "loss": 0.8138, + "step": 6222 + }, + { + "epoch": 0.11032673482320512, + "grad_norm": 2.609375, + "learning_rate": 4.858420682214667e-05, + "loss": 0.8439, + "step": 6224 + }, + { + "epoch": 0.1103621868588167, + "grad_norm": 3.078125, + "learning_rate": 4.858328016280148e-05, + "loss": 0.8632, + "step": 6226 + }, + { + "epoch": 0.11039763889442827, + "grad_norm": 2.671875, + "learning_rate": 4.858235320914187e-05, + "loss": 0.7924, + "step": 6228 + }, + { + "epoch": 0.11043309093003983, + "grad_norm": 2.6875, + "learning_rate": 4.8581425961179396e-05, + "loss": 0.7989, + "step": 6230 + }, + { + "epoch": 0.11046854296565141, + "grad_norm": 2.6875, + "learning_rate": 4.858049841892564e-05, + "loss": 0.796, + "step": 6232 + }, + { + "epoch": 0.11050399500126298, + "grad_norm": 2.546875, + "learning_rate": 4.8579570582392176e-05, + "loss": 0.7959, + "step": 6234 + }, + { + "epoch": 0.11053944703687454, + "grad_norm": 3.21875, + "learning_rate": 4.8578642451590585e-05, + "loss": 0.7785, + "step": 6236 + }, + { + "epoch": 0.11057489907248612, + "grad_norm": 2.796875, + "learning_rate": 4.857771402653244e-05, + "loss": 0.824, + "step": 6238 + }, + { + "epoch": 0.1106103511080977, + "grad_norm": 2.515625, + "learning_rate": 4.857678530722933e-05, + "loss": 0.7858, + "step": 6240 + }, + { + "epoch": 0.11064580314370925, + "grad_norm": 2.984375, + "learning_rate": 4.857585629369287e-05, + "loss": 0.8481, + "step": 6242 + }, + { + "epoch": 0.11068125517932083, + "grad_norm": 2.625, + "learning_rate": 4.857492698593462e-05, + "loss": 0.8059, + "step": 6244 + }, + { + "epoch": 0.1107167072149324, + "grad_norm": 2.40625, + "learning_rate": 4.85739973839662e-05, + "loss": 0.7678, + "step": 6246 + }, + { + "epoch": 0.11075215925054396, + "grad_norm": 2.765625, + "learning_rate": 4.857306748779919e-05, + "loss": 0.814, + "step": 6248 + }, + { + "epoch": 0.11078761128615554, + "grad_norm": 2.578125, + "learning_rate": 4.857213729744521e-05, + "loss": 0.8241, + "step": 6250 + }, + { + "epoch": 0.11082306332176711, + "grad_norm": 2.984375, + "learning_rate": 4.857120681291587e-05, + "loss": 0.8479, + "step": 6252 + }, + { + "epoch": 0.11085851535737867, + "grad_norm": 2.828125, + "learning_rate": 4.8570276034222787e-05, + "loss": 0.7965, + "step": 6254 + }, + { + "epoch": 0.11089396739299025, + "grad_norm": 2.71875, + "learning_rate": 4.856934496137756e-05, + "loss": 0.8589, + "step": 6256 + }, + { + "epoch": 0.11092941942860182, + "grad_norm": 2.84375, + "learning_rate": 4.8568413594391814e-05, + "loss": 0.7966, + "step": 6258 + }, + { + "epoch": 0.11096487146421338, + "grad_norm": 2.84375, + "learning_rate": 4.856748193327718e-05, + "loss": 0.8463, + "step": 6260 + }, + { + "epoch": 0.11100032349982496, + "grad_norm": 2.734375, + "learning_rate": 4.856654997804528e-05, + "loss": 0.8121, + "step": 6262 + }, + { + "epoch": 0.11103577553543653, + "grad_norm": 2.859375, + "learning_rate": 4.8565617728707745e-05, + "loss": 0.838, + "step": 6264 + }, + { + "epoch": 0.11107122757104809, + "grad_norm": 2.78125, + "learning_rate": 4.8564685185276204e-05, + "loss": 0.8355, + "step": 6266 + }, + { + "epoch": 0.11110667960665967, + "grad_norm": 2.578125, + "learning_rate": 4.8563752347762305e-05, + "loss": 0.8608, + "step": 6268 + }, + { + "epoch": 0.11114213164227124, + "grad_norm": 2.859375, + "learning_rate": 4.856281921617768e-05, + "loss": 0.7789, + "step": 6270 + }, + { + "epoch": 0.1111775836778828, + "grad_norm": 2.96875, + "learning_rate": 4.856188579053399e-05, + "loss": 0.8424, + "step": 6272 + }, + { + "epoch": 0.11121303571349438, + "grad_norm": 2.984375, + "learning_rate": 4.856095207084286e-05, + "loss": 0.8807, + "step": 6274 + }, + { + "epoch": 0.11124848774910594, + "grad_norm": 2.953125, + "learning_rate": 4.8560018057115965e-05, + "loss": 0.83, + "step": 6276 + }, + { + "epoch": 0.11128393978471751, + "grad_norm": 2.640625, + "learning_rate": 4.855908374936495e-05, + "loss": 0.806, + "step": 6278 + }, + { + "epoch": 0.11131939182032909, + "grad_norm": 2.6875, + "learning_rate": 4.8558149147601474e-05, + "loss": 0.855, + "step": 6280 + }, + { + "epoch": 0.11135484385594065, + "grad_norm": 2.453125, + "learning_rate": 4.85572142518372e-05, + "loss": 0.7635, + "step": 6282 + }, + { + "epoch": 0.11139029589155222, + "grad_norm": 2.5625, + "learning_rate": 4.855627906208381e-05, + "loss": 0.7634, + "step": 6284 + }, + { + "epoch": 0.1114257479271638, + "grad_norm": 2.703125, + "learning_rate": 4.8555343578352955e-05, + "loss": 0.8574, + "step": 6286 + }, + { + "epoch": 0.11146119996277536, + "grad_norm": 3.109375, + "learning_rate": 4.855440780065632e-05, + "loss": 0.8141, + "step": 6288 + }, + { + "epoch": 0.11149665199838693, + "grad_norm": 2.859375, + "learning_rate": 4.8553471729005577e-05, + "loss": 0.8272, + "step": 6290 + }, + { + "epoch": 0.1115321040339985, + "grad_norm": 2.71875, + "learning_rate": 4.855253536341242e-05, + "loss": 0.8545, + "step": 6292 + }, + { + "epoch": 0.11156755606961007, + "grad_norm": 2.90625, + "learning_rate": 4.8551598703888525e-05, + "loss": 0.8552, + "step": 6294 + }, + { + "epoch": 0.11160300810522164, + "grad_norm": 2.8125, + "learning_rate": 4.855066175044558e-05, + "loss": 0.8481, + "step": 6296 + }, + { + "epoch": 0.11163846014083322, + "grad_norm": 2.609375, + "learning_rate": 4.8549724503095286e-05, + "loss": 0.8124, + "step": 6298 + }, + { + "epoch": 0.11167391217644478, + "grad_norm": 2.828125, + "learning_rate": 4.8548786961849334e-05, + "loss": 0.8208, + "step": 6300 + }, + { + "epoch": 0.11170936421205635, + "grad_norm": 2.75, + "learning_rate": 4.8547849126719426e-05, + "loss": 0.8274, + "step": 6302 + }, + { + "epoch": 0.11174481624766792, + "grad_norm": 2.53125, + "learning_rate": 4.8546910997717264e-05, + "loss": 0.8817, + "step": 6304 + }, + { + "epoch": 0.11178026828327949, + "grad_norm": 3.078125, + "learning_rate": 4.854597257485456e-05, + "loss": 0.8826, + "step": 6306 + }, + { + "epoch": 0.11181572031889106, + "grad_norm": 2.90625, + "learning_rate": 4.8545033858143025e-05, + "loss": 0.8136, + "step": 6308 + }, + { + "epoch": 0.11185117235450263, + "grad_norm": 2.875, + "learning_rate": 4.8544094847594366e-05, + "loss": 0.8205, + "step": 6310 + }, + { + "epoch": 0.1118866243901142, + "grad_norm": 2.65625, + "learning_rate": 4.854315554322031e-05, + "loss": 0.8596, + "step": 6312 + }, + { + "epoch": 0.11192207642572577, + "grad_norm": 2.890625, + "learning_rate": 4.854221594503258e-05, + "loss": 0.8556, + "step": 6314 + }, + { + "epoch": 0.11195752846133734, + "grad_norm": 2.65625, + "learning_rate": 4.854127605304289e-05, + "loss": 0.8152, + "step": 6316 + }, + { + "epoch": 0.1119929804969489, + "grad_norm": 2.8125, + "learning_rate": 4.8540335867262976e-05, + "loss": 0.8278, + "step": 6318 + }, + { + "epoch": 0.11202843253256048, + "grad_norm": 2.6875, + "learning_rate": 4.853939538770458e-05, + "loss": 0.8356, + "step": 6320 + }, + { + "epoch": 0.11206388456817205, + "grad_norm": 2.4375, + "learning_rate": 4.8538454614379435e-05, + "loss": 0.809, + "step": 6322 + }, + { + "epoch": 0.11209933660378361, + "grad_norm": 2.609375, + "learning_rate": 4.8537513547299276e-05, + "loss": 0.8496, + "step": 6324 + }, + { + "epoch": 0.11213478863939519, + "grad_norm": 2.734375, + "learning_rate": 4.853657218647585e-05, + "loss": 0.8314, + "step": 6326 + }, + { + "epoch": 0.11217024067500676, + "grad_norm": 2.890625, + "learning_rate": 4.853563053192091e-05, + "loss": 0.8568, + "step": 6328 + }, + { + "epoch": 0.11220569271061832, + "grad_norm": 2.453125, + "learning_rate": 4.853468858364619e-05, + "loss": 0.8321, + "step": 6330 + }, + { + "epoch": 0.1122411447462299, + "grad_norm": 2.609375, + "learning_rate": 4.853374634166347e-05, + "loss": 0.854, + "step": 6332 + }, + { + "epoch": 0.11227659678184147, + "grad_norm": 2.6875, + "learning_rate": 4.8532803805984496e-05, + "loss": 0.8576, + "step": 6334 + }, + { + "epoch": 0.11231204881745303, + "grad_norm": 3.078125, + "learning_rate": 4.853186097662103e-05, + "loss": 0.8006, + "step": 6336 + }, + { + "epoch": 0.11234750085306461, + "grad_norm": 2.796875, + "learning_rate": 4.853091785358485e-05, + "loss": 0.8828, + "step": 6338 + }, + { + "epoch": 0.11238295288867618, + "grad_norm": 2.71875, + "learning_rate": 4.85299744368877e-05, + "loss": 0.8404, + "step": 6340 + }, + { + "epoch": 0.11241840492428774, + "grad_norm": 2.65625, + "learning_rate": 4.8529030726541383e-05, + "loss": 0.8326, + "step": 6342 + }, + { + "epoch": 0.11245385695989932, + "grad_norm": 3.015625, + "learning_rate": 4.8528086722557656e-05, + "loss": 0.8589, + "step": 6344 + }, + { + "epoch": 0.11248930899551089, + "grad_norm": 2.640625, + "learning_rate": 4.8527142424948305e-05, + "loss": 0.8304, + "step": 6346 + }, + { + "epoch": 0.11252476103112245, + "grad_norm": 2.609375, + "learning_rate": 4.8526197833725126e-05, + "loss": 0.8005, + "step": 6348 + }, + { + "epoch": 0.11256021306673403, + "grad_norm": 2.859375, + "learning_rate": 4.852525294889989e-05, + "loss": 0.8631, + "step": 6350 + }, + { + "epoch": 0.1125956651023456, + "grad_norm": 2.859375, + "learning_rate": 4.85243077704844e-05, + "loss": 0.8739, + "step": 6352 + }, + { + "epoch": 0.11263111713795716, + "grad_norm": 2.484375, + "learning_rate": 4.8523362298490446e-05, + "loss": 0.8413, + "step": 6354 + }, + { + "epoch": 0.11266656917356874, + "grad_norm": 2.671875, + "learning_rate": 4.852241653292984e-05, + "loss": 0.8242, + "step": 6356 + }, + { + "epoch": 0.11270202120918031, + "grad_norm": 2.828125, + "learning_rate": 4.8521470473814365e-05, + "loss": 0.8035, + "step": 6358 + }, + { + "epoch": 0.11273747324479187, + "grad_norm": 2.75, + "learning_rate": 4.852052412115584e-05, + "loss": 0.7923, + "step": 6360 + }, + { + "epoch": 0.11277292528040345, + "grad_norm": 2.875, + "learning_rate": 4.8519577474966074e-05, + "loss": 0.8228, + "step": 6362 + }, + { + "epoch": 0.11280837731601501, + "grad_norm": 2.515625, + "learning_rate": 4.851863053525688e-05, + "loss": 0.7987, + "step": 6364 + }, + { + "epoch": 0.11284382935162658, + "grad_norm": 2.75, + "learning_rate": 4.8517683302040075e-05, + "loss": 0.8717, + "step": 6366 + }, + { + "epoch": 0.11287928138723816, + "grad_norm": 2.859375, + "learning_rate": 4.851673577532748e-05, + "loss": 0.8595, + "step": 6368 + }, + { + "epoch": 0.11291473342284972, + "grad_norm": 2.859375, + "learning_rate": 4.851578795513092e-05, + "loss": 0.8714, + "step": 6370 + }, + { + "epoch": 0.11295018545846129, + "grad_norm": 3.015625, + "learning_rate": 4.851483984146223e-05, + "loss": 0.8443, + "step": 6372 + }, + { + "epoch": 0.11298563749407287, + "grad_norm": 2.84375, + "learning_rate": 4.851389143433323e-05, + "loss": 0.8583, + "step": 6374 + }, + { + "epoch": 0.11302108952968443, + "grad_norm": 2.84375, + "learning_rate": 4.8512942733755764e-05, + "loss": 0.8792, + "step": 6376 + }, + { + "epoch": 0.113056541565296, + "grad_norm": 2.875, + "learning_rate": 4.8511993739741676e-05, + "loss": 0.8085, + "step": 6378 + }, + { + "epoch": 0.11309199360090758, + "grad_norm": 2.78125, + "learning_rate": 4.85110444523028e-05, + "loss": 0.8323, + "step": 6380 + }, + { + "epoch": 0.11312744563651914, + "grad_norm": 2.6875, + "learning_rate": 4.851009487145098e-05, + "loss": 0.8491, + "step": 6382 + }, + { + "epoch": 0.11316289767213071, + "grad_norm": 2.875, + "learning_rate": 4.850914499719807e-05, + "loss": 0.842, + "step": 6384 + }, + { + "epoch": 0.11319834970774229, + "grad_norm": 2.78125, + "learning_rate": 4.850819482955594e-05, + "loss": 0.8243, + "step": 6386 + }, + { + "epoch": 0.11323380174335385, + "grad_norm": 2.75, + "learning_rate": 4.850724436853643e-05, + "loss": 0.8169, + "step": 6388 + }, + { + "epoch": 0.11326925377896542, + "grad_norm": 2.96875, + "learning_rate": 4.8506293614151404e-05, + "loss": 0.83, + "step": 6390 + }, + { + "epoch": 0.113304705814577, + "grad_norm": 3.046875, + "learning_rate": 4.8505342566412734e-05, + "loss": 0.8608, + "step": 6392 + }, + { + "epoch": 0.11334015785018856, + "grad_norm": 3.046875, + "learning_rate": 4.8504391225332277e-05, + "loss": 0.8466, + "step": 6394 + }, + { + "epoch": 0.11337560988580013, + "grad_norm": 2.875, + "learning_rate": 4.8503439590921925e-05, + "loss": 0.8541, + "step": 6396 + }, + { + "epoch": 0.1134110619214117, + "grad_norm": 2.859375, + "learning_rate": 4.850248766319353e-05, + "loss": 0.8495, + "step": 6398 + }, + { + "epoch": 0.11344651395702327, + "grad_norm": 2.8125, + "learning_rate": 4.8501535442159e-05, + "loss": 0.8642, + "step": 6400 + }, + { + "epoch": 0.11348196599263484, + "grad_norm": 2.8125, + "learning_rate": 4.8500582927830185e-05, + "loss": 0.8203, + "step": 6402 + }, + { + "epoch": 0.11351741802824641, + "grad_norm": 2.953125, + "learning_rate": 4.8499630120218994e-05, + "loss": 0.8405, + "step": 6404 + }, + { + "epoch": 0.11355287006385797, + "grad_norm": 3.109375, + "learning_rate": 4.849867701933732e-05, + "loss": 0.8076, + "step": 6406 + }, + { + "epoch": 0.11358832209946955, + "grad_norm": 2.828125, + "learning_rate": 4.849772362519704e-05, + "loss": 0.8343, + "step": 6408 + }, + { + "epoch": 0.11362377413508112, + "grad_norm": 2.5625, + "learning_rate": 4.849676993781008e-05, + "loss": 0.8003, + "step": 6410 + }, + { + "epoch": 0.11365922617069268, + "grad_norm": 3.09375, + "learning_rate": 4.849581595718832e-05, + "loss": 0.8195, + "step": 6412 + }, + { + "epoch": 0.11369467820630426, + "grad_norm": 2.796875, + "learning_rate": 4.849486168334366e-05, + "loss": 0.8063, + "step": 6414 + }, + { + "epoch": 0.11373013024191583, + "grad_norm": 2.8125, + "learning_rate": 4.849390711628803e-05, + "loss": 0.8344, + "step": 6416 + }, + { + "epoch": 0.1137655822775274, + "grad_norm": 2.90625, + "learning_rate": 4.8492952256033333e-05, + "loss": 0.852, + "step": 6418 + }, + { + "epoch": 0.11380103431313897, + "grad_norm": 2.84375, + "learning_rate": 4.849199710259148e-05, + "loss": 0.8153, + "step": 6420 + }, + { + "epoch": 0.11383648634875054, + "grad_norm": 2.609375, + "learning_rate": 4.84910416559744e-05, + "loss": 0.8212, + "step": 6422 + }, + { + "epoch": 0.1138719383843621, + "grad_norm": 2.75, + "learning_rate": 4.8490085916194005e-05, + "loss": 0.8511, + "step": 6424 + }, + { + "epoch": 0.11390739041997368, + "grad_norm": 2.8125, + "learning_rate": 4.848912988326224e-05, + "loss": 0.8078, + "step": 6426 + }, + { + "epoch": 0.11394284245558525, + "grad_norm": 2.71875, + "learning_rate": 4.8488173557191026e-05, + "loss": 0.8286, + "step": 6428 + }, + { + "epoch": 0.11397829449119681, + "grad_norm": 2.59375, + "learning_rate": 4.8487216937992294e-05, + "loss": 0.863, + "step": 6430 + }, + { + "epoch": 0.11401374652680839, + "grad_norm": 2.65625, + "learning_rate": 4.848626002567799e-05, + "loss": 0.8603, + "step": 6432 + }, + { + "epoch": 0.11404919856241996, + "grad_norm": 2.640625, + "learning_rate": 4.8485302820260045e-05, + "loss": 0.847, + "step": 6434 + }, + { + "epoch": 0.11408465059803152, + "grad_norm": 2.640625, + "learning_rate": 4.848434532175042e-05, + "loss": 0.8007, + "step": 6436 + }, + { + "epoch": 0.1141201026336431, + "grad_norm": 2.828125, + "learning_rate": 4.8483387530161054e-05, + "loss": 0.8277, + "step": 6438 + }, + { + "epoch": 0.11415555466925467, + "grad_norm": 2.609375, + "learning_rate": 4.8482429445503905e-05, + "loss": 0.7848, + "step": 6440 + }, + { + "epoch": 0.11419100670486623, + "grad_norm": 2.765625, + "learning_rate": 4.848147106779093e-05, + "loss": 0.804, + "step": 6442 + }, + { + "epoch": 0.11422645874047781, + "grad_norm": 3.0, + "learning_rate": 4.848051239703408e-05, + "loss": 0.8579, + "step": 6444 + }, + { + "epoch": 0.11426191077608938, + "grad_norm": 2.859375, + "learning_rate": 4.8479553433245325e-05, + "loss": 0.8028, + "step": 6446 + }, + { + "epoch": 0.11429736281170094, + "grad_norm": 2.75, + "learning_rate": 4.847859417643664e-05, + "loss": 0.8015, + "step": 6448 + }, + { + "epoch": 0.11433281484731252, + "grad_norm": 2.6875, + "learning_rate": 4.847763462661999e-05, + "loss": 0.8323, + "step": 6450 + }, + { + "epoch": 0.11436826688292408, + "grad_norm": 2.578125, + "learning_rate": 4.8476674783807344e-05, + "loss": 0.8275, + "step": 6452 + }, + { + "epoch": 0.11440371891853565, + "grad_norm": 2.625, + "learning_rate": 4.84757146480107e-05, + "loss": 0.9091, + "step": 6454 + }, + { + "epoch": 0.11443917095414723, + "grad_norm": 2.890625, + "learning_rate": 4.8474754219242016e-05, + "loss": 0.8291, + "step": 6456 + }, + { + "epoch": 0.11447462298975879, + "grad_norm": 2.640625, + "learning_rate": 4.847379349751329e-05, + "loss": 0.8048, + "step": 6458 + }, + { + "epoch": 0.11451007502537036, + "grad_norm": 2.734375, + "learning_rate": 4.8472832482836504e-05, + "loss": 0.8122, + "step": 6460 + }, + { + "epoch": 0.11454552706098194, + "grad_norm": 2.9375, + "learning_rate": 4.847187117522366e-05, + "loss": 0.8276, + "step": 6462 + }, + { + "epoch": 0.1145809790965935, + "grad_norm": 2.765625, + "learning_rate": 4.8470909574686764e-05, + "loss": 0.8586, + "step": 6464 + }, + { + "epoch": 0.11461643113220507, + "grad_norm": 2.75, + "learning_rate": 4.846994768123779e-05, + "loss": 0.8073, + "step": 6466 + }, + { + "epoch": 0.11465188316781665, + "grad_norm": 2.703125, + "learning_rate": 4.846898549488877e-05, + "loss": 0.8771, + "step": 6468 + }, + { + "epoch": 0.1146873352034282, + "grad_norm": 2.890625, + "learning_rate": 4.846802301565169e-05, + "loss": 0.8059, + "step": 6470 + }, + { + "epoch": 0.11472278723903978, + "grad_norm": 2.625, + "learning_rate": 4.8467060243538574e-05, + "loss": 0.8012, + "step": 6472 + }, + { + "epoch": 0.11475823927465136, + "grad_norm": 3.15625, + "learning_rate": 4.8466097178561435e-05, + "loss": 0.865, + "step": 6474 + }, + { + "epoch": 0.11479369131026292, + "grad_norm": 3.078125, + "learning_rate": 4.84651338207323e-05, + "loss": 0.8429, + "step": 6476 + }, + { + "epoch": 0.11482914334587449, + "grad_norm": 2.546875, + "learning_rate": 4.846417017006317e-05, + "loss": 0.8308, + "step": 6478 + }, + { + "epoch": 0.11486459538148606, + "grad_norm": 2.796875, + "learning_rate": 4.8463206226566084e-05, + "loss": 0.8076, + "step": 6480 + }, + { + "epoch": 0.11490004741709763, + "grad_norm": 2.625, + "learning_rate": 4.8462241990253077e-05, + "loss": 0.8155, + "step": 6482 + }, + { + "epoch": 0.1149354994527092, + "grad_norm": 2.90625, + "learning_rate": 4.846127746113617e-05, + "loss": 0.8628, + "step": 6484 + }, + { + "epoch": 0.11497095148832077, + "grad_norm": 2.6875, + "learning_rate": 4.8460312639227414e-05, + "loss": 0.8624, + "step": 6486 + }, + { + "epoch": 0.11500640352393234, + "grad_norm": 2.515625, + "learning_rate": 4.8459347524538834e-05, + "loss": 0.8101, + "step": 6488 + }, + { + "epoch": 0.11504185555954391, + "grad_norm": 2.765625, + "learning_rate": 4.845838211708249e-05, + "loss": 0.8185, + "step": 6490 + }, + { + "epoch": 0.11507730759515548, + "grad_norm": 2.5, + "learning_rate": 4.845741641687042e-05, + "loss": 0.8343, + "step": 6492 + }, + { + "epoch": 0.11511275963076704, + "grad_norm": 2.8125, + "learning_rate": 4.8456450423914677e-05, + "loss": 0.8335, + "step": 6494 + }, + { + "epoch": 0.11514821166637862, + "grad_norm": 2.484375, + "learning_rate": 4.845548413822733e-05, + "loss": 0.8088, + "step": 6496 + }, + { + "epoch": 0.1151836637019902, + "grad_norm": 3.0, + "learning_rate": 4.8454517559820414e-05, + "loss": 0.8173, + "step": 6498 + }, + { + "epoch": 0.11521911573760175, + "grad_norm": 2.65625, + "learning_rate": 4.8453550688706006e-05, + "loss": 0.8532, + "step": 6500 + }, + { + "epoch": 0.11525456777321333, + "grad_norm": 2.609375, + "learning_rate": 4.845258352489618e-05, + "loss": 0.7932, + "step": 6502 + }, + { + "epoch": 0.1152900198088249, + "grad_norm": 2.796875, + "learning_rate": 4.8451616068402985e-05, + "loss": 0.8306, + "step": 6504 + }, + { + "epoch": 0.11532547184443646, + "grad_norm": 3.015625, + "learning_rate": 4.845064831923851e-05, + "loss": 0.8666, + "step": 6506 + }, + { + "epoch": 0.11536092388004804, + "grad_norm": 2.96875, + "learning_rate": 4.8449680277414834e-05, + "loss": 0.8321, + "step": 6508 + }, + { + "epoch": 0.11539637591565961, + "grad_norm": 2.921875, + "learning_rate": 4.8448711942944025e-05, + "loss": 0.8143, + "step": 6510 + }, + { + "epoch": 0.11543182795127117, + "grad_norm": 2.625, + "learning_rate": 4.844774331583818e-05, + "loss": 0.8275, + "step": 6512 + }, + { + "epoch": 0.11546727998688275, + "grad_norm": 2.5, + "learning_rate": 4.8446774396109375e-05, + "loss": 0.8232, + "step": 6514 + }, + { + "epoch": 0.11550273202249432, + "grad_norm": 2.53125, + "learning_rate": 4.844580518376971e-05, + "loss": 0.8541, + "step": 6516 + }, + { + "epoch": 0.11553818405810588, + "grad_norm": 2.875, + "learning_rate": 4.844483567883128e-05, + "loss": 0.857, + "step": 6518 + }, + { + "epoch": 0.11557363609371746, + "grad_norm": 2.96875, + "learning_rate": 4.8443865881306194e-05, + "loss": 0.8258, + "step": 6520 + }, + { + "epoch": 0.11560908812932903, + "grad_norm": 2.703125, + "learning_rate": 4.844289579120653e-05, + "loss": 0.8415, + "step": 6522 + }, + { + "epoch": 0.11564454016494059, + "grad_norm": 2.96875, + "learning_rate": 4.844192540854442e-05, + "loss": 0.8205, + "step": 6524 + }, + { + "epoch": 0.11567999220055217, + "grad_norm": 2.59375, + "learning_rate": 4.844095473333197e-05, + "loss": 0.7767, + "step": 6526 + }, + { + "epoch": 0.11571544423616374, + "grad_norm": 2.8125, + "learning_rate": 4.8439983765581274e-05, + "loss": 0.7606, + "step": 6528 + }, + { + "epoch": 0.1157508962717753, + "grad_norm": 2.421875, + "learning_rate": 4.8439012505304465e-05, + "loss": 0.7859, + "step": 6530 + }, + { + "epoch": 0.11578634830738688, + "grad_norm": 2.859375, + "learning_rate": 4.8438040952513664e-05, + "loss": 0.7816, + "step": 6532 + }, + { + "epoch": 0.11582180034299844, + "grad_norm": 2.625, + "learning_rate": 4.8437069107220994e-05, + "loss": 0.8353, + "step": 6534 + }, + { + "epoch": 0.11585725237861001, + "grad_norm": 3.21875, + "learning_rate": 4.8436096969438584e-05, + "loss": 0.8546, + "step": 6536 + }, + { + "epoch": 0.11589270441422159, + "grad_norm": 2.625, + "learning_rate": 4.8435124539178564e-05, + "loss": 0.7771, + "step": 6538 + }, + { + "epoch": 0.11592815644983315, + "grad_norm": 2.875, + "learning_rate": 4.843415181645306e-05, + "loss": 0.8452, + "step": 6540 + }, + { + "epoch": 0.11596360848544472, + "grad_norm": 2.90625, + "learning_rate": 4.8433178801274244e-05, + "loss": 0.8423, + "step": 6542 + }, + { + "epoch": 0.1159990605210563, + "grad_norm": 2.6875, + "learning_rate": 4.8432205493654224e-05, + "loss": 0.8103, + "step": 6544 + }, + { + "epoch": 0.11603451255666786, + "grad_norm": 2.59375, + "learning_rate": 4.843123189360516e-05, + "loss": 0.8496, + "step": 6546 + }, + { + "epoch": 0.11606996459227943, + "grad_norm": 3.0, + "learning_rate": 4.8430258001139206e-05, + "loss": 0.857, + "step": 6548 + }, + { + "epoch": 0.116105416627891, + "grad_norm": 2.71875, + "learning_rate": 4.842928381626851e-05, + "loss": 0.8203, + "step": 6550 + }, + { + "epoch": 0.11614086866350257, + "grad_norm": 2.84375, + "learning_rate": 4.8428309339005235e-05, + "loss": 0.8282, + "step": 6552 + }, + { + "epoch": 0.11617632069911414, + "grad_norm": 2.953125, + "learning_rate": 4.8427334569361537e-05, + "loss": 0.8114, + "step": 6554 + }, + { + "epoch": 0.11621177273472572, + "grad_norm": 2.8125, + "learning_rate": 4.842635950734958e-05, + "loss": 0.8155, + "step": 6556 + }, + { + "epoch": 0.11624722477033728, + "grad_norm": 2.890625, + "learning_rate": 4.8425384152981545e-05, + "loss": 0.8523, + "step": 6558 + }, + { + "epoch": 0.11628267680594885, + "grad_norm": 2.625, + "learning_rate": 4.8424408506269585e-05, + "loss": 0.8089, + "step": 6560 + }, + { + "epoch": 0.11631812884156043, + "grad_norm": 3.578125, + "learning_rate": 4.842343256722589e-05, + "loss": 0.8448, + "step": 6562 + }, + { + "epoch": 0.11635358087717199, + "grad_norm": 2.734375, + "learning_rate": 4.842245633586264e-05, + "loss": 0.8025, + "step": 6564 + }, + { + "epoch": 0.11638903291278356, + "grad_norm": 2.578125, + "learning_rate": 4.842147981219201e-05, + "loss": 0.8363, + "step": 6566 + }, + { + "epoch": 0.11642448494839513, + "grad_norm": 2.78125, + "learning_rate": 4.842050299622618e-05, + "loss": 0.8789, + "step": 6568 + }, + { + "epoch": 0.1164599369840067, + "grad_norm": 2.5625, + "learning_rate": 4.841952588797736e-05, + "loss": 0.7869, + "step": 6570 + }, + { + "epoch": 0.11649538901961827, + "grad_norm": 2.6875, + "learning_rate": 4.841854848745774e-05, + "loss": 0.8549, + "step": 6572 + }, + { + "epoch": 0.11653084105522984, + "grad_norm": 2.625, + "learning_rate": 4.84175707946795e-05, + "loss": 0.829, + "step": 6574 + }, + { + "epoch": 0.1165662930908414, + "grad_norm": 2.59375, + "learning_rate": 4.8416592809654865e-05, + "loss": 0.8299, + "step": 6576 + }, + { + "epoch": 0.11660174512645298, + "grad_norm": 2.515625, + "learning_rate": 4.841561453239602e-05, + "loss": 0.8258, + "step": 6578 + }, + { + "epoch": 0.11663719716206455, + "grad_norm": 2.640625, + "learning_rate": 4.84146359629152e-05, + "loss": 0.7927, + "step": 6580 + }, + { + "epoch": 0.11667264919767611, + "grad_norm": 2.796875, + "learning_rate": 4.841365710122458e-05, + "loss": 0.8182, + "step": 6582 + }, + { + "epoch": 0.11670810123328769, + "grad_norm": 2.6875, + "learning_rate": 4.84126779473364e-05, + "loss": 0.8024, + "step": 6584 + }, + { + "epoch": 0.11674355326889926, + "grad_norm": 2.671875, + "learning_rate": 4.841169850126288e-05, + "loss": 0.8211, + "step": 6586 + }, + { + "epoch": 0.11677900530451082, + "grad_norm": 2.78125, + "learning_rate": 4.841071876301625e-05, + "loss": 0.8367, + "step": 6588 + }, + { + "epoch": 0.1168144573401224, + "grad_norm": 2.75, + "learning_rate": 4.840973873260871e-05, + "loss": 0.8627, + "step": 6590 + }, + { + "epoch": 0.11684990937573397, + "grad_norm": 2.734375, + "learning_rate": 4.8408758410052514e-05, + "loss": 0.8322, + "step": 6592 + }, + { + "epoch": 0.11688536141134553, + "grad_norm": 2.71875, + "learning_rate": 4.840777779535988e-05, + "loss": 0.8435, + "step": 6594 + }, + { + "epoch": 0.11692081344695711, + "grad_norm": 3.203125, + "learning_rate": 4.840679688854306e-05, + "loss": 0.8276, + "step": 6596 + }, + { + "epoch": 0.11695626548256868, + "grad_norm": 2.71875, + "learning_rate": 4.840581568961429e-05, + "loss": 0.8271, + "step": 6598 + }, + { + "epoch": 0.11699171751818024, + "grad_norm": 2.609375, + "learning_rate": 4.840483419858582e-05, + "loss": 0.8112, + "step": 6600 + }, + { + "epoch": 0.11702716955379182, + "grad_norm": 2.828125, + "learning_rate": 4.8403852415469885e-05, + "loss": 0.8333, + "step": 6602 + }, + { + "epoch": 0.11706262158940339, + "grad_norm": 2.765625, + "learning_rate": 4.840287034027876e-05, + "loss": 0.8411, + "step": 6604 + }, + { + "epoch": 0.11709807362501495, + "grad_norm": 2.375, + "learning_rate": 4.840188797302467e-05, + "loss": 0.824, + "step": 6606 + }, + { + "epoch": 0.11713352566062653, + "grad_norm": 2.640625, + "learning_rate": 4.84009053137199e-05, + "loss": 0.8263, + "step": 6608 + }, + { + "epoch": 0.1171689776962381, + "grad_norm": 2.796875, + "learning_rate": 4.839992236237672e-05, + "loss": 0.8372, + "step": 6610 + }, + { + "epoch": 0.11720442973184966, + "grad_norm": 2.84375, + "learning_rate": 4.8398939119007365e-05, + "loss": 0.8426, + "step": 6612 + }, + { + "epoch": 0.11723988176746124, + "grad_norm": 2.734375, + "learning_rate": 4.839795558362413e-05, + "loss": 0.812, + "step": 6614 + }, + { + "epoch": 0.11727533380307281, + "grad_norm": 2.75, + "learning_rate": 4.839697175623928e-05, + "loss": 0.8206, + "step": 6616 + }, + { + "epoch": 0.11731078583868437, + "grad_norm": 2.8125, + "learning_rate": 4.839598763686509e-05, + "loss": 0.8273, + "step": 6618 + }, + { + "epoch": 0.11734623787429595, + "grad_norm": 2.796875, + "learning_rate": 4.839500322551386e-05, + "loss": 0.8424, + "step": 6620 + }, + { + "epoch": 0.11738168990990751, + "grad_norm": 2.4375, + "learning_rate": 4.839401852219786e-05, + "loss": 0.8293, + "step": 6622 + }, + { + "epoch": 0.11741714194551908, + "grad_norm": 2.90625, + "learning_rate": 4.839303352692938e-05, + "loss": 0.8094, + "step": 6624 + }, + { + "epoch": 0.11745259398113066, + "grad_norm": 2.796875, + "learning_rate": 4.8392048239720703e-05, + "loss": 0.7789, + "step": 6626 + }, + { + "epoch": 0.11748804601674222, + "grad_norm": 2.9375, + "learning_rate": 4.839106266058415e-05, + "loss": 0.8413, + "step": 6628 + }, + { + "epoch": 0.11752349805235379, + "grad_norm": 2.546875, + "learning_rate": 4.8390076789532004e-05, + "loss": 0.7869, + "step": 6630 + }, + { + "epoch": 0.11755895008796537, + "grad_norm": 2.734375, + "learning_rate": 4.838909062657657e-05, + "loss": 0.833, + "step": 6632 + }, + { + "epoch": 0.11759440212357693, + "grad_norm": 2.765625, + "learning_rate": 4.838810417173015e-05, + "loss": 0.8326, + "step": 6634 + }, + { + "epoch": 0.1176298541591885, + "grad_norm": 2.921875, + "learning_rate": 4.8387117425005066e-05, + "loss": 0.8743, + "step": 6636 + }, + { + "epoch": 0.11766530619480008, + "grad_norm": 2.609375, + "learning_rate": 4.8386130386413635e-05, + "loss": 0.8445, + "step": 6638 + }, + { + "epoch": 0.11770075823041164, + "grad_norm": 2.796875, + "learning_rate": 4.8385143055968166e-05, + "loss": 0.8199, + "step": 6640 + }, + { + "epoch": 0.11773621026602321, + "grad_norm": 2.453125, + "learning_rate": 4.838415543368098e-05, + "loss": 0.8384, + "step": 6642 + }, + { + "epoch": 0.11777166230163479, + "grad_norm": 2.6875, + "learning_rate": 4.83831675195644e-05, + "loss": 0.8122, + "step": 6644 + }, + { + "epoch": 0.11780711433724635, + "grad_norm": 2.65625, + "learning_rate": 4.838217931363076e-05, + "loss": 0.8082, + "step": 6646 + }, + { + "epoch": 0.11784256637285792, + "grad_norm": 2.703125, + "learning_rate": 4.8381190815892394e-05, + "loss": 0.829, + "step": 6648 + }, + { + "epoch": 0.1178780184084695, + "grad_norm": 2.59375, + "learning_rate": 4.8380202026361644e-05, + "loss": 0.8042, + "step": 6650 + }, + { + "epoch": 0.11791347044408106, + "grad_norm": 2.734375, + "learning_rate": 4.837921294505083e-05, + "loss": 0.8213, + "step": 6652 + }, + { + "epoch": 0.11794892247969263, + "grad_norm": 2.625, + "learning_rate": 4.837822357197232e-05, + "loss": 0.7967, + "step": 6654 + }, + { + "epoch": 0.1179843745153042, + "grad_norm": 2.8125, + "learning_rate": 4.8377233907138444e-05, + "loss": 0.8582, + "step": 6656 + }, + { + "epoch": 0.11801982655091577, + "grad_norm": 2.65625, + "learning_rate": 4.837624395056155e-05, + "loss": 0.8131, + "step": 6658 + }, + { + "epoch": 0.11805527858652734, + "grad_norm": 2.921875, + "learning_rate": 4.837525370225401e-05, + "loss": 0.8288, + "step": 6660 + }, + { + "epoch": 0.11809073062213891, + "grad_norm": 2.703125, + "learning_rate": 4.8374263162228176e-05, + "loss": 0.8583, + "step": 6662 + }, + { + "epoch": 0.11812618265775048, + "grad_norm": 2.765625, + "learning_rate": 4.83732723304964e-05, + "loss": 0.8188, + "step": 6664 + }, + { + "epoch": 0.11816163469336205, + "grad_norm": 3.0, + "learning_rate": 4.837228120707106e-05, + "loss": 0.8512, + "step": 6666 + }, + { + "epoch": 0.11819708672897362, + "grad_norm": 2.484375, + "learning_rate": 4.837128979196451e-05, + "loss": 0.815, + "step": 6668 + }, + { + "epoch": 0.11823253876458518, + "grad_norm": 3.0625, + "learning_rate": 4.8370298085189134e-05, + "loss": 0.8627, + "step": 6670 + }, + { + "epoch": 0.11826799080019676, + "grad_norm": 2.703125, + "learning_rate": 4.83693060867573e-05, + "loss": 0.8659, + "step": 6672 + }, + { + "epoch": 0.11830344283580833, + "grad_norm": 2.90625, + "learning_rate": 4.8368313796681404e-05, + "loss": 0.8722, + "step": 6674 + }, + { + "epoch": 0.1183388948714199, + "grad_norm": 2.53125, + "learning_rate": 4.8367321214973815e-05, + "loss": 0.8038, + "step": 6676 + }, + { + "epoch": 0.11837434690703147, + "grad_norm": 2.859375, + "learning_rate": 4.836632834164692e-05, + "loss": 0.8188, + "step": 6678 + }, + { + "epoch": 0.11840979894264304, + "grad_norm": 2.671875, + "learning_rate": 4.836533517671312e-05, + "loss": 0.8267, + "step": 6680 + }, + { + "epoch": 0.1184452509782546, + "grad_norm": 2.609375, + "learning_rate": 4.83643417201848e-05, + "loss": 0.8017, + "step": 6682 + }, + { + "epoch": 0.11848070301386618, + "grad_norm": 2.609375, + "learning_rate": 4.836334797207437e-05, + "loss": 0.8188, + "step": 6684 + }, + { + "epoch": 0.11851615504947775, + "grad_norm": 2.71875, + "learning_rate": 4.836235393239421e-05, + "loss": 0.7921, + "step": 6686 + }, + { + "epoch": 0.11855160708508931, + "grad_norm": 2.6875, + "learning_rate": 4.836135960115675e-05, + "loss": 0.8295, + "step": 6688 + }, + { + "epoch": 0.11858705912070089, + "grad_norm": 3.03125, + "learning_rate": 4.8360364978374384e-05, + "loss": 0.8566, + "step": 6690 + }, + { + "epoch": 0.11862251115631246, + "grad_norm": 2.6875, + "learning_rate": 4.835937006405953e-05, + "loss": 0.8308, + "step": 6692 + }, + { + "epoch": 0.11865796319192402, + "grad_norm": 3.03125, + "learning_rate": 4.83583748582246e-05, + "loss": 0.8197, + "step": 6694 + }, + { + "epoch": 0.1186934152275356, + "grad_norm": 2.59375, + "learning_rate": 4.8357379360882014e-05, + "loss": 0.8059, + "step": 6696 + }, + { + "epoch": 0.11872886726314717, + "grad_norm": 2.953125, + "learning_rate": 4.8356383572044206e-05, + "loss": 0.829, + "step": 6698 + }, + { + "epoch": 0.11876431929875873, + "grad_norm": 2.609375, + "learning_rate": 4.835538749172359e-05, + "loss": 0.8027, + "step": 6700 + }, + { + "epoch": 0.11879977133437031, + "grad_norm": 2.796875, + "learning_rate": 4.835439111993261e-05, + "loss": 0.8291, + "step": 6702 + }, + { + "epoch": 0.11883522336998187, + "grad_norm": 2.765625, + "learning_rate": 4.835339445668369e-05, + "loss": 0.8159, + "step": 6704 + }, + { + "epoch": 0.11887067540559344, + "grad_norm": 2.734375, + "learning_rate": 4.8352397501989265e-05, + "loss": 0.8232, + "step": 6706 + }, + { + "epoch": 0.11890612744120502, + "grad_norm": 2.78125, + "learning_rate": 4.835140025586179e-05, + "loss": 0.8433, + "step": 6708 + }, + { + "epoch": 0.11894157947681658, + "grad_norm": 2.59375, + "learning_rate": 4.83504027183137e-05, + "loss": 0.811, + "step": 6710 + }, + { + "epoch": 0.11897703151242815, + "grad_norm": 2.640625, + "learning_rate": 4.8349404889357455e-05, + "loss": 0.8117, + "step": 6712 + }, + { + "epoch": 0.11901248354803973, + "grad_norm": 2.484375, + "learning_rate": 4.8348406769005494e-05, + "loss": 0.8082, + "step": 6714 + }, + { + "epoch": 0.11904793558365129, + "grad_norm": 2.953125, + "learning_rate": 4.834740835727028e-05, + "loss": 0.8391, + "step": 6716 + }, + { + "epoch": 0.11908338761926286, + "grad_norm": 2.71875, + "learning_rate": 4.834640965416427e-05, + "loss": 0.7962, + "step": 6718 + }, + { + "epoch": 0.11911883965487444, + "grad_norm": 2.828125, + "learning_rate": 4.834541065969993e-05, + "loss": 0.8261, + "step": 6720 + }, + { + "epoch": 0.119154291690486, + "grad_norm": 2.78125, + "learning_rate": 4.834441137388973e-05, + "loss": 0.7957, + "step": 6722 + }, + { + "epoch": 0.11918974372609757, + "grad_norm": 2.75, + "learning_rate": 4.834341179674614e-05, + "loss": 0.8004, + "step": 6724 + }, + { + "epoch": 0.11922519576170915, + "grad_norm": 2.796875, + "learning_rate": 4.834241192828164e-05, + "loss": 0.8264, + "step": 6726 + }, + { + "epoch": 0.1192606477973207, + "grad_norm": 2.703125, + "learning_rate": 4.8341411768508684e-05, + "loss": 0.7818, + "step": 6728 + }, + { + "epoch": 0.11929609983293228, + "grad_norm": 2.75, + "learning_rate": 4.8340411317439785e-05, + "loss": 0.8018, + "step": 6730 + }, + { + "epoch": 0.11933155186854386, + "grad_norm": 2.859375, + "learning_rate": 4.833941057508741e-05, + "loss": 0.8283, + "step": 6732 + }, + { + "epoch": 0.11936700390415542, + "grad_norm": 2.484375, + "learning_rate": 4.8338409541464045e-05, + "loss": 0.8298, + "step": 6734 + }, + { + "epoch": 0.11940245593976699, + "grad_norm": 2.734375, + "learning_rate": 4.83374082165822e-05, + "loss": 0.8132, + "step": 6736 + }, + { + "epoch": 0.11943790797537857, + "grad_norm": 2.828125, + "learning_rate": 4.833640660045436e-05, + "loss": 0.8407, + "step": 6738 + }, + { + "epoch": 0.11947336001099013, + "grad_norm": 2.609375, + "learning_rate": 4.8335404693093026e-05, + "loss": 0.8115, + "step": 6740 + }, + { + "epoch": 0.1195088120466017, + "grad_norm": 2.84375, + "learning_rate": 4.83344024945107e-05, + "loss": 0.8323, + "step": 6742 + }, + { + "epoch": 0.11954426408221328, + "grad_norm": 2.953125, + "learning_rate": 4.8333400004719885e-05, + "loss": 0.8535, + "step": 6744 + }, + { + "epoch": 0.11957971611782484, + "grad_norm": 2.625, + "learning_rate": 4.8332397223733104e-05, + "loss": 0.8649, + "step": 6746 + }, + { + "epoch": 0.11961516815343641, + "grad_norm": 2.703125, + "learning_rate": 4.8331394151562864e-05, + "loss": 0.8297, + "step": 6748 + }, + { + "epoch": 0.11965062018904798, + "grad_norm": 2.765625, + "learning_rate": 4.833039078822169e-05, + "loss": 0.8305, + "step": 6750 + }, + { + "epoch": 0.11968607222465955, + "grad_norm": 2.578125, + "learning_rate": 4.832938713372209e-05, + "loss": 0.8449, + "step": 6752 + }, + { + "epoch": 0.11972152426027112, + "grad_norm": 3.0625, + "learning_rate": 4.8328383188076595e-05, + "loss": 0.8273, + "step": 6754 + }, + { + "epoch": 0.1197569762958827, + "grad_norm": 2.828125, + "learning_rate": 4.832737895129775e-05, + "loss": 0.8507, + "step": 6756 + }, + { + "epoch": 0.11979242833149426, + "grad_norm": 2.609375, + "learning_rate": 4.8326374423398066e-05, + "loss": 0.7767, + "step": 6758 + }, + { + "epoch": 0.11982788036710583, + "grad_norm": 2.875, + "learning_rate": 4.832536960439009e-05, + "loss": 0.8543, + "step": 6760 + }, + { + "epoch": 0.1198633324027174, + "grad_norm": 2.75, + "learning_rate": 4.8324364494286364e-05, + "loss": 0.8477, + "step": 6762 + }, + { + "epoch": 0.11989878443832896, + "grad_norm": 2.75, + "learning_rate": 4.832335909309942e-05, + "loss": 0.7939, + "step": 6764 + }, + { + "epoch": 0.11993423647394054, + "grad_norm": 2.640625, + "learning_rate": 4.8322353400841816e-05, + "loss": 0.7968, + "step": 6766 + }, + { + "epoch": 0.11996968850955211, + "grad_norm": 2.71875, + "learning_rate": 4.83213474175261e-05, + "loss": 0.8366, + "step": 6768 + }, + { + "epoch": 0.12000514054516367, + "grad_norm": 2.984375, + "learning_rate": 4.8320341143164815e-05, + "loss": 0.8365, + "step": 6770 + }, + { + "epoch": 0.12004059258077525, + "grad_norm": 2.671875, + "learning_rate": 4.831933457777055e-05, + "loss": 0.8163, + "step": 6772 + }, + { + "epoch": 0.12007604461638682, + "grad_norm": 2.921875, + "learning_rate": 4.8318327721355825e-05, + "loss": 0.8055, + "step": 6774 + }, + { + "epoch": 0.12011149665199838, + "grad_norm": 2.75, + "learning_rate": 4.831732057393324e-05, + "loss": 0.7799, + "step": 6776 + }, + { + "epoch": 0.12014694868760996, + "grad_norm": 2.765625, + "learning_rate": 4.8316313135515343e-05, + "loss": 0.8324, + "step": 6778 + }, + { + "epoch": 0.12018240072322153, + "grad_norm": 3.03125, + "learning_rate": 4.8315305406114726e-05, + "loss": 0.8153, + "step": 6780 + }, + { + "epoch": 0.1202178527588331, + "grad_norm": 2.65625, + "learning_rate": 4.8314297385743945e-05, + "loss": 0.8353, + "step": 6782 + }, + { + "epoch": 0.12025330479444467, + "grad_norm": 3.125, + "learning_rate": 4.831328907441559e-05, + "loss": 0.8471, + "step": 6784 + }, + { + "epoch": 0.12028875683005624, + "grad_norm": 2.796875, + "learning_rate": 4.831228047214224e-05, + "loss": 0.8277, + "step": 6786 + }, + { + "epoch": 0.1203242088656678, + "grad_norm": 2.890625, + "learning_rate": 4.8311271578936496e-05, + "loss": 0.8371, + "step": 6788 + }, + { + "epoch": 0.12035966090127938, + "grad_norm": 2.71875, + "learning_rate": 4.8310262394810934e-05, + "loss": 0.7628, + "step": 6790 + }, + { + "epoch": 0.12039511293689094, + "grad_norm": 2.59375, + "learning_rate": 4.8309252919778146e-05, + "loss": 0.8369, + "step": 6792 + }, + { + "epoch": 0.12043056497250251, + "grad_norm": 2.875, + "learning_rate": 4.830824315385074e-05, + "loss": 0.8366, + "step": 6794 + }, + { + "epoch": 0.12046601700811409, + "grad_norm": 2.875, + "learning_rate": 4.830723309704131e-05, + "loss": 0.8113, + "step": 6796 + }, + { + "epoch": 0.12050146904372565, + "grad_norm": 2.75, + "learning_rate": 4.8306222749362475e-05, + "loss": 0.8502, + "step": 6798 + }, + { + "epoch": 0.12053692107933722, + "grad_norm": 2.890625, + "learning_rate": 4.8305212110826833e-05, + "loss": 0.8319, + "step": 6800 + }, + { + "epoch": 0.1205723731149488, + "grad_norm": 2.890625, + "learning_rate": 4.8304201181446994e-05, + "loss": 0.8532, + "step": 6802 + }, + { + "epoch": 0.12060782515056036, + "grad_norm": 2.96875, + "learning_rate": 4.830318996123557e-05, + "loss": 0.8519, + "step": 6804 + }, + { + "epoch": 0.12064327718617193, + "grad_norm": 2.578125, + "learning_rate": 4.830217845020521e-05, + "loss": 0.8027, + "step": 6806 + }, + { + "epoch": 0.1206787292217835, + "grad_norm": 2.84375, + "learning_rate": 4.83011666483685e-05, + "loss": 0.8347, + "step": 6808 + }, + { + "epoch": 0.12071418125739507, + "grad_norm": 2.375, + "learning_rate": 4.830015455573809e-05, + "loss": 0.8236, + "step": 6810 + }, + { + "epoch": 0.12074963329300664, + "grad_norm": 2.8125, + "learning_rate": 4.82991421723266e-05, + "loss": 0.8405, + "step": 6812 + }, + { + "epoch": 0.12078508532861822, + "grad_norm": 2.828125, + "learning_rate": 4.829812949814667e-05, + "loss": 0.8778, + "step": 6814 + }, + { + "epoch": 0.12082053736422978, + "grad_norm": 2.765625, + "learning_rate": 4.829711653321093e-05, + "loss": 0.8311, + "step": 6816 + }, + { + "epoch": 0.12085598939984135, + "grad_norm": 2.609375, + "learning_rate": 4.829610327753204e-05, + "loss": 0.8165, + "step": 6818 + }, + { + "epoch": 0.12089144143545293, + "grad_norm": 2.75, + "learning_rate": 4.829508973112263e-05, + "loss": 0.8344, + "step": 6820 + }, + { + "epoch": 0.12092689347106449, + "grad_norm": 3.03125, + "learning_rate": 4.829407589399535e-05, + "loss": 0.7916, + "step": 6822 + }, + { + "epoch": 0.12096234550667606, + "grad_norm": 2.734375, + "learning_rate": 4.829306176616285e-05, + "loss": 0.7913, + "step": 6824 + }, + { + "epoch": 0.12099779754228764, + "grad_norm": 2.734375, + "learning_rate": 4.82920473476378e-05, + "loss": 0.7872, + "step": 6826 + }, + { + "epoch": 0.1210332495778992, + "grad_norm": 2.859375, + "learning_rate": 4.8291032638432846e-05, + "loss": 0.8254, + "step": 6828 + }, + { + "epoch": 0.12106870161351077, + "grad_norm": 2.859375, + "learning_rate": 4.8290017638560656e-05, + "loss": 0.8217, + "step": 6830 + }, + { + "epoch": 0.12110415364912235, + "grad_norm": 2.53125, + "learning_rate": 4.8289002348033895e-05, + "loss": 0.7936, + "step": 6832 + }, + { + "epoch": 0.1211396056847339, + "grad_norm": 3.09375, + "learning_rate": 4.828798676686524e-05, + "loss": 0.8685, + "step": 6834 + }, + { + "epoch": 0.12117505772034548, + "grad_norm": 2.59375, + "learning_rate": 4.828697089506736e-05, + "loss": 0.8507, + "step": 6836 + }, + { + "epoch": 0.12121050975595705, + "grad_norm": 2.875, + "learning_rate": 4.8285954732652924e-05, + "loss": 0.826, + "step": 6838 + }, + { + "epoch": 0.12124596179156862, + "grad_norm": 2.578125, + "learning_rate": 4.828493827963464e-05, + "loss": 0.8032, + "step": 6840 + }, + { + "epoch": 0.12128141382718019, + "grad_norm": 2.65625, + "learning_rate": 4.828392153602516e-05, + "loss": 0.8642, + "step": 6842 + }, + { + "epoch": 0.12131686586279176, + "grad_norm": 2.59375, + "learning_rate": 4.82829045018372e-05, + "loss": 0.8316, + "step": 6844 + }, + { + "epoch": 0.12135231789840333, + "grad_norm": 2.703125, + "learning_rate": 4.828188717708343e-05, + "loss": 0.81, + "step": 6846 + }, + { + "epoch": 0.1213877699340149, + "grad_norm": 2.96875, + "learning_rate": 4.828086956177657e-05, + "loss": 0.8534, + "step": 6848 + }, + { + "epoch": 0.12142322196962647, + "grad_norm": 2.546875, + "learning_rate": 4.8279851655929295e-05, + "loss": 0.8427, + "step": 6850 + }, + { + "epoch": 0.12145867400523803, + "grad_norm": 2.71875, + "learning_rate": 4.827883345955433e-05, + "loss": 0.8379, + "step": 6852 + }, + { + "epoch": 0.12149412604084961, + "grad_norm": 2.53125, + "learning_rate": 4.827781497266437e-05, + "loss": 0.7829, + "step": 6854 + }, + { + "epoch": 0.12152957807646118, + "grad_norm": 2.546875, + "learning_rate": 4.827679619527213e-05, + "loss": 0.7964, + "step": 6856 + }, + { + "epoch": 0.12156503011207274, + "grad_norm": 2.703125, + "learning_rate": 4.827577712739031e-05, + "loss": 0.8052, + "step": 6858 + }, + { + "epoch": 0.12160048214768432, + "grad_norm": 2.796875, + "learning_rate": 4.827475776903165e-05, + "loss": 0.8155, + "step": 6860 + }, + { + "epoch": 0.1216359341832959, + "grad_norm": 2.734375, + "learning_rate": 4.827373812020886e-05, + "loss": 0.7722, + "step": 6862 + }, + { + "epoch": 0.12167138621890745, + "grad_norm": 2.890625, + "learning_rate": 4.827271818093466e-05, + "loss": 0.8165, + "step": 6864 + }, + { + "epoch": 0.12170683825451903, + "grad_norm": 2.671875, + "learning_rate": 4.8271697951221794e-05, + "loss": 0.7605, + "step": 6866 + }, + { + "epoch": 0.1217422902901306, + "grad_norm": 2.578125, + "learning_rate": 4.827067743108298e-05, + "loss": 0.8532, + "step": 6868 + }, + { + "epoch": 0.12177774232574216, + "grad_norm": 2.703125, + "learning_rate": 4.826965662053096e-05, + "loss": 0.7734, + "step": 6870 + }, + { + "epoch": 0.12181319436135374, + "grad_norm": 2.5, + "learning_rate": 4.826863551957846e-05, + "loss": 0.8054, + "step": 6872 + }, + { + "epoch": 0.1218486463969653, + "grad_norm": 2.765625, + "learning_rate": 4.826761412823825e-05, + "loss": 0.8259, + "step": 6874 + }, + { + "epoch": 0.12188409843257687, + "grad_norm": 2.921875, + "learning_rate": 4.8266592446523055e-05, + "loss": 0.8487, + "step": 6876 + }, + { + "epoch": 0.12191955046818845, + "grad_norm": 3.046875, + "learning_rate": 4.8265570474445636e-05, + "loss": 0.7975, + "step": 6878 + }, + { + "epoch": 0.12195500250380001, + "grad_norm": 3.015625, + "learning_rate": 4.826454821201875e-05, + "loss": 0.7988, + "step": 6880 + }, + { + "epoch": 0.12199045453941158, + "grad_norm": 2.578125, + "learning_rate": 4.826352565925513e-05, + "loss": 0.8201, + "step": 6882 + }, + { + "epoch": 0.12202590657502316, + "grad_norm": 2.84375, + "learning_rate": 4.826250281616757e-05, + "loss": 0.8149, + "step": 6884 + }, + { + "epoch": 0.12206135861063472, + "grad_norm": 2.84375, + "learning_rate": 4.826147968276881e-05, + "loss": 0.7627, + "step": 6886 + }, + { + "epoch": 0.12209681064624629, + "grad_norm": 2.875, + "learning_rate": 4.826045625907164e-05, + "loss": 0.8573, + "step": 6888 + }, + { + "epoch": 0.12213226268185787, + "grad_norm": 3.0625, + "learning_rate": 4.825943254508881e-05, + "loss": 0.8613, + "step": 6890 + }, + { + "epoch": 0.12216771471746943, + "grad_norm": 2.5625, + "learning_rate": 4.825840854083311e-05, + "loss": 0.8319, + "step": 6892 + }, + { + "epoch": 0.122203166753081, + "grad_norm": 2.8125, + "learning_rate": 4.8257384246317316e-05, + "loss": 0.827, + "step": 6894 + }, + { + "epoch": 0.12223861878869258, + "grad_norm": 2.265625, + "learning_rate": 4.8256359661554215e-05, + "loss": 0.8295, + "step": 6896 + }, + { + "epoch": 0.12227407082430414, + "grad_norm": 2.796875, + "learning_rate": 4.825533478655658e-05, + "loss": 0.8233, + "step": 6898 + }, + { + "epoch": 0.12230952285991571, + "grad_norm": 2.640625, + "learning_rate": 4.825430962133722e-05, + "loss": 0.8086, + "step": 6900 + }, + { + "epoch": 0.12234497489552729, + "grad_norm": 2.765625, + "learning_rate": 4.825328416590891e-05, + "loss": 0.7928, + "step": 6902 + }, + { + "epoch": 0.12238042693113885, + "grad_norm": 2.640625, + "learning_rate": 4.825225842028447e-05, + "loss": 0.7984, + "step": 6904 + }, + { + "epoch": 0.12241587896675042, + "grad_norm": 2.65625, + "learning_rate": 4.8251232384476675e-05, + "loss": 0.8349, + "step": 6906 + }, + { + "epoch": 0.122451331002362, + "grad_norm": 2.8125, + "learning_rate": 4.825020605849835e-05, + "loss": 0.8235, + "step": 6908 + }, + { + "epoch": 0.12248678303797356, + "grad_norm": 2.90625, + "learning_rate": 4.8249179442362294e-05, + "loss": 0.8255, + "step": 6910 + }, + { + "epoch": 0.12252223507358513, + "grad_norm": 2.828125, + "learning_rate": 4.824815253608132e-05, + "loss": 0.842, + "step": 6912 + }, + { + "epoch": 0.1225576871091967, + "grad_norm": 2.921875, + "learning_rate": 4.8247125339668244e-05, + "loss": 0.8267, + "step": 6914 + }, + { + "epoch": 0.12259313914480827, + "grad_norm": 2.609375, + "learning_rate": 4.824609785313589e-05, + "loss": 0.8199, + "step": 6916 + }, + { + "epoch": 0.12262859118041984, + "grad_norm": 2.875, + "learning_rate": 4.824507007649708e-05, + "loss": 0.8283, + "step": 6918 + }, + { + "epoch": 0.12266404321603142, + "grad_norm": 2.703125, + "learning_rate": 4.8244042009764625e-05, + "loss": 0.8408, + "step": 6920 + }, + { + "epoch": 0.12269949525164298, + "grad_norm": 2.859375, + "learning_rate": 4.8243013652951374e-05, + "loss": 0.8198, + "step": 6922 + }, + { + "epoch": 0.12273494728725455, + "grad_norm": 2.71875, + "learning_rate": 4.824198500607016e-05, + "loss": 0.8309, + "step": 6924 + }, + { + "epoch": 0.12277039932286613, + "grad_norm": 2.78125, + "learning_rate": 4.82409560691338e-05, + "loss": 0.7655, + "step": 6926 + }, + { + "epoch": 0.12280585135847769, + "grad_norm": 3.0, + "learning_rate": 4.823992684215516e-05, + "loss": 0.8159, + "step": 6928 + }, + { + "epoch": 0.12284130339408926, + "grad_norm": 3.15625, + "learning_rate": 4.823889732514707e-05, + "loss": 0.8199, + "step": 6930 + }, + { + "epoch": 0.12287675542970083, + "grad_norm": 2.84375, + "learning_rate": 4.823786751812238e-05, + "loss": 0.8167, + "step": 6932 + }, + { + "epoch": 0.1229122074653124, + "grad_norm": 3.046875, + "learning_rate": 4.8236837421093946e-05, + "loss": 0.7913, + "step": 6934 + }, + { + "epoch": 0.12294765950092397, + "grad_norm": 2.625, + "learning_rate": 4.823580703407462e-05, + "loss": 0.8211, + "step": 6936 + }, + { + "epoch": 0.12298311153653554, + "grad_norm": 2.609375, + "learning_rate": 4.823477635707726e-05, + "loss": 0.8176, + "step": 6938 + }, + { + "epoch": 0.1230185635721471, + "grad_norm": 2.78125, + "learning_rate": 4.8233745390114734e-05, + "loss": 0.7938, + "step": 6940 + }, + { + "epoch": 0.12305401560775868, + "grad_norm": 2.9375, + "learning_rate": 4.82327141331999e-05, + "loss": 0.8201, + "step": 6942 + }, + { + "epoch": 0.12308946764337025, + "grad_norm": 2.765625, + "learning_rate": 4.823168258634564e-05, + "loss": 0.8104, + "step": 6944 + }, + { + "epoch": 0.12312491967898181, + "grad_norm": 2.765625, + "learning_rate": 4.823065074956481e-05, + "loss": 0.7937, + "step": 6946 + }, + { + "epoch": 0.12316037171459339, + "grad_norm": 2.625, + "learning_rate": 4.822961862287031e-05, + "loss": 0.8213, + "step": 6948 + }, + { + "epoch": 0.12319582375020496, + "grad_norm": 2.640625, + "learning_rate": 4.8228586206274996e-05, + "loss": 0.8671, + "step": 6950 + }, + { + "epoch": 0.12323127578581652, + "grad_norm": 2.65625, + "learning_rate": 4.8227553499791774e-05, + "loss": 0.8332, + "step": 6952 + }, + { + "epoch": 0.1232667278214281, + "grad_norm": 2.703125, + "learning_rate": 4.8226520503433515e-05, + "loss": 0.7827, + "step": 6954 + }, + { + "epoch": 0.12330217985703967, + "grad_norm": 2.796875, + "learning_rate": 4.8225487217213114e-05, + "loss": 0.8011, + "step": 6956 + }, + { + "epoch": 0.12333763189265123, + "grad_norm": 2.71875, + "learning_rate": 4.822445364114349e-05, + "loss": 0.816, + "step": 6958 + }, + { + "epoch": 0.12337308392826281, + "grad_norm": 2.546875, + "learning_rate": 4.82234197752375e-05, + "loss": 0.8316, + "step": 6960 + }, + { + "epoch": 0.12340853596387437, + "grad_norm": 2.6875, + "learning_rate": 4.822238561950808e-05, + "loss": 0.8169, + "step": 6962 + }, + { + "epoch": 0.12344398799948594, + "grad_norm": 2.875, + "learning_rate": 4.8221351173968124e-05, + "loss": 0.8094, + "step": 6964 + }, + { + "epoch": 0.12347944003509752, + "grad_norm": 2.78125, + "learning_rate": 4.822031643863053e-05, + "loss": 0.8237, + "step": 6966 + }, + { + "epoch": 0.12351489207070908, + "grad_norm": 2.640625, + "learning_rate": 4.821928141350823e-05, + "loss": 0.8092, + "step": 6968 + }, + { + "epoch": 0.12355034410632065, + "grad_norm": 2.359375, + "learning_rate": 4.821824609861414e-05, + "loss": 0.8337, + "step": 6970 + }, + { + "epoch": 0.12358579614193223, + "grad_norm": 2.75, + "learning_rate": 4.821721049396117e-05, + "loss": 0.8329, + "step": 6972 + }, + { + "epoch": 0.12362124817754379, + "grad_norm": 2.65625, + "learning_rate": 4.821617459956225e-05, + "loss": 0.823, + "step": 6974 + }, + { + "epoch": 0.12365670021315536, + "grad_norm": 2.703125, + "learning_rate": 4.8215138415430313e-05, + "loss": 0.8266, + "step": 6976 + }, + { + "epoch": 0.12369215224876694, + "grad_norm": 3.015625, + "learning_rate": 4.821410194157827e-05, + "loss": 0.8009, + "step": 6978 + }, + { + "epoch": 0.1237276042843785, + "grad_norm": 2.71875, + "learning_rate": 4.821306517801908e-05, + "loss": 0.8233, + "step": 6980 + }, + { + "epoch": 0.12376305631999007, + "grad_norm": 2.765625, + "learning_rate": 4.821202812476567e-05, + "loss": 0.8174, + "step": 6982 + }, + { + "epoch": 0.12379850835560165, + "grad_norm": 2.859375, + "learning_rate": 4.821099078183098e-05, + "loss": 0.8217, + "step": 6984 + }, + { + "epoch": 0.12383396039121321, + "grad_norm": 2.609375, + "learning_rate": 4.8209953149227966e-05, + "loss": 0.7908, + "step": 6986 + }, + { + "epoch": 0.12386941242682478, + "grad_norm": 2.640625, + "learning_rate": 4.8208915226969566e-05, + "loss": 0.8364, + "step": 6988 + }, + { + "epoch": 0.12390486446243636, + "grad_norm": 2.953125, + "learning_rate": 4.820787701506874e-05, + "loss": 0.8028, + "step": 6990 + }, + { + "epoch": 0.12394031649804792, + "grad_norm": 3.0, + "learning_rate": 4.820683851353844e-05, + "loss": 0.8089, + "step": 6992 + }, + { + "epoch": 0.12397576853365949, + "grad_norm": 2.640625, + "learning_rate": 4.820579972239163e-05, + "loss": 0.8408, + "step": 6994 + }, + { + "epoch": 0.12401122056927107, + "grad_norm": 2.96875, + "learning_rate": 4.8204760641641275e-05, + "loss": 0.8123, + "step": 6996 + }, + { + "epoch": 0.12404667260488263, + "grad_norm": 2.671875, + "learning_rate": 4.820372127130034e-05, + "loss": 0.7928, + "step": 6998 + }, + { + "epoch": 0.1240821246404942, + "grad_norm": 2.96875, + "learning_rate": 4.8202681611381795e-05, + "loss": 0.8467, + "step": 7000 + }, + { + "epoch": 0.12411757667610578, + "grad_norm": 2.890625, + "learning_rate": 4.8201641661898625e-05, + "loss": 0.8613, + "step": 7002 + }, + { + "epoch": 0.12415302871171734, + "grad_norm": 2.765625, + "learning_rate": 4.820060142286379e-05, + "loss": 0.8444, + "step": 7004 + }, + { + "epoch": 0.12418848074732891, + "grad_norm": 2.59375, + "learning_rate": 4.819956089429028e-05, + "loss": 0.8008, + "step": 7006 + }, + { + "epoch": 0.12422393278294049, + "grad_norm": 3.015625, + "learning_rate": 4.8198520076191085e-05, + "loss": 0.8038, + "step": 7008 + }, + { + "epoch": 0.12425938481855205, + "grad_norm": 2.703125, + "learning_rate": 4.819747896857919e-05, + "loss": 0.8439, + "step": 7010 + }, + { + "epoch": 0.12429483685416362, + "grad_norm": 2.671875, + "learning_rate": 4.819643757146759e-05, + "loss": 0.8163, + "step": 7012 + }, + { + "epoch": 0.1243302888897752, + "grad_norm": 2.796875, + "learning_rate": 4.819539588486929e-05, + "loss": 0.8019, + "step": 7014 + }, + { + "epoch": 0.12436574092538676, + "grad_norm": 2.484375, + "learning_rate": 4.819435390879726e-05, + "loss": 0.8193, + "step": 7016 + }, + { + "epoch": 0.12440119296099833, + "grad_norm": 2.671875, + "learning_rate": 4.8193311643264543e-05, + "loss": 0.8196, + "step": 7018 + }, + { + "epoch": 0.1244366449966099, + "grad_norm": 2.734375, + "learning_rate": 4.819226908828412e-05, + "loss": 0.8168, + "step": 7020 + }, + { + "epoch": 0.12447209703222147, + "grad_norm": 2.703125, + "learning_rate": 4.819122624386902e-05, + "loss": 0.8314, + "step": 7022 + }, + { + "epoch": 0.12450754906783304, + "grad_norm": 2.765625, + "learning_rate": 4.819018311003223e-05, + "loss": 0.7862, + "step": 7024 + }, + { + "epoch": 0.12454300110344461, + "grad_norm": 3.046875, + "learning_rate": 4.818913968678679e-05, + "loss": 0.8643, + "step": 7026 + }, + { + "epoch": 0.12457845313905618, + "grad_norm": 2.578125, + "learning_rate": 4.818809597414572e-05, + "loss": 0.8531, + "step": 7028 + }, + { + "epoch": 0.12461390517466775, + "grad_norm": 2.5625, + "learning_rate": 4.818705197212204e-05, + "loss": 0.8179, + "step": 7030 + }, + { + "epoch": 0.12464935721027932, + "grad_norm": 2.65625, + "learning_rate": 4.818600768072878e-05, + "loss": 0.786, + "step": 7032 + }, + { + "epoch": 0.12468480924589088, + "grad_norm": 2.765625, + "learning_rate": 4.818496309997898e-05, + "loss": 0.7907, + "step": 7034 + }, + { + "epoch": 0.12472026128150246, + "grad_norm": 2.671875, + "learning_rate": 4.8183918229885664e-05, + "loss": 0.7994, + "step": 7036 + }, + { + "epoch": 0.12475571331711403, + "grad_norm": 2.78125, + "learning_rate": 4.8182873070461874e-05, + "loss": 0.8645, + "step": 7038 + }, + { + "epoch": 0.1247911653527256, + "grad_norm": 2.84375, + "learning_rate": 4.818182762172066e-05, + "loss": 0.8562, + "step": 7040 + }, + { + "epoch": 0.12482661738833717, + "grad_norm": 2.9375, + "learning_rate": 4.818078188367506e-05, + "loss": 0.8527, + "step": 7042 + }, + { + "epoch": 0.12486206942394873, + "grad_norm": 3.46875, + "learning_rate": 4.8179735856338144e-05, + "loss": 0.8573, + "step": 7044 + }, + { + "epoch": 0.1248975214595603, + "grad_norm": 3.046875, + "learning_rate": 4.8178689539722946e-05, + "loss": 0.8259, + "step": 7046 + }, + { + "epoch": 0.12493297349517188, + "grad_norm": 2.421875, + "learning_rate": 4.817764293384253e-05, + "loss": 0.8099, + "step": 7048 + }, + { + "epoch": 0.12496842553078344, + "grad_norm": 2.90625, + "learning_rate": 4.817659603870995e-05, + "loss": 0.8493, + "step": 7050 + }, + { + "epoch": 0.12500387756639503, + "grad_norm": 3.203125, + "learning_rate": 4.817554885433829e-05, + "loss": 0.8322, + "step": 7052 + }, + { + "epoch": 0.1250393296020066, + "grad_norm": 2.75, + "learning_rate": 4.8174501380740605e-05, + "loss": 0.8546, + "step": 7054 + }, + { + "epoch": 0.12507478163761815, + "grad_norm": 2.78125, + "learning_rate": 4.817345361792996e-05, + "loss": 0.864, + "step": 7056 + }, + { + "epoch": 0.12511023367322974, + "grad_norm": 2.671875, + "learning_rate": 4.8172405565919456e-05, + "loss": 0.7856, + "step": 7058 + }, + { + "epoch": 0.1251456857088413, + "grad_norm": 2.8125, + "learning_rate": 4.8171357224722144e-05, + "loss": 0.8015, + "step": 7060 + }, + { + "epoch": 0.12518113774445286, + "grad_norm": 2.65625, + "learning_rate": 4.817030859435113e-05, + "loss": 0.8243, + "step": 7062 + }, + { + "epoch": 0.12521658978006445, + "grad_norm": 2.546875, + "learning_rate": 4.816925967481949e-05, + "loss": 0.796, + "step": 7064 + }, + { + "epoch": 0.125252041815676, + "grad_norm": 2.609375, + "learning_rate": 4.816821046614031e-05, + "loss": 0.8522, + "step": 7066 + }, + { + "epoch": 0.12528749385128757, + "grad_norm": 2.875, + "learning_rate": 4.816716096832669e-05, + "loss": 0.7957, + "step": 7068 + }, + { + "epoch": 0.12532294588689916, + "grad_norm": 3.375, + "learning_rate": 4.816611118139173e-05, + "loss": 0.8612, + "step": 7070 + }, + { + "epoch": 0.12535839792251072, + "grad_norm": 2.609375, + "learning_rate": 4.816506110534852e-05, + "loss": 0.8112, + "step": 7072 + }, + { + "epoch": 0.12539384995812228, + "grad_norm": 2.828125, + "learning_rate": 4.8164010740210176e-05, + "loss": 0.7945, + "step": 7074 + }, + { + "epoch": 0.12542930199373387, + "grad_norm": 2.5625, + "learning_rate": 4.8162960085989806e-05, + "loss": 0.8379, + "step": 7076 + }, + { + "epoch": 0.12546475402934543, + "grad_norm": 2.796875, + "learning_rate": 4.816190914270051e-05, + "loss": 0.7771, + "step": 7078 + }, + { + "epoch": 0.125500206064957, + "grad_norm": 2.625, + "learning_rate": 4.816085791035543e-05, + "loss": 0.7982, + "step": 7080 + }, + { + "epoch": 0.12553565810056858, + "grad_norm": 3.046875, + "learning_rate": 4.815980638896765e-05, + "loss": 0.8187, + "step": 7082 + }, + { + "epoch": 0.12557111013618014, + "grad_norm": 3.0, + "learning_rate": 4.8158754578550315e-05, + "loss": 0.8395, + "step": 7084 + }, + { + "epoch": 0.1256065621717917, + "grad_norm": 2.609375, + "learning_rate": 4.815770247911655e-05, + "loss": 0.8151, + "step": 7086 + }, + { + "epoch": 0.12564201420740326, + "grad_norm": 2.921875, + "learning_rate": 4.815665009067948e-05, + "loss": 0.838, + "step": 7088 + }, + { + "epoch": 0.12567746624301485, + "grad_norm": 2.828125, + "learning_rate": 4.815559741325223e-05, + "loss": 0.8211, + "step": 7090 + }, + { + "epoch": 0.1257129182786264, + "grad_norm": 2.796875, + "learning_rate": 4.815454444684796e-05, + "loss": 0.8336, + "step": 7092 + }, + { + "epoch": 0.12574837031423797, + "grad_norm": 2.921875, + "learning_rate": 4.8153491191479795e-05, + "loss": 0.8184, + "step": 7094 + }, + { + "epoch": 0.12578382234984956, + "grad_norm": 2.484375, + "learning_rate": 4.8152437647160884e-05, + "loss": 0.8088, + "step": 7096 + }, + { + "epoch": 0.12581927438546112, + "grad_norm": 2.875, + "learning_rate": 4.815138381390437e-05, + "loss": 0.8312, + "step": 7098 + }, + { + "epoch": 0.12585472642107268, + "grad_norm": 2.390625, + "learning_rate": 4.81503296917234e-05, + "loss": 0.7815, + "step": 7100 + }, + { + "epoch": 0.12589017845668427, + "grad_norm": 2.71875, + "learning_rate": 4.814927528063116e-05, + "loss": 0.8262, + "step": 7102 + }, + { + "epoch": 0.12592563049229583, + "grad_norm": 2.609375, + "learning_rate": 4.814822058064077e-05, + "loss": 0.8283, + "step": 7104 + }, + { + "epoch": 0.1259610825279074, + "grad_norm": 2.703125, + "learning_rate": 4.814716559176541e-05, + "loss": 0.8609, + "step": 7106 + }, + { + "epoch": 0.12599653456351897, + "grad_norm": 2.84375, + "learning_rate": 4.8146110314018245e-05, + "loss": 0.8504, + "step": 7108 + }, + { + "epoch": 0.12603198659913054, + "grad_norm": 2.796875, + "learning_rate": 4.814505474741244e-05, + "loss": 0.8211, + "step": 7110 + }, + { + "epoch": 0.1260674386347421, + "grad_norm": 2.828125, + "learning_rate": 4.814399889196119e-05, + "loss": 0.8408, + "step": 7112 + }, + { + "epoch": 0.12610289067035368, + "grad_norm": 2.515625, + "learning_rate": 4.8142942747677634e-05, + "loss": 0.8322, + "step": 7114 + }, + { + "epoch": 0.12613834270596525, + "grad_norm": 2.75, + "learning_rate": 4.814188631457498e-05, + "loss": 0.79, + "step": 7116 + }, + { + "epoch": 0.1261737947415768, + "grad_norm": 2.96875, + "learning_rate": 4.81408295926664e-05, + "loss": 0.8469, + "step": 7118 + }, + { + "epoch": 0.1262092467771884, + "grad_norm": 2.9375, + "learning_rate": 4.81397725819651e-05, + "loss": 0.8656, + "step": 7120 + }, + { + "epoch": 0.12624469881279995, + "grad_norm": 2.5625, + "learning_rate": 4.813871528248425e-05, + "loss": 0.8063, + "step": 7122 + }, + { + "epoch": 0.12628015084841152, + "grad_norm": 2.765625, + "learning_rate": 4.813765769423705e-05, + "loss": 0.7747, + "step": 7124 + }, + { + "epoch": 0.1263156028840231, + "grad_norm": 2.84375, + "learning_rate": 4.8136599817236706e-05, + "loss": 0.8538, + "step": 7126 + }, + { + "epoch": 0.12635105491963466, + "grad_norm": 2.421875, + "learning_rate": 4.8135541651496414e-05, + "loss": 0.7959, + "step": 7128 + }, + { + "epoch": 0.12638650695524623, + "grad_norm": 2.84375, + "learning_rate": 4.8134483197029376e-05, + "loss": 0.7977, + "step": 7130 + }, + { + "epoch": 0.1264219589908578, + "grad_norm": 2.65625, + "learning_rate": 4.813342445384881e-05, + "loss": 0.8197, + "step": 7132 + }, + { + "epoch": 0.12645741102646937, + "grad_norm": 2.765625, + "learning_rate": 4.8132365421967926e-05, + "loss": 0.8116, + "step": 7134 + }, + { + "epoch": 0.12649286306208093, + "grad_norm": 2.671875, + "learning_rate": 4.813130610139994e-05, + "loss": 0.8418, + "step": 7136 + }, + { + "epoch": 0.12652831509769252, + "grad_norm": 2.6875, + "learning_rate": 4.813024649215807e-05, + "loss": 0.8424, + "step": 7138 + }, + { + "epoch": 0.12656376713330408, + "grad_norm": 2.734375, + "learning_rate": 4.812918659425555e-05, + "loss": 0.7927, + "step": 7140 + }, + { + "epoch": 0.12659921916891564, + "grad_norm": 2.796875, + "learning_rate": 4.812812640770559e-05, + "loss": 0.8142, + "step": 7142 + }, + { + "epoch": 0.12663467120452723, + "grad_norm": 2.546875, + "learning_rate": 4.8127065932521434e-05, + "loss": 0.8158, + "step": 7144 + }, + { + "epoch": 0.1266701232401388, + "grad_norm": 2.765625, + "learning_rate": 4.8126005168716305e-05, + "loss": 0.853, + "step": 7146 + }, + { + "epoch": 0.12670557527575035, + "grad_norm": 2.75, + "learning_rate": 4.812494411630345e-05, + "loss": 0.844, + "step": 7148 + }, + { + "epoch": 0.12674102731136194, + "grad_norm": 2.765625, + "learning_rate": 4.8123882775296113e-05, + "loss": 0.84, + "step": 7150 + }, + { + "epoch": 0.1267764793469735, + "grad_norm": 2.59375, + "learning_rate": 4.812282114570753e-05, + "loss": 0.8119, + "step": 7152 + }, + { + "epoch": 0.12681193138258506, + "grad_norm": 2.96875, + "learning_rate": 4.812175922755096e-05, + "loss": 0.8149, + "step": 7154 + }, + { + "epoch": 0.12684738341819665, + "grad_norm": 2.9375, + "learning_rate": 4.812069702083965e-05, + "loss": 0.8097, + "step": 7156 + }, + { + "epoch": 0.1268828354538082, + "grad_norm": 2.625, + "learning_rate": 4.8119634525586856e-05, + "loss": 0.8152, + "step": 7158 + }, + { + "epoch": 0.12691828748941977, + "grad_norm": 3.015625, + "learning_rate": 4.811857174180584e-05, + "loss": 0.8176, + "step": 7160 + }, + { + "epoch": 0.12695373952503136, + "grad_norm": 2.546875, + "learning_rate": 4.811750866950986e-05, + "loss": 0.804, + "step": 7162 + }, + { + "epoch": 0.12698919156064292, + "grad_norm": 2.640625, + "learning_rate": 4.81164453087122e-05, + "loss": 0.8052, + "step": 7164 + }, + { + "epoch": 0.12702464359625448, + "grad_norm": 2.796875, + "learning_rate": 4.8115381659426105e-05, + "loss": 0.8712, + "step": 7166 + }, + { + "epoch": 0.12706009563186607, + "grad_norm": 2.421875, + "learning_rate": 4.811431772166486e-05, + "loss": 0.8033, + "step": 7168 + }, + { + "epoch": 0.12709554766747763, + "grad_norm": 3.109375, + "learning_rate": 4.8113253495441745e-05, + "loss": 0.8239, + "step": 7170 + }, + { + "epoch": 0.1271309997030892, + "grad_norm": 2.875, + "learning_rate": 4.811218898077005e-05, + "loss": 0.8461, + "step": 7172 + }, + { + "epoch": 0.12716645173870078, + "grad_norm": 2.640625, + "learning_rate": 4.811112417766304e-05, + "loss": 0.8235, + "step": 7174 + }, + { + "epoch": 0.12720190377431234, + "grad_norm": 2.828125, + "learning_rate": 4.811005908613402e-05, + "loss": 0.8135, + "step": 7176 + }, + { + "epoch": 0.1272373558099239, + "grad_norm": 2.640625, + "learning_rate": 4.810899370619627e-05, + "loss": 0.8182, + "step": 7178 + }, + { + "epoch": 0.1272728078455355, + "grad_norm": 2.84375, + "learning_rate": 4.81079280378631e-05, + "loss": 0.812, + "step": 7180 + }, + { + "epoch": 0.12730825988114705, + "grad_norm": 2.875, + "learning_rate": 4.81068620811478e-05, + "loss": 0.8564, + "step": 7182 + }, + { + "epoch": 0.1273437119167586, + "grad_norm": 2.953125, + "learning_rate": 4.810579583606367e-05, + "loss": 0.8287, + "step": 7184 + }, + { + "epoch": 0.1273791639523702, + "grad_norm": 2.828125, + "learning_rate": 4.810472930262402e-05, + "loss": 0.8113, + "step": 7186 + }, + { + "epoch": 0.12741461598798176, + "grad_norm": 2.90625, + "learning_rate": 4.810366248084216e-05, + "loss": 0.8531, + "step": 7188 + }, + { + "epoch": 0.12745006802359332, + "grad_norm": 2.5625, + "learning_rate": 4.810259537073141e-05, + "loss": 0.7819, + "step": 7190 + }, + { + "epoch": 0.1274855200592049, + "grad_norm": 3.015625, + "learning_rate": 4.8101527972305075e-05, + "loss": 0.8039, + "step": 7192 + }, + { + "epoch": 0.12752097209481647, + "grad_norm": 2.984375, + "learning_rate": 4.810046028557649e-05, + "loss": 0.8555, + "step": 7194 + }, + { + "epoch": 0.12755642413042803, + "grad_norm": 2.890625, + "learning_rate": 4.8099392310558966e-05, + "loss": 0.8321, + "step": 7196 + }, + { + "epoch": 0.12759187616603962, + "grad_norm": 2.828125, + "learning_rate": 4.809832404726584e-05, + "loss": 0.8268, + "step": 7198 + }, + { + "epoch": 0.12762732820165118, + "grad_norm": 2.671875, + "learning_rate": 4.8097255495710435e-05, + "loss": 0.845, + "step": 7200 + }, + { + "epoch": 0.12766278023726274, + "grad_norm": 2.8125, + "learning_rate": 4.80961866559061e-05, + "loss": 0.8416, + "step": 7202 + }, + { + "epoch": 0.12769823227287433, + "grad_norm": 3.015625, + "learning_rate": 4.809511752786616e-05, + "loss": 0.8385, + "step": 7204 + }, + { + "epoch": 0.1277336843084859, + "grad_norm": 2.359375, + "learning_rate": 4.809404811160397e-05, + "loss": 0.7694, + "step": 7206 + }, + { + "epoch": 0.12776913634409745, + "grad_norm": 2.734375, + "learning_rate": 4.809297840713287e-05, + "loss": 0.8489, + "step": 7208 + }, + { + "epoch": 0.12780458837970904, + "grad_norm": 2.546875, + "learning_rate": 4.8091908414466206e-05, + "loss": 0.88, + "step": 7210 + }, + { + "epoch": 0.1278400404153206, + "grad_norm": 2.71875, + "learning_rate": 4.8090838133617334e-05, + "loss": 0.8352, + "step": 7212 + }, + { + "epoch": 0.12787549245093216, + "grad_norm": 2.765625, + "learning_rate": 4.808976756459961e-05, + "loss": 0.8255, + "step": 7214 + }, + { + "epoch": 0.12791094448654375, + "grad_norm": 2.546875, + "learning_rate": 4.80886967074264e-05, + "loss": 0.8162, + "step": 7216 + }, + { + "epoch": 0.1279463965221553, + "grad_norm": 2.625, + "learning_rate": 4.808762556211106e-05, + "loss": 0.823, + "step": 7218 + }, + { + "epoch": 0.12798184855776687, + "grad_norm": 2.625, + "learning_rate": 4.808655412866697e-05, + "loss": 0.7868, + "step": 7220 + }, + { + "epoch": 0.12801730059337846, + "grad_norm": 2.734375, + "learning_rate": 4.8085482407107483e-05, + "loss": 0.8302, + "step": 7222 + }, + { + "epoch": 0.12805275262899002, + "grad_norm": 2.6875, + "learning_rate": 4.808441039744599e-05, + "loss": 0.7836, + "step": 7224 + }, + { + "epoch": 0.12808820466460158, + "grad_norm": 3.015625, + "learning_rate": 4.8083338099695864e-05, + "loss": 0.8161, + "step": 7226 + }, + { + "epoch": 0.12812365670021317, + "grad_norm": 2.703125, + "learning_rate": 4.8082265513870484e-05, + "loss": 0.7852, + "step": 7228 + }, + { + "epoch": 0.12815910873582473, + "grad_norm": 2.78125, + "learning_rate": 4.808119263998324e-05, + "loss": 0.8264, + "step": 7230 + }, + { + "epoch": 0.1281945607714363, + "grad_norm": 2.546875, + "learning_rate": 4.808011947804751e-05, + "loss": 0.7741, + "step": 7232 + }, + { + "epoch": 0.12823001280704788, + "grad_norm": 3.140625, + "learning_rate": 4.807904602807671e-05, + "loss": 0.8024, + "step": 7234 + }, + { + "epoch": 0.12826546484265944, + "grad_norm": 2.671875, + "learning_rate": 4.807797229008422e-05, + "loss": 0.8464, + "step": 7236 + }, + { + "epoch": 0.128300916878271, + "grad_norm": 2.765625, + "learning_rate": 4.8076898264083435e-05, + "loss": 0.8072, + "step": 7238 + }, + { + "epoch": 0.1283363689138826, + "grad_norm": 2.875, + "learning_rate": 4.8075823950087774e-05, + "loss": 0.784, + "step": 7240 + }, + { + "epoch": 0.12837182094949415, + "grad_norm": 3.078125, + "learning_rate": 4.807474934811063e-05, + "loss": 0.8259, + "step": 7242 + }, + { + "epoch": 0.1284072729851057, + "grad_norm": 2.828125, + "learning_rate": 4.8073674458165416e-05, + "loss": 0.7873, + "step": 7244 + }, + { + "epoch": 0.1284427250207173, + "grad_norm": 2.734375, + "learning_rate": 4.8072599280265565e-05, + "loss": 0.8289, + "step": 7246 + }, + { + "epoch": 0.12847817705632886, + "grad_norm": 2.765625, + "learning_rate": 4.807152381442447e-05, + "loss": 0.8145, + "step": 7248 + }, + { + "epoch": 0.12851362909194042, + "grad_norm": 2.84375, + "learning_rate": 4.807044806065557e-05, + "loss": 0.8007, + "step": 7250 + }, + { + "epoch": 0.128549081127552, + "grad_norm": 2.71875, + "learning_rate": 4.806937201897228e-05, + "loss": 0.8343, + "step": 7252 + }, + { + "epoch": 0.12858453316316357, + "grad_norm": 2.875, + "learning_rate": 4.8068295689388035e-05, + "loss": 0.8283, + "step": 7254 + }, + { + "epoch": 0.12861998519877513, + "grad_norm": 2.828125, + "learning_rate": 4.806721907191626e-05, + "loss": 0.8135, + "step": 7256 + }, + { + "epoch": 0.1286554372343867, + "grad_norm": 2.625, + "learning_rate": 4.8066142166570397e-05, + "loss": 0.8934, + "step": 7258 + }, + { + "epoch": 0.12869088926999828, + "grad_norm": 2.546875, + "learning_rate": 4.806506497336388e-05, + "loss": 0.8221, + "step": 7260 + }, + { + "epoch": 0.12872634130560984, + "grad_norm": 2.953125, + "learning_rate": 4.8063987492310156e-05, + "loss": 0.8494, + "step": 7262 + }, + { + "epoch": 0.1287617933412214, + "grad_norm": 2.578125, + "learning_rate": 4.806290972342268e-05, + "loss": 0.7975, + "step": 7264 + }, + { + "epoch": 0.128797245376833, + "grad_norm": 2.578125, + "learning_rate": 4.806183166671489e-05, + "loss": 0.801, + "step": 7266 + }, + { + "epoch": 0.12883269741244455, + "grad_norm": 2.6875, + "learning_rate": 4.8060753322200244e-05, + "loss": 0.8517, + "step": 7268 + }, + { + "epoch": 0.1288681494480561, + "grad_norm": 2.828125, + "learning_rate": 4.80596746898922e-05, + "loss": 0.8417, + "step": 7270 + }, + { + "epoch": 0.1289036014836677, + "grad_norm": 2.671875, + "learning_rate": 4.8058595769804224e-05, + "loss": 0.8524, + "step": 7272 + }, + { + "epoch": 0.12893905351927926, + "grad_norm": 2.671875, + "learning_rate": 4.805751656194977e-05, + "loss": 0.817, + "step": 7274 + }, + { + "epoch": 0.12897450555489082, + "grad_norm": 2.640625, + "learning_rate": 4.8056437066342315e-05, + "loss": 0.8042, + "step": 7276 + }, + { + "epoch": 0.1290099575905024, + "grad_norm": 2.625, + "learning_rate": 4.805535728299533e-05, + "loss": 0.8078, + "step": 7278 + }, + { + "epoch": 0.12904540962611397, + "grad_norm": 2.765625, + "learning_rate": 4.805427721192228e-05, + "loss": 0.8814, + "step": 7280 + }, + { + "epoch": 0.12908086166172553, + "grad_norm": 2.765625, + "learning_rate": 4.805319685313666e-05, + "loss": 0.8336, + "step": 7282 + }, + { + "epoch": 0.12911631369733712, + "grad_norm": 2.828125, + "learning_rate": 4.805211620665194e-05, + "loss": 0.8131, + "step": 7284 + }, + { + "epoch": 0.12915176573294868, + "grad_norm": 2.828125, + "learning_rate": 4.805103527248161e-05, + "loss": 0.8335, + "step": 7286 + }, + { + "epoch": 0.12918721776856024, + "grad_norm": 2.703125, + "learning_rate": 4.804995405063916e-05, + "loss": 0.8351, + "step": 7288 + }, + { + "epoch": 0.12922266980417182, + "grad_norm": 2.859375, + "learning_rate": 4.804887254113809e-05, + "loss": 0.8125, + "step": 7290 + }, + { + "epoch": 0.12925812183978339, + "grad_norm": 2.75, + "learning_rate": 4.804779074399189e-05, + "loss": 0.8221, + "step": 7292 + }, + { + "epoch": 0.12929357387539495, + "grad_norm": 3.0, + "learning_rate": 4.8046708659214054e-05, + "loss": 0.8297, + "step": 7294 + }, + { + "epoch": 0.12932902591100653, + "grad_norm": 2.6875, + "learning_rate": 4.8045626286818106e-05, + "loss": 0.8106, + "step": 7296 + }, + { + "epoch": 0.1293644779466181, + "grad_norm": 2.484375, + "learning_rate": 4.804454362681754e-05, + "loss": 0.8042, + "step": 7298 + }, + { + "epoch": 0.12939992998222966, + "grad_norm": 2.9375, + "learning_rate": 4.804346067922587e-05, + "loss": 0.821, + "step": 7300 + }, + { + "epoch": 0.12943538201784124, + "grad_norm": 3.046875, + "learning_rate": 4.80423774440566e-05, + "loss": 0.7792, + "step": 7302 + }, + { + "epoch": 0.1294708340534528, + "grad_norm": 2.671875, + "learning_rate": 4.804129392132327e-05, + "loss": 0.8335, + "step": 7304 + }, + { + "epoch": 0.12950628608906437, + "grad_norm": 2.6875, + "learning_rate": 4.804021011103939e-05, + "loss": 0.8099, + "step": 7306 + }, + { + "epoch": 0.12954173812467595, + "grad_norm": 2.734375, + "learning_rate": 4.8039126013218474e-05, + "loss": 0.8458, + "step": 7308 + }, + { + "epoch": 0.12957719016028751, + "grad_norm": 2.671875, + "learning_rate": 4.803804162787408e-05, + "loss": 0.8252, + "step": 7310 + }, + { + "epoch": 0.12961264219589907, + "grad_norm": 2.953125, + "learning_rate": 4.803695695501972e-05, + "loss": 0.8265, + "step": 7312 + }, + { + "epoch": 0.12964809423151066, + "grad_norm": 2.734375, + "learning_rate": 4.803587199466893e-05, + "loss": 0.8698, + "step": 7314 + }, + { + "epoch": 0.12968354626712222, + "grad_norm": 2.71875, + "learning_rate": 4.8034786746835256e-05, + "loss": 0.8338, + "step": 7316 + }, + { + "epoch": 0.12971899830273378, + "grad_norm": 2.453125, + "learning_rate": 4.803370121153225e-05, + "loss": 0.861, + "step": 7318 + }, + { + "epoch": 0.12975445033834537, + "grad_norm": 2.734375, + "learning_rate": 4.8032615388773445e-05, + "loss": 0.8264, + "step": 7320 + }, + { + "epoch": 0.12978990237395693, + "grad_norm": 2.5625, + "learning_rate": 4.8031529278572394e-05, + "loss": 0.839, + "step": 7322 + }, + { + "epoch": 0.1298253544095685, + "grad_norm": 2.828125, + "learning_rate": 4.803044288094266e-05, + "loss": 0.8518, + "step": 7324 + }, + { + "epoch": 0.12986080644518008, + "grad_norm": 2.90625, + "learning_rate": 4.802935619589779e-05, + "loss": 0.8324, + "step": 7326 + }, + { + "epoch": 0.12989625848079164, + "grad_norm": 2.859375, + "learning_rate": 4.802826922345136e-05, + "loss": 0.7853, + "step": 7328 + }, + { + "epoch": 0.1299317105164032, + "grad_norm": 2.921875, + "learning_rate": 4.802718196361692e-05, + "loss": 0.8865, + "step": 7330 + }, + { + "epoch": 0.1299671625520148, + "grad_norm": 2.796875, + "learning_rate": 4.802609441640805e-05, + "loss": 0.8392, + "step": 7332 + }, + { + "epoch": 0.13000261458762635, + "grad_norm": 3.0, + "learning_rate": 4.8025006581838314e-05, + "loss": 0.8358, + "step": 7334 + }, + { + "epoch": 0.1300380666232379, + "grad_norm": 2.78125, + "learning_rate": 4.802391845992129e-05, + "loss": 0.8317, + "step": 7336 + }, + { + "epoch": 0.1300735186588495, + "grad_norm": 2.671875, + "learning_rate": 4.802283005067057e-05, + "loss": 0.8186, + "step": 7338 + }, + { + "epoch": 0.13010897069446106, + "grad_norm": 2.75, + "learning_rate": 4.802174135409971e-05, + "loss": 0.8265, + "step": 7340 + }, + { + "epoch": 0.13014442273007262, + "grad_norm": 2.421875, + "learning_rate": 4.802065237022233e-05, + "loss": 0.822, + "step": 7342 + }, + { + "epoch": 0.1301798747656842, + "grad_norm": 2.765625, + "learning_rate": 4.801956309905199e-05, + "loss": 0.8397, + "step": 7344 + }, + { + "epoch": 0.13021532680129577, + "grad_norm": 2.765625, + "learning_rate": 4.80184735406023e-05, + "loss": 0.7997, + "step": 7346 + }, + { + "epoch": 0.13025077883690733, + "grad_norm": 2.71875, + "learning_rate": 4.8017383694886855e-05, + "loss": 0.7548, + "step": 7348 + }, + { + "epoch": 0.13028623087251892, + "grad_norm": 3.078125, + "learning_rate": 4.8016293561919256e-05, + "loss": 0.85, + "step": 7350 + }, + { + "epoch": 0.13032168290813048, + "grad_norm": 2.90625, + "learning_rate": 4.8015203141713114e-05, + "loss": 0.8169, + "step": 7352 + }, + { + "epoch": 0.13035713494374204, + "grad_norm": 2.640625, + "learning_rate": 4.801411243428202e-05, + "loss": 0.7821, + "step": 7354 + }, + { + "epoch": 0.13039258697935363, + "grad_norm": 2.6875, + "learning_rate": 4.80130214396396e-05, + "loss": 0.819, + "step": 7356 + }, + { + "epoch": 0.1304280390149652, + "grad_norm": 2.890625, + "learning_rate": 4.801193015779947e-05, + "loss": 0.8274, + "step": 7358 + }, + { + "epoch": 0.13046349105057675, + "grad_norm": 2.734375, + "learning_rate": 4.801083858877524e-05, + "loss": 0.8147, + "step": 7360 + }, + { + "epoch": 0.13049894308618834, + "grad_norm": 2.765625, + "learning_rate": 4.800974673258054e-05, + "loss": 0.8461, + "step": 7362 + }, + { + "epoch": 0.1305343951217999, + "grad_norm": 2.78125, + "learning_rate": 4.8008654589228984e-05, + "loss": 0.8115, + "step": 7364 + }, + { + "epoch": 0.13056984715741146, + "grad_norm": 2.640625, + "learning_rate": 4.800756215873422e-05, + "loss": 0.7553, + "step": 7366 + }, + { + "epoch": 0.13060529919302305, + "grad_norm": 2.640625, + "learning_rate": 4.8006469441109874e-05, + "loss": 0.7717, + "step": 7368 + }, + { + "epoch": 0.1306407512286346, + "grad_norm": 2.609375, + "learning_rate": 4.8005376436369576e-05, + "loss": 0.7747, + "step": 7370 + }, + { + "epoch": 0.13067620326424617, + "grad_norm": 2.9375, + "learning_rate": 4.800428314452697e-05, + "loss": 0.7978, + "step": 7372 + }, + { + "epoch": 0.13071165529985776, + "grad_norm": 3.15625, + "learning_rate": 4.800318956559571e-05, + "loss": 0.8016, + "step": 7374 + }, + { + "epoch": 0.13074710733546932, + "grad_norm": 2.703125, + "learning_rate": 4.800209569958943e-05, + "loss": 0.8473, + "step": 7376 + }, + { + "epoch": 0.13078255937108088, + "grad_norm": 2.921875, + "learning_rate": 4.80010015465218e-05, + "loss": 0.8325, + "step": 7378 + }, + { + "epoch": 0.13081801140669247, + "grad_norm": 2.671875, + "learning_rate": 4.799990710640645e-05, + "loss": 0.8087, + "step": 7380 + }, + { + "epoch": 0.13085346344230403, + "grad_norm": 2.53125, + "learning_rate": 4.799881237925704e-05, + "loss": 0.7808, + "step": 7382 + }, + { + "epoch": 0.1308889154779156, + "grad_norm": 2.734375, + "learning_rate": 4.799771736508725e-05, + "loss": 0.8534, + "step": 7384 + }, + { + "epoch": 0.13092436751352718, + "grad_norm": 2.6875, + "learning_rate": 4.7996622063910744e-05, + "loss": 0.7919, + "step": 7386 + }, + { + "epoch": 0.13095981954913874, + "grad_norm": 3.25, + "learning_rate": 4.7995526475741174e-05, + "loss": 0.8613, + "step": 7388 + }, + { + "epoch": 0.1309952715847503, + "grad_norm": 2.703125, + "learning_rate": 4.799443060059223e-05, + "loss": 0.8076, + "step": 7390 + }, + { + "epoch": 0.1310307236203619, + "grad_norm": 2.734375, + "learning_rate": 4.7993334438477576e-05, + "loss": 0.8378, + "step": 7392 + }, + { + "epoch": 0.13106617565597345, + "grad_norm": 2.625, + "learning_rate": 4.7992237989410904e-05, + "loss": 0.7977, + "step": 7394 + }, + { + "epoch": 0.131101627691585, + "grad_norm": 2.65625, + "learning_rate": 4.799114125340589e-05, + "loss": 0.8135, + "step": 7396 + }, + { + "epoch": 0.1311370797271966, + "grad_norm": 3.015625, + "learning_rate": 4.7990044230476215e-05, + "loss": 0.8572, + "step": 7398 + }, + { + "epoch": 0.13117253176280816, + "grad_norm": 2.796875, + "learning_rate": 4.798894692063559e-05, + "loss": 0.8426, + "step": 7400 + }, + { + "epoch": 0.13120798379841972, + "grad_norm": 2.71875, + "learning_rate": 4.798784932389768e-05, + "loss": 0.8142, + "step": 7402 + }, + { + "epoch": 0.1312434358340313, + "grad_norm": 2.765625, + "learning_rate": 4.798675144027621e-05, + "loss": 0.845, + "step": 7404 + }, + { + "epoch": 0.13127888786964287, + "grad_norm": 2.78125, + "learning_rate": 4.798565326978486e-05, + "loss": 0.7994, + "step": 7406 + }, + { + "epoch": 0.13131433990525443, + "grad_norm": 2.734375, + "learning_rate": 4.798455481243735e-05, + "loss": 0.8078, + "step": 7408 + }, + { + "epoch": 0.13134979194086602, + "grad_norm": 2.25, + "learning_rate": 4.798345606824739e-05, + "loss": 0.7955, + "step": 7410 + }, + { + "epoch": 0.13138524397647758, + "grad_norm": 2.78125, + "learning_rate": 4.7982357037228676e-05, + "loss": 0.7976, + "step": 7412 + }, + { + "epoch": 0.13142069601208914, + "grad_norm": 2.609375, + "learning_rate": 4.798125771939493e-05, + "loss": 0.8133, + "step": 7414 + }, + { + "epoch": 0.13145614804770073, + "grad_norm": 2.8125, + "learning_rate": 4.798015811475989e-05, + "loss": 0.7703, + "step": 7416 + }, + { + "epoch": 0.1314916000833123, + "grad_norm": 2.96875, + "learning_rate": 4.7979058223337246e-05, + "loss": 0.8613, + "step": 7418 + }, + { + "epoch": 0.13152705211892385, + "grad_norm": 2.75, + "learning_rate": 4.797795804514075e-05, + "loss": 0.81, + "step": 7420 + }, + { + "epoch": 0.13156250415453544, + "grad_norm": 2.71875, + "learning_rate": 4.797685758018413e-05, + "loss": 0.7829, + "step": 7422 + }, + { + "epoch": 0.131597956190147, + "grad_norm": 3.046875, + "learning_rate": 4.79757568284811e-05, + "loss": 0.8287, + "step": 7424 + }, + { + "epoch": 0.13163340822575856, + "grad_norm": 2.84375, + "learning_rate": 4.797465579004542e-05, + "loss": 0.8734, + "step": 7426 + }, + { + "epoch": 0.13166886026137012, + "grad_norm": 2.84375, + "learning_rate": 4.797355446489081e-05, + "loss": 0.8128, + "step": 7428 + }, + { + "epoch": 0.1317043122969817, + "grad_norm": 3.015625, + "learning_rate": 4.7972452853031035e-05, + "loss": 0.7994, + "step": 7430 + }, + { + "epoch": 0.13173976433259327, + "grad_norm": 2.71875, + "learning_rate": 4.797135095447983e-05, + "loss": 0.7838, + "step": 7432 + }, + { + "epoch": 0.13177521636820483, + "grad_norm": 2.65625, + "learning_rate": 4.797024876925095e-05, + "loss": 0.7476, + "step": 7434 + }, + { + "epoch": 0.13181066840381642, + "grad_norm": 2.71875, + "learning_rate": 4.796914629735815e-05, + "loss": 0.8098, + "step": 7436 + }, + { + "epoch": 0.13184612043942798, + "grad_norm": 2.625, + "learning_rate": 4.796804353881519e-05, + "loss": 0.8402, + "step": 7438 + }, + { + "epoch": 0.13188157247503954, + "grad_norm": 2.703125, + "learning_rate": 4.7966940493635825e-05, + "loss": 0.8256, + "step": 7440 + }, + { + "epoch": 0.13191702451065113, + "grad_norm": 2.859375, + "learning_rate": 4.796583716183383e-05, + "loss": 0.8504, + "step": 7442 + }, + { + "epoch": 0.1319524765462627, + "grad_norm": 2.90625, + "learning_rate": 4.7964733543422975e-05, + "loss": 0.7709, + "step": 7444 + }, + { + "epoch": 0.13198792858187425, + "grad_norm": 2.609375, + "learning_rate": 4.7963629638417015e-05, + "loss": 0.7819, + "step": 7446 + }, + { + "epoch": 0.13202338061748584, + "grad_norm": 2.53125, + "learning_rate": 4.7962525446829757e-05, + "loss": 0.825, + "step": 7448 + }, + { + "epoch": 0.1320588326530974, + "grad_norm": 2.6875, + "learning_rate": 4.7961420968674955e-05, + "loss": 0.8014, + "step": 7450 + }, + { + "epoch": 0.13209428468870896, + "grad_norm": 2.5, + "learning_rate": 4.796031620396641e-05, + "loss": 0.7856, + "step": 7452 + }, + { + "epoch": 0.13212973672432055, + "grad_norm": 2.453125, + "learning_rate": 4.795921115271789e-05, + "loss": 0.8327, + "step": 7454 + }, + { + "epoch": 0.1321651887599321, + "grad_norm": 2.703125, + "learning_rate": 4.79581058149432e-05, + "loss": 0.8167, + "step": 7456 + }, + { + "epoch": 0.13220064079554367, + "grad_norm": 2.765625, + "learning_rate": 4.7957000190656134e-05, + "loss": 0.7662, + "step": 7458 + }, + { + "epoch": 0.13223609283115526, + "grad_norm": 2.921875, + "learning_rate": 4.7955894279870483e-05, + "loss": 0.8741, + "step": 7460 + }, + { + "epoch": 0.13227154486676682, + "grad_norm": 2.5625, + "learning_rate": 4.7954788082600055e-05, + "loss": 0.808, + "step": 7462 + }, + { + "epoch": 0.13230699690237838, + "grad_norm": 2.609375, + "learning_rate": 4.795368159885866e-05, + "loss": 0.7918, + "step": 7464 + }, + { + "epoch": 0.13234244893798996, + "grad_norm": 2.78125, + "learning_rate": 4.7952574828660086e-05, + "loss": 0.8284, + "step": 7466 + }, + { + "epoch": 0.13237790097360153, + "grad_norm": 2.796875, + "learning_rate": 4.7951467772018164e-05, + "loss": 0.8582, + "step": 7468 + }, + { + "epoch": 0.1324133530092131, + "grad_norm": 2.5625, + "learning_rate": 4.7950360428946705e-05, + "loss": 0.8033, + "step": 7470 + }, + { + "epoch": 0.13244880504482467, + "grad_norm": 3.03125, + "learning_rate": 4.794925279945953e-05, + "loss": 0.8375, + "step": 7472 + }, + { + "epoch": 0.13248425708043624, + "grad_norm": 2.90625, + "learning_rate": 4.794814488357046e-05, + "loss": 0.8369, + "step": 7474 + }, + { + "epoch": 0.1325197091160478, + "grad_norm": 2.609375, + "learning_rate": 4.7947036681293325e-05, + "loss": 0.787, + "step": 7476 + }, + { + "epoch": 0.13255516115165938, + "grad_norm": 2.953125, + "learning_rate": 4.7945928192641944e-05, + "loss": 0.8105, + "step": 7478 + }, + { + "epoch": 0.13259061318727094, + "grad_norm": 2.90625, + "learning_rate": 4.7944819417630165e-05, + "loss": 0.8264, + "step": 7480 + }, + { + "epoch": 0.1326260652228825, + "grad_norm": 2.671875, + "learning_rate": 4.7943710356271816e-05, + "loss": 0.8561, + "step": 7482 + }, + { + "epoch": 0.1326615172584941, + "grad_norm": 2.90625, + "learning_rate": 4.794260100858074e-05, + "loss": 0.7978, + "step": 7484 + }, + { + "epoch": 0.13269696929410565, + "grad_norm": 2.921875, + "learning_rate": 4.794149137457078e-05, + "loss": 0.8171, + "step": 7486 + }, + { + "epoch": 0.13273242132971722, + "grad_norm": 2.671875, + "learning_rate": 4.794038145425579e-05, + "loss": 0.8441, + "step": 7488 + }, + { + "epoch": 0.1327678733653288, + "grad_norm": 2.859375, + "learning_rate": 4.793927124764962e-05, + "loss": 0.7825, + "step": 7490 + }, + { + "epoch": 0.13280332540094036, + "grad_norm": 2.546875, + "learning_rate": 4.7938160754766114e-05, + "loss": 0.837, + "step": 7492 + }, + { + "epoch": 0.13283877743655192, + "grad_norm": 2.71875, + "learning_rate": 4.793704997561915e-05, + "loss": 0.8297, + "step": 7494 + }, + { + "epoch": 0.1328742294721635, + "grad_norm": 2.578125, + "learning_rate": 4.793593891022257e-05, + "loss": 0.7936, + "step": 7496 + }, + { + "epoch": 0.13290968150777507, + "grad_norm": 2.984375, + "learning_rate": 4.793482755859026e-05, + "loss": 0.8175, + "step": 7498 + }, + { + "epoch": 0.13294513354338663, + "grad_norm": 2.84375, + "learning_rate": 4.793371592073607e-05, + "loss": 0.826, + "step": 7500 + }, + { + "epoch": 0.13298058557899822, + "grad_norm": 2.703125, + "learning_rate": 4.793260399667388e-05, + "loss": 0.7818, + "step": 7502 + }, + { + "epoch": 0.13301603761460978, + "grad_norm": 2.90625, + "learning_rate": 4.793149178641758e-05, + "loss": 0.8198, + "step": 7504 + }, + { + "epoch": 0.13305148965022134, + "grad_norm": 2.765625, + "learning_rate": 4.793037928998103e-05, + "loss": 0.8036, + "step": 7506 + }, + { + "epoch": 0.13308694168583293, + "grad_norm": 2.75, + "learning_rate": 4.7929266507378125e-05, + "loss": 0.8362, + "step": 7508 + }, + { + "epoch": 0.1331223937214445, + "grad_norm": 3.0, + "learning_rate": 4.792815343862275e-05, + "loss": 0.8377, + "step": 7510 + }, + { + "epoch": 0.13315784575705605, + "grad_norm": 2.84375, + "learning_rate": 4.792704008372879e-05, + "loss": 0.7947, + "step": 7512 + }, + { + "epoch": 0.13319329779266764, + "grad_norm": 2.96875, + "learning_rate": 4.792592644271015e-05, + "loss": 0.7883, + "step": 7514 + }, + { + "epoch": 0.1332287498282792, + "grad_norm": 2.75, + "learning_rate": 4.792481251558073e-05, + "loss": 0.8371, + "step": 7516 + }, + { + "epoch": 0.13326420186389076, + "grad_norm": 2.671875, + "learning_rate": 4.792369830235441e-05, + "loss": 0.8336, + "step": 7518 + }, + { + "epoch": 0.13329965389950235, + "grad_norm": 3.03125, + "learning_rate": 4.792258380304512e-05, + "loss": 0.8234, + "step": 7520 + }, + { + "epoch": 0.1333351059351139, + "grad_norm": 2.5625, + "learning_rate": 4.7921469017666756e-05, + "loss": 0.7981, + "step": 7522 + }, + { + "epoch": 0.13337055797072547, + "grad_norm": 2.953125, + "learning_rate": 4.792035394623323e-05, + "loss": 0.8175, + "step": 7524 + }, + { + "epoch": 0.13340601000633706, + "grad_norm": 2.765625, + "learning_rate": 4.791923858875847e-05, + "loss": 0.8514, + "step": 7526 + }, + { + "epoch": 0.13344146204194862, + "grad_norm": 2.8125, + "learning_rate": 4.791812294525638e-05, + "loss": 0.8058, + "step": 7528 + }, + { + "epoch": 0.13347691407756018, + "grad_norm": 2.515625, + "learning_rate": 4.791700701574089e-05, + "loss": 0.843, + "step": 7530 + }, + { + "epoch": 0.13351236611317177, + "grad_norm": 2.6875, + "learning_rate": 4.7915890800225926e-05, + "loss": 0.8108, + "step": 7532 + }, + { + "epoch": 0.13354781814878333, + "grad_norm": 2.90625, + "learning_rate": 4.791477429872542e-05, + "loss": 0.8333, + "step": 7534 + }, + { + "epoch": 0.1335832701843949, + "grad_norm": 2.65625, + "learning_rate": 4.7913657511253296e-05, + "loss": 0.835, + "step": 7536 + }, + { + "epoch": 0.13361872222000648, + "grad_norm": 2.796875, + "learning_rate": 4.7912540437823506e-05, + "loss": 0.8367, + "step": 7538 + }, + { + "epoch": 0.13365417425561804, + "grad_norm": 2.75, + "learning_rate": 4.791142307844998e-05, + "loss": 0.8028, + "step": 7540 + }, + { + "epoch": 0.1336896262912296, + "grad_norm": 2.5625, + "learning_rate": 4.7910305433146664e-05, + "loss": 0.7917, + "step": 7542 + }, + { + "epoch": 0.1337250783268412, + "grad_norm": 2.484375, + "learning_rate": 4.790918750192751e-05, + "loss": 0.7507, + "step": 7544 + }, + { + "epoch": 0.13376053036245275, + "grad_norm": 2.8125, + "learning_rate": 4.790806928480647e-05, + "loss": 0.8264, + "step": 7546 + }, + { + "epoch": 0.1337959823980643, + "grad_norm": 2.625, + "learning_rate": 4.79069507817975e-05, + "loss": 0.787, + "step": 7548 + }, + { + "epoch": 0.1338314344336759, + "grad_norm": 2.765625, + "learning_rate": 4.790583199291455e-05, + "loss": 0.8439, + "step": 7550 + }, + { + "epoch": 0.13386688646928746, + "grad_norm": 2.8125, + "learning_rate": 4.790471291817159e-05, + "loss": 0.8312, + "step": 7552 + }, + { + "epoch": 0.13390233850489902, + "grad_norm": 2.578125, + "learning_rate": 4.790359355758258e-05, + "loss": 0.8275, + "step": 7554 + }, + { + "epoch": 0.1339377905405106, + "grad_norm": 2.65625, + "learning_rate": 4.79024739111615e-05, + "loss": 0.807, + "step": 7556 + }, + { + "epoch": 0.13397324257612217, + "grad_norm": 2.71875, + "learning_rate": 4.7901353978922306e-05, + "loss": 0.8654, + "step": 7558 + }, + { + "epoch": 0.13400869461173373, + "grad_norm": 2.46875, + "learning_rate": 4.7900233760878986e-05, + "loss": 0.8133, + "step": 7560 + }, + { + "epoch": 0.13404414664734532, + "grad_norm": 2.640625, + "learning_rate": 4.789911325704552e-05, + "loss": 0.7941, + "step": 7562 + }, + { + "epoch": 0.13407959868295688, + "grad_norm": 2.71875, + "learning_rate": 4.789799246743589e-05, + "loss": 0.8172, + "step": 7564 + }, + { + "epoch": 0.13411505071856844, + "grad_norm": 2.453125, + "learning_rate": 4.789687139206409e-05, + "loss": 0.8277, + "step": 7566 + }, + { + "epoch": 0.13415050275418003, + "grad_norm": 2.484375, + "learning_rate": 4.78957500309441e-05, + "loss": 0.8077, + "step": 7568 + }, + { + "epoch": 0.1341859547897916, + "grad_norm": 3.09375, + "learning_rate": 4.789462838408991e-05, + "loss": 0.8342, + "step": 7570 + }, + { + "epoch": 0.13422140682540315, + "grad_norm": 2.421875, + "learning_rate": 4.789350645151554e-05, + "loss": 0.8314, + "step": 7572 + }, + { + "epoch": 0.13425685886101474, + "grad_norm": 2.703125, + "learning_rate": 4.789238423323497e-05, + "loss": 0.829, + "step": 7574 + }, + { + "epoch": 0.1342923108966263, + "grad_norm": 2.671875, + "learning_rate": 4.789126172926222e-05, + "loss": 0.8237, + "step": 7576 + }, + { + "epoch": 0.13432776293223786, + "grad_norm": 2.796875, + "learning_rate": 4.7890138939611285e-05, + "loss": 0.8177, + "step": 7578 + }, + { + "epoch": 0.13436321496784945, + "grad_norm": 2.984375, + "learning_rate": 4.788901586429618e-05, + "loss": 0.8511, + "step": 7580 + }, + { + "epoch": 0.134398667003461, + "grad_norm": 2.640625, + "learning_rate": 4.7887892503330936e-05, + "loss": 0.8095, + "step": 7582 + }, + { + "epoch": 0.13443411903907257, + "grad_norm": 2.5625, + "learning_rate": 4.7886768856729546e-05, + "loss": 0.8151, + "step": 7584 + }, + { + "epoch": 0.13446957107468416, + "grad_norm": 2.578125, + "learning_rate": 4.788564492450606e-05, + "loss": 0.8282, + "step": 7586 + }, + { + "epoch": 0.13450502311029572, + "grad_norm": 2.828125, + "learning_rate": 4.7884520706674485e-05, + "loss": 0.7904, + "step": 7588 + }, + { + "epoch": 0.13454047514590728, + "grad_norm": 2.828125, + "learning_rate": 4.788339620324887e-05, + "loss": 0.8623, + "step": 7590 + }, + { + "epoch": 0.13457592718151887, + "grad_norm": 2.828125, + "learning_rate": 4.788227141424322e-05, + "loss": 0.8727, + "step": 7592 + }, + { + "epoch": 0.13461137921713043, + "grad_norm": 2.53125, + "learning_rate": 4.78811463396716e-05, + "loss": 0.7833, + "step": 7594 + }, + { + "epoch": 0.134646831252742, + "grad_norm": 2.75, + "learning_rate": 4.788002097954804e-05, + "loss": 0.8188, + "step": 7596 + }, + { + "epoch": 0.13468228328835358, + "grad_norm": 2.921875, + "learning_rate": 4.787889533388658e-05, + "loss": 0.8437, + "step": 7598 + }, + { + "epoch": 0.13471773532396514, + "grad_norm": 2.75, + "learning_rate": 4.787776940270127e-05, + "loss": 0.788, + "step": 7600 + }, + { + "epoch": 0.1347531873595767, + "grad_norm": 3.03125, + "learning_rate": 4.787664318600615e-05, + "loss": 0.8419, + "step": 7602 + }, + { + "epoch": 0.13478863939518826, + "grad_norm": 2.4375, + "learning_rate": 4.787551668381531e-05, + "loss": 0.7876, + "step": 7604 + }, + { + "epoch": 0.13482409143079985, + "grad_norm": 3.09375, + "learning_rate": 4.787438989614278e-05, + "loss": 0.8196, + "step": 7606 + }, + { + "epoch": 0.1348595434664114, + "grad_norm": 2.875, + "learning_rate": 4.7873262823002627e-05, + "loss": 0.8218, + "step": 7608 + }, + { + "epoch": 0.13489499550202297, + "grad_norm": 2.671875, + "learning_rate": 4.787213546440892e-05, + "loss": 0.8086, + "step": 7610 + }, + { + "epoch": 0.13493044753763456, + "grad_norm": 3.03125, + "learning_rate": 4.7871007820375725e-05, + "loss": 0.7893, + "step": 7612 + }, + { + "epoch": 0.13496589957324612, + "grad_norm": 2.875, + "learning_rate": 4.786987989091711e-05, + "loss": 0.8129, + "step": 7614 + }, + { + "epoch": 0.13500135160885768, + "grad_norm": 2.90625, + "learning_rate": 4.786875167604716e-05, + "loss": 0.8359, + "step": 7616 + }, + { + "epoch": 0.13503680364446927, + "grad_norm": 2.578125, + "learning_rate": 4.7867623175779955e-05, + "loss": 0.8402, + "step": 7618 + }, + { + "epoch": 0.13507225568008083, + "grad_norm": 2.890625, + "learning_rate": 4.786649439012958e-05, + "loss": 0.8359, + "step": 7620 + }, + { + "epoch": 0.1351077077156924, + "grad_norm": 2.828125, + "learning_rate": 4.786536531911011e-05, + "loss": 0.8427, + "step": 7622 + }, + { + "epoch": 0.13514315975130398, + "grad_norm": 2.640625, + "learning_rate": 4.7864235962735646e-05, + "loss": 0.8282, + "step": 7624 + }, + { + "epoch": 0.13517861178691554, + "grad_norm": 2.859375, + "learning_rate": 4.7863106321020285e-05, + "loss": 0.7969, + "step": 7626 + }, + { + "epoch": 0.1352140638225271, + "grad_norm": 2.9375, + "learning_rate": 4.7861976393978115e-05, + "loss": 0.7827, + "step": 7628 + }, + { + "epoch": 0.13524951585813869, + "grad_norm": 2.828125, + "learning_rate": 4.7860846181623244e-05, + "loss": 0.8338, + "step": 7630 + }, + { + "epoch": 0.13528496789375025, + "grad_norm": 2.71875, + "learning_rate": 4.785971568396977e-05, + "loss": 0.8179, + "step": 7632 + }, + { + "epoch": 0.1353204199293618, + "grad_norm": 2.8125, + "learning_rate": 4.78585849010318e-05, + "loss": 0.8643, + "step": 7634 + }, + { + "epoch": 0.1353558719649734, + "grad_norm": 2.6875, + "learning_rate": 4.785745383282346e-05, + "loss": 0.8289, + "step": 7636 + }, + { + "epoch": 0.13539132400058496, + "grad_norm": 2.6875, + "learning_rate": 4.785632247935886e-05, + "loss": 0.8129, + "step": 7638 + }, + { + "epoch": 0.13542677603619652, + "grad_norm": 2.671875, + "learning_rate": 4.785519084065211e-05, + "loss": 0.8034, + "step": 7640 + }, + { + "epoch": 0.1354622280718081, + "grad_norm": 3.15625, + "learning_rate": 4.7854058916717336e-05, + "loss": 0.8267, + "step": 7642 + }, + { + "epoch": 0.13549768010741967, + "grad_norm": 2.609375, + "learning_rate": 4.7852926707568676e-05, + "loss": 0.7633, + "step": 7644 + }, + { + "epoch": 0.13553313214303123, + "grad_norm": 2.8125, + "learning_rate": 4.785179421322025e-05, + "loss": 0.8112, + "step": 7646 + }, + { + "epoch": 0.13556858417864281, + "grad_norm": 2.84375, + "learning_rate": 4.785066143368618e-05, + "loss": 0.8385, + "step": 7648 + }, + { + "epoch": 0.13560403621425438, + "grad_norm": 3.046875, + "learning_rate": 4.784952836898062e-05, + "loss": 0.8185, + "step": 7650 + }, + { + "epoch": 0.13563948824986594, + "grad_norm": 2.78125, + "learning_rate": 4.784839501911771e-05, + "loss": 0.8011, + "step": 7652 + }, + { + "epoch": 0.13567494028547752, + "grad_norm": 2.671875, + "learning_rate": 4.7847261384111585e-05, + "loss": 0.8231, + "step": 7654 + }, + { + "epoch": 0.13571039232108909, + "grad_norm": 2.84375, + "learning_rate": 4.7846127463976395e-05, + "loss": 0.7639, + "step": 7656 + }, + { + "epoch": 0.13574584435670065, + "grad_norm": 2.921875, + "learning_rate": 4.78449932587263e-05, + "loss": 0.8732, + "step": 7658 + }, + { + "epoch": 0.13578129639231223, + "grad_norm": 2.671875, + "learning_rate": 4.784385876837545e-05, + "loss": 0.7955, + "step": 7660 + }, + { + "epoch": 0.1358167484279238, + "grad_norm": 2.53125, + "learning_rate": 4.784272399293799e-05, + "loss": 0.848, + "step": 7662 + }, + { + "epoch": 0.13585220046353536, + "grad_norm": 3.109375, + "learning_rate": 4.784158893242809e-05, + "loss": 0.8288, + "step": 7664 + }, + { + "epoch": 0.13588765249914694, + "grad_norm": 2.640625, + "learning_rate": 4.784045358685993e-05, + "loss": 0.763, + "step": 7666 + }, + { + "epoch": 0.1359231045347585, + "grad_norm": 2.78125, + "learning_rate": 4.783931795624766e-05, + "loss": 0.7886, + "step": 7668 + }, + { + "epoch": 0.13595855657037006, + "grad_norm": 2.578125, + "learning_rate": 4.783818204060546e-05, + "loss": 0.8637, + "step": 7670 + }, + { + "epoch": 0.13599400860598165, + "grad_norm": 2.96875, + "learning_rate": 4.78370458399475e-05, + "loss": 0.814, + "step": 7672 + }, + { + "epoch": 0.13602946064159321, + "grad_norm": 2.390625, + "learning_rate": 4.7835909354287975e-05, + "loss": 0.8021, + "step": 7674 + }, + { + "epoch": 0.13606491267720477, + "grad_norm": 2.8125, + "learning_rate": 4.7834772583641054e-05, + "loss": 0.8465, + "step": 7676 + }, + { + "epoch": 0.13610036471281636, + "grad_norm": 2.703125, + "learning_rate": 4.783363552802092e-05, + "loss": 0.8472, + "step": 7678 + }, + { + "epoch": 0.13613581674842792, + "grad_norm": 3.09375, + "learning_rate": 4.783249818744178e-05, + "loss": 0.8313, + "step": 7680 + }, + { + "epoch": 0.13617126878403948, + "grad_norm": 2.796875, + "learning_rate": 4.783136056191781e-05, + "loss": 0.8137, + "step": 7682 + }, + { + "epoch": 0.13620672081965107, + "grad_norm": 2.703125, + "learning_rate": 4.783022265146322e-05, + "loss": 0.8334, + "step": 7684 + }, + { + "epoch": 0.13624217285526263, + "grad_norm": 2.96875, + "learning_rate": 4.782908445609221e-05, + "loss": 0.8134, + "step": 7686 + }, + { + "epoch": 0.1362776248908742, + "grad_norm": 2.828125, + "learning_rate": 4.782794597581898e-05, + "loss": 0.835, + "step": 7688 + }, + { + "epoch": 0.13631307692648578, + "grad_norm": 2.859375, + "learning_rate": 4.782680721065773e-05, + "loss": 0.8511, + "step": 7690 + }, + { + "epoch": 0.13634852896209734, + "grad_norm": 2.84375, + "learning_rate": 4.7825668160622686e-05, + "loss": 0.8473, + "step": 7692 + }, + { + "epoch": 0.1363839809977089, + "grad_norm": 2.546875, + "learning_rate": 4.7824528825728055e-05, + "loss": 0.8053, + "step": 7694 + }, + { + "epoch": 0.1364194330333205, + "grad_norm": 2.765625, + "learning_rate": 4.782338920598807e-05, + "loss": 0.8145, + "step": 7696 + }, + { + "epoch": 0.13645488506893205, + "grad_norm": 2.8125, + "learning_rate": 4.782224930141693e-05, + "loss": 0.8105, + "step": 7698 + }, + { + "epoch": 0.1364903371045436, + "grad_norm": 2.71875, + "learning_rate": 4.7821109112028876e-05, + "loss": 0.7639, + "step": 7700 + }, + { + "epoch": 0.1365257891401552, + "grad_norm": 3.125, + "learning_rate": 4.781996863783813e-05, + "loss": 0.8154, + "step": 7702 + }, + { + "epoch": 0.13656124117576676, + "grad_norm": 2.875, + "learning_rate": 4.781882787885893e-05, + "loss": 0.7771, + "step": 7704 + }, + { + "epoch": 0.13659669321137832, + "grad_norm": 2.59375, + "learning_rate": 4.7817686835105513e-05, + "loss": 0.8352, + "step": 7706 + }, + { + "epoch": 0.1366321452469899, + "grad_norm": 2.84375, + "learning_rate": 4.781654550659211e-05, + "loss": 0.8448, + "step": 7708 + }, + { + "epoch": 0.13666759728260147, + "grad_norm": 2.515625, + "learning_rate": 4.781540389333298e-05, + "loss": 0.8205, + "step": 7710 + }, + { + "epoch": 0.13670304931821303, + "grad_norm": 2.765625, + "learning_rate": 4.781426199534236e-05, + "loss": 0.8376, + "step": 7712 + }, + { + "epoch": 0.13673850135382462, + "grad_norm": 2.734375, + "learning_rate": 4.78131198126345e-05, + "loss": 0.8491, + "step": 7714 + }, + { + "epoch": 0.13677395338943618, + "grad_norm": 2.796875, + "learning_rate": 4.781197734522366e-05, + "loss": 0.8213, + "step": 7716 + }, + { + "epoch": 0.13680940542504774, + "grad_norm": 2.71875, + "learning_rate": 4.781083459312409e-05, + "loss": 0.8204, + "step": 7718 + }, + { + "epoch": 0.13684485746065933, + "grad_norm": 2.546875, + "learning_rate": 4.780969155635006e-05, + "loss": 0.7855, + "step": 7720 + }, + { + "epoch": 0.1368803094962709, + "grad_norm": 2.75, + "learning_rate": 4.7808548234915826e-05, + "loss": 0.7865, + "step": 7722 + }, + { + "epoch": 0.13691576153188245, + "grad_norm": 2.90625, + "learning_rate": 4.7807404628835664e-05, + "loss": 0.8088, + "step": 7724 + }, + { + "epoch": 0.13695121356749404, + "grad_norm": 2.75, + "learning_rate": 4.780626073812383e-05, + "loss": 0.8477, + "step": 7726 + }, + { + "epoch": 0.1369866656031056, + "grad_norm": 2.6875, + "learning_rate": 4.780511656279463e-05, + "loss": 0.8656, + "step": 7728 + }, + { + "epoch": 0.13702211763871716, + "grad_norm": 2.609375, + "learning_rate": 4.7803972102862314e-05, + "loss": 0.8328, + "step": 7730 + }, + { + "epoch": 0.13705756967432875, + "grad_norm": 2.796875, + "learning_rate": 4.780282735834119e-05, + "loss": 0.7767, + "step": 7732 + }, + { + "epoch": 0.1370930217099403, + "grad_norm": 2.625, + "learning_rate": 4.780168232924551e-05, + "loss": 0.8444, + "step": 7734 + }, + { + "epoch": 0.13712847374555187, + "grad_norm": 2.578125, + "learning_rate": 4.78005370155896e-05, + "loss": 0.8089, + "step": 7736 + }, + { + "epoch": 0.13716392578116346, + "grad_norm": 2.90625, + "learning_rate": 4.7799391417387727e-05, + "loss": 0.8253, + "step": 7738 + }, + { + "epoch": 0.13719937781677502, + "grad_norm": 2.703125, + "learning_rate": 4.77982455346542e-05, + "loss": 0.8302, + "step": 7740 + }, + { + "epoch": 0.13723482985238658, + "grad_norm": 2.75, + "learning_rate": 4.779709936740332e-05, + "loss": 0.8361, + "step": 7742 + }, + { + "epoch": 0.13727028188799817, + "grad_norm": 2.671875, + "learning_rate": 4.779595291564939e-05, + "loss": 0.8076, + "step": 7744 + }, + { + "epoch": 0.13730573392360973, + "grad_norm": 2.8125, + "learning_rate": 4.77948061794067e-05, + "loss": 0.8185, + "step": 7746 + }, + { + "epoch": 0.1373411859592213, + "grad_norm": 2.640625, + "learning_rate": 4.7793659158689594e-05, + "loss": 0.7821, + "step": 7748 + }, + { + "epoch": 0.13737663799483288, + "grad_norm": 2.84375, + "learning_rate": 4.779251185351237e-05, + "loss": 0.8564, + "step": 7750 + }, + { + "epoch": 0.13741209003044444, + "grad_norm": 2.484375, + "learning_rate": 4.779136426388934e-05, + "loss": 0.8062, + "step": 7752 + }, + { + "epoch": 0.137447542066056, + "grad_norm": 2.671875, + "learning_rate": 4.779021638983483e-05, + "loss": 0.8118, + "step": 7754 + }, + { + "epoch": 0.1374829941016676, + "grad_norm": 2.71875, + "learning_rate": 4.7789068231363165e-05, + "loss": 0.8128, + "step": 7756 + }, + { + "epoch": 0.13751844613727915, + "grad_norm": 2.8125, + "learning_rate": 4.7787919788488675e-05, + "loss": 0.8501, + "step": 7758 + }, + { + "epoch": 0.1375538981728907, + "grad_norm": 2.984375, + "learning_rate": 4.77867710612257e-05, + "loss": 0.7931, + "step": 7760 + }, + { + "epoch": 0.1375893502085023, + "grad_norm": 3.03125, + "learning_rate": 4.778562204958856e-05, + "loss": 0.882, + "step": 7762 + }, + { + "epoch": 0.13762480224411386, + "grad_norm": 2.953125, + "learning_rate": 4.7784472753591606e-05, + "loss": 0.8161, + "step": 7764 + }, + { + "epoch": 0.13766025427972542, + "grad_norm": 3.1875, + "learning_rate": 4.778332317324918e-05, + "loss": 0.8651, + "step": 7766 + }, + { + "epoch": 0.137695706315337, + "grad_norm": 2.671875, + "learning_rate": 4.7782173308575625e-05, + "loss": 0.823, + "step": 7768 + }, + { + "epoch": 0.13773115835094857, + "grad_norm": 3.109375, + "learning_rate": 4.7781023159585295e-05, + "loss": 0.8671, + "step": 7770 + }, + { + "epoch": 0.13776661038656013, + "grad_norm": 2.71875, + "learning_rate": 4.777987272629253e-05, + "loss": 0.8153, + "step": 7772 + }, + { + "epoch": 0.1378020624221717, + "grad_norm": 2.765625, + "learning_rate": 4.7778722008711704e-05, + "loss": 0.9034, + "step": 7774 + }, + { + "epoch": 0.13783751445778328, + "grad_norm": 2.71875, + "learning_rate": 4.7777571006857174e-05, + "loss": 0.8068, + "step": 7776 + }, + { + "epoch": 0.13787296649339484, + "grad_norm": 2.6875, + "learning_rate": 4.777641972074331e-05, + "loss": 0.7982, + "step": 7778 + }, + { + "epoch": 0.1379084185290064, + "grad_norm": 2.8125, + "learning_rate": 4.7775268150384454e-05, + "loss": 0.8481, + "step": 7780 + }, + { + "epoch": 0.137943870564618, + "grad_norm": 2.703125, + "learning_rate": 4.7774116295795e-05, + "loss": 0.8235, + "step": 7782 + }, + { + "epoch": 0.13797932260022955, + "grad_norm": 2.765625, + "learning_rate": 4.777296415698933e-05, + "loss": 0.7464, + "step": 7784 + }, + { + "epoch": 0.1380147746358411, + "grad_norm": 2.765625, + "learning_rate": 4.7771811733981797e-05, + "loss": 0.8036, + "step": 7786 + }, + { + "epoch": 0.1380502266714527, + "grad_norm": 2.578125, + "learning_rate": 4.777065902678681e-05, + "loss": 0.7683, + "step": 7788 + }, + { + "epoch": 0.13808567870706426, + "grad_norm": 3.03125, + "learning_rate": 4.776950603541873e-05, + "loss": 0.8879, + "step": 7790 + }, + { + "epoch": 0.13812113074267582, + "grad_norm": 2.875, + "learning_rate": 4.776835275989196e-05, + "loss": 0.7814, + "step": 7792 + }, + { + "epoch": 0.1381565827782874, + "grad_norm": 2.703125, + "learning_rate": 4.776719920022089e-05, + "loss": 0.8611, + "step": 7794 + }, + { + "epoch": 0.13819203481389897, + "grad_norm": 2.625, + "learning_rate": 4.776604535641992e-05, + "loss": 0.8142, + "step": 7796 + }, + { + "epoch": 0.13822748684951053, + "grad_norm": 3.015625, + "learning_rate": 4.776489122850344e-05, + "loss": 0.8255, + "step": 7798 + }, + { + "epoch": 0.13826293888512212, + "grad_norm": 2.65625, + "learning_rate": 4.776373681648586e-05, + "loss": 0.7773, + "step": 7800 + }, + { + "epoch": 0.13829839092073368, + "grad_norm": 2.328125, + "learning_rate": 4.776258212038159e-05, + "loss": 0.7941, + "step": 7802 + }, + { + "epoch": 0.13833384295634524, + "grad_norm": 2.921875, + "learning_rate": 4.7761427140205034e-05, + "loss": 0.7812, + "step": 7804 + }, + { + "epoch": 0.13836929499195683, + "grad_norm": 2.65625, + "learning_rate": 4.7760271875970606e-05, + "loss": 0.7889, + "step": 7806 + }, + { + "epoch": 0.1384047470275684, + "grad_norm": 2.78125, + "learning_rate": 4.7759116327692726e-05, + "loss": 0.8312, + "step": 7808 + }, + { + "epoch": 0.13844019906317995, + "grad_norm": 2.609375, + "learning_rate": 4.775796049538582e-05, + "loss": 0.846, + "step": 7810 + }, + { + "epoch": 0.13847565109879154, + "grad_norm": 2.765625, + "learning_rate": 4.77568043790643e-05, + "loss": 0.7984, + "step": 7812 + }, + { + "epoch": 0.1385111031344031, + "grad_norm": 2.890625, + "learning_rate": 4.77556479787426e-05, + "loss": 0.7918, + "step": 7814 + }, + { + "epoch": 0.13854655517001466, + "grad_norm": 2.75, + "learning_rate": 4.7754491294435165e-05, + "loss": 0.8044, + "step": 7816 + }, + { + "epoch": 0.13858200720562625, + "grad_norm": 2.640625, + "learning_rate": 4.775333432615641e-05, + "loss": 0.791, + "step": 7818 + }, + { + "epoch": 0.1386174592412378, + "grad_norm": 2.5625, + "learning_rate": 4.775217707392078e-05, + "loss": 0.7833, + "step": 7820 + }, + { + "epoch": 0.13865291127684937, + "grad_norm": 2.703125, + "learning_rate": 4.7751019537742725e-05, + "loss": 0.8009, + "step": 7822 + }, + { + "epoch": 0.13868836331246095, + "grad_norm": 2.84375, + "learning_rate": 4.774986171763668e-05, + "loss": 0.8026, + "step": 7824 + }, + { + "epoch": 0.13872381534807252, + "grad_norm": 2.921875, + "learning_rate": 4.7748703613617095e-05, + "loss": 0.7954, + "step": 7826 + }, + { + "epoch": 0.13875926738368408, + "grad_norm": 2.59375, + "learning_rate": 4.7747545225698434e-05, + "loss": 0.8351, + "step": 7828 + }, + { + "epoch": 0.13879471941929566, + "grad_norm": 2.53125, + "learning_rate": 4.774638655389514e-05, + "loss": 0.8444, + "step": 7830 + }, + { + "epoch": 0.13883017145490723, + "grad_norm": 2.75, + "learning_rate": 4.7745227598221687e-05, + "loss": 0.8925, + "step": 7832 + }, + { + "epoch": 0.13886562349051879, + "grad_norm": 2.625, + "learning_rate": 4.774406835869253e-05, + "loss": 0.7889, + "step": 7834 + }, + { + "epoch": 0.13890107552613037, + "grad_norm": 2.828125, + "learning_rate": 4.7742908835322136e-05, + "loss": 0.8259, + "step": 7836 + }, + { + "epoch": 0.13893652756174193, + "grad_norm": 2.6875, + "learning_rate": 4.774174902812498e-05, + "loss": 0.8031, + "step": 7838 + }, + { + "epoch": 0.1389719795973535, + "grad_norm": 2.5, + "learning_rate": 4.774058893711553e-05, + "loss": 0.7957, + "step": 7840 + }, + { + "epoch": 0.13900743163296508, + "grad_norm": 3.140625, + "learning_rate": 4.7739428562308266e-05, + "loss": 0.7898, + "step": 7842 + }, + { + "epoch": 0.13904288366857664, + "grad_norm": 2.53125, + "learning_rate": 4.773826790371767e-05, + "loss": 0.7799, + "step": 7844 + }, + { + "epoch": 0.1390783357041882, + "grad_norm": 2.5625, + "learning_rate": 4.773710696135822e-05, + "loss": 0.8168, + "step": 7846 + }, + { + "epoch": 0.1391137877397998, + "grad_norm": 3.359375, + "learning_rate": 4.773594573524442e-05, + "loss": 0.8682, + "step": 7848 + }, + { + "epoch": 0.13914923977541135, + "grad_norm": 2.921875, + "learning_rate": 4.773478422539075e-05, + "loss": 0.8098, + "step": 7850 + }, + { + "epoch": 0.13918469181102291, + "grad_norm": 2.671875, + "learning_rate": 4.773362243181171e-05, + "loss": 0.7966, + "step": 7852 + }, + { + "epoch": 0.1392201438466345, + "grad_norm": 2.609375, + "learning_rate": 4.773246035452179e-05, + "loss": 0.828, + "step": 7854 + }, + { + "epoch": 0.13925559588224606, + "grad_norm": 2.703125, + "learning_rate": 4.773129799353551e-05, + "loss": 0.8082, + "step": 7856 + }, + { + "epoch": 0.13929104791785762, + "grad_norm": 2.65625, + "learning_rate": 4.773013534886735e-05, + "loss": 0.8269, + "step": 7858 + }, + { + "epoch": 0.1393264999534692, + "grad_norm": 2.9375, + "learning_rate": 4.772897242053186e-05, + "loss": 0.8295, + "step": 7860 + }, + { + "epoch": 0.13936195198908077, + "grad_norm": 3.078125, + "learning_rate": 4.772780920854351e-05, + "loss": 0.8027, + "step": 7862 + }, + { + "epoch": 0.13939740402469233, + "grad_norm": 2.84375, + "learning_rate": 4.772664571291684e-05, + "loss": 0.8514, + "step": 7864 + }, + { + "epoch": 0.13943285606030392, + "grad_norm": 2.578125, + "learning_rate": 4.772548193366636e-05, + "loss": 0.7971, + "step": 7866 + }, + { + "epoch": 0.13946830809591548, + "grad_norm": 2.734375, + "learning_rate": 4.772431787080661e-05, + "loss": 0.8007, + "step": 7868 + }, + { + "epoch": 0.13950376013152704, + "grad_norm": 2.484375, + "learning_rate": 4.7723153524352096e-05, + "loss": 0.8558, + "step": 7870 + }, + { + "epoch": 0.13953921216713863, + "grad_norm": 2.796875, + "learning_rate": 4.772198889431736e-05, + "loss": 0.8324, + "step": 7872 + }, + { + "epoch": 0.1395746642027502, + "grad_norm": 3.015625, + "learning_rate": 4.7720823980716934e-05, + "loss": 0.8482, + "step": 7874 + }, + { + "epoch": 0.13961011623836175, + "grad_norm": 3.078125, + "learning_rate": 4.771965878356536e-05, + "loss": 0.8482, + "step": 7876 + }, + { + "epoch": 0.13964556827397334, + "grad_norm": 3.265625, + "learning_rate": 4.771849330287718e-05, + "loss": 0.8271, + "step": 7878 + }, + { + "epoch": 0.1396810203095849, + "grad_norm": 3.0625, + "learning_rate": 4.7717327538666935e-05, + "loss": 0.8187, + "step": 7880 + }, + { + "epoch": 0.13971647234519646, + "grad_norm": 2.46875, + "learning_rate": 4.771616149094917e-05, + "loss": 0.817, + "step": 7882 + }, + { + "epoch": 0.13975192438080805, + "grad_norm": 2.546875, + "learning_rate": 4.771499515973844e-05, + "loss": 0.7927, + "step": 7884 + }, + { + "epoch": 0.1397873764164196, + "grad_norm": 2.8125, + "learning_rate": 4.7713828545049303e-05, + "loss": 0.8102, + "step": 7886 + }, + { + "epoch": 0.13982282845203117, + "grad_norm": 2.671875, + "learning_rate": 4.7712661646896316e-05, + "loss": 0.8759, + "step": 7888 + }, + { + "epoch": 0.13985828048764276, + "grad_norm": 2.84375, + "learning_rate": 4.771149446529405e-05, + "loss": 0.847, + "step": 7890 + }, + { + "epoch": 0.13989373252325432, + "grad_norm": 2.8125, + "learning_rate": 4.771032700025706e-05, + "loss": 0.8383, + "step": 7892 + }, + { + "epoch": 0.13992918455886588, + "grad_norm": 2.390625, + "learning_rate": 4.770915925179991e-05, + "loss": 0.8035, + "step": 7894 + }, + { + "epoch": 0.13996463659447747, + "grad_norm": 2.8125, + "learning_rate": 4.7707991219937194e-05, + "loss": 0.8306, + "step": 7896 + }, + { + "epoch": 0.14000008863008903, + "grad_norm": 2.71875, + "learning_rate": 4.770682290468347e-05, + "loss": 0.8405, + "step": 7898 + }, + { + "epoch": 0.1400355406657006, + "grad_norm": 2.828125, + "learning_rate": 4.7705654306053326e-05, + "loss": 0.8105, + "step": 7900 + }, + { + "epoch": 0.14007099270131218, + "grad_norm": 2.609375, + "learning_rate": 4.770448542406135e-05, + "loss": 0.7714, + "step": 7902 + }, + { + "epoch": 0.14010644473692374, + "grad_norm": 2.65625, + "learning_rate": 4.770331625872212e-05, + "loss": 0.7847, + "step": 7904 + }, + { + "epoch": 0.1401418967725353, + "grad_norm": 2.9375, + "learning_rate": 4.770214681005024e-05, + "loss": 0.8186, + "step": 7906 + }, + { + "epoch": 0.1401773488081469, + "grad_norm": 2.640625, + "learning_rate": 4.7700977078060286e-05, + "loss": 0.799, + "step": 7908 + }, + { + "epoch": 0.14021280084375845, + "grad_norm": 2.65625, + "learning_rate": 4.7699807062766876e-05, + "loss": 0.7855, + "step": 7910 + }, + { + "epoch": 0.14024825287937, + "grad_norm": 2.859375, + "learning_rate": 4.7698636764184597e-05, + "loss": 0.848, + "step": 7912 + }, + { + "epoch": 0.1402837049149816, + "grad_norm": 2.90625, + "learning_rate": 4.769746618232805e-05, + "loss": 0.8345, + "step": 7914 + }, + { + "epoch": 0.14031915695059316, + "grad_norm": 2.765625, + "learning_rate": 4.769629531721187e-05, + "loss": 0.7926, + "step": 7916 + }, + { + "epoch": 0.14035460898620472, + "grad_norm": 2.671875, + "learning_rate": 4.769512416885064e-05, + "loss": 0.8247, + "step": 7918 + }, + { + "epoch": 0.1403900610218163, + "grad_norm": 2.859375, + "learning_rate": 4.7693952737259e-05, + "loss": 0.8146, + "step": 7920 + }, + { + "epoch": 0.14042551305742787, + "grad_norm": 2.578125, + "learning_rate": 4.7692781022451536e-05, + "loss": 0.8202, + "step": 7922 + }, + { + "epoch": 0.14046096509303943, + "grad_norm": 2.71875, + "learning_rate": 4.7691609024442905e-05, + "loss": 0.819, + "step": 7924 + }, + { + "epoch": 0.14049641712865102, + "grad_norm": 2.5625, + "learning_rate": 4.7690436743247727e-05, + "loss": 0.8124, + "step": 7926 + }, + { + "epoch": 0.14053186916426258, + "grad_norm": 2.640625, + "learning_rate": 4.768926417888061e-05, + "loss": 0.7961, + "step": 7928 + }, + { + "epoch": 0.14056732119987414, + "grad_norm": 2.5625, + "learning_rate": 4.76880913313562e-05, + "loss": 0.7982, + "step": 7930 + }, + { + "epoch": 0.14060277323548573, + "grad_norm": 2.71875, + "learning_rate": 4.7686918200689144e-05, + "loss": 0.8538, + "step": 7932 + }, + { + "epoch": 0.1406382252710973, + "grad_norm": 2.8125, + "learning_rate": 4.768574478689408e-05, + "loss": 0.7942, + "step": 7934 + }, + { + "epoch": 0.14067367730670885, + "grad_norm": 2.953125, + "learning_rate": 4.768457108998564e-05, + "loss": 0.802, + "step": 7936 + }, + { + "epoch": 0.14070912934232044, + "grad_norm": 2.46875, + "learning_rate": 4.768339710997847e-05, + "loss": 0.8082, + "step": 7938 + }, + { + "epoch": 0.140744581377932, + "grad_norm": 2.859375, + "learning_rate": 4.768222284688724e-05, + "loss": 0.7945, + "step": 7940 + }, + { + "epoch": 0.14078003341354356, + "grad_norm": 2.890625, + "learning_rate": 4.7681048300726584e-05, + "loss": 0.845, + "step": 7942 + }, + { + "epoch": 0.14081548544915512, + "grad_norm": 2.8125, + "learning_rate": 4.767987347151118e-05, + "loss": 0.8232, + "step": 7944 + }, + { + "epoch": 0.1408509374847667, + "grad_norm": 2.65625, + "learning_rate": 4.767869835925567e-05, + "loss": 0.8091, + "step": 7946 + }, + { + "epoch": 0.14088638952037827, + "grad_norm": 2.4375, + "learning_rate": 4.767752296397473e-05, + "loss": 0.7925, + "step": 7948 + }, + { + "epoch": 0.14092184155598983, + "grad_norm": 2.6875, + "learning_rate": 4.767634728568303e-05, + "loss": 0.8356, + "step": 7950 + }, + { + "epoch": 0.14095729359160142, + "grad_norm": 3.078125, + "learning_rate": 4.7675171324395236e-05, + "loss": 0.8526, + "step": 7952 + }, + { + "epoch": 0.14099274562721298, + "grad_norm": 2.671875, + "learning_rate": 4.767399508012603e-05, + "loss": 0.8233, + "step": 7954 + }, + { + "epoch": 0.14102819766282454, + "grad_norm": 2.734375, + "learning_rate": 4.767281855289009e-05, + "loss": 0.8237, + "step": 7956 + }, + { + "epoch": 0.14106364969843613, + "grad_norm": 2.359375, + "learning_rate": 4.767164174270208e-05, + "loss": 0.8088, + "step": 7958 + }, + { + "epoch": 0.1410991017340477, + "grad_norm": 2.703125, + "learning_rate": 4.767046464957672e-05, + "loss": 0.8382, + "step": 7960 + }, + { + "epoch": 0.14113455376965925, + "grad_norm": 2.859375, + "learning_rate": 4.7669287273528676e-05, + "loss": 0.7871, + "step": 7962 + }, + { + "epoch": 0.14117000580527084, + "grad_norm": 2.671875, + "learning_rate": 4.766810961457265e-05, + "loss": 0.8088, + "step": 7964 + }, + { + "epoch": 0.1412054578408824, + "grad_norm": 2.71875, + "learning_rate": 4.7666931672723346e-05, + "loss": 0.8341, + "step": 7966 + }, + { + "epoch": 0.14124090987649396, + "grad_norm": 2.9375, + "learning_rate": 4.766575344799544e-05, + "loss": 0.816, + "step": 7968 + }, + { + "epoch": 0.14127636191210555, + "grad_norm": 2.640625, + "learning_rate": 4.7664574940403666e-05, + "loss": 0.8126, + "step": 7970 + }, + { + "epoch": 0.1413118139477171, + "grad_norm": 2.515625, + "learning_rate": 4.7663396149962715e-05, + "loss": 0.7982, + "step": 7972 + }, + { + "epoch": 0.14134726598332867, + "grad_norm": 2.734375, + "learning_rate": 4.76622170766873e-05, + "loss": 0.7979, + "step": 7974 + }, + { + "epoch": 0.14138271801894026, + "grad_norm": 2.78125, + "learning_rate": 4.766103772059213e-05, + "loss": 0.8553, + "step": 7976 + }, + { + "epoch": 0.14141817005455182, + "grad_norm": 2.578125, + "learning_rate": 4.7659858081691936e-05, + "loss": 0.7936, + "step": 7978 + }, + { + "epoch": 0.14145362209016338, + "grad_norm": 2.65625, + "learning_rate": 4.7658678160001425e-05, + "loss": 0.8434, + "step": 7980 + }, + { + "epoch": 0.14148907412577497, + "grad_norm": 2.78125, + "learning_rate": 4.7657497955535334e-05, + "loss": 0.8208, + "step": 7982 + }, + { + "epoch": 0.14152452616138653, + "grad_norm": 3.0, + "learning_rate": 4.765631746830839e-05, + "loss": 0.8071, + "step": 7984 + }, + { + "epoch": 0.1415599781969981, + "grad_norm": 2.515625, + "learning_rate": 4.7655136698335326e-05, + "loss": 0.8083, + "step": 7986 + }, + { + "epoch": 0.14159543023260968, + "grad_norm": 2.796875, + "learning_rate": 4.7653955645630866e-05, + "loss": 0.7766, + "step": 7988 + }, + { + "epoch": 0.14163088226822124, + "grad_norm": 3.28125, + "learning_rate": 4.765277431020976e-05, + "loss": 0.8276, + "step": 7990 + }, + { + "epoch": 0.1416663343038328, + "grad_norm": 2.609375, + "learning_rate": 4.7651592692086756e-05, + "loss": 0.8471, + "step": 7992 + }, + { + "epoch": 0.14170178633944439, + "grad_norm": 2.671875, + "learning_rate": 4.7650410791276584e-05, + "loss": 0.8432, + "step": 7994 + }, + { + "epoch": 0.14173723837505595, + "grad_norm": 2.828125, + "learning_rate": 4.764922860779401e-05, + "loss": 0.7798, + "step": 7996 + }, + { + "epoch": 0.1417726904106675, + "grad_norm": 3.03125, + "learning_rate": 4.764804614165377e-05, + "loss": 0.763, + "step": 7998 + }, + { + "epoch": 0.1418081424462791, + "grad_norm": 2.96875, + "learning_rate": 4.7646863392870644e-05, + "loss": 0.7884, + "step": 8000 + }, + { + "epoch": 0.14184359448189066, + "grad_norm": 2.859375, + "learning_rate": 4.764568036145938e-05, + "loss": 0.819, + "step": 8002 + }, + { + "epoch": 0.14187904651750222, + "grad_norm": 2.9375, + "learning_rate": 4.764449704743473e-05, + "loss": 0.8361, + "step": 8004 + }, + { + "epoch": 0.1419144985531138, + "grad_norm": 2.78125, + "learning_rate": 4.764331345081148e-05, + "loss": 0.829, + "step": 8006 + }, + { + "epoch": 0.14194995058872537, + "grad_norm": 2.921875, + "learning_rate": 4.76421295716044e-05, + "loss": 0.8125, + "step": 8008 + }, + { + "epoch": 0.14198540262433693, + "grad_norm": 2.6875, + "learning_rate": 4.7640945409828255e-05, + "loss": 0.848, + "step": 8010 + }, + { + "epoch": 0.14202085465994851, + "grad_norm": 2.828125, + "learning_rate": 4.763976096549782e-05, + "loss": 0.8195, + "step": 8012 + }, + { + "epoch": 0.14205630669556008, + "grad_norm": 2.96875, + "learning_rate": 4.7638576238627886e-05, + "loss": 0.7931, + "step": 8014 + }, + { + "epoch": 0.14209175873117164, + "grad_norm": 2.71875, + "learning_rate": 4.763739122923324e-05, + "loss": 0.8273, + "step": 8016 + }, + { + "epoch": 0.14212721076678322, + "grad_norm": 2.8125, + "learning_rate": 4.763620593732867e-05, + "loss": 0.8068, + "step": 8018 + }, + { + "epoch": 0.14216266280239478, + "grad_norm": 2.6875, + "learning_rate": 4.763502036292896e-05, + "loss": 0.7955, + "step": 8020 + }, + { + "epoch": 0.14219811483800635, + "grad_norm": 2.828125, + "learning_rate": 4.763383450604891e-05, + "loss": 0.8171, + "step": 8022 + }, + { + "epoch": 0.14223356687361793, + "grad_norm": 2.859375, + "learning_rate": 4.763264836670332e-05, + "loss": 0.8004, + "step": 8024 + }, + { + "epoch": 0.1422690189092295, + "grad_norm": 3.140625, + "learning_rate": 4.7631461944906994e-05, + "loss": 0.8078, + "step": 8026 + }, + { + "epoch": 0.14230447094484105, + "grad_norm": 3.171875, + "learning_rate": 4.763027524067473e-05, + "loss": 0.8291, + "step": 8028 + }, + { + "epoch": 0.14233992298045264, + "grad_norm": 2.734375, + "learning_rate": 4.7629088254021354e-05, + "loss": 0.8019, + "step": 8030 + }, + { + "epoch": 0.1423753750160642, + "grad_norm": 3.09375, + "learning_rate": 4.762790098496166e-05, + "loss": 0.7968, + "step": 8032 + }, + { + "epoch": 0.14241082705167576, + "grad_norm": 2.921875, + "learning_rate": 4.7626713433510485e-05, + "loss": 0.7821, + "step": 8034 + }, + { + "epoch": 0.14244627908728735, + "grad_norm": 2.9375, + "learning_rate": 4.762552559968264e-05, + "loss": 0.8094, + "step": 8036 + }, + { + "epoch": 0.1424817311228989, + "grad_norm": 2.640625, + "learning_rate": 4.762433748349294e-05, + "loss": 0.8488, + "step": 8038 + }, + { + "epoch": 0.14251718315851047, + "grad_norm": 3.046875, + "learning_rate": 4.762314908495622e-05, + "loss": 0.8179, + "step": 8040 + }, + { + "epoch": 0.14255263519412206, + "grad_norm": 3.125, + "learning_rate": 4.7621960404087316e-05, + "loss": 0.8421, + "step": 8042 + }, + { + "epoch": 0.14258808722973362, + "grad_norm": 2.90625, + "learning_rate": 4.7620771440901056e-05, + "loss": 0.7974, + "step": 8044 + }, + { + "epoch": 0.14262353926534518, + "grad_norm": 3.03125, + "learning_rate": 4.761958219541228e-05, + "loss": 0.8883, + "step": 8046 + }, + { + "epoch": 0.14265899130095677, + "grad_norm": 2.875, + "learning_rate": 4.761839266763583e-05, + "loss": 0.8937, + "step": 8048 + }, + { + "epoch": 0.14269444333656833, + "grad_norm": 2.765625, + "learning_rate": 4.761720285758655e-05, + "loss": 0.7914, + "step": 8050 + }, + { + "epoch": 0.1427298953721799, + "grad_norm": 2.84375, + "learning_rate": 4.761601276527929e-05, + "loss": 0.7956, + "step": 8052 + }, + { + "epoch": 0.14276534740779148, + "grad_norm": 2.90625, + "learning_rate": 4.76148223907289e-05, + "loss": 0.8424, + "step": 8054 + }, + { + "epoch": 0.14280079944340304, + "grad_norm": 2.59375, + "learning_rate": 4.761363173395024e-05, + "loss": 0.8371, + "step": 8056 + }, + { + "epoch": 0.1428362514790146, + "grad_norm": 2.75, + "learning_rate": 4.761244079495817e-05, + "loss": 0.8222, + "step": 8058 + }, + { + "epoch": 0.1428717035146262, + "grad_norm": 2.875, + "learning_rate": 4.761124957376754e-05, + "loss": 0.7927, + "step": 8060 + }, + { + "epoch": 0.14290715555023775, + "grad_norm": 2.609375, + "learning_rate": 4.761005807039323e-05, + "loss": 0.8782, + "step": 8062 + }, + { + "epoch": 0.1429426075858493, + "grad_norm": 2.765625, + "learning_rate": 4.7608866284850104e-05, + "loss": 0.8218, + "step": 8064 + }, + { + "epoch": 0.1429780596214609, + "grad_norm": 2.78125, + "learning_rate": 4.7607674217153034e-05, + "loss": 0.8434, + "step": 8066 + }, + { + "epoch": 0.14301351165707246, + "grad_norm": 3.421875, + "learning_rate": 4.760648186731689e-05, + "loss": 0.849, + "step": 8068 + }, + { + "epoch": 0.14304896369268402, + "grad_norm": 2.671875, + "learning_rate": 4.7605289235356574e-05, + "loss": 0.7891, + "step": 8070 + }, + { + "epoch": 0.1430844157282956, + "grad_norm": 2.90625, + "learning_rate": 4.760409632128695e-05, + "loss": 0.8308, + "step": 8072 + }, + { + "epoch": 0.14311986776390717, + "grad_norm": 2.484375, + "learning_rate": 4.7602903125122914e-05, + "loss": 0.801, + "step": 8074 + }, + { + "epoch": 0.14315531979951873, + "grad_norm": 2.71875, + "learning_rate": 4.760170964687935e-05, + "loss": 0.8058, + "step": 8076 + }, + { + "epoch": 0.14319077183513032, + "grad_norm": 2.84375, + "learning_rate": 4.760051588657117e-05, + "loss": 0.8482, + "step": 8078 + }, + { + "epoch": 0.14322622387074188, + "grad_norm": 2.71875, + "learning_rate": 4.759932184421325e-05, + "loss": 0.78, + "step": 8080 + }, + { + "epoch": 0.14326167590635344, + "grad_norm": 2.828125, + "learning_rate": 4.75981275198205e-05, + "loss": 0.8269, + "step": 8082 + }, + { + "epoch": 0.14329712794196503, + "grad_norm": 2.734375, + "learning_rate": 4.759693291340783e-05, + "loss": 0.8253, + "step": 8084 + }, + { + "epoch": 0.1433325799775766, + "grad_norm": 2.953125, + "learning_rate": 4.759573802499014e-05, + "loss": 0.7828, + "step": 8086 + }, + { + "epoch": 0.14336803201318815, + "grad_norm": 3.125, + "learning_rate": 4.759454285458235e-05, + "loss": 0.8468, + "step": 8088 + }, + { + "epoch": 0.14340348404879974, + "grad_norm": 2.40625, + "learning_rate": 4.759334740219937e-05, + "loss": 0.8235, + "step": 8090 + }, + { + "epoch": 0.1434389360844113, + "grad_norm": 2.640625, + "learning_rate": 4.7592151667856125e-05, + "loss": 0.7874, + "step": 8092 + }, + { + "epoch": 0.14347438812002286, + "grad_norm": 2.53125, + "learning_rate": 4.759095565156752e-05, + "loss": 0.768, + "step": 8094 + }, + { + "epoch": 0.14350984015563445, + "grad_norm": 2.828125, + "learning_rate": 4.75897593533485e-05, + "loss": 0.8154, + "step": 8096 + }, + { + "epoch": 0.143545292191246, + "grad_norm": 2.734375, + "learning_rate": 4.758856277321398e-05, + "loss": 0.8223, + "step": 8098 + }, + { + "epoch": 0.14358074422685757, + "grad_norm": 2.671875, + "learning_rate": 4.758736591117892e-05, + "loss": 0.8145, + "step": 8100 + }, + { + "epoch": 0.14361619626246916, + "grad_norm": 2.453125, + "learning_rate": 4.7586168767258227e-05, + "loss": 0.7969, + "step": 8102 + }, + { + "epoch": 0.14365164829808072, + "grad_norm": 3.046875, + "learning_rate": 4.758497134146686e-05, + "loss": 0.8054, + "step": 8104 + }, + { + "epoch": 0.14368710033369228, + "grad_norm": 2.796875, + "learning_rate": 4.758377363381974e-05, + "loss": 0.8435, + "step": 8106 + }, + { + "epoch": 0.14372255236930387, + "grad_norm": 2.953125, + "learning_rate": 4.7582575644331836e-05, + "loss": 0.8286, + "step": 8108 + }, + { + "epoch": 0.14375800440491543, + "grad_norm": 2.859375, + "learning_rate": 4.758137737301809e-05, + "loss": 0.8173, + "step": 8110 + }, + { + "epoch": 0.143793456440527, + "grad_norm": 2.640625, + "learning_rate": 4.7580178819893465e-05, + "loss": 0.8239, + "step": 8112 + }, + { + "epoch": 0.14382890847613855, + "grad_norm": 2.734375, + "learning_rate": 4.75789799849729e-05, + "loss": 0.8259, + "step": 8114 + }, + { + "epoch": 0.14386436051175014, + "grad_norm": 2.8125, + "learning_rate": 4.757778086827138e-05, + "loss": 0.8239, + "step": 8116 + }, + { + "epoch": 0.1438998125473617, + "grad_norm": 2.75, + "learning_rate": 4.757658146980385e-05, + "loss": 0.8177, + "step": 8118 + }, + { + "epoch": 0.14393526458297326, + "grad_norm": 2.65625, + "learning_rate": 4.7575381789585296e-05, + "loss": 0.809, + "step": 8120 + }, + { + "epoch": 0.14397071661858485, + "grad_norm": 3.078125, + "learning_rate": 4.7574181827630666e-05, + "loss": 0.8368, + "step": 8122 + }, + { + "epoch": 0.1440061686541964, + "grad_norm": 2.796875, + "learning_rate": 4.757298158395496e-05, + "loss": 0.8409, + "step": 8124 + }, + { + "epoch": 0.14404162068980797, + "grad_norm": 2.734375, + "learning_rate": 4.757178105857313e-05, + "loss": 0.8275, + "step": 8126 + }, + { + "epoch": 0.14407707272541956, + "grad_norm": 2.6875, + "learning_rate": 4.757058025150018e-05, + "loss": 0.8308, + "step": 8128 + }, + { + "epoch": 0.14411252476103112, + "grad_norm": 2.9375, + "learning_rate": 4.7569379162751094e-05, + "loss": 0.804, + "step": 8130 + }, + { + "epoch": 0.14414797679664268, + "grad_norm": 2.6875, + "learning_rate": 4.756817779234086e-05, + "loss": 0.8118, + "step": 8132 + }, + { + "epoch": 0.14418342883225427, + "grad_norm": 2.4375, + "learning_rate": 4.756697614028446e-05, + "loss": 0.8067, + "step": 8134 + }, + { + "epoch": 0.14421888086786583, + "grad_norm": 2.765625, + "learning_rate": 4.75657742065969e-05, + "loss": 0.8501, + "step": 8136 + }, + { + "epoch": 0.1442543329034774, + "grad_norm": 2.9375, + "learning_rate": 4.7564571991293184e-05, + "loss": 0.7915, + "step": 8138 + }, + { + "epoch": 0.14428978493908898, + "grad_norm": 2.78125, + "learning_rate": 4.75633694943883e-05, + "loss": 0.7987, + "step": 8140 + }, + { + "epoch": 0.14432523697470054, + "grad_norm": 2.796875, + "learning_rate": 4.756216671589727e-05, + "loss": 0.8148, + "step": 8142 + }, + { + "epoch": 0.1443606890103121, + "grad_norm": 2.828125, + "learning_rate": 4.75609636558351e-05, + "loss": 0.8114, + "step": 8144 + }, + { + "epoch": 0.1443961410459237, + "grad_norm": 2.703125, + "learning_rate": 4.7559760314216794e-05, + "loss": 0.826, + "step": 8146 + }, + { + "epoch": 0.14443159308153525, + "grad_norm": 2.890625, + "learning_rate": 4.755855669105739e-05, + "loss": 0.8442, + "step": 8148 + }, + { + "epoch": 0.1444670451171468, + "grad_norm": 3.0, + "learning_rate": 4.755735278637189e-05, + "loss": 0.834, + "step": 8150 + }, + { + "epoch": 0.1445024971527584, + "grad_norm": 2.65625, + "learning_rate": 4.755614860017533e-05, + "loss": 0.7876, + "step": 8152 + }, + { + "epoch": 0.14453794918836996, + "grad_norm": 2.796875, + "learning_rate": 4.7554944132482724e-05, + "loss": 0.8251, + "step": 8154 + }, + { + "epoch": 0.14457340122398152, + "grad_norm": 2.65625, + "learning_rate": 4.755373938330912e-05, + "loss": 0.813, + "step": 8156 + }, + { + "epoch": 0.1446088532595931, + "grad_norm": 2.609375, + "learning_rate": 4.755253435266955e-05, + "loss": 0.8088, + "step": 8158 + }, + { + "epoch": 0.14464430529520467, + "grad_norm": 2.6875, + "learning_rate": 4.755132904057904e-05, + "loss": 0.8135, + "step": 8160 + }, + { + "epoch": 0.14467975733081623, + "grad_norm": 2.609375, + "learning_rate": 4.7550123447052646e-05, + "loss": 0.7945, + "step": 8162 + }, + { + "epoch": 0.14471520936642782, + "grad_norm": 2.59375, + "learning_rate": 4.754891757210541e-05, + "loss": 0.8046, + "step": 8164 + }, + { + "epoch": 0.14475066140203938, + "grad_norm": 2.6875, + "learning_rate": 4.754771141575237e-05, + "loss": 0.8994, + "step": 8166 + }, + { + "epoch": 0.14478611343765094, + "grad_norm": 2.625, + "learning_rate": 4.75465049780086e-05, + "loss": 0.8353, + "step": 8168 + }, + { + "epoch": 0.14482156547326253, + "grad_norm": 2.65625, + "learning_rate": 4.754529825888914e-05, + "loss": 0.7793, + "step": 8170 + }, + { + "epoch": 0.1448570175088741, + "grad_norm": 2.671875, + "learning_rate": 4.754409125840905e-05, + "loss": 0.8158, + "step": 8172 + }, + { + "epoch": 0.14489246954448565, + "grad_norm": 2.859375, + "learning_rate": 4.75428839765834e-05, + "loss": 0.8244, + "step": 8174 + }, + { + "epoch": 0.14492792158009724, + "grad_norm": 2.765625, + "learning_rate": 4.754167641342725e-05, + "loss": 0.7988, + "step": 8176 + }, + { + "epoch": 0.1449633736157088, + "grad_norm": 3.15625, + "learning_rate": 4.754046856895568e-05, + "loss": 0.8256, + "step": 8178 + }, + { + "epoch": 0.14499882565132036, + "grad_norm": 2.75, + "learning_rate": 4.753926044318375e-05, + "loss": 0.8186, + "step": 8180 + }, + { + "epoch": 0.14503427768693194, + "grad_norm": 2.703125, + "learning_rate": 4.7538052036126545e-05, + "loss": 0.7826, + "step": 8182 + }, + { + "epoch": 0.1450697297225435, + "grad_norm": 2.78125, + "learning_rate": 4.753684334779914e-05, + "loss": 0.8254, + "step": 8184 + }, + { + "epoch": 0.14510518175815507, + "grad_norm": 2.6875, + "learning_rate": 4.7535634378216636e-05, + "loss": 0.8321, + "step": 8186 + }, + { + "epoch": 0.14514063379376665, + "grad_norm": 3.171875, + "learning_rate": 4.7534425127394106e-05, + "loss": 0.7746, + "step": 8188 + }, + { + "epoch": 0.14517608582937822, + "grad_norm": 2.546875, + "learning_rate": 4.7533215595346636e-05, + "loss": 0.8301, + "step": 8190 + }, + { + "epoch": 0.14521153786498978, + "grad_norm": 2.703125, + "learning_rate": 4.753200578208934e-05, + "loss": 0.813, + "step": 8192 + }, + { + "epoch": 0.14524698990060136, + "grad_norm": 2.875, + "learning_rate": 4.75307956876373e-05, + "loss": 0.808, + "step": 8194 + }, + { + "epoch": 0.14528244193621292, + "grad_norm": 2.765625, + "learning_rate": 4.752958531200562e-05, + "loss": 0.8109, + "step": 8196 + }, + { + "epoch": 0.14531789397182449, + "grad_norm": 2.828125, + "learning_rate": 4.7528374655209407e-05, + "loss": 0.823, + "step": 8198 + }, + { + "epoch": 0.14535334600743607, + "grad_norm": 2.984375, + "learning_rate": 4.752716371726378e-05, + "loss": 0.8127, + "step": 8200 + }, + { + "epoch": 0.14538879804304763, + "grad_norm": 2.796875, + "learning_rate": 4.752595249818383e-05, + "loss": 0.8062, + "step": 8202 + }, + { + "epoch": 0.1454242500786592, + "grad_norm": 2.71875, + "learning_rate": 4.752474099798469e-05, + "loss": 0.7996, + "step": 8204 + }, + { + "epoch": 0.14545970211427078, + "grad_norm": 2.59375, + "learning_rate": 4.752352921668147e-05, + "loss": 0.8083, + "step": 8206 + }, + { + "epoch": 0.14549515414988234, + "grad_norm": 2.765625, + "learning_rate": 4.752231715428931e-05, + "loss": 0.8477, + "step": 8208 + }, + { + "epoch": 0.1455306061854939, + "grad_norm": 2.78125, + "learning_rate": 4.752110481082331e-05, + "loss": 0.841, + "step": 8210 + }, + { + "epoch": 0.1455660582211055, + "grad_norm": 2.859375, + "learning_rate": 4.751989218629861e-05, + "loss": 0.8154, + "step": 8212 + }, + { + "epoch": 0.14560151025671705, + "grad_norm": 2.59375, + "learning_rate": 4.751867928073036e-05, + "loss": 0.7936, + "step": 8214 + }, + { + "epoch": 0.14563696229232861, + "grad_norm": 2.96875, + "learning_rate": 4.751746609413367e-05, + "loss": 0.8145, + "step": 8216 + }, + { + "epoch": 0.1456724143279402, + "grad_norm": 3.171875, + "learning_rate": 4.75162526265237e-05, + "loss": 0.7813, + "step": 8218 + }, + { + "epoch": 0.14570786636355176, + "grad_norm": 3.0, + "learning_rate": 4.7515038877915584e-05, + "loss": 0.8669, + "step": 8220 + }, + { + "epoch": 0.14574331839916332, + "grad_norm": 2.609375, + "learning_rate": 4.7513824848324474e-05, + "loss": 0.8316, + "step": 8222 + }, + { + "epoch": 0.1457787704347749, + "grad_norm": 3.0625, + "learning_rate": 4.751261053776552e-05, + "loss": 0.8761, + "step": 8224 + }, + { + "epoch": 0.14581422247038647, + "grad_norm": 2.78125, + "learning_rate": 4.751139594625388e-05, + "loss": 0.8457, + "step": 8226 + }, + { + "epoch": 0.14584967450599803, + "grad_norm": 2.890625, + "learning_rate": 4.751018107380469e-05, + "loss": 0.8049, + "step": 8228 + }, + { + "epoch": 0.14588512654160962, + "grad_norm": 2.828125, + "learning_rate": 4.750896592043315e-05, + "loss": 0.82, + "step": 8230 + }, + { + "epoch": 0.14592057857722118, + "grad_norm": 2.96875, + "learning_rate": 4.7507750486154387e-05, + "loss": 0.8268, + "step": 8232 + }, + { + "epoch": 0.14595603061283274, + "grad_norm": 3.03125, + "learning_rate": 4.7506534770983595e-05, + "loss": 0.8836, + "step": 8234 + }, + { + "epoch": 0.14599148264844433, + "grad_norm": 3.171875, + "learning_rate": 4.750531877493594e-05, + "loss": 0.8191, + "step": 8236 + }, + { + "epoch": 0.1460269346840559, + "grad_norm": 2.703125, + "learning_rate": 4.7504102498026584e-05, + "loss": 0.8321, + "step": 8238 + }, + { + "epoch": 0.14606238671966745, + "grad_norm": 2.734375, + "learning_rate": 4.7502885940270723e-05, + "loss": 0.8084, + "step": 8240 + }, + { + "epoch": 0.14609783875527904, + "grad_norm": 2.78125, + "learning_rate": 4.7501669101683535e-05, + "loss": 0.7722, + "step": 8242 + }, + { + "epoch": 0.1461332907908906, + "grad_norm": 2.953125, + "learning_rate": 4.750045198228019e-05, + "loss": 0.8216, + "step": 8244 + }, + { + "epoch": 0.14616874282650216, + "grad_norm": 2.71875, + "learning_rate": 4.7499234582075905e-05, + "loss": 0.813, + "step": 8246 + }, + { + "epoch": 0.14620419486211375, + "grad_norm": 2.6875, + "learning_rate": 4.749801690108585e-05, + "loss": 0.8364, + "step": 8248 + }, + { + "epoch": 0.1462396468977253, + "grad_norm": 2.875, + "learning_rate": 4.749679893932524e-05, + "loss": 0.8664, + "step": 8250 + }, + { + "epoch": 0.14627509893333687, + "grad_norm": 2.671875, + "learning_rate": 4.7495580696809254e-05, + "loss": 0.8306, + "step": 8252 + }, + { + "epoch": 0.14631055096894846, + "grad_norm": 2.75, + "learning_rate": 4.7494362173553114e-05, + "loss": 0.817, + "step": 8254 + }, + { + "epoch": 0.14634600300456002, + "grad_norm": 2.765625, + "learning_rate": 4.7493143369572013e-05, + "loss": 0.8451, + "step": 8256 + }, + { + "epoch": 0.14638145504017158, + "grad_norm": 2.65625, + "learning_rate": 4.749192428488117e-05, + "loss": 0.8197, + "step": 8258 + }, + { + "epoch": 0.14641690707578317, + "grad_norm": 2.828125, + "learning_rate": 4.7490704919495796e-05, + "loss": 0.8499, + "step": 8260 + }, + { + "epoch": 0.14645235911139473, + "grad_norm": 2.953125, + "learning_rate": 4.748948527343112e-05, + "loss": 0.8043, + "step": 8262 + }, + { + "epoch": 0.1464878111470063, + "grad_norm": 3.171875, + "learning_rate": 4.748826534670234e-05, + "loss": 0.8115, + "step": 8264 + }, + { + "epoch": 0.14652326318261788, + "grad_norm": 2.90625, + "learning_rate": 4.748704513932469e-05, + "loss": 0.8248, + "step": 8266 + }, + { + "epoch": 0.14655871521822944, + "grad_norm": 2.859375, + "learning_rate": 4.748582465131341e-05, + "loss": 0.8252, + "step": 8268 + }, + { + "epoch": 0.146594167253841, + "grad_norm": 2.765625, + "learning_rate": 4.748460388268372e-05, + "loss": 0.8104, + "step": 8270 + }, + { + "epoch": 0.1466296192894526, + "grad_norm": 2.6875, + "learning_rate": 4.748338283345085e-05, + "loss": 0.8272, + "step": 8272 + }, + { + "epoch": 0.14666507132506415, + "grad_norm": 2.4375, + "learning_rate": 4.7482161503630053e-05, + "loss": 0.7902, + "step": 8274 + }, + { + "epoch": 0.1467005233606757, + "grad_norm": 2.828125, + "learning_rate": 4.7480939893236556e-05, + "loss": 0.8334, + "step": 8276 + }, + { + "epoch": 0.1467359753962873, + "grad_norm": 2.890625, + "learning_rate": 4.7479718002285615e-05, + "loss": 0.7837, + "step": 8278 + }, + { + "epoch": 0.14677142743189886, + "grad_norm": 2.5625, + "learning_rate": 4.747849583079248e-05, + "loss": 0.7474, + "step": 8280 + }, + { + "epoch": 0.14680687946751042, + "grad_norm": 3.109375, + "learning_rate": 4.747727337877239e-05, + "loss": 0.8168, + "step": 8282 + }, + { + "epoch": 0.14684233150312198, + "grad_norm": 2.78125, + "learning_rate": 4.747605064624062e-05, + "loss": 0.8059, + "step": 8284 + }, + { + "epoch": 0.14687778353873357, + "grad_norm": 3.09375, + "learning_rate": 4.747482763321241e-05, + "loss": 0.8225, + "step": 8286 + }, + { + "epoch": 0.14691323557434513, + "grad_norm": 2.734375, + "learning_rate": 4.7473604339703034e-05, + "loss": 0.8391, + "step": 8288 + }, + { + "epoch": 0.1469486876099567, + "grad_norm": 2.84375, + "learning_rate": 4.747238076572777e-05, + "loss": 0.8136, + "step": 8290 + }, + { + "epoch": 0.14698413964556828, + "grad_norm": 2.609375, + "learning_rate": 4.747115691130185e-05, + "loss": 0.8232, + "step": 8292 + }, + { + "epoch": 0.14701959168117984, + "grad_norm": 2.796875, + "learning_rate": 4.746993277644059e-05, + "loss": 0.8548, + "step": 8294 + }, + { + "epoch": 0.1470550437167914, + "grad_norm": 2.75, + "learning_rate": 4.746870836115924e-05, + "loss": 0.8169, + "step": 8296 + }, + { + "epoch": 0.147090495752403, + "grad_norm": 2.71875, + "learning_rate": 4.74674836654731e-05, + "loss": 0.8498, + "step": 8298 + }, + { + "epoch": 0.14712594778801455, + "grad_norm": 2.609375, + "learning_rate": 4.7466258689397434e-05, + "loss": 0.8027, + "step": 8300 + }, + { + "epoch": 0.1471613998236261, + "grad_norm": 2.984375, + "learning_rate": 4.7465033432947546e-05, + "loss": 0.8488, + "step": 8302 + }, + { + "epoch": 0.1471968518592377, + "grad_norm": 2.71875, + "learning_rate": 4.746380789613871e-05, + "loss": 0.842, + "step": 8304 + }, + { + "epoch": 0.14723230389484926, + "grad_norm": 2.6875, + "learning_rate": 4.746258207898624e-05, + "loss": 0.8198, + "step": 8306 + }, + { + "epoch": 0.14726775593046082, + "grad_norm": 2.640625, + "learning_rate": 4.746135598150542e-05, + "loss": 0.8277, + "step": 8308 + }, + { + "epoch": 0.1473032079660724, + "grad_norm": 2.890625, + "learning_rate": 4.746012960371156e-05, + "loss": 0.8142, + "step": 8310 + }, + { + "epoch": 0.14733866000168397, + "grad_norm": 2.671875, + "learning_rate": 4.745890294561995e-05, + "loss": 0.7867, + "step": 8312 + }, + { + "epoch": 0.14737411203729553, + "grad_norm": 2.765625, + "learning_rate": 4.745767600724592e-05, + "loss": 0.8592, + "step": 8314 + }, + { + "epoch": 0.14740956407290712, + "grad_norm": 2.734375, + "learning_rate": 4.745644878860478e-05, + "loss": 0.8329, + "step": 8316 + }, + { + "epoch": 0.14744501610851868, + "grad_norm": 2.640625, + "learning_rate": 4.7455221289711814e-05, + "loss": 0.799, + "step": 8318 + }, + { + "epoch": 0.14748046814413024, + "grad_norm": 2.84375, + "learning_rate": 4.745399351058237e-05, + "loss": 0.8623, + "step": 8320 + }, + { + "epoch": 0.14751592017974183, + "grad_norm": 2.828125, + "learning_rate": 4.7452765451231776e-05, + "loss": 0.8279, + "step": 8322 + }, + { + "epoch": 0.1475513722153534, + "grad_norm": 2.84375, + "learning_rate": 4.745153711167534e-05, + "loss": 0.8329, + "step": 8324 + }, + { + "epoch": 0.14758682425096495, + "grad_norm": 3.03125, + "learning_rate": 4.745030849192839e-05, + "loss": 0.8266, + "step": 8326 + }, + { + "epoch": 0.14762227628657654, + "grad_norm": 2.734375, + "learning_rate": 4.744907959200627e-05, + "loss": 0.779, + "step": 8328 + }, + { + "epoch": 0.1476577283221881, + "grad_norm": 2.75, + "learning_rate": 4.744785041192431e-05, + "loss": 0.8216, + "step": 8330 + }, + { + "epoch": 0.14769318035779966, + "grad_norm": 2.65625, + "learning_rate": 4.7446620951697856e-05, + "loss": 0.8291, + "step": 8332 + }, + { + "epoch": 0.14772863239341125, + "grad_norm": 3.0, + "learning_rate": 4.744539121134225e-05, + "loss": 0.8277, + "step": 8334 + }, + { + "epoch": 0.1477640844290228, + "grad_norm": 2.40625, + "learning_rate": 4.744416119087283e-05, + "loss": 0.8255, + "step": 8336 + }, + { + "epoch": 0.14779953646463437, + "grad_norm": 2.796875, + "learning_rate": 4.744293089030496e-05, + "loss": 0.8148, + "step": 8338 + }, + { + "epoch": 0.14783498850024596, + "grad_norm": 2.6875, + "learning_rate": 4.744170030965398e-05, + "loss": 0.8207, + "step": 8340 + }, + { + "epoch": 0.14787044053585752, + "grad_norm": 2.71875, + "learning_rate": 4.7440469448935264e-05, + "loss": 0.8454, + "step": 8342 + }, + { + "epoch": 0.14790589257146908, + "grad_norm": 2.875, + "learning_rate": 4.743923830816416e-05, + "loss": 0.8418, + "step": 8344 + }, + { + "epoch": 0.14794134460708067, + "grad_norm": 3.0625, + "learning_rate": 4.743800688735603e-05, + "loss": 0.8134, + "step": 8346 + }, + { + "epoch": 0.14797679664269223, + "grad_norm": 2.71875, + "learning_rate": 4.743677518652625e-05, + "loss": 0.8243, + "step": 8348 + }, + { + "epoch": 0.1480122486783038, + "grad_norm": 2.765625, + "learning_rate": 4.743554320569019e-05, + "loss": 0.7993, + "step": 8350 + }, + { + "epoch": 0.14804770071391538, + "grad_norm": 2.765625, + "learning_rate": 4.743431094486323e-05, + "loss": 0.8062, + "step": 8352 + }, + { + "epoch": 0.14808315274952694, + "grad_norm": 3.015625, + "learning_rate": 4.743307840406073e-05, + "loss": 0.8124, + "step": 8354 + }, + { + "epoch": 0.1481186047851385, + "grad_norm": 3.0, + "learning_rate": 4.7431845583298084e-05, + "loss": 0.8467, + "step": 8356 + }, + { + "epoch": 0.14815405682075009, + "grad_norm": 2.671875, + "learning_rate": 4.7430612482590685e-05, + "loss": 0.7828, + "step": 8358 + }, + { + "epoch": 0.14818950885636165, + "grad_norm": 2.828125, + "learning_rate": 4.742937910195391e-05, + "loss": 0.7977, + "step": 8360 + }, + { + "epoch": 0.1482249608919732, + "grad_norm": 2.671875, + "learning_rate": 4.742814544140316e-05, + "loss": 0.8005, + "step": 8362 + }, + { + "epoch": 0.1482604129275848, + "grad_norm": 2.609375, + "learning_rate": 4.742691150095383e-05, + "loss": 0.8071, + "step": 8364 + }, + { + "epoch": 0.14829586496319636, + "grad_norm": 2.875, + "learning_rate": 4.74256772806213e-05, + "loss": 0.821, + "step": 8366 + }, + { + "epoch": 0.14833131699880792, + "grad_norm": 3.0625, + "learning_rate": 4.7424442780421003e-05, + "loss": 0.835, + "step": 8368 + }, + { + "epoch": 0.1483667690344195, + "grad_norm": 3.03125, + "learning_rate": 4.742320800036832e-05, + "loss": 0.7975, + "step": 8370 + }, + { + "epoch": 0.14840222107003107, + "grad_norm": 2.625, + "learning_rate": 4.742197294047869e-05, + "loss": 0.8017, + "step": 8372 + }, + { + "epoch": 0.14843767310564263, + "grad_norm": 2.875, + "learning_rate": 4.742073760076749e-05, + "loss": 0.863, + "step": 8374 + }, + { + "epoch": 0.14847312514125421, + "grad_norm": 2.390625, + "learning_rate": 4.741950198125016e-05, + "loss": 0.7757, + "step": 8376 + }, + { + "epoch": 0.14850857717686577, + "grad_norm": 2.6875, + "learning_rate": 4.7418266081942116e-05, + "loss": 0.8262, + "step": 8378 + }, + { + "epoch": 0.14854402921247734, + "grad_norm": 2.75, + "learning_rate": 4.741702990285878e-05, + "loss": 0.8187, + "step": 8380 + }, + { + "epoch": 0.14857948124808892, + "grad_norm": 2.328125, + "learning_rate": 4.7415793444015574e-05, + "loss": 0.7889, + "step": 8382 + }, + { + "epoch": 0.14861493328370048, + "grad_norm": 2.828125, + "learning_rate": 4.741455670542795e-05, + "loss": 0.7936, + "step": 8384 + }, + { + "epoch": 0.14865038531931205, + "grad_norm": 2.59375, + "learning_rate": 4.741331968711131e-05, + "loss": 0.8174, + "step": 8386 + }, + { + "epoch": 0.14868583735492363, + "grad_norm": 2.75, + "learning_rate": 4.741208238908111e-05, + "loss": 0.8257, + "step": 8388 + }, + { + "epoch": 0.1487212893905352, + "grad_norm": 2.859375, + "learning_rate": 4.7410844811352806e-05, + "loss": 0.8034, + "step": 8390 + }, + { + "epoch": 0.14875674142614675, + "grad_norm": 3.015625, + "learning_rate": 4.740960695394181e-05, + "loss": 0.8, + "step": 8392 + }, + { + "epoch": 0.14879219346175834, + "grad_norm": 2.8125, + "learning_rate": 4.7408368816863596e-05, + "loss": 0.817, + "step": 8394 + }, + { + "epoch": 0.1488276454973699, + "grad_norm": 3.078125, + "learning_rate": 4.7407130400133605e-05, + "loss": 0.7515, + "step": 8396 + }, + { + "epoch": 0.14886309753298146, + "grad_norm": 2.921875, + "learning_rate": 4.7405891703767294e-05, + "loss": 0.8348, + "step": 8398 + }, + { + "epoch": 0.14889854956859305, + "grad_norm": 2.9375, + "learning_rate": 4.740465272778012e-05, + "loss": 0.851, + "step": 8400 + }, + { + "epoch": 0.1489340016042046, + "grad_norm": 2.46875, + "learning_rate": 4.740341347218754e-05, + "loss": 0.8495, + "step": 8402 + }, + { + "epoch": 0.14896945363981617, + "grad_norm": 2.765625, + "learning_rate": 4.7402173937005035e-05, + "loss": 0.8287, + "step": 8404 + }, + { + "epoch": 0.14900490567542776, + "grad_norm": 3.09375, + "learning_rate": 4.7400934122248066e-05, + "loss": 0.8723, + "step": 8406 + }, + { + "epoch": 0.14904035771103932, + "grad_norm": 3.296875, + "learning_rate": 4.7399694027932094e-05, + "loss": 0.8113, + "step": 8408 + }, + { + "epoch": 0.14907580974665088, + "grad_norm": 3.390625, + "learning_rate": 4.7398453654072616e-05, + "loss": 0.8134, + "step": 8410 + }, + { + "epoch": 0.14911126178226247, + "grad_norm": 3.171875, + "learning_rate": 4.7397213000685104e-05, + "loss": 0.7777, + "step": 8412 + }, + { + "epoch": 0.14914671381787403, + "grad_norm": 2.65625, + "learning_rate": 4.739597206778503e-05, + "loss": 0.8102, + "step": 8414 + }, + { + "epoch": 0.1491821658534856, + "grad_norm": 2.453125, + "learning_rate": 4.7394730855387895e-05, + "loss": 0.8204, + "step": 8416 + }, + { + "epoch": 0.14921761788909718, + "grad_norm": 2.9375, + "learning_rate": 4.739348936350918e-05, + "loss": 0.7826, + "step": 8418 + }, + { + "epoch": 0.14925306992470874, + "grad_norm": 2.46875, + "learning_rate": 4.7392247592164384e-05, + "loss": 0.8538, + "step": 8420 + }, + { + "epoch": 0.1492885219603203, + "grad_norm": 2.734375, + "learning_rate": 4.739100554136901e-05, + "loss": 0.8514, + "step": 8422 + }, + { + "epoch": 0.1493239739959319, + "grad_norm": 2.65625, + "learning_rate": 4.738976321113854e-05, + "loss": 0.8351, + "step": 8424 + }, + { + "epoch": 0.14935942603154345, + "grad_norm": 2.625, + "learning_rate": 4.738852060148849e-05, + "loss": 0.7661, + "step": 8426 + }, + { + "epoch": 0.149394878067155, + "grad_norm": 2.53125, + "learning_rate": 4.738727771243437e-05, + "loss": 0.8071, + "step": 8428 + }, + { + "epoch": 0.1494303301027666, + "grad_norm": 2.65625, + "learning_rate": 4.7386034543991674e-05, + "loss": 0.7708, + "step": 8430 + }, + { + "epoch": 0.14946578213837816, + "grad_norm": 2.875, + "learning_rate": 4.738479109617594e-05, + "loss": 0.8564, + "step": 8432 + }, + { + "epoch": 0.14950123417398972, + "grad_norm": 2.859375, + "learning_rate": 4.738354736900268e-05, + "loss": 0.8329, + "step": 8434 + }, + { + "epoch": 0.1495366862096013, + "grad_norm": 2.8125, + "learning_rate": 4.73823033624874e-05, + "loss": 0.8144, + "step": 8436 + }, + { + "epoch": 0.14957213824521287, + "grad_norm": 2.625, + "learning_rate": 4.738105907664565e-05, + "loss": 0.8032, + "step": 8438 + }, + { + "epoch": 0.14960759028082443, + "grad_norm": 2.734375, + "learning_rate": 4.737981451149293e-05, + "loss": 0.8256, + "step": 8440 + }, + { + "epoch": 0.14964304231643602, + "grad_norm": 2.703125, + "learning_rate": 4.737856966704479e-05, + "loss": 0.7986, + "step": 8442 + }, + { + "epoch": 0.14967849435204758, + "grad_norm": 3.484375, + "learning_rate": 4.737732454331677e-05, + "loss": 0.8465, + "step": 8444 + }, + { + "epoch": 0.14971394638765914, + "grad_norm": 3.0625, + "learning_rate": 4.737607914032439e-05, + "loss": 0.8307, + "step": 8446 + }, + { + "epoch": 0.14974939842327073, + "grad_norm": 2.96875, + "learning_rate": 4.737483345808321e-05, + "loss": 0.8057, + "step": 8448 + }, + { + "epoch": 0.1497848504588823, + "grad_norm": 2.484375, + "learning_rate": 4.737358749660877e-05, + "loss": 0.8325, + "step": 8450 + }, + { + "epoch": 0.14982030249449385, + "grad_norm": 2.609375, + "learning_rate": 4.737234125591661e-05, + "loss": 0.8536, + "step": 8452 + }, + { + "epoch": 0.1498557545301054, + "grad_norm": 2.78125, + "learning_rate": 4.7371094736022295e-05, + "loss": 0.8113, + "step": 8454 + }, + { + "epoch": 0.149891206565717, + "grad_norm": 2.734375, + "learning_rate": 4.736984793694138e-05, + "loss": 0.8725, + "step": 8456 + }, + { + "epoch": 0.14992665860132856, + "grad_norm": 2.59375, + "learning_rate": 4.736860085868942e-05, + "loss": 0.8384, + "step": 8458 + }, + { + "epoch": 0.14996211063694012, + "grad_norm": 2.9375, + "learning_rate": 4.736735350128199e-05, + "loss": 0.8097, + "step": 8460 + }, + { + "epoch": 0.1499975626725517, + "grad_norm": 2.453125, + "learning_rate": 4.736610586473463e-05, + "loss": 0.7781, + "step": 8462 + }, + { + "epoch": 0.15003301470816327, + "grad_norm": 2.734375, + "learning_rate": 4.736485794906294e-05, + "loss": 0.8082, + "step": 8464 + }, + { + "epoch": 0.15006846674377483, + "grad_norm": 2.6875, + "learning_rate": 4.7363609754282466e-05, + "loss": 0.8256, + "step": 8466 + }, + { + "epoch": 0.15010391877938642, + "grad_norm": 2.65625, + "learning_rate": 4.736236128040882e-05, + "loss": 0.8137, + "step": 8468 + }, + { + "epoch": 0.15013937081499798, + "grad_norm": 2.734375, + "learning_rate": 4.736111252745755e-05, + "loss": 0.8189, + "step": 8470 + }, + { + "epoch": 0.15017482285060954, + "grad_norm": 2.640625, + "learning_rate": 4.7359863495444254e-05, + "loss": 0.8005, + "step": 8472 + }, + { + "epoch": 0.15021027488622113, + "grad_norm": 2.984375, + "learning_rate": 4.735861418438452e-05, + "loss": 0.8336, + "step": 8474 + }, + { + "epoch": 0.1502457269218327, + "grad_norm": 2.75, + "learning_rate": 4.735736459429394e-05, + "loss": 0.8146, + "step": 8476 + }, + { + "epoch": 0.15028117895744425, + "grad_norm": 2.828125, + "learning_rate": 4.735611472518811e-05, + "loss": 0.8229, + "step": 8478 + }, + { + "epoch": 0.15031663099305584, + "grad_norm": 2.703125, + "learning_rate": 4.7354864577082616e-05, + "loss": 0.821, + "step": 8480 + }, + { + "epoch": 0.1503520830286674, + "grad_norm": 2.84375, + "learning_rate": 4.7353614149993074e-05, + "loss": 0.7703, + "step": 8482 + }, + { + "epoch": 0.15038753506427896, + "grad_norm": 2.546875, + "learning_rate": 4.735236344393508e-05, + "loss": 0.7681, + "step": 8484 + }, + { + "epoch": 0.15042298709989055, + "grad_norm": 2.609375, + "learning_rate": 4.735111245892425e-05, + "loss": 0.8142, + "step": 8486 + }, + { + "epoch": 0.1504584391355021, + "grad_norm": 2.46875, + "learning_rate": 4.734986119497619e-05, + "loss": 0.8485, + "step": 8488 + }, + { + "epoch": 0.15049389117111367, + "grad_norm": 2.71875, + "learning_rate": 4.734860965210651e-05, + "loss": 0.8056, + "step": 8490 + }, + { + "epoch": 0.15052934320672526, + "grad_norm": 3.15625, + "learning_rate": 4.734735783033085e-05, + "loss": 0.8393, + "step": 8492 + }, + { + "epoch": 0.15056479524233682, + "grad_norm": 2.703125, + "learning_rate": 4.734610572966481e-05, + "loss": 0.8382, + "step": 8494 + }, + { + "epoch": 0.15060024727794838, + "grad_norm": 2.546875, + "learning_rate": 4.734485335012403e-05, + "loss": 0.8161, + "step": 8496 + }, + { + "epoch": 0.15063569931355997, + "grad_norm": 3.03125, + "learning_rate": 4.734360069172413e-05, + "loss": 0.8473, + "step": 8498 + }, + { + "epoch": 0.15067115134917153, + "grad_norm": 2.6875, + "learning_rate": 4.7342347754480745e-05, + "loss": 0.8288, + "step": 8500 + }, + { + "epoch": 0.1507066033847831, + "grad_norm": 2.65625, + "learning_rate": 4.734109453840952e-05, + "loss": 0.8326, + "step": 8502 + }, + { + "epoch": 0.15074205542039468, + "grad_norm": 3.015625, + "learning_rate": 4.7339841043526085e-05, + "loss": 0.8067, + "step": 8504 + }, + { + "epoch": 0.15077750745600624, + "grad_norm": 2.96875, + "learning_rate": 4.733858726984609e-05, + "loss": 0.7916, + "step": 8506 + }, + { + "epoch": 0.1508129594916178, + "grad_norm": 2.859375, + "learning_rate": 4.7337333217385173e-05, + "loss": 0.816, + "step": 8508 + }, + { + "epoch": 0.1508484115272294, + "grad_norm": 2.8125, + "learning_rate": 4.7336078886158994e-05, + "loss": 0.8172, + "step": 8510 + }, + { + "epoch": 0.15088386356284095, + "grad_norm": 2.765625, + "learning_rate": 4.7334824276183195e-05, + "loss": 0.7866, + "step": 8512 + }, + { + "epoch": 0.1509193155984525, + "grad_norm": 2.65625, + "learning_rate": 4.733356938747345e-05, + "loss": 0.7956, + "step": 8514 + }, + { + "epoch": 0.1509547676340641, + "grad_norm": 2.84375, + "learning_rate": 4.7332314220045417e-05, + "loss": 0.7696, + "step": 8516 + }, + { + "epoch": 0.15099021966967566, + "grad_norm": 2.921875, + "learning_rate": 4.7331058773914736e-05, + "loss": 0.8321, + "step": 8518 + }, + { + "epoch": 0.15102567170528722, + "grad_norm": 2.78125, + "learning_rate": 4.732980304909711e-05, + "loss": 0.8247, + "step": 8520 + }, + { + "epoch": 0.1510611237408988, + "grad_norm": 2.625, + "learning_rate": 4.7328547045608185e-05, + "loss": 0.8107, + "step": 8522 + }, + { + "epoch": 0.15109657577651037, + "grad_norm": 2.75, + "learning_rate": 4.7327290763463636e-05, + "loss": 0.8323, + "step": 8524 + }, + { + "epoch": 0.15113202781212193, + "grad_norm": 2.78125, + "learning_rate": 4.732603420267916e-05, + "loss": 0.8015, + "step": 8526 + }, + { + "epoch": 0.15116747984773352, + "grad_norm": 2.90625, + "learning_rate": 4.7324777363270424e-05, + "loss": 0.8537, + "step": 8528 + }, + { + "epoch": 0.15120293188334508, + "grad_norm": 3.03125, + "learning_rate": 4.7323520245253114e-05, + "loss": 0.8273, + "step": 8530 + }, + { + "epoch": 0.15123838391895664, + "grad_norm": 2.875, + "learning_rate": 4.732226284864293e-05, + "loss": 0.8292, + "step": 8532 + }, + { + "epoch": 0.15127383595456823, + "grad_norm": 2.59375, + "learning_rate": 4.7321005173455546e-05, + "loss": 0.8115, + "step": 8534 + }, + { + "epoch": 0.15130928799017979, + "grad_norm": 2.71875, + "learning_rate": 4.731974721970667e-05, + "loss": 0.7805, + "step": 8536 + }, + { + "epoch": 0.15134474002579135, + "grad_norm": 2.984375, + "learning_rate": 4.7318488987411994e-05, + "loss": 0.84, + "step": 8538 + }, + { + "epoch": 0.15138019206140294, + "grad_norm": 2.78125, + "learning_rate": 4.7317230476587225e-05, + "loss": 0.8011, + "step": 8540 + }, + { + "epoch": 0.1514156440970145, + "grad_norm": 2.78125, + "learning_rate": 4.7315971687248076e-05, + "loss": 0.7949, + "step": 8542 + }, + { + "epoch": 0.15145109613262606, + "grad_norm": 2.703125, + "learning_rate": 4.731471261941024e-05, + "loss": 0.7571, + "step": 8544 + }, + { + "epoch": 0.15148654816823764, + "grad_norm": 2.984375, + "learning_rate": 4.7313453273089445e-05, + "loss": 0.8316, + "step": 8546 + }, + { + "epoch": 0.1515220002038492, + "grad_norm": 2.828125, + "learning_rate": 4.73121936483014e-05, + "loss": 0.8164, + "step": 8548 + }, + { + "epoch": 0.15155745223946077, + "grad_norm": 2.859375, + "learning_rate": 4.7310933745061813e-05, + "loss": 0.8136, + "step": 8550 + }, + { + "epoch": 0.15159290427507235, + "grad_norm": 2.5625, + "learning_rate": 4.7309673563386426e-05, + "loss": 0.8086, + "step": 8552 + }, + { + "epoch": 0.15162835631068391, + "grad_norm": 2.65625, + "learning_rate": 4.730841310329096e-05, + "loss": 0.8417, + "step": 8554 + }, + { + "epoch": 0.15166380834629548, + "grad_norm": 2.828125, + "learning_rate": 4.730715236479115e-05, + "loss": 0.8629, + "step": 8556 + }, + { + "epoch": 0.15169926038190706, + "grad_norm": 2.921875, + "learning_rate": 4.730589134790272e-05, + "loss": 0.7892, + "step": 8558 + }, + { + "epoch": 0.15173471241751862, + "grad_norm": 2.859375, + "learning_rate": 4.730463005264142e-05, + "loss": 0.8418, + "step": 8560 + }, + { + "epoch": 0.15177016445313019, + "grad_norm": 2.953125, + "learning_rate": 4.7303368479022974e-05, + "loss": 0.8379, + "step": 8562 + }, + { + "epoch": 0.15180561648874177, + "grad_norm": 2.625, + "learning_rate": 4.730210662706314e-05, + "loss": 0.8355, + "step": 8564 + }, + { + "epoch": 0.15184106852435333, + "grad_norm": 2.9375, + "learning_rate": 4.730084449677766e-05, + "loss": 0.8156, + "step": 8566 + }, + { + "epoch": 0.1518765205599649, + "grad_norm": 2.421875, + "learning_rate": 4.7299582088182284e-05, + "loss": 0.8068, + "step": 8568 + }, + { + "epoch": 0.15191197259557648, + "grad_norm": 2.671875, + "learning_rate": 4.729831940129277e-05, + "loss": 0.7711, + "step": 8570 + }, + { + "epoch": 0.15194742463118804, + "grad_norm": 2.640625, + "learning_rate": 4.729705643612486e-05, + "loss": 0.8524, + "step": 8572 + }, + { + "epoch": 0.1519828766667996, + "grad_norm": 2.671875, + "learning_rate": 4.729579319269435e-05, + "loss": 0.7844, + "step": 8574 + }, + { + "epoch": 0.1520183287024112, + "grad_norm": 2.6875, + "learning_rate": 4.729452967101697e-05, + "loss": 0.8294, + "step": 8576 + }, + { + "epoch": 0.15205378073802275, + "grad_norm": 2.78125, + "learning_rate": 4.729326587110852e-05, + "loss": 0.7946, + "step": 8578 + }, + { + "epoch": 0.15208923277363431, + "grad_norm": 2.6875, + "learning_rate": 4.729200179298474e-05, + "loss": 0.8395, + "step": 8580 + }, + { + "epoch": 0.1521246848092459, + "grad_norm": 2.765625, + "learning_rate": 4.729073743666143e-05, + "loss": 0.8131, + "step": 8582 + }, + { + "epoch": 0.15216013684485746, + "grad_norm": 2.875, + "learning_rate": 4.728947280215435e-05, + "loss": 0.7983, + "step": 8584 + }, + { + "epoch": 0.15219558888046902, + "grad_norm": 2.5, + "learning_rate": 4.72882078894793e-05, + "loss": 0.8259, + "step": 8586 + }, + { + "epoch": 0.1522310409160806, + "grad_norm": 2.65625, + "learning_rate": 4.728694269865205e-05, + "loss": 0.7883, + "step": 8588 + }, + { + "epoch": 0.15226649295169217, + "grad_norm": 2.703125, + "learning_rate": 4.728567722968841e-05, + "loss": 0.7958, + "step": 8590 + }, + { + "epoch": 0.15230194498730373, + "grad_norm": 2.859375, + "learning_rate": 4.728441148260415e-05, + "loss": 0.768, + "step": 8592 + }, + { + "epoch": 0.15233739702291532, + "grad_norm": 2.734375, + "learning_rate": 4.728314545741508e-05, + "loss": 0.8115, + "step": 8594 + }, + { + "epoch": 0.15237284905852688, + "grad_norm": 2.859375, + "learning_rate": 4.728187915413699e-05, + "loss": 0.8365, + "step": 8596 + }, + { + "epoch": 0.15240830109413844, + "grad_norm": 2.75, + "learning_rate": 4.72806125727857e-05, + "loss": 0.8323, + "step": 8598 + }, + { + "epoch": 0.15244375312975003, + "grad_norm": 2.96875, + "learning_rate": 4.7279345713377e-05, + "loss": 0.8174, + "step": 8600 + }, + { + "epoch": 0.1524792051653616, + "grad_norm": 2.71875, + "learning_rate": 4.72780785759267e-05, + "loss": 0.7958, + "step": 8602 + }, + { + "epoch": 0.15251465720097315, + "grad_norm": 2.859375, + "learning_rate": 4.727681116045063e-05, + "loss": 0.8139, + "step": 8604 + }, + { + "epoch": 0.15255010923658474, + "grad_norm": 2.703125, + "learning_rate": 4.727554346696459e-05, + "loss": 0.8109, + "step": 8606 + }, + { + "epoch": 0.1525855612721963, + "grad_norm": 2.8125, + "learning_rate": 4.727427549548441e-05, + "loss": 0.8242, + "step": 8608 + }, + { + "epoch": 0.15262101330780786, + "grad_norm": 2.609375, + "learning_rate": 4.727300724602591e-05, + "loss": 0.8205, + "step": 8610 + }, + { + "epoch": 0.15265646534341945, + "grad_norm": 2.640625, + "learning_rate": 4.727173871860492e-05, + "loss": 0.8197, + "step": 8612 + }, + { + "epoch": 0.152691917379031, + "grad_norm": 2.71875, + "learning_rate": 4.727046991323726e-05, + "loss": 0.7871, + "step": 8614 + }, + { + "epoch": 0.15272736941464257, + "grad_norm": 2.75, + "learning_rate": 4.7269200829938784e-05, + "loss": 0.8023, + "step": 8616 + }, + { + "epoch": 0.15276282145025416, + "grad_norm": 2.765625, + "learning_rate": 4.7267931468725326e-05, + "loss": 0.8246, + "step": 8618 + }, + { + "epoch": 0.15279827348586572, + "grad_norm": 2.796875, + "learning_rate": 4.726666182961271e-05, + "loss": 0.8302, + "step": 8620 + }, + { + "epoch": 0.15283372552147728, + "grad_norm": 2.75, + "learning_rate": 4.7265391912616796e-05, + "loss": 0.8324, + "step": 8622 + }, + { + "epoch": 0.15286917755708884, + "grad_norm": 2.78125, + "learning_rate": 4.726412171775343e-05, + "loss": 0.8154, + "step": 8624 + }, + { + "epoch": 0.15290462959270043, + "grad_norm": 2.6875, + "learning_rate": 4.7262851245038456e-05, + "loss": 0.8289, + "step": 8626 + }, + { + "epoch": 0.152940081628312, + "grad_norm": 2.578125, + "learning_rate": 4.7261580494487745e-05, + "loss": 0.8119, + "step": 8628 + }, + { + "epoch": 0.15297553366392355, + "grad_norm": 2.78125, + "learning_rate": 4.726030946611714e-05, + "loss": 0.7956, + "step": 8630 + }, + { + "epoch": 0.15301098569953514, + "grad_norm": 2.9375, + "learning_rate": 4.7259038159942514e-05, + "loss": 0.7943, + "step": 8632 + }, + { + "epoch": 0.1530464377351467, + "grad_norm": 2.921875, + "learning_rate": 4.725776657597972e-05, + "loss": 0.8546, + "step": 8634 + }, + { + "epoch": 0.15308188977075826, + "grad_norm": 2.890625, + "learning_rate": 4.725649471424464e-05, + "loss": 0.8512, + "step": 8636 + }, + { + "epoch": 0.15311734180636985, + "grad_norm": 2.484375, + "learning_rate": 4.7255222574753144e-05, + "loss": 0.8184, + "step": 8638 + }, + { + "epoch": 0.1531527938419814, + "grad_norm": 2.75, + "learning_rate": 4.7253950157521106e-05, + "loss": 0.8186, + "step": 8640 + }, + { + "epoch": 0.15318824587759297, + "grad_norm": 2.875, + "learning_rate": 4.72526774625644e-05, + "loss": 0.811, + "step": 8642 + }, + { + "epoch": 0.15322369791320456, + "grad_norm": 2.84375, + "learning_rate": 4.725140448989892e-05, + "loss": 0.8125, + "step": 8644 + }, + { + "epoch": 0.15325914994881612, + "grad_norm": 2.4375, + "learning_rate": 4.725013123954054e-05, + "loss": 0.8177, + "step": 8646 + }, + { + "epoch": 0.15329460198442768, + "grad_norm": 2.625, + "learning_rate": 4.724885771150516e-05, + "loss": 0.8118, + "step": 8648 + }, + { + "epoch": 0.15333005402003927, + "grad_norm": 2.703125, + "learning_rate": 4.724758390580867e-05, + "loss": 0.7981, + "step": 8650 + }, + { + "epoch": 0.15336550605565083, + "grad_norm": 2.625, + "learning_rate": 4.724630982246696e-05, + "loss": 0.844, + "step": 8652 + }, + { + "epoch": 0.1534009580912624, + "grad_norm": 2.921875, + "learning_rate": 4.724503546149595e-05, + "loss": 0.8135, + "step": 8654 + }, + { + "epoch": 0.15343641012687398, + "grad_norm": 3.171875, + "learning_rate": 4.724376082291152e-05, + "loss": 0.8015, + "step": 8656 + }, + { + "epoch": 0.15347186216248554, + "grad_norm": 2.65625, + "learning_rate": 4.724248590672959e-05, + "loss": 0.8211, + "step": 8658 + }, + { + "epoch": 0.1535073141980971, + "grad_norm": 2.515625, + "learning_rate": 4.7241210712966075e-05, + "loss": 0.7986, + "step": 8660 + }, + { + "epoch": 0.1535427662337087, + "grad_norm": 2.765625, + "learning_rate": 4.723993524163688e-05, + "loss": 0.8151, + "step": 8662 + }, + { + "epoch": 0.15357821826932025, + "grad_norm": 2.765625, + "learning_rate": 4.723865949275792e-05, + "loss": 0.8132, + "step": 8664 + }, + { + "epoch": 0.1536136703049318, + "grad_norm": 2.6875, + "learning_rate": 4.723738346634513e-05, + "loss": 0.797, + "step": 8666 + }, + { + "epoch": 0.1536491223405434, + "grad_norm": 2.671875, + "learning_rate": 4.723610716241442e-05, + "loss": 0.8153, + "step": 8668 + }, + { + "epoch": 0.15368457437615496, + "grad_norm": 2.65625, + "learning_rate": 4.723483058098173e-05, + "loss": 0.8162, + "step": 8670 + }, + { + "epoch": 0.15372002641176652, + "grad_norm": 2.828125, + "learning_rate": 4.723355372206297e-05, + "loss": 0.8034, + "step": 8672 + }, + { + "epoch": 0.1537554784473781, + "grad_norm": 2.546875, + "learning_rate": 4.723227658567411e-05, + "loss": 0.8138, + "step": 8674 + }, + { + "epoch": 0.15379093048298967, + "grad_norm": 2.828125, + "learning_rate": 4.723099917183106e-05, + "loss": 0.8076, + "step": 8676 + }, + { + "epoch": 0.15382638251860123, + "grad_norm": 3.0, + "learning_rate": 4.7229721480549774e-05, + "loss": 0.8268, + "step": 8678 + }, + { + "epoch": 0.15386183455421282, + "grad_norm": 2.828125, + "learning_rate": 4.722844351184619e-05, + "loss": 0.8274, + "step": 8680 + }, + { + "epoch": 0.15389728658982438, + "grad_norm": 2.53125, + "learning_rate": 4.722716526573626e-05, + "loss": 0.7815, + "step": 8682 + }, + { + "epoch": 0.15393273862543594, + "grad_norm": 2.875, + "learning_rate": 4.722588674223594e-05, + "loss": 0.8101, + "step": 8684 + }, + { + "epoch": 0.15396819066104753, + "grad_norm": 2.953125, + "learning_rate": 4.722460794136117e-05, + "loss": 0.8077, + "step": 8686 + }, + { + "epoch": 0.1540036426966591, + "grad_norm": 2.546875, + "learning_rate": 4.7223328863127944e-05, + "loss": 0.8179, + "step": 8688 + }, + { + "epoch": 0.15403909473227065, + "grad_norm": 3.046875, + "learning_rate": 4.722204950755219e-05, + "loss": 0.8085, + "step": 8690 + }, + { + "epoch": 0.15407454676788224, + "grad_norm": 2.859375, + "learning_rate": 4.722076987464989e-05, + "loss": 0.8197, + "step": 8692 + }, + { + "epoch": 0.1541099988034938, + "grad_norm": 2.609375, + "learning_rate": 4.721948996443701e-05, + "loss": 0.7829, + "step": 8694 + }, + { + "epoch": 0.15414545083910536, + "grad_norm": 2.40625, + "learning_rate": 4.7218209776929525e-05, + "loss": 0.818, + "step": 8696 + }, + { + "epoch": 0.15418090287471695, + "grad_norm": 2.65625, + "learning_rate": 4.7216929312143396e-05, + "loss": 0.8165, + "step": 8698 + }, + { + "epoch": 0.1542163549103285, + "grad_norm": 2.65625, + "learning_rate": 4.721564857009463e-05, + "loss": 0.7966, + "step": 8700 + }, + { + "epoch": 0.15425180694594007, + "grad_norm": 2.796875, + "learning_rate": 4.7214367550799196e-05, + "loss": 0.8127, + "step": 8702 + }, + { + "epoch": 0.15428725898155166, + "grad_norm": 2.453125, + "learning_rate": 4.721308625427309e-05, + "loss": 0.8077, + "step": 8704 + }, + { + "epoch": 0.15432271101716322, + "grad_norm": 2.875, + "learning_rate": 4.7211804680532276e-05, + "loss": 0.81, + "step": 8706 + }, + { + "epoch": 0.15435816305277478, + "grad_norm": 2.828125, + "learning_rate": 4.7210522829592774e-05, + "loss": 0.8512, + "step": 8708 + }, + { + "epoch": 0.15439361508838637, + "grad_norm": 2.625, + "learning_rate": 4.7209240701470584e-05, + "loss": 0.8265, + "step": 8710 + }, + { + "epoch": 0.15442906712399793, + "grad_norm": 2.609375, + "learning_rate": 4.7207958296181676e-05, + "loss": 0.8358, + "step": 8712 + }, + { + "epoch": 0.1544645191596095, + "grad_norm": 2.5, + "learning_rate": 4.7206675613742084e-05, + "loss": 0.7722, + "step": 8714 + }, + { + "epoch": 0.15449997119522108, + "grad_norm": 2.78125, + "learning_rate": 4.7205392654167806e-05, + "loss": 0.8044, + "step": 8716 + }, + { + "epoch": 0.15453542323083264, + "grad_norm": 2.578125, + "learning_rate": 4.7204109417474854e-05, + "loss": 0.8226, + "step": 8718 + }, + { + "epoch": 0.1545708752664442, + "grad_norm": 2.78125, + "learning_rate": 4.7202825903679234e-05, + "loss": 0.8345, + "step": 8720 + }, + { + "epoch": 0.15460632730205578, + "grad_norm": 2.65625, + "learning_rate": 4.720154211279698e-05, + "loss": 0.8133, + "step": 8722 + }, + { + "epoch": 0.15464177933766735, + "grad_norm": 2.625, + "learning_rate": 4.72002580448441e-05, + "loss": 0.7914, + "step": 8724 + }, + { + "epoch": 0.1546772313732789, + "grad_norm": 2.71875, + "learning_rate": 4.719897369983663e-05, + "loss": 0.785, + "step": 8726 + }, + { + "epoch": 0.1547126834088905, + "grad_norm": 2.59375, + "learning_rate": 4.7197689077790585e-05, + "loss": 0.789, + "step": 8728 + }, + { + "epoch": 0.15474813544450206, + "grad_norm": 2.78125, + "learning_rate": 4.719640417872201e-05, + "loss": 0.8307, + "step": 8730 + }, + { + "epoch": 0.15478358748011362, + "grad_norm": 2.90625, + "learning_rate": 4.719511900264693e-05, + "loss": 0.8563, + "step": 8732 + }, + { + "epoch": 0.1548190395157252, + "grad_norm": 3.125, + "learning_rate": 4.719383354958138e-05, + "loss": 0.8509, + "step": 8734 + }, + { + "epoch": 0.15485449155133676, + "grad_norm": 2.734375, + "learning_rate": 4.7192547819541423e-05, + "loss": 0.8402, + "step": 8736 + }, + { + "epoch": 0.15488994358694833, + "grad_norm": 3.0625, + "learning_rate": 4.7191261812543084e-05, + "loss": 0.8446, + "step": 8738 + }, + { + "epoch": 0.1549253956225599, + "grad_norm": 3.0, + "learning_rate": 4.718997552860243e-05, + "loss": 0.8627, + "step": 8740 + }, + { + "epoch": 0.15496084765817147, + "grad_norm": 2.546875, + "learning_rate": 4.7188688967735486e-05, + "loss": 0.8258, + "step": 8742 + }, + { + "epoch": 0.15499629969378304, + "grad_norm": 2.921875, + "learning_rate": 4.7187402129958334e-05, + "loss": 0.8241, + "step": 8744 + }, + { + "epoch": 0.15503175172939462, + "grad_norm": 2.6875, + "learning_rate": 4.718611501528703e-05, + "loss": 0.834, + "step": 8746 + }, + { + "epoch": 0.15506720376500618, + "grad_norm": 2.75, + "learning_rate": 4.7184827623737623e-05, + "loss": 0.8331, + "step": 8748 + }, + { + "epoch": 0.15510265580061774, + "grad_norm": 2.703125, + "learning_rate": 4.71835399553262e-05, + "loss": 0.804, + "step": 8750 + }, + { + "epoch": 0.15513810783622933, + "grad_norm": 2.703125, + "learning_rate": 4.718225201006881e-05, + "loss": 0.8106, + "step": 8752 + }, + { + "epoch": 0.1551735598718409, + "grad_norm": 2.921875, + "learning_rate": 4.718096378798153e-05, + "loss": 0.839, + "step": 8754 + }, + { + "epoch": 0.15520901190745245, + "grad_norm": 2.734375, + "learning_rate": 4.717967528908045e-05, + "loss": 0.8135, + "step": 8756 + }, + { + "epoch": 0.15524446394306404, + "grad_norm": 3.0, + "learning_rate": 4.717838651338163e-05, + "loss": 0.7937, + "step": 8758 + }, + { + "epoch": 0.1552799159786756, + "grad_norm": 2.578125, + "learning_rate": 4.717709746090118e-05, + "loss": 0.8099, + "step": 8760 + }, + { + "epoch": 0.15531536801428716, + "grad_norm": 3.125, + "learning_rate": 4.717580813165517e-05, + "loss": 0.7994, + "step": 8762 + }, + { + "epoch": 0.15535082004989875, + "grad_norm": 2.890625, + "learning_rate": 4.717451852565969e-05, + "loss": 0.7946, + "step": 8764 + }, + { + "epoch": 0.1553862720855103, + "grad_norm": 2.9375, + "learning_rate": 4.7173228642930846e-05, + "loss": 0.8289, + "step": 8766 + }, + { + "epoch": 0.15542172412112187, + "grad_norm": 2.625, + "learning_rate": 4.717193848348471e-05, + "loss": 0.8171, + "step": 8768 + }, + { + "epoch": 0.15545717615673346, + "grad_norm": 2.546875, + "learning_rate": 4.7170648047337415e-05, + "loss": 0.8237, + "step": 8770 + }, + { + "epoch": 0.15549262819234502, + "grad_norm": 2.703125, + "learning_rate": 4.7169357334505046e-05, + "loss": 0.8451, + "step": 8772 + }, + { + "epoch": 0.15552808022795658, + "grad_norm": 2.546875, + "learning_rate": 4.7168066345003716e-05, + "loss": 0.822, + "step": 8774 + }, + { + "epoch": 0.15556353226356817, + "grad_norm": 2.625, + "learning_rate": 4.716677507884953e-05, + "loss": 0.8134, + "step": 8776 + }, + { + "epoch": 0.15559898429917973, + "grad_norm": 2.640625, + "learning_rate": 4.7165483536058605e-05, + "loss": 0.8004, + "step": 8778 + }, + { + "epoch": 0.1556344363347913, + "grad_norm": 2.890625, + "learning_rate": 4.716419171664708e-05, + "loss": 0.8202, + "step": 8780 + }, + { + "epoch": 0.15566988837040288, + "grad_norm": 2.625, + "learning_rate": 4.716289962063104e-05, + "loss": 0.8158, + "step": 8782 + }, + { + "epoch": 0.15570534040601444, + "grad_norm": 2.578125, + "learning_rate": 4.716160724802664e-05, + "loss": 0.797, + "step": 8784 + }, + { + "epoch": 0.155740792441626, + "grad_norm": 2.609375, + "learning_rate": 4.716031459884999e-05, + "loss": 0.7925, + "step": 8786 + }, + { + "epoch": 0.1557762444772376, + "grad_norm": 2.8125, + "learning_rate": 4.715902167311723e-05, + "loss": 0.8325, + "step": 8788 + }, + { + "epoch": 0.15581169651284915, + "grad_norm": 2.65625, + "learning_rate": 4.71577284708445e-05, + "loss": 0.8167, + "step": 8790 + }, + { + "epoch": 0.1558471485484607, + "grad_norm": 2.78125, + "learning_rate": 4.7156434992047937e-05, + "loss": 0.8647, + "step": 8792 + }, + { + "epoch": 0.1558826005840723, + "grad_norm": 2.671875, + "learning_rate": 4.715514123674367e-05, + "loss": 0.8149, + "step": 8794 + }, + { + "epoch": 0.15591805261968386, + "grad_norm": 2.78125, + "learning_rate": 4.7153847204947866e-05, + "loss": 0.7531, + "step": 8796 + }, + { + "epoch": 0.15595350465529542, + "grad_norm": 2.90625, + "learning_rate": 4.7152552896676656e-05, + "loss": 0.8169, + "step": 8798 + }, + { + "epoch": 0.15598895669090698, + "grad_norm": 2.671875, + "learning_rate": 4.71512583119462e-05, + "loss": 0.8065, + "step": 8800 + }, + { + "epoch": 0.15602440872651857, + "grad_norm": 2.5625, + "learning_rate": 4.714996345077265e-05, + "loss": 0.8222, + "step": 8802 + }, + { + "epoch": 0.15605986076213013, + "grad_norm": 2.65625, + "learning_rate": 4.714866831317218e-05, + "loss": 0.8257, + "step": 8804 + }, + { + "epoch": 0.1560953127977417, + "grad_norm": 2.875, + "learning_rate": 4.714737289916093e-05, + "loss": 0.8318, + "step": 8806 + }, + { + "epoch": 0.15613076483335328, + "grad_norm": 2.703125, + "learning_rate": 4.714607720875509e-05, + "loss": 0.7956, + "step": 8808 + }, + { + "epoch": 0.15616621686896484, + "grad_norm": 2.890625, + "learning_rate": 4.7144781241970815e-05, + "loss": 0.868, + "step": 8810 + }, + { + "epoch": 0.1562016689045764, + "grad_norm": 2.625, + "learning_rate": 4.7143484998824284e-05, + "loss": 0.8525, + "step": 8812 + }, + { + "epoch": 0.156237120940188, + "grad_norm": 2.5, + "learning_rate": 4.7142188479331674e-05, + "loss": 0.8132, + "step": 8814 + }, + { + "epoch": 0.15627257297579955, + "grad_norm": 2.46875, + "learning_rate": 4.714089168350916e-05, + "loss": 0.8109, + "step": 8816 + }, + { + "epoch": 0.1563080250114111, + "grad_norm": 2.546875, + "learning_rate": 4.713959461137293e-05, + "loss": 0.8043, + "step": 8818 + }, + { + "epoch": 0.1563434770470227, + "grad_norm": 2.828125, + "learning_rate": 4.7138297262939173e-05, + "loss": 0.8104, + "step": 8820 + }, + { + "epoch": 0.15637892908263426, + "grad_norm": 2.640625, + "learning_rate": 4.7136999638224076e-05, + "loss": 0.8069, + "step": 8822 + }, + { + "epoch": 0.15641438111824582, + "grad_norm": 2.984375, + "learning_rate": 4.713570173724383e-05, + "loss": 0.8431, + "step": 8824 + }, + { + "epoch": 0.1564498331538574, + "grad_norm": 2.765625, + "learning_rate": 4.713440356001464e-05, + "loss": 0.8172, + "step": 8826 + }, + { + "epoch": 0.15648528518946897, + "grad_norm": 2.484375, + "learning_rate": 4.713310510655271e-05, + "loss": 0.8077, + "step": 8828 + }, + { + "epoch": 0.15652073722508053, + "grad_norm": 2.875, + "learning_rate": 4.713180637687423e-05, + "loss": 0.8036, + "step": 8830 + }, + { + "epoch": 0.15655618926069212, + "grad_norm": 2.9375, + "learning_rate": 4.713050737099542e-05, + "loss": 0.8204, + "step": 8832 + }, + { + "epoch": 0.15659164129630368, + "grad_norm": 2.671875, + "learning_rate": 4.712920808893249e-05, + "loss": 0.8062, + "step": 8834 + }, + { + "epoch": 0.15662709333191524, + "grad_norm": 2.78125, + "learning_rate": 4.712790853070165e-05, + "loss": 0.8581, + "step": 8836 + }, + { + "epoch": 0.15666254536752683, + "grad_norm": 2.8125, + "learning_rate": 4.712660869631912e-05, + "loss": 0.8173, + "step": 8838 + }, + { + "epoch": 0.1566979974031384, + "grad_norm": 2.671875, + "learning_rate": 4.712530858580111e-05, + "loss": 0.7951, + "step": 8840 + }, + { + "epoch": 0.15673344943874995, + "grad_norm": 2.5625, + "learning_rate": 4.712400819916387e-05, + "loss": 0.7993, + "step": 8842 + }, + { + "epoch": 0.15676890147436154, + "grad_norm": 2.875, + "learning_rate": 4.7122707536423615e-05, + "loss": 0.787, + "step": 8844 + }, + { + "epoch": 0.1568043535099731, + "grad_norm": 2.921875, + "learning_rate": 4.712140659759658e-05, + "loss": 0.7964, + "step": 8846 + }, + { + "epoch": 0.15683980554558466, + "grad_norm": 2.5, + "learning_rate": 4.7120105382698996e-05, + "loss": 0.8216, + "step": 8848 + }, + { + "epoch": 0.15687525758119625, + "grad_norm": 2.890625, + "learning_rate": 4.71188038917471e-05, + "loss": 0.8313, + "step": 8850 + }, + { + "epoch": 0.1569107096168078, + "grad_norm": 2.84375, + "learning_rate": 4.711750212475714e-05, + "loss": 0.8019, + "step": 8852 + }, + { + "epoch": 0.15694616165241937, + "grad_norm": 2.703125, + "learning_rate": 4.711620008174536e-05, + "loss": 0.8438, + "step": 8854 + }, + { + "epoch": 0.15698161368803096, + "grad_norm": 2.9375, + "learning_rate": 4.711489776272802e-05, + "loss": 0.8425, + "step": 8856 + }, + { + "epoch": 0.15701706572364252, + "grad_norm": 2.8125, + "learning_rate": 4.711359516772135e-05, + "loss": 0.8696, + "step": 8858 + }, + { + "epoch": 0.15705251775925408, + "grad_norm": 2.9375, + "learning_rate": 4.711229229674162e-05, + "loss": 0.816, + "step": 8860 + }, + { + "epoch": 0.15708796979486567, + "grad_norm": 2.796875, + "learning_rate": 4.7110989149805095e-05, + "loss": 0.8127, + "step": 8862 + }, + { + "epoch": 0.15712342183047723, + "grad_norm": 2.734375, + "learning_rate": 4.710968572692802e-05, + "loss": 0.7671, + "step": 8864 + }, + { + "epoch": 0.1571588738660888, + "grad_norm": 2.96875, + "learning_rate": 4.710838202812668e-05, + "loss": 0.8565, + "step": 8866 + }, + { + "epoch": 0.15719432590170038, + "grad_norm": 2.703125, + "learning_rate": 4.7107078053417335e-05, + "loss": 0.8202, + "step": 8868 + }, + { + "epoch": 0.15722977793731194, + "grad_norm": 2.6875, + "learning_rate": 4.710577380281626e-05, + "loss": 0.851, + "step": 8870 + }, + { + "epoch": 0.1572652299729235, + "grad_norm": 2.75, + "learning_rate": 4.710446927633973e-05, + "loss": 0.763, + "step": 8872 + }, + { + "epoch": 0.1573006820085351, + "grad_norm": 2.828125, + "learning_rate": 4.7103164474004037e-05, + "loss": 0.7902, + "step": 8874 + }, + { + "epoch": 0.15733613404414665, + "grad_norm": 2.78125, + "learning_rate": 4.710185939582544e-05, + "loss": 0.8135, + "step": 8876 + }, + { + "epoch": 0.1573715860797582, + "grad_norm": 2.78125, + "learning_rate": 4.7100554041820255e-05, + "loss": 0.8237, + "step": 8878 + }, + { + "epoch": 0.1574070381153698, + "grad_norm": 2.484375, + "learning_rate": 4.709924841200475e-05, + "loss": 0.7817, + "step": 8880 + }, + { + "epoch": 0.15744249015098136, + "grad_norm": 2.734375, + "learning_rate": 4.709794250639523e-05, + "loss": 0.7556, + "step": 8882 + }, + { + "epoch": 0.15747794218659292, + "grad_norm": 2.6875, + "learning_rate": 4.709663632500799e-05, + "loss": 0.8198, + "step": 8884 + }, + { + "epoch": 0.1575133942222045, + "grad_norm": 2.5625, + "learning_rate": 4.7095329867859335e-05, + "loss": 0.7842, + "step": 8886 + }, + { + "epoch": 0.15754884625781607, + "grad_norm": 2.6875, + "learning_rate": 4.709402313496556e-05, + "loss": 0.8254, + "step": 8888 + }, + { + "epoch": 0.15758429829342763, + "grad_norm": 2.609375, + "learning_rate": 4.709271612634298e-05, + "loss": 0.8064, + "step": 8890 + }, + { + "epoch": 0.15761975032903922, + "grad_norm": 2.640625, + "learning_rate": 4.7091408842007904e-05, + "loss": 0.7782, + "step": 8892 + }, + { + "epoch": 0.15765520236465078, + "grad_norm": 2.890625, + "learning_rate": 4.709010128197665e-05, + "loss": 0.8461, + "step": 8894 + }, + { + "epoch": 0.15769065440026234, + "grad_norm": 3.171875, + "learning_rate": 4.708879344626553e-05, + "loss": 0.8172, + "step": 8896 + }, + { + "epoch": 0.15772610643587393, + "grad_norm": 2.828125, + "learning_rate": 4.7087485334890866e-05, + "loss": 0.7925, + "step": 8898 + }, + { + "epoch": 0.15776155847148549, + "grad_norm": 2.609375, + "learning_rate": 4.708617694786899e-05, + "loss": 0.839, + "step": 8900 + }, + { + "epoch": 0.15779701050709705, + "grad_norm": 3.0625, + "learning_rate": 4.7084868285216234e-05, + "loss": 0.818, + "step": 8902 + }, + { + "epoch": 0.15783246254270863, + "grad_norm": 2.671875, + "learning_rate": 4.70835593469489e-05, + "loss": 0.8456, + "step": 8904 + }, + { + "epoch": 0.1578679145783202, + "grad_norm": 2.828125, + "learning_rate": 4.708225013308336e-05, + "loss": 0.8227, + "step": 8906 + }, + { + "epoch": 0.15790336661393176, + "grad_norm": 2.984375, + "learning_rate": 4.708094064363594e-05, + "loss": 0.83, + "step": 8908 + }, + { + "epoch": 0.15793881864954334, + "grad_norm": 2.703125, + "learning_rate": 4.707963087862297e-05, + "loss": 0.8352, + "step": 8910 + }, + { + "epoch": 0.1579742706851549, + "grad_norm": 2.9375, + "learning_rate": 4.7078320838060816e-05, + "loss": 0.8151, + "step": 8912 + }, + { + "epoch": 0.15800972272076647, + "grad_norm": 2.796875, + "learning_rate": 4.7077010521965816e-05, + "loss": 0.8006, + "step": 8914 + }, + { + "epoch": 0.15804517475637805, + "grad_norm": 2.609375, + "learning_rate": 4.707569993035431e-05, + "loss": 0.8574, + "step": 8916 + }, + { + "epoch": 0.15808062679198961, + "grad_norm": 2.75, + "learning_rate": 4.707438906324267e-05, + "loss": 0.7731, + "step": 8918 + }, + { + "epoch": 0.15811607882760118, + "grad_norm": 2.96875, + "learning_rate": 4.707307792064727e-05, + "loss": 0.8476, + "step": 8920 + }, + { + "epoch": 0.15815153086321276, + "grad_norm": 2.609375, + "learning_rate": 4.707176650258444e-05, + "loss": 0.8064, + "step": 8922 + }, + { + "epoch": 0.15818698289882432, + "grad_norm": 2.640625, + "learning_rate": 4.707045480907056e-05, + "loss": 0.8639, + "step": 8924 + }, + { + "epoch": 0.15822243493443588, + "grad_norm": 2.703125, + "learning_rate": 4.706914284012201e-05, + "loss": 0.8088, + "step": 8926 + }, + { + "epoch": 0.15825788697004747, + "grad_norm": 2.71875, + "learning_rate": 4.706783059575515e-05, + "loss": 0.7744, + "step": 8928 + }, + { + "epoch": 0.15829333900565903, + "grad_norm": 2.6875, + "learning_rate": 4.706651807598635e-05, + "loss": 0.8192, + "step": 8930 + }, + { + "epoch": 0.1583287910412706, + "grad_norm": 2.890625, + "learning_rate": 4.706520528083202e-05, + "loss": 0.8281, + "step": 8932 + }, + { + "epoch": 0.15836424307688218, + "grad_norm": 2.703125, + "learning_rate": 4.706389221030851e-05, + "loss": 0.7792, + "step": 8934 + }, + { + "epoch": 0.15839969511249374, + "grad_norm": 2.4375, + "learning_rate": 4.706257886443222e-05, + "loss": 0.7977, + "step": 8936 + }, + { + "epoch": 0.1584351471481053, + "grad_norm": 2.734375, + "learning_rate": 4.706126524321954e-05, + "loss": 0.8396, + "step": 8938 + }, + { + "epoch": 0.1584705991837169, + "grad_norm": 2.6875, + "learning_rate": 4.705995134668688e-05, + "loss": 0.8214, + "step": 8940 + }, + { + "epoch": 0.15850605121932845, + "grad_norm": 2.609375, + "learning_rate": 4.7058637174850604e-05, + "loss": 0.823, + "step": 8942 + }, + { + "epoch": 0.15854150325494, + "grad_norm": 2.796875, + "learning_rate": 4.7057322727727145e-05, + "loss": 0.8337, + "step": 8944 + }, + { + "epoch": 0.1585769552905516, + "grad_norm": 2.6875, + "learning_rate": 4.7056008005332886e-05, + "loss": 0.8593, + "step": 8946 + }, + { + "epoch": 0.15861240732616316, + "grad_norm": 3.359375, + "learning_rate": 4.7054693007684245e-05, + "loss": 0.8306, + "step": 8948 + }, + { + "epoch": 0.15864785936177472, + "grad_norm": 2.71875, + "learning_rate": 4.705337773479762e-05, + "loss": 0.8124, + "step": 8950 + }, + { + "epoch": 0.1586833113973863, + "grad_norm": 2.515625, + "learning_rate": 4.7052062186689435e-05, + "loss": 0.8111, + "step": 8952 + }, + { + "epoch": 0.15871876343299787, + "grad_norm": 2.859375, + "learning_rate": 4.705074636337612e-05, + "loss": 0.808, + "step": 8954 + }, + { + "epoch": 0.15875421546860943, + "grad_norm": 2.90625, + "learning_rate": 4.704943026487407e-05, + "loss": 0.8285, + "step": 8956 + }, + { + "epoch": 0.15878966750422102, + "grad_norm": 2.6875, + "learning_rate": 4.704811389119973e-05, + "loss": 0.8562, + "step": 8958 + }, + { + "epoch": 0.15882511953983258, + "grad_norm": 2.515625, + "learning_rate": 4.704679724236952e-05, + "loss": 0.8075, + "step": 8960 + }, + { + "epoch": 0.15886057157544414, + "grad_norm": 2.75, + "learning_rate": 4.704548031839987e-05, + "loss": 0.8646, + "step": 8962 + }, + { + "epoch": 0.15889602361105573, + "grad_norm": 2.578125, + "learning_rate": 4.704416311930722e-05, + "loss": 0.8135, + "step": 8964 + }, + { + "epoch": 0.1589314756466673, + "grad_norm": 2.84375, + "learning_rate": 4.7042845645108e-05, + "loss": 0.8081, + "step": 8966 + }, + { + "epoch": 0.15896692768227885, + "grad_norm": 3.15625, + "learning_rate": 4.7041527895818664e-05, + "loss": 0.7951, + "step": 8968 + }, + { + "epoch": 0.1590023797178904, + "grad_norm": 2.609375, + "learning_rate": 4.704020987145565e-05, + "loss": 0.7943, + "step": 8970 + }, + { + "epoch": 0.159037831753502, + "grad_norm": 2.515625, + "learning_rate": 4.70388915720354e-05, + "loss": 0.7993, + "step": 8972 + }, + { + "epoch": 0.15907328378911356, + "grad_norm": 2.65625, + "learning_rate": 4.703757299757439e-05, + "loss": 0.8204, + "step": 8974 + }, + { + "epoch": 0.15910873582472512, + "grad_norm": 2.625, + "learning_rate": 4.703625414808904e-05, + "loss": 0.8297, + "step": 8976 + }, + { + "epoch": 0.1591441878603367, + "grad_norm": 2.9375, + "learning_rate": 4.703493502359584e-05, + "loss": 0.8265, + "step": 8978 + }, + { + "epoch": 0.15917963989594827, + "grad_norm": 2.75, + "learning_rate": 4.703361562411124e-05, + "loss": 0.8388, + "step": 8980 + }, + { + "epoch": 0.15921509193155983, + "grad_norm": 3.046875, + "learning_rate": 4.7032295949651693e-05, + "loss": 0.8035, + "step": 8982 + }, + { + "epoch": 0.15925054396717142, + "grad_norm": 2.84375, + "learning_rate": 4.70309760002337e-05, + "loss": 0.8611, + "step": 8984 + }, + { + "epoch": 0.15928599600278298, + "grad_norm": 3.015625, + "learning_rate": 4.702965577587371e-05, + "loss": 0.8407, + "step": 8986 + }, + { + "epoch": 0.15932144803839454, + "grad_norm": 2.71875, + "learning_rate": 4.7028335276588195e-05, + "loss": 0.7786, + "step": 8988 + }, + { + "epoch": 0.15935690007400613, + "grad_norm": 2.859375, + "learning_rate": 4.702701450239365e-05, + "loss": 0.8373, + "step": 8990 + }, + { + "epoch": 0.1593923521096177, + "grad_norm": 2.765625, + "learning_rate": 4.7025693453306555e-05, + "loss": 0.8483, + "step": 8992 + }, + { + "epoch": 0.15942780414522925, + "grad_norm": 2.734375, + "learning_rate": 4.702437212934339e-05, + "loss": 0.8224, + "step": 8994 + }, + { + "epoch": 0.15946325618084084, + "grad_norm": 2.921875, + "learning_rate": 4.702305053052065e-05, + "loss": 0.8397, + "step": 8996 + }, + { + "epoch": 0.1594987082164524, + "grad_norm": 2.984375, + "learning_rate": 4.702172865685483e-05, + "loss": 0.8114, + "step": 8998 + }, + { + "epoch": 0.15953416025206396, + "grad_norm": 2.859375, + "learning_rate": 4.702040650836241e-05, + "loss": 0.802, + "step": 9000 + }, + { + "epoch": 0.15956961228767555, + "grad_norm": 2.90625, + "learning_rate": 4.701908408505992e-05, + "loss": 0.8164, + "step": 9002 + }, + { + "epoch": 0.1596050643232871, + "grad_norm": 2.515625, + "learning_rate": 4.701776138696383e-05, + "loss": 0.7726, + "step": 9004 + }, + { + "epoch": 0.15964051635889867, + "grad_norm": 2.984375, + "learning_rate": 4.7016438414090674e-05, + "loss": 0.8649, + "step": 9006 + }, + { + "epoch": 0.15967596839451026, + "grad_norm": 2.6875, + "learning_rate": 4.7015115166456954e-05, + "loss": 0.7944, + "step": 9008 + }, + { + "epoch": 0.15971142043012182, + "grad_norm": 2.765625, + "learning_rate": 4.701379164407917e-05, + "loss": 0.8098, + "step": 9010 + }, + { + "epoch": 0.15974687246573338, + "grad_norm": 2.8125, + "learning_rate": 4.701246784697386e-05, + "loss": 0.8219, + "step": 9012 + }, + { + "epoch": 0.15978232450134497, + "grad_norm": 2.84375, + "learning_rate": 4.701114377515754e-05, + "loss": 0.8315, + "step": 9014 + }, + { + "epoch": 0.15981777653695653, + "grad_norm": 2.390625, + "learning_rate": 4.7009819428646726e-05, + "loss": 0.7966, + "step": 9016 + }, + { + "epoch": 0.1598532285725681, + "grad_norm": 2.578125, + "learning_rate": 4.7008494807457954e-05, + "loss": 0.7995, + "step": 9018 + }, + { + "epoch": 0.15988868060817968, + "grad_norm": 3.078125, + "learning_rate": 4.700716991160775e-05, + "loss": 0.8324, + "step": 9020 + }, + { + "epoch": 0.15992413264379124, + "grad_norm": 2.78125, + "learning_rate": 4.7005844741112646e-05, + "loss": 0.841, + "step": 9022 + }, + { + "epoch": 0.1599595846794028, + "grad_norm": 2.625, + "learning_rate": 4.700451929598918e-05, + "loss": 0.8093, + "step": 9024 + }, + { + "epoch": 0.1599950367150144, + "grad_norm": 3.125, + "learning_rate": 4.70031935762539e-05, + "loss": 0.8544, + "step": 9026 + }, + { + "epoch": 0.16003048875062595, + "grad_norm": 2.8125, + "learning_rate": 4.7001867581923355e-05, + "loss": 0.8258, + "step": 9028 + }, + { + "epoch": 0.1600659407862375, + "grad_norm": 2.59375, + "learning_rate": 4.700054131301407e-05, + "loss": 0.7826, + "step": 9030 + }, + { + "epoch": 0.1601013928218491, + "grad_norm": 2.734375, + "learning_rate": 4.699921476954262e-05, + "loss": 0.8419, + "step": 9032 + }, + { + "epoch": 0.16013684485746066, + "grad_norm": 2.5625, + "learning_rate": 4.699788795152555e-05, + "loss": 0.7868, + "step": 9034 + }, + { + "epoch": 0.16017229689307222, + "grad_norm": 2.5625, + "learning_rate": 4.699656085897942e-05, + "loss": 0.8306, + "step": 9036 + }, + { + "epoch": 0.1602077489286838, + "grad_norm": 2.953125, + "learning_rate": 4.69952334919208e-05, + "loss": 0.8282, + "step": 9038 + }, + { + "epoch": 0.16024320096429537, + "grad_norm": 2.625, + "learning_rate": 4.6993905850366237e-05, + "loss": 0.825, + "step": 9040 + }, + { + "epoch": 0.16027865299990693, + "grad_norm": 2.8125, + "learning_rate": 4.6992577934332315e-05, + "loss": 0.814, + "step": 9042 + }, + { + "epoch": 0.16031410503551852, + "grad_norm": 2.640625, + "learning_rate": 4.6991249743835595e-05, + "loss": 0.8185, + "step": 9044 + }, + { + "epoch": 0.16034955707113008, + "grad_norm": 2.75, + "learning_rate": 4.6989921278892665e-05, + "loss": 0.8195, + "step": 9046 + }, + { + "epoch": 0.16038500910674164, + "grad_norm": 3.015625, + "learning_rate": 4.698859253952009e-05, + "loss": 0.8143, + "step": 9048 + }, + { + "epoch": 0.16042046114235323, + "grad_norm": 2.71875, + "learning_rate": 4.6987263525734474e-05, + "loss": 0.7709, + "step": 9050 + }, + { + "epoch": 0.1604559131779648, + "grad_norm": 2.6875, + "learning_rate": 4.698593423755238e-05, + "loss": 0.8342, + "step": 9052 + }, + { + "epoch": 0.16049136521357635, + "grad_norm": 2.65625, + "learning_rate": 4.6984604674990407e-05, + "loss": 0.8358, + "step": 9054 + }, + { + "epoch": 0.16052681724918794, + "grad_norm": 2.9375, + "learning_rate": 4.698327483806515e-05, + "loss": 0.7821, + "step": 9056 + }, + { + "epoch": 0.1605622692847995, + "grad_norm": 3.28125, + "learning_rate": 4.698194472679319e-05, + "loss": 0.8044, + "step": 9058 + }, + { + "epoch": 0.16059772132041106, + "grad_norm": 3.046875, + "learning_rate": 4.698061434119115e-05, + "loss": 0.8232, + "step": 9060 + }, + { + "epoch": 0.16063317335602265, + "grad_norm": 2.953125, + "learning_rate": 4.697928368127562e-05, + "loss": 0.8248, + "step": 9062 + }, + { + "epoch": 0.1606686253916342, + "grad_norm": 2.84375, + "learning_rate": 4.6977952747063204e-05, + "loss": 0.8056, + "step": 9064 + }, + { + "epoch": 0.16070407742724577, + "grad_norm": 2.78125, + "learning_rate": 4.697662153857052e-05, + "loss": 0.7992, + "step": 9066 + }, + { + "epoch": 0.16073952946285736, + "grad_norm": 2.90625, + "learning_rate": 4.697529005581417e-05, + "loss": 0.8145, + "step": 9068 + }, + { + "epoch": 0.16077498149846892, + "grad_norm": 2.984375, + "learning_rate": 4.697395829881078e-05, + "loss": 0.8375, + "step": 9070 + }, + { + "epoch": 0.16081043353408048, + "grad_norm": 3.0625, + "learning_rate": 4.697262626757697e-05, + "loss": 0.8664, + "step": 9072 + }, + { + "epoch": 0.16084588556969207, + "grad_norm": 2.734375, + "learning_rate": 4.697129396212936e-05, + "loss": 0.8609, + "step": 9074 + }, + { + "epoch": 0.16088133760530363, + "grad_norm": 2.609375, + "learning_rate": 4.696996138248457e-05, + "loss": 0.7714, + "step": 9076 + }, + { + "epoch": 0.1609167896409152, + "grad_norm": 2.671875, + "learning_rate": 4.696862852865925e-05, + "loss": 0.7998, + "step": 9078 + }, + { + "epoch": 0.16095224167652677, + "grad_norm": 2.453125, + "learning_rate": 4.6967295400670016e-05, + "loss": 0.8001, + "step": 9080 + }, + { + "epoch": 0.16098769371213834, + "grad_norm": 2.640625, + "learning_rate": 4.696596199853351e-05, + "loss": 0.8074, + "step": 9082 + }, + { + "epoch": 0.1610231457477499, + "grad_norm": 2.71875, + "learning_rate": 4.6964628322266374e-05, + "loss": 0.842, + "step": 9084 + }, + { + "epoch": 0.16105859778336148, + "grad_norm": 2.765625, + "learning_rate": 4.696329437188525e-05, + "loss": 0.7974, + "step": 9086 + }, + { + "epoch": 0.16109404981897305, + "grad_norm": 2.828125, + "learning_rate": 4.696196014740679e-05, + "loss": 0.7829, + "step": 9088 + }, + { + "epoch": 0.1611295018545846, + "grad_norm": 2.4375, + "learning_rate": 4.696062564884764e-05, + "loss": 0.8241, + "step": 9090 + }, + { + "epoch": 0.1611649538901962, + "grad_norm": 2.8125, + "learning_rate": 4.695929087622446e-05, + "loss": 0.8322, + "step": 9092 + }, + { + "epoch": 0.16120040592580775, + "grad_norm": 2.75, + "learning_rate": 4.6957955829553904e-05, + "loss": 0.779, + "step": 9094 + }, + { + "epoch": 0.16123585796141932, + "grad_norm": 2.625, + "learning_rate": 4.695662050885262e-05, + "loss": 0.7793, + "step": 9096 + }, + { + "epoch": 0.1612713099970309, + "grad_norm": 2.9375, + "learning_rate": 4.69552849141373e-05, + "loss": 0.8895, + "step": 9098 + }, + { + "epoch": 0.16130676203264246, + "grad_norm": 2.625, + "learning_rate": 4.6953949045424587e-05, + "loss": 0.8151, + "step": 9100 + }, + { + "epoch": 0.16134221406825403, + "grad_norm": 2.703125, + "learning_rate": 4.6952612902731165e-05, + "loss": 0.8219, + "step": 9102 + }, + { + "epoch": 0.1613776661038656, + "grad_norm": 2.421875, + "learning_rate": 4.6951276486073706e-05, + "loss": 0.7823, + "step": 9104 + }, + { + "epoch": 0.16141311813947717, + "grad_norm": 2.96875, + "learning_rate": 4.69499397954689e-05, + "loss": 0.845, + "step": 9106 + }, + { + "epoch": 0.16144857017508873, + "grad_norm": 2.765625, + "learning_rate": 4.6948602830933404e-05, + "loss": 0.838, + "step": 9108 + }, + { + "epoch": 0.16148402221070032, + "grad_norm": 2.8125, + "learning_rate": 4.694726559248392e-05, + "loss": 0.7893, + "step": 9110 + }, + { + "epoch": 0.16151947424631188, + "grad_norm": 2.953125, + "learning_rate": 4.6945928080137134e-05, + "loss": 0.8171, + "step": 9112 + }, + { + "epoch": 0.16155492628192344, + "grad_norm": 2.5625, + "learning_rate": 4.694459029390973e-05, + "loss": 0.7906, + "step": 9114 + }, + { + "epoch": 0.16159037831753503, + "grad_norm": 2.765625, + "learning_rate": 4.694325223381842e-05, + "loss": 0.8033, + "step": 9116 + }, + { + "epoch": 0.1616258303531466, + "grad_norm": 3.265625, + "learning_rate": 4.694191389987988e-05, + "loss": 0.8416, + "step": 9118 + }, + { + "epoch": 0.16166128238875815, + "grad_norm": 3.0, + "learning_rate": 4.694057529211084e-05, + "loss": 0.8006, + "step": 9120 + }, + { + "epoch": 0.16169673442436974, + "grad_norm": 2.8125, + "learning_rate": 4.693923641052798e-05, + "loss": 0.8163, + "step": 9122 + }, + { + "epoch": 0.1617321864599813, + "grad_norm": 2.703125, + "learning_rate": 4.693789725514802e-05, + "loss": 0.8369, + "step": 9124 + }, + { + "epoch": 0.16176763849559286, + "grad_norm": 2.921875, + "learning_rate": 4.693655782598768e-05, + "loss": 0.8142, + "step": 9126 + }, + { + "epoch": 0.16180309053120445, + "grad_norm": 2.734375, + "learning_rate": 4.693521812306366e-05, + "loss": 0.8145, + "step": 9128 + }, + { + "epoch": 0.161838542566816, + "grad_norm": 2.78125, + "learning_rate": 4.6933878146392685e-05, + "loss": 0.8574, + "step": 9130 + }, + { + "epoch": 0.16187399460242757, + "grad_norm": 2.78125, + "learning_rate": 4.693253789599148e-05, + "loss": 0.8351, + "step": 9132 + }, + { + "epoch": 0.16190944663803916, + "grad_norm": 2.796875, + "learning_rate": 4.693119737187677e-05, + "loss": 0.8218, + "step": 9134 + }, + { + "epoch": 0.16194489867365072, + "grad_norm": 2.640625, + "learning_rate": 4.692985657406529e-05, + "loss": 0.8088, + "step": 9136 + }, + { + "epoch": 0.16198035070926228, + "grad_norm": 2.484375, + "learning_rate": 4.692851550257377e-05, + "loss": 0.7974, + "step": 9138 + }, + { + "epoch": 0.16201580274487384, + "grad_norm": 2.734375, + "learning_rate": 4.6927174157418934e-05, + "loss": 0.8186, + "step": 9140 + }, + { + "epoch": 0.16205125478048543, + "grad_norm": 2.703125, + "learning_rate": 4.6925832538617536e-05, + "loss": 0.8193, + "step": 9142 + }, + { + "epoch": 0.162086706816097, + "grad_norm": 2.8125, + "learning_rate": 4.692449064618631e-05, + "loss": 0.7591, + "step": 9144 + }, + { + "epoch": 0.16212215885170855, + "grad_norm": 2.609375, + "learning_rate": 4.692314848014202e-05, + "loss": 0.817, + "step": 9146 + }, + { + "epoch": 0.16215761088732014, + "grad_norm": 2.703125, + "learning_rate": 4.6921806040501394e-05, + "loss": 0.8258, + "step": 9148 + }, + { + "epoch": 0.1621930629229317, + "grad_norm": 2.921875, + "learning_rate": 4.6920463327281196e-05, + "loss": 0.8466, + "step": 9150 + }, + { + "epoch": 0.16222851495854326, + "grad_norm": 2.78125, + "learning_rate": 4.691912034049818e-05, + "loss": 0.8232, + "step": 9152 + }, + { + "epoch": 0.16226396699415485, + "grad_norm": 3.390625, + "learning_rate": 4.691777708016911e-05, + "loss": 0.8537, + "step": 9154 + }, + { + "epoch": 0.1622994190297664, + "grad_norm": 2.734375, + "learning_rate": 4.6916433546310746e-05, + "loss": 0.8061, + "step": 9156 + }, + { + "epoch": 0.16233487106537797, + "grad_norm": 2.8125, + "learning_rate": 4.691508973893985e-05, + "loss": 0.8205, + "step": 9158 + }, + { + "epoch": 0.16237032310098956, + "grad_norm": 2.53125, + "learning_rate": 4.691374565807321e-05, + "loss": 0.8232, + "step": 9160 + }, + { + "epoch": 0.16240577513660112, + "grad_norm": 2.53125, + "learning_rate": 4.691240130372758e-05, + "loss": 0.7885, + "step": 9162 + }, + { + "epoch": 0.16244122717221268, + "grad_norm": 2.53125, + "learning_rate": 4.6911056675919754e-05, + "loss": 0.8036, + "step": 9164 + }, + { + "epoch": 0.16247667920782427, + "grad_norm": 2.546875, + "learning_rate": 4.69097117746665e-05, + "loss": 0.807, + "step": 9166 + }, + { + "epoch": 0.16251213124343583, + "grad_norm": 2.828125, + "learning_rate": 4.69083665999846e-05, + "loss": 0.7837, + "step": 9168 + }, + { + "epoch": 0.1625475832790474, + "grad_norm": 2.71875, + "learning_rate": 4.690702115189086e-05, + "loss": 0.8121, + "step": 9170 + }, + { + "epoch": 0.16258303531465898, + "grad_norm": 2.84375, + "learning_rate": 4.690567543040205e-05, + "loss": 0.8367, + "step": 9172 + }, + { + "epoch": 0.16261848735027054, + "grad_norm": 2.84375, + "learning_rate": 4.690432943553498e-05, + "loss": 0.8392, + "step": 9174 + }, + { + "epoch": 0.1626539393858821, + "grad_norm": 3.1875, + "learning_rate": 4.690298316730644e-05, + "loss": 0.8201, + "step": 9176 + }, + { + "epoch": 0.1626893914214937, + "grad_norm": 2.453125, + "learning_rate": 4.690163662573323e-05, + "loss": 0.7778, + "step": 9178 + }, + { + "epoch": 0.16272484345710525, + "grad_norm": 2.578125, + "learning_rate": 4.690028981083215e-05, + "loss": 0.7806, + "step": 9180 + }, + { + "epoch": 0.1627602954927168, + "grad_norm": 2.984375, + "learning_rate": 4.6898942722620024e-05, + "loss": 0.8281, + "step": 9182 + }, + { + "epoch": 0.1627957475283284, + "grad_norm": 2.390625, + "learning_rate": 4.689759536111364e-05, + "loss": 0.78, + "step": 9184 + }, + { + "epoch": 0.16283119956393996, + "grad_norm": 3.109375, + "learning_rate": 4.6896247726329846e-05, + "loss": 0.7946, + "step": 9186 + }, + { + "epoch": 0.16286665159955152, + "grad_norm": 2.875, + "learning_rate": 4.689489981828543e-05, + "loss": 0.8081, + "step": 9188 + }, + { + "epoch": 0.1629021036351631, + "grad_norm": 2.9375, + "learning_rate": 4.6893551636997223e-05, + "loss": 0.8461, + "step": 9190 + }, + { + "epoch": 0.16293755567077467, + "grad_norm": 2.59375, + "learning_rate": 4.689220318248207e-05, + "loss": 0.7735, + "step": 9192 + }, + { + "epoch": 0.16297300770638623, + "grad_norm": 2.578125, + "learning_rate": 4.689085445475676e-05, + "loss": 0.7966, + "step": 9194 + }, + { + "epoch": 0.16300845974199782, + "grad_norm": 2.875, + "learning_rate": 4.688950545383815e-05, + "loss": 0.7951, + "step": 9196 + }, + { + "epoch": 0.16304391177760938, + "grad_norm": 2.953125, + "learning_rate": 4.688815617974307e-05, + "loss": 0.8245, + "step": 9198 + }, + { + "epoch": 0.16307936381322094, + "grad_norm": 3.015625, + "learning_rate": 4.688680663248837e-05, + "loss": 0.8025, + "step": 9200 + }, + { + "epoch": 0.16311481584883253, + "grad_norm": 2.921875, + "learning_rate": 4.688545681209087e-05, + "loss": 0.8242, + "step": 9202 + }, + { + "epoch": 0.1631502678844441, + "grad_norm": 2.921875, + "learning_rate": 4.6884106718567435e-05, + "loss": 0.7914, + "step": 9204 + }, + { + "epoch": 0.16318571992005565, + "grad_norm": 2.734375, + "learning_rate": 4.688275635193491e-05, + "loss": 0.8505, + "step": 9206 + }, + { + "epoch": 0.16322117195566724, + "grad_norm": 2.859375, + "learning_rate": 4.688140571221014e-05, + "loss": 0.8436, + "step": 9208 + }, + { + "epoch": 0.1632566239912788, + "grad_norm": 2.6875, + "learning_rate": 4.6880054799409976e-05, + "loss": 0.8364, + "step": 9210 + }, + { + "epoch": 0.16329207602689036, + "grad_norm": 2.84375, + "learning_rate": 4.687870361355129e-05, + "loss": 0.8385, + "step": 9212 + }, + { + "epoch": 0.16332752806250195, + "grad_norm": 2.828125, + "learning_rate": 4.6877352154650945e-05, + "loss": 0.8277, + "step": 9214 + }, + { + "epoch": 0.1633629800981135, + "grad_norm": 2.859375, + "learning_rate": 4.6876000422725795e-05, + "loss": 0.8034, + "step": 9216 + }, + { + "epoch": 0.16339843213372507, + "grad_norm": 2.828125, + "learning_rate": 4.6874648417792724e-05, + "loss": 0.7925, + "step": 9218 + }, + { + "epoch": 0.16343388416933666, + "grad_norm": 2.71875, + "learning_rate": 4.687329613986859e-05, + "loss": 0.7897, + "step": 9220 + }, + { + "epoch": 0.16346933620494822, + "grad_norm": 2.484375, + "learning_rate": 4.687194358897028e-05, + "loss": 0.812, + "step": 9222 + }, + { + "epoch": 0.16350478824055978, + "grad_norm": 2.75, + "learning_rate": 4.687059076511467e-05, + "loss": 0.7761, + "step": 9224 + }, + { + "epoch": 0.16354024027617137, + "grad_norm": 2.78125, + "learning_rate": 4.686923766831864e-05, + "loss": 0.7916, + "step": 9226 + }, + { + "epoch": 0.16357569231178293, + "grad_norm": 2.96875, + "learning_rate": 4.686788429859907e-05, + "loss": 0.7843, + "step": 9228 + }, + { + "epoch": 0.1636111443473945, + "grad_norm": 3.015625, + "learning_rate": 4.686653065597287e-05, + "loss": 0.8011, + "step": 9230 + }, + { + "epoch": 0.16364659638300608, + "grad_norm": 2.78125, + "learning_rate": 4.686517674045693e-05, + "loss": 0.8351, + "step": 9232 + }, + { + "epoch": 0.16368204841861764, + "grad_norm": 2.90625, + "learning_rate": 4.686382255206813e-05, + "loss": 0.7966, + "step": 9234 + }, + { + "epoch": 0.1637175004542292, + "grad_norm": 2.609375, + "learning_rate": 4.686246809082337e-05, + "loss": 0.8251, + "step": 9236 + }, + { + "epoch": 0.1637529524898408, + "grad_norm": 2.921875, + "learning_rate": 4.6861113356739574e-05, + "loss": 0.807, + "step": 9238 + }, + { + "epoch": 0.16378840452545235, + "grad_norm": 3.0, + "learning_rate": 4.6859758349833626e-05, + "loss": 0.8265, + "step": 9240 + }, + { + "epoch": 0.1638238565610639, + "grad_norm": 2.921875, + "learning_rate": 4.6858403070122456e-05, + "loss": 0.8123, + "step": 9242 + }, + { + "epoch": 0.1638593085966755, + "grad_norm": 2.671875, + "learning_rate": 4.685704751762296e-05, + "loss": 0.8313, + "step": 9244 + }, + { + "epoch": 0.16389476063228706, + "grad_norm": 2.671875, + "learning_rate": 4.6855691692352074e-05, + "loss": 0.7875, + "step": 9246 + }, + { + "epoch": 0.16393021266789862, + "grad_norm": 3.28125, + "learning_rate": 4.6854335594326704e-05, + "loss": 0.8297, + "step": 9248 + }, + { + "epoch": 0.1639656647035102, + "grad_norm": 2.703125, + "learning_rate": 4.685297922356378e-05, + "loss": 0.8047, + "step": 9250 + }, + { + "epoch": 0.16400111673912177, + "grad_norm": 2.609375, + "learning_rate": 4.685162258008022e-05, + "loss": 0.8121, + "step": 9252 + }, + { + "epoch": 0.16403656877473333, + "grad_norm": 3.078125, + "learning_rate": 4.6850265663892964e-05, + "loss": 0.8477, + "step": 9254 + }, + { + "epoch": 0.16407202081034492, + "grad_norm": 2.625, + "learning_rate": 4.684890847501894e-05, + "loss": 0.8041, + "step": 9256 + }, + { + "epoch": 0.16410747284595648, + "grad_norm": 2.6875, + "learning_rate": 4.68475510134751e-05, + "loss": 0.7815, + "step": 9258 + }, + { + "epoch": 0.16414292488156804, + "grad_norm": 2.484375, + "learning_rate": 4.684619327927836e-05, + "loss": 0.8229, + "step": 9260 + }, + { + "epoch": 0.16417837691717962, + "grad_norm": 2.96875, + "learning_rate": 4.6844835272445686e-05, + "loss": 0.8021, + "step": 9262 + }, + { + "epoch": 0.16421382895279119, + "grad_norm": 2.734375, + "learning_rate": 4.684347699299402e-05, + "loss": 0.8477, + "step": 9264 + }, + { + "epoch": 0.16424928098840275, + "grad_norm": 2.578125, + "learning_rate": 4.6842118440940306e-05, + "loss": 0.7934, + "step": 9266 + }, + { + "epoch": 0.16428473302401433, + "grad_norm": 2.484375, + "learning_rate": 4.684075961630151e-05, + "loss": 0.7917, + "step": 9268 + }, + { + "epoch": 0.1643201850596259, + "grad_norm": 2.609375, + "learning_rate": 4.683940051909458e-05, + "loss": 0.7748, + "step": 9270 + }, + { + "epoch": 0.16435563709523746, + "grad_norm": 2.875, + "learning_rate": 4.6838041149336476e-05, + "loss": 0.8254, + "step": 9272 + }, + { + "epoch": 0.16439108913084904, + "grad_norm": 2.734375, + "learning_rate": 4.683668150704417e-05, + "loss": 0.8116, + "step": 9274 + }, + { + "epoch": 0.1644265411664606, + "grad_norm": 2.671875, + "learning_rate": 4.683532159223463e-05, + "loss": 0.8229, + "step": 9276 + }, + { + "epoch": 0.16446199320207217, + "grad_norm": 2.703125, + "learning_rate": 4.683396140492481e-05, + "loss": 0.7687, + "step": 9278 + }, + { + "epoch": 0.16449744523768375, + "grad_norm": 2.71875, + "learning_rate": 4.683260094513171e-05, + "loss": 0.7867, + "step": 9280 + }, + { + "epoch": 0.16453289727329531, + "grad_norm": 2.890625, + "learning_rate": 4.6831240212872305e-05, + "loss": 0.8179, + "step": 9282 + }, + { + "epoch": 0.16456834930890687, + "grad_norm": 2.921875, + "learning_rate": 4.6829879208163564e-05, + "loss": 0.7967, + "step": 9284 + }, + { + "epoch": 0.16460380134451846, + "grad_norm": 2.546875, + "learning_rate": 4.682851793102248e-05, + "loss": 0.7793, + "step": 9286 + }, + { + "epoch": 0.16463925338013002, + "grad_norm": 2.6875, + "learning_rate": 4.682715638146603e-05, + "loss": 0.8315, + "step": 9288 + }, + { + "epoch": 0.16467470541574158, + "grad_norm": 2.6875, + "learning_rate": 4.682579455951122e-05, + "loss": 0.8027, + "step": 9290 + }, + { + "epoch": 0.16471015745135317, + "grad_norm": 2.84375, + "learning_rate": 4.682443246517503e-05, + "loss": 0.8081, + "step": 9292 + }, + { + "epoch": 0.16474560948696473, + "grad_norm": 2.625, + "learning_rate": 4.682307009847448e-05, + "loss": 0.8358, + "step": 9294 + }, + { + "epoch": 0.1647810615225763, + "grad_norm": 2.625, + "learning_rate": 4.6821707459426556e-05, + "loss": 0.8409, + "step": 9296 + }, + { + "epoch": 0.16481651355818788, + "grad_norm": 2.890625, + "learning_rate": 4.682034454804827e-05, + "loss": 0.841, + "step": 9298 + }, + { + "epoch": 0.16485196559379944, + "grad_norm": 2.5625, + "learning_rate": 4.681898136435663e-05, + "loss": 0.736, + "step": 9300 + }, + { + "epoch": 0.164887417629411, + "grad_norm": 2.703125, + "learning_rate": 4.6817617908368646e-05, + "loss": 0.8464, + "step": 9302 + }, + { + "epoch": 0.1649228696650226, + "grad_norm": 2.890625, + "learning_rate": 4.681625418010134e-05, + "loss": 0.8265, + "step": 9304 + }, + { + "epoch": 0.16495832170063415, + "grad_norm": 2.875, + "learning_rate": 4.6814890179571714e-05, + "loss": 0.7931, + "step": 9306 + }, + { + "epoch": 0.1649937737362457, + "grad_norm": 3.0, + "learning_rate": 4.681352590679681e-05, + "loss": 0.8446, + "step": 9308 + }, + { + "epoch": 0.16502922577185727, + "grad_norm": 2.8125, + "learning_rate": 4.681216136179365e-05, + "loss": 0.7961, + "step": 9310 + }, + { + "epoch": 0.16506467780746886, + "grad_norm": 2.71875, + "learning_rate": 4.681079654457925e-05, + "loss": 0.795, + "step": 9312 + }, + { + "epoch": 0.16510012984308042, + "grad_norm": 2.78125, + "learning_rate": 4.680943145517066e-05, + "loss": 0.7826, + "step": 9314 + }, + { + "epoch": 0.16513558187869198, + "grad_norm": 2.671875, + "learning_rate": 4.68080660935849e-05, + "loss": 0.7961, + "step": 9316 + }, + { + "epoch": 0.16517103391430357, + "grad_norm": 2.625, + "learning_rate": 4.680670045983903e-05, + "loss": 0.8179, + "step": 9318 + }, + { + "epoch": 0.16520648594991513, + "grad_norm": 2.375, + "learning_rate": 4.6805334553950064e-05, + "loss": 0.7996, + "step": 9320 + }, + { + "epoch": 0.1652419379855267, + "grad_norm": 2.765625, + "learning_rate": 4.6803968375935076e-05, + "loss": 0.7934, + "step": 9322 + }, + { + "epoch": 0.16527739002113828, + "grad_norm": 2.9375, + "learning_rate": 4.68026019258111e-05, + "loss": 0.8005, + "step": 9324 + }, + { + "epoch": 0.16531284205674984, + "grad_norm": 2.640625, + "learning_rate": 4.6801235203595195e-05, + "loss": 0.8145, + "step": 9326 + }, + { + "epoch": 0.1653482940923614, + "grad_norm": 2.84375, + "learning_rate": 4.679986820930441e-05, + "loss": 0.8435, + "step": 9328 + }, + { + "epoch": 0.165383746127973, + "grad_norm": 2.8125, + "learning_rate": 4.679850094295581e-05, + "loss": 0.7971, + "step": 9330 + }, + { + "epoch": 0.16541919816358455, + "grad_norm": 2.890625, + "learning_rate": 4.6797133404566466e-05, + "loss": 0.7846, + "step": 9332 + }, + { + "epoch": 0.1654546501991961, + "grad_norm": 2.875, + "learning_rate": 4.679576559415344e-05, + "loss": 0.8079, + "step": 9334 + }, + { + "epoch": 0.1654901022348077, + "grad_norm": 3.03125, + "learning_rate": 4.679439751173379e-05, + "loss": 0.8371, + "step": 9336 + }, + { + "epoch": 0.16552555427041926, + "grad_norm": 2.71875, + "learning_rate": 4.67930291573246e-05, + "loss": 0.8212, + "step": 9338 + }, + { + "epoch": 0.16556100630603082, + "grad_norm": 2.78125, + "learning_rate": 4.679166053094295e-05, + "loss": 0.7708, + "step": 9340 + }, + { + "epoch": 0.1655964583416424, + "grad_norm": 2.578125, + "learning_rate": 4.679029163260591e-05, + "loss": 0.8194, + "step": 9342 + }, + { + "epoch": 0.16563191037725397, + "grad_norm": 2.46875, + "learning_rate": 4.6788922462330575e-05, + "loss": 0.7825, + "step": 9344 + }, + { + "epoch": 0.16566736241286553, + "grad_norm": 2.8125, + "learning_rate": 4.6787553020134025e-05, + "loss": 0.8312, + "step": 9346 + }, + { + "epoch": 0.16570281444847712, + "grad_norm": 2.625, + "learning_rate": 4.6786183306033346e-05, + "loss": 0.849, + "step": 9348 + }, + { + "epoch": 0.16573826648408868, + "grad_norm": 2.984375, + "learning_rate": 4.678481332004564e-05, + "loss": 0.8005, + "step": 9350 + }, + { + "epoch": 0.16577371851970024, + "grad_norm": 3.171875, + "learning_rate": 4.6783443062188e-05, + "loss": 0.8119, + "step": 9352 + }, + { + "epoch": 0.16580917055531183, + "grad_norm": 2.765625, + "learning_rate": 4.678207253247753e-05, + "loss": 0.8433, + "step": 9354 + }, + { + "epoch": 0.1658446225909234, + "grad_norm": 2.640625, + "learning_rate": 4.6780701730931334e-05, + "loss": 0.8299, + "step": 9356 + }, + { + "epoch": 0.16588007462653495, + "grad_norm": 2.703125, + "learning_rate": 4.6779330657566513e-05, + "loss": 0.8287, + "step": 9358 + }, + { + "epoch": 0.16591552666214654, + "grad_norm": 2.875, + "learning_rate": 4.677795931240018e-05, + "loss": 0.8628, + "step": 9360 + }, + { + "epoch": 0.1659509786977581, + "grad_norm": 2.828125, + "learning_rate": 4.6776587695449455e-05, + "loss": 0.8101, + "step": 9362 + }, + { + "epoch": 0.16598643073336966, + "grad_norm": 2.859375, + "learning_rate": 4.677521580673145e-05, + "loss": 0.7773, + "step": 9364 + }, + { + "epoch": 0.16602188276898125, + "grad_norm": 2.5, + "learning_rate": 4.677384364626328e-05, + "loss": 0.7934, + "step": 9366 + }, + { + "epoch": 0.1660573348045928, + "grad_norm": 2.71875, + "learning_rate": 4.6772471214062086e-05, + "loss": 0.8239, + "step": 9368 + }, + { + "epoch": 0.16609278684020437, + "grad_norm": 3.125, + "learning_rate": 4.6771098510144984e-05, + "loss": 0.7893, + "step": 9370 + }, + { + "epoch": 0.16612823887581596, + "grad_norm": 2.765625, + "learning_rate": 4.676972553452911e-05, + "loss": 0.8154, + "step": 9372 + }, + { + "epoch": 0.16616369091142752, + "grad_norm": 2.71875, + "learning_rate": 4.676835228723159e-05, + "loss": 0.8276, + "step": 9374 + }, + { + "epoch": 0.16619914294703908, + "grad_norm": 2.96875, + "learning_rate": 4.676697876826957e-05, + "loss": 0.8499, + "step": 9376 + }, + { + "epoch": 0.16623459498265067, + "grad_norm": 2.890625, + "learning_rate": 4.676560497766019e-05, + "loss": 0.8199, + "step": 9378 + }, + { + "epoch": 0.16627004701826223, + "grad_norm": 2.8125, + "learning_rate": 4.6764230915420596e-05, + "loss": 0.8217, + "step": 9380 + }, + { + "epoch": 0.1663054990538738, + "grad_norm": 2.703125, + "learning_rate": 4.676285658156793e-05, + "loss": 0.8098, + "step": 9382 + }, + { + "epoch": 0.16634095108948538, + "grad_norm": 2.65625, + "learning_rate": 4.6761481976119336e-05, + "loss": 0.8065, + "step": 9384 + }, + { + "epoch": 0.16637640312509694, + "grad_norm": 2.765625, + "learning_rate": 4.6760107099091985e-05, + "loss": 0.8115, + "step": 9386 + }, + { + "epoch": 0.1664118551607085, + "grad_norm": 2.84375, + "learning_rate": 4.675873195050304e-05, + "loss": 0.7882, + "step": 9388 + }, + { + "epoch": 0.1664473071963201, + "grad_norm": 2.625, + "learning_rate": 4.6757356530369646e-05, + "loss": 0.7722, + "step": 9390 + }, + { + "epoch": 0.16648275923193165, + "grad_norm": 2.640625, + "learning_rate": 4.6755980838708974e-05, + "loss": 0.7995, + "step": 9392 + }, + { + "epoch": 0.1665182112675432, + "grad_norm": 2.90625, + "learning_rate": 4.6754604875538187e-05, + "loss": 0.7809, + "step": 9394 + }, + { + "epoch": 0.1665536633031548, + "grad_norm": 2.6875, + "learning_rate": 4.6753228640874474e-05, + "loss": 0.7824, + "step": 9396 + }, + { + "epoch": 0.16658911533876636, + "grad_norm": 3.03125, + "learning_rate": 4.6751852134734994e-05, + "loss": 0.8413, + "step": 9398 + }, + { + "epoch": 0.16662456737437792, + "grad_norm": 2.828125, + "learning_rate": 4.6750475357136925e-05, + "loss": 0.8434, + "step": 9400 + }, + { + "epoch": 0.1666600194099895, + "grad_norm": 2.59375, + "learning_rate": 4.6749098308097464e-05, + "loss": 0.7929, + "step": 9402 + }, + { + "epoch": 0.16669547144560107, + "grad_norm": 2.703125, + "learning_rate": 4.674772098763378e-05, + "loss": 0.8189, + "step": 9404 + }, + { + "epoch": 0.16673092348121263, + "grad_norm": 3.0625, + "learning_rate": 4.674634339576306e-05, + "loss": 0.8326, + "step": 9406 + }, + { + "epoch": 0.16676637551682422, + "grad_norm": 2.90625, + "learning_rate": 4.6744965532502514e-05, + "loss": 0.8473, + "step": 9408 + }, + { + "epoch": 0.16680182755243578, + "grad_norm": 2.96875, + "learning_rate": 4.674358739786933e-05, + "loss": 0.8468, + "step": 9410 + }, + { + "epoch": 0.16683727958804734, + "grad_norm": 2.75, + "learning_rate": 4.6742208991880694e-05, + "loss": 0.8275, + "step": 9412 + }, + { + "epoch": 0.16687273162365893, + "grad_norm": 2.734375, + "learning_rate": 4.6740830314553823e-05, + "loss": 0.7788, + "step": 9414 + }, + { + "epoch": 0.1669081836592705, + "grad_norm": 2.515625, + "learning_rate": 4.6739451365905915e-05, + "loss": 0.797, + "step": 9416 + }, + { + "epoch": 0.16694363569488205, + "grad_norm": 3.015625, + "learning_rate": 4.673807214595419e-05, + "loss": 0.8269, + "step": 9418 + }, + { + "epoch": 0.16697908773049364, + "grad_norm": 2.890625, + "learning_rate": 4.6736692654715845e-05, + "loss": 0.8637, + "step": 9420 + }, + { + "epoch": 0.1670145397661052, + "grad_norm": 2.703125, + "learning_rate": 4.673531289220811e-05, + "loss": 0.79, + "step": 9422 + }, + { + "epoch": 0.16704999180171676, + "grad_norm": 2.828125, + "learning_rate": 4.6733932858448184e-05, + "loss": 0.784, + "step": 9424 + }, + { + "epoch": 0.16708544383732835, + "grad_norm": 2.890625, + "learning_rate": 4.6732552553453316e-05, + "loss": 0.7979, + "step": 9426 + }, + { + "epoch": 0.1671208958729399, + "grad_norm": 2.6875, + "learning_rate": 4.6731171977240713e-05, + "loss": 0.8525, + "step": 9428 + }, + { + "epoch": 0.16715634790855147, + "grad_norm": 2.8125, + "learning_rate": 4.672979112982761e-05, + "loss": 0.7991, + "step": 9430 + }, + { + "epoch": 0.16719179994416306, + "grad_norm": 3.015625, + "learning_rate": 4.6728410011231235e-05, + "loss": 0.8529, + "step": 9432 + }, + { + "epoch": 0.16722725197977462, + "grad_norm": 2.921875, + "learning_rate": 4.672702862146884e-05, + "loss": 0.8547, + "step": 9434 + }, + { + "epoch": 0.16726270401538618, + "grad_norm": 2.796875, + "learning_rate": 4.672564696055765e-05, + "loss": 0.8292, + "step": 9436 + }, + { + "epoch": 0.16729815605099776, + "grad_norm": 2.875, + "learning_rate": 4.672426502851491e-05, + "loss": 0.7923, + "step": 9438 + }, + { + "epoch": 0.16733360808660933, + "grad_norm": 2.53125, + "learning_rate": 4.672288282535786e-05, + "loss": 0.7652, + "step": 9440 + }, + { + "epoch": 0.1673690601222209, + "grad_norm": 2.71875, + "learning_rate": 4.6721500351103766e-05, + "loss": 0.8313, + "step": 9442 + }, + { + "epoch": 0.16740451215783247, + "grad_norm": 2.953125, + "learning_rate": 4.672011760576987e-05, + "loss": 0.8155, + "step": 9444 + }, + { + "epoch": 0.16743996419344404, + "grad_norm": 2.703125, + "learning_rate": 4.671873458937342e-05, + "loss": 0.8007, + "step": 9446 + }, + { + "epoch": 0.1674754162290556, + "grad_norm": 2.984375, + "learning_rate": 4.6717351301931704e-05, + "loss": 0.8482, + "step": 9448 + }, + { + "epoch": 0.16751086826466718, + "grad_norm": 3.0625, + "learning_rate": 4.671596774346196e-05, + "loss": 0.8175, + "step": 9450 + }, + { + "epoch": 0.16754632030027874, + "grad_norm": 2.765625, + "learning_rate": 4.671458391398146e-05, + "loss": 0.8452, + "step": 9452 + }, + { + "epoch": 0.1675817723358903, + "grad_norm": 2.875, + "learning_rate": 4.6713199813507474e-05, + "loss": 0.8062, + "step": 9454 + }, + { + "epoch": 0.1676172243715019, + "grad_norm": 3.03125, + "learning_rate": 4.6711815442057276e-05, + "loss": 0.8506, + "step": 9456 + }, + { + "epoch": 0.16765267640711345, + "grad_norm": 2.59375, + "learning_rate": 4.671043079964815e-05, + "loss": 0.7725, + "step": 9458 + }, + { + "epoch": 0.16768812844272502, + "grad_norm": 2.53125, + "learning_rate": 4.670904588629736e-05, + "loss": 0.8275, + "step": 9460 + }, + { + "epoch": 0.1677235804783366, + "grad_norm": 2.90625, + "learning_rate": 4.6707660702022205e-05, + "loss": 0.8068, + "step": 9462 + }, + { + "epoch": 0.16775903251394816, + "grad_norm": 2.765625, + "learning_rate": 4.670627524683997e-05, + "loss": 0.8035, + "step": 9464 + }, + { + "epoch": 0.16779448454955972, + "grad_norm": 2.640625, + "learning_rate": 4.6704889520767935e-05, + "loss": 0.7704, + "step": 9466 + }, + { + "epoch": 0.1678299365851713, + "grad_norm": 2.46875, + "learning_rate": 4.67035035238234e-05, + "loss": 0.8211, + "step": 9468 + }, + { + "epoch": 0.16786538862078287, + "grad_norm": 2.921875, + "learning_rate": 4.6702117256023665e-05, + "loss": 0.846, + "step": 9470 + }, + { + "epoch": 0.16790084065639443, + "grad_norm": 2.546875, + "learning_rate": 4.6700730717386024e-05, + "loss": 0.8129, + "step": 9472 + }, + { + "epoch": 0.16793629269200602, + "grad_norm": 2.875, + "learning_rate": 4.6699343907927785e-05, + "loss": 0.8938, + "step": 9474 + }, + { + "epoch": 0.16797174472761758, + "grad_norm": 2.34375, + "learning_rate": 4.669795682766625e-05, + "loss": 0.7938, + "step": 9476 + }, + { + "epoch": 0.16800719676322914, + "grad_norm": 3.125, + "learning_rate": 4.6696569476618736e-05, + "loss": 0.8289, + "step": 9478 + }, + { + "epoch": 0.1680426487988407, + "grad_norm": 2.578125, + "learning_rate": 4.669518185480255e-05, + "loss": 0.8243, + "step": 9480 + }, + { + "epoch": 0.1680781008344523, + "grad_norm": 2.6875, + "learning_rate": 4.669379396223502e-05, + "loss": 0.8155, + "step": 9482 + }, + { + "epoch": 0.16811355287006385, + "grad_norm": 2.578125, + "learning_rate": 4.669240579893346e-05, + "loss": 0.7802, + "step": 9484 + }, + { + "epoch": 0.16814900490567541, + "grad_norm": 2.75, + "learning_rate": 4.669101736491519e-05, + "loss": 0.8266, + "step": 9486 + }, + { + "epoch": 0.168184456941287, + "grad_norm": 2.40625, + "learning_rate": 4.6689628660197534e-05, + "loss": 0.7776, + "step": 9488 + }, + { + "epoch": 0.16821990897689856, + "grad_norm": 2.671875, + "learning_rate": 4.668823968479784e-05, + "loss": 0.8245, + "step": 9490 + }, + { + "epoch": 0.16825536101251012, + "grad_norm": 2.578125, + "learning_rate": 4.668685043873343e-05, + "loss": 0.7765, + "step": 9492 + }, + { + "epoch": 0.1682908130481217, + "grad_norm": 2.84375, + "learning_rate": 4.6685460922021644e-05, + "loss": 0.8347, + "step": 9494 + }, + { + "epoch": 0.16832626508373327, + "grad_norm": 2.90625, + "learning_rate": 4.668407113467983e-05, + "loss": 0.8044, + "step": 9496 + }, + { + "epoch": 0.16836171711934483, + "grad_norm": 2.59375, + "learning_rate": 4.668268107672531e-05, + "loss": 0.7938, + "step": 9498 + }, + { + "epoch": 0.16839716915495642, + "grad_norm": 2.71875, + "learning_rate": 4.6681290748175457e-05, + "loss": 0.8243, + "step": 9500 + }, + { + "epoch": 0.16843262119056798, + "grad_norm": 2.984375, + "learning_rate": 4.6679900149047604e-05, + "loss": 0.8175, + "step": 9502 + }, + { + "epoch": 0.16846807322617954, + "grad_norm": 2.5, + "learning_rate": 4.667850927935912e-05, + "loss": 0.8063, + "step": 9504 + }, + { + "epoch": 0.16850352526179113, + "grad_norm": 2.765625, + "learning_rate": 4.6677118139127354e-05, + "loss": 0.7868, + "step": 9506 + }, + { + "epoch": 0.1685389772974027, + "grad_norm": 2.703125, + "learning_rate": 4.6675726728369664e-05, + "loss": 0.8022, + "step": 9508 + }, + { + "epoch": 0.16857442933301425, + "grad_norm": 2.890625, + "learning_rate": 4.667433504710342e-05, + "loss": 0.8211, + "step": 9510 + }, + { + "epoch": 0.16860988136862584, + "grad_norm": 2.71875, + "learning_rate": 4.667294309534599e-05, + "loss": 0.8062, + "step": 9512 + }, + { + "epoch": 0.1686453334042374, + "grad_norm": 2.640625, + "learning_rate": 4.6671550873114744e-05, + "loss": 0.7539, + "step": 9514 + }, + { + "epoch": 0.16868078543984896, + "grad_norm": 2.78125, + "learning_rate": 4.667015838042705e-05, + "loss": 0.832, + "step": 9516 + }, + { + "epoch": 0.16871623747546055, + "grad_norm": 2.390625, + "learning_rate": 4.6668765617300306e-05, + "loss": 0.773, + "step": 9518 + }, + { + "epoch": 0.1687516895110721, + "grad_norm": 2.8125, + "learning_rate": 4.6667372583751875e-05, + "loss": 0.832, + "step": 9520 + }, + { + "epoch": 0.16878714154668367, + "grad_norm": 2.6875, + "learning_rate": 4.6665979279799146e-05, + "loss": 0.8179, + "step": 9522 + }, + { + "epoch": 0.16882259358229526, + "grad_norm": 2.65625, + "learning_rate": 4.666458570545951e-05, + "loss": 0.7374, + "step": 9524 + }, + { + "epoch": 0.16885804561790682, + "grad_norm": 2.796875, + "learning_rate": 4.6663191860750354e-05, + "loss": 0.834, + "step": 9526 + }, + { + "epoch": 0.16889349765351838, + "grad_norm": 2.875, + "learning_rate": 4.666179774568909e-05, + "loss": 0.8472, + "step": 9528 + }, + { + "epoch": 0.16892894968912997, + "grad_norm": 2.859375, + "learning_rate": 4.666040336029308e-05, + "loss": 0.7924, + "step": 9530 + }, + { + "epoch": 0.16896440172474153, + "grad_norm": 2.953125, + "learning_rate": 4.6659008704579756e-05, + "loss": 0.8083, + "step": 9532 + }, + { + "epoch": 0.1689998537603531, + "grad_norm": 2.65625, + "learning_rate": 4.665761377856652e-05, + "loss": 0.8192, + "step": 9534 + }, + { + "epoch": 0.16903530579596468, + "grad_norm": 2.671875, + "learning_rate": 4.6656218582270775e-05, + "loss": 0.8148, + "step": 9536 + }, + { + "epoch": 0.16907075783157624, + "grad_norm": 2.859375, + "learning_rate": 4.665482311570992e-05, + "loss": 0.8115, + "step": 9538 + }, + { + "epoch": 0.1691062098671878, + "grad_norm": 3.078125, + "learning_rate": 4.6653427378901395e-05, + "loss": 0.8163, + "step": 9540 + }, + { + "epoch": 0.1691416619027994, + "grad_norm": 2.671875, + "learning_rate": 4.665203137186261e-05, + "loss": 0.8194, + "step": 9542 + }, + { + "epoch": 0.16917711393841095, + "grad_norm": 2.515625, + "learning_rate": 4.665063509461097e-05, + "loss": 0.7891, + "step": 9544 + }, + { + "epoch": 0.1692125659740225, + "grad_norm": 2.796875, + "learning_rate": 4.664923854716392e-05, + "loss": 0.7962, + "step": 9546 + }, + { + "epoch": 0.1692480180096341, + "grad_norm": 2.671875, + "learning_rate": 4.664784172953888e-05, + "loss": 0.7783, + "step": 9548 + }, + { + "epoch": 0.16928347004524566, + "grad_norm": 2.921875, + "learning_rate": 4.664644464175328e-05, + "loss": 0.8456, + "step": 9550 + }, + { + "epoch": 0.16931892208085722, + "grad_norm": 2.65625, + "learning_rate": 4.664504728382457e-05, + "loss": 0.7876, + "step": 9552 + }, + { + "epoch": 0.1693543741164688, + "grad_norm": 2.609375, + "learning_rate": 4.664364965577018e-05, + "loss": 0.8043, + "step": 9554 + }, + { + "epoch": 0.16938982615208037, + "grad_norm": 2.65625, + "learning_rate": 4.664225175760754e-05, + "loss": 0.7579, + "step": 9556 + }, + { + "epoch": 0.16942527818769193, + "grad_norm": 2.8125, + "learning_rate": 4.6640853589354114e-05, + "loss": 0.8425, + "step": 9558 + }, + { + "epoch": 0.16946073022330352, + "grad_norm": 2.390625, + "learning_rate": 4.663945515102733e-05, + "loss": 0.755, + "step": 9560 + }, + { + "epoch": 0.16949618225891508, + "grad_norm": 2.546875, + "learning_rate": 4.663805644264467e-05, + "loss": 0.815, + "step": 9562 + }, + { + "epoch": 0.16953163429452664, + "grad_norm": 2.53125, + "learning_rate": 4.663665746422356e-05, + "loss": 0.7912, + "step": 9564 + }, + { + "epoch": 0.16956708633013823, + "grad_norm": 2.671875, + "learning_rate": 4.6635258215781476e-05, + "loss": 0.8059, + "step": 9566 + }, + { + "epoch": 0.1696025383657498, + "grad_norm": 2.8125, + "learning_rate": 4.663385869733587e-05, + "loss": 0.8317, + "step": 9568 + }, + { + "epoch": 0.16963799040136135, + "grad_norm": 2.859375, + "learning_rate": 4.663245890890423e-05, + "loss": 0.7885, + "step": 9570 + }, + { + "epoch": 0.16967344243697294, + "grad_norm": 2.71875, + "learning_rate": 4.663105885050399e-05, + "loss": 0.7997, + "step": 9572 + }, + { + "epoch": 0.1697088944725845, + "grad_norm": 2.53125, + "learning_rate": 4.662965852215265e-05, + "loss": 0.8182, + "step": 9574 + }, + { + "epoch": 0.16974434650819606, + "grad_norm": 2.625, + "learning_rate": 4.6628257923867676e-05, + "loss": 0.7913, + "step": 9576 + }, + { + "epoch": 0.16977979854380765, + "grad_norm": 2.625, + "learning_rate": 4.6626857055666546e-05, + "loss": 0.7741, + "step": 9578 + }, + { + "epoch": 0.1698152505794192, + "grad_norm": 2.5625, + "learning_rate": 4.662545591756675e-05, + "loss": 0.7387, + "step": 9580 + }, + { + "epoch": 0.16985070261503077, + "grad_norm": 2.8125, + "learning_rate": 4.6624054509585755e-05, + "loss": 0.8389, + "step": 9582 + }, + { + "epoch": 0.16988615465064236, + "grad_norm": 3.0, + "learning_rate": 4.6622652831741074e-05, + "loss": 0.8325, + "step": 9584 + }, + { + "epoch": 0.16992160668625392, + "grad_norm": 2.90625, + "learning_rate": 4.6621250884050195e-05, + "loss": 0.7699, + "step": 9586 + }, + { + "epoch": 0.16995705872186548, + "grad_norm": 2.6875, + "learning_rate": 4.6619848666530594e-05, + "loss": 0.8166, + "step": 9588 + }, + { + "epoch": 0.16999251075747707, + "grad_norm": 2.953125, + "learning_rate": 4.6618446179199795e-05, + "loss": 0.832, + "step": 9590 + }, + { + "epoch": 0.17002796279308863, + "grad_norm": 2.6875, + "learning_rate": 4.661704342207529e-05, + "loss": 0.7962, + "step": 9592 + }, + { + "epoch": 0.1700634148287002, + "grad_norm": 2.828125, + "learning_rate": 4.661564039517458e-05, + "loss": 0.7887, + "step": 9594 + }, + { + "epoch": 0.17009886686431178, + "grad_norm": 2.703125, + "learning_rate": 4.661423709851518e-05, + "loss": 0.817, + "step": 9596 + }, + { + "epoch": 0.17013431889992334, + "grad_norm": 2.90625, + "learning_rate": 4.661283353211461e-05, + "loss": 0.8294, + "step": 9598 + }, + { + "epoch": 0.1701697709355349, + "grad_norm": 2.5, + "learning_rate": 4.661142969599037e-05, + "loss": 0.7648, + "step": 9600 + }, + { + "epoch": 0.17020522297114649, + "grad_norm": 2.71875, + "learning_rate": 4.661002559016e-05, + "loss": 0.8282, + "step": 9602 + }, + { + "epoch": 0.17024067500675805, + "grad_norm": 2.765625, + "learning_rate": 4.6608621214641e-05, + "loss": 0.8384, + "step": 9604 + }, + { + "epoch": 0.1702761270423696, + "grad_norm": 2.875, + "learning_rate": 4.660721656945092e-05, + "loss": 0.7908, + "step": 9606 + }, + { + "epoch": 0.1703115790779812, + "grad_norm": 2.484375, + "learning_rate": 4.6605811654607265e-05, + "loss": 0.8134, + "step": 9608 + }, + { + "epoch": 0.17034703111359276, + "grad_norm": 3.0, + "learning_rate": 4.660440647012759e-05, + "loss": 0.7794, + "step": 9610 + }, + { + "epoch": 0.17038248314920432, + "grad_norm": 2.609375, + "learning_rate": 4.6603001016029425e-05, + "loss": 0.8215, + "step": 9612 + }, + { + "epoch": 0.1704179351848159, + "grad_norm": 2.84375, + "learning_rate": 4.66015952923303e-05, + "loss": 0.8334, + "step": 9614 + }, + { + "epoch": 0.17045338722042747, + "grad_norm": 2.609375, + "learning_rate": 4.660018929904776e-05, + "loss": 0.7975, + "step": 9616 + }, + { + "epoch": 0.17048883925603903, + "grad_norm": 2.828125, + "learning_rate": 4.659878303619937e-05, + "loss": 0.8501, + "step": 9618 + }, + { + "epoch": 0.17052429129165061, + "grad_norm": 2.5625, + "learning_rate": 4.659737650380265e-05, + "loss": 0.7872, + "step": 9620 + }, + { + "epoch": 0.17055974332726218, + "grad_norm": 2.578125, + "learning_rate": 4.6595969701875184e-05, + "loss": 0.8015, + "step": 9622 + }, + { + "epoch": 0.17059519536287374, + "grad_norm": 2.859375, + "learning_rate": 4.6594562630434505e-05, + "loss": 0.7942, + "step": 9624 + }, + { + "epoch": 0.17063064739848532, + "grad_norm": 2.71875, + "learning_rate": 4.659315528949819e-05, + "loss": 0.7957, + "step": 9626 + }, + { + "epoch": 0.17066609943409689, + "grad_norm": 2.390625, + "learning_rate": 4.659174767908379e-05, + "loss": 0.8032, + "step": 9628 + }, + { + "epoch": 0.17070155146970845, + "grad_norm": 2.828125, + "learning_rate": 4.659033979920888e-05, + "loss": 0.8284, + "step": 9630 + }, + { + "epoch": 0.17073700350532003, + "grad_norm": 2.78125, + "learning_rate": 4.658893164989102e-05, + "loss": 0.8575, + "step": 9632 + }, + { + "epoch": 0.1707724555409316, + "grad_norm": 2.640625, + "learning_rate": 4.658752323114779e-05, + "loss": 0.818, + "step": 9634 + }, + { + "epoch": 0.17080790757654316, + "grad_norm": 2.921875, + "learning_rate": 4.6586114542996776e-05, + "loss": 0.8272, + "step": 9636 + }, + { + "epoch": 0.17084335961215474, + "grad_norm": 2.734375, + "learning_rate": 4.6584705585455534e-05, + "loss": 0.8401, + "step": 9638 + }, + { + "epoch": 0.1708788116477663, + "grad_norm": 2.625, + "learning_rate": 4.6583296358541675e-05, + "loss": 0.8057, + "step": 9640 + }, + { + "epoch": 0.17091426368337787, + "grad_norm": 2.46875, + "learning_rate": 4.658188686227277e-05, + "loss": 0.776, + "step": 9642 + }, + { + "epoch": 0.17094971571898945, + "grad_norm": 2.8125, + "learning_rate": 4.658047709666641e-05, + "loss": 0.8276, + "step": 9644 + }, + { + "epoch": 0.17098516775460101, + "grad_norm": 2.828125, + "learning_rate": 4.657906706174019e-05, + "loss": 0.7901, + "step": 9646 + }, + { + "epoch": 0.17102061979021257, + "grad_norm": 2.90625, + "learning_rate": 4.65776567575117e-05, + "loss": 0.8314, + "step": 9648 + }, + { + "epoch": 0.17105607182582414, + "grad_norm": 2.484375, + "learning_rate": 4.6576246183998564e-05, + "loss": 0.7915, + "step": 9650 + }, + { + "epoch": 0.17109152386143572, + "grad_norm": 2.84375, + "learning_rate": 4.657483534121836e-05, + "loss": 0.7905, + "step": 9652 + }, + { + "epoch": 0.17112697589704728, + "grad_norm": 2.671875, + "learning_rate": 4.65734242291887e-05, + "loss": 0.7404, + "step": 9654 + }, + { + "epoch": 0.17116242793265884, + "grad_norm": 2.5625, + "learning_rate": 4.657201284792721e-05, + "loss": 0.829, + "step": 9656 + }, + { + "epoch": 0.17119787996827043, + "grad_norm": 2.375, + "learning_rate": 4.657060119745149e-05, + "loss": 0.7781, + "step": 9658 + }, + { + "epoch": 0.171233332003882, + "grad_norm": 2.5, + "learning_rate": 4.6569189277779154e-05, + "loss": 0.785, + "step": 9660 + }, + { + "epoch": 0.17126878403949355, + "grad_norm": 2.84375, + "learning_rate": 4.6567777088927836e-05, + "loss": 0.7846, + "step": 9662 + }, + { + "epoch": 0.17130423607510514, + "grad_norm": 2.640625, + "learning_rate": 4.656636463091515e-05, + "loss": 0.8225, + "step": 9664 + }, + { + "epoch": 0.1713396881107167, + "grad_norm": 2.796875, + "learning_rate": 4.656495190375872e-05, + "loss": 0.7614, + "step": 9666 + }, + { + "epoch": 0.17137514014632826, + "grad_norm": 2.734375, + "learning_rate": 4.656353890747619e-05, + "loss": 0.8443, + "step": 9668 + }, + { + "epoch": 0.17141059218193985, + "grad_norm": 3.0625, + "learning_rate": 4.656212564208518e-05, + "loss": 0.8243, + "step": 9670 + }, + { + "epoch": 0.1714460442175514, + "grad_norm": 2.640625, + "learning_rate": 4.6560712107603334e-05, + "loss": 0.7915, + "step": 9672 + }, + { + "epoch": 0.17148149625316297, + "grad_norm": 2.8125, + "learning_rate": 4.655929830404829e-05, + "loss": 0.7426, + "step": 9674 + }, + { + "epoch": 0.17151694828877456, + "grad_norm": 2.953125, + "learning_rate": 4.65578842314377e-05, + "loss": 0.7934, + "step": 9676 + }, + { + "epoch": 0.17155240032438612, + "grad_norm": 2.9375, + "learning_rate": 4.65564698897892e-05, + "loss": 0.8462, + "step": 9678 + }, + { + "epoch": 0.17158785235999768, + "grad_norm": 2.890625, + "learning_rate": 4.6555055279120444e-05, + "loss": 0.8183, + "step": 9680 + }, + { + "epoch": 0.17162330439560927, + "grad_norm": 2.859375, + "learning_rate": 4.655364039944909e-05, + "loss": 0.821, + "step": 9682 + }, + { + "epoch": 0.17165875643122083, + "grad_norm": 2.78125, + "learning_rate": 4.6552225250792794e-05, + "loss": 0.7928, + "step": 9684 + }, + { + "epoch": 0.1716942084668324, + "grad_norm": 3.140625, + "learning_rate": 4.655080983316922e-05, + "loss": 0.845, + "step": 9686 + }, + { + "epoch": 0.17172966050244398, + "grad_norm": 2.75, + "learning_rate": 4.654939414659602e-05, + "loss": 0.8374, + "step": 9688 + }, + { + "epoch": 0.17176511253805554, + "grad_norm": 2.609375, + "learning_rate": 4.654797819109087e-05, + "loss": 0.8192, + "step": 9690 + }, + { + "epoch": 0.1718005645736671, + "grad_norm": 2.75, + "learning_rate": 4.654656196667145e-05, + "loss": 0.7864, + "step": 9692 + }, + { + "epoch": 0.1718360166092787, + "grad_norm": 2.625, + "learning_rate": 4.654514547335541e-05, + "loss": 0.8356, + "step": 9694 + }, + { + "epoch": 0.17187146864489025, + "grad_norm": 2.71875, + "learning_rate": 4.6543728711160456e-05, + "loss": 0.819, + "step": 9696 + }, + { + "epoch": 0.1719069206805018, + "grad_norm": 2.859375, + "learning_rate": 4.654231168010425e-05, + "loss": 0.7734, + "step": 9698 + }, + { + "epoch": 0.1719423727161134, + "grad_norm": 2.71875, + "learning_rate": 4.654089438020448e-05, + "loss": 0.8568, + "step": 9700 + }, + { + "epoch": 0.17197782475172496, + "grad_norm": 3.0, + "learning_rate": 4.653947681147883e-05, + "loss": 0.8154, + "step": 9702 + }, + { + "epoch": 0.17201327678733652, + "grad_norm": 2.421875, + "learning_rate": 4.6538058973945004e-05, + "loss": 0.7839, + "step": 9704 + }, + { + "epoch": 0.1720487288229481, + "grad_norm": 2.859375, + "learning_rate": 4.6536640867620686e-05, + "loss": 0.8061, + "step": 9706 + }, + { + "epoch": 0.17208418085855967, + "grad_norm": 2.640625, + "learning_rate": 4.653522249252357e-05, + "loss": 0.8139, + "step": 9708 + }, + { + "epoch": 0.17211963289417123, + "grad_norm": 2.734375, + "learning_rate": 4.6533803848671366e-05, + "loss": 0.8192, + "step": 9710 + }, + { + "epoch": 0.17215508492978282, + "grad_norm": 2.453125, + "learning_rate": 4.6532384936081777e-05, + "loss": 0.8006, + "step": 9712 + }, + { + "epoch": 0.17219053696539438, + "grad_norm": 2.84375, + "learning_rate": 4.65309657547725e-05, + "loss": 0.8358, + "step": 9714 + }, + { + "epoch": 0.17222598900100594, + "grad_norm": 2.75, + "learning_rate": 4.652954630476127e-05, + "loss": 0.8254, + "step": 9716 + }, + { + "epoch": 0.17226144103661753, + "grad_norm": 2.828125, + "learning_rate": 4.652812658606578e-05, + "loss": 0.832, + "step": 9718 + }, + { + "epoch": 0.1722968930722291, + "grad_norm": 2.515625, + "learning_rate": 4.652670659870375e-05, + "loss": 0.7853, + "step": 9720 + }, + { + "epoch": 0.17233234510784065, + "grad_norm": 3.125, + "learning_rate": 4.652528634269291e-05, + "loss": 0.8334, + "step": 9722 + }, + { + "epoch": 0.17236779714345224, + "grad_norm": 2.890625, + "learning_rate": 4.6523865818050984e-05, + "loss": 0.8368, + "step": 9724 + }, + { + "epoch": 0.1724032491790638, + "grad_norm": 2.8125, + "learning_rate": 4.6522445024795694e-05, + "loss": 0.8178, + "step": 9726 + }, + { + "epoch": 0.17243870121467536, + "grad_norm": 2.875, + "learning_rate": 4.6521023962944765e-05, + "loss": 0.819, + "step": 9728 + }, + { + "epoch": 0.17247415325028695, + "grad_norm": 2.6875, + "learning_rate": 4.651960263251594e-05, + "loss": 0.8075, + "step": 9730 + }, + { + "epoch": 0.1725096052858985, + "grad_norm": 2.640625, + "learning_rate": 4.6518181033526966e-05, + "loss": 0.8005, + "step": 9732 + }, + { + "epoch": 0.17254505732151007, + "grad_norm": 2.875, + "learning_rate": 4.6516759165995563e-05, + "loss": 0.8483, + "step": 9734 + }, + { + "epoch": 0.17258050935712166, + "grad_norm": 2.90625, + "learning_rate": 4.651533702993949e-05, + "loss": 0.8286, + "step": 9736 + }, + { + "epoch": 0.17261596139273322, + "grad_norm": 2.84375, + "learning_rate": 4.65139146253765e-05, + "loss": 0.7821, + "step": 9738 + }, + { + "epoch": 0.17265141342834478, + "grad_norm": 2.859375, + "learning_rate": 4.6512491952324334e-05, + "loss": 0.851, + "step": 9740 + }, + { + "epoch": 0.17268686546395637, + "grad_norm": 2.6875, + "learning_rate": 4.6511069010800745e-05, + "loss": 0.7941, + "step": 9742 + }, + { + "epoch": 0.17272231749956793, + "grad_norm": 2.609375, + "learning_rate": 4.6509645800823494e-05, + "loss": 0.7729, + "step": 9744 + }, + { + "epoch": 0.1727577695351795, + "grad_norm": 2.53125, + "learning_rate": 4.650822232241034e-05, + "loss": 0.8009, + "step": 9746 + }, + { + "epoch": 0.17279322157079108, + "grad_norm": 2.484375, + "learning_rate": 4.650679857557906e-05, + "loss": 0.7401, + "step": 9748 + }, + { + "epoch": 0.17282867360640264, + "grad_norm": 2.71875, + "learning_rate": 4.6505374560347415e-05, + "loss": 0.7513, + "step": 9750 + }, + { + "epoch": 0.1728641256420142, + "grad_norm": 2.765625, + "learning_rate": 4.650395027673317e-05, + "loss": 0.8464, + "step": 9752 + }, + { + "epoch": 0.1728995776776258, + "grad_norm": 2.734375, + "learning_rate": 4.650252572475411e-05, + "loss": 0.8083, + "step": 9754 + }, + { + "epoch": 0.17293502971323735, + "grad_norm": 3.0, + "learning_rate": 4.6501100904427996e-05, + "loss": 0.7998, + "step": 9756 + }, + { + "epoch": 0.1729704817488489, + "grad_norm": 2.453125, + "learning_rate": 4.649967581577263e-05, + "loss": 0.7697, + "step": 9758 + }, + { + "epoch": 0.1730059337844605, + "grad_norm": 2.90625, + "learning_rate": 4.649825045880579e-05, + "loss": 0.7944, + "step": 9760 + }, + { + "epoch": 0.17304138582007206, + "grad_norm": 2.875, + "learning_rate": 4.649682483354525e-05, + "loss": 0.8214, + "step": 9762 + }, + { + "epoch": 0.17307683785568362, + "grad_norm": 2.71875, + "learning_rate": 4.649539894000883e-05, + "loss": 0.7868, + "step": 9764 + }, + { + "epoch": 0.1731122898912952, + "grad_norm": 2.546875, + "learning_rate": 4.6493972778214294e-05, + "loss": 0.7774, + "step": 9766 + }, + { + "epoch": 0.17314774192690677, + "grad_norm": 2.96875, + "learning_rate": 4.649254634817946e-05, + "loss": 0.8818, + "step": 9768 + }, + { + "epoch": 0.17318319396251833, + "grad_norm": 2.5, + "learning_rate": 4.6491119649922124e-05, + "loss": 0.7984, + "step": 9770 + }, + { + "epoch": 0.17321864599812992, + "grad_norm": 2.78125, + "learning_rate": 4.64896926834601e-05, + "loss": 0.7955, + "step": 9772 + }, + { + "epoch": 0.17325409803374148, + "grad_norm": 2.6875, + "learning_rate": 4.648826544881117e-05, + "loss": 0.8068, + "step": 9774 + }, + { + "epoch": 0.17328955006935304, + "grad_norm": 2.4375, + "learning_rate": 4.648683794599318e-05, + "loss": 0.8155, + "step": 9776 + }, + { + "epoch": 0.17332500210496463, + "grad_norm": 2.828125, + "learning_rate": 4.648541017502392e-05, + "loss": 0.8327, + "step": 9778 + }, + { + "epoch": 0.1733604541405762, + "grad_norm": 2.78125, + "learning_rate": 4.648398213592121e-05, + "loss": 0.8585, + "step": 9780 + }, + { + "epoch": 0.17339590617618775, + "grad_norm": 2.65625, + "learning_rate": 4.648255382870288e-05, + "loss": 0.8518, + "step": 9782 + }, + { + "epoch": 0.17343135821179934, + "grad_norm": 2.765625, + "learning_rate": 4.648112525338676e-05, + "loss": 0.7884, + "step": 9784 + }, + { + "epoch": 0.1734668102474109, + "grad_norm": 3.1875, + "learning_rate": 4.647969640999066e-05, + "loss": 0.8202, + "step": 9786 + }, + { + "epoch": 0.17350226228302246, + "grad_norm": 2.875, + "learning_rate": 4.6478267298532434e-05, + "loss": 0.8754, + "step": 9788 + }, + { + "epoch": 0.17353771431863405, + "grad_norm": 2.859375, + "learning_rate": 4.6476837919029904e-05, + "loss": 0.8141, + "step": 9790 + }, + { + "epoch": 0.1735731663542456, + "grad_norm": 2.765625, + "learning_rate": 4.647540827150091e-05, + "loss": 0.826, + "step": 9792 + }, + { + "epoch": 0.17360861838985717, + "grad_norm": 2.6875, + "learning_rate": 4.647397835596329e-05, + "loss": 0.804, + "step": 9794 + }, + { + "epoch": 0.17364407042546876, + "grad_norm": 2.703125, + "learning_rate": 4.647254817243489e-05, + "loss": 0.8226, + "step": 9796 + }, + { + "epoch": 0.17367952246108032, + "grad_norm": 2.703125, + "learning_rate": 4.647111772093356e-05, + "loss": 0.8509, + "step": 9798 + }, + { + "epoch": 0.17371497449669188, + "grad_norm": 2.484375, + "learning_rate": 4.646968700147717e-05, + "loss": 0.7859, + "step": 9800 + }, + { + "epoch": 0.17375042653230346, + "grad_norm": 2.78125, + "learning_rate": 4.6468256014083546e-05, + "loss": 0.7955, + "step": 9802 + }, + { + "epoch": 0.17378587856791503, + "grad_norm": 2.828125, + "learning_rate": 4.6466824758770555e-05, + "loss": 0.8422, + "step": 9804 + }, + { + "epoch": 0.17382133060352659, + "grad_norm": 2.53125, + "learning_rate": 4.6465393235556066e-05, + "loss": 0.8353, + "step": 9806 + }, + { + "epoch": 0.17385678263913817, + "grad_norm": 3.015625, + "learning_rate": 4.6463961444457934e-05, + "loss": 0.8482, + "step": 9808 + }, + { + "epoch": 0.17389223467474973, + "grad_norm": 2.828125, + "learning_rate": 4.646252938549405e-05, + "loss": 0.798, + "step": 9810 + }, + { + "epoch": 0.1739276867103613, + "grad_norm": 2.53125, + "learning_rate": 4.646109705868226e-05, + "loss": 0.8061, + "step": 9812 + }, + { + "epoch": 0.17396313874597288, + "grad_norm": 2.515625, + "learning_rate": 4.645966446404044e-05, + "loss": 0.756, + "step": 9814 + }, + { + "epoch": 0.17399859078158444, + "grad_norm": 2.953125, + "learning_rate": 4.645823160158649e-05, + "loss": 0.8085, + "step": 9816 + }, + { + "epoch": 0.174034042817196, + "grad_norm": 2.90625, + "learning_rate": 4.645679847133827e-05, + "loss": 0.83, + "step": 9818 + }, + { + "epoch": 0.17406949485280757, + "grad_norm": 2.734375, + "learning_rate": 4.645536507331368e-05, + "loss": 0.7976, + "step": 9820 + }, + { + "epoch": 0.17410494688841915, + "grad_norm": 3.140625, + "learning_rate": 4.64539314075306e-05, + "loss": 0.8144, + "step": 9822 + }, + { + "epoch": 0.17414039892403071, + "grad_norm": 2.828125, + "learning_rate": 4.645249747400693e-05, + "loss": 0.7811, + "step": 9824 + }, + { + "epoch": 0.17417585095964228, + "grad_norm": 2.59375, + "learning_rate": 4.645106327276056e-05, + "loss": 0.7529, + "step": 9826 + }, + { + "epoch": 0.17421130299525386, + "grad_norm": 2.6875, + "learning_rate": 4.6449628803809384e-05, + "loss": 0.7915, + "step": 9828 + }, + { + "epoch": 0.17424675503086542, + "grad_norm": 3.15625, + "learning_rate": 4.644819406717131e-05, + "loss": 0.7945, + "step": 9830 + }, + { + "epoch": 0.17428220706647699, + "grad_norm": 2.828125, + "learning_rate": 4.6446759062864236e-05, + "loss": 0.8408, + "step": 9832 + }, + { + "epoch": 0.17431765910208857, + "grad_norm": 2.640625, + "learning_rate": 4.644532379090608e-05, + "loss": 0.7478, + "step": 9834 + }, + { + "epoch": 0.17435311113770013, + "grad_norm": 3.09375, + "learning_rate": 4.644388825131475e-05, + "loss": 0.7955, + "step": 9836 + }, + { + "epoch": 0.1743885631733117, + "grad_norm": 2.609375, + "learning_rate": 4.6442452444108166e-05, + "loss": 0.8422, + "step": 9838 + }, + { + "epoch": 0.17442401520892328, + "grad_norm": 2.625, + "learning_rate": 4.644101636930423e-05, + "loss": 0.7869, + "step": 9840 + }, + { + "epoch": 0.17445946724453484, + "grad_norm": 2.6875, + "learning_rate": 4.643958002692088e-05, + "loss": 0.8166, + "step": 9842 + }, + { + "epoch": 0.1744949192801464, + "grad_norm": 2.625, + "learning_rate": 4.643814341697604e-05, + "loss": 0.7817, + "step": 9844 + }, + { + "epoch": 0.174530371315758, + "grad_norm": 2.90625, + "learning_rate": 4.6436706539487636e-05, + "loss": 0.8151, + "step": 9846 + }, + { + "epoch": 0.17456582335136955, + "grad_norm": 2.859375, + "learning_rate": 4.64352693944736e-05, + "loss": 0.7965, + "step": 9848 + }, + { + "epoch": 0.17460127538698111, + "grad_norm": 2.5625, + "learning_rate": 4.643383198195186e-05, + "loss": 0.7691, + "step": 9850 + }, + { + "epoch": 0.1746367274225927, + "grad_norm": 2.703125, + "learning_rate": 4.643239430194036e-05, + "loss": 0.779, + "step": 9852 + }, + { + "epoch": 0.17467217945820426, + "grad_norm": 2.765625, + "learning_rate": 4.6430956354457054e-05, + "loss": 0.7911, + "step": 9854 + }, + { + "epoch": 0.17470763149381582, + "grad_norm": 2.609375, + "learning_rate": 4.642951813951987e-05, + "loss": 0.7949, + "step": 9856 + }, + { + "epoch": 0.1747430835294274, + "grad_norm": 2.578125, + "learning_rate": 4.642807965714676e-05, + "loss": 0.8238, + "step": 9858 + }, + { + "epoch": 0.17477853556503897, + "grad_norm": 2.578125, + "learning_rate": 4.642664090735569e-05, + "loss": 0.7866, + "step": 9860 + }, + { + "epoch": 0.17481398760065053, + "grad_norm": 2.671875, + "learning_rate": 4.64252018901646e-05, + "loss": 0.813, + "step": 9862 + }, + { + "epoch": 0.17484943963626212, + "grad_norm": 2.765625, + "learning_rate": 4.642376260559145e-05, + "loss": 0.7915, + "step": 9864 + }, + { + "epoch": 0.17488489167187368, + "grad_norm": 2.734375, + "learning_rate": 4.6422323053654205e-05, + "loss": 0.8049, + "step": 9866 + }, + { + "epoch": 0.17492034370748524, + "grad_norm": 2.546875, + "learning_rate": 4.6420883234370826e-05, + "loss": 0.8036, + "step": 9868 + }, + { + "epoch": 0.17495579574309683, + "grad_norm": 3.078125, + "learning_rate": 4.641944314775929e-05, + "loss": 0.824, + "step": 9870 + }, + { + "epoch": 0.1749912477787084, + "grad_norm": 2.9375, + "learning_rate": 4.6418002793837564e-05, + "loss": 0.8099, + "step": 9872 + }, + { + "epoch": 0.17502669981431995, + "grad_norm": 2.734375, + "learning_rate": 4.641656217262362e-05, + "loss": 0.8045, + "step": 9874 + }, + { + "epoch": 0.17506215184993154, + "grad_norm": 2.609375, + "learning_rate": 4.6415121284135453e-05, + "loss": 0.8309, + "step": 9876 + }, + { + "epoch": 0.1750976038855431, + "grad_norm": 2.4375, + "learning_rate": 4.641368012839102e-05, + "loss": 0.8056, + "step": 9878 + }, + { + "epoch": 0.17513305592115466, + "grad_norm": 2.671875, + "learning_rate": 4.641223870540833e-05, + "loss": 0.7938, + "step": 9880 + }, + { + "epoch": 0.17516850795676625, + "grad_norm": 2.515625, + "learning_rate": 4.641079701520535e-05, + "loss": 0.7869, + "step": 9882 + }, + { + "epoch": 0.1752039599923778, + "grad_norm": 2.5625, + "learning_rate": 4.64093550578001e-05, + "loss": 0.7944, + "step": 9884 + }, + { + "epoch": 0.17523941202798937, + "grad_norm": 2.71875, + "learning_rate": 4.640791283321054e-05, + "loss": 0.7952, + "step": 9886 + }, + { + "epoch": 0.17527486406360096, + "grad_norm": 2.6875, + "learning_rate": 4.64064703414547e-05, + "loss": 0.8147, + "step": 9888 + }, + { + "epoch": 0.17531031609921252, + "grad_norm": 3.0, + "learning_rate": 4.6405027582550556e-05, + "loss": 0.8132, + "step": 9890 + }, + { + "epoch": 0.17534576813482408, + "grad_norm": 2.609375, + "learning_rate": 4.640358455651613e-05, + "loss": 0.8292, + "step": 9892 + }, + { + "epoch": 0.17538122017043567, + "grad_norm": 2.78125, + "learning_rate": 4.640214126336943e-05, + "loss": 0.7597, + "step": 9894 + }, + { + "epoch": 0.17541667220604723, + "grad_norm": 2.734375, + "learning_rate": 4.640069770312846e-05, + "loss": 0.7683, + "step": 9896 + }, + { + "epoch": 0.1754521242416588, + "grad_norm": 2.9375, + "learning_rate": 4.639925387581125e-05, + "loss": 0.7725, + "step": 9898 + }, + { + "epoch": 0.17548757627727038, + "grad_norm": 2.890625, + "learning_rate": 4.6397809781435805e-05, + "loss": 0.7968, + "step": 9900 + }, + { + "epoch": 0.17552302831288194, + "grad_norm": 2.453125, + "learning_rate": 4.639636542002015e-05, + "loss": 0.7619, + "step": 9902 + }, + { + "epoch": 0.1755584803484935, + "grad_norm": 2.609375, + "learning_rate": 4.639492079158231e-05, + "loss": 0.8393, + "step": 9904 + }, + { + "epoch": 0.1755939323841051, + "grad_norm": 3.109375, + "learning_rate": 4.6393475896140306e-05, + "loss": 0.8553, + "step": 9906 + }, + { + "epoch": 0.17562938441971665, + "grad_norm": 2.75, + "learning_rate": 4.639203073371219e-05, + "loss": 0.8141, + "step": 9908 + }, + { + "epoch": 0.1756648364553282, + "grad_norm": 2.484375, + "learning_rate": 4.639058530431598e-05, + "loss": 0.799, + "step": 9910 + }, + { + "epoch": 0.1757002884909398, + "grad_norm": 2.875, + "learning_rate": 4.638913960796973e-05, + "loss": 0.8397, + "step": 9912 + }, + { + "epoch": 0.17573574052655136, + "grad_norm": 2.765625, + "learning_rate": 4.6387693644691464e-05, + "loss": 0.7984, + "step": 9914 + }, + { + "epoch": 0.17577119256216292, + "grad_norm": 2.375, + "learning_rate": 4.638624741449924e-05, + "loss": 0.7958, + "step": 9916 + }, + { + "epoch": 0.1758066445977745, + "grad_norm": 2.796875, + "learning_rate": 4.63848009174111e-05, + "loss": 0.8291, + "step": 9918 + }, + { + "epoch": 0.17584209663338607, + "grad_norm": 2.53125, + "learning_rate": 4.63833541534451e-05, + "loss": 0.8011, + "step": 9920 + }, + { + "epoch": 0.17587754866899763, + "grad_norm": 2.640625, + "learning_rate": 4.63819071226193e-05, + "loss": 0.7897, + "step": 9922 + }, + { + "epoch": 0.17591300070460922, + "grad_norm": 2.734375, + "learning_rate": 4.638045982495174e-05, + "loss": 0.8142, + "step": 9924 + }, + { + "epoch": 0.17594845274022078, + "grad_norm": 3.0625, + "learning_rate": 4.637901226046051e-05, + "loss": 0.785, + "step": 9926 + }, + { + "epoch": 0.17598390477583234, + "grad_norm": 2.609375, + "learning_rate": 4.6377564429163645e-05, + "loss": 0.7962, + "step": 9928 + }, + { + "epoch": 0.17601935681144393, + "grad_norm": 2.515625, + "learning_rate": 4.6376116331079235e-05, + "loss": 0.7905, + "step": 9930 + }, + { + "epoch": 0.1760548088470555, + "grad_norm": 2.515625, + "learning_rate": 4.637466796622535e-05, + "loss": 0.7657, + "step": 9932 + }, + { + "epoch": 0.17609026088266705, + "grad_norm": 2.859375, + "learning_rate": 4.637321933462006e-05, + "loss": 0.795, + "step": 9934 + }, + { + "epoch": 0.17612571291827864, + "grad_norm": 2.625, + "learning_rate": 4.6371770436281436e-05, + "loss": 0.8107, + "step": 9936 + }, + { + "epoch": 0.1761611649538902, + "grad_norm": 2.46875, + "learning_rate": 4.637032127122757e-05, + "loss": 0.777, + "step": 9938 + }, + { + "epoch": 0.17619661698950176, + "grad_norm": 2.40625, + "learning_rate": 4.636887183947655e-05, + "loss": 0.8037, + "step": 9940 + }, + { + "epoch": 0.17623206902511335, + "grad_norm": 2.703125, + "learning_rate": 4.6367422141046455e-05, + "loss": 0.8038, + "step": 9942 + }, + { + "epoch": 0.1762675210607249, + "grad_norm": 2.859375, + "learning_rate": 4.6365972175955394e-05, + "loss": 0.8317, + "step": 9944 + }, + { + "epoch": 0.17630297309633647, + "grad_norm": 2.59375, + "learning_rate": 4.636452194422144e-05, + "loss": 0.7815, + "step": 9946 + }, + { + "epoch": 0.17633842513194806, + "grad_norm": 2.6875, + "learning_rate": 4.6363071445862704e-05, + "loss": 0.8249, + "step": 9948 + }, + { + "epoch": 0.17637387716755962, + "grad_norm": 2.421875, + "learning_rate": 4.636162068089729e-05, + "loss": 0.7989, + "step": 9950 + }, + { + "epoch": 0.17640932920317118, + "grad_norm": 2.671875, + "learning_rate": 4.636016964934329e-05, + "loss": 0.8027, + "step": 9952 + }, + { + "epoch": 0.17644478123878277, + "grad_norm": 2.953125, + "learning_rate": 4.635871835121883e-05, + "loss": 0.8011, + "step": 9954 + }, + { + "epoch": 0.17648023327439433, + "grad_norm": 2.6875, + "learning_rate": 4.6357266786542006e-05, + "loss": 0.7947, + "step": 9956 + }, + { + "epoch": 0.1765156853100059, + "grad_norm": 2.859375, + "learning_rate": 4.6355814955330954e-05, + "loss": 0.8287, + "step": 9958 + }, + { + "epoch": 0.17655113734561748, + "grad_norm": 2.59375, + "learning_rate": 4.635436285760377e-05, + "loss": 0.7452, + "step": 9960 + }, + { + "epoch": 0.17658658938122904, + "grad_norm": 2.59375, + "learning_rate": 4.635291049337859e-05, + "loss": 0.784, + "step": 9962 + }, + { + "epoch": 0.1766220414168406, + "grad_norm": 2.859375, + "learning_rate": 4.635145786267353e-05, + "loss": 0.7922, + "step": 9964 + }, + { + "epoch": 0.17665749345245219, + "grad_norm": 3.03125, + "learning_rate": 4.635000496550672e-05, + "loss": 0.8115, + "step": 9966 + }, + { + "epoch": 0.17669294548806375, + "grad_norm": 2.84375, + "learning_rate": 4.63485518018963e-05, + "loss": 0.7737, + "step": 9968 + }, + { + "epoch": 0.1767283975236753, + "grad_norm": 2.71875, + "learning_rate": 4.6347098371860396e-05, + "loss": 0.8111, + "step": 9970 + }, + { + "epoch": 0.1767638495592869, + "grad_norm": 2.625, + "learning_rate": 4.634564467541715e-05, + "loss": 0.7941, + "step": 9972 + }, + { + "epoch": 0.17679930159489846, + "grad_norm": 2.65625, + "learning_rate": 4.634419071258472e-05, + "loss": 0.8295, + "step": 9974 + }, + { + "epoch": 0.17683475363051002, + "grad_norm": 2.734375, + "learning_rate": 4.634273648338122e-05, + "loss": 0.8232, + "step": 9976 + }, + { + "epoch": 0.1768702056661216, + "grad_norm": 2.921875, + "learning_rate": 4.6341281987824817e-05, + "loss": 0.7908, + "step": 9978 + }, + { + "epoch": 0.17690565770173317, + "grad_norm": 2.5625, + "learning_rate": 4.6339827225933665e-05, + "loss": 0.8186, + "step": 9980 + }, + { + "epoch": 0.17694110973734473, + "grad_norm": 2.71875, + "learning_rate": 4.633837219772591e-05, + "loss": 0.8195, + "step": 9982 + }, + { + "epoch": 0.17697656177295631, + "grad_norm": 2.609375, + "learning_rate": 4.633691690321971e-05, + "loss": 0.8031, + "step": 9984 + }, + { + "epoch": 0.17701201380856788, + "grad_norm": 2.921875, + "learning_rate": 4.633546134243324e-05, + "loss": 0.8055, + "step": 9986 + }, + { + "epoch": 0.17704746584417944, + "grad_norm": 2.765625, + "learning_rate": 4.633400551538465e-05, + "loss": 0.798, + "step": 9988 + }, + { + "epoch": 0.177082917879791, + "grad_norm": 2.609375, + "learning_rate": 4.633254942209212e-05, + "loss": 0.8628, + "step": 9990 + }, + { + "epoch": 0.17711836991540258, + "grad_norm": 2.734375, + "learning_rate": 4.633109306257381e-05, + "loss": 0.8108, + "step": 9992 + }, + { + "epoch": 0.17715382195101415, + "grad_norm": 2.765625, + "learning_rate": 4.632963643684791e-05, + "loss": 0.791, + "step": 9994 + }, + { + "epoch": 0.1771892739866257, + "grad_norm": 2.671875, + "learning_rate": 4.632817954493258e-05, + "loss": 0.8108, + "step": 9996 + }, + { + "epoch": 0.1772247260222373, + "grad_norm": 2.921875, + "learning_rate": 4.632672238684602e-05, + "loss": 0.8331, + "step": 9998 + }, + { + "epoch": 0.17726017805784886, + "grad_norm": 2.640625, + "learning_rate": 4.6325264962606395e-05, + "loss": 0.7822, + "step": 10000 + }, + { + "epoch": 0.17729563009346042, + "grad_norm": 2.421875, + "learning_rate": 4.6323807272231915e-05, + "loss": 0.7935, + "step": 10002 + }, + { + "epoch": 0.177331082129072, + "grad_norm": 2.65625, + "learning_rate": 4.6322349315740756e-05, + "loss": 0.7826, + "step": 10004 + }, + { + "epoch": 0.17736653416468356, + "grad_norm": 2.578125, + "learning_rate": 4.632089109315113e-05, + "loss": 0.8238, + "step": 10006 + }, + { + "epoch": 0.17740198620029513, + "grad_norm": 2.640625, + "learning_rate": 4.631943260448122e-05, + "loss": 0.8166, + "step": 10008 + }, + { + "epoch": 0.1774374382359067, + "grad_norm": 2.671875, + "learning_rate": 4.631797384974922e-05, + "loss": 0.7851, + "step": 10010 + }, + { + "epoch": 0.17747289027151827, + "grad_norm": 2.953125, + "learning_rate": 4.631651482897336e-05, + "loss": 0.7606, + "step": 10012 + }, + { + "epoch": 0.17750834230712983, + "grad_norm": 2.71875, + "learning_rate": 4.631505554217183e-05, + "loss": 0.8473, + "step": 10014 + }, + { + "epoch": 0.17754379434274142, + "grad_norm": 2.90625, + "learning_rate": 4.6313595989362844e-05, + "loss": 0.8125, + "step": 10016 + }, + { + "epoch": 0.17757924637835298, + "grad_norm": 2.671875, + "learning_rate": 4.631213617056462e-05, + "loss": 0.7986, + "step": 10018 + }, + { + "epoch": 0.17761469841396454, + "grad_norm": 2.90625, + "learning_rate": 4.6310676085795376e-05, + "loss": 0.8402, + "step": 10020 + }, + { + "epoch": 0.17765015044957613, + "grad_norm": 2.953125, + "learning_rate": 4.630921573507333e-05, + "loss": 0.8056, + "step": 10022 + }, + { + "epoch": 0.1776856024851877, + "grad_norm": 2.796875, + "learning_rate": 4.630775511841672e-05, + "loss": 0.7992, + "step": 10024 + }, + { + "epoch": 0.17772105452079925, + "grad_norm": 2.859375, + "learning_rate": 4.630629423584376e-05, + "loss": 0.8099, + "step": 10026 + }, + { + "epoch": 0.17775650655641084, + "grad_norm": 2.75, + "learning_rate": 4.6304833087372676e-05, + "loss": 0.8018, + "step": 10028 + }, + { + "epoch": 0.1777919585920224, + "grad_norm": 2.75, + "learning_rate": 4.6303371673021726e-05, + "loss": 0.8144, + "step": 10030 + }, + { + "epoch": 0.17782741062763396, + "grad_norm": 2.6875, + "learning_rate": 4.630190999280912e-05, + "loss": 0.8091, + "step": 10032 + }, + { + "epoch": 0.17786286266324555, + "grad_norm": 3.015625, + "learning_rate": 4.630044804675313e-05, + "loss": 0.8219, + "step": 10034 + }, + { + "epoch": 0.1778983146988571, + "grad_norm": 2.59375, + "learning_rate": 4.629898583487198e-05, + "loss": 0.8331, + "step": 10036 + }, + { + "epoch": 0.17793376673446867, + "grad_norm": 2.609375, + "learning_rate": 4.629752335718391e-05, + "loss": 0.8037, + "step": 10038 + }, + { + "epoch": 0.17796921877008026, + "grad_norm": 2.78125, + "learning_rate": 4.62960606137072e-05, + "loss": 0.8163, + "step": 10040 + }, + { + "epoch": 0.17800467080569182, + "grad_norm": 2.65625, + "learning_rate": 4.6294597604460086e-05, + "loss": 0.7786, + "step": 10042 + }, + { + "epoch": 0.17804012284130338, + "grad_norm": 2.890625, + "learning_rate": 4.629313432946083e-05, + "loss": 0.8375, + "step": 10044 + }, + { + "epoch": 0.17807557487691497, + "grad_norm": 2.75, + "learning_rate": 4.629167078872769e-05, + "loss": 0.8391, + "step": 10046 + }, + { + "epoch": 0.17811102691252653, + "grad_norm": 2.84375, + "learning_rate": 4.629020698227893e-05, + "loss": 0.7991, + "step": 10048 + }, + { + "epoch": 0.1781464789481381, + "grad_norm": 2.46875, + "learning_rate": 4.6288742910132834e-05, + "loss": 0.7886, + "step": 10050 + }, + { + "epoch": 0.17818193098374968, + "grad_norm": 2.671875, + "learning_rate": 4.628727857230765e-05, + "loss": 0.8491, + "step": 10052 + }, + { + "epoch": 0.17821738301936124, + "grad_norm": 2.484375, + "learning_rate": 4.628581396882166e-05, + "loss": 0.7983, + "step": 10054 + }, + { + "epoch": 0.1782528350549728, + "grad_norm": 2.78125, + "learning_rate": 4.628434909969315e-05, + "loss": 0.8217, + "step": 10056 + }, + { + "epoch": 0.1782882870905844, + "grad_norm": 2.625, + "learning_rate": 4.62828839649404e-05, + "loss": 0.8234, + "step": 10058 + }, + { + "epoch": 0.17832373912619595, + "grad_norm": 2.578125, + "learning_rate": 4.628141856458168e-05, + "loss": 0.8098, + "step": 10060 + }, + { + "epoch": 0.1783591911618075, + "grad_norm": 2.609375, + "learning_rate": 4.6279952898635305e-05, + "loss": 0.7831, + "step": 10062 + }, + { + "epoch": 0.1783946431974191, + "grad_norm": 2.703125, + "learning_rate": 4.627848696711954e-05, + "loss": 0.8053, + "step": 10064 + }, + { + "epoch": 0.17843009523303066, + "grad_norm": 2.796875, + "learning_rate": 4.6277020770052695e-05, + "loss": 0.8616, + "step": 10066 + }, + { + "epoch": 0.17846554726864222, + "grad_norm": 2.71875, + "learning_rate": 4.627555430745305e-05, + "loss": 0.8083, + "step": 10068 + }, + { + "epoch": 0.1785009993042538, + "grad_norm": 2.75, + "learning_rate": 4.6274087579338934e-05, + "loss": 0.7734, + "step": 10070 + }, + { + "epoch": 0.17853645133986537, + "grad_norm": 2.671875, + "learning_rate": 4.6272620585728626e-05, + "loss": 0.7799, + "step": 10072 + }, + { + "epoch": 0.17857190337547693, + "grad_norm": 2.578125, + "learning_rate": 4.627115332664045e-05, + "loss": 0.7787, + "step": 10074 + }, + { + "epoch": 0.17860735541108852, + "grad_norm": 2.515625, + "learning_rate": 4.626968580209271e-05, + "loss": 0.7949, + "step": 10076 + }, + { + "epoch": 0.17864280744670008, + "grad_norm": 2.609375, + "learning_rate": 4.6268218012103716e-05, + "loss": 0.8278, + "step": 10078 + }, + { + "epoch": 0.17867825948231164, + "grad_norm": 3.0, + "learning_rate": 4.6266749956691794e-05, + "loss": 0.8582, + "step": 10080 + }, + { + "epoch": 0.17871371151792323, + "grad_norm": 2.78125, + "learning_rate": 4.626528163587527e-05, + "loss": 0.8595, + "step": 10082 + }, + { + "epoch": 0.1787491635535348, + "grad_norm": 2.671875, + "learning_rate": 4.626381304967244e-05, + "loss": 0.8001, + "step": 10084 + }, + { + "epoch": 0.17878461558914635, + "grad_norm": 2.65625, + "learning_rate": 4.626234419810167e-05, + "loss": 0.7742, + "step": 10086 + }, + { + "epoch": 0.17882006762475794, + "grad_norm": 2.65625, + "learning_rate": 4.626087508118127e-05, + "loss": 0.7925, + "step": 10088 + }, + { + "epoch": 0.1788555196603695, + "grad_norm": 2.578125, + "learning_rate": 4.625940569892958e-05, + "loss": 0.8243, + "step": 10090 + }, + { + "epoch": 0.17889097169598106, + "grad_norm": 2.921875, + "learning_rate": 4.6257936051364927e-05, + "loss": 0.8089, + "step": 10092 + }, + { + "epoch": 0.17892642373159265, + "grad_norm": 2.640625, + "learning_rate": 4.625646613850566e-05, + "loss": 0.815, + "step": 10094 + }, + { + "epoch": 0.1789618757672042, + "grad_norm": 3.0, + "learning_rate": 4.6254995960370126e-05, + "loss": 0.8038, + "step": 10096 + }, + { + "epoch": 0.17899732780281577, + "grad_norm": 2.765625, + "learning_rate": 4.625352551697667e-05, + "loss": 0.7894, + "step": 10098 + }, + { + "epoch": 0.17903277983842736, + "grad_norm": 2.875, + "learning_rate": 4.6252054808343645e-05, + "loss": 0.7992, + "step": 10100 + }, + { + "epoch": 0.17906823187403892, + "grad_norm": 2.84375, + "learning_rate": 4.62505838344894e-05, + "loss": 0.7953, + "step": 10102 + }, + { + "epoch": 0.17910368390965048, + "grad_norm": 2.5625, + "learning_rate": 4.62491125954323e-05, + "loss": 0.8013, + "step": 10104 + }, + { + "epoch": 0.17913913594526207, + "grad_norm": 2.890625, + "learning_rate": 4.624764109119069e-05, + "loss": 0.8029, + "step": 10106 + }, + { + "epoch": 0.17917458798087363, + "grad_norm": 2.875, + "learning_rate": 4.624616932178295e-05, + "loss": 0.8491, + "step": 10108 + }, + { + "epoch": 0.1792100400164852, + "grad_norm": 2.53125, + "learning_rate": 4.624469728722744e-05, + "loss": 0.7669, + "step": 10110 + }, + { + "epoch": 0.17924549205209678, + "grad_norm": 2.640625, + "learning_rate": 4.624322498754253e-05, + "loss": 0.7937, + "step": 10112 + }, + { + "epoch": 0.17928094408770834, + "grad_norm": 2.8125, + "learning_rate": 4.624175242274661e-05, + "loss": 0.8431, + "step": 10114 + }, + { + "epoch": 0.1793163961233199, + "grad_norm": 2.625, + "learning_rate": 4.624027959285804e-05, + "loss": 0.8262, + "step": 10116 + }, + { + "epoch": 0.1793518481589315, + "grad_norm": 2.734375, + "learning_rate": 4.6238806497895194e-05, + "loss": 0.8436, + "step": 10118 + }, + { + "epoch": 0.17938730019454305, + "grad_norm": 2.9375, + "learning_rate": 4.623733313787647e-05, + "loss": 0.8014, + "step": 10120 + }, + { + "epoch": 0.1794227522301546, + "grad_norm": 2.609375, + "learning_rate": 4.623585951282026e-05, + "loss": 0.7842, + "step": 10122 + }, + { + "epoch": 0.1794582042657662, + "grad_norm": 2.609375, + "learning_rate": 4.623438562274494e-05, + "loss": 0.7823, + "step": 10124 + }, + { + "epoch": 0.17949365630137776, + "grad_norm": 2.9375, + "learning_rate": 4.623291146766892e-05, + "loss": 0.8083, + "step": 10126 + }, + { + "epoch": 0.17952910833698932, + "grad_norm": 2.828125, + "learning_rate": 4.623143704761057e-05, + "loss": 0.7822, + "step": 10128 + }, + { + "epoch": 0.1795645603726009, + "grad_norm": 2.78125, + "learning_rate": 4.622996236258832e-05, + "loss": 0.8341, + "step": 10130 + }, + { + "epoch": 0.17960001240821247, + "grad_norm": 2.484375, + "learning_rate": 4.622848741262056e-05, + "loss": 0.8124, + "step": 10132 + }, + { + "epoch": 0.17963546444382403, + "grad_norm": 2.71875, + "learning_rate": 4.6227012197725695e-05, + "loss": 0.8148, + "step": 10134 + }, + { + "epoch": 0.17967091647943562, + "grad_norm": 2.5625, + "learning_rate": 4.622553671792213e-05, + "loss": 0.7565, + "step": 10136 + }, + { + "epoch": 0.17970636851504718, + "grad_norm": 2.578125, + "learning_rate": 4.6224060973228314e-05, + "loss": 0.7274, + "step": 10138 + }, + { + "epoch": 0.17974182055065874, + "grad_norm": 2.96875, + "learning_rate": 4.622258496366262e-05, + "loss": 0.8538, + "step": 10140 + }, + { + "epoch": 0.17977727258627033, + "grad_norm": 2.96875, + "learning_rate": 4.622110868924349e-05, + "loss": 0.8225, + "step": 10142 + }, + { + "epoch": 0.1798127246218819, + "grad_norm": 2.59375, + "learning_rate": 4.6219632149989336e-05, + "loss": 0.7981, + "step": 10144 + }, + { + "epoch": 0.17984817665749345, + "grad_norm": 2.65625, + "learning_rate": 4.62181553459186e-05, + "loss": 0.8034, + "step": 10146 + }, + { + "epoch": 0.17988362869310504, + "grad_norm": 2.828125, + "learning_rate": 4.6216678277049705e-05, + "loss": 0.8419, + "step": 10148 + }, + { + "epoch": 0.1799190807287166, + "grad_norm": 2.609375, + "learning_rate": 4.621520094340108e-05, + "loss": 0.7972, + "step": 10150 + }, + { + "epoch": 0.17995453276432816, + "grad_norm": 2.578125, + "learning_rate": 4.6213723344991163e-05, + "loss": 0.7692, + "step": 10152 + }, + { + "epoch": 0.17998998479993975, + "grad_norm": 2.578125, + "learning_rate": 4.621224548183841e-05, + "loss": 0.7733, + "step": 10154 + }, + { + "epoch": 0.1800254368355513, + "grad_norm": 2.71875, + "learning_rate": 4.621076735396124e-05, + "loss": 0.7879, + "step": 10156 + }, + { + "epoch": 0.18006088887116287, + "grad_norm": 2.875, + "learning_rate": 4.620928896137812e-05, + "loss": 0.7648, + "step": 10158 + }, + { + "epoch": 0.18009634090677445, + "grad_norm": 3.015625, + "learning_rate": 4.620781030410749e-05, + "loss": 0.8248, + "step": 10160 + }, + { + "epoch": 0.18013179294238602, + "grad_norm": 2.328125, + "learning_rate": 4.62063313821678e-05, + "loss": 0.7791, + "step": 10162 + }, + { + "epoch": 0.18016724497799758, + "grad_norm": 2.859375, + "learning_rate": 4.6204852195577506e-05, + "loss": 0.8162, + "step": 10164 + }, + { + "epoch": 0.18020269701360914, + "grad_norm": 3.03125, + "learning_rate": 4.620337274435508e-05, + "loss": 0.8146, + "step": 10166 + }, + { + "epoch": 0.18023814904922072, + "grad_norm": 2.640625, + "learning_rate": 4.6201893028518986e-05, + "loss": 0.825, + "step": 10168 + }, + { + "epoch": 0.18027360108483229, + "grad_norm": 2.75, + "learning_rate": 4.620041304808767e-05, + "loss": 0.7898, + "step": 10170 + }, + { + "epoch": 0.18030905312044385, + "grad_norm": 2.65625, + "learning_rate": 4.619893280307962e-05, + "loss": 0.793, + "step": 10172 + }, + { + "epoch": 0.18034450515605543, + "grad_norm": 2.875, + "learning_rate": 4.619745229351331e-05, + "loss": 0.7651, + "step": 10174 + }, + { + "epoch": 0.180379957191667, + "grad_norm": 2.6875, + "learning_rate": 4.61959715194072e-05, + "loss": 0.7769, + "step": 10176 + }, + { + "epoch": 0.18041540922727856, + "grad_norm": 2.671875, + "learning_rate": 4.619449048077979e-05, + "loss": 0.8221, + "step": 10178 + }, + { + "epoch": 0.18045086126289014, + "grad_norm": 2.71875, + "learning_rate": 4.619300917764955e-05, + "loss": 0.7848, + "step": 10180 + }, + { + "epoch": 0.1804863132985017, + "grad_norm": 2.828125, + "learning_rate": 4.6191527610034965e-05, + "loss": 0.7816, + "step": 10182 + }, + { + "epoch": 0.18052176533411327, + "grad_norm": 2.75, + "learning_rate": 4.619004577795453e-05, + "loss": 0.7701, + "step": 10184 + }, + { + "epoch": 0.18055721736972485, + "grad_norm": 2.640625, + "learning_rate": 4.618856368142674e-05, + "loss": 0.8016, + "step": 10186 + }, + { + "epoch": 0.18059266940533641, + "grad_norm": 2.609375, + "learning_rate": 4.6187081320470096e-05, + "loss": 0.7553, + "step": 10188 + }, + { + "epoch": 0.18062812144094798, + "grad_norm": 2.84375, + "learning_rate": 4.6185598695103075e-05, + "loss": 0.7912, + "step": 10190 + }, + { + "epoch": 0.18066357347655956, + "grad_norm": 2.875, + "learning_rate": 4.6184115805344206e-05, + "loss": 0.8097, + "step": 10192 + }, + { + "epoch": 0.18069902551217112, + "grad_norm": 2.734375, + "learning_rate": 4.6182632651211976e-05, + "loss": 0.7898, + "step": 10194 + }, + { + "epoch": 0.18073447754778268, + "grad_norm": 2.765625, + "learning_rate": 4.618114923272491e-05, + "loss": 0.8266, + "step": 10196 + }, + { + "epoch": 0.18076992958339427, + "grad_norm": 2.859375, + "learning_rate": 4.6179665549901506e-05, + "loss": 0.7944, + "step": 10198 + }, + { + "epoch": 0.18080538161900583, + "grad_norm": 2.59375, + "learning_rate": 4.617818160276029e-05, + "loss": 0.7928, + "step": 10200 + }, + { + "epoch": 0.1808408336546174, + "grad_norm": 2.859375, + "learning_rate": 4.617669739131979e-05, + "loss": 0.8188, + "step": 10202 + }, + { + "epoch": 0.18087628569022898, + "grad_norm": 2.671875, + "learning_rate": 4.617521291559851e-05, + "loss": 0.7336, + "step": 10204 + }, + { + "epoch": 0.18091173772584054, + "grad_norm": 2.53125, + "learning_rate": 4.617372817561497e-05, + "loss": 0.7776, + "step": 10206 + }, + { + "epoch": 0.1809471897614521, + "grad_norm": 2.75, + "learning_rate": 4.617224317138773e-05, + "loss": 0.7534, + "step": 10208 + }, + { + "epoch": 0.1809826417970637, + "grad_norm": 3.0, + "learning_rate": 4.6170757902935296e-05, + "loss": 0.7864, + "step": 10210 + }, + { + "epoch": 0.18101809383267525, + "grad_norm": 2.609375, + "learning_rate": 4.616927237027622e-05, + "loss": 0.8448, + "step": 10212 + }, + { + "epoch": 0.1810535458682868, + "grad_norm": 2.8125, + "learning_rate": 4.616778657342903e-05, + "loss": 0.7974, + "step": 10214 + }, + { + "epoch": 0.1810889979038984, + "grad_norm": 2.625, + "learning_rate": 4.616630051241227e-05, + "loss": 0.8253, + "step": 10216 + }, + { + "epoch": 0.18112444993950996, + "grad_norm": 2.9375, + "learning_rate": 4.616481418724449e-05, + "loss": 0.8596, + "step": 10218 + }, + { + "epoch": 0.18115990197512152, + "grad_norm": 2.765625, + "learning_rate": 4.616332759794424e-05, + "loss": 0.8015, + "step": 10220 + }, + { + "epoch": 0.1811953540107331, + "grad_norm": 3.1875, + "learning_rate": 4.616184074453006e-05, + "loss": 0.857, + "step": 10222 + }, + { + "epoch": 0.18123080604634467, + "grad_norm": 2.671875, + "learning_rate": 4.616035362702053e-05, + "loss": 0.781, + "step": 10224 + }, + { + "epoch": 0.18126625808195623, + "grad_norm": 2.84375, + "learning_rate": 4.615886624543418e-05, + "loss": 0.8163, + "step": 10226 + }, + { + "epoch": 0.18130171011756782, + "grad_norm": 2.921875, + "learning_rate": 4.615737859978959e-05, + "loss": 0.8093, + "step": 10228 + }, + { + "epoch": 0.18133716215317938, + "grad_norm": 2.671875, + "learning_rate": 4.615589069010533e-05, + "loss": 0.8303, + "step": 10230 + }, + { + "epoch": 0.18137261418879094, + "grad_norm": 2.96875, + "learning_rate": 4.615440251639995e-05, + "loss": 0.8302, + "step": 10232 + }, + { + "epoch": 0.18140806622440253, + "grad_norm": 2.328125, + "learning_rate": 4.6152914078692046e-05, + "loss": 0.8092, + "step": 10234 + }, + { + "epoch": 0.1814435182600141, + "grad_norm": 2.53125, + "learning_rate": 4.615142537700017e-05, + "loss": 0.8085, + "step": 10236 + }, + { + "epoch": 0.18147897029562565, + "grad_norm": 2.75, + "learning_rate": 4.614993641134291e-05, + "loss": 0.7855, + "step": 10238 + }, + { + "epoch": 0.18151442233123724, + "grad_norm": 2.78125, + "learning_rate": 4.614844718173885e-05, + "loss": 0.8175, + "step": 10240 + }, + { + "epoch": 0.1815498743668488, + "grad_norm": 2.953125, + "learning_rate": 4.6146957688206585e-05, + "loss": 0.8237, + "step": 10242 + }, + { + "epoch": 0.18158532640246036, + "grad_norm": 2.90625, + "learning_rate": 4.614546793076467e-05, + "loss": 0.8189, + "step": 10244 + }, + { + "epoch": 0.18162077843807195, + "grad_norm": 2.578125, + "learning_rate": 4.614397790943174e-05, + "loss": 0.7813, + "step": 10246 + }, + { + "epoch": 0.1816562304736835, + "grad_norm": 2.71875, + "learning_rate": 4.6142487624226364e-05, + "loss": 0.8565, + "step": 10248 + }, + { + "epoch": 0.18169168250929507, + "grad_norm": 2.78125, + "learning_rate": 4.614099707516715e-05, + "loss": 0.7952, + "step": 10250 + }, + { + "epoch": 0.18172713454490666, + "grad_norm": 2.890625, + "learning_rate": 4.6139506262272684e-05, + "loss": 0.772, + "step": 10252 + }, + { + "epoch": 0.18176258658051822, + "grad_norm": 2.8125, + "learning_rate": 4.613801518556159e-05, + "loss": 0.8336, + "step": 10254 + }, + { + "epoch": 0.18179803861612978, + "grad_norm": 2.65625, + "learning_rate": 4.613652384505247e-05, + "loss": 0.8098, + "step": 10256 + }, + { + "epoch": 0.18183349065174137, + "grad_norm": 3.09375, + "learning_rate": 4.613503224076393e-05, + "loss": 0.7743, + "step": 10258 + }, + { + "epoch": 0.18186894268735293, + "grad_norm": 2.78125, + "learning_rate": 4.613354037271459e-05, + "loss": 0.8096, + "step": 10260 + }, + { + "epoch": 0.1819043947229645, + "grad_norm": 2.828125, + "learning_rate": 4.6132048240923075e-05, + "loss": 0.8208, + "step": 10262 + }, + { + "epoch": 0.18193984675857608, + "grad_norm": 2.9375, + "learning_rate": 4.6130555845408e-05, + "loss": 0.8287, + "step": 10264 + }, + { + "epoch": 0.18197529879418764, + "grad_norm": 3.03125, + "learning_rate": 4.612906318618798e-05, + "loss": 0.8224, + "step": 10266 + }, + { + "epoch": 0.1820107508297992, + "grad_norm": 2.828125, + "learning_rate": 4.612757026328166e-05, + "loss": 0.8103, + "step": 10268 + }, + { + "epoch": 0.1820462028654108, + "grad_norm": 2.875, + "learning_rate": 4.6126077076707665e-05, + "loss": 0.8066, + "step": 10270 + }, + { + "epoch": 0.18208165490102235, + "grad_norm": 2.4375, + "learning_rate": 4.612458362648462e-05, + "loss": 0.7646, + "step": 10272 + }, + { + "epoch": 0.1821171069366339, + "grad_norm": 2.671875, + "learning_rate": 4.612308991263118e-05, + "loss": 0.8349, + "step": 10274 + }, + { + "epoch": 0.1821525589722455, + "grad_norm": 2.78125, + "learning_rate": 4.612159593516597e-05, + "loss": 0.8153, + "step": 10276 + }, + { + "epoch": 0.18218801100785706, + "grad_norm": 2.671875, + "learning_rate": 4.612010169410764e-05, + "loss": 0.7964, + "step": 10278 + }, + { + "epoch": 0.18222346304346862, + "grad_norm": 2.765625, + "learning_rate": 4.611860718947485e-05, + "loss": 0.7736, + "step": 10280 + }, + { + "epoch": 0.1822589150790802, + "grad_norm": 2.75, + "learning_rate": 4.6117112421286235e-05, + "loss": 0.819, + "step": 10282 + }, + { + "epoch": 0.18229436711469177, + "grad_norm": 2.53125, + "learning_rate": 4.611561738956046e-05, + "loss": 0.8259, + "step": 10284 + }, + { + "epoch": 0.18232981915030333, + "grad_norm": 3.203125, + "learning_rate": 4.611412209431617e-05, + "loss": 0.8199, + "step": 10286 + }, + { + "epoch": 0.18236527118591492, + "grad_norm": 2.984375, + "learning_rate": 4.6112626535572035e-05, + "loss": 0.7539, + "step": 10288 + }, + { + "epoch": 0.18240072322152648, + "grad_norm": 2.671875, + "learning_rate": 4.611113071334673e-05, + "loss": 0.7697, + "step": 10290 + }, + { + "epoch": 0.18243617525713804, + "grad_norm": 3.03125, + "learning_rate": 4.610963462765889e-05, + "loss": 0.8111, + "step": 10292 + }, + { + "epoch": 0.18247162729274963, + "grad_norm": 2.84375, + "learning_rate": 4.6108138278527226e-05, + "loss": 0.8284, + "step": 10294 + }, + { + "epoch": 0.1825070793283612, + "grad_norm": 2.828125, + "learning_rate": 4.610664166597039e-05, + "loss": 0.8089, + "step": 10296 + }, + { + "epoch": 0.18254253136397275, + "grad_norm": 2.96875, + "learning_rate": 4.610514479000706e-05, + "loss": 0.8126, + "step": 10298 + }, + { + "epoch": 0.18257798339958434, + "grad_norm": 2.875, + "learning_rate": 4.610364765065591e-05, + "loss": 0.8213, + "step": 10300 + }, + { + "epoch": 0.1826134354351959, + "grad_norm": 2.703125, + "learning_rate": 4.610215024793564e-05, + "loss": 0.7928, + "step": 10302 + }, + { + "epoch": 0.18264888747080746, + "grad_norm": 2.640625, + "learning_rate": 4.6100652581864925e-05, + "loss": 0.7968, + "step": 10304 + }, + { + "epoch": 0.18268433950641905, + "grad_norm": 2.625, + "learning_rate": 4.6099154652462474e-05, + "loss": 0.8257, + "step": 10306 + }, + { + "epoch": 0.1827197915420306, + "grad_norm": 2.71875, + "learning_rate": 4.609765645974695e-05, + "loss": 0.8239, + "step": 10308 + }, + { + "epoch": 0.18275524357764217, + "grad_norm": 2.859375, + "learning_rate": 4.609615800373708e-05, + "loss": 0.8294, + "step": 10310 + }, + { + "epoch": 0.18279069561325376, + "grad_norm": 2.71875, + "learning_rate": 4.609465928445155e-05, + "loss": 0.7955, + "step": 10312 + }, + { + "epoch": 0.18282614764886532, + "grad_norm": 2.765625, + "learning_rate": 4.609316030190906e-05, + "loss": 0.7991, + "step": 10314 + }, + { + "epoch": 0.18286159968447688, + "grad_norm": 2.59375, + "learning_rate": 4.609166105612833e-05, + "loss": 0.7639, + "step": 10316 + }, + { + "epoch": 0.18289705172008847, + "grad_norm": 2.859375, + "learning_rate": 4.6090161547128065e-05, + "loss": 0.8347, + "step": 10318 + }, + { + "epoch": 0.18293250375570003, + "grad_norm": 2.65625, + "learning_rate": 4.6088661774926975e-05, + "loss": 0.7859, + "step": 10320 + }, + { + "epoch": 0.1829679557913116, + "grad_norm": 2.78125, + "learning_rate": 4.608716173954377e-05, + "loss": 0.8036, + "step": 10322 + }, + { + "epoch": 0.18300340782692318, + "grad_norm": 2.5, + "learning_rate": 4.6085661440997185e-05, + "loss": 0.7947, + "step": 10324 + }, + { + "epoch": 0.18303885986253474, + "grad_norm": 2.859375, + "learning_rate": 4.608416087930594e-05, + "loss": 0.7943, + "step": 10326 + }, + { + "epoch": 0.1830743118981463, + "grad_norm": 2.671875, + "learning_rate": 4.608266005448876e-05, + "loss": 0.8049, + "step": 10328 + }, + { + "epoch": 0.18310976393375789, + "grad_norm": 2.59375, + "learning_rate": 4.608115896656437e-05, + "loss": 0.7881, + "step": 10330 + }, + { + "epoch": 0.18314521596936945, + "grad_norm": 2.9375, + "learning_rate": 4.6079657615551495e-05, + "loss": 0.8153, + "step": 10332 + }, + { + "epoch": 0.183180668004981, + "grad_norm": 2.5625, + "learning_rate": 4.60781560014689e-05, + "loss": 0.8135, + "step": 10334 + }, + { + "epoch": 0.18321612004059257, + "grad_norm": 2.421875, + "learning_rate": 4.607665412433531e-05, + "loss": 0.7861, + "step": 10336 + }, + { + "epoch": 0.18325157207620416, + "grad_norm": 2.65625, + "learning_rate": 4.607515198416945e-05, + "loss": 0.8023, + "step": 10338 + }, + { + "epoch": 0.18328702411181572, + "grad_norm": 2.6875, + "learning_rate": 4.6073649580990096e-05, + "loss": 0.7736, + "step": 10340 + }, + { + "epoch": 0.18332247614742728, + "grad_norm": 2.65625, + "learning_rate": 4.607214691481598e-05, + "loss": 0.825, + "step": 10342 + }, + { + "epoch": 0.18335792818303887, + "grad_norm": 2.71875, + "learning_rate": 4.6070643985665864e-05, + "loss": 0.8063, + "step": 10344 + }, + { + "epoch": 0.18339338021865043, + "grad_norm": 2.875, + "learning_rate": 4.6069140793558495e-05, + "loss": 0.806, + "step": 10346 + }, + { + "epoch": 0.183428832254262, + "grad_norm": 2.5625, + "learning_rate": 4.606763733851264e-05, + "loss": 0.8334, + "step": 10348 + }, + { + "epoch": 0.18346428428987357, + "grad_norm": 2.8125, + "learning_rate": 4.606613362054706e-05, + "loss": 0.8018, + "step": 10350 + }, + { + "epoch": 0.18349973632548514, + "grad_norm": 3.0, + "learning_rate": 4.6064629639680514e-05, + "loss": 0.8372, + "step": 10352 + }, + { + "epoch": 0.1835351883610967, + "grad_norm": 2.984375, + "learning_rate": 4.606312539593178e-05, + "loss": 0.8061, + "step": 10354 + }, + { + "epoch": 0.18357064039670828, + "grad_norm": 2.75, + "learning_rate": 4.606162088931963e-05, + "loss": 0.814, + "step": 10356 + }, + { + "epoch": 0.18360609243231985, + "grad_norm": 2.84375, + "learning_rate": 4.606011611986283e-05, + "loss": 0.8303, + "step": 10358 + }, + { + "epoch": 0.1836415444679314, + "grad_norm": 2.828125, + "learning_rate": 4.605861108758018e-05, + "loss": 0.813, + "step": 10360 + }, + { + "epoch": 0.183676996503543, + "grad_norm": 3.03125, + "learning_rate": 4.6057105792490446e-05, + "loss": 0.8015, + "step": 10362 + }, + { + "epoch": 0.18371244853915455, + "grad_norm": 2.546875, + "learning_rate": 4.605560023461242e-05, + "loss": 0.7731, + "step": 10364 + }, + { + "epoch": 0.18374790057476612, + "grad_norm": 2.765625, + "learning_rate": 4.6054094413964876e-05, + "loss": 0.768, + "step": 10366 + }, + { + "epoch": 0.1837833526103777, + "grad_norm": 2.96875, + "learning_rate": 4.605258833056663e-05, + "loss": 0.7992, + "step": 10368 + }, + { + "epoch": 0.18381880464598926, + "grad_norm": 2.796875, + "learning_rate": 4.605108198443647e-05, + "loss": 0.81, + "step": 10370 + }, + { + "epoch": 0.18385425668160083, + "grad_norm": 2.625, + "learning_rate": 4.604957537559318e-05, + "loss": 0.8038, + "step": 10372 + }, + { + "epoch": 0.1838897087172124, + "grad_norm": 2.890625, + "learning_rate": 4.604806850405559e-05, + "loss": 0.8335, + "step": 10374 + }, + { + "epoch": 0.18392516075282397, + "grad_norm": 2.78125, + "learning_rate": 4.604656136984247e-05, + "loss": 0.8094, + "step": 10376 + }, + { + "epoch": 0.18396061278843553, + "grad_norm": 2.578125, + "learning_rate": 4.6045053972972654e-05, + "loss": 0.8018, + "step": 10378 + }, + { + "epoch": 0.18399606482404712, + "grad_norm": 2.6875, + "learning_rate": 4.604354631346495e-05, + "loss": 0.7497, + "step": 10380 + }, + { + "epoch": 0.18403151685965868, + "grad_norm": 2.640625, + "learning_rate": 4.6042038391338174e-05, + "loss": 0.7949, + "step": 10382 + }, + { + "epoch": 0.18406696889527024, + "grad_norm": 2.46875, + "learning_rate": 4.6040530206611146e-05, + "loss": 0.7979, + "step": 10384 + }, + { + "epoch": 0.18410242093088183, + "grad_norm": 2.703125, + "learning_rate": 4.603902175930267e-05, + "loss": 0.8234, + "step": 10386 + }, + { + "epoch": 0.1841378729664934, + "grad_norm": 2.890625, + "learning_rate": 4.603751304943159e-05, + "loss": 0.8141, + "step": 10388 + }, + { + "epoch": 0.18417332500210495, + "grad_norm": 3.171875, + "learning_rate": 4.603600407701673e-05, + "loss": 0.779, + "step": 10390 + }, + { + "epoch": 0.18420877703771654, + "grad_norm": 2.671875, + "learning_rate": 4.603449484207692e-05, + "loss": 0.7836, + "step": 10392 + }, + { + "epoch": 0.1842442290733281, + "grad_norm": 2.75, + "learning_rate": 4.6032985344631e-05, + "loss": 0.8044, + "step": 10394 + }, + { + "epoch": 0.18427968110893966, + "grad_norm": 2.5625, + "learning_rate": 4.60314755846978e-05, + "loss": 0.8166, + "step": 10396 + }, + { + "epoch": 0.18431513314455125, + "grad_norm": 2.734375, + "learning_rate": 4.602996556229616e-05, + "loss": 0.8144, + "step": 10398 + }, + { + "epoch": 0.1843505851801628, + "grad_norm": 2.765625, + "learning_rate": 4.6028455277444936e-05, + "loss": 0.7835, + "step": 10400 + }, + { + "epoch": 0.18438603721577437, + "grad_norm": 2.828125, + "learning_rate": 4.602694473016297e-05, + "loss": 0.8057, + "step": 10402 + }, + { + "epoch": 0.18442148925138596, + "grad_norm": 2.875, + "learning_rate": 4.6025433920469117e-05, + "loss": 0.7827, + "step": 10404 + }, + { + "epoch": 0.18445694128699752, + "grad_norm": 2.6875, + "learning_rate": 4.6023922848382215e-05, + "loss": 0.7865, + "step": 10406 + }, + { + "epoch": 0.18449239332260908, + "grad_norm": 2.96875, + "learning_rate": 4.6022411513921146e-05, + "loss": 0.8172, + "step": 10408 + }, + { + "epoch": 0.18452784535822067, + "grad_norm": 2.90625, + "learning_rate": 4.602089991710475e-05, + "loss": 0.8207, + "step": 10410 + }, + { + "epoch": 0.18456329739383223, + "grad_norm": 2.90625, + "learning_rate": 4.6019388057951916e-05, + "loss": 0.8287, + "step": 10412 + }, + { + "epoch": 0.1845987494294438, + "grad_norm": 2.875, + "learning_rate": 4.6017875936481494e-05, + "loss": 0.801, + "step": 10414 + }, + { + "epoch": 0.18463420146505538, + "grad_norm": 2.734375, + "learning_rate": 4.601636355271235e-05, + "loss": 0.8247, + "step": 10416 + }, + { + "epoch": 0.18466965350066694, + "grad_norm": 2.765625, + "learning_rate": 4.601485090666337e-05, + "loss": 0.8076, + "step": 10418 + }, + { + "epoch": 0.1847051055362785, + "grad_norm": 2.5, + "learning_rate": 4.601333799835343e-05, + "loss": 0.8205, + "step": 10420 + }, + { + "epoch": 0.1847405575718901, + "grad_norm": 2.890625, + "learning_rate": 4.60118248278014e-05, + "loss": 0.7802, + "step": 10422 + }, + { + "epoch": 0.18477600960750165, + "grad_norm": 2.984375, + "learning_rate": 4.601031139502619e-05, + "loss": 0.791, + "step": 10424 + }, + { + "epoch": 0.1848114616431132, + "grad_norm": 2.53125, + "learning_rate": 4.6008797700046647e-05, + "loss": 0.8082, + "step": 10426 + }, + { + "epoch": 0.1848469136787248, + "grad_norm": 2.578125, + "learning_rate": 4.6007283742881704e-05, + "loss": 0.7884, + "step": 10428 + }, + { + "epoch": 0.18488236571433636, + "grad_norm": 2.75, + "learning_rate": 4.6005769523550226e-05, + "loss": 0.7965, + "step": 10430 + }, + { + "epoch": 0.18491781774994792, + "grad_norm": 2.828125, + "learning_rate": 4.600425504207112e-05, + "loss": 0.7819, + "step": 10432 + }, + { + "epoch": 0.1849532697855595, + "grad_norm": 2.640625, + "learning_rate": 4.600274029846329e-05, + "loss": 0.8161, + "step": 10434 + }, + { + "epoch": 0.18498872182117107, + "grad_norm": 2.65625, + "learning_rate": 4.600122529274563e-05, + "loss": 0.7533, + "step": 10436 + }, + { + "epoch": 0.18502417385678263, + "grad_norm": 2.765625, + "learning_rate": 4.599971002493706e-05, + "loss": 0.7857, + "step": 10438 + }, + { + "epoch": 0.18505962589239422, + "grad_norm": 2.9375, + "learning_rate": 4.599819449505647e-05, + "loss": 0.741, + "step": 10440 + }, + { + "epoch": 0.18509507792800578, + "grad_norm": 2.6875, + "learning_rate": 4.5996678703122794e-05, + "loss": 0.7814, + "step": 10442 + }, + { + "epoch": 0.18513052996361734, + "grad_norm": 2.96875, + "learning_rate": 4.5995162649154944e-05, + "loss": 0.8245, + "step": 10444 + }, + { + "epoch": 0.18516598199922893, + "grad_norm": 3.21875, + "learning_rate": 4.5993646333171837e-05, + "loss": 0.8011, + "step": 10446 + }, + { + "epoch": 0.1852014340348405, + "grad_norm": 2.6875, + "learning_rate": 4.599212975519239e-05, + "loss": 0.7667, + "step": 10448 + }, + { + "epoch": 0.18523688607045205, + "grad_norm": 2.84375, + "learning_rate": 4.5990612915235545e-05, + "loss": 0.8283, + "step": 10450 + }, + { + "epoch": 0.18527233810606364, + "grad_norm": 2.859375, + "learning_rate": 4.598909581332021e-05, + "loss": 0.8087, + "step": 10452 + }, + { + "epoch": 0.1853077901416752, + "grad_norm": 2.671875, + "learning_rate": 4.598757844946534e-05, + "loss": 0.8038, + "step": 10454 + }, + { + "epoch": 0.18534324217728676, + "grad_norm": 2.828125, + "learning_rate": 4.598606082368986e-05, + "loss": 0.8039, + "step": 10456 + }, + { + "epoch": 0.18537869421289835, + "grad_norm": 2.96875, + "learning_rate": 4.5984542936012716e-05, + "loss": 0.7779, + "step": 10458 + }, + { + "epoch": 0.1854141462485099, + "grad_norm": 3.0, + "learning_rate": 4.598302478645284e-05, + "loss": 0.8191, + "step": 10460 + }, + { + "epoch": 0.18544959828412147, + "grad_norm": 2.78125, + "learning_rate": 4.5981506375029194e-05, + "loss": 0.7917, + "step": 10462 + }, + { + "epoch": 0.18548505031973306, + "grad_norm": 2.71875, + "learning_rate": 4.597998770176071e-05, + "loss": 0.8076, + "step": 10464 + }, + { + "epoch": 0.18552050235534462, + "grad_norm": 2.390625, + "learning_rate": 4.597846876666635e-05, + "loss": 0.7947, + "step": 10466 + }, + { + "epoch": 0.18555595439095618, + "grad_norm": 2.90625, + "learning_rate": 4.597694956976508e-05, + "loss": 0.8099, + "step": 10468 + }, + { + "epoch": 0.18559140642656777, + "grad_norm": 3.046875, + "learning_rate": 4.597543011107584e-05, + "loss": 0.8352, + "step": 10470 + }, + { + "epoch": 0.18562685846217933, + "grad_norm": 2.890625, + "learning_rate": 4.59739103906176e-05, + "loss": 0.7795, + "step": 10472 + }, + { + "epoch": 0.1856623104977909, + "grad_norm": 2.71875, + "learning_rate": 4.597239040840933e-05, + "loss": 0.7894, + "step": 10474 + }, + { + "epoch": 0.18569776253340248, + "grad_norm": 3.046875, + "learning_rate": 4.5970870164469995e-05, + "loss": 0.8411, + "step": 10476 + }, + { + "epoch": 0.18573321456901404, + "grad_norm": 2.984375, + "learning_rate": 4.5969349658818575e-05, + "loss": 0.7486, + "step": 10478 + }, + { + "epoch": 0.1857686666046256, + "grad_norm": 2.640625, + "learning_rate": 4.596782889147403e-05, + "loss": 0.8125, + "step": 10480 + }, + { + "epoch": 0.1858041186402372, + "grad_norm": 2.671875, + "learning_rate": 4.5966307862455344e-05, + "loss": 0.8195, + "step": 10482 + }, + { + "epoch": 0.18583957067584875, + "grad_norm": 2.84375, + "learning_rate": 4.596478657178151e-05, + "loss": 0.8261, + "step": 10484 + }, + { + "epoch": 0.1858750227114603, + "grad_norm": 2.71875, + "learning_rate": 4.5963265019471504e-05, + "loss": 0.809, + "step": 10486 + }, + { + "epoch": 0.1859104747470719, + "grad_norm": 2.9375, + "learning_rate": 4.596174320554432e-05, + "loss": 0.8152, + "step": 10488 + }, + { + "epoch": 0.18594592678268346, + "grad_norm": 2.59375, + "learning_rate": 4.5960221130018946e-05, + "loss": 0.7875, + "step": 10490 + }, + { + "epoch": 0.18598137881829502, + "grad_norm": 2.859375, + "learning_rate": 4.5958698792914364e-05, + "loss": 0.8312, + "step": 10492 + }, + { + "epoch": 0.1860168308539066, + "grad_norm": 2.671875, + "learning_rate": 4.595717619424961e-05, + "loss": 0.746, + "step": 10494 + }, + { + "epoch": 0.18605228288951817, + "grad_norm": 2.796875, + "learning_rate": 4.595565333404365e-05, + "loss": 0.8165, + "step": 10496 + }, + { + "epoch": 0.18608773492512973, + "grad_norm": 2.625, + "learning_rate": 4.59541302123155e-05, + "loss": 0.7756, + "step": 10498 + }, + { + "epoch": 0.18612318696074132, + "grad_norm": 2.3125, + "learning_rate": 4.595260682908417e-05, + "loss": 0.7439, + "step": 10500 + }, + { + "epoch": 0.18615863899635288, + "grad_norm": 2.875, + "learning_rate": 4.595108318436867e-05, + "loss": 0.8294, + "step": 10502 + }, + { + "epoch": 0.18619409103196444, + "grad_norm": 2.703125, + "learning_rate": 4.594955927818802e-05, + "loss": 0.7616, + "step": 10504 + }, + { + "epoch": 0.186229543067576, + "grad_norm": 2.6875, + "learning_rate": 4.5948035110561236e-05, + "loss": 0.7847, + "step": 10506 + }, + { + "epoch": 0.1862649951031876, + "grad_norm": 2.890625, + "learning_rate": 4.5946510681507326e-05, + "loss": 0.8121, + "step": 10508 + }, + { + "epoch": 0.18630044713879915, + "grad_norm": 2.8125, + "learning_rate": 4.5944985991045333e-05, + "loss": 0.8213, + "step": 10510 + }, + { + "epoch": 0.1863358991744107, + "grad_norm": 2.890625, + "learning_rate": 4.594346103919428e-05, + "loss": 0.8258, + "step": 10512 + }, + { + "epoch": 0.1863713512100223, + "grad_norm": 2.859375, + "learning_rate": 4.594193582597319e-05, + "loss": 0.8081, + "step": 10514 + }, + { + "epoch": 0.18640680324563386, + "grad_norm": 2.796875, + "learning_rate": 4.594041035140111e-05, + "loss": 0.7971, + "step": 10516 + }, + { + "epoch": 0.18644225528124542, + "grad_norm": 2.5625, + "learning_rate": 4.593888461549706e-05, + "loss": 0.7671, + "step": 10518 + }, + { + "epoch": 0.186477707316857, + "grad_norm": 2.796875, + "learning_rate": 4.59373586182801e-05, + "loss": 0.8175, + "step": 10520 + }, + { + "epoch": 0.18651315935246857, + "grad_norm": 2.96875, + "learning_rate": 4.593583235976926e-05, + "loss": 0.7981, + "step": 10522 + }, + { + "epoch": 0.18654861138808013, + "grad_norm": 2.953125, + "learning_rate": 4.593430583998359e-05, + "loss": 0.8102, + "step": 10524 + }, + { + "epoch": 0.18658406342369172, + "grad_norm": 2.921875, + "learning_rate": 4.5932779058942154e-05, + "loss": 0.76, + "step": 10526 + }, + { + "epoch": 0.18661951545930328, + "grad_norm": 2.421875, + "learning_rate": 4.5931252016663985e-05, + "loss": 0.8077, + "step": 10528 + }, + { + "epoch": 0.18665496749491484, + "grad_norm": 2.953125, + "learning_rate": 4.592972471316815e-05, + "loss": 0.8312, + "step": 10530 + }, + { + "epoch": 0.18669041953052642, + "grad_norm": 2.828125, + "learning_rate": 4.5928197148473726e-05, + "loss": 0.7986, + "step": 10532 + }, + { + "epoch": 0.18672587156613799, + "grad_norm": 2.953125, + "learning_rate": 4.5926669322599746e-05, + "loss": 0.8203, + "step": 10534 + }, + { + "epoch": 0.18676132360174955, + "grad_norm": 2.875, + "learning_rate": 4.5925141235565294e-05, + "loss": 0.776, + "step": 10536 + }, + { + "epoch": 0.18679677563736113, + "grad_norm": 2.875, + "learning_rate": 4.592361288738945e-05, + "loss": 0.8089, + "step": 10538 + }, + { + "epoch": 0.1868322276729727, + "grad_norm": 2.875, + "learning_rate": 4.592208427809125e-05, + "loss": 0.7796, + "step": 10540 + }, + { + "epoch": 0.18686767970858426, + "grad_norm": 2.890625, + "learning_rate": 4.592055540768981e-05, + "loss": 0.7896, + "step": 10542 + }, + { + "epoch": 0.18690313174419584, + "grad_norm": 2.40625, + "learning_rate": 4.59190262762042e-05, + "loss": 0.8076, + "step": 10544 + }, + { + "epoch": 0.1869385837798074, + "grad_norm": 2.859375, + "learning_rate": 4.591749688365349e-05, + "loss": 0.8327, + "step": 10546 + }, + { + "epoch": 0.18697403581541897, + "grad_norm": 2.671875, + "learning_rate": 4.5915967230056786e-05, + "loss": 0.766, + "step": 10548 + }, + { + "epoch": 0.18700948785103055, + "grad_norm": 2.8125, + "learning_rate": 4.591443731543316e-05, + "loss": 0.8005, + "step": 10550 + }, + { + "epoch": 0.18704493988664211, + "grad_norm": 2.359375, + "learning_rate": 4.591290713980172e-05, + "loss": 0.8138, + "step": 10552 + }, + { + "epoch": 0.18708039192225367, + "grad_norm": 2.546875, + "learning_rate": 4.591137670318155e-05, + "loss": 0.8359, + "step": 10554 + }, + { + "epoch": 0.18711584395786526, + "grad_norm": 2.46875, + "learning_rate": 4.590984600559175e-05, + "loss": 0.8147, + "step": 10556 + }, + { + "epoch": 0.18715129599347682, + "grad_norm": 2.90625, + "learning_rate": 4.590831504705143e-05, + "loss": 0.8025, + "step": 10558 + }, + { + "epoch": 0.18718674802908838, + "grad_norm": 2.609375, + "learning_rate": 4.590678382757969e-05, + "loss": 0.8017, + "step": 10560 + }, + { + "epoch": 0.18722220006469997, + "grad_norm": 2.78125, + "learning_rate": 4.590525234719565e-05, + "loss": 0.7923, + "step": 10562 + }, + { + "epoch": 0.18725765210031153, + "grad_norm": 2.671875, + "learning_rate": 4.590372060591841e-05, + "loss": 0.7839, + "step": 10564 + }, + { + "epoch": 0.1872931041359231, + "grad_norm": 2.796875, + "learning_rate": 4.5902188603767094e-05, + "loss": 0.814, + "step": 10566 + }, + { + "epoch": 0.18732855617153468, + "grad_norm": 2.671875, + "learning_rate": 4.590065634076082e-05, + "loss": 0.8513, + "step": 10568 + }, + { + "epoch": 0.18736400820714624, + "grad_norm": 2.71875, + "learning_rate": 4.58991238169187e-05, + "loss": 0.7772, + "step": 10570 + }, + { + "epoch": 0.1873994602427578, + "grad_norm": 3.0, + "learning_rate": 4.589759103225987e-05, + "loss": 0.8171, + "step": 10572 + }, + { + "epoch": 0.1874349122783694, + "grad_norm": 2.703125, + "learning_rate": 4.589605798680346e-05, + "loss": 0.8193, + "step": 10574 + }, + { + "epoch": 0.18747036431398095, + "grad_norm": 2.9375, + "learning_rate": 4.5894524680568596e-05, + "loss": 0.8185, + "step": 10576 + }, + { + "epoch": 0.1875058163495925, + "grad_norm": 2.71875, + "learning_rate": 4.589299111357441e-05, + "loss": 0.788, + "step": 10578 + }, + { + "epoch": 0.1875412683852041, + "grad_norm": 2.484375, + "learning_rate": 4.589145728584006e-05, + "loss": 0.7515, + "step": 10580 + }, + { + "epoch": 0.18757672042081566, + "grad_norm": 2.984375, + "learning_rate": 4.588992319738466e-05, + "loss": 0.7798, + "step": 10582 + }, + { + "epoch": 0.18761217245642722, + "grad_norm": 2.453125, + "learning_rate": 4.588838884822738e-05, + "loss": 0.7761, + "step": 10584 + }, + { + "epoch": 0.1876476244920388, + "grad_norm": 3.0, + "learning_rate": 4.5886854238387364e-05, + "loss": 0.8103, + "step": 10586 + }, + { + "epoch": 0.18768307652765037, + "grad_norm": 2.8125, + "learning_rate": 4.588531936788375e-05, + "loss": 0.8216, + "step": 10588 + }, + { + "epoch": 0.18771852856326193, + "grad_norm": 2.671875, + "learning_rate": 4.588378423673569e-05, + "loss": 0.8116, + "step": 10590 + }, + { + "epoch": 0.18775398059887352, + "grad_norm": 2.765625, + "learning_rate": 4.588224884496237e-05, + "loss": 0.8287, + "step": 10592 + }, + { + "epoch": 0.18778943263448508, + "grad_norm": 2.59375, + "learning_rate": 4.588071319258293e-05, + "loss": 0.7659, + "step": 10594 + }, + { + "epoch": 0.18782488467009664, + "grad_norm": 2.5, + "learning_rate": 4.587917727961652e-05, + "loss": 0.8064, + "step": 10596 + }, + { + "epoch": 0.18786033670570823, + "grad_norm": 2.75, + "learning_rate": 4.587764110608235e-05, + "loss": 0.7971, + "step": 10598 + }, + { + "epoch": 0.1878957887413198, + "grad_norm": 2.6875, + "learning_rate": 4.5876104671999556e-05, + "loss": 0.8327, + "step": 10600 + }, + { + "epoch": 0.18793124077693135, + "grad_norm": 2.53125, + "learning_rate": 4.5874567977387326e-05, + "loss": 0.7434, + "step": 10602 + }, + { + "epoch": 0.18796669281254294, + "grad_norm": 2.625, + "learning_rate": 4.5873031022264834e-05, + "loss": 0.8094, + "step": 10604 + }, + { + "epoch": 0.1880021448481545, + "grad_norm": 2.8125, + "learning_rate": 4.5871493806651265e-05, + "loss": 0.7896, + "step": 10606 + }, + { + "epoch": 0.18803759688376606, + "grad_norm": 2.59375, + "learning_rate": 4.58699563305658e-05, + "loss": 0.792, + "step": 10608 + }, + { + "epoch": 0.18807304891937765, + "grad_norm": 2.75, + "learning_rate": 4.586841859402763e-05, + "loss": 0.8111, + "step": 10610 + }, + { + "epoch": 0.1881085009549892, + "grad_norm": 2.984375, + "learning_rate": 4.586688059705593e-05, + "loss": 0.8222, + "step": 10612 + }, + { + "epoch": 0.18814395299060077, + "grad_norm": 3.140625, + "learning_rate": 4.586534233966992e-05, + "loss": 0.8149, + "step": 10614 + }, + { + "epoch": 0.18817940502621236, + "grad_norm": 2.625, + "learning_rate": 4.5863803821888775e-05, + "loss": 0.7729, + "step": 10616 + }, + { + "epoch": 0.18821485706182392, + "grad_norm": 2.65625, + "learning_rate": 4.586226504373171e-05, + "loss": 0.8154, + "step": 10618 + }, + { + "epoch": 0.18825030909743548, + "grad_norm": 2.6875, + "learning_rate": 4.5860726005217924e-05, + "loss": 0.8175, + "step": 10620 + }, + { + "epoch": 0.18828576113304707, + "grad_norm": 2.828125, + "learning_rate": 4.585918670636662e-05, + "loss": 0.8531, + "step": 10622 + }, + { + "epoch": 0.18832121316865863, + "grad_norm": 2.765625, + "learning_rate": 4.5857647147197e-05, + "loss": 0.7576, + "step": 10624 + }, + { + "epoch": 0.1883566652042702, + "grad_norm": 2.578125, + "learning_rate": 4.5856107327728305e-05, + "loss": 0.7785, + "step": 10626 + }, + { + "epoch": 0.18839211723988178, + "grad_norm": 2.8125, + "learning_rate": 4.5854567247979727e-05, + "loss": 0.7932, + "step": 10628 + }, + { + "epoch": 0.18842756927549334, + "grad_norm": 2.890625, + "learning_rate": 4.5853026907970484e-05, + "loss": 0.7907, + "step": 10630 + }, + { + "epoch": 0.1884630213111049, + "grad_norm": 2.671875, + "learning_rate": 4.585148630771983e-05, + "loss": 0.8394, + "step": 10632 + }, + { + "epoch": 0.1884984733467165, + "grad_norm": 2.703125, + "learning_rate": 4.584994544724695e-05, + "loss": 0.8028, + "step": 10634 + }, + { + "epoch": 0.18853392538232805, + "grad_norm": 2.578125, + "learning_rate": 4.5848404326571104e-05, + "loss": 0.8247, + "step": 10636 + }, + { + "epoch": 0.1885693774179396, + "grad_norm": 3.171875, + "learning_rate": 4.584686294571151e-05, + "loss": 0.7953, + "step": 10638 + }, + { + "epoch": 0.1886048294535512, + "grad_norm": 2.640625, + "learning_rate": 4.584532130468741e-05, + "loss": 0.8111, + "step": 10640 + }, + { + "epoch": 0.18864028148916276, + "grad_norm": 2.578125, + "learning_rate": 4.584377940351804e-05, + "loss": 0.7983, + "step": 10642 + }, + { + "epoch": 0.18867573352477432, + "grad_norm": 2.859375, + "learning_rate": 4.584223724222265e-05, + "loss": 0.8223, + "step": 10644 + }, + { + "epoch": 0.1887111855603859, + "grad_norm": 2.4375, + "learning_rate": 4.5840694820820476e-05, + "loss": 0.8173, + "step": 10646 + }, + { + "epoch": 0.18874663759599747, + "grad_norm": 2.625, + "learning_rate": 4.583915213933077e-05, + "loss": 0.7821, + "step": 10648 + }, + { + "epoch": 0.18878208963160903, + "grad_norm": 2.703125, + "learning_rate": 4.583760919777279e-05, + "loss": 0.8053, + "step": 10650 + }, + { + "epoch": 0.18881754166722062, + "grad_norm": 2.859375, + "learning_rate": 4.583606599616579e-05, + "loss": 0.7501, + "step": 10652 + }, + { + "epoch": 0.18885299370283218, + "grad_norm": 2.5625, + "learning_rate": 4.5834522534529015e-05, + "loss": 0.771, + "step": 10654 + }, + { + "epoch": 0.18888844573844374, + "grad_norm": 2.53125, + "learning_rate": 4.5832978812881744e-05, + "loss": 0.8112, + "step": 10656 + }, + { + "epoch": 0.18892389777405533, + "grad_norm": 2.90625, + "learning_rate": 4.583143483124324e-05, + "loss": 0.8547, + "step": 10658 + }, + { + "epoch": 0.1889593498096669, + "grad_norm": 2.46875, + "learning_rate": 4.582989058963276e-05, + "loss": 0.806, + "step": 10660 + }, + { + "epoch": 0.18899480184527845, + "grad_norm": 2.875, + "learning_rate": 4.5828346088069596e-05, + "loss": 0.7513, + "step": 10662 + }, + { + "epoch": 0.18903025388089004, + "grad_norm": 2.6875, + "learning_rate": 4.5826801326573006e-05, + "loss": 0.8386, + "step": 10664 + }, + { + "epoch": 0.1890657059165016, + "grad_norm": 2.59375, + "learning_rate": 4.582525630516227e-05, + "loss": 0.7845, + "step": 10666 + }, + { + "epoch": 0.18910115795211316, + "grad_norm": 2.734375, + "learning_rate": 4.582371102385667e-05, + "loss": 0.8075, + "step": 10668 + }, + { + "epoch": 0.18913660998772475, + "grad_norm": 2.78125, + "learning_rate": 4.5822165482675505e-05, + "loss": 0.8587, + "step": 10670 + }, + { + "epoch": 0.1891720620233363, + "grad_norm": 2.75, + "learning_rate": 4.5820619681638046e-05, + "loss": 0.7742, + "step": 10672 + }, + { + "epoch": 0.18920751405894787, + "grad_norm": 2.625, + "learning_rate": 4.581907362076359e-05, + "loss": 0.839, + "step": 10674 + }, + { + "epoch": 0.18924296609455943, + "grad_norm": 2.6875, + "learning_rate": 4.581752730007143e-05, + "loss": 0.8301, + "step": 10676 + }, + { + "epoch": 0.18927841813017102, + "grad_norm": 2.671875, + "learning_rate": 4.5815980719580864e-05, + "loss": 0.7948, + "step": 10678 + }, + { + "epoch": 0.18931387016578258, + "grad_norm": 2.59375, + "learning_rate": 4.5814433879311194e-05, + "loss": 0.8258, + "step": 10680 + }, + { + "epoch": 0.18934932220139414, + "grad_norm": 3.03125, + "learning_rate": 4.581288677928173e-05, + "loss": 0.8405, + "step": 10682 + }, + { + "epoch": 0.18938477423700573, + "grad_norm": 2.734375, + "learning_rate": 4.581133941951177e-05, + "loss": 0.8275, + "step": 10684 + }, + { + "epoch": 0.1894202262726173, + "grad_norm": 2.828125, + "learning_rate": 4.580979180002063e-05, + "loss": 0.7894, + "step": 10686 + }, + { + "epoch": 0.18945567830822885, + "grad_norm": 2.75, + "learning_rate": 4.580824392082762e-05, + "loss": 0.7929, + "step": 10688 + }, + { + "epoch": 0.18949113034384044, + "grad_norm": 2.625, + "learning_rate": 4.580669578195206e-05, + "loss": 0.8554, + "step": 10690 + }, + { + "epoch": 0.189526582379452, + "grad_norm": 2.75, + "learning_rate": 4.580514738341328e-05, + "loss": 0.7792, + "step": 10692 + }, + { + "epoch": 0.18956203441506356, + "grad_norm": 2.734375, + "learning_rate": 4.580359872523058e-05, + "loss": 0.8322, + "step": 10694 + }, + { + "epoch": 0.18959748645067515, + "grad_norm": 2.71875, + "learning_rate": 4.580204980742331e-05, + "loss": 0.8135, + "step": 10696 + }, + { + "epoch": 0.1896329384862867, + "grad_norm": 2.6875, + "learning_rate": 4.580050063001079e-05, + "loss": 0.8146, + "step": 10698 + }, + { + "epoch": 0.18966839052189827, + "grad_norm": 2.78125, + "learning_rate": 4.579895119301235e-05, + "loss": 0.8429, + "step": 10700 + }, + { + "epoch": 0.18970384255750986, + "grad_norm": 2.703125, + "learning_rate": 4.579740149644734e-05, + "loss": 0.7867, + "step": 10702 + }, + { + "epoch": 0.18973929459312142, + "grad_norm": 2.578125, + "learning_rate": 4.579585154033509e-05, + "loss": 0.8102, + "step": 10704 + }, + { + "epoch": 0.18977474662873298, + "grad_norm": 2.4375, + "learning_rate": 4.5794301324694934e-05, + "loss": 0.8017, + "step": 10706 + }, + { + "epoch": 0.18981019866434456, + "grad_norm": 2.625, + "learning_rate": 4.579275084954623e-05, + "loss": 0.7597, + "step": 10708 + }, + { + "epoch": 0.18984565069995613, + "grad_norm": 2.65625, + "learning_rate": 4.579120011490834e-05, + "loss": 0.8081, + "step": 10710 + }, + { + "epoch": 0.1898811027355677, + "grad_norm": 2.828125, + "learning_rate": 4.5789649120800587e-05, + "loss": 0.7912, + "step": 10712 + }, + { + "epoch": 0.18991655477117927, + "grad_norm": 2.578125, + "learning_rate": 4.5788097867242355e-05, + "loss": 0.7823, + "step": 10714 + }, + { + "epoch": 0.18995200680679084, + "grad_norm": 2.890625, + "learning_rate": 4.578654635425298e-05, + "loss": 0.8361, + "step": 10716 + }, + { + "epoch": 0.1899874588424024, + "grad_norm": 2.609375, + "learning_rate": 4.578499458185185e-05, + "loss": 0.7681, + "step": 10718 + }, + { + "epoch": 0.19002291087801398, + "grad_norm": 2.765625, + "learning_rate": 4.578344255005831e-05, + "loss": 0.8167, + "step": 10720 + }, + { + "epoch": 0.19005836291362554, + "grad_norm": 2.640625, + "learning_rate": 4.578189025889173e-05, + "loss": 0.8504, + "step": 10722 + }, + { + "epoch": 0.1900938149492371, + "grad_norm": 2.6875, + "learning_rate": 4.578033770837149e-05, + "loss": 0.7749, + "step": 10724 + }, + { + "epoch": 0.1901292669848487, + "grad_norm": 2.84375, + "learning_rate": 4.577878489851697e-05, + "loss": 0.8276, + "step": 10726 + }, + { + "epoch": 0.19016471902046025, + "grad_norm": 2.59375, + "learning_rate": 4.577723182934754e-05, + "loss": 0.8341, + "step": 10728 + }, + { + "epoch": 0.19020017105607182, + "grad_norm": 2.75, + "learning_rate": 4.577567850088258e-05, + "loss": 0.7664, + "step": 10730 + }, + { + "epoch": 0.1902356230916834, + "grad_norm": 2.8125, + "learning_rate": 4.577412491314149e-05, + "loss": 0.8106, + "step": 10732 + }, + { + "epoch": 0.19027107512729496, + "grad_norm": 2.484375, + "learning_rate": 4.577257106614364e-05, + "loss": 0.8244, + "step": 10734 + }, + { + "epoch": 0.19030652716290652, + "grad_norm": 2.921875, + "learning_rate": 4.577101695990843e-05, + "loss": 0.8214, + "step": 10736 + }, + { + "epoch": 0.1903419791985181, + "grad_norm": 2.875, + "learning_rate": 4.5769462594455256e-05, + "loss": 0.8786, + "step": 10738 + }, + { + "epoch": 0.19037743123412967, + "grad_norm": 2.546875, + "learning_rate": 4.5767907969803514e-05, + "loss": 0.8052, + "step": 10740 + }, + { + "epoch": 0.19041288326974123, + "grad_norm": 2.71875, + "learning_rate": 4.5766353085972605e-05, + "loss": 0.8261, + "step": 10742 + }, + { + "epoch": 0.19044833530535282, + "grad_norm": 2.828125, + "learning_rate": 4.5764797942981944e-05, + "loss": 0.7922, + "step": 10744 + }, + { + "epoch": 0.19048378734096438, + "grad_norm": 2.828125, + "learning_rate": 4.576324254085092e-05, + "loss": 0.7848, + "step": 10746 + }, + { + "epoch": 0.19051923937657594, + "grad_norm": 2.828125, + "learning_rate": 4.576168687959895e-05, + "loss": 0.8277, + "step": 10748 + }, + { + "epoch": 0.19055469141218753, + "grad_norm": 2.625, + "learning_rate": 4.5760130959245464e-05, + "loss": 0.8246, + "step": 10750 + }, + { + "epoch": 0.1905901434477991, + "grad_norm": 2.59375, + "learning_rate": 4.575857477980986e-05, + "loss": 0.7686, + "step": 10752 + }, + { + "epoch": 0.19062559548341065, + "grad_norm": 2.828125, + "learning_rate": 4.5757018341311565e-05, + "loss": 0.7949, + "step": 10754 + }, + { + "epoch": 0.19066104751902224, + "grad_norm": 2.671875, + "learning_rate": 4.575546164377e-05, + "loss": 0.8289, + "step": 10756 + }, + { + "epoch": 0.1906964995546338, + "grad_norm": 2.71875, + "learning_rate": 4.575390468720461e-05, + "loss": 0.8116, + "step": 10758 + }, + { + "epoch": 0.19073195159024536, + "grad_norm": 2.609375, + "learning_rate": 4.5752347471634804e-05, + "loss": 0.8185, + "step": 10760 + }, + { + "epoch": 0.19076740362585695, + "grad_norm": 2.84375, + "learning_rate": 4.5750789997080035e-05, + "loss": 0.7798, + "step": 10762 + }, + { + "epoch": 0.1908028556614685, + "grad_norm": 2.84375, + "learning_rate": 4.5749232263559716e-05, + "loss": 0.8127, + "step": 10764 + }, + { + "epoch": 0.19083830769708007, + "grad_norm": 2.9375, + "learning_rate": 4.5747674271093306e-05, + "loss": 0.8089, + "step": 10766 + }, + { + "epoch": 0.19087375973269166, + "grad_norm": 2.8125, + "learning_rate": 4.5746116019700234e-05, + "loss": 0.7595, + "step": 10768 + }, + { + "epoch": 0.19090921176830322, + "grad_norm": 2.5625, + "learning_rate": 4.574455750939997e-05, + "loss": 0.7865, + "step": 10770 + }, + { + "epoch": 0.19094466380391478, + "grad_norm": 2.8125, + "learning_rate": 4.574299874021194e-05, + "loss": 0.8002, + "step": 10772 + }, + { + "epoch": 0.19098011583952637, + "grad_norm": 2.671875, + "learning_rate": 4.574143971215561e-05, + "loss": 0.7734, + "step": 10774 + }, + { + "epoch": 0.19101556787513793, + "grad_norm": 2.703125, + "learning_rate": 4.573988042525042e-05, + "loss": 0.794, + "step": 10776 + }, + { + "epoch": 0.1910510199107495, + "grad_norm": 2.921875, + "learning_rate": 4.573832087951586e-05, + "loss": 0.7835, + "step": 10778 + }, + { + "epoch": 0.19108647194636108, + "grad_norm": 2.75, + "learning_rate": 4.5736761074971366e-05, + "loss": 0.814, + "step": 10780 + }, + { + "epoch": 0.19112192398197264, + "grad_norm": 2.796875, + "learning_rate": 4.573520101163641e-05, + "loss": 0.8026, + "step": 10782 + }, + { + "epoch": 0.1911573760175842, + "grad_norm": 2.859375, + "learning_rate": 4.5733640689530465e-05, + "loss": 0.7686, + "step": 10784 + }, + { + "epoch": 0.1911928280531958, + "grad_norm": 2.75, + "learning_rate": 4.5732080108673007e-05, + "loss": 0.8028, + "step": 10786 + }, + { + "epoch": 0.19122828008880735, + "grad_norm": 2.640625, + "learning_rate": 4.573051926908351e-05, + "loss": 0.8185, + "step": 10788 + }, + { + "epoch": 0.1912637321244189, + "grad_norm": 3.25, + "learning_rate": 4.5728958170781446e-05, + "loss": 0.8286, + "step": 10790 + }, + { + "epoch": 0.1912991841600305, + "grad_norm": 2.578125, + "learning_rate": 4.57273968137863e-05, + "loss": 0.8065, + "step": 10792 + }, + { + "epoch": 0.19133463619564206, + "grad_norm": 2.640625, + "learning_rate": 4.572583519811756e-05, + "loss": 0.8441, + "step": 10794 + }, + { + "epoch": 0.19137008823125362, + "grad_norm": 2.546875, + "learning_rate": 4.572427332379472e-05, + "loss": 0.7712, + "step": 10796 + }, + { + "epoch": 0.1914055402668652, + "grad_norm": 2.65625, + "learning_rate": 4.572271119083726e-05, + "loss": 0.7992, + "step": 10798 + }, + { + "epoch": 0.19144099230247677, + "grad_norm": 3.109375, + "learning_rate": 4.5721148799264676e-05, + "loss": 0.7992, + "step": 10800 + }, + { + "epoch": 0.19147644433808833, + "grad_norm": 2.578125, + "learning_rate": 4.571958614909648e-05, + "loss": 0.7961, + "step": 10802 + }, + { + "epoch": 0.19151189637369992, + "grad_norm": 2.8125, + "learning_rate": 4.571802324035216e-05, + "loss": 0.7667, + "step": 10804 + }, + { + "epoch": 0.19154734840931148, + "grad_norm": 2.625, + "learning_rate": 4.5716460073051224e-05, + "loss": 0.788, + "step": 10806 + }, + { + "epoch": 0.19158280044492304, + "grad_norm": 2.53125, + "learning_rate": 4.571489664721318e-05, + "loss": 0.7764, + "step": 10808 + }, + { + "epoch": 0.19161825248053463, + "grad_norm": 2.796875, + "learning_rate": 4.571333296285755e-05, + "loss": 0.7794, + "step": 10810 + }, + { + "epoch": 0.1916537045161462, + "grad_norm": 2.65625, + "learning_rate": 4.571176902000383e-05, + "loss": 0.7814, + "step": 10812 + }, + { + "epoch": 0.19168915655175775, + "grad_norm": 2.921875, + "learning_rate": 4.5710204818671546e-05, + "loss": 0.825, + "step": 10814 + }, + { + "epoch": 0.19172460858736934, + "grad_norm": 3.296875, + "learning_rate": 4.570864035888022e-05, + "loss": 0.8684, + "step": 10816 + }, + { + "epoch": 0.1917600606229809, + "grad_norm": 3.015625, + "learning_rate": 4.570707564064938e-05, + "loss": 0.7767, + "step": 10818 + }, + { + "epoch": 0.19179551265859246, + "grad_norm": 2.859375, + "learning_rate": 4.570551066399854e-05, + "loss": 0.8271, + "step": 10820 + }, + { + "epoch": 0.19183096469420405, + "grad_norm": 2.4375, + "learning_rate": 4.570394542894725e-05, + "loss": 0.7728, + "step": 10822 + }, + { + "epoch": 0.1918664167298156, + "grad_norm": 2.765625, + "learning_rate": 4.5702379935515026e-05, + "loss": 0.7949, + "step": 10824 + }, + { + "epoch": 0.19190186876542717, + "grad_norm": 2.65625, + "learning_rate": 4.570081418372142e-05, + "loss": 0.8263, + "step": 10826 + }, + { + "epoch": 0.19193732080103876, + "grad_norm": 2.953125, + "learning_rate": 4.569924817358596e-05, + "loss": 0.7999, + "step": 10828 + }, + { + "epoch": 0.19197277283665032, + "grad_norm": 2.515625, + "learning_rate": 4.5697681905128195e-05, + "loss": 0.794, + "step": 10830 + }, + { + "epoch": 0.19200822487226188, + "grad_norm": 2.515625, + "learning_rate": 4.569611537836767e-05, + "loss": 0.7843, + "step": 10832 + }, + { + "epoch": 0.19204367690787347, + "grad_norm": 2.84375, + "learning_rate": 4.569454859332394e-05, + "loss": 0.806, + "step": 10834 + }, + { + "epoch": 0.19207912894348503, + "grad_norm": 2.6875, + "learning_rate": 4.569298155001655e-05, + "loss": 0.8376, + "step": 10836 + }, + { + "epoch": 0.1921145809790966, + "grad_norm": 2.734375, + "learning_rate": 4.569141424846506e-05, + "loss": 0.8096, + "step": 10838 + }, + { + "epoch": 0.19215003301470818, + "grad_norm": 2.625, + "learning_rate": 4.568984668868903e-05, + "loss": 0.8156, + "step": 10840 + }, + { + "epoch": 0.19218548505031974, + "grad_norm": 2.828125, + "learning_rate": 4.568827887070802e-05, + "loss": 0.7778, + "step": 10842 + }, + { + "epoch": 0.1922209370859313, + "grad_norm": 2.796875, + "learning_rate": 4.5686710794541595e-05, + "loss": 0.808, + "step": 10844 + }, + { + "epoch": 0.19225638912154286, + "grad_norm": 3.1875, + "learning_rate": 4.568514246020934e-05, + "loss": 0.7928, + "step": 10846 + }, + { + "epoch": 0.19229184115715445, + "grad_norm": 2.828125, + "learning_rate": 4.568357386773081e-05, + "loss": 0.7913, + "step": 10848 + }, + { + "epoch": 0.192327293192766, + "grad_norm": 2.671875, + "learning_rate": 4.5682005017125584e-05, + "loss": 0.7929, + "step": 10850 + }, + { + "epoch": 0.19236274522837757, + "grad_norm": 2.796875, + "learning_rate": 4.568043590841325e-05, + "loss": 0.809, + "step": 10852 + }, + { + "epoch": 0.19239819726398916, + "grad_norm": 2.9375, + "learning_rate": 4.567886654161338e-05, + "loss": 0.7859, + "step": 10854 + }, + { + "epoch": 0.19243364929960072, + "grad_norm": 2.5625, + "learning_rate": 4.567729691674556e-05, + "loss": 0.7684, + "step": 10856 + }, + { + "epoch": 0.19246910133521228, + "grad_norm": 2.8125, + "learning_rate": 4.5675727033829386e-05, + "loss": 0.8059, + "step": 10858 + }, + { + "epoch": 0.19250455337082387, + "grad_norm": 2.8125, + "learning_rate": 4.567415689288444e-05, + "loss": 0.8213, + "step": 10860 + }, + { + "epoch": 0.19254000540643543, + "grad_norm": 2.6875, + "learning_rate": 4.5672586493930325e-05, + "loss": 0.7972, + "step": 10862 + }, + { + "epoch": 0.192575457442047, + "grad_norm": 2.625, + "learning_rate": 4.567101583698663e-05, + "loss": 0.8258, + "step": 10864 + }, + { + "epoch": 0.19261090947765858, + "grad_norm": 2.921875, + "learning_rate": 4.5669444922072965e-05, + "loss": 0.8301, + "step": 10866 + }, + { + "epoch": 0.19264636151327014, + "grad_norm": 2.671875, + "learning_rate": 4.5667873749208946e-05, + "loss": 0.8251, + "step": 10868 + }, + { + "epoch": 0.1926818135488817, + "grad_norm": 2.5625, + "learning_rate": 4.566630231841416e-05, + "loss": 0.8066, + "step": 10870 + }, + { + "epoch": 0.19271726558449329, + "grad_norm": 2.546875, + "learning_rate": 4.566473062970821e-05, + "loss": 0.7789, + "step": 10872 + }, + { + "epoch": 0.19275271762010485, + "grad_norm": 2.6875, + "learning_rate": 4.566315868311074e-05, + "loss": 0.8069, + "step": 10874 + }, + { + "epoch": 0.1927881696557164, + "grad_norm": 2.765625, + "learning_rate": 4.5661586478641356e-05, + "loss": 0.7928, + "step": 10876 + }, + { + "epoch": 0.192823621691328, + "grad_norm": 2.515625, + "learning_rate": 4.5660014016319674e-05, + "loss": 0.7653, + "step": 10878 + }, + { + "epoch": 0.19285907372693956, + "grad_norm": 2.921875, + "learning_rate": 4.5658441296165316e-05, + "loss": 0.8492, + "step": 10880 + }, + { + "epoch": 0.19289452576255112, + "grad_norm": 2.75, + "learning_rate": 4.5656868318197914e-05, + "loss": 0.7798, + "step": 10882 + }, + { + "epoch": 0.1929299777981627, + "grad_norm": 2.6875, + "learning_rate": 4.56552950824371e-05, + "loss": 0.7742, + "step": 10884 + }, + { + "epoch": 0.19296542983377427, + "grad_norm": 2.703125, + "learning_rate": 4.5653721588902506e-05, + "loss": 0.779, + "step": 10886 + }, + { + "epoch": 0.19300088186938583, + "grad_norm": 2.734375, + "learning_rate": 4.565214783761377e-05, + "loss": 0.8233, + "step": 10888 + }, + { + "epoch": 0.19303633390499741, + "grad_norm": 2.8125, + "learning_rate": 4.5650573828590525e-05, + "loss": 0.8177, + "step": 10890 + }, + { + "epoch": 0.19307178594060898, + "grad_norm": 2.625, + "learning_rate": 4.5648999561852424e-05, + "loss": 0.8479, + "step": 10892 + }, + { + "epoch": 0.19310723797622054, + "grad_norm": 2.96875, + "learning_rate": 4.564742503741911e-05, + "loss": 0.8367, + "step": 10894 + }, + { + "epoch": 0.19314269001183212, + "grad_norm": 2.890625, + "learning_rate": 4.564585025531023e-05, + "loss": 0.7768, + "step": 10896 + }, + { + "epoch": 0.19317814204744369, + "grad_norm": 2.75, + "learning_rate": 4.564427521554544e-05, + "loss": 0.8172, + "step": 10898 + }, + { + "epoch": 0.19321359408305525, + "grad_norm": 2.671875, + "learning_rate": 4.564269991814439e-05, + "loss": 0.8101, + "step": 10900 + }, + { + "epoch": 0.19324904611866683, + "grad_norm": 2.859375, + "learning_rate": 4.564112436312675e-05, + "loss": 0.8061, + "step": 10902 + }, + { + "epoch": 0.1932844981542784, + "grad_norm": 2.828125, + "learning_rate": 4.563954855051218e-05, + "loss": 0.8271, + "step": 10904 + }, + { + "epoch": 0.19331995018988996, + "grad_norm": 2.796875, + "learning_rate": 4.563797248032034e-05, + "loss": 0.7935, + "step": 10906 + }, + { + "epoch": 0.19335540222550154, + "grad_norm": 2.578125, + "learning_rate": 4.5636396152570906e-05, + "loss": 0.8145, + "step": 10908 + }, + { + "epoch": 0.1933908542611131, + "grad_norm": 2.890625, + "learning_rate": 4.5634819567283536e-05, + "loss": 0.8593, + "step": 10910 + }, + { + "epoch": 0.19342630629672466, + "grad_norm": 2.765625, + "learning_rate": 4.563324272447792e-05, + "loss": 0.7838, + "step": 10912 + }, + { + "epoch": 0.19346175833233625, + "grad_norm": 2.734375, + "learning_rate": 4.563166562417374e-05, + "loss": 0.8105, + "step": 10914 + }, + { + "epoch": 0.19349721036794781, + "grad_norm": 2.984375, + "learning_rate": 4.563008826639066e-05, + "loss": 0.8196, + "step": 10916 + }, + { + "epoch": 0.19353266240355937, + "grad_norm": 2.6875, + "learning_rate": 4.5628510651148385e-05, + "loss": 0.8281, + "step": 10918 + }, + { + "epoch": 0.19356811443917096, + "grad_norm": 2.625, + "learning_rate": 4.562693277846658e-05, + "loss": 0.7671, + "step": 10920 + }, + { + "epoch": 0.19360356647478252, + "grad_norm": 3.0625, + "learning_rate": 4.562535464836496e-05, + "loss": 0.8377, + "step": 10922 + }, + { + "epoch": 0.19363901851039408, + "grad_norm": 2.578125, + "learning_rate": 4.562377626086321e-05, + "loss": 0.7881, + "step": 10924 + }, + { + "epoch": 0.19367447054600567, + "grad_norm": 2.8125, + "learning_rate": 4.5622197615981025e-05, + "loss": 0.815, + "step": 10926 + }, + { + "epoch": 0.19370992258161723, + "grad_norm": 2.640625, + "learning_rate": 4.5620618713738114e-05, + "loss": 0.7683, + "step": 10928 + }, + { + "epoch": 0.1937453746172288, + "grad_norm": 2.6875, + "learning_rate": 4.561903955415417e-05, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.19378082665284038, + "grad_norm": 2.578125, + "learning_rate": 4.5617460137248915e-05, + "loss": 0.8001, + "step": 10932 + }, + { + "epoch": 0.19381627868845194, + "grad_norm": 2.65625, + "learning_rate": 4.5615880463042036e-05, + "loss": 0.8007, + "step": 10934 + }, + { + "epoch": 0.1938517307240635, + "grad_norm": 2.84375, + "learning_rate": 4.561430053155328e-05, + "loss": 0.7932, + "step": 10936 + }, + { + "epoch": 0.1938871827596751, + "grad_norm": 2.625, + "learning_rate": 4.561272034280234e-05, + "loss": 0.7869, + "step": 10938 + }, + { + "epoch": 0.19392263479528665, + "grad_norm": 2.8125, + "learning_rate": 4.561113989680894e-05, + "loss": 0.8114, + "step": 10940 + }, + { + "epoch": 0.1939580868308982, + "grad_norm": 2.578125, + "learning_rate": 4.560955919359281e-05, + "loss": 0.7966, + "step": 10942 + }, + { + "epoch": 0.1939935388665098, + "grad_norm": 2.515625, + "learning_rate": 4.560797823317368e-05, + "loss": 0.8255, + "step": 10944 + }, + { + "epoch": 0.19402899090212136, + "grad_norm": 2.875, + "learning_rate": 4.560639701557127e-05, + "loss": 0.8436, + "step": 10946 + }, + { + "epoch": 0.19406444293773292, + "grad_norm": 2.515625, + "learning_rate": 4.560481554080531e-05, + "loss": 0.7665, + "step": 10948 + }, + { + "epoch": 0.1940998949733445, + "grad_norm": 2.734375, + "learning_rate": 4.5603233808895554e-05, + "loss": 0.7843, + "step": 10950 + }, + { + "epoch": 0.19413534700895607, + "grad_norm": 2.8125, + "learning_rate": 4.560165181986172e-05, + "loss": 0.8479, + "step": 10952 + }, + { + "epoch": 0.19417079904456763, + "grad_norm": 3.046875, + "learning_rate": 4.5600069573723577e-05, + "loss": 0.7911, + "step": 10954 + }, + { + "epoch": 0.19420625108017922, + "grad_norm": 2.375, + "learning_rate": 4.559848707050085e-05, + "loss": 0.7645, + "step": 10956 + }, + { + "epoch": 0.19424170311579078, + "grad_norm": 2.78125, + "learning_rate": 4.559690431021329e-05, + "loss": 0.8018, + "step": 10958 + }, + { + "epoch": 0.19427715515140234, + "grad_norm": 2.6875, + "learning_rate": 4.559532129288066e-05, + "loss": 0.8233, + "step": 10960 + }, + { + "epoch": 0.19431260718701393, + "grad_norm": 2.515625, + "learning_rate": 4.559373801852271e-05, + "loss": 0.809, + "step": 10962 + }, + { + "epoch": 0.1943480592226255, + "grad_norm": 2.671875, + "learning_rate": 4.5592154487159197e-05, + "loss": 0.7707, + "step": 10964 + }, + { + "epoch": 0.19438351125823705, + "grad_norm": 2.765625, + "learning_rate": 4.559057069880988e-05, + "loss": 0.7766, + "step": 10966 + }, + { + "epoch": 0.19441896329384864, + "grad_norm": 2.765625, + "learning_rate": 4.558898665349453e-05, + "loss": 0.8001, + "step": 10968 + }, + { + "epoch": 0.1944544153294602, + "grad_norm": 2.921875, + "learning_rate": 4.558740235123292e-05, + "loss": 0.7537, + "step": 10970 + }, + { + "epoch": 0.19448986736507176, + "grad_norm": 3.140625, + "learning_rate": 4.5585817792044815e-05, + "loss": 0.8424, + "step": 10972 + }, + { + "epoch": 0.19452531940068335, + "grad_norm": 2.671875, + "learning_rate": 4.558423297595e-05, + "loss": 0.8104, + "step": 10974 + }, + { + "epoch": 0.1945607714362949, + "grad_norm": 2.640625, + "learning_rate": 4.558264790296823e-05, + "loss": 0.8017, + "step": 10976 + }, + { + "epoch": 0.19459622347190647, + "grad_norm": 2.734375, + "learning_rate": 4.558106257311932e-05, + "loss": 0.8555, + "step": 10978 + }, + { + "epoch": 0.19463167550751806, + "grad_norm": 2.546875, + "learning_rate": 4.557947698642302e-05, + "loss": 0.8092, + "step": 10980 + }, + { + "epoch": 0.19466712754312962, + "grad_norm": 2.890625, + "learning_rate": 4.557789114289913e-05, + "loss": 0.8264, + "step": 10982 + }, + { + "epoch": 0.19470257957874118, + "grad_norm": 2.890625, + "learning_rate": 4.557630504256746e-05, + "loss": 0.7817, + "step": 10984 + }, + { + "epoch": 0.19473803161435277, + "grad_norm": 2.734375, + "learning_rate": 4.5574718685447784e-05, + "loss": 0.7646, + "step": 10986 + }, + { + "epoch": 0.19477348364996433, + "grad_norm": 2.828125, + "learning_rate": 4.55731320715599e-05, + "loss": 0.7924, + "step": 10988 + }, + { + "epoch": 0.1948089356855759, + "grad_norm": 2.90625, + "learning_rate": 4.557154520092361e-05, + "loss": 0.7924, + "step": 10990 + }, + { + "epoch": 0.19484438772118748, + "grad_norm": 2.921875, + "learning_rate": 4.5569958073558724e-05, + "loss": 0.8446, + "step": 10992 + }, + { + "epoch": 0.19487983975679904, + "grad_norm": 2.765625, + "learning_rate": 4.556837068948505e-05, + "loss": 0.83, + "step": 10994 + }, + { + "epoch": 0.1949152917924106, + "grad_norm": 3.0, + "learning_rate": 4.556678304872239e-05, + "loss": 0.7711, + "step": 10996 + }, + { + "epoch": 0.1949507438280222, + "grad_norm": 2.53125, + "learning_rate": 4.556519515129056e-05, + "loss": 0.8246, + "step": 10998 + }, + { + "epoch": 0.19498619586363375, + "grad_norm": 2.609375, + "learning_rate": 4.556360699720938e-05, + "loss": 0.8017, + "step": 11000 + }, + { + "epoch": 0.1950216478992453, + "grad_norm": 3.015625, + "learning_rate": 4.556201858649867e-05, + "loss": 0.838, + "step": 11002 + }, + { + "epoch": 0.1950570999348569, + "grad_norm": 2.734375, + "learning_rate": 4.556042991917825e-05, + "loss": 0.7748, + "step": 11004 + }, + { + "epoch": 0.19509255197046846, + "grad_norm": 2.921875, + "learning_rate": 4.555884099526794e-05, + "loss": 0.8404, + "step": 11006 + }, + { + "epoch": 0.19512800400608002, + "grad_norm": 2.71875, + "learning_rate": 4.555725181478758e-05, + "loss": 0.8018, + "step": 11008 + }, + { + "epoch": 0.1951634560416916, + "grad_norm": 2.75, + "learning_rate": 4.5555662377757e-05, + "loss": 0.8392, + "step": 11010 + }, + { + "epoch": 0.19519890807730317, + "grad_norm": 2.53125, + "learning_rate": 4.5554072684196035e-05, + "loss": 0.8423, + "step": 11012 + }, + { + "epoch": 0.19523436011291473, + "grad_norm": 2.765625, + "learning_rate": 4.555248273412453e-05, + "loss": 0.8131, + "step": 11014 + }, + { + "epoch": 0.1952698121485263, + "grad_norm": 2.65625, + "learning_rate": 4.555089252756232e-05, + "loss": 0.7641, + "step": 11016 + }, + { + "epoch": 0.19530526418413788, + "grad_norm": 2.65625, + "learning_rate": 4.554930206452924e-05, + "loss": 0.7994, + "step": 11018 + }, + { + "epoch": 0.19534071621974944, + "grad_norm": 2.6875, + "learning_rate": 4.554771134504516e-05, + "loss": 0.7984, + "step": 11020 + }, + { + "epoch": 0.195376168255361, + "grad_norm": 2.53125, + "learning_rate": 4.554612036912992e-05, + "loss": 0.7979, + "step": 11022 + }, + { + "epoch": 0.1954116202909726, + "grad_norm": 2.78125, + "learning_rate": 4.554452913680338e-05, + "loss": 0.7939, + "step": 11024 + }, + { + "epoch": 0.19544707232658415, + "grad_norm": 2.546875, + "learning_rate": 4.5542937648085394e-05, + "loss": 0.8338, + "step": 11026 + }, + { + "epoch": 0.1954825243621957, + "grad_norm": 2.84375, + "learning_rate": 4.5541345902995825e-05, + "loss": 0.8299, + "step": 11028 + }, + { + "epoch": 0.1955179763978073, + "grad_norm": 2.609375, + "learning_rate": 4.553975390155454e-05, + "loss": 0.8436, + "step": 11030 + }, + { + "epoch": 0.19555342843341886, + "grad_norm": 3.015625, + "learning_rate": 4.55381616437814e-05, + "loss": 0.7852, + "step": 11032 + }, + { + "epoch": 0.19558888046903042, + "grad_norm": 2.953125, + "learning_rate": 4.553656912969628e-05, + "loss": 0.8327, + "step": 11034 + }, + { + "epoch": 0.195624332504642, + "grad_norm": 2.828125, + "learning_rate": 4.553497635931905e-05, + "loss": 0.7849, + "step": 11036 + }, + { + "epoch": 0.19565978454025357, + "grad_norm": 2.796875, + "learning_rate": 4.55333833326696e-05, + "loss": 0.801, + "step": 11038 + }, + { + "epoch": 0.19569523657586513, + "grad_norm": 3.0, + "learning_rate": 4.55317900497678e-05, + "loss": 0.8602, + "step": 11040 + }, + { + "epoch": 0.19573068861147672, + "grad_norm": 2.59375, + "learning_rate": 4.553019651063354e-05, + "loss": 0.7818, + "step": 11042 + }, + { + "epoch": 0.19576614064708828, + "grad_norm": 2.734375, + "learning_rate": 4.55286027152867e-05, + "loss": 0.7704, + "step": 11044 + }, + { + "epoch": 0.19580159268269984, + "grad_norm": 2.71875, + "learning_rate": 4.5527008663747176e-05, + "loss": 0.7926, + "step": 11046 + }, + { + "epoch": 0.19583704471831143, + "grad_norm": 2.6875, + "learning_rate": 4.552541435603486e-05, + "loss": 0.8085, + "step": 11048 + }, + { + "epoch": 0.195872496753923, + "grad_norm": 2.671875, + "learning_rate": 4.5523819792169646e-05, + "loss": 0.7907, + "step": 11050 + }, + { + "epoch": 0.19590794878953455, + "grad_norm": 2.5625, + "learning_rate": 4.5522224972171435e-05, + "loss": 0.8319, + "step": 11052 + }, + { + "epoch": 0.19594340082514614, + "grad_norm": 2.796875, + "learning_rate": 4.5520629896060134e-05, + "loss": 0.8224, + "step": 11054 + }, + { + "epoch": 0.1959788528607577, + "grad_norm": 2.390625, + "learning_rate": 4.5519034563855646e-05, + "loss": 0.7893, + "step": 11056 + }, + { + "epoch": 0.19601430489636926, + "grad_norm": 2.921875, + "learning_rate": 4.551743897557788e-05, + "loss": 0.8568, + "step": 11058 + }, + { + "epoch": 0.19604975693198085, + "grad_norm": 2.484375, + "learning_rate": 4.551584313124675e-05, + "loss": 0.7999, + "step": 11060 + }, + { + "epoch": 0.1960852089675924, + "grad_norm": 2.671875, + "learning_rate": 4.5514247030882165e-05, + "loss": 0.818, + "step": 11062 + }, + { + "epoch": 0.19612066100320397, + "grad_norm": 2.8125, + "learning_rate": 4.551265067450405e-05, + "loss": 0.7775, + "step": 11064 + }, + { + "epoch": 0.19615611303881555, + "grad_norm": 2.875, + "learning_rate": 4.551105406213233e-05, + "loss": 0.8353, + "step": 11066 + }, + { + "epoch": 0.19619156507442712, + "grad_norm": 3.03125, + "learning_rate": 4.550945719378693e-05, + "loss": 0.8182, + "step": 11068 + }, + { + "epoch": 0.19622701711003868, + "grad_norm": 2.703125, + "learning_rate": 4.550786006948777e-05, + "loss": 0.7792, + "step": 11070 + }, + { + "epoch": 0.19626246914565026, + "grad_norm": 2.640625, + "learning_rate": 4.5506262689254796e-05, + "loss": 0.819, + "step": 11072 + }, + { + "epoch": 0.19629792118126183, + "grad_norm": 2.734375, + "learning_rate": 4.550466505310793e-05, + "loss": 0.8382, + "step": 11074 + }, + { + "epoch": 0.19633337321687339, + "grad_norm": 2.484375, + "learning_rate": 4.550306716106712e-05, + "loss": 0.8173, + "step": 11076 + }, + { + "epoch": 0.19636882525248497, + "grad_norm": 2.90625, + "learning_rate": 4.55014690131523e-05, + "loss": 0.8505, + "step": 11078 + }, + { + "epoch": 0.19640427728809653, + "grad_norm": 2.8125, + "learning_rate": 4.549987060938341e-05, + "loss": 0.8275, + "step": 11080 + }, + { + "epoch": 0.1964397293237081, + "grad_norm": 2.703125, + "learning_rate": 4.5498271949780414e-05, + "loss": 0.765, + "step": 11082 + }, + { + "epoch": 0.19647518135931968, + "grad_norm": 2.6875, + "learning_rate": 4.5496673034363246e-05, + "loss": 0.7818, + "step": 11084 + }, + { + "epoch": 0.19651063339493124, + "grad_norm": 2.625, + "learning_rate": 4.549507386315187e-05, + "loss": 0.7932, + "step": 11086 + }, + { + "epoch": 0.1965460854305428, + "grad_norm": 2.609375, + "learning_rate": 4.5493474436166236e-05, + "loss": 0.817, + "step": 11088 + }, + { + "epoch": 0.1965815374661544, + "grad_norm": 2.71875, + "learning_rate": 4.549187475342632e-05, + "loss": 0.805, + "step": 11090 + }, + { + "epoch": 0.19661698950176595, + "grad_norm": 2.5, + "learning_rate": 4.549027481495207e-05, + "loss": 0.7974, + "step": 11092 + }, + { + "epoch": 0.19665244153737751, + "grad_norm": 2.640625, + "learning_rate": 4.548867462076346e-05, + "loss": 0.8194, + "step": 11094 + }, + { + "epoch": 0.1966878935729891, + "grad_norm": 2.859375, + "learning_rate": 4.548707417088046e-05, + "loss": 0.8188, + "step": 11096 + }, + { + "epoch": 0.19672334560860066, + "grad_norm": 2.921875, + "learning_rate": 4.5485473465323035e-05, + "loss": 0.8054, + "step": 11098 + }, + { + "epoch": 0.19675879764421222, + "grad_norm": 3.171875, + "learning_rate": 4.548387250411117e-05, + "loss": 0.8056, + "step": 11100 + }, + { + "epoch": 0.1967942496798238, + "grad_norm": 2.828125, + "learning_rate": 4.5482271287264845e-05, + "loss": 0.7905, + "step": 11102 + }, + { + "epoch": 0.19682970171543537, + "grad_norm": 2.5625, + "learning_rate": 4.5480669814804036e-05, + "loss": 0.7922, + "step": 11104 + }, + { + "epoch": 0.19686515375104693, + "grad_norm": 2.734375, + "learning_rate": 4.5479068086748746e-05, + "loss": 0.8214, + "step": 11106 + }, + { + "epoch": 0.19690060578665852, + "grad_norm": 2.921875, + "learning_rate": 4.5477466103118936e-05, + "loss": 0.7875, + "step": 11108 + }, + { + "epoch": 0.19693605782227008, + "grad_norm": 2.84375, + "learning_rate": 4.547586386393462e-05, + "loss": 0.816, + "step": 11110 + }, + { + "epoch": 0.19697150985788164, + "grad_norm": 3.1875, + "learning_rate": 4.547426136921579e-05, + "loss": 0.8257, + "step": 11112 + }, + { + "epoch": 0.19700696189349323, + "grad_norm": 2.640625, + "learning_rate": 4.5472658618982446e-05, + "loss": 0.7914, + "step": 11114 + }, + { + "epoch": 0.1970424139291048, + "grad_norm": 2.640625, + "learning_rate": 4.5471055613254574e-05, + "loss": 0.8, + "step": 11116 + }, + { + "epoch": 0.19707786596471635, + "grad_norm": 2.546875, + "learning_rate": 4.54694523520522e-05, + "loss": 0.7953, + "step": 11118 + }, + { + "epoch": 0.19711331800032794, + "grad_norm": 2.84375, + "learning_rate": 4.546784883539533e-05, + "loss": 0.7683, + "step": 11120 + }, + { + "epoch": 0.1971487700359395, + "grad_norm": 2.8125, + "learning_rate": 4.546624506330396e-05, + "loss": 0.7772, + "step": 11122 + }, + { + "epoch": 0.19718422207155106, + "grad_norm": 2.890625, + "learning_rate": 4.546464103579812e-05, + "loss": 0.7998, + "step": 11124 + }, + { + "epoch": 0.19721967410716265, + "grad_norm": 2.84375, + "learning_rate": 4.546303675289782e-05, + "loss": 0.8329, + "step": 11126 + }, + { + "epoch": 0.1972551261427742, + "grad_norm": 2.875, + "learning_rate": 4.5461432214623084e-05, + "loss": 0.7998, + "step": 11128 + }, + { + "epoch": 0.19729057817838577, + "grad_norm": 2.578125, + "learning_rate": 4.545982742099394e-05, + "loss": 0.8297, + "step": 11130 + }, + { + "epoch": 0.19732603021399736, + "grad_norm": 2.671875, + "learning_rate": 4.54582223720304e-05, + "loss": 0.8171, + "step": 11132 + }, + { + "epoch": 0.19736148224960892, + "grad_norm": 2.609375, + "learning_rate": 4.545661706775251e-05, + "loss": 0.7716, + "step": 11134 + }, + { + "epoch": 0.19739693428522048, + "grad_norm": 2.625, + "learning_rate": 4.54550115081803e-05, + "loss": 0.7913, + "step": 11136 + }, + { + "epoch": 0.19743238632083207, + "grad_norm": 2.640625, + "learning_rate": 4.545340569333382e-05, + "loss": 0.7837, + "step": 11138 + }, + { + "epoch": 0.19746783835644363, + "grad_norm": 2.8125, + "learning_rate": 4.545179962323308e-05, + "loss": 0.8132, + "step": 11140 + }, + { + "epoch": 0.1975032903920552, + "grad_norm": 2.953125, + "learning_rate": 4.545019329789815e-05, + "loss": 0.8121, + "step": 11142 + }, + { + "epoch": 0.19753874242766678, + "grad_norm": 2.78125, + "learning_rate": 4.5448586717349065e-05, + "loss": 0.8451, + "step": 11144 + }, + { + "epoch": 0.19757419446327834, + "grad_norm": 3.0625, + "learning_rate": 4.5446979881605874e-05, + "loss": 0.7968, + "step": 11146 + }, + { + "epoch": 0.1976096464988899, + "grad_norm": 2.765625, + "learning_rate": 4.5445372790688634e-05, + "loss": 0.7621, + "step": 11148 + }, + { + "epoch": 0.1976450985345015, + "grad_norm": 2.890625, + "learning_rate": 4.5443765444617404e-05, + "loss": 0.805, + "step": 11150 + }, + { + "epoch": 0.19768055057011305, + "grad_norm": 2.75, + "learning_rate": 4.544215784341224e-05, + "loss": 0.8009, + "step": 11152 + }, + { + "epoch": 0.1977160026057246, + "grad_norm": 2.8125, + "learning_rate": 4.544054998709319e-05, + "loss": 0.8013, + "step": 11154 + }, + { + "epoch": 0.1977514546413362, + "grad_norm": 2.6875, + "learning_rate": 4.543894187568035e-05, + "loss": 0.804, + "step": 11156 + }, + { + "epoch": 0.19778690667694776, + "grad_norm": 2.40625, + "learning_rate": 4.5437333509193765e-05, + "loss": 0.8002, + "step": 11158 + }, + { + "epoch": 0.19782235871255932, + "grad_norm": 2.703125, + "learning_rate": 4.543572488765351e-05, + "loss": 0.8441, + "step": 11160 + }, + { + "epoch": 0.1978578107481709, + "grad_norm": 3.015625, + "learning_rate": 4.5434116011079675e-05, + "loss": 0.8051, + "step": 11162 + }, + { + "epoch": 0.19789326278378247, + "grad_norm": 2.828125, + "learning_rate": 4.543250687949232e-05, + "loss": 0.7755, + "step": 11164 + }, + { + "epoch": 0.19792871481939403, + "grad_norm": 2.78125, + "learning_rate": 4.543089749291154e-05, + "loss": 0.7464, + "step": 11166 + }, + { + "epoch": 0.19796416685500562, + "grad_norm": 2.609375, + "learning_rate": 4.5429287851357416e-05, + "loss": 0.8078, + "step": 11168 + }, + { + "epoch": 0.19799961889061718, + "grad_norm": 2.984375, + "learning_rate": 4.542767795485003e-05, + "loss": 0.7877, + "step": 11170 + }, + { + "epoch": 0.19803507092622874, + "grad_norm": 2.859375, + "learning_rate": 4.542606780340948e-05, + "loss": 0.8076, + "step": 11172 + }, + { + "epoch": 0.19807052296184033, + "grad_norm": 2.609375, + "learning_rate": 4.5424457397055856e-05, + "loss": 0.7988, + "step": 11174 + }, + { + "epoch": 0.1981059749974519, + "grad_norm": 2.703125, + "learning_rate": 4.542284673580927e-05, + "loss": 0.8249, + "step": 11176 + }, + { + "epoch": 0.19814142703306345, + "grad_norm": 2.5, + "learning_rate": 4.5421235819689796e-05, + "loss": 0.7981, + "step": 11178 + }, + { + "epoch": 0.19817687906867504, + "grad_norm": 2.484375, + "learning_rate": 4.541962464871756e-05, + "loss": 0.7837, + "step": 11180 + }, + { + "epoch": 0.1982123311042866, + "grad_norm": 2.640625, + "learning_rate": 4.541801322291266e-05, + "loss": 0.756, + "step": 11182 + }, + { + "epoch": 0.19824778313989816, + "grad_norm": 2.59375, + "learning_rate": 4.54164015422952e-05, + "loss": 0.8059, + "step": 11184 + }, + { + "epoch": 0.19828323517550972, + "grad_norm": 2.71875, + "learning_rate": 4.541478960688531e-05, + "loss": 0.8035, + "step": 11186 + }, + { + "epoch": 0.1983186872111213, + "grad_norm": 2.796875, + "learning_rate": 4.5413177416703094e-05, + "loss": 0.8364, + "step": 11188 + }, + { + "epoch": 0.19835413924673287, + "grad_norm": 2.578125, + "learning_rate": 4.541156497176868e-05, + "loss": 0.7633, + "step": 11190 + }, + { + "epoch": 0.19838959128234443, + "grad_norm": 2.84375, + "learning_rate": 4.540995227210218e-05, + "loss": 0.8082, + "step": 11192 + }, + { + "epoch": 0.19842504331795602, + "grad_norm": 2.875, + "learning_rate": 4.540833931772373e-05, + "loss": 0.8092, + "step": 11194 + }, + { + "epoch": 0.19846049535356758, + "grad_norm": 2.640625, + "learning_rate": 4.540672610865346e-05, + "loss": 0.8039, + "step": 11196 + }, + { + "epoch": 0.19849594738917914, + "grad_norm": 2.59375, + "learning_rate": 4.540511264491149e-05, + "loss": 0.8232, + "step": 11198 + }, + { + "epoch": 0.19853139942479073, + "grad_norm": 2.828125, + "learning_rate": 4.540349892651797e-05, + "loss": 0.8326, + "step": 11200 + }, + { + "epoch": 0.1985668514604023, + "grad_norm": 2.65625, + "learning_rate": 4.5401884953493035e-05, + "loss": 0.8045, + "step": 11202 + }, + { + "epoch": 0.19860230349601385, + "grad_norm": 2.5625, + "learning_rate": 4.540027072585682e-05, + "loss": 0.8091, + "step": 11204 + }, + { + "epoch": 0.19863775553162544, + "grad_norm": 2.828125, + "learning_rate": 4.539865624362948e-05, + "loss": 0.8383, + "step": 11206 + }, + { + "epoch": 0.198673207567237, + "grad_norm": 2.515625, + "learning_rate": 4.5397041506831154e-05, + "loss": 0.7917, + "step": 11208 + }, + { + "epoch": 0.19870865960284856, + "grad_norm": 2.546875, + "learning_rate": 4.5395426515482005e-05, + "loss": 0.7516, + "step": 11210 + }, + { + "epoch": 0.19874411163846015, + "grad_norm": 2.640625, + "learning_rate": 4.5393811269602173e-05, + "loss": 0.7987, + "step": 11212 + }, + { + "epoch": 0.1987795636740717, + "grad_norm": 2.625, + "learning_rate": 4.539219576921183e-05, + "loss": 0.7911, + "step": 11214 + }, + { + "epoch": 0.19881501570968327, + "grad_norm": 2.640625, + "learning_rate": 4.539058001433113e-05, + "loss": 0.8121, + "step": 11216 + }, + { + "epoch": 0.19885046774529486, + "grad_norm": 2.625, + "learning_rate": 4.538896400498024e-05, + "loss": 0.7945, + "step": 11218 + }, + { + "epoch": 0.19888591978090642, + "grad_norm": 2.59375, + "learning_rate": 4.538734774117932e-05, + "loss": 0.8229, + "step": 11220 + }, + { + "epoch": 0.19892137181651798, + "grad_norm": 2.390625, + "learning_rate": 4.538573122294856e-05, + "loss": 0.8015, + "step": 11222 + }, + { + "epoch": 0.19895682385212957, + "grad_norm": 2.8125, + "learning_rate": 4.53841144503081e-05, + "loss": 0.7746, + "step": 11224 + }, + { + "epoch": 0.19899227588774113, + "grad_norm": 2.953125, + "learning_rate": 4.538249742327815e-05, + "loss": 0.8317, + "step": 11226 + }, + { + "epoch": 0.1990277279233527, + "grad_norm": 2.6875, + "learning_rate": 4.5380880141878876e-05, + "loss": 0.7737, + "step": 11228 + }, + { + "epoch": 0.19906317995896428, + "grad_norm": 2.796875, + "learning_rate": 4.5379262606130465e-05, + "loss": 0.7977, + "step": 11230 + }, + { + "epoch": 0.19909863199457584, + "grad_norm": 2.734375, + "learning_rate": 4.53776448160531e-05, + "loss": 0.7838, + "step": 11232 + }, + { + "epoch": 0.1991340840301874, + "grad_norm": 2.546875, + "learning_rate": 4.537602677166697e-05, + "loss": 0.7875, + "step": 11234 + }, + { + "epoch": 0.19916953606579899, + "grad_norm": 2.796875, + "learning_rate": 4.537440847299227e-05, + "loss": 0.798, + "step": 11236 + }, + { + "epoch": 0.19920498810141055, + "grad_norm": 2.578125, + "learning_rate": 4.53727899200492e-05, + "loss": 0.8197, + "step": 11238 + }, + { + "epoch": 0.1992404401370221, + "grad_norm": 2.765625, + "learning_rate": 4.537117111285795e-05, + "loss": 0.7943, + "step": 11240 + }, + { + "epoch": 0.1992758921726337, + "grad_norm": 2.90625, + "learning_rate": 4.536955205143873e-05, + "loss": 0.858, + "step": 11242 + }, + { + "epoch": 0.19931134420824526, + "grad_norm": 3.15625, + "learning_rate": 4.536793273581174e-05, + "loss": 0.781, + "step": 11244 + }, + { + "epoch": 0.19934679624385682, + "grad_norm": 2.796875, + "learning_rate": 4.5366313165997196e-05, + "loss": 0.7907, + "step": 11246 + }, + { + "epoch": 0.1993822482794684, + "grad_norm": 2.515625, + "learning_rate": 4.5364693342015306e-05, + "loss": 0.8009, + "step": 11248 + }, + { + "epoch": 0.19941770031507997, + "grad_norm": 2.640625, + "learning_rate": 4.536307326388628e-05, + "loss": 0.8137, + "step": 11250 + }, + { + "epoch": 0.19945315235069153, + "grad_norm": 2.640625, + "learning_rate": 4.536145293163034e-05, + "loss": 0.7723, + "step": 11252 + }, + { + "epoch": 0.19948860438630311, + "grad_norm": 2.671875, + "learning_rate": 4.535983234526772e-05, + "loss": 0.7916, + "step": 11254 + }, + { + "epoch": 0.19952405642191468, + "grad_norm": 2.578125, + "learning_rate": 4.5358211504818625e-05, + "loss": 0.7999, + "step": 11256 + }, + { + "epoch": 0.19955950845752624, + "grad_norm": 2.953125, + "learning_rate": 4.535659041030329e-05, + "loss": 0.7989, + "step": 11258 + }, + { + "epoch": 0.19959496049313782, + "grad_norm": 2.828125, + "learning_rate": 4.535496906174195e-05, + "loss": 0.8241, + "step": 11260 + }, + { + "epoch": 0.19963041252874938, + "grad_norm": 2.890625, + "learning_rate": 4.535334745915483e-05, + "loss": 0.8053, + "step": 11262 + }, + { + "epoch": 0.19966586456436095, + "grad_norm": 2.578125, + "learning_rate": 4.535172560256218e-05, + "loss": 0.8119, + "step": 11264 + }, + { + "epoch": 0.19970131659997253, + "grad_norm": 2.703125, + "learning_rate": 4.535010349198423e-05, + "loss": 0.7971, + "step": 11266 + }, + { + "epoch": 0.1997367686355841, + "grad_norm": 2.53125, + "learning_rate": 4.5348481127441226e-05, + "loss": 0.8, + "step": 11268 + }, + { + "epoch": 0.19977222067119565, + "grad_norm": 2.734375, + "learning_rate": 4.534685850895342e-05, + "loss": 0.7838, + "step": 11270 + }, + { + "epoch": 0.19980767270680724, + "grad_norm": 2.640625, + "learning_rate": 4.534523563654105e-05, + "loss": 0.7988, + "step": 11272 + }, + { + "epoch": 0.1998431247424188, + "grad_norm": 2.953125, + "learning_rate": 4.5343612510224374e-05, + "loss": 0.8068, + "step": 11274 + }, + { + "epoch": 0.19987857677803036, + "grad_norm": 2.59375, + "learning_rate": 4.534198913002367e-05, + "loss": 0.8058, + "step": 11276 + }, + { + "epoch": 0.19991402881364195, + "grad_norm": 2.6875, + "learning_rate": 4.534036549595916e-05, + "loss": 0.8102, + "step": 11278 + }, + { + "epoch": 0.1999494808492535, + "grad_norm": 2.84375, + "learning_rate": 4.533874160805113e-05, + "loss": 0.7908, + "step": 11280 + }, + { + "epoch": 0.19998493288486507, + "grad_norm": 2.640625, + "learning_rate": 4.5337117466319843e-05, + "loss": 0.7979, + "step": 11282 + }, + { + "epoch": 0.20002038492047666, + "grad_norm": 2.59375, + "learning_rate": 4.533549307078557e-05, + "loss": 0.8293, + "step": 11284 + }, + { + "epoch": 0.20005583695608822, + "grad_norm": 2.625, + "learning_rate": 4.5333868421468574e-05, + "loss": 0.8047, + "step": 11286 + }, + { + "epoch": 0.20009128899169978, + "grad_norm": 2.5, + "learning_rate": 4.533224351838914e-05, + "loss": 0.7941, + "step": 11288 + }, + { + "epoch": 0.20012674102731137, + "grad_norm": 2.734375, + "learning_rate": 4.533061836156753e-05, + "loss": 0.7903, + "step": 11290 + }, + { + "epoch": 0.20016219306292293, + "grad_norm": 2.59375, + "learning_rate": 4.5328992951024054e-05, + "loss": 0.8487, + "step": 11292 + }, + { + "epoch": 0.2001976450985345, + "grad_norm": 2.65625, + "learning_rate": 4.532736728677897e-05, + "loss": 0.8333, + "step": 11294 + }, + { + "epoch": 0.20023309713414608, + "grad_norm": 2.84375, + "learning_rate": 4.5325741368852576e-05, + "loss": 0.8218, + "step": 11296 + }, + { + "epoch": 0.20026854916975764, + "grad_norm": 2.828125, + "learning_rate": 4.532411519726517e-05, + "loss": 0.7909, + "step": 11298 + }, + { + "epoch": 0.2003040012053692, + "grad_norm": 2.890625, + "learning_rate": 4.532248877203703e-05, + "loss": 0.84, + "step": 11300 + }, + { + "epoch": 0.2003394532409808, + "grad_norm": 2.703125, + "learning_rate": 4.532086209318846e-05, + "loss": 0.8219, + "step": 11302 + }, + { + "epoch": 0.20037490527659235, + "grad_norm": 2.984375, + "learning_rate": 4.531923516073978e-05, + "loss": 0.7958, + "step": 11304 + }, + { + "epoch": 0.2004103573122039, + "grad_norm": 2.796875, + "learning_rate": 4.5317607974711265e-05, + "loss": 0.8303, + "step": 11306 + }, + { + "epoch": 0.2004458093478155, + "grad_norm": 2.8125, + "learning_rate": 4.5315980535123246e-05, + "loss": 0.817, + "step": 11308 + }, + { + "epoch": 0.20048126138342706, + "grad_norm": 2.671875, + "learning_rate": 4.531435284199601e-05, + "loss": 0.8366, + "step": 11310 + }, + { + "epoch": 0.20051671341903862, + "grad_norm": 2.8125, + "learning_rate": 4.5312724895349885e-05, + "loss": 0.7865, + "step": 11312 + }, + { + "epoch": 0.2005521654546502, + "grad_norm": 2.625, + "learning_rate": 4.531109669520519e-05, + "loss": 0.8117, + "step": 11314 + }, + { + "epoch": 0.20058761749026177, + "grad_norm": 2.59375, + "learning_rate": 4.530946824158223e-05, + "loss": 0.7994, + "step": 11316 + }, + { + "epoch": 0.20062306952587333, + "grad_norm": 2.734375, + "learning_rate": 4.530783953450134e-05, + "loss": 0.7917, + "step": 11318 + }, + { + "epoch": 0.20065852156148492, + "grad_norm": 2.90625, + "learning_rate": 4.530621057398284e-05, + "loss": 0.8028, + "step": 11320 + }, + { + "epoch": 0.20069397359709648, + "grad_norm": 2.78125, + "learning_rate": 4.530458136004706e-05, + "loss": 0.8238, + "step": 11322 + }, + { + "epoch": 0.20072942563270804, + "grad_norm": 2.953125, + "learning_rate": 4.530295189271434e-05, + "loss": 0.8099, + "step": 11324 + }, + { + "epoch": 0.20076487766831963, + "grad_norm": 2.78125, + "learning_rate": 4.530132217200501e-05, + "loss": 0.8288, + "step": 11326 + }, + { + "epoch": 0.2008003297039312, + "grad_norm": 2.828125, + "learning_rate": 4.52996921979394e-05, + "loss": 0.8228, + "step": 11328 + }, + { + "epoch": 0.20083578173954275, + "grad_norm": 2.6875, + "learning_rate": 4.5298061970537865e-05, + "loss": 0.8187, + "step": 11330 + }, + { + "epoch": 0.20087123377515434, + "grad_norm": 2.640625, + "learning_rate": 4.529643148982074e-05, + "loss": 0.7506, + "step": 11332 + }, + { + "epoch": 0.2009066858107659, + "grad_norm": 2.703125, + "learning_rate": 4.5294800755808385e-05, + "loss": 0.7772, + "step": 11334 + }, + { + "epoch": 0.20094213784637746, + "grad_norm": 2.921875, + "learning_rate": 4.5293169768521135e-05, + "loss": 0.804, + "step": 11336 + }, + { + "epoch": 0.20097758988198905, + "grad_norm": 2.84375, + "learning_rate": 4.529153852797936e-05, + "loss": 0.8106, + "step": 11338 + }, + { + "epoch": 0.2010130419176006, + "grad_norm": 2.671875, + "learning_rate": 4.528990703420341e-05, + "loss": 0.8061, + "step": 11340 + }, + { + "epoch": 0.20104849395321217, + "grad_norm": 2.5625, + "learning_rate": 4.528827528721364e-05, + "loss": 0.7881, + "step": 11342 + }, + { + "epoch": 0.20108394598882376, + "grad_norm": 2.9375, + "learning_rate": 4.528664328703043e-05, + "loss": 0.7986, + "step": 11344 + }, + { + "epoch": 0.20111939802443532, + "grad_norm": 2.609375, + "learning_rate": 4.528501103367413e-05, + "loss": 0.7846, + "step": 11346 + }, + { + "epoch": 0.20115485006004688, + "grad_norm": 2.640625, + "learning_rate": 4.5283378527165125e-05, + "loss": 0.8148, + "step": 11348 + }, + { + "epoch": 0.20119030209565847, + "grad_norm": 2.65625, + "learning_rate": 4.528174576752377e-05, + "loss": 0.7728, + "step": 11350 + }, + { + "epoch": 0.20122575413127003, + "grad_norm": 2.71875, + "learning_rate": 4.528011275477045e-05, + "loss": 0.7757, + "step": 11352 + }, + { + "epoch": 0.2012612061668816, + "grad_norm": 2.59375, + "learning_rate": 4.5278479488925563e-05, + "loss": 0.8256, + "step": 11354 + }, + { + "epoch": 0.20129665820249315, + "grad_norm": 2.71875, + "learning_rate": 4.527684597000946e-05, + "loss": 0.7902, + "step": 11356 + }, + { + "epoch": 0.20133211023810474, + "grad_norm": 2.703125, + "learning_rate": 4.527521219804255e-05, + "loss": 0.8071, + "step": 11358 + }, + { + "epoch": 0.2013675622737163, + "grad_norm": 2.515625, + "learning_rate": 4.527357817304522e-05, + "loss": 0.8052, + "step": 11360 + }, + { + "epoch": 0.20140301430932786, + "grad_norm": 3.1875, + "learning_rate": 4.527194389503784e-05, + "loss": 0.8477, + "step": 11362 + }, + { + "epoch": 0.20143846634493945, + "grad_norm": 2.59375, + "learning_rate": 4.527030936404084e-05, + "loss": 0.7812, + "step": 11364 + }, + { + "epoch": 0.201473918380551, + "grad_norm": 3.25, + "learning_rate": 4.5268674580074594e-05, + "loss": 0.7922, + "step": 11366 + }, + { + "epoch": 0.20150937041616257, + "grad_norm": 2.9375, + "learning_rate": 4.5267039543159504e-05, + "loss": 0.8091, + "step": 11368 + }, + { + "epoch": 0.20154482245177416, + "grad_norm": 2.75, + "learning_rate": 4.5265404253316e-05, + "loss": 0.8252, + "step": 11370 + }, + { + "epoch": 0.20158027448738572, + "grad_norm": 3.125, + "learning_rate": 4.526376871056446e-05, + "loss": 0.7924, + "step": 11372 + }, + { + "epoch": 0.20161572652299728, + "grad_norm": 2.546875, + "learning_rate": 4.5262132914925303e-05, + "loss": 0.8151, + "step": 11374 + }, + { + "epoch": 0.20165117855860887, + "grad_norm": 2.984375, + "learning_rate": 4.526049686641896e-05, + "loss": 0.8193, + "step": 11376 + }, + { + "epoch": 0.20168663059422043, + "grad_norm": 2.75, + "learning_rate": 4.525886056506582e-05, + "loss": 0.8402, + "step": 11378 + }, + { + "epoch": 0.201722082629832, + "grad_norm": 2.53125, + "learning_rate": 4.5257224010886335e-05, + "loss": 0.8217, + "step": 11380 + }, + { + "epoch": 0.20175753466544358, + "grad_norm": 2.625, + "learning_rate": 4.525558720390091e-05, + "loss": 0.8155, + "step": 11382 + }, + { + "epoch": 0.20179298670105514, + "grad_norm": 2.890625, + "learning_rate": 4.525395014412997e-05, + "loss": 0.7995, + "step": 11384 + }, + { + "epoch": 0.2018284387366667, + "grad_norm": 2.515625, + "learning_rate": 4.525231283159395e-05, + "loss": 0.7724, + "step": 11386 + }, + { + "epoch": 0.2018638907722783, + "grad_norm": 2.578125, + "learning_rate": 4.525067526631329e-05, + "loss": 0.7979, + "step": 11388 + }, + { + "epoch": 0.20189934280788985, + "grad_norm": 2.6875, + "learning_rate": 4.524903744830842e-05, + "loss": 0.8032, + "step": 11390 + }, + { + "epoch": 0.2019347948435014, + "grad_norm": 2.75, + "learning_rate": 4.5247399377599773e-05, + "loss": 0.8137, + "step": 11392 + }, + { + "epoch": 0.201970246879113, + "grad_norm": 3.046875, + "learning_rate": 4.524576105420781e-05, + "loss": 0.7718, + "step": 11394 + }, + { + "epoch": 0.20200569891472456, + "grad_norm": 2.796875, + "learning_rate": 4.524412247815296e-05, + "loss": 0.8087, + "step": 11396 + }, + { + "epoch": 0.20204115095033612, + "grad_norm": 2.71875, + "learning_rate": 4.524248364945568e-05, + "loss": 0.8201, + "step": 11398 + }, + { + "epoch": 0.2020766029859477, + "grad_norm": 2.515625, + "learning_rate": 4.5240844568136415e-05, + "loss": 0.7793, + "step": 11400 + }, + { + "epoch": 0.20211205502155927, + "grad_norm": 2.96875, + "learning_rate": 4.5239205234215634e-05, + "loss": 0.8182, + "step": 11402 + }, + { + "epoch": 0.20214750705717083, + "grad_norm": 2.796875, + "learning_rate": 4.523756564771378e-05, + "loss": 0.7857, + "step": 11404 + }, + { + "epoch": 0.20218295909278242, + "grad_norm": 2.59375, + "learning_rate": 4.523592580865132e-05, + "loss": 0.7496, + "step": 11406 + }, + { + "epoch": 0.20221841112839398, + "grad_norm": 2.6875, + "learning_rate": 4.523428571704873e-05, + "loss": 0.7862, + "step": 11408 + }, + { + "epoch": 0.20225386316400554, + "grad_norm": 2.84375, + "learning_rate": 4.523264537292646e-05, + "loss": 0.7743, + "step": 11410 + }, + { + "epoch": 0.20228931519961713, + "grad_norm": 2.34375, + "learning_rate": 4.5231004776305e-05, + "loss": 0.7779, + "step": 11412 + }, + { + "epoch": 0.2023247672352287, + "grad_norm": 2.890625, + "learning_rate": 4.52293639272048e-05, + "loss": 0.7965, + "step": 11414 + }, + { + "epoch": 0.20236021927084025, + "grad_norm": 2.546875, + "learning_rate": 4.522772282564637e-05, + "loss": 0.8085, + "step": 11416 + }, + { + "epoch": 0.20239567130645184, + "grad_norm": 2.53125, + "learning_rate": 4.522608147165016e-05, + "loss": 0.8085, + "step": 11418 + }, + { + "epoch": 0.2024311233420634, + "grad_norm": 2.953125, + "learning_rate": 4.522443986523667e-05, + "loss": 0.8009, + "step": 11420 + }, + { + "epoch": 0.20246657537767496, + "grad_norm": 2.859375, + "learning_rate": 4.522279800642638e-05, + "loss": 0.8008, + "step": 11422 + }, + { + "epoch": 0.20250202741328654, + "grad_norm": 2.640625, + "learning_rate": 4.522115589523978e-05, + "loss": 0.826, + "step": 11424 + }, + { + "epoch": 0.2025374794488981, + "grad_norm": 2.40625, + "learning_rate": 4.521951353169737e-05, + "loss": 0.7952, + "step": 11426 + }, + { + "epoch": 0.20257293148450967, + "grad_norm": 2.890625, + "learning_rate": 4.521787091581964e-05, + "loss": 0.7572, + "step": 11428 + }, + { + "epoch": 0.20260838352012125, + "grad_norm": 2.53125, + "learning_rate": 4.5216228047627096e-05, + "loss": 0.8417, + "step": 11430 + }, + { + "epoch": 0.20264383555573282, + "grad_norm": 2.671875, + "learning_rate": 4.5214584927140236e-05, + "loss": 0.8195, + "step": 11432 + }, + { + "epoch": 0.20267928759134438, + "grad_norm": 2.796875, + "learning_rate": 4.521294155437957e-05, + "loss": 0.7914, + "step": 11434 + }, + { + "epoch": 0.20271473962695596, + "grad_norm": 2.53125, + "learning_rate": 4.52112979293656e-05, + "loss": 0.763, + "step": 11436 + }, + { + "epoch": 0.20275019166256752, + "grad_norm": 2.796875, + "learning_rate": 4.520965405211884e-05, + "loss": 0.8315, + "step": 11438 + }, + { + "epoch": 0.20278564369817909, + "grad_norm": 3.171875, + "learning_rate": 4.520800992265981e-05, + "loss": 0.7956, + "step": 11440 + }, + { + "epoch": 0.20282109573379067, + "grad_norm": 2.765625, + "learning_rate": 4.520636554100902e-05, + "loss": 0.7905, + "step": 11442 + }, + { + "epoch": 0.20285654776940223, + "grad_norm": 2.703125, + "learning_rate": 4.5204720907187004e-05, + "loss": 0.7493, + "step": 11444 + }, + { + "epoch": 0.2028919998050138, + "grad_norm": 2.6875, + "learning_rate": 4.5203076021214274e-05, + "loss": 0.8327, + "step": 11446 + }, + { + "epoch": 0.20292745184062538, + "grad_norm": 2.65625, + "learning_rate": 4.520143088311136e-05, + "loss": 0.8388, + "step": 11448 + }, + { + "epoch": 0.20296290387623694, + "grad_norm": 2.609375, + "learning_rate": 4.5199785492898805e-05, + "loss": 0.794, + "step": 11450 + }, + { + "epoch": 0.2029983559118485, + "grad_norm": 3.109375, + "learning_rate": 4.519813985059712e-05, + "loss": 0.8091, + "step": 11452 + }, + { + "epoch": 0.2030338079474601, + "grad_norm": 2.703125, + "learning_rate": 4.519649395622687e-05, + "loss": 0.8035, + "step": 11454 + }, + { + "epoch": 0.20306925998307165, + "grad_norm": 2.890625, + "learning_rate": 4.5194847809808585e-05, + "loss": 0.8301, + "step": 11456 + }, + { + "epoch": 0.20310471201868321, + "grad_norm": 2.515625, + "learning_rate": 4.519320141136279e-05, + "loss": 0.7984, + "step": 11458 + }, + { + "epoch": 0.2031401640542948, + "grad_norm": 3.015625, + "learning_rate": 4.519155476091006e-05, + "loss": 0.8362, + "step": 11460 + }, + { + "epoch": 0.20317561608990636, + "grad_norm": 3.03125, + "learning_rate": 4.518990785847093e-05, + "loss": 0.8017, + "step": 11462 + }, + { + "epoch": 0.20321106812551792, + "grad_norm": 2.84375, + "learning_rate": 4.5188260704065955e-05, + "loss": 0.8518, + "step": 11464 + }, + { + "epoch": 0.2032465201611295, + "grad_norm": 2.671875, + "learning_rate": 4.518661329771569e-05, + "loss": 0.8023, + "step": 11466 + }, + { + "epoch": 0.20328197219674107, + "grad_norm": 2.84375, + "learning_rate": 4.51849656394407e-05, + "loss": 0.7892, + "step": 11468 + }, + { + "epoch": 0.20331742423235263, + "grad_norm": 2.421875, + "learning_rate": 4.518331772926154e-05, + "loss": 0.7387, + "step": 11470 + }, + { + "epoch": 0.20335287626796422, + "grad_norm": 2.875, + "learning_rate": 4.518166956719877e-05, + "loss": 0.7539, + "step": 11472 + }, + { + "epoch": 0.20338832830357578, + "grad_norm": 2.84375, + "learning_rate": 4.518002115327298e-05, + "loss": 0.8094, + "step": 11474 + }, + { + "epoch": 0.20342378033918734, + "grad_norm": 2.875, + "learning_rate": 4.517837248750473e-05, + "loss": 0.787, + "step": 11476 + }, + { + "epoch": 0.20345923237479893, + "grad_norm": 2.5625, + "learning_rate": 4.517672356991458e-05, + "loss": 0.7691, + "step": 11478 + }, + { + "epoch": 0.2034946844104105, + "grad_norm": 3.015625, + "learning_rate": 4.517507440052313e-05, + "loss": 0.8491, + "step": 11480 + }, + { + "epoch": 0.20353013644602205, + "grad_norm": 2.8125, + "learning_rate": 4.517342497935096e-05, + "loss": 0.8421, + "step": 11482 + }, + { + "epoch": 0.20356558848163364, + "grad_norm": 3.0, + "learning_rate": 4.517177530641864e-05, + "loss": 0.8048, + "step": 11484 + }, + { + "epoch": 0.2036010405172452, + "grad_norm": 2.59375, + "learning_rate": 4.517012538174676e-05, + "loss": 0.8004, + "step": 11486 + }, + { + "epoch": 0.20363649255285676, + "grad_norm": 2.828125, + "learning_rate": 4.516847520535593e-05, + "loss": 0.806, + "step": 11488 + }, + { + "epoch": 0.20367194458846835, + "grad_norm": 3.21875, + "learning_rate": 4.516682477726673e-05, + "loss": 0.7854, + "step": 11490 + }, + { + "epoch": 0.2037073966240799, + "grad_norm": 2.765625, + "learning_rate": 4.5165174097499755e-05, + "loss": 0.7726, + "step": 11492 + }, + { + "epoch": 0.20374284865969147, + "grad_norm": 2.796875, + "learning_rate": 4.5163523166075594e-05, + "loss": 0.8349, + "step": 11494 + }, + { + "epoch": 0.20377830069530306, + "grad_norm": 2.578125, + "learning_rate": 4.516187198301488e-05, + "loss": 0.7926, + "step": 11496 + }, + { + "epoch": 0.20381375273091462, + "grad_norm": 2.75, + "learning_rate": 4.516022054833819e-05, + "loss": 0.8755, + "step": 11498 + }, + { + "epoch": 0.20384920476652618, + "grad_norm": 2.65625, + "learning_rate": 4.515856886206616e-05, + "loss": 0.8133, + "step": 11500 + }, + { + "epoch": 0.20388465680213777, + "grad_norm": 2.484375, + "learning_rate": 4.5156916924219385e-05, + "loss": 0.8109, + "step": 11502 + }, + { + "epoch": 0.20392010883774933, + "grad_norm": 2.46875, + "learning_rate": 4.515526473481848e-05, + "loss": 0.8204, + "step": 11504 + }, + { + "epoch": 0.2039555608733609, + "grad_norm": 2.515625, + "learning_rate": 4.515361229388407e-05, + "loss": 0.78, + "step": 11506 + }, + { + "epoch": 0.20399101290897248, + "grad_norm": 2.734375, + "learning_rate": 4.515195960143678e-05, + "loss": 0.8417, + "step": 11508 + }, + { + "epoch": 0.20402646494458404, + "grad_norm": 2.859375, + "learning_rate": 4.515030665749723e-05, + "loss": 0.8101, + "step": 11510 + }, + { + "epoch": 0.2040619169801956, + "grad_norm": 2.453125, + "learning_rate": 4.514865346208605e-05, + "loss": 0.8105, + "step": 11512 + }, + { + "epoch": 0.2040973690158072, + "grad_norm": 3.015625, + "learning_rate": 4.514700001522387e-05, + "loss": 0.8042, + "step": 11514 + }, + { + "epoch": 0.20413282105141875, + "grad_norm": 2.625, + "learning_rate": 4.514534631693133e-05, + "loss": 0.7869, + "step": 11516 + }, + { + "epoch": 0.2041682730870303, + "grad_norm": 2.71875, + "learning_rate": 4.5143692367229065e-05, + "loss": 0.8014, + "step": 11518 + }, + { + "epoch": 0.2042037251226419, + "grad_norm": 3.046875, + "learning_rate": 4.5142038166137706e-05, + "loss": 0.8336, + "step": 11520 + }, + { + "epoch": 0.20423917715825346, + "grad_norm": 2.8125, + "learning_rate": 4.5140383713677916e-05, + "loss": 0.7753, + "step": 11522 + }, + { + "epoch": 0.20427462919386502, + "grad_norm": 2.859375, + "learning_rate": 4.513872900987032e-05, + "loss": 0.8049, + "step": 11524 + }, + { + "epoch": 0.2043100812294766, + "grad_norm": 2.671875, + "learning_rate": 4.513707405473559e-05, + "loss": 0.793, + "step": 11526 + }, + { + "epoch": 0.20434553326508817, + "grad_norm": 2.875, + "learning_rate": 4.5135418848294366e-05, + "loss": 0.7965, + "step": 11528 + }, + { + "epoch": 0.20438098530069973, + "grad_norm": 3.015625, + "learning_rate": 4.5133763390567316e-05, + "loss": 0.7977, + "step": 11530 + }, + { + "epoch": 0.2044164373363113, + "grad_norm": 2.984375, + "learning_rate": 4.513210768157508e-05, + "loss": 0.8373, + "step": 11532 + }, + { + "epoch": 0.20445188937192288, + "grad_norm": 2.859375, + "learning_rate": 4.5130451721338344e-05, + "loss": 0.8149, + "step": 11534 + }, + { + "epoch": 0.20448734140753444, + "grad_norm": 2.75, + "learning_rate": 4.5128795509877764e-05, + "loss": 0.7718, + "step": 11536 + }, + { + "epoch": 0.204522793443146, + "grad_norm": 2.765625, + "learning_rate": 4.5127139047214006e-05, + "loss": 0.7862, + "step": 11538 + }, + { + "epoch": 0.2045582454787576, + "grad_norm": 2.640625, + "learning_rate": 4.5125482333367744e-05, + "loss": 0.7925, + "step": 11540 + }, + { + "epoch": 0.20459369751436915, + "grad_norm": 2.6875, + "learning_rate": 4.512382536835965e-05, + "loss": 0.8175, + "step": 11542 + }, + { + "epoch": 0.2046291495499807, + "grad_norm": 2.734375, + "learning_rate": 4.512216815221041e-05, + "loss": 0.8373, + "step": 11544 + }, + { + "epoch": 0.2046646015855923, + "grad_norm": 2.734375, + "learning_rate": 4.51205106849407e-05, + "loss": 0.8322, + "step": 11546 + }, + { + "epoch": 0.20470005362120386, + "grad_norm": 2.75, + "learning_rate": 4.511885296657121e-05, + "loss": 0.7942, + "step": 11548 + }, + { + "epoch": 0.20473550565681542, + "grad_norm": 2.546875, + "learning_rate": 4.511719499712264e-05, + "loss": 0.7904, + "step": 11550 + }, + { + "epoch": 0.204770957692427, + "grad_norm": 2.5625, + "learning_rate": 4.511553677661564e-05, + "loss": 0.8381, + "step": 11552 + }, + { + "epoch": 0.20480640972803857, + "grad_norm": 2.59375, + "learning_rate": 4.5113878305070945e-05, + "loss": 0.7896, + "step": 11554 + }, + { + "epoch": 0.20484186176365013, + "grad_norm": 2.875, + "learning_rate": 4.5112219582509244e-05, + "loss": 0.8513, + "step": 11556 + }, + { + "epoch": 0.20487731379926172, + "grad_norm": 3.171875, + "learning_rate": 4.511056060895122e-05, + "loss": 0.7517, + "step": 11558 + }, + { + "epoch": 0.20491276583487328, + "grad_norm": 2.4375, + "learning_rate": 4.510890138441759e-05, + "loss": 0.7744, + "step": 11560 + }, + { + "epoch": 0.20494821787048484, + "grad_norm": 2.828125, + "learning_rate": 4.5107241908929066e-05, + "loss": 0.8118, + "step": 11562 + }, + { + "epoch": 0.20498366990609643, + "grad_norm": 2.8125, + "learning_rate": 4.510558218250635e-05, + "loss": 0.8291, + "step": 11564 + }, + { + "epoch": 0.205019121941708, + "grad_norm": 2.546875, + "learning_rate": 4.5103922205170144e-05, + "loss": 0.794, + "step": 11566 + }, + { + "epoch": 0.20505457397731955, + "grad_norm": 2.703125, + "learning_rate": 4.510226197694119e-05, + "loss": 0.8009, + "step": 11568 + }, + { + "epoch": 0.20509002601293114, + "grad_norm": 2.625, + "learning_rate": 4.510060149784019e-05, + "loss": 0.7495, + "step": 11570 + }, + { + "epoch": 0.2051254780485427, + "grad_norm": 2.796875, + "learning_rate": 4.509894076788787e-05, + "loss": 0.8229, + "step": 11572 + }, + { + "epoch": 0.20516093008415426, + "grad_norm": 2.671875, + "learning_rate": 4.509727978710495e-05, + "loss": 0.8195, + "step": 11574 + }, + { + "epoch": 0.20519638211976585, + "grad_norm": 2.625, + "learning_rate": 4.509561855551217e-05, + "loss": 0.8167, + "step": 11576 + }, + { + "epoch": 0.2052318341553774, + "grad_norm": 2.609375, + "learning_rate": 4.509395707313026e-05, + "loss": 0.7554, + "step": 11578 + }, + { + "epoch": 0.20526728619098897, + "grad_norm": 2.90625, + "learning_rate": 4.509229533997994e-05, + "loss": 0.8383, + "step": 11580 + }, + { + "epoch": 0.20530273822660056, + "grad_norm": 2.875, + "learning_rate": 4.509063335608196e-05, + "loss": 0.7817, + "step": 11582 + }, + { + "epoch": 0.20533819026221212, + "grad_norm": 2.390625, + "learning_rate": 4.5088971121457066e-05, + "loss": 0.7454, + "step": 11584 + }, + { + "epoch": 0.20537364229782368, + "grad_norm": 2.59375, + "learning_rate": 4.508730863612599e-05, + "loss": 0.8176, + "step": 11586 + }, + { + "epoch": 0.20540909433343527, + "grad_norm": 2.703125, + "learning_rate": 4.50856459001095e-05, + "loss": 0.821, + "step": 11588 + }, + { + "epoch": 0.20544454636904683, + "grad_norm": 2.640625, + "learning_rate": 4.5083982913428324e-05, + "loss": 0.843, + "step": 11590 + }, + { + "epoch": 0.2054799984046584, + "grad_norm": 2.65625, + "learning_rate": 4.508231967610322e-05, + "loss": 0.8111, + "step": 11592 + }, + { + "epoch": 0.20551545044026998, + "grad_norm": 2.859375, + "learning_rate": 4.5080656188154955e-05, + "loss": 0.7918, + "step": 11594 + }, + { + "epoch": 0.20555090247588154, + "grad_norm": 2.640625, + "learning_rate": 4.507899244960429e-05, + "loss": 0.7829, + "step": 11596 + }, + { + "epoch": 0.2055863545114931, + "grad_norm": 2.6875, + "learning_rate": 4.5077328460471965e-05, + "loss": 0.8103, + "step": 11598 + }, + { + "epoch": 0.20562180654710469, + "grad_norm": 2.546875, + "learning_rate": 4.5075664220778766e-05, + "loss": 0.8046, + "step": 11600 + }, + { + "epoch": 0.20565725858271625, + "grad_norm": 2.984375, + "learning_rate": 4.5073999730545466e-05, + "loss": 0.7595, + "step": 11602 + }, + { + "epoch": 0.2056927106183278, + "grad_norm": 2.34375, + "learning_rate": 4.507233498979283e-05, + "loss": 0.8069, + "step": 11604 + }, + { + "epoch": 0.2057281626539394, + "grad_norm": 2.5, + "learning_rate": 4.507066999854164e-05, + "loss": 0.8049, + "step": 11606 + }, + { + "epoch": 0.20576361468955096, + "grad_norm": 2.65625, + "learning_rate": 4.506900475681266e-05, + "loss": 0.7755, + "step": 11608 + }, + { + "epoch": 0.20579906672516252, + "grad_norm": 2.5625, + "learning_rate": 4.506733926462668e-05, + "loss": 0.794, + "step": 11610 + }, + { + "epoch": 0.2058345187607741, + "grad_norm": 2.640625, + "learning_rate": 4.5065673522004495e-05, + "loss": 0.7593, + "step": 11612 + }, + { + "epoch": 0.20586997079638567, + "grad_norm": 2.734375, + "learning_rate": 4.5064007528966865e-05, + "loss": 0.7871, + "step": 11614 + }, + { + "epoch": 0.20590542283199723, + "grad_norm": 2.78125, + "learning_rate": 4.506234128553461e-05, + "loss": 0.8246, + "step": 11616 + }, + { + "epoch": 0.20594087486760881, + "grad_norm": 2.90625, + "learning_rate": 4.506067479172852e-05, + "loss": 0.8485, + "step": 11618 + }, + { + "epoch": 0.20597632690322037, + "grad_norm": 2.8125, + "learning_rate": 4.505900804756938e-05, + "loss": 0.8238, + "step": 11620 + }, + { + "epoch": 0.20601177893883194, + "grad_norm": 2.59375, + "learning_rate": 4.5057341053078004e-05, + "loss": 0.7727, + "step": 11622 + }, + { + "epoch": 0.20604723097444352, + "grad_norm": 2.78125, + "learning_rate": 4.505567380827519e-05, + "loss": 0.7404, + "step": 11624 + }, + { + "epoch": 0.20608268301005508, + "grad_norm": 3.046875, + "learning_rate": 4.505400631318174e-05, + "loss": 0.7966, + "step": 11626 + }, + { + "epoch": 0.20611813504566665, + "grad_norm": 2.96875, + "learning_rate": 4.505233856781846e-05, + "loss": 0.8268, + "step": 11628 + }, + { + "epoch": 0.20615358708127823, + "grad_norm": 3.046875, + "learning_rate": 4.5050670572206186e-05, + "loss": 0.803, + "step": 11630 + }, + { + "epoch": 0.2061890391168898, + "grad_norm": 2.90625, + "learning_rate": 4.504900232636571e-05, + "loss": 0.7863, + "step": 11632 + }, + { + "epoch": 0.20622449115250135, + "grad_norm": 2.8125, + "learning_rate": 4.5047333830317865e-05, + "loss": 0.8045, + "step": 11634 + }, + { + "epoch": 0.20625994318811294, + "grad_norm": 2.59375, + "learning_rate": 4.504566508408347e-05, + "loss": 0.8201, + "step": 11636 + }, + { + "epoch": 0.2062953952237245, + "grad_norm": 2.421875, + "learning_rate": 4.5043996087683346e-05, + "loss": 0.7946, + "step": 11638 + }, + { + "epoch": 0.20633084725933606, + "grad_norm": 2.6875, + "learning_rate": 4.504232684113833e-05, + "loss": 0.8265, + "step": 11640 + }, + { + "epoch": 0.20636629929494765, + "grad_norm": 2.375, + "learning_rate": 4.504065734446925e-05, + "loss": 0.794, + "step": 11642 + }, + { + "epoch": 0.2064017513305592, + "grad_norm": 2.625, + "learning_rate": 4.503898759769694e-05, + "loss": 0.7796, + "step": 11644 + }, + { + "epoch": 0.20643720336617077, + "grad_norm": 2.59375, + "learning_rate": 4.5037317600842235e-05, + "loss": 0.7687, + "step": 11646 + }, + { + "epoch": 0.20647265540178236, + "grad_norm": 2.609375, + "learning_rate": 4.503564735392598e-05, + "loss": 0.78, + "step": 11648 + }, + { + "epoch": 0.20650810743739392, + "grad_norm": 2.40625, + "learning_rate": 4.503397685696902e-05, + "loss": 0.7891, + "step": 11650 + }, + { + "epoch": 0.20654355947300548, + "grad_norm": 2.828125, + "learning_rate": 4.5032306109992204e-05, + "loss": 0.7959, + "step": 11652 + }, + { + "epoch": 0.20657901150861707, + "grad_norm": 3.0, + "learning_rate": 4.503063511301638e-05, + "loss": 0.7963, + "step": 11654 + }, + { + "epoch": 0.20661446354422863, + "grad_norm": 2.765625, + "learning_rate": 4.50289638660624e-05, + "loss": 0.7754, + "step": 11656 + }, + { + "epoch": 0.2066499155798402, + "grad_norm": 2.828125, + "learning_rate": 4.502729236915112e-05, + "loss": 0.7979, + "step": 11658 + }, + { + "epoch": 0.20668536761545178, + "grad_norm": 2.53125, + "learning_rate": 4.50256206223034e-05, + "loss": 0.8032, + "step": 11660 + }, + { + "epoch": 0.20672081965106334, + "grad_norm": 2.453125, + "learning_rate": 4.502394862554011e-05, + "loss": 0.8072, + "step": 11662 + }, + { + "epoch": 0.2067562716866749, + "grad_norm": 2.90625, + "learning_rate": 4.5022276378882125e-05, + "loss": 0.8484, + "step": 11664 + }, + { + "epoch": 0.2067917237222865, + "grad_norm": 2.609375, + "learning_rate": 4.5020603882350286e-05, + "loss": 0.7754, + "step": 11666 + }, + { + "epoch": 0.20682717575789805, + "grad_norm": 2.84375, + "learning_rate": 4.501893113596548e-05, + "loss": 0.8538, + "step": 11668 + }, + { + "epoch": 0.2068626277935096, + "grad_norm": 2.546875, + "learning_rate": 4.501725813974858e-05, + "loss": 0.7993, + "step": 11670 + }, + { + "epoch": 0.2068980798291212, + "grad_norm": 2.765625, + "learning_rate": 4.501558489372049e-05, + "loss": 0.8001, + "step": 11672 + }, + { + "epoch": 0.20693353186473276, + "grad_norm": 2.578125, + "learning_rate": 4.5013911397902044e-05, + "loss": 0.8201, + "step": 11674 + }, + { + "epoch": 0.20696898390034432, + "grad_norm": 2.671875, + "learning_rate": 4.5012237652314164e-05, + "loss": 0.7967, + "step": 11676 + }, + { + "epoch": 0.2070044359359559, + "grad_norm": 2.5625, + "learning_rate": 4.5010563656977725e-05, + "loss": 0.8026, + "step": 11678 + }, + { + "epoch": 0.20703988797156747, + "grad_norm": 2.46875, + "learning_rate": 4.5008889411913625e-05, + "loss": 0.7553, + "step": 11680 + }, + { + "epoch": 0.20707534000717903, + "grad_norm": 2.90625, + "learning_rate": 4.500721491714274e-05, + "loss": 0.7963, + "step": 11682 + }, + { + "epoch": 0.20711079204279062, + "grad_norm": 2.8125, + "learning_rate": 4.500554017268599e-05, + "loss": 0.8529, + "step": 11684 + }, + { + "epoch": 0.20714624407840218, + "grad_norm": 2.46875, + "learning_rate": 4.5003865178564265e-05, + "loss": 0.7984, + "step": 11686 + }, + { + "epoch": 0.20718169611401374, + "grad_norm": 2.84375, + "learning_rate": 4.500218993479847e-05, + "loss": 0.8345, + "step": 11688 + }, + { + "epoch": 0.20721714814962533, + "grad_norm": 2.734375, + "learning_rate": 4.5000514441409505e-05, + "loss": 0.8268, + "step": 11690 + }, + { + "epoch": 0.2072526001852369, + "grad_norm": 2.75, + "learning_rate": 4.499883869841828e-05, + "loss": 0.8061, + "step": 11692 + }, + { + "epoch": 0.20728805222084845, + "grad_norm": 2.75, + "learning_rate": 4.499716270584573e-05, + "loss": 0.8296, + "step": 11694 + }, + { + "epoch": 0.20732350425646004, + "grad_norm": 2.828125, + "learning_rate": 4.4995486463712735e-05, + "loss": 0.7848, + "step": 11696 + }, + { + "epoch": 0.2073589562920716, + "grad_norm": 2.703125, + "learning_rate": 4.4993809972040246e-05, + "loss": 0.8256, + "step": 11698 + }, + { + "epoch": 0.20739440832768316, + "grad_norm": 2.546875, + "learning_rate": 4.4992133230849176e-05, + "loss": 0.835, + "step": 11700 + }, + { + "epoch": 0.20742986036329472, + "grad_norm": 2.859375, + "learning_rate": 4.499045624016044e-05, + "loss": 0.7905, + "step": 11702 + }, + { + "epoch": 0.2074653123989063, + "grad_norm": 2.8125, + "learning_rate": 4.498877899999498e-05, + "loss": 0.7935, + "step": 11704 + }, + { + "epoch": 0.20750076443451787, + "grad_norm": 2.890625, + "learning_rate": 4.4987101510373717e-05, + "loss": 0.7899, + "step": 11706 + }, + { + "epoch": 0.20753621647012943, + "grad_norm": 2.78125, + "learning_rate": 4.4985423771317595e-05, + "loss": 0.7984, + "step": 11708 + }, + { + "epoch": 0.20757166850574102, + "grad_norm": 2.640625, + "learning_rate": 4.498374578284754e-05, + "loss": 0.8174, + "step": 11710 + }, + { + "epoch": 0.20760712054135258, + "grad_norm": 2.71875, + "learning_rate": 4.49820675449845e-05, + "loss": 0.8063, + "step": 11712 + }, + { + "epoch": 0.20764257257696414, + "grad_norm": 2.84375, + "learning_rate": 4.498038905774942e-05, + "loss": 0.8267, + "step": 11714 + }, + { + "epoch": 0.20767802461257573, + "grad_norm": 2.65625, + "learning_rate": 4.497871032116325e-05, + "loss": 0.8247, + "step": 11716 + }, + { + "epoch": 0.2077134766481873, + "grad_norm": 2.703125, + "learning_rate": 4.497703133524693e-05, + "loss": 0.762, + "step": 11718 + }, + { + "epoch": 0.20774892868379885, + "grad_norm": 2.765625, + "learning_rate": 4.497535210002143e-05, + "loss": 0.7811, + "step": 11720 + }, + { + "epoch": 0.20778438071941044, + "grad_norm": 2.734375, + "learning_rate": 4.497367261550769e-05, + "loss": 0.8083, + "step": 11722 + }, + { + "epoch": 0.207819832755022, + "grad_norm": 2.671875, + "learning_rate": 4.497199288172668e-05, + "loss": 0.7593, + "step": 11724 + }, + { + "epoch": 0.20785528479063356, + "grad_norm": 2.625, + "learning_rate": 4.497031289869936e-05, + "loss": 0.7981, + "step": 11726 + }, + { + "epoch": 0.20789073682624515, + "grad_norm": 3.0, + "learning_rate": 4.4968632666446684e-05, + "loss": 0.7701, + "step": 11728 + }, + { + "epoch": 0.2079261888618567, + "grad_norm": 2.75, + "learning_rate": 4.4966952184989645e-05, + "loss": 0.8126, + "step": 11730 + }, + { + "epoch": 0.20796164089746827, + "grad_norm": 2.25, + "learning_rate": 4.4965271454349186e-05, + "loss": 0.8052, + "step": 11732 + }, + { + "epoch": 0.20799709293307986, + "grad_norm": 2.515625, + "learning_rate": 4.496359047454631e-05, + "loss": 0.783, + "step": 11734 + }, + { + "epoch": 0.20803254496869142, + "grad_norm": 2.59375, + "learning_rate": 4.496190924560197e-05, + "loss": 0.8078, + "step": 11736 + }, + { + "epoch": 0.20806799700430298, + "grad_norm": 2.71875, + "learning_rate": 4.4960227767537175e-05, + "loss": 0.8013, + "step": 11738 + }, + { + "epoch": 0.20810344903991457, + "grad_norm": 2.59375, + "learning_rate": 4.4958546040372896e-05, + "loss": 0.7807, + "step": 11740 + }, + { + "epoch": 0.20813890107552613, + "grad_norm": 2.984375, + "learning_rate": 4.495686406413011e-05, + "loss": 0.82, + "step": 11742 + }, + { + "epoch": 0.2081743531111377, + "grad_norm": 2.5625, + "learning_rate": 4.495518183882982e-05, + "loss": 0.7697, + "step": 11744 + }, + { + "epoch": 0.20820980514674928, + "grad_norm": 2.46875, + "learning_rate": 4.4953499364493015e-05, + "loss": 0.8185, + "step": 11746 + }, + { + "epoch": 0.20824525718236084, + "grad_norm": 2.6875, + "learning_rate": 4.49518166411407e-05, + "loss": 0.8237, + "step": 11748 + }, + { + "epoch": 0.2082807092179724, + "grad_norm": 2.734375, + "learning_rate": 4.4950133668793856e-05, + "loss": 0.8437, + "step": 11750 + }, + { + "epoch": 0.208316161253584, + "grad_norm": 2.765625, + "learning_rate": 4.4948450447473515e-05, + "loss": 0.8473, + "step": 11752 + }, + { + "epoch": 0.20835161328919555, + "grad_norm": 2.703125, + "learning_rate": 4.494676697720066e-05, + "loss": 0.8198, + "step": 11754 + }, + { + "epoch": 0.2083870653248071, + "grad_norm": 2.71875, + "learning_rate": 4.4945083257996306e-05, + "loss": 0.7779, + "step": 11756 + }, + { + "epoch": 0.2084225173604187, + "grad_norm": 2.484375, + "learning_rate": 4.494339928988147e-05, + "loss": 0.7301, + "step": 11758 + }, + { + "epoch": 0.20845796939603026, + "grad_norm": 2.609375, + "learning_rate": 4.494171507287717e-05, + "loss": 0.8416, + "step": 11760 + }, + { + "epoch": 0.20849342143164182, + "grad_norm": 2.65625, + "learning_rate": 4.4940030607004405e-05, + "loss": 0.8098, + "step": 11762 + }, + { + "epoch": 0.2085288734672534, + "grad_norm": 2.734375, + "learning_rate": 4.4938345892284225e-05, + "loss": 0.7958, + "step": 11764 + }, + { + "epoch": 0.20856432550286497, + "grad_norm": 2.765625, + "learning_rate": 4.493666092873763e-05, + "loss": 0.781, + "step": 11766 + }, + { + "epoch": 0.20859977753847653, + "grad_norm": 2.875, + "learning_rate": 4.493497571638567e-05, + "loss": 0.8235, + "step": 11768 + }, + { + "epoch": 0.20863522957408812, + "grad_norm": 2.71875, + "learning_rate": 4.493329025524936e-05, + "loss": 0.7772, + "step": 11770 + }, + { + "epoch": 0.20867068160969968, + "grad_norm": 2.515625, + "learning_rate": 4.4931604545349735e-05, + "loss": 0.7812, + "step": 11772 + }, + { + "epoch": 0.20870613364531124, + "grad_norm": 2.75, + "learning_rate": 4.492991858670784e-05, + "loss": 0.8014, + "step": 11774 + }, + { + "epoch": 0.20874158568092283, + "grad_norm": 3.328125, + "learning_rate": 4.492823237934472e-05, + "loss": 0.8308, + "step": 11776 + }, + { + "epoch": 0.20877703771653439, + "grad_norm": 2.578125, + "learning_rate": 4.4926545923281404e-05, + "loss": 0.7867, + "step": 11778 + }, + { + "epoch": 0.20881248975214595, + "grad_norm": 2.8125, + "learning_rate": 4.492485921853894e-05, + "loss": 0.793, + "step": 11780 + }, + { + "epoch": 0.20884794178775754, + "grad_norm": 2.96875, + "learning_rate": 4.492317226513839e-05, + "loss": 0.7952, + "step": 11782 + }, + { + "epoch": 0.2088833938233691, + "grad_norm": 2.78125, + "learning_rate": 4.4921485063100796e-05, + "loss": 0.7814, + "step": 11784 + }, + { + "epoch": 0.20891884585898066, + "grad_norm": 2.859375, + "learning_rate": 4.491979761244722e-05, + "loss": 0.8192, + "step": 11786 + }, + { + "epoch": 0.20895429789459224, + "grad_norm": 2.609375, + "learning_rate": 4.491810991319873e-05, + "loss": 0.8639, + "step": 11788 + }, + { + "epoch": 0.2089897499302038, + "grad_norm": 2.578125, + "learning_rate": 4.491642196537635e-05, + "loss": 0.7892, + "step": 11790 + }, + { + "epoch": 0.20902520196581537, + "grad_norm": 2.515625, + "learning_rate": 4.491473376900119e-05, + "loss": 0.795, + "step": 11792 + }, + { + "epoch": 0.20906065400142695, + "grad_norm": 2.609375, + "learning_rate": 4.4913045324094306e-05, + "loss": 0.8182, + "step": 11794 + }, + { + "epoch": 0.20909610603703851, + "grad_norm": 2.671875, + "learning_rate": 4.4911356630676756e-05, + "loss": 0.8547, + "step": 11796 + }, + { + "epoch": 0.20913155807265008, + "grad_norm": 2.75, + "learning_rate": 4.4909667688769616e-05, + "loss": 0.7721, + "step": 11798 + }, + { + "epoch": 0.20916701010826166, + "grad_norm": 2.78125, + "learning_rate": 4.490797849839398e-05, + "loss": 0.8018, + "step": 11800 + }, + { + "epoch": 0.20920246214387322, + "grad_norm": 2.515625, + "learning_rate": 4.4906289059570916e-05, + "loss": 0.7506, + "step": 11802 + }, + { + "epoch": 0.20923791417948479, + "grad_norm": 2.6875, + "learning_rate": 4.490459937232151e-05, + "loss": 0.8056, + "step": 11804 + }, + { + "epoch": 0.20927336621509637, + "grad_norm": 2.8125, + "learning_rate": 4.4902909436666855e-05, + "loss": 0.8051, + "step": 11806 + }, + { + "epoch": 0.20930881825070793, + "grad_norm": 2.96875, + "learning_rate": 4.490121925262803e-05, + "loss": 0.7755, + "step": 11808 + }, + { + "epoch": 0.2093442702863195, + "grad_norm": 2.546875, + "learning_rate": 4.489952882022613e-05, + "loss": 0.8145, + "step": 11810 + }, + { + "epoch": 0.20937972232193108, + "grad_norm": 2.703125, + "learning_rate": 4.4897838139482263e-05, + "loss": 0.7995, + "step": 11812 + }, + { + "epoch": 0.20941517435754264, + "grad_norm": 2.609375, + "learning_rate": 4.489614721041751e-05, + "loss": 0.7982, + "step": 11814 + }, + { + "epoch": 0.2094506263931542, + "grad_norm": 2.8125, + "learning_rate": 4.4894456033053e-05, + "loss": 0.8647, + "step": 11816 + }, + { + "epoch": 0.2094860784287658, + "grad_norm": 2.8125, + "learning_rate": 4.4892764607409806e-05, + "loss": 0.8212, + "step": 11818 + }, + { + "epoch": 0.20952153046437735, + "grad_norm": 2.5625, + "learning_rate": 4.489107293350907e-05, + "loss": 0.7901, + "step": 11820 + }, + { + "epoch": 0.20955698249998891, + "grad_norm": 2.84375, + "learning_rate": 4.488938101137188e-05, + "loss": 0.8199, + "step": 11822 + }, + { + "epoch": 0.2095924345356005, + "grad_norm": 2.875, + "learning_rate": 4.4887688841019346e-05, + "loss": 0.8028, + "step": 11824 + }, + { + "epoch": 0.20962788657121206, + "grad_norm": 3.15625, + "learning_rate": 4.488599642247261e-05, + "loss": 0.796, + "step": 11826 + }, + { + "epoch": 0.20966333860682362, + "grad_norm": 2.671875, + "learning_rate": 4.4884303755752775e-05, + "loss": 0.8427, + "step": 11828 + }, + { + "epoch": 0.2096987906424352, + "grad_norm": 2.921875, + "learning_rate": 4.488261084088098e-05, + "loss": 0.7922, + "step": 11830 + }, + { + "epoch": 0.20973424267804677, + "grad_norm": 2.671875, + "learning_rate": 4.488091767787833e-05, + "loss": 0.798, + "step": 11832 + }, + { + "epoch": 0.20976969471365833, + "grad_norm": 2.84375, + "learning_rate": 4.4879224266765974e-05, + "loss": 0.8005, + "step": 11834 + }, + { + "epoch": 0.20980514674926992, + "grad_norm": 2.984375, + "learning_rate": 4.4877530607565045e-05, + "loss": 0.7842, + "step": 11836 + }, + { + "epoch": 0.20984059878488148, + "grad_norm": 2.96875, + "learning_rate": 4.487583670029667e-05, + "loss": 0.8524, + "step": 11838 + }, + { + "epoch": 0.20987605082049304, + "grad_norm": 3.15625, + "learning_rate": 4.487414254498199e-05, + "loss": 0.8432, + "step": 11840 + }, + { + "epoch": 0.20991150285610463, + "grad_norm": 2.765625, + "learning_rate": 4.4872448141642156e-05, + "loss": 0.7839, + "step": 11842 + }, + { + "epoch": 0.2099469548917162, + "grad_norm": 2.578125, + "learning_rate": 4.48707534902983e-05, + "loss": 0.7926, + "step": 11844 + }, + { + "epoch": 0.20998240692732775, + "grad_norm": 3.015625, + "learning_rate": 4.486905859097158e-05, + "loss": 0.8395, + "step": 11846 + }, + { + "epoch": 0.21001785896293934, + "grad_norm": 2.578125, + "learning_rate": 4.486736344368315e-05, + "loss": 0.7788, + "step": 11848 + }, + { + "epoch": 0.2100533109985509, + "grad_norm": 2.578125, + "learning_rate": 4.4865668048454166e-05, + "loss": 0.7706, + "step": 11850 + }, + { + "epoch": 0.21008876303416246, + "grad_norm": 2.546875, + "learning_rate": 4.486397240530578e-05, + "loss": 0.8222, + "step": 11852 + }, + { + "epoch": 0.21012421506977405, + "grad_norm": 2.71875, + "learning_rate": 4.486227651425916e-05, + "loss": 0.7723, + "step": 11854 + }, + { + "epoch": 0.2101596671053856, + "grad_norm": 2.96875, + "learning_rate": 4.486058037533546e-05, + "loss": 0.8181, + "step": 11856 + }, + { + "epoch": 0.21019511914099717, + "grad_norm": 2.6875, + "learning_rate": 4.4858883988555854e-05, + "loss": 0.7714, + "step": 11858 + }, + { + "epoch": 0.21023057117660876, + "grad_norm": 2.8125, + "learning_rate": 4.485718735394151e-05, + "loss": 0.8509, + "step": 11860 + }, + { + "epoch": 0.21026602321222032, + "grad_norm": 2.953125, + "learning_rate": 4.485549047151361e-05, + "loss": 0.8047, + "step": 11862 + }, + { + "epoch": 0.21030147524783188, + "grad_norm": 2.6875, + "learning_rate": 4.485379334129333e-05, + "loss": 0.809, + "step": 11864 + }, + { + "epoch": 0.21033692728344347, + "grad_norm": 2.890625, + "learning_rate": 4.485209596330183e-05, + "loss": 0.7935, + "step": 11866 + }, + { + "epoch": 0.21037237931905503, + "grad_norm": 2.75, + "learning_rate": 4.485039833756032e-05, + "loss": 0.7642, + "step": 11868 + }, + { + "epoch": 0.2104078313546666, + "grad_norm": 3.09375, + "learning_rate": 4.484870046408996e-05, + "loss": 0.8066, + "step": 11870 + }, + { + "epoch": 0.21044328339027815, + "grad_norm": 2.625, + "learning_rate": 4.4847002342911956e-05, + "loss": 0.8243, + "step": 11872 + }, + { + "epoch": 0.21047873542588974, + "grad_norm": 2.609375, + "learning_rate": 4.48453039740475e-05, + "loss": 0.8057, + "step": 11874 + }, + { + "epoch": 0.2105141874615013, + "grad_norm": 2.796875, + "learning_rate": 4.4843605357517786e-05, + "loss": 0.7966, + "step": 11876 + }, + { + "epoch": 0.21054963949711286, + "grad_norm": 2.890625, + "learning_rate": 4.484190649334401e-05, + "loss": 0.8227, + "step": 11878 + }, + { + "epoch": 0.21058509153272445, + "grad_norm": 2.8125, + "learning_rate": 4.484020738154737e-05, + "loss": 0.857, + "step": 11880 + }, + { + "epoch": 0.210620543568336, + "grad_norm": 2.578125, + "learning_rate": 4.4838508022149074e-05, + "loss": 0.7818, + "step": 11882 + }, + { + "epoch": 0.21065599560394757, + "grad_norm": 2.6875, + "learning_rate": 4.483680841517033e-05, + "loss": 0.7862, + "step": 11884 + }, + { + "epoch": 0.21069144763955916, + "grad_norm": 2.578125, + "learning_rate": 4.4835108560632353e-05, + "loss": 0.7826, + "step": 11886 + }, + { + "epoch": 0.21072689967517072, + "grad_norm": 2.75, + "learning_rate": 4.483340845855635e-05, + "loss": 0.7946, + "step": 11888 + }, + { + "epoch": 0.21076235171078228, + "grad_norm": 3.03125, + "learning_rate": 4.4831708108963546e-05, + "loss": 0.8464, + "step": 11890 + }, + { + "epoch": 0.21079780374639387, + "grad_norm": 3.15625, + "learning_rate": 4.483000751187515e-05, + "loss": 0.8415, + "step": 11892 + }, + { + "epoch": 0.21083325578200543, + "grad_norm": 2.765625, + "learning_rate": 4.4828306667312385e-05, + "loss": 0.8356, + "step": 11894 + }, + { + "epoch": 0.210868707817617, + "grad_norm": 2.578125, + "learning_rate": 4.4826605575296486e-05, + "loss": 0.8171, + "step": 11896 + }, + { + "epoch": 0.21090415985322858, + "grad_norm": 2.59375, + "learning_rate": 4.482490423584868e-05, + "loss": 0.7678, + "step": 11898 + }, + { + "epoch": 0.21093961188884014, + "grad_norm": 2.765625, + "learning_rate": 4.48232026489902e-05, + "loss": 0.8157, + "step": 11900 + }, + { + "epoch": 0.2109750639244517, + "grad_norm": 2.78125, + "learning_rate": 4.482150081474229e-05, + "loss": 0.8076, + "step": 11902 + }, + { + "epoch": 0.2110105159600633, + "grad_norm": 2.984375, + "learning_rate": 4.481979873312616e-05, + "loss": 0.8188, + "step": 11904 + }, + { + "epoch": 0.21104596799567485, + "grad_norm": 2.609375, + "learning_rate": 4.481809640416308e-05, + "loss": 0.7849, + "step": 11906 + }, + { + "epoch": 0.2110814200312864, + "grad_norm": 2.578125, + "learning_rate": 4.481639382787428e-05, + "loss": 0.8167, + "step": 11908 + }, + { + "epoch": 0.211116872066898, + "grad_norm": 2.640625, + "learning_rate": 4.481469100428102e-05, + "loss": 0.7851, + "step": 11910 + }, + { + "epoch": 0.21115232410250956, + "grad_norm": 2.21875, + "learning_rate": 4.4812987933404535e-05, + "loss": 0.7196, + "step": 11912 + }, + { + "epoch": 0.21118777613812112, + "grad_norm": 2.796875, + "learning_rate": 4.481128461526609e-05, + "loss": 0.7946, + "step": 11914 + }, + { + "epoch": 0.2112232281737327, + "grad_norm": 2.828125, + "learning_rate": 4.480958104988694e-05, + "loss": 0.8444, + "step": 11916 + }, + { + "epoch": 0.21125868020934427, + "grad_norm": 2.84375, + "learning_rate": 4.4807877237288344e-05, + "loss": 0.8068, + "step": 11918 + }, + { + "epoch": 0.21129413224495583, + "grad_norm": 2.84375, + "learning_rate": 4.4806173177491564e-05, + "loss": 0.7822, + "step": 11920 + }, + { + "epoch": 0.21132958428056742, + "grad_norm": 2.59375, + "learning_rate": 4.480446887051787e-05, + "loss": 0.8279, + "step": 11922 + }, + { + "epoch": 0.21136503631617898, + "grad_norm": 2.734375, + "learning_rate": 4.4802764316388536e-05, + "loss": 0.7888, + "step": 11924 + }, + { + "epoch": 0.21140048835179054, + "grad_norm": 2.6875, + "learning_rate": 4.480105951512482e-05, + "loss": 0.7546, + "step": 11926 + }, + { + "epoch": 0.21143594038740213, + "grad_norm": 2.59375, + "learning_rate": 4.4799354466748e-05, + "loss": 0.8287, + "step": 11928 + }, + { + "epoch": 0.2114713924230137, + "grad_norm": 2.65625, + "learning_rate": 4.479764917127938e-05, + "loss": 0.807, + "step": 11930 + }, + { + "epoch": 0.21150684445862525, + "grad_norm": 2.765625, + "learning_rate": 4.4795943628740204e-05, + "loss": 0.7858, + "step": 11932 + }, + { + "epoch": 0.21154229649423684, + "grad_norm": 3.03125, + "learning_rate": 4.479423783915177e-05, + "loss": 0.815, + "step": 11934 + }, + { + "epoch": 0.2115777485298484, + "grad_norm": 3.09375, + "learning_rate": 4.4792531802535386e-05, + "loss": 0.8248, + "step": 11936 + }, + { + "epoch": 0.21161320056545996, + "grad_norm": 2.984375, + "learning_rate": 4.4790825518912326e-05, + "loss": 0.8082, + "step": 11938 + }, + { + "epoch": 0.21164865260107155, + "grad_norm": 2.65625, + "learning_rate": 4.478911898830388e-05, + "loss": 0.7982, + "step": 11940 + }, + { + "epoch": 0.2116841046366831, + "grad_norm": 2.84375, + "learning_rate": 4.478741221073136e-05, + "loss": 0.7951, + "step": 11942 + }, + { + "epoch": 0.21171955667229467, + "grad_norm": 3.046875, + "learning_rate": 4.478570518621604e-05, + "loss": 0.8596, + "step": 11944 + }, + { + "epoch": 0.21175500870790626, + "grad_norm": 2.625, + "learning_rate": 4.4783997914779254e-05, + "loss": 0.7852, + "step": 11946 + }, + { + "epoch": 0.21179046074351782, + "grad_norm": 2.53125, + "learning_rate": 4.47822903964423e-05, + "loss": 0.7785, + "step": 11948 + }, + { + "epoch": 0.21182591277912938, + "grad_norm": 2.578125, + "learning_rate": 4.478058263122646e-05, + "loss": 0.7551, + "step": 11950 + }, + { + "epoch": 0.21186136481474097, + "grad_norm": 2.921875, + "learning_rate": 4.4778874619153086e-05, + "loss": 0.8051, + "step": 11952 + }, + { + "epoch": 0.21189681685035253, + "grad_norm": 2.859375, + "learning_rate": 4.4777166360243474e-05, + "loss": 0.7988, + "step": 11954 + }, + { + "epoch": 0.2119322688859641, + "grad_norm": 2.96875, + "learning_rate": 4.4775457854518944e-05, + "loss": 0.8211, + "step": 11956 + }, + { + "epoch": 0.21196772092157568, + "grad_norm": 2.75, + "learning_rate": 4.4773749102000814e-05, + "loss": 0.7679, + "step": 11958 + }, + { + "epoch": 0.21200317295718724, + "grad_norm": 2.640625, + "learning_rate": 4.477204010271042e-05, + "loss": 0.8257, + "step": 11960 + }, + { + "epoch": 0.2120386249927988, + "grad_norm": 2.609375, + "learning_rate": 4.477033085666909e-05, + "loss": 0.8224, + "step": 11962 + }, + { + "epoch": 0.21207407702841038, + "grad_norm": 2.40625, + "learning_rate": 4.4768621363898135e-05, + "loss": 0.7642, + "step": 11964 + }, + { + "epoch": 0.21210952906402195, + "grad_norm": 2.9375, + "learning_rate": 4.47669116244189e-05, + "loss": 0.7883, + "step": 11966 + }, + { + "epoch": 0.2121449810996335, + "grad_norm": 2.65625, + "learning_rate": 4.476520163825274e-05, + "loss": 0.8087, + "step": 11968 + }, + { + "epoch": 0.2121804331352451, + "grad_norm": 2.9375, + "learning_rate": 4.476349140542098e-05, + "loss": 0.8171, + "step": 11970 + }, + { + "epoch": 0.21221588517085666, + "grad_norm": 2.890625, + "learning_rate": 4.476178092594495e-05, + "loss": 0.7746, + "step": 11972 + }, + { + "epoch": 0.21225133720646822, + "grad_norm": 2.921875, + "learning_rate": 4.4760070199846016e-05, + "loss": 0.8099, + "step": 11974 + }, + { + "epoch": 0.2122867892420798, + "grad_norm": 2.65625, + "learning_rate": 4.475835922714552e-05, + "loss": 0.8398, + "step": 11976 + }, + { + "epoch": 0.21232224127769136, + "grad_norm": 2.546875, + "learning_rate": 4.475664800786482e-05, + "loss": 0.7722, + "step": 11978 + }, + { + "epoch": 0.21235769331330293, + "grad_norm": 2.453125, + "learning_rate": 4.475493654202527e-05, + "loss": 0.7342, + "step": 11980 + }, + { + "epoch": 0.2123931453489145, + "grad_norm": 2.765625, + "learning_rate": 4.475322482964823e-05, + "loss": 0.7683, + "step": 11982 + }, + { + "epoch": 0.21242859738452607, + "grad_norm": 2.75, + "learning_rate": 4.475151287075505e-05, + "loss": 0.8195, + "step": 11984 + }, + { + "epoch": 0.21246404942013764, + "grad_norm": 2.640625, + "learning_rate": 4.4749800665367104e-05, + "loss": 0.8022, + "step": 11986 + }, + { + "epoch": 0.21249950145574922, + "grad_norm": 2.84375, + "learning_rate": 4.474808821350576e-05, + "loss": 0.8183, + "step": 11988 + }, + { + "epoch": 0.21253495349136078, + "grad_norm": 2.8125, + "learning_rate": 4.474637551519239e-05, + "loss": 0.8333, + "step": 11990 + }, + { + "epoch": 0.21257040552697234, + "grad_norm": 2.703125, + "learning_rate": 4.474466257044837e-05, + "loss": 0.7638, + "step": 11992 + }, + { + "epoch": 0.21260585756258393, + "grad_norm": 2.515625, + "learning_rate": 4.474294937929506e-05, + "loss": 0.7644, + "step": 11994 + }, + { + "epoch": 0.2126413095981955, + "grad_norm": 2.875, + "learning_rate": 4.474123594175387e-05, + "loss": 0.8115, + "step": 11996 + }, + { + "epoch": 0.21267676163380705, + "grad_norm": 3.25, + "learning_rate": 4.4739522257846154e-05, + "loss": 0.8353, + "step": 11998 + }, + { + "epoch": 0.21271221366941864, + "grad_norm": 2.578125, + "learning_rate": 4.4737808327593325e-05, + "loss": 0.7944, + "step": 12000 + } + ], + "logging_steps": 2, + "max_steps": 56414, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 1.9936181361265607e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}