{ "best_metric": 0.8991827368736267, "best_model_checkpoint": "data/Llama-31-8B_task-1_180-samples_config-3_full/checkpoint-476", "epoch": 35.0, "eval_steps": 500, "global_step": 595, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.058823529411764705, "grad_norm": 1.858755111694336, "learning_rate": 3.9215686274509804e-08, "loss": 2.4463, "step": 1 }, { "epoch": 0.11764705882352941, "grad_norm": 1.8338021039962769, "learning_rate": 7.843137254901961e-08, "loss": 2.4612, "step": 2 }, { "epoch": 0.23529411764705882, "grad_norm": 1.7695642709732056, "learning_rate": 1.5686274509803921e-07, "loss": 2.3799, "step": 4 }, { "epoch": 0.35294117647058826, "grad_norm": 1.7253705263137817, "learning_rate": 2.3529411764705883e-07, "loss": 2.4519, "step": 6 }, { "epoch": 0.47058823529411764, "grad_norm": 2.068340301513672, "learning_rate": 3.1372549019607843e-07, "loss": 2.4357, "step": 8 }, { "epoch": 0.5882352941176471, "grad_norm": 1.71905517578125, "learning_rate": 3.921568627450981e-07, "loss": 2.4114, "step": 10 }, { "epoch": 0.7058823529411765, "grad_norm": 1.9960722923278809, "learning_rate": 4.7058823529411767e-07, "loss": 2.5452, "step": 12 }, { "epoch": 0.8235294117647058, "grad_norm": 1.846751093864441, "learning_rate": 5.490196078431373e-07, "loss": 2.4838, "step": 14 }, { "epoch": 0.9411764705882353, "grad_norm": 1.7892955541610718, "learning_rate": 6.274509803921569e-07, "loss": 2.4542, "step": 16 }, { "epoch": 1.0, "eval_loss": 2.4258534908294678, "eval_runtime": 14.49, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 17 }, { "epoch": 1.0588235294117647, "grad_norm": 1.7462923526763916, "learning_rate": 7.058823529411766e-07, "loss": 2.4066, "step": 18 }, { "epoch": 1.1764705882352942, "grad_norm": 1.6423271894454956, "learning_rate": 7.843137254901962e-07, "loss": 2.4084, "step": 20 }, { "epoch": 1.2941176470588236, "grad_norm": 1.6562241315841675, "learning_rate": 8.627450980392157e-07, "loss": 2.4685, "step": 22 }, { "epoch": 1.4117647058823528, "grad_norm": 1.5601104497909546, "learning_rate": 9.411764705882353e-07, "loss": 2.3986, "step": 24 }, { "epoch": 1.5294117647058822, "grad_norm": 1.689158320426941, "learning_rate": 1.019607843137255e-06, "loss": 2.512, "step": 26 }, { "epoch": 1.6470588235294117, "grad_norm": 1.7012155055999756, "learning_rate": 1.0980392156862745e-06, "loss": 2.434, "step": 28 }, { "epoch": 1.7647058823529411, "grad_norm": 1.533742070198059, "learning_rate": 1.1764705882352942e-06, "loss": 2.4145, "step": 30 }, { "epoch": 1.8823529411764706, "grad_norm": 1.6920032501220703, "learning_rate": 1.2549019607843137e-06, "loss": 2.4019, "step": 32 }, { "epoch": 2.0, "grad_norm": 1.5552300214767456, "learning_rate": 1.3333333333333334e-06, "loss": 2.4022, "step": 34 }, { "epoch": 2.0, "eval_loss": 2.3881916999816895, "eval_runtime": 14.4942, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 34 }, { "epoch": 2.1176470588235294, "grad_norm": 1.5124330520629883, "learning_rate": 1.4117647058823531e-06, "loss": 2.3961, "step": 36 }, { "epoch": 2.235294117647059, "grad_norm": 1.4209071397781372, "learning_rate": 1.4901960784313726e-06, "loss": 2.4343, "step": 38 }, { "epoch": 2.3529411764705883, "grad_norm": 1.6290644407272339, "learning_rate": 1.5686274509803923e-06, "loss": 2.3528, "step": 40 }, { "epoch": 2.4705882352941178, "grad_norm": 1.5042191743850708, "learning_rate": 1.6470588235294118e-06, "loss": 2.3555, "step": 42 }, { "epoch": 2.588235294117647, "grad_norm": 1.6211644411087036, "learning_rate": 1.7254901960784315e-06, "loss": 2.3944, "step": 44 }, { "epoch": 2.7058823529411766, "grad_norm": 1.5416394472122192, "learning_rate": 1.8039215686274512e-06, "loss": 2.3917, "step": 46 }, { "epoch": 2.8235294117647056, "grad_norm": 1.697242259979248, "learning_rate": 1.8823529411764707e-06, "loss": 2.3457, "step": 48 }, { "epoch": 2.9411764705882355, "grad_norm": 1.8130015134811401, "learning_rate": 1.96078431372549e-06, "loss": 2.3317, "step": 50 }, { "epoch": 3.0, "eval_loss": 2.314044952392578, "eval_runtime": 14.4935, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 51 }, { "epoch": 3.0588235294117645, "grad_norm": 1.830562710762024, "learning_rate": 2.03921568627451e-06, "loss": 2.2938, "step": 52 }, { "epoch": 3.176470588235294, "grad_norm": 1.8372972011566162, "learning_rate": 2.1176470588235296e-06, "loss": 2.3541, "step": 54 }, { "epoch": 3.2941176470588234, "grad_norm": 1.8857609033584595, "learning_rate": 2.196078431372549e-06, "loss": 2.2888, "step": 56 }, { "epoch": 3.411764705882353, "grad_norm": 1.7633429765701294, "learning_rate": 2.274509803921569e-06, "loss": 2.2616, "step": 58 }, { "epoch": 3.5294117647058822, "grad_norm": 1.5656747817993164, "learning_rate": 2.3529411764705885e-06, "loss": 2.2801, "step": 60 }, { "epoch": 3.6470588235294117, "grad_norm": 1.6285021305084229, "learning_rate": 2.431372549019608e-06, "loss": 2.3144, "step": 62 }, { "epoch": 3.764705882352941, "grad_norm": 1.531112790107727, "learning_rate": 2.5098039215686274e-06, "loss": 2.2294, "step": 64 }, { "epoch": 3.8823529411764706, "grad_norm": 1.527350664138794, "learning_rate": 2.5882352941176473e-06, "loss": 2.1976, "step": 66 }, { "epoch": 4.0, "grad_norm": 2.009220838546753, "learning_rate": 2.666666666666667e-06, "loss": 2.2607, "step": 68 }, { "epoch": 4.0, "eval_loss": 2.2050342559814453, "eval_runtime": 14.4977, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 68 }, { "epoch": 4.117647058823529, "grad_norm": 2.025496006011963, "learning_rate": 2.7450980392156867e-06, "loss": 2.1834, "step": 70 }, { "epoch": 4.235294117647059, "grad_norm": 1.714682698249817, "learning_rate": 2.8235294117647062e-06, "loss": 2.1785, "step": 72 }, { "epoch": 4.352941176470588, "grad_norm": 1.1758134365081787, "learning_rate": 2.901960784313726e-06, "loss": 2.2148, "step": 74 }, { "epoch": 4.470588235294118, "grad_norm": 1.2965394258499146, "learning_rate": 2.980392156862745e-06, "loss": 2.1797, "step": 76 }, { "epoch": 4.588235294117647, "grad_norm": 1.1413812637329102, "learning_rate": 3.058823529411765e-06, "loss": 2.1743, "step": 78 }, { "epoch": 4.705882352941177, "grad_norm": 1.0636754035949707, "learning_rate": 3.1372549019607846e-06, "loss": 2.0559, "step": 80 }, { "epoch": 4.823529411764706, "grad_norm": 1.1126306056976318, "learning_rate": 3.2156862745098045e-06, "loss": 2.1355, "step": 82 }, { "epoch": 4.9411764705882355, "grad_norm": 1.1904844045639038, "learning_rate": 3.2941176470588236e-06, "loss": 2.1352, "step": 84 }, { "epoch": 5.0, "eval_loss": 2.0643370151519775, "eval_runtime": 14.5066, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 85 }, { "epoch": 5.0588235294117645, "grad_norm": 1.0779309272766113, "learning_rate": 3.3725490196078435e-06, "loss": 2.0248, "step": 86 }, { "epoch": 5.176470588235294, "grad_norm": 1.107112169265747, "learning_rate": 3.450980392156863e-06, "loss": 2.1049, "step": 88 }, { "epoch": 5.294117647058823, "grad_norm": 0.9876514077186584, "learning_rate": 3.529411764705883e-06, "loss": 2.0005, "step": 90 }, { "epoch": 5.411764705882353, "grad_norm": 1.073117733001709, "learning_rate": 3.6078431372549024e-06, "loss": 1.9499, "step": 92 }, { "epoch": 5.529411764705882, "grad_norm": 1.0594408512115479, "learning_rate": 3.6862745098039223e-06, "loss": 1.9899, "step": 94 }, { "epoch": 5.647058823529412, "grad_norm": 1.0870219469070435, "learning_rate": 3.7647058823529414e-06, "loss": 1.9852, "step": 96 }, { "epoch": 5.764705882352941, "grad_norm": 0.9945081472396851, "learning_rate": 3.843137254901962e-06, "loss": 1.9981, "step": 98 }, { "epoch": 5.882352941176471, "grad_norm": 0.8944886326789856, "learning_rate": 3.92156862745098e-06, "loss": 1.9126, "step": 100 }, { "epoch": 6.0, "grad_norm": 0.8814469575881958, "learning_rate": 4.000000000000001e-06, "loss": 1.9456, "step": 102 }, { "epoch": 6.0, "eval_loss": 1.888541340827942, "eval_runtime": 14.5125, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 102 }, { "epoch": 6.117647058823529, "grad_norm": 0.8919170498847961, "learning_rate": 4.07843137254902e-06, "loss": 1.8157, "step": 104 }, { "epoch": 6.235294117647059, "grad_norm": 0.8234829902648926, "learning_rate": 4.15686274509804e-06, "loss": 1.9187, "step": 106 }, { "epoch": 6.352941176470588, "grad_norm": 0.8216582536697388, "learning_rate": 4.235294117647059e-06, "loss": 1.8121, "step": 108 }, { "epoch": 6.470588235294118, "grad_norm": 0.8760618567466736, "learning_rate": 4.313725490196079e-06, "loss": 1.8794, "step": 110 }, { "epoch": 6.588235294117647, "grad_norm": 0.90522301197052, "learning_rate": 4.392156862745098e-06, "loss": 1.7899, "step": 112 }, { "epoch": 6.705882352941177, "grad_norm": 0.8919849395751953, "learning_rate": 4.4705882352941184e-06, "loss": 1.7929, "step": 114 }, { "epoch": 6.823529411764706, "grad_norm": 1.0193332433700562, "learning_rate": 4.549019607843138e-06, "loss": 1.7409, "step": 116 }, { "epoch": 6.9411764705882355, "grad_norm": 0.9497600793838501, "learning_rate": 4.627450980392157e-06, "loss": 1.7528, "step": 118 }, { "epoch": 7.0, "eval_loss": 1.7024633884429932, "eval_runtime": 14.5072, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 119 }, { "epoch": 7.0588235294117645, "grad_norm": 0.9311454892158508, "learning_rate": 4.705882352941177e-06, "loss": 1.7333, "step": 120 }, { "epoch": 7.176470588235294, "grad_norm": 1.0313152074813843, "learning_rate": 4.784313725490196e-06, "loss": 1.7217, "step": 122 }, { "epoch": 7.294117647058823, "grad_norm": 1.1278079748153687, "learning_rate": 4.862745098039216e-06, "loss": 1.6414, "step": 124 }, { "epoch": 7.411764705882353, "grad_norm": 0.9751306176185608, "learning_rate": 4.941176470588236e-06, "loss": 1.6047, "step": 126 }, { "epoch": 7.529411764705882, "grad_norm": 0.9619643688201904, "learning_rate": 5.019607843137255e-06, "loss": 1.5733, "step": 128 }, { "epoch": 7.647058823529412, "grad_norm": 0.9418209195137024, "learning_rate": 5.098039215686274e-06, "loss": 1.5655, "step": 130 }, { "epoch": 7.764705882352941, "grad_norm": 0.986770749092102, "learning_rate": 5.176470588235295e-06, "loss": 1.5325, "step": 132 }, { "epoch": 7.882352941176471, "grad_norm": 0.8657909631729126, "learning_rate": 5.254901960784314e-06, "loss": 1.5042, "step": 134 }, { "epoch": 8.0, "grad_norm": 0.8987972736358643, "learning_rate": 5.333333333333334e-06, "loss": 1.4935, "step": 136 }, { "epoch": 8.0, "eval_loss": 1.467383861541748, "eval_runtime": 14.5108, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 136 }, { "epoch": 8.117647058823529, "grad_norm": 0.8275275826454163, "learning_rate": 5.411764705882353e-06, "loss": 1.4215, "step": 138 }, { "epoch": 8.235294117647058, "grad_norm": 0.9540057182312012, "learning_rate": 5.4901960784313735e-06, "loss": 1.4698, "step": 140 }, { "epoch": 8.352941176470589, "grad_norm": 0.9684072136878967, "learning_rate": 5.568627450980393e-06, "loss": 1.4359, "step": 142 }, { "epoch": 8.470588235294118, "grad_norm": 0.9229031801223755, "learning_rate": 5.6470588235294125e-06, "loss": 1.3994, "step": 144 }, { "epoch": 8.588235294117647, "grad_norm": 0.8458110094070435, "learning_rate": 5.725490196078431e-06, "loss": 1.3096, "step": 146 }, { "epoch": 8.705882352941176, "grad_norm": 0.9069352746009827, "learning_rate": 5.803921568627452e-06, "loss": 1.2347, "step": 148 }, { "epoch": 8.823529411764707, "grad_norm": 0.8469833731651306, "learning_rate": 5.882352941176471e-06, "loss": 1.332, "step": 150 }, { "epoch": 8.941176470588236, "grad_norm": 0.8933460116386414, "learning_rate": 5.96078431372549e-06, "loss": 1.2733, "step": 152 }, { "epoch": 9.0, "eval_loss": 1.2421215772628784, "eval_runtime": 14.5471, "eval_samples_per_second": 2.475, "eval_steps_per_second": 2.475, "step": 153 }, { "epoch": 9.058823529411764, "grad_norm": 0.8019786477088928, "learning_rate": 6.03921568627451e-06, "loss": 1.1929, "step": 154 }, { "epoch": 9.176470588235293, "grad_norm": 0.7300643920898438, "learning_rate": 6.11764705882353e-06, "loss": 1.2392, "step": 156 }, { "epoch": 9.294117647058824, "grad_norm": 0.809948742389679, "learning_rate": 6.19607843137255e-06, "loss": 1.1685, "step": 158 }, { "epoch": 9.411764705882353, "grad_norm": 0.6852974891662598, "learning_rate": 6.274509803921569e-06, "loss": 1.168, "step": 160 }, { "epoch": 9.529411764705882, "grad_norm": 0.709697961807251, "learning_rate": 6.352941176470589e-06, "loss": 1.1333, "step": 162 }, { "epoch": 9.647058823529411, "grad_norm": 0.7923583388328552, "learning_rate": 6.431372549019609e-06, "loss": 1.1475, "step": 164 }, { "epoch": 9.764705882352942, "grad_norm": 0.7233794927597046, "learning_rate": 6.5098039215686285e-06, "loss": 1.1775, "step": 166 }, { "epoch": 9.882352941176471, "grad_norm": 0.7074316740036011, "learning_rate": 6.588235294117647e-06, "loss": 1.1279, "step": 168 }, { "epoch": 10.0, "grad_norm": 0.6581458449363708, "learning_rate": 6.666666666666667e-06, "loss": 1.1154, "step": 170 }, { "epoch": 10.0, "eval_loss": 1.1133772134780884, "eval_runtime": 14.5122, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 170 }, { "epoch": 10.117647058823529, "grad_norm": 0.6955820918083191, "learning_rate": 6.745098039215687e-06, "loss": 1.0662, "step": 172 }, { "epoch": 10.235294117647058, "grad_norm": 0.5870165824890137, "learning_rate": 6.8235294117647065e-06, "loss": 1.0219, "step": 174 }, { "epoch": 10.352941176470589, "grad_norm": 0.6177704334259033, "learning_rate": 6.901960784313726e-06, "loss": 1.0537, "step": 176 }, { "epoch": 10.470588235294118, "grad_norm": 0.6390775442123413, "learning_rate": 6.9803921568627454e-06, "loss": 1.1001, "step": 178 }, { "epoch": 10.588235294117647, "grad_norm": 0.4973801374435425, "learning_rate": 7.058823529411766e-06, "loss": 1.0578, "step": 180 }, { "epoch": 10.705882352941176, "grad_norm": 0.518943190574646, "learning_rate": 7.137254901960785e-06, "loss": 1.1447, "step": 182 }, { "epoch": 10.823529411764707, "grad_norm": 0.6414965987205505, "learning_rate": 7.215686274509805e-06, "loss": 1.0872, "step": 184 }, { "epoch": 10.941176470588236, "grad_norm": 0.508786678314209, "learning_rate": 7.294117647058823e-06, "loss": 1.1202, "step": 186 }, { "epoch": 11.0, "eval_loss": 1.0689375400543213, "eval_runtime": 14.505, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 187 }, { "epoch": 11.058823529411764, "grad_norm": 0.48530295491218567, "learning_rate": 7.372549019607845e-06, "loss": 1.0999, "step": 188 }, { "epoch": 11.176470588235293, "grad_norm": 0.5133592486381531, "learning_rate": 7.450980392156863e-06, "loss": 1.0864, "step": 190 }, { "epoch": 11.294117647058824, "grad_norm": 0.49263596534729004, "learning_rate": 7.529411764705883e-06, "loss": 1.0535, "step": 192 }, { "epoch": 11.411764705882353, "grad_norm": 0.4610048532485962, "learning_rate": 7.607843137254902e-06, "loss": 1.0462, "step": 194 }, { "epoch": 11.529411764705882, "grad_norm": 0.5121297836303711, "learning_rate": 7.686274509803923e-06, "loss": 1.0862, "step": 196 }, { "epoch": 11.647058823529411, "grad_norm": 0.5441015958786011, "learning_rate": 7.764705882352941e-06, "loss": 1.0068, "step": 198 }, { "epoch": 11.764705882352942, "grad_norm": 0.5135095119476318, "learning_rate": 7.84313725490196e-06, "loss": 1.0548, "step": 200 }, { "epoch": 11.882352941176471, "grad_norm": 0.4792177081108093, "learning_rate": 7.92156862745098e-06, "loss": 0.9711, "step": 202 }, { "epoch": 12.0, "grad_norm": 0.45314979553222656, "learning_rate": 8.000000000000001e-06, "loss": 0.9449, "step": 204 }, { "epoch": 12.0, "eval_loss": 1.0450434684753418, "eval_runtime": 14.5066, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 204 }, { "epoch": 12.117647058823529, "grad_norm": 0.5007625818252563, "learning_rate": 8.07843137254902e-06, "loss": 1.0258, "step": 206 }, { "epoch": 12.235294117647058, "grad_norm": 0.5184361934661865, "learning_rate": 8.15686274509804e-06, "loss": 1.0845, "step": 208 }, { "epoch": 12.352941176470589, "grad_norm": 0.44266751408576965, "learning_rate": 8.23529411764706e-06, "loss": 1.0005, "step": 210 }, { "epoch": 12.470588235294118, "grad_norm": 0.5165805220603943, "learning_rate": 8.31372549019608e-06, "loss": 1.0242, "step": 212 }, { "epoch": 12.588235294117647, "grad_norm": 0.5037981867790222, "learning_rate": 8.392156862745099e-06, "loss": 0.9857, "step": 214 }, { "epoch": 12.705882352941176, "grad_norm": 0.5604737997055054, "learning_rate": 8.470588235294118e-06, "loss": 1.0086, "step": 216 }, { "epoch": 12.823529411764707, "grad_norm": 0.6752682328224182, "learning_rate": 8.549019607843138e-06, "loss": 1.0277, "step": 218 }, { "epoch": 12.941176470588236, "grad_norm": 0.5517321228981018, "learning_rate": 8.627450980392157e-06, "loss": 0.9973, "step": 220 }, { "epoch": 13.0, "eval_loss": 1.0252662897109985, "eval_runtime": 14.5028, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 221 }, { "epoch": 13.058823529411764, "grad_norm": 0.6104453802108765, "learning_rate": 8.705882352941177e-06, "loss": 0.9835, "step": 222 }, { "epoch": 13.176470588235293, "grad_norm": 0.47119539976119995, "learning_rate": 8.784313725490196e-06, "loss": 1.0076, "step": 224 }, { "epoch": 13.294117647058824, "grad_norm": 0.4882214367389679, "learning_rate": 8.862745098039216e-06, "loss": 0.9808, "step": 226 }, { "epoch": 13.411764705882353, "grad_norm": 0.7123433947563171, "learning_rate": 8.941176470588237e-06, "loss": 0.9676, "step": 228 }, { "epoch": 13.529411764705882, "grad_norm": 0.5918748378753662, "learning_rate": 9.019607843137256e-06, "loss": 1.0068, "step": 230 }, { "epoch": 13.647058823529411, "grad_norm": 0.5302197337150574, "learning_rate": 9.098039215686276e-06, "loss": 0.9573, "step": 232 }, { "epoch": 13.764705882352942, "grad_norm": 0.5693833827972412, "learning_rate": 9.176470588235294e-06, "loss": 0.9914, "step": 234 }, { "epoch": 13.882352941176471, "grad_norm": 0.490904837846756, "learning_rate": 9.254901960784315e-06, "loss": 1.032, "step": 236 }, { "epoch": 14.0, "grad_norm": 0.5507678389549255, "learning_rate": 9.333333333333334e-06, "loss": 1.0562, "step": 238 }, { "epoch": 14.0, "eval_loss": 1.0090599060058594, "eval_runtime": 14.506, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 238 }, { "epoch": 14.117647058823529, "grad_norm": 0.6389086246490479, "learning_rate": 9.411764705882354e-06, "loss": 0.9853, "step": 240 }, { "epoch": 14.235294117647058, "grad_norm": 0.5049781203269958, "learning_rate": 9.490196078431373e-06, "loss": 1.0067, "step": 242 }, { "epoch": 14.352941176470589, "grad_norm": 0.7086266279220581, "learning_rate": 9.568627450980393e-06, "loss": 0.9387, "step": 244 }, { "epoch": 14.470588235294118, "grad_norm": 0.5628448128700256, "learning_rate": 9.647058823529412e-06, "loss": 1.0068, "step": 246 }, { "epoch": 14.588235294117647, "grad_norm": 0.6910731196403503, "learning_rate": 9.725490196078432e-06, "loss": 1.0007, "step": 248 }, { "epoch": 14.705882352941176, "grad_norm": 0.6134346127510071, "learning_rate": 9.803921568627451e-06, "loss": 0.9456, "step": 250 }, { "epoch": 14.823529411764707, "grad_norm": 0.6747128963470459, "learning_rate": 9.882352941176472e-06, "loss": 0.9506, "step": 252 }, { "epoch": 14.941176470588236, "grad_norm": 0.5889897346496582, "learning_rate": 9.960784313725492e-06, "loss": 0.9947, "step": 254 }, { "epoch": 15.0, "eval_loss": 0.9928128719329834, "eval_runtime": 14.4936, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 255 }, { "epoch": 15.058823529411764, "grad_norm": 0.5487807989120483, "learning_rate": 9.999995315380667e-06, "loss": 0.9354, "step": 256 }, { "epoch": 15.176470588235293, "grad_norm": 0.6178866624832153, "learning_rate": 9.99995783847866e-06, "loss": 0.9655, "step": 258 }, { "epoch": 15.294117647058824, "grad_norm": 0.5696916580200195, "learning_rate": 9.999882884955554e-06, "loss": 0.9468, "step": 260 }, { "epoch": 15.411764705882353, "grad_norm": 0.6009863615036011, "learning_rate": 9.99977045537315e-06, "loss": 0.9852, "step": 262 }, { "epoch": 15.529411764705882, "grad_norm": 0.6040264368057251, "learning_rate": 9.999620550574155e-06, "loss": 0.9553, "step": 264 }, { "epoch": 15.647058823529411, "grad_norm": 0.6321269869804382, "learning_rate": 9.999433171682158e-06, "loss": 0.9419, "step": 266 }, { "epoch": 15.764705882352942, "grad_norm": 0.6273146867752075, "learning_rate": 9.999208320101643e-06, "loss": 0.9715, "step": 268 }, { "epoch": 15.882352941176471, "grad_norm": 0.6734570860862732, "learning_rate": 9.998945997517957e-06, "loss": 0.918, "step": 270 }, { "epoch": 16.0, "grad_norm": 0.7102432250976562, "learning_rate": 9.99864620589731e-06, "loss": 1.0096, "step": 272 }, { "epoch": 16.0, "eval_loss": 0.9803969264030457, "eval_runtime": 14.5083, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 272 }, { "epoch": 16.11764705882353, "grad_norm": 0.6154859066009521, "learning_rate": 9.998308947486753e-06, "loss": 0.8898, "step": 274 }, { "epoch": 16.235294117647058, "grad_norm": 0.6435267329216003, "learning_rate": 9.997934224814173e-06, "loss": 0.9271, "step": 276 }, { "epoch": 16.352941176470587, "grad_norm": 0.7057787775993347, "learning_rate": 9.997522040688258e-06, "loss": 1.0, "step": 278 }, { "epoch": 16.470588235294116, "grad_norm": 0.6257563233375549, "learning_rate": 9.997072398198492e-06, "loss": 0.973, "step": 280 }, { "epoch": 16.58823529411765, "grad_norm": 0.6798095703125, "learning_rate": 9.996585300715117e-06, "loss": 0.9625, "step": 282 }, { "epoch": 16.705882352941178, "grad_norm": 0.7027468681335449, "learning_rate": 9.996060751889114e-06, "loss": 0.9529, "step": 284 }, { "epoch": 16.823529411764707, "grad_norm": 0.6210634708404541, "learning_rate": 9.995498755652186e-06, "loss": 0.8968, "step": 286 }, { "epoch": 16.941176470588236, "grad_norm": 0.6995490789413452, "learning_rate": 9.994899316216709e-06, "loss": 0.9222, "step": 288 }, { "epoch": 17.0, "eval_loss": 0.9691942930221558, "eval_runtime": 14.5044, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 289 }, { "epoch": 17.058823529411764, "grad_norm": 0.6503624320030212, "learning_rate": 9.994262438075713e-06, "loss": 0.9487, "step": 290 }, { "epoch": 17.176470588235293, "grad_norm": 0.6647483706474304, "learning_rate": 9.993588126002848e-06, "loss": 0.9163, "step": 292 }, { "epoch": 17.294117647058822, "grad_norm": 0.7215944528579712, "learning_rate": 9.992876385052346e-06, "loss": 0.8638, "step": 294 }, { "epoch": 17.41176470588235, "grad_norm": 0.7234969139099121, "learning_rate": 9.992127220558976e-06, "loss": 0.9037, "step": 296 }, { "epoch": 17.529411764705884, "grad_norm": 0.7656229138374329, "learning_rate": 9.991340638138022e-06, "loss": 0.9633, "step": 298 }, { "epoch": 17.647058823529413, "grad_norm": 0.6850258111953735, "learning_rate": 9.990516643685222e-06, "loss": 0.9458, "step": 300 }, { "epoch": 17.764705882352942, "grad_norm": 0.7975447773933411, "learning_rate": 9.98965524337673e-06, "loss": 0.9801, "step": 302 }, { "epoch": 17.88235294117647, "grad_norm": 0.7075424790382385, "learning_rate": 9.988756443669081e-06, "loss": 0.888, "step": 304 }, { "epoch": 18.0, "grad_norm": 0.85096675157547, "learning_rate": 9.987820251299121e-06, "loss": 0.8838, "step": 306 }, { "epoch": 18.0, "eval_loss": 0.9602956771850586, "eval_runtime": 14.5129, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 306 }, { "epoch": 18.11764705882353, "grad_norm": 0.698685884475708, "learning_rate": 9.98684667328398e-06, "loss": 0.8838, "step": 308 }, { "epoch": 18.235294117647058, "grad_norm": 0.7671274542808533, "learning_rate": 9.985835716921e-06, "loss": 0.9012, "step": 310 }, { "epoch": 18.352941176470587, "grad_norm": 0.8342521786689758, "learning_rate": 9.984787389787689e-06, "loss": 0.9412, "step": 312 }, { "epoch": 18.470588235294116, "grad_norm": 0.6886960864067078, "learning_rate": 9.983701699741668e-06, "loss": 0.8946, "step": 314 }, { "epoch": 18.58823529411765, "grad_norm": 0.7856888175010681, "learning_rate": 9.982578654920601e-06, "loss": 0.9169, "step": 316 }, { "epoch": 18.705882352941178, "grad_norm": 0.7338317036628723, "learning_rate": 9.981418263742148e-06, "loss": 0.8584, "step": 318 }, { "epoch": 18.823529411764707, "grad_norm": 0.727165699005127, "learning_rate": 9.980220534903889e-06, "loss": 0.9385, "step": 320 }, { "epoch": 18.941176470588236, "grad_norm": 0.777866542339325, "learning_rate": 9.978985477383264e-06, "loss": 0.8942, "step": 322 }, { "epoch": 19.0, "eval_loss": 0.951096773147583, "eval_runtime": 14.4924, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 323 }, { "epoch": 19.058823529411764, "grad_norm": 0.6845978498458862, "learning_rate": 9.97771310043751e-06, "loss": 0.8752, "step": 324 }, { "epoch": 19.176470588235293, "grad_norm": 0.7632399201393127, "learning_rate": 9.97640341360358e-06, "loss": 0.9616, "step": 326 }, { "epoch": 19.294117647058822, "grad_norm": 0.7852567434310913, "learning_rate": 9.975056426698094e-06, "loss": 0.8884, "step": 328 }, { "epoch": 19.41176470588235, "grad_norm": 0.7355157136917114, "learning_rate": 9.973672149817232e-06, "loss": 0.8175, "step": 330 }, { "epoch": 19.529411764705884, "grad_norm": 0.7707788348197937, "learning_rate": 9.972250593336689e-06, "loss": 0.8878, "step": 332 }, { "epoch": 19.647058823529413, "grad_norm": 1.0082019567489624, "learning_rate": 9.970791767911581e-06, "loss": 0.9118, "step": 334 }, { "epoch": 19.764705882352942, "grad_norm": 0.8013073205947876, "learning_rate": 9.96929568447637e-06, "loss": 0.8724, "step": 336 }, { "epoch": 19.88235294117647, "grad_norm": 0.6911207437515259, "learning_rate": 9.967762354244778e-06, "loss": 0.8832, "step": 338 }, { "epoch": 20.0, "grad_norm": 0.8336138725280762, "learning_rate": 9.966191788709716e-06, "loss": 0.9058, "step": 340 }, { "epoch": 20.0, "eval_loss": 0.9431850910186768, "eval_runtime": 14.5083, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 340 }, { "epoch": 20.11764705882353, "grad_norm": 0.7745249271392822, "learning_rate": 9.964583999643174e-06, "loss": 0.878, "step": 342 }, { "epoch": 20.235294117647058, "grad_norm": 0.7922182083129883, "learning_rate": 9.962938999096159e-06, "loss": 0.8275, "step": 344 }, { "epoch": 20.352941176470587, "grad_norm": 0.8610040545463562, "learning_rate": 9.961256799398584e-06, "loss": 0.94, "step": 346 }, { "epoch": 20.470588235294116, "grad_norm": 0.9406768083572388, "learning_rate": 9.95953741315919e-06, "loss": 0.8779, "step": 348 }, { "epoch": 20.58823529411765, "grad_norm": 0.8344603180885315, "learning_rate": 9.957780853265441e-06, "loss": 0.8318, "step": 350 }, { "epoch": 20.705882352941178, "grad_norm": 0.8624390363693237, "learning_rate": 9.955987132883435e-06, "loss": 0.8644, "step": 352 }, { "epoch": 20.823529411764707, "grad_norm": 0.7996507287025452, "learning_rate": 9.954156265457801e-06, "loss": 0.8656, "step": 354 }, { "epoch": 20.941176470588236, "grad_norm": 0.9234054684638977, "learning_rate": 9.952288264711601e-06, "loss": 0.8837, "step": 356 }, { "epoch": 21.0, "eval_loss": 0.9354000091552734, "eval_runtime": 14.5044, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 357 }, { "epoch": 21.058823529411764, "grad_norm": 0.793875515460968, "learning_rate": 9.950383144646221e-06, "loss": 0.8662, "step": 358 }, { "epoch": 21.176470588235293, "grad_norm": 0.8161793947219849, "learning_rate": 9.948440919541277e-06, "loss": 0.8713, "step": 360 }, { "epoch": 21.294117647058822, "grad_norm": 0.9452466368675232, "learning_rate": 9.946461603954499e-06, "loss": 0.9299, "step": 362 }, { "epoch": 21.41176470588235, "grad_norm": 0.8712689876556396, "learning_rate": 9.944445212721619e-06, "loss": 0.84, "step": 364 }, { "epoch": 21.529411764705884, "grad_norm": 0.8613099455833435, "learning_rate": 9.942391760956277e-06, "loss": 0.8523, "step": 366 }, { "epoch": 21.647058823529413, "grad_norm": 1.0285900831222534, "learning_rate": 9.940301264049885e-06, "loss": 0.8411, "step": 368 }, { "epoch": 21.764705882352942, "grad_norm": 0.9434134364128113, "learning_rate": 9.938173737671531e-06, "loss": 0.819, "step": 370 }, { "epoch": 21.88235294117647, "grad_norm": 0.9282283782958984, "learning_rate": 9.936009197767847e-06, "loss": 0.8783, "step": 372 }, { "epoch": 22.0, "grad_norm": 0.9603204131126404, "learning_rate": 9.933807660562898e-06, "loss": 0.795, "step": 374 }, { "epoch": 22.0, "eval_loss": 0.9314696788787842, "eval_runtime": 14.5014, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 374 }, { "epoch": 22.11764705882353, "grad_norm": 0.9283419847488403, "learning_rate": 9.931569142558057e-06, "loss": 0.8911, "step": 376 }, { "epoch": 22.235294117647058, "grad_norm": 0.985173761844635, "learning_rate": 9.929293660531889e-06, "loss": 0.8351, "step": 378 }, { "epoch": 22.352941176470587, "grad_norm": 0.9488443732261658, "learning_rate": 9.926981231540007e-06, "loss": 0.8245, "step": 380 }, { "epoch": 22.470588235294116, "grad_norm": 1.0252861976623535, "learning_rate": 9.924631872914967e-06, "loss": 0.8096, "step": 382 }, { "epoch": 22.58823529411765, "grad_norm": 0.8986847996711731, "learning_rate": 9.922245602266119e-06, "loss": 0.8311, "step": 384 }, { "epoch": 22.705882352941178, "grad_norm": 0.9069613218307495, "learning_rate": 9.919822437479488e-06, "loss": 0.7961, "step": 386 }, { "epoch": 22.823529411764707, "grad_norm": 0.8006130456924438, "learning_rate": 9.91736239671763e-06, "loss": 0.866, "step": 388 }, { "epoch": 22.941176470588236, "grad_norm": 0.8258039355278015, "learning_rate": 9.91486549841951e-06, "loss": 0.8395, "step": 390 }, { "epoch": 23.0, "eval_loss": 0.9243198037147522, "eval_runtime": 14.5142, "eval_samples_per_second": 2.48, "eval_steps_per_second": 2.48, "step": 391 }, { "epoch": 23.058823529411764, "grad_norm": 1.0394818782806396, "learning_rate": 9.912331761300341e-06, "loss": 0.787, "step": 392 }, { "epoch": 23.176470588235293, "grad_norm": 0.9367055892944336, "learning_rate": 9.909761204351469e-06, "loss": 0.8501, "step": 394 }, { "epoch": 23.294117647058822, "grad_norm": 1.0531871318817139, "learning_rate": 9.90715384684021e-06, "loss": 0.8855, "step": 396 }, { "epoch": 23.41176470588235, "grad_norm": 0.9447432160377502, "learning_rate": 9.904509708309723e-06, "loss": 0.7717, "step": 398 }, { "epoch": 23.529411764705884, "grad_norm": 0.9809987545013428, "learning_rate": 9.901828808578846e-06, "loss": 0.7949, "step": 400 }, { "epoch": 23.647058823529413, "grad_norm": 0.9656151533126831, "learning_rate": 9.899111167741966e-06, "loss": 0.8286, "step": 402 }, { "epoch": 23.764705882352942, "grad_norm": 1.0195831060409546, "learning_rate": 9.896356806168851e-06, "loss": 0.8478, "step": 404 }, { "epoch": 23.88235294117647, "grad_norm": 1.144056797027588, "learning_rate": 9.89356574450451e-06, "loss": 0.7723, "step": 406 }, { "epoch": 24.0, "grad_norm": 1.0349133014678955, "learning_rate": 9.890738003669029e-06, "loss": 0.8308, "step": 408 }, { "epoch": 24.0, "eval_loss": 0.9168965816497803, "eval_runtime": 14.494, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 408 }, { "epoch": 24.11764705882353, "grad_norm": 1.003952980041504, "learning_rate": 9.887873604857424e-06, "loss": 0.8492, "step": 410 }, { "epoch": 24.235294117647058, "grad_norm": 1.1212753057479858, "learning_rate": 9.884972569539471e-06, "loss": 0.8037, "step": 412 }, { "epoch": 24.352941176470587, "grad_norm": 0.995343029499054, "learning_rate": 9.882034919459556e-06, "loss": 0.765, "step": 414 }, { "epoch": 24.470588235294116, "grad_norm": 1.0651168823242188, "learning_rate": 9.879060676636502e-06, "loss": 0.8008, "step": 416 }, { "epoch": 24.58823529411765, "grad_norm": 1.1323087215423584, "learning_rate": 9.876049863363415e-06, "loss": 0.8154, "step": 418 }, { "epoch": 24.705882352941178, "grad_norm": 1.118166446685791, "learning_rate": 9.873002502207502e-06, "loss": 0.7665, "step": 420 }, { "epoch": 24.823529411764707, "grad_norm": 1.1308856010437012, "learning_rate": 9.86991861600992e-06, "loss": 0.8056, "step": 422 }, { "epoch": 24.941176470588236, "grad_norm": 1.0739870071411133, "learning_rate": 9.866798227885588e-06, "loss": 0.7863, "step": 424 }, { "epoch": 25.0, "eval_loss": 0.9137818217277527, "eval_runtime": 14.4961, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 425 }, { "epoch": 25.058823529411764, "grad_norm": 0.947708785533905, "learning_rate": 9.863641361223025e-06, "loss": 0.746, "step": 426 }, { "epoch": 25.176470588235293, "grad_norm": 1.226585030555725, "learning_rate": 9.860448039684169e-06, "loss": 0.7622, "step": 428 }, { "epoch": 25.294117647058822, "grad_norm": 1.2553542852401733, "learning_rate": 9.857218287204204e-06, "loss": 0.784, "step": 430 }, { "epoch": 25.41176470588235, "grad_norm": 1.130286455154419, "learning_rate": 9.853952127991374e-06, "loss": 0.78, "step": 432 }, { "epoch": 25.529411764705884, "grad_norm": 1.2538301944732666, "learning_rate": 9.850649586526808e-06, "loss": 0.7608, "step": 434 }, { "epoch": 25.647058823529413, "grad_norm": 1.170310378074646, "learning_rate": 9.847310687564335e-06, "loss": 0.8389, "step": 436 }, { "epoch": 25.764705882352942, "grad_norm": 0.9732166528701782, "learning_rate": 9.843935456130295e-06, "loss": 0.8158, "step": 438 }, { "epoch": 25.88235294117647, "grad_norm": 1.2474738359451294, "learning_rate": 9.840523917523354e-06, "loss": 0.7528, "step": 440 }, { "epoch": 26.0, "grad_norm": 1.130893349647522, "learning_rate": 9.83707609731432e-06, "loss": 0.7468, "step": 442 }, { "epoch": 26.0, "eval_loss": 0.9068209528923035, "eval_runtime": 14.5043, "eval_samples_per_second": 2.482, "eval_steps_per_second": 2.482, "step": 442 }, { "epoch": 26.11764705882353, "grad_norm": 1.0452311038970947, "learning_rate": 9.833592021345938e-06, "loss": 0.7589, "step": 444 }, { "epoch": 26.235294117647058, "grad_norm": 0.9809611439704895, "learning_rate": 9.830071715732708e-06, "loss": 0.8016, "step": 446 }, { "epoch": 26.352941176470587, "grad_norm": 1.0656489133834839, "learning_rate": 9.826515206860683e-06, "loss": 0.7417, "step": 448 }, { "epoch": 26.470588235294116, "grad_norm": 1.1188890933990479, "learning_rate": 9.822922521387277e-06, "loss": 0.7569, "step": 450 }, { "epoch": 26.58823529411765, "grad_norm": 1.087983250617981, "learning_rate": 9.819293686241057e-06, "loss": 0.7596, "step": 452 }, { "epoch": 26.705882352941178, "grad_norm": 1.0073840618133545, "learning_rate": 9.81562872862155e-06, "loss": 0.7423, "step": 454 }, { "epoch": 26.823529411764707, "grad_norm": 1.0083576440811157, "learning_rate": 9.811927675999035e-06, "loss": 0.7533, "step": 456 }, { "epoch": 26.941176470588236, "grad_norm": 1.0545302629470825, "learning_rate": 9.808190556114333e-06, "loss": 0.7658, "step": 458 }, { "epoch": 27.0, "eval_loss": 0.9007807970046997, "eval_runtime": 14.5307, "eval_samples_per_second": 2.478, "eval_steps_per_second": 2.478, "step": 459 }, { "epoch": 27.058823529411764, "grad_norm": 0.9539656043052673, "learning_rate": 9.804417396978605e-06, "loss": 0.7658, "step": 460 }, { "epoch": 27.176470588235293, "grad_norm": 1.044712781906128, "learning_rate": 9.800608226873143e-06, "loss": 0.6566, "step": 462 }, { "epoch": 27.294117647058822, "grad_norm": 1.3112603425979614, "learning_rate": 9.796763074349147e-06, "loss": 0.8283, "step": 464 }, { "epoch": 27.41176470588235, "grad_norm": 1.1589727401733398, "learning_rate": 9.792881968227533e-06, "loss": 0.6633, "step": 466 }, { "epoch": 27.529411764705884, "grad_norm": 0.9757166504859924, "learning_rate": 9.788964937598688e-06, "loss": 0.7725, "step": 468 }, { "epoch": 27.647058823529413, "grad_norm": 1.1313936710357666, "learning_rate": 9.78501201182228e-06, "loss": 0.7581, "step": 470 }, { "epoch": 27.764705882352942, "grad_norm": 1.1437342166900635, "learning_rate": 9.781023220527013e-06, "loss": 0.7226, "step": 472 }, { "epoch": 27.88235294117647, "grad_norm": 1.1630206108093262, "learning_rate": 9.776998593610428e-06, "loss": 0.7693, "step": 474 }, { "epoch": 28.0, "grad_norm": 1.0083279609680176, "learning_rate": 9.77293816123866e-06, "loss": 0.7128, "step": 476 }, { "epoch": 28.0, "eval_loss": 0.8991827368736267, "eval_runtime": 14.5006, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 476 }, { "epoch": 28.11764705882353, "grad_norm": 1.1530383825302124, "learning_rate": 9.768841953846225e-06, "loss": 0.6908, "step": 478 }, { "epoch": 28.235294117647058, "grad_norm": 1.0489223003387451, "learning_rate": 9.764710002135784e-06, "loss": 0.675, "step": 480 }, { "epoch": 28.352941176470587, "grad_norm": 1.2449612617492676, "learning_rate": 9.760542337077914e-06, "loss": 0.7516, "step": 482 }, { "epoch": 28.470588235294116, "grad_norm": 1.1940374374389648, "learning_rate": 9.75633898991088e-06, "loss": 0.7681, "step": 484 }, { "epoch": 28.58823529411765, "grad_norm": 1.1063061952590942, "learning_rate": 9.752099992140401e-06, "loss": 0.7693, "step": 486 }, { "epoch": 28.705882352941178, "grad_norm": 1.1479785442352295, "learning_rate": 9.747825375539401e-06, "loss": 0.7108, "step": 488 }, { "epoch": 28.823529411764707, "grad_norm": 1.2331879138946533, "learning_rate": 9.743515172147793e-06, "loss": 0.7786, "step": 490 }, { "epoch": 28.941176470588236, "grad_norm": 1.1679853200912476, "learning_rate": 9.739169414272219e-06, "loss": 0.6474, "step": 492 }, { "epoch": 29.0, "eval_loss": 0.906444787979126, "eval_runtime": 14.4932, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 493 }, { "epoch": 29.058823529411764, "grad_norm": 1.0444296598434448, "learning_rate": 9.734788134485817e-06, "loss": 0.6756, "step": 494 }, { "epoch": 29.176470588235293, "grad_norm": 1.5380338430404663, "learning_rate": 9.73037136562798e-06, "loss": 0.7099, "step": 496 }, { "epoch": 29.294117647058822, "grad_norm": 1.166580080986023, "learning_rate": 9.7259191408041e-06, "loss": 0.7595, "step": 498 }, { "epoch": 29.41176470588235, "grad_norm": 1.2345106601715088, "learning_rate": 9.721431493385322e-06, "loss": 0.7026, "step": 500 }, { "epoch": 29.529411764705884, "grad_norm": 1.0901451110839844, "learning_rate": 9.71690845700831e-06, "loss": 0.6719, "step": 502 }, { "epoch": 29.647058823529413, "grad_norm": 1.1619518995285034, "learning_rate": 9.71235006557497e-06, "loss": 0.7517, "step": 504 }, { "epoch": 29.764705882352942, "grad_norm": 1.1259740591049194, "learning_rate": 9.707756353252213e-06, "loss": 0.7052, "step": 506 }, { "epoch": 29.88235294117647, "grad_norm": 1.1172682046890259, "learning_rate": 9.70312735447169e-06, "loss": 0.655, "step": 508 }, { "epoch": 30.0, "grad_norm": 1.306216835975647, "learning_rate": 9.698463103929542e-06, "loss": 0.6387, "step": 510 }, { "epoch": 30.0, "eval_loss": 0.9089268445968628, "eval_runtime": 14.4939, "eval_samples_per_second": 2.484, "eval_steps_per_second": 2.484, "step": 510 }, { "epoch": 30.11764705882353, "grad_norm": 1.258402705192566, "learning_rate": 9.693763636586135e-06, "loss": 0.716, "step": 512 }, { "epoch": 30.235294117647058, "grad_norm": 1.143336296081543, "learning_rate": 9.689028987665797e-06, "loss": 0.6283, "step": 514 }, { "epoch": 30.352941176470587, "grad_norm": 1.1861103773117065, "learning_rate": 9.684259192656554e-06, "loss": 0.6445, "step": 516 }, { "epoch": 30.470588235294116, "grad_norm": 1.2192977666854858, "learning_rate": 9.679454287309868e-06, "loss": 0.6928, "step": 518 }, { "epoch": 30.58823529411765, "grad_norm": 1.3194884061813354, "learning_rate": 9.674614307640368e-06, "loss": 0.733, "step": 520 }, { "epoch": 30.705882352941178, "grad_norm": 1.3853224515914917, "learning_rate": 9.669739289925578e-06, "loss": 0.6438, "step": 522 }, { "epoch": 30.823529411764707, "grad_norm": 1.4584524631500244, "learning_rate": 9.664829270705638e-06, "loss": 0.7003, "step": 524 }, { "epoch": 30.941176470588236, "grad_norm": 1.637763500213623, "learning_rate": 9.659884286783052e-06, "loss": 0.6846, "step": 526 }, { "epoch": 31.0, "eval_loss": 0.9096066355705261, "eval_runtime": 14.5201, "eval_samples_per_second": 2.479, "eval_steps_per_second": 2.479, "step": 527 }, { "epoch": 31.058823529411764, "grad_norm": 1.399101972579956, "learning_rate": 9.654904375222384e-06, "loss": 0.617, "step": 528 }, { "epoch": 31.176470588235293, "grad_norm": 1.3545421361923218, "learning_rate": 9.649889573350006e-06, "loss": 0.6534, "step": 530 }, { "epoch": 31.294117647058822, "grad_norm": 1.4606151580810547, "learning_rate": 9.644839918753796e-06, "loss": 0.6815, "step": 532 }, { "epoch": 31.41176470588235, "grad_norm": 1.435264229774475, "learning_rate": 9.639755449282874e-06, "loss": 0.6696, "step": 534 }, { "epoch": 31.529411764705884, "grad_norm": 1.2791359424591064, "learning_rate": 9.634636203047309e-06, "loss": 0.642, "step": 536 }, { "epoch": 31.647058823529413, "grad_norm": 1.2923133373260498, "learning_rate": 9.629482218417834e-06, "loss": 0.712, "step": 538 }, { "epoch": 31.764705882352942, "grad_norm": 1.2450653314590454, "learning_rate": 9.62429353402556e-06, "loss": 0.6357, "step": 540 }, { "epoch": 31.88235294117647, "grad_norm": 1.31989586353302, "learning_rate": 9.619070188761687e-06, "loss": 0.6692, "step": 542 }, { "epoch": 32.0, "grad_norm": 1.3321213722229004, "learning_rate": 9.613812221777212e-06, "loss": 0.6424, "step": 544 }, { "epoch": 32.0, "eval_loss": 0.9172940850257874, "eval_runtime": 14.4984, "eval_samples_per_second": 2.483, "eval_steps_per_second": 2.483, "step": 544 }, { "epoch": 32.11764705882353, "grad_norm": 1.2186630964279175, "learning_rate": 9.608519672482635e-06, "loss": 0.5872, "step": 546 }, { "epoch": 32.23529411764706, "grad_norm": 1.5495742559432983, "learning_rate": 9.603192580547664e-06, "loss": 0.6069, "step": 548 }, { "epoch": 32.35294117647059, "grad_norm": 1.551956295967102, "learning_rate": 9.597830985900913e-06, "loss": 0.6971, "step": 550 }, { "epoch": 32.470588235294116, "grad_norm": 1.5809985399246216, "learning_rate": 9.592434928729617e-06, "loss": 0.6887, "step": 552 }, { "epoch": 32.588235294117645, "grad_norm": 1.5837764739990234, "learning_rate": 9.58700444947931e-06, "loss": 0.6228, "step": 554 }, { "epoch": 32.705882352941174, "grad_norm": 1.4612311124801636, "learning_rate": 9.581539588853539e-06, "loss": 0.6002, "step": 556 }, { "epoch": 32.8235294117647, "grad_norm": 1.4830561876296997, "learning_rate": 9.576040387813553e-06, "loss": 0.6673, "step": 558 }, { "epoch": 32.94117647058823, "grad_norm": 1.5311380624771118, "learning_rate": 9.570506887577994e-06, "loss": 0.6598, "step": 560 }, { "epoch": 33.0, "eval_loss": 0.9237830638885498, "eval_runtime": 14.5075, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 561 }, { "epoch": 33.05882352941177, "grad_norm": 1.3405797481536865, "learning_rate": 9.564939129622591e-06, "loss": 0.6105, "step": 562 }, { "epoch": 33.1764705882353, "grad_norm": 1.4336148500442505, "learning_rate": 9.559337155679843e-06, "loss": 0.572, "step": 564 }, { "epoch": 33.294117647058826, "grad_norm": 1.4750621318817139, "learning_rate": 9.553701007738717e-06, "loss": 0.5598, "step": 566 }, { "epoch": 33.411764705882355, "grad_norm": 1.4853854179382324, "learning_rate": 9.54803072804433e-06, "loss": 0.6175, "step": 568 }, { "epoch": 33.529411764705884, "grad_norm": 1.5611326694488525, "learning_rate": 9.542326359097619e-06, "loss": 0.5898, "step": 570 }, { "epoch": 33.64705882352941, "grad_norm": 1.4341068267822266, "learning_rate": 9.536587943655043e-06, "loss": 0.6158, "step": 572 }, { "epoch": 33.76470588235294, "grad_norm": 1.3872367143630981, "learning_rate": 9.530815524728245e-06, "loss": 0.6776, "step": 574 }, { "epoch": 33.88235294117647, "grad_norm": 1.3841159343719482, "learning_rate": 9.525009145583746e-06, "loss": 0.6208, "step": 576 }, { "epoch": 34.0, "grad_norm": 1.5026782751083374, "learning_rate": 9.519168849742603e-06, "loss": 0.6634, "step": 578 }, { "epoch": 34.0, "eval_loss": 0.9289535880088806, "eval_runtime": 14.5116, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 578 }, { "epoch": 34.11764705882353, "grad_norm": 1.5542646646499634, "learning_rate": 9.5132946809801e-06, "loss": 0.6259, "step": 580 }, { "epoch": 34.23529411764706, "grad_norm": 1.337219476699829, "learning_rate": 9.507386683325404e-06, "loss": 0.5992, "step": 582 }, { "epoch": 34.35294117647059, "grad_norm": 1.744362235069275, "learning_rate": 9.501444901061248e-06, "loss": 0.5903, "step": 584 }, { "epoch": 34.470588235294116, "grad_norm": 1.5578619241714478, "learning_rate": 9.495469378723592e-06, "loss": 0.5371, "step": 586 }, { "epoch": 34.588235294117645, "grad_norm": 1.679646611213684, "learning_rate": 9.489460161101291e-06, "loss": 0.617, "step": 588 }, { "epoch": 34.705882352941174, "grad_norm": 1.5505824089050293, "learning_rate": 9.483417293235759e-06, "loss": 0.6008, "step": 590 }, { "epoch": 34.8235294117647, "grad_norm": 1.9452924728393555, "learning_rate": 9.477340820420633e-06, "loss": 0.5852, "step": 592 }, { "epoch": 34.94117647058823, "grad_norm": 1.5196162462234497, "learning_rate": 9.471230788201429e-06, "loss": 0.5893, "step": 594 }, { "epoch": 35.0, "eval_loss": 0.9399586915969849, "eval_runtime": 14.5128, "eval_samples_per_second": 2.481, "eval_steps_per_second": 2.481, "step": 595 }, { "epoch": 35.0, "step": 595, "total_flos": 7.576813686279373e+16, "train_loss": 1.1527870081052058, "train_runtime": 5617.142, "train_samples_per_second": 3.632, "train_steps_per_second": 0.454 } ], "logging_steps": 2, "max_steps": 2550, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 7, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.576813686279373e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }