GaetanMichelet's picture
Model save
0315e33 verified
{
"best_metric": 0.8991827368736267,
"best_model_checkpoint": "data/Llama-31-8B_task-1_180-samples_config-3_full/checkpoint-476",
"epoch": 35.0,
"eval_steps": 500,
"global_step": 595,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.058823529411764705,
"grad_norm": 1.858755111694336,
"learning_rate": 3.9215686274509804e-08,
"loss": 2.4463,
"step": 1
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.8338021039962769,
"learning_rate": 7.843137254901961e-08,
"loss": 2.4612,
"step": 2
},
{
"epoch": 0.23529411764705882,
"grad_norm": 1.7695642709732056,
"learning_rate": 1.5686274509803921e-07,
"loss": 2.3799,
"step": 4
},
{
"epoch": 0.35294117647058826,
"grad_norm": 1.7253705263137817,
"learning_rate": 2.3529411764705883e-07,
"loss": 2.4519,
"step": 6
},
{
"epoch": 0.47058823529411764,
"grad_norm": 2.068340301513672,
"learning_rate": 3.1372549019607843e-07,
"loss": 2.4357,
"step": 8
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.71905517578125,
"learning_rate": 3.921568627450981e-07,
"loss": 2.4114,
"step": 10
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.9960722923278809,
"learning_rate": 4.7058823529411767e-07,
"loss": 2.5452,
"step": 12
},
{
"epoch": 0.8235294117647058,
"grad_norm": 1.846751093864441,
"learning_rate": 5.490196078431373e-07,
"loss": 2.4838,
"step": 14
},
{
"epoch": 0.9411764705882353,
"grad_norm": 1.7892955541610718,
"learning_rate": 6.274509803921569e-07,
"loss": 2.4542,
"step": 16
},
{
"epoch": 1.0,
"eval_loss": 2.4258534908294678,
"eval_runtime": 14.49,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 2.484,
"step": 17
},
{
"epoch": 1.0588235294117647,
"grad_norm": 1.7462923526763916,
"learning_rate": 7.058823529411766e-07,
"loss": 2.4066,
"step": 18
},
{
"epoch": 1.1764705882352942,
"grad_norm": 1.6423271894454956,
"learning_rate": 7.843137254901962e-07,
"loss": 2.4084,
"step": 20
},
{
"epoch": 1.2941176470588236,
"grad_norm": 1.6562241315841675,
"learning_rate": 8.627450980392157e-07,
"loss": 2.4685,
"step": 22
},
{
"epoch": 1.4117647058823528,
"grad_norm": 1.5601104497909546,
"learning_rate": 9.411764705882353e-07,
"loss": 2.3986,
"step": 24
},
{
"epoch": 1.5294117647058822,
"grad_norm": 1.689158320426941,
"learning_rate": 1.019607843137255e-06,
"loss": 2.512,
"step": 26
},
{
"epoch": 1.6470588235294117,
"grad_norm": 1.7012155055999756,
"learning_rate": 1.0980392156862745e-06,
"loss": 2.434,
"step": 28
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1.533742070198059,
"learning_rate": 1.1764705882352942e-06,
"loss": 2.4145,
"step": 30
},
{
"epoch": 1.8823529411764706,
"grad_norm": 1.6920032501220703,
"learning_rate": 1.2549019607843137e-06,
"loss": 2.4019,
"step": 32
},
{
"epoch": 2.0,
"grad_norm": 1.5552300214767456,
"learning_rate": 1.3333333333333334e-06,
"loss": 2.4022,
"step": 34
},
{
"epoch": 2.0,
"eval_loss": 2.3881916999816895,
"eval_runtime": 14.4942,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 2.484,
"step": 34
},
{
"epoch": 2.1176470588235294,
"grad_norm": 1.5124330520629883,
"learning_rate": 1.4117647058823531e-06,
"loss": 2.3961,
"step": 36
},
{
"epoch": 2.235294117647059,
"grad_norm": 1.4209071397781372,
"learning_rate": 1.4901960784313726e-06,
"loss": 2.4343,
"step": 38
},
{
"epoch": 2.3529411764705883,
"grad_norm": 1.6290644407272339,
"learning_rate": 1.5686274509803923e-06,
"loss": 2.3528,
"step": 40
},
{
"epoch": 2.4705882352941178,
"grad_norm": 1.5042191743850708,
"learning_rate": 1.6470588235294118e-06,
"loss": 2.3555,
"step": 42
},
{
"epoch": 2.588235294117647,
"grad_norm": 1.6211644411087036,
"learning_rate": 1.7254901960784315e-06,
"loss": 2.3944,
"step": 44
},
{
"epoch": 2.7058823529411766,
"grad_norm": 1.5416394472122192,
"learning_rate": 1.8039215686274512e-06,
"loss": 2.3917,
"step": 46
},
{
"epoch": 2.8235294117647056,
"grad_norm": 1.697242259979248,
"learning_rate": 1.8823529411764707e-06,
"loss": 2.3457,
"step": 48
},
{
"epoch": 2.9411764705882355,
"grad_norm": 1.8130015134811401,
"learning_rate": 1.96078431372549e-06,
"loss": 2.3317,
"step": 50
},
{
"epoch": 3.0,
"eval_loss": 2.314044952392578,
"eval_runtime": 14.4935,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 2.484,
"step": 51
},
{
"epoch": 3.0588235294117645,
"grad_norm": 1.830562710762024,
"learning_rate": 2.03921568627451e-06,
"loss": 2.2938,
"step": 52
},
{
"epoch": 3.176470588235294,
"grad_norm": 1.8372972011566162,
"learning_rate": 2.1176470588235296e-06,
"loss": 2.3541,
"step": 54
},
{
"epoch": 3.2941176470588234,
"grad_norm": 1.8857609033584595,
"learning_rate": 2.196078431372549e-06,
"loss": 2.2888,
"step": 56
},
{
"epoch": 3.411764705882353,
"grad_norm": 1.7633429765701294,
"learning_rate": 2.274509803921569e-06,
"loss": 2.2616,
"step": 58
},
{
"epoch": 3.5294117647058822,
"grad_norm": 1.5656747817993164,
"learning_rate": 2.3529411764705885e-06,
"loss": 2.2801,
"step": 60
},
{
"epoch": 3.6470588235294117,
"grad_norm": 1.6285021305084229,
"learning_rate": 2.431372549019608e-06,
"loss": 2.3144,
"step": 62
},
{
"epoch": 3.764705882352941,
"grad_norm": 1.531112790107727,
"learning_rate": 2.5098039215686274e-06,
"loss": 2.2294,
"step": 64
},
{
"epoch": 3.8823529411764706,
"grad_norm": 1.527350664138794,
"learning_rate": 2.5882352941176473e-06,
"loss": 2.1976,
"step": 66
},
{
"epoch": 4.0,
"grad_norm": 2.009220838546753,
"learning_rate": 2.666666666666667e-06,
"loss": 2.2607,
"step": 68
},
{
"epoch": 4.0,
"eval_loss": 2.2050342559814453,
"eval_runtime": 14.4977,
"eval_samples_per_second": 2.483,
"eval_steps_per_second": 2.483,
"step": 68
},
{
"epoch": 4.117647058823529,
"grad_norm": 2.025496006011963,
"learning_rate": 2.7450980392156867e-06,
"loss": 2.1834,
"step": 70
},
{
"epoch": 4.235294117647059,
"grad_norm": 1.714682698249817,
"learning_rate": 2.8235294117647062e-06,
"loss": 2.1785,
"step": 72
},
{
"epoch": 4.352941176470588,
"grad_norm": 1.1758134365081787,
"learning_rate": 2.901960784313726e-06,
"loss": 2.2148,
"step": 74
},
{
"epoch": 4.470588235294118,
"grad_norm": 1.2965394258499146,
"learning_rate": 2.980392156862745e-06,
"loss": 2.1797,
"step": 76
},
{
"epoch": 4.588235294117647,
"grad_norm": 1.1413812637329102,
"learning_rate": 3.058823529411765e-06,
"loss": 2.1743,
"step": 78
},
{
"epoch": 4.705882352941177,
"grad_norm": 1.0636754035949707,
"learning_rate": 3.1372549019607846e-06,
"loss": 2.0559,
"step": 80
},
{
"epoch": 4.823529411764706,
"grad_norm": 1.1126306056976318,
"learning_rate": 3.2156862745098045e-06,
"loss": 2.1355,
"step": 82
},
{
"epoch": 4.9411764705882355,
"grad_norm": 1.1904844045639038,
"learning_rate": 3.2941176470588236e-06,
"loss": 2.1352,
"step": 84
},
{
"epoch": 5.0,
"eval_loss": 2.0643370151519775,
"eval_runtime": 14.5066,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 85
},
{
"epoch": 5.0588235294117645,
"grad_norm": 1.0779309272766113,
"learning_rate": 3.3725490196078435e-06,
"loss": 2.0248,
"step": 86
},
{
"epoch": 5.176470588235294,
"grad_norm": 1.107112169265747,
"learning_rate": 3.450980392156863e-06,
"loss": 2.1049,
"step": 88
},
{
"epoch": 5.294117647058823,
"grad_norm": 0.9876514077186584,
"learning_rate": 3.529411764705883e-06,
"loss": 2.0005,
"step": 90
},
{
"epoch": 5.411764705882353,
"grad_norm": 1.073117733001709,
"learning_rate": 3.6078431372549024e-06,
"loss": 1.9499,
"step": 92
},
{
"epoch": 5.529411764705882,
"grad_norm": 1.0594408512115479,
"learning_rate": 3.6862745098039223e-06,
"loss": 1.9899,
"step": 94
},
{
"epoch": 5.647058823529412,
"grad_norm": 1.0870219469070435,
"learning_rate": 3.7647058823529414e-06,
"loss": 1.9852,
"step": 96
},
{
"epoch": 5.764705882352941,
"grad_norm": 0.9945081472396851,
"learning_rate": 3.843137254901962e-06,
"loss": 1.9981,
"step": 98
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.8944886326789856,
"learning_rate": 3.92156862745098e-06,
"loss": 1.9126,
"step": 100
},
{
"epoch": 6.0,
"grad_norm": 0.8814469575881958,
"learning_rate": 4.000000000000001e-06,
"loss": 1.9456,
"step": 102
},
{
"epoch": 6.0,
"eval_loss": 1.888541340827942,
"eval_runtime": 14.5125,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 102
},
{
"epoch": 6.117647058823529,
"grad_norm": 0.8919170498847961,
"learning_rate": 4.07843137254902e-06,
"loss": 1.8157,
"step": 104
},
{
"epoch": 6.235294117647059,
"grad_norm": 0.8234829902648926,
"learning_rate": 4.15686274509804e-06,
"loss": 1.9187,
"step": 106
},
{
"epoch": 6.352941176470588,
"grad_norm": 0.8216582536697388,
"learning_rate": 4.235294117647059e-06,
"loss": 1.8121,
"step": 108
},
{
"epoch": 6.470588235294118,
"grad_norm": 0.8760618567466736,
"learning_rate": 4.313725490196079e-06,
"loss": 1.8794,
"step": 110
},
{
"epoch": 6.588235294117647,
"grad_norm": 0.90522301197052,
"learning_rate": 4.392156862745098e-06,
"loss": 1.7899,
"step": 112
},
{
"epoch": 6.705882352941177,
"grad_norm": 0.8919849395751953,
"learning_rate": 4.4705882352941184e-06,
"loss": 1.7929,
"step": 114
},
{
"epoch": 6.823529411764706,
"grad_norm": 1.0193332433700562,
"learning_rate": 4.549019607843138e-06,
"loss": 1.7409,
"step": 116
},
{
"epoch": 6.9411764705882355,
"grad_norm": 0.9497600793838501,
"learning_rate": 4.627450980392157e-06,
"loss": 1.7528,
"step": 118
},
{
"epoch": 7.0,
"eval_loss": 1.7024633884429932,
"eval_runtime": 14.5072,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 119
},
{
"epoch": 7.0588235294117645,
"grad_norm": 0.9311454892158508,
"learning_rate": 4.705882352941177e-06,
"loss": 1.7333,
"step": 120
},
{
"epoch": 7.176470588235294,
"grad_norm": 1.0313152074813843,
"learning_rate": 4.784313725490196e-06,
"loss": 1.7217,
"step": 122
},
{
"epoch": 7.294117647058823,
"grad_norm": 1.1278079748153687,
"learning_rate": 4.862745098039216e-06,
"loss": 1.6414,
"step": 124
},
{
"epoch": 7.411764705882353,
"grad_norm": 0.9751306176185608,
"learning_rate": 4.941176470588236e-06,
"loss": 1.6047,
"step": 126
},
{
"epoch": 7.529411764705882,
"grad_norm": 0.9619643688201904,
"learning_rate": 5.019607843137255e-06,
"loss": 1.5733,
"step": 128
},
{
"epoch": 7.647058823529412,
"grad_norm": 0.9418209195137024,
"learning_rate": 5.098039215686274e-06,
"loss": 1.5655,
"step": 130
},
{
"epoch": 7.764705882352941,
"grad_norm": 0.986770749092102,
"learning_rate": 5.176470588235295e-06,
"loss": 1.5325,
"step": 132
},
{
"epoch": 7.882352941176471,
"grad_norm": 0.8657909631729126,
"learning_rate": 5.254901960784314e-06,
"loss": 1.5042,
"step": 134
},
{
"epoch": 8.0,
"grad_norm": 0.8987972736358643,
"learning_rate": 5.333333333333334e-06,
"loss": 1.4935,
"step": 136
},
{
"epoch": 8.0,
"eval_loss": 1.467383861541748,
"eval_runtime": 14.5108,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 136
},
{
"epoch": 8.117647058823529,
"grad_norm": 0.8275275826454163,
"learning_rate": 5.411764705882353e-06,
"loss": 1.4215,
"step": 138
},
{
"epoch": 8.235294117647058,
"grad_norm": 0.9540057182312012,
"learning_rate": 5.4901960784313735e-06,
"loss": 1.4698,
"step": 140
},
{
"epoch": 8.352941176470589,
"grad_norm": 0.9684072136878967,
"learning_rate": 5.568627450980393e-06,
"loss": 1.4359,
"step": 142
},
{
"epoch": 8.470588235294118,
"grad_norm": 0.9229031801223755,
"learning_rate": 5.6470588235294125e-06,
"loss": 1.3994,
"step": 144
},
{
"epoch": 8.588235294117647,
"grad_norm": 0.8458110094070435,
"learning_rate": 5.725490196078431e-06,
"loss": 1.3096,
"step": 146
},
{
"epoch": 8.705882352941176,
"grad_norm": 0.9069352746009827,
"learning_rate": 5.803921568627452e-06,
"loss": 1.2347,
"step": 148
},
{
"epoch": 8.823529411764707,
"grad_norm": 0.8469833731651306,
"learning_rate": 5.882352941176471e-06,
"loss": 1.332,
"step": 150
},
{
"epoch": 8.941176470588236,
"grad_norm": 0.8933460116386414,
"learning_rate": 5.96078431372549e-06,
"loss": 1.2733,
"step": 152
},
{
"epoch": 9.0,
"eval_loss": 1.2421215772628784,
"eval_runtime": 14.5471,
"eval_samples_per_second": 2.475,
"eval_steps_per_second": 2.475,
"step": 153
},
{
"epoch": 9.058823529411764,
"grad_norm": 0.8019786477088928,
"learning_rate": 6.03921568627451e-06,
"loss": 1.1929,
"step": 154
},
{
"epoch": 9.176470588235293,
"grad_norm": 0.7300643920898438,
"learning_rate": 6.11764705882353e-06,
"loss": 1.2392,
"step": 156
},
{
"epoch": 9.294117647058824,
"grad_norm": 0.809948742389679,
"learning_rate": 6.19607843137255e-06,
"loss": 1.1685,
"step": 158
},
{
"epoch": 9.411764705882353,
"grad_norm": 0.6852974891662598,
"learning_rate": 6.274509803921569e-06,
"loss": 1.168,
"step": 160
},
{
"epoch": 9.529411764705882,
"grad_norm": 0.709697961807251,
"learning_rate": 6.352941176470589e-06,
"loss": 1.1333,
"step": 162
},
{
"epoch": 9.647058823529411,
"grad_norm": 0.7923583388328552,
"learning_rate": 6.431372549019609e-06,
"loss": 1.1475,
"step": 164
},
{
"epoch": 9.764705882352942,
"grad_norm": 0.7233794927597046,
"learning_rate": 6.5098039215686285e-06,
"loss": 1.1775,
"step": 166
},
{
"epoch": 9.882352941176471,
"grad_norm": 0.7074316740036011,
"learning_rate": 6.588235294117647e-06,
"loss": 1.1279,
"step": 168
},
{
"epoch": 10.0,
"grad_norm": 0.6581458449363708,
"learning_rate": 6.666666666666667e-06,
"loss": 1.1154,
"step": 170
},
{
"epoch": 10.0,
"eval_loss": 1.1133772134780884,
"eval_runtime": 14.5122,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 170
},
{
"epoch": 10.117647058823529,
"grad_norm": 0.6955820918083191,
"learning_rate": 6.745098039215687e-06,
"loss": 1.0662,
"step": 172
},
{
"epoch": 10.235294117647058,
"grad_norm": 0.5870165824890137,
"learning_rate": 6.8235294117647065e-06,
"loss": 1.0219,
"step": 174
},
{
"epoch": 10.352941176470589,
"grad_norm": 0.6177704334259033,
"learning_rate": 6.901960784313726e-06,
"loss": 1.0537,
"step": 176
},
{
"epoch": 10.470588235294118,
"grad_norm": 0.6390775442123413,
"learning_rate": 6.9803921568627454e-06,
"loss": 1.1001,
"step": 178
},
{
"epoch": 10.588235294117647,
"grad_norm": 0.4973801374435425,
"learning_rate": 7.058823529411766e-06,
"loss": 1.0578,
"step": 180
},
{
"epoch": 10.705882352941176,
"grad_norm": 0.518943190574646,
"learning_rate": 7.137254901960785e-06,
"loss": 1.1447,
"step": 182
},
{
"epoch": 10.823529411764707,
"grad_norm": 0.6414965987205505,
"learning_rate": 7.215686274509805e-06,
"loss": 1.0872,
"step": 184
},
{
"epoch": 10.941176470588236,
"grad_norm": 0.508786678314209,
"learning_rate": 7.294117647058823e-06,
"loss": 1.1202,
"step": 186
},
{
"epoch": 11.0,
"eval_loss": 1.0689375400543213,
"eval_runtime": 14.505,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 187
},
{
"epoch": 11.058823529411764,
"grad_norm": 0.48530295491218567,
"learning_rate": 7.372549019607845e-06,
"loss": 1.0999,
"step": 188
},
{
"epoch": 11.176470588235293,
"grad_norm": 0.5133592486381531,
"learning_rate": 7.450980392156863e-06,
"loss": 1.0864,
"step": 190
},
{
"epoch": 11.294117647058824,
"grad_norm": 0.49263596534729004,
"learning_rate": 7.529411764705883e-06,
"loss": 1.0535,
"step": 192
},
{
"epoch": 11.411764705882353,
"grad_norm": 0.4610048532485962,
"learning_rate": 7.607843137254902e-06,
"loss": 1.0462,
"step": 194
},
{
"epoch": 11.529411764705882,
"grad_norm": 0.5121297836303711,
"learning_rate": 7.686274509803923e-06,
"loss": 1.0862,
"step": 196
},
{
"epoch": 11.647058823529411,
"grad_norm": 0.5441015958786011,
"learning_rate": 7.764705882352941e-06,
"loss": 1.0068,
"step": 198
},
{
"epoch": 11.764705882352942,
"grad_norm": 0.5135095119476318,
"learning_rate": 7.84313725490196e-06,
"loss": 1.0548,
"step": 200
},
{
"epoch": 11.882352941176471,
"grad_norm": 0.4792177081108093,
"learning_rate": 7.92156862745098e-06,
"loss": 0.9711,
"step": 202
},
{
"epoch": 12.0,
"grad_norm": 0.45314979553222656,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9449,
"step": 204
},
{
"epoch": 12.0,
"eval_loss": 1.0450434684753418,
"eval_runtime": 14.5066,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 204
},
{
"epoch": 12.117647058823529,
"grad_norm": 0.5007625818252563,
"learning_rate": 8.07843137254902e-06,
"loss": 1.0258,
"step": 206
},
{
"epoch": 12.235294117647058,
"grad_norm": 0.5184361934661865,
"learning_rate": 8.15686274509804e-06,
"loss": 1.0845,
"step": 208
},
{
"epoch": 12.352941176470589,
"grad_norm": 0.44266751408576965,
"learning_rate": 8.23529411764706e-06,
"loss": 1.0005,
"step": 210
},
{
"epoch": 12.470588235294118,
"grad_norm": 0.5165805220603943,
"learning_rate": 8.31372549019608e-06,
"loss": 1.0242,
"step": 212
},
{
"epoch": 12.588235294117647,
"grad_norm": 0.5037981867790222,
"learning_rate": 8.392156862745099e-06,
"loss": 0.9857,
"step": 214
},
{
"epoch": 12.705882352941176,
"grad_norm": 0.5604737997055054,
"learning_rate": 8.470588235294118e-06,
"loss": 1.0086,
"step": 216
},
{
"epoch": 12.823529411764707,
"grad_norm": 0.6752682328224182,
"learning_rate": 8.549019607843138e-06,
"loss": 1.0277,
"step": 218
},
{
"epoch": 12.941176470588236,
"grad_norm": 0.5517321228981018,
"learning_rate": 8.627450980392157e-06,
"loss": 0.9973,
"step": 220
},
{
"epoch": 13.0,
"eval_loss": 1.0252662897109985,
"eval_runtime": 14.5028,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 221
},
{
"epoch": 13.058823529411764,
"grad_norm": 0.6104453802108765,
"learning_rate": 8.705882352941177e-06,
"loss": 0.9835,
"step": 222
},
{
"epoch": 13.176470588235293,
"grad_norm": 0.47119539976119995,
"learning_rate": 8.784313725490196e-06,
"loss": 1.0076,
"step": 224
},
{
"epoch": 13.294117647058824,
"grad_norm": 0.4882214367389679,
"learning_rate": 8.862745098039216e-06,
"loss": 0.9808,
"step": 226
},
{
"epoch": 13.411764705882353,
"grad_norm": 0.7123433947563171,
"learning_rate": 8.941176470588237e-06,
"loss": 0.9676,
"step": 228
},
{
"epoch": 13.529411764705882,
"grad_norm": 0.5918748378753662,
"learning_rate": 9.019607843137256e-06,
"loss": 1.0068,
"step": 230
},
{
"epoch": 13.647058823529411,
"grad_norm": 0.5302197337150574,
"learning_rate": 9.098039215686276e-06,
"loss": 0.9573,
"step": 232
},
{
"epoch": 13.764705882352942,
"grad_norm": 0.5693833827972412,
"learning_rate": 9.176470588235294e-06,
"loss": 0.9914,
"step": 234
},
{
"epoch": 13.882352941176471,
"grad_norm": 0.490904837846756,
"learning_rate": 9.254901960784315e-06,
"loss": 1.032,
"step": 236
},
{
"epoch": 14.0,
"grad_norm": 0.5507678389549255,
"learning_rate": 9.333333333333334e-06,
"loss": 1.0562,
"step": 238
},
{
"epoch": 14.0,
"eval_loss": 1.0090599060058594,
"eval_runtime": 14.506,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 238
},
{
"epoch": 14.117647058823529,
"grad_norm": 0.6389086246490479,
"learning_rate": 9.411764705882354e-06,
"loss": 0.9853,
"step": 240
},
{
"epoch": 14.235294117647058,
"grad_norm": 0.5049781203269958,
"learning_rate": 9.490196078431373e-06,
"loss": 1.0067,
"step": 242
},
{
"epoch": 14.352941176470589,
"grad_norm": 0.7086266279220581,
"learning_rate": 9.568627450980393e-06,
"loss": 0.9387,
"step": 244
},
{
"epoch": 14.470588235294118,
"grad_norm": 0.5628448128700256,
"learning_rate": 9.647058823529412e-06,
"loss": 1.0068,
"step": 246
},
{
"epoch": 14.588235294117647,
"grad_norm": 0.6910731196403503,
"learning_rate": 9.725490196078432e-06,
"loss": 1.0007,
"step": 248
},
{
"epoch": 14.705882352941176,
"grad_norm": 0.6134346127510071,
"learning_rate": 9.803921568627451e-06,
"loss": 0.9456,
"step": 250
},
{
"epoch": 14.823529411764707,
"grad_norm": 0.6747128963470459,
"learning_rate": 9.882352941176472e-06,
"loss": 0.9506,
"step": 252
},
{
"epoch": 14.941176470588236,
"grad_norm": 0.5889897346496582,
"learning_rate": 9.960784313725492e-06,
"loss": 0.9947,
"step": 254
},
{
"epoch": 15.0,
"eval_loss": 0.9928128719329834,
"eval_runtime": 14.4936,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 2.484,
"step": 255
},
{
"epoch": 15.058823529411764,
"grad_norm": 0.5487807989120483,
"learning_rate": 9.999995315380667e-06,
"loss": 0.9354,
"step": 256
},
{
"epoch": 15.176470588235293,
"grad_norm": 0.6178866624832153,
"learning_rate": 9.99995783847866e-06,
"loss": 0.9655,
"step": 258
},
{
"epoch": 15.294117647058824,
"grad_norm": 0.5696916580200195,
"learning_rate": 9.999882884955554e-06,
"loss": 0.9468,
"step": 260
},
{
"epoch": 15.411764705882353,
"grad_norm": 0.6009863615036011,
"learning_rate": 9.99977045537315e-06,
"loss": 0.9852,
"step": 262
},
{
"epoch": 15.529411764705882,
"grad_norm": 0.6040264368057251,
"learning_rate": 9.999620550574155e-06,
"loss": 0.9553,
"step": 264
},
{
"epoch": 15.647058823529411,
"grad_norm": 0.6321269869804382,
"learning_rate": 9.999433171682158e-06,
"loss": 0.9419,
"step": 266
},
{
"epoch": 15.764705882352942,
"grad_norm": 0.6273146867752075,
"learning_rate": 9.999208320101643e-06,
"loss": 0.9715,
"step": 268
},
{
"epoch": 15.882352941176471,
"grad_norm": 0.6734570860862732,
"learning_rate": 9.998945997517957e-06,
"loss": 0.918,
"step": 270
},
{
"epoch": 16.0,
"grad_norm": 0.7102432250976562,
"learning_rate": 9.99864620589731e-06,
"loss": 1.0096,
"step": 272
},
{
"epoch": 16.0,
"eval_loss": 0.9803969264030457,
"eval_runtime": 14.5083,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 272
},
{
"epoch": 16.11764705882353,
"grad_norm": 0.6154859066009521,
"learning_rate": 9.998308947486753e-06,
"loss": 0.8898,
"step": 274
},
{
"epoch": 16.235294117647058,
"grad_norm": 0.6435267329216003,
"learning_rate": 9.997934224814173e-06,
"loss": 0.9271,
"step": 276
},
{
"epoch": 16.352941176470587,
"grad_norm": 0.7057787775993347,
"learning_rate": 9.997522040688258e-06,
"loss": 1.0,
"step": 278
},
{
"epoch": 16.470588235294116,
"grad_norm": 0.6257563233375549,
"learning_rate": 9.997072398198492e-06,
"loss": 0.973,
"step": 280
},
{
"epoch": 16.58823529411765,
"grad_norm": 0.6798095703125,
"learning_rate": 9.996585300715117e-06,
"loss": 0.9625,
"step": 282
},
{
"epoch": 16.705882352941178,
"grad_norm": 0.7027468681335449,
"learning_rate": 9.996060751889114e-06,
"loss": 0.9529,
"step": 284
},
{
"epoch": 16.823529411764707,
"grad_norm": 0.6210634708404541,
"learning_rate": 9.995498755652186e-06,
"loss": 0.8968,
"step": 286
},
{
"epoch": 16.941176470588236,
"grad_norm": 0.6995490789413452,
"learning_rate": 9.994899316216709e-06,
"loss": 0.9222,
"step": 288
},
{
"epoch": 17.0,
"eval_loss": 0.9691942930221558,
"eval_runtime": 14.5044,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 289
},
{
"epoch": 17.058823529411764,
"grad_norm": 0.6503624320030212,
"learning_rate": 9.994262438075713e-06,
"loss": 0.9487,
"step": 290
},
{
"epoch": 17.176470588235293,
"grad_norm": 0.6647483706474304,
"learning_rate": 9.993588126002848e-06,
"loss": 0.9163,
"step": 292
},
{
"epoch": 17.294117647058822,
"grad_norm": 0.7215944528579712,
"learning_rate": 9.992876385052346e-06,
"loss": 0.8638,
"step": 294
},
{
"epoch": 17.41176470588235,
"grad_norm": 0.7234969139099121,
"learning_rate": 9.992127220558976e-06,
"loss": 0.9037,
"step": 296
},
{
"epoch": 17.529411764705884,
"grad_norm": 0.7656229138374329,
"learning_rate": 9.991340638138022e-06,
"loss": 0.9633,
"step": 298
},
{
"epoch": 17.647058823529413,
"grad_norm": 0.6850258111953735,
"learning_rate": 9.990516643685222e-06,
"loss": 0.9458,
"step": 300
},
{
"epoch": 17.764705882352942,
"grad_norm": 0.7975447773933411,
"learning_rate": 9.98965524337673e-06,
"loss": 0.9801,
"step": 302
},
{
"epoch": 17.88235294117647,
"grad_norm": 0.7075424790382385,
"learning_rate": 9.988756443669081e-06,
"loss": 0.888,
"step": 304
},
{
"epoch": 18.0,
"grad_norm": 0.85096675157547,
"learning_rate": 9.987820251299121e-06,
"loss": 0.8838,
"step": 306
},
{
"epoch": 18.0,
"eval_loss": 0.9602956771850586,
"eval_runtime": 14.5129,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 306
},
{
"epoch": 18.11764705882353,
"grad_norm": 0.698685884475708,
"learning_rate": 9.98684667328398e-06,
"loss": 0.8838,
"step": 308
},
{
"epoch": 18.235294117647058,
"grad_norm": 0.7671274542808533,
"learning_rate": 9.985835716921e-06,
"loss": 0.9012,
"step": 310
},
{
"epoch": 18.352941176470587,
"grad_norm": 0.8342521786689758,
"learning_rate": 9.984787389787689e-06,
"loss": 0.9412,
"step": 312
},
{
"epoch": 18.470588235294116,
"grad_norm": 0.6886960864067078,
"learning_rate": 9.983701699741668e-06,
"loss": 0.8946,
"step": 314
},
{
"epoch": 18.58823529411765,
"grad_norm": 0.7856888175010681,
"learning_rate": 9.982578654920601e-06,
"loss": 0.9169,
"step": 316
},
{
"epoch": 18.705882352941178,
"grad_norm": 0.7338317036628723,
"learning_rate": 9.981418263742148e-06,
"loss": 0.8584,
"step": 318
},
{
"epoch": 18.823529411764707,
"grad_norm": 0.727165699005127,
"learning_rate": 9.980220534903889e-06,
"loss": 0.9385,
"step": 320
},
{
"epoch": 18.941176470588236,
"grad_norm": 0.777866542339325,
"learning_rate": 9.978985477383264e-06,
"loss": 0.8942,
"step": 322
},
{
"epoch": 19.0,
"eval_loss": 0.951096773147583,
"eval_runtime": 14.4924,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 2.484,
"step": 323
},
{
"epoch": 19.058823529411764,
"grad_norm": 0.6845978498458862,
"learning_rate": 9.97771310043751e-06,
"loss": 0.8752,
"step": 324
},
{
"epoch": 19.176470588235293,
"grad_norm": 0.7632399201393127,
"learning_rate": 9.97640341360358e-06,
"loss": 0.9616,
"step": 326
},
{
"epoch": 19.294117647058822,
"grad_norm": 0.7852567434310913,
"learning_rate": 9.975056426698094e-06,
"loss": 0.8884,
"step": 328
},
{
"epoch": 19.41176470588235,
"grad_norm": 0.7355157136917114,
"learning_rate": 9.973672149817232e-06,
"loss": 0.8175,
"step": 330
},
{
"epoch": 19.529411764705884,
"grad_norm": 0.7707788348197937,
"learning_rate": 9.972250593336689e-06,
"loss": 0.8878,
"step": 332
},
{
"epoch": 19.647058823529413,
"grad_norm": 1.0082019567489624,
"learning_rate": 9.970791767911581e-06,
"loss": 0.9118,
"step": 334
},
{
"epoch": 19.764705882352942,
"grad_norm": 0.8013073205947876,
"learning_rate": 9.96929568447637e-06,
"loss": 0.8724,
"step": 336
},
{
"epoch": 19.88235294117647,
"grad_norm": 0.6911207437515259,
"learning_rate": 9.967762354244778e-06,
"loss": 0.8832,
"step": 338
},
{
"epoch": 20.0,
"grad_norm": 0.8336138725280762,
"learning_rate": 9.966191788709716e-06,
"loss": 0.9058,
"step": 340
},
{
"epoch": 20.0,
"eval_loss": 0.9431850910186768,
"eval_runtime": 14.5083,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 340
},
{
"epoch": 20.11764705882353,
"grad_norm": 0.7745249271392822,
"learning_rate": 9.964583999643174e-06,
"loss": 0.878,
"step": 342
},
{
"epoch": 20.235294117647058,
"grad_norm": 0.7922182083129883,
"learning_rate": 9.962938999096159e-06,
"loss": 0.8275,
"step": 344
},
{
"epoch": 20.352941176470587,
"grad_norm": 0.8610040545463562,
"learning_rate": 9.961256799398584e-06,
"loss": 0.94,
"step": 346
},
{
"epoch": 20.470588235294116,
"grad_norm": 0.9406768083572388,
"learning_rate": 9.95953741315919e-06,
"loss": 0.8779,
"step": 348
},
{
"epoch": 20.58823529411765,
"grad_norm": 0.8344603180885315,
"learning_rate": 9.957780853265441e-06,
"loss": 0.8318,
"step": 350
},
{
"epoch": 20.705882352941178,
"grad_norm": 0.8624390363693237,
"learning_rate": 9.955987132883435e-06,
"loss": 0.8644,
"step": 352
},
{
"epoch": 20.823529411764707,
"grad_norm": 0.7996507287025452,
"learning_rate": 9.954156265457801e-06,
"loss": 0.8656,
"step": 354
},
{
"epoch": 20.941176470588236,
"grad_norm": 0.9234054684638977,
"learning_rate": 9.952288264711601e-06,
"loss": 0.8837,
"step": 356
},
{
"epoch": 21.0,
"eval_loss": 0.9354000091552734,
"eval_runtime": 14.5044,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 357
},
{
"epoch": 21.058823529411764,
"grad_norm": 0.793875515460968,
"learning_rate": 9.950383144646221e-06,
"loss": 0.8662,
"step": 358
},
{
"epoch": 21.176470588235293,
"grad_norm": 0.8161793947219849,
"learning_rate": 9.948440919541277e-06,
"loss": 0.8713,
"step": 360
},
{
"epoch": 21.294117647058822,
"grad_norm": 0.9452466368675232,
"learning_rate": 9.946461603954499e-06,
"loss": 0.9299,
"step": 362
},
{
"epoch": 21.41176470588235,
"grad_norm": 0.8712689876556396,
"learning_rate": 9.944445212721619e-06,
"loss": 0.84,
"step": 364
},
{
"epoch": 21.529411764705884,
"grad_norm": 0.8613099455833435,
"learning_rate": 9.942391760956277e-06,
"loss": 0.8523,
"step": 366
},
{
"epoch": 21.647058823529413,
"grad_norm": 1.0285900831222534,
"learning_rate": 9.940301264049885e-06,
"loss": 0.8411,
"step": 368
},
{
"epoch": 21.764705882352942,
"grad_norm": 0.9434134364128113,
"learning_rate": 9.938173737671531e-06,
"loss": 0.819,
"step": 370
},
{
"epoch": 21.88235294117647,
"grad_norm": 0.9282283782958984,
"learning_rate": 9.936009197767847e-06,
"loss": 0.8783,
"step": 372
},
{
"epoch": 22.0,
"grad_norm": 0.9603204131126404,
"learning_rate": 9.933807660562898e-06,
"loss": 0.795,
"step": 374
},
{
"epoch": 22.0,
"eval_loss": 0.9314696788787842,
"eval_runtime": 14.5014,
"eval_samples_per_second": 2.483,
"eval_steps_per_second": 2.483,
"step": 374
},
{
"epoch": 22.11764705882353,
"grad_norm": 0.9283419847488403,
"learning_rate": 9.931569142558057e-06,
"loss": 0.8911,
"step": 376
},
{
"epoch": 22.235294117647058,
"grad_norm": 0.985173761844635,
"learning_rate": 9.929293660531889e-06,
"loss": 0.8351,
"step": 378
},
{
"epoch": 22.352941176470587,
"grad_norm": 0.9488443732261658,
"learning_rate": 9.926981231540007e-06,
"loss": 0.8245,
"step": 380
},
{
"epoch": 22.470588235294116,
"grad_norm": 1.0252861976623535,
"learning_rate": 9.924631872914967e-06,
"loss": 0.8096,
"step": 382
},
{
"epoch": 22.58823529411765,
"grad_norm": 0.8986847996711731,
"learning_rate": 9.922245602266119e-06,
"loss": 0.8311,
"step": 384
},
{
"epoch": 22.705882352941178,
"grad_norm": 0.9069613218307495,
"learning_rate": 9.919822437479488e-06,
"loss": 0.7961,
"step": 386
},
{
"epoch": 22.823529411764707,
"grad_norm": 0.8006130456924438,
"learning_rate": 9.91736239671763e-06,
"loss": 0.866,
"step": 388
},
{
"epoch": 22.941176470588236,
"grad_norm": 0.8258039355278015,
"learning_rate": 9.91486549841951e-06,
"loss": 0.8395,
"step": 390
},
{
"epoch": 23.0,
"eval_loss": 0.9243198037147522,
"eval_runtime": 14.5142,
"eval_samples_per_second": 2.48,
"eval_steps_per_second": 2.48,
"step": 391
},
{
"epoch": 23.058823529411764,
"grad_norm": 1.0394818782806396,
"learning_rate": 9.912331761300341e-06,
"loss": 0.787,
"step": 392
},
{
"epoch": 23.176470588235293,
"grad_norm": 0.9367055892944336,
"learning_rate": 9.909761204351469e-06,
"loss": 0.8501,
"step": 394
},
{
"epoch": 23.294117647058822,
"grad_norm": 1.0531871318817139,
"learning_rate": 9.90715384684021e-06,
"loss": 0.8855,
"step": 396
},
{
"epoch": 23.41176470588235,
"grad_norm": 0.9447432160377502,
"learning_rate": 9.904509708309723e-06,
"loss": 0.7717,
"step": 398
},
{
"epoch": 23.529411764705884,
"grad_norm": 0.9809987545013428,
"learning_rate": 9.901828808578846e-06,
"loss": 0.7949,
"step": 400
},
{
"epoch": 23.647058823529413,
"grad_norm": 0.9656151533126831,
"learning_rate": 9.899111167741966e-06,
"loss": 0.8286,
"step": 402
},
{
"epoch": 23.764705882352942,
"grad_norm": 1.0195831060409546,
"learning_rate": 9.896356806168851e-06,
"loss": 0.8478,
"step": 404
},
{
"epoch": 23.88235294117647,
"grad_norm": 1.144056797027588,
"learning_rate": 9.89356574450451e-06,
"loss": 0.7723,
"step": 406
},
{
"epoch": 24.0,
"grad_norm": 1.0349133014678955,
"learning_rate": 9.890738003669029e-06,
"loss": 0.8308,
"step": 408
},
{
"epoch": 24.0,
"eval_loss": 0.9168965816497803,
"eval_runtime": 14.494,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 2.484,
"step": 408
},
{
"epoch": 24.11764705882353,
"grad_norm": 1.003952980041504,
"learning_rate": 9.887873604857424e-06,
"loss": 0.8492,
"step": 410
},
{
"epoch": 24.235294117647058,
"grad_norm": 1.1212753057479858,
"learning_rate": 9.884972569539471e-06,
"loss": 0.8037,
"step": 412
},
{
"epoch": 24.352941176470587,
"grad_norm": 0.995343029499054,
"learning_rate": 9.882034919459556e-06,
"loss": 0.765,
"step": 414
},
{
"epoch": 24.470588235294116,
"grad_norm": 1.0651168823242188,
"learning_rate": 9.879060676636502e-06,
"loss": 0.8008,
"step": 416
},
{
"epoch": 24.58823529411765,
"grad_norm": 1.1323087215423584,
"learning_rate": 9.876049863363415e-06,
"loss": 0.8154,
"step": 418
},
{
"epoch": 24.705882352941178,
"grad_norm": 1.118166446685791,
"learning_rate": 9.873002502207502e-06,
"loss": 0.7665,
"step": 420
},
{
"epoch": 24.823529411764707,
"grad_norm": 1.1308856010437012,
"learning_rate": 9.86991861600992e-06,
"loss": 0.8056,
"step": 422
},
{
"epoch": 24.941176470588236,
"grad_norm": 1.0739870071411133,
"learning_rate": 9.866798227885588e-06,
"loss": 0.7863,
"step": 424
},
{
"epoch": 25.0,
"eval_loss": 0.9137818217277527,
"eval_runtime": 14.4961,
"eval_samples_per_second": 2.483,
"eval_steps_per_second": 2.483,
"step": 425
},
{
"epoch": 25.058823529411764,
"grad_norm": 0.947708785533905,
"learning_rate": 9.863641361223025e-06,
"loss": 0.746,
"step": 426
},
{
"epoch": 25.176470588235293,
"grad_norm": 1.226585030555725,
"learning_rate": 9.860448039684169e-06,
"loss": 0.7622,
"step": 428
},
{
"epoch": 25.294117647058822,
"grad_norm": 1.2553542852401733,
"learning_rate": 9.857218287204204e-06,
"loss": 0.784,
"step": 430
},
{
"epoch": 25.41176470588235,
"grad_norm": 1.130286455154419,
"learning_rate": 9.853952127991374e-06,
"loss": 0.78,
"step": 432
},
{
"epoch": 25.529411764705884,
"grad_norm": 1.2538301944732666,
"learning_rate": 9.850649586526808e-06,
"loss": 0.7608,
"step": 434
},
{
"epoch": 25.647058823529413,
"grad_norm": 1.170310378074646,
"learning_rate": 9.847310687564335e-06,
"loss": 0.8389,
"step": 436
},
{
"epoch": 25.764705882352942,
"grad_norm": 0.9732166528701782,
"learning_rate": 9.843935456130295e-06,
"loss": 0.8158,
"step": 438
},
{
"epoch": 25.88235294117647,
"grad_norm": 1.2474738359451294,
"learning_rate": 9.840523917523354e-06,
"loss": 0.7528,
"step": 440
},
{
"epoch": 26.0,
"grad_norm": 1.130893349647522,
"learning_rate": 9.83707609731432e-06,
"loss": 0.7468,
"step": 442
},
{
"epoch": 26.0,
"eval_loss": 0.9068209528923035,
"eval_runtime": 14.5043,
"eval_samples_per_second": 2.482,
"eval_steps_per_second": 2.482,
"step": 442
},
{
"epoch": 26.11764705882353,
"grad_norm": 1.0452311038970947,
"learning_rate": 9.833592021345938e-06,
"loss": 0.7589,
"step": 444
},
{
"epoch": 26.235294117647058,
"grad_norm": 0.9809611439704895,
"learning_rate": 9.830071715732708e-06,
"loss": 0.8016,
"step": 446
},
{
"epoch": 26.352941176470587,
"grad_norm": 1.0656489133834839,
"learning_rate": 9.826515206860683e-06,
"loss": 0.7417,
"step": 448
},
{
"epoch": 26.470588235294116,
"grad_norm": 1.1188890933990479,
"learning_rate": 9.822922521387277e-06,
"loss": 0.7569,
"step": 450
},
{
"epoch": 26.58823529411765,
"grad_norm": 1.087983250617981,
"learning_rate": 9.819293686241057e-06,
"loss": 0.7596,
"step": 452
},
{
"epoch": 26.705882352941178,
"grad_norm": 1.0073840618133545,
"learning_rate": 9.81562872862155e-06,
"loss": 0.7423,
"step": 454
},
{
"epoch": 26.823529411764707,
"grad_norm": 1.0083576440811157,
"learning_rate": 9.811927675999035e-06,
"loss": 0.7533,
"step": 456
},
{
"epoch": 26.941176470588236,
"grad_norm": 1.0545302629470825,
"learning_rate": 9.808190556114333e-06,
"loss": 0.7658,
"step": 458
},
{
"epoch": 27.0,
"eval_loss": 0.9007807970046997,
"eval_runtime": 14.5307,
"eval_samples_per_second": 2.478,
"eval_steps_per_second": 2.478,
"step": 459
},
{
"epoch": 27.058823529411764,
"grad_norm": 0.9539656043052673,
"learning_rate": 9.804417396978605e-06,
"loss": 0.7658,
"step": 460
},
{
"epoch": 27.176470588235293,
"grad_norm": 1.044712781906128,
"learning_rate": 9.800608226873143e-06,
"loss": 0.6566,
"step": 462
},
{
"epoch": 27.294117647058822,
"grad_norm": 1.3112603425979614,
"learning_rate": 9.796763074349147e-06,
"loss": 0.8283,
"step": 464
},
{
"epoch": 27.41176470588235,
"grad_norm": 1.1589727401733398,
"learning_rate": 9.792881968227533e-06,
"loss": 0.6633,
"step": 466
},
{
"epoch": 27.529411764705884,
"grad_norm": 0.9757166504859924,
"learning_rate": 9.788964937598688e-06,
"loss": 0.7725,
"step": 468
},
{
"epoch": 27.647058823529413,
"grad_norm": 1.1313936710357666,
"learning_rate": 9.78501201182228e-06,
"loss": 0.7581,
"step": 470
},
{
"epoch": 27.764705882352942,
"grad_norm": 1.1437342166900635,
"learning_rate": 9.781023220527013e-06,
"loss": 0.7226,
"step": 472
},
{
"epoch": 27.88235294117647,
"grad_norm": 1.1630206108093262,
"learning_rate": 9.776998593610428e-06,
"loss": 0.7693,
"step": 474
},
{
"epoch": 28.0,
"grad_norm": 1.0083279609680176,
"learning_rate": 9.77293816123866e-06,
"loss": 0.7128,
"step": 476
},
{
"epoch": 28.0,
"eval_loss": 0.8991827368736267,
"eval_runtime": 14.5006,
"eval_samples_per_second": 2.483,
"eval_steps_per_second": 2.483,
"step": 476
},
{
"epoch": 28.11764705882353,
"grad_norm": 1.1530383825302124,
"learning_rate": 9.768841953846225e-06,
"loss": 0.6908,
"step": 478
},
{
"epoch": 28.235294117647058,
"grad_norm": 1.0489223003387451,
"learning_rate": 9.764710002135784e-06,
"loss": 0.675,
"step": 480
},
{
"epoch": 28.352941176470587,
"grad_norm": 1.2449612617492676,
"learning_rate": 9.760542337077914e-06,
"loss": 0.7516,
"step": 482
},
{
"epoch": 28.470588235294116,
"grad_norm": 1.1940374374389648,
"learning_rate": 9.75633898991088e-06,
"loss": 0.7681,
"step": 484
},
{
"epoch": 28.58823529411765,
"grad_norm": 1.1063061952590942,
"learning_rate": 9.752099992140401e-06,
"loss": 0.7693,
"step": 486
},
{
"epoch": 28.705882352941178,
"grad_norm": 1.1479785442352295,
"learning_rate": 9.747825375539401e-06,
"loss": 0.7108,
"step": 488
},
{
"epoch": 28.823529411764707,
"grad_norm": 1.2331879138946533,
"learning_rate": 9.743515172147793e-06,
"loss": 0.7786,
"step": 490
},
{
"epoch": 28.941176470588236,
"grad_norm": 1.1679853200912476,
"learning_rate": 9.739169414272219e-06,
"loss": 0.6474,
"step": 492
},
{
"epoch": 29.0,
"eval_loss": 0.906444787979126,
"eval_runtime": 14.4932,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 2.484,
"step": 493
},
{
"epoch": 29.058823529411764,
"grad_norm": 1.0444296598434448,
"learning_rate": 9.734788134485817e-06,
"loss": 0.6756,
"step": 494
},
{
"epoch": 29.176470588235293,
"grad_norm": 1.5380338430404663,
"learning_rate": 9.73037136562798e-06,
"loss": 0.7099,
"step": 496
},
{
"epoch": 29.294117647058822,
"grad_norm": 1.166580080986023,
"learning_rate": 9.7259191408041e-06,
"loss": 0.7595,
"step": 498
},
{
"epoch": 29.41176470588235,
"grad_norm": 1.2345106601715088,
"learning_rate": 9.721431493385322e-06,
"loss": 0.7026,
"step": 500
},
{
"epoch": 29.529411764705884,
"grad_norm": 1.0901451110839844,
"learning_rate": 9.71690845700831e-06,
"loss": 0.6719,
"step": 502
},
{
"epoch": 29.647058823529413,
"grad_norm": 1.1619518995285034,
"learning_rate": 9.71235006557497e-06,
"loss": 0.7517,
"step": 504
},
{
"epoch": 29.764705882352942,
"grad_norm": 1.1259740591049194,
"learning_rate": 9.707756353252213e-06,
"loss": 0.7052,
"step": 506
},
{
"epoch": 29.88235294117647,
"grad_norm": 1.1172682046890259,
"learning_rate": 9.70312735447169e-06,
"loss": 0.655,
"step": 508
},
{
"epoch": 30.0,
"grad_norm": 1.306216835975647,
"learning_rate": 9.698463103929542e-06,
"loss": 0.6387,
"step": 510
},
{
"epoch": 30.0,
"eval_loss": 0.9089268445968628,
"eval_runtime": 14.4939,
"eval_samples_per_second": 2.484,
"eval_steps_per_second": 2.484,
"step": 510
},
{
"epoch": 30.11764705882353,
"grad_norm": 1.258402705192566,
"learning_rate": 9.693763636586135e-06,
"loss": 0.716,
"step": 512
},
{
"epoch": 30.235294117647058,
"grad_norm": 1.143336296081543,
"learning_rate": 9.689028987665797e-06,
"loss": 0.6283,
"step": 514
},
{
"epoch": 30.352941176470587,
"grad_norm": 1.1861103773117065,
"learning_rate": 9.684259192656554e-06,
"loss": 0.6445,
"step": 516
},
{
"epoch": 30.470588235294116,
"grad_norm": 1.2192977666854858,
"learning_rate": 9.679454287309868e-06,
"loss": 0.6928,
"step": 518
},
{
"epoch": 30.58823529411765,
"grad_norm": 1.3194884061813354,
"learning_rate": 9.674614307640368e-06,
"loss": 0.733,
"step": 520
},
{
"epoch": 30.705882352941178,
"grad_norm": 1.3853224515914917,
"learning_rate": 9.669739289925578e-06,
"loss": 0.6438,
"step": 522
},
{
"epoch": 30.823529411764707,
"grad_norm": 1.4584524631500244,
"learning_rate": 9.664829270705638e-06,
"loss": 0.7003,
"step": 524
},
{
"epoch": 30.941176470588236,
"grad_norm": 1.637763500213623,
"learning_rate": 9.659884286783052e-06,
"loss": 0.6846,
"step": 526
},
{
"epoch": 31.0,
"eval_loss": 0.9096066355705261,
"eval_runtime": 14.5201,
"eval_samples_per_second": 2.479,
"eval_steps_per_second": 2.479,
"step": 527
},
{
"epoch": 31.058823529411764,
"grad_norm": 1.399101972579956,
"learning_rate": 9.654904375222384e-06,
"loss": 0.617,
"step": 528
},
{
"epoch": 31.176470588235293,
"grad_norm": 1.3545421361923218,
"learning_rate": 9.649889573350006e-06,
"loss": 0.6534,
"step": 530
},
{
"epoch": 31.294117647058822,
"grad_norm": 1.4606151580810547,
"learning_rate": 9.644839918753796e-06,
"loss": 0.6815,
"step": 532
},
{
"epoch": 31.41176470588235,
"grad_norm": 1.435264229774475,
"learning_rate": 9.639755449282874e-06,
"loss": 0.6696,
"step": 534
},
{
"epoch": 31.529411764705884,
"grad_norm": 1.2791359424591064,
"learning_rate": 9.634636203047309e-06,
"loss": 0.642,
"step": 536
},
{
"epoch": 31.647058823529413,
"grad_norm": 1.2923133373260498,
"learning_rate": 9.629482218417834e-06,
"loss": 0.712,
"step": 538
},
{
"epoch": 31.764705882352942,
"grad_norm": 1.2450653314590454,
"learning_rate": 9.62429353402556e-06,
"loss": 0.6357,
"step": 540
},
{
"epoch": 31.88235294117647,
"grad_norm": 1.31989586353302,
"learning_rate": 9.619070188761687e-06,
"loss": 0.6692,
"step": 542
},
{
"epoch": 32.0,
"grad_norm": 1.3321213722229004,
"learning_rate": 9.613812221777212e-06,
"loss": 0.6424,
"step": 544
},
{
"epoch": 32.0,
"eval_loss": 0.9172940850257874,
"eval_runtime": 14.4984,
"eval_samples_per_second": 2.483,
"eval_steps_per_second": 2.483,
"step": 544
},
{
"epoch": 32.11764705882353,
"grad_norm": 1.2186630964279175,
"learning_rate": 9.608519672482635e-06,
"loss": 0.5872,
"step": 546
},
{
"epoch": 32.23529411764706,
"grad_norm": 1.5495742559432983,
"learning_rate": 9.603192580547664e-06,
"loss": 0.6069,
"step": 548
},
{
"epoch": 32.35294117647059,
"grad_norm": 1.551956295967102,
"learning_rate": 9.597830985900913e-06,
"loss": 0.6971,
"step": 550
},
{
"epoch": 32.470588235294116,
"grad_norm": 1.5809985399246216,
"learning_rate": 9.592434928729617e-06,
"loss": 0.6887,
"step": 552
},
{
"epoch": 32.588235294117645,
"grad_norm": 1.5837764739990234,
"learning_rate": 9.58700444947931e-06,
"loss": 0.6228,
"step": 554
},
{
"epoch": 32.705882352941174,
"grad_norm": 1.4612311124801636,
"learning_rate": 9.581539588853539e-06,
"loss": 0.6002,
"step": 556
},
{
"epoch": 32.8235294117647,
"grad_norm": 1.4830561876296997,
"learning_rate": 9.576040387813553e-06,
"loss": 0.6673,
"step": 558
},
{
"epoch": 32.94117647058823,
"grad_norm": 1.5311380624771118,
"learning_rate": 9.570506887577994e-06,
"loss": 0.6598,
"step": 560
},
{
"epoch": 33.0,
"eval_loss": 0.9237830638885498,
"eval_runtime": 14.5075,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 561
},
{
"epoch": 33.05882352941177,
"grad_norm": 1.3405797481536865,
"learning_rate": 9.564939129622591e-06,
"loss": 0.6105,
"step": 562
},
{
"epoch": 33.1764705882353,
"grad_norm": 1.4336148500442505,
"learning_rate": 9.559337155679843e-06,
"loss": 0.572,
"step": 564
},
{
"epoch": 33.294117647058826,
"grad_norm": 1.4750621318817139,
"learning_rate": 9.553701007738717e-06,
"loss": 0.5598,
"step": 566
},
{
"epoch": 33.411764705882355,
"grad_norm": 1.4853854179382324,
"learning_rate": 9.54803072804433e-06,
"loss": 0.6175,
"step": 568
},
{
"epoch": 33.529411764705884,
"grad_norm": 1.5611326694488525,
"learning_rate": 9.542326359097619e-06,
"loss": 0.5898,
"step": 570
},
{
"epoch": 33.64705882352941,
"grad_norm": 1.4341068267822266,
"learning_rate": 9.536587943655043e-06,
"loss": 0.6158,
"step": 572
},
{
"epoch": 33.76470588235294,
"grad_norm": 1.3872367143630981,
"learning_rate": 9.530815524728245e-06,
"loss": 0.6776,
"step": 574
},
{
"epoch": 33.88235294117647,
"grad_norm": 1.3841159343719482,
"learning_rate": 9.525009145583746e-06,
"loss": 0.6208,
"step": 576
},
{
"epoch": 34.0,
"grad_norm": 1.5026782751083374,
"learning_rate": 9.519168849742603e-06,
"loss": 0.6634,
"step": 578
},
{
"epoch": 34.0,
"eval_loss": 0.9289535880088806,
"eval_runtime": 14.5116,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 578
},
{
"epoch": 34.11764705882353,
"grad_norm": 1.5542646646499634,
"learning_rate": 9.5132946809801e-06,
"loss": 0.6259,
"step": 580
},
{
"epoch": 34.23529411764706,
"grad_norm": 1.337219476699829,
"learning_rate": 9.507386683325404e-06,
"loss": 0.5992,
"step": 582
},
{
"epoch": 34.35294117647059,
"grad_norm": 1.744362235069275,
"learning_rate": 9.501444901061248e-06,
"loss": 0.5903,
"step": 584
},
{
"epoch": 34.470588235294116,
"grad_norm": 1.5578619241714478,
"learning_rate": 9.495469378723592e-06,
"loss": 0.5371,
"step": 586
},
{
"epoch": 34.588235294117645,
"grad_norm": 1.679646611213684,
"learning_rate": 9.489460161101291e-06,
"loss": 0.617,
"step": 588
},
{
"epoch": 34.705882352941174,
"grad_norm": 1.5505824089050293,
"learning_rate": 9.483417293235759e-06,
"loss": 0.6008,
"step": 590
},
{
"epoch": 34.8235294117647,
"grad_norm": 1.9452924728393555,
"learning_rate": 9.477340820420633e-06,
"loss": 0.5852,
"step": 592
},
{
"epoch": 34.94117647058823,
"grad_norm": 1.5196162462234497,
"learning_rate": 9.471230788201429e-06,
"loss": 0.5893,
"step": 594
},
{
"epoch": 35.0,
"eval_loss": 0.9399586915969849,
"eval_runtime": 14.5128,
"eval_samples_per_second": 2.481,
"eval_steps_per_second": 2.481,
"step": 595
},
{
"epoch": 35.0,
"step": 595,
"total_flos": 7.576813686279373e+16,
"train_loss": 1.1527870081052058,
"train_runtime": 5617.142,
"train_samples_per_second": 3.632,
"train_steps_per_second": 0.454
}
],
"logging_steps": 2,
"max_steps": 2550,
"num_input_tokens_seen": 0,
"num_train_epochs": 150,
"save_steps": 25,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 7,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.576813686279373e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}