|
{ |
|
"best_metric": 0.8991827368736267, |
|
"best_model_checkpoint": "data/Llama-31-8B_task-1_180-samples_config-3_full/checkpoint-476", |
|
"epoch": 35.0, |
|
"eval_steps": 500, |
|
"global_step": 595, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.058823529411764705, |
|
"grad_norm": 1.858755111694336, |
|
"learning_rate": 3.9215686274509804e-08, |
|
"loss": 2.4463, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 1.8338021039962769, |
|
"learning_rate": 7.843137254901961e-08, |
|
"loss": 2.4612, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 1.7695642709732056, |
|
"learning_rate": 1.5686274509803921e-07, |
|
"loss": 2.3799, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 1.7253705263137817, |
|
"learning_rate": 2.3529411764705883e-07, |
|
"loss": 2.4519, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 2.068340301513672, |
|
"learning_rate": 3.1372549019607843e-07, |
|
"loss": 2.4357, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 1.71905517578125, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 2.4114, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 1.9960722923278809, |
|
"learning_rate": 4.7058823529411767e-07, |
|
"loss": 2.5452, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.8235294117647058, |
|
"grad_norm": 1.846751093864441, |
|
"learning_rate": 5.490196078431373e-07, |
|
"loss": 2.4838, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 1.7892955541610718, |
|
"learning_rate": 6.274509803921569e-07, |
|
"loss": 2.4542, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.4258534908294678, |
|
"eval_runtime": 14.49, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 1.0588235294117647, |
|
"grad_norm": 1.7462923526763916, |
|
"learning_rate": 7.058823529411766e-07, |
|
"loss": 2.4066, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 1.6423271894454956, |
|
"learning_rate": 7.843137254901962e-07, |
|
"loss": 2.4084, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.2941176470588236, |
|
"grad_norm": 1.6562241315841675, |
|
"learning_rate": 8.627450980392157e-07, |
|
"loss": 2.4685, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 1.5601104497909546, |
|
"learning_rate": 9.411764705882353e-07, |
|
"loss": 2.3986, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.5294117647058822, |
|
"grad_norm": 1.689158320426941, |
|
"learning_rate": 1.019607843137255e-06, |
|
"loss": 2.512, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.6470588235294117, |
|
"grad_norm": 1.7012155055999756, |
|
"learning_rate": 1.0980392156862745e-06, |
|
"loss": 2.434, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 1.533742070198059, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 2.4145, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 1.6920032501220703, |
|
"learning_rate": 1.2549019607843137e-06, |
|
"loss": 2.4019, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.5552300214767456, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 2.4022, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.3881916999816895, |
|
"eval_runtime": 14.4942, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 1.5124330520629883, |
|
"learning_rate": 1.4117647058823531e-06, |
|
"loss": 2.3961, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.235294117647059, |
|
"grad_norm": 1.4209071397781372, |
|
"learning_rate": 1.4901960784313726e-06, |
|
"loss": 2.4343, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 1.6290644407272339, |
|
"learning_rate": 1.5686274509803923e-06, |
|
"loss": 2.3528, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.4705882352941178, |
|
"grad_norm": 1.5042191743850708, |
|
"learning_rate": 1.6470588235294118e-06, |
|
"loss": 2.3555, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 2.588235294117647, |
|
"grad_norm": 1.6211644411087036, |
|
"learning_rate": 1.7254901960784315e-06, |
|
"loss": 2.3944, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 2.7058823529411766, |
|
"grad_norm": 1.5416394472122192, |
|
"learning_rate": 1.8039215686274512e-06, |
|
"loss": 2.3917, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 1.697242259979248, |
|
"learning_rate": 1.8823529411764707e-06, |
|
"loss": 2.3457, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 1.8130015134811401, |
|
"learning_rate": 1.96078431372549e-06, |
|
"loss": 2.3317, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.314044952392578, |
|
"eval_runtime": 14.4935, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 3.0588235294117645, |
|
"grad_norm": 1.830562710762024, |
|
"learning_rate": 2.03921568627451e-06, |
|
"loss": 2.2938, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 3.176470588235294, |
|
"grad_norm": 1.8372972011566162, |
|
"learning_rate": 2.1176470588235296e-06, |
|
"loss": 2.3541, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 3.2941176470588234, |
|
"grad_norm": 1.8857609033584595, |
|
"learning_rate": 2.196078431372549e-06, |
|
"loss": 2.2888, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 3.411764705882353, |
|
"grad_norm": 1.7633429765701294, |
|
"learning_rate": 2.274509803921569e-06, |
|
"loss": 2.2616, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 1.5656747817993164, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 2.2801, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.6470588235294117, |
|
"grad_norm": 1.6285021305084229, |
|
"learning_rate": 2.431372549019608e-06, |
|
"loss": 2.3144, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 3.764705882352941, |
|
"grad_norm": 1.531112790107727, |
|
"learning_rate": 2.5098039215686274e-06, |
|
"loss": 2.2294, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 3.8823529411764706, |
|
"grad_norm": 1.527350664138794, |
|
"learning_rate": 2.5882352941176473e-06, |
|
"loss": 2.1976, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.009220838546753, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 2.2607, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.2050342559814453, |
|
"eval_runtime": 14.4977, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 4.117647058823529, |
|
"grad_norm": 2.025496006011963, |
|
"learning_rate": 2.7450980392156867e-06, |
|
"loss": 2.1834, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 4.235294117647059, |
|
"grad_norm": 1.714682698249817, |
|
"learning_rate": 2.8235294117647062e-06, |
|
"loss": 2.1785, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 4.352941176470588, |
|
"grad_norm": 1.1758134365081787, |
|
"learning_rate": 2.901960784313726e-06, |
|
"loss": 2.2148, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 4.470588235294118, |
|
"grad_norm": 1.2965394258499146, |
|
"learning_rate": 2.980392156862745e-06, |
|
"loss": 2.1797, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 4.588235294117647, |
|
"grad_norm": 1.1413812637329102, |
|
"learning_rate": 3.058823529411765e-06, |
|
"loss": 2.1743, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 1.0636754035949707, |
|
"learning_rate": 3.1372549019607846e-06, |
|
"loss": 2.0559, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.823529411764706, |
|
"grad_norm": 1.1126306056976318, |
|
"learning_rate": 3.2156862745098045e-06, |
|
"loss": 2.1355, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 4.9411764705882355, |
|
"grad_norm": 1.1904844045639038, |
|
"learning_rate": 3.2941176470588236e-06, |
|
"loss": 2.1352, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.0643370151519775, |
|
"eval_runtime": 14.5066, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 5.0588235294117645, |
|
"grad_norm": 1.0779309272766113, |
|
"learning_rate": 3.3725490196078435e-06, |
|
"loss": 2.0248, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 5.176470588235294, |
|
"grad_norm": 1.107112169265747, |
|
"learning_rate": 3.450980392156863e-06, |
|
"loss": 2.1049, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 5.294117647058823, |
|
"grad_norm": 0.9876514077186584, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 2.0005, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 5.411764705882353, |
|
"grad_norm": 1.073117733001709, |
|
"learning_rate": 3.6078431372549024e-06, |
|
"loss": 1.9499, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 5.529411764705882, |
|
"grad_norm": 1.0594408512115479, |
|
"learning_rate": 3.6862745098039223e-06, |
|
"loss": 1.9899, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 5.647058823529412, |
|
"grad_norm": 1.0870219469070435, |
|
"learning_rate": 3.7647058823529414e-06, |
|
"loss": 1.9852, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 5.764705882352941, |
|
"grad_norm": 0.9945081472396851, |
|
"learning_rate": 3.843137254901962e-06, |
|
"loss": 1.9981, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 0.8944886326789856, |
|
"learning_rate": 3.92156862745098e-06, |
|
"loss": 1.9126, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.8814469575881958, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.9456, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.888541340827942, |
|
"eval_runtime": 14.5125, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 6.117647058823529, |
|
"grad_norm": 0.8919170498847961, |
|
"learning_rate": 4.07843137254902e-06, |
|
"loss": 1.8157, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 6.235294117647059, |
|
"grad_norm": 0.8234829902648926, |
|
"learning_rate": 4.15686274509804e-06, |
|
"loss": 1.9187, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 6.352941176470588, |
|
"grad_norm": 0.8216582536697388, |
|
"learning_rate": 4.235294117647059e-06, |
|
"loss": 1.8121, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 6.470588235294118, |
|
"grad_norm": 0.8760618567466736, |
|
"learning_rate": 4.313725490196079e-06, |
|
"loss": 1.8794, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 6.588235294117647, |
|
"grad_norm": 0.90522301197052, |
|
"learning_rate": 4.392156862745098e-06, |
|
"loss": 1.7899, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 6.705882352941177, |
|
"grad_norm": 0.8919849395751953, |
|
"learning_rate": 4.4705882352941184e-06, |
|
"loss": 1.7929, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 6.823529411764706, |
|
"grad_norm": 1.0193332433700562, |
|
"learning_rate": 4.549019607843138e-06, |
|
"loss": 1.7409, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 6.9411764705882355, |
|
"grad_norm": 0.9497600793838501, |
|
"learning_rate": 4.627450980392157e-06, |
|
"loss": 1.7528, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.7024633884429932, |
|
"eval_runtime": 14.5072, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 0.9311454892158508, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 1.7333, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 7.176470588235294, |
|
"grad_norm": 1.0313152074813843, |
|
"learning_rate": 4.784313725490196e-06, |
|
"loss": 1.7217, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 7.294117647058823, |
|
"grad_norm": 1.1278079748153687, |
|
"learning_rate": 4.862745098039216e-06, |
|
"loss": 1.6414, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 7.411764705882353, |
|
"grad_norm": 0.9751306176185608, |
|
"learning_rate": 4.941176470588236e-06, |
|
"loss": 1.6047, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 7.529411764705882, |
|
"grad_norm": 0.9619643688201904, |
|
"learning_rate": 5.019607843137255e-06, |
|
"loss": 1.5733, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 7.647058823529412, |
|
"grad_norm": 0.9418209195137024, |
|
"learning_rate": 5.098039215686274e-06, |
|
"loss": 1.5655, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 7.764705882352941, |
|
"grad_norm": 0.986770749092102, |
|
"learning_rate": 5.176470588235295e-06, |
|
"loss": 1.5325, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 7.882352941176471, |
|
"grad_norm": 0.8657909631729126, |
|
"learning_rate": 5.254901960784314e-06, |
|
"loss": 1.5042, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.8987972736358643, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.4935, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.467383861541748, |
|
"eval_runtime": 14.5108, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 8.117647058823529, |
|
"grad_norm": 0.8275275826454163, |
|
"learning_rate": 5.411764705882353e-06, |
|
"loss": 1.4215, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 8.235294117647058, |
|
"grad_norm": 0.9540057182312012, |
|
"learning_rate": 5.4901960784313735e-06, |
|
"loss": 1.4698, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 8.352941176470589, |
|
"grad_norm": 0.9684072136878967, |
|
"learning_rate": 5.568627450980393e-06, |
|
"loss": 1.4359, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 8.470588235294118, |
|
"grad_norm": 0.9229031801223755, |
|
"learning_rate": 5.6470588235294125e-06, |
|
"loss": 1.3994, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 8.588235294117647, |
|
"grad_norm": 0.8458110094070435, |
|
"learning_rate": 5.725490196078431e-06, |
|
"loss": 1.3096, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 8.705882352941176, |
|
"grad_norm": 0.9069352746009827, |
|
"learning_rate": 5.803921568627452e-06, |
|
"loss": 1.2347, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 8.823529411764707, |
|
"grad_norm": 0.8469833731651306, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 1.332, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"grad_norm": 0.8933460116386414, |
|
"learning_rate": 5.96078431372549e-06, |
|
"loss": 1.2733, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 1.2421215772628784, |
|
"eval_runtime": 14.5471, |
|
"eval_samples_per_second": 2.475, |
|
"eval_steps_per_second": 2.475, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 9.058823529411764, |
|
"grad_norm": 0.8019786477088928, |
|
"learning_rate": 6.03921568627451e-06, |
|
"loss": 1.1929, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 9.176470588235293, |
|
"grad_norm": 0.7300643920898438, |
|
"learning_rate": 6.11764705882353e-06, |
|
"loss": 1.2392, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 9.294117647058824, |
|
"grad_norm": 0.809948742389679, |
|
"learning_rate": 6.19607843137255e-06, |
|
"loss": 1.1685, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 0.6852974891662598, |
|
"learning_rate": 6.274509803921569e-06, |
|
"loss": 1.168, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 9.529411764705882, |
|
"grad_norm": 0.709697961807251, |
|
"learning_rate": 6.352941176470589e-06, |
|
"loss": 1.1333, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 9.647058823529411, |
|
"grad_norm": 0.7923583388328552, |
|
"learning_rate": 6.431372549019609e-06, |
|
"loss": 1.1475, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 9.764705882352942, |
|
"grad_norm": 0.7233794927597046, |
|
"learning_rate": 6.5098039215686285e-06, |
|
"loss": 1.1775, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 9.882352941176471, |
|
"grad_norm": 0.7074316740036011, |
|
"learning_rate": 6.588235294117647e-06, |
|
"loss": 1.1279, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.6581458449363708, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.1154, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.1133772134780884, |
|
"eval_runtime": 14.5122, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 10.117647058823529, |
|
"grad_norm": 0.6955820918083191, |
|
"learning_rate": 6.745098039215687e-06, |
|
"loss": 1.0662, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 10.235294117647058, |
|
"grad_norm": 0.5870165824890137, |
|
"learning_rate": 6.8235294117647065e-06, |
|
"loss": 1.0219, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 10.352941176470589, |
|
"grad_norm": 0.6177704334259033, |
|
"learning_rate": 6.901960784313726e-06, |
|
"loss": 1.0537, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 10.470588235294118, |
|
"grad_norm": 0.6390775442123413, |
|
"learning_rate": 6.9803921568627454e-06, |
|
"loss": 1.1001, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 10.588235294117647, |
|
"grad_norm": 0.4973801374435425, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 1.0578, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 10.705882352941176, |
|
"grad_norm": 0.518943190574646, |
|
"learning_rate": 7.137254901960785e-06, |
|
"loss": 1.1447, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 10.823529411764707, |
|
"grad_norm": 0.6414965987205505, |
|
"learning_rate": 7.215686274509805e-06, |
|
"loss": 1.0872, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 10.941176470588236, |
|
"grad_norm": 0.508786678314209, |
|
"learning_rate": 7.294117647058823e-06, |
|
"loss": 1.1202, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 1.0689375400543213, |
|
"eval_runtime": 14.505, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 11.058823529411764, |
|
"grad_norm": 0.48530295491218567, |
|
"learning_rate": 7.372549019607845e-06, |
|
"loss": 1.0999, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 11.176470588235293, |
|
"grad_norm": 0.5133592486381531, |
|
"learning_rate": 7.450980392156863e-06, |
|
"loss": 1.0864, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 11.294117647058824, |
|
"grad_norm": 0.49263596534729004, |
|
"learning_rate": 7.529411764705883e-06, |
|
"loss": 1.0535, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 11.411764705882353, |
|
"grad_norm": 0.4610048532485962, |
|
"learning_rate": 7.607843137254902e-06, |
|
"loss": 1.0462, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 11.529411764705882, |
|
"grad_norm": 0.5121297836303711, |
|
"learning_rate": 7.686274509803923e-06, |
|
"loss": 1.0862, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 11.647058823529411, |
|
"grad_norm": 0.5441015958786011, |
|
"learning_rate": 7.764705882352941e-06, |
|
"loss": 1.0068, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 0.5135095119476318, |
|
"learning_rate": 7.84313725490196e-06, |
|
"loss": 1.0548, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 11.882352941176471, |
|
"grad_norm": 0.4792177081108093, |
|
"learning_rate": 7.92156862745098e-06, |
|
"loss": 0.9711, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.45314979553222656, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.9449, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.0450434684753418, |
|
"eval_runtime": 14.5066, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 12.117647058823529, |
|
"grad_norm": 0.5007625818252563, |
|
"learning_rate": 8.07843137254902e-06, |
|
"loss": 1.0258, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 12.235294117647058, |
|
"grad_norm": 0.5184361934661865, |
|
"learning_rate": 8.15686274509804e-06, |
|
"loss": 1.0845, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 12.352941176470589, |
|
"grad_norm": 0.44266751408576965, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 1.0005, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 12.470588235294118, |
|
"grad_norm": 0.5165805220603943, |
|
"learning_rate": 8.31372549019608e-06, |
|
"loss": 1.0242, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 12.588235294117647, |
|
"grad_norm": 0.5037981867790222, |
|
"learning_rate": 8.392156862745099e-06, |
|
"loss": 0.9857, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 12.705882352941176, |
|
"grad_norm": 0.5604737997055054, |
|
"learning_rate": 8.470588235294118e-06, |
|
"loss": 1.0086, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 12.823529411764707, |
|
"grad_norm": 0.6752682328224182, |
|
"learning_rate": 8.549019607843138e-06, |
|
"loss": 1.0277, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 12.941176470588236, |
|
"grad_norm": 0.5517321228981018, |
|
"learning_rate": 8.627450980392157e-06, |
|
"loss": 0.9973, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 1.0252662897109985, |
|
"eval_runtime": 14.5028, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 13.058823529411764, |
|
"grad_norm": 0.6104453802108765, |
|
"learning_rate": 8.705882352941177e-06, |
|
"loss": 0.9835, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 13.176470588235293, |
|
"grad_norm": 0.47119539976119995, |
|
"learning_rate": 8.784313725490196e-06, |
|
"loss": 1.0076, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 13.294117647058824, |
|
"grad_norm": 0.4882214367389679, |
|
"learning_rate": 8.862745098039216e-06, |
|
"loss": 0.9808, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 13.411764705882353, |
|
"grad_norm": 0.7123433947563171, |
|
"learning_rate": 8.941176470588237e-06, |
|
"loss": 0.9676, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 13.529411764705882, |
|
"grad_norm": 0.5918748378753662, |
|
"learning_rate": 9.019607843137256e-06, |
|
"loss": 1.0068, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 13.647058823529411, |
|
"grad_norm": 0.5302197337150574, |
|
"learning_rate": 9.098039215686276e-06, |
|
"loss": 0.9573, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 13.764705882352942, |
|
"grad_norm": 0.5693833827972412, |
|
"learning_rate": 9.176470588235294e-06, |
|
"loss": 0.9914, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 13.882352941176471, |
|
"grad_norm": 0.490904837846756, |
|
"learning_rate": 9.254901960784315e-06, |
|
"loss": 1.032, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.5507678389549255, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.0562, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 1.0090599060058594, |
|
"eval_runtime": 14.506, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 14.117647058823529, |
|
"grad_norm": 0.6389086246490479, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 0.9853, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 14.235294117647058, |
|
"grad_norm": 0.5049781203269958, |
|
"learning_rate": 9.490196078431373e-06, |
|
"loss": 1.0067, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 14.352941176470589, |
|
"grad_norm": 0.7086266279220581, |
|
"learning_rate": 9.568627450980393e-06, |
|
"loss": 0.9387, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 14.470588235294118, |
|
"grad_norm": 0.5628448128700256, |
|
"learning_rate": 9.647058823529412e-06, |
|
"loss": 1.0068, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 14.588235294117647, |
|
"grad_norm": 0.6910731196403503, |
|
"learning_rate": 9.725490196078432e-06, |
|
"loss": 1.0007, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 14.705882352941176, |
|
"grad_norm": 0.6134346127510071, |
|
"learning_rate": 9.803921568627451e-06, |
|
"loss": 0.9456, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 14.823529411764707, |
|
"grad_norm": 0.6747128963470459, |
|
"learning_rate": 9.882352941176472e-06, |
|
"loss": 0.9506, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 14.941176470588236, |
|
"grad_norm": 0.5889897346496582, |
|
"learning_rate": 9.960784313725492e-06, |
|
"loss": 0.9947, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 0.9928128719329834, |
|
"eval_runtime": 14.4936, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 15.058823529411764, |
|
"grad_norm": 0.5487807989120483, |
|
"learning_rate": 9.999995315380667e-06, |
|
"loss": 0.9354, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 15.176470588235293, |
|
"grad_norm": 0.6178866624832153, |
|
"learning_rate": 9.99995783847866e-06, |
|
"loss": 0.9655, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 15.294117647058824, |
|
"grad_norm": 0.5696916580200195, |
|
"learning_rate": 9.999882884955554e-06, |
|
"loss": 0.9468, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 15.411764705882353, |
|
"grad_norm": 0.6009863615036011, |
|
"learning_rate": 9.99977045537315e-06, |
|
"loss": 0.9852, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 15.529411764705882, |
|
"grad_norm": 0.6040264368057251, |
|
"learning_rate": 9.999620550574155e-06, |
|
"loss": 0.9553, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 15.647058823529411, |
|
"grad_norm": 0.6321269869804382, |
|
"learning_rate": 9.999433171682158e-06, |
|
"loss": 0.9419, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 15.764705882352942, |
|
"grad_norm": 0.6273146867752075, |
|
"learning_rate": 9.999208320101643e-06, |
|
"loss": 0.9715, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 15.882352941176471, |
|
"grad_norm": 0.6734570860862732, |
|
"learning_rate": 9.998945997517957e-06, |
|
"loss": 0.918, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.7102432250976562, |
|
"learning_rate": 9.99864620589731e-06, |
|
"loss": 1.0096, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.9803969264030457, |
|
"eval_runtime": 14.5083, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 16.11764705882353, |
|
"grad_norm": 0.6154859066009521, |
|
"learning_rate": 9.998308947486753e-06, |
|
"loss": 0.8898, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 16.235294117647058, |
|
"grad_norm": 0.6435267329216003, |
|
"learning_rate": 9.997934224814173e-06, |
|
"loss": 0.9271, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 16.352941176470587, |
|
"grad_norm": 0.7057787775993347, |
|
"learning_rate": 9.997522040688258e-06, |
|
"loss": 1.0, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 16.470588235294116, |
|
"grad_norm": 0.6257563233375549, |
|
"learning_rate": 9.997072398198492e-06, |
|
"loss": 0.973, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 16.58823529411765, |
|
"grad_norm": 0.6798095703125, |
|
"learning_rate": 9.996585300715117e-06, |
|
"loss": 0.9625, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 16.705882352941178, |
|
"grad_norm": 0.7027468681335449, |
|
"learning_rate": 9.996060751889114e-06, |
|
"loss": 0.9529, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 16.823529411764707, |
|
"grad_norm": 0.6210634708404541, |
|
"learning_rate": 9.995498755652186e-06, |
|
"loss": 0.8968, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 16.941176470588236, |
|
"grad_norm": 0.6995490789413452, |
|
"learning_rate": 9.994899316216709e-06, |
|
"loss": 0.9222, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 0.9691942930221558, |
|
"eval_runtime": 14.5044, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 17.058823529411764, |
|
"grad_norm": 0.6503624320030212, |
|
"learning_rate": 9.994262438075713e-06, |
|
"loss": 0.9487, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 17.176470588235293, |
|
"grad_norm": 0.6647483706474304, |
|
"learning_rate": 9.993588126002848e-06, |
|
"loss": 0.9163, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 17.294117647058822, |
|
"grad_norm": 0.7215944528579712, |
|
"learning_rate": 9.992876385052346e-06, |
|
"loss": 0.8638, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 17.41176470588235, |
|
"grad_norm": 0.7234969139099121, |
|
"learning_rate": 9.992127220558976e-06, |
|
"loss": 0.9037, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 17.529411764705884, |
|
"grad_norm": 0.7656229138374329, |
|
"learning_rate": 9.991340638138022e-06, |
|
"loss": 0.9633, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 17.647058823529413, |
|
"grad_norm": 0.6850258111953735, |
|
"learning_rate": 9.990516643685222e-06, |
|
"loss": 0.9458, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 17.764705882352942, |
|
"grad_norm": 0.7975447773933411, |
|
"learning_rate": 9.98965524337673e-06, |
|
"loss": 0.9801, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 17.88235294117647, |
|
"grad_norm": 0.7075424790382385, |
|
"learning_rate": 9.988756443669081e-06, |
|
"loss": 0.888, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.85096675157547, |
|
"learning_rate": 9.987820251299121e-06, |
|
"loss": 0.8838, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 0.9602956771850586, |
|
"eval_runtime": 14.5129, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 18.11764705882353, |
|
"grad_norm": 0.698685884475708, |
|
"learning_rate": 9.98684667328398e-06, |
|
"loss": 0.8838, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 18.235294117647058, |
|
"grad_norm": 0.7671274542808533, |
|
"learning_rate": 9.985835716921e-06, |
|
"loss": 0.9012, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 18.352941176470587, |
|
"grad_norm": 0.8342521786689758, |
|
"learning_rate": 9.984787389787689e-06, |
|
"loss": 0.9412, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 18.470588235294116, |
|
"grad_norm": 0.6886960864067078, |
|
"learning_rate": 9.983701699741668e-06, |
|
"loss": 0.8946, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 18.58823529411765, |
|
"grad_norm": 0.7856888175010681, |
|
"learning_rate": 9.982578654920601e-06, |
|
"loss": 0.9169, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 18.705882352941178, |
|
"grad_norm": 0.7338317036628723, |
|
"learning_rate": 9.981418263742148e-06, |
|
"loss": 0.8584, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 18.823529411764707, |
|
"grad_norm": 0.727165699005127, |
|
"learning_rate": 9.980220534903889e-06, |
|
"loss": 0.9385, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 18.941176470588236, |
|
"grad_norm": 0.777866542339325, |
|
"learning_rate": 9.978985477383264e-06, |
|
"loss": 0.8942, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 0.951096773147583, |
|
"eval_runtime": 14.4924, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 19.058823529411764, |
|
"grad_norm": 0.6845978498458862, |
|
"learning_rate": 9.97771310043751e-06, |
|
"loss": 0.8752, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 19.176470588235293, |
|
"grad_norm": 0.7632399201393127, |
|
"learning_rate": 9.97640341360358e-06, |
|
"loss": 0.9616, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 19.294117647058822, |
|
"grad_norm": 0.7852567434310913, |
|
"learning_rate": 9.975056426698094e-06, |
|
"loss": 0.8884, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 19.41176470588235, |
|
"grad_norm": 0.7355157136917114, |
|
"learning_rate": 9.973672149817232e-06, |
|
"loss": 0.8175, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 19.529411764705884, |
|
"grad_norm": 0.7707788348197937, |
|
"learning_rate": 9.972250593336689e-06, |
|
"loss": 0.8878, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 19.647058823529413, |
|
"grad_norm": 1.0082019567489624, |
|
"learning_rate": 9.970791767911581e-06, |
|
"loss": 0.9118, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 19.764705882352942, |
|
"grad_norm": 0.8013073205947876, |
|
"learning_rate": 9.96929568447637e-06, |
|
"loss": 0.8724, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 19.88235294117647, |
|
"grad_norm": 0.6911207437515259, |
|
"learning_rate": 9.967762354244778e-06, |
|
"loss": 0.8832, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.8336138725280762, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 0.9058, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 0.9431850910186768, |
|
"eval_runtime": 14.5083, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 20.11764705882353, |
|
"grad_norm": 0.7745249271392822, |
|
"learning_rate": 9.964583999643174e-06, |
|
"loss": 0.878, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 20.235294117647058, |
|
"grad_norm": 0.7922182083129883, |
|
"learning_rate": 9.962938999096159e-06, |
|
"loss": 0.8275, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 20.352941176470587, |
|
"grad_norm": 0.8610040545463562, |
|
"learning_rate": 9.961256799398584e-06, |
|
"loss": 0.94, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 20.470588235294116, |
|
"grad_norm": 0.9406768083572388, |
|
"learning_rate": 9.95953741315919e-06, |
|
"loss": 0.8779, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 20.58823529411765, |
|
"grad_norm": 0.8344603180885315, |
|
"learning_rate": 9.957780853265441e-06, |
|
"loss": 0.8318, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 20.705882352941178, |
|
"grad_norm": 0.8624390363693237, |
|
"learning_rate": 9.955987132883435e-06, |
|
"loss": 0.8644, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 20.823529411764707, |
|
"grad_norm": 0.7996507287025452, |
|
"learning_rate": 9.954156265457801e-06, |
|
"loss": 0.8656, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 20.941176470588236, |
|
"grad_norm": 0.9234054684638977, |
|
"learning_rate": 9.952288264711601e-06, |
|
"loss": 0.8837, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 0.9354000091552734, |
|
"eval_runtime": 14.5044, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 21.058823529411764, |
|
"grad_norm": 0.793875515460968, |
|
"learning_rate": 9.950383144646221e-06, |
|
"loss": 0.8662, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 21.176470588235293, |
|
"grad_norm": 0.8161793947219849, |
|
"learning_rate": 9.948440919541277e-06, |
|
"loss": 0.8713, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 21.294117647058822, |
|
"grad_norm": 0.9452466368675232, |
|
"learning_rate": 9.946461603954499e-06, |
|
"loss": 0.9299, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 21.41176470588235, |
|
"grad_norm": 0.8712689876556396, |
|
"learning_rate": 9.944445212721619e-06, |
|
"loss": 0.84, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 21.529411764705884, |
|
"grad_norm": 0.8613099455833435, |
|
"learning_rate": 9.942391760956277e-06, |
|
"loss": 0.8523, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 21.647058823529413, |
|
"grad_norm": 1.0285900831222534, |
|
"learning_rate": 9.940301264049885e-06, |
|
"loss": 0.8411, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 21.764705882352942, |
|
"grad_norm": 0.9434134364128113, |
|
"learning_rate": 9.938173737671531e-06, |
|
"loss": 0.819, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 21.88235294117647, |
|
"grad_norm": 0.9282283782958984, |
|
"learning_rate": 9.936009197767847e-06, |
|
"loss": 0.8783, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.9603204131126404, |
|
"learning_rate": 9.933807660562898e-06, |
|
"loss": 0.795, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 0.9314696788787842, |
|
"eval_runtime": 14.5014, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 22.11764705882353, |
|
"grad_norm": 0.9283419847488403, |
|
"learning_rate": 9.931569142558057e-06, |
|
"loss": 0.8911, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 22.235294117647058, |
|
"grad_norm": 0.985173761844635, |
|
"learning_rate": 9.929293660531889e-06, |
|
"loss": 0.8351, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 22.352941176470587, |
|
"grad_norm": 0.9488443732261658, |
|
"learning_rate": 9.926981231540007e-06, |
|
"loss": 0.8245, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 22.470588235294116, |
|
"grad_norm": 1.0252861976623535, |
|
"learning_rate": 9.924631872914967e-06, |
|
"loss": 0.8096, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 22.58823529411765, |
|
"grad_norm": 0.8986847996711731, |
|
"learning_rate": 9.922245602266119e-06, |
|
"loss": 0.8311, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 22.705882352941178, |
|
"grad_norm": 0.9069613218307495, |
|
"learning_rate": 9.919822437479488e-06, |
|
"loss": 0.7961, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 22.823529411764707, |
|
"grad_norm": 0.8006130456924438, |
|
"learning_rate": 9.91736239671763e-06, |
|
"loss": 0.866, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 22.941176470588236, |
|
"grad_norm": 0.8258039355278015, |
|
"learning_rate": 9.91486549841951e-06, |
|
"loss": 0.8395, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 0.9243198037147522, |
|
"eval_runtime": 14.5142, |
|
"eval_samples_per_second": 2.48, |
|
"eval_steps_per_second": 2.48, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 23.058823529411764, |
|
"grad_norm": 1.0394818782806396, |
|
"learning_rate": 9.912331761300341e-06, |
|
"loss": 0.787, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 23.176470588235293, |
|
"grad_norm": 0.9367055892944336, |
|
"learning_rate": 9.909761204351469e-06, |
|
"loss": 0.8501, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 23.294117647058822, |
|
"grad_norm": 1.0531871318817139, |
|
"learning_rate": 9.90715384684021e-06, |
|
"loss": 0.8855, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 23.41176470588235, |
|
"grad_norm": 0.9447432160377502, |
|
"learning_rate": 9.904509708309723e-06, |
|
"loss": 0.7717, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 23.529411764705884, |
|
"grad_norm": 0.9809987545013428, |
|
"learning_rate": 9.901828808578846e-06, |
|
"loss": 0.7949, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 23.647058823529413, |
|
"grad_norm": 0.9656151533126831, |
|
"learning_rate": 9.899111167741966e-06, |
|
"loss": 0.8286, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 23.764705882352942, |
|
"grad_norm": 1.0195831060409546, |
|
"learning_rate": 9.896356806168851e-06, |
|
"loss": 0.8478, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 23.88235294117647, |
|
"grad_norm": 1.144056797027588, |
|
"learning_rate": 9.89356574450451e-06, |
|
"loss": 0.7723, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 1.0349133014678955, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 0.8308, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.9168965816497803, |
|
"eval_runtime": 14.494, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 24.11764705882353, |
|
"grad_norm": 1.003952980041504, |
|
"learning_rate": 9.887873604857424e-06, |
|
"loss": 0.8492, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 24.235294117647058, |
|
"grad_norm": 1.1212753057479858, |
|
"learning_rate": 9.884972569539471e-06, |
|
"loss": 0.8037, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 24.352941176470587, |
|
"grad_norm": 0.995343029499054, |
|
"learning_rate": 9.882034919459556e-06, |
|
"loss": 0.765, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 24.470588235294116, |
|
"grad_norm": 1.0651168823242188, |
|
"learning_rate": 9.879060676636502e-06, |
|
"loss": 0.8008, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 24.58823529411765, |
|
"grad_norm": 1.1323087215423584, |
|
"learning_rate": 9.876049863363415e-06, |
|
"loss": 0.8154, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 24.705882352941178, |
|
"grad_norm": 1.118166446685791, |
|
"learning_rate": 9.873002502207502e-06, |
|
"loss": 0.7665, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 24.823529411764707, |
|
"grad_norm": 1.1308856010437012, |
|
"learning_rate": 9.86991861600992e-06, |
|
"loss": 0.8056, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 24.941176470588236, |
|
"grad_norm": 1.0739870071411133, |
|
"learning_rate": 9.866798227885588e-06, |
|
"loss": 0.7863, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 0.9137818217277527, |
|
"eval_runtime": 14.4961, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 25.058823529411764, |
|
"grad_norm": 0.947708785533905, |
|
"learning_rate": 9.863641361223025e-06, |
|
"loss": 0.746, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 25.176470588235293, |
|
"grad_norm": 1.226585030555725, |
|
"learning_rate": 9.860448039684169e-06, |
|
"loss": 0.7622, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 25.294117647058822, |
|
"grad_norm": 1.2553542852401733, |
|
"learning_rate": 9.857218287204204e-06, |
|
"loss": 0.784, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 25.41176470588235, |
|
"grad_norm": 1.130286455154419, |
|
"learning_rate": 9.853952127991374e-06, |
|
"loss": 0.78, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 25.529411764705884, |
|
"grad_norm": 1.2538301944732666, |
|
"learning_rate": 9.850649586526808e-06, |
|
"loss": 0.7608, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 25.647058823529413, |
|
"grad_norm": 1.170310378074646, |
|
"learning_rate": 9.847310687564335e-06, |
|
"loss": 0.8389, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 25.764705882352942, |
|
"grad_norm": 0.9732166528701782, |
|
"learning_rate": 9.843935456130295e-06, |
|
"loss": 0.8158, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 25.88235294117647, |
|
"grad_norm": 1.2474738359451294, |
|
"learning_rate": 9.840523917523354e-06, |
|
"loss": 0.7528, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 1.130893349647522, |
|
"learning_rate": 9.83707609731432e-06, |
|
"loss": 0.7468, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 0.9068209528923035, |
|
"eval_runtime": 14.5043, |
|
"eval_samples_per_second": 2.482, |
|
"eval_steps_per_second": 2.482, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 26.11764705882353, |
|
"grad_norm": 1.0452311038970947, |
|
"learning_rate": 9.833592021345938e-06, |
|
"loss": 0.7589, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 26.235294117647058, |
|
"grad_norm": 0.9809611439704895, |
|
"learning_rate": 9.830071715732708e-06, |
|
"loss": 0.8016, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 26.352941176470587, |
|
"grad_norm": 1.0656489133834839, |
|
"learning_rate": 9.826515206860683e-06, |
|
"loss": 0.7417, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 26.470588235294116, |
|
"grad_norm": 1.1188890933990479, |
|
"learning_rate": 9.822922521387277e-06, |
|
"loss": 0.7569, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 26.58823529411765, |
|
"grad_norm": 1.087983250617981, |
|
"learning_rate": 9.819293686241057e-06, |
|
"loss": 0.7596, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 26.705882352941178, |
|
"grad_norm": 1.0073840618133545, |
|
"learning_rate": 9.81562872862155e-06, |
|
"loss": 0.7423, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 26.823529411764707, |
|
"grad_norm": 1.0083576440811157, |
|
"learning_rate": 9.811927675999035e-06, |
|
"loss": 0.7533, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 26.941176470588236, |
|
"grad_norm": 1.0545302629470825, |
|
"learning_rate": 9.808190556114333e-06, |
|
"loss": 0.7658, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 0.9007807970046997, |
|
"eval_runtime": 14.5307, |
|
"eval_samples_per_second": 2.478, |
|
"eval_steps_per_second": 2.478, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 27.058823529411764, |
|
"grad_norm": 0.9539656043052673, |
|
"learning_rate": 9.804417396978605e-06, |
|
"loss": 0.7658, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 27.176470588235293, |
|
"grad_norm": 1.044712781906128, |
|
"learning_rate": 9.800608226873143e-06, |
|
"loss": 0.6566, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 27.294117647058822, |
|
"grad_norm": 1.3112603425979614, |
|
"learning_rate": 9.796763074349147e-06, |
|
"loss": 0.8283, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 27.41176470588235, |
|
"grad_norm": 1.1589727401733398, |
|
"learning_rate": 9.792881968227533e-06, |
|
"loss": 0.6633, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 27.529411764705884, |
|
"grad_norm": 0.9757166504859924, |
|
"learning_rate": 9.788964937598688e-06, |
|
"loss": 0.7725, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 27.647058823529413, |
|
"grad_norm": 1.1313936710357666, |
|
"learning_rate": 9.78501201182228e-06, |
|
"loss": 0.7581, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 27.764705882352942, |
|
"grad_norm": 1.1437342166900635, |
|
"learning_rate": 9.781023220527013e-06, |
|
"loss": 0.7226, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 27.88235294117647, |
|
"grad_norm": 1.1630206108093262, |
|
"learning_rate": 9.776998593610428e-06, |
|
"loss": 0.7693, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 1.0083279609680176, |
|
"learning_rate": 9.77293816123866e-06, |
|
"loss": 0.7128, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 0.8991827368736267, |
|
"eval_runtime": 14.5006, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 28.11764705882353, |
|
"grad_norm": 1.1530383825302124, |
|
"learning_rate": 9.768841953846225e-06, |
|
"loss": 0.6908, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 28.235294117647058, |
|
"grad_norm": 1.0489223003387451, |
|
"learning_rate": 9.764710002135784e-06, |
|
"loss": 0.675, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 28.352941176470587, |
|
"grad_norm": 1.2449612617492676, |
|
"learning_rate": 9.760542337077914e-06, |
|
"loss": 0.7516, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 28.470588235294116, |
|
"grad_norm": 1.1940374374389648, |
|
"learning_rate": 9.75633898991088e-06, |
|
"loss": 0.7681, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 28.58823529411765, |
|
"grad_norm": 1.1063061952590942, |
|
"learning_rate": 9.752099992140401e-06, |
|
"loss": 0.7693, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 28.705882352941178, |
|
"grad_norm": 1.1479785442352295, |
|
"learning_rate": 9.747825375539401e-06, |
|
"loss": 0.7108, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 28.823529411764707, |
|
"grad_norm": 1.2331879138946533, |
|
"learning_rate": 9.743515172147793e-06, |
|
"loss": 0.7786, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 28.941176470588236, |
|
"grad_norm": 1.1679853200912476, |
|
"learning_rate": 9.739169414272219e-06, |
|
"loss": 0.6474, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 0.906444787979126, |
|
"eval_runtime": 14.4932, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 29.058823529411764, |
|
"grad_norm": 1.0444296598434448, |
|
"learning_rate": 9.734788134485817e-06, |
|
"loss": 0.6756, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 29.176470588235293, |
|
"grad_norm": 1.5380338430404663, |
|
"learning_rate": 9.73037136562798e-06, |
|
"loss": 0.7099, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 29.294117647058822, |
|
"grad_norm": 1.166580080986023, |
|
"learning_rate": 9.7259191408041e-06, |
|
"loss": 0.7595, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 29.41176470588235, |
|
"grad_norm": 1.2345106601715088, |
|
"learning_rate": 9.721431493385322e-06, |
|
"loss": 0.7026, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 29.529411764705884, |
|
"grad_norm": 1.0901451110839844, |
|
"learning_rate": 9.71690845700831e-06, |
|
"loss": 0.6719, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 29.647058823529413, |
|
"grad_norm": 1.1619518995285034, |
|
"learning_rate": 9.71235006557497e-06, |
|
"loss": 0.7517, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 29.764705882352942, |
|
"grad_norm": 1.1259740591049194, |
|
"learning_rate": 9.707756353252213e-06, |
|
"loss": 0.7052, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 29.88235294117647, |
|
"grad_norm": 1.1172682046890259, |
|
"learning_rate": 9.70312735447169e-06, |
|
"loss": 0.655, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 1.306216835975647, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.6387, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.9089268445968628, |
|
"eval_runtime": 14.4939, |
|
"eval_samples_per_second": 2.484, |
|
"eval_steps_per_second": 2.484, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 30.11764705882353, |
|
"grad_norm": 1.258402705192566, |
|
"learning_rate": 9.693763636586135e-06, |
|
"loss": 0.716, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 30.235294117647058, |
|
"grad_norm": 1.143336296081543, |
|
"learning_rate": 9.689028987665797e-06, |
|
"loss": 0.6283, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 30.352941176470587, |
|
"grad_norm": 1.1861103773117065, |
|
"learning_rate": 9.684259192656554e-06, |
|
"loss": 0.6445, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 30.470588235294116, |
|
"grad_norm": 1.2192977666854858, |
|
"learning_rate": 9.679454287309868e-06, |
|
"loss": 0.6928, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 30.58823529411765, |
|
"grad_norm": 1.3194884061813354, |
|
"learning_rate": 9.674614307640368e-06, |
|
"loss": 0.733, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 30.705882352941178, |
|
"grad_norm": 1.3853224515914917, |
|
"learning_rate": 9.669739289925578e-06, |
|
"loss": 0.6438, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 30.823529411764707, |
|
"grad_norm": 1.4584524631500244, |
|
"learning_rate": 9.664829270705638e-06, |
|
"loss": 0.7003, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 30.941176470588236, |
|
"grad_norm": 1.637763500213623, |
|
"learning_rate": 9.659884286783052e-06, |
|
"loss": 0.6846, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 0.9096066355705261, |
|
"eval_runtime": 14.5201, |
|
"eval_samples_per_second": 2.479, |
|
"eval_steps_per_second": 2.479, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 31.058823529411764, |
|
"grad_norm": 1.399101972579956, |
|
"learning_rate": 9.654904375222384e-06, |
|
"loss": 0.617, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 31.176470588235293, |
|
"grad_norm": 1.3545421361923218, |
|
"learning_rate": 9.649889573350006e-06, |
|
"loss": 0.6534, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 31.294117647058822, |
|
"grad_norm": 1.4606151580810547, |
|
"learning_rate": 9.644839918753796e-06, |
|
"loss": 0.6815, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 31.41176470588235, |
|
"grad_norm": 1.435264229774475, |
|
"learning_rate": 9.639755449282874e-06, |
|
"loss": 0.6696, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 31.529411764705884, |
|
"grad_norm": 1.2791359424591064, |
|
"learning_rate": 9.634636203047309e-06, |
|
"loss": 0.642, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 31.647058823529413, |
|
"grad_norm": 1.2923133373260498, |
|
"learning_rate": 9.629482218417834e-06, |
|
"loss": 0.712, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 31.764705882352942, |
|
"grad_norm": 1.2450653314590454, |
|
"learning_rate": 9.62429353402556e-06, |
|
"loss": 0.6357, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 31.88235294117647, |
|
"grad_norm": 1.31989586353302, |
|
"learning_rate": 9.619070188761687e-06, |
|
"loss": 0.6692, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 1.3321213722229004, |
|
"learning_rate": 9.613812221777212e-06, |
|
"loss": 0.6424, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 0.9172940850257874, |
|
"eval_runtime": 14.4984, |
|
"eval_samples_per_second": 2.483, |
|
"eval_steps_per_second": 2.483, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 32.11764705882353, |
|
"grad_norm": 1.2186630964279175, |
|
"learning_rate": 9.608519672482635e-06, |
|
"loss": 0.5872, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 32.23529411764706, |
|
"grad_norm": 1.5495742559432983, |
|
"learning_rate": 9.603192580547664e-06, |
|
"loss": 0.6069, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 32.35294117647059, |
|
"grad_norm": 1.551956295967102, |
|
"learning_rate": 9.597830985900913e-06, |
|
"loss": 0.6971, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 32.470588235294116, |
|
"grad_norm": 1.5809985399246216, |
|
"learning_rate": 9.592434928729617e-06, |
|
"loss": 0.6887, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 32.588235294117645, |
|
"grad_norm": 1.5837764739990234, |
|
"learning_rate": 9.58700444947931e-06, |
|
"loss": 0.6228, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 32.705882352941174, |
|
"grad_norm": 1.4612311124801636, |
|
"learning_rate": 9.581539588853539e-06, |
|
"loss": 0.6002, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 32.8235294117647, |
|
"grad_norm": 1.4830561876296997, |
|
"learning_rate": 9.576040387813553e-06, |
|
"loss": 0.6673, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 32.94117647058823, |
|
"grad_norm": 1.5311380624771118, |
|
"learning_rate": 9.570506887577994e-06, |
|
"loss": 0.6598, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 0.9237830638885498, |
|
"eval_runtime": 14.5075, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 33.05882352941177, |
|
"grad_norm": 1.3405797481536865, |
|
"learning_rate": 9.564939129622591e-06, |
|
"loss": 0.6105, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 33.1764705882353, |
|
"grad_norm": 1.4336148500442505, |
|
"learning_rate": 9.559337155679843e-06, |
|
"loss": 0.572, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 33.294117647058826, |
|
"grad_norm": 1.4750621318817139, |
|
"learning_rate": 9.553701007738717e-06, |
|
"loss": 0.5598, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 33.411764705882355, |
|
"grad_norm": 1.4853854179382324, |
|
"learning_rate": 9.54803072804433e-06, |
|
"loss": 0.6175, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 33.529411764705884, |
|
"grad_norm": 1.5611326694488525, |
|
"learning_rate": 9.542326359097619e-06, |
|
"loss": 0.5898, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 33.64705882352941, |
|
"grad_norm": 1.4341068267822266, |
|
"learning_rate": 9.536587943655043e-06, |
|
"loss": 0.6158, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 33.76470588235294, |
|
"grad_norm": 1.3872367143630981, |
|
"learning_rate": 9.530815524728245e-06, |
|
"loss": 0.6776, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 33.88235294117647, |
|
"grad_norm": 1.3841159343719482, |
|
"learning_rate": 9.525009145583746e-06, |
|
"loss": 0.6208, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 1.5026782751083374, |
|
"learning_rate": 9.519168849742603e-06, |
|
"loss": 0.6634, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 0.9289535880088806, |
|
"eval_runtime": 14.5116, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 34.11764705882353, |
|
"grad_norm": 1.5542646646499634, |
|
"learning_rate": 9.5132946809801e-06, |
|
"loss": 0.6259, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 34.23529411764706, |
|
"grad_norm": 1.337219476699829, |
|
"learning_rate": 9.507386683325404e-06, |
|
"loss": 0.5992, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 34.35294117647059, |
|
"grad_norm": 1.744362235069275, |
|
"learning_rate": 9.501444901061248e-06, |
|
"loss": 0.5903, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 34.470588235294116, |
|
"grad_norm": 1.5578619241714478, |
|
"learning_rate": 9.495469378723592e-06, |
|
"loss": 0.5371, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 34.588235294117645, |
|
"grad_norm": 1.679646611213684, |
|
"learning_rate": 9.489460161101291e-06, |
|
"loss": 0.617, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 34.705882352941174, |
|
"grad_norm": 1.5505824089050293, |
|
"learning_rate": 9.483417293235759e-06, |
|
"loss": 0.6008, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 34.8235294117647, |
|
"grad_norm": 1.9452924728393555, |
|
"learning_rate": 9.477340820420633e-06, |
|
"loss": 0.5852, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 34.94117647058823, |
|
"grad_norm": 1.5196162462234497, |
|
"learning_rate": 9.471230788201429e-06, |
|
"loss": 0.5893, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 0.9399586915969849, |
|
"eval_runtime": 14.5128, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 2.481, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"step": 595, |
|
"total_flos": 7.576813686279373e+16, |
|
"train_loss": 1.1527870081052058, |
|
"train_runtime": 5617.142, |
|
"train_samples_per_second": 3.632, |
|
"train_steps_per_second": 0.454 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 2550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 7, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.576813686279373e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|