|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.16326530612245, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04081632653061224, |
|
"grad_norm": 7.114395618438721, |
|
"learning_rate": 9.981632653061225e-06, |
|
"loss": 0.7362, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08163265306122448, |
|
"grad_norm": 11.572301864624023, |
|
"learning_rate": 9.961224489795919e-06, |
|
"loss": 0.8729, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12244897959183673, |
|
"grad_norm": 9.383491516113281, |
|
"learning_rate": 9.940816326530614e-06, |
|
"loss": 0.773, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 7.83120059967041, |
|
"learning_rate": 9.920408163265307e-06, |
|
"loss": 0.7817, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.20408163265306123, |
|
"grad_norm": 10.92087173461914, |
|
"learning_rate": 9.9e-06, |
|
"loss": 0.6256, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.24489795918367346, |
|
"grad_norm": 3.8826725482940674, |
|
"learning_rate": 9.879591836734695e-06, |
|
"loss": 0.5759, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 11.15483283996582, |
|
"learning_rate": 9.859183673469388e-06, |
|
"loss": 0.7333, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 11.470726013183594, |
|
"learning_rate": 9.838775510204083e-06, |
|
"loss": 0.5943, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3673469387755102, |
|
"grad_norm": 13.159674644470215, |
|
"learning_rate": 9.818367346938777e-06, |
|
"loss": 0.7804, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 9.58558464050293, |
|
"learning_rate": 9.79795918367347e-06, |
|
"loss": 0.6491, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4489795918367347, |
|
"grad_norm": 9.653897285461426, |
|
"learning_rate": 9.777551020408163e-06, |
|
"loss": 0.5919, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"grad_norm": 8.117432594299316, |
|
"learning_rate": 9.757142857142858e-06, |
|
"loss": 0.4571, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5306122448979592, |
|
"grad_norm": 6.9328460693359375, |
|
"learning_rate": 9.736734693877551e-06, |
|
"loss": 0.6597, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 7.962501049041748, |
|
"learning_rate": 9.716326530612246e-06, |
|
"loss": 0.5132, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6122448979591837, |
|
"grad_norm": 10.508763313293457, |
|
"learning_rate": 9.69591836734694e-06, |
|
"loss": 0.6893, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"grad_norm": 7.637253761291504, |
|
"learning_rate": 9.675510204081635e-06, |
|
"loss": 0.6142, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6938775510204082, |
|
"grad_norm": 10.0332670211792, |
|
"learning_rate": 9.655102040816328e-06, |
|
"loss": 0.582, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7346938775510204, |
|
"grad_norm": 8.150875091552734, |
|
"learning_rate": 9.634693877551021e-06, |
|
"loss": 0.477, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7755102040816326, |
|
"grad_norm": 10.330913543701172, |
|
"learning_rate": 9.614285714285714e-06, |
|
"loss": 0.5916, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 11.654999732971191, |
|
"learning_rate": 9.593877551020408e-06, |
|
"loss": 0.6236, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 8.048078536987305, |
|
"learning_rate": 9.573469387755103e-06, |
|
"loss": 0.6142, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8979591836734694, |
|
"grad_norm": 9.869592666625977, |
|
"learning_rate": 9.553061224489798e-06, |
|
"loss": 0.625, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9387755102040817, |
|
"grad_norm": 8.321409225463867, |
|
"learning_rate": 9.532653061224491e-06, |
|
"loss": 0.5767, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9795918367346939, |
|
"grad_norm": 7.6769256591796875, |
|
"learning_rate": 9.512244897959184e-06, |
|
"loss": 0.5134, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0204081632653061, |
|
"grad_norm": 9.609123229980469, |
|
"learning_rate": 9.491836734693877e-06, |
|
"loss": 0.5868, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0612244897959184, |
|
"grad_norm": 9.19683837890625, |
|
"learning_rate": 9.471428571428572e-06, |
|
"loss": 0.5215, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1020408163265305, |
|
"grad_norm": 7.328164577484131, |
|
"learning_rate": 9.451020408163266e-06, |
|
"loss": 0.5422, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 6.913904190063477, |
|
"learning_rate": 9.430612244897959e-06, |
|
"loss": 0.5214, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.183673469387755, |
|
"grad_norm": 9.28811264038086, |
|
"learning_rate": 9.410204081632654e-06, |
|
"loss": 0.5319, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2244897959183674, |
|
"grad_norm": 9.132966041564941, |
|
"learning_rate": 9.389795918367349e-06, |
|
"loss": 0.5581, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2653061224489797, |
|
"grad_norm": 6.9722065925598145, |
|
"learning_rate": 9.369387755102042e-06, |
|
"loss": 0.431, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.306122448979592, |
|
"grad_norm": 5.06177282333374, |
|
"learning_rate": 9.348979591836736e-06, |
|
"loss": 0.4583, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.346938775510204, |
|
"grad_norm": 7.732840538024902, |
|
"learning_rate": 9.328571428571429e-06, |
|
"loss": 0.4194, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3877551020408163, |
|
"grad_norm": 8.94101333618164, |
|
"learning_rate": 9.308163265306122e-06, |
|
"loss": 0.4519, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 7.5437750816345215, |
|
"learning_rate": 9.287755102040817e-06, |
|
"loss": 0.5095, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.469387755102041, |
|
"grad_norm": 5.702700138092041, |
|
"learning_rate": 9.26734693877551e-06, |
|
"loss": 0.3936, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.510204081632653, |
|
"grad_norm": 9.153871536254883, |
|
"learning_rate": 9.246938775510205e-06, |
|
"loss": 0.4566, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.5510204081632653, |
|
"grad_norm": 13.249794006347656, |
|
"learning_rate": 9.226530612244899e-06, |
|
"loss": 0.5216, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5918367346938775, |
|
"grad_norm": 7.065913200378418, |
|
"learning_rate": 9.206122448979594e-06, |
|
"loss": 0.4562, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6326530612244898, |
|
"grad_norm": 7.559301853179932, |
|
"learning_rate": 9.185714285714287e-06, |
|
"loss": 0.3883, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.6734693877551021, |
|
"grad_norm": 12.103629112243652, |
|
"learning_rate": 9.16530612244898e-06, |
|
"loss": 0.4149, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 7.9720072746276855, |
|
"learning_rate": 9.144897959183673e-06, |
|
"loss": 0.4718, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.7551020408163265, |
|
"grad_norm": 4.845782279968262, |
|
"learning_rate": 9.124489795918368e-06, |
|
"loss": 0.4304, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7959183673469388, |
|
"grad_norm": 6.954368591308594, |
|
"learning_rate": 9.104081632653062e-06, |
|
"loss": 0.3436, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.836734693877551, |
|
"grad_norm": 4.751299858093262, |
|
"learning_rate": 9.083673469387757e-06, |
|
"loss": 0.4366, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.8775510204081631, |
|
"grad_norm": 6.507364273071289, |
|
"learning_rate": 9.06326530612245e-06, |
|
"loss": 0.5794, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.9183673469387754, |
|
"grad_norm": 8.891802787780762, |
|
"learning_rate": 9.042857142857143e-06, |
|
"loss": 0.4616, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.9591836734693877, |
|
"grad_norm": 10.056327819824219, |
|
"learning_rate": 9.022448979591838e-06, |
|
"loss": 0.4946, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 7.899660110473633, |
|
"learning_rate": 9.002040816326531e-06, |
|
"loss": 0.4437, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.0408163265306123, |
|
"grad_norm": 6.761326313018799, |
|
"learning_rate": 8.981632653061225e-06, |
|
"loss": 0.4303, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0816326530612246, |
|
"grad_norm": 8.639615058898926, |
|
"learning_rate": 8.96122448979592e-06, |
|
"loss": 0.3267, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.122448979591837, |
|
"grad_norm": 7.710758209228516, |
|
"learning_rate": 8.940816326530613e-06, |
|
"loss": 0.3559, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.163265306122449, |
|
"grad_norm": 6.812905311584473, |
|
"learning_rate": 8.920408163265308e-06, |
|
"loss": 0.4761, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.204081632653061, |
|
"grad_norm": 7.2431511878967285, |
|
"learning_rate": 8.900000000000001e-06, |
|
"loss": 0.405, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.2448979591836733, |
|
"grad_norm": 7.230724811553955, |
|
"learning_rate": 8.879591836734694e-06, |
|
"loss": 0.3638, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 9.520208358764648, |
|
"learning_rate": 8.859183673469388e-06, |
|
"loss": 0.3473, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.326530612244898, |
|
"grad_norm": 7.048585414886475, |
|
"learning_rate": 8.838775510204083e-06, |
|
"loss": 0.3652, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.36734693877551, |
|
"grad_norm": 6.979404449462891, |
|
"learning_rate": 8.818367346938776e-06, |
|
"loss": 0.3855, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.4081632653061225, |
|
"grad_norm": 3.765305280685425, |
|
"learning_rate": 8.797959183673471e-06, |
|
"loss": 0.3452, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.4489795918367347, |
|
"grad_norm": 10.533697128295898, |
|
"learning_rate": 8.777551020408164e-06, |
|
"loss": 0.3874, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.489795918367347, |
|
"grad_norm": 8.108145713806152, |
|
"learning_rate": 8.757142857142858e-06, |
|
"loss": 0.3695, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.5306122448979593, |
|
"grad_norm": 7.947360992431641, |
|
"learning_rate": 8.736734693877552e-06, |
|
"loss": 0.408, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 7.8081374168396, |
|
"learning_rate": 8.716326530612246e-06, |
|
"loss": 0.4059, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.612244897959184, |
|
"grad_norm": 8.579155921936035, |
|
"learning_rate": 8.695918367346939e-06, |
|
"loss": 0.3934, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.6530612244897958, |
|
"grad_norm": 6.4387712478637695, |
|
"learning_rate": 8.675510204081632e-06, |
|
"loss": 0.4256, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.693877551020408, |
|
"grad_norm": 8.415692329406738, |
|
"learning_rate": 8.655102040816327e-06, |
|
"loss": 0.3453, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.7346938775510203, |
|
"grad_norm": 8.50904369354248, |
|
"learning_rate": 8.63469387755102e-06, |
|
"loss": 0.4766, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.7755102040816326, |
|
"grad_norm": 4.662519931793213, |
|
"learning_rate": 8.614285714285716e-06, |
|
"loss": 0.3973, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.816326530612245, |
|
"grad_norm": 6.288435935974121, |
|
"learning_rate": 8.593877551020409e-06, |
|
"loss": 0.4408, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 6.625838279724121, |
|
"learning_rate": 8.573469387755102e-06, |
|
"loss": 0.2908, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.8979591836734695, |
|
"grad_norm": 8.510032653808594, |
|
"learning_rate": 8.553061224489797e-06, |
|
"loss": 0.3813, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.938775510204082, |
|
"grad_norm": 11.82463264465332, |
|
"learning_rate": 8.53265306122449e-06, |
|
"loss": 0.4352, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.979591836734694, |
|
"grad_norm": 8.821819305419922, |
|
"learning_rate": 8.512244897959184e-06, |
|
"loss": 0.4318, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.020408163265306, |
|
"grad_norm": 8.010713577270508, |
|
"learning_rate": 8.491836734693879e-06, |
|
"loss": 0.2323, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.061224489795918, |
|
"grad_norm": 9.03991985321045, |
|
"learning_rate": 8.471428571428572e-06, |
|
"loss": 0.3603, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.1020408163265305, |
|
"grad_norm": 10.94204044342041, |
|
"learning_rate": 8.451020408163267e-06, |
|
"loss": 0.3576, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 7.89410924911499, |
|
"learning_rate": 8.43061224489796e-06, |
|
"loss": 0.2851, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.183673469387755, |
|
"grad_norm": 6.53656005859375, |
|
"learning_rate": 8.410204081632653e-06, |
|
"loss": 0.318, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.2244897959183674, |
|
"grad_norm": 6.487284183502197, |
|
"learning_rate": 8.389795918367347e-06, |
|
"loss": 0.317, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.2653061224489797, |
|
"grad_norm": 6.947931289672852, |
|
"learning_rate": 8.369387755102042e-06, |
|
"loss": 0.2879, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.306122448979592, |
|
"grad_norm": 4.166048526763916, |
|
"learning_rate": 8.348979591836735e-06, |
|
"loss": 0.3392, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.3469387755102042, |
|
"grad_norm": 9.974846839904785, |
|
"learning_rate": 8.32857142857143e-06, |
|
"loss": 0.3663, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.387755102040816, |
|
"grad_norm": 9.668428421020508, |
|
"learning_rate": 8.308163265306123e-06, |
|
"loss": 0.3212, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 11.81507396697998, |
|
"learning_rate": 8.287755102040816e-06, |
|
"loss": 0.3241, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.4693877551020407, |
|
"grad_norm": 13.690321922302246, |
|
"learning_rate": 8.267346938775511e-06, |
|
"loss": 0.4535, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.510204081632653, |
|
"grad_norm": 11.042778968811035, |
|
"learning_rate": 8.246938775510205e-06, |
|
"loss": 0.3826, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.5510204081632653, |
|
"grad_norm": 8.57719612121582, |
|
"learning_rate": 8.226530612244898e-06, |
|
"loss": 0.3905, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.5918367346938775, |
|
"grad_norm": 7.843425750732422, |
|
"learning_rate": 8.206122448979591e-06, |
|
"loss": 0.3125, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.63265306122449, |
|
"grad_norm": 5.9236931800842285, |
|
"learning_rate": 8.185714285714286e-06, |
|
"loss": 0.3512, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.673469387755102, |
|
"grad_norm": 8.213603973388672, |
|
"learning_rate": 8.165306122448981e-06, |
|
"loss": 0.4094, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 3.8083949089050293, |
|
"learning_rate": 8.144897959183674e-06, |
|
"loss": 0.2751, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.7551020408163263, |
|
"grad_norm": 12.339240074157715, |
|
"learning_rate": 8.124489795918368e-06, |
|
"loss": 0.3296, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.795918367346939, |
|
"grad_norm": 9.532052040100098, |
|
"learning_rate": 8.104081632653061e-06, |
|
"loss": 0.3033, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.836734693877551, |
|
"grad_norm": 6.307032108306885, |
|
"learning_rate": 8.083673469387756e-06, |
|
"loss": 0.3765, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.877551020408163, |
|
"grad_norm": 7.3003010749816895, |
|
"learning_rate": 8.06326530612245e-06, |
|
"loss": 0.2161, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.9183673469387754, |
|
"grad_norm": 7.6572699546813965, |
|
"learning_rate": 8.042857142857143e-06, |
|
"loss": 0.2886, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.9591836734693877, |
|
"grad_norm": 6.745776176452637, |
|
"learning_rate": 8.022448979591838e-06, |
|
"loss": 0.3376, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 10.482270240783691, |
|
"learning_rate": 8.002040816326533e-06, |
|
"loss": 0.2657, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.040816326530612, |
|
"grad_norm": 6.213717460632324, |
|
"learning_rate": 7.981632653061226e-06, |
|
"loss": 0.2596, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.081632653061225, |
|
"grad_norm": 10.256094932556152, |
|
"learning_rate": 7.961224489795919e-06, |
|
"loss": 0.2338, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.122448979591836, |
|
"grad_norm": 8.640167236328125, |
|
"learning_rate": 7.940816326530612e-06, |
|
"loss": 0.2936, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.163265306122449, |
|
"grad_norm": 6.750300407409668, |
|
"learning_rate": 7.920408163265306e-06, |
|
"loss": 0.3511, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.204081632653061, |
|
"grad_norm": 9.488007545471191, |
|
"learning_rate": 7.9e-06, |
|
"loss": 0.2913, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.244897959183674, |
|
"grad_norm": 4.2671003341674805, |
|
"learning_rate": 7.879591836734694e-06, |
|
"loss": 0.2768, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 5.375782489776611, |
|
"learning_rate": 7.859183673469389e-06, |
|
"loss": 0.274, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.326530612244898, |
|
"grad_norm": 10.316515922546387, |
|
"learning_rate": 7.838775510204082e-06, |
|
"loss": 0.2373, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.36734693877551, |
|
"grad_norm": 6.733712673187256, |
|
"learning_rate": 7.818367346938777e-06, |
|
"loss": 0.32, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.408163265306122, |
|
"grad_norm": 6.432683944702148, |
|
"learning_rate": 7.79795918367347e-06, |
|
"loss": 0.3149, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.448979591836735, |
|
"grad_norm": 8.520115852355957, |
|
"learning_rate": 7.777551020408164e-06, |
|
"loss": 0.3369, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.489795918367347, |
|
"grad_norm": 3.52677845954895, |
|
"learning_rate": 7.757142857142857e-06, |
|
"loss": 0.2584, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.530612244897959, |
|
"grad_norm": 9.642569541931152, |
|
"learning_rate": 7.736734693877552e-06, |
|
"loss": 0.3076, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 11.233070373535156, |
|
"learning_rate": 7.716326530612245e-06, |
|
"loss": 0.3263, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.612244897959184, |
|
"grad_norm": 7.919038772583008, |
|
"learning_rate": 7.69591836734694e-06, |
|
"loss": 0.3152, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.653061224489796, |
|
"grad_norm": 7.116144180297852, |
|
"learning_rate": 7.675510204081633e-06, |
|
"loss": 0.2809, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.6938775510204085, |
|
"grad_norm": 6.757585525512695, |
|
"learning_rate": 7.655102040816327e-06, |
|
"loss": 0.2642, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.73469387755102, |
|
"grad_norm": 9.032824516296387, |
|
"learning_rate": 7.634693877551022e-06, |
|
"loss": 0.3546, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.775510204081632, |
|
"grad_norm": 7.837385654449463, |
|
"learning_rate": 7.614285714285715e-06, |
|
"loss": 0.3196, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.816326530612245, |
|
"grad_norm": 5.3134846687316895, |
|
"learning_rate": 7.593877551020409e-06, |
|
"loss": 0.2068, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.857142857142857, |
|
"grad_norm": 8.036781311035156, |
|
"learning_rate": 7.573469387755102e-06, |
|
"loss": 0.24, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.8979591836734695, |
|
"grad_norm": 12.13339900970459, |
|
"learning_rate": 7.5530612244897965e-06, |
|
"loss": 0.2795, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.938775510204081, |
|
"grad_norm": 5.253749370574951, |
|
"learning_rate": 7.532653061224491e-06, |
|
"loss": 0.2667, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.979591836734694, |
|
"grad_norm": 7.921318531036377, |
|
"learning_rate": 7.512244897959185e-06, |
|
"loss": 0.3218, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.020408163265306, |
|
"grad_norm": 9.056379318237305, |
|
"learning_rate": 7.491836734693878e-06, |
|
"loss": 0.2324, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.061224489795919, |
|
"grad_norm": 5.0674920082092285, |
|
"learning_rate": 7.471428571428571e-06, |
|
"loss": 0.3063, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.1020408163265305, |
|
"grad_norm": 12.247352600097656, |
|
"learning_rate": 7.451020408163266e-06, |
|
"loss": 0.3469, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.142857142857143, |
|
"grad_norm": 7.278679847717285, |
|
"learning_rate": 7.43061224489796e-06, |
|
"loss": 0.2851, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.183673469387755, |
|
"grad_norm": 11.491589546203613, |
|
"learning_rate": 7.410204081632654e-06, |
|
"loss": 0.245, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.224489795918367, |
|
"grad_norm": 8.652173042297363, |
|
"learning_rate": 7.389795918367347e-06, |
|
"loss": 0.2328, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.26530612244898, |
|
"grad_norm": 7.953402519226074, |
|
"learning_rate": 7.369387755102041e-06, |
|
"loss": 0.2784, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.3061224489795915, |
|
"grad_norm": 6.98084831237793, |
|
"learning_rate": 7.348979591836736e-06, |
|
"loss": 0.2379, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.346938775510204, |
|
"grad_norm": 7.400093078613281, |
|
"learning_rate": 7.328571428571429e-06, |
|
"loss": 0.2057, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.387755102040816, |
|
"grad_norm": 4.131850719451904, |
|
"learning_rate": 7.308163265306123e-06, |
|
"loss": 0.2792, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.428571428571429, |
|
"grad_norm": 10.25373363494873, |
|
"learning_rate": 7.287755102040817e-06, |
|
"loss": 0.2642, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.469387755102041, |
|
"grad_norm": 5.801217079162598, |
|
"learning_rate": 7.267346938775512e-06, |
|
"loss": 0.2301, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.510204081632653, |
|
"grad_norm": 3.3520116806030273, |
|
"learning_rate": 7.246938775510205e-06, |
|
"loss": 0.2728, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.551020408163265, |
|
"grad_norm": 5.335054874420166, |
|
"learning_rate": 7.226530612244898e-06, |
|
"loss": 0.2812, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.591836734693878, |
|
"grad_norm": 4.713162899017334, |
|
"learning_rate": 7.206122448979592e-06, |
|
"loss": 0.221, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.63265306122449, |
|
"grad_norm": 8.641826629638672, |
|
"learning_rate": 7.185714285714286e-06, |
|
"loss": 0.2204, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.673469387755102, |
|
"grad_norm": 4.286067485809326, |
|
"learning_rate": 7.165306122448981e-06, |
|
"loss": 0.2446, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 7.071314811706543, |
|
"learning_rate": 7.144897959183674e-06, |
|
"loss": 0.1788, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.755102040816326, |
|
"grad_norm": 9.792560577392578, |
|
"learning_rate": 7.124489795918368e-06, |
|
"loss": 0.272, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.795918367346939, |
|
"grad_norm": 6.726722240447998, |
|
"learning_rate": 7.104081632653061e-06, |
|
"loss": 0.18, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.836734693877551, |
|
"grad_norm": 7.378534317016602, |
|
"learning_rate": 7.083673469387755e-06, |
|
"loss": 0.2308, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.877551020408164, |
|
"grad_norm": 10.169758796691895, |
|
"learning_rate": 7.0632653061224495e-06, |
|
"loss": 0.296, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.918367346938775, |
|
"grad_norm": 5.440324306488037, |
|
"learning_rate": 7.042857142857144e-06, |
|
"loss": 0.2725, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.959183673469388, |
|
"grad_norm": 8.175464630126953, |
|
"learning_rate": 7.022448979591837e-06, |
|
"loss": 0.2251, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 6.7653398513793945, |
|
"learning_rate": 7.002040816326531e-06, |
|
"loss": 0.3411, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 6.040816326530612, |
|
"grad_norm": 6.0152716636657715, |
|
"learning_rate": 6.981632653061225e-06, |
|
"loss": 0.2071, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 6.081632653061225, |
|
"grad_norm": 8.066520690917969, |
|
"learning_rate": 6.961224489795919e-06, |
|
"loss": 0.1639, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 6.122448979591836, |
|
"grad_norm": 4.027646541595459, |
|
"learning_rate": 6.940816326530613e-06, |
|
"loss": 0.2069, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.163265306122449, |
|
"grad_norm": 5.61140251159668, |
|
"learning_rate": 6.920408163265307e-06, |
|
"loss": 0.2413, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 6.204081632653061, |
|
"grad_norm": 9.809159278869629, |
|
"learning_rate": 6.9e-06, |
|
"loss": 0.26, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 6.244897959183674, |
|
"grad_norm": 6.755568504333496, |
|
"learning_rate": 6.879591836734695e-06, |
|
"loss": 0.1965, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 6.285714285714286, |
|
"grad_norm": 4.21774435043335, |
|
"learning_rate": 6.859183673469388e-06, |
|
"loss": 0.1959, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 6.326530612244898, |
|
"grad_norm": 5.166352272033691, |
|
"learning_rate": 6.838775510204082e-06, |
|
"loss": 0.1941, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.36734693877551, |
|
"grad_norm": 10.346336364746094, |
|
"learning_rate": 6.818367346938776e-06, |
|
"loss": 0.2278, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.408163265306122, |
|
"grad_norm": 7.703672409057617, |
|
"learning_rate": 6.797959183673471e-06, |
|
"loss": 0.2854, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.448979591836735, |
|
"grad_norm": 9.366389274597168, |
|
"learning_rate": 6.777551020408164e-06, |
|
"loss": 0.2229, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 6.489795918367347, |
|
"grad_norm": 7.013561248779297, |
|
"learning_rate": 6.757142857142858e-06, |
|
"loss": 0.2732, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 6.530612244897959, |
|
"grad_norm": 5.82119083404541, |
|
"learning_rate": 6.736734693877551e-06, |
|
"loss": 0.2389, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.571428571428571, |
|
"grad_norm": 8.112947463989258, |
|
"learning_rate": 6.716326530612245e-06, |
|
"loss": 0.193, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 6.612244897959184, |
|
"grad_norm": 8.238506317138672, |
|
"learning_rate": 6.6959183673469396e-06, |
|
"loss": 0.2344, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.653061224489796, |
|
"grad_norm": 6.531697750091553, |
|
"learning_rate": 6.675510204081634e-06, |
|
"loss": 0.2146, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.6938775510204085, |
|
"grad_norm": 6.525634288787842, |
|
"learning_rate": 6.655102040816327e-06, |
|
"loss": 0.1993, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.73469387755102, |
|
"grad_norm": 8.847986221313477, |
|
"learning_rate": 6.63469387755102e-06, |
|
"loss": 0.2884, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.775510204081632, |
|
"grad_norm": 9.006918907165527, |
|
"learning_rate": 6.614285714285715e-06, |
|
"loss": 0.2173, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 6.816326530612245, |
|
"grad_norm": 9.229476928710938, |
|
"learning_rate": 6.593877551020409e-06, |
|
"loss": 0.201, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.857142857142857, |
|
"grad_norm": 5.367541313171387, |
|
"learning_rate": 6.573469387755103e-06, |
|
"loss": 0.2505, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.8979591836734695, |
|
"grad_norm": 7.771108150482178, |
|
"learning_rate": 6.553061224489796e-06, |
|
"loss": 0.2225, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.938775510204081, |
|
"grad_norm": 5.306410789489746, |
|
"learning_rate": 6.53265306122449e-06, |
|
"loss": 0.2549, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.979591836734694, |
|
"grad_norm": 4.359670162200928, |
|
"learning_rate": 6.512244897959185e-06, |
|
"loss": 0.2585, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 7.020408163265306, |
|
"grad_norm": 4.528923034667969, |
|
"learning_rate": 6.491836734693878e-06, |
|
"loss": 0.2489, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 7.061224489795919, |
|
"grad_norm": 4.760287761688232, |
|
"learning_rate": 6.4714285714285715e-06, |
|
"loss": 0.1955, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 7.1020408163265305, |
|
"grad_norm": 9.205543518066406, |
|
"learning_rate": 6.451020408163266e-06, |
|
"loss": 0.2283, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 5.910974025726318, |
|
"learning_rate": 6.43061224489796e-06, |
|
"loss": 0.2308, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.183673469387755, |
|
"grad_norm": 5.042090892791748, |
|
"learning_rate": 6.410204081632654e-06, |
|
"loss": 0.1883, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 7.224489795918367, |
|
"grad_norm": 5.0842742919921875, |
|
"learning_rate": 6.389795918367347e-06, |
|
"loss": 0.1942, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 7.26530612244898, |
|
"grad_norm": 9.219313621520996, |
|
"learning_rate": 6.369387755102041e-06, |
|
"loss": 0.1946, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 7.3061224489795915, |
|
"grad_norm": 7.5656208992004395, |
|
"learning_rate": 6.348979591836735e-06, |
|
"loss": 0.2016, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 7.346938775510204, |
|
"grad_norm": 5.8213653564453125, |
|
"learning_rate": 6.3285714285714296e-06, |
|
"loss": 0.2808, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.387755102040816, |
|
"grad_norm": 8.004778861999512, |
|
"learning_rate": 6.308163265306123e-06, |
|
"loss": 0.2393, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 7.428571428571429, |
|
"grad_norm": 7.5001139640808105, |
|
"learning_rate": 6.287755102040817e-06, |
|
"loss": 0.2007, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 7.469387755102041, |
|
"grad_norm": 9.618229866027832, |
|
"learning_rate": 6.26734693877551e-06, |
|
"loss": 0.2464, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 7.510204081632653, |
|
"grad_norm": 7.257756233215332, |
|
"learning_rate": 6.246938775510205e-06, |
|
"loss": 0.1692, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 7.551020408163265, |
|
"grad_norm": 7.658279895782471, |
|
"learning_rate": 6.2265306122448985e-06, |
|
"loss": 0.2367, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 7.591836734693878, |
|
"grad_norm": 6.590469837188721, |
|
"learning_rate": 6.206122448979593e-06, |
|
"loss": 0.2508, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 7.63265306122449, |
|
"grad_norm": 8.601705551147461, |
|
"learning_rate": 6.185714285714286e-06, |
|
"loss": 0.2168, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 7.673469387755102, |
|
"grad_norm": 6.144942283630371, |
|
"learning_rate": 6.16530612244898e-06, |
|
"loss": 0.2308, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 7.714285714285714, |
|
"grad_norm": 3.256690502166748, |
|
"learning_rate": 6.144897959183674e-06, |
|
"loss": 0.1791, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.755102040816326, |
|
"grad_norm": 6.810645580291748, |
|
"learning_rate": 6.124489795918368e-06, |
|
"loss": 0.2335, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.795918367346939, |
|
"grad_norm": 3.4259018898010254, |
|
"learning_rate": 6.1040816326530616e-06, |
|
"loss": 0.212, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 7.836734693877551, |
|
"grad_norm": 8.353039741516113, |
|
"learning_rate": 6.083673469387756e-06, |
|
"loss": 0.2312, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 7.877551020408164, |
|
"grad_norm": 5.548733711242676, |
|
"learning_rate": 6.06326530612245e-06, |
|
"loss": 0.2666, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 7.918367346938775, |
|
"grad_norm": 8.386053085327148, |
|
"learning_rate": 6.042857142857144e-06, |
|
"loss": 0.2543, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 7.959183673469388, |
|
"grad_norm": 7.863219261169434, |
|
"learning_rate": 6.022448979591837e-06, |
|
"loss": 0.2171, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 5.328917503356934, |
|
"learning_rate": 6.0020408163265305e-06, |
|
"loss": 0.1592, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 8.040816326530612, |
|
"grad_norm": 11.456307411193848, |
|
"learning_rate": 5.981632653061225e-06, |
|
"loss": 0.2345, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 8.081632653061224, |
|
"grad_norm": 3.9219276905059814, |
|
"learning_rate": 5.96122448979592e-06, |
|
"loss": 0.228, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 8.122448979591837, |
|
"grad_norm": 7.155372142791748, |
|
"learning_rate": 5.940816326530613e-06, |
|
"loss": 0.1879, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 8.16326530612245, |
|
"grad_norm": 8.530526161193848, |
|
"learning_rate": 5.920408163265306e-06, |
|
"loss": 0.1808, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 30, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|