|
{ |
|
"best_metric": 0.8894595801697186, |
|
"best_model_checkpoint": "tsec_vit_model/checkpoint-2520", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 2800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 0.632759690284729, |
|
"learning_rate": 1.7857142857142857e-06, |
|
"loss": 0.7005, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.6090130805969238, |
|
"learning_rate": 3.5714285714285714e-06, |
|
"loss": 0.6888, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 0.5027034878730774, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 0.6841, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.7158761620521545, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 0.6711, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 1.0677605867385864, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 0.6439, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.716444194316864, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 0.6388, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9646293520927429, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.5942, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.5171552896499634, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.587, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 1.2230862379074097, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 0.5882, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 1.0726110935211182, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 0.5506, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 2.488288640975952, |
|
"learning_rate": 1.9642857142857145e-05, |
|
"loss": 0.5487, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 2.186500310897827, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 0.5138, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4642857142857143, |
|
"grad_norm": 2.18605637550354, |
|
"learning_rate": 2.3214285714285715e-05, |
|
"loss": 0.5203, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.5495938062667847, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.5006, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 1.3939014673233032, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 0.5171, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 1.4516574144363403, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.4882, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6071428571428571, |
|
"grad_norm": 1.5352245569229126, |
|
"learning_rate": 3.0357142857142857e-05, |
|
"loss": 0.4809, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 2.6514480113983154, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 0.458, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6785714285714286, |
|
"grad_norm": 2.1452901363372803, |
|
"learning_rate": 3.392857142857143e-05, |
|
"loss": 0.478, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.5536608695983887, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.4851, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.7489731311798096, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.4608, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 1.6109980344772339, |
|
"learning_rate": 3.928571428571429e-05, |
|
"loss": 0.4611, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8214285714285714, |
|
"grad_norm": 2.391719102859497, |
|
"learning_rate": 4.107142857142857e-05, |
|
"loss": 0.3998, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 3.6933391094207764, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.4624, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 2.0588369369506836, |
|
"learning_rate": 4.464285714285715e-05, |
|
"loss": 0.4437, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 2.831156015396118, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 0.4652, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9642857142857143, |
|
"grad_norm": 2.0229790210723877, |
|
"learning_rate": 4.8214285714285716e-05, |
|
"loss": 0.4037, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.1834700107574463, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4387, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8150960250111657, |
|
"eval_loss": 0.4179099500179291, |
|
"eval_runtime": 110.4006, |
|
"eval_samples_per_second": 40.561, |
|
"eval_steps_per_second": 2.536, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0357142857142858, |
|
"grad_norm": 1.7746831178665161, |
|
"learning_rate": 4.9801587301587306e-05, |
|
"loss": 0.4069, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 2.6947922706604004, |
|
"learning_rate": 4.960317460317461e-05, |
|
"loss": 0.4334, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1071428571428572, |
|
"grad_norm": 1.4933383464813232, |
|
"learning_rate": 4.940476190476191e-05, |
|
"loss": 0.4409, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 1.8830089569091797, |
|
"learning_rate": 4.9206349206349204e-05, |
|
"loss": 0.4065, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1785714285714286, |
|
"grad_norm": 1.978843092918396, |
|
"learning_rate": 4.900793650793651e-05, |
|
"loss": 0.4273, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2142857142857142, |
|
"grad_norm": 1.2928149700164795, |
|
"learning_rate": 4.880952380952381e-05, |
|
"loss": 0.4376, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.5401108264923096, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 0.4061, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 2.296520709991455, |
|
"learning_rate": 4.841269841269841e-05, |
|
"loss": 0.433, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3214285714285714, |
|
"grad_norm": 1.2640879154205322, |
|
"learning_rate": 4.8214285714285716e-05, |
|
"loss": 0.4142, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3571428571428572, |
|
"grad_norm": 1.3309462070465088, |
|
"learning_rate": 4.801587301587302e-05, |
|
"loss": 0.4222, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3928571428571428, |
|
"grad_norm": 1.8462568521499634, |
|
"learning_rate": 4.781746031746032e-05, |
|
"loss": 0.4483, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.074401617050171, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 0.3762, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4642857142857144, |
|
"grad_norm": 1.5691473484039307, |
|
"learning_rate": 4.7420634920634924e-05, |
|
"loss": 0.415, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.2222189903259277, |
|
"learning_rate": 4.722222222222222e-05, |
|
"loss": 0.3862, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5357142857142856, |
|
"grad_norm": 1.4210410118103027, |
|
"learning_rate": 4.7023809523809525e-05, |
|
"loss": 0.352, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 1.632934331893921, |
|
"learning_rate": 4.682539682539683e-05, |
|
"loss": 0.3873, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 1.3977652788162231, |
|
"learning_rate": 4.662698412698413e-05, |
|
"loss": 0.3763, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"grad_norm": 2.74967622756958, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 0.3823, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.6785714285714286, |
|
"grad_norm": 1.1579408645629883, |
|
"learning_rate": 4.623015873015873e-05, |
|
"loss": 0.3967, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 2.7291440963745117, |
|
"learning_rate": 4.603174603174603e-05, |
|
"loss": 0.4201, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.0126214027404785, |
|
"learning_rate": 4.5833333333333334e-05, |
|
"loss": 0.3598, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 1.6705009937286377, |
|
"learning_rate": 4.563492063492064e-05, |
|
"loss": 0.366, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8214285714285714, |
|
"grad_norm": 1.6749546527862549, |
|
"learning_rate": 4.543650793650794e-05, |
|
"loss": 0.3854, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 1.0892480611801147, |
|
"learning_rate": 4.523809523809524e-05, |
|
"loss": 0.3563, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.8928571428571428, |
|
"grad_norm": 1.4169703722000122, |
|
"learning_rate": 4.503968253968254e-05, |
|
"loss": 0.3903, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"grad_norm": 1.393864631652832, |
|
"learning_rate": 4.4841269841269846e-05, |
|
"loss": 0.3915, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 1.6008644104003906, |
|
"learning_rate": 4.464285714285715e-05, |
|
"loss": 0.3665, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0977710485458374, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.4239, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8398838767306833, |
|
"eval_loss": 0.3610800504684448, |
|
"eval_runtime": 114.7581, |
|
"eval_samples_per_second": 39.021, |
|
"eval_steps_per_second": 2.44, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.0357142857142856, |
|
"grad_norm": 1.4717609882354736, |
|
"learning_rate": 4.4246031746031744e-05, |
|
"loss": 0.3609, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.0714285714285716, |
|
"grad_norm": 1.0943901538848877, |
|
"learning_rate": 4.404761904761905e-05, |
|
"loss": 0.3379, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.107142857142857, |
|
"grad_norm": 1.8704146146774292, |
|
"learning_rate": 4.384920634920635e-05, |
|
"loss": 0.364, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 1.2977815866470337, |
|
"learning_rate": 4.3650793650793655e-05, |
|
"loss": 0.3834, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.1785714285714284, |
|
"grad_norm": 1.6529649496078491, |
|
"learning_rate": 4.345238095238096e-05, |
|
"loss": 0.3688, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.2142857142857144, |
|
"grad_norm": 1.1914763450622559, |
|
"learning_rate": 4.3253968253968256e-05, |
|
"loss": 0.3548, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.70374596118927, |
|
"learning_rate": 4.305555555555556e-05, |
|
"loss": 0.3323, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 1.1331793069839478, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.3494, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 1.3151099681854248, |
|
"learning_rate": 4.265873015873016e-05, |
|
"loss": 0.3177, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.357142857142857, |
|
"grad_norm": 1.7670910358428955, |
|
"learning_rate": 4.2460317460317464e-05, |
|
"loss": 0.3145, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.392857142857143, |
|
"grad_norm": 1.441829800605774, |
|
"learning_rate": 4.226190476190476e-05, |
|
"loss": 0.2968, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.4285714285714284, |
|
"grad_norm": 1.5992521047592163, |
|
"learning_rate": 4.2063492063492065e-05, |
|
"loss": 0.353, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.4642857142857144, |
|
"grad_norm": 1.076711654663086, |
|
"learning_rate": 4.186507936507937e-05, |
|
"loss": 0.3461, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.0764169692993164, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.3345, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.5357142857142856, |
|
"grad_norm": 1.3735431432724, |
|
"learning_rate": 4.1468253968253976e-05, |
|
"loss": 0.3612, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 1.4703452587127686, |
|
"learning_rate": 4.126984126984127e-05, |
|
"loss": 0.3444, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.607142857142857, |
|
"grad_norm": 1.451257348060608, |
|
"learning_rate": 4.107142857142857e-05, |
|
"loss": 0.3287, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.642857142857143, |
|
"grad_norm": 1.8615336418151855, |
|
"learning_rate": 4.0873015873015874e-05, |
|
"loss": 0.3419, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 2.925323963165283, |
|
"learning_rate": 4.067460317460318e-05, |
|
"loss": 0.3217, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.7142857142857144, |
|
"grad_norm": 1.7260726690292358, |
|
"learning_rate": 4.047619047619048e-05, |
|
"loss": 0.3584, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.4659258127212524, |
|
"learning_rate": 4.027777777777778e-05, |
|
"loss": 0.3821, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.7857142857142856, |
|
"grad_norm": 1.7859466075897217, |
|
"learning_rate": 4.007936507936508e-05, |
|
"loss": 0.3595, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.821428571428571, |
|
"grad_norm": 1.2320784330368042, |
|
"learning_rate": 3.9880952380952386e-05, |
|
"loss": 0.3764, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.4097161293029785, |
|
"learning_rate": 3.968253968253968e-05, |
|
"loss": 0.3282, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.892857142857143, |
|
"grad_norm": 1.185285210609436, |
|
"learning_rate": 3.9484126984126986e-05, |
|
"loss": 0.3214, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.928571428571429, |
|
"grad_norm": 1.3706032037734985, |
|
"learning_rate": 3.928571428571429e-05, |
|
"loss": 0.3092, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.9642857142857144, |
|
"grad_norm": 2.1342084407806396, |
|
"learning_rate": 3.908730158730159e-05, |
|
"loss": 0.3274, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.438025712966919, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.3148, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8599821348816435, |
|
"eval_loss": 0.31560027599334717, |
|
"eval_runtime": 116.0276, |
|
"eval_samples_per_second": 38.594, |
|
"eval_steps_per_second": 2.413, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.0357142857142856, |
|
"grad_norm": 1.6471174955368042, |
|
"learning_rate": 3.8690476190476195e-05, |
|
"loss": 0.3198, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.0714285714285716, |
|
"grad_norm": 1.5575629472732544, |
|
"learning_rate": 3.84920634920635e-05, |
|
"loss": 0.3127, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.107142857142857, |
|
"grad_norm": 1.1578933000564575, |
|
"learning_rate": 3.8293650793650795e-05, |
|
"loss": 0.3416, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 1.9167553186416626, |
|
"learning_rate": 3.809523809523809e-05, |
|
"loss": 0.3626, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.1785714285714284, |
|
"grad_norm": 1.038451910018921, |
|
"learning_rate": 3.7896825396825396e-05, |
|
"loss": 0.336, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.2142857142857144, |
|
"grad_norm": 1.2739081382751465, |
|
"learning_rate": 3.76984126984127e-05, |
|
"loss": 0.2838, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.2061238288879395, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.3099, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.2857142857142856, |
|
"grad_norm": 1.5365681648254395, |
|
"learning_rate": 3.730158730158731e-05, |
|
"loss": 0.3163, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.3214285714285716, |
|
"grad_norm": 2.33838152885437, |
|
"learning_rate": 3.7103174603174604e-05, |
|
"loss": 0.3109, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.357142857142857, |
|
"grad_norm": 1.2280935049057007, |
|
"learning_rate": 3.690476190476191e-05, |
|
"loss": 0.3104, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.392857142857143, |
|
"grad_norm": 1.2506486177444458, |
|
"learning_rate": 3.6706349206349205e-05, |
|
"loss": 0.339, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 1.0430885553359985, |
|
"learning_rate": 3.650793650793651e-05, |
|
"loss": 0.3015, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.4642857142857144, |
|
"grad_norm": 1.5323641300201416, |
|
"learning_rate": 3.630952380952381e-05, |
|
"loss": 0.3408, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.0826184749603271, |
|
"learning_rate": 3.611111111111111e-05, |
|
"loss": 0.3061, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.5357142857142856, |
|
"grad_norm": 2.019437313079834, |
|
"learning_rate": 3.591269841269841e-05, |
|
"loss": 0.3465, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 1.5877951383590698, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.3023, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.607142857142857, |
|
"grad_norm": 1.5832830667495728, |
|
"learning_rate": 3.551587301587302e-05, |
|
"loss": 0.262, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.642857142857143, |
|
"grad_norm": 1.5758150815963745, |
|
"learning_rate": 3.5317460317460324e-05, |
|
"loss": 0.3337, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.678571428571429, |
|
"grad_norm": 1.255638837814331, |
|
"learning_rate": 3.511904761904762e-05, |
|
"loss": 0.2857, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 1.5360593795776367, |
|
"learning_rate": 3.492063492063492e-05, |
|
"loss": 0.2964, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.0826270580291748, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 0.2872, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.7857142857142856, |
|
"grad_norm": 1.6883878707885742, |
|
"learning_rate": 3.4523809523809526e-05, |
|
"loss": 0.3016, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.821428571428571, |
|
"grad_norm": 1.8018616437911987, |
|
"learning_rate": 3.432539682539683e-05, |
|
"loss": 0.3115, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.857142857142857, |
|
"grad_norm": 1.9364327192306519, |
|
"learning_rate": 3.412698412698413e-05, |
|
"loss": 0.2986, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.892857142857143, |
|
"grad_norm": 1.6606508493423462, |
|
"learning_rate": 3.392857142857143e-05, |
|
"loss": 0.3017, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.928571428571429, |
|
"grad_norm": 2.1849498748779297, |
|
"learning_rate": 3.3730158730158734e-05, |
|
"loss": 0.2779, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.9642857142857144, |
|
"grad_norm": 1.5579859018325806, |
|
"learning_rate": 3.353174603174603e-05, |
|
"loss": 0.3199, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.6287128925323486, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.2988, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.8729343456900402, |
|
"eval_loss": 0.30018478631973267, |
|
"eval_runtime": 115.4434, |
|
"eval_samples_per_second": 38.79, |
|
"eval_steps_per_second": 2.425, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.035714285714286, |
|
"grad_norm": 1.5949360132217407, |
|
"learning_rate": 3.313492063492064e-05, |
|
"loss": 0.305, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.071428571428571, |
|
"grad_norm": 1.678801417350769, |
|
"learning_rate": 3.2936507936507936e-05, |
|
"loss": 0.2677, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.107142857142857, |
|
"grad_norm": 1.6015031337738037, |
|
"learning_rate": 3.273809523809524e-05, |
|
"loss": 0.3167, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.142857142857143, |
|
"grad_norm": 1.525894045829773, |
|
"learning_rate": 3.253968253968254e-05, |
|
"loss": 0.2745, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.178571428571429, |
|
"grad_norm": 1.2955286502838135, |
|
"learning_rate": 3.234126984126985e-05, |
|
"loss": 0.2925, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.214285714285714, |
|
"grad_norm": 2.1041815280914307, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 0.2782, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 1.7325941324234009, |
|
"learning_rate": 3.194444444444444e-05, |
|
"loss": 0.302, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 1.9786794185638428, |
|
"learning_rate": 3.1746031746031745e-05, |
|
"loss": 0.2864, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.321428571428571, |
|
"grad_norm": 1.3869291543960571, |
|
"learning_rate": 3.154761904761905e-05, |
|
"loss": 0.2834, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.357142857142857, |
|
"grad_norm": 2.1556620597839355, |
|
"learning_rate": 3.134920634920635e-05, |
|
"loss": 0.3011, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.392857142857143, |
|
"grad_norm": 1.6058976650238037, |
|
"learning_rate": 3.1150793650793656e-05, |
|
"loss": 0.2893, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.428571428571429, |
|
"grad_norm": 1.9528658390045166, |
|
"learning_rate": 3.095238095238095e-05, |
|
"loss": 0.2725, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.464285714285714, |
|
"grad_norm": 1.5343618392944336, |
|
"learning_rate": 3.075396825396826e-05, |
|
"loss": 0.2578, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 1.0154541730880737, |
|
"learning_rate": 3.055555555555556e-05, |
|
"loss": 0.2897, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.535714285714286, |
|
"grad_norm": 2.355865001678467, |
|
"learning_rate": 3.0357142857142857e-05, |
|
"loss": 0.2658, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 1.24091374874115, |
|
"learning_rate": 3.0158730158730158e-05, |
|
"loss": 0.3198, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.607142857142857, |
|
"grad_norm": 1.0385922193527222, |
|
"learning_rate": 2.996031746031746e-05, |
|
"loss": 0.285, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.642857142857143, |
|
"grad_norm": 1.032619833946228, |
|
"learning_rate": 2.9761904761904762e-05, |
|
"loss": 0.2741, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.678571428571429, |
|
"grad_norm": 1.587849736213684, |
|
"learning_rate": 2.9563492063492066e-05, |
|
"loss": 0.3093, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.714285714285714, |
|
"grad_norm": 1.2873854637145996, |
|
"learning_rate": 2.9365079365079366e-05, |
|
"loss": 0.299, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 1.6315929889678955, |
|
"learning_rate": 2.916666666666667e-05, |
|
"loss": 0.2776, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.785714285714286, |
|
"grad_norm": 1.8490331172943115, |
|
"learning_rate": 2.8968253968253974e-05, |
|
"loss": 0.2613, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.821428571428571, |
|
"grad_norm": 1.8352344036102295, |
|
"learning_rate": 2.876984126984127e-05, |
|
"loss": 0.281, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.857142857142857, |
|
"grad_norm": 1.0744656324386597, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.2622, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.892857142857143, |
|
"grad_norm": 1.2477718591690063, |
|
"learning_rate": 2.8373015873015875e-05, |
|
"loss": 0.2997, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.928571428571429, |
|
"grad_norm": 1.5114529132843018, |
|
"learning_rate": 2.8174603174603175e-05, |
|
"loss": 0.2827, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.964285714285714, |
|
"grad_norm": 1.3266760110855103, |
|
"learning_rate": 2.797619047619048e-05, |
|
"loss": 0.283, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 2.2647006511688232, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.2498, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.8693613220187584, |
|
"eval_loss": 0.30874186754226685, |
|
"eval_runtime": 116.4377, |
|
"eval_samples_per_second": 38.458, |
|
"eval_steps_per_second": 2.405, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.035714285714286, |
|
"grad_norm": 1.6704837083816528, |
|
"learning_rate": 2.7579365079365083e-05, |
|
"loss": 0.2973, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.071428571428571, |
|
"grad_norm": 1.9291003942489624, |
|
"learning_rate": 2.7380952380952383e-05, |
|
"loss": 0.2756, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.107142857142857, |
|
"grad_norm": 2.44765043258667, |
|
"learning_rate": 2.718253968253968e-05, |
|
"loss": 0.2575, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.142857142857143, |
|
"grad_norm": 2.0583574771881104, |
|
"learning_rate": 2.6984126984126984e-05, |
|
"loss": 0.256, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.178571428571429, |
|
"grad_norm": 1.8882899284362793, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 0.299, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.214285714285714, |
|
"grad_norm": 1.5178027153015137, |
|
"learning_rate": 2.6587301587301588e-05, |
|
"loss": 0.3209, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 1.4342873096466064, |
|
"learning_rate": 2.6388888888888892e-05, |
|
"loss": 0.2828, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.285714285714286, |
|
"grad_norm": 1.3157316446304321, |
|
"learning_rate": 2.6190476190476192e-05, |
|
"loss": 0.2655, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.321428571428571, |
|
"grad_norm": 1.0939053297042847, |
|
"learning_rate": 2.5992063492063496e-05, |
|
"loss": 0.278, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.357142857142857, |
|
"grad_norm": 1.9465833902359009, |
|
"learning_rate": 2.5793650793650796e-05, |
|
"loss": 0.2652, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.392857142857143, |
|
"grad_norm": 1.5688917636871338, |
|
"learning_rate": 2.5595238095238093e-05, |
|
"loss": 0.2622, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.428571428571429, |
|
"grad_norm": 1.3724976778030396, |
|
"learning_rate": 2.5396825396825397e-05, |
|
"loss": 0.2524, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.464285714285714, |
|
"grad_norm": 1.160733699798584, |
|
"learning_rate": 2.5198412698412697e-05, |
|
"loss": 0.2868, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 1.2799668312072754, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.2713, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.535714285714286, |
|
"grad_norm": 1.9062435626983643, |
|
"learning_rate": 2.4801587301587305e-05, |
|
"loss": 0.2527, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 5.571428571428571, |
|
"grad_norm": 2.5998220443725586, |
|
"learning_rate": 2.4603174603174602e-05, |
|
"loss": 0.2668, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.607142857142857, |
|
"grad_norm": 2.346576452255249, |
|
"learning_rate": 2.4404761904761906e-05, |
|
"loss": 0.2557, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 5.642857142857143, |
|
"grad_norm": 1.9609774351119995, |
|
"learning_rate": 2.4206349206349206e-05, |
|
"loss": 0.2706, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 5.678571428571429, |
|
"grad_norm": 1.8994617462158203, |
|
"learning_rate": 2.400793650793651e-05, |
|
"loss": 0.2878, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 1.677849531173706, |
|
"learning_rate": 2.380952380952381e-05, |
|
"loss": 0.2761, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 1.6345635652542114, |
|
"learning_rate": 2.361111111111111e-05, |
|
"loss": 0.2231, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.785714285714286, |
|
"grad_norm": 1.6198554039001465, |
|
"learning_rate": 2.3412698412698414e-05, |
|
"loss": 0.2612, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.821428571428571, |
|
"grad_norm": 2.052764415740967, |
|
"learning_rate": 2.3214285714285715e-05, |
|
"loss": 0.2738, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.857142857142857, |
|
"grad_norm": 1.0596450567245483, |
|
"learning_rate": 2.3015873015873015e-05, |
|
"loss": 0.2597, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.892857142857143, |
|
"grad_norm": 1.9113609790802002, |
|
"learning_rate": 2.281746031746032e-05, |
|
"loss": 0.2894, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.928571428571429, |
|
"grad_norm": 1.7055829763412476, |
|
"learning_rate": 2.261904761904762e-05, |
|
"loss": 0.226, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.964285714285714, |
|
"grad_norm": 0.969464898109436, |
|
"learning_rate": 2.2420634920634923e-05, |
|
"loss": 0.2977, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 3.1485939025878906, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.3028, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.8715944618133095, |
|
"eval_loss": 0.29658079147338867, |
|
"eval_runtime": 113.6842, |
|
"eval_samples_per_second": 39.39, |
|
"eval_steps_per_second": 2.463, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.035714285714286, |
|
"grad_norm": 1.4179354906082153, |
|
"learning_rate": 2.2023809523809524e-05, |
|
"loss": 0.2681, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.071428571428571, |
|
"grad_norm": 2.099681854248047, |
|
"learning_rate": 2.1825396825396827e-05, |
|
"loss": 0.2441, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.107142857142857, |
|
"grad_norm": 1.5406619310379028, |
|
"learning_rate": 2.1626984126984128e-05, |
|
"loss": 0.2707, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 6.142857142857143, |
|
"grad_norm": 2.4518625736236572, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 0.2942, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 6.178571428571429, |
|
"grad_norm": 1.1450200080871582, |
|
"learning_rate": 2.1230158730158732e-05, |
|
"loss": 0.2336, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 6.214285714285714, |
|
"grad_norm": 1.4170438051223755, |
|
"learning_rate": 2.1031746031746032e-05, |
|
"loss": 0.2857, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 1.2231560945510864, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.2507, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 6.285714285714286, |
|
"grad_norm": 1.4597039222717285, |
|
"learning_rate": 2.0634920634920636e-05, |
|
"loss": 0.2688, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.321428571428571, |
|
"grad_norm": 1.9395854473114014, |
|
"learning_rate": 2.0436507936507937e-05, |
|
"loss": 0.2715, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 6.357142857142857, |
|
"grad_norm": 0.9649907946586609, |
|
"learning_rate": 2.023809523809524e-05, |
|
"loss": 0.2508, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 6.392857142857143, |
|
"grad_norm": 1.3842933177947998, |
|
"learning_rate": 2.003968253968254e-05, |
|
"loss": 0.2818, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 6.428571428571429, |
|
"grad_norm": 1.7550239562988281, |
|
"learning_rate": 1.984126984126984e-05, |
|
"loss": 0.2553, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.464285714285714, |
|
"grad_norm": 1.04232919216156, |
|
"learning_rate": 1.9642857142857145e-05, |
|
"loss": 0.2303, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 2.306325674057007, |
|
"learning_rate": 1.9444444444444445e-05, |
|
"loss": 0.257, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 6.535714285714286, |
|
"grad_norm": 1.704186201095581, |
|
"learning_rate": 1.924603174603175e-05, |
|
"loss": 0.2844, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 6.571428571428571, |
|
"grad_norm": 1.3930236101150513, |
|
"learning_rate": 1.9047619047619046e-05, |
|
"loss": 0.2784, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 6.607142857142857, |
|
"grad_norm": 1.5494309663772583, |
|
"learning_rate": 1.884920634920635e-05, |
|
"loss": 0.2735, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 6.642857142857143, |
|
"grad_norm": 1.5971505641937256, |
|
"learning_rate": 1.8650793650793654e-05, |
|
"loss": 0.2466, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 6.678571428571429, |
|
"grad_norm": 1.8419290781021118, |
|
"learning_rate": 1.8452380952380954e-05, |
|
"loss": 0.2476, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 6.714285714285714, |
|
"grad_norm": 1.3679064512252808, |
|
"learning_rate": 1.8253968253968254e-05, |
|
"loss": 0.2277, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 1.1586477756500244, |
|
"learning_rate": 1.8055555555555555e-05, |
|
"loss": 0.2266, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 6.785714285714286, |
|
"grad_norm": 1.9979654550552368, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 0.2747, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.821428571428571, |
|
"grad_norm": 1.4875764846801758, |
|
"learning_rate": 1.7658730158730162e-05, |
|
"loss": 0.2666, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 6.857142857142857, |
|
"grad_norm": 1.6196482181549072, |
|
"learning_rate": 1.746031746031746e-05, |
|
"loss": 0.2532, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.892857142857143, |
|
"grad_norm": 2.7135918140411377, |
|
"learning_rate": 1.7261904761904763e-05, |
|
"loss": 0.2265, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 6.928571428571429, |
|
"grad_norm": 1.423257827758789, |
|
"learning_rate": 1.7063492063492063e-05, |
|
"loss": 0.2621, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.964285714285714, |
|
"grad_norm": 2.1509084701538086, |
|
"learning_rate": 1.6865079365079367e-05, |
|
"loss": 0.2636, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 1.562657117843628, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.2179, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.8807503349709692, |
|
"eval_loss": 0.2742190361022949, |
|
"eval_runtime": 113.8381, |
|
"eval_samples_per_second": 39.337, |
|
"eval_steps_per_second": 2.46, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.035714285714286, |
|
"grad_norm": 1.4138425588607788, |
|
"learning_rate": 1.6468253968253968e-05, |
|
"loss": 0.2251, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 7.071428571428571, |
|
"grad_norm": 2.6958255767822266, |
|
"learning_rate": 1.626984126984127e-05, |
|
"loss": 0.2521, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 7.107142857142857, |
|
"grad_norm": 2.014803647994995, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 0.216, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 2.1656410694122314, |
|
"learning_rate": 1.5873015873015872e-05, |
|
"loss": 0.2085, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.178571428571429, |
|
"grad_norm": 2.207980155944824, |
|
"learning_rate": 1.5674603174603176e-05, |
|
"loss": 0.263, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 7.214285714285714, |
|
"grad_norm": 1.978162169456482, |
|
"learning_rate": 1.5476190476190476e-05, |
|
"loss": 0.2478, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 1.2705848217010498, |
|
"learning_rate": 1.527777777777778e-05, |
|
"loss": 0.2338, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 7.285714285714286, |
|
"grad_norm": 1.6287261247634888, |
|
"learning_rate": 1.5079365079365079e-05, |
|
"loss": 0.2063, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 7.321428571428571, |
|
"grad_norm": 2.0518875122070312, |
|
"learning_rate": 1.4880952380952381e-05, |
|
"loss": 0.3032, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 7.357142857142857, |
|
"grad_norm": 2.7869536876678467, |
|
"learning_rate": 1.4682539682539683e-05, |
|
"loss": 0.2242, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 7.392857142857143, |
|
"grad_norm": 1.007391095161438, |
|
"learning_rate": 1.4484126984126987e-05, |
|
"loss": 0.2589, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 7.428571428571429, |
|
"grad_norm": 1.2812851667404175, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.2383, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 7.464285714285714, |
|
"grad_norm": 1.1465330123901367, |
|
"learning_rate": 1.4087301587301587e-05, |
|
"loss": 0.2284, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 2.2567813396453857, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.213, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 7.535714285714286, |
|
"grad_norm": 1.7949641942977905, |
|
"learning_rate": 1.3690476190476192e-05, |
|
"loss": 0.2368, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 7.571428571428571, |
|
"grad_norm": 2.443598985671997, |
|
"learning_rate": 1.3492063492063492e-05, |
|
"loss": 0.2665, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 7.607142857142857, |
|
"grad_norm": 1.9699336290359497, |
|
"learning_rate": 1.3293650793650794e-05, |
|
"loss": 0.2346, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 7.642857142857143, |
|
"grad_norm": 1.543039083480835, |
|
"learning_rate": 1.3095238095238096e-05, |
|
"loss": 0.2432, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 7.678571428571429, |
|
"grad_norm": 1.9814691543579102, |
|
"learning_rate": 1.2896825396825398e-05, |
|
"loss": 0.2575, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 7.714285714285714, |
|
"grad_norm": 2.1088602542877197, |
|
"learning_rate": 1.2698412698412699e-05, |
|
"loss": 0.2346, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 1.5648256540298462, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.2403, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 7.785714285714286, |
|
"grad_norm": 1.6079583168029785, |
|
"learning_rate": 1.2301587301587301e-05, |
|
"loss": 0.2414, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 7.821428571428571, |
|
"grad_norm": 1.2859593629837036, |
|
"learning_rate": 1.2103174603174603e-05, |
|
"loss": 0.2756, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 7.857142857142857, |
|
"grad_norm": 2.072089672088623, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 0.2484, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 7.892857142857143, |
|
"grad_norm": 1.623353362083435, |
|
"learning_rate": 1.1706349206349207e-05, |
|
"loss": 0.274, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 7.928571428571429, |
|
"grad_norm": 1.70241379737854, |
|
"learning_rate": 1.1507936507936508e-05, |
|
"loss": 0.2307, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 7.964285714285714, |
|
"grad_norm": 1.7186700105667114, |
|
"learning_rate": 1.130952380952381e-05, |
|
"loss": 0.2223, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 2.0665862560272217, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.2274, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8814202769093346, |
|
"eval_loss": 0.2860513925552368, |
|
"eval_runtime": 115.9278, |
|
"eval_samples_per_second": 38.627, |
|
"eval_steps_per_second": 2.415, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 8.035714285714286, |
|
"grad_norm": 1.3822689056396484, |
|
"learning_rate": 1.0912698412698414e-05, |
|
"loss": 0.238, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 8.071428571428571, |
|
"grad_norm": 1.4226853847503662, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 0.266, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 8.107142857142858, |
|
"grad_norm": 1.9675803184509277, |
|
"learning_rate": 1.0515873015873016e-05, |
|
"loss": 0.2814, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 8.142857142857142, |
|
"grad_norm": 1.8133440017700195, |
|
"learning_rate": 1.0317460317460318e-05, |
|
"loss": 0.2486, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 8.178571428571429, |
|
"grad_norm": 1.2323780059814453, |
|
"learning_rate": 1.011904761904762e-05, |
|
"loss": 0.2175, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 8.214285714285714, |
|
"grad_norm": 1.7063086032867432, |
|
"learning_rate": 9.92063492063492e-06, |
|
"loss": 0.2125, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 1.532769799232483, |
|
"learning_rate": 9.722222222222223e-06, |
|
"loss": 0.1813, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 8.285714285714286, |
|
"grad_norm": 1.5620160102844238, |
|
"learning_rate": 9.523809523809523e-06, |
|
"loss": 0.2355, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 8.321428571428571, |
|
"grad_norm": 1.6230847835540771, |
|
"learning_rate": 9.325396825396827e-06, |
|
"loss": 0.2199, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 8.357142857142858, |
|
"grad_norm": 1.552085041999817, |
|
"learning_rate": 9.126984126984127e-06, |
|
"loss": 0.2479, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 8.392857142857142, |
|
"grad_norm": 1.8006685972213745, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 0.2677, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 8.428571428571429, |
|
"grad_norm": 1.2057029008865356, |
|
"learning_rate": 8.73015873015873e-06, |
|
"loss": 0.2022, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 8.464285714285714, |
|
"grad_norm": 1.4805638790130615, |
|
"learning_rate": 8.531746031746032e-06, |
|
"loss": 0.2158, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 1.441603422164917, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.2166, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 8.535714285714286, |
|
"grad_norm": 2.171687602996826, |
|
"learning_rate": 8.134920634920636e-06, |
|
"loss": 0.2332, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 1.5523641109466553, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 0.2099, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 8.607142857142858, |
|
"grad_norm": 1.429527759552002, |
|
"learning_rate": 7.738095238095238e-06, |
|
"loss": 0.1934, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 8.642857142857142, |
|
"grad_norm": 3.8555209636688232, |
|
"learning_rate": 7.5396825396825394e-06, |
|
"loss": 0.2813, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 8.678571428571429, |
|
"grad_norm": 1.118416666984558, |
|
"learning_rate": 7.3412698412698415e-06, |
|
"loss": 0.2358, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 8.714285714285714, |
|
"grad_norm": 1.9037988185882568, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 0.2091, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 1.8768919706344604, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.2176, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 8.785714285714286, |
|
"grad_norm": 1.606123685836792, |
|
"learning_rate": 6.746031746031746e-06, |
|
"loss": 0.2388, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 8.821428571428571, |
|
"grad_norm": 2.6942107677459717, |
|
"learning_rate": 6.547619047619048e-06, |
|
"loss": 0.2182, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 8.857142857142858, |
|
"grad_norm": 1.449601173400879, |
|
"learning_rate": 6.349206349206349e-06, |
|
"loss": 0.2102, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 8.892857142857142, |
|
"grad_norm": 2.651686668395996, |
|
"learning_rate": 6.1507936507936505e-06, |
|
"loss": 0.2166, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 8.928571428571429, |
|
"grad_norm": 2.6743762493133545, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 0.2468, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 8.964285714285714, |
|
"grad_norm": 2.29903244972229, |
|
"learning_rate": 5.753968253968254e-06, |
|
"loss": 0.2111, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 1.8269622325897217, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.2195, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.8894595801697186, |
|
"eval_loss": 0.26261791586875916, |
|
"eval_runtime": 114.2956, |
|
"eval_samples_per_second": 39.179, |
|
"eval_steps_per_second": 2.45, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 9.035714285714286, |
|
"grad_norm": 3.011329174041748, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 0.1827, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 9.071428571428571, |
|
"grad_norm": 2.3060262203216553, |
|
"learning_rate": 5.158730158730159e-06, |
|
"loss": 0.1862, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 9.107142857142858, |
|
"grad_norm": 1.7220100164413452, |
|
"learning_rate": 4.96031746031746e-06, |
|
"loss": 0.2215, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 9.142857142857142, |
|
"grad_norm": 2.463092803955078, |
|
"learning_rate": 4.7619047619047615e-06, |
|
"loss": 0.2228, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 9.178571428571429, |
|
"grad_norm": 1.204136848449707, |
|
"learning_rate": 4.563492063492064e-06, |
|
"loss": 0.2439, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 9.214285714285714, |
|
"grad_norm": 2.263396978378296, |
|
"learning_rate": 4.365079365079365e-06, |
|
"loss": 0.2266, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 2.7832555770874023, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.2089, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 9.285714285714286, |
|
"grad_norm": 2.0564024448394775, |
|
"learning_rate": 3.968253968253968e-06, |
|
"loss": 0.1839, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 9.321428571428571, |
|
"grad_norm": 2.0316998958587646, |
|
"learning_rate": 3.7698412698412697e-06, |
|
"loss": 0.2424, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 9.357142857142858, |
|
"grad_norm": 2.229687213897705, |
|
"learning_rate": 3.5714285714285714e-06, |
|
"loss": 0.2489, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 9.392857142857142, |
|
"grad_norm": 1.7529199123382568, |
|
"learning_rate": 3.373015873015873e-06, |
|
"loss": 0.2273, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 9.428571428571429, |
|
"grad_norm": 1.5242239236831665, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 0.2224, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 9.464285714285714, |
|
"grad_norm": 1.5499508380889893, |
|
"learning_rate": 2.9761904761904763e-06, |
|
"loss": 0.2338, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 2.5902230739593506, |
|
"learning_rate": 2.777777777777778e-06, |
|
"loss": 0.1909, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 9.535714285714286, |
|
"grad_norm": 1.3243242502212524, |
|
"learning_rate": 2.5793650793650795e-06, |
|
"loss": 0.2199, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 9.571428571428571, |
|
"grad_norm": 1.9745112657546997, |
|
"learning_rate": 2.3809523809523808e-06, |
|
"loss": 0.2172, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 9.607142857142858, |
|
"grad_norm": 2.10951828956604, |
|
"learning_rate": 2.1825396825396824e-06, |
|
"loss": 0.2331, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 9.642857142857142, |
|
"grad_norm": 2.113539457321167, |
|
"learning_rate": 1.984126984126984e-06, |
|
"loss": 0.2373, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 9.678571428571429, |
|
"grad_norm": 1.548854112625122, |
|
"learning_rate": 1.7857142857142857e-06, |
|
"loss": 0.2035, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 9.714285714285714, |
|
"grad_norm": 1.797196626663208, |
|
"learning_rate": 1.5873015873015873e-06, |
|
"loss": 0.2015, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 1.9279841184616089, |
|
"learning_rate": 1.388888888888889e-06, |
|
"loss": 0.2208, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 9.785714285714286, |
|
"grad_norm": 0.950290858745575, |
|
"learning_rate": 1.1904761904761904e-06, |
|
"loss": 0.2394, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 9.821428571428571, |
|
"grad_norm": 1.2573094367980957, |
|
"learning_rate": 9.92063492063492e-07, |
|
"loss": 0.2491, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 9.857142857142858, |
|
"grad_norm": 1.8162927627563477, |
|
"learning_rate": 7.936507936507937e-07, |
|
"loss": 0.2229, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 9.892857142857142, |
|
"grad_norm": 2.0845260620117188, |
|
"learning_rate": 5.952380952380952e-07, |
|
"loss": 0.1984, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 9.928571428571429, |
|
"grad_norm": 1.8501282930374146, |
|
"learning_rate": 3.9682539682539683e-07, |
|
"loss": 0.2287, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 9.964285714285714, |
|
"grad_norm": 2.941807746887207, |
|
"learning_rate": 1.9841269841269841e-07, |
|
"loss": 0.2233, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.2508630752563477, |
|
"learning_rate": 0.0, |
|
"loss": 0.1886, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.8865564984368022, |
|
"eval_loss": 0.2717145085334778, |
|
"eval_runtime": 113.959, |
|
"eval_samples_per_second": 39.295, |
|
"eval_steps_per_second": 2.457, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 2800, |
|
"total_flos": 1.3877265500181135e+19, |
|
"train_loss": 0.3087472263830049, |
|
"train_runtime": 6534.1253, |
|
"train_samples_per_second": 27.407, |
|
"train_steps_per_second": 0.429 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3877265500181135e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|