|
{ |
|
"best_metric": 3.214390277862549, |
|
"best_model_checkpoint": "./gpt2_bias_model_mps/checkpoint-975", |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 975, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010256410256410256, |
|
"grad_norm": 15.915234565734863, |
|
"learning_rate": 4.948717948717949e-05, |
|
"loss": 4.1387, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.020512820512820513, |
|
"grad_norm": 14.396087646484375, |
|
"learning_rate": 4.8974358974358975e-05, |
|
"loss": 4.0441, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 15.571473121643066, |
|
"learning_rate": 4.846153846153846e-05, |
|
"loss": 3.9314, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.041025641025641026, |
|
"grad_norm": 14.709969520568848, |
|
"learning_rate": 4.7948717948717955e-05, |
|
"loss": 3.8778, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05128205128205128, |
|
"grad_norm": 13.663440704345703, |
|
"learning_rate": 4.7435897435897435e-05, |
|
"loss": 3.6624, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 13.031949996948242, |
|
"learning_rate": 4.692307692307693e-05, |
|
"loss": 3.7466, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07179487179487179, |
|
"grad_norm": 13.534045219421387, |
|
"learning_rate": 4.6410256410256415e-05, |
|
"loss": 3.886, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08205128205128205, |
|
"grad_norm": 13.990560531616211, |
|
"learning_rate": 4.5897435897435895e-05, |
|
"loss": 3.8182, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 14.367162704467773, |
|
"learning_rate": 4.538461538461539e-05, |
|
"loss": 3.6856, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 12.511638641357422, |
|
"learning_rate": 4.4871794871794874e-05, |
|
"loss": 3.7046, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11282051282051282, |
|
"grad_norm": 15.438050270080566, |
|
"learning_rate": 4.435897435897436e-05, |
|
"loss": 3.7047, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 12.294541358947754, |
|
"learning_rate": 4.384615384615385e-05, |
|
"loss": 3.787, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 14.492098808288574, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 3.688, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14358974358974358, |
|
"grad_norm": 12.25203800201416, |
|
"learning_rate": 4.282051282051282e-05, |
|
"loss": 3.5417, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 13.325423240661621, |
|
"learning_rate": 4.230769230769231e-05, |
|
"loss": 3.5546, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1641025641025641, |
|
"grad_norm": 12.688371658325195, |
|
"learning_rate": 4.17948717948718e-05, |
|
"loss": 3.6785, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17435897435897435, |
|
"grad_norm": 13.553004264831543, |
|
"learning_rate": 4.128205128205128e-05, |
|
"loss": 3.575, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 12.279559135437012, |
|
"learning_rate": 4.0769230769230773e-05, |
|
"loss": 3.643, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19487179487179487, |
|
"grad_norm": 11.203484535217285, |
|
"learning_rate": 4.025641025641026e-05, |
|
"loss": 3.5966, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 11.550701141357422, |
|
"learning_rate": 3.974358974358974e-05, |
|
"loss": 3.6116, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 12.691393852233887, |
|
"learning_rate": 3.923076923076923e-05, |
|
"loss": 3.452, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22564102564102564, |
|
"grad_norm": 11.071481704711914, |
|
"learning_rate": 3.871794871794872e-05, |
|
"loss": 3.5236, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2358974358974359, |
|
"grad_norm": 12.220307350158691, |
|
"learning_rate": 3.8205128205128206e-05, |
|
"loss": 3.6149, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 12.471100807189941, |
|
"learning_rate": 3.769230769230769e-05, |
|
"loss": 3.6393, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 10.304001808166504, |
|
"learning_rate": 3.717948717948718e-05, |
|
"loss": 3.4771, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 10.889674186706543, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 3.5182, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 10.550483703613281, |
|
"learning_rate": 3.615384615384615e-05, |
|
"loss": 3.3559, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.28717948717948716, |
|
"grad_norm": 10.34374713897705, |
|
"learning_rate": 3.5641025641025646e-05, |
|
"loss": 3.4854, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.29743589743589743, |
|
"grad_norm": 11.178277969360352, |
|
"learning_rate": 3.5128205128205125e-05, |
|
"loss": 3.5015, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 10.405166625976562, |
|
"learning_rate": 3.461538461538462e-05, |
|
"loss": 3.5436, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.31794871794871793, |
|
"grad_norm": 10.547426223754883, |
|
"learning_rate": 3.4102564102564105e-05, |
|
"loss": 3.4588, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3282051282051282, |
|
"grad_norm": 11.255645751953125, |
|
"learning_rate": 3.358974358974359e-05, |
|
"loss": 3.4427, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 10.89976692199707, |
|
"learning_rate": 3.307692307692308e-05, |
|
"loss": 3.4668, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3487179487179487, |
|
"grad_norm": 10.082490921020508, |
|
"learning_rate": 3.2564102564102565e-05, |
|
"loss": 3.5122, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.358974358974359, |
|
"grad_norm": 9.010897636413574, |
|
"learning_rate": 3.205128205128206e-05, |
|
"loss": 3.4547, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 10.768731117248535, |
|
"learning_rate": 3.153846153846154e-05, |
|
"loss": 3.4541, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.37948717948717947, |
|
"grad_norm": 10.812358856201172, |
|
"learning_rate": 3.102564102564103e-05, |
|
"loss": 3.3868, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.38974358974358975, |
|
"grad_norm": 10.063950538635254, |
|
"learning_rate": 3.0512820512820518e-05, |
|
"loss": 3.5424, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 9.603002548217773, |
|
"learning_rate": 3e-05, |
|
"loss": 3.3761, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 9.461718559265137, |
|
"learning_rate": 2.948717948717949e-05, |
|
"loss": 3.423, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4205128205128205, |
|
"grad_norm": 9.013967514038086, |
|
"learning_rate": 2.8974358974358977e-05, |
|
"loss": 3.421, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 10.10628604888916, |
|
"learning_rate": 2.846153846153846e-05, |
|
"loss": 3.4225, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.441025641025641, |
|
"grad_norm": 9.19229507446289, |
|
"learning_rate": 2.794871794871795e-05, |
|
"loss": 3.4973, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4512820512820513, |
|
"grad_norm": 9.309951782226562, |
|
"learning_rate": 2.743589743589744e-05, |
|
"loss": 3.4458, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 8.606375694274902, |
|
"learning_rate": 2.6923076923076923e-05, |
|
"loss": 3.4163, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4717948717948718, |
|
"grad_norm": 9.342279434204102, |
|
"learning_rate": 2.6410256410256413e-05, |
|
"loss": 3.4543, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.48205128205128206, |
|
"grad_norm": 10.326979637145996, |
|
"learning_rate": 2.58974358974359e-05, |
|
"loss": 3.4219, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 9.113346099853516, |
|
"learning_rate": 2.5384615384615383e-05, |
|
"loss": 3.4182, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5025641025641026, |
|
"grad_norm": 9.695829391479492, |
|
"learning_rate": 2.4871794871794873e-05, |
|
"loss": 3.4622, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 10.052870750427246, |
|
"learning_rate": 2.435897435897436e-05, |
|
"loss": 3.392, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 7.836452960968018, |
|
"learning_rate": 2.384615384615385e-05, |
|
"loss": 3.4489, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 10.3803129196167, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 3.4643, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5435897435897435, |
|
"grad_norm": 9.079364776611328, |
|
"learning_rate": 2.2820512820512822e-05, |
|
"loss": 3.4824, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 9.125578880310059, |
|
"learning_rate": 2.230769230769231e-05, |
|
"loss": 3.4399, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5641025641025641, |
|
"grad_norm": 8.740405082702637, |
|
"learning_rate": 2.1794871794871795e-05, |
|
"loss": 3.4194, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5743589743589743, |
|
"grad_norm": 8.519855499267578, |
|
"learning_rate": 2.1282051282051282e-05, |
|
"loss": 3.4603, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 8.29417610168457, |
|
"learning_rate": 2.0769230769230772e-05, |
|
"loss": 3.4312, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5948717948717949, |
|
"grad_norm": 8.083531379699707, |
|
"learning_rate": 2.025641025641026e-05, |
|
"loss": 3.3395, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6051282051282051, |
|
"grad_norm": 7.79611873626709, |
|
"learning_rate": 1.9743589743589745e-05, |
|
"loss": 3.319, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 9.557352066040039, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 3.5313, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6256410256410256, |
|
"grad_norm": 8.805798530578613, |
|
"learning_rate": 1.8717948717948718e-05, |
|
"loss": 3.4618, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6358974358974359, |
|
"grad_norm": 8.321375846862793, |
|
"learning_rate": 1.8205128205128204e-05, |
|
"loss": 3.4288, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 7.985848903656006, |
|
"learning_rate": 1.7692307692307694e-05, |
|
"loss": 3.4127, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6564102564102564, |
|
"grad_norm": 8.117752075195312, |
|
"learning_rate": 1.717948717948718e-05, |
|
"loss": 3.3404, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 9.96267318725586, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 3.3629, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 7.83555793762207, |
|
"learning_rate": 1.6153846153846154e-05, |
|
"loss": 3.4605, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6871794871794872, |
|
"grad_norm": 8.735118865966797, |
|
"learning_rate": 1.564102564102564e-05, |
|
"loss": 3.4665, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6974358974358974, |
|
"grad_norm": 8.65040397644043, |
|
"learning_rate": 1.5128205128205129e-05, |
|
"loss": 3.4698, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 6.974828243255615, |
|
"learning_rate": 1.4615384615384617e-05, |
|
"loss": 3.4783, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.717948717948718, |
|
"grad_norm": 8.379024505615234, |
|
"learning_rate": 1.4102564102564104e-05, |
|
"loss": 3.3683, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7282051282051282, |
|
"grad_norm": 8.352046012878418, |
|
"learning_rate": 1.358974358974359e-05, |
|
"loss": 3.4313, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 7.566000461578369, |
|
"learning_rate": 1.3076923076923078e-05, |
|
"loss": 3.3775, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7487179487179487, |
|
"grad_norm": 7.003971099853516, |
|
"learning_rate": 1.2564102564102565e-05, |
|
"loss": 3.2418, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7589743589743589, |
|
"grad_norm": 7.048854351043701, |
|
"learning_rate": 1.2051282051282051e-05, |
|
"loss": 3.3531, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 7.653282642364502, |
|
"learning_rate": 1.153846153846154e-05, |
|
"loss": 3.4099, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7794871794871795, |
|
"grad_norm": 7.643819808959961, |
|
"learning_rate": 1.1025641025641026e-05, |
|
"loss": 3.3944, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7897435897435897, |
|
"grad_norm": 8.362899780273438, |
|
"learning_rate": 1.0512820512820514e-05, |
|
"loss": 3.411, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.380786895751953, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4135, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8102564102564103, |
|
"grad_norm": 8.83707332611084, |
|
"learning_rate": 9.487179487179487e-06, |
|
"loss": 3.2312, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 7.70910120010376, |
|
"learning_rate": 8.974358974358976e-06, |
|
"loss": 3.4221, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"grad_norm": 7.079320430755615, |
|
"learning_rate": 8.461538461538462e-06, |
|
"loss": 3.338, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.841025641025641, |
|
"grad_norm": 8.756903648376465, |
|
"learning_rate": 7.948717948717949e-06, |
|
"loss": 3.3683, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8512820512820513, |
|
"grad_norm": 8.349058151245117, |
|
"learning_rate": 7.435897435897436e-06, |
|
"loss": 3.4499, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 8.497963905334473, |
|
"learning_rate": 6.923076923076923e-06, |
|
"loss": 3.4448, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8717948717948718, |
|
"grad_norm": 8.383698463439941, |
|
"learning_rate": 6.41025641025641e-06, |
|
"loss": 3.2929, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.882051282051282, |
|
"grad_norm": 8.016318321228027, |
|
"learning_rate": 5.897435897435897e-06, |
|
"loss": 3.4073, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"grad_norm": 9.301827430725098, |
|
"learning_rate": 5.3846153846153855e-06, |
|
"loss": 3.4206, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9025641025641026, |
|
"grad_norm": 7.227042198181152, |
|
"learning_rate": 4.871794871794872e-06, |
|
"loss": 3.3388, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9128205128205128, |
|
"grad_norm": 6.956933975219727, |
|
"learning_rate": 4.3589743589743586e-06, |
|
"loss": 3.2773, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 8.344608306884766, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 3.2864, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 7.429567813873291, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 3.3606, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9435897435897436, |
|
"grad_norm": 7.6066389083862305, |
|
"learning_rate": 2.8205128205128207e-06, |
|
"loss": 3.3459, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"grad_norm": 7.413199424743652, |
|
"learning_rate": 2.307692307692308e-06, |
|
"loss": 3.3968, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9641025641025641, |
|
"grad_norm": 6.853078365325928, |
|
"learning_rate": 1.7948717948717948e-06, |
|
"loss": 3.3706, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9743589743589743, |
|
"grad_norm": 6.604455947875977, |
|
"learning_rate": 1.282051282051282e-06, |
|
"loss": 3.3048, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 8.022303581237793, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 3.3028, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9948717948717949, |
|
"grad_norm": 8.982901573181152, |
|
"learning_rate": 2.564102564102564e-07, |
|
"loss": 3.3398, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 3.214390277862549, |
|
"eval_runtime": 39.3331, |
|
"eval_samples_per_second": 99.306, |
|
"eval_steps_per_second": 6.229, |
|
"step": 975 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 975, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1018842955776000.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|