{ "best_metric": 3.214390277862549, "best_model_checkpoint": "./gpt2_bias_model_mps/checkpoint-975", "epoch": 1.0, "eval_steps": 500, "global_step": 975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010256410256410256, "grad_norm": 15.915234565734863, "learning_rate": 4.948717948717949e-05, "loss": 4.1387, "step": 10 }, { "epoch": 0.020512820512820513, "grad_norm": 14.396087646484375, "learning_rate": 4.8974358974358975e-05, "loss": 4.0441, "step": 20 }, { "epoch": 0.03076923076923077, "grad_norm": 15.571473121643066, "learning_rate": 4.846153846153846e-05, "loss": 3.9314, "step": 30 }, { "epoch": 0.041025641025641026, "grad_norm": 14.709969520568848, "learning_rate": 4.7948717948717955e-05, "loss": 3.8778, "step": 40 }, { "epoch": 0.05128205128205128, "grad_norm": 13.663440704345703, "learning_rate": 4.7435897435897435e-05, "loss": 3.6624, "step": 50 }, { "epoch": 0.06153846153846154, "grad_norm": 13.031949996948242, "learning_rate": 4.692307692307693e-05, "loss": 3.7466, "step": 60 }, { "epoch": 0.07179487179487179, "grad_norm": 13.534045219421387, "learning_rate": 4.6410256410256415e-05, "loss": 3.886, "step": 70 }, { "epoch": 0.08205128205128205, "grad_norm": 13.990560531616211, "learning_rate": 4.5897435897435895e-05, "loss": 3.8182, "step": 80 }, { "epoch": 0.09230769230769231, "grad_norm": 14.367162704467773, "learning_rate": 4.538461538461539e-05, "loss": 3.6856, "step": 90 }, { "epoch": 0.10256410256410256, "grad_norm": 12.511638641357422, "learning_rate": 4.4871794871794874e-05, "loss": 3.7046, "step": 100 }, { "epoch": 0.11282051282051282, "grad_norm": 15.438050270080566, "learning_rate": 4.435897435897436e-05, "loss": 3.7047, "step": 110 }, { "epoch": 0.12307692307692308, "grad_norm": 12.294541358947754, "learning_rate": 4.384615384615385e-05, "loss": 3.787, "step": 120 }, { "epoch": 0.13333333333333333, "grad_norm": 14.492098808288574, "learning_rate": 4.3333333333333334e-05, "loss": 3.688, "step": 130 }, { "epoch": 0.14358974358974358, "grad_norm": 12.25203800201416, "learning_rate": 4.282051282051282e-05, "loss": 3.5417, "step": 140 }, { "epoch": 0.15384615384615385, "grad_norm": 13.325423240661621, "learning_rate": 4.230769230769231e-05, "loss": 3.5546, "step": 150 }, { "epoch": 0.1641025641025641, "grad_norm": 12.688371658325195, "learning_rate": 4.17948717948718e-05, "loss": 3.6785, "step": 160 }, { "epoch": 0.17435897435897435, "grad_norm": 13.553004264831543, "learning_rate": 4.128205128205128e-05, "loss": 3.575, "step": 170 }, { "epoch": 0.18461538461538463, "grad_norm": 12.279559135437012, "learning_rate": 4.0769230769230773e-05, "loss": 3.643, "step": 180 }, { "epoch": 0.19487179487179487, "grad_norm": 11.203484535217285, "learning_rate": 4.025641025641026e-05, "loss": 3.5966, "step": 190 }, { "epoch": 0.20512820512820512, "grad_norm": 11.550701141357422, "learning_rate": 3.974358974358974e-05, "loss": 3.6116, "step": 200 }, { "epoch": 0.2153846153846154, "grad_norm": 12.691393852233887, "learning_rate": 3.923076923076923e-05, "loss": 3.452, "step": 210 }, { "epoch": 0.22564102564102564, "grad_norm": 11.071481704711914, "learning_rate": 3.871794871794872e-05, "loss": 3.5236, "step": 220 }, { "epoch": 0.2358974358974359, "grad_norm": 12.220307350158691, "learning_rate": 3.8205128205128206e-05, "loss": 3.6149, "step": 230 }, { "epoch": 0.24615384615384617, "grad_norm": 12.471100807189941, "learning_rate": 3.769230769230769e-05, "loss": 3.6393, "step": 240 }, { "epoch": 0.2564102564102564, "grad_norm": 10.304001808166504, "learning_rate": 3.717948717948718e-05, "loss": 3.4771, "step": 250 }, { "epoch": 0.26666666666666666, "grad_norm": 10.889674186706543, "learning_rate": 3.6666666666666666e-05, "loss": 3.5182, "step": 260 }, { "epoch": 0.27692307692307694, "grad_norm": 10.550483703613281, "learning_rate": 3.615384615384615e-05, "loss": 3.3559, "step": 270 }, { "epoch": 0.28717948717948716, "grad_norm": 10.34374713897705, "learning_rate": 3.5641025641025646e-05, "loss": 3.4854, "step": 280 }, { "epoch": 0.29743589743589743, "grad_norm": 11.178277969360352, "learning_rate": 3.5128205128205125e-05, "loss": 3.5015, "step": 290 }, { "epoch": 0.3076923076923077, "grad_norm": 10.405166625976562, "learning_rate": 3.461538461538462e-05, "loss": 3.5436, "step": 300 }, { "epoch": 0.31794871794871793, "grad_norm": 10.547426223754883, "learning_rate": 3.4102564102564105e-05, "loss": 3.4588, "step": 310 }, { "epoch": 0.3282051282051282, "grad_norm": 11.255645751953125, "learning_rate": 3.358974358974359e-05, "loss": 3.4427, "step": 320 }, { "epoch": 0.3384615384615385, "grad_norm": 10.89976692199707, "learning_rate": 3.307692307692308e-05, "loss": 3.4668, "step": 330 }, { "epoch": 0.3487179487179487, "grad_norm": 10.082490921020508, "learning_rate": 3.2564102564102565e-05, "loss": 3.5122, "step": 340 }, { "epoch": 0.358974358974359, "grad_norm": 9.010897636413574, "learning_rate": 3.205128205128206e-05, "loss": 3.4547, "step": 350 }, { "epoch": 0.36923076923076925, "grad_norm": 10.768731117248535, "learning_rate": 3.153846153846154e-05, "loss": 3.4541, "step": 360 }, { "epoch": 0.37948717948717947, "grad_norm": 10.812358856201172, "learning_rate": 3.102564102564103e-05, "loss": 3.3868, "step": 370 }, { "epoch": 0.38974358974358975, "grad_norm": 10.063950538635254, "learning_rate": 3.0512820512820518e-05, "loss": 3.5424, "step": 380 }, { "epoch": 0.4, "grad_norm": 9.603002548217773, "learning_rate": 3e-05, "loss": 3.3761, "step": 390 }, { "epoch": 0.41025641025641024, "grad_norm": 9.461718559265137, "learning_rate": 2.948717948717949e-05, "loss": 3.423, "step": 400 }, { "epoch": 0.4205128205128205, "grad_norm": 9.013967514038086, "learning_rate": 2.8974358974358977e-05, "loss": 3.421, "step": 410 }, { "epoch": 0.4307692307692308, "grad_norm": 10.10628604888916, "learning_rate": 2.846153846153846e-05, "loss": 3.4225, "step": 420 }, { "epoch": 0.441025641025641, "grad_norm": 9.19229507446289, "learning_rate": 2.794871794871795e-05, "loss": 3.4973, "step": 430 }, { "epoch": 0.4512820512820513, "grad_norm": 9.309951782226562, "learning_rate": 2.743589743589744e-05, "loss": 3.4458, "step": 440 }, { "epoch": 0.46153846153846156, "grad_norm": 8.606375694274902, "learning_rate": 2.6923076923076923e-05, "loss": 3.4163, "step": 450 }, { "epoch": 0.4717948717948718, "grad_norm": 9.342279434204102, "learning_rate": 2.6410256410256413e-05, "loss": 3.4543, "step": 460 }, { "epoch": 0.48205128205128206, "grad_norm": 10.326979637145996, "learning_rate": 2.58974358974359e-05, "loss": 3.4219, "step": 470 }, { "epoch": 0.49230769230769234, "grad_norm": 9.113346099853516, "learning_rate": 2.5384615384615383e-05, "loss": 3.4182, "step": 480 }, { "epoch": 0.5025641025641026, "grad_norm": 9.695829391479492, "learning_rate": 2.4871794871794873e-05, "loss": 3.4622, "step": 490 }, { "epoch": 0.5128205128205128, "grad_norm": 10.052870750427246, "learning_rate": 2.435897435897436e-05, "loss": 3.392, "step": 500 }, { "epoch": 0.5230769230769231, "grad_norm": 7.836452960968018, "learning_rate": 2.384615384615385e-05, "loss": 3.4489, "step": 510 }, { "epoch": 0.5333333333333333, "grad_norm": 10.3803129196167, "learning_rate": 2.3333333333333336e-05, "loss": 3.4643, "step": 520 }, { "epoch": 0.5435897435897435, "grad_norm": 9.079364776611328, "learning_rate": 2.2820512820512822e-05, "loss": 3.4824, "step": 530 }, { "epoch": 0.5538461538461539, "grad_norm": 9.125578880310059, "learning_rate": 2.230769230769231e-05, "loss": 3.4399, "step": 540 }, { "epoch": 0.5641025641025641, "grad_norm": 8.740405082702637, "learning_rate": 2.1794871794871795e-05, "loss": 3.4194, "step": 550 }, { "epoch": 0.5743589743589743, "grad_norm": 8.519855499267578, "learning_rate": 2.1282051282051282e-05, "loss": 3.4603, "step": 560 }, { "epoch": 0.5846153846153846, "grad_norm": 8.29417610168457, "learning_rate": 2.0769230769230772e-05, "loss": 3.4312, "step": 570 }, { "epoch": 0.5948717948717949, "grad_norm": 8.083531379699707, "learning_rate": 2.025641025641026e-05, "loss": 3.3395, "step": 580 }, { "epoch": 0.6051282051282051, "grad_norm": 7.79611873626709, "learning_rate": 1.9743589743589745e-05, "loss": 3.319, "step": 590 }, { "epoch": 0.6153846153846154, "grad_norm": 9.557352066040039, "learning_rate": 1.923076923076923e-05, "loss": 3.5313, "step": 600 }, { "epoch": 0.6256410256410256, "grad_norm": 8.805798530578613, "learning_rate": 1.8717948717948718e-05, "loss": 3.4618, "step": 610 }, { "epoch": 0.6358974358974359, "grad_norm": 8.321375846862793, "learning_rate": 1.8205128205128204e-05, "loss": 3.4288, "step": 620 }, { "epoch": 0.6461538461538462, "grad_norm": 7.985848903656006, "learning_rate": 1.7692307692307694e-05, "loss": 3.4127, "step": 630 }, { "epoch": 0.6564102564102564, "grad_norm": 8.117752075195312, "learning_rate": 1.717948717948718e-05, "loss": 3.3404, "step": 640 }, { "epoch": 0.6666666666666666, "grad_norm": 9.96267318725586, "learning_rate": 1.6666666666666667e-05, "loss": 3.3629, "step": 650 }, { "epoch": 0.676923076923077, "grad_norm": 7.83555793762207, "learning_rate": 1.6153846153846154e-05, "loss": 3.4605, "step": 660 }, { "epoch": 0.6871794871794872, "grad_norm": 8.735118865966797, "learning_rate": 1.564102564102564e-05, "loss": 3.4665, "step": 670 }, { "epoch": 0.6974358974358974, "grad_norm": 8.65040397644043, "learning_rate": 1.5128205128205129e-05, "loss": 3.4698, "step": 680 }, { "epoch": 0.7076923076923077, "grad_norm": 6.974828243255615, "learning_rate": 1.4615384615384617e-05, "loss": 3.4783, "step": 690 }, { "epoch": 0.717948717948718, "grad_norm": 8.379024505615234, "learning_rate": 1.4102564102564104e-05, "loss": 3.3683, "step": 700 }, { "epoch": 0.7282051282051282, "grad_norm": 8.352046012878418, "learning_rate": 1.358974358974359e-05, "loss": 3.4313, "step": 710 }, { "epoch": 0.7384615384615385, "grad_norm": 7.566000461578369, "learning_rate": 1.3076923076923078e-05, "loss": 3.3775, "step": 720 }, { "epoch": 0.7487179487179487, "grad_norm": 7.003971099853516, "learning_rate": 1.2564102564102565e-05, "loss": 3.2418, "step": 730 }, { "epoch": 0.7589743589743589, "grad_norm": 7.048854351043701, "learning_rate": 1.2051282051282051e-05, "loss": 3.3531, "step": 740 }, { "epoch": 0.7692307692307693, "grad_norm": 7.653282642364502, "learning_rate": 1.153846153846154e-05, "loss": 3.4099, "step": 750 }, { "epoch": 0.7794871794871795, "grad_norm": 7.643819808959961, "learning_rate": 1.1025641025641026e-05, "loss": 3.3944, "step": 760 }, { "epoch": 0.7897435897435897, "grad_norm": 8.362899780273438, "learning_rate": 1.0512820512820514e-05, "loss": 3.411, "step": 770 }, { "epoch": 0.8, "grad_norm": 7.380786895751953, "learning_rate": 1e-05, "loss": 3.4135, "step": 780 }, { "epoch": 0.8102564102564103, "grad_norm": 8.83707332611084, "learning_rate": 9.487179487179487e-06, "loss": 3.2312, "step": 790 }, { "epoch": 0.8205128205128205, "grad_norm": 7.70910120010376, "learning_rate": 8.974358974358976e-06, "loss": 3.4221, "step": 800 }, { "epoch": 0.8307692307692308, "grad_norm": 7.079320430755615, "learning_rate": 8.461538461538462e-06, "loss": 3.338, "step": 810 }, { "epoch": 0.841025641025641, "grad_norm": 8.756903648376465, "learning_rate": 7.948717948717949e-06, "loss": 3.3683, "step": 820 }, { "epoch": 0.8512820512820513, "grad_norm": 8.349058151245117, "learning_rate": 7.435897435897436e-06, "loss": 3.4499, "step": 830 }, { "epoch": 0.8615384615384616, "grad_norm": 8.497963905334473, "learning_rate": 6.923076923076923e-06, "loss": 3.4448, "step": 840 }, { "epoch": 0.8717948717948718, "grad_norm": 8.383698463439941, "learning_rate": 6.41025641025641e-06, "loss": 3.2929, "step": 850 }, { "epoch": 0.882051282051282, "grad_norm": 8.016318321228027, "learning_rate": 5.897435897435897e-06, "loss": 3.4073, "step": 860 }, { "epoch": 0.8923076923076924, "grad_norm": 9.301827430725098, "learning_rate": 5.3846153846153855e-06, "loss": 3.4206, "step": 870 }, { "epoch": 0.9025641025641026, "grad_norm": 7.227042198181152, "learning_rate": 4.871794871794872e-06, "loss": 3.3388, "step": 880 }, { "epoch": 0.9128205128205128, "grad_norm": 6.956933975219727, "learning_rate": 4.3589743589743586e-06, "loss": 3.2773, "step": 890 }, { "epoch": 0.9230769230769231, "grad_norm": 8.344608306884766, "learning_rate": 3.846153846153847e-06, "loss": 3.2864, "step": 900 }, { "epoch": 0.9333333333333333, "grad_norm": 7.429567813873291, "learning_rate": 3.3333333333333333e-06, "loss": 3.3606, "step": 910 }, { "epoch": 0.9435897435897436, "grad_norm": 7.6066389083862305, "learning_rate": 2.8205128205128207e-06, "loss": 3.3459, "step": 920 }, { "epoch": 0.9538461538461539, "grad_norm": 7.413199424743652, "learning_rate": 2.307692307692308e-06, "loss": 3.3968, "step": 930 }, { "epoch": 0.9641025641025641, "grad_norm": 6.853078365325928, "learning_rate": 1.7948717948717948e-06, "loss": 3.3706, "step": 940 }, { "epoch": 0.9743589743589743, "grad_norm": 6.604455947875977, "learning_rate": 1.282051282051282e-06, "loss": 3.3048, "step": 950 }, { "epoch": 0.9846153846153847, "grad_norm": 8.022303581237793, "learning_rate": 7.692307692307694e-07, "loss": 3.3028, "step": 960 }, { "epoch": 0.9948717948717949, "grad_norm": 8.982901573181152, "learning_rate": 2.564102564102564e-07, "loss": 3.3398, "step": 970 }, { "epoch": 1.0, "eval_loss": 3.214390277862549, "eval_runtime": 39.3331, "eval_samples_per_second": 99.306, "eval_steps_per_second": 6.229, "step": 975 } ], "logging_steps": 10, "max_steps": 975, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1018842955776000.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }