UltimoUno's picture
Uploaded checkpoint-5000
cd9ea39 verified
raw
history blame contribute delete
No virus
74.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25,
"eval_steps": 1000,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 118.0,
"learning_rate": 2.2000000000000002e-08,
"loss": 8.296,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 120.0,
"learning_rate": 4.4000000000000004e-08,
"loss": 8.3152,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 120.0,
"learning_rate": 6.6e-08,
"loss": 8.3217,
"step": 30
},
{
"epoch": 0.0,
"grad_norm": 117.0,
"learning_rate": 8.800000000000001e-08,
"loss": 8.3192,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 121.5,
"learning_rate": 1.1e-07,
"loss": 8.3432,
"step": 50
},
{
"epoch": 0.0,
"grad_norm": 120.5,
"learning_rate": 1.32e-07,
"loss": 8.3085,
"step": 60
},
{
"epoch": 0.0,
"grad_norm": 120.0,
"learning_rate": 1.5400000000000003e-07,
"loss": 8.3136,
"step": 70
},
{
"epoch": 0.0,
"grad_norm": 116.5,
"learning_rate": 1.7600000000000001e-07,
"loss": 8.3053,
"step": 80
},
{
"epoch": 0.0,
"grad_norm": 119.0,
"learning_rate": 1.98e-07,
"loss": 8.3401,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 119.0,
"learning_rate": 2.2e-07,
"loss": 8.2908,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 119.5,
"learning_rate": 2.42e-07,
"loss": 8.3283,
"step": 110
},
{
"epoch": 0.01,
"grad_norm": 119.5,
"learning_rate": 2.64e-07,
"loss": 8.2985,
"step": 120
},
{
"epoch": 0.01,
"grad_norm": 117.0,
"learning_rate": 2.8600000000000005e-07,
"loss": 8.2882,
"step": 130
},
{
"epoch": 0.01,
"grad_norm": 115.5,
"learning_rate": 3.0800000000000006e-07,
"loss": 8.2644,
"step": 140
},
{
"epoch": 0.01,
"grad_norm": 114.5,
"learning_rate": 3.3e-07,
"loss": 8.2033,
"step": 150
},
{
"epoch": 0.01,
"grad_norm": 115.5,
"learning_rate": 3.5200000000000003e-07,
"loss": 8.1953,
"step": 160
},
{
"epoch": 0.01,
"grad_norm": 113.5,
"learning_rate": 3.7400000000000004e-07,
"loss": 8.1654,
"step": 170
},
{
"epoch": 0.01,
"grad_norm": 116.5,
"learning_rate": 3.96e-07,
"loss": 8.1758,
"step": 180
},
{
"epoch": 0.01,
"grad_norm": 116.5,
"learning_rate": 4.18e-07,
"loss": 8.1881,
"step": 190
},
{
"epoch": 0.01,
"grad_norm": 111.0,
"learning_rate": 4.4e-07,
"loss": 8.0965,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 112.0,
"learning_rate": 4.62e-07,
"loss": 8.1027,
"step": 210
},
{
"epoch": 0.01,
"grad_norm": 112.0,
"learning_rate": 4.84e-07,
"loss": 8.0656,
"step": 220
},
{
"epoch": 0.01,
"grad_norm": 113.0,
"learning_rate": 5.06e-07,
"loss": 8.0485,
"step": 230
},
{
"epoch": 0.01,
"grad_norm": 113.0,
"learning_rate": 5.28e-07,
"loss": 8.0212,
"step": 240
},
{
"epoch": 0.01,
"grad_norm": 110.0,
"learning_rate": 5.5e-07,
"loss": 7.9518,
"step": 250
},
{
"epoch": 0.01,
"grad_norm": 107.5,
"learning_rate": 5.720000000000001e-07,
"loss": 7.9086,
"step": 260
},
{
"epoch": 0.01,
"grad_norm": 106.0,
"learning_rate": 5.94e-07,
"loss": 7.8287,
"step": 270
},
{
"epoch": 0.01,
"grad_norm": 105.0,
"learning_rate": 6.160000000000001e-07,
"loss": 7.7678,
"step": 280
},
{
"epoch": 0.01,
"grad_norm": 108.5,
"learning_rate": 6.38e-07,
"loss": 7.7167,
"step": 290
},
{
"epoch": 0.01,
"grad_norm": 102.0,
"learning_rate": 6.6e-07,
"loss": 7.6403,
"step": 300
},
{
"epoch": 0.02,
"grad_norm": 100.0,
"learning_rate": 6.82e-07,
"loss": 7.5792,
"step": 310
},
{
"epoch": 0.02,
"grad_norm": 100.0,
"learning_rate": 7.040000000000001e-07,
"loss": 7.5038,
"step": 320
},
{
"epoch": 0.02,
"grad_norm": 103.5,
"learning_rate": 7.260000000000001e-07,
"loss": 7.4563,
"step": 330
},
{
"epoch": 0.02,
"grad_norm": 99.5,
"learning_rate": 7.480000000000001e-07,
"loss": 7.3818,
"step": 340
},
{
"epoch": 0.02,
"grad_norm": 104.5,
"learning_rate": 7.7e-07,
"loss": 7.3034,
"step": 350
},
{
"epoch": 0.02,
"grad_norm": 103.0,
"learning_rate": 7.92e-07,
"loss": 7.2297,
"step": 360
},
{
"epoch": 0.02,
"grad_norm": 99.5,
"learning_rate": 8.140000000000001e-07,
"loss": 7.1394,
"step": 370
},
{
"epoch": 0.02,
"grad_norm": 103.0,
"learning_rate": 8.36e-07,
"loss": 7.0762,
"step": 380
},
{
"epoch": 0.02,
"grad_norm": 102.5,
"learning_rate": 8.580000000000001e-07,
"loss": 7.0122,
"step": 390
},
{
"epoch": 0.02,
"grad_norm": 104.5,
"learning_rate": 8.8e-07,
"loss": 6.8935,
"step": 400
},
{
"epoch": 0.02,
"grad_norm": 99.0,
"learning_rate": 9.02e-07,
"loss": 6.7687,
"step": 410
},
{
"epoch": 0.02,
"grad_norm": 94.0,
"learning_rate": 9.24e-07,
"loss": 6.6825,
"step": 420
},
{
"epoch": 0.02,
"grad_norm": 93.5,
"learning_rate": 9.46e-07,
"loss": 6.568,
"step": 430
},
{
"epoch": 0.02,
"grad_norm": 89.5,
"learning_rate": 9.68e-07,
"loss": 6.4486,
"step": 440
},
{
"epoch": 0.02,
"grad_norm": 88.0,
"learning_rate": 9.9e-07,
"loss": 6.3358,
"step": 450
},
{
"epoch": 0.02,
"grad_norm": 85.5,
"learning_rate": 1.012e-06,
"loss": 6.196,
"step": 460
},
{
"epoch": 0.02,
"grad_norm": 84.0,
"learning_rate": 1.034e-06,
"loss": 6.0811,
"step": 470
},
{
"epoch": 0.02,
"grad_norm": 82.0,
"learning_rate": 1.056e-06,
"loss": 5.9429,
"step": 480
},
{
"epoch": 0.02,
"grad_norm": 82.0,
"learning_rate": 1.078e-06,
"loss": 5.8245,
"step": 490
},
{
"epoch": 0.03,
"grad_norm": 81.0,
"learning_rate": 1.1e-06,
"loss": 5.6819,
"step": 500
},
{
"epoch": 0.03,
"grad_norm": 81.5,
"learning_rate": 1.0975555555555557e-06,
"loss": 5.539,
"step": 510
},
{
"epoch": 0.03,
"grad_norm": 81.5,
"learning_rate": 1.095111111111111e-06,
"loss": 5.4188,
"step": 520
},
{
"epoch": 0.03,
"grad_norm": 81.5,
"learning_rate": 1.0926666666666667e-06,
"loss": 5.2885,
"step": 530
},
{
"epoch": 0.03,
"grad_norm": 82.0,
"learning_rate": 1.0902222222222224e-06,
"loss": 5.1496,
"step": 540
},
{
"epoch": 0.03,
"grad_norm": 81.5,
"learning_rate": 1.0877777777777778e-06,
"loss": 5.0336,
"step": 550
},
{
"epoch": 0.03,
"grad_norm": 82.0,
"learning_rate": 1.0853333333333334e-06,
"loss": 4.9006,
"step": 560
},
{
"epoch": 0.03,
"grad_norm": 81.5,
"learning_rate": 1.082888888888889e-06,
"loss": 4.7757,
"step": 570
},
{
"epoch": 0.03,
"grad_norm": 82.0,
"learning_rate": 1.0804444444444445e-06,
"loss": 4.6461,
"step": 580
},
{
"epoch": 0.03,
"grad_norm": 80.5,
"learning_rate": 1.078e-06,
"loss": 4.5267,
"step": 590
},
{
"epoch": 0.03,
"grad_norm": 82.0,
"learning_rate": 1.0755555555555555e-06,
"loss": 4.4198,
"step": 600
},
{
"epoch": 0.03,
"grad_norm": 83.0,
"learning_rate": 1.0731111111111111e-06,
"loss": 4.2895,
"step": 610
},
{
"epoch": 0.03,
"grad_norm": 83.5,
"learning_rate": 1.0706666666666668e-06,
"loss": 4.2099,
"step": 620
},
{
"epoch": 0.03,
"grad_norm": 84.0,
"learning_rate": 1.0682222222222224e-06,
"loss": 4.1181,
"step": 630
},
{
"epoch": 0.03,
"grad_norm": 82.0,
"learning_rate": 1.0657777777777778e-06,
"loss": 4.0069,
"step": 640
},
{
"epoch": 0.03,
"grad_norm": 82.0,
"learning_rate": 1.0633333333333335e-06,
"loss": 3.9045,
"step": 650
},
{
"epoch": 0.03,
"grad_norm": 82.0,
"learning_rate": 1.0608888888888889e-06,
"loss": 3.7913,
"step": 660
},
{
"epoch": 0.03,
"grad_norm": 82.5,
"learning_rate": 1.0584444444444445e-06,
"loss": 3.7172,
"step": 670
},
{
"epoch": 0.03,
"grad_norm": 82.5,
"learning_rate": 1.056e-06,
"loss": 3.6226,
"step": 680
},
{
"epoch": 0.03,
"grad_norm": 82.0,
"learning_rate": 1.0535555555555556e-06,
"loss": 3.5128,
"step": 690
},
{
"epoch": 0.04,
"grad_norm": 81.5,
"learning_rate": 1.0511111111111112e-06,
"loss": 3.4335,
"step": 700
},
{
"epoch": 0.04,
"grad_norm": 81.0,
"learning_rate": 1.0486666666666668e-06,
"loss": 3.3716,
"step": 710
},
{
"epoch": 0.04,
"grad_norm": 79.5,
"learning_rate": 1.0462222222222222e-06,
"loss": 3.2632,
"step": 720
},
{
"epoch": 0.04,
"grad_norm": 80.5,
"learning_rate": 1.0437777777777779e-06,
"loss": 3.2192,
"step": 730
},
{
"epoch": 0.04,
"grad_norm": 80.5,
"learning_rate": 1.0413333333333333e-06,
"loss": 3.1043,
"step": 740
},
{
"epoch": 0.04,
"grad_norm": 81.0,
"learning_rate": 1.038888888888889e-06,
"loss": 3.0207,
"step": 750
},
{
"epoch": 0.04,
"grad_norm": 80.0,
"learning_rate": 1.0364444444444445e-06,
"loss": 2.9501,
"step": 760
},
{
"epoch": 0.04,
"grad_norm": 79.5,
"learning_rate": 1.034e-06,
"loss": 2.9028,
"step": 770
},
{
"epoch": 0.04,
"grad_norm": 78.5,
"learning_rate": 1.0315555555555556e-06,
"loss": 2.8108,
"step": 780
},
{
"epoch": 0.04,
"grad_norm": 78.5,
"learning_rate": 1.0291111111111112e-06,
"loss": 2.7679,
"step": 790
},
{
"epoch": 0.04,
"grad_norm": 77.0,
"learning_rate": 1.0266666666666666e-06,
"loss": 2.6742,
"step": 800
},
{
"epoch": 0.04,
"grad_norm": 77.5,
"learning_rate": 1.0242222222222223e-06,
"loss": 2.6345,
"step": 810
},
{
"epoch": 0.04,
"grad_norm": 77.0,
"learning_rate": 1.021777777777778e-06,
"loss": 2.5581,
"step": 820
},
{
"epoch": 0.04,
"grad_norm": 77.0,
"learning_rate": 1.0193333333333333e-06,
"loss": 2.5076,
"step": 830
},
{
"epoch": 0.04,
"grad_norm": 77.0,
"learning_rate": 1.016888888888889e-06,
"loss": 2.4608,
"step": 840
},
{
"epoch": 0.04,
"grad_norm": 77.0,
"learning_rate": 1.0144444444444446e-06,
"loss": 2.3898,
"step": 850
},
{
"epoch": 0.04,
"grad_norm": 77.0,
"learning_rate": 1.012e-06,
"loss": 2.3806,
"step": 860
},
{
"epoch": 0.04,
"grad_norm": 76.5,
"learning_rate": 1.0095555555555556e-06,
"loss": 2.2935,
"step": 870
},
{
"epoch": 0.04,
"grad_norm": 75.5,
"learning_rate": 1.0071111111111113e-06,
"loss": 2.2645,
"step": 880
},
{
"epoch": 0.04,
"grad_norm": 75.0,
"learning_rate": 1.0046666666666667e-06,
"loss": 2.1927,
"step": 890
},
{
"epoch": 0.04,
"grad_norm": 75.5,
"learning_rate": 1.0022222222222223e-06,
"loss": 2.1659,
"step": 900
},
{
"epoch": 0.05,
"grad_norm": 74.0,
"learning_rate": 9.997777777777777e-07,
"loss": 2.1114,
"step": 910
},
{
"epoch": 0.05,
"grad_norm": 76.5,
"learning_rate": 9.973333333333334e-07,
"loss": 2.0784,
"step": 920
},
{
"epoch": 0.05,
"grad_norm": 74.0,
"learning_rate": 9.94888888888889e-07,
"loss": 2.0061,
"step": 930
},
{
"epoch": 0.05,
"grad_norm": 73.5,
"learning_rate": 9.924444444444446e-07,
"loss": 1.9691,
"step": 940
},
{
"epoch": 0.05,
"grad_norm": 73.0,
"learning_rate": 9.9e-07,
"loss": 1.8852,
"step": 950
},
{
"epoch": 0.05,
"grad_norm": 74.0,
"learning_rate": 9.875555555555557e-07,
"loss": 1.8666,
"step": 960
},
{
"epoch": 0.05,
"grad_norm": 72.0,
"learning_rate": 9.85111111111111e-07,
"loss": 1.8188,
"step": 970
},
{
"epoch": 0.05,
"grad_norm": 71.5,
"learning_rate": 9.826666666666667e-07,
"loss": 1.7199,
"step": 980
},
{
"epoch": 0.05,
"grad_norm": 73.0,
"learning_rate": 9.802222222222221e-07,
"loss": 1.7025,
"step": 990
},
{
"epoch": 0.05,
"grad_norm": 71.0,
"learning_rate": 9.777777777777778e-07,
"loss": 1.6182,
"step": 1000
},
{
"epoch": 0.05,
"eval_loss": 1.607034683227539,
"eval_runtime": 64.6423,
"eval_samples_per_second": 15.47,
"eval_steps_per_second": 15.47,
"step": 1000
},
{
"epoch": 0.05,
"grad_norm": 71.5,
"learning_rate": 9.753333333333334e-07,
"loss": 1.5834,
"step": 1010
},
{
"epoch": 0.05,
"grad_norm": 71.0,
"learning_rate": 9.72888888888889e-07,
"loss": 1.4948,
"step": 1020
},
{
"epoch": 0.05,
"grad_norm": 71.5,
"learning_rate": 9.704444444444445e-07,
"loss": 1.4671,
"step": 1030
},
{
"epoch": 0.05,
"grad_norm": 70.0,
"learning_rate": 9.68e-07,
"loss": 1.341,
"step": 1040
},
{
"epoch": 0.05,
"grad_norm": 69.0,
"learning_rate": 9.655555555555555e-07,
"loss": 1.3023,
"step": 1050
},
{
"epoch": 0.05,
"grad_norm": 69.5,
"learning_rate": 9.631111111111111e-07,
"loss": 1.222,
"step": 1060
},
{
"epoch": 0.05,
"grad_norm": 64.5,
"learning_rate": 9.606666666666668e-07,
"loss": 1.1278,
"step": 1070
},
{
"epoch": 0.05,
"grad_norm": 62.5,
"learning_rate": 9.582222222222222e-07,
"loss": 1.0297,
"step": 1080
},
{
"epoch": 0.05,
"grad_norm": 60.5,
"learning_rate": 9.557777777777778e-07,
"loss": 0.9804,
"step": 1090
},
{
"epoch": 0.06,
"grad_norm": 56.0,
"learning_rate": 9.533333333333335e-07,
"loss": 0.9246,
"step": 1100
},
{
"epoch": 0.06,
"grad_norm": 54.5,
"learning_rate": 9.50888888888889e-07,
"loss": 0.8298,
"step": 1110
},
{
"epoch": 0.06,
"grad_norm": 50.0,
"learning_rate": 9.484444444444445e-07,
"loss": 0.7654,
"step": 1120
},
{
"epoch": 0.06,
"grad_norm": 54.0,
"learning_rate": 9.46e-07,
"loss": 0.7176,
"step": 1130
},
{
"epoch": 0.06,
"grad_norm": 47.25,
"learning_rate": 9.435555555555556e-07,
"loss": 0.6529,
"step": 1140
},
{
"epoch": 0.06,
"grad_norm": 45.75,
"learning_rate": 9.411111111111111e-07,
"loss": 0.6445,
"step": 1150
},
{
"epoch": 0.06,
"grad_norm": 47.75,
"learning_rate": 9.386666666666668e-07,
"loss": 0.6063,
"step": 1160
},
{
"epoch": 0.06,
"grad_norm": 43.25,
"learning_rate": 9.362222222222223e-07,
"loss": 0.5723,
"step": 1170
},
{
"epoch": 0.06,
"grad_norm": 41.0,
"learning_rate": 9.337777777777779e-07,
"loss": 0.5213,
"step": 1180
},
{
"epoch": 0.06,
"grad_norm": 35.75,
"learning_rate": 9.313333333333334e-07,
"loss": 0.5067,
"step": 1190
},
{
"epoch": 0.06,
"grad_norm": 35.0,
"learning_rate": 9.288888888888889e-07,
"loss": 0.4591,
"step": 1200
},
{
"epoch": 0.06,
"grad_norm": 38.5,
"learning_rate": 9.264444444444444e-07,
"loss": 0.4633,
"step": 1210
},
{
"epoch": 0.06,
"grad_norm": 35.0,
"learning_rate": 9.24e-07,
"loss": 0.4209,
"step": 1220
},
{
"epoch": 0.06,
"grad_norm": 33.0,
"learning_rate": 9.215555555555556e-07,
"loss": 0.4244,
"step": 1230
},
{
"epoch": 0.06,
"grad_norm": 29.5,
"learning_rate": 9.191111111111112e-07,
"loss": 0.3825,
"step": 1240
},
{
"epoch": 0.06,
"grad_norm": 28.75,
"learning_rate": 9.166666666666667e-07,
"loss": 0.3652,
"step": 1250
},
{
"epoch": 0.06,
"grad_norm": 26.5,
"learning_rate": 9.142222222222223e-07,
"loss": 0.3491,
"step": 1260
},
{
"epoch": 0.06,
"grad_norm": 30.5,
"learning_rate": 9.117777777777778e-07,
"loss": 0.3503,
"step": 1270
},
{
"epoch": 0.06,
"grad_norm": 26.0,
"learning_rate": 9.093333333333333e-07,
"loss": 0.3352,
"step": 1280
},
{
"epoch": 0.06,
"grad_norm": 24.0,
"learning_rate": 9.06888888888889e-07,
"loss": 0.2929,
"step": 1290
},
{
"epoch": 0.07,
"grad_norm": 21.0,
"learning_rate": 9.044444444444445e-07,
"loss": 0.2818,
"step": 1300
},
{
"epoch": 0.07,
"grad_norm": 26.25,
"learning_rate": 9.02e-07,
"loss": 0.2846,
"step": 1310
},
{
"epoch": 0.07,
"grad_norm": 22.625,
"learning_rate": 8.995555555555556e-07,
"loss": 0.2931,
"step": 1320
},
{
"epoch": 0.07,
"grad_norm": 23.5,
"learning_rate": 8.971111111111112e-07,
"loss": 0.2636,
"step": 1330
},
{
"epoch": 0.07,
"grad_norm": 21.75,
"learning_rate": 8.946666666666667e-07,
"loss": 0.2601,
"step": 1340
},
{
"epoch": 0.07,
"grad_norm": 21.875,
"learning_rate": 8.922222222222223e-07,
"loss": 0.2422,
"step": 1350
},
{
"epoch": 0.07,
"grad_norm": 19.875,
"learning_rate": 8.897777777777778e-07,
"loss": 0.2224,
"step": 1360
},
{
"epoch": 0.07,
"grad_norm": 19.375,
"learning_rate": 8.873333333333334e-07,
"loss": 0.2403,
"step": 1370
},
{
"epoch": 0.07,
"grad_norm": 18.25,
"learning_rate": 8.848888888888889e-07,
"loss": 0.2175,
"step": 1380
},
{
"epoch": 0.07,
"grad_norm": 17.375,
"learning_rate": 8.824444444444444e-07,
"loss": 0.2203,
"step": 1390
},
{
"epoch": 0.07,
"grad_norm": 18.625,
"learning_rate": 8.8e-07,
"loss": 0.1981,
"step": 1400
},
{
"epoch": 0.07,
"grad_norm": 17.25,
"learning_rate": 8.775555555555557e-07,
"loss": 0.1919,
"step": 1410
},
{
"epoch": 0.07,
"grad_norm": 17.625,
"learning_rate": 8.751111111111112e-07,
"loss": 0.1952,
"step": 1420
},
{
"epoch": 0.07,
"grad_norm": 16.125,
"learning_rate": 8.726666666666667e-07,
"loss": 0.1902,
"step": 1430
},
{
"epoch": 0.07,
"grad_norm": 15.125,
"learning_rate": 8.702222222222223e-07,
"loss": 0.1776,
"step": 1440
},
{
"epoch": 0.07,
"grad_norm": 15.4375,
"learning_rate": 8.677777777777778e-07,
"loss": 0.1818,
"step": 1450
},
{
"epoch": 0.07,
"grad_norm": 18.0,
"learning_rate": 8.653333333333333e-07,
"loss": 0.1595,
"step": 1460
},
{
"epoch": 0.07,
"grad_norm": 15.6875,
"learning_rate": 8.62888888888889e-07,
"loss": 0.1515,
"step": 1470
},
{
"epoch": 0.07,
"grad_norm": 18.25,
"learning_rate": 8.604444444444446e-07,
"loss": 0.1645,
"step": 1480
},
{
"epoch": 0.07,
"grad_norm": 16.0,
"learning_rate": 8.580000000000001e-07,
"loss": 0.1548,
"step": 1490
},
{
"epoch": 0.07,
"grad_norm": 12.3125,
"learning_rate": 8.555555555555556e-07,
"loss": 0.1445,
"step": 1500
},
{
"epoch": 0.08,
"grad_norm": 15.4375,
"learning_rate": 8.531111111111111e-07,
"loss": 0.1442,
"step": 1510
},
{
"epoch": 0.08,
"grad_norm": 11.6875,
"learning_rate": 8.506666666666667e-07,
"loss": 0.1349,
"step": 1520
},
{
"epoch": 0.08,
"grad_norm": 12.5625,
"learning_rate": 8.482222222222222e-07,
"loss": 0.1328,
"step": 1530
},
{
"epoch": 0.08,
"grad_norm": 15.4375,
"learning_rate": 8.457777777777778e-07,
"loss": 0.1329,
"step": 1540
},
{
"epoch": 0.08,
"grad_norm": 11.375,
"learning_rate": 8.433333333333334e-07,
"loss": 0.1273,
"step": 1550
},
{
"epoch": 0.08,
"grad_norm": 11.6875,
"learning_rate": 8.40888888888889e-07,
"loss": 0.1383,
"step": 1560
},
{
"epoch": 0.08,
"grad_norm": 13.875,
"learning_rate": 8.384444444444445e-07,
"loss": 0.1265,
"step": 1570
},
{
"epoch": 0.08,
"grad_norm": 10.875,
"learning_rate": 8.36e-07,
"loss": 0.1264,
"step": 1580
},
{
"epoch": 0.08,
"grad_norm": 11.25,
"learning_rate": 8.335555555555555e-07,
"loss": 0.1127,
"step": 1590
},
{
"epoch": 0.08,
"grad_norm": 10.6875,
"learning_rate": 8.311111111111112e-07,
"loss": 0.112,
"step": 1600
},
{
"epoch": 0.08,
"grad_norm": 9.5,
"learning_rate": 8.286666666666667e-07,
"loss": 0.1162,
"step": 1610
},
{
"epoch": 0.08,
"grad_norm": 11.0625,
"learning_rate": 8.262222222222222e-07,
"loss": 0.0977,
"step": 1620
},
{
"epoch": 0.08,
"grad_norm": 12.375,
"learning_rate": 8.237777777777779e-07,
"loss": 0.1047,
"step": 1630
},
{
"epoch": 0.08,
"grad_norm": 12.375,
"learning_rate": 8.213333333333334e-07,
"loss": 0.1005,
"step": 1640
},
{
"epoch": 0.08,
"grad_norm": 9.5625,
"learning_rate": 8.188888888888889e-07,
"loss": 0.1039,
"step": 1650
},
{
"epoch": 0.08,
"grad_norm": 8.625,
"learning_rate": 8.164444444444445e-07,
"loss": 0.0944,
"step": 1660
},
{
"epoch": 0.08,
"grad_norm": 10.1875,
"learning_rate": 8.140000000000001e-07,
"loss": 0.0916,
"step": 1670
},
{
"epoch": 0.08,
"grad_norm": 7.84375,
"learning_rate": 8.115555555555556e-07,
"loss": 0.087,
"step": 1680
},
{
"epoch": 0.08,
"grad_norm": 9.1875,
"learning_rate": 8.091111111111111e-07,
"loss": 0.0923,
"step": 1690
},
{
"epoch": 0.09,
"grad_norm": 10.0,
"learning_rate": 8.066666666666666e-07,
"loss": 0.0992,
"step": 1700
},
{
"epoch": 0.09,
"grad_norm": 9.0,
"learning_rate": 8.042222222222223e-07,
"loss": 0.0859,
"step": 1710
},
{
"epoch": 0.09,
"grad_norm": 10.375,
"learning_rate": 8.017777777777779e-07,
"loss": 0.0862,
"step": 1720
},
{
"epoch": 0.09,
"grad_norm": 8.125,
"learning_rate": 7.993333333333334e-07,
"loss": 0.0773,
"step": 1730
},
{
"epoch": 0.09,
"grad_norm": 7.375,
"learning_rate": 7.96888888888889e-07,
"loss": 0.077,
"step": 1740
},
{
"epoch": 0.09,
"grad_norm": 7.3125,
"learning_rate": 7.944444444444445e-07,
"loss": 0.0738,
"step": 1750
},
{
"epoch": 0.09,
"grad_norm": 7.375,
"learning_rate": 7.92e-07,
"loss": 0.0703,
"step": 1760
},
{
"epoch": 0.09,
"grad_norm": 8.125,
"learning_rate": 7.895555555555555e-07,
"loss": 0.0668,
"step": 1770
},
{
"epoch": 0.09,
"grad_norm": 8.8125,
"learning_rate": 7.87111111111111e-07,
"loss": 0.0806,
"step": 1780
},
{
"epoch": 0.09,
"grad_norm": 8.6875,
"learning_rate": 7.846666666666668e-07,
"loss": 0.0694,
"step": 1790
},
{
"epoch": 0.09,
"grad_norm": 8.8125,
"learning_rate": 7.822222222222223e-07,
"loss": 0.0708,
"step": 1800
},
{
"epoch": 0.09,
"grad_norm": 6.8125,
"learning_rate": 7.797777777777778e-07,
"loss": 0.0711,
"step": 1810
},
{
"epoch": 0.09,
"grad_norm": 8.0625,
"learning_rate": 7.773333333333334e-07,
"loss": 0.0769,
"step": 1820
},
{
"epoch": 0.09,
"grad_norm": 11.0625,
"learning_rate": 7.748888888888889e-07,
"loss": 0.0689,
"step": 1830
},
{
"epoch": 0.09,
"grad_norm": 8.25,
"learning_rate": 7.724444444444444e-07,
"loss": 0.0651,
"step": 1840
},
{
"epoch": 0.09,
"grad_norm": 7.9375,
"learning_rate": 7.7e-07,
"loss": 0.0731,
"step": 1850
},
{
"epoch": 0.09,
"grad_norm": 6.65625,
"learning_rate": 7.675555555555557e-07,
"loss": 0.07,
"step": 1860
},
{
"epoch": 0.09,
"grad_norm": 6.78125,
"learning_rate": 7.651111111111112e-07,
"loss": 0.0634,
"step": 1870
},
{
"epoch": 0.09,
"grad_norm": 8.25,
"learning_rate": 7.626666666666667e-07,
"loss": 0.069,
"step": 1880
},
{
"epoch": 0.09,
"grad_norm": 7.34375,
"learning_rate": 7.602222222222222e-07,
"loss": 0.0701,
"step": 1890
},
{
"epoch": 0.1,
"grad_norm": 7.59375,
"learning_rate": 7.577777777777778e-07,
"loss": 0.0636,
"step": 1900
},
{
"epoch": 0.1,
"grad_norm": 7.46875,
"learning_rate": 7.553333333333334e-07,
"loss": 0.059,
"step": 1910
},
{
"epoch": 0.1,
"grad_norm": 8.625,
"learning_rate": 7.528888888888889e-07,
"loss": 0.063,
"step": 1920
},
{
"epoch": 0.1,
"grad_norm": 7.8125,
"learning_rate": 7.504444444444444e-07,
"loss": 0.0612,
"step": 1930
},
{
"epoch": 0.1,
"grad_norm": 8.125,
"learning_rate": 7.480000000000001e-07,
"loss": 0.0615,
"step": 1940
},
{
"epoch": 0.1,
"grad_norm": 10.0625,
"learning_rate": 7.455555555555556e-07,
"loss": 0.059,
"step": 1950
},
{
"epoch": 0.1,
"grad_norm": 6.09375,
"learning_rate": 7.431111111111111e-07,
"loss": 0.0644,
"step": 1960
},
{
"epoch": 0.1,
"grad_norm": 7.0625,
"learning_rate": 7.406666666666668e-07,
"loss": 0.0658,
"step": 1970
},
{
"epoch": 0.1,
"grad_norm": 5.875,
"learning_rate": 7.382222222222223e-07,
"loss": 0.059,
"step": 1980
},
{
"epoch": 0.1,
"grad_norm": 6.84375,
"learning_rate": 7.357777777777778e-07,
"loss": 0.0617,
"step": 1990
},
{
"epoch": 0.1,
"grad_norm": 6.3125,
"learning_rate": 7.333333333333333e-07,
"loss": 0.0613,
"step": 2000
},
{
"epoch": 0.1,
"eval_loss": 0.06299971044063568,
"eval_runtime": 64.6734,
"eval_samples_per_second": 15.462,
"eval_steps_per_second": 15.462,
"step": 2000
},
{
"epoch": 0.1,
"grad_norm": 8.0,
"learning_rate": 7.308888888888889e-07,
"loss": 0.0596,
"step": 2010
},
{
"epoch": 0.1,
"grad_norm": 5.84375,
"learning_rate": 7.284444444444445e-07,
"loss": 0.0587,
"step": 2020
},
{
"epoch": 0.1,
"grad_norm": 5.78125,
"learning_rate": 7.260000000000001e-07,
"loss": 0.0605,
"step": 2030
},
{
"epoch": 0.1,
"grad_norm": 6.65625,
"learning_rate": 7.235555555555556e-07,
"loss": 0.0557,
"step": 2040
},
{
"epoch": 0.1,
"grad_norm": 5.65625,
"learning_rate": 7.211111111111112e-07,
"loss": 0.0572,
"step": 2050
},
{
"epoch": 0.1,
"grad_norm": 5.84375,
"learning_rate": 7.186666666666667e-07,
"loss": 0.053,
"step": 2060
},
{
"epoch": 0.1,
"grad_norm": 5.78125,
"learning_rate": 7.162222222222222e-07,
"loss": 0.0582,
"step": 2070
},
{
"epoch": 0.1,
"grad_norm": 6.90625,
"learning_rate": 7.137777777777777e-07,
"loss": 0.0644,
"step": 2080
},
{
"epoch": 0.1,
"grad_norm": 7.96875,
"learning_rate": 7.113333333333333e-07,
"loss": 0.0588,
"step": 2090
},
{
"epoch": 0.1,
"grad_norm": 5.15625,
"learning_rate": 7.08888888888889e-07,
"loss": 0.0535,
"step": 2100
},
{
"epoch": 0.11,
"grad_norm": 7.46875,
"learning_rate": 7.064444444444445e-07,
"loss": 0.0573,
"step": 2110
},
{
"epoch": 0.11,
"grad_norm": 6.28125,
"learning_rate": 7.040000000000001e-07,
"loss": 0.0584,
"step": 2120
},
{
"epoch": 0.11,
"grad_norm": 5.75,
"learning_rate": 7.015555555555556e-07,
"loss": 0.0532,
"step": 2130
},
{
"epoch": 0.11,
"grad_norm": 7.15625,
"learning_rate": 6.991111111111111e-07,
"loss": 0.0526,
"step": 2140
},
{
"epoch": 0.11,
"grad_norm": 7.40625,
"learning_rate": 6.966666666666666e-07,
"loss": 0.0621,
"step": 2150
},
{
"epoch": 0.11,
"grad_norm": 9.125,
"learning_rate": 6.942222222222222e-07,
"loss": 0.055,
"step": 2160
},
{
"epoch": 0.11,
"grad_norm": 8.625,
"learning_rate": 6.917777777777779e-07,
"loss": 0.0614,
"step": 2170
},
{
"epoch": 0.11,
"grad_norm": 6.375,
"learning_rate": 6.893333333333334e-07,
"loss": 0.0527,
"step": 2180
},
{
"epoch": 0.11,
"grad_norm": 5.21875,
"learning_rate": 6.868888888888889e-07,
"loss": 0.069,
"step": 2190
},
{
"epoch": 0.11,
"grad_norm": 5.65625,
"learning_rate": 6.844444444444445e-07,
"loss": 0.0524,
"step": 2200
},
{
"epoch": 0.11,
"grad_norm": 5.84375,
"learning_rate": 6.82e-07,
"loss": 0.0596,
"step": 2210
},
{
"epoch": 0.11,
"grad_norm": 5.15625,
"learning_rate": 6.795555555555555e-07,
"loss": 0.0587,
"step": 2220
},
{
"epoch": 0.11,
"grad_norm": 5.375,
"learning_rate": 6.771111111111111e-07,
"loss": 0.0563,
"step": 2230
},
{
"epoch": 0.11,
"grad_norm": 5.71875,
"learning_rate": 6.746666666666667e-07,
"loss": 0.0553,
"step": 2240
},
{
"epoch": 0.11,
"grad_norm": 5.5625,
"learning_rate": 6.722222222222223e-07,
"loss": 0.0557,
"step": 2250
},
{
"epoch": 0.11,
"grad_norm": 6.8125,
"learning_rate": 6.697777777777778e-07,
"loss": 0.0557,
"step": 2260
},
{
"epoch": 0.11,
"grad_norm": 5.8125,
"learning_rate": 6.673333333333334e-07,
"loss": 0.0551,
"step": 2270
},
{
"epoch": 0.11,
"grad_norm": 5.78125,
"learning_rate": 6.648888888888889e-07,
"loss": 0.0535,
"step": 2280
},
{
"epoch": 0.11,
"grad_norm": 5.375,
"learning_rate": 6.624444444444445e-07,
"loss": 0.0501,
"step": 2290
},
{
"epoch": 0.12,
"grad_norm": 6.40625,
"learning_rate": 6.6e-07,
"loss": 0.0586,
"step": 2300
},
{
"epoch": 0.12,
"grad_norm": 7.3125,
"learning_rate": 6.575555555555556e-07,
"loss": 0.0548,
"step": 2310
},
{
"epoch": 0.12,
"grad_norm": 8.5,
"learning_rate": 6.551111111111111e-07,
"loss": 0.0566,
"step": 2320
},
{
"epoch": 0.12,
"grad_norm": 5.5,
"learning_rate": 6.526666666666667e-07,
"loss": 0.0656,
"step": 2330
},
{
"epoch": 0.12,
"grad_norm": 5.625,
"learning_rate": 6.502222222222222e-07,
"loss": 0.052,
"step": 2340
},
{
"epoch": 0.12,
"grad_norm": 6.1875,
"learning_rate": 6.477777777777779e-07,
"loss": 0.0574,
"step": 2350
},
{
"epoch": 0.12,
"grad_norm": 7.53125,
"learning_rate": 6.453333333333334e-07,
"loss": 0.0564,
"step": 2360
},
{
"epoch": 0.12,
"grad_norm": 5.0,
"learning_rate": 6.428888888888889e-07,
"loss": 0.0554,
"step": 2370
},
{
"epoch": 0.12,
"grad_norm": 6.1875,
"learning_rate": 6.404444444444444e-07,
"loss": 0.0513,
"step": 2380
},
{
"epoch": 0.12,
"grad_norm": 7.03125,
"learning_rate": 6.38e-07,
"loss": 0.0525,
"step": 2390
},
{
"epoch": 0.12,
"grad_norm": 5.25,
"learning_rate": 6.355555555555555e-07,
"loss": 0.0522,
"step": 2400
},
{
"epoch": 0.12,
"grad_norm": 6.125,
"learning_rate": 6.331111111111112e-07,
"loss": 0.06,
"step": 2410
},
{
"epoch": 0.12,
"grad_norm": 5.0625,
"learning_rate": 6.306666666666668e-07,
"loss": 0.0542,
"step": 2420
},
{
"epoch": 0.12,
"grad_norm": 5.71875,
"learning_rate": 6.282222222222223e-07,
"loss": 0.0614,
"step": 2430
},
{
"epoch": 0.12,
"grad_norm": 6.8125,
"learning_rate": 6.257777777777778e-07,
"loss": 0.055,
"step": 2440
},
{
"epoch": 0.12,
"grad_norm": 4.78125,
"learning_rate": 6.233333333333333e-07,
"loss": 0.0533,
"step": 2450
},
{
"epoch": 0.12,
"grad_norm": 5.6875,
"learning_rate": 6.208888888888889e-07,
"loss": 0.0595,
"step": 2460
},
{
"epoch": 0.12,
"grad_norm": 7.875,
"learning_rate": 6.184444444444444e-07,
"loss": 0.0589,
"step": 2470
},
{
"epoch": 0.12,
"grad_norm": 6.71875,
"learning_rate": 6.160000000000001e-07,
"loss": 0.0513,
"step": 2480
},
{
"epoch": 0.12,
"grad_norm": 7.125,
"learning_rate": 6.135555555555556e-07,
"loss": 0.0566,
"step": 2490
},
{
"epoch": 0.12,
"grad_norm": 5.875,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0549,
"step": 2500
},
{
"epoch": 0.13,
"grad_norm": 7.125,
"learning_rate": 6.086666666666667e-07,
"loss": 0.0583,
"step": 2510
},
{
"epoch": 0.13,
"grad_norm": 5.875,
"learning_rate": 6.062222222222222e-07,
"loss": 0.0632,
"step": 2520
},
{
"epoch": 0.13,
"grad_norm": 4.875,
"learning_rate": 6.037777777777777e-07,
"loss": 0.058,
"step": 2530
},
{
"epoch": 0.13,
"grad_norm": 5.03125,
"learning_rate": 6.013333333333334e-07,
"loss": 0.0566,
"step": 2540
},
{
"epoch": 0.13,
"grad_norm": 5.8125,
"learning_rate": 5.988888888888889e-07,
"loss": 0.0588,
"step": 2550
},
{
"epoch": 0.13,
"grad_norm": 6.21875,
"learning_rate": 5.964444444444445e-07,
"loss": 0.059,
"step": 2560
},
{
"epoch": 0.13,
"grad_norm": 5.09375,
"learning_rate": 5.94e-07,
"loss": 0.0592,
"step": 2570
},
{
"epoch": 0.13,
"grad_norm": 7.59375,
"learning_rate": 5.915555555555556e-07,
"loss": 0.0599,
"step": 2580
},
{
"epoch": 0.13,
"grad_norm": 5.8125,
"learning_rate": 5.891111111111111e-07,
"loss": 0.0518,
"step": 2590
},
{
"epoch": 0.13,
"grad_norm": 8.0,
"learning_rate": 5.866666666666667e-07,
"loss": 0.0578,
"step": 2600
},
{
"epoch": 0.13,
"grad_norm": 7.125,
"learning_rate": 5.842222222222223e-07,
"loss": 0.0509,
"step": 2610
},
{
"epoch": 0.13,
"grad_norm": 4.96875,
"learning_rate": 5.817777777777778e-07,
"loss": 0.0536,
"step": 2620
},
{
"epoch": 0.13,
"grad_norm": 7.25,
"learning_rate": 5.793333333333333e-07,
"loss": 0.052,
"step": 2630
},
{
"epoch": 0.13,
"grad_norm": 8.8125,
"learning_rate": 5.768888888888889e-07,
"loss": 0.0555,
"step": 2640
},
{
"epoch": 0.13,
"grad_norm": 10.375,
"learning_rate": 5.744444444444445e-07,
"loss": 0.0574,
"step": 2650
},
{
"epoch": 0.13,
"grad_norm": 7.625,
"learning_rate": 5.720000000000001e-07,
"loss": 0.0569,
"step": 2660
},
{
"epoch": 0.13,
"grad_norm": 7.1875,
"learning_rate": 5.695555555555556e-07,
"loss": 0.0565,
"step": 2670
},
{
"epoch": 0.13,
"grad_norm": 6.71875,
"learning_rate": 5.671111111111111e-07,
"loss": 0.0518,
"step": 2680
},
{
"epoch": 0.13,
"grad_norm": 6.71875,
"learning_rate": 5.646666666666667e-07,
"loss": 0.0614,
"step": 2690
},
{
"epoch": 0.14,
"grad_norm": 5.59375,
"learning_rate": 5.622222222222222e-07,
"loss": 0.054,
"step": 2700
},
{
"epoch": 0.14,
"grad_norm": 8.0625,
"learning_rate": 5.597777777777777e-07,
"loss": 0.0573,
"step": 2710
},
{
"epoch": 0.14,
"grad_norm": 8.25,
"learning_rate": 5.573333333333335e-07,
"loss": 0.0602,
"step": 2720
},
{
"epoch": 0.14,
"grad_norm": 8.625,
"learning_rate": 5.54888888888889e-07,
"loss": 0.0643,
"step": 2730
},
{
"epoch": 0.14,
"grad_norm": 7.25,
"learning_rate": 5.524444444444445e-07,
"loss": 0.0589,
"step": 2740
},
{
"epoch": 0.14,
"grad_norm": 5.78125,
"learning_rate": 5.5e-07,
"loss": 0.0508,
"step": 2750
},
{
"epoch": 0.14,
"grad_norm": 5.625,
"learning_rate": 5.475555555555556e-07,
"loss": 0.0644,
"step": 2760
},
{
"epoch": 0.14,
"grad_norm": 6.15625,
"learning_rate": 5.451111111111112e-07,
"loss": 0.0525,
"step": 2770
},
{
"epoch": 0.14,
"grad_norm": 7.5625,
"learning_rate": 5.426666666666667e-07,
"loss": 0.0626,
"step": 2780
},
{
"epoch": 0.14,
"grad_norm": 7.875,
"learning_rate": 5.402222222222222e-07,
"loss": 0.0568,
"step": 2790
},
{
"epoch": 0.14,
"grad_norm": 5.46875,
"learning_rate": 5.377777777777778e-07,
"loss": 0.0535,
"step": 2800
},
{
"epoch": 0.14,
"grad_norm": 5.46875,
"learning_rate": 5.353333333333334e-07,
"loss": 0.0539,
"step": 2810
},
{
"epoch": 0.14,
"grad_norm": 5.15625,
"learning_rate": 5.328888888888889e-07,
"loss": 0.0467,
"step": 2820
},
{
"epoch": 0.14,
"grad_norm": 5.90625,
"learning_rate": 5.304444444444444e-07,
"loss": 0.0613,
"step": 2830
},
{
"epoch": 0.14,
"grad_norm": 6.125,
"learning_rate": 5.28e-07,
"loss": 0.0516,
"step": 2840
},
{
"epoch": 0.14,
"grad_norm": 5.8125,
"learning_rate": 5.255555555555556e-07,
"loss": 0.0555,
"step": 2850
},
{
"epoch": 0.14,
"grad_norm": 8.125,
"learning_rate": 5.231111111111111e-07,
"loss": 0.0608,
"step": 2860
},
{
"epoch": 0.14,
"grad_norm": 7.78125,
"learning_rate": 5.206666666666666e-07,
"loss": 0.0559,
"step": 2870
},
{
"epoch": 0.14,
"grad_norm": 6.28125,
"learning_rate": 5.182222222222223e-07,
"loss": 0.0566,
"step": 2880
},
{
"epoch": 0.14,
"grad_norm": 6.53125,
"learning_rate": 5.157777777777778e-07,
"loss": 0.0475,
"step": 2890
},
{
"epoch": 0.14,
"grad_norm": 5.59375,
"learning_rate": 5.133333333333333e-07,
"loss": 0.0537,
"step": 2900
},
{
"epoch": 0.15,
"grad_norm": 7.75,
"learning_rate": 5.10888888888889e-07,
"loss": 0.0558,
"step": 2910
},
{
"epoch": 0.15,
"grad_norm": 5.9375,
"learning_rate": 5.084444444444445e-07,
"loss": 0.0518,
"step": 2920
},
{
"epoch": 0.15,
"grad_norm": 7.0625,
"learning_rate": 5.06e-07,
"loss": 0.0578,
"step": 2930
},
{
"epoch": 0.15,
"grad_norm": 6.0625,
"learning_rate": 5.035555555555556e-07,
"loss": 0.052,
"step": 2940
},
{
"epoch": 0.15,
"grad_norm": 5.53125,
"learning_rate": 5.011111111111112e-07,
"loss": 0.048,
"step": 2950
},
{
"epoch": 0.15,
"grad_norm": 6.59375,
"learning_rate": 4.986666666666667e-07,
"loss": 0.0566,
"step": 2960
},
{
"epoch": 0.15,
"grad_norm": 5.1875,
"learning_rate": 4.962222222222223e-07,
"loss": 0.0501,
"step": 2970
},
{
"epoch": 0.15,
"grad_norm": 6.09375,
"learning_rate": 4.937777777777778e-07,
"loss": 0.0554,
"step": 2980
},
{
"epoch": 0.15,
"grad_norm": 7.34375,
"learning_rate": 4.913333333333334e-07,
"loss": 0.0582,
"step": 2990
},
{
"epoch": 0.15,
"grad_norm": 5.09375,
"learning_rate": 4.888888888888889e-07,
"loss": 0.0568,
"step": 3000
},
{
"epoch": 0.15,
"eval_loss": 0.05641532689332962,
"eval_runtime": 64.515,
"eval_samples_per_second": 15.5,
"eval_steps_per_second": 15.5,
"step": 3000
},
{
"epoch": 0.15,
"grad_norm": 5.84375,
"learning_rate": 4.864444444444445e-07,
"loss": 0.0486,
"step": 3010
},
{
"epoch": 0.15,
"grad_norm": 6.34375,
"learning_rate": 4.84e-07,
"loss": 0.0504,
"step": 3020
},
{
"epoch": 0.15,
"grad_norm": 6.15625,
"learning_rate": 4.815555555555556e-07,
"loss": 0.0576,
"step": 3030
},
{
"epoch": 0.15,
"grad_norm": 5.4375,
"learning_rate": 4.791111111111111e-07,
"loss": 0.0585,
"step": 3040
},
{
"epoch": 0.15,
"grad_norm": 5.25,
"learning_rate": 4.766666666666667e-07,
"loss": 0.0557,
"step": 3050
},
{
"epoch": 0.15,
"grad_norm": 6.84375,
"learning_rate": 4.7422222222222225e-07,
"loss": 0.0509,
"step": 3060
},
{
"epoch": 0.15,
"grad_norm": 8.6875,
"learning_rate": 4.717777777777778e-07,
"loss": 0.0596,
"step": 3070
},
{
"epoch": 0.15,
"grad_norm": 8.1875,
"learning_rate": 4.693333333333334e-07,
"loss": 0.0532,
"step": 3080
},
{
"epoch": 0.15,
"grad_norm": 5.40625,
"learning_rate": 4.6688888888888893e-07,
"loss": 0.0546,
"step": 3090
},
{
"epoch": 0.15,
"grad_norm": 8.5,
"learning_rate": 4.6444444444444446e-07,
"loss": 0.0614,
"step": 3100
},
{
"epoch": 0.16,
"grad_norm": 5.375,
"learning_rate": 4.62e-07,
"loss": 0.0513,
"step": 3110
},
{
"epoch": 0.16,
"grad_norm": 6.71875,
"learning_rate": 4.595555555555556e-07,
"loss": 0.0542,
"step": 3120
},
{
"epoch": 0.16,
"grad_norm": 5.21875,
"learning_rate": 4.5711111111111114e-07,
"loss": 0.0513,
"step": 3130
},
{
"epoch": 0.16,
"grad_norm": 7.6875,
"learning_rate": 4.5466666666666666e-07,
"loss": 0.0589,
"step": 3140
},
{
"epoch": 0.16,
"grad_norm": 6.09375,
"learning_rate": 4.5222222222222224e-07,
"loss": 0.0515,
"step": 3150
},
{
"epoch": 0.16,
"grad_norm": 5.5,
"learning_rate": 4.497777777777778e-07,
"loss": 0.0581,
"step": 3160
},
{
"epoch": 0.16,
"grad_norm": 7.21875,
"learning_rate": 4.4733333333333334e-07,
"loss": 0.0559,
"step": 3170
},
{
"epoch": 0.16,
"grad_norm": 7.375,
"learning_rate": 4.448888888888889e-07,
"loss": 0.0537,
"step": 3180
},
{
"epoch": 0.16,
"grad_norm": 4.75,
"learning_rate": 4.4244444444444444e-07,
"loss": 0.0473,
"step": 3190
},
{
"epoch": 0.16,
"grad_norm": 7.9375,
"learning_rate": 4.4e-07,
"loss": 0.0494,
"step": 3200
},
{
"epoch": 0.16,
"grad_norm": 8.5,
"learning_rate": 4.375555555555556e-07,
"loss": 0.058,
"step": 3210
},
{
"epoch": 0.16,
"grad_norm": 6.0,
"learning_rate": 4.351111111111111e-07,
"loss": 0.057,
"step": 3220
},
{
"epoch": 0.16,
"grad_norm": 5.125,
"learning_rate": 4.3266666666666665e-07,
"loss": 0.0572,
"step": 3230
},
{
"epoch": 0.16,
"grad_norm": 5.09375,
"learning_rate": 4.302222222222223e-07,
"loss": 0.0525,
"step": 3240
},
{
"epoch": 0.16,
"grad_norm": 6.8125,
"learning_rate": 4.277777777777778e-07,
"loss": 0.0564,
"step": 3250
},
{
"epoch": 0.16,
"grad_norm": 6.5625,
"learning_rate": 4.2533333333333333e-07,
"loss": 0.0533,
"step": 3260
},
{
"epoch": 0.16,
"grad_norm": 5.96875,
"learning_rate": 4.228888888888889e-07,
"loss": 0.0569,
"step": 3270
},
{
"epoch": 0.16,
"grad_norm": 5.3125,
"learning_rate": 4.204444444444445e-07,
"loss": 0.0558,
"step": 3280
},
{
"epoch": 0.16,
"grad_norm": 5.875,
"learning_rate": 4.18e-07,
"loss": 0.0558,
"step": 3290
},
{
"epoch": 0.17,
"grad_norm": 6.375,
"learning_rate": 4.155555555555556e-07,
"loss": 0.0506,
"step": 3300
},
{
"epoch": 0.17,
"grad_norm": 6.28125,
"learning_rate": 4.131111111111111e-07,
"loss": 0.0519,
"step": 3310
},
{
"epoch": 0.17,
"grad_norm": 5.3125,
"learning_rate": 4.106666666666667e-07,
"loss": 0.0542,
"step": 3320
},
{
"epoch": 0.17,
"grad_norm": 5.65625,
"learning_rate": 4.0822222222222227e-07,
"loss": 0.0508,
"step": 3330
},
{
"epoch": 0.17,
"grad_norm": 8.625,
"learning_rate": 4.057777777777778e-07,
"loss": 0.0619,
"step": 3340
},
{
"epoch": 0.17,
"grad_norm": 5.5,
"learning_rate": 4.033333333333333e-07,
"loss": 0.051,
"step": 3350
},
{
"epoch": 0.17,
"grad_norm": 5.15625,
"learning_rate": 4.0088888888888895e-07,
"loss": 0.05,
"step": 3360
},
{
"epoch": 0.17,
"grad_norm": 5.71875,
"learning_rate": 3.984444444444445e-07,
"loss": 0.052,
"step": 3370
},
{
"epoch": 0.17,
"grad_norm": 7.25,
"learning_rate": 3.96e-07,
"loss": 0.053,
"step": 3380
},
{
"epoch": 0.17,
"grad_norm": 6.125,
"learning_rate": 3.935555555555555e-07,
"loss": 0.055,
"step": 3390
},
{
"epoch": 0.17,
"grad_norm": 5.59375,
"learning_rate": 3.9111111111111115e-07,
"loss": 0.0564,
"step": 3400
},
{
"epoch": 0.17,
"grad_norm": 5.8125,
"learning_rate": 3.886666666666667e-07,
"loss": 0.0505,
"step": 3410
},
{
"epoch": 0.17,
"grad_norm": 5.34375,
"learning_rate": 3.862222222222222e-07,
"loss": 0.048,
"step": 3420
},
{
"epoch": 0.17,
"grad_norm": 4.9375,
"learning_rate": 3.8377777777777783e-07,
"loss": 0.0506,
"step": 3430
},
{
"epoch": 0.17,
"grad_norm": 5.1875,
"learning_rate": 3.8133333333333336e-07,
"loss": 0.0634,
"step": 3440
},
{
"epoch": 0.17,
"grad_norm": 8.4375,
"learning_rate": 3.788888888888889e-07,
"loss": 0.0546,
"step": 3450
},
{
"epoch": 0.17,
"grad_norm": 5.625,
"learning_rate": 3.7644444444444446e-07,
"loss": 0.0552,
"step": 3460
},
{
"epoch": 0.17,
"grad_norm": 6.5,
"learning_rate": 3.7400000000000004e-07,
"loss": 0.0512,
"step": 3470
},
{
"epoch": 0.17,
"grad_norm": 5.8125,
"learning_rate": 3.7155555555555557e-07,
"loss": 0.055,
"step": 3480
},
{
"epoch": 0.17,
"grad_norm": 5.21875,
"learning_rate": 3.6911111111111114e-07,
"loss": 0.0464,
"step": 3490
},
{
"epoch": 0.17,
"grad_norm": 7.78125,
"learning_rate": 3.6666666666666667e-07,
"loss": 0.056,
"step": 3500
},
{
"epoch": 0.18,
"grad_norm": 11.8125,
"learning_rate": 3.6422222222222225e-07,
"loss": 0.0629,
"step": 3510
},
{
"epoch": 0.18,
"grad_norm": 5.71875,
"learning_rate": 3.617777777777778e-07,
"loss": 0.0595,
"step": 3520
},
{
"epoch": 0.18,
"grad_norm": 5.75,
"learning_rate": 3.5933333333333335e-07,
"loss": 0.0617,
"step": 3530
},
{
"epoch": 0.18,
"grad_norm": 5.5,
"learning_rate": 3.5688888888888887e-07,
"loss": 0.0556,
"step": 3540
},
{
"epoch": 0.18,
"grad_norm": 5.90625,
"learning_rate": 3.544444444444445e-07,
"loss": 0.0567,
"step": 3550
},
{
"epoch": 0.18,
"grad_norm": 6.03125,
"learning_rate": 3.5200000000000003e-07,
"loss": 0.0518,
"step": 3560
},
{
"epoch": 0.18,
"grad_norm": 7.125,
"learning_rate": 3.4955555555555555e-07,
"loss": 0.0588,
"step": 3570
},
{
"epoch": 0.18,
"grad_norm": 6.21875,
"learning_rate": 3.471111111111111e-07,
"loss": 0.0584,
"step": 3580
},
{
"epoch": 0.18,
"grad_norm": 5.46875,
"learning_rate": 3.446666666666667e-07,
"loss": 0.0668,
"step": 3590
},
{
"epoch": 0.18,
"grad_norm": 5.65625,
"learning_rate": 3.4222222222222223e-07,
"loss": 0.0615,
"step": 3600
},
{
"epoch": 0.18,
"grad_norm": 5.0625,
"learning_rate": 3.3977777777777776e-07,
"loss": 0.0564,
"step": 3610
},
{
"epoch": 0.18,
"grad_norm": 6.21875,
"learning_rate": 3.3733333333333334e-07,
"loss": 0.057,
"step": 3620
},
{
"epoch": 0.18,
"grad_norm": 6.3125,
"learning_rate": 3.348888888888889e-07,
"loss": 0.0579,
"step": 3630
},
{
"epoch": 0.18,
"grad_norm": 6.59375,
"learning_rate": 3.3244444444444444e-07,
"loss": 0.0577,
"step": 3640
},
{
"epoch": 0.18,
"grad_norm": 6.59375,
"learning_rate": 3.3e-07,
"loss": 0.0542,
"step": 3650
},
{
"epoch": 0.18,
"grad_norm": 5.46875,
"learning_rate": 3.2755555555555554e-07,
"loss": 0.0517,
"step": 3660
},
{
"epoch": 0.18,
"grad_norm": 8.6875,
"learning_rate": 3.251111111111111e-07,
"loss": 0.0562,
"step": 3670
},
{
"epoch": 0.18,
"grad_norm": 5.5625,
"learning_rate": 3.226666666666667e-07,
"loss": 0.0543,
"step": 3680
},
{
"epoch": 0.18,
"grad_norm": 7.5625,
"learning_rate": 3.202222222222222e-07,
"loss": 0.0529,
"step": 3690
},
{
"epoch": 0.18,
"grad_norm": 5.90625,
"learning_rate": 3.1777777777777775e-07,
"loss": 0.0544,
"step": 3700
},
{
"epoch": 0.19,
"grad_norm": 7.46875,
"learning_rate": 3.153333333333334e-07,
"loss": 0.0519,
"step": 3710
},
{
"epoch": 0.19,
"grad_norm": 5.5,
"learning_rate": 3.128888888888889e-07,
"loss": 0.0562,
"step": 3720
},
{
"epoch": 0.19,
"grad_norm": 5.375,
"learning_rate": 3.1044444444444443e-07,
"loss": 0.0532,
"step": 3730
},
{
"epoch": 0.19,
"grad_norm": 10.0,
"learning_rate": 3.0800000000000006e-07,
"loss": 0.0572,
"step": 3740
},
{
"epoch": 0.19,
"grad_norm": 5.15625,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0585,
"step": 3750
},
{
"epoch": 0.19,
"grad_norm": 7.21875,
"learning_rate": 3.031111111111111e-07,
"loss": 0.0659,
"step": 3760
},
{
"epoch": 0.19,
"grad_norm": 6.8125,
"learning_rate": 3.006666666666667e-07,
"loss": 0.0634,
"step": 3770
},
{
"epoch": 0.19,
"grad_norm": 5.6875,
"learning_rate": 2.9822222222222226e-07,
"loss": 0.058,
"step": 3780
},
{
"epoch": 0.19,
"grad_norm": 5.46875,
"learning_rate": 2.957777777777778e-07,
"loss": 0.0493,
"step": 3790
},
{
"epoch": 0.19,
"grad_norm": 4.875,
"learning_rate": 2.9333333333333337e-07,
"loss": 0.0499,
"step": 3800
},
{
"epoch": 0.19,
"grad_norm": 5.40625,
"learning_rate": 2.908888888888889e-07,
"loss": 0.0595,
"step": 3810
},
{
"epoch": 0.19,
"grad_norm": 6.59375,
"learning_rate": 2.8844444444444447e-07,
"loss": 0.064,
"step": 3820
},
{
"epoch": 0.19,
"grad_norm": 5.125,
"learning_rate": 2.8600000000000005e-07,
"loss": 0.0475,
"step": 3830
},
{
"epoch": 0.19,
"grad_norm": 9.4375,
"learning_rate": 2.8355555555555557e-07,
"loss": 0.0589,
"step": 3840
},
{
"epoch": 0.19,
"grad_norm": 5.875,
"learning_rate": 2.811111111111111e-07,
"loss": 0.0563,
"step": 3850
},
{
"epoch": 0.19,
"grad_norm": 4.9375,
"learning_rate": 2.7866666666666673e-07,
"loss": 0.0603,
"step": 3860
},
{
"epoch": 0.19,
"grad_norm": 5.78125,
"learning_rate": 2.7622222222222225e-07,
"loss": 0.0541,
"step": 3870
},
{
"epoch": 0.19,
"grad_norm": 6.46875,
"learning_rate": 2.737777777777778e-07,
"loss": 0.0573,
"step": 3880
},
{
"epoch": 0.19,
"grad_norm": 9.25,
"learning_rate": 2.7133333333333335e-07,
"loss": 0.0563,
"step": 3890
},
{
"epoch": 0.2,
"grad_norm": 6.375,
"learning_rate": 2.688888888888889e-07,
"loss": 0.0533,
"step": 3900
},
{
"epoch": 0.2,
"grad_norm": 9.125,
"learning_rate": 2.6644444444444446e-07,
"loss": 0.0528,
"step": 3910
},
{
"epoch": 0.2,
"grad_norm": 6.71875,
"learning_rate": 2.64e-07,
"loss": 0.0605,
"step": 3920
},
{
"epoch": 0.2,
"grad_norm": 6.5,
"learning_rate": 2.6155555555555556e-07,
"loss": 0.0605,
"step": 3930
},
{
"epoch": 0.2,
"grad_norm": 5.34375,
"learning_rate": 2.5911111111111114e-07,
"loss": 0.0563,
"step": 3940
},
{
"epoch": 0.2,
"grad_norm": 5.5625,
"learning_rate": 2.5666666666666666e-07,
"loss": 0.0497,
"step": 3950
},
{
"epoch": 0.2,
"grad_norm": 9.9375,
"learning_rate": 2.5422222222222224e-07,
"loss": 0.0599,
"step": 3960
},
{
"epoch": 0.2,
"grad_norm": 7.625,
"learning_rate": 2.517777777777778e-07,
"loss": 0.0574,
"step": 3970
},
{
"epoch": 0.2,
"grad_norm": 6.53125,
"learning_rate": 2.4933333333333334e-07,
"loss": 0.0586,
"step": 3980
},
{
"epoch": 0.2,
"grad_norm": 5.3125,
"learning_rate": 2.468888888888889e-07,
"loss": 0.0576,
"step": 3990
},
{
"epoch": 0.2,
"grad_norm": 5.21875,
"learning_rate": 2.4444444444444445e-07,
"loss": 0.0594,
"step": 4000
},
{
"epoch": 0.2,
"eval_loss": 0.05571923404932022,
"eval_runtime": 64.3883,
"eval_samples_per_second": 15.531,
"eval_steps_per_second": 15.531,
"step": 4000
},
{
"epoch": 0.2,
"grad_norm": 6.0625,
"learning_rate": 2.42e-07,
"loss": 0.0556,
"step": 4010
},
{
"epoch": 0.2,
"grad_norm": 7.0625,
"learning_rate": 2.3955555555555555e-07,
"loss": 0.0558,
"step": 4020
},
{
"epoch": 0.2,
"grad_norm": 5.75,
"learning_rate": 2.3711111111111113e-07,
"loss": 0.0572,
"step": 4030
},
{
"epoch": 0.2,
"grad_norm": 6.71875,
"learning_rate": 2.346666666666667e-07,
"loss": 0.055,
"step": 4040
},
{
"epoch": 0.2,
"grad_norm": 5.9375,
"learning_rate": 2.3222222222222223e-07,
"loss": 0.0592,
"step": 4050
},
{
"epoch": 0.2,
"grad_norm": 9.125,
"learning_rate": 2.297777777777778e-07,
"loss": 0.0584,
"step": 4060
},
{
"epoch": 0.2,
"grad_norm": 5.8125,
"learning_rate": 2.2733333333333333e-07,
"loss": 0.0599,
"step": 4070
},
{
"epoch": 0.2,
"grad_norm": 8.625,
"learning_rate": 2.248888888888889e-07,
"loss": 0.0609,
"step": 4080
},
{
"epoch": 0.2,
"grad_norm": 5.46875,
"learning_rate": 2.2244444444444446e-07,
"loss": 0.0512,
"step": 4090
},
{
"epoch": 0.2,
"grad_norm": 9.25,
"learning_rate": 2.2e-07,
"loss": 0.0566,
"step": 4100
},
{
"epoch": 0.21,
"grad_norm": 6.75,
"learning_rate": 2.1755555555555556e-07,
"loss": 0.0525,
"step": 4110
},
{
"epoch": 0.21,
"grad_norm": 5.0625,
"learning_rate": 2.1511111111111114e-07,
"loss": 0.061,
"step": 4120
},
{
"epoch": 0.21,
"grad_norm": 6.34375,
"learning_rate": 2.1266666666666667e-07,
"loss": 0.058,
"step": 4130
},
{
"epoch": 0.21,
"grad_norm": 7.34375,
"learning_rate": 2.1022222222222224e-07,
"loss": 0.0636,
"step": 4140
},
{
"epoch": 0.21,
"grad_norm": 6.65625,
"learning_rate": 2.077777777777778e-07,
"loss": 0.0587,
"step": 4150
},
{
"epoch": 0.21,
"grad_norm": 5.5,
"learning_rate": 2.0533333333333335e-07,
"loss": 0.0559,
"step": 4160
},
{
"epoch": 0.21,
"grad_norm": 5.09375,
"learning_rate": 2.028888888888889e-07,
"loss": 0.0551,
"step": 4170
},
{
"epoch": 0.21,
"grad_norm": 5.96875,
"learning_rate": 2.0044444444444447e-07,
"loss": 0.0541,
"step": 4180
},
{
"epoch": 0.21,
"grad_norm": 6.0625,
"learning_rate": 1.98e-07,
"loss": 0.058,
"step": 4190
},
{
"epoch": 0.21,
"grad_norm": 6.65625,
"learning_rate": 1.9555555555555558e-07,
"loss": 0.0593,
"step": 4200
},
{
"epoch": 0.21,
"grad_norm": 8.5,
"learning_rate": 1.931111111111111e-07,
"loss": 0.0623,
"step": 4210
},
{
"epoch": 0.21,
"grad_norm": 5.84375,
"learning_rate": 1.9066666666666668e-07,
"loss": 0.0584,
"step": 4220
},
{
"epoch": 0.21,
"grad_norm": 5.5,
"learning_rate": 1.8822222222222223e-07,
"loss": 0.0508,
"step": 4230
},
{
"epoch": 0.21,
"grad_norm": 5.8125,
"learning_rate": 1.8577777777777778e-07,
"loss": 0.0618,
"step": 4240
},
{
"epoch": 0.21,
"grad_norm": 5.34375,
"learning_rate": 1.8333333333333333e-07,
"loss": 0.053,
"step": 4250
},
{
"epoch": 0.21,
"grad_norm": 6.875,
"learning_rate": 1.808888888888889e-07,
"loss": 0.0559,
"step": 4260
},
{
"epoch": 0.21,
"grad_norm": 7.375,
"learning_rate": 1.7844444444444444e-07,
"loss": 0.0576,
"step": 4270
},
{
"epoch": 0.21,
"grad_norm": 8.3125,
"learning_rate": 1.7600000000000001e-07,
"loss": 0.0547,
"step": 4280
},
{
"epoch": 0.21,
"grad_norm": 6.90625,
"learning_rate": 1.7355555555555554e-07,
"loss": 0.0532,
"step": 4290
},
{
"epoch": 0.21,
"grad_norm": 7.46875,
"learning_rate": 1.7111111111111112e-07,
"loss": 0.0696,
"step": 4300
},
{
"epoch": 0.22,
"grad_norm": 5.3125,
"learning_rate": 1.6866666666666667e-07,
"loss": 0.0569,
"step": 4310
},
{
"epoch": 0.22,
"grad_norm": 7.125,
"learning_rate": 1.6622222222222222e-07,
"loss": 0.059,
"step": 4320
},
{
"epoch": 0.22,
"grad_norm": 5.03125,
"learning_rate": 1.6377777777777777e-07,
"loss": 0.0486,
"step": 4330
},
{
"epoch": 0.22,
"grad_norm": 7.75,
"learning_rate": 1.6133333333333335e-07,
"loss": 0.0557,
"step": 4340
},
{
"epoch": 0.22,
"grad_norm": 5.90625,
"learning_rate": 1.5888888888888887e-07,
"loss": 0.0605,
"step": 4350
},
{
"epoch": 0.22,
"grad_norm": 4.96875,
"learning_rate": 1.5644444444444445e-07,
"loss": 0.0534,
"step": 4360
},
{
"epoch": 0.22,
"grad_norm": 4.78125,
"learning_rate": 1.5400000000000003e-07,
"loss": 0.0513,
"step": 4370
},
{
"epoch": 0.22,
"grad_norm": 10.75,
"learning_rate": 1.5155555555555555e-07,
"loss": 0.057,
"step": 4380
},
{
"epoch": 0.22,
"grad_norm": 6.125,
"learning_rate": 1.4911111111111113e-07,
"loss": 0.0602,
"step": 4390
},
{
"epoch": 0.22,
"grad_norm": 5.5625,
"learning_rate": 1.4666666666666668e-07,
"loss": 0.0546,
"step": 4400
},
{
"epoch": 0.22,
"grad_norm": 5.40625,
"learning_rate": 1.4422222222222223e-07,
"loss": 0.0543,
"step": 4410
},
{
"epoch": 0.22,
"grad_norm": 6.46875,
"learning_rate": 1.4177777777777779e-07,
"loss": 0.0627,
"step": 4420
},
{
"epoch": 0.22,
"grad_norm": 7.375,
"learning_rate": 1.3933333333333336e-07,
"loss": 0.0596,
"step": 4430
},
{
"epoch": 0.22,
"grad_norm": 5.5625,
"learning_rate": 1.368888888888889e-07,
"loss": 0.0593,
"step": 4440
},
{
"epoch": 0.22,
"grad_norm": 6.75,
"learning_rate": 1.3444444444444444e-07,
"loss": 0.0554,
"step": 4450
},
{
"epoch": 0.22,
"grad_norm": 6.21875,
"learning_rate": 1.32e-07,
"loss": 0.0516,
"step": 4460
},
{
"epoch": 0.22,
"grad_norm": 6.21875,
"learning_rate": 1.2955555555555557e-07,
"loss": 0.0529,
"step": 4470
},
{
"epoch": 0.22,
"grad_norm": 7.71875,
"learning_rate": 1.2711111111111112e-07,
"loss": 0.0549,
"step": 4480
},
{
"epoch": 0.22,
"grad_norm": 5.34375,
"learning_rate": 1.2466666666666667e-07,
"loss": 0.0564,
"step": 4490
},
{
"epoch": 0.23,
"grad_norm": 5.96875,
"learning_rate": 1.2222222222222222e-07,
"loss": 0.0576,
"step": 4500
},
{
"epoch": 0.23,
"grad_norm": 6.375,
"learning_rate": 1.1977777777777777e-07,
"loss": 0.0539,
"step": 4510
},
{
"epoch": 0.23,
"grad_norm": 6.25,
"learning_rate": 1.1733333333333335e-07,
"loss": 0.0546,
"step": 4520
},
{
"epoch": 0.23,
"grad_norm": 5.65625,
"learning_rate": 1.148888888888889e-07,
"loss": 0.0555,
"step": 4530
},
{
"epoch": 0.23,
"grad_norm": 5.84375,
"learning_rate": 1.1244444444444445e-07,
"loss": 0.0583,
"step": 4540
},
{
"epoch": 0.23,
"grad_norm": 5.75,
"learning_rate": 1.1e-07,
"loss": 0.0553,
"step": 4550
},
{
"epoch": 0.23,
"grad_norm": 5.875,
"learning_rate": 1.0755555555555557e-07,
"loss": 0.0533,
"step": 4560
},
{
"epoch": 0.23,
"grad_norm": 4.8125,
"learning_rate": 1.0511111111111112e-07,
"loss": 0.0565,
"step": 4570
},
{
"epoch": 0.23,
"grad_norm": 4.9375,
"learning_rate": 1.0266666666666667e-07,
"loss": 0.0573,
"step": 4580
},
{
"epoch": 0.23,
"grad_norm": 6.71875,
"learning_rate": 1.0022222222222224e-07,
"loss": 0.054,
"step": 4590
},
{
"epoch": 0.23,
"grad_norm": 7.21875,
"learning_rate": 9.777777777777779e-08,
"loss": 0.0592,
"step": 4600
},
{
"epoch": 0.23,
"grad_norm": 7.0625,
"learning_rate": 9.533333333333334e-08,
"loss": 0.0591,
"step": 4610
},
{
"epoch": 0.23,
"grad_norm": 6.03125,
"learning_rate": 9.288888888888889e-08,
"loss": 0.0645,
"step": 4620
},
{
"epoch": 0.23,
"grad_norm": 7.0625,
"learning_rate": 9.044444444444446e-08,
"loss": 0.0549,
"step": 4630
},
{
"epoch": 0.23,
"grad_norm": 5.65625,
"learning_rate": 8.800000000000001e-08,
"loss": 0.0554,
"step": 4640
},
{
"epoch": 0.23,
"grad_norm": 7.8125,
"learning_rate": 8.555555555555556e-08,
"loss": 0.0536,
"step": 4650
},
{
"epoch": 0.23,
"grad_norm": 5.6875,
"learning_rate": 8.311111111111111e-08,
"loss": 0.0633,
"step": 4660
},
{
"epoch": 0.23,
"grad_norm": 5.96875,
"learning_rate": 8.066666666666667e-08,
"loss": 0.0529,
"step": 4670
},
{
"epoch": 0.23,
"grad_norm": 7.96875,
"learning_rate": 7.822222222222223e-08,
"loss": 0.0532,
"step": 4680
},
{
"epoch": 0.23,
"grad_norm": 9.5625,
"learning_rate": 7.577777777777778e-08,
"loss": 0.0603,
"step": 4690
},
{
"epoch": 0.23,
"grad_norm": 6.875,
"learning_rate": 7.333333333333334e-08,
"loss": 0.0584,
"step": 4700
},
{
"epoch": 0.24,
"grad_norm": 6.0625,
"learning_rate": 7.088888888888889e-08,
"loss": 0.0577,
"step": 4710
},
{
"epoch": 0.24,
"grad_norm": 7.71875,
"learning_rate": 6.844444444444444e-08,
"loss": 0.0612,
"step": 4720
},
{
"epoch": 0.24,
"grad_norm": 6.28125,
"learning_rate": 6.6e-08,
"loss": 0.0668,
"step": 4730
},
{
"epoch": 0.24,
"grad_norm": 6.09375,
"learning_rate": 6.355555555555556e-08,
"loss": 0.0577,
"step": 4740
},
{
"epoch": 0.24,
"grad_norm": 5.59375,
"learning_rate": 6.111111111111111e-08,
"loss": 0.0578,
"step": 4750
},
{
"epoch": 0.24,
"grad_norm": 5.78125,
"learning_rate": 5.8666666666666676e-08,
"loss": 0.0567,
"step": 4760
},
{
"epoch": 0.24,
"grad_norm": 7.3125,
"learning_rate": 5.622222222222223e-08,
"loss": 0.0609,
"step": 4770
},
{
"epoch": 0.24,
"grad_norm": 6.34375,
"learning_rate": 5.3777777777777785e-08,
"loss": 0.0561,
"step": 4780
},
{
"epoch": 0.24,
"grad_norm": 5.125,
"learning_rate": 5.1333333333333336e-08,
"loss": 0.0531,
"step": 4790
},
{
"epoch": 0.24,
"grad_norm": 5.625,
"learning_rate": 4.8888888888888894e-08,
"loss": 0.0546,
"step": 4800
},
{
"epoch": 0.24,
"grad_norm": 7.21875,
"learning_rate": 4.6444444444444446e-08,
"loss": 0.0525,
"step": 4810
},
{
"epoch": 0.24,
"grad_norm": 5.65625,
"learning_rate": 4.4000000000000004e-08,
"loss": 0.0589,
"step": 4820
},
{
"epoch": 0.24,
"grad_norm": 6.90625,
"learning_rate": 4.1555555555555555e-08,
"loss": 0.0595,
"step": 4830
},
{
"epoch": 0.24,
"grad_norm": 5.5625,
"learning_rate": 3.911111111111111e-08,
"loss": 0.054,
"step": 4840
},
{
"epoch": 0.24,
"grad_norm": 7.375,
"learning_rate": 3.666666666666667e-08,
"loss": 0.0606,
"step": 4850
},
{
"epoch": 0.24,
"grad_norm": 6.375,
"learning_rate": 3.422222222222222e-08,
"loss": 0.0572,
"step": 4860
},
{
"epoch": 0.24,
"grad_norm": 5.21875,
"learning_rate": 3.177777777777778e-08,
"loss": 0.0558,
"step": 4870
},
{
"epoch": 0.24,
"grad_norm": 6.9375,
"learning_rate": 2.9333333333333338e-08,
"loss": 0.0577,
"step": 4880
},
{
"epoch": 0.24,
"grad_norm": 5.46875,
"learning_rate": 2.6888888888888893e-08,
"loss": 0.0521,
"step": 4890
},
{
"epoch": 0.24,
"grad_norm": 4.96875,
"learning_rate": 2.4444444444444447e-08,
"loss": 0.0548,
"step": 4900
},
{
"epoch": 0.25,
"grad_norm": 6.9375,
"learning_rate": 2.2000000000000002e-08,
"loss": 0.0582,
"step": 4910
},
{
"epoch": 0.25,
"grad_norm": 5.0,
"learning_rate": 1.9555555555555556e-08,
"loss": 0.0527,
"step": 4920
},
{
"epoch": 0.25,
"grad_norm": 6.96875,
"learning_rate": 1.711111111111111e-08,
"loss": 0.0521,
"step": 4930
},
{
"epoch": 0.25,
"grad_norm": 5.28125,
"learning_rate": 1.4666666666666669e-08,
"loss": 0.0581,
"step": 4940
},
{
"epoch": 0.25,
"grad_norm": 5.59375,
"learning_rate": 1.2222222222222224e-08,
"loss": 0.0509,
"step": 4950
},
{
"epoch": 0.25,
"grad_norm": 9.1875,
"learning_rate": 9.777777777777778e-09,
"loss": 0.0546,
"step": 4960
},
{
"epoch": 0.25,
"grad_norm": 5.71875,
"learning_rate": 7.3333333333333345e-09,
"loss": 0.0588,
"step": 4970
},
{
"epoch": 0.25,
"grad_norm": 6.15625,
"learning_rate": 4.888888888888889e-09,
"loss": 0.0601,
"step": 4980
},
{
"epoch": 0.25,
"grad_norm": 8.0625,
"learning_rate": 2.4444444444444446e-09,
"loss": 0.0551,
"step": 4990
},
{
"epoch": 0.25,
"grad_norm": 5.25,
"learning_rate": 0.0,
"loss": 0.0534,
"step": 5000
},
{
"epoch": 0.25,
"eval_loss": 0.05834246426820755,
"eval_runtime": 64.5109,
"eval_samples_per_second": 15.501,
"eval_steps_per_second": 15.501,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 8.06961020928e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}